fustor-source-oss 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fustor_source_oss/__init__.py +0 -0
- fustor_source_oss/config.py +70 -0
- fustor_source_oss/driver.py +274 -0
- fustor_source_oss/mapper.py +109 -0
- fustor_source_oss-0.2.dist-info/METADATA +8 -0
- fustor_source_oss-0.2.dist-info/RECORD +9 -0
- fustor_source_oss-0.2.dist-info/WHEEL +5 -0
- fustor_source_oss-0.2.dist-info/entry_points.txt +2 -0
- fustor_source_oss-0.2.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, AnyHttpUrl, ValidationError, model_validator
|
|
2
|
+
from typing import Optional, Literal, Dict, Any
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
class QueueType(str, Enum):
|
|
6
|
+
"""Defines the supported queue types for incremental synchronization."""
|
|
7
|
+
POLLING = "polling" # Simple polling based on object modification time
|
|
8
|
+
SQS = "sqs" # AWS SQS for event notifications
|
|
9
|
+
AMQP = "amqp" # AMQP (e.g., RabbitMQ) for event notifications
|
|
10
|
+
|
|
11
|
+
class SQSQueueConfig(BaseModel):
|
|
12
|
+
"""Configuration for AWS SQS queue."""
|
|
13
|
+
queue_url: str = Field(..., description="URL of the SQS queue.")
|
|
14
|
+
region_name: Optional[str] = Field(None, description="AWS region of the SQS queue.")
|
|
15
|
+
visibility_timeout: int = Field(30, ge=0, description="Message visibility timeout in seconds.")
|
|
16
|
+
|
|
17
|
+
class AMQPQueueConfig(BaseModel):
|
|
18
|
+
"""Configuration for AMQP (RabbitMQ) queue."""
|
|
19
|
+
host: str = Field(..., description="AMQP broker hostname.")
|
|
20
|
+
port: int = Field(5672, gt=0, description="AMQP broker port.")
|
|
21
|
+
username: str = Field("guest", description="AMQP username.")
|
|
22
|
+
password: str = Field("guest", description="AMQP password.")
|
|
23
|
+
queue_name: str = Field(..., description="Name of the queue to consume.")
|
|
24
|
+
|
|
25
|
+
class PollingQueueConfig(BaseModel):
|
|
26
|
+
"""Configuration for polling-based incremental sync."""
|
|
27
|
+
interval_seconds: int = Field(30, ge=1, description="Polling interval in seconds.")
|
|
28
|
+
|
|
29
|
+
class OssDriverParams(BaseModel):
|
|
30
|
+
"""
|
|
31
|
+
Configuration parameters specific to the OSS Source Driver.
|
|
32
|
+
These parameters are stored within the 'driver_params' field of the core SourceConfig.
|
|
33
|
+
"""
|
|
34
|
+
endpoint_url: str = Field(..., description="S3-compatible service endpoint URL.")
|
|
35
|
+
bucket_name: str = Field(..., min_length=3, description="Name of the S3 bucket.")
|
|
36
|
+
region_name: Optional[str] = Field(None, description="AWS region of the S3 bucket.")
|
|
37
|
+
prefix: str = Field("", description="Optional object key prefix to filter files.")
|
|
38
|
+
recursive: bool = Field(True, description="Whether to recursively list objects within the prefix.")
|
|
39
|
+
|
|
40
|
+
# Incremental synchronization strategy
|
|
41
|
+
queue_type: QueueType = Field(QueueType.POLLING, description="Strategy for incremental synchronization.")
|
|
42
|
+
|
|
43
|
+
# Specific queue configurations, validated conditionally
|
|
44
|
+
sqs_queue_config: Optional[SQSQueueConfig] = Field(None, description="SQS queue configuration if queue_type is SQS.")
|
|
45
|
+
amqp_queue_config: Optional[AMQPQueueConfig] = Field(None, description="AMQP queue configuration if queue_type is AMQP.")
|
|
46
|
+
polling_queue_config: Optional[PollingQueueConfig] = Field(None, description="Polling queue configuration if queue_type is POLLING.")
|
|
47
|
+
|
|
48
|
+
# Custom validator to ensure correct queue config is provided based on queue_type
|
|
49
|
+
@classmethod
|
|
50
|
+
def validate_queue_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
51
|
+
queue_type = values.get("queue_type")
|
|
52
|
+
if queue_type == QueueType.SQS:
|
|
53
|
+
if not values.get("sqs_queue_config"):
|
|
54
|
+
raise ValueError("sqs_queue_config must be provided for SQS queue_type.")
|
|
55
|
+
# Ensure other queue configs are not present if SQS is chosen
|
|
56
|
+
values["amqp_queue_config"] = None
|
|
57
|
+
values["polling_queue_config"] = None
|
|
58
|
+
elif queue_type == QueueType.AMQP:
|
|
59
|
+
if not values.get("amqp_queue_config"):
|
|
60
|
+
raise ValueError("amqp_queue_config must be provided for AMQP queue_type.")
|
|
61
|
+
# Ensure other queue configs are not present if AMQP is chosen
|
|
62
|
+
values["sqs_queue_config"] = None
|
|
63
|
+
values["polling_queue_config"] = None
|
|
64
|
+
elif queue_type == QueueType.POLLING:
|
|
65
|
+
# Polling config is optional, but if present, others are cleared
|
|
66
|
+
values["sqs_queue_config"] = None
|
|
67
|
+
values["amqp_queue_config"] = None
|
|
68
|
+
return values
|
|
69
|
+
|
|
70
|
+
model_validator(mode='before')(validate_queue_config)
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import asyncio
|
|
3
|
+
from typing import Dict, Any, Iterator, Tuple, Optional
|
|
4
|
+
import boto3
|
|
5
|
+
from botocore.exceptions import ClientError
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
from fustor_core.drivers import SourceDriver
|
|
9
|
+
from fustor_core.models.config import SourceConfig, PasswdCredential
|
|
10
|
+
from fustor_core.exceptions import DriverError
|
|
11
|
+
from fustor_event_model.models import EventBase, EventType
|
|
12
|
+
|
|
13
|
+
from .config import OssDriverParams, QueueType
|
|
14
|
+
from .mapper import map_s3_objects_to_events_batch
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
class OssSourceDriver(SourceDriver):
|
|
19
|
+
"""
|
|
20
|
+
Fustor Source Driver for S3-compatible object storage.
|
|
21
|
+
Supports snapshot (full listing) and polling-based incremental synchronization.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, id: str, config: SourceConfig):
|
|
25
|
+
super().__init__(id, config)
|
|
26
|
+
self.logger = logging.getLogger(f"{__name__}.{id}")
|
|
27
|
+
|
|
28
|
+
# Parse OSS specific parameters from driver_params
|
|
29
|
+
try:
|
|
30
|
+
self.oss_params = OssDriverParams(**config.driver_params)
|
|
31
|
+
except Exception as e:
|
|
32
|
+
raise DriverError(f"OSS configuration invalid: {e}")
|
|
33
|
+
|
|
34
|
+
# Parse credentials
|
|
35
|
+
if not isinstance(config.credential, PasswdCredential):
|
|
36
|
+
raise DriverError("OSS driver requires PasswdCredential (Access Key ID / Secret Access Key).")
|
|
37
|
+
|
|
38
|
+
self.ak = config.credential.user
|
|
39
|
+
self.sk = config.credential.passwd
|
|
40
|
+
|
|
41
|
+
self.bucket_name = self.oss_params.bucket_name
|
|
42
|
+
self.endpoint_url = self.oss_params.endpoint_url
|
|
43
|
+
self.region_name = self.oss_params.region_name
|
|
44
|
+
self.prefix = self.oss_params.prefix
|
|
45
|
+
|
|
46
|
+
# Initialize S3 client
|
|
47
|
+
try:
|
|
48
|
+
self.s3_client = boto3.client(
|
|
49
|
+
"s3",
|
|
50
|
+
endpoint_url=self.endpoint_url,
|
|
51
|
+
aws_access_key_id=self.ak,
|
|
52
|
+
aws_secret_access_key=self.sk,
|
|
53
|
+
region_name=self.region_name,
|
|
54
|
+
)
|
|
55
|
+
self.logger.info(f"Initialized S3 client for bucket '{self.bucket_name}' at '{self.endpoint_url}'")
|
|
56
|
+
except Exception as e:
|
|
57
|
+
raise DriverError(f"Failed to initialize S3 client: {e}")
|
|
58
|
+
|
|
59
|
+
async def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
|
|
60
|
+
"""
|
|
61
|
+
Performs a one-time, full snapshot of the source data by listing all objects
|
|
62
|
+
matching the prefix in the specified S3 bucket.
|
|
63
|
+
"""
|
|
64
|
+
self.logger.info(f"Starting snapshot for bucket '{self.bucket_name}' with prefix '{self.prefix}'")
|
|
65
|
+
|
|
66
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
67
|
+
page_iterator = paginator.paginate(
|
|
68
|
+
Bucket=self.bucket_name,
|
|
69
|
+
Prefix=self.prefix,
|
|
70
|
+
PaginationConfig={"PageSize": kwargs.get("batch_size", 1000)} # Default batch size
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
for page in page_iterator:
|
|
74
|
+
contents = page.get("Contents", [])
|
|
75
|
+
if not contents:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Convert to Fustor EventBase batch
|
|
79
|
+
try:
|
|
80
|
+
event_batch = map_s3_objects_to_events_batch(
|
|
81
|
+
contents,
|
|
82
|
+
self.bucket_name,
|
|
83
|
+
"objects", # Logical table name for S3 objects
|
|
84
|
+
EventType.INSERT
|
|
85
|
+
)
|
|
86
|
+
yield event_batch
|
|
87
|
+
except Exception as e:
|
|
88
|
+
self.logger.error(f"Error mapping S3 objects to event batch: {e}", exc_info=True)
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
self.logger.info(f"Snapshot completed for bucket '{self.bucket_name}' with prefix '{self.prefix}'")
|
|
92
|
+
|
|
93
|
+
def is_position_available(self, position: int) -> bool:
|
|
94
|
+
"""
|
|
95
|
+
For OSS, positions are timestamps. We assume any past timestamp is "available"
|
|
96
|
+
as long as the object was not deleted. Polling logic will handle filtering.
|
|
97
|
+
"""
|
|
98
|
+
return True # For polling, we can always attempt to resume from a timestamp
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def get_message_iterator(self, start_position: int = -1, **kwargs) -> Iterator[EventBase]:
|
|
102
|
+
"""
|
|
103
|
+
Performs incremental data capture (CDC) for OSS.
|
|
104
|
+
|
|
105
|
+
In polling mode, this method periodically lists objects and filters them
|
|
106
|
+
by LastModified timestamp greater than start_position.
|
|
107
|
+
"""
|
|
108
|
+
if self.oss_params.queue_type != QueueType.POLLING:
|
|
109
|
+
raise DriverError(f"Unsupported queue_type '{self.oss_params.queue_type}' for get_message_iterator. "
|
|
110
|
+
"Only POLLING is currently implemented.")
|
|
111
|
+
|
|
112
|
+
polling_interval = self.oss_params.polling_queue_config.interval_seconds \
|
|
113
|
+
if self.oss_params.polling_queue_config else 30
|
|
114
|
+
|
|
115
|
+
self.logger.info(f"Starting message polling for bucket '{self.bucket_name}' with prefix '{self.prefix}' "
|
|
116
|
+
f"from position {start_position}, interval: {polling_interval}s")
|
|
117
|
+
|
|
118
|
+
last_known_position = start_position
|
|
119
|
+
while True:
|
|
120
|
+
try:
|
|
121
|
+
# S3 ListObjectsV2 doesn't support filtering by LastModified directly.
|
|
122
|
+
# We must list and then filter client-side.
|
|
123
|
+
# This can be inefficient for very large buckets/prefixes.
|
|
124
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
125
|
+
page_iterator = paginator.paginate(
|
|
126
|
+
Bucket=self.bucket_name,
|
|
127
|
+
Prefix=self.prefix,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
new_or_modified_objects = []
|
|
131
|
+
current_max_timestamp = last_known_position
|
|
132
|
+
|
|
133
|
+
for page in page_iterator:
|
|
134
|
+
contents = page.get("Contents", [])
|
|
135
|
+
for s3_obj in contents:
|
|
136
|
+
last_modified: datetime = s3_obj.get("LastModified")
|
|
137
|
+
if last_modified:
|
|
138
|
+
obj_timestamp = int(last_modified.timestamp())
|
|
139
|
+
|
|
140
|
+
# Filter for objects modified AFTER the last_known_position
|
|
141
|
+
# We use '>=' to potentially include items modified in the same second as the last checkpoint
|
|
142
|
+
# The consumer (Agent) should handle exact duplicates if any.
|
|
143
|
+
if obj_timestamp >= last_known_position:
|
|
144
|
+
new_or_modified_objects.append(s3_obj)
|
|
145
|
+
if obj_timestamp > current_max_timestamp:
|
|
146
|
+
current_max_timestamp = obj_timestamp
|
|
147
|
+
|
|
148
|
+
if new_or_modified_objects:
|
|
149
|
+
self.logger.debug(f"Found {len(new_or_modified_objects)} new/modified objects.")
|
|
150
|
+
# Sort by timestamp to maintain order
|
|
151
|
+
new_or_modified_objects.sort(key=lambda x: int(x.get("LastModified", datetime.min).timestamp()))
|
|
152
|
+
|
|
153
|
+
# Group into batches
|
|
154
|
+
batch_size = kwargs.get("batch_size", 100) # Use the batch_size from config
|
|
155
|
+
for i in range(0, len(new_or_modified_objects), batch_size):
|
|
156
|
+
batch_objects = new_or_modified_objects[i:i + batch_size]
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
event_batch = map_s3_objects_to_events_batch(
|
|
160
|
+
batch_objects,
|
|
161
|
+
self.bucket_name,
|
|
162
|
+
"objects",
|
|
163
|
+
EventType.UPDATE # Treat as update/insert for changes
|
|
164
|
+
)
|
|
165
|
+
yield event_batch
|
|
166
|
+
last_known_position = int(event_batch.index) # Update position after yielding a batch
|
|
167
|
+
except Exception as e:
|
|
168
|
+
self.logger.error(f"Error mapping S3 objects to event batch during polling: {e}", exc_info=True)
|
|
169
|
+
continue
|
|
170
|
+
else:
|
|
171
|
+
self.logger.debug("No new or modified objects found in this polling cycle.")
|
|
172
|
+
|
|
173
|
+
# Update last_known_position to the highest timestamp seen in this cycle
|
|
174
|
+
# This prevents re-processing objects already processed in this polling interval
|
|
175
|
+
last_known_position = current_max_timestamp
|
|
176
|
+
|
|
177
|
+
except ClientError as e:
|
|
178
|
+
error_code = e.response.get("Error", {}).get("Code")
|
|
179
|
+
if error_code == "NoSuchBucket":
|
|
180
|
+
raise DriverError(f"S3 Bucket '{self.bucket_name}' not found: {e}")
|
|
181
|
+
elif error_code == "AccessDenied":
|
|
182
|
+
raise DriverError(f"Access denied to S3 Bucket '{self.bucket_name}': {e}")
|
|
183
|
+
else:
|
|
184
|
+
raise DriverError(f"S3 client error during polling: {e}")
|
|
185
|
+
except Exception as e:
|
|
186
|
+
self.logger.error(f"Unexpected error during S3 polling: {e}", exc_info=True)
|
|
187
|
+
raise DriverError(f"Unexpected error during S3 polling: {e}")
|
|
188
|
+
|
|
189
|
+
await asyncio.sleep(polling_interval)
|
|
190
|
+
|
|
191
|
+
async def test_connection(self, **kwargs) -> Tuple[bool, str]:
|
|
192
|
+
"""
|
|
193
|
+
Tests the connection to the S3 service by performing a head_bucket operation.
|
|
194
|
+
"""
|
|
195
|
+
self.logger.info(f"Testing connection to S3 bucket '{self.bucket_name}' at '{self.endpoint_url}'")
|
|
196
|
+
try:
|
|
197
|
+
await asyncio.to_thread(self.s3_client.head_bucket, Bucket=self.bucket_name)
|
|
198
|
+
self.logger.info("S3 connection test successful.")
|
|
199
|
+
return True, "Connection successful."
|
|
200
|
+
except ClientError as e:
|
|
201
|
+
error_code = e.response.get("Error", {}).get("Code")
|
|
202
|
+
error_message = e.response.get("Error", {}).get("Message")
|
|
203
|
+
if error_code == "404": # Bucket not found
|
|
204
|
+
return False, f"Bucket '{self.bucket_name}' not found. Error: {error_message}"
|
|
205
|
+
elif error_code == "403": # Access denied
|
|
206
|
+
return False, f"Access denied to bucket '{self.bucket_name}'. Please check credentials and permissions. Error: {error_message}"
|
|
207
|
+
else:
|
|
208
|
+
return False, f"S3 connection failed with client error: {error_code} - {error_message}"
|
|
209
|
+
except Exception as e:
|
|
210
|
+
self.logger.error(f"S3 connection test failed with unexpected error: {e}", exc_info=True)
|
|
211
|
+
return False, f"Connection failed: {str(e)}"
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
async def get_available_fields(cls, **kwargs) -> Dict:
|
|
215
|
+
"""
|
|
216
|
+
Declares the data fields that this source can provide.
|
|
217
|
+
For S3, these are common S3 object metadata fields.
|
|
218
|
+
"""
|
|
219
|
+
return {
|
|
220
|
+
"type": "object",
|
|
221
|
+
"properties": {
|
|
222
|
+
"Key": {"type": "string", "description": "Object key (path)."},
|
|
223
|
+
"Size": {"type": "integer", "description": "Object size in bytes."},
|
|
224
|
+
"LastModified": {"type": "string", "format": "date-time", "description": "Last modified timestamp."},
|
|
225
|
+
"ETag": {"type": "string", "description": "ETag of the object."},
|
|
226
|
+
"StorageClass": {"type": "string", "description": "Storage class of the object."},
|
|
227
|
+
"OwnerId": {"type": "string", "description": "Canonical user ID of the object owner."},
|
|
228
|
+
"OwnerDisplayName": {"type": "string", "description": "Display name of the object owner."},
|
|
229
|
+
},
|
|
230
|
+
"required": ["Key", "Size", "LastModified"]
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
async def get_wizard_steps(cls) -> Dict[str, Any]:
|
|
235
|
+
"""
|
|
236
|
+
Provides configuration wizard steps for UI integration.
|
|
237
|
+
"""
|
|
238
|
+
return {
|
|
239
|
+
"steps": [
|
|
240
|
+
{
|
|
241
|
+
"title": "基本连接设置",
|
|
242
|
+
"description": "配置 S3-兼容对象存储服务的连接信息。",
|
|
243
|
+
"fields": [
|
|
244
|
+
{"key": "driver_params.endpoint_url", "label": "Endpoint URL", "type": "text", "required": True, "placeholder": "e.g., https://s3.amazonaws.com or http://minio:9000"},
|
|
245
|
+
{"key": "driver_params.bucket_name", "label": "存储桶名称", "type": "text", "required": True, "placeholder": "e.g., my-fustor-data"},
|
|
246
|
+
{"key": "driver_params.region_name", "label": "区域名称 (可选)", "type": "text", "placeholder": "e.g., us-east-1"},
|
|
247
|
+
{"key": "driver_params.prefix", "label": "路径前缀 (可选)", "type": "text", "placeholder": "e.g., logs/2023/", "help_text": "仅同步以该前缀开头的对象。"},
|
|
248
|
+
{"key": "driver_params.recursive", "label": "递归遍历", "type": "boolean", "default": True, "help_text": "是否递归遍历子目录。"},
|
|
249
|
+
]
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"title": "认证凭证",
|
|
253
|
+
"description": "提供访问 S3 存储桶的凭证。",
|
|
254
|
+
"fields": [
|
|
255
|
+
{"key": "credential.user", "label": "Access Key ID", "type": "text", "required": True, "placeholder": "您的 S3 Access Key ID"},
|
|
256
|
+
{"key": "credential.passwd", "label": "Secret Access Key", "type": "password", "required": True, "placeholder": "您的 S3 Secret Access Key"},
|
|
257
|
+
]
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"title": "增量同步策略",
|
|
261
|
+
"description": "选择用于捕获对象变更的策略。'Polling' 适合小规模存储或测试,更推荐使用消息队列。",
|
|
262
|
+
"fields": [
|
|
263
|
+
{"key": "driver_params.queue_type", "label": "同步策略", "type": "select", "required": True, "options": [
|
|
264
|
+
{"value": "polling", "label": "Polling (轮询)"},
|
|
265
|
+
{"value": "sqs", "label": "SQS (AWS SQS / 兼容)"},
|
|
266
|
+
{"value": "amqp", "label": "AMQP (RabbitMQ / 兼容)"},
|
|
267
|
+
]},
|
|
268
|
+
# Conditional fields for SQS
|
|
269
|
+
{"key": "driver_params.sqs_queue_config.queue_url", "label": "SQS 队列 URL", "type": "text", "required": True, "condition_key": "driver_params.queue_type", "condition_value": "sqs", "placeholder": "e.g., https://sqs.us-east-1.amazonaws.com/..."}
|
|
270
|
+
# ... other SQS/AMQP fields will go here when implemented
|
|
271
|
+
]
|
|
272
|
+
}
|
|
273
|
+
]
|
|
274
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from typing import Dict, Any, List
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from fustor_event_model.models import EventBase, EventType
|
|
4
|
+
|
|
5
|
+
def map_s3_object_to_event(
|
|
6
|
+
s3_object: Dict[str, Any],
|
|
7
|
+
bucket_name: str,
|
|
8
|
+
table_name: str,
|
|
9
|
+
event_type: EventType = EventType.INSERT
|
|
10
|
+
) -> EventBase:
|
|
11
|
+
"""
|
|
12
|
+
Maps a single S3 object dictionary to a Fustor EventBase.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
s3_object: A dictionary representing an S3 object (from boto3 ListObjectsV2).
|
|
16
|
+
bucket_name: The name of the S3 bucket.
|
|
17
|
+
table_name: The logical table name for the event (e.g., 'objects').
|
|
18
|
+
event_type: The type of event (INSERT, UPDATE, DELETE).
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
An EventBase instance.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Extract relevant metadata from the S3 object
|
|
25
|
+
key = s3_object.get("Key")
|
|
26
|
+
if not key:
|
|
27
|
+
raise ValueError("S3 object must have a 'Key'.")
|
|
28
|
+
|
|
29
|
+
size = s3_object.get("Size")
|
|
30
|
+
last_modified: datetime = s3_object.get("LastModified")
|
|
31
|
+
etag = s3_object.get("ETag")
|
|
32
|
+
storage_class = s3_object.get("StorageClass")
|
|
33
|
+
owner_id = s3_object.get("Owner", {}).get("ID")
|
|
34
|
+
owner_display_name = s3_object.get("Owner", {}).get("DisplayName")
|
|
35
|
+
|
|
36
|
+
# The index will be the Unix timestamp of LastModified for ordering
|
|
37
|
+
# Use -1 if LastModified is not available, though it usually is for objects.
|
|
38
|
+
index = int(last_modified.timestamp()) if last_modified else -1
|
|
39
|
+
|
|
40
|
+
# Prepare the row data
|
|
41
|
+
row_data = {
|
|
42
|
+
"Key": key,
|
|
43
|
+
"Size": size,
|
|
44
|
+
"LastModified": last_modified.isoformat() if last_modified else None,
|
|
45
|
+
"ETag": etag,
|
|
46
|
+
"StorageClass": storage_class,
|
|
47
|
+
"OwnerId": owner_id,
|
|
48
|
+
"OwnerDisplayName": owner_display_name,
|
|
49
|
+
# Add any other relevant S3 object metadata here
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Define the fields present in the row
|
|
53
|
+
fields = list(row_data.keys())
|
|
54
|
+
|
|
55
|
+
return EventBase(
|
|
56
|
+
event_type=event_type,
|
|
57
|
+
event_schema=bucket_name, # Use bucket name as the schema
|
|
58
|
+
table=table_name,
|
|
59
|
+
index=index,
|
|
60
|
+
fields=fields,
|
|
61
|
+
rows=[row_data], # Each S3 object becomes one row in an event batch
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def map_s3_objects_to_events_batch(
|
|
65
|
+
s3_objects: List[Dict[str, Any]],
|
|
66
|
+
bucket_name: str,
|
|
67
|
+
table_name: str,
|
|
68
|
+
event_type: EventType = EventType.INSERT
|
|
69
|
+
) -> EventBase:
|
|
70
|
+
"""
|
|
71
|
+
Maps a list of S3 objects to a single EventBase with multiple rows.
|
|
72
|
+
This is useful for batching inserts/updates from a paginator.
|
|
73
|
+
"""
|
|
74
|
+
if not s3_objects:
|
|
75
|
+
raise ValueError("s3_objects list cannot be empty.")
|
|
76
|
+
|
|
77
|
+
# Assume all objects in the batch belong to the same schema and table
|
|
78
|
+
# and have similar structure for fields.
|
|
79
|
+
first_object = s3_objects[0]
|
|
80
|
+
|
|
81
|
+
# The index for a batch event could be the largest LastModified timestamp
|
|
82
|
+
# or the last object's timestamp in the batch. Let's use the last object for now.
|
|
83
|
+
last_modified = s3_objects[-1].get("LastModified")
|
|
84
|
+
index = int(last_modified.timestamp()) if last_modified else -1
|
|
85
|
+
|
|
86
|
+
rows: List[Dict[str, Any]] = []
|
|
87
|
+
for s3_obj in s3_objects:
|
|
88
|
+
row_data = {
|
|
89
|
+
"Key": s3_obj.get("Key"),
|
|
90
|
+
"Size": s3_obj.get("Size"),
|
|
91
|
+
"LastModified": s3_obj.get("LastModified").isoformat() if s3_obj.get("LastModified") else None,
|
|
92
|
+
"ETag": s3_obj.get("ETag"),
|
|
93
|
+
"StorageClass": s3_obj.get("StorageClass"),
|
|
94
|
+
"OwnerId": s3_obj.get("Owner", {}).get("ID"),
|
|
95
|
+
"OwnerDisplayName": s3_obj.get("Owner", {}).get("DisplayName"),
|
|
96
|
+
}
|
|
97
|
+
rows.append(row_data)
|
|
98
|
+
|
|
99
|
+
# All rows in the batch are assumed to have the same fields, derive from first one
|
|
100
|
+
fields = list(rows[0].keys()) if rows else []
|
|
101
|
+
|
|
102
|
+
return EventBase(
|
|
103
|
+
event_type=event_type,
|
|
104
|
+
event_schema=bucket_name,
|
|
105
|
+
table=table_name,
|
|
106
|
+
index=index,
|
|
107
|
+
fields=fields,
|
|
108
|
+
rows=rows,
|
|
109
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fustor-source-oss
|
|
3
|
+
Version: 0.2
|
|
4
|
+
Summary: Fustor Source Driver for S3-compatible object storage (OSS)
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: boto3<2.0.0,>=1.28.0
|
|
7
|
+
Requires-Dist: fustor-core
|
|
8
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
fustor_source_oss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
fustor_source_oss/config.py,sha256=Ozl_Aj02G66BD4maDkimHHUtr0woGKh3JIxIz3-svb4,3924
|
|
3
|
+
fustor_source_oss/driver.py,sha256=tnpD5tqfnLPWTkCLrUMmgDF5t_AJnLiqSS6JBaJSmVw,14381
|
|
4
|
+
fustor_source_oss/mapper.py,sha256=GUJSU3ylJuVWI5V4j7spx0Tr-7YcAQtAUhzVFF8kdJw,3830
|
|
5
|
+
fustor_source_oss-0.2.dist-info/METADATA,sha256=Dl6aJVRZWlSR24dwPby1VpdmZrfr2y35e1W2hz4ux98,253
|
|
6
|
+
fustor_source_oss-0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
fustor_source_oss-0.2.dist-info/entry_points.txt,sha256=3jSvXrC9TTuba3sFWtAhDVpBSZRxsYyj9nhsYRiqo8c,85
|
|
8
|
+
fustor_source_oss-0.2.dist-info/top_level.txt,sha256=hq80-RUZ_j94qqXmO-jGI50cZrWD143nkbuO_xeWGdg,18
|
|
9
|
+
fustor_source_oss-0.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fustor_source_oss
|