fustor-source-oss 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,70 @@
1
+ from pydantic import BaseModel, Field, AnyHttpUrl, ValidationError, model_validator
2
+ from typing import Optional, Literal, Dict, Any
3
+ from enum import Enum
4
+
5
+ class QueueType(str, Enum):
6
+ """Defines the supported queue types for incremental synchronization."""
7
+ POLLING = "polling" # Simple polling based on object modification time
8
+ SQS = "sqs" # AWS SQS for event notifications
9
+ AMQP = "amqp" # AMQP (e.g., RabbitMQ) for event notifications
10
+
11
+ class SQSQueueConfig(BaseModel):
12
+ """Configuration for AWS SQS queue."""
13
+ queue_url: str = Field(..., description="URL of the SQS queue.")
14
+ region_name: Optional[str] = Field(None, description="AWS region of the SQS queue.")
15
+ visibility_timeout: int = Field(30, ge=0, description="Message visibility timeout in seconds.")
16
+
17
+ class AMQPQueueConfig(BaseModel):
18
+ """Configuration for AMQP (RabbitMQ) queue."""
19
+ host: str = Field(..., description="AMQP broker hostname.")
20
+ port: int = Field(5672, gt=0, description="AMQP broker port.")
21
+ username: str = Field("guest", description="AMQP username.")
22
+ password: str = Field("guest", description="AMQP password.")
23
+ queue_name: str = Field(..., description="Name of the queue to consume.")
24
+
25
+ class PollingQueueConfig(BaseModel):
26
+ """Configuration for polling-based incremental sync."""
27
+ interval_seconds: int = Field(30, ge=1, description="Polling interval in seconds.")
28
+
29
+ class OssDriverParams(BaseModel):
30
+ """
31
+ Configuration parameters specific to the OSS Source Driver.
32
+ These parameters are stored within the 'driver_params' field of the core SourceConfig.
33
+ """
34
+ endpoint_url: str = Field(..., description="S3-compatible service endpoint URL.")
35
+ bucket_name: str = Field(..., min_length=3, description="Name of the S3 bucket.")
36
+ region_name: Optional[str] = Field(None, description="AWS region of the S3 bucket.")
37
+ prefix: str = Field("", description="Optional object key prefix to filter files.")
38
+ recursive: bool = Field(True, description="Whether to recursively list objects within the prefix.")
39
+
40
+ # Incremental synchronization strategy
41
+ queue_type: QueueType = Field(QueueType.POLLING, description="Strategy for incremental synchronization.")
42
+
43
+ # Specific queue configurations, validated conditionally
44
+ sqs_queue_config: Optional[SQSQueueConfig] = Field(None, description="SQS queue configuration if queue_type is SQS.")
45
+ amqp_queue_config: Optional[AMQPQueueConfig] = Field(None, description="AMQP queue configuration if queue_type is AMQP.")
46
+ polling_queue_config: Optional[PollingQueueConfig] = Field(None, description="Polling queue configuration if queue_type is POLLING.")
47
+
48
+ # Custom validator to ensure correct queue config is provided based on queue_type
49
+ @classmethod
50
+ def validate_queue_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
51
+ queue_type = values.get("queue_type")
52
+ if queue_type == QueueType.SQS:
53
+ if not values.get("sqs_queue_config"):
54
+ raise ValueError("sqs_queue_config must be provided for SQS queue_type.")
55
+ # Ensure other queue configs are not present if SQS is chosen
56
+ values["amqp_queue_config"] = None
57
+ values["polling_queue_config"] = None
58
+ elif queue_type == QueueType.AMQP:
59
+ if not values.get("amqp_queue_config"):
60
+ raise ValueError("amqp_queue_config must be provided for AMQP queue_type.")
61
+ # Ensure other queue configs are not present if AMQP is chosen
62
+ values["sqs_queue_config"] = None
63
+ values["polling_queue_config"] = None
64
+ elif queue_type == QueueType.POLLING:
65
+ # Polling config is optional, but if present, others are cleared
66
+ values["sqs_queue_config"] = None
67
+ values["amqp_queue_config"] = None
68
+ return values
69
+
70
+ model_validator(mode='before')(validate_queue_config)
@@ -0,0 +1,274 @@
1
+ import logging
2
+ import asyncio
3
+ from typing import Dict, Any, Iterator, Tuple, Optional
4
+ import boto3
5
+ from botocore.exceptions import ClientError
6
+ from datetime import datetime, timezone
7
+
8
+ from fustor_core.drivers import SourceDriver
9
+ from fustor_core.models.config import SourceConfig, PasswdCredential
10
+ from fustor_core.exceptions import DriverError
11
+ from fustor_event_model.models import EventBase, EventType
12
+
13
+ from .config import OssDriverParams, QueueType
14
+ from .mapper import map_s3_objects_to_events_batch
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class OssSourceDriver(SourceDriver):
19
+ """
20
+ Fustor Source Driver for S3-compatible object storage.
21
+ Supports snapshot (full listing) and polling-based incremental synchronization.
22
+ """
23
+
24
+ def __init__(self, id: str, config: SourceConfig):
25
+ super().__init__(id, config)
26
+ self.logger = logging.getLogger(f"{__name__}.{id}")
27
+
28
+ # Parse OSS specific parameters from driver_params
29
+ try:
30
+ self.oss_params = OssDriverParams(**config.driver_params)
31
+ except Exception as e:
32
+ raise DriverError(f"OSS configuration invalid: {e}")
33
+
34
+ # Parse credentials
35
+ if not isinstance(config.credential, PasswdCredential):
36
+ raise DriverError("OSS driver requires PasswdCredential (Access Key ID / Secret Access Key).")
37
+
38
+ self.ak = config.credential.user
39
+ self.sk = config.credential.passwd
40
+
41
+ self.bucket_name = self.oss_params.bucket_name
42
+ self.endpoint_url = self.oss_params.endpoint_url
43
+ self.region_name = self.oss_params.region_name
44
+ self.prefix = self.oss_params.prefix
45
+
46
+ # Initialize S3 client
47
+ try:
48
+ self.s3_client = boto3.client(
49
+ "s3",
50
+ endpoint_url=self.endpoint_url,
51
+ aws_access_key_id=self.ak,
52
+ aws_secret_access_key=self.sk,
53
+ region_name=self.region_name,
54
+ )
55
+ self.logger.info(f"Initialized S3 client for bucket '{self.bucket_name}' at '{self.endpoint_url}'")
56
+ except Exception as e:
57
+ raise DriverError(f"Failed to initialize S3 client: {e}")
58
+
59
+ async def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
60
+ """
61
+ Performs a one-time, full snapshot of the source data by listing all objects
62
+ matching the prefix in the specified S3 bucket.
63
+ """
64
+ self.logger.info(f"Starting snapshot for bucket '{self.bucket_name}' with prefix '{self.prefix}'")
65
+
66
+ paginator = self.s3_client.get_paginator("list_objects_v2")
67
+ page_iterator = paginator.paginate(
68
+ Bucket=self.bucket_name,
69
+ Prefix=self.prefix,
70
+ PaginationConfig={"PageSize": kwargs.get("batch_size", 1000)} # Default batch size
71
+ )
72
+
73
+ for page in page_iterator:
74
+ contents = page.get("Contents", [])
75
+ if not contents:
76
+ continue
77
+
78
+ # Convert to Fustor EventBase batch
79
+ try:
80
+ event_batch = map_s3_objects_to_events_batch(
81
+ contents,
82
+ self.bucket_name,
83
+ "objects", # Logical table name for S3 objects
84
+ EventType.INSERT
85
+ )
86
+ yield event_batch
87
+ except Exception as e:
88
+ self.logger.error(f"Error mapping S3 objects to event batch: {e}", exc_info=True)
89
+ continue
90
+
91
+ self.logger.info(f"Snapshot completed for bucket '{self.bucket_name}' with prefix '{self.prefix}'")
92
+
93
+ def is_position_available(self, position: int) -> bool:
94
+ """
95
+ For OSS, positions are timestamps. We assume any past timestamp is "available"
96
+ as long as the object was not deleted. Polling logic will handle filtering.
97
+ """
98
+ return True # For polling, we can always attempt to resume from a timestamp
99
+
100
+
101
+ async def get_message_iterator(self, start_position: int = -1, **kwargs) -> Iterator[EventBase]:
102
+ """
103
+ Performs incremental data capture (CDC) for OSS.
104
+
105
+ In polling mode, this method periodically lists objects and filters them
106
+ by LastModified timestamp greater than start_position.
107
+ """
108
+ if self.oss_params.queue_type != QueueType.POLLING:
109
+ raise DriverError(f"Unsupported queue_type '{self.oss_params.queue_type}' for get_message_iterator. "
110
+ "Only POLLING is currently implemented.")
111
+
112
+ polling_interval = self.oss_params.polling_queue_config.interval_seconds \
113
+ if self.oss_params.polling_queue_config else 30
114
+
115
+ self.logger.info(f"Starting message polling for bucket '{self.bucket_name}' with prefix '{self.prefix}' "
116
+ f"from position {start_position}, interval: {polling_interval}s")
117
+
118
+ last_known_position = start_position
119
+ while True:
120
+ try:
121
+ # S3 ListObjectsV2 doesn't support filtering by LastModified directly.
122
+ # We must list and then filter client-side.
123
+ # This can be inefficient for very large buckets/prefixes.
124
+ paginator = self.s3_client.get_paginator("list_objects_v2")
125
+ page_iterator = paginator.paginate(
126
+ Bucket=self.bucket_name,
127
+ Prefix=self.prefix,
128
+ )
129
+
130
+ new_or_modified_objects = []
131
+ current_max_timestamp = last_known_position
132
+
133
+ for page in page_iterator:
134
+ contents = page.get("Contents", [])
135
+ for s3_obj in contents:
136
+ last_modified: datetime = s3_obj.get("LastModified")
137
+ if last_modified:
138
+ obj_timestamp = int(last_modified.timestamp())
139
+
140
+ # Filter for objects modified AFTER the last_known_position
141
+ # We use '>=' to potentially include items modified in the same second as the last checkpoint
142
+ # The consumer (Agent) should handle exact duplicates if any.
143
+ if obj_timestamp >= last_known_position:
144
+ new_or_modified_objects.append(s3_obj)
145
+ if obj_timestamp > current_max_timestamp:
146
+ current_max_timestamp = obj_timestamp
147
+
148
+ if new_or_modified_objects:
149
+ self.logger.debug(f"Found {len(new_or_modified_objects)} new/modified objects.")
150
+ # Sort by timestamp to maintain order
151
+ new_or_modified_objects.sort(key=lambda x: int(x.get("LastModified", datetime.min).timestamp()))
152
+
153
+ # Group into batches
154
+ batch_size = kwargs.get("batch_size", 100) # Use the batch_size from config
155
+ for i in range(0, len(new_or_modified_objects), batch_size):
156
+ batch_objects = new_or_modified_objects[i:i + batch_size]
157
+
158
+ try:
159
+ event_batch = map_s3_objects_to_events_batch(
160
+ batch_objects,
161
+ self.bucket_name,
162
+ "objects",
163
+ EventType.UPDATE # Treat as update/insert for changes
164
+ )
165
+ yield event_batch
166
+ last_known_position = int(event_batch.index) # Update position after yielding a batch
167
+ except Exception as e:
168
+ self.logger.error(f"Error mapping S3 objects to event batch during polling: {e}", exc_info=True)
169
+ continue
170
+ else:
171
+ self.logger.debug("No new or modified objects found in this polling cycle.")
172
+
173
+ # Update last_known_position to the highest timestamp seen in this cycle
174
+ # This prevents re-processing objects already processed in this polling interval
175
+ last_known_position = current_max_timestamp
176
+
177
+ except ClientError as e:
178
+ error_code = e.response.get("Error", {}).get("Code")
179
+ if error_code == "NoSuchBucket":
180
+ raise DriverError(f"S3 Bucket '{self.bucket_name}' not found: {e}")
181
+ elif error_code == "AccessDenied":
182
+ raise DriverError(f"Access denied to S3 Bucket '{self.bucket_name}': {e}")
183
+ else:
184
+ raise DriverError(f"S3 client error during polling: {e}")
185
+ except Exception as e:
186
+ self.logger.error(f"Unexpected error during S3 polling: {e}", exc_info=True)
187
+ raise DriverError(f"Unexpected error during S3 polling: {e}")
188
+
189
+ await asyncio.sleep(polling_interval)
190
+
191
+ async def test_connection(self, **kwargs) -> Tuple[bool, str]:
192
+ """
193
+ Tests the connection to the S3 service by performing a head_bucket operation.
194
+ """
195
+ self.logger.info(f"Testing connection to S3 bucket '{self.bucket_name}' at '{self.endpoint_url}'")
196
+ try:
197
+ await asyncio.to_thread(self.s3_client.head_bucket, Bucket=self.bucket_name)
198
+ self.logger.info("S3 connection test successful.")
199
+ return True, "Connection successful."
200
+ except ClientError as e:
201
+ error_code = e.response.get("Error", {}).get("Code")
202
+ error_message = e.response.get("Error", {}).get("Message")
203
+ if error_code == "404": # Bucket not found
204
+ return False, f"Bucket '{self.bucket_name}' not found. Error: {error_message}"
205
+ elif error_code == "403": # Access denied
206
+ return False, f"Access denied to bucket '{self.bucket_name}'. Please check credentials and permissions. Error: {error_message}"
207
+ else:
208
+ return False, f"S3 connection failed with client error: {error_code} - {error_message}"
209
+ except Exception as e:
210
+ self.logger.error(f"S3 connection test failed with unexpected error: {e}", exc_info=True)
211
+ return False, f"Connection failed: {str(e)}"
212
+
213
+ @classmethod
214
+ async def get_available_fields(cls, **kwargs) -> Dict:
215
+ """
216
+ Declares the data fields that this source can provide.
217
+ For S3, these are common S3 object metadata fields.
218
+ """
219
+ return {
220
+ "type": "object",
221
+ "properties": {
222
+ "Key": {"type": "string", "description": "Object key (path)."},
223
+ "Size": {"type": "integer", "description": "Object size in bytes."},
224
+ "LastModified": {"type": "string", "format": "date-time", "description": "Last modified timestamp."},
225
+ "ETag": {"type": "string", "description": "ETag of the object."},
226
+ "StorageClass": {"type": "string", "description": "Storage class of the object."},
227
+ "OwnerId": {"type": "string", "description": "Canonical user ID of the object owner."},
228
+ "OwnerDisplayName": {"type": "string", "description": "Display name of the object owner."},
229
+ },
230
+ "required": ["Key", "Size", "LastModified"]
231
+ }
232
+
233
+ @classmethod
234
+ async def get_wizard_steps(cls) -> Dict[str, Any]:
235
+ """
236
+ Provides configuration wizard steps for UI integration.
237
+ """
238
+ return {
239
+ "steps": [
240
+ {
241
+ "title": "基本连接设置",
242
+ "description": "配置 S3-兼容对象存储服务的连接信息。",
243
+ "fields": [
244
+ {"key": "driver_params.endpoint_url", "label": "Endpoint URL", "type": "text", "required": True, "placeholder": "e.g., https://s3.amazonaws.com or http://minio:9000"},
245
+ {"key": "driver_params.bucket_name", "label": "存储桶名称", "type": "text", "required": True, "placeholder": "e.g., my-fustor-data"},
246
+ {"key": "driver_params.region_name", "label": "区域名称 (可选)", "type": "text", "placeholder": "e.g., us-east-1"},
247
+ {"key": "driver_params.prefix", "label": "路径前缀 (可选)", "type": "text", "placeholder": "e.g., logs/2023/", "help_text": "仅同步以该前缀开头的对象。"},
248
+ {"key": "driver_params.recursive", "label": "递归遍历", "type": "boolean", "default": True, "help_text": "是否递归遍历子目录。"},
249
+ ]
250
+ },
251
+ {
252
+ "title": "认证凭证",
253
+ "description": "提供访问 S3 存储桶的凭证。",
254
+ "fields": [
255
+ {"key": "credential.user", "label": "Access Key ID", "type": "text", "required": True, "placeholder": "您的 S3 Access Key ID"},
256
+ {"key": "credential.passwd", "label": "Secret Access Key", "type": "password", "required": True, "placeholder": "您的 S3 Secret Access Key"},
257
+ ]
258
+ },
259
+ {
260
+ "title": "增量同步策略",
261
+ "description": "选择用于捕获对象变更的策略。'Polling' 适合小规模存储或测试,更推荐使用消息队列。",
262
+ "fields": [
263
+ {"key": "driver_params.queue_type", "label": "同步策略", "type": "select", "required": True, "options": [
264
+ {"value": "polling", "label": "Polling (轮询)"},
265
+ {"value": "sqs", "label": "SQS (AWS SQS / 兼容)"},
266
+ {"value": "amqp", "label": "AMQP (RabbitMQ / 兼容)"},
267
+ ]},
268
+ # Conditional fields for SQS
269
+ {"key": "driver_params.sqs_queue_config.queue_url", "label": "SQS 队列 URL", "type": "text", "required": True, "condition_key": "driver_params.queue_type", "condition_value": "sqs", "placeholder": "e.g., https://sqs.us-east-1.amazonaws.com/..."}
270
+ # ... other SQS/AMQP fields will go here when implemented
271
+ ]
272
+ }
273
+ ]
274
+ }
@@ -0,0 +1,109 @@
1
+ from typing import Dict, Any, List
2
+ from datetime import datetime
3
+ from fustor_event_model.models import EventBase, EventType
4
+
5
+ def map_s3_object_to_event(
6
+ s3_object: Dict[str, Any],
7
+ bucket_name: str,
8
+ table_name: str,
9
+ event_type: EventType = EventType.INSERT
10
+ ) -> EventBase:
11
+ """
12
+ Maps a single S3 object dictionary to a Fustor EventBase.
13
+
14
+ Args:
15
+ s3_object: A dictionary representing an S3 object (from boto3 ListObjectsV2).
16
+ bucket_name: The name of the S3 bucket.
17
+ table_name: The logical table name for the event (e.g., 'objects').
18
+ event_type: The type of event (INSERT, UPDATE, DELETE).
19
+
20
+ Returns:
21
+ An EventBase instance.
22
+ """
23
+
24
+ # Extract relevant metadata from the S3 object
25
+ key = s3_object.get("Key")
26
+ if not key:
27
+ raise ValueError("S3 object must have a 'Key'.")
28
+
29
+ size = s3_object.get("Size")
30
+ last_modified: datetime = s3_object.get("LastModified")
31
+ etag = s3_object.get("ETag")
32
+ storage_class = s3_object.get("StorageClass")
33
+ owner_id = s3_object.get("Owner", {}).get("ID")
34
+ owner_display_name = s3_object.get("Owner", {}).get("DisplayName")
35
+
36
+ # The index will be the Unix timestamp of LastModified for ordering
37
+ # Use -1 if LastModified is not available, though it usually is for objects.
38
+ index = int(last_modified.timestamp()) if last_modified else -1
39
+
40
+ # Prepare the row data
41
+ row_data = {
42
+ "Key": key,
43
+ "Size": size,
44
+ "LastModified": last_modified.isoformat() if last_modified else None,
45
+ "ETag": etag,
46
+ "StorageClass": storage_class,
47
+ "OwnerId": owner_id,
48
+ "OwnerDisplayName": owner_display_name,
49
+ # Add any other relevant S3 object metadata here
50
+ }
51
+
52
+ # Define the fields present in the row
53
+ fields = list(row_data.keys())
54
+
55
+ return EventBase(
56
+ event_type=event_type,
57
+ event_schema=bucket_name, # Use bucket name as the schema
58
+ table=table_name,
59
+ index=index,
60
+ fields=fields,
61
+ rows=[row_data], # Each S3 object becomes one row in an event batch
62
+ )
63
+
64
+ def map_s3_objects_to_events_batch(
65
+ s3_objects: List[Dict[str, Any]],
66
+ bucket_name: str,
67
+ table_name: str,
68
+ event_type: EventType = EventType.INSERT
69
+ ) -> EventBase:
70
+ """
71
+ Maps a list of S3 objects to a single EventBase with multiple rows.
72
+ This is useful for batching inserts/updates from a paginator.
73
+ """
74
+ if not s3_objects:
75
+ raise ValueError("s3_objects list cannot be empty.")
76
+
77
+ # Assume all objects in the batch belong to the same schema and table
78
+ # and have similar structure for fields.
79
+ first_object = s3_objects[0]
80
+
81
+ # The index for a batch event could be the largest LastModified timestamp
82
+ # or the last object's timestamp in the batch. Let's use the last object for now.
83
+ last_modified = s3_objects[-1].get("LastModified")
84
+ index = int(last_modified.timestamp()) if last_modified else -1
85
+
86
+ rows: List[Dict[str, Any]] = []
87
+ for s3_obj in s3_objects:
88
+ row_data = {
89
+ "Key": s3_obj.get("Key"),
90
+ "Size": s3_obj.get("Size"),
91
+ "LastModified": s3_obj.get("LastModified").isoformat() if s3_obj.get("LastModified") else None,
92
+ "ETag": s3_obj.get("ETag"),
93
+ "StorageClass": s3_obj.get("StorageClass"),
94
+ "OwnerId": s3_obj.get("Owner", {}).get("ID"),
95
+ "OwnerDisplayName": s3_obj.get("Owner", {}).get("DisplayName"),
96
+ }
97
+ rows.append(row_data)
98
+
99
+ # All rows in the batch are assumed to have the same fields, derive from first one
100
+ fields = list(rows[0].keys()) if rows else []
101
+
102
+ return EventBase(
103
+ event_type=event_type,
104
+ event_schema=bucket_name,
105
+ table=table_name,
106
+ index=index,
107
+ fields=fields,
108
+ rows=rows,
109
+ )
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: fustor-source-oss
3
+ Version: 0.2
4
+ Summary: Fustor Source Driver for S3-compatible object storage (OSS)
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: boto3<2.0.0,>=1.28.0
7
+ Requires-Dist: fustor-core
8
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
@@ -0,0 +1,9 @@
1
+ fustor_source_oss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ fustor_source_oss/config.py,sha256=Ozl_Aj02G66BD4maDkimHHUtr0woGKh3JIxIz3-svb4,3924
3
+ fustor_source_oss/driver.py,sha256=tnpD5tqfnLPWTkCLrUMmgDF5t_AJnLiqSS6JBaJSmVw,14381
4
+ fustor_source_oss/mapper.py,sha256=GUJSU3ylJuVWI5V4j7spx0Tr-7YcAQtAUhzVFF8kdJw,3830
5
+ fustor_source_oss-0.2.dist-info/METADATA,sha256=Dl6aJVRZWlSR24dwPby1VpdmZrfr2y35e1W2hz4ux98,253
6
+ fustor_source_oss-0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ fustor_source_oss-0.2.dist-info/entry_points.txt,sha256=3jSvXrC9TTuba3sFWtAhDVpBSZRxsYyj9nhsYRiqo8c,85
8
+ fustor_source_oss-0.2.dist-info/top_level.txt,sha256=hq80-RUZ_j94qqXmO-jGI50cZrWD143nkbuO_xeWGdg,18
9
+ fustor_source_oss-0.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [fustor_agent.drivers.sources]
2
+ source_oss = fustor_source_oss.driver:OssSourceDriver
@@ -0,0 +1 @@
1
+ fustor_source_oss