awslabs.s3-tables-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """AWS S3 Tables MCP Server file processing module.
16
+
17
+ This module provides functionality for processing and analyzing uploaded files,
18
+ particularly focusing on CSV file handling and preview capabilities.
19
+ """
20
+
21
+ import csv
22
+ import os
23
+ import pyarrow as pa
24
+ import re
25
+ import uuid
26
+ from .utils import get_s3_client, pyiceberg_load_catalog
27
+ from datetime import date, datetime, time
28
+ from decimal import Decimal
29
+ from io import StringIO
30
+ from pyiceberg.types import (
31
+ BinaryType,
32
+ BooleanType,
33
+ DateType,
34
+ DecimalType,
35
+ DoubleType,
36
+ FixedType,
37
+ FloatType,
38
+ IntegerType,
39
+ ListType,
40
+ LongType,
41
+ MapType,
42
+ StringType,
43
+ StructType,
44
+ TimestampType,
45
+ TimestamptzType,
46
+ TimeType,
47
+ UUIDType,
48
+ )
49
+ from typing import Dict, List, Optional
50
+ from urllib.parse import urlparse
51
+
52
+
53
+ def validate_s3_url(s3_url: str) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:
54
+ """Validate an S3 URL and extract its components.
55
+
56
+ Args:
57
+ s3_url: The S3 URL to validate (format: s3://bucket-name/key)
58
+
59
+ Returns:
60
+ Tuple containing:
61
+ - bool: Whether the URL is valid
62
+ - str: Error message if invalid, None if valid
63
+ - str: Bucket name if valid, None if invalid
64
+ - str: Object key if valid, None if invalid
65
+ """
66
+ try:
67
+ parsed = urlparse(s3_url)
68
+ if parsed.scheme != 's3':
69
+ return False, f"Invalid URL scheme: {parsed.scheme}. Must be 's3://'", None, None
70
+
71
+ if not parsed.netloc:
72
+ return False, 'Missing bucket name in S3 URL', None, None
73
+
74
+ bucket = parsed.netloc
75
+ key = parsed.path.lstrip('/')
76
+
77
+ if not key:
78
+ return False, 'Missing object key in S3 URL', None, None
79
+
80
+ return True, None, bucket, key
81
+ except Exception as e:
82
+ return False, f'Error parsing S3 URL: {str(e)}', None, None
83
+
84
+
85
+ def preview_csv_structure(s3_url: str) -> Dict:
86
+ """Preview the structure of a CSV file stored in S3 by reading its headers and first row.
87
+
88
+ This function provides a quick preview of a CSV file's structure by reading
89
+ only the headers and first row of data from an S3 location. It's useful for
90
+ understanding the schema and data format without downloading the entire file.
91
+
92
+ Args:
93
+ s3_url: The S3 URL of the CSV file (format: s3://bucket-name/key)
94
+
95
+ Returns:
96
+ A dictionary containing:
97
+ - headers: List of column names from the first row
98
+ - first_row: Dictionary mapping column names to their values from the first data row (empty if no data)
99
+ - total_columns: Number of columns in the CSV
100
+ - file_name: Name of the CSV file
101
+
102
+ Returns error dictionary with status and error message if:
103
+ - URL is not a valid S3 URL
104
+ - File is not a CSV file
105
+ - File cannot be accessed
106
+ - Any other error occurs
107
+ """
108
+ try:
109
+ # Validate S3 URL
110
+ is_valid, error_msg, bucket, key = validate_s3_url(s3_url)
111
+ if not is_valid:
112
+ return {'status': 'error', 'error': error_msg}
113
+
114
+ # At this point, bucket and key are guaranteed to be non-None strings
115
+ if bucket is None or key is None:
116
+ return {'status': 'error', 'error': 'Invalid S3 URL: bucket or key is None'}
117
+
118
+ # Check if file has .csv extension
119
+ if not key.lower().endswith('.csv'):
120
+ return {
121
+ 'status': 'error',
122
+ 'error': f'File {key} is not a CSV file. Only .csv files are supported.',
123
+ }
124
+
125
+ # Get S3 client
126
+ s3_client = get_s3_client()
127
+
128
+ # Get the object from S3, only downloading first 8KB (should be enough for headers and first row)
129
+ response = s3_client.get_object(
130
+ Bucket=bucket,
131
+ Key=key,
132
+ Range='bytes=0-32768', # First 32KB
133
+ )
134
+
135
+ # Read the CSV content
136
+ csv_content = response['Body'].read().decode('utf-8')
137
+
138
+ # Split content into lines
139
+ lines = csv_content.splitlines()
140
+ if not lines:
141
+ return {'status': 'error', 'error': 'File is empty'}
142
+
143
+ # Parse the headers
144
+ headers = next(csv.reader([lines[0]]), [])
145
+
146
+ # Try to get first row if it exists
147
+ first_row = next(csv.reader([lines[1]]), []) if len(lines) > 1 else []
148
+
149
+ # Create a dictionary mapping headers to first row values
150
+ first_row_dict = dict(zip(headers, first_row)) if headers and first_row else {}
151
+
152
+ return {
153
+ 'headers': headers,
154
+ 'first_row': first_row_dict,
155
+ 'total_columns': len(headers),
156
+ 'file_name': os.path.basename(key),
157
+ }
158
+ except Exception as e:
159
+ return {'status': 'error', 'error': str(e)}
160
+
161
+
162
+ def convert_value(value: Optional[str], iceberg_type):
163
+ """Convert a string value to the appropriate type based on Iceberg schema type.
164
+
165
+ Args:
166
+ value: The string value to convert (can be None)
167
+ iceberg_type: The Iceberg type to convert to
168
+
169
+ Returns:
170
+ The converted value of the appropriate type
171
+
172
+ Raises:
173
+ ValueError: If the value cannot be converted to the target type
174
+ NotImplementedError: For unsupported complex types
175
+ """
176
+ if value is None or value == '':
177
+ return None
178
+
179
+ if isinstance(iceberg_type, BooleanType):
180
+ return value.lower() in ('true', '1', 'yes')
181
+
182
+ elif isinstance(iceberg_type, IntegerType):
183
+ return int(value)
184
+
185
+ elif isinstance(iceberg_type, LongType):
186
+ return int(value)
187
+
188
+ elif isinstance(iceberg_type, FloatType):
189
+ return float(value)
190
+
191
+ elif isinstance(iceberg_type, DoubleType):
192
+ return float(value)
193
+
194
+ elif isinstance(iceberg_type, DecimalType):
195
+ return Decimal(value)
196
+
197
+ elif isinstance(iceberg_type, DateType):
198
+ return date.fromisoformat(value)
199
+
200
+ elif isinstance(iceberg_type, TimeType):
201
+ return time.fromisoformat(value)
202
+
203
+ elif isinstance(iceberg_type, TimestampType):
204
+ return datetime.fromisoformat(value)
205
+
206
+ elif isinstance(iceberg_type, TimestamptzType):
207
+ return datetime.fromisoformat(value) # Ensure it's tz-aware if needed
208
+
209
+ elif isinstance(iceberg_type, StringType):
210
+ return str(value)
211
+
212
+ elif isinstance(iceberg_type, UUIDType):
213
+ return uuid.UUID(value)
214
+
215
+ elif isinstance(iceberg_type, BinaryType) or isinstance(iceberg_type, FixedType):
216
+ return bytes.fromhex(value)
217
+
218
+ elif isinstance(iceberg_type, ListType):
219
+ # naive split for example; you'd want better parsing logic
220
+ return [convert_value(v.strip(), iceberg_type.element_type) for v in value.split(',')]
221
+
222
+ elif isinstance(iceberg_type, MapType):
223
+ # naive: "key1:value1,key2:value2"
224
+ return {
225
+ k.strip(): convert_value(v.strip(), iceberg_type.value_type)
226
+ for k, v in (item.split(':') for item in value.split(','))
227
+ }
228
+
229
+ elif isinstance(iceberg_type, StructType):
230
+ raise NotImplementedError('Nested structs need structured input like JSON or dict.')
231
+
232
+ else:
233
+ raise ValueError(f'Unsupported Iceberg type: {iceberg_type}')
234
+
235
+
236
+ def create_pyarrow_schema_from_iceberg(schema) -> pa.Schema:
237
+ """Create a PyArrow schema from an Iceberg schema, supporting basic types and decimals."""
238
+
239
+ def convert_iceberg_type_to_pyarrow(iceberg_type_str: str):
240
+ """Convert an Iceberg type string to a PyArrow type."""
241
+ iceberg_type_str = iceberg_type_str.lower()
242
+
243
+ if iceberg_type_str == 'boolean':
244
+ return pa.bool_()
245
+ elif iceberg_type_str == 'int':
246
+ return pa.int32()
247
+ elif iceberg_type_str == 'long':
248
+ return pa.int64()
249
+ elif iceberg_type_str == 'float':
250
+ return pa.float32()
251
+ elif iceberg_type_str == 'double':
252
+ return pa.float64()
253
+ elif iceberg_type_str == 'date':
254
+ return pa.date32()
255
+ elif iceberg_type_str == 'time':
256
+ return pa.time64('us')
257
+ elif iceberg_type_str == 'timestamp':
258
+ return pa.timestamp('us')
259
+ elif iceberg_type_str == 'timestamptz':
260
+ return pa.timestamp('us', tz='UTC')
261
+ elif iceberg_type_str == 'string':
262
+ return pa.string()
263
+ elif iceberg_type_str == 'uuid':
264
+ return pa.string()
265
+ elif iceberg_type_str == 'binary':
266
+ return pa.binary()
267
+ elif iceberg_type_str.startswith('fixed'):
268
+ size_match = re.match(r'fixed\((\d+)\)', iceberg_type_str)
269
+ return pa.binary(int(size_match.group(1))) if size_match else pa.binary()
270
+ elif iceberg_type_str.startswith('decimal'):
271
+ decimal_match = re.match(r'decimal\((\d+),\s*(\d+)\)', iceberg_type_str)
272
+ if decimal_match:
273
+ precision = int(decimal_match.group(1))
274
+ scale = int(decimal_match.group(2))
275
+ if precision <= 18:
276
+ return pa.decimal128(
277
+ precision, scale
278
+ ) # Will use INT64 encoding for small precision
279
+ else:
280
+ return pa.decimal256(precision, scale) # For large precision decimals
281
+ else:
282
+ raise ValueError(f'Invalid decimal type format: {iceberg_type_str}')
283
+ else:
284
+ raise ValueError(f'Unsupported Iceberg type: {iceberg_type_str}')
285
+
286
+ # Build PyArrow schema
287
+ pa_fields = []
288
+ for field in schema.fields:
289
+ name = field.name
290
+ iceberg_type_str = str(field.field_type)
291
+ try:
292
+ pa_type = convert_iceberg_type_to_pyarrow(iceberg_type_str)
293
+ except ValueError as e:
294
+ raise ValueError(f"Error in field '{name}': {e}")
295
+
296
+ pa_fields.append(pa.field(name, pa_type, nullable=not field.required))
297
+
298
+ return pa.schema(pa_fields)
299
+
300
+
301
+ def process_chunk(chunk: List[Dict], table, chunk_name: str = 'Chunk') -> Dict:
302
+ """Process a chunk of data by converting it to a PyArrow table and appending to the table.
303
+
304
+ Args:
305
+ chunk: List of dictionaries representing the data rows
306
+ table: The Iceberg table to append data to
307
+ chunk_name: Name identifier for the chunk (for logging purposes)
308
+
309
+ Returns:
310
+ Dictionary with status and message
311
+ """
312
+ try:
313
+ # Get the Iceberg schema and create PyArrow schema
314
+ schema = table.schema()
315
+ pyarrow_schema = create_pyarrow_schema_from_iceberg(schema)
316
+
317
+ # Convert list of dictionaries to PyArrow table with proper schema
318
+ table_data = pa.Table.from_pylist(chunk, schema=pyarrow_schema)
319
+
320
+ table.append(table_data)
321
+
322
+ return {
323
+ 'status': 'success',
324
+ 'message': f'Successfully processed {len(chunk)} rows in {chunk_name.lower()}',
325
+ }
326
+
327
+ except Exception as e:
328
+ return {'status': 'error', 'error': f'Error inserting {chunk_name.lower()}: {str(e)}'}
329
+
330
+
331
+ async def import_csv_to_table(
332
+ warehouse: str,
333
+ region: str,
334
+ namespace: str,
335
+ table_name: str,
336
+ s3_url: str,
337
+ uri: str = 'https://s3tables.us-west-2.amazonaws.com/iceberg',
338
+ catalog_name: str = 's3tablescatalog',
339
+ rest_signing_name: str = 's3tables',
340
+ rest_sigv4_enabled: str = 'true',
341
+ ) -> Dict:
342
+ """Import data from a CSV file into an S3 table.
343
+
344
+ This function reads data from a CSV file stored in S3 and imports it into an existing S3 table.
345
+ The CSV file must have headers that match the table's schema. The function will validate the CSV structure
346
+ before attempting to import the data.
347
+
348
+ Args:
349
+ warehouse: Warehouse string for Iceberg catalog
350
+ region: AWS region for S3Tables/Iceberg REST endpoint
351
+ namespace: The namespace containing the table
352
+ table_name: The name of the table to import data into
353
+ s3_url: The S3 URL of the CSV file (format: s3://bucket-name/key)
354
+ uri: REST URI for Iceberg catalog
355
+ catalog_name: Catalog name
356
+ rest_signing_name: REST signing name
357
+ rest_sigv4_enabled: Enable SigV4 signing
358
+
359
+ Returns:
360
+ A dictionary containing:
361
+ - status: 'success' or 'error'
362
+ - message: Success message or error details
363
+ - rows_processed: Number of rows processed (on success)
364
+ - file_processed: Name of the processed file
365
+ - csv_headers: List of CSV headers
366
+
367
+ Returns error dictionary with status and error message if:
368
+ - URL is not a valid S3 URL
369
+ - File is not a CSV file
370
+ - File cannot be accessed
371
+ - Table does not exist
372
+ - CSV headers don't match table schema
373
+ - Any other error occurs
374
+ """
375
+ # Validate S3 URL
376
+ is_valid, error_msg, bucket, key = validate_s3_url(s3_url)
377
+ if not is_valid:
378
+ return {'status': 'error', 'error': error_msg}
379
+
380
+ if bucket is None or key is None:
381
+ return {'status': 'error', 'error': 'Invalid S3 URL: bucket or key is None'}
382
+
383
+ if not key.lower().endswith('.csv'):
384
+ return {
385
+ 'status': 'error',
386
+ 'error': f'File {key} is not a CSV file. Only .csv files are supported.',
387
+ }
388
+
389
+ try:
390
+ # Load catalog using provided parameters (see pyiceberg.py style)
391
+ catalog = pyiceberg_load_catalog(
392
+ catalog_name,
393
+ warehouse,
394
+ uri,
395
+ region,
396
+ rest_signing_name,
397
+ rest_sigv4_enabled,
398
+ )
399
+
400
+ # Load existing table
401
+ table = catalog.load_table(f'{namespace}.{table_name}')
402
+
403
+ # Get schema information
404
+ schema = table.schema()
405
+
406
+ # Get S3 client
407
+ s3_client = get_s3_client()
408
+
409
+ # Get the CSV file from S3
410
+ response = s3_client.get_object(Bucket=bucket, Key=key)
411
+ csv_content = response['Body'].read().decode('utf-8')
412
+
413
+ # Read CSV content
414
+ csv_reader = csv.DictReader(StringIO(csv_content))
415
+
416
+ # Validate headers against schema
417
+ csv_headers = csv_reader.fieldnames
418
+ schema_field_names = {field.name for field in schema.fields}
419
+
420
+ if not csv_headers:
421
+ return {'status': 'error', 'error': 'CSV file has no headers'}
422
+
423
+ missing_columns = schema_field_names - set(csv_headers)
424
+ if missing_columns:
425
+ return {
426
+ 'status': 'error',
427
+ 'error': f'CSV is missing required columns: {", ".join(missing_columns)}',
428
+ }
429
+
430
+ # Process rows in chunks
431
+ chunk_size = 5000
432
+ rows_processed = 0
433
+ current_chunk = []
434
+
435
+ for row in csv_reader:
436
+ # Transform row data according to schema types
437
+ transformed_row = {}
438
+ for field in schema.fields:
439
+ value = row.get(field.name)
440
+
441
+ # Handle required fields
442
+ if field.required and (value is None or value == ''):
443
+ return {
444
+ 'status': 'error',
445
+ 'error': f'Required field {field.name} is missing or empty in row {rows_processed + 1}',
446
+ }
447
+
448
+ # Transform value based on field type
449
+ try:
450
+ if value is None or value == '':
451
+ transformed_row[field.name] = None
452
+ else:
453
+ transformed_row[field.name] = convert_value(value, field.field_type)
454
+ except (ValueError, TypeError) as e:
455
+ return {
456
+ 'status': 'error',
457
+ 'error': f'Error converting value for field {field.name} in row {rows_processed + 1}: {str(e)}',
458
+ }
459
+
460
+ current_chunk.append(transformed_row)
461
+ rows_processed += 1
462
+
463
+ # Process chunk when it reaches the chunk size
464
+ if len(current_chunk) >= chunk_size:
465
+ result = process_chunk(current_chunk, table, 'Chunk')
466
+ if result['status'] == 'error':
467
+ return result
468
+ current_chunk = []
469
+
470
+ # Process any remaining rows
471
+ if current_chunk:
472
+ result = process_chunk(current_chunk, table, 'Final Chunk')
473
+ if result['status'] == 'error':
474
+ return result
475
+
476
+ return {
477
+ 'status': 'success',
478
+ 'message': f'Successfully processed {rows_processed} rows',
479
+ 'rows_processed': rows_processed,
480
+ 'file_processed': os.path.basename(key),
481
+ 'csv_headers': csv_headers,
482
+ }
483
+
484
+ except Exception as e:
485
+ return {'status': 'error', 'error': str(e)}