earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,603 @@
1
+ # input_readers.py
2
+ """Flexible input format readers for STAC URL ingestion with cloud storage support.
3
+
4
+ This module provides a pluggable input reading system that supports multiple file formats
5
+ and storage backends for STAC URL ingestion. Designed to handle massive datasets efficiently
6
+ while providing a consistent interface across different input sources and formats.
7
+
8
+ Supported Input Formats:
9
+ - Parquet: High-performance columnar format (recommended for large datasets)
10
+ - CSV: Comma-separated values with configurable options
11
+ - TSV: Tab-separated values (specialized CSV configuration)
12
+ - NDJSON/JSONL: Newline-delimited JSON for streaming STAC items
13
+ - Auto-detection: Automatic format detection based on file extension
14
+
15
+ Supported Storage Systems:
16
+ - Local filesystem: Direct file access for development and single-machine deployment
17
+ - S3 and S3-compatible: AWS S3, MinIO, DigitalOcean Spaces via fsspec
18
+ - Google Cloud Storage: GCS integration via fsspec protocols
19
+ - Azure Blob Storage: Azure integration via fsspec protocols
20
+ - HTTP/HTTPS: Direct URL reading for publicly accessible datasets
21
+
22
+ Key Features:
23
+ - Automatic format detection eliminates manual configuration
24
+ - Cloud storage integration with optimized streaming
25
+ - Memory-efficient processing for unlimited file sizes
26
+ - Comprehensive error handling with informative messages
27
+ - Extensible design for adding new input formats
28
+ - Thread-safe operations for concurrent processing
29
+
30
+ Performance Optimizations:
31
+ - Parquet: Columnar reading with predicate pushdown for URL columns
32
+ - CSV: Streaming parsing with configurable chunk sizes
33
+ - Cloud storage: Intelligent buffering and connection reuse
34
+ - Memory efficiency: Constant memory usage regardless of input file size
35
+
36
+ Design Pattern:
37
+ The input readers follow the Strategy pattern with a Factory for automatic
38
+ reader selection. This enables seamless switching between formats and
39
+ automatic format detection based on file extensions.
40
+
41
+ Example:
42
+ >>> # Automatic format detection and reader selection
43
+ >>> reader = ReaderFactory.get_reader('auto')
44
+ >>> urls = reader.read_urls('s3://bucket/urls.parquet', 'url')
45
+ >>>
46
+ >>> # Explicit format specification for better performance
47
+ >>> reader = ReaderFactory.get_reader('parquet')
48
+ >>> urls = reader.read_urls('large_dataset.parquet', 'stac_url')
49
+ >>>
50
+ >>> # CSV with custom configuration
51
+ >>> reader = ReaderFactory.get_reader('csv', delimiter='|', encoding='utf-8')
52
+ >>> urls = reader.read_urls('custom_format.csv', 'item_url')
53
+
54
+ Integration:
55
+ Input readers integrate seamlessly with EarthCatalog's ingestion pipeline
56
+ through the ProcessingConfig.input_format parameter. The pipeline automatically
57
+ selects and configures the appropriate reader based on configuration.
58
+ """
59
+
60
+ import logging
61
+ from abc import ABC, abstractmethod
62
+ from typing import cast
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # Check for optional dependencies
67
+ try:
68
+ import pandas as pd
69
+
70
+ HAS_PANDAS = True
71
+ except ImportError:
72
+ pd = None
73
+ HAS_PANDAS = False
74
+
75
+ try:
76
+ import pyarrow.parquet as pq
77
+
78
+ HAS_PYARROW = True
79
+ except ImportError:
80
+ pq = None
81
+ HAS_PYARROW = False
82
+
83
+ try:
84
+ import fsspec
85
+
86
+ HAS_FSSPEC = True
87
+ except ImportError:
88
+ fsspec = None
89
+ HAS_FSSPEC = False
90
+
91
+
92
+ class InputReader(ABC):
93
+ """Abstract base class defining the interface for all input format readers.
94
+
95
+ This abstract base class establishes the contract for reading STAC URLs from various
96
+ input formats and storage systems. All concrete readers must implement the defined
97
+ interface to ensure consistent behavior across different input sources.
98
+
99
+ Interface Design:
100
+ The interface is designed for high-performance streaming operations that can
101
+ handle unlimited file sizes with constant memory usage. Implementations should
102
+ focus on efficient URL extraction while validating input format compliance.
103
+
104
+ Implementation Requirements:
105
+ - read_urls(): Must support streaming for large files to maintain memory efficiency
106
+ - validate_format(): Should perform quick format validation without full file reads
107
+ - Error handling: Must provide informative error messages for debugging
108
+ - Cloud storage: Should integrate with fsspec for seamless cloud operations
109
+
110
+ Performance Expectations:
111
+ - Memory usage should remain constant regardless of input file size
112
+ - URL extraction should be optimized for the specific format characteristics
113
+ - Validation should be fast and avoid unnecessary I/O operations
114
+ - Cloud operations should use streaming and connection reuse
115
+ """
116
+
117
+ @abstractmethod
118
+ def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
119
+ """Read URLs from input file and return as list."""
120
+ pass
121
+
122
+ @abstractmethod
123
+ def validate_format(self, file_path: str) -> bool:
124
+ """Validate that file matches expected format."""
125
+ pass
126
+
127
+
128
+ class ParquetReader(InputReader):
129
+ """High-performance Parquet file reader optimized for large-scale URL extraction.
130
+
131
+ This reader provides optimized access to Parquet files containing STAC URLs,
132
+ leveraging Parquet's columnar format for efficient URL column extraction.
133
+ Designed for processing massive datasets with minimal memory overhead.
134
+
135
+ Parquet Advantages:
136
+ - Columnar storage enables reading only required URL columns
137
+ - Excellent compression reduces network I/O for cloud storage
138
+ - Built-in schema validation ensures data consistency
139
+ - Fast metadata access for quick file validation
140
+ - Predicate pushdown capabilities for filtered reading
141
+
142
+ Performance Features:
143
+ - Memory-efficient streaming for files of any size
144
+ - Optimized cloud storage integration with fsspec
145
+ - Parallel reading capabilities where supported
146
+ - Intelligent buffering for network-based files
147
+ - Schema-based validation for robust error handling
148
+
149
+ Cloud Storage Optimization:
150
+ - Uses fsspec for unified cloud storage access
151
+ - Streaming reads minimize memory usage for large files
152
+ - Connection pooling and reuse for better performance
153
+ - Automatic retry strategies for network resilience
154
+
155
+ Use Cases:
156
+ - Large-scale STAC URL datasets (recommended format)
157
+ - Cloud-based data processing workflows
158
+ - High-performance ingestion requiring minimal memory
159
+ - Datasets requiring schema validation and consistency
160
+ - Integration with existing Parquet-based data pipelines
161
+
162
+ Example:
163
+ >>> reader = ParquetReader()
164
+ >>> urls = reader.read_urls('s3://bucket/urls.parquet', 'stac_url')
165
+ >>> print(f"Extracted {len(urls)} URLs from Parquet file")
166
+ """
167
+
168
+ def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
169
+ """Read URLs from Parquet file."""
170
+ if not HAS_PYARROW:
171
+ raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
172
+
173
+ try:
174
+ if file_path.startswith("s3://"):
175
+ if not HAS_FSSPEC:
176
+ raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
177
+ if fsspec is None:
178
+ raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
179
+ fs = fsspec.filesystem("s3")
180
+ with fs.open(file_path, "rb") as f:
181
+ if pq is None:
182
+ raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
183
+ table = pq.read_table(f)
184
+ else:
185
+ if pq is None:
186
+ raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
187
+ table = pq.read_table(file_path)
188
+
189
+ df = table.to_pandas()
190
+
191
+ if url_column not in df.columns:
192
+ raise ValueError(f"Parquet file must contain '{url_column}' column")
193
+
194
+ urls = cast(list[str], df[url_column].tolist())
195
+ logger.info(f"Read {len(urls)} URLs from Parquet file: {file_path}")
196
+ return urls
197
+
198
+ except Exception as e:
199
+ logger.error(f"Error reading Parquet file {file_path}: {e}")
200
+ raise
201
+
202
+ def validate_format(self, file_path: str) -> bool:
203
+ """Validate Parquet file format."""
204
+ if not HAS_PYARROW or pq is None:
205
+ return False
206
+
207
+ try:
208
+ if file_path.startswith("s3://"):
209
+ if not HAS_FSSPEC or fsspec is None:
210
+ return False
211
+ fs = fsspec.filesystem("s3")
212
+ with fs.open(file_path, "rb") as f:
213
+ pq.ParquetFile(f)
214
+ else:
215
+ pq.ParquetFile(file_path)
216
+ return True
217
+ except (OSError, ValueError, TypeError, RuntimeError):
218
+ return False
219
+
220
+
221
+ class CSVReader(InputReader):
222
+ """Reader for CSV files with URL column."""
223
+
224
+ def __init__(self, delimiter: str = ",", quotechar: str = '"'):
225
+ self.delimiter = delimiter
226
+ self.quotechar = quotechar
227
+
228
+ def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
229
+ """Read URLs from CSV file."""
230
+ if not HAS_PANDAS:
231
+ raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
232
+
233
+ try:
234
+ if file_path.startswith("s3://"):
235
+ if not HAS_FSSPEC or fsspec is None:
236
+ raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
237
+ fs = fsspec.filesystem("s3")
238
+ with fs.open(file_path, "r") as f:
239
+ if pd is None:
240
+ raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
241
+ df = pd.read_csv(f, delimiter=self.delimiter, quotechar=self.quotechar)
242
+ else:
243
+ if pd is None:
244
+ raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
245
+ df = pd.read_csv(file_path, delimiter=self.delimiter, quotechar=self.quotechar)
246
+
247
+ if url_column not in df.columns:
248
+ raise ValueError(f"CSV file must contain '{url_column}' column")
249
+
250
+ urls = cast(list[str], df[url_column].dropna().tolist())
251
+ logger.info(f"Read {len(urls)} URLs from CSV file: {file_path}")
252
+ return urls
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error reading CSV file {file_path}: {e}")
256
+ raise
257
+
258
+ def validate_format(self, file_path: str) -> bool:
259
+ """Validate CSV file format."""
260
+ if not HAS_PANDAS or pd is None:
261
+ return False
262
+
263
+ try:
264
+ if file_path.startswith("s3://"):
265
+ if not HAS_FSSPEC or fsspec is None:
266
+ return False
267
+ fs = fsspec.filesystem("s3")
268
+ with fs.open(file_path, "r") as f:
269
+ # Try to read first few lines to validate
270
+ pd.read_csv(f, nrows=5, delimiter=self.delimiter, quotechar=self.quotechar)
271
+ else:
272
+ pd.read_csv(file_path, nrows=5, delimiter=self.delimiter, quotechar=self.quotechar)
273
+ return True
274
+ except (OSError, ValueError, TypeError):
275
+ return False
276
+
277
+
278
+ class NDJSONReader(InputReader):
279
+ """Reader for NDJSON/JSONL files with URL field.
280
+
281
+ This reader processes Newline-Delimited JSON (NDJSON) files, where each line
282
+ contains a separate JSON object with a URL field. NDJSON is ideal for streaming
283
+ large datasets and is commonly used for STAC item aggregations.
284
+
285
+ NDJSON Format:
286
+ - Each line is a valid JSON object
287
+ - Lines are separated by newlines (not comma-separated)
288
+ - Lines starting with '#' are treated as comments
289
+ - URL field can be any key name (default: "url")
290
+
291
+ Performance Features:
292
+ - Memory-efficient streaming for unlimited file sizes
293
+ - Line-by-line processing prevents loading entire file into memory
294
+ - Cloud storage integration via fsspec for S3, GCS, Azure
295
+ - Skips empty lines and comments automatically
296
+ - Graceful handling of malformed JSON lines
297
+
298
+ Use Cases:
299
+ - Large-scale STAC URL datasets where each item is a JSON object
300
+ - Streaming ingestion from cloud storage
301
+ - ITS_LIVE bulk data processing (800+ files, ~50k items each)
302
+ - Log files with JSON entries
303
+ - Exported STAC collections in NDJSON format
304
+
305
+ Example NDJSON file:
306
+ {"url": "https://example.com/item1.json", "id": "item1"}
307
+ {"url": "https://example.com/item2.json", "id": "item2"}
308
+ # This is a comment line (will be skipped)
309
+ {"url": "https://example.com/item3.json", "id": "item3"}
310
+
311
+ Example:
312
+ >>> reader = NDJSONReader()
313
+ >>> urls = reader.read_urls('s3://bucket/items.jsonl', 'url')
314
+ >>> print(f"Extracted {len(urls)} URLs from NDJSON file")
315
+ """
316
+
317
+ def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
318
+ """Read URLs from NDJSON file.
319
+
320
+ Args:
321
+ file_path: Path to NDJSON file (local or s3://)
322
+ url_column: JSON key name containing URLs (default: "url")
323
+
324
+ Returns:
325
+ List of URLs extracted from each JSON object.
326
+
327
+ Raises:
328
+ ValueError: If file_path is empty or url_column is not found in any objects.
329
+ ImportError: If fsspec is required for S3 but not installed.
330
+ """
331
+ if not file_path:
332
+ raise ValueError("file_path cannot be empty")
333
+
334
+ urls: list[str] = []
335
+ line_number = 0
336
+ skipped_lines = 0
337
+ missing_field_count = 0
338
+
339
+ try:
340
+ if file_path.startswith("s3://"):
341
+ if not HAS_FSSPEC or fsspec is None:
342
+ raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
343
+ fs = fsspec.filesystem("s3")
344
+ with fs.open(file_path, "r") as f:
345
+ urls, line_number, skipped_lines, missing_field_count = self._read_lines(f, url_column)
346
+ else:
347
+ with open(file_path) as f:
348
+ urls, line_number, skipped_lines, missing_field_count = self._read_lines(f, url_column)
349
+
350
+ if not urls:
351
+ if line_number == 0:
352
+ # Empty file is valid, just return empty list
353
+ logger.info(f"NDJSON file is empty: {file_path}")
354
+ elif missing_field_count > 0:
355
+ raise ValueError(
356
+ f"NDJSON file does not contain '{url_column}' field in any objects. "
357
+ f"Checked {line_number} lines."
358
+ )
359
+ else:
360
+ raise ValueError(
361
+ f"NDJSON file does not contain any valid JSON objects with data. "
362
+ f"Checked {line_number} lines, {skipped_lines} were empty/comments."
363
+ )
364
+
365
+ if skipped_lines > 0:
366
+ logger.warning(f"Skipped {skipped_lines} empty/comment lines in {file_path}")
367
+
368
+ if missing_field_count > 0:
369
+ logger.warning(f"{missing_field_count} lines missing '{url_column}' field in {file_path}")
370
+
371
+ logger.info(
372
+ f"Read {len(urls)} URLs from NDJSON file: {file_path} "
373
+ f"({line_number} lines processed, {skipped_lines} skipped)"
374
+ )
375
+ return urls
376
+
377
+ except FileNotFoundError:
378
+ logger.error(f"NDJSON file not found: {file_path}")
379
+ raise
380
+ except ImportError:
381
+ raise
382
+ except Exception as e:
383
+ logger.error(f"Error reading NDJSON file {file_path}: {e}")
384
+ raise
385
+
386
+ def _read_lines(self, file_obj, url_column: str) -> tuple[list[str], int, int, int]:
387
+ """Read and parse lines from file object.
388
+
389
+ Args:
390
+ file_obj: Open file object (local or S3)
391
+ url_column: JSON key name containing URLs
392
+
393
+ Returns:
394
+ Tuple of (urls list, total lines, skipped lines, missing field count)
395
+ """
396
+ import json
397
+
398
+ urls: list[str] = []
399
+ line_number = 0
400
+ skipped_lines = 0
401
+ missing_field_count = 0
402
+
403
+ for line in file_obj:
404
+ line_number += 1
405
+ line = line.strip()
406
+
407
+ # Skip empty lines
408
+ if not line:
409
+ skipped_lines += 1
410
+ continue
411
+
412
+ # Skip comment lines
413
+ if line.startswith("#"):
414
+ skipped_lines += 1
415
+ continue
416
+
417
+ # Parse JSON object
418
+ try:
419
+ obj = json.loads(line)
420
+
421
+ # Extract URL from specified field
422
+ if isinstance(obj, dict):
423
+ if url_column in obj:
424
+ url_value = obj[url_column]
425
+ if isinstance(url_value, str) and url_value:
426
+ urls.append(url_value)
427
+ else:
428
+ logger.debug(
429
+ f"Line {line_number}: '{url_column}' field is not a non-empty string, skipping"
430
+ )
431
+ missing_field_count += 1
432
+ else:
433
+ missing_field_count += 1
434
+ else:
435
+ logger.debug(f"Line {line_number}: JSON object is not a dict, skipping")
436
+ missing_field_count += 1
437
+
438
+ except json.JSONDecodeError as e:
439
+ logger.warning(f"Line {line_number}: Invalid JSON, skipping: {e}")
440
+ skipped_lines += 1
441
+ continue
442
+
443
+ return urls, line_number, skipped_lines, missing_field_count
444
+
445
+ def validate_format(self, file_path: str) -> bool:
446
+ """Validate NDJSON file format.
447
+
448
+ Args:
449
+ file_path: Path to NDJSON file (local or s3://)
450
+
451
+ Returns:
452
+ True if file appears to be valid NDJSON, False otherwise.
453
+ """
454
+
455
+ try:
456
+ if file_path.startswith("s3://"):
457
+ if not HAS_FSSPEC or fsspec is None:
458
+ return False
459
+ fs = fsspec.filesystem("s3")
460
+ with fs.open(file_path, "r") as f:
461
+ return self._validate_content(f)
462
+ else:
463
+ with open(file_path) as f:
464
+ return self._validate_content(f)
465
+
466
+ except (OSError, ValueError, UnicodeDecodeError):
467
+ return False
468
+
469
+ def _validate_content(self, file_obj) -> bool:
470
+ """Validate file content by checking first few non-comment lines.
471
+
472
+ Args:
473
+ file_obj: Open file object
474
+
475
+ Returns:
476
+ True if file contains valid JSON objects, False otherwise.
477
+ """
478
+ import json
479
+
480
+ valid_lines = 0
481
+ max_lines_to_check = 10
482
+
483
+ for line in file_obj:
484
+ line = line.strip()
485
+
486
+ # Skip empty and comment lines
487
+ if not line or line.startswith("#"):
488
+ continue
489
+
490
+ # Try to parse as JSON
491
+ try:
492
+ obj = json.loads(line)
493
+ if isinstance(obj, dict):
494
+ valid_lines += 1
495
+ if valid_lines >= max_lines_to_check:
496
+ return True
497
+ except (json.JSONDecodeError, ValueError):
498
+ return False
499
+
500
+ # Return True if we found at least one valid JSON object
501
+ return valid_lines > 0
502
+
503
+
504
+ class ReaderFactory:
505
+ """Factory class for automatic input reader selection and configuration.
506
+
507
+ This factory provides intelligent input reader selection based on file format
508
+ specifications or automatic detection from file extensions. Eliminates the need
509
+ for manual reader instantiation and configuration in most use cases.
510
+
511
+ Automatic Format Detection:
512
+ When format_name='auto', the factory analyzes file extensions to select
513
+ the most appropriate reader:
514
+ - .parquet, .pq → ParquetReader (high performance, recommended)
515
+ - .csv → CSVReader with comma delimiter
516
+ - .tsv, .tab → CSVReader with tab delimiter
517
+ - .ndjson, .jsonl → NDJSONReader for newline-delimited JSON
518
+ - Fallback to CSV for unknown extensions
519
+
520
+ Supported Formats:
521
+ - 'parquet': Optimized columnar format reader
522
+ - 'csv': Flexible comma-separated values reader
523
+ - 'tsv': Tab-separated values (CSV variant)
524
+ - 'ndjson': Newline-delimited JSON reader
525
+ - 'jsonl': Alias for ndjson format
526
+ - 'auto': Automatic detection based on file extension
527
+
528
+ Reader Configuration:
529
+ The factory accepts configuration parameters that are passed to the
530
+ underlying readers for customization:
531
+ - CSV readers: delimiter, encoding, quoting options
532
+ - Parquet readers: column selection and filtering options
533
+ - All readers: error handling and validation settings
534
+
535
+ Example:
536
+ >>> # Automatic format detection (recommended)
537
+ >>> reader = ReaderFactory.get_reader('auto')
538
+ >>> urls = reader.read_urls('data.parquet', 'url')
539
+ >>>
540
+ >>> # Explicit format for performance-critical scenarios
541
+ >>> reader = ReaderFactory.get_reader('parquet')
542
+ >>>
543
+ >>> # Custom CSV configuration
544
+ >>> reader = ReaderFactory.get_reader('csv', delimiter='|', encoding='utf-8')
545
+
546
+ Performance:
547
+ The factory adds minimal overhead to reader creation and is optimized for
548
+ repeated use. Reader instances can be reused across multiple files of the
549
+ same format for better performance in batch processing scenarios.
550
+ """
551
+
552
+ _readers = {
553
+ "parquet": ParquetReader,
554
+ "csv": CSVReader,
555
+ "tsv": lambda: CSVReader(delimiter="\t"),
556
+ "ndjson": NDJSONReader,
557
+ "jsonl": NDJSONReader,
558
+ }
559
+
560
+ @classmethod
561
+ def get_reader(cls, format_name: str, **kwargs) -> InputReader:
562
+ """Get reader for specified format."""
563
+ format_name = format_name.lower()
564
+
565
+ if format_name not in cls._readers:
566
+ raise ValueError(f"Unsupported input format: {format_name}. Supported formats: {list(cls._readers.keys())}")
567
+
568
+ reader_class = cls._readers[format_name]
569
+
570
+ # Handle lambda functions for parameterized readers
571
+ if callable(reader_class) and not isinstance(reader_class, type):
572
+ result = reader_class()
573
+ return cast(InputReader, result)
574
+
575
+ # For type classes, instantiate with kwargs
576
+ reader_type = cast(type[InputReader], reader_class)
577
+ result = reader_type(**kwargs)
578
+ return result
579
+
580
+ @classmethod
581
+ def auto_detect_format(cls, file_path: str) -> str:
582
+ """Auto-detect file format based on extension."""
583
+ path_lower = file_path.lower()
584
+
585
+ if path_lower.endswith(".parquet") or path_lower.endswith(".pq"):
586
+ return "parquet"
587
+ elif path_lower.endswith(".csv"):
588
+ return "csv"
589
+ elif path_lower.endswith(".tsv") or path_lower.endswith(".tab"):
590
+ return "tsv"
591
+ elif path_lower.endswith(".ndjson"):
592
+ return "ndjson"
593
+ elif path_lower.endswith(".jsonl"):
594
+ return "jsonl"
595
+ else:
596
+ # Default to CSV format for unknown extensions
597
+ logger.warning(f"Unknown file extension for {file_path}, defaulting to CSV format")
598
+ return "csv"
599
+
600
+ @classmethod
601
+ def get_supported_formats(cls) -> list[str]:
602
+ """Get list of supported input formats."""
603
+ return list(cls._readers.keys())