earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
# input_readers.py
|
|
2
|
+
"""Flexible input format readers for STAC URL ingestion with cloud storage support.
|
|
3
|
+
|
|
4
|
+
This module provides a pluggable input reading system that supports multiple file formats
|
|
5
|
+
and storage backends for STAC URL ingestion. Designed to handle massive datasets efficiently
|
|
6
|
+
while providing a consistent interface across different input sources and formats.
|
|
7
|
+
|
|
8
|
+
Supported Input Formats:
|
|
9
|
+
- Parquet: High-performance columnar format (recommended for large datasets)
|
|
10
|
+
- CSV: Comma-separated values with configurable options
|
|
11
|
+
- TSV: Tab-separated values (specialized CSV configuration)
|
|
12
|
+
- NDJSON/JSONL: Newline-delimited JSON for streaming STAC items
|
|
13
|
+
- Auto-detection: Automatic format detection based on file extension
|
|
14
|
+
|
|
15
|
+
Supported Storage Systems:
|
|
16
|
+
- Local filesystem: Direct file access for development and single-machine deployment
|
|
17
|
+
- S3 and S3-compatible: AWS S3, MinIO, DigitalOcean Spaces via fsspec
|
|
18
|
+
- Google Cloud Storage: GCS integration via fsspec protocols
|
|
19
|
+
- Azure Blob Storage: Azure integration via fsspec protocols
|
|
20
|
+
- HTTP/HTTPS: Direct URL reading for publicly accessible datasets
|
|
21
|
+
|
|
22
|
+
Key Features:
|
|
23
|
+
- Automatic format detection eliminates manual configuration
|
|
24
|
+
- Cloud storage integration with optimized streaming
|
|
25
|
+
- Memory-efficient processing for unlimited file sizes
|
|
26
|
+
- Comprehensive error handling with informative messages
|
|
27
|
+
- Extensible design for adding new input formats
|
|
28
|
+
- Thread-safe operations for concurrent processing
|
|
29
|
+
|
|
30
|
+
Performance Optimizations:
|
|
31
|
+
- Parquet: Columnar reading with predicate pushdown for URL columns
|
|
32
|
+
- CSV: Streaming parsing with configurable chunk sizes
|
|
33
|
+
- Cloud storage: Intelligent buffering and connection reuse
|
|
34
|
+
- Memory efficiency: Constant memory usage regardless of input file size
|
|
35
|
+
|
|
36
|
+
Design Pattern:
|
|
37
|
+
The input readers follow the Strategy pattern with a Factory for automatic
|
|
38
|
+
reader selection. This enables seamless switching between formats and
|
|
39
|
+
automatic format detection based on file extensions.
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> # Automatic format detection and reader selection
|
|
43
|
+
>>> reader = ReaderFactory.get_reader('auto')
|
|
44
|
+
>>> urls = reader.read_urls('s3://bucket/urls.parquet', 'url')
|
|
45
|
+
>>>
|
|
46
|
+
>>> # Explicit format specification for better performance
|
|
47
|
+
>>> reader = ReaderFactory.get_reader('parquet')
|
|
48
|
+
>>> urls = reader.read_urls('large_dataset.parquet', 'stac_url')
|
|
49
|
+
>>>
|
|
50
|
+
>>> # CSV with custom configuration
|
|
51
|
+
>>> reader = ReaderFactory.get_reader('csv', delimiter='|', encoding='utf-8')
|
|
52
|
+
>>> urls = reader.read_urls('custom_format.csv', 'item_url')
|
|
53
|
+
|
|
54
|
+
Integration:
|
|
55
|
+
Input readers integrate seamlessly with EarthCatalog's ingestion pipeline
|
|
56
|
+
through the ProcessingConfig.input_format parameter. The pipeline automatically
|
|
57
|
+
selects and configures the appropriate reader based on configuration.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
import logging
|
|
61
|
+
from abc import ABC, abstractmethod
|
|
62
|
+
from typing import cast
|
|
63
|
+
|
|
64
|
+
logger = logging.getLogger(__name__)
|
|
65
|
+
|
|
66
|
+
# Check for optional dependencies
|
|
67
|
+
try:
|
|
68
|
+
import pandas as pd
|
|
69
|
+
|
|
70
|
+
HAS_PANDAS = True
|
|
71
|
+
except ImportError:
|
|
72
|
+
pd = None
|
|
73
|
+
HAS_PANDAS = False
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
import pyarrow.parquet as pq
|
|
77
|
+
|
|
78
|
+
HAS_PYARROW = True
|
|
79
|
+
except ImportError:
|
|
80
|
+
pq = None
|
|
81
|
+
HAS_PYARROW = False
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
import fsspec
|
|
85
|
+
|
|
86
|
+
HAS_FSSPEC = True
|
|
87
|
+
except ImportError:
|
|
88
|
+
fsspec = None
|
|
89
|
+
HAS_FSSPEC = False
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class InputReader(ABC):
|
|
93
|
+
"""Abstract base class defining the interface for all input format readers.
|
|
94
|
+
|
|
95
|
+
This abstract base class establishes the contract for reading STAC URLs from various
|
|
96
|
+
input formats and storage systems. All concrete readers must implement the defined
|
|
97
|
+
interface to ensure consistent behavior across different input sources.
|
|
98
|
+
|
|
99
|
+
Interface Design:
|
|
100
|
+
The interface is designed for high-performance streaming operations that can
|
|
101
|
+
handle unlimited file sizes with constant memory usage. Implementations should
|
|
102
|
+
focus on efficient URL extraction while validating input format compliance.
|
|
103
|
+
|
|
104
|
+
Implementation Requirements:
|
|
105
|
+
- read_urls(): Must support streaming for large files to maintain memory efficiency
|
|
106
|
+
- validate_format(): Should perform quick format validation without full file reads
|
|
107
|
+
- Error handling: Must provide informative error messages for debugging
|
|
108
|
+
- Cloud storage: Should integrate with fsspec for seamless cloud operations
|
|
109
|
+
|
|
110
|
+
Performance Expectations:
|
|
111
|
+
- Memory usage should remain constant regardless of input file size
|
|
112
|
+
- URL extraction should be optimized for the specific format characteristics
|
|
113
|
+
- Validation should be fast and avoid unnecessary I/O operations
|
|
114
|
+
- Cloud operations should use streaming and connection reuse
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
@abstractmethod
|
|
118
|
+
def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
|
|
119
|
+
"""Read URLs from input file and return as list."""
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
@abstractmethod
|
|
123
|
+
def validate_format(self, file_path: str) -> bool:
|
|
124
|
+
"""Validate that file matches expected format."""
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ParquetReader(InputReader):
|
|
129
|
+
"""High-performance Parquet file reader optimized for large-scale URL extraction.
|
|
130
|
+
|
|
131
|
+
This reader provides optimized access to Parquet files containing STAC URLs,
|
|
132
|
+
leveraging Parquet's columnar format for efficient URL column extraction.
|
|
133
|
+
Designed for processing massive datasets with minimal memory overhead.
|
|
134
|
+
|
|
135
|
+
Parquet Advantages:
|
|
136
|
+
- Columnar storage enables reading only required URL columns
|
|
137
|
+
- Excellent compression reduces network I/O for cloud storage
|
|
138
|
+
- Built-in schema validation ensures data consistency
|
|
139
|
+
- Fast metadata access for quick file validation
|
|
140
|
+
- Predicate pushdown capabilities for filtered reading
|
|
141
|
+
|
|
142
|
+
Performance Features:
|
|
143
|
+
- Memory-efficient streaming for files of any size
|
|
144
|
+
- Optimized cloud storage integration with fsspec
|
|
145
|
+
- Parallel reading capabilities where supported
|
|
146
|
+
- Intelligent buffering for network-based files
|
|
147
|
+
- Schema-based validation for robust error handling
|
|
148
|
+
|
|
149
|
+
Cloud Storage Optimization:
|
|
150
|
+
- Uses fsspec for unified cloud storage access
|
|
151
|
+
- Streaming reads minimize memory usage for large files
|
|
152
|
+
- Connection pooling and reuse for better performance
|
|
153
|
+
- Automatic retry strategies for network resilience
|
|
154
|
+
|
|
155
|
+
Use Cases:
|
|
156
|
+
- Large-scale STAC URL datasets (recommended format)
|
|
157
|
+
- Cloud-based data processing workflows
|
|
158
|
+
- High-performance ingestion requiring minimal memory
|
|
159
|
+
- Datasets requiring schema validation and consistency
|
|
160
|
+
- Integration with existing Parquet-based data pipelines
|
|
161
|
+
|
|
162
|
+
Example:
|
|
163
|
+
>>> reader = ParquetReader()
|
|
164
|
+
>>> urls = reader.read_urls('s3://bucket/urls.parquet', 'stac_url')
|
|
165
|
+
>>> print(f"Extracted {len(urls)} URLs from Parquet file")
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
|
|
169
|
+
"""Read URLs from Parquet file."""
|
|
170
|
+
if not HAS_PYARROW:
|
|
171
|
+
raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
if file_path.startswith("s3://"):
|
|
175
|
+
if not HAS_FSSPEC:
|
|
176
|
+
raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
|
|
177
|
+
if fsspec is None:
|
|
178
|
+
raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
|
|
179
|
+
fs = fsspec.filesystem("s3")
|
|
180
|
+
with fs.open(file_path, "rb") as f:
|
|
181
|
+
if pq is None:
|
|
182
|
+
raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
|
|
183
|
+
table = pq.read_table(f)
|
|
184
|
+
else:
|
|
185
|
+
if pq is None:
|
|
186
|
+
raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
|
|
187
|
+
table = pq.read_table(file_path)
|
|
188
|
+
|
|
189
|
+
df = table.to_pandas()
|
|
190
|
+
|
|
191
|
+
if url_column not in df.columns:
|
|
192
|
+
raise ValueError(f"Parquet file must contain '{url_column}' column")
|
|
193
|
+
|
|
194
|
+
urls = cast(list[str], df[url_column].tolist())
|
|
195
|
+
logger.info(f"Read {len(urls)} URLs from Parquet file: {file_path}")
|
|
196
|
+
return urls
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Error reading Parquet file {file_path}: {e}")
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def validate_format(self, file_path: str) -> bool:
|
|
203
|
+
"""Validate Parquet file format."""
|
|
204
|
+
if not HAS_PYARROW or pq is None:
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
if file_path.startswith("s3://"):
|
|
209
|
+
if not HAS_FSSPEC or fsspec is None:
|
|
210
|
+
return False
|
|
211
|
+
fs = fsspec.filesystem("s3")
|
|
212
|
+
with fs.open(file_path, "rb") as f:
|
|
213
|
+
pq.ParquetFile(f)
|
|
214
|
+
else:
|
|
215
|
+
pq.ParquetFile(file_path)
|
|
216
|
+
return True
|
|
217
|
+
except (OSError, ValueError, TypeError, RuntimeError):
|
|
218
|
+
return False
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class CSVReader(InputReader):
|
|
222
|
+
"""Reader for CSV files with URL column."""
|
|
223
|
+
|
|
224
|
+
def __init__(self, delimiter: str = ",", quotechar: str = '"'):
|
|
225
|
+
self.delimiter = delimiter
|
|
226
|
+
self.quotechar = quotechar
|
|
227
|
+
|
|
228
|
+
def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
|
|
229
|
+
"""Read URLs from CSV file."""
|
|
230
|
+
if not HAS_PANDAS:
|
|
231
|
+
raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
if file_path.startswith("s3://"):
|
|
235
|
+
if not HAS_FSSPEC or fsspec is None:
|
|
236
|
+
raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
|
|
237
|
+
fs = fsspec.filesystem("s3")
|
|
238
|
+
with fs.open(file_path, "r") as f:
|
|
239
|
+
if pd is None:
|
|
240
|
+
raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
|
|
241
|
+
df = pd.read_csv(f, delimiter=self.delimiter, quotechar=self.quotechar)
|
|
242
|
+
else:
|
|
243
|
+
if pd is None:
|
|
244
|
+
raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
|
|
245
|
+
df = pd.read_csv(file_path, delimiter=self.delimiter, quotechar=self.quotechar)
|
|
246
|
+
|
|
247
|
+
if url_column not in df.columns:
|
|
248
|
+
raise ValueError(f"CSV file must contain '{url_column}' column")
|
|
249
|
+
|
|
250
|
+
urls = cast(list[str], df[url_column].dropna().tolist())
|
|
251
|
+
logger.info(f"Read {len(urls)} URLs from CSV file: {file_path}")
|
|
252
|
+
return urls
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.error(f"Error reading CSV file {file_path}: {e}")
|
|
256
|
+
raise
|
|
257
|
+
|
|
258
|
+
def validate_format(self, file_path: str) -> bool:
|
|
259
|
+
"""Validate CSV file format."""
|
|
260
|
+
if not HAS_PANDAS or pd is None:
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
if file_path.startswith("s3://"):
|
|
265
|
+
if not HAS_FSSPEC or fsspec is None:
|
|
266
|
+
return False
|
|
267
|
+
fs = fsspec.filesystem("s3")
|
|
268
|
+
with fs.open(file_path, "r") as f:
|
|
269
|
+
# Try to read first few lines to validate
|
|
270
|
+
pd.read_csv(f, nrows=5, delimiter=self.delimiter, quotechar=self.quotechar)
|
|
271
|
+
else:
|
|
272
|
+
pd.read_csv(file_path, nrows=5, delimiter=self.delimiter, quotechar=self.quotechar)
|
|
273
|
+
return True
|
|
274
|
+
except (OSError, ValueError, TypeError):
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class NDJSONReader(InputReader):
|
|
279
|
+
"""Reader for NDJSON/JSONL files with URL field.
|
|
280
|
+
|
|
281
|
+
This reader processes Newline-Delimited JSON (NDJSON) files, where each line
|
|
282
|
+
contains a separate JSON object with a URL field. NDJSON is ideal for streaming
|
|
283
|
+
large datasets and is commonly used for STAC item aggregations.
|
|
284
|
+
|
|
285
|
+
NDJSON Format:
|
|
286
|
+
- Each line is a valid JSON object
|
|
287
|
+
- Lines are separated by newlines (not comma-separated)
|
|
288
|
+
- Lines starting with '#' are treated as comments
|
|
289
|
+
- URL field can be any key name (default: "url")
|
|
290
|
+
|
|
291
|
+
Performance Features:
|
|
292
|
+
- Memory-efficient streaming for unlimited file sizes
|
|
293
|
+
- Line-by-line processing prevents loading entire file into memory
|
|
294
|
+
- Cloud storage integration via fsspec for S3, GCS, Azure
|
|
295
|
+
- Skips empty lines and comments automatically
|
|
296
|
+
- Graceful handling of malformed JSON lines
|
|
297
|
+
|
|
298
|
+
Use Cases:
|
|
299
|
+
- Large-scale STAC URL datasets where each item is a JSON object
|
|
300
|
+
- Streaming ingestion from cloud storage
|
|
301
|
+
- ITS_LIVE bulk data processing (800+ files, ~50k items each)
|
|
302
|
+
- Log files with JSON entries
|
|
303
|
+
- Exported STAC collections in NDJSON format
|
|
304
|
+
|
|
305
|
+
Example NDJSON file:
|
|
306
|
+
{"url": "https://example.com/item1.json", "id": "item1"}
|
|
307
|
+
{"url": "https://example.com/item2.json", "id": "item2"}
|
|
308
|
+
# This is a comment line (will be skipped)
|
|
309
|
+
{"url": "https://example.com/item3.json", "id": "item3"}
|
|
310
|
+
|
|
311
|
+
Example:
|
|
312
|
+
>>> reader = NDJSONReader()
|
|
313
|
+
>>> urls = reader.read_urls('s3://bucket/items.jsonl', 'url')
|
|
314
|
+
>>> print(f"Extracted {len(urls)} URLs from NDJSON file")
|
|
315
|
+
"""
|
|
316
|
+
|
|
317
|
+
def read_urls(self, file_path: str, url_column: str = "url") -> list[str]:
|
|
318
|
+
"""Read URLs from NDJSON file.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
file_path: Path to NDJSON file (local or s3://)
|
|
322
|
+
url_column: JSON key name containing URLs (default: "url")
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
List of URLs extracted from each JSON object.
|
|
326
|
+
|
|
327
|
+
Raises:
|
|
328
|
+
ValueError: If file_path is empty or url_column is not found in any objects.
|
|
329
|
+
ImportError: If fsspec is required for S3 but not installed.
|
|
330
|
+
"""
|
|
331
|
+
if not file_path:
|
|
332
|
+
raise ValueError("file_path cannot be empty")
|
|
333
|
+
|
|
334
|
+
urls: list[str] = []
|
|
335
|
+
line_number = 0
|
|
336
|
+
skipped_lines = 0
|
|
337
|
+
missing_field_count = 0
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
if file_path.startswith("s3://"):
|
|
341
|
+
if not HAS_FSSPEC or fsspec is None:
|
|
342
|
+
raise ImportError("fsspec is required for S3 support. Install with: pip install fsspec s3fs")
|
|
343
|
+
fs = fsspec.filesystem("s3")
|
|
344
|
+
with fs.open(file_path, "r") as f:
|
|
345
|
+
urls, line_number, skipped_lines, missing_field_count = self._read_lines(f, url_column)
|
|
346
|
+
else:
|
|
347
|
+
with open(file_path) as f:
|
|
348
|
+
urls, line_number, skipped_lines, missing_field_count = self._read_lines(f, url_column)
|
|
349
|
+
|
|
350
|
+
if not urls:
|
|
351
|
+
if line_number == 0:
|
|
352
|
+
# Empty file is valid, just return empty list
|
|
353
|
+
logger.info(f"NDJSON file is empty: {file_path}")
|
|
354
|
+
elif missing_field_count > 0:
|
|
355
|
+
raise ValueError(
|
|
356
|
+
f"NDJSON file does not contain '{url_column}' field in any objects. "
|
|
357
|
+
f"Checked {line_number} lines."
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"NDJSON file does not contain any valid JSON objects with data. "
|
|
362
|
+
f"Checked {line_number} lines, {skipped_lines} were empty/comments."
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if skipped_lines > 0:
|
|
366
|
+
logger.warning(f"Skipped {skipped_lines} empty/comment lines in {file_path}")
|
|
367
|
+
|
|
368
|
+
if missing_field_count > 0:
|
|
369
|
+
logger.warning(f"{missing_field_count} lines missing '{url_column}' field in {file_path}")
|
|
370
|
+
|
|
371
|
+
logger.info(
|
|
372
|
+
f"Read {len(urls)} URLs from NDJSON file: {file_path} "
|
|
373
|
+
f"({line_number} lines processed, {skipped_lines} skipped)"
|
|
374
|
+
)
|
|
375
|
+
return urls
|
|
376
|
+
|
|
377
|
+
except FileNotFoundError:
|
|
378
|
+
logger.error(f"NDJSON file not found: {file_path}")
|
|
379
|
+
raise
|
|
380
|
+
except ImportError:
|
|
381
|
+
raise
|
|
382
|
+
except Exception as e:
|
|
383
|
+
logger.error(f"Error reading NDJSON file {file_path}: {e}")
|
|
384
|
+
raise
|
|
385
|
+
|
|
386
|
+
def _read_lines(self, file_obj, url_column: str) -> tuple[list[str], int, int, int]:
|
|
387
|
+
"""Read and parse lines from file object.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
file_obj: Open file object (local or S3)
|
|
391
|
+
url_column: JSON key name containing URLs
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Tuple of (urls list, total lines, skipped lines, missing field count)
|
|
395
|
+
"""
|
|
396
|
+
import json
|
|
397
|
+
|
|
398
|
+
urls: list[str] = []
|
|
399
|
+
line_number = 0
|
|
400
|
+
skipped_lines = 0
|
|
401
|
+
missing_field_count = 0
|
|
402
|
+
|
|
403
|
+
for line in file_obj:
|
|
404
|
+
line_number += 1
|
|
405
|
+
line = line.strip()
|
|
406
|
+
|
|
407
|
+
# Skip empty lines
|
|
408
|
+
if not line:
|
|
409
|
+
skipped_lines += 1
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
# Skip comment lines
|
|
413
|
+
if line.startswith("#"):
|
|
414
|
+
skipped_lines += 1
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
# Parse JSON object
|
|
418
|
+
try:
|
|
419
|
+
obj = json.loads(line)
|
|
420
|
+
|
|
421
|
+
# Extract URL from specified field
|
|
422
|
+
if isinstance(obj, dict):
|
|
423
|
+
if url_column in obj:
|
|
424
|
+
url_value = obj[url_column]
|
|
425
|
+
if isinstance(url_value, str) and url_value:
|
|
426
|
+
urls.append(url_value)
|
|
427
|
+
else:
|
|
428
|
+
logger.debug(
|
|
429
|
+
f"Line {line_number}: '{url_column}' field is not a non-empty string, skipping"
|
|
430
|
+
)
|
|
431
|
+
missing_field_count += 1
|
|
432
|
+
else:
|
|
433
|
+
missing_field_count += 1
|
|
434
|
+
else:
|
|
435
|
+
logger.debug(f"Line {line_number}: JSON object is not a dict, skipping")
|
|
436
|
+
missing_field_count += 1
|
|
437
|
+
|
|
438
|
+
except json.JSONDecodeError as e:
|
|
439
|
+
logger.warning(f"Line {line_number}: Invalid JSON, skipping: {e}")
|
|
440
|
+
skipped_lines += 1
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
return urls, line_number, skipped_lines, missing_field_count
|
|
444
|
+
|
|
445
|
+
def validate_format(self, file_path: str) -> bool:
|
|
446
|
+
"""Validate NDJSON file format.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
file_path: Path to NDJSON file (local or s3://)
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
True if file appears to be valid NDJSON, False otherwise.
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
if file_path.startswith("s3://"):
|
|
457
|
+
if not HAS_FSSPEC or fsspec is None:
|
|
458
|
+
return False
|
|
459
|
+
fs = fsspec.filesystem("s3")
|
|
460
|
+
with fs.open(file_path, "r") as f:
|
|
461
|
+
return self._validate_content(f)
|
|
462
|
+
else:
|
|
463
|
+
with open(file_path) as f:
|
|
464
|
+
return self._validate_content(f)
|
|
465
|
+
|
|
466
|
+
except (OSError, ValueError, UnicodeDecodeError):
|
|
467
|
+
return False
|
|
468
|
+
|
|
469
|
+
def _validate_content(self, file_obj) -> bool:
|
|
470
|
+
"""Validate file content by checking first few non-comment lines.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
file_obj: Open file object
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
True if file contains valid JSON objects, False otherwise.
|
|
477
|
+
"""
|
|
478
|
+
import json
|
|
479
|
+
|
|
480
|
+
valid_lines = 0
|
|
481
|
+
max_lines_to_check = 10
|
|
482
|
+
|
|
483
|
+
for line in file_obj:
|
|
484
|
+
line = line.strip()
|
|
485
|
+
|
|
486
|
+
# Skip empty and comment lines
|
|
487
|
+
if not line or line.startswith("#"):
|
|
488
|
+
continue
|
|
489
|
+
|
|
490
|
+
# Try to parse as JSON
|
|
491
|
+
try:
|
|
492
|
+
obj = json.loads(line)
|
|
493
|
+
if isinstance(obj, dict):
|
|
494
|
+
valid_lines += 1
|
|
495
|
+
if valid_lines >= max_lines_to_check:
|
|
496
|
+
return True
|
|
497
|
+
except (json.JSONDecodeError, ValueError):
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
# Return True if we found at least one valid JSON object
|
|
501
|
+
return valid_lines > 0
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class ReaderFactory:
|
|
505
|
+
"""Factory class for automatic input reader selection and configuration.
|
|
506
|
+
|
|
507
|
+
This factory provides intelligent input reader selection based on file format
|
|
508
|
+
specifications or automatic detection from file extensions. Eliminates the need
|
|
509
|
+
for manual reader instantiation and configuration in most use cases.
|
|
510
|
+
|
|
511
|
+
Automatic Format Detection:
|
|
512
|
+
When format_name='auto', the factory analyzes file extensions to select
|
|
513
|
+
the most appropriate reader:
|
|
514
|
+
- .parquet, .pq → ParquetReader (high performance, recommended)
|
|
515
|
+
- .csv → CSVReader with comma delimiter
|
|
516
|
+
- .tsv, .tab → CSVReader with tab delimiter
|
|
517
|
+
- .ndjson, .jsonl → NDJSONReader for newline-delimited JSON
|
|
518
|
+
- Fallback to CSV for unknown extensions
|
|
519
|
+
|
|
520
|
+
Supported Formats:
|
|
521
|
+
- 'parquet': Optimized columnar format reader
|
|
522
|
+
- 'csv': Flexible comma-separated values reader
|
|
523
|
+
- 'tsv': Tab-separated values (CSV variant)
|
|
524
|
+
- 'ndjson': Newline-delimited JSON reader
|
|
525
|
+
- 'jsonl': Alias for ndjson format
|
|
526
|
+
- 'auto': Automatic detection based on file extension
|
|
527
|
+
|
|
528
|
+
Reader Configuration:
|
|
529
|
+
The factory accepts configuration parameters that are passed to the
|
|
530
|
+
underlying readers for customization:
|
|
531
|
+
- CSV readers: delimiter, encoding, quoting options
|
|
532
|
+
- Parquet readers: column selection and filtering options
|
|
533
|
+
- All readers: error handling and validation settings
|
|
534
|
+
|
|
535
|
+
Example:
|
|
536
|
+
>>> # Automatic format detection (recommended)
|
|
537
|
+
>>> reader = ReaderFactory.get_reader('auto')
|
|
538
|
+
>>> urls = reader.read_urls('data.parquet', 'url')
|
|
539
|
+
>>>
|
|
540
|
+
>>> # Explicit format for performance-critical scenarios
|
|
541
|
+
>>> reader = ReaderFactory.get_reader('parquet')
|
|
542
|
+
>>>
|
|
543
|
+
>>> # Custom CSV configuration
|
|
544
|
+
>>> reader = ReaderFactory.get_reader('csv', delimiter='|', encoding='utf-8')
|
|
545
|
+
|
|
546
|
+
Performance:
|
|
547
|
+
The factory adds minimal overhead to reader creation and is optimized for
|
|
548
|
+
repeated use. Reader instances can be reused across multiple files of the
|
|
549
|
+
same format for better performance in batch processing scenarios.
|
|
550
|
+
"""
|
|
551
|
+
|
|
552
|
+
_readers = {
|
|
553
|
+
"parquet": ParquetReader,
|
|
554
|
+
"csv": CSVReader,
|
|
555
|
+
"tsv": lambda: CSVReader(delimiter="\t"),
|
|
556
|
+
"ndjson": NDJSONReader,
|
|
557
|
+
"jsonl": NDJSONReader,
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
@classmethod
|
|
561
|
+
def get_reader(cls, format_name: str, **kwargs) -> InputReader:
|
|
562
|
+
"""Get reader for specified format."""
|
|
563
|
+
format_name = format_name.lower()
|
|
564
|
+
|
|
565
|
+
if format_name not in cls._readers:
|
|
566
|
+
raise ValueError(f"Unsupported input format: {format_name}. Supported formats: {list(cls._readers.keys())}")
|
|
567
|
+
|
|
568
|
+
reader_class = cls._readers[format_name]
|
|
569
|
+
|
|
570
|
+
# Handle lambda functions for parameterized readers
|
|
571
|
+
if callable(reader_class) and not isinstance(reader_class, type):
|
|
572
|
+
result = reader_class()
|
|
573
|
+
return cast(InputReader, result)
|
|
574
|
+
|
|
575
|
+
# For type classes, instantiate with kwargs
|
|
576
|
+
reader_type = cast(type[InputReader], reader_class)
|
|
577
|
+
result = reader_type(**kwargs)
|
|
578
|
+
return result
|
|
579
|
+
|
|
580
|
+
@classmethod
|
|
581
|
+
def auto_detect_format(cls, file_path: str) -> str:
|
|
582
|
+
"""Auto-detect file format based on extension."""
|
|
583
|
+
path_lower = file_path.lower()
|
|
584
|
+
|
|
585
|
+
if path_lower.endswith(".parquet") or path_lower.endswith(".pq"):
|
|
586
|
+
return "parquet"
|
|
587
|
+
elif path_lower.endswith(".csv"):
|
|
588
|
+
return "csv"
|
|
589
|
+
elif path_lower.endswith(".tsv") or path_lower.endswith(".tab"):
|
|
590
|
+
return "tsv"
|
|
591
|
+
elif path_lower.endswith(".ndjson"):
|
|
592
|
+
return "ndjson"
|
|
593
|
+
elif path_lower.endswith(".jsonl"):
|
|
594
|
+
return "jsonl"
|
|
595
|
+
else:
|
|
596
|
+
# Default to CSV format for unknown extensions
|
|
597
|
+
logger.warning(f"Unknown file extension for {file_path}, defaulting to CSV format")
|
|
598
|
+
return "csv"
|
|
599
|
+
|
|
600
|
+
@classmethod
|
|
601
|
+
def get_supported_formats(cls) -> list[str]:
|
|
602
|
+
"""Get list of supported input formats."""
|
|
603
|
+
return list(cls._readers.keys())
|