earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,666 @@
1
+ """Tests for the input_readers module."""
2
+
3
+ from pathlib import Path
4
+ from unittest.mock import MagicMock, patch
5
+
6
+ import pandas as pd
7
+ import pytest
8
+
9
+ from earthcatalog.input_readers import (
10
+ CSVReader,
11
+ InputReader,
12
+ NDJSONReader,
13
+ ParquetReader,
14
+ ReaderFactory,
15
+ )
16
+
17
+
18
+ class TestParquetReader:
19
+ """Tests for ParquetReader class."""
20
+
21
+ @pytest.fixture
22
+ def sample_parquet_file(self, tmp_path: Path) -> Path:
23
+ """Create a temporary parquet file with URLs."""
24
+ df = pd.DataFrame(
25
+ {
26
+ "url": [
27
+ "https://example.com/item1.json",
28
+ "https://example.com/item2.json",
29
+ "https://example.com/item3.json",
30
+ ],
31
+ "name": ["item1", "item2", "item3"],
32
+ }
33
+ )
34
+ path = tmp_path / "urls.parquet"
35
+ df.to_parquet(path, index=False)
36
+ return path
37
+
38
+ @pytest.fixture
39
+ def empty_parquet_file(self, tmp_path: Path) -> Path:
40
+ """Create an empty parquet file."""
41
+ df = pd.DataFrame({"url": []})
42
+ path = tmp_path / "empty.parquet"
43
+ df.to_parquet(path, index=False)
44
+ return path
45
+
46
+ def test_read_urls_basic(self, sample_parquet_file: Path):
47
+ """Test reading URLs from parquet file."""
48
+ reader = ParquetReader()
49
+ urls = reader.read_urls(str(sample_parquet_file), "url")
50
+ assert len(urls) == 3
51
+ assert all(u.startswith("https://") for u in urls)
52
+ assert "https://example.com/item1.json" in urls
53
+
54
+ def test_read_urls_custom_column(self, tmp_path: Path):
55
+ """Test reading from custom URL column name."""
56
+ df = pd.DataFrame({"stac_url": ["https://example.com/item.json"]})
57
+ path = tmp_path / "custom.parquet"
58
+ df.to_parquet(path, index=False)
59
+
60
+ reader = ParquetReader()
61
+ urls = reader.read_urls(str(path), "stac_url")
62
+ assert len(urls) == 1
63
+ assert urls[0] == "https://example.com/item.json"
64
+
65
+ def test_read_urls_missing_column(self, sample_parquet_file: Path):
66
+ """Test error when URL column doesn't exist."""
67
+ reader = ParquetReader()
68
+ with pytest.raises(ValueError, match="must contain"):
69
+ reader.read_urls(str(sample_parquet_file), "nonexistent_column")
70
+
71
+ def test_read_urls_file_not_found(self):
72
+ """Test error when file doesn't exist."""
73
+ reader = ParquetReader()
74
+ with pytest.raises(FileNotFoundError):
75
+ reader.read_urls("/nonexistent/path/to/file.parquet", "url")
76
+
77
+ def test_read_urls_empty_file(self, empty_parquet_file: Path):
78
+ """Test reading from empty parquet file."""
79
+ reader = ParquetReader()
80
+ urls = reader.read_urls(str(empty_parquet_file), "url")
81
+ assert len(urls) == 0
82
+ assert isinstance(urls, list)
83
+
84
+ def test_validate_format_valid(self, sample_parquet_file: Path):
85
+ """Test format validation for valid parquet."""
86
+ reader = ParquetReader()
87
+ assert reader.validate_format(str(sample_parquet_file)) is True
88
+
89
+ def test_validate_format_invalid(self, tmp_path: Path):
90
+ """Test format validation for invalid file."""
91
+ invalid = tmp_path / "invalid.txt"
92
+ invalid.write_text("this is not a parquet file")
93
+ reader = ParquetReader()
94
+ assert reader.validate_format(str(invalid)) is False
95
+
96
+ def test_validate_format_nonexistent(self):
97
+ """Test format validation for nonexistent file."""
98
+ reader = ParquetReader()
99
+ assert reader.validate_format("/nonexistent/file.parquet") is False
100
+
101
+ def test_read_urls_preserves_order(self, tmp_path: Path):
102
+ """Test that URL order is preserved."""
103
+ urls = [f"https://example.com/item{i}.json" for i in range(100)]
104
+ df = pd.DataFrame({"url": urls})
105
+ path = tmp_path / "ordered.parquet"
106
+ df.to_parquet(path, index=False)
107
+
108
+ reader = ParquetReader()
109
+ result = reader.read_urls(str(path), "url")
110
+ assert result == urls
111
+
112
+
113
+ class TestCSVReader:
114
+ """Tests for CSVReader class."""
115
+
116
+ @pytest.fixture
117
+ def sample_csv_file(self, tmp_path: Path) -> Path:
118
+ """Create a temporary CSV file with URLs."""
119
+ path = tmp_path / "urls.csv"
120
+ path.write_text("url,name\nhttps://example.com/1.json,item1\nhttps://example.com/2.json,item2\n")
121
+ return path
122
+
123
+ @pytest.fixture
124
+ def empty_csv_file(self, tmp_path: Path) -> Path:
125
+ """Create an empty CSV file (header only)."""
126
+ path = tmp_path / "empty.csv"
127
+ path.write_text("url,name\n")
128
+ return path
129
+
130
+ def test_read_urls_basic(self, sample_csv_file: Path):
131
+ """Test reading URLs from CSV file."""
132
+ reader = CSVReader()
133
+ urls = reader.read_urls(str(sample_csv_file), "url")
134
+ assert len(urls) == 2
135
+ assert "https://example.com/1.json" in urls
136
+ assert "https://example.com/2.json" in urls
137
+
138
+ def test_read_urls_custom_delimiter(self, tmp_path: Path):
139
+ """Test reading with custom delimiter (pipe)."""
140
+ path = tmp_path / "urls.psv"
141
+ path.write_text("url|name\nhttps://example.com/1.json|item1\n")
142
+ reader = CSVReader(delimiter="|")
143
+ urls = reader.read_urls(str(path), "url")
144
+ assert len(urls) == 1
145
+ assert urls[0] == "https://example.com/1.json"
146
+
147
+ def test_read_urls_tab_delimiter(self, tmp_path: Path):
148
+ """Test reading TSV file."""
149
+ path = tmp_path / "urls.tsv"
150
+ path.write_text("url\tname\nhttps://example.com/1.json\titem1\n")
151
+ reader = CSVReader(delimiter="\t")
152
+ urls = reader.read_urls(str(path), "url")
153
+ assert len(urls) == 1
154
+
155
+ def test_read_urls_empty_file(self, empty_csv_file: Path):
156
+ """Test reading empty CSV file (header only)."""
157
+ reader = CSVReader()
158
+ urls = reader.read_urls(str(empty_csv_file), "url")
159
+ assert len(urls) == 0
160
+
161
+ def test_read_urls_missing_column(self, sample_csv_file: Path):
162
+ """Test error when URL column doesn't exist."""
163
+ reader = CSVReader()
164
+ with pytest.raises(ValueError, match="must contain"):
165
+ reader.read_urls(str(sample_csv_file), "nonexistent")
166
+
167
+ def test_read_urls_with_null_values(self, tmp_path: Path):
168
+ """Test that null URL values are filtered out."""
169
+ path = tmp_path / "with_nulls.csv"
170
+ path.write_text("url,name\nhttps://example.com/1.json,item1\n,item2\nhttps://example.com/3.json,item3\n")
171
+ reader = CSVReader()
172
+ urls = reader.read_urls(str(path), "url")
173
+ # Should have 2 URLs (the empty one filtered out)
174
+ assert len(urls) == 2
175
+ assert "" not in urls
176
+
177
+ def test_validate_format_valid(self, sample_csv_file: Path):
178
+ """Test format validation for valid CSV."""
179
+ reader = CSVReader()
180
+ assert reader.validate_format(str(sample_csv_file)) is True
181
+
182
+ def test_validate_format_invalid_delimiter(self, tmp_path: Path):
183
+ """Test validation when delimiter doesn't match."""
184
+ path = tmp_path / "wrong_delim.csv"
185
+ path.write_text("url|name\nhttps://example.com/1.json|item1\n")
186
+ # With default comma delimiter, this might still "validate" as CSV
187
+ # but would have wrong structure
188
+ reader = CSVReader(delimiter=",")
189
+ # This should still return True as it's valid CSV (just single column)
190
+ assert reader.validate_format(str(path)) is True
191
+
192
+ def test_custom_quotechar(self, tmp_path: Path):
193
+ """Test reading with custom quote character."""
194
+ path = tmp_path / "quoted.csv"
195
+ path.write_text("url,name\n'https://example.com/1.json','item1'\n")
196
+ reader = CSVReader(quotechar="'")
197
+ urls = reader.read_urls(str(path), "url")
198
+ assert len(urls) == 1
199
+
200
+
201
+ class TestNDJSONReader:
202
+ """Tests for NDJSONReader class."""
203
+
204
+ @pytest.fixture
205
+ def sample_ndjson_file(self, tmp_path: Path) -> Path:
206
+ """Create a temporary NDJSON file with URLs."""
207
+ path = tmp_path / "urls.ndjson"
208
+ path.write_text(
209
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
210
+ '{"url": "https://example.com/item2.json", "id": "item2"}\n'
211
+ '{"url": "https://example.com/item3.json", "id": "item3"}\n'
212
+ )
213
+ return path
214
+
215
+ @pytest.fixture
216
+ def sample_jsonl_file(self, tmp_path: Path) -> Path:
217
+ """Create a temporary JSONL file with URLs."""
218
+ path = tmp_path / "urls.jsonl"
219
+ path.write_text(
220
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
221
+ '{"url": "https://example.com/item2.json", "id": "item2"}\n'
222
+ )
223
+ return path
224
+
225
+ @pytest.fixture
226
+ def ndjson_with_comments(self, tmp_path: Path) -> Path:
227
+ """Create NDJSON file with comment lines."""
228
+ path = tmp_path / "with_comments.ndjson"
229
+ path.write_text(
230
+ "# This is a comment\n"
231
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
232
+ "# Another comment\n"
233
+ '{"url": "https://example.com/item2.json", "id": "item2"}\n'
234
+ )
235
+ return path
236
+
237
+ @pytest.fixture
238
+ def ndjson_with_empty_lines(self, tmp_path: Path) -> Path:
239
+ """Create NDJSON file with empty lines."""
240
+ path = tmp_path / "with_empty.ndjson"
241
+ path.write_text(
242
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
243
+ "\n"
244
+ "\n"
245
+ '{"url": "https://example.com/item2.json", "id": "item2"}\n'
246
+ )
247
+ return path
248
+
249
+ @pytest.fixture
250
+ def ndjson_with_malformed(self, tmp_path: Path) -> Path:
251
+ """Create NDJSON file with some malformed JSON."""
252
+ path = tmp_path / "with_malformed.ndjson"
253
+ path.write_text(
254
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
255
+ "this is not json\n"
256
+ '{"url": "https://example.com/item2.json", "id": "item2"}\n'
257
+ '{"url": "https://example.com/item3.json", "id": "item3"}\n'
258
+ )
259
+ return path
260
+
261
+ @pytest.fixture
262
+ def ndjson_custom_url_field(self, tmp_path: Path) -> Path:
263
+ """Create NDJSON file with custom URL field name."""
264
+ path = tmp_path / "custom_field.ndjson"
265
+ path.write_text(
266
+ '{"stac_url": "https://example.com/item1.json", "id": "item1"}\n'
267
+ '{"stac_url": "https://example.com/item2.json", "id": "item2"}\n'
268
+ )
269
+ return path
270
+
271
+ @pytest.fixture
272
+ def ndjson_with_missing_field(self, tmp_path: Path) -> Path:
273
+ """Create NDJSON file where some objects are missing the URL field."""
274
+ path = tmp_path / "missing_field.ndjson"
275
+ path.write_text(
276
+ '{"url": "https://example.com/item1.json", "id": "item1"}\n'
277
+ '{"id": "item2"}\n' # Missing url field
278
+ '{"url": "https://example.com/item3.json", "id": "item3"}\n'
279
+ )
280
+ return path
281
+
282
+ @pytest.fixture
283
+ def empty_ndjson_file(self, tmp_path: Path) -> Path:
284
+ """Create an empty NDJSON file."""
285
+ path = tmp_path / "empty.ndjson"
286
+ path.write_text("")
287
+ return path
288
+
289
+ @pytest.fixture
290
+ def ndjson_all_comments(self, tmp_path: Path) -> Path:
291
+ """Create NDJSON file with only comments."""
292
+ path = tmp_path / "all_comments.ndjson"
293
+ path.write_text("# Comment 1\n# Comment 2\n# Comment 3\n")
294
+ return path
295
+
296
+ def test_read_urls_basic(self, sample_ndjson_file: Path):
297
+ """Test reading URLs from NDJSON file."""
298
+ reader = NDJSONReader()
299
+ urls = reader.read_urls(str(sample_ndjson_file), "url")
300
+ assert len(urls) == 3
301
+ assert "https://example.com/item1.json" in urls
302
+ assert "https://example.com/item2.json" in urls
303
+ assert "https://example.com/item3.json" in urls
304
+
305
+ def test_read_urls_from_jsonl(self, sample_jsonl_file: Path):
306
+ """Test reading URLs from JSONL file (.jsonl extension)."""
307
+ reader = NDJSONReader()
308
+ urls = reader.read_urls(str(sample_jsonl_file), "url")
309
+ assert len(urls) == 2
310
+ assert "https://example.com/item1.json" in urls
311
+ assert "https://example.com/item2.json" in urls
312
+
313
+ def test_read_urls_with_comments(self, ndjson_with_comments: Path):
314
+ """Test that comment lines are skipped."""
315
+ reader = NDJSONReader()
316
+ urls = reader.read_urls(str(ndjson_with_comments), "url")
317
+ assert len(urls) == 2
318
+ assert "https://example.com/item1.json" in urls
319
+ assert "https://example.com/item2.json" in urls
320
+
321
+ def test_read_urls_with_empty_lines(self, ndjson_with_empty_lines: Path):
322
+ """Test that empty lines are skipped."""
323
+ reader = NDJSONReader()
324
+ urls = reader.read_urls(str(ndjson_with_empty_lines), "url")
325
+ assert len(urls) == 2
326
+
327
+ def test_read_urls_with_malformed_json(self, ndjson_with_malformed: Path):
328
+ """Test that malformed JSON lines are skipped."""
329
+ reader = NDJSONReader()
330
+ urls = reader.read_urls(str(ndjson_with_malformed), "url")
331
+ # Should skip the malformed line
332
+ assert len(urls) == 3
333
+ assert "https://example.com/item1.json" in urls
334
+ assert "https://example.com/item2.json" in urls
335
+ assert "https://example.com/item3.json" in urls
336
+
337
+ def test_read_urls_custom_field(self, ndjson_custom_url_field: Path):
338
+ """Test reading from custom URL field name."""
339
+ reader = NDJSONReader()
340
+ urls = reader.read_urls(str(ndjson_custom_url_field), "stac_url")
341
+ assert len(urls) == 2
342
+ assert urls[0] == "https://example.com/item1.json"
343
+
344
+ def test_read_urls_missing_field(self, ndjson_with_missing_field: Path):
345
+ """Test handling when some objects are missing the URL field."""
346
+ reader = NDJSONReader()
347
+ urls = reader.read_urls(str(ndjson_with_missing_field), "url")
348
+ # Should only get URLs from objects that have the field
349
+ assert len(urls) == 2
350
+ assert "https://example.com/item1.json" in urls
351
+ assert "https://example.com/item3.json" in urls
352
+
353
+ def test_read_urls_empty_file(self, empty_ndjson_file: Path):
354
+ """Test reading from empty NDJSON file."""
355
+ reader = NDJSONReader()
356
+ urls = reader.read_urls(str(empty_ndjson_file), "url")
357
+ assert len(urls) == 0
358
+ assert isinstance(urls, list)
359
+
360
+ def test_read_urls_all_comments(self, ndjson_all_comments: Path):
361
+ """Test file with only comment lines."""
362
+ reader = NDJSONReader()
363
+ with pytest.raises(ValueError, match="does not contain"):
364
+ reader.read_urls(str(ndjson_all_comments), "url")
365
+
366
+ def test_read_urls_file_not_found(self):
367
+ """Test error when file doesn't exist."""
368
+ reader = NDJSONReader()
369
+ with pytest.raises(FileNotFoundError):
370
+ reader.read_urls("/nonexistent/path/to/file.ndjson", "url")
371
+
372
+ def test_read_urls_empty_path(self):
373
+ """Test error when file path is empty."""
374
+ reader = NDJSONReader()
375
+ with pytest.raises(ValueError, match="file_path cannot be empty"):
376
+ reader.read_urls("", "url")
377
+
378
+ def test_read_urls_all_missing_field(self, tmp_path: Path):
379
+ """Test error when all objects are missing the URL field."""
380
+ path = tmp_path / "all_missing.ndjson"
381
+ path.write_text('{"id": "item1"}\n{"id": "item2"}\n')
382
+
383
+ reader = NDJSONReader()
384
+ with pytest.raises(ValueError, match="does not contain"):
385
+ reader.read_urls(str(path), "url")
386
+
387
+ def test_read_urls_preserves_order(self, tmp_path: Path):
388
+ """Test that URL order is preserved."""
389
+ urls = [f"https://example.com/item{i}.json" for i in range(100)]
390
+ lines = [f'{{"url": "{url}", "id": "item{i}"}}\n' for i, url in enumerate(urls)]
391
+
392
+ path = tmp_path / "ordered.ndjson"
393
+ path.write_text("".join(lines))
394
+
395
+ reader = NDJSONReader()
396
+ result = reader.read_urls(str(path), "url")
397
+ assert result == urls
398
+
399
+ def test_read_urls_large_file(self, tmp_path: Path):
400
+ """Test reading larger NDJSON files."""
401
+ # Create file with 10000 lines
402
+ lines = [f'{{"url": "https://example.com/item{i}.json"}}\n' for i in range(10000)]
403
+ path = tmp_path / "large.ndjson"
404
+ path.write_text("".join(lines))
405
+
406
+ reader = NDJSONReader()
407
+ result = reader.read_urls(str(path), "url")
408
+ assert len(result) == 10000
409
+ assert result[0] == "https://example.com/item0.json"
410
+ assert result[-1] == "https://example.com/item9999.json"
411
+
412
+ def test_validate_format_valid(self, sample_ndjson_file: Path):
413
+ """Test format validation for valid NDJSON."""
414
+ reader = NDJSONReader()
415
+ assert reader.validate_format(str(sample_ndjson_file)) is True
416
+
417
+ def test_validate_format_valid_jsonl(self, sample_jsonl_file: Path):
418
+ """Test format validation for JSONL files."""
419
+ reader = NDJSONReader()
420
+ assert reader.validate_format(str(sample_jsonl_file)) is True
421
+
422
+ def test_validate_format_invalid_json(self, tmp_path: Path):
423
+ """Test format validation for invalid JSON."""
424
+ invalid = tmp_path / "invalid.ndjson"
425
+ invalid.write_text("this is not json\nnot json either\n")
426
+ reader = NDJSONReader()
427
+ assert reader.validate_format(str(invalid)) is False
428
+
429
+ def test_validate_format_nonexistent(self):
430
+ """Test format validation for nonexistent file."""
431
+ reader = NDJSONReader()
432
+ assert reader.validate_format("/nonexistent/file.ndjson") is False
433
+
434
+ def test_validate_format_empty_file(self, empty_ndjson_file: Path):
435
+ """Test format validation for empty file."""
436
+ reader = NDJSONReader()
437
+ # Empty file has no valid JSON, so should be False
438
+ assert reader.validate_format(str(empty_ndjson_file)) is False
439
+
440
+ def test_validate_format_with_comments(self, ndjson_with_comments: Path):
441
+ """Test validation works with comment lines."""
442
+ reader = NDJSONReader()
443
+ # Should detect valid JSON after comments
444
+ assert reader.validate_format(str(ndjson_with_comments)) is True
445
+
446
+
447
+ class TestReaderFactory:
448
+ """Tests for ReaderFactory class."""
449
+
450
+ @pytest.mark.parametrize(
451
+ "format_name,expected_type,expected_delimiter",
452
+ [
453
+ ("parquet", ParquetReader, None),
454
+ ("csv", CSVReader, ","),
455
+ ("tsv", CSVReader, "\t"),
456
+ ("ndjson", NDJSONReader, None),
457
+ ("jsonl", NDJSONReader, None),
458
+ ("PARQUET", ParquetReader, None), # case insensitive
459
+ ("NDJSON", NDJSONReader, None), # case insensitive
460
+ ("JSONL", NDJSONReader, None), # case insensitive
461
+ ],
462
+ ids=["parquet", "csv", "tsv", "ndjson", "jsonl", "parquet_uppercase", "ndjson_uppercase", "jsonl_uppercase"],
463
+ )
464
+ def test_get_reader(self, format_name, expected_type, expected_delimiter):
465
+ """Test getting reader for various format types."""
466
+ reader = ReaderFactory.get_reader(format_name)
467
+ assert isinstance(reader, expected_type)
468
+ if expected_delimiter is not None:
469
+ assert reader.delimiter == expected_delimiter
470
+
471
+ def test_get_reader_unknown_format(self):
472
+ """Test error for unknown format."""
473
+ with pytest.raises(ValueError, match="Unsupported input format"):
474
+ ReaderFactory.get_reader("unknown_format")
475
+
476
+ @pytest.mark.parametrize(
477
+ "file_path,expected_format",
478
+ [
479
+ ("data.parquet", "parquet"),
480
+ ("s3://bucket/data.parquet", "parquet"),
481
+ ("/path/to/DATA.PARQUET", "parquet"),
482
+ ("data.csv", "csv"),
483
+ ("s3://bucket/data.CSV", "csv"),
484
+ ("data.tsv", "tsv"),
485
+ ("data.ndjson", "ndjson"),
486
+ ("s3://bucket/data.NDJSON", "ndjson"),
487
+ ("data.jsonl", "jsonl"),
488
+ ("/path/to/data.JSONL", "jsonl"),
489
+ ("data.unknown", "csv"), # defaults to csv
490
+ ],
491
+ ids=[
492
+ "parquet",
493
+ "parquet_s3",
494
+ "parquet_upper",
495
+ "csv",
496
+ "csv_s3",
497
+ "tsv",
498
+ "ndjson",
499
+ "ndjson_s3",
500
+ "jsonl",
501
+ "jsonl_upper",
502
+ "unknown_defaults_csv",
503
+ ],
504
+ )
505
+ def test_auto_detect_format(self, file_path, expected_format):
506
+ """Test auto-detection of file formats."""
507
+ assert ReaderFactory.auto_detect_format(file_path) == expected_format
508
+
509
+ def test_get_supported_formats(self):
510
+ """Test list of supported formats."""
511
+ formats = ReaderFactory.get_supported_formats()
512
+ assert "parquet" in formats
513
+ assert "csv" in formats
514
+ assert "tsv" in formats
515
+ assert "ndjson" in formats
516
+ assert "jsonl" in formats
517
+ assert isinstance(formats, list)
518
+
519
+
520
+ class TestInputReaderInterface:
521
+ """Tests for InputReader abstract interface compliance."""
522
+
523
+ @pytest.mark.parametrize(
524
+ "reader_class",
525
+ [ParquetReader, CSVReader, NDJSONReader],
526
+ ids=["parquet_reader", "csv_reader", "ndjson_reader"],
527
+ )
528
+ def test_reader_implements_interface(self, reader_class):
529
+ """Test that readers implement InputReader interface."""
530
+ reader = reader_class()
531
+ assert isinstance(reader, InputReader)
532
+ assert hasattr(reader, "read_urls")
533
+ assert hasattr(reader, "validate_format")
534
+ assert callable(reader.read_urls)
535
+ assert callable(reader.validate_format)
536
+
537
+
538
+ class TestReaderIntegration:
539
+ """Integration tests for reader workflows."""
540
+
541
+ def test_factory_then_read_parquet(self, tmp_path: Path):
542
+ """Test complete workflow: factory -> read parquet."""
543
+ # Create test file
544
+ df = pd.DataFrame({"stac_url": ["https://example.com/item.json"]})
545
+ path = tmp_path / "test.parquet"
546
+ df.to_parquet(path, index=False)
547
+
548
+ # Use factory to get reader and read
549
+ reader = ReaderFactory.get_reader("parquet")
550
+ urls = reader.read_urls(str(path), "stac_url")
551
+ assert urls == ["https://example.com/item.json"]
552
+
553
+ def test_factory_then_read_csv(self, tmp_path: Path):
554
+ """Test complete workflow: factory -> read csv."""
555
+ path = tmp_path / "test.csv"
556
+ path.write_text("stac_url\nhttps://example.com/item.json\n")
557
+
558
+ reader = ReaderFactory.get_reader("csv")
559
+ urls = reader.read_urls(str(path), "stac_url")
560
+ assert urls == ["https://example.com/item.json"]
561
+
562
+ def test_factory_then_read_ndjson(self, tmp_path: Path):
563
+ """Test complete workflow: factory -> read ndjson."""
564
+ path = tmp_path / "test.ndjson"
565
+ path.write_text('{"stac_url": "https://example.com/item.json"}\n')
566
+
567
+ reader = ReaderFactory.get_reader("ndjson")
568
+ urls = reader.read_urls(str(path), "stac_url")
569
+ assert urls == ["https://example.com/item.json"]
570
+
571
+ def test_factory_auto_detect_ndjson(self, tmp_path: Path):
572
+ """Test auto-detection workflow for ndjson files."""
573
+ path = tmp_path / "data.jsonl"
574
+ path.write_text('{"url": "https://example.com/item.json"}\n')
575
+
576
+ # Auto-detect should pick jsonl format
577
+ format_detected = ReaderFactory.auto_detect_format(str(path))
578
+ assert format_detected == "jsonl"
579
+
580
+ # Get reader and read
581
+ reader = ReaderFactory.get_reader(format_detected)
582
+ urls = reader.read_urls(str(path), "url")
583
+ assert urls == ["https://example.com/item.json"]
584
+
585
+ def test_large_file_handling(self, tmp_path: Path):
586
+ """Test reading larger files works correctly."""
587
+ # Create file with 10000 URLs
588
+ urls = [f"https://example.com/item{i}.json" for i in range(10000)]
589
+ df = pd.DataFrame({"url": urls})
590
+ path = tmp_path / "large.parquet"
591
+ df.to_parquet(path, index=False)
592
+
593
+ reader = ParquetReader()
594
+ result = reader.read_urls(str(path), "url")
595
+ assert len(result) == 10000
596
+ assert result[0] == "https://example.com/item0.json"
597
+ assert result[-1] == "https://example.com/item9999.json"
598
+
599
+
600
+ class TestCloudStorageMocking:
601
+ """Tests for cloud storage integration (mocked)."""
602
+
603
+ @patch("earthcatalog.input_readers.HAS_FSSPEC", True)
604
+ @patch("earthcatalog.input_readers.fsspec")
605
+ def test_parquet_s3_path_detection(self, mock_fsspec):
606
+ """Test that S3 paths trigger fsspec usage."""
607
+ # Setup mock
608
+ mock_fs = MagicMock()
609
+ mock_fsspec.filesystem.return_value = mock_fs
610
+
611
+ # Create a mock file object that returns parquet data
612
+ mock_file = MagicMock()
613
+ mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
614
+ mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
615
+
616
+ reader = ParquetReader()
617
+
618
+ # This should detect S3 and use fsspec
619
+ try:
620
+ reader.read_urls("s3://bucket/data.parquet", "url")
621
+ except (ValueError, TypeError, OSError):
622
+ # Expected to fail since mock doesn't return real data
623
+ pass
624
+
625
+ # Verify fsspec was called with s3
626
+ mock_fsspec.filesystem.assert_called_once_with("s3")
627
+
628
+ @patch("earthcatalog.input_readers.HAS_FSSPEC", True)
629
+ @patch("earthcatalog.input_readers.fsspec")
630
+ def test_csv_s3_path_detection(self, mock_fsspec):
631
+ """Test that S3 paths trigger fsspec usage for CSV."""
632
+ mock_fs = MagicMock()
633
+ mock_fsspec.filesystem.return_value = mock_fs
634
+
635
+ mock_file = MagicMock()
636
+ mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
637
+ mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
638
+
639
+ reader = CSVReader()
640
+
641
+ try:
642
+ reader.read_urls("s3://bucket/data.csv", "url")
643
+ except (ValueError, TypeError, OSError):
644
+ pass
645
+
646
+ mock_fsspec.filesystem.assert_called_once_with("s3")
647
+
648
+ @patch("earthcatalog.input_readers.HAS_FSSPEC", True)
649
+ @patch("earthcatalog.input_readers.fsspec")
650
+ def test_ndjson_s3_path_detection(self, mock_fsspec):
651
+ """Test that S3 paths trigger fsspec usage for NDJSON."""
652
+ mock_fs = MagicMock()
653
+ mock_fsspec.filesystem.return_value = mock_fs
654
+
655
+ mock_file = MagicMock()
656
+ mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
657
+ mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
658
+
659
+ reader = NDJSONReader()
660
+
661
+ try:
662
+ reader.read_urls("s3://bucket/data.ndjson", "url")
663
+ except (ValueError, TypeError, OSError):
664
+ pass
665
+
666
+ mock_fsspec.filesystem.assert_called_once_with("s3")