earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,666 @@
|
|
|
1
|
+
"""Tests for the input_readers module."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from unittest.mock import MagicMock, patch
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from earthcatalog.input_readers import (
|
|
10
|
+
CSVReader,
|
|
11
|
+
InputReader,
|
|
12
|
+
NDJSONReader,
|
|
13
|
+
ParquetReader,
|
|
14
|
+
ReaderFactory,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestParquetReader:
|
|
19
|
+
"""Tests for ParquetReader class."""
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def sample_parquet_file(self, tmp_path: Path) -> Path:
|
|
23
|
+
"""Create a temporary parquet file with URLs."""
|
|
24
|
+
df = pd.DataFrame(
|
|
25
|
+
{
|
|
26
|
+
"url": [
|
|
27
|
+
"https://example.com/item1.json",
|
|
28
|
+
"https://example.com/item2.json",
|
|
29
|
+
"https://example.com/item3.json",
|
|
30
|
+
],
|
|
31
|
+
"name": ["item1", "item2", "item3"],
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
path = tmp_path / "urls.parquet"
|
|
35
|
+
df.to_parquet(path, index=False)
|
|
36
|
+
return path
|
|
37
|
+
|
|
38
|
+
@pytest.fixture
|
|
39
|
+
def empty_parquet_file(self, tmp_path: Path) -> Path:
|
|
40
|
+
"""Create an empty parquet file."""
|
|
41
|
+
df = pd.DataFrame({"url": []})
|
|
42
|
+
path = tmp_path / "empty.parquet"
|
|
43
|
+
df.to_parquet(path, index=False)
|
|
44
|
+
return path
|
|
45
|
+
|
|
46
|
+
def test_read_urls_basic(self, sample_parquet_file: Path):
|
|
47
|
+
"""Test reading URLs from parquet file."""
|
|
48
|
+
reader = ParquetReader()
|
|
49
|
+
urls = reader.read_urls(str(sample_parquet_file), "url")
|
|
50
|
+
assert len(urls) == 3
|
|
51
|
+
assert all(u.startswith("https://") for u in urls)
|
|
52
|
+
assert "https://example.com/item1.json" in urls
|
|
53
|
+
|
|
54
|
+
def test_read_urls_custom_column(self, tmp_path: Path):
|
|
55
|
+
"""Test reading from custom URL column name."""
|
|
56
|
+
df = pd.DataFrame({"stac_url": ["https://example.com/item.json"]})
|
|
57
|
+
path = tmp_path / "custom.parquet"
|
|
58
|
+
df.to_parquet(path, index=False)
|
|
59
|
+
|
|
60
|
+
reader = ParquetReader()
|
|
61
|
+
urls = reader.read_urls(str(path), "stac_url")
|
|
62
|
+
assert len(urls) == 1
|
|
63
|
+
assert urls[0] == "https://example.com/item.json"
|
|
64
|
+
|
|
65
|
+
def test_read_urls_missing_column(self, sample_parquet_file: Path):
|
|
66
|
+
"""Test error when URL column doesn't exist."""
|
|
67
|
+
reader = ParquetReader()
|
|
68
|
+
with pytest.raises(ValueError, match="must contain"):
|
|
69
|
+
reader.read_urls(str(sample_parquet_file), "nonexistent_column")
|
|
70
|
+
|
|
71
|
+
def test_read_urls_file_not_found(self):
|
|
72
|
+
"""Test error when file doesn't exist."""
|
|
73
|
+
reader = ParquetReader()
|
|
74
|
+
with pytest.raises(FileNotFoundError):
|
|
75
|
+
reader.read_urls("/nonexistent/path/to/file.parquet", "url")
|
|
76
|
+
|
|
77
|
+
def test_read_urls_empty_file(self, empty_parquet_file: Path):
|
|
78
|
+
"""Test reading from empty parquet file."""
|
|
79
|
+
reader = ParquetReader()
|
|
80
|
+
urls = reader.read_urls(str(empty_parquet_file), "url")
|
|
81
|
+
assert len(urls) == 0
|
|
82
|
+
assert isinstance(urls, list)
|
|
83
|
+
|
|
84
|
+
def test_validate_format_valid(self, sample_parquet_file: Path):
|
|
85
|
+
"""Test format validation for valid parquet."""
|
|
86
|
+
reader = ParquetReader()
|
|
87
|
+
assert reader.validate_format(str(sample_parquet_file)) is True
|
|
88
|
+
|
|
89
|
+
def test_validate_format_invalid(self, tmp_path: Path):
|
|
90
|
+
"""Test format validation for invalid file."""
|
|
91
|
+
invalid = tmp_path / "invalid.txt"
|
|
92
|
+
invalid.write_text("this is not a parquet file")
|
|
93
|
+
reader = ParquetReader()
|
|
94
|
+
assert reader.validate_format(str(invalid)) is False
|
|
95
|
+
|
|
96
|
+
def test_validate_format_nonexistent(self):
|
|
97
|
+
"""Test format validation for nonexistent file."""
|
|
98
|
+
reader = ParquetReader()
|
|
99
|
+
assert reader.validate_format("/nonexistent/file.parquet") is False
|
|
100
|
+
|
|
101
|
+
def test_read_urls_preserves_order(self, tmp_path: Path):
|
|
102
|
+
"""Test that URL order is preserved."""
|
|
103
|
+
urls = [f"https://example.com/item{i}.json" for i in range(100)]
|
|
104
|
+
df = pd.DataFrame({"url": urls})
|
|
105
|
+
path = tmp_path / "ordered.parquet"
|
|
106
|
+
df.to_parquet(path, index=False)
|
|
107
|
+
|
|
108
|
+
reader = ParquetReader()
|
|
109
|
+
result = reader.read_urls(str(path), "url")
|
|
110
|
+
assert result == urls
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class TestCSVReader:
|
|
114
|
+
"""Tests for CSVReader class."""
|
|
115
|
+
|
|
116
|
+
@pytest.fixture
|
|
117
|
+
def sample_csv_file(self, tmp_path: Path) -> Path:
|
|
118
|
+
"""Create a temporary CSV file with URLs."""
|
|
119
|
+
path = tmp_path / "urls.csv"
|
|
120
|
+
path.write_text("url,name\nhttps://example.com/1.json,item1\nhttps://example.com/2.json,item2\n")
|
|
121
|
+
return path
|
|
122
|
+
|
|
123
|
+
@pytest.fixture
|
|
124
|
+
def empty_csv_file(self, tmp_path: Path) -> Path:
|
|
125
|
+
"""Create an empty CSV file (header only)."""
|
|
126
|
+
path = tmp_path / "empty.csv"
|
|
127
|
+
path.write_text("url,name\n")
|
|
128
|
+
return path
|
|
129
|
+
|
|
130
|
+
def test_read_urls_basic(self, sample_csv_file: Path):
|
|
131
|
+
"""Test reading URLs from CSV file."""
|
|
132
|
+
reader = CSVReader()
|
|
133
|
+
urls = reader.read_urls(str(sample_csv_file), "url")
|
|
134
|
+
assert len(urls) == 2
|
|
135
|
+
assert "https://example.com/1.json" in urls
|
|
136
|
+
assert "https://example.com/2.json" in urls
|
|
137
|
+
|
|
138
|
+
def test_read_urls_custom_delimiter(self, tmp_path: Path):
|
|
139
|
+
"""Test reading with custom delimiter (pipe)."""
|
|
140
|
+
path = tmp_path / "urls.psv"
|
|
141
|
+
path.write_text("url|name\nhttps://example.com/1.json|item1\n")
|
|
142
|
+
reader = CSVReader(delimiter="|")
|
|
143
|
+
urls = reader.read_urls(str(path), "url")
|
|
144
|
+
assert len(urls) == 1
|
|
145
|
+
assert urls[0] == "https://example.com/1.json"
|
|
146
|
+
|
|
147
|
+
def test_read_urls_tab_delimiter(self, tmp_path: Path):
|
|
148
|
+
"""Test reading TSV file."""
|
|
149
|
+
path = tmp_path / "urls.tsv"
|
|
150
|
+
path.write_text("url\tname\nhttps://example.com/1.json\titem1\n")
|
|
151
|
+
reader = CSVReader(delimiter="\t")
|
|
152
|
+
urls = reader.read_urls(str(path), "url")
|
|
153
|
+
assert len(urls) == 1
|
|
154
|
+
|
|
155
|
+
def test_read_urls_empty_file(self, empty_csv_file: Path):
|
|
156
|
+
"""Test reading empty CSV file (header only)."""
|
|
157
|
+
reader = CSVReader()
|
|
158
|
+
urls = reader.read_urls(str(empty_csv_file), "url")
|
|
159
|
+
assert len(urls) == 0
|
|
160
|
+
|
|
161
|
+
def test_read_urls_missing_column(self, sample_csv_file: Path):
|
|
162
|
+
"""Test error when URL column doesn't exist."""
|
|
163
|
+
reader = CSVReader()
|
|
164
|
+
with pytest.raises(ValueError, match="must contain"):
|
|
165
|
+
reader.read_urls(str(sample_csv_file), "nonexistent")
|
|
166
|
+
|
|
167
|
+
def test_read_urls_with_null_values(self, tmp_path: Path):
|
|
168
|
+
"""Test that null URL values are filtered out."""
|
|
169
|
+
path = tmp_path / "with_nulls.csv"
|
|
170
|
+
path.write_text("url,name\nhttps://example.com/1.json,item1\n,item2\nhttps://example.com/3.json,item3\n")
|
|
171
|
+
reader = CSVReader()
|
|
172
|
+
urls = reader.read_urls(str(path), "url")
|
|
173
|
+
# Should have 2 URLs (the empty one filtered out)
|
|
174
|
+
assert len(urls) == 2
|
|
175
|
+
assert "" not in urls
|
|
176
|
+
|
|
177
|
+
def test_validate_format_valid(self, sample_csv_file: Path):
|
|
178
|
+
"""Test format validation for valid CSV."""
|
|
179
|
+
reader = CSVReader()
|
|
180
|
+
assert reader.validate_format(str(sample_csv_file)) is True
|
|
181
|
+
|
|
182
|
+
def test_validate_format_invalid_delimiter(self, tmp_path: Path):
|
|
183
|
+
"""Test validation when delimiter doesn't match."""
|
|
184
|
+
path = tmp_path / "wrong_delim.csv"
|
|
185
|
+
path.write_text("url|name\nhttps://example.com/1.json|item1\n")
|
|
186
|
+
# With default comma delimiter, this might still "validate" as CSV
|
|
187
|
+
# but would have wrong structure
|
|
188
|
+
reader = CSVReader(delimiter=",")
|
|
189
|
+
# This should still return True as it's valid CSV (just single column)
|
|
190
|
+
assert reader.validate_format(str(path)) is True
|
|
191
|
+
|
|
192
|
+
def test_custom_quotechar(self, tmp_path: Path):
|
|
193
|
+
"""Test reading with custom quote character."""
|
|
194
|
+
path = tmp_path / "quoted.csv"
|
|
195
|
+
path.write_text("url,name\n'https://example.com/1.json','item1'\n")
|
|
196
|
+
reader = CSVReader(quotechar="'")
|
|
197
|
+
urls = reader.read_urls(str(path), "url")
|
|
198
|
+
assert len(urls) == 1
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class TestNDJSONReader:
|
|
202
|
+
"""Tests for NDJSONReader class."""
|
|
203
|
+
|
|
204
|
+
@pytest.fixture
|
|
205
|
+
def sample_ndjson_file(self, tmp_path: Path) -> Path:
|
|
206
|
+
"""Create a temporary NDJSON file with URLs."""
|
|
207
|
+
path = tmp_path / "urls.ndjson"
|
|
208
|
+
path.write_text(
|
|
209
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
210
|
+
'{"url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
211
|
+
'{"url": "https://example.com/item3.json", "id": "item3"}\n'
|
|
212
|
+
)
|
|
213
|
+
return path
|
|
214
|
+
|
|
215
|
+
@pytest.fixture
|
|
216
|
+
def sample_jsonl_file(self, tmp_path: Path) -> Path:
|
|
217
|
+
"""Create a temporary JSONL file with URLs."""
|
|
218
|
+
path = tmp_path / "urls.jsonl"
|
|
219
|
+
path.write_text(
|
|
220
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
221
|
+
'{"url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
222
|
+
)
|
|
223
|
+
return path
|
|
224
|
+
|
|
225
|
+
@pytest.fixture
|
|
226
|
+
def ndjson_with_comments(self, tmp_path: Path) -> Path:
|
|
227
|
+
"""Create NDJSON file with comment lines."""
|
|
228
|
+
path = tmp_path / "with_comments.ndjson"
|
|
229
|
+
path.write_text(
|
|
230
|
+
"# This is a comment\n"
|
|
231
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
232
|
+
"# Another comment\n"
|
|
233
|
+
'{"url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
234
|
+
)
|
|
235
|
+
return path
|
|
236
|
+
|
|
237
|
+
@pytest.fixture
|
|
238
|
+
def ndjson_with_empty_lines(self, tmp_path: Path) -> Path:
|
|
239
|
+
"""Create NDJSON file with empty lines."""
|
|
240
|
+
path = tmp_path / "with_empty.ndjson"
|
|
241
|
+
path.write_text(
|
|
242
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
243
|
+
"\n"
|
|
244
|
+
"\n"
|
|
245
|
+
'{"url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
246
|
+
)
|
|
247
|
+
return path
|
|
248
|
+
|
|
249
|
+
@pytest.fixture
|
|
250
|
+
def ndjson_with_malformed(self, tmp_path: Path) -> Path:
|
|
251
|
+
"""Create NDJSON file with some malformed JSON."""
|
|
252
|
+
path = tmp_path / "with_malformed.ndjson"
|
|
253
|
+
path.write_text(
|
|
254
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
255
|
+
"this is not json\n"
|
|
256
|
+
'{"url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
257
|
+
'{"url": "https://example.com/item3.json", "id": "item3"}\n'
|
|
258
|
+
)
|
|
259
|
+
return path
|
|
260
|
+
|
|
261
|
+
@pytest.fixture
|
|
262
|
+
def ndjson_custom_url_field(self, tmp_path: Path) -> Path:
|
|
263
|
+
"""Create NDJSON file with custom URL field name."""
|
|
264
|
+
path = tmp_path / "custom_field.ndjson"
|
|
265
|
+
path.write_text(
|
|
266
|
+
'{"stac_url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
267
|
+
'{"stac_url": "https://example.com/item2.json", "id": "item2"}\n'
|
|
268
|
+
)
|
|
269
|
+
return path
|
|
270
|
+
|
|
271
|
+
@pytest.fixture
|
|
272
|
+
def ndjson_with_missing_field(self, tmp_path: Path) -> Path:
|
|
273
|
+
"""Create NDJSON file where some objects are missing the URL field."""
|
|
274
|
+
path = tmp_path / "missing_field.ndjson"
|
|
275
|
+
path.write_text(
|
|
276
|
+
'{"url": "https://example.com/item1.json", "id": "item1"}\n'
|
|
277
|
+
'{"id": "item2"}\n' # Missing url field
|
|
278
|
+
'{"url": "https://example.com/item3.json", "id": "item3"}\n'
|
|
279
|
+
)
|
|
280
|
+
return path
|
|
281
|
+
|
|
282
|
+
@pytest.fixture
|
|
283
|
+
def empty_ndjson_file(self, tmp_path: Path) -> Path:
|
|
284
|
+
"""Create an empty NDJSON file."""
|
|
285
|
+
path = tmp_path / "empty.ndjson"
|
|
286
|
+
path.write_text("")
|
|
287
|
+
return path
|
|
288
|
+
|
|
289
|
+
@pytest.fixture
|
|
290
|
+
def ndjson_all_comments(self, tmp_path: Path) -> Path:
|
|
291
|
+
"""Create NDJSON file with only comments."""
|
|
292
|
+
path = tmp_path / "all_comments.ndjson"
|
|
293
|
+
path.write_text("# Comment 1\n# Comment 2\n# Comment 3\n")
|
|
294
|
+
return path
|
|
295
|
+
|
|
296
|
+
def test_read_urls_basic(self, sample_ndjson_file: Path):
|
|
297
|
+
"""Test reading URLs from NDJSON file."""
|
|
298
|
+
reader = NDJSONReader()
|
|
299
|
+
urls = reader.read_urls(str(sample_ndjson_file), "url")
|
|
300
|
+
assert len(urls) == 3
|
|
301
|
+
assert "https://example.com/item1.json" in urls
|
|
302
|
+
assert "https://example.com/item2.json" in urls
|
|
303
|
+
assert "https://example.com/item3.json" in urls
|
|
304
|
+
|
|
305
|
+
def test_read_urls_from_jsonl(self, sample_jsonl_file: Path):
|
|
306
|
+
"""Test reading URLs from JSONL file (.jsonl extension)."""
|
|
307
|
+
reader = NDJSONReader()
|
|
308
|
+
urls = reader.read_urls(str(sample_jsonl_file), "url")
|
|
309
|
+
assert len(urls) == 2
|
|
310
|
+
assert "https://example.com/item1.json" in urls
|
|
311
|
+
assert "https://example.com/item2.json" in urls
|
|
312
|
+
|
|
313
|
+
def test_read_urls_with_comments(self, ndjson_with_comments: Path):
|
|
314
|
+
"""Test that comment lines are skipped."""
|
|
315
|
+
reader = NDJSONReader()
|
|
316
|
+
urls = reader.read_urls(str(ndjson_with_comments), "url")
|
|
317
|
+
assert len(urls) == 2
|
|
318
|
+
assert "https://example.com/item1.json" in urls
|
|
319
|
+
assert "https://example.com/item2.json" in urls
|
|
320
|
+
|
|
321
|
+
def test_read_urls_with_empty_lines(self, ndjson_with_empty_lines: Path):
|
|
322
|
+
"""Test that empty lines are skipped."""
|
|
323
|
+
reader = NDJSONReader()
|
|
324
|
+
urls = reader.read_urls(str(ndjson_with_empty_lines), "url")
|
|
325
|
+
assert len(urls) == 2
|
|
326
|
+
|
|
327
|
+
def test_read_urls_with_malformed_json(self, ndjson_with_malformed: Path):
|
|
328
|
+
"""Test that malformed JSON lines are skipped."""
|
|
329
|
+
reader = NDJSONReader()
|
|
330
|
+
urls = reader.read_urls(str(ndjson_with_malformed), "url")
|
|
331
|
+
# Should skip the malformed line
|
|
332
|
+
assert len(urls) == 3
|
|
333
|
+
assert "https://example.com/item1.json" in urls
|
|
334
|
+
assert "https://example.com/item2.json" in urls
|
|
335
|
+
assert "https://example.com/item3.json" in urls
|
|
336
|
+
|
|
337
|
+
def test_read_urls_custom_field(self, ndjson_custom_url_field: Path):
|
|
338
|
+
"""Test reading from custom URL field name."""
|
|
339
|
+
reader = NDJSONReader()
|
|
340
|
+
urls = reader.read_urls(str(ndjson_custom_url_field), "stac_url")
|
|
341
|
+
assert len(urls) == 2
|
|
342
|
+
assert urls[0] == "https://example.com/item1.json"
|
|
343
|
+
|
|
344
|
+
def test_read_urls_missing_field(self, ndjson_with_missing_field: Path):
|
|
345
|
+
"""Test handling when some objects are missing the URL field."""
|
|
346
|
+
reader = NDJSONReader()
|
|
347
|
+
urls = reader.read_urls(str(ndjson_with_missing_field), "url")
|
|
348
|
+
# Should only get URLs from objects that have the field
|
|
349
|
+
assert len(urls) == 2
|
|
350
|
+
assert "https://example.com/item1.json" in urls
|
|
351
|
+
assert "https://example.com/item3.json" in urls
|
|
352
|
+
|
|
353
|
+
def test_read_urls_empty_file(self, empty_ndjson_file: Path):
|
|
354
|
+
"""Test reading from empty NDJSON file."""
|
|
355
|
+
reader = NDJSONReader()
|
|
356
|
+
urls = reader.read_urls(str(empty_ndjson_file), "url")
|
|
357
|
+
assert len(urls) == 0
|
|
358
|
+
assert isinstance(urls, list)
|
|
359
|
+
|
|
360
|
+
def test_read_urls_all_comments(self, ndjson_all_comments: Path):
|
|
361
|
+
"""Test file with only comment lines."""
|
|
362
|
+
reader = NDJSONReader()
|
|
363
|
+
with pytest.raises(ValueError, match="does not contain"):
|
|
364
|
+
reader.read_urls(str(ndjson_all_comments), "url")
|
|
365
|
+
|
|
366
|
+
def test_read_urls_file_not_found(self):
|
|
367
|
+
"""Test error when file doesn't exist."""
|
|
368
|
+
reader = NDJSONReader()
|
|
369
|
+
with pytest.raises(FileNotFoundError):
|
|
370
|
+
reader.read_urls("/nonexistent/path/to/file.ndjson", "url")
|
|
371
|
+
|
|
372
|
+
def test_read_urls_empty_path(self):
|
|
373
|
+
"""Test error when file path is empty."""
|
|
374
|
+
reader = NDJSONReader()
|
|
375
|
+
with pytest.raises(ValueError, match="file_path cannot be empty"):
|
|
376
|
+
reader.read_urls("", "url")
|
|
377
|
+
|
|
378
|
+
def test_read_urls_all_missing_field(self, tmp_path: Path):
|
|
379
|
+
"""Test error when all objects are missing the URL field."""
|
|
380
|
+
path = tmp_path / "all_missing.ndjson"
|
|
381
|
+
path.write_text('{"id": "item1"}\n{"id": "item2"}\n')
|
|
382
|
+
|
|
383
|
+
reader = NDJSONReader()
|
|
384
|
+
with pytest.raises(ValueError, match="does not contain"):
|
|
385
|
+
reader.read_urls(str(path), "url")
|
|
386
|
+
|
|
387
|
+
def test_read_urls_preserves_order(self, tmp_path: Path):
|
|
388
|
+
"""Test that URL order is preserved."""
|
|
389
|
+
urls = [f"https://example.com/item{i}.json" for i in range(100)]
|
|
390
|
+
lines = [f'{{"url": "{url}", "id": "item{i}"}}\n' for i, url in enumerate(urls)]
|
|
391
|
+
|
|
392
|
+
path = tmp_path / "ordered.ndjson"
|
|
393
|
+
path.write_text("".join(lines))
|
|
394
|
+
|
|
395
|
+
reader = NDJSONReader()
|
|
396
|
+
result = reader.read_urls(str(path), "url")
|
|
397
|
+
assert result == urls
|
|
398
|
+
|
|
399
|
+
def test_read_urls_large_file(self, tmp_path: Path):
|
|
400
|
+
"""Test reading larger NDJSON files."""
|
|
401
|
+
# Create file with 10000 lines
|
|
402
|
+
lines = [f'{{"url": "https://example.com/item{i}.json"}}\n' for i in range(10000)]
|
|
403
|
+
path = tmp_path / "large.ndjson"
|
|
404
|
+
path.write_text("".join(lines))
|
|
405
|
+
|
|
406
|
+
reader = NDJSONReader()
|
|
407
|
+
result = reader.read_urls(str(path), "url")
|
|
408
|
+
assert len(result) == 10000
|
|
409
|
+
assert result[0] == "https://example.com/item0.json"
|
|
410
|
+
assert result[-1] == "https://example.com/item9999.json"
|
|
411
|
+
|
|
412
|
+
def test_validate_format_valid(self, sample_ndjson_file: Path):
|
|
413
|
+
"""Test format validation for valid NDJSON."""
|
|
414
|
+
reader = NDJSONReader()
|
|
415
|
+
assert reader.validate_format(str(sample_ndjson_file)) is True
|
|
416
|
+
|
|
417
|
+
def test_validate_format_valid_jsonl(self, sample_jsonl_file: Path):
|
|
418
|
+
"""Test format validation for JSONL files."""
|
|
419
|
+
reader = NDJSONReader()
|
|
420
|
+
assert reader.validate_format(str(sample_jsonl_file)) is True
|
|
421
|
+
|
|
422
|
+
def test_validate_format_invalid_json(self, tmp_path: Path):
|
|
423
|
+
"""Test format validation for invalid JSON."""
|
|
424
|
+
invalid = tmp_path / "invalid.ndjson"
|
|
425
|
+
invalid.write_text("this is not json\nnot json either\n")
|
|
426
|
+
reader = NDJSONReader()
|
|
427
|
+
assert reader.validate_format(str(invalid)) is False
|
|
428
|
+
|
|
429
|
+
def test_validate_format_nonexistent(self):
|
|
430
|
+
"""Test format validation for nonexistent file."""
|
|
431
|
+
reader = NDJSONReader()
|
|
432
|
+
assert reader.validate_format("/nonexistent/file.ndjson") is False
|
|
433
|
+
|
|
434
|
+
def test_validate_format_empty_file(self, empty_ndjson_file: Path):
|
|
435
|
+
"""Test format validation for empty file."""
|
|
436
|
+
reader = NDJSONReader()
|
|
437
|
+
# Empty file has no valid JSON, so should be False
|
|
438
|
+
assert reader.validate_format(str(empty_ndjson_file)) is False
|
|
439
|
+
|
|
440
|
+
def test_validate_format_with_comments(self, ndjson_with_comments: Path):
|
|
441
|
+
"""Test validation works with comment lines."""
|
|
442
|
+
reader = NDJSONReader()
|
|
443
|
+
# Should detect valid JSON after comments
|
|
444
|
+
assert reader.validate_format(str(ndjson_with_comments)) is True
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class TestReaderFactory:
|
|
448
|
+
"""Tests for ReaderFactory class."""
|
|
449
|
+
|
|
450
|
+
@pytest.mark.parametrize(
|
|
451
|
+
"format_name,expected_type,expected_delimiter",
|
|
452
|
+
[
|
|
453
|
+
("parquet", ParquetReader, None),
|
|
454
|
+
("csv", CSVReader, ","),
|
|
455
|
+
("tsv", CSVReader, "\t"),
|
|
456
|
+
("ndjson", NDJSONReader, None),
|
|
457
|
+
("jsonl", NDJSONReader, None),
|
|
458
|
+
("PARQUET", ParquetReader, None), # case insensitive
|
|
459
|
+
("NDJSON", NDJSONReader, None), # case insensitive
|
|
460
|
+
("JSONL", NDJSONReader, None), # case insensitive
|
|
461
|
+
],
|
|
462
|
+
ids=["parquet", "csv", "tsv", "ndjson", "jsonl", "parquet_uppercase", "ndjson_uppercase", "jsonl_uppercase"],
|
|
463
|
+
)
|
|
464
|
+
def test_get_reader(self, format_name, expected_type, expected_delimiter):
|
|
465
|
+
"""Test getting reader for various format types."""
|
|
466
|
+
reader = ReaderFactory.get_reader(format_name)
|
|
467
|
+
assert isinstance(reader, expected_type)
|
|
468
|
+
if expected_delimiter is not None:
|
|
469
|
+
assert reader.delimiter == expected_delimiter
|
|
470
|
+
|
|
471
|
+
def test_get_reader_unknown_format(self):
|
|
472
|
+
"""Test error for unknown format."""
|
|
473
|
+
with pytest.raises(ValueError, match="Unsupported input format"):
|
|
474
|
+
ReaderFactory.get_reader("unknown_format")
|
|
475
|
+
|
|
476
|
+
@pytest.mark.parametrize(
|
|
477
|
+
"file_path,expected_format",
|
|
478
|
+
[
|
|
479
|
+
("data.parquet", "parquet"),
|
|
480
|
+
("s3://bucket/data.parquet", "parquet"),
|
|
481
|
+
("/path/to/DATA.PARQUET", "parquet"),
|
|
482
|
+
("data.csv", "csv"),
|
|
483
|
+
("s3://bucket/data.CSV", "csv"),
|
|
484
|
+
("data.tsv", "tsv"),
|
|
485
|
+
("data.ndjson", "ndjson"),
|
|
486
|
+
("s3://bucket/data.NDJSON", "ndjson"),
|
|
487
|
+
("data.jsonl", "jsonl"),
|
|
488
|
+
("/path/to/data.JSONL", "jsonl"),
|
|
489
|
+
("data.unknown", "csv"), # defaults to csv
|
|
490
|
+
],
|
|
491
|
+
ids=[
|
|
492
|
+
"parquet",
|
|
493
|
+
"parquet_s3",
|
|
494
|
+
"parquet_upper",
|
|
495
|
+
"csv",
|
|
496
|
+
"csv_s3",
|
|
497
|
+
"tsv",
|
|
498
|
+
"ndjson",
|
|
499
|
+
"ndjson_s3",
|
|
500
|
+
"jsonl",
|
|
501
|
+
"jsonl_upper",
|
|
502
|
+
"unknown_defaults_csv",
|
|
503
|
+
],
|
|
504
|
+
)
|
|
505
|
+
def test_auto_detect_format(self, file_path, expected_format):
|
|
506
|
+
"""Test auto-detection of file formats."""
|
|
507
|
+
assert ReaderFactory.auto_detect_format(file_path) == expected_format
|
|
508
|
+
|
|
509
|
+
def test_get_supported_formats(self):
|
|
510
|
+
"""Test list of supported formats."""
|
|
511
|
+
formats = ReaderFactory.get_supported_formats()
|
|
512
|
+
assert "parquet" in formats
|
|
513
|
+
assert "csv" in formats
|
|
514
|
+
assert "tsv" in formats
|
|
515
|
+
assert "ndjson" in formats
|
|
516
|
+
assert "jsonl" in formats
|
|
517
|
+
assert isinstance(formats, list)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class TestInputReaderInterface:
|
|
521
|
+
"""Tests for InputReader abstract interface compliance."""
|
|
522
|
+
|
|
523
|
+
@pytest.mark.parametrize(
|
|
524
|
+
"reader_class",
|
|
525
|
+
[ParquetReader, CSVReader, NDJSONReader],
|
|
526
|
+
ids=["parquet_reader", "csv_reader", "ndjson_reader"],
|
|
527
|
+
)
|
|
528
|
+
def test_reader_implements_interface(self, reader_class):
|
|
529
|
+
"""Test that readers implement InputReader interface."""
|
|
530
|
+
reader = reader_class()
|
|
531
|
+
assert isinstance(reader, InputReader)
|
|
532
|
+
assert hasattr(reader, "read_urls")
|
|
533
|
+
assert hasattr(reader, "validate_format")
|
|
534
|
+
assert callable(reader.read_urls)
|
|
535
|
+
assert callable(reader.validate_format)
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class TestReaderIntegration:
|
|
539
|
+
"""Integration tests for reader workflows."""
|
|
540
|
+
|
|
541
|
+
def test_factory_then_read_parquet(self, tmp_path: Path):
|
|
542
|
+
"""Test complete workflow: factory -> read parquet."""
|
|
543
|
+
# Create test file
|
|
544
|
+
df = pd.DataFrame({"stac_url": ["https://example.com/item.json"]})
|
|
545
|
+
path = tmp_path / "test.parquet"
|
|
546
|
+
df.to_parquet(path, index=False)
|
|
547
|
+
|
|
548
|
+
# Use factory to get reader and read
|
|
549
|
+
reader = ReaderFactory.get_reader("parquet")
|
|
550
|
+
urls = reader.read_urls(str(path), "stac_url")
|
|
551
|
+
assert urls == ["https://example.com/item.json"]
|
|
552
|
+
|
|
553
|
+
def test_factory_then_read_csv(self, tmp_path: Path):
|
|
554
|
+
"""Test complete workflow: factory -> read csv."""
|
|
555
|
+
path = tmp_path / "test.csv"
|
|
556
|
+
path.write_text("stac_url\nhttps://example.com/item.json\n")
|
|
557
|
+
|
|
558
|
+
reader = ReaderFactory.get_reader("csv")
|
|
559
|
+
urls = reader.read_urls(str(path), "stac_url")
|
|
560
|
+
assert urls == ["https://example.com/item.json"]
|
|
561
|
+
|
|
562
|
+
def test_factory_then_read_ndjson(self, tmp_path: Path):
|
|
563
|
+
"""Test complete workflow: factory -> read ndjson."""
|
|
564
|
+
path = tmp_path / "test.ndjson"
|
|
565
|
+
path.write_text('{"stac_url": "https://example.com/item.json"}\n')
|
|
566
|
+
|
|
567
|
+
reader = ReaderFactory.get_reader("ndjson")
|
|
568
|
+
urls = reader.read_urls(str(path), "stac_url")
|
|
569
|
+
assert urls == ["https://example.com/item.json"]
|
|
570
|
+
|
|
571
|
+
def test_factory_auto_detect_ndjson(self, tmp_path: Path):
|
|
572
|
+
"""Test auto-detection workflow for ndjson files."""
|
|
573
|
+
path = tmp_path / "data.jsonl"
|
|
574
|
+
path.write_text('{"url": "https://example.com/item.json"}\n')
|
|
575
|
+
|
|
576
|
+
# Auto-detect should pick jsonl format
|
|
577
|
+
format_detected = ReaderFactory.auto_detect_format(str(path))
|
|
578
|
+
assert format_detected == "jsonl"
|
|
579
|
+
|
|
580
|
+
# Get reader and read
|
|
581
|
+
reader = ReaderFactory.get_reader(format_detected)
|
|
582
|
+
urls = reader.read_urls(str(path), "url")
|
|
583
|
+
assert urls == ["https://example.com/item.json"]
|
|
584
|
+
|
|
585
|
+
def test_large_file_handling(self, tmp_path: Path):
|
|
586
|
+
"""Test reading larger files works correctly."""
|
|
587
|
+
# Create file with 10000 URLs
|
|
588
|
+
urls = [f"https://example.com/item{i}.json" for i in range(10000)]
|
|
589
|
+
df = pd.DataFrame({"url": urls})
|
|
590
|
+
path = tmp_path / "large.parquet"
|
|
591
|
+
df.to_parquet(path, index=False)
|
|
592
|
+
|
|
593
|
+
reader = ParquetReader()
|
|
594
|
+
result = reader.read_urls(str(path), "url")
|
|
595
|
+
assert len(result) == 10000
|
|
596
|
+
assert result[0] == "https://example.com/item0.json"
|
|
597
|
+
assert result[-1] == "https://example.com/item9999.json"
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
class TestCloudStorageMocking:
|
|
601
|
+
"""Tests for cloud storage integration (mocked)."""
|
|
602
|
+
|
|
603
|
+
@patch("earthcatalog.input_readers.HAS_FSSPEC", True)
|
|
604
|
+
@patch("earthcatalog.input_readers.fsspec")
|
|
605
|
+
def test_parquet_s3_path_detection(self, mock_fsspec):
|
|
606
|
+
"""Test that S3 paths trigger fsspec usage."""
|
|
607
|
+
# Setup mock
|
|
608
|
+
mock_fs = MagicMock()
|
|
609
|
+
mock_fsspec.filesystem.return_value = mock_fs
|
|
610
|
+
|
|
611
|
+
# Create a mock file object that returns parquet data
|
|
612
|
+
mock_file = MagicMock()
|
|
613
|
+
mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
|
|
614
|
+
mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
|
|
615
|
+
|
|
616
|
+
reader = ParquetReader()
|
|
617
|
+
|
|
618
|
+
# This should detect S3 and use fsspec
|
|
619
|
+
try:
|
|
620
|
+
reader.read_urls("s3://bucket/data.parquet", "url")
|
|
621
|
+
except (ValueError, TypeError, OSError):
|
|
622
|
+
# Expected to fail since mock doesn't return real data
|
|
623
|
+
pass
|
|
624
|
+
|
|
625
|
+
# Verify fsspec was called with s3
|
|
626
|
+
mock_fsspec.filesystem.assert_called_once_with("s3")
|
|
627
|
+
|
|
628
|
+
@patch("earthcatalog.input_readers.HAS_FSSPEC", True)
|
|
629
|
+
@patch("earthcatalog.input_readers.fsspec")
|
|
630
|
+
def test_csv_s3_path_detection(self, mock_fsspec):
|
|
631
|
+
"""Test that S3 paths trigger fsspec usage for CSV."""
|
|
632
|
+
mock_fs = MagicMock()
|
|
633
|
+
mock_fsspec.filesystem.return_value = mock_fs
|
|
634
|
+
|
|
635
|
+
mock_file = MagicMock()
|
|
636
|
+
mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
|
|
637
|
+
mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
|
|
638
|
+
|
|
639
|
+
reader = CSVReader()
|
|
640
|
+
|
|
641
|
+
try:
|
|
642
|
+
reader.read_urls("s3://bucket/data.csv", "url")
|
|
643
|
+
except (ValueError, TypeError, OSError):
|
|
644
|
+
pass
|
|
645
|
+
|
|
646
|
+
mock_fsspec.filesystem.assert_called_once_with("s3")
|
|
647
|
+
|
|
648
|
+
@patch("earthcatalog.input_readers.HAS_FSSPEC", True)
|
|
649
|
+
@patch("earthcatalog.input_readers.fsspec")
|
|
650
|
+
def test_ndjson_s3_path_detection(self, mock_fsspec):
|
|
651
|
+
"""Test that S3 paths trigger fsspec usage for NDJSON."""
|
|
652
|
+
mock_fs = MagicMock()
|
|
653
|
+
mock_fsspec.filesystem.return_value = mock_fs
|
|
654
|
+
|
|
655
|
+
mock_file = MagicMock()
|
|
656
|
+
mock_fs.open.return_value.__enter__ = MagicMock(return_value=mock_file)
|
|
657
|
+
mock_fs.open.return_value.__exit__ = MagicMock(return_value=False)
|
|
658
|
+
|
|
659
|
+
reader = NDJSONReader()
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
reader.read_urls("s3://bucket/data.ndjson", "url")
|
|
663
|
+
except (ValueError, TypeError, OSError):
|
|
664
|
+
pass
|
|
665
|
+
|
|
666
|
+
mock_fsspec.filesystem.assert_called_once_with("s3")
|