udata-hydra 2.1.3.dev7106__tar.gz → 2.1.3.dev7204__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/PKG-INFO +2 -2
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/pyproject.toml +2 -2
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/csv.py +3 -5
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/resource.py +29 -14
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/minio.py +2 -3
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/README.md +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/helpers.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/app.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/cli.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/config_default.toml +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/context.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/calculate_next_check.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/check_resources.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/helpers.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/preprocess_check_data.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/select_batch.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/check.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/resource.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/resource_exception.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/logger.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/checks.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/resources.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/resources_exceptions.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/status.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/check.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/resource.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/resource_exception.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/__init__.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/auth.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/csv.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/db.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/errors.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/file.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/http.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/parquet.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/queue.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/reader.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/timer.py +0 -0
- {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: udata-hydra
|
|
3
|
-
Version: 2.1.3.
|
|
3
|
+
Version: 2.1.3.dev7204
|
|
4
4
|
Summary: Async crawler and parsing service for data.gouv.fr
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Opendata Team
|
|
@@ -18,7 +18,7 @@ Requires-Dist: aioresponses (>=0.7.3) ; extra == "dev"
|
|
|
18
18
|
Requires-Dist: asyncpg (>=0.29.0)
|
|
19
19
|
Requires-Dist: bumpx (>=0.3.10) ; extra == "dev"
|
|
20
20
|
Requires-Dist: coloredlogs (>=15.0.1)
|
|
21
|
-
Requires-Dist: csv-detective (==0.7.
|
|
21
|
+
Requires-Dist: csv-detective (==0.7.4)
|
|
22
22
|
Requires-Dist: dateparser (>=1.1.7)
|
|
23
23
|
Requires-Dist: gunicorn (>=20.1.0) ; extra == "dev"
|
|
24
24
|
Requires-Dist: humanfriendly (>=10.0)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "udata-hydra"
|
|
3
|
-
version = "2.1.3.
|
|
3
|
+
version = "2.1.3.dev7204"
|
|
4
4
|
description = "Async crawler and parsing service for data.gouv.fr"
|
|
5
5
|
authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
|
|
6
6
|
dependencies = [
|
|
@@ -8,7 +8,7 @@ dependencies = [
|
|
|
8
8
|
"aiohttp>=3.10.3",
|
|
9
9
|
"asyncpg>=0.29.0",
|
|
10
10
|
"coloredlogs>=15.0.1",
|
|
11
|
-
"csv-detective==0.7.
|
|
11
|
+
"csv-detective==0.7.4",
|
|
12
12
|
"dateparser>=1.1.7",
|
|
13
13
|
"humanfriendly>=10.0",
|
|
14
14
|
"marshmallow>=3.14.1",
|
|
@@ -183,7 +183,6 @@ async def analyse_csv(
|
|
|
183
183
|
parquet_args: tuple[str, int] | None = await csv_to_parquet(
|
|
184
184
|
file_path=tmp_file.name,
|
|
185
185
|
inspection=csv_inspection,
|
|
186
|
-
table_name=table_name,
|
|
187
186
|
resource_id=resource_id,
|
|
188
187
|
)
|
|
189
188
|
timer.mark("csv-to-parquet")
|
|
@@ -294,7 +293,6 @@ def generate_records(file_path: str, inspection: dict, columns: dict) -> Iterato
|
|
|
294
293
|
async def csv_to_parquet(
|
|
295
294
|
file_path: str,
|
|
296
295
|
inspection: dict,
|
|
297
|
-
table_name: str,
|
|
298
296
|
resource_id: str | None = None,
|
|
299
297
|
) -> tuple[str, int] | None:
|
|
300
298
|
"""
|
|
@@ -315,13 +313,13 @@ async def csv_to_parquet(
|
|
|
315
313
|
|
|
316
314
|
if int(inspection.get("total_lines", 0)) < config.MIN_LINES_FOR_PARQUET:
|
|
317
315
|
log.debug(
|
|
318
|
-
f"Skipping parquet export for {
|
|
316
|
+
f"Skipping parquet export for {resource_id} because it has less than {config.MIN_LINES_FOR_PARQUET} lines."
|
|
319
317
|
)
|
|
320
318
|
return
|
|
321
319
|
|
|
322
320
|
log.debug(
|
|
323
321
|
f"Converting from {engine_to_file.get(inspection.get('engine', ''), 'CSV')} "
|
|
324
|
-
f"to parquet for {
|
|
322
|
+
f"to parquet for {resource_id} and sending to Minio."
|
|
325
323
|
)
|
|
326
324
|
|
|
327
325
|
if resource_id:
|
|
@@ -333,7 +331,7 @@ async def csv_to_parquet(
|
|
|
333
331
|
parquet_file, _ = save_as_parquet(
|
|
334
332
|
records=generate_records(file_path, inspection, columns),
|
|
335
333
|
columns=columns,
|
|
336
|
-
output_filename=
|
|
334
|
+
output_filename=resource_id,
|
|
337
335
|
)
|
|
338
336
|
parquet_size: int = os.path.getsize(parquet_file)
|
|
339
337
|
parquet_url: str = minio_client.send_file(parquet_file)
|
|
@@ -187,11 +187,18 @@ async def detect_resource_change_from_checksum(
|
|
|
187
187
|
"analysis:last-modified-detection": "computed-checksum",
|
|
188
188
|
}
|
|
189
189
|
"""
|
|
190
|
-
if last_check and last_check.get("checksum")
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
190
|
+
if last_check and last_check.get("checksum"):
|
|
191
|
+
if last_check.get("checksum") != new_checksum:
|
|
192
|
+
return Change.HAS_CHANGED, {
|
|
193
|
+
"analysis:last-modified-at": datetime.now(timezone.utc).isoformat(),
|
|
194
|
+
"analysis:last-modified-detection": "computed-checksum",
|
|
195
|
+
}
|
|
196
|
+
elif last_check.get("detected_last_modified_at"):
|
|
197
|
+
return Change.HAS_NOT_CHANGED, {
|
|
198
|
+
"analysis:last-modified-at": last_check["detected_last_modified_at"].isoformat(),
|
|
199
|
+
"analysis:last-modified-detection": "previous-check-detection",
|
|
200
|
+
}
|
|
201
|
+
# if the previous check did not have the info, we investigate further
|
|
195
202
|
return Change.NO_GUESS, None
|
|
196
203
|
|
|
197
204
|
|
|
@@ -224,16 +231,24 @@ async def detect_resource_change_from_content_length_header(
|
|
|
224
231
|
data: dict,
|
|
225
232
|
) -> tuple[Change, dict | None]:
|
|
226
233
|
# content-length variation between current and last check
|
|
227
|
-
if len(data) <= 1 or not data[0]
|
|
234
|
+
if len(data) <= 1 or not data[0].get("content_length"):
|
|
228
235
|
return Change.NO_GUESS, None
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
236
|
+
if data[0].get("content_length") and data[1].get("content_length"):
|
|
237
|
+
if data[0]["content_length"] != data[1]["content_length"]:
|
|
238
|
+
return Change.HAS_CHANGED, {
|
|
239
|
+
# if resource has changed, set last-modified to the current check's creation
|
|
240
|
+
"analysis:last-modified-at": data[0]["created_at"].isoformat(),
|
|
241
|
+
"analysis:last-modified-detection": "content-length-header",
|
|
242
|
+
}
|
|
243
|
+
# same content_length is not 100% certainly no change, but a good tradeoff to prevent many downloads
|
|
244
|
+
elif data[1].get("detected_last_modified_at"):
|
|
245
|
+
return Change.HAS_NOT_CHANGED, {
|
|
246
|
+
# no change, using the last-modified from the previous check (passed on from check to check)
|
|
247
|
+
"analysis:last-modified-at": data[1]["detected_last_modified_at"].isoformat(),
|
|
248
|
+
"analysis:last-modified-detection": "previous-check-detection",
|
|
249
|
+
}
|
|
250
|
+
# if the previous check did not have the info, we investigate further
|
|
251
|
+
return Change.NO_GUESS, None
|
|
237
252
|
|
|
238
253
|
|
|
239
254
|
async def detect_resource_change_on_early_hints(
|
|
@@ -10,12 +10,11 @@ log = logging.getLogger("udata-hydra")
|
|
|
10
10
|
|
|
11
11
|
class MinIOClient:
|
|
12
12
|
def __init__(self, bucket=config.MINIO_BUCKET):
|
|
13
|
-
self.url = config.MINIO_URL
|
|
14
13
|
self.user = config.MINIO_USER
|
|
15
14
|
self.password = config.MINIO_PWD
|
|
16
15
|
self.bucket = bucket
|
|
17
16
|
self.client = Minio(
|
|
18
|
-
|
|
17
|
+
config.MINIO_URL or "test",
|
|
19
18
|
access_key=self.user or "test",
|
|
20
19
|
secret_key=self.password or "test",
|
|
21
20
|
secure=True,
|
|
@@ -40,6 +39,6 @@ class MinIOClient:
|
|
|
40
39
|
)
|
|
41
40
|
if delete_source:
|
|
42
41
|
os.remove(file_name)
|
|
43
|
-
return f"https://{
|
|
42
|
+
return f"https://{config.MINIO_URL}/{self.bucket}/{config.MINIO_FOLDER}/{file_name}"
|
|
44
43
|
else:
|
|
45
44
|
raise Exception(f"file '{file_name}' does not exists")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/calculate_next_check.py
RENAMED
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/check_resources.py
RENAMED
|
File without changes
|
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/preprocess_check_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/resources_exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|