udata-hydra 2.2.1.dev7337__tar.gz → 2.2.1.dev7367__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/PKG-INFO +1 -1
  2. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/pyproject.toml +1 -1
  3. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/geojson.py +0 -1
  4. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/cli.py +84 -12
  5. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/README.md +0 -0
  6. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/__init__.py +0 -0
  7. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/__init__.py +0 -0
  8. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/csv.py +0 -0
  9. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/helpers.py +0 -0
  10. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/resource.py +0 -0
  11. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/app.py +0 -0
  12. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/config_default.toml +0 -0
  13. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/context.py +0 -0
  14. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/__init__.py +0 -0
  15. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/calculate_next_check.py +0 -0
  16. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/check_resources.py +0 -0
  17. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/helpers.py +0 -0
  18. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  19. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/select_batch.py +0 -0
  20. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/__init__.py +0 -0
  21. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/check.py +0 -0
  22. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/resource.py +0 -0
  23. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/resource_exception.py +0 -0
  24. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/logger.py +0 -0
  25. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/__init__.py +0 -0
  26. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  27. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  28. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  29. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  30. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  32. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  33. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  34. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  35. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  36. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  37. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  38. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  39. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  40. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  41. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  42. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  43. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  44. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  45. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  46. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  47. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  48. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  49. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  50. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
  51. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/__init__.py +0 -0
  52. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/checks.py +0 -0
  53. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/resources.py +0 -0
  54. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/resources_exceptions.py +0 -0
  55. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/status.py +0 -0
  56. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/__init__.py +0 -0
  57. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/check.py +0 -0
  58. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/resource.py +0 -0
  59. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/resource_exception.py +0 -0
  60. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/__init__.py +0 -0
  61. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/auth.py +0 -0
  62. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/csv.py +0 -0
  63. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/db.py +0 -0
  64. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/errors.py +0 -0
  65. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/file.py +0 -0
  66. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/geojson.py +0 -0
  67. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/http.py +0 -0
  68. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/minio.py +0 -0
  69. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/parquet.py +0 -0
  70. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/queue.py +0 -0
  71. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/reader.py +0 -0
  72. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/timer.py +0 -0
  73. {udata_hydra-2.2.1.dev7337 → udata_hydra-2.2.1.dev7367}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.2.1.dev7337
3
+ Version: 2.2.1.dev7367
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.2.1.dev7337"
3
+ version = "2.2.1.dev7367"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import os
3
- import subprocess
4
3
  from datetime import datetime, timezone
5
4
 
6
5
  import tippecanoe
@@ -1,7 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import os
4
- from datetime import datetime, timezone
4
+ from datetime import datetime, timedelta, timezone
5
5
  from pathlib import Path
6
6
  from tempfile import NamedTemporaryFile
7
7
 
@@ -15,6 +15,7 @@ from progressist import ProgressBar
15
15
  from udata_hydra import config
16
16
  from udata_hydra.analysis.csv import analyse_csv
17
17
  from udata_hydra.analysis.geojson import analyse_geojson
18
+ from udata_hydra.analysis.resource import analyse_resource
18
19
  from udata_hydra.crawl.check_resources import check_resource as crawl_check_resource
19
20
  from udata_hydra.db.check import Check
20
21
  from udata_hydra.db.resource import Resource
@@ -159,6 +160,16 @@ async def check_resource(resource_id: str, method: str = "get", force_analysis:
159
160
  )
160
161
 
161
162
 
163
+ @cli(name="analyse-resource")
164
+ async def analyse_resource_cli(resource_id: str):
165
+ """Trigger a resource analysis, mainly useful for local debug (with breakpoints)"""
166
+ check: Record | None = await Check.get_by_resource_id(resource_id)
167
+ if not check:
168
+ log.error("Could not find a check linked to the specified resource ID")
169
+ return
170
+ await analyse_resource(check=check, last_check=None, force_analysis=True)
171
+
172
+
162
173
  @cli(name="analyse-csv")
163
174
  async def analyse_csv_cli(
164
175
  check_id: str | None = None,
@@ -349,23 +360,27 @@ async def purge_csv_tables(quiet: bool = False) -> None:
349
360
  ON checks.parsing_table = md5(c.url)
350
361
  WHERE checks.parsing_table IS NOT NULL AND (c.id IS NULL OR c.deleted = TRUE);
351
362
  """
352
- conn = await connection()
353
- res: list[Record] = await conn.fetch(q)
363
+ conn_main = await connection()
364
+ res: list[Record] = await conn_main.fetch(q)
354
365
  tables_to_delete: list[str] = [r["parsing_table"] for r in res]
355
366
 
356
367
  success_count = 0
357
368
  error_count = 0
358
369
 
370
+ conn_csv = await connection(db_name="csv")
359
371
  for table in tables_to_delete:
360
372
  try:
361
- async with conn.transaction():
362
- log.debug(f'Deleting table "{table}"')
363
- await conn.execute(f'DROP TABLE IF EXISTS "{table}"')
364
- await conn.execute("DELETE FROM tables_index WHERE parsing_table = $1", table)
365
- await conn.execute(
366
- "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
367
- )
368
- success_count += 1
373
+ async with conn_main.transaction():
374
+ async with conn_csv.transaction():
375
+ log.debug(f'Deleting table "{table}"')
376
+ await conn_csv.execute(f'DROP TABLE IF EXISTS "{table}"')
377
+ await conn_main.execute(
378
+ "DELETE FROM tables_index WHERE parsing_table = $1", table
379
+ )
380
+ await conn_main.execute(
381
+ "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
382
+ )
383
+ success_count += 1
369
384
  except Exception as e:
370
385
  error_count += 1
371
386
  log.error(f'Failed to delete table "{table}": {str(e)}')
@@ -417,7 +432,10 @@ async def insert_resource_into_catalog(resource_id: str):
417
432
  datetime.fromisoformat(resource["resource"]["harvest"]["modified_at"]).replace(
418
433
  tzinfo=timezone.utc
419
434
  )
420
- if resource["resource"].get("harvest")
435
+ if (
436
+ resource["resource"].get("harvest") is not None
437
+ and resource["resource"]["harvest"].get("modified_at")
438
+ )
421
439
  else None,
422
440
  )
423
441
  log.info(f"Resource {resource_id} successfully {action}ed into DB.")
@@ -425,6 +443,60 @@ async def insert_resource_into_catalog(resource_id: str):
425
443
  raise e
426
444
 
427
445
 
446
+ @cli
447
+ async def purge_selected_csv_tables(
448
+ nb_days_to_keep: int | None = None,
449
+ nb_tables_to_keep: int | None = None,
450
+ quiet: bool = False,
451
+ ) -> None:
452
+ """Delete converted CSV tables either:
453
+ - if they're more than nb_days_to_keep days old
454
+ - if they're not in the top nb_table_to_keep most recent
455
+ """
456
+ if quiet:
457
+ log.setLevel(logging.ERROR)
458
+
459
+ assert nb_days_to_keep is not None or nb_tables_to_keep is not None
460
+ conn_csv = await connection(db_name="csv")
461
+ if nb_days_to_keep is not None:
462
+ threshold = datetime.now(timezone.utc) - timedelta(days=int(nb_days_to_keep))
463
+ q = """SELECT parsing_table FROM tables_index WHERE created_at <= $1"""
464
+ res: list[Record] = await conn_csv.fetch(q, threshold)
465
+ elif nb_tables_to_keep is not None:
466
+ q = """SELECT parsing_table FROM tables_index ORDER BY created_at DESC OFFSET $1"""
467
+ res: list[Record] = await conn_csv.fetch(q, int(nb_tables_to_keep))
468
+
469
+ tables_to_delete: list[str] = [r["parsing_table"] for r in res]
470
+
471
+ success_count = 0
472
+ error_count = 0
473
+ conn_main = await connection()
474
+ for table in tables_to_delete:
475
+ try:
476
+ async with conn_main.transaction():
477
+ async with conn_csv.transaction():
478
+ log.debug(f'Deleting table "{table}"')
479
+ await conn_csv.execute(f'DROP TABLE IF EXISTS "{table}"')
480
+ await conn_csv.execute(
481
+ "DELETE FROM tables_index WHERE parsing_table = $1", table
482
+ )
483
+ await conn_main.execute(
484
+ "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
485
+ )
486
+ success_count += 1
487
+ except Exception as e:
488
+ error_count += 1
489
+ log.error(f'Failed to delete table "{table}": {str(e)}')
490
+ continue
491
+
492
+ if success_count:
493
+ log.info(f"Successfully deleted {success_count} table(s).")
494
+ if error_count:
495
+ log.warning(f"Failed to delete {error_count} table(s). Check logs for details.")
496
+ if not (success_count or error_count):
497
+ log.info("Nothing to delete.")
498
+
499
+
428
500
  @wrap
429
501
  async def cli_wrapper():
430
502
  context["conn"] = {}