udata-hydra 2.2.1.dev7347__tar.gz → 2.2.1.dev7367__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/PKG-INFO +1 -1
  2. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/pyproject.toml +1 -1
  3. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/cli.py +69 -11
  4. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/README.md +0 -0
  5. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/__init__.py +0 -0
  6. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/__init__.py +0 -0
  7. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/csv.py +0 -0
  8. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/geojson.py +0 -0
  9. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/helpers.py +0 -0
  10. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/analysis/resource.py +0 -0
  11. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/app.py +0 -0
  12. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/config_default.toml +0 -0
  13. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/context.py +0 -0
  14. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/__init__.py +0 -0
  15. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/calculate_next_check.py +0 -0
  16. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/check_resources.py +0 -0
  17. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/helpers.py +0 -0
  18. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  19. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/crawl/select_batch.py +0 -0
  20. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/__init__.py +0 -0
  21. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/check.py +0 -0
  22. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/resource.py +0 -0
  23. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/db/resource_exception.py +0 -0
  24. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/logger.py +0 -0
  25. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/__init__.py +0 -0
  26. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  27. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  28. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  29. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  30. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  32. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  33. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  34. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  35. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  36. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  37. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  38. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  39. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  40. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  41. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  42. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  43. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  44. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  45. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  46. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  47. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  48. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  49. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  50. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
  51. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/__init__.py +0 -0
  52. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/checks.py +0 -0
  53. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/resources.py +0 -0
  54. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/resources_exceptions.py +0 -0
  55. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/routes/status.py +0 -0
  56. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/__init__.py +0 -0
  57. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/check.py +0 -0
  58. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/resource.py +0 -0
  59. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/schemas/resource_exception.py +0 -0
  60. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/__init__.py +0 -0
  61. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/auth.py +0 -0
  62. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/csv.py +0 -0
  63. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/db.py +0 -0
  64. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/errors.py +0 -0
  65. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/file.py +0 -0
  66. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/geojson.py +0 -0
  67. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/http.py +0 -0
  68. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/minio.py +0 -0
  69. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/parquet.py +0 -0
  70. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/queue.py +0 -0
  71. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/reader.py +0 -0
  72. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/utils/timer.py +0 -0
  73. {udata_hydra-2.2.1.dev7347 → udata_hydra-2.2.1.dev7367}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.2.1.dev7347
3
+ Version: 2.2.1.dev7367
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.2.1.dev7347"
3
+ version = "2.2.1.dev7367"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -1,7 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import os
4
- from datetime import datetime, timezone
4
+ from datetime import datetime, timedelta, timezone
5
5
  from pathlib import Path
6
6
  from tempfile import NamedTemporaryFile
7
7
 
@@ -360,23 +360,27 @@ async def purge_csv_tables(quiet: bool = False) -> None:
360
360
  ON checks.parsing_table = md5(c.url)
361
361
  WHERE checks.parsing_table IS NOT NULL AND (c.id IS NULL OR c.deleted = TRUE);
362
362
  """
363
- conn = await connection()
364
- res: list[Record] = await conn.fetch(q)
363
+ conn_main = await connection()
364
+ res: list[Record] = await conn_main.fetch(q)
365
365
  tables_to_delete: list[str] = [r["parsing_table"] for r in res]
366
366
 
367
367
  success_count = 0
368
368
  error_count = 0
369
369
 
370
+ conn_csv = await connection(db_name="csv")
370
371
  for table in tables_to_delete:
371
372
  try:
372
- async with conn.transaction():
373
- log.debug(f'Deleting table "{table}"')
374
- await conn.execute(f'DROP TABLE IF EXISTS "{table}"')
375
- await conn.execute("DELETE FROM tables_index WHERE parsing_table = $1", table)
376
- await conn.execute(
377
- "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
378
- )
379
- success_count += 1
373
+ async with conn_main.transaction():
374
+ async with conn_csv.transaction():
375
+ log.debug(f'Deleting table "{table}"')
376
+ await conn_csv.execute(f'DROP TABLE IF EXISTS "{table}"')
377
+ await conn_main.execute(
378
+ "DELETE FROM tables_index WHERE parsing_table = $1", table
379
+ )
380
+ await conn_main.execute(
381
+ "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
382
+ )
383
+ success_count += 1
380
384
  except Exception as e:
381
385
  error_count += 1
382
386
  log.error(f'Failed to delete table "{table}": {str(e)}')
@@ -439,6 +443,60 @@ async def insert_resource_into_catalog(resource_id: str):
439
443
  raise e
440
444
 
441
445
 
446
+ @cli
447
+ async def purge_selected_csv_tables(
448
+ nb_days_to_keep: int | None = None,
449
+ nb_tables_to_keep: int | None = None,
450
+ quiet: bool = False,
451
+ ) -> None:
452
+ """Delete converted CSV tables either:
453
+ - if they're more than nb_days_to_keep days old
454
+ - if they're not in the top nb_table_to_keep most recent
455
+ """
456
+ if quiet:
457
+ log.setLevel(logging.ERROR)
458
+
459
+ assert nb_days_to_keep is not None or nb_tables_to_keep is not None
460
+ conn_csv = await connection(db_name="csv")
461
+ if nb_days_to_keep is not None:
462
+ threshold = datetime.now(timezone.utc) - timedelta(days=int(nb_days_to_keep))
463
+ q = """SELECT parsing_table FROM tables_index WHERE created_at <= $1"""
464
+ res: list[Record] = await conn_csv.fetch(q, threshold)
465
+ elif nb_tables_to_keep is not None:
466
+ q = """SELECT parsing_table FROM tables_index ORDER BY created_at DESC OFFSET $1"""
467
+ res: list[Record] = await conn_csv.fetch(q, int(nb_tables_to_keep))
468
+
469
+ tables_to_delete: list[str] = [r["parsing_table"] for r in res]
470
+
471
+ success_count = 0
472
+ error_count = 0
473
+ conn_main = await connection()
474
+ for table in tables_to_delete:
475
+ try:
476
+ async with conn_main.transaction():
477
+ async with conn_csv.transaction():
478
+ log.debug(f'Deleting table "{table}"')
479
+ await conn_csv.execute(f'DROP TABLE IF EXISTS "{table}"')
480
+ await conn_csv.execute(
481
+ "DELETE FROM tables_index WHERE parsing_table = $1", table
482
+ )
483
+ await conn_main.execute(
484
+ "UPDATE checks SET parsing_table = NULL WHERE parsing_table = $1", table
485
+ )
486
+ success_count += 1
487
+ except Exception as e:
488
+ error_count += 1
489
+ log.error(f'Failed to delete table "{table}": {str(e)}')
490
+ continue
491
+
492
+ if success_count:
493
+ log.info(f"Successfully deleted {success_count} table(s).")
494
+ if error_count:
495
+ log.warning(f"Failed to delete {error_count} table(s). Check logs for details.")
496
+ if not (success_count or error_count):
497
+ log.info("Nothing to delete.")
498
+
499
+
442
500
  @wrap
443
501
  async def cli_wrapper():
444
502
  context["conn"] = {}