udata-hydra 2.0.4.dev5074__tar.gz → 2.0.4.dev5083__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of udata-hydra might be problematic. Click here for more details.

Files changed (68) hide show
  1. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/PKG-INFO +1 -1
  2. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/pyproject.toml +1 -1
  3. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/crawl/select_batch.py +6 -5
  4. udata_hydra-2.0.4.dev5083/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +8 -0
  5. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/README.md +0 -0
  6. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/__init__.py +0 -0
  7. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/analysis/__init__.py +0 -0
  8. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/analysis/csv.py +0 -0
  9. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/analysis/errors.py +0 -0
  10. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/analysis/helpers.py +0 -0
  11. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/analysis/resource.py +0 -0
  12. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/app.py +0 -0
  13. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/cli.py +0 -0
  14. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/config_default.toml +0 -0
  15. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/context.py +0 -0
  16. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/crawl/__init__.py +0 -0
  17. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/crawl/check_resources.py +0 -0
  18. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/crawl/helpers.py +0 -0
  19. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/crawl/process_check_data.py +0 -0
  20. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/db/__init__.py +0 -0
  21. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/db/check.py +0 -0
  22. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/db/resource.py +0 -0
  23. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/db/resource_exception.py +0 -0
  24. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/logger.py +0 -0
  25. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/__init__.py +0 -0
  26. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  27. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  28. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  29. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  30. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  32. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  33. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  34. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  35. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  36. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  37. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  38. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  39. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  40. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  41. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  42. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  43. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  44. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  45. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  46. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  47. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/__init__.py +0 -0
  48. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/checks.py +0 -0
  49. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/resources.py +0 -0
  50. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/resources_exceptions.py +0 -0
  51. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/resources_legacy.py +0 -0
  52. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/routes/status.py +0 -0
  53. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/schemas/__init__.py +0 -0
  54. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/schemas/check.py +0 -0
  55. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/schemas/resource.py +0 -0
  56. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/schemas/resource_exception.py +0 -0
  57. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/__init__.py +0 -0
  58. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/auth.py +0 -0
  59. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/csv.py +0 -0
  60. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/db.py +0 -0
  61. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/file.py +0 -0
  62. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/http.py +0 -0
  63. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/minio.py +0 -0
  64. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/parquet.py +0 -0
  65. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/queue.py +0 -0
  66. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/reader.py +0 -0
  67. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/utils/timer.py +0 -0
  68. {udata_hydra-2.0.4.dev5074 → udata_hydra-2.0.4.dev5083}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: udata-hydra
3
- Version: 2.0.4.dev5074
3
+ Version: 2.0.4.dev5083
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "udata-hydra"
3
- version = "2.0.4.dev5074"
3
+ version = "2.0.4.dev5083"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = ["Opendata Team <opendatateam@data.gouv.fr>"]
6
6
  license = "MIT"
@@ -30,8 +30,8 @@ async def select_rows_based_on_query(connection, q: str, *args) -> list[Record]:
30
30
  async def select_batch_resources_to_check() -> list[Record]:
31
31
  """Select a batch of resources to check from the catalog
32
32
  - It first selects resources with priority=True
33
- - ...then resources without checks
34
- - and if the total number of selected resources is still less than the batch size, it will also add resources with outdated checks in the batch.
33
+ - ...then resources without last check
34
+ - and if the total number of selected resources is still less than the batch size, it will also add resources with outdated last check in the batch
35
35
  """
36
36
  context.monitor().set_status("Getting a batch from catalog...")
37
37
 
@@ -39,7 +39,7 @@ async def select_batch_resources_to_check() -> list[Record]:
39
39
  async with pool.acquire() as connection:
40
40
  excluded = Resource.get_excluded_clause()
41
41
 
42
- # first urls that are prioritised
42
+ # first resources that are prioritised
43
43
  q = f"""
44
44
  SELECT * FROM (
45
45
  SELECT catalog.url, dataset_id, resource_id
@@ -51,7 +51,8 @@ async def select_batch_resources_to_check() -> list[Record]:
51
51
  """
52
52
  to_check: list[Record] = await select_rows_based_on_query(connection, q)
53
53
 
54
- # then urls without checks
54
+ # then resources with no last check
55
+ # (either because they have never been checked before, or because the last check has been deleted)
55
56
  if len(to_check) < config.BATCH_SIZE:
56
57
  q = f"""
57
58
  SELECT * FROM (
@@ -65,7 +66,7 @@ async def select_batch_resources_to_check() -> list[Record]:
65
66
  """
66
67
  to_check += await select_rows_based_on_query(connection, q)
67
68
 
68
- # if not enough for our batch size, handle outdated checks
69
+ # if not enough for our batch size, handle resources with outdated last check
69
70
  if len(to_check) < config.BATCH_SIZE:
70
71
  since = parse_timespan(config.SINCE) # in seconds
71
72
  since = datetime.now(timezone.utc) - timedelta(seconds=since)
@@ -0,0 +1,8 @@
1
+ -- Change the column `last_check` of the `catalog` table to be a foreign key to the `check` table.
2
+ -- (assuming `last_check` is already an INT)
3
+ -- When a check is deleted, the `last_check` column of the `catalog` table will be set to NULL.
4
+
5
+ ALTER TABLE catalog
6
+ ADD CONSTRAINT fk_last_check
7
+ FOREIGN KEY (last_check) REFERENCES checks(id)
8
+ ON DELETE SET NULL;