udata-hydra 2.0.5.dev5485__tar.gz → 2.0.5.dev5527__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/PKG-INFO +1 -1
  2. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/pyproject.toml +1 -1
  3. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/resource.py +6 -4
  4. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/cli.py +2 -1
  5. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/check_resources.py +14 -2
  6. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/checks.py +6 -1
  7. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/README.md +0 -0
  8. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/__init__.py +0 -0
  9. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/__init__.py +0 -0
  10. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/csv.py +0 -0
  11. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/errors.py +0 -0
  12. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/helpers.py +0 -0
  13. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/app.py +0 -0
  14. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/config_default.toml +0 -0
  15. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/context.py +0 -0
  16. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/__init__.py +0 -0
  17. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/helpers.py +0 -0
  18. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/process_check_data.py +0 -0
  19. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/select_batch.py +0 -0
  20. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/__init__.py +0 -0
  21. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/check.py +0 -0
  22. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/resource.py +0 -0
  23. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/resource_exception.py +0 -0
  24. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/logger.py +0 -0
  25. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/__init__.py +0 -0
  26. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  27. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  28. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  29. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  30. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  32. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  33. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  34. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  35. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  36. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  37. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  38. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  39. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  40. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  41. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  42. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  43. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  44. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  45. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  46. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  47. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  48. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/__init__.py +0 -0
  49. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources.py +0 -0
  50. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_exceptions.py +0 -0
  51. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_legacy.py +0 -0
  52. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/status.py +0 -0
  53. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/__init__.py +0 -0
  54. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/check.py +0 -0
  55. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/resource.py +0 -0
  56. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/resource_exception.py +0 -0
  57. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/__init__.py +0 -0
  58. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/auth.py +0 -0
  59. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/csv.py +0 -0
  60. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/db.py +0 -0
  61. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/file.py +0 -0
  62. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/http.py +0 -0
  63. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/minio.py +0 -0
  64. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/parquet.py +0 -0
  65. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/queue.py +0 -0
  66. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/reader.py +0 -0
  67. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/timer.py +0 -0
  68. {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: udata-hydra
3
- Version: 2.0.5.dev5485
3
+ Version: 2.0.5.dev5527
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "udata-hydra"
3
- version = "2.0.5.dev5485"
3
+ version = "2.0.5.dev5527"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = ["Opendata Team <opendatateam@data.gouv.fr>"]
6
6
  license = "MIT"
@@ -31,7 +31,9 @@ class Change(Enum):
31
31
  log = logging.getLogger("udata-hydra")
32
32
 
33
33
 
34
- async def analyse_resource(check_id: int, is_first_check: bool) -> None:
34
+ async def analyse_resource(
35
+ check_id: int, is_first_check: bool, force_analysis: bool = False
36
+ ) -> None:
35
37
  """
36
38
  Perform analysis on the resource designated by check_id:
37
39
  - change analysis
@@ -70,7 +72,7 @@ async def analyse_resource(check_id: int, is_first_check: bool) -> None:
70
72
  # if the change status is NO_GUESS or HAS_CHANGED, let's download the file to get more infos
71
73
  dl_analysis = {}
72
74
  tmp_file = None
73
- if change_status != Change.HAS_NOT_CHANGED:
75
+ if change_status != Change.HAS_NOT_CHANGED or force_analysis:
74
76
  try:
75
77
  tmp_file = await download_resource(url, headers, max_size_allowed)
76
78
  except IOError:
@@ -107,7 +109,7 @@ async def analyse_resource(check_id: int, is_first_check: bool) -> None:
107
109
 
108
110
  analysis_results = {**dl_analysis, **(change_payload or {})}
109
111
 
110
- if change_status == Change.HAS_CHANGED or is_first_check:
112
+ if change_status == Change.HAS_CHANGED or is_first_check or force_analysis:
111
113
  if is_tabular and tmp_file:
112
114
  # Change status to TO_ANALYSE_CSV
113
115
  await Resource.update(resource_id, data={"status": "TO_ANALYSE_CSV"})
@@ -260,7 +262,7 @@ async def detect_resource_change_on_early_hints(
260
262
 
261
263
 
262
264
  async def detect_resource_change_from_harvest(
263
- checks_data: list, resource_id: str
265
+ checks_data: tuple, resource_id: str
264
266
  ) -> tuple[Change, dict | None]:
265
267
  """
266
268
  Checks if resource has a harvest.modified_at
@@ -137,7 +137,7 @@ async def crawl_url(url: str, method: str = "get"):
137
137
 
138
138
 
139
139
  @cli
140
- async def check_resource(resource_id: str, method: str = "get"):
140
+ async def check_resource(resource_id: str, method: str = "get", force_analysis: bool = True):
141
141
  """Trigger a complete check for a given resource_id"""
142
142
  resource: asyncpg.Record | None = await Resource.get(resource_id)
143
143
  if not resource:
@@ -149,6 +149,7 @@ async def check_resource(resource_id: str, method: str = "get"):
149
149
  resource_id=resource_id,
150
150
  session=session,
151
151
  method=method,
152
+ force_analysis=force_analysis,
152
153
  worker_priority="high",
153
154
  )
154
155
 
@@ -61,6 +61,7 @@ async def check_resource(
61
61
  sleep: float = 0,
62
62
  method: str = "head",
63
63
  worker_priority: str = "default",
64
+ force_analysis: bool = False,
64
65
  ) -> str:
65
66
  log.debug(f"check {url}, sleep {sleep}, method {method}")
66
67
 
@@ -101,7 +102,12 @@ async def check_resource(
101
102
  end = time.time()
102
103
  if method != "get" and not has_nice_head(resp):
103
104
  return await check_resource(
104
- url, resource_id, session, method="get", worker_priority=worker_priority
105
+ url,
106
+ resource_id,
107
+ session,
108
+ force_analysis=force_analysis,
109
+ method="get",
110
+ worker_priority=worker_priority,
105
111
  )
106
112
  resp.raise_for_status()
107
113
 
@@ -122,7 +128,13 @@ async def check_resource(
122
128
  await Resource.update(resource_id, data={"status": "TO_ANALYSE_RESOURCE"})
123
129
 
124
130
  # Enqueue the resource for analysis
125
- queue.enqueue(analyse_resource, check["id"], is_first_check, _priority=worker_priority)
131
+ queue.enqueue(
132
+ analyse_resource,
133
+ check["id"],
134
+ is_first_check,
135
+ force_analysis,
136
+ _priority=worker_priority,
137
+ )
126
138
 
127
139
  return RESOURCE_RESPONSE_STATUSES["OK"]
128
140
 
@@ -64,6 +64,7 @@ async def create_check(request: web.Request) -> web.Response:
64
64
  try:
65
65
  payload: dict = await request.json()
66
66
  resource_id: str = payload["resource_id"]
67
+ force_analysis: bool = payload.get("force_analysis", True)
67
68
  except Exception as err:
68
69
  raise web.HTTPBadRequest(text=json.dumps({"error": str(err)}))
69
70
 
@@ -80,7 +81,11 @@ async def create_check(request: web.Request) -> web.Response:
80
81
  timeout=None, headers={"user-agent": config.USER_AGENT}
81
82
  ) as session:
82
83
  status: str = await check_resource(
83
- url=url, resource_id=resource_id, session=session, worker_priority="high"
84
+ url=url,
85
+ resource_id=resource_id,
86
+ force_analysis=force_analysis,
87
+ session=session,
88
+ worker_priority="high",
84
89
  )
85
90
  context.monitor().refresh(status)
86
91