udata-hydra 2.1.3.dev7106__tar.gz → 2.1.3.dev7204__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/PKG-INFO +2 -2
  2. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/pyproject.toml +2 -2
  3. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/csv.py +3 -5
  4. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/resource.py +29 -14
  5. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/minio.py +2 -3
  6. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/README.md +0 -0
  7. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/__init__.py +0 -0
  8. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/__init__.py +0 -0
  9. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/analysis/helpers.py +0 -0
  10. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/app.py +0 -0
  11. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/cli.py +0 -0
  12. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/config_default.toml +0 -0
  13. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/context.py +0 -0
  14. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/__init__.py +0 -0
  15. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/calculate_next_check.py +0 -0
  16. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/check_resources.py +0 -0
  17. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/helpers.py +0 -0
  18. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  19. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/crawl/select_batch.py +0 -0
  20. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/__init__.py +0 -0
  21. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/check.py +0 -0
  22. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/resource.py +0 -0
  23. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/db/resource_exception.py +0 -0
  24. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/logger.py +0 -0
  25. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/__init__.py +0 -0
  26. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  27. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  28. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  29. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  30. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  32. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  33. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  34. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  35. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  36. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  37. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  38. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  39. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  40. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  41. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  42. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  43. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  44. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  45. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  46. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  47. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  48. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  49. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  50. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/__init__.py +0 -0
  51. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/checks.py +0 -0
  52. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/resources.py +0 -0
  53. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/resources_exceptions.py +0 -0
  54. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/routes/status.py +0 -0
  55. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/__init__.py +0 -0
  56. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/check.py +0 -0
  57. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/resource.py +0 -0
  58. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/schemas/resource_exception.py +0 -0
  59. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/__init__.py +0 -0
  60. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/auth.py +0 -0
  61. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/csv.py +0 -0
  62. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/db.py +0 -0
  63. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/errors.py +0 -0
  64. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/file.py +0 -0
  65. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/http.py +0 -0
  66. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/parquet.py +0 -0
  67. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/queue.py +0 -0
  68. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/reader.py +0 -0
  69. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/utils/timer.py +0 -0
  70. {udata_hydra-2.1.3.dev7106 → udata_hydra-2.1.3.dev7204}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.1.3.dev7106
3
+ Version: 2.1.3.dev7204
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -18,7 +18,7 @@ Requires-Dist: aioresponses (>=0.7.3) ; extra == "dev"
18
18
  Requires-Dist: asyncpg (>=0.29.0)
19
19
  Requires-Dist: bumpx (>=0.3.10) ; extra == "dev"
20
20
  Requires-Dist: coloredlogs (>=15.0.1)
21
- Requires-Dist: csv-detective (==0.7.3)
21
+ Requires-Dist: csv-detective (==0.7.4)
22
22
  Requires-Dist: dateparser (>=1.1.7)
23
23
  Requires-Dist: gunicorn (>=20.1.0) ; extra == "dev"
24
24
  Requires-Dist: humanfriendly (>=10.0)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.1.3.dev7106"
3
+ version = "2.1.3.dev7204"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -8,7 +8,7 @@ dependencies = [
8
8
  "aiohttp>=3.10.3",
9
9
  "asyncpg>=0.29.0",
10
10
  "coloredlogs>=15.0.1",
11
- "csv-detective==0.7.3",
11
+ "csv-detective==0.7.4",
12
12
  "dateparser>=1.1.7",
13
13
  "humanfriendly>=10.0",
14
14
  "marshmallow>=3.14.1",
@@ -183,7 +183,6 @@ async def analyse_csv(
183
183
  parquet_args: tuple[str, int] | None = await csv_to_parquet(
184
184
  file_path=tmp_file.name,
185
185
  inspection=csv_inspection,
186
- table_name=table_name,
187
186
  resource_id=resource_id,
188
187
  )
189
188
  timer.mark("csv-to-parquet")
@@ -294,7 +293,6 @@ def generate_records(file_path: str, inspection: dict, columns: dict) -> Iterato
294
293
  async def csv_to_parquet(
295
294
  file_path: str,
296
295
  inspection: dict,
297
- table_name: str,
298
296
  resource_id: str | None = None,
299
297
  ) -> tuple[str, int] | None:
300
298
  """
@@ -315,13 +313,13 @@ async def csv_to_parquet(
315
313
 
316
314
  if int(inspection.get("total_lines", 0)) < config.MIN_LINES_FOR_PARQUET:
317
315
  log.debug(
318
- f"Skipping parquet export for {table_name} because it has less than {config.MIN_LINES_FOR_PARQUET} lines."
316
+ f"Skipping parquet export for {resource_id} because it has less than {config.MIN_LINES_FOR_PARQUET} lines."
319
317
  )
320
318
  return
321
319
 
322
320
  log.debug(
323
321
  f"Converting from {engine_to_file.get(inspection.get('engine', ''), 'CSV')} "
324
- f"to parquet for {table_name} and sending to Minio."
322
+ f"to parquet for {resource_id} and sending to Minio."
325
323
  )
326
324
 
327
325
  if resource_id:
@@ -333,7 +331,7 @@ async def csv_to_parquet(
333
331
  parquet_file, _ = save_as_parquet(
334
332
  records=generate_records(file_path, inspection, columns),
335
333
  columns=columns,
336
- output_filename=table_name,
334
+ output_filename=resource_id,
337
335
  )
338
336
  parquet_size: int = os.path.getsize(parquet_file)
339
337
  parquet_url: str = minio_client.send_file(parquet_file)
@@ -187,11 +187,18 @@ async def detect_resource_change_from_checksum(
187
187
  "analysis:last-modified-detection": "computed-checksum",
188
188
  }
189
189
  """
190
- if last_check and last_check.get("checksum") != new_checksum:
191
- return Change.HAS_CHANGED, {
192
- "analysis:last-modified-at": datetime.now(timezone.utc).isoformat(),
193
- "analysis:last-modified-detection": "computed-checksum",
194
- }
190
+ if last_check and last_check.get("checksum"):
191
+ if last_check.get("checksum") != new_checksum:
192
+ return Change.HAS_CHANGED, {
193
+ "analysis:last-modified-at": datetime.now(timezone.utc).isoformat(),
194
+ "analysis:last-modified-detection": "computed-checksum",
195
+ }
196
+ elif last_check.get("detected_last_modified_at"):
197
+ return Change.HAS_NOT_CHANGED, {
198
+ "analysis:last-modified-at": last_check["detected_last_modified_at"].isoformat(),
199
+ "analysis:last-modified-detection": "previous-check-detection",
200
+ }
201
+ # if the previous check did not have the info, we investigate further
195
202
  return Change.NO_GUESS, None
196
203
 
197
204
 
@@ -224,16 +231,24 @@ async def detect_resource_change_from_content_length_header(
224
231
  data: dict,
225
232
  ) -> tuple[Change, dict | None]:
226
233
  # content-length variation between current and last check
227
- if len(data) <= 1 or not data[0]["content_length"]:
234
+ if len(data) <= 1 or not data[0].get("content_length"):
228
235
  return Change.NO_GUESS, None
229
- changed_at = data[0]["created_at"]
230
- payload = {
231
- "analysis:last-modified-at": changed_at.isoformat(),
232
- "analysis:last-modified-detection": "content-length-header",
233
- }
234
- if data[0]["content_length"] != data[1]["content_length"]:
235
- return Change.HAS_CHANGED, payload
236
- return Change.HAS_NOT_CHANGED, payload
236
+ if data[0].get("content_length") and data[1].get("content_length"):
237
+ if data[0]["content_length"] != data[1]["content_length"]:
238
+ return Change.HAS_CHANGED, {
239
+ # if resource has changed, set last-modified to the current check's creation
240
+ "analysis:last-modified-at": data[0]["created_at"].isoformat(),
241
+ "analysis:last-modified-detection": "content-length-header",
242
+ }
243
+ # same content_length is not 100% certainly no change, but a good tradeoff to prevent many downloads
244
+ elif data[1].get("detected_last_modified_at"):
245
+ return Change.HAS_NOT_CHANGED, {
246
+ # no change, using the last-modified from the previous check (passed on from check to check)
247
+ "analysis:last-modified-at": data[1]["detected_last_modified_at"].isoformat(),
248
+ "analysis:last-modified-detection": "previous-check-detection",
249
+ }
250
+ # if the previous check did not have the info, we investigate further
251
+ return Change.NO_GUESS, None
237
252
 
238
253
 
239
254
  async def detect_resource_change_on_early_hints(
@@ -10,12 +10,11 @@ log = logging.getLogger("udata-hydra")
10
10
 
11
11
  class MinIOClient:
12
12
  def __init__(self, bucket=config.MINIO_BUCKET):
13
- self.url = config.MINIO_URL
14
13
  self.user = config.MINIO_USER
15
14
  self.password = config.MINIO_PWD
16
15
  self.bucket = bucket
17
16
  self.client = Minio(
18
- self.url or "test",
17
+ config.MINIO_URL or "test",
19
18
  access_key=self.user or "test",
20
19
  secret_key=self.password or "test",
21
20
  secure=True,
@@ -40,6 +39,6 @@ class MinIOClient:
40
39
  )
41
40
  if delete_source:
42
41
  os.remove(file_name)
43
- return f"https://{self.url}/{self.bucket}/{config.MINIO_FOLDER}/{file_name}"
42
+ return f"https://{config.MINIO_URL}/{self.bucket}/{config.MINIO_FOLDER}/{file_name}"
44
43
  else:
45
44
  raise Exception(f"file '{file_name}' does not exists")