udata-hydra 2.1.3.dev7204__tar.gz → 2.1.3.dev7241__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/PKG-INFO +7 -3
  2. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/README.md +5 -2
  3. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/pyproject.toml +2 -1
  4. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/analysis/csv.py +9 -41
  5. udata_hydra-2.1.3.dev7241/udata_hydra/analysis/geojson.py +130 -0
  6. udata_hydra-2.1.3.dev7241/udata_hydra/analysis/helpers.py +77 -0
  7. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/analysis/resource.py +16 -4
  8. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/cli.py +32 -0
  9. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/config_default.toml +12 -4
  10. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/db/resource.py +3 -0
  11. udata_hydra-2.1.3.dev7241/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +5 -0
  12. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/schemas/check.py +2 -0
  13. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/__init__.py +1 -0
  14. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/csv.py +1 -1
  15. udata_hydra-2.1.3.dev7241/udata_hydra/utils/geojson.py +14 -0
  16. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/http.py +9 -1
  17. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/minio.py +11 -9
  18. udata_hydra-2.1.3.dev7204/udata_hydra/analysis/helpers.py +0 -27
  19. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/__init__.py +0 -0
  20. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/analysis/__init__.py +0 -0
  21. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/app.py +0 -0
  22. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/context.py +0 -0
  23. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/__init__.py +0 -0
  24. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/calculate_next_check.py +0 -0
  25. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/check_resources.py +0 -0
  26. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/helpers.py +0 -0
  27. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  28. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/crawl/select_batch.py +0 -0
  29. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/db/__init__.py +0 -0
  30. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/db/check.py +0 -0
  31. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/db/resource_exception.py +0 -0
  32. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/logger.py +0 -0
  33. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/__init__.py +0 -0
  34. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  35. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  36. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  37. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  38. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  39. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  40. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  41. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  42. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  43. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  44. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  45. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  46. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  47. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  48. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  49. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  50. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  51. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  52. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  53. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  54. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  55. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  56. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  57. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  58. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/routes/__init__.py +0 -0
  59. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/routes/checks.py +0 -0
  60. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/routes/resources.py +0 -0
  61. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/routes/resources_exceptions.py +0 -0
  62. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/routes/status.py +0 -0
  63. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/schemas/__init__.py +0 -0
  64. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/schemas/resource.py +0 -0
  65. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/schemas/resource_exception.py +0 -0
  66. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/auth.py +0 -0
  67. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/db.py +0 -0
  68. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/errors.py +0 -0
  69. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/file.py +0 -0
  70. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/parquet.py +0 -0
  71. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/queue.py +0 -0
  72. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/reader.py +0 -0
  73. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/timer.py +0 -0
  74. {udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.1.3.dev7204
3
+ Version: 2.1.3.dev7241
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -44,6 +44,7 @@ Requires-Dist: setuptools (>=70.3.0)
44
44
  Requires-Dist: sqlalchemy (>=1.4.46)
45
45
  Requires-Dist: str2bool (>=1.1)
46
46
  Requires-Dist: str2float (>=0.0.9)
47
+ Requires-Dist: tippecanoe (>=2.72.0)
47
48
  Requires-Dist: toml (>=0.10.2)
48
49
  Description-Content-Type: text/markdown
49
50
 
@@ -55,7 +56,8 @@ URLs are crawled via _aiohttp_, catalog and crawled metadata are stored in a _Po
55
56
 
56
57
  Since it's called _hydra_, it also has mythical powers embedded:
57
58
  - analyse remote resource metadata over time to detect changes in the smartest way possible
58
- - if the remote resource is a CSV, convert it to a PostgreSQL table, ready for APIfication
59
+ - if the remote resource is tabular (csv or excel-like), convert it to a PostgreSQL table, ready for APIfication, and to parquet to offer another distribution of the data
60
+ - if the remote resource is a geojson, convert it to PMTiles to offer another distribution of the data
59
61
  - send crawl and analysis info to a udata instance
60
62
 
61
63
  ## Architecture schema
@@ -126,6 +128,8 @@ Converted CSV tables will be stored in the database specified via `config.DATABA
126
128
 
127
129
  To run the tests, you need to launch the database, the test database, and the Redis broker with `docker compose -f docker-compose.yml -f docker-compose.test.yml -f docker-compose.broker.yml up -d`.
128
130
 
131
+ Make sure the dev dependecies are installed with `poetry install --extras dev`.
132
+
129
133
  Then you can run the tests with `poetry run pytest`.
130
134
 
131
135
  To run a specific test file, you can pass the path to the file to pytest, like this: `poetry run pytest tests/test_app.py`.
@@ -181,7 +185,7 @@ The API serves the following endpoints:
181
185
  - `PUT` on `/api/resources/{resource_id}` to update a resource in the DB "catalog" table
182
186
  - `DELETE` on `/api/resources/{resource_id}` to delete a resource in the DB "catalog" table
183
187
 
184
- > :warning: **Warning: the following routes are deprecated and need be removed in the future:**
188
+ > :warning: **Warning: the following routes are deprecated and will be removed in the future:**
185
189
  > - `POST` on `/api/resource/created` -> use `POST` on `/api/resources/` instead
186
190
  > - `POST` on `/api/resource/updated` -> use `PUT` on `/api/resources/` instead
187
191
  > - `POST` on `/api/resource/deleted` -> use `DELETE` on `/api/resources/` instead
@@ -6,7 +6,8 @@ URLs are crawled via _aiohttp_, catalog and crawled metadata are stored in a _Po
6
6
 
7
7
  Since it's called _hydra_, it also has mythical powers embedded:
8
8
  - analyse remote resource metadata over time to detect changes in the smartest way possible
9
- - if the remote resource is a CSV, convert it to a PostgreSQL table, ready for APIfication
9
+ - if the remote resource is tabular (csv or excel-like), convert it to a PostgreSQL table, ready for APIfication, and to parquet to offer another distribution of the data
10
+ - if the remote resource is a geojson, convert it to PMTiles to offer another distribution of the data
10
11
  - send crawl and analysis info to a udata instance
11
12
 
12
13
  ## Architecture schema
@@ -77,6 +78,8 @@ Converted CSV tables will be stored in the database specified via `config.DATABA
77
78
 
78
79
  To run the tests, you need to launch the database, the test database, and the Redis broker with `docker compose -f docker-compose.yml -f docker-compose.test.yml -f docker-compose.broker.yml up -d`.
79
80
 
81
+ Make sure the dev dependecies are installed with `poetry install --extras dev`.
82
+
80
83
  Then you can run the tests with `poetry run pytest`.
81
84
 
82
85
  To run a specific test file, you can pass the path to the file to pytest, like this: `poetry run pytest tests/test_app.py`.
@@ -132,7 +135,7 @@ The API serves the following endpoints:
132
135
  - `PUT` on `/api/resources/{resource_id}` to update a resource in the DB "catalog" table
133
136
  - `DELETE` on `/api/resources/{resource_id}` to delete a resource in the DB "catalog" table
134
137
 
135
- > :warning: **Warning: the following routes are deprecated and need be removed in the future:**
138
+ > :warning: **Warning: the following routes are deprecated and will be removed in the future:**
136
139
  > - `POST` on `/api/resource/created` -> use `POST` on `/api/resources/` instead
137
140
  > - `POST` on `/api/resource/updated` -> use `PUT` on `/api/resources/` instead
138
141
  > - `POST` on `/api/resource/deleted` -> use `DELETE` on `/api/resources/` instead
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.1.3.dev7204"
3
+ version = "2.1.3.dev7241"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -27,6 +27,7 @@ dependencies = [
27
27
  "str2bool>=1.1",
28
28
  "str2float>=0.0.9",
29
29
  "toml>=0.10.2",
30
+ "tippecanoe>=2.72.0",
30
31
  ]
31
32
  requires-python = ">=3.11,<3.13"
32
33
  license = { text = "MIT" }
@@ -41,12 +41,8 @@ from udata_hydra.utils import (
41
41
  ParseException,
42
42
  Reader,
43
43
  Timer,
44
- UdataPayload,
45
44
  detect_tabular_from_headers,
46
- download_resource,
47
45
  handle_parse_exception,
48
- queue,
49
- send,
50
46
  )
51
47
  from udata_hydra.utils.minio import MinIOClient
52
48
  from udata_hydra.utils.parquet import save_as_parquet
@@ -84,29 +80,7 @@ PYTHON_TYPE_TO_PY = {
84
80
  }
85
81
 
86
82
  RESERVED_COLS = ("__id", "cmin", "cmax", "collation", "ctid", "tableoid", "xmin", "xmax")
87
- minio_client = MinIOClient()
88
-
89
-
90
- async def notify_udata(resource: Record, check: dict) -> None:
91
- """Notify udata of the result of a parsing"""
92
- payload = {
93
- "resource_id": check["resource_id"],
94
- "dataset_id": resource["dataset_id"],
95
- "document": {
96
- "analysis:parsing:error": check["parsing_error"],
97
- "analysis:parsing:started_at": check["parsing_started_at"].isoformat()
98
- if check["parsing_started_at"]
99
- else None,
100
- "analysis:parsing:finished_at": check["parsing_finished_at"].isoformat()
101
- if check["parsing_finished_at"]
102
- else None,
103
- },
104
- }
105
- if config.CSV_TO_PARQUET:
106
- payload["document"]["analysis:parsing:parquet_url"] = check.get("parquet_url")
107
- payload["document"]["analysis:parsing:parquet_size"] = check.get("parquet_size")
108
- payload["document"] = UdataPayload(payload["document"])
109
- queue.enqueue(send, _priority="high", **payload)
83
+ minio_client = MinIOClient(bucket=config.MINIO_PARQUET_BUCKET, folder=config.MINIO_PARQUET_FOLDER)
110
84
 
111
85
 
112
86
  async def analyse_csv(
@@ -137,18 +111,12 @@ async def analyse_csv(
137
111
 
138
112
  table_name, tmp_file = None, None
139
113
  try:
140
- headers = json.loads(check.get("headers") or "{}")
141
- _, file_format = await detect_tabular_from_headers(check)
142
- tmp_file = (
143
- open(file_path, "rb")
144
- if file_path
145
- else await download_resource(
146
- url=url,
147
- headers=headers,
148
- max_size_allowed=None
149
- if exception
150
- else int(config.MAX_FILESIZE_ALLOWED.get(file_format, "csv")),
151
- )
114
+ _, file_format = detect_tabular_from_headers(check)
115
+ tmp_file = await helpers.read_or_download_file(
116
+ check=check,
117
+ file_path=file_path,
118
+ file_format=file_format,
119
+ exception=exception,
152
120
  )
153
121
  table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
154
122
  timer.mark("download-file")
@@ -205,7 +173,7 @@ async def analyse_csv(
205
173
  except (ParseException, IOException) as e:
206
174
  await handle_parse_exception(e, table_name, check)
207
175
  finally:
208
- await notify_udata(resource, check)
176
+ await helpers.notify_udata(resource, check)
209
177
  timer.stop()
210
178
  if tmp_file is not None:
211
179
  tmp_file.close()
@@ -250,7 +218,7 @@ def compute_create_table_query(
250
218
  for col_name, index_type in indexes.items():
251
219
  if index_type not in config.SQL_INDEXES_TYPES_SUPPORTED:
252
220
  log.error(
253
- f'Index type "{index_type}" is unknown or not supported yet! Index for colum {col_name} was not created.'
221
+ f'Index type "{index_type}" is unknown or not supported yet! Index for column {col_name} was not created.'
254
222
  )
255
223
  continue
256
224
 
@@ -0,0 +1,130 @@
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ from datetime import datetime, timezone
5
+
6
+ from asyncpg import Record
7
+
8
+ from udata_hydra import config
9
+ from udata_hydra.analysis import helpers
10
+ from udata_hydra.db.check import Check
11
+ from udata_hydra.db.resource import Resource
12
+ from udata_hydra.db.resource_exception import ResourceException
13
+ from udata_hydra.utils import (
14
+ IOException,
15
+ ParseException,
16
+ Timer,
17
+ handle_parse_exception,
18
+ )
19
+ from udata_hydra.utils.minio import MinIOClient
20
+
21
+ log = logging.getLogger("udata-hydra")
22
+ minio_client = MinIOClient(bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER)
23
+
24
+
25
+ async def analyse_geojson(
26
+ check: dict,
27
+ file_path: str | None = None,
28
+ ) -> None:
29
+ """Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
30
+ if not config.GEOJSON_TO_PMTILES:
31
+ log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
32
+ return
33
+
34
+ resource_id: str = str(check["resource_id"])
35
+ url = check["url"]
36
+
37
+ # Update resource status to ANALYSING_GEOJSON
38
+ resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
39
+
40
+ # Check if the resource is in the exceptions table
41
+ exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
42
+
43
+ timer = Timer("analyse-geojson")
44
+ assert any(_ is not None for _ in (check["id"], url))
45
+
46
+ tmp_file = None
47
+ try:
48
+ tmp_file = await helpers.read_or_download_file(
49
+ check=check,
50
+ file_path=file_path,
51
+ file_format="geojson",
52
+ exception=exception,
53
+ )
54
+ timer.mark("download-file")
55
+
56
+ check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
57
+
58
+ # Convert to PMTiles
59
+ try:
60
+ pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
61
+ file_path=tmp_file.name,
62
+ resource_id=resource_id,
63
+ )
64
+ timer.mark("geojson-to-pmtiles")
65
+ except Exception as e:
66
+ raise ParseException(
67
+ step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
68
+ ) from e
69
+
70
+ check = await Check.update(
71
+ check["id"],
72
+ {
73
+ "parsing_finished_at": datetime.now(timezone.utc),
74
+ "pmtiles_url": pmtiles_url,
75
+ "pmtiles_size": pmtiles_size,
76
+ },
77
+ )
78
+
79
+ except (ParseException, IOException) as e:
80
+ await handle_parse_exception(e, None, check)
81
+ finally:
82
+ await helpers.notify_udata(resource, check)
83
+ timer.stop()
84
+ if tmp_file is not None:
85
+ tmp_file.close()
86
+ os.remove(tmp_file.name)
87
+
88
+ # Reset resource status to None
89
+ await Resource.update(resource_id, {"status": None})
90
+
91
+
92
+ async def geojson_to_pmtiles(
93
+ file_path: str,
94
+ resource_id: str | None = None,
95
+ ) -> tuple[str, int]:
96
+ """
97
+ Convert a GeoJSON file to PMTiles format.
98
+
99
+ Args:
100
+ file_path: GeoJSON file path to convert.
101
+ resource_id: Optional resource ID for status updates.
102
+
103
+ Returns:
104
+ pmtiles_url: URL of the PMTiles file.
105
+ pmtiles_size: size of the PMTiles file.
106
+ """
107
+
108
+ log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
109
+
110
+ if resource_id:
111
+ await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
112
+
113
+ output_pmtiles = f"{resource_id}.pmtiles"
114
+
115
+ command = [
116
+ "tippecanoe",
117
+ "--maximum-zoom=g", # guess
118
+ "-o",
119
+ output_pmtiles,
120
+ "--coalesce-densest-as-needed",
121
+ "--extend-zooms-if-still-dropping",
122
+ file_path,
123
+ ]
124
+ subprocess.run(command, check=True)
125
+ log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
126
+
127
+ pmtiles_size = os.path.getsize(output_pmtiles)
128
+ pmtiles_url: str = minio_client.send_file(output_pmtiles)
129
+
130
+ return pmtiles_url, pmtiles_size
@@ -0,0 +1,77 @@
1
+ import json
2
+ from datetime import date, datetime
3
+ from typing import IO
4
+
5
+ from asyncpg import Record
6
+ from dateparser import parse as date_parser
7
+ from dateutil.parser import ParserError
8
+ from dateutil.parser import parse as dateutil_parser
9
+
10
+ from udata_hydra import config
11
+ from udata_hydra.utils import UdataPayload, download_resource, queue, send
12
+
13
+
14
+ def to_json(value: str) -> str:
15
+ """Convenience method, should be casted from string directly by postgres"""
16
+ return value
17
+
18
+
19
+ def _parse_dt(value: str) -> datetime | None:
20
+ """For performance reasons, we try first with dateutil and fallback on dateparser"""
21
+ try:
22
+ return dateutil_parser(value)
23
+ except ParserError:
24
+ return date_parser(value)
25
+
26
+
27
+ def to_date(value: str) -> date | None:
28
+ parsed = _parse_dt(value)
29
+ return parsed.date() if parsed else None
30
+
31
+
32
+ def to_datetime(value: str) -> datetime | None:
33
+ return _parse_dt(value)
34
+
35
+
36
+ async def read_or_download_file(
37
+ check: dict,
38
+ file_path: str,
39
+ file_format: str,
40
+ exception: Record | None,
41
+ ) -> IO[bytes]:
42
+ return (
43
+ open(file_path, "rb")
44
+ if file_path
45
+ else await download_resource(
46
+ url=check["url"],
47
+ headers=json.loads(check.get("headers") or "{}"),
48
+ max_size_allowed=None
49
+ if exception
50
+ else int(config.MAX_FILESIZE_ALLOWED.get(file_format, "csv")),
51
+ )
52
+ )
53
+
54
+
55
+ async def notify_udata(resource: Record, check: dict) -> None:
56
+ """Notify udata of the result of a parsing"""
57
+ payload = {
58
+ "resource_id": check["resource_id"],
59
+ "dataset_id": resource["dataset_id"],
60
+ "document": {
61
+ "analysis:parsing:error": check["parsing_error"],
62
+ "analysis:parsing:started_at": check["parsing_started_at"].isoformat()
63
+ if check["parsing_started_at"]
64
+ else None,
65
+ "analysis:parsing:finished_at": check["parsing_finished_at"].isoformat()
66
+ if check["parsing_finished_at"]
67
+ else None,
68
+ },
69
+ }
70
+ if config.CSV_TO_PARQUET and check.get("parquet_url"):
71
+ payload["document"]["analysis:parsing:parquet_url"] = check.get("parquet_url")
72
+ payload["document"]["analysis:parsing:parquet_size"] = check.get("parquet_size")
73
+ if config.GEOJSON_TO_PMTILES and check.get("pmtiles_url"):
74
+ payload["document"]["analysis:parsing:pmtiles_url"] = check.get("pmtiles_url")
75
+ payload["document"]["analysis:parsing:pmtiles_size"] = check.get("pmtiles_size")
76
+ payload["document"] = UdataPayload(payload["document"])
77
+ queue.enqueue(send, _priority="high", **payload)
@@ -10,6 +10,7 @@ from dateparser import parse as date_parser
10
10
 
11
11
  from udata_hydra import config, context
12
12
  from udata_hydra.analysis.csv import analyse_csv
13
+ from udata_hydra.analysis.geojson import analyse_geojson
13
14
  from udata_hydra.crawl.calculate_next_check import calculate_next_check_date
14
15
  from udata_hydra.db.check import Check
15
16
  from udata_hydra.db.resource import Resource
@@ -18,6 +19,7 @@ from udata_hydra.utils import (
18
19
  IOException,
19
20
  UdataPayload,
20
21
  compute_checksum_from_file,
22
+ detect_geojson_from_headers,
21
23
  detect_tabular_from_headers,
22
24
  download_resource,
23
25
  queue,
@@ -69,8 +71,11 @@ async def analyse_resource(
69
71
  # let's see if we can infer a modification date on early hints based on harvest infos and headers
70
72
  change_status, change_payload = await detect_resource_change_on_early_hints(resource)
71
73
 
72
- # could it be a CSV? If we get hints, we will analyse the file further depending on change status
73
- is_tabular, file_format = await detect_tabular_from_headers(check)
74
+ # could it be a CSV or a GeoJSON? If we get hints, we will analyse the file further depending on change status
75
+ is_tabular, file_format = detect_tabular_from_headers(check)
76
+ is_geojson: bool = detect_geojson_from_headers(check)
77
+ if is_geojson:
78
+ file_format = "geojson"
74
79
  max_size_allowed = None if exception else int(config.MAX_FILESIZE_ALLOWED[file_format])
75
80
 
76
81
  # if the change status is NO_GUESS or HAS_CHANGED, let's download the file to get more infos
@@ -96,7 +101,7 @@ async def analyse_resource(
96
101
  )
97
102
  dl_analysis["analysis:mime-type"] = magic.from_file(tmp_file.name, mime=True)
98
103
  finally:
99
- if tmp_file and not is_tabular:
104
+ if tmp_file and not (is_tabular or is_geojson):
100
105
  os.remove(tmp_file.name)
101
106
  await Check.update(
102
107
  check["id"],
@@ -136,7 +141,14 @@ async def analyse_resource(
136
141
  file_path=tmp_file.name,
137
142
  _priority="high" if worker_priority == "high" else "default",
138
143
  )
139
-
144
+ elif is_geojson and tmp_file:
145
+ await Resource.update(resource_id, data={"status": "TO_ANALYSE_GEOJSON"})
146
+ queue.enqueue(
147
+ analyse_geojson,
148
+ check=check,
149
+ file_path=tmp_file.name,
150
+ _priority="high" if worker_priority == "high" else "default",
151
+ )
140
152
  else:
141
153
  await Resource.update(resource_id, data={"status": None})
142
154
 
@@ -14,6 +14,7 @@ from progressist import ProgressBar
14
14
 
15
15
  from udata_hydra import config
16
16
  from udata_hydra.analysis.csv import analyse_csv
17
+ from udata_hydra.analysis.geojson import analyse_geojson
17
18
  from udata_hydra.crawl.check_resources import check_resource as crawl_check_resource
18
19
  from udata_hydra.db.check import Check
19
20
  from udata_hydra.db.resource import Resource
@@ -190,6 +191,37 @@ async def analyse_csv_cli(
190
191
  await analyse_csv(check=check, debug_insert=debug_insert)
191
192
 
192
193
 
194
+ @cli(name="analyse-geojson")
195
+ async def analyse_geojson_cli(
196
+ check_id: str | None = None,
197
+ url: str | None = None,
198
+ resource_id: str | None = None,
199
+ ):
200
+ """Trigger a GeoJSON analysis from a check_id, an url or a resource_id
201
+ Try to get the check from the check ID, then from the URL
202
+ """
203
+ assert check_id or url or resource_id
204
+ check = None
205
+ if check_id:
206
+ check: Record | None = await Check.get_by_id(int(check_id), with_deleted=True)
207
+ if not check and url:
208
+ checks: list[Record] | None = await Check.get_by_url(url)
209
+ if checks and len(checks) > 1:
210
+ log.warning(f"Multiple checks found for URL {url}, using the latest one")
211
+ check = checks[0] if checks else None
212
+ if not check and resource_id:
213
+ check: Record | None = await Check.get_by_resource_id(resource_id)
214
+ if not check:
215
+ if check_id:
216
+ log.error("Could not retrieve the specified check")
217
+ elif url:
218
+ log.error("Could not find a check linked to the specified URL")
219
+ elif resource_id:
220
+ log.error("Could not find a check linked to the specified resource ID")
221
+ return
222
+ await analyse_geojson(check=check)
223
+
224
+
193
225
  @cli
194
226
  async def csv_sample(size: int = 1000, download: bool = False, max_size: str = "100M"):
195
227
  """Get a csv sample from latest checks
@@ -55,6 +55,7 @@ MAX_FILESIZE_ALLOWED.csvgz = 104857600
55
55
  MAX_FILESIZE_ALLOWED.xls = 52428800 # /2
56
56
  MAX_FILESIZE_ALLOWED.xlsx = 13107200 # /8
57
57
  MAX_FILESIZE_ALLOWED.ods = 10485760 # /10
58
+ MAX_FILESIZE_ALLOWED.geojson = 104857600
58
59
 
59
60
  # -- CSV analysis settings -- #
60
61
  SQL_INDEXES_TYPES_SUPPORTED = ["index"]
@@ -72,10 +73,17 @@ UDATA_URI = ""
72
73
  UDATA_URI_API_KEY = ""
73
74
 
74
75
  # -- Minio / datalake settings -- #
75
- CSV_TO_PARQUET = false
76
- MIN_LINES_FOR_PARQUET = 200
77
- MINIO_FOLDER = "" # no trailing slash
78
76
  MINIO_URL = "" # no scheme
79
- MINIO_BUCKET = ""
80
77
  MINIO_USER = ""
81
78
  MINIO_PWD = ""
79
+
80
+ # -- Parquet conversion settings -- #
81
+ CSV_TO_PARQUET = false
82
+ MIN_LINES_FOR_PARQUET = 200
83
+ MINIO_PARQUET_BUCKET = ""
84
+ MINIO_PARQUET_FOLDER = "" # no trailing slash
85
+
86
+ # -- PMTiles conversion settings -- #
87
+ GEOJSON_TO_PMTILES = false
88
+ MINIO_PMTILES_BUCKET = ""
89
+ MINIO_PMTILES_FOLDER = "" # no trailing slash
@@ -18,6 +18,9 @@ class Resource:
18
18
  "ANALYSING_CSV": "resource content currently being analysed by CSV detective",
19
19
  "INSERTING_IN_DB": "currently being inserted in DB",
20
20
  "CONVERTING_TO_PARQUET": "currently being converted to Parquet",
21
+ "TO_ANALYSE_GEOJSON": "geojson resource content to be analysed",
22
+ "ANALYSING_GEOJSON": "geojson resource content currently being analysed",
23
+ "CONVERTING_TO_PMTILES": "currently being converted to pmtiles",
21
24
  }
22
25
 
23
26
  @classmethod
@@ -0,0 +1,5 @@
1
+ -- Add PMTiles fields to checks table
2
+
3
+ ALTER TABLE checks
4
+ ADD COLUMN pmtiles_url VARCHAR,
5
+ ADD COLUMN pmtiles_size BIGINT;
@@ -24,6 +24,8 @@ class CheckSchema(Schema):
24
24
  parsing_table = fields.Str()
25
25
  parquet_url = fields.Str()
26
26
  parquet_size = fields.Integer()
27
+ pmtiles_url = fields.Str()
28
+ pmtiles_size = fields.Integer()
27
29
 
28
30
  def create(self, data):
29
31
  return self.load(data)
@@ -3,6 +3,7 @@ from .auth import token_auth_middleware
3
3
  from .csv import detect_tabular_from_headers
4
4
  from .errors import IOException, ParseException, handle_parse_exception
5
5
  from .file import compute_checksum_from_file, download_resource, read_csv_gz
6
+ from .geojson import detect_geojson_from_headers
6
7
  from .http import UdataPayload, get_request_params, is_valid_uri, send
7
8
  from .queue import enqueue
8
9
  from .reader import Reader, generate_dialect
@@ -1,7 +1,7 @@
1
1
  import json
2
2
 
3
3
 
4
- async def detect_tabular_from_headers(check: dict) -> tuple[bool, str]:
4
+ def detect_tabular_from_headers(check: dict) -> tuple[bool, str]:
5
5
  """
6
6
  Determine from content-type header if file looks like:
7
7
  - a csv
@@ -0,0 +1,14 @@
1
+ import json
2
+
3
+
4
+ async def detect_geojson_from_headers(check: dict) -> bool:
5
+ headers: dict = json.loads(check["headers"] or "{}")
6
+ # in some cases geojson files have the content-type `application/json`
7
+ # but adding this in the list would not have been a restrictive enough condition
8
+ # so we check the URL, which is satisfactory for now
9
+ if any(
10
+ headers.get("content-type", "").lower().startswith(ct)
11
+ for ct in ["application/vnd.geo+json"]
12
+ ) or "geojson" in check.get("url", ""):
13
+ return True
14
+ return False
@@ -23,7 +23,15 @@ class UdataPayload:
23
23
  "last-modified-detection",
24
24
  "mime-type",
25
25
  ],
26
- "analysis:parsing": ["error", "finished_at", "parquet_size", "parquet_url", "started_at"],
26
+ "analysis:parsing": [
27
+ "error",
28
+ "started_at",
29
+ "finished_at",
30
+ "parquet_size",
31
+ "parquet_url",
32
+ "pmtiles_size",
33
+ "pmtiles_url",
34
+ ],
27
35
  }
28
36
 
29
37
  def __init__(self, payload: dict):
@@ -9,10 +9,11 @@ log = logging.getLogger("udata-hydra")
9
9
 
10
10
 
11
11
  class MinIOClient:
12
- def __init__(self, bucket=config.MINIO_BUCKET):
12
+ def __init__(self, bucket: str, folder: str):
13
13
  self.user = config.MINIO_USER
14
14
  self.password = config.MINIO_PWD
15
15
  self.bucket = bucket
16
+ self.folder = folder
16
17
  self.client = Minio(
17
18
  config.MINIO_URL or "test",
18
19
  access_key=self.user or "test",
@@ -26,19 +27,20 @@ class MinIOClient:
26
27
 
27
28
  def send_file(
28
29
  self,
29
- file_name,
30
- delete_source=True,
30
+ file_path: str,
31
+ delete_source: bool = True,
31
32
  ) -> str:
32
33
  if self.bucket is None:
33
34
  raise AttributeError("A bucket has to be specified.")
34
- if os.path.isfile(file_name):
35
+ if os.path.isfile(file_path):
36
+ file_name = os.path.basename(file_path)
35
37
  self.client.fput_object(
36
38
  self.bucket,
37
- f"{config.MINIO_FOLDER}/{file_name}",
38
- file_name,
39
+ f"{self.folder}/{file_name}",
40
+ file_path,
39
41
  )
40
42
  if delete_source:
41
- os.remove(file_name)
42
- return f"https://{config.MINIO_URL}/{self.bucket}/{config.MINIO_FOLDER}/{file_name}"
43
+ os.remove(file_path)
44
+ return f"https://{config.MINIO_URL}/{self.bucket}/{self.folder}/{file_name}"
43
45
  else:
44
- raise Exception(f"file '{file_name}' does not exists")
46
+ raise Exception(f"file '{file_path}' does not exists")
@@ -1,27 +0,0 @@
1
- from datetime import date, datetime
2
-
3
- from dateparser import parse as date_parser
4
- from dateutil.parser import ParserError
5
- from dateutil.parser import parse as dateutil_parser
6
-
7
-
8
- def to_json(value: str) -> str:
9
- """Convenience method, should be casted from string directly by postgres"""
10
- return value
11
-
12
-
13
- def _parse_dt(value: str) -> datetime | None:
14
- """For performance reasons, we try first with dateutil and fallback on dateparser"""
15
- try:
16
- return dateutil_parser(value)
17
- except ParserError:
18
- return date_parser(value)
19
-
20
-
21
- def to_date(value: str) -> date | None:
22
- parsed = _parse_dt(value)
23
- return parsed.date() if parsed else None
24
-
25
-
26
- def to_datetime(value: str) -> datetime | None:
27
- return _parse_dt(value)