udata-hydra 2.2.2.dev7633__tar.gz → 2.2.2.dev7667__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/PKG-INFO +2 -2
  2. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/pyproject.toml +2 -2
  3. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/csv.py +20 -1
  4. udata_hydra-2.2.2.dev7667/udata_hydra/analysis/geojson.py +250 -0
  5. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/config_default.toml +5 -0
  6. udata_hydra-2.2.2.dev7667/udata_hydra/migrations/main/20250615_add_geojson_fields.sql +5 -0
  7. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/status.py +5 -0
  8. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/__init__.py +3 -1
  9. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/file.py +7 -0
  10. udata_hydra-2.2.2.dev7633/udata_hydra/analysis/geojson.py +0 -131
  11. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/README.md +0 -0
  12. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/__init__.py +0 -0
  13. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/__init__.py +0 -0
  14. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/helpers.py +0 -0
  15. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/resource.py +0 -0
  16. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/app.py +0 -0
  17. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/cli.py +0 -0
  18. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/context.py +0 -0
  19. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/__init__.py +0 -0
  20. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/calculate_next_check.py +0 -0
  21. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/check_resources.py +0 -0
  22. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/helpers.py +0 -0
  23. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  24. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/select_batch.py +0 -0
  25. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/__init__.py +0 -0
  26. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/check.py +0 -0
  27. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/resource.py +0 -0
  28. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/resource_exception.py +0 -0
  29. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/logger.py +0 -0
  30. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/__init__.py +0 -0
  31. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  32. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  33. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  34. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  35. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20250610_migrate_resources_exception.sql +0 -0
  36. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20250626_delete_datetime_iso_references.sql +0 -0
  37. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  38. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  39. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  40. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  41. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  42. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  43. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  44. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  45. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  46. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  47. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  48. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  49. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  50. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  51. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  52. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  53. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  54. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  55. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  56. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  57. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
  58. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250519_add_format_column_catalog.sql +0 -0
  59. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250610_migrate_resources_exception.sql +0 -0
  60. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250611_add_status_since_catalog.sql +0 -0
  61. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/__init__.py +0 -0
  62. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/checks.py +0 -0
  63. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/resources.py +0 -0
  64. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/resources_exceptions.py +0 -0
  65. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/__init__.py +0 -0
  66. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/check.py +0 -0
  67. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/resource.py +0 -0
  68. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/resource_exception.py +0 -0
  69. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/auth.py +0 -0
  70. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/csv.py +0 -0
  71. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/db.py +0 -0
  72. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/errors.py +0 -0
  73. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/geojson.py +0 -0
  74. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/http.py +0 -0
  75. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/minio.py +0 -0
  76. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/parquet.py +0 -0
  77. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/queue.py +0 -0
  78. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/timer.py +0 -0
  79. {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.2.2.dev7633
3
+ Version: 2.2.2.dev7667
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -38,7 +38,7 @@ Requires-Dist: python-magic (>=0.4.25)
38
38
  Requires-Dist: python-slugify (>=8.0.4)
39
39
  Requires-Dist: redis (>=4.1.4)
40
40
  Requires-Dist: rq (>=1.11.1)
41
- Requires-Dist: ruff (>=0.5.7) ; extra == "dev"
41
+ Requires-Dist: ruff (>=0.9.3) ; extra == "dev"
42
42
  Requires-Dist: sentry-sdk (>=2.10.0)
43
43
  Requires-Dist: setuptools (>=70.3.0)
44
44
  Requires-Dist: sqlalchemy (>=1.4.46)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.2.2.dev7633"
3
+ version = "2.2.2.dev7667"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -45,7 +45,7 @@ dev = [
45
45
  "pytest-asyncio>=0.18.3",
46
46
  "pytest-cov>=5.0.0",
47
47
  "pytest-mock>=3.7.0",
48
- "ruff>=0.5.7",
48
+ "ruff>=0.9.3",
49
49
  ]
50
50
 
51
51
  [tool.mypy]
@@ -5,7 +5,6 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  from datetime import datetime, timezone
8
- from math import isnan
9
8
  from typing import Iterator
10
9
 
11
10
  import pandas as pd
@@ -33,6 +32,7 @@ from sqlalchemy.schema import CreateIndex, CreateTable, Index
33
32
 
34
33
  from udata_hydra import config, context
35
34
  from udata_hydra.analysis import helpers
35
+ from udata_hydra.analysis.geojson import csv_to_geojson_and_pmtiles
36
36
  from udata_hydra.db import compute_insert_query
37
37
  from udata_hydra.db.check import Check
38
38
  from udata_hydra.db.resource import Resource
@@ -43,6 +43,7 @@ from udata_hydra.utils import (
43
43
  Timer,
44
44
  detect_tabular_from_headers,
45
45
  handle_parse_exception,
46
+ remove_remainders,
46
47
  )
47
48
  from udata_hydra.utils.minio import MinIOClient
48
49
  from udata_hydra.utils.parquet import save_as_parquet
@@ -160,10 +161,24 @@ async def analyse_csv(
160
161
  )
161
162
  timer.mark("csv-to-parquet")
162
163
  except Exception as e:
164
+ remove_remainders(resource_id, ["parquet"])
163
165
  raise ParseException(
164
166
  step="parquet_export", resource_id=resource_id, url=url, check_id=check["id"]
165
167
  ) from e
166
168
 
169
+ try:
170
+ geojson_args: tuple[str, int, str, int] | None = await csv_to_geojson_and_pmtiles(
171
+ df=df,
172
+ inspection=csv_inspection,
173
+ resource_id=resource_id,
174
+ )
175
+ timer.mark("csv-to-geojson-pmtiles")
176
+ except Exception as e:
177
+ remove_remainders(resource_id, ["geojson", "pmtiles", "pmtiles-journal"])
178
+ raise ParseException(
179
+ step="geojson_export", resource_id=resource_id, url=url, check_id=check["id"]
180
+ ) from e
181
+
167
182
  check = await Check.update(
168
183
  check["id"],
169
184
  {
@@ -171,6 +186,10 @@ async def analyse_csv(
171
186
  "parsing_finished_at": datetime.now(timezone.utc),
172
187
  "parquet_url": parquet_args[0] if parquet_args else None,
173
188
  "parquet_size": parquet_args[1] if parquet_args else None,
189
+ "geojson_url": geojson_args[0] if geojson_args else None,
190
+ "geojson_size": geojson_args[1] if geojson_args else None,
191
+ "pmtiles_url": geojson_args[2] if geojson_args else None,
192
+ "pmtiles_size": geojson_args[3] if geojson_args else None,
174
193
  },
175
194
  )
176
195
  await csv_to_db_index(table_name, csv_inspection, check)
@@ -0,0 +1,250 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from datetime import datetime, timezone
5
+
6
+ import pandas as pd
7
+ import tippecanoe
8
+ from asyncpg import Record
9
+
10
+ from udata_hydra import config
11
+ from udata_hydra.analysis import helpers
12
+ from udata_hydra.db.check import Check
13
+ from udata_hydra.db.resource import Resource
14
+ from udata_hydra.db.resource_exception import ResourceException
15
+ from udata_hydra.utils import (
16
+ IOException,
17
+ ParseException,
18
+ Timer,
19
+ handle_parse_exception,
20
+ )
21
+ from udata_hydra.utils.minio import MinIOClient
22
+
23
+ log = logging.getLogger("udata-hydra")
24
+ minio_client_pmtiles = MinIOClient(
25
+ bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER
26
+ )
27
+ minio_client_geojson = MinIOClient(
28
+ bucket=config.MINIO_GEOJSON_BUCKET, folder=config.MINIO_GEOJSON_FOLDER
29
+ )
30
+
31
+
32
+ async def analyse_geojson(
33
+ check: dict,
34
+ file_path: str | None = None,
35
+ ) -> None:
36
+ """Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
37
+ if not config.GEOJSON_TO_PMTILES:
38
+ log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
39
+ return
40
+
41
+ resource_id: str = str(check["resource_id"])
42
+ url = check["url"]
43
+
44
+ # Update resource status to ANALYSING_GEOJSON
45
+ resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
46
+
47
+ # Check if the resource is in the exceptions table
48
+ exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
49
+
50
+ timer = Timer("analyse-geojson")
51
+ assert any(_ is not None for _ in (check["id"], url))
52
+
53
+ tmp_file = None
54
+ try:
55
+ tmp_file = await helpers.read_or_download_file(
56
+ check=check,
57
+ file_path=file_path,
58
+ file_format="geojson",
59
+ exception=exception,
60
+ )
61
+ timer.mark("download-file")
62
+
63
+ check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
64
+
65
+ # Convert to PMTiles
66
+ try:
67
+ pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
68
+ file_path=tmp_file.name,
69
+ resource_id=resource_id,
70
+ )
71
+ timer.mark("geojson-to-pmtiles")
72
+ except Exception as e:
73
+ raise ParseException(
74
+ step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
75
+ ) from e
76
+
77
+ check = await Check.update(
78
+ check["id"],
79
+ {
80
+ "parsing_finished_at": datetime.now(timezone.utc),
81
+ "pmtiles_url": pmtiles_url,
82
+ "pmtiles_size": pmtiles_size,
83
+ },
84
+ )
85
+
86
+ except (ParseException, IOException) as e:
87
+ await handle_parse_exception(e, None, check)
88
+ finally:
89
+ await helpers.notify_udata(resource, check)
90
+ timer.stop()
91
+ if tmp_file is not None:
92
+ tmp_file.close()
93
+ os.remove(tmp_file.name)
94
+
95
+ # Reset resource status to None
96
+ await Resource.update(resource_id, {"status": None})
97
+
98
+
99
+ async def geojson_to_pmtiles(
100
+ file_path: str,
101
+ resource_id: str | None = None,
102
+ ) -> tuple[str, int]:
103
+ """
104
+ Convert a GeoJSON file to PMTiles format.
105
+
106
+ Args:
107
+ file_path: GeoJSON file path to convert.
108
+ resource_id: Optional resource ID for status updates.
109
+
110
+ Returns:
111
+ pmtiles_url: URL of the PMTiles file.
112
+ pmtiles_size: size of the PMTiles file.
113
+ """
114
+
115
+ log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
116
+
117
+ if resource_id:
118
+ await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
119
+
120
+ output_pmtiles = f"{resource_id}.pmtiles"
121
+
122
+ command = [
123
+ "--maximum-zoom=g", # guess
124
+ "-o",
125
+ output_pmtiles,
126
+ "--coalesce-densest-as-needed",
127
+ "--extend-zooms-if-still-dropping",
128
+ file_path,
129
+ ]
130
+ exit_code = tippecanoe._program("tippecanoe", *command)
131
+ if exit_code:
132
+ raise ValueError(f"GeoJSON to PMTiles conversion failed for {file_path}")
133
+ log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
134
+
135
+ pmtiles_size = os.path.getsize(output_pmtiles)
136
+ pmtiles_url: str = minio_client_pmtiles.send_file(output_pmtiles)
137
+
138
+ return pmtiles_url, pmtiles_size
139
+
140
+
141
+ async def csv_to_geojson_and_pmtiles(
142
+ df: pd.DataFrame,
143
+ inspection: dict,
144
+ resource_id: str | None = None,
145
+ ) -> tuple[str, int, str, int] | None:
146
+ def cast_latlon(latlon: str) -> list[float, float]:
147
+ # we can safely do this as the detection was successful
148
+ lat, lon = latlon.replace(" ", "").split(",")
149
+ # using the geojson standard: longitude before latitude
150
+ return [float(lon), float(lat)]
151
+
152
+ def prevent_nan(value):
153
+ # convenience to prevent downstream crash (NaN in json or PMtiles)
154
+ if pd.isna(value):
155
+ return None
156
+ return value
157
+
158
+ if not config.CSV_TO_GEOJSON:
159
+ log.debug("CSV_TO_GEOJSON turned off, skipping geojson/PMtiles export.")
160
+ return
161
+
162
+ log.debug(
163
+ f"Converting to geojson and PMtiles if relevant for {resource_id} and sending to Minio."
164
+ )
165
+
166
+ geo = {}
167
+ for column, detection in inspection["columns"].items():
168
+ # see csv-detective's geo formats:
169
+ # https://github.com/datagouv/csv-detective/tree/master/csv_detective/detect_fields/geo
170
+ if "geojson" in detection["format"]:
171
+ geo["geometry"] = column
172
+ break
173
+ if "latlon" in detection["format"]:
174
+ geo["latlon"] = column
175
+ break
176
+ if "latitude" in detection["format"]:
177
+ geo["lat"] = column
178
+ if "longitude" in detection["format"]:
179
+ geo["lon"] = column
180
+ # priority is given to geometry, then latlon, then latitude + longitude
181
+ if "geometry" in geo:
182
+ geo = {"geometry": geo["geometry"]}
183
+ if "latlon" in geo:
184
+ geo = {"latlon": geo["latlon"]}
185
+ if not geo or (("lat" in geo and "lon" not in geo) or ("lon" in geo and "lat" not in geo)):
186
+ log.debug("No geographical columns found, skipping")
187
+ return None
188
+
189
+ if resource_id:
190
+ await Resource.update(resource_id, {"status": "CONVERTING_TO_GEOJSON"})
191
+
192
+ template = {"type": "FeatureCollection", "features": []}
193
+ for _, row in df.iterrows():
194
+ if "geometry" in geo:
195
+ template["features"].append(
196
+ {
197
+ "type": "Feature",
198
+ # json is not pre-cast by csv-detective
199
+ "geometry": json.loads(row[geo["geometry"]]),
200
+ "properties": {
201
+ col: prevent_nan(row[col]) for col in df.columns if col != geo["geometry"]
202
+ },
203
+ }
204
+ )
205
+ elif "latlon" in geo:
206
+ # ending up here means we either have the exact lat,lon format, or NaN
207
+ # skipping row if NaN
208
+ if pd.isna(row[geo["latlon"]]):
209
+ continue
210
+ template["features"].append(
211
+ {
212
+ "type": "Feature",
213
+ "geometry": {
214
+ "type": "Point",
215
+ "coordinates": cast_latlon(row[geo["latlon"]]),
216
+ },
217
+ "properties": {
218
+ col: prevent_nan(row[col]) for col in df.columns if col != geo["latlon"]
219
+ },
220
+ }
221
+ )
222
+ else:
223
+ # skipping row if lat or lon is NaN
224
+ if any(pd.isna(coord) for coord in (row[geo["lon"]], row[geo["lat"]])):
225
+ continue
226
+ template["features"].append(
227
+ {
228
+ "type": "Feature",
229
+ "geometry": {
230
+ "type": "Point",
231
+ # these columns are precast by csv-detective
232
+ "coordinates": [row[geo["lon"]], row[geo["lat"]]],
233
+ },
234
+ "properties": {
235
+ col: prevent_nan(row[col])
236
+ for col in df.columns
237
+ if col not in [geo["lon"], geo["lat"]]
238
+ },
239
+ }
240
+ )
241
+ geojson_file = f"{resource_id}.geojson"
242
+ with open(geojson_file, "w") as f:
243
+ json.dump(template, f, indent=4, ensure_ascii=False, default=str)
244
+ geojson_size = os.path.getsize(geojson_file)
245
+
246
+ pmtiles_url, pmtiles_size = await geojson_to_pmtiles(geojson_file, resource_id)
247
+
248
+ geojson_url: str = minio_client_geojson.send_file(geojson_file)
249
+
250
+ return geojson_url, geojson_size, pmtiles_url, pmtiles_size
@@ -87,3 +87,8 @@ MINIO_PARQUET_FOLDER = "" # no trailing slash
87
87
  GEOJSON_TO_PMTILES = false
88
88
  MINIO_PMTILES_BUCKET = ""
89
89
  MINIO_PMTILES_FOLDER = "" # no trailing slash
90
+
91
+ # -- Geojson conversion settings -- #
92
+ CSV_TO_GEOJSON = false
93
+ MINIO_GEOJSON_BUCKET = ""
94
+ MINIO_GEOJSON_FOLDER = "" # no trailing slash
@@ -0,0 +1,5 @@
1
+ -- Add PMTiles fields to checks table
2
+
3
+ ALTER TABLE checks
4
+ ADD COLUMN geojson_url VARCHAR,
5
+ ADD COLUMN geojson_size BIGINT;
@@ -147,5 +147,10 @@ async def get_health(request: web.Request) -> web.Response:
147
147
  {
148
148
  "version": config.APP_VERSION,
149
149
  "environment": config.ENVIRONMENT or "unknown",
150
+ "csv_analysis": config.CSV_ANALYSIS,
151
+ "csv_to_db": config.CSV_TO_DB,
152
+ "csv_to_parquet": config.CSV_TO_PARQUET,
153
+ "geojson_to_pmtiles": config.GEOJSON_TO_PMTILES,
154
+ "csv_to_geojson": config.CSV_TO_GEOJSON,
150
155
  }
151
156
  )
@@ -1,7 +1,7 @@
1
1
  from .auth import token_auth_middleware
2
2
  from .csv import detect_tabular_from_headers
3
3
  from .errors import IOException, ParseException, handle_parse_exception
4
- from .file import compute_checksum_from_file, download_resource
4
+ from .file import compute_checksum_from_file, download_resource, read_csv_gz, remove_remainders
5
5
  from .geojson import detect_geojson_from_headers_or_catalog
6
6
  from .http import UdataPayload, get_request_params, send
7
7
  from .queue import enqueue
@@ -15,6 +15,8 @@ __all__ = [
15
15
  "handle_parse_exception",
16
16
  "compute_checksum_from_file",
17
17
  "download_resource",
18
+ "read_csv_gz",
19
+ "remove_remainders",
18
20
  "detect_geojson_from_headers_or_catalog",
19
21
  "UdataPayload",
20
22
  "get_request_params",
@@ -80,3 +80,10 @@ async def download_resource(
80
80
  ]:
81
81
  tmp_file = read_csv_gz(tmp_file.name)
82
82
  return tmp_file
83
+
84
+
85
+ def remove_remainders(resource_id: str, extensions: list[str]) -> None:
86
+ """Delete potential remainders from process that crashed"""
87
+ for ext in extensions:
88
+ if os.path.exists(f"{resource_id}.{ext}"):
89
+ os.remove(f"{resource_id}.{ext}")
@@ -1,131 +0,0 @@
1
- import logging
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- import tippecanoe
6
- from asyncpg import Record
7
-
8
- from udata_hydra import config
9
- from udata_hydra.analysis import helpers
10
- from udata_hydra.db.check import Check
11
- from udata_hydra.db.resource import Resource
12
- from udata_hydra.db.resource_exception import ResourceException
13
- from udata_hydra.utils import (
14
- IOException,
15
- ParseException,
16
- Timer,
17
- handle_parse_exception,
18
- )
19
- from udata_hydra.utils.minio import MinIOClient
20
-
21
- log = logging.getLogger("udata-hydra")
22
- minio_client = MinIOClient(bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER)
23
-
24
-
25
- async def analyse_geojson(
26
- check: dict,
27
- file_path: str | None = None,
28
- ) -> None:
29
- """Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
30
- if not config.GEOJSON_TO_PMTILES:
31
- log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
32
- return
33
-
34
- resource_id: str = str(check["resource_id"])
35
- url = check["url"]
36
-
37
- # Update resource status to ANALYSING_GEOJSON
38
- resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
39
-
40
- # Check if the resource is in the exceptions table
41
- exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
42
-
43
- timer = Timer("analyse-geojson")
44
- assert any(_ is not None for _ in (check["id"], url))
45
-
46
- tmp_file = None
47
- try:
48
- tmp_file = await helpers.read_or_download_file(
49
- check=check,
50
- file_path=file_path,
51
- file_format="geojson",
52
- exception=exception,
53
- )
54
- timer.mark("download-file")
55
-
56
- check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
57
-
58
- # Convert to PMTiles
59
- try:
60
- pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
61
- file_path=tmp_file.name,
62
- resource_id=resource_id,
63
- )
64
- timer.mark("geojson-to-pmtiles")
65
- except Exception as e:
66
- raise ParseException(
67
- step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
68
- ) from e
69
-
70
- check = await Check.update(
71
- check["id"],
72
- {
73
- "parsing_finished_at": datetime.now(timezone.utc),
74
- "pmtiles_url": pmtiles_url,
75
- "pmtiles_size": pmtiles_size,
76
- },
77
- )
78
-
79
- except (ParseException, IOException) as e:
80
- await handle_parse_exception(e, None, check)
81
- finally:
82
- await helpers.notify_udata(resource, check)
83
- timer.stop()
84
- if tmp_file is not None:
85
- tmp_file.close()
86
- os.remove(tmp_file.name)
87
-
88
- # Reset resource status to None
89
- await Resource.update(resource_id, {"status": None})
90
-
91
-
92
- async def geojson_to_pmtiles(
93
- file_path: str,
94
- resource_id: str | None = None,
95
- ) -> tuple[str, int]:
96
- """
97
- Convert a GeoJSON file to PMTiles format.
98
-
99
- Args:
100
- file_path: GeoJSON file path to convert.
101
- resource_id: Optional resource ID for status updates.
102
-
103
- Returns:
104
- pmtiles_url: URL of the PMTiles file.
105
- pmtiles_size: size of the PMTiles file.
106
- """
107
-
108
- log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
109
-
110
- if resource_id:
111
- await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
112
-
113
- output_pmtiles = f"{resource_id}.pmtiles"
114
-
115
- command = [
116
- "--maximum-zoom=g", # guess
117
- "-o",
118
- output_pmtiles,
119
- "--coalesce-densest-as-needed",
120
- "--extend-zooms-if-still-dropping",
121
- file_path,
122
- ]
123
- exit_code = tippecanoe._program("tippecanoe", *command)
124
- if exit_code:
125
- raise ValueError(f"GeoJSON to PMTiles conversion failed for {file_path}")
126
- log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
127
-
128
- pmtiles_size = os.path.getsize(output_pmtiles)
129
- pmtiles_url: str = minio_client.send_file(output_pmtiles)
130
-
131
- return pmtiles_url, pmtiles_size