udata-hydra 2.2.2.dev7633__tar.gz → 2.2.2.dev7667__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/PKG-INFO +2 -2
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/pyproject.toml +2 -2
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/csv.py +20 -1
- udata_hydra-2.2.2.dev7667/udata_hydra/analysis/geojson.py +250 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/config_default.toml +5 -0
- udata_hydra-2.2.2.dev7667/udata_hydra/migrations/main/20250615_add_geojson_fields.sql +5 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/status.py +5 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/__init__.py +3 -1
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/file.py +7 -0
- udata_hydra-2.2.2.dev7633/udata_hydra/analysis/geojson.py +0 -131
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/README.md +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/helpers.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/analysis/resource.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/app.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/cli.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/context.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/calculate_next_check.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/check_resources.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/helpers.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/preprocess_check_data.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/select_batch.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/check.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/resource.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/resource_exception.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/logger.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20250610_migrate_resources_exception.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/csv/20250626_delete_datetime_iso_references.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250519_add_format_column_catalog.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250610_migrate_resources_exception.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/migrations/main/20250611_add_status_since_catalog.sql +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/checks.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/resources.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/resources_exceptions.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/check.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/resource.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/resource_exception.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/auth.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/csv.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/db.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/errors.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/geojson.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/http.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/minio.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/parquet.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/queue.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/utils/timer.py +0 -0
- {udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: udata-hydra
|
|
3
|
-
Version: 2.2.2.
|
|
3
|
+
Version: 2.2.2.dev7667
|
|
4
4
|
Summary: Async crawler and parsing service for data.gouv.fr
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Opendata Team
|
|
@@ -38,7 +38,7 @@ Requires-Dist: python-magic (>=0.4.25)
|
|
|
38
38
|
Requires-Dist: python-slugify (>=8.0.4)
|
|
39
39
|
Requires-Dist: redis (>=4.1.4)
|
|
40
40
|
Requires-Dist: rq (>=1.11.1)
|
|
41
|
-
Requires-Dist: ruff (>=0.
|
|
41
|
+
Requires-Dist: ruff (>=0.9.3) ; extra == "dev"
|
|
42
42
|
Requires-Dist: sentry-sdk (>=2.10.0)
|
|
43
43
|
Requires-Dist: setuptools (>=70.3.0)
|
|
44
44
|
Requires-Dist: sqlalchemy (>=1.4.46)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "udata-hydra"
|
|
3
|
-
version = "2.2.2.
|
|
3
|
+
version = "2.2.2.dev7667"
|
|
4
4
|
description = "Async crawler and parsing service for data.gouv.fr"
|
|
5
5
|
authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
|
|
6
6
|
dependencies = [
|
|
@@ -45,7 +45,7 @@ dev = [
|
|
|
45
45
|
"pytest-asyncio>=0.18.3",
|
|
46
46
|
"pytest-cov>=5.0.0",
|
|
47
47
|
"pytest-mock>=3.7.0",
|
|
48
|
-
"ruff>=0.
|
|
48
|
+
"ruff>=0.9.3",
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
[tool.mypy]
|
|
@@ -5,7 +5,6 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
-
from math import isnan
|
|
9
8
|
from typing import Iterator
|
|
10
9
|
|
|
11
10
|
import pandas as pd
|
|
@@ -33,6 +32,7 @@ from sqlalchemy.schema import CreateIndex, CreateTable, Index
|
|
|
33
32
|
|
|
34
33
|
from udata_hydra import config, context
|
|
35
34
|
from udata_hydra.analysis import helpers
|
|
35
|
+
from udata_hydra.analysis.geojson import csv_to_geojson_and_pmtiles
|
|
36
36
|
from udata_hydra.db import compute_insert_query
|
|
37
37
|
from udata_hydra.db.check import Check
|
|
38
38
|
from udata_hydra.db.resource import Resource
|
|
@@ -43,6 +43,7 @@ from udata_hydra.utils import (
|
|
|
43
43
|
Timer,
|
|
44
44
|
detect_tabular_from_headers,
|
|
45
45
|
handle_parse_exception,
|
|
46
|
+
remove_remainders,
|
|
46
47
|
)
|
|
47
48
|
from udata_hydra.utils.minio import MinIOClient
|
|
48
49
|
from udata_hydra.utils.parquet import save_as_parquet
|
|
@@ -160,10 +161,24 @@ async def analyse_csv(
|
|
|
160
161
|
)
|
|
161
162
|
timer.mark("csv-to-parquet")
|
|
162
163
|
except Exception as e:
|
|
164
|
+
remove_remainders(resource_id, ["parquet"])
|
|
163
165
|
raise ParseException(
|
|
164
166
|
step="parquet_export", resource_id=resource_id, url=url, check_id=check["id"]
|
|
165
167
|
) from e
|
|
166
168
|
|
|
169
|
+
try:
|
|
170
|
+
geojson_args: tuple[str, int, str, int] | None = await csv_to_geojson_and_pmtiles(
|
|
171
|
+
df=df,
|
|
172
|
+
inspection=csv_inspection,
|
|
173
|
+
resource_id=resource_id,
|
|
174
|
+
)
|
|
175
|
+
timer.mark("csv-to-geojson-pmtiles")
|
|
176
|
+
except Exception as e:
|
|
177
|
+
remove_remainders(resource_id, ["geojson", "pmtiles", "pmtiles-journal"])
|
|
178
|
+
raise ParseException(
|
|
179
|
+
step="geojson_export", resource_id=resource_id, url=url, check_id=check["id"]
|
|
180
|
+
) from e
|
|
181
|
+
|
|
167
182
|
check = await Check.update(
|
|
168
183
|
check["id"],
|
|
169
184
|
{
|
|
@@ -171,6 +186,10 @@ async def analyse_csv(
|
|
|
171
186
|
"parsing_finished_at": datetime.now(timezone.utc),
|
|
172
187
|
"parquet_url": parquet_args[0] if parquet_args else None,
|
|
173
188
|
"parquet_size": parquet_args[1] if parquet_args else None,
|
|
189
|
+
"geojson_url": geojson_args[0] if geojson_args else None,
|
|
190
|
+
"geojson_size": geojson_args[1] if geojson_args else None,
|
|
191
|
+
"pmtiles_url": geojson_args[2] if geojson_args else None,
|
|
192
|
+
"pmtiles_size": geojson_args[3] if geojson_args else None,
|
|
174
193
|
},
|
|
175
194
|
)
|
|
176
195
|
await csv_to_db_index(table_name, csv_inspection, check)
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import tippecanoe
|
|
8
|
+
from asyncpg import Record
|
|
9
|
+
|
|
10
|
+
from udata_hydra import config
|
|
11
|
+
from udata_hydra.analysis import helpers
|
|
12
|
+
from udata_hydra.db.check import Check
|
|
13
|
+
from udata_hydra.db.resource import Resource
|
|
14
|
+
from udata_hydra.db.resource_exception import ResourceException
|
|
15
|
+
from udata_hydra.utils import (
|
|
16
|
+
IOException,
|
|
17
|
+
ParseException,
|
|
18
|
+
Timer,
|
|
19
|
+
handle_parse_exception,
|
|
20
|
+
)
|
|
21
|
+
from udata_hydra.utils.minio import MinIOClient
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger("udata-hydra")
|
|
24
|
+
minio_client_pmtiles = MinIOClient(
|
|
25
|
+
bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER
|
|
26
|
+
)
|
|
27
|
+
minio_client_geojson = MinIOClient(
|
|
28
|
+
bucket=config.MINIO_GEOJSON_BUCKET, folder=config.MINIO_GEOJSON_FOLDER
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def analyse_geojson(
|
|
33
|
+
check: dict,
|
|
34
|
+
file_path: str | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
|
|
37
|
+
if not config.GEOJSON_TO_PMTILES:
|
|
38
|
+
log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
resource_id: str = str(check["resource_id"])
|
|
42
|
+
url = check["url"]
|
|
43
|
+
|
|
44
|
+
# Update resource status to ANALYSING_GEOJSON
|
|
45
|
+
resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
|
|
46
|
+
|
|
47
|
+
# Check if the resource is in the exceptions table
|
|
48
|
+
exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
|
|
49
|
+
|
|
50
|
+
timer = Timer("analyse-geojson")
|
|
51
|
+
assert any(_ is not None for _ in (check["id"], url))
|
|
52
|
+
|
|
53
|
+
tmp_file = None
|
|
54
|
+
try:
|
|
55
|
+
tmp_file = await helpers.read_or_download_file(
|
|
56
|
+
check=check,
|
|
57
|
+
file_path=file_path,
|
|
58
|
+
file_format="geojson",
|
|
59
|
+
exception=exception,
|
|
60
|
+
)
|
|
61
|
+
timer.mark("download-file")
|
|
62
|
+
|
|
63
|
+
check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
|
|
64
|
+
|
|
65
|
+
# Convert to PMTiles
|
|
66
|
+
try:
|
|
67
|
+
pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
|
|
68
|
+
file_path=tmp_file.name,
|
|
69
|
+
resource_id=resource_id,
|
|
70
|
+
)
|
|
71
|
+
timer.mark("geojson-to-pmtiles")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
raise ParseException(
|
|
74
|
+
step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
|
|
75
|
+
) from e
|
|
76
|
+
|
|
77
|
+
check = await Check.update(
|
|
78
|
+
check["id"],
|
|
79
|
+
{
|
|
80
|
+
"parsing_finished_at": datetime.now(timezone.utc),
|
|
81
|
+
"pmtiles_url": pmtiles_url,
|
|
82
|
+
"pmtiles_size": pmtiles_size,
|
|
83
|
+
},
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
except (ParseException, IOException) as e:
|
|
87
|
+
await handle_parse_exception(e, None, check)
|
|
88
|
+
finally:
|
|
89
|
+
await helpers.notify_udata(resource, check)
|
|
90
|
+
timer.stop()
|
|
91
|
+
if tmp_file is not None:
|
|
92
|
+
tmp_file.close()
|
|
93
|
+
os.remove(tmp_file.name)
|
|
94
|
+
|
|
95
|
+
# Reset resource status to None
|
|
96
|
+
await Resource.update(resource_id, {"status": None})
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def geojson_to_pmtiles(
|
|
100
|
+
file_path: str,
|
|
101
|
+
resource_id: str | None = None,
|
|
102
|
+
) -> tuple[str, int]:
|
|
103
|
+
"""
|
|
104
|
+
Convert a GeoJSON file to PMTiles format.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
file_path: GeoJSON file path to convert.
|
|
108
|
+
resource_id: Optional resource ID for status updates.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
pmtiles_url: URL of the PMTiles file.
|
|
112
|
+
pmtiles_size: size of the PMTiles file.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
|
|
116
|
+
|
|
117
|
+
if resource_id:
|
|
118
|
+
await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
|
|
119
|
+
|
|
120
|
+
output_pmtiles = f"{resource_id}.pmtiles"
|
|
121
|
+
|
|
122
|
+
command = [
|
|
123
|
+
"--maximum-zoom=g", # guess
|
|
124
|
+
"-o",
|
|
125
|
+
output_pmtiles,
|
|
126
|
+
"--coalesce-densest-as-needed",
|
|
127
|
+
"--extend-zooms-if-still-dropping",
|
|
128
|
+
file_path,
|
|
129
|
+
]
|
|
130
|
+
exit_code = tippecanoe._program("tippecanoe", *command)
|
|
131
|
+
if exit_code:
|
|
132
|
+
raise ValueError(f"GeoJSON to PMTiles conversion failed for {file_path}")
|
|
133
|
+
log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
|
|
134
|
+
|
|
135
|
+
pmtiles_size = os.path.getsize(output_pmtiles)
|
|
136
|
+
pmtiles_url: str = minio_client_pmtiles.send_file(output_pmtiles)
|
|
137
|
+
|
|
138
|
+
return pmtiles_url, pmtiles_size
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
async def csv_to_geojson_and_pmtiles(
|
|
142
|
+
df: pd.DataFrame,
|
|
143
|
+
inspection: dict,
|
|
144
|
+
resource_id: str | None = None,
|
|
145
|
+
) -> tuple[str, int, str, int] | None:
|
|
146
|
+
def cast_latlon(latlon: str) -> list[float, float]:
|
|
147
|
+
# we can safely do this as the detection was successful
|
|
148
|
+
lat, lon = latlon.replace(" ", "").split(",")
|
|
149
|
+
# using the geojson standard: longitude before latitude
|
|
150
|
+
return [float(lon), float(lat)]
|
|
151
|
+
|
|
152
|
+
def prevent_nan(value):
|
|
153
|
+
# convenience to prevent downstream crash (NaN in json or PMtiles)
|
|
154
|
+
if pd.isna(value):
|
|
155
|
+
return None
|
|
156
|
+
return value
|
|
157
|
+
|
|
158
|
+
if not config.CSV_TO_GEOJSON:
|
|
159
|
+
log.debug("CSV_TO_GEOJSON turned off, skipping geojson/PMtiles export.")
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
log.debug(
|
|
163
|
+
f"Converting to geojson and PMtiles if relevant for {resource_id} and sending to Minio."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
geo = {}
|
|
167
|
+
for column, detection in inspection["columns"].items():
|
|
168
|
+
# see csv-detective's geo formats:
|
|
169
|
+
# https://github.com/datagouv/csv-detective/tree/master/csv_detective/detect_fields/geo
|
|
170
|
+
if "geojson" in detection["format"]:
|
|
171
|
+
geo["geometry"] = column
|
|
172
|
+
break
|
|
173
|
+
if "latlon" in detection["format"]:
|
|
174
|
+
geo["latlon"] = column
|
|
175
|
+
break
|
|
176
|
+
if "latitude" in detection["format"]:
|
|
177
|
+
geo["lat"] = column
|
|
178
|
+
if "longitude" in detection["format"]:
|
|
179
|
+
geo["lon"] = column
|
|
180
|
+
# priority is given to geometry, then latlon, then latitude + longitude
|
|
181
|
+
if "geometry" in geo:
|
|
182
|
+
geo = {"geometry": geo["geometry"]}
|
|
183
|
+
if "latlon" in geo:
|
|
184
|
+
geo = {"latlon": geo["latlon"]}
|
|
185
|
+
if not geo or (("lat" in geo and "lon" not in geo) or ("lon" in geo and "lat" not in geo)):
|
|
186
|
+
log.debug("No geographical columns found, skipping")
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
if resource_id:
|
|
190
|
+
await Resource.update(resource_id, {"status": "CONVERTING_TO_GEOJSON"})
|
|
191
|
+
|
|
192
|
+
template = {"type": "FeatureCollection", "features": []}
|
|
193
|
+
for _, row in df.iterrows():
|
|
194
|
+
if "geometry" in geo:
|
|
195
|
+
template["features"].append(
|
|
196
|
+
{
|
|
197
|
+
"type": "Feature",
|
|
198
|
+
# json is not pre-cast by csv-detective
|
|
199
|
+
"geometry": json.loads(row[geo["geometry"]]),
|
|
200
|
+
"properties": {
|
|
201
|
+
col: prevent_nan(row[col]) for col in df.columns if col != geo["geometry"]
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
elif "latlon" in geo:
|
|
206
|
+
# ending up here means we either have the exact lat,lon format, or NaN
|
|
207
|
+
# skipping row if NaN
|
|
208
|
+
if pd.isna(row[geo["latlon"]]):
|
|
209
|
+
continue
|
|
210
|
+
template["features"].append(
|
|
211
|
+
{
|
|
212
|
+
"type": "Feature",
|
|
213
|
+
"geometry": {
|
|
214
|
+
"type": "Point",
|
|
215
|
+
"coordinates": cast_latlon(row[geo["latlon"]]),
|
|
216
|
+
},
|
|
217
|
+
"properties": {
|
|
218
|
+
col: prevent_nan(row[col]) for col in df.columns if col != geo["latlon"]
|
|
219
|
+
},
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
else:
|
|
223
|
+
# skipping row if lat or lon is NaN
|
|
224
|
+
if any(pd.isna(coord) for coord in (row[geo["lon"]], row[geo["lat"]])):
|
|
225
|
+
continue
|
|
226
|
+
template["features"].append(
|
|
227
|
+
{
|
|
228
|
+
"type": "Feature",
|
|
229
|
+
"geometry": {
|
|
230
|
+
"type": "Point",
|
|
231
|
+
# these columns are precast by csv-detective
|
|
232
|
+
"coordinates": [row[geo["lon"]], row[geo["lat"]]],
|
|
233
|
+
},
|
|
234
|
+
"properties": {
|
|
235
|
+
col: prevent_nan(row[col])
|
|
236
|
+
for col in df.columns
|
|
237
|
+
if col not in [geo["lon"], geo["lat"]]
|
|
238
|
+
},
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
geojson_file = f"{resource_id}.geojson"
|
|
242
|
+
with open(geojson_file, "w") as f:
|
|
243
|
+
json.dump(template, f, indent=4, ensure_ascii=False, default=str)
|
|
244
|
+
geojson_size = os.path.getsize(geojson_file)
|
|
245
|
+
|
|
246
|
+
pmtiles_url, pmtiles_size = await geojson_to_pmtiles(geojson_file, resource_id)
|
|
247
|
+
|
|
248
|
+
geojson_url: str = minio_client_geojson.send_file(geojson_file)
|
|
249
|
+
|
|
250
|
+
return geojson_url, geojson_size, pmtiles_url, pmtiles_size
|
|
@@ -87,3 +87,8 @@ MINIO_PARQUET_FOLDER = "" # no trailing slash
|
|
|
87
87
|
GEOJSON_TO_PMTILES = false
|
|
88
88
|
MINIO_PMTILES_BUCKET = ""
|
|
89
89
|
MINIO_PMTILES_FOLDER = "" # no trailing slash
|
|
90
|
+
|
|
91
|
+
# -- Geojson conversion settings -- #
|
|
92
|
+
CSV_TO_GEOJSON = false
|
|
93
|
+
MINIO_GEOJSON_BUCKET = ""
|
|
94
|
+
MINIO_GEOJSON_FOLDER = "" # no trailing slash
|
|
@@ -147,5 +147,10 @@ async def get_health(request: web.Request) -> web.Response:
|
|
|
147
147
|
{
|
|
148
148
|
"version": config.APP_VERSION,
|
|
149
149
|
"environment": config.ENVIRONMENT or "unknown",
|
|
150
|
+
"csv_analysis": config.CSV_ANALYSIS,
|
|
151
|
+
"csv_to_db": config.CSV_TO_DB,
|
|
152
|
+
"csv_to_parquet": config.CSV_TO_PARQUET,
|
|
153
|
+
"geojson_to_pmtiles": config.GEOJSON_TO_PMTILES,
|
|
154
|
+
"csv_to_geojson": config.CSV_TO_GEOJSON,
|
|
150
155
|
}
|
|
151
156
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from .auth import token_auth_middleware
|
|
2
2
|
from .csv import detect_tabular_from_headers
|
|
3
3
|
from .errors import IOException, ParseException, handle_parse_exception
|
|
4
|
-
from .file import compute_checksum_from_file, download_resource
|
|
4
|
+
from .file import compute_checksum_from_file, download_resource, read_csv_gz, remove_remainders
|
|
5
5
|
from .geojson import detect_geojson_from_headers_or_catalog
|
|
6
6
|
from .http import UdataPayload, get_request_params, send
|
|
7
7
|
from .queue import enqueue
|
|
@@ -15,6 +15,8 @@ __all__ = [
|
|
|
15
15
|
"handle_parse_exception",
|
|
16
16
|
"compute_checksum_from_file",
|
|
17
17
|
"download_resource",
|
|
18
|
+
"read_csv_gz",
|
|
19
|
+
"remove_remainders",
|
|
18
20
|
"detect_geojson_from_headers_or_catalog",
|
|
19
21
|
"UdataPayload",
|
|
20
22
|
"get_request_params",
|
|
@@ -80,3 +80,10 @@ async def download_resource(
|
|
|
80
80
|
]:
|
|
81
81
|
tmp_file = read_csv_gz(tmp_file.name)
|
|
82
82
|
return tmp_file
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def remove_remainders(resource_id: str, extensions: list[str]) -> None:
|
|
86
|
+
"""Delete potential remainders from process that crashed"""
|
|
87
|
+
for ext in extensions:
|
|
88
|
+
if os.path.exists(f"{resource_id}.{ext}"):
|
|
89
|
+
os.remove(f"{resource_id}.{ext}")
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
from datetime import datetime, timezone
|
|
4
|
-
|
|
5
|
-
import tippecanoe
|
|
6
|
-
from asyncpg import Record
|
|
7
|
-
|
|
8
|
-
from udata_hydra import config
|
|
9
|
-
from udata_hydra.analysis import helpers
|
|
10
|
-
from udata_hydra.db.check import Check
|
|
11
|
-
from udata_hydra.db.resource import Resource
|
|
12
|
-
from udata_hydra.db.resource_exception import ResourceException
|
|
13
|
-
from udata_hydra.utils import (
|
|
14
|
-
IOException,
|
|
15
|
-
ParseException,
|
|
16
|
-
Timer,
|
|
17
|
-
handle_parse_exception,
|
|
18
|
-
)
|
|
19
|
-
from udata_hydra.utils.minio import MinIOClient
|
|
20
|
-
|
|
21
|
-
log = logging.getLogger("udata-hydra")
|
|
22
|
-
minio_client = MinIOClient(bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
async def analyse_geojson(
|
|
26
|
-
check: dict,
|
|
27
|
-
file_path: str | None = None,
|
|
28
|
-
) -> None:
|
|
29
|
-
"""Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
|
|
30
|
-
if not config.GEOJSON_TO_PMTILES:
|
|
31
|
-
log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
|
|
32
|
-
return
|
|
33
|
-
|
|
34
|
-
resource_id: str = str(check["resource_id"])
|
|
35
|
-
url = check["url"]
|
|
36
|
-
|
|
37
|
-
# Update resource status to ANALYSING_GEOJSON
|
|
38
|
-
resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
|
|
39
|
-
|
|
40
|
-
# Check if the resource is in the exceptions table
|
|
41
|
-
exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
|
|
42
|
-
|
|
43
|
-
timer = Timer("analyse-geojson")
|
|
44
|
-
assert any(_ is not None for _ in (check["id"], url))
|
|
45
|
-
|
|
46
|
-
tmp_file = None
|
|
47
|
-
try:
|
|
48
|
-
tmp_file = await helpers.read_or_download_file(
|
|
49
|
-
check=check,
|
|
50
|
-
file_path=file_path,
|
|
51
|
-
file_format="geojson",
|
|
52
|
-
exception=exception,
|
|
53
|
-
)
|
|
54
|
-
timer.mark("download-file")
|
|
55
|
-
|
|
56
|
-
check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
|
|
57
|
-
|
|
58
|
-
# Convert to PMTiles
|
|
59
|
-
try:
|
|
60
|
-
pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
|
|
61
|
-
file_path=tmp_file.name,
|
|
62
|
-
resource_id=resource_id,
|
|
63
|
-
)
|
|
64
|
-
timer.mark("geojson-to-pmtiles")
|
|
65
|
-
except Exception as e:
|
|
66
|
-
raise ParseException(
|
|
67
|
-
step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
|
|
68
|
-
) from e
|
|
69
|
-
|
|
70
|
-
check = await Check.update(
|
|
71
|
-
check["id"],
|
|
72
|
-
{
|
|
73
|
-
"parsing_finished_at": datetime.now(timezone.utc),
|
|
74
|
-
"pmtiles_url": pmtiles_url,
|
|
75
|
-
"pmtiles_size": pmtiles_size,
|
|
76
|
-
},
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
except (ParseException, IOException) as e:
|
|
80
|
-
await handle_parse_exception(e, None, check)
|
|
81
|
-
finally:
|
|
82
|
-
await helpers.notify_udata(resource, check)
|
|
83
|
-
timer.stop()
|
|
84
|
-
if tmp_file is not None:
|
|
85
|
-
tmp_file.close()
|
|
86
|
-
os.remove(tmp_file.name)
|
|
87
|
-
|
|
88
|
-
# Reset resource status to None
|
|
89
|
-
await Resource.update(resource_id, {"status": None})
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
async def geojson_to_pmtiles(
|
|
93
|
-
file_path: str,
|
|
94
|
-
resource_id: str | None = None,
|
|
95
|
-
) -> tuple[str, int]:
|
|
96
|
-
"""
|
|
97
|
-
Convert a GeoJSON file to PMTiles format.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
file_path: GeoJSON file path to convert.
|
|
101
|
-
resource_id: Optional resource ID for status updates.
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
pmtiles_url: URL of the PMTiles file.
|
|
105
|
-
pmtiles_size: size of the PMTiles file.
|
|
106
|
-
"""
|
|
107
|
-
|
|
108
|
-
log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
|
|
109
|
-
|
|
110
|
-
if resource_id:
|
|
111
|
-
await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
|
|
112
|
-
|
|
113
|
-
output_pmtiles = f"{resource_id}.pmtiles"
|
|
114
|
-
|
|
115
|
-
command = [
|
|
116
|
-
"--maximum-zoom=g", # guess
|
|
117
|
-
"-o",
|
|
118
|
-
output_pmtiles,
|
|
119
|
-
"--coalesce-densest-as-needed",
|
|
120
|
-
"--extend-zooms-if-still-dropping",
|
|
121
|
-
file_path,
|
|
122
|
-
]
|
|
123
|
-
exit_code = tippecanoe._program("tippecanoe", *command)
|
|
124
|
-
if exit_code:
|
|
125
|
-
raise ValueError(f"GeoJSON to PMTiles conversion failed for {file_path}")
|
|
126
|
-
log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
|
|
127
|
-
|
|
128
|
-
pmtiles_size = os.path.getsize(output_pmtiles)
|
|
129
|
-
pmtiles_url: str = minio_client.send_file(output_pmtiles)
|
|
130
|
-
|
|
131
|
-
return pmtiles_url, pmtiles_size
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/calculate_next_check.py
RENAMED
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/check_resources.py
RENAMED
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/crawl/preprocess_check_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/db/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/routes/resources_exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7633 → udata_hydra-2.2.2.dev7667}/udata_hydra/schemas/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|