PyPI - udata-hydra - Versions diffs - 2.1.3.dev7204__tar.gz → 2.1.3.dev7241__tar.gz - Mend

udata-hydra 2.1.3.dev7204tar.gz → 2.1.3.dev7241tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: udata-hydra
-Version: 2.1.3.dev7204
+Version: 2.1.3.dev7241
 Summary: Async crawler and parsing service for data.gouv.fr
 License: MIT
 Author: Opendata Team
@@ -44,6 +44,7 @@ Requires-Dist: setuptools (>=70.3.0)
 Requires-Dist: sqlalchemy (>=1.4.46)
 Requires-Dist: str2bool (>=1.1)
 Requires-Dist: str2float (>=0.0.9)
+Requires-Dist: tippecanoe (>=2.72.0)
 Requires-Dist: toml (>=0.10.2)
 Description-Content-Type: text/markdown
@@ -55,7 +56,8 @@ URLs are crawled via _aiohttp_, catalog and crawled metadata are stored in a _Po
 Since it's called _hydra_, it also has mythical powers embedded:
 - analyse remote resource metadata over time to detect changes in the smartest way possible
-- if the remote resource is a CSV, convert it to a PostgreSQL table, ready for APIfication
+- if the remote resource is tabular (csv or excel-like), convert it to a PostgreSQL table, ready for APIfication, and to parquet to offer another distribution of the data
+- if the remote resource is a geojson, convert it to PMTiles to offer another distribution of the data
 - send crawl and analysis info to a udata instance
 ## Architecture schema
@@ -126,6 +128,8 @@ Converted CSV tables will be stored in the database specified via `config.DATABA
 To run the tests, you need to launch the database, the test database, and the Redis broker with `docker compose -f docker-compose.yml -f docker-compose.test.yml -f docker-compose.broker.yml up -d`.
+Make sure the dev dependecies are installed with `poetry install --extras dev`.
 Then you can run the tests with `poetry run pytest`.
 To run a specific test file, you can pass the path to the file to pytest, like this: `poetry run pytest tests/test_app.py`.
@@ -181,7 +185,7 @@ The API serves the following endpoints:
 - `PUT` on `/api/resources/{resource_id}` to update a resource in the DB "catalog" table
 - `DELETE` on `/api/resources/{resource_id}` to delete a resource in the DB "catalog" table
-> :warning: **Warning: the following routes are deprecated and need be removed in the future:**
+> :warning: **Warning: the following routes are deprecated and will be removed in the future:**
 > - `POST` on `/api/resource/created` -> use `POST` on `/api/resources/` instead
 > - `POST` on `/api/resource/updated` -> use `PUT` on `/api/resources/` instead
 > - `POST` on `/api/resource/deleted` -> use `DELETE` on `/api/resources/` instead

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/README.md RENAMED Viewed

@@ -6,7 +6,8 @@ URLs are crawled via _aiohttp_, catalog and crawled metadata are stored in a _Po
 Since it's called _hydra_, it also has mythical powers embedded:
 - analyse remote resource metadata over time to detect changes in the smartest way possible
-- if the remote resource is a CSV, convert it to a PostgreSQL table, ready for APIfication
+- if the remote resource is tabular (csv or excel-like), convert it to a PostgreSQL table, ready for APIfication, and to parquet to offer another distribution of the data
+- if the remote resource is a geojson, convert it to PMTiles to offer another distribution of the data
 - send crawl and analysis info to a udata instance
 ## Architecture schema
@@ -77,6 +78,8 @@ Converted CSV tables will be stored in the database specified via `config.DATABA
 To run the tests, you need to launch the database, the test database, and the Redis broker with `docker compose -f docker-compose.yml -f docker-compose.test.yml -f docker-compose.broker.yml up -d`.
+Make sure the dev dependecies are installed with `poetry install --extras dev`.
 Then you can run the tests with `poetry run pytest`.
 To run a specific test file, you can pass the path to the file to pytest, like this: `poetry run pytest tests/test_app.py`.
@@ -132,7 +135,7 @@ The API serves the following endpoints:
 - `PUT` on `/api/resources/{resource_id}` to update a resource in the DB "catalog" table
 - `DELETE` on `/api/resources/{resource_id}` to delete a resource in the DB "catalog" table
-> :warning: **Warning: the following routes are deprecated and need be removed in the future:**
+> :warning: **Warning: the following routes are deprecated and will be removed in the future:**
 > - `POST` on `/api/resource/created` -> use `POST` on `/api/resources/` instead
 > - `POST` on `/api/resource/updated` -> use `PUT` on `/api/resources/` instead
 > - `POST` on `/api/resource/deleted` -> use `DELETE` on `/api/resources/` instead

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "udata-hydra"
-version = "2.1.3.dev7204"
+version = "2.1.3.dev7241"
 description = "Async crawler and parsing service for data.gouv.fr"
 authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
 dependencies = [
@@ -27,6 +27,7 @@ dependencies = [
     "str2bool>=1.1",
     "str2float>=0.0.9",
     "toml>=0.10.2",
+    "tippecanoe>=2.72.0",
 ]
 requires-python = ">=3.11,<3.13"
 license = { text = "MIT" }

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/analysis/csv.py RENAMED Viewed

@@ -41,12 +41,8 @@ from udata_hydra.utils import (
     ParseException,
     Reader,
     Timer,
-    UdataPayload,
     detect_tabular_from_headers,
-    download_resource,
     handle_parse_exception,
-    queue,
-    send,
 )
 from udata_hydra.utils.minio import MinIOClient
 from udata_hydra.utils.parquet import save_as_parquet
@@ -84,29 +80,7 @@ PYTHON_TYPE_TO_PY = {
 }
 RESERVED_COLS = ("__id", "cmin", "cmax", "collation", "ctid", "tableoid", "xmin", "xmax")
-minio_client = MinIOClient()
-async def notify_udata(resource: Record, check: dict) -> None:
-    """Notify udata of the result of a parsing"""
-    payload = {
-        "resource_id": check["resource_id"],
-        "dataset_id": resource["dataset_id"],
-        "document": {
-            "analysis:parsing:error": check["parsing_error"],
-            "analysis:parsing:started_at": check["parsing_started_at"].isoformat()
-            if check["parsing_started_at"]
-            else None,
-            "analysis:parsing:finished_at": check["parsing_finished_at"].isoformat()
-            if check["parsing_finished_at"]
-            else None,
-        },
-    }
-    if config.CSV_TO_PARQUET:
-        payload["document"]["analysis:parsing:parquet_url"] = check.get("parquet_url")
-        payload["document"]["analysis:parsing:parquet_size"] = check.get("parquet_size")
-    payload["document"] = UdataPayload(payload["document"])
-    queue.enqueue(send, _priority="high", **payload)
+minio_client = MinIOClient(bucket=config.MINIO_PARQUET_BUCKET, folder=config.MINIO_PARQUET_FOLDER)
 async def analyse_csv(
@@ -137,18 +111,12 @@ async def analyse_csv(
     table_name, tmp_file = None, None
     try:
-        headers = json.loads(check.get("headers") or "{}")
-        _, file_format = await detect_tabular_from_headers(check)
-        tmp_file = (
-            open(file_path, "rb")
-            if file_path
-            else await download_resource(
-                url=url,
-                headers=headers,
-                max_size_allowed=None
-                if exception
-                else int(config.MAX_FILESIZE_ALLOWED.get(file_format, "csv")),
-            )
+        _, file_format = detect_tabular_from_headers(check)
+        tmp_file = await helpers.read_or_download_file(
+            check=check,
+            file_path=file_path,
+            file_format=file_format,
+            exception=exception,
         )
         table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
         timer.mark("download-file")
@@ -205,7 +173,7 @@ async def analyse_csv(
     except (ParseException, IOException) as e:
         await handle_parse_exception(e, table_name, check)
     finally:
-        await notify_udata(resource, check)
+        await helpers.notify_udata(resource, check)
         timer.stop()
         if tmp_file is not None:
             tmp_file.close()
@@ -250,7 +218,7 @@ def compute_create_table_query(
         for col_name, index_type in indexes.items():
             if index_type not in config.SQL_INDEXES_TYPES_SUPPORTED:
                 log.error(
-                    f'Index type "{index_type}" is unknown or not supported yet! Index for colum {col_name} was not created.'
+                    f'Index type "{index_type}" is unknown or not supported yet! Index for column {col_name} was not created.'
                 )
                 continue

udata_hydra-2.1.3.dev7241/udata_hydra/analysis/geojson.py ADDED Viewed

@@ -0,0 +1,130 @@
+import logging
+import os
+import subprocess
+from datetime import datetime, timezone
+from asyncpg import Record
+from udata_hydra import config
+from udata_hydra.analysis import helpers
+from udata_hydra.db.check import Check
+from udata_hydra.db.resource import Resource
+from udata_hydra.db.resource_exception import ResourceException
+from udata_hydra.utils import (
+    IOException,
+    ParseException,
+    Timer,
+    handle_parse_exception,
+)
+from udata_hydra.utils.minio import MinIOClient
+log = logging.getLogger("udata-hydra")
+minio_client = MinIOClient(bucket=config.MINIO_PMTILES_BUCKET, folder=config.MINIO_PMTILES_FOLDER)
+async def analyse_geojson(
+    check: dict,
+    file_path: str | None = None,
+) -> None:
+    """Launch GeoJSON analysis from a check or an URL (debug), using previously downloaded file at file_path if any"""
+    if not config.GEOJSON_TO_PMTILES:
+        log.debug("GEOJSON_TO_PMTILES turned off, skipping.")
+        return
+    resource_id: str = str(check["resource_id"])
+    url = check["url"]
+    # Update resource status to ANALYSING_GEOJSON
+    resource: Record | None = await Resource.update(resource_id, {"status": "ANALYSING_GEOJSON"})
+    # Check if the resource is in the exceptions table
+    exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
+    timer = Timer("analyse-geojson")
+    assert any(_ is not None for _ in (check["id"], url))
+    tmp_file = None
+    try:
+        tmp_file = await helpers.read_or_download_file(
+            check=check,
+            file_path=file_path,
+            file_format="geojson",
+            exception=exception,
+        )
+        timer.mark("download-file")
+        check = await Check.update(check["id"], {"parsing_started_at": datetime.now(timezone.utc)})
+        # Convert to PMTiles
+        try:
+            pmtiles_url, pmtiles_size = await geojson_to_pmtiles(
+                file_path=tmp_file.name,
+                resource_id=resource_id,
+            )
+            timer.mark("geojson-to-pmtiles")
+        except Exception as e:
+            raise ParseException(
+                step="pmtiles_export", resource_id=resource_id, url=url, check_id=check["id"]
+            ) from e
+        check = await Check.update(
+            check["id"],
+            {
+                "parsing_finished_at": datetime.now(timezone.utc),
+                "pmtiles_url": pmtiles_url,
+                "pmtiles_size": pmtiles_size,
+            },
+        )
+    except (ParseException, IOException) as e:
+        await handle_parse_exception(e, None, check)
+    finally:
+        await helpers.notify_udata(resource, check)
+        timer.stop()
+        if tmp_file is not None:
+            tmp_file.close()
+            os.remove(tmp_file.name)
+        # Reset resource status to None
+        await Resource.update(resource_id, {"status": None})
+async def geojson_to_pmtiles(
+    file_path: str,
+    resource_id: str | None = None,
+) -> tuple[str, int]:
+    """
+    Convert a GeoJSON file to PMTiles format.
+    Args:
+        file_path: GeoJSON file path to convert.
+        resource_id: Optional resource ID for status updates.
+    Returns:
+        pmtiles_url: URL of the PMTiles file.
+        pmtiles_size: size of the PMTiles file.
+    """
+    log.debug(f"Converting GeoJSON to PMTiles for {file_path}")
+    if resource_id:
+        await Resource.update(resource_id, {"status": "CONVERTING_TO_PMTILES"})
+    output_pmtiles = f"{resource_id}.pmtiles"
+    command = [
+        "tippecanoe",
+        "--maximum-zoom=g",  # guess
+        "-o",
+        output_pmtiles,
+        "--coalesce-densest-as-needed",
+        "--extend-zooms-if-still-dropping",
+        file_path,
+    ]
+    subprocess.run(command, check=True)
+    log.debug(f"Successfully converted {file_path} to {output_pmtiles}")
+    pmtiles_size = os.path.getsize(output_pmtiles)
+    pmtiles_url: str = minio_client.send_file(output_pmtiles)
+    return pmtiles_url, pmtiles_size

udata_hydra-2.1.3.dev7241/udata_hydra/analysis/helpers.py ADDED Viewed

@@ -0,0 +1,77 @@
+import json
+from datetime import date, datetime
+from typing import IO
+from asyncpg import Record
+from dateparser import parse as date_parser
+from dateutil.parser import ParserError
+from dateutil.parser import parse as dateutil_parser
+from udata_hydra import config
+from udata_hydra.utils import UdataPayload, download_resource, queue, send
+def to_json(value: str) -> str:
+    """Convenience method, should be casted from string directly by postgres"""
+    return value
+def _parse_dt(value: str) -> datetime | None:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
+    try:
+        return dateutil_parser(value)
+    except ParserError:
+        return date_parser(value)
+def to_date(value: str) -> date | None:
+    parsed = _parse_dt(value)
+    return parsed.date() if parsed else None
+def to_datetime(value: str) -> datetime | None:
+    return _parse_dt(value)
+async def read_or_download_file(
+    check: dict,
+    file_path: str,
+    file_format: str,
+    exception: Record | None,
+) -> IO[bytes]:
+    return (
+        open(file_path, "rb")
+        if file_path
+        else await download_resource(
+            url=check["url"],
+            headers=json.loads(check.get("headers") or "{}"),
+            max_size_allowed=None
+            if exception
+            else int(config.MAX_FILESIZE_ALLOWED.get(file_format, "csv")),
+        )
+    )
+async def notify_udata(resource: Record, check: dict) -> None:
+    """Notify udata of the result of a parsing"""
+    payload = {
+        "resource_id": check["resource_id"],
+        "dataset_id": resource["dataset_id"],
+        "document": {
+            "analysis:parsing:error": check["parsing_error"],
+            "analysis:parsing:started_at": check["parsing_started_at"].isoformat()
+            if check["parsing_started_at"]
+            else None,
+            "analysis:parsing:finished_at": check["parsing_finished_at"].isoformat()
+            if check["parsing_finished_at"]
+            else None,
+        },
+    }
+    if config.CSV_TO_PARQUET and check.get("parquet_url"):
+        payload["document"]["analysis:parsing:parquet_url"] = check.get("parquet_url")
+        payload["document"]["analysis:parsing:parquet_size"] = check.get("parquet_size")
+    if config.GEOJSON_TO_PMTILES and check.get("pmtiles_url"):
+        payload["document"]["analysis:parsing:pmtiles_url"] = check.get("pmtiles_url")
+        payload["document"]["analysis:parsing:pmtiles_size"] = check.get("pmtiles_size")
+    payload["document"] = UdataPayload(payload["document"])
+    queue.enqueue(send, _priority="high", **payload)

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/analysis/resource.py RENAMED Viewed

@@ -10,6 +10,7 @@ from dateparser import parse as date_parser
 from udata_hydra import config, context
 from udata_hydra.analysis.csv import analyse_csv
+from udata_hydra.analysis.geojson import analyse_geojson
 from udata_hydra.crawl.calculate_next_check import calculate_next_check_date
 from udata_hydra.db.check import Check
 from udata_hydra.db.resource import Resource
@@ -18,6 +19,7 @@ from udata_hydra.utils import (
     IOException,
     UdataPayload,
     compute_checksum_from_file,
+    detect_geojson_from_headers,
     detect_tabular_from_headers,
     download_resource,
     queue,
@@ -69,8 +71,11 @@ async def analyse_resource(
     # let's see if we can infer a modification date on early hints based on harvest infos and headers
     change_status, change_payload = await detect_resource_change_on_early_hints(resource)
-    # could it be a CSV? If we get hints, we will analyse the file further depending on change status
-    is_tabular, file_format = await detect_tabular_from_headers(check)
+    # could it be a CSV or a GeoJSON? If we get hints, we will analyse the file further depending on change status
+    is_tabular, file_format = detect_tabular_from_headers(check)
+    is_geojson: bool = detect_geojson_from_headers(check)
+    if is_geojson:
+        file_format = "geojson"
     max_size_allowed = None if exception else int(config.MAX_FILESIZE_ALLOWED[file_format])
     # if the change status is NO_GUESS or HAS_CHANGED, let's download the file to get more infos
@@ -96,7 +101,7 @@ async def analyse_resource(
                 )
             dl_analysis["analysis:mime-type"] = magic.from_file(tmp_file.name, mime=True)
         finally:
-            if tmp_file and not is_tabular:
+            if tmp_file and not (is_tabular or is_geojson):
                 os.remove(tmp_file.name)
             await Check.update(
                 check["id"],
@@ -136,7 +141,14 @@ async def analyse_resource(
                 file_path=tmp_file.name,
                 _priority="high" if worker_priority == "high" else "default",
             )
+        elif is_geojson and tmp_file:
+            await Resource.update(resource_id, data={"status": "TO_ANALYSE_GEOJSON"})
+            queue.enqueue(
+                analyse_geojson,
+                check=check,
+                file_path=tmp_file.name,
+                _priority="high" if worker_priority == "high" else "default",
+            )
         else:
             await Resource.update(resource_id, data={"status": None})

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/cli.py RENAMED Viewed

@@ -14,6 +14,7 @@ from progressist import ProgressBar
 from udata_hydra import config
 from udata_hydra.analysis.csv import analyse_csv
+from udata_hydra.analysis.geojson import analyse_geojson
 from udata_hydra.crawl.check_resources import check_resource as crawl_check_resource
 from udata_hydra.db.check import Check
 from udata_hydra.db.resource import Resource
@@ -190,6 +191,37 @@ async def analyse_csv_cli(
     await analyse_csv(check=check, debug_insert=debug_insert)
+@cli(name="analyse-geojson")
+async def analyse_geojson_cli(
+    check_id: str | None = None,
+    url: str | None = None,
+    resource_id: str | None = None,
+):
+    """Trigger a GeoJSON analysis from a check_id, an url or a resource_id
+    Try to get the check from the check ID, then from the URL
+    """
+    assert check_id or url or resource_id
+    check = None
+    if check_id:
+        check: Record | None = await Check.get_by_id(int(check_id), with_deleted=True)
+    if not check and url:
+        checks: list[Record] | None = await Check.get_by_url(url)
+        if checks and len(checks) > 1:
+            log.warning(f"Multiple checks found for URL {url}, using the latest one")
+        check = checks[0] if checks else None
+    if not check and resource_id:
+        check: Record | None = await Check.get_by_resource_id(resource_id)
+    if not check:
+        if check_id:
+            log.error("Could not retrieve the specified check")
+        elif url:
+            log.error("Could not find a check linked to the specified URL")
+        elif resource_id:
+            log.error("Could not find a check linked to the specified resource ID")
+        return
+    await analyse_geojson(check=check)
 @cli
 async def csv_sample(size: int = 1000, download: bool = False, max_size: str = "100M"):
     """Get a csv sample from latest checks

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/config_default.toml RENAMED Viewed

@@ -55,6 +55,7 @@ MAX_FILESIZE_ALLOWED.csvgz = 104857600
 MAX_FILESIZE_ALLOWED.xls = 52428800    # /2
 MAX_FILESIZE_ALLOWED.xlsx = 13107200   # /8
 MAX_FILESIZE_ALLOWED.ods = 10485760    # /10
+MAX_FILESIZE_ALLOWED.geojson = 104857600
 # -- CSV analysis settings -- #
 SQL_INDEXES_TYPES_SUPPORTED = ["index"]
@@ -72,10 +73,17 @@ UDATA_URI = ""
 UDATA_URI_API_KEY = ""
 # -- Minio / datalake settings -- #
-CSV_TO_PARQUET = false
-MIN_LINES_FOR_PARQUET = 200
-MINIO_FOLDER = "" # no trailing slash
 MINIO_URL = "" # no scheme
-MINIO_BUCKET = ""
 MINIO_USER = ""
 MINIO_PWD = ""
+# -- Parquet conversion settings -- #
+CSV_TO_PARQUET = false
+MIN_LINES_FOR_PARQUET = 200
+MINIO_PARQUET_BUCKET = ""
+MINIO_PARQUET_FOLDER = "" # no trailing slash
+# -- PMTiles conversion settings -- #
+GEOJSON_TO_PMTILES = false
+MINIO_PMTILES_BUCKET = ""
+MINIO_PMTILES_FOLDER = "" # no trailing slash

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/db/resource.py RENAMED Viewed

@@ -18,6 +18,9 @@ class Resource:
         "ANALYSING_CSV": "resource content currently being analysed by CSV detective",
         "INSERTING_IN_DB": "currently being inserted in DB",
         "CONVERTING_TO_PARQUET": "currently being converted to Parquet",
+        "TO_ANALYSE_GEOJSON": "geojson resource content to be analysed",
+        "ANALYSING_GEOJSON": "geojson resource content currently being analysed",
+        "CONVERTING_TO_PMTILES": "currently being converted to pmtiles",
     }
     @classmethod

udata_hydra-2.1.3.dev7241/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql ADDED Viewed

@@ -0,0 +1,5 @@
+-- Add PMTiles fields to checks table
+ALTER TABLE checks
+    ADD COLUMN pmtiles_url VARCHAR,
+    ADD COLUMN pmtiles_size BIGINT;

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/schemas/check.py RENAMED Viewed

@@ -24,6 +24,8 @@ class CheckSchema(Schema):
     parsing_table = fields.Str()
     parquet_url = fields.Str()
     parquet_size = fields.Integer()
+    pmtiles_url = fields.Str()
+    pmtiles_size = fields.Integer()
     def create(self, data):
         return self.load(data)

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/__init__.py RENAMED Viewed

@@ -3,6 +3,7 @@ from .auth import token_auth_middleware
 from .csv import detect_tabular_from_headers
 from .errors import IOException, ParseException, handle_parse_exception
 from .file import compute_checksum_from_file, download_resource, read_csv_gz
+from .geojson import detect_geojson_from_headers
 from .http import UdataPayload, get_request_params, is_valid_uri, send
 from .queue import enqueue
 from .reader import Reader, generate_dialect

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/csv.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import json
-async def detect_tabular_from_headers(check: dict) -> tuple[bool, str]:
+def detect_tabular_from_headers(check: dict) -> tuple[bool, str]:
     """
     Determine from content-type header if file looks like:
         - a csv

udata_hydra-2.1.3.dev7241/udata_hydra/utils/geojson.py ADDED Viewed

@@ -0,0 +1,14 @@
+import json
+async def detect_geojson_from_headers(check: dict) -> bool:
+    headers: dict = json.loads(check["headers"] or "{}")
+    # in some cases geojson files have the content-type `application/json`
+    # but adding this in the list would not have been a restrictive enough condition
+    # so we check the URL, which is satisfactory for now
+    if any(
+        headers.get("content-type", "").lower().startswith(ct)
+        for ct in ["application/vnd.geo+json"]
+    ) or "geojson" in check.get("url", ""):
+        return True
+    return False

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/http.py RENAMED Viewed

@@ -23,7 +23,15 @@ class UdataPayload:
             "last-modified-detection",
             "mime-type",
         ],
-        "analysis:parsing": ["error", "finished_at", "parquet_size", "parquet_url", "started_at"],
+        "analysis:parsing": [
+            "error",
+            "started_at",
+            "finished_at",
+            "parquet_size",
+            "parquet_url",
+            "pmtiles_size",
+            "pmtiles_url",
+        ],
     }
     def __init__(self, payload: dict):

{udata_hydra-2.1.3.dev7204 → udata_hydra-2.1.3.dev7241}/udata_hydra/utils/minio.py RENAMED Viewed

@@ -9,10 +9,11 @@ log = logging.getLogger("udata-hydra")
 class MinIOClient:
-    def __init__(self, bucket=config.MINIO_BUCKET):
+    def __init__(self, bucket: str, folder: str):
         self.user = config.MINIO_USER
         self.password = config.MINIO_PWD
         self.bucket = bucket
+        self.folder = folder
         self.client = Minio(
             config.MINIO_URL or "test",
             access_key=self.user or "test",
@@ -26,19 +27,20 @@ class MinIOClient:
     def send_file(
         self,
-        file_name,
-        delete_source=True,
+        file_path: str,
+        delete_source: bool = True,
     ) -> str:
         if self.bucket is None:
             raise AttributeError("A bucket has to be specified.")
-        if os.path.isfile(file_name):
+        if os.path.isfile(file_path):
+            file_name = os.path.basename(file_path)
             self.client.fput_object(
                 self.bucket,
-                f"{config.MINIO_FOLDER}/{file_name}",
-                file_name,
+                f"{self.folder}/{file_name}",
+                file_path,
             )
             if delete_source:
-                os.remove(file_name)
-            return f"https://{config.MINIO_URL}/{self.bucket}/{config.MINIO_FOLDER}/{file_name}"
+                os.remove(file_path)
+            return f"https://{config.MINIO_URL}/{self.bucket}/{self.folder}/{file_name}"
         else:
-            raise Exception(f"file '{file_name}' does not exists")
+            raise Exception(f"file '{file_path}' does not exists")

udata_hydra-2.1.3.dev7204/udata_hydra/analysis/helpers.py DELETED Viewed

@@ -1,27 +0,0 @@
-from datetime import date, datetime
-from dateparser import parse as date_parser
-from dateutil.parser import ParserError
-from dateutil.parser import parse as dateutil_parser
-def to_json(value: str) -> str:
-    """Convenience method, should be casted from string directly by postgres"""
-    return value
-def _parse_dt(value: str) -> datetime | None:
-    """For performance reasons, we try first with dateutil and fallback on dateparser"""
-    try:
-        return dateutil_parser(value)
-    except ParserError:
-        return date_parser(value)
-def to_date(value: str) -> date | None:
-    parsed = _parse_dt(value)
-    return parsed.date() if parsed else None
-def to_datetime(value: str) -> datetime | None:
-    return _parse_dt(value)