PyPI - adminbounds - Versions diffs - 0.1.0__py3-none-any.whl - Mend

adminbounds 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

adminbounds/__init__.py +7 -0
adminbounds/_annotate.py +125 -0
adminbounds/_diagnose.py +153 -0
adminbounds/_gadm.py +243 -0
adminbounds/_import.py +198 -0
adminbounds/_upload.py +49 -0
adminbounds/cli/__init__.py +158 -0
adminbounds/client.py +136 -0
adminbounds/config.py +31 -0
adminbounds/data/china.geojson +1 -0
adminbounds/data/china_city.geojson +1 -0
adminbounds/data/china_district.geojson +1 -0
adminbounds/data/china_state.geojson +1 -0
adminbounds/db.py +21 -0
adminbounds/sql/functions/infer_admin_semantic_relation.sql +355 -0
adminbounds/sql/schema/01_admin_units.sql +34 -0
adminbounds/sql/schema/02_thematic_admin_relations.sql +34 -0
adminbounds-0.1.0.dist-info/METADATA +367 -0
adminbounds-0.1.0.dist-info/RECORD +22 -0
adminbounds-0.1.0.dist-info/WHEEL +4 -0
adminbounds-0.1.0.dist-info/entry_points.txt +2 -0
adminbounds-0.1.0.dist-info/licenses/LICENSE +21 -0

adminbounds/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .client import AdminBoundsClient
+# Backwards-compatible alias
+GeoAdminClient = AdminBoundsClient
+__version__ = "0.1.0"
+__all__ = ["AdminBoundsClient", "GeoAdminClient"]

adminbounds/_annotate.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""
+Batch-annotation of geometries in a source table with admin-unit semantic relations.
+"""
+import json
+import logging
+from tqdm import tqdm
+log = logging.getLogger(__name__)
+FETCH_SQL = """
+SELECT
+    src.uuid                    AS feature_uuid,
+    ST_AsText(src.{geom_col})   AS geom_wkt
+FROM {schema}.{source_table} src
+LEFT JOIN adminbounds.thematic_admin_relations tar
+    ON tar.source_table = %(source_table)s
+   AND tar.feature_uuid = src.uuid
+WHERE tar.id IS NULL
+  AND src.{geom_col} IS NOT NULL
+LIMIT %(batch_size)s
+"""
+INFER_SQL = """
+SELECT adminbounds.infer_admin_semantic_relation(ST_GeomFromText(%(wkt)s, 4326)) AS result
+"""
+INSERT_SQL = """
+INSERT INTO adminbounds.thematic_admin_relations
+    (source_table, feature_uuid, admin_level_match, confidence,
+     coincides_with, intersects_with, covers_children, contained_by)
+SELECT
+    %(source_table)s,
+    %(feature_uuid)s::UUID,
+    (r->>'admin_level_match')::INTEGER,
+    (r->>'confidence')::FLOAT8,
+    r->'coincides_with',
+    r->'intersects_with',
+    r->'covers_children',
+    r->'contained_by'
+FROM (SELECT %(relations)s::jsonb AS r) sub
+ON CONFLICT (source_table, feature_uuid) DO NOTHING
+"""
+def annotate_batch(
+    conn,
+    source_table: str,
+    geom_col: str,
+    schema: str,
+    batch_size: int,
+    on_progress=None,
+) -> int:
+    """Batch-annotate source table. Returns count of newly annotated rows."""
+    conn.autocommit = False
+    total_processed = 0
+    try:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"""
+                SELECT COUNT(*)
+                FROM {schema}.{source_table} src
+                LEFT JOIN adminbounds.thematic_admin_relations tar
+                    ON tar.source_table = %s
+                   AND tar.feature_uuid = src.uuid
+                WHERE tar.id IS NULL AND src.{geom_col} IS NOT NULL
+                """,
+                (source_table,),
+            )
+            remaining = cur.fetchone()[0]
+            log.info("Rows to annotate: %d", remaining)
+        pbar = tqdm(total=remaining, unit="row")
+        while True:
+            fetch_sql = FETCH_SQL.format(
+                geom_col=geom_col,
+                schema=schema,
+                source_table=source_table,
+            )
+            with conn.cursor() as cur:
+                cur.execute(fetch_sql, {"source_table": source_table, "batch_size": batch_size})
+                rows = cur.fetchall()
+            if not rows:
+                break
+            for feature_uuid, geom_wkt in rows:
+                try:
+                    with conn.cursor() as cur:
+                        cur.execute(INFER_SQL, {"wkt": geom_wkt})
+                        result = cur.fetchone()[0]
+                        if result is None:
+                            continue
+                        relations_str = json.dumps(result) if isinstance(result, dict) else result
+                        cur.execute(
+                            INSERT_SQL,
+                            {
+                                "source_table": source_table,
+                                "feature_uuid": str(feature_uuid),
+                                "relations":    relations_str,
+                            },
+                        )
+                    conn.commit()
+                    total_processed += 1
+                    pbar.update(1)
+                    if on_progress:
+                        on_progress(total_processed, remaining)
+                except Exception as exc:
+                    conn.rollback()
+                    log.warning("Row %s failed: %s", feature_uuid, exc)
+                    pbar.update(1)
+        pbar.close()
+    finally:
+        conn.close()
+    log.info("Done. Annotated %d rows.", total_processed)
+    return total_processed

adminbounds/_diagnose.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""
+Diagnostic checks for infer_admin_semantic_relation returning empty results.
+"""
+import json
+PASS = "  [OK]"
+FAIL = "  [FAIL]"
+WARN = "  [WARN]"
+def diagnose(conn, source_table: str, geom_col: str, schema: str) -> dict:
+    """Run diagnostic checks. Returns structured result dict."""
+    results = {}
+    cur = conn.cursor()
+    # 1. admin_units row count
+    cur.execute("SELECT COUNT(*) FROM adminbounds.admin_units")
+    total = cur.fetchone()[0]
+    cur.execute("SELECT COUNT(*) FROM adminbounds.admin_units WHERE geom_bbox IS NULL")
+    null_bbox = cur.fetchone()[0]
+    results["admin_units_total"] = total
+    results["admin_units_null_bbox"] = null_bbox
+    print("\n=== 1. admin_units row count ===")
+    print(f"{PASS if total > 0 else FAIL} Total rows: {total}")
+    print(f"{PASS if null_bbox == 0 else FAIL} Rows with NULL geom_bbox (derived fields missing): {null_bbox}")
+    if null_bbox > 0:
+        print("      → Run import-boundaries again; compute_derived_fields() did not complete.")
+    print("\n=== 2. admin_units level distribution ===")
+    cur.execute("SELECT level, COUNT(*) FROM adminbounds.admin_units GROUP BY level ORDER BY level")
+    level_dist = {}
+    for row in cur.fetchall():
+        level_dist[row[0]] = row[1]
+        print(f"  Level {row[0]}: {row[1]} rows")
+    results["level_distribution"] = level_dist
+    print(f"\n=== 3. Source table: {schema}.{source_table} ===")
+    cur.execute(f"SELECT COUNT(*) FROM {schema}.{source_table} WHERE {geom_col} IS NOT NULL")
+    src_count = cur.fetchone()[0]
+    results["source_non_null_geoms"] = src_count
+    print(f"{PASS if src_count > 0 else FAIL} Non-null geometries: {src_count}")
+    cur.execute(f"SELECT DISTINCT ST_SRID({geom_col}) FROM {schema}.{source_table} WHERE {geom_col} IS NOT NULL LIMIT 5")
+    srids = [r[0] for r in cur.fetchall()]
+    results["source_srids"] = srids
+    print(f"{PASS if srids == [4326] else FAIL} Geometry SRIDs in source table: {srids}")
+    if srids and srids != [4326]:
+        print("      → Geometries are NOT in EPSG:4326. The function expects 4326.")
+    cur.execute(f"""
+        SELECT
+            ST_XMin(ST_Extent({geom_col})),
+            ST_YMin(ST_Extent({geom_col})),
+            ST_XMax(ST_Extent({geom_col})),
+            ST_YMax(ST_Extent({geom_col}))
+        FROM {schema}.{source_table}
+        WHERE {geom_col} IS NOT NULL
+    """)
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        xmin, ymin, xmax, ymax = row
+        results["bbox"] = {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}
+        print(f"  Bounding box: ({xmin:.4f}, {ymin:.4f}) → ({xmax:.4f}, {ymax:.4f})")
+        in_china = (70 <= xmin <= 140) and (15 <= ymin <= 55)
+        results["in_china_range"] = in_china
+        print(f"{PASS if in_china else FAIL} Coordinates look like China (lon 70–140, lat 15–55): {in_china}")
+    print("\n=== 4. Spatial overlap: source bbox vs admin_units bbox ===")
+    cur.execute(f"""
+        SELECT COUNT(*)
+        FROM adminbounds.admin_units au
+        WHERE au.geom_bbox && (
+            SELECT ST_Extent({geom_col}) FROM {schema}.{source_table} WHERE {geom_col} IS NOT NULL
+        )
+    """)
+    overlap_count = cur.fetchone()[0]
+    results["spatial_overlap_count"] = overlap_count
+    print(f"{PASS if overlap_count > 0 else FAIL} admin_units whose bbox overlaps source extent: {overlap_count}")
+    if overlap_count == 0:
+        print("      → No spatial overlap at all. Likely a CRS or coordinate system mismatch.")
+    print("\n=== 5. Manual function call on first source geometry ===")
+    cur.execute(f"""
+        SELECT
+            ST_AsText({geom_col})  AS wkt,
+            ST_SRID({geom_col})    AS srid,
+            ST_IsValid({geom_col}) AS is_valid
+        FROM {schema}.{source_table}
+        WHERE {geom_col} IS NOT NULL
+        LIMIT 1
+    """)
+    row = cur.fetchone()
+    if row:
+        wkt, srid, is_valid = row
+        results["sample_srid"] = srid
+        results["sample_is_valid"] = is_valid
+        print(f"  SRID: {srid}, IsValid: {is_valid}")
+        print(f"  WKT (first 120 chars): {wkt[:120]}...")
+        cur.execute(
+            "SELECT adminbounds.infer_admin_semantic_relation(ST_GeomFromText(%s, 4326))",
+            (wkt,),
+        )
+        func_result = cur.fetchone()[0]
+        results["function_result"] = func_result
+        print(f"\n  Function result:\n  {json.dumps(func_result, ensure_ascii=False, indent=2)}")
+        cur.execute(f"""
+            WITH input AS (
+                SELECT ST_GeomFromText(%s, 4326) AS g
+            ),
+            layer1 AS (
+                SELECT adcode FROM adminbounds.admin_units, input
+                WHERE geom_bbox && ST_Envelope(input.g)
+            ),
+            layer2 AS (
+                SELECT au.adcode FROM adminbounds.admin_units au, input
+                WHERE au.geom_bbox && ST_Envelope(input.g)
+                  AND ST_Intersects(au.geom_hull, input.g)
+            ),
+            layer3 AS (
+                SELECT au.adcode FROM adminbounds.admin_units au, input
+                WHERE au.geom_bbox && ST_Envelope(input.g)
+                  AND ST_Intersects(au.geom_hull, input.g)
+                  AND ST_Intersects(
+                        CASE WHEN au.vertex_count > 500 THEN au.geom_simple ELSE au.geom END,
+                        input.g
+                      )
+            )
+            SELECT
+                (SELECT COUNT(*) FROM layer1) AS after_layer1_bbox,
+                (SELECT COUNT(*) FROM layer2) AS after_layer2_hull,
+                (SELECT COUNT(*) FROM layer3) AS after_layer3_geom
+        """, (wkt,))
+        row = cur.fetchone()
+        results["filter_layers"] = {
+            "after_bbox": row[0],
+            "after_hull": row[1],
+            "after_geom": row[2],
+        }
+        print(f"\n  Three-layer filter candidates (first geometry):")
+        print(f"    After layer 1 (bbox):  {row[0]}")
+        print(f"    After layer 2 (hull):  {row[1]}")
+        print(f"    After layer 3 (geom):  {row[2]}")
+        if row[0] == 0:
+            print(f"    {FAIL} Nothing passes bbox filter → geom_bbox NULL or CRS mismatch")
+        elif row[2] == 0:
+            print(f"    {WARN} Passes bbox/hull but not fine geometry → simplification or topology issue")
+    cur.close()
+    return results

adminbounds/_gadm.py ADDED Viewed

@@ -0,0 +1,243 @@
+"""
+GADM 4.1 worldwide admin boundary downloader and importer.
+Downloads GeoJSON zips from the GADM CDN, maps fields to the admin_units schema,
+and upserts via the existing staging pipeline.
+"""
+from __future__ import annotations
+import io
+import zipfile
+from pathlib import Path
+from urllib.request import urlopen, Request
+from urllib.error import HTTPError
+import geopandas as gpd
+from shapely.geometry import shape
+from tqdm import tqdm
+from ._import import _upsert_staging, _compute_derived_fields
+# ---------------------------------------------------------------------------
+# Country name → ISO3 lookup (lowercase keys)
+# ---------------------------------------------------------------------------
+_COUNTRY_ISO3: dict[str, str] = {
+    "afghanistan": "AFG", "albania": "ALB", "algeria": "DZA", "andorra": "AND",
+    "angola": "AGO", "argentina": "ARG", "armenia": "ARM", "australia": "AUS",
+    "austria": "AUT", "azerbaijan": "AZE", "bahrain": "BHR", "bangladesh": "BGD",
+    "belarus": "BLR", "belgium": "BEL", "belize": "BLZ", "benin": "BEN",
+    "bhutan": "BTN", "bolivia": "BOL", "bosnia and herzegovina": "BIH",
+    "botswana": "BWA", "brazil": "BRA", "brunei": "BRN", "bulgaria": "BGR",
+    "burkina faso": "BFA", "burundi": "BDI", "cambodia": "KHM", "cameroon": "CMR",
+    "canada": "CAN", "central african republic": "CAF", "chad": "TCD",
+    "chile": "CHL", "china": "CHN", "colombia": "COL", "congo": "COG",
+    "democratic republic of the congo": "COD", "dr congo": "COD",
+    "costa rica": "CRI", "croatia": "HRV", "cuba": "CUB", "cyprus": "CYP",
+    "czech republic": "CZE", "czechia": "CZE", "denmark": "DNK", "djibouti": "DJI",
+    "dominican republic": "DOM", "ecuador": "ECU", "egypt": "EGY",
+    "el salvador": "SLV", "eritrea": "ERI", "estonia": "EST", "eswatini": "SWZ",
+    "ethiopia": "ETH", "finland": "FIN", "france": "FRA", "gabon": "GAB",
+    "gambia": "GMB", "georgia": "GEO", "germany": "DEU", "ghana": "GHA",
+    "greece": "GRC", "guatemala": "GTM", "guinea": "GIN",
+    "guinea-bissau": "GNB", "guyana": "GUY", "haiti": "HTI", "honduras": "HND",
+    "hungary": "HUN", "iceland": "ISL", "india": "IND", "indonesia": "IDN",
+    "iran": "IRN", "iraq": "IRQ", "ireland": "IRL", "israel": "ISR",
+    "italy": "ITA", "jamaica": "JAM", "japan": "JPN", "jordan": "JOR",
+    "kazakhstan": "KAZ", "kenya": "KEN", "kuwait": "KWT", "kyrgyzstan": "KGZ",
+    "laos": "LAO", "latvia": "LVA", "lebanon": "LBN", "lesotho": "LSO",
+    "liberia": "LBR", "libya": "LBY", "liechtenstein": "LIE", "lithuania": "LTU",
+    "luxembourg": "LUX", "madagascar": "MDG", "malawi": "MWI", "malaysia": "MYS",
+    "mali": "MLI", "malta": "MLT", "mauritania": "MRT", "mexico": "MEX",
+    "moldova": "MDA", "mongolia": "MNG", "montenegro": "MNE", "morocco": "MAR",
+    "mozambique": "MOZ", "myanmar": "MMR", "namibia": "NAM", "nepal": "NPL",
+    "netherlands": "NLD", "new zealand": "NZL", "nicaragua": "NIC",
+    "niger": "NER", "nigeria": "NGA", "north korea": "PRK",
+    "north macedonia": "MKD", "norway": "NOR", "oman": "OMN", "pakistan": "PAK",
+    "panama": "PAN", "papua new guinea": "PNG", "paraguay": "PRY", "peru": "PER",
+    "philippines": "PHL", "poland": "POL", "portugal": "PRT", "qatar": "QAT",
+    "romania": "ROU", "russia": "RUS", "rwanda": "RWA", "saudi arabia": "SAU",
+    "senegal": "SEN", "serbia": "SRB", "sierra leone": "SLE", "singapore": "SGP",
+    "slovakia": "SVK", "slovenia": "SVN", "somalia": "SOM", "south africa": "ZAF",
+    "south korea": "KOR", "south sudan": "SSD", "spain": "ESP", "sri lanka": "LKA",
+    "sudan": "SDN", "sweden": "SWE", "switzerland": "CHE", "syria": "SYR",
+    "taiwan": "TWN", "tajikistan": "TJK", "tanzania": "TZA", "thailand": "THA",
+    "timor-leste": "TLS", "east timor": "TLS", "togo": "TGO", "tunisia": "TUN",
+    "turkey": "TUR", "turkiye": "TUR", "turkmenistan": "TKM", "uganda": "UGA",
+    "ukraine": "UKR", "united arab emirates": "ARE", "uae": "ARE",
+    "united kingdom": "GBR", "uk": "GBR", "great britain": "GBR",
+    "united states": "USA", "united states of america": "USA", "usa": "USA",
+    "us": "USA", "uruguay": "URY", "uzbekistan": "UZB", "venezuela": "VEN",
+    "vietnam": "VNM", "viet nam": "VNM", "yemen": "YEM", "zambia": "ZMB",
+    "zimbabwe": "ZWE",
+}
+def _resolve_iso3(country: str) -> str:
+    """Resolve a country name or ISO3 code to uppercase ISO3."""
+    upper = country.strip().upper()
+    # Direct ISO3 match (3-letter alpha)
+    if len(upper) == 3 and upper.isalpha():
+        return upper
+    # Name lookup
+    lower = country.strip().lower()
+    if lower in _COUNTRY_ISO3:
+        return _COUNTRY_ISO3[lower]
+    # Partial suggestions
+    suggestions = [k for k in _COUNTRY_ISO3 if lower in k or k in lower]
+    msg = f"Country not recognised: {country!r}."
+    if suggestions:
+        msg += f" Did you mean: {', '.join(suggestions[:5])}?"
+    raise ValueError(msg)
+def _gadm_url(iso3: str, level: int) -> str:
+    return f"https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_{level}.json.zip"
+def _download_file(url: str, dest: Path, force: bool) -> Path | None:
+    """Download *url* to *dest*; return None on HTTP 404."""
+    if dest.exists() and not force:
+        return dest
+    print(f"  Downloading {url} ...")
+    try:
+        req = Request(url, headers={"User-Agent": "adminbounds/1.0"})
+        with urlopen(req) as response:
+            total = int(response.headers.get("Content-Length", 0))
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            buf = io.BytesIO()
+            with tqdm(
+                total=total or None,
+                unit="B",
+                unit_scale=True,
+                desc=dest.name,
+                leave=False,
+            ) as pbar:
+                while True:
+                    chunk = response.read(65536)
+                    if not chunk:
+                        break
+                    buf.write(chunk)
+                    pbar.update(len(chunk))
+            dest.write_bytes(buf.getvalue())
+    except HTTPError as exc:
+        if exc.code == 404:
+            print(f"    Level not available (HTTP 404): {url}")
+            return None
+        raise
+    return dest
+def _extract_json(zip_path: Path) -> dict:
+    """Extract and parse the first .json file from a zip archive."""
+    import json
+    with zipfile.ZipFile(zip_path) as zf:
+        json_names = [n for n in zf.namelist() if n.endswith(".json")]
+        if not json_names:
+            raise ValueError(f"No .json file found in {zip_path}")
+        with zf.open(json_names[0]) as f:
+            return json.load(f)
+def _parse_gadm_features(data: dict, gadm_level: int) -> list[dict]:
+    """Map GeoJSON features from a GADM file to admin_units row dicts."""
+    gid_key    = f"GID_{gadm_level}"
+    name_key   = f"NAME_{gadm_level}"
+    parent_key = f"GID_{gadm_level - 1}" if gadm_level > 0 else None
+    db_level   = gadm_level + 1  # GADM 0→level 1, GADM 1→level 2, …
+    rows = []
+    for feature in data.get("features", []):
+        props = feature.get("properties", {}) or {}
+        geom_data = feature.get("geometry")
+        adcode = props.get(gid_key)
+        if not adcode or not geom_data:
+            continue
+        name        = props.get(name_key) or adcode
+        parent_code = props.get(parent_key) if parent_key else None
+        geometry    = shape(geom_data)
+        rows.append({
+            "adcode":      adcode,
+            "name":        name,
+            "level":       db_level,
+            "parent_code": parent_code,
+            "geometry":    geometry,
+        })
+    return rows
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def download_gadm(
+    country: str,
+    engine,
+    levels: list[int] | None = None,
+    cache_dir: Path | None = None,
+    force: bool = False,
+) -> int:
+    """Download and import GADM 4.1 boundaries for a country.
+    Args:
+        country:   ISO3 code (e.g. "DEU") or English name (e.g. "Germany").
+        engine:    SQLAlchemy engine connected to the adminbounds DB.
+        levels:    GADM levels to import (0=country … 3=district).
+                   Default: all available [0, 1, 2, 3].
+        cache_dir: Directory for cached zip files.
+                   Default: ~/.adminbounds/gadm_cache/
+        force:     Re-download even if already cached.
+    Returns:
+        Total rows upserted into adminbounds.admin_units.
+    """
+    if levels is None:
+        levels = [0, 1, 2, 3]
+    iso3 = _resolve_iso3(country)
+    print(f"Resolved '{country}' → ISO3={iso3}")
+    if cache_dir is None:
+        cache_dir = Path.home() / ".adminbounds" / "gadm_cache"
+    cache_dir = Path(cache_dir)
+    all_rows: list[dict] = []
+    for lvl in levels:
+        url      = _gadm_url(iso3, lvl)
+        zip_path = cache_dir / f"gadm41_{iso3}_{lvl}.json.zip"
+        zip_file = _download_file(url, zip_path, force)
+        if zip_file is None:
+            continue  # level not available
+        print(f"  Parsing level {lvl} ...")
+        data = _extract_json(zip_file)
+        rows = _parse_gadm_features(data, lvl)
+        print(f"    → {len(rows)} features")
+        all_rows.extend(rows)
+    if not all_rows:
+        print("No data downloaded.")
+        return 0
+    # Deduplicate by adcode
+    seen: dict[str, dict] = {}
+    for row in all_rows:
+        seen[row["adcode"]] = row
+    deduped = list(seen.values())
+    print(f"Total unique units: {len(deduped)}")
+    print("Upserting into adminbounds.admin_units ...")
+    gdf = gpd.GeoDataFrame(deduped, crs="EPSG:4326")
+    gdf = gdf.rename_geometry("geom")
+    _upsert_staging(engine, gdf)
+    print("  Upsert complete.")
+    _compute_derived_fields(engine)
+    return len(deduped)