PyPI - osmsg - Versions diffs - 1.1.2__tar.gz → 1.2.0__tar.gz - Mend

osmsg 1.1.2tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{osmsg-1.1.2 → osmsg-1.2.0}/PKG-INFO +64 -2
{osmsg-1.1.2 → osmsg-1.2.0}/README.md +63 -1
osmsg-1.2.0/osmsg/__version__.py +1 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/cli.py +81 -4
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/__init__.py +1 -1
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/ingest.py +1 -1
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/exceptions.py +1 -1
osmsg-1.2.0/osmsg/export/psql.py +156 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/handlers.py +1 -1
osmsg-1.2.0/osmsg/history.py +272 -0
osmsg-1.2.0/osmsg/maintain/__init__.py +7 -0
osmsg-1.2.0/osmsg/maintain/cli.py +83 -0
osmsg-1.2.0/osmsg/maintain/convert.py +314 -0
osmsg-1.2.0/osmsg/maintain/manifest.py +62 -0
osmsg-1.2.0/osmsg/maintain/month.py +120 -0
osmsg-1.2.0/osmsg/maintain/parquet.py +43 -0
osmsg-1.2.0/osmsg/maintain/pbf_split.py +79 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pipeline.py +233 -16
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/replication.py +1 -1
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/tm.py +1 -1
{osmsg-1.1.2 → osmsg-1.2.0}/pyproject.toml +1 -1
osmsg-1.1.2/osmsg/__version__.py +0 -1
osmsg-1.1.2/osmsg/export/psql.py +0 -69
{osmsg-1.1.2 → osmsg-1.2.0}/LICENSE +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/__init__.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_http.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_tick.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/auth.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/boundary.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/duckdb_schema.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/queries.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/schema.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/__init__.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/csv.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/json.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/markdown.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/parquet.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/fetch.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/geofabrik.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/models.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pg_schema.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/py.typed +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/ui.py +0 -0
{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/workers.py +0 -0

{osmsg-1.1.2 → osmsg-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: osmsg
-Version: 1.1.2
+Version: 1.2.0
 Summary: OpenStreetMap Stats Generator: Commandline
 Keywords: osm,stats,commandline,openstreetmap
 Author: Kshitij Raj Sharma
@@ -46,13 +46,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
 A Project of [OSGeo Nepal](https://osgeonepal.org).
-## Features
+## What does it do?
 - Per-user create/modify/delete counts over any time window.
 - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
 - Country and custom-boundary filters via Geofabrik.
 - Cron-friendly resume with `--update`.
+- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
 - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
+- Cloud-native history: months covered by a published parquet dataset are read remotely.
 ## Install
@@ -68,6 +70,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
 `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
 with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
+More ways to install:
+```bash
+conda install -c conda-forge osmsg                 # conda / mamba
+brew install osgeonepal/tap/osmsg          # macOS / Linux (Homebrew tap)
+```
+On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
+and run it directly, no Python required.
 ## Quick start
 ```bash
@@ -78,6 +90,38 @@ osmsg --hashtags hotosm --last day       # only changesets tagged #hotosm
 That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
+## Set up a full history store
+Two commands give you a complete, self-updating store. The first loads all of OSM history from the
+published dataset and records where to resume; the second catches up to now and runs on a schedule.
+```bash
+osmsg --insert            # load all history into stats.duckdb, then exit
+osmsg --update            # catch up to now (repeat on cron)
+```
+`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
+current. For near-real-time, run `osmsg --update --url minute`.
+Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
+```bash
+osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
+osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
+```
+Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
+```bash
+osmsg --insert --start 2020-01-01 --end 2023-01-01
+```
+Already have the planet files? Insert from them directly:
+```bash
+osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
+```
 ## Tutorials
 ### 1. Stats for a country
@@ -185,6 +229,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
 | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
 | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
+| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
+| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
+| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
+| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
+| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
 | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -192,6 +241,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 A `.env` file at the working directory is loaded automatically.
+## Maintainers
+Generating and publishing the history dataset is the `osmsg maintain` group:
+```bash
+osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history   # append one finished month
+osmsg maintain month 2026-06 --no-upload                       # generate locally, review, upload later
+osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
+osmsg maintain publish work/out --repo osgeonepal/osmsg-history
+```
+See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
 ## Documentation
 - [Installation](./docs/Installation.md)

{osmsg-1.1.2 → osmsg-1.2.0}/README.md RENAMED Viewed

@@ -14,13 +14,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
 A Project of [OSGeo Nepal](https://osgeonepal.org).
-## Features
+## What does it do?
 - Per-user create/modify/delete counts over any time window.
 - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
 - Country and custom-boundary filters via Geofabrik.
 - Cron-friendly resume with `--update`.
+- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
 - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
+- Cloud-native history: months covered by a published parquet dataset are read remotely.
 ## Install
@@ -36,6 +38,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
 `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
 with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
+More ways to install:
+```bash
+conda install -c conda-forge osmsg                 # conda / mamba
+brew install osgeonepal/tap/osmsg          # macOS / Linux (Homebrew tap)
+```
+On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
+and run it directly, no Python required.
 ## Quick start
 ```bash
@@ -46,6 +58,38 @@ osmsg --hashtags hotosm --last day       # only changesets tagged #hotosm
 That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
+## Set up a full history store
+Two commands give you a complete, self-updating store. The first loads all of OSM history from the
+published dataset and records where to resume; the second catches up to now and runs on a schedule.
+```bash
+osmsg --insert            # load all history into stats.duckdb, then exit
+osmsg --update            # catch up to now (repeat on cron)
+```
+`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
+current. For near-real-time, run `osmsg --update --url minute`.
+Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
+```bash
+osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
+osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
+```
+Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
+```bash
+osmsg --insert --start 2020-01-01 --end 2023-01-01
+```
+Already have the planet files? Insert from them directly:
+```bash
+osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
+```
 ## Tutorials
 ### 1. Stats for a country
@@ -153,6 +197,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
 | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
 | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
+| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
+| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
+| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
+| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
+| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
 | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -160,6 +209,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 A `.env` file at the working directory is loaded automatically.
+## Maintainers
+Generating and publishing the history dataset is the `osmsg maintain` group:
+```bash
+osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history   # append one finished month
+osmsg maintain month 2026-06 --no-upload                       # generate locally, review, upload later
+osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
+osmsg maintain publish work/out --repo osgeonepal/osmsg-history
+```
+See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
 ## Documentation
 - [Installation](./docs/Installation.md)

osmsg-1.2.0/osmsg/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.0"

{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/cli.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """Typer-based CLI for osmsg.
-UTC throughout — no display timezone. Outputs default to parquet (queryable from
+UTC throughout, no display timezone. Outputs default to parquet (queryable from
 disk by DuckDB / polars / pandas). Other formats: csv, json, markdown, psql.
 """
@@ -24,6 +24,7 @@ from .exceptions import (
     OsmsgError,
     UnknownRegionError,
 )
+from .maintain.cli import maintain_app
 from .pipeline import RunConfig, run
 from .ui import console, error, info, render_table, warn
@@ -36,6 +37,7 @@ app = typer.Typer(
     no_args_is_help=False,
     help="OpenStreetMap stats generator. Parquet-first, OAuth 2.0, UTC-only.",
 )
+app.add_typer(maintain_app, name="maintain")
 class Period(StrEnum):
@@ -104,9 +106,10 @@ def _period_range(period: Period) -> tuple[dt.datetime, dt.datetime]:
     raise ValueError(period)
-@app.command()
+@app.callback(invoke_without_command=True)
 @use_yaml_config(param_name="config", param_help="YAML config file (CLI flags override its values).")
 def main(
+    ctx: typer.Context,
     version: Annotated[
         bool | None,
         typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
@@ -215,6 +218,15 @@ def main(
         str | None,
         typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
     ] = None,
+    psql_bulk: Annotated[
+        bool,
+        typer.Option(
+            "--psql-bulk",
+            envvar="OSMSG_PSQL_BULK",
+            help="Faster one-time psql load: drop secondary indexes and foreign keys during the push "
+            "and rebuild them after. Use for a full history import, not for incremental --update.",
+        ),
+    ] = False,
     changeset_pad_hours: Annotated[
         int,
         typer.Option(
@@ -226,16 +238,68 @@ def main(
             max=48,
         ),
     ] = 1,
+    history: Annotated[
+        bool,
+        typer.Option(
+            "--history/--no-history",
+            envvar="OSMSG_HISTORY",
+            help="Serve covered months from the published parquet (HuggingFace) and only download the "
+            "recent tail. Falls back to the live diff path if unavailable. Ignored by --update.",
+        ),
+    ] = True,
+    history_url: Annotated[
+        str,
+        typer.Option(
+            "--history-url",
+            envvar="OSMSG_HISTORY_URL",
+            help="Base URL of the published history dataset.",
+        ),
+    ] = "hf://datasets/kshitijrajsharma/osmsg-history",
+    insert: Annotated[
+        bool,
+        typer.Option(
+            "--insert",
+            help="Load history into the store and seed resume state, then exit. No window loads the "
+            "whole published history; --start/--end loads a slice. Follow with --update to catch up.",
+        ),
+    ] = False,
+    osh_file: Annotated[
+        str | None,
+        typer.Option("--osh-file", help="Insert from a local .osh.pbf instead of the published dataset."),
+    ] = None,
+    changeset_file: Annotated[
+        str | None,
+        typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
+    ] = None,
 ) -> None:
-    """Run osmsg."""
+    """Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
+    if ctx.invoked_subcommand is not None:
+        return
     if formats is None:
         formats = [Format.parquet]
+    if psql_dsn and Format.psql not in formats:
+        formats.append(Format.psql)
     if sum(1 for x in (start, last, days) if x) > 1:
-        error("--start, --last, and --days are mutually exclusive — pick one.")
+        error("--start, --last, and --days are mutually exclusive, pick one.")
         raise typer.Exit(code=2)
     if update and any(x is not None for x in (start, end, last, days)):
         error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
         raise typer.Exit(code=2)
+    if insert and update:
+        error("--insert and --update are mutually exclusive; insert first, then update.")
+        raise typer.Exit(code=2)
+    if insert and (last is not None or days is not None):
+        error("--insert takes --start/--end (or no window), not --last/--days.")
+        raise typer.Exit(code=2)
+    if (osh_file is None) != (changeset_file is None):
+        error("--osh-file and --changeset-file must be given together.")
+        raise typer.Exit(code=2)
+    if osh_file and not insert:
+        error("--osh-file/--changeset-file are only valid with --insert.")
+        raise typer.Exit(code=2)
+    if psql_bulk and update:
+        error("--psql-bulk is for a one-time full load (drops indexes/keys); do not use it with --update.")
+        raise typer.Exit(code=2)
     if Format.psql in formats and not psql_dsn:
         error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
         raise typer.Exit(code=2)
@@ -267,7 +331,13 @@ def main(
         osm_username=username,
         osm_password=_read_password_stdin() if password_stdin else None,
         psql_dsn=psql_dsn,
+        psql_bulk=psql_bulk,
         changeset_pad_hours=changeset_pad_hours,
+        history_mode="auto" if history else "off",
+        history_url=history_url,
+        insert=insert,
+        osh_file=osh_file,
+        changeset_file=changeset_file,
     )
     if last is not None:
@@ -300,6 +370,13 @@ def main(
         error(str(exc))
         raise typer.Exit(code=2) from exc
+    if insert:
+        info(f"insert complete: {result['rows']:,} history changeset rows loaded.")
+        for label, path in (result.get("files") or {}).items():
+            console.print(f"[green]✓[/green] {label}: [bold]{path}[/bold]")
+        console.print("Next: [bold]osmsg --update[/bold] to catch up to now.")
+        return
     rows_data = result.get("rows_data") or []
     display_n = min(rows or 20, len(rows_data))
     render_table(

{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """DuckDB persistence: schema, ingest, queries.
 The schema is portable: identical column shape works in DuckDB, Parquet, and
-PostgreSQL — exporters re-issue the CREATE TABLE there.
+PostgreSQL, exporters re-issue the CREATE TABLE there.
 Public surface:

{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/ingest.py RENAMED Viewed

@@ -106,7 +106,7 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
     _quarantine_corrupt(parquet_dir)
     def pattern(name: str) -> str:
-        # read_parquet() takes a literal — escape so quoted paths can't break out.
+        # read_parquet() takes a literal, escape so quoted paths can't break out.
         return _sql_escape((parquet_dir / f"temp_*_{name}_*.parquet").as_posix())
     conn.execute("BEGIN")

{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/exceptions.py RENAMED Viewed

@@ -25,7 +25,7 @@ class GeofabrikAuthError(OsmsgError):
 class NoDataFoundError(Exception):
-    """Empty range — info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
+    """Empty range, info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
 __all__ = [

osmsg-1.2.0/osmsg/export/psql.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""PostgreSQL exporter via DuckDB's postgres extension."""
+import duckdb
+from ..exceptions import OsmsgError
+from ..pg_schema import PG_SCHEMA
+# Secondary indexes and foreign keys that make a row-by-row insert slow. For a one-time bulk load
+# they are dropped before the COPY and rebuilt once after (one index build + one FK validation,
+# instead of maintaining them per row). Primary keys stay, because the ON CONFLICT upserts need them.
+# Indexes are (name, create-sql); foreign keys are (table, name, add-clause).
+_BULK_INDEXES = [
+    ("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
+    ("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
+    ("idx_changeset_stats_uid", "CREATE INDEX idx_changeset_stats_uid ON changeset_stats (uid)"),
+]
+_BULK_FKS = [
+    ("changesets", "changesets_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
+    (
+        "changeset_stats",
+        "changeset_stats_changeset_id_fkey",
+        "FOREIGN KEY (changeset_id) REFERENCES changesets (changeset_id)",
+    ),
+    ("changeset_stats", "changeset_stats_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
+]
+# Bulk loads push the big tables in this many changeset_id ranges, each its own statement and so its
+# own commit, so a failure costs one range instead of rolling back the whole multi-GB load.
+_BULK_COMMIT_CHUNKS = 32
+def _pg(conn: duckdb.DuckDBPyConnection, sql: str) -> None:
+    conn.execute(f"CALL postgres_execute('pg_target', $${sql}$$)")
+def _pg_has_history(conn: duckdb.DuckDBPyConnection) -> bool:
+    """True if the PG target already holds the history layer (seq_id=0); checked cheaply with LIMIT 1."""
+    probe = "SELECT count(*) FROM (SELECT 1 FROM pg_target.changeset_stats WHERE seq_id = 0 LIMIT 1) t"
+    row = conn.execute(probe).fetchone()
+    return bool(row and row[0])
+def _push_changesets(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
+    # Newer non-NULL wins, NULL never downgrades (mirrors the DuckDB-side merge).
+    conn.execute(
+        f"""
+        INSERT INTO pg_target.changesets AS c (changeset_id, uid, created_at, hashtags, editor, geom)
+        SELECT changeset_id, uid, created_at, hashtags, editor, geom FROM changesets {where}
+        ON CONFLICT (changeset_id) DO UPDATE SET
+            created_at = COALESCE(EXCLUDED.created_at, c.created_at),
+            hashtags   = COALESCE(EXCLUDED.hashtags,   c.hashtags),
+            editor     = COALESCE(EXCLUDED.editor,     c.editor),
+            geom       = COALESCE(EXCLUDED.geom,       c.geom)
+        """
+    )
+def _push_changeset_stats(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
+    conn.execute(f"INSERT INTO pg_target.changeset_stats SELECT * FROM changeset_stats {where} ON CONFLICT DO NOTHING")
+def _push_chunked(conn: duckdb.DuckDBPyConnection, source: str, push) -> None:
+    """Call push() once per changeset_id range so each range commits on its own."""
+    bounds = conn.execute(f"SELECT min(changeset_id), max(changeset_id) FROM {source}").fetchone()
+    if not bounds or bounds[0] is None:
+        return
+    lo, hi = bounds
+    step = (hi - lo) // _BULK_COMMIT_CHUNKS + 1
+    cursor = lo
+    while cursor <= hi:
+        push(conn, f"WHERE changeset_id >= {cursor} AND changeset_id < {cursor + step}")
+        cursor += step
+def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = False) -> None:
+    """Push every osmsg table to the libpq DSN target. bulk_load is for the one-time full-history
+    import (drops indexes and foreign keys, streams, rebuilds, commits per range); leave it off for
+    incremental --update pushes. The DSN is interpolated into ATTACH, so it must be trusted."""
+    conn.execute("INSTALL postgres")
+    conn.execute("LOAD postgres")
+    conn.execute("INSTALL spatial")
+    conn.execute("LOAD spatial")
+    safe_dsn = dsn.replace("'", "''")
+    conn.execute(f"ATTACH '{safe_dsn}' AS pg_target (TYPE postgres)")
+    try:
+        for stmt in PG_SCHEMA.strip().split(";"):
+            stmt = stmt.strip()
+            if stmt:
+                _pg(conn, stmt)
+        # Refuse cross-source push: would double-count via the (seq_id, changeset_id) PK.
+        local_sources = {r[0] for r in conn.execute("SELECT source_url FROM state").fetchall()}
+        existing_sources = {r[0] for r in conn.execute("SELECT source_url FROM pg_target.state").fetchall()}
+        cross_source = existing_sources - local_sources
+        if cross_source and local_sources:
+            raise OsmsgError(
+                f"PG target already has data from source(s) {sorted(cross_source)} "
+                f"but this run pushes from {sorted(local_sources)}. Mixing sources "
+                f"double-counts via the (seq_id, changeset_id) key. Use a separate "
+                f"--psql-dsn, or wipe the existing PG tables first."
+            )
+        if bulk_load:
+            # Stream rows instead of buffering them to preserve order; buffering 180M+ JSON-bearing
+            # rows is what exhausts memory in a single INSERT. Then drop the secondary indexes and
+            # foreign keys so the load does not maintain them per row.
+            conn.execute("SET preserve_insertion_order = false")
+            for table, name, _add in _BULK_FKS:
+                _pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
+            for name, _create in _BULK_INDEXES:
+                _pg(conn, f"DROP INDEX IF EXISTS {name}")
+            conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
+            _push_chunked(conn, "changesets", _push_changesets)
+            _push_chunked(conn, "changeset_stats", _push_changeset_stats)
+        elif _pg_has_history(conn):
+            # The history layer (seq_id=0) is already in PG from the bulk load and never changes, so an
+            # incremental --update pushes only the live layer and its parents, not the 180M history rows.
+            live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
+            conn.execute(
+                "INSERT INTO pg_target.users SELECT * FROM users "
+                "WHERE uid IN (SELECT uid FROM changeset_stats WHERE seq_id <> 0) ON CONFLICT DO NOTHING"
+            )
+            _push_changesets(conn, f"WHERE {live_ids}")
+            _push_changeset_stats(conn, "WHERE seq_id <> 0")
+        else:
+            # No history in PG (a plain live target): push everything (live rows are all seq_id<>0).
+            conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
+            _push_changesets(conn)
+            _push_changeset_stats(conn)
+        conn.execute(
+            """
+            INSERT INTO pg_target.state (source_url, last_seq, last_ts, updated_at)
+            SELECT source_url, last_seq, last_ts, updated_at FROM state
+            ON CONFLICT (source_url) DO UPDATE SET
+                last_seq   = EXCLUDED.last_seq,
+                last_ts    = EXCLUDED.last_ts,
+                updated_at = EXCLUDED.updated_at
+            """
+        )
+        if bulk_load:
+            # Rebuild once, with more memory for the sort-based index builds, then refresh planner stats.
+            for table, name, add in _BULK_FKS:
+                _pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
+            for _name, create in _BULK_INDEXES:
+                _pg(conn, f"SET maintenance_work_mem = '512MB'; {create}")
+            _pg(conn, "ANALYZE users")
+            _pg(conn, "ANALYZE changesets")
+            _pg(conn, "ANALYZE changeset_stats")
+    finally:
+        conn.execute("DETACH pg_target")
+__all__ = ["PG_SCHEMA", "to_psql"]

{osmsg-1.1.2 → osmsg-1.2.0}/osmsg/handlers.py RENAMED Viewed

@@ -56,7 +56,7 @@ class ChangesetHandler(osmium.SimpleHandler):
         keep = bool(cfg["changeset_meta"] and not cfg["hashtags"])
         # Some editors only fill the `hashtags` tag (comment stays generic); checking
-        # comment alone silently drops those. Tokenize via regex on both — real data
+        # comment alone silently drops those. Tokenize via regex on both, real data
         # mixes `;`, space, and comma as separators inside `hashtags`.
         comment = c.tags.get("comment", "")
         hashtags_field = c.tags.get("hashtags", "")

osmsg 1.1.2__tar.gz → 1.2.0__tar.gz

osmsg 1.1.2tar.gz → 1.2.0tar.gz