PyPI - osmsg - Versions diffs - 1.2.0__tar.gz → 1.2.2__tar.gz - Mend

osmsg 1.2.0tar.gz → 1.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of osmsg might be problematic. Click here for more details.

Files changed (44) hide show

{osmsg-1.2.0 → osmsg-1.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: osmsg
-Version: 1.2.0
+Version: 1.2.2
 Summary: OpenStreetMap Stats Generator: Commandline
 Keywords: osm,stats,commandline,openstreetmap
 Author: Kshitij Raj Sharma
@@ -78,7 +78,8 @@ brew install osgeonepal/tap/osmsg          # macOS / Linux (Homebrew tap)
 ```
 On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
-and run it directly, no Python required.
+and double-click it to open the desktop app. Fill in the dates and options, click Run, and open the
+output folder. The CLI below is for macOS, Linux, and pip/uv users.
 ## Quick start
@@ -213,6 +214,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
 Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
 `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
+Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
+refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
 ## Configuration
 Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -228,12 +232,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
 | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
 | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
+| `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
 | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
 | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
 | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
 | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
 | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
-| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
+| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files. |
 | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |

{osmsg-1.2.0 → osmsg-1.2.2}/README.md RENAMED Viewed

@@ -46,7 +46,8 @@ brew install osgeonepal/tap/osmsg          # macOS / Linux (Homebrew tap)
 ```
 On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
-and run it directly, no Python required.
+and double-click it to open the desktop app. Fill in the dates and options, click Run, and open the
+output folder. The CLI below is for macOS, Linux, and pip/uv users.
 ## Quick start
@@ -181,6 +182,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
 Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
 `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
+Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
+refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
 ## Configuration
 Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -196,12 +200,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
 | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
 | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
 | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
+| `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
 | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
 | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
 | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
 | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
 | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
-| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
+| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files. |
 | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
 | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |

osmsg-1.2.2/osmsg/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.2"

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/cli.py RENAMED Viewed

@@ -271,6 +271,14 @@ def main(
         str | None,
         typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
     ] = None,
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            "--overwrite",
+            help="Recompute even if <name>.duckdb already holds this exact query; otherwise a rerun "
+            "that only changes the output format re-exports from the existing store.",
+        ),
+    ] = False,
 ) -> None:
     """Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
     if ctx.invoked_subcommand is not None:
@@ -338,6 +346,7 @@ def main(
         insert=insert,
         osh_file=osh_file,
         changeset_file=changeset_file,
+        overwrite=overwrite,
     )
     if last is not None:

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/psql.py RENAMED Viewed

@@ -5,10 +5,6 @@ import duckdb
 from ..exceptions import OsmsgError
 from ..pg_schema import PG_SCHEMA
-# Secondary indexes and foreign keys that make a row-by-row insert slow. For a one-time bulk load
-# they are dropped before the COPY and rebuilt once after (one index build + one FK validation,
-# instead of maintaining them per row). Primary keys stay, because the ON CONFLICT upserts need them.
-# Indexes are (name, create-sql); foreign keys are (table, name, add-clause).
 _BULK_INDEXES = [
     ("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
     ("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
@@ -25,8 +21,6 @@ _BULK_FKS = [
 ]
-# Bulk loads push the big tables in this many changeset_id ranges, each its own statement and so its
-# own commit, so a failure costs one range instead of rolling back the whole multi-GB load.
 _BULK_COMMIT_CHUNKS = 32
@@ -102,9 +96,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
             )
         if bulk_load:
-            # Stream rows instead of buffering them to preserve order; buffering 180M+ JSON-bearing
-            # rows is what exhausts memory in a single INSERT. Then drop the secondary indexes and
-            # foreign keys so the load does not maintain them per row.
             conn.execute("SET preserve_insertion_order = false")
             for table, name, _add in _BULK_FKS:
                 _pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
@@ -114,8 +105,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
             _push_chunked(conn, "changesets", _push_changesets)
             _push_chunked(conn, "changeset_stats", _push_changeset_stats)
         elif _pg_has_history(conn):
-            # The history layer (seq_id=0) is already in PG from the bulk load and never changes, so an
-            # incremental --update pushes only the live layer and its parents, not the 180M history rows.
             live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
             conn.execute(
                 "INSERT INTO pg_target.users SELECT * FROM users "
@@ -124,7 +113,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
             _push_changesets(conn, f"WHERE {live_ids}")
             _push_changeset_stats(conn, "WHERE seq_id <> 0")
         else:
-            # No history in PG (a plain live target): push everything (live rows are all seq_id<>0).
             conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
             _push_changesets(conn)
             _push_changeset_stats(conn)
@@ -141,7 +129,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
         )
         if bulk_load:
-            # Rebuild once, with more memory for the sort-based index builds, then refresh planner stats.
             for table, name, add in _BULK_FKS:
                 _pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
             for _name, create in _BULK_INDEXES:

osmsg-1.2.2/osmsg/gui.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Minimal tkinter desktop UI for running osmsg and saving the output."""
+from __future__ import annotations
+import datetime as dt
+import os
+import queue
+import sys
+import threading
+from pathlib import Path
+from typing import Any
+from .exceptions import NoDataFoundError, OsmsgError
+from .pipeline import RunConfig, run
+UTC = dt.UTC
+FORMATS = ["parquet", "csv", "json", "markdown"]
+def _parse_date(value: str) -> dt.datetime | None:
+    value = value.strip()
+    if not value:
+        return None
+    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
+        try:
+            return dt.datetime.strptime(value, fmt).replace(tzinfo=UTC)
+        except ValueError:
+            continue
+    raise OsmsgError(f"Unrecognized date: {value!r}. Use YYYY-MM-DD.")
+def _split(value: str | None) -> list[str] | None:
+    items: list[str] = [part.strip() for part in (value or "").split(",") if part.strip()]
+    return items if items else None
+def build_config(form: dict[str, object], output_dir: str) -> RunConfig:
+    """Map the form fields to a RunConfig, raising OsmsgError on invalid input."""
+    formats = [name for name in FORMATS if form.get(name)]
+    if not formats:
+        raise OsmsgError("Pick at least one output format.")
+    start = _parse_date(str(form.get("start", "")))
+    if start is None:
+        raise OsmsgError("Start date is required (YYYY-MM-DD).")
+    return RunConfig(
+        name=str(form.get("name") or "stats"),
+        start_date=start,
+        end_date=_parse_date(str(form.get("end", ""))),
+        hashtags=_split(str(form.get("hashtags") or "")),
+        additional_tags=_split(str(form.get("tags") or "")),
+        tag_mode="all" if form.get("all_tags") else "none",
+        summary=bool(form.get("summary")),
+        formats=formats,
+        output_dir=Path(output_dir or "."),
+    )
+def _open_folder(path: Path) -> None:
+    if sys.platform == "win32":
+        os.startfile(path)  # noqa: S606
+    elif sys.platform == "darwin":
+        import subprocess
+        subprocess.run(["open", str(path)], check=False)
+    else:
+        import subprocess
+        subprocess.run(["xdg-open", str(path)], check=False)
+class _Redirector:
+    def __init__(self, sink: queue.Queue) -> None:
+        self.sink = sink
+    def write(self, text: str) -> None:
+        if text:
+            self.sink.put(("log", text))
+    def flush(self) -> None:
+        pass
+    def isatty(self) -> bool:
+        return False
+class App:
+    def __init__(self) -> None:
+        import tkinter as tk
+        from tkinter import filedialog, scrolledtext, ttk
+        self._tk = tk
+        self._filedialog = filedialog
+        self.events: queue.Queue = queue.Queue()
+        self.out_dir = str(Path.home() / "osmsg")
+        self.root = tk.Tk()
+        self.root.title("osmsg")
+        self.vars: dict[str, Any] = {}
+        frame = ttk.Frame(self.root, padding=12)
+        frame.grid(sticky="nsew")
+        rows = [
+            ("Name", "name", "stats"),
+            ("Start (YYYY-MM-DD)", "start", ""),
+            ("End (blank = now)", "end", ""),
+            ("Hashtags (comma-sep)", "hashtags", ""),
+            ("Tags (comma-sep)", "tags", ""),
+        ]
+        for i, (label, key, default) in enumerate(rows):
+            ttk.Label(frame, text=label).grid(row=i, column=0, sticky="w", pady=2)
+            var = tk.StringVar(value=default)
+            ttk.Entry(frame, textvariable=var, width=40).grid(row=i, column=1, columnspan=3, sticky="we", pady=2)
+            self.vars[key] = var
+        self.vars["all_tags"] = tk.BooleanVar()
+        self.vars["summary"] = tk.BooleanVar()
+        ttk.Checkbutton(frame, text="All tags", variable=self.vars["all_tags"]).grid(row=5, column=0, sticky="w")
+        ttk.Checkbutton(frame, text="Daily summary", variable=self.vars["summary"]).grid(row=5, column=1, sticky="w")
+        fmt_frame = ttk.LabelFrame(frame, text="Formats", padding=6)
+        fmt_frame.grid(row=6, column=0, columnspan=4, sticky="we", pady=6)
+        for i, name in enumerate(FORMATS):
+            var = tk.BooleanVar(value=name in ("parquet", "csv"))
+            ttk.Checkbutton(fmt_frame, text=name, variable=var).grid(row=0, column=i, padx=4)
+            self.vars[name] = var
+        self.out_label = ttk.Label(frame, text=f"Output: {self.out_dir}")
+        self.out_label.grid(row=7, column=0, columnspan=3, sticky="w")
+        ttk.Button(frame, text="Choose folder", command=self._choose_folder).grid(row=7, column=3, sticky="e")
+        self.run_btn = ttk.Button(frame, text="Run", command=self._on_run)
+        self.run_btn.grid(row=8, column=0, pady=8, sticky="w")
+        self.open_btn = ttk.Button(frame, text="Open output folder", command=lambda: _open_folder(Path(self.out_dir)))
+        self.open_btn.grid(row=8, column=1, pady=8, sticky="w")
+        self.log = scrolledtext.ScrolledText(frame, width=70, height=14, state="disabled")
+        self.log.grid(row=9, column=0, columnspan=4, sticky="nsew")
+        self.root.after(120, self._drain)
+    def _choose_folder(self) -> None:
+        chosen = self._filedialog.askdirectory(initialdir=self.out_dir)
+        if chosen:
+            self.out_dir = chosen
+            self.out_label.config(text=f"Output: {self.out_dir}")
+    def _append(self, text: str) -> None:
+        self.log.config(state="normal")
+        self.log.insert("end", text)
+        self.log.see("end")
+        self.log.config(state="disabled")
+    def _on_run(self) -> None:
+        try:
+            cfg = build_config({k: v.get() for k, v in self.vars.items()}, self.out_dir)
+        except OsmsgError as exc:
+            self._append(f"\n{exc}\n")
+            return
+        self.run_btn.config(state="disabled")
+        self._append(f"\nRunning into {self.out_dir} ...\n")
+        threading.Thread(target=self._worker, args=(cfg,), daemon=True).start()
+    def _worker(self, cfg: RunConfig) -> None:
+        saved = sys.stdout, sys.stderr
+        sys.stdout = sys.stderr = _Redirector(self.events)  # type: ignore[assignment]
+        try:
+            result = run(cfg)
+            self.events.put(("done", f"Done. {result['rows']} rows. Files in {self.out_dir}"))
+        except NoDataFoundError:
+            self.events.put(("done", "No data found for that range."))
+        except OsmsgError as exc:
+            self.events.put(("done", f"Error: {exc}"))
+        except Exception as exc:
+            self.events.put(("done", f"Unexpected error: {type(exc).__name__}: {exc}"))
+        finally:
+            sys.stdout, sys.stderr = saved
+    def _drain(self) -> None:
+        try:
+            while True:
+                kind, payload = self.events.get_nowait()
+                if kind == "log":
+                    self._append(payload)
+                else:
+                    self._append(f"\n{payload}\n")
+                    self.run_btn.config(state="normal")
+        except queue.Empty:
+            pass
+        self.root.after(120, self._drain)
+    def run(self) -> None:
+        self.root.mainloop()
+def launch() -> None:
+    App().run()

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/history.py RENAMED Viewed

@@ -10,26 +10,26 @@ from dataclasses import dataclass
 import duckdb
 import requests
-from .ui import info, warn
+from .ui import info, progress_bar, warn
 UTC = dt.UTC
 SCHEMA_VERSION = 1
 DEFAULT_HISTORY_URL = "hf://datasets/kshitijrajsharma/osmsg-history"
-HISTORY_SEQ_ID = 0  # sentinel seq_id for rows sourced from the history backfill (no replication seq)
+HISTORY_SEQ_ID = 0
 @dataclass
 class Manifest:
     schema_version: int
-    min_month: dt.datetime  # first day of the earliest covered month (UTC)
-    frontier: dt.datetime  # first day of the month AFTER the latest covered month (exclusive bound)
+    min_month: dt.datetime
+    frontier: dt.datetime
 @dataclass
 class WindowSplit:
     remote_start: dt.datetime | None
-    remote_end: dt.datetime | None  # exclusive
-    live_start: dt.datetime  # the live diff path handles [live_start, end]
+    remote_end: dt.datetime | None
+    live_start: dt.datetime
     @property
     def has_remote(self) -> bool:
@@ -51,7 +51,6 @@ class RemoteFilters:
 def _manifest_http_url(history_url: str) -> str:
-    # hf://datasets/<repo> -> https://huggingface.co/datasets/<repo>/resolve/main/manifest.json
     if history_url.startswith("hf://datasets/"):
         repo = history_url[len("hf://datasets/") :]
         return f"https://huggingface.co/datasets/{repo}/resolve/main/manifest.json"
@@ -78,7 +77,7 @@ def fetch_manifest(history_url: str, timeout: int = 15) -> Manifest | None:
                 return None
             payload = response.json()
         else:
-            with open(url) as handle:  # local path (testing / self-hosted mirror)
+            with open(url) as handle:
                 payload = json.load(handle)
     except (requests.RequestException, OSError, ValueError) as exc:
         warn(f"history: manifest unreachable ({type(exc).__name__}); using live path.")
@@ -124,9 +123,8 @@ def _months(start: dt.datetime, end: dt.datetime) -> list[tuple[int, int]]:
 def _partition_list(base: str, dataset: str, months: list[tuple[int, int]]) -> str | None:
-    """Direct read_parquet() over the dataset's month partitions, or None when none exist. A glob would
-    make DuckDB list every partition over the HF API. Local bases are filtered to files that exist,
-    since a converted slice may lack a partition (e.g. a month with metadata but no counted edits)."""
+    """Direct read_parquet() over the given month partitions (local bases filtered to existing files),
+    or None when none exist."""
     root = base.rstrip("/")
     remote = root.startswith(("hf://", "http://", "https://", "s3://"))
     files = [f"{root}/{dataset}/year={year}/month={month}/data.parquet" for (year, month) in months]
@@ -138,9 +136,7 @@ def _partition_list(base: str, dataset: str, months: list[tuple[int, int]]) -> s
 def _hashtag_predicate(hashtags: list[str], exact_lookup: bool) -> str:
-    """SQL predicate over the changesets `hashtags` list, matching the live ChangesetHandler.
-    Whole-token (case-insensitive) with exact_lookup, otherwise substring. hashtags are already
-    canonicalised to a leading '#'."""
+    """SQL predicate matching the changesets `hashtags` list: whole-token with exact_lookup, else substring."""
     needles = [h.lower() for h in hashtags]
     if exact_lookup:
         terms = ", ".join(f"'{n}'" for n in needles)
@@ -160,10 +156,6 @@ def ingest_remote(
     if split.remote_start is None or split.remote_end is None:
         return 0
     months = _months(split.remote_start, split.remote_end)
-    changesets_src = _partition_list(history_url, "changesets", months)
-    changefiles_src = _partition_list(history_url, "changefiles", months)
-    if changesets_src is None and changefiles_src is None:
-        return 0
     start_iso = split.remote_start.astimezone(UTC).isoformat()
     end_iso = split.remote_end.astimezone(UTC).isoformat()
     in_window = f"created_at >= TIMESTAMPTZ '{start_iso}' AND created_at < TIMESTAMPTZ '{end_iso}'"
@@ -172,21 +164,8 @@ def ingest_remote(
     conn.execute("INSTALL spatial; LOAD spatial;")
     if history_url.startswith(("hf://", "http://", "https://", "s3://")):
         conn.execute("INSTALL httpfs; LOAD httpfs;")
-        # Ride out HF rate-limits on multi-partition reads instead of failing the run.
         conn.execute("SET http_retries=10; SET http_retry_wait_ms=2000; SET http_retry_backoff=1.5;")
-    info(f"history: remote ingest {start_iso} -> {end_iso} ({len(months)} month partitions) from {history_url}")
-    if changesets_src is not None:
-        # Names for everyone in the window; every changeset_stats uid has a changeset row here.
-        conn.execute(
-            f"""INSERT INTO users
-                SELECT uid, any_value(username) FROM {changesets_src}
-                WHERE {in_window} AND username IS NOT NULL
-                GROUP BY uid
-                ON CONFLICT (uid) DO NOTHING"""
-        )
     changeset_preds = [in_window]
     if filters.hashtags:
         changeset_preds.append(_hashtag_predicate(filters.hashtags, filters.exact_lookup))
@@ -199,51 +178,55 @@ def ingest_remote(
         changeset_preds.append(f"uid IN (SELECT uid FROM users WHERE username IN ({names}))")
     changeset_where = " AND ".join(changeset_preds)
-    # Always populate changesets: every changeset_stats row needs a parent row (the live path keeps
-    # this invariant via stubs, and Postgres enforces it as a foreign key). A metadata filter narrows
-    # which changesets (and thus which stats) are kept; a plain run keeps all in the window.
-    if changesets_src is not None:
-        conn.execute(
-            f"""INSERT INTO changesets
-                SELECT changeset_id, uid, created_at, hashtags, editor,
-                       CASE WHEN min_lon IS NOT NULL
-                            THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
-                FROM {changesets_src} WHERE {changeset_where}
-                ON CONFLICT (changeset_id) DO NOTHING"""
-        )
     stats_preds = [in_window]
     if filters.has_metadata_filter:
-        # Keep element stats only for changesets that passed the metadata filter above.
         stats_preds.append("changeset_id IN (SELECT changeset_id FROM changesets)")
     stats_where = " AND ".join(stats_preds)
-    if changefiles_src is not None:
-        conn.execute(
-            f"""INSERT INTO changeset_stats
-                SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
-                       nodes_created, nodes_modified, nodes_deleted,
-                       ways_created, ways_modified, ways_deleted,
-                       rels_created, rels_modified, rels_deleted,
-                       poi_created, poi_modified, tag_stats
-                FROM {changefiles_src} WHERE {stats_where}
-                ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
-        )
+    info(f"history: remote ingest {start_iso} -> {end_iso} ({len(months)} month partitions) from {history_url}")
+    with progress_bar(len(months), unit="months", description="Reading history") as advance:
+        for month in months:
+            changesets_src = _partition_list(history_url, "changesets", [month])
+            changefiles_src = _partition_list(history_url, "changefiles", [month])
+            if changesets_src is not None:
+                conn.execute(
+                    f"""INSERT INTO users
+                        SELECT uid, any_value(username) FROM {changesets_src}
+                        WHERE {in_window} AND username IS NOT NULL
+                        GROUP BY uid ON CONFLICT (uid) DO NOTHING"""
+                )
+                conn.execute(
+                    f"""INSERT INTO changesets
+                        SELECT changeset_id, uid, created_at, hashtags, editor,
+                               CASE WHEN min_lon IS NOT NULL
+                                    THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
+                        FROM {changesets_src} WHERE {changeset_where}
+                        ON CONFLICT (changeset_id) DO NOTHING"""
+                )
+            if changefiles_src is not None:
+                conn.execute(
+                    f"""INSERT INTO changeset_stats
+                        SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
+                               nodes_created, nodes_modified, nodes_deleted,
+                               ways_created, ways_modified, ways_deleted,
+                               rels_created, rels_modified, rels_deleted,
+                               poi_created, poi_modified, tag_stats
+                        FROM {changefiles_src} WHERE {stats_where}
+                        ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
+                )
+            advance()
     row = conn.execute(f"SELECT count(*) FROM changeset_stats WHERE seq_id = {HISTORY_SEQ_ID}").fetchone()
     return row[0] if row else 0
-# Resume one day before the frontier, not at it. A changeset can stay open for up to 24h, so its
-# edits can straddle the frontier, and converting a date to a replication sequence is not exact. The
-# re-scanned day overlaps the history layer, which the seq_id=0 dedup removes, so this never misses an
-# edit and never double counts.
 RESUME_SAFETY = dt.timedelta(days=1)
 def seed_resume_at(conn: duckdb.DuckDBPyConnection, resume_at: dt.datetime, replication_url: str) -> dt.datetime | None:
-    """Seed the `state` table so `osmsg --update` resumes at `resume_at` on `replication_url`. Derives
-    the replication sequence from the timestamp, so the caller never picks a seq by hand. Returns the
-    resume timestamp, or None if no sequence resolves at that time."""
+    """Seed `state` so `osmsg --update` resumes at `resume_at` on `replication_url`. Returns resume_at,
+    or None if no sequence resolves."""
     from osmium.replication.server import ReplicationServer
     from .db.schema import upsert_state

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/convert.py RENAMED Viewed

@@ -1,7 +1,5 @@
 """Convert a planet .osh history plus a changeset dump into the changefiles/changesets parquet
-datasets, out of core via osmsg's own DuckDB tables. Streams raw per-edit rows to parquet in bounded
-batches, then aggregates and joins in DuckDB (a changeset's edits are scattered across the .osh, so an
-in-memory pass OOMs at planet scale)."""
+datasets, out of core via osmsg's own DuckDB tables."""
 import concurrent.futures as cf
 import datetime as dt
@@ -20,11 +18,8 @@ from .pbf_split import split_pbf
 BATCH = 1_000_000
 CREATE, MODIFY, DELETE = 0, 1, 2
-# Out-of-core settings for planet-scale aggregation. Leave headroom below physical RAM; spill to disk.
 DUCKDB_MEMORY_LIMIT = "40GB"
 DUCKDB_THREADS = 24
-# A global GROUP BY over all string-keyed tag rows OOMs even with spill, and json_group_object does
-# not spill. Shard raw tags to disk by changeset_id % K, then aggregate each shard independently.
 TAG_SHARDS = 64
 ELEM_SCHEMA = pa.schema(
@@ -162,9 +157,7 @@ def stream_changesets(dump: str, start: dt.datetime, end: dt.datetime, work: pat
 def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
-    """Populate osmsg's tables (users, changesets, changeset_stats) from the streamed raw rows. Globs
-    raw_elements_*/raw_tags_* so single-process and split-parallel runs both work: one global GROUP BY
-    recombines each changeset's edits across parts."""
+    """Populate osmsg's tables (users, changesets, changeset_stats) from the streamed raw rows."""
     con.execute("INSTALL json; LOAD json;")
     work = pathlib.Path(work)
     cs = (work / "raw_changesets.parquet").as_posix()
@@ -209,8 +202,6 @@ def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
               a.rels_created, a.rels_modified, a.rels_deleted,
               a.poi_created, a.poi_modified"""
     for b in range(TAG_SHARDS):
-        # Insert this shard's agg changesets; attach tag_stats only if the shard has tags (tiny inputs
-        # and edit-only changesets carry none).
         shard_dir = shards / f"shard={b}"
         if shard_dir.is_dir():
             shard_glob = (shard_dir / "*.parquet").as_posix()
@@ -244,11 +235,8 @@ def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
 def export_parquet(con: duckdb.DuckDBPyConnection, out: pathlib.Path) -> None:
-    """Materialise the two datasets as persisted tables (a view would re-run the planet-scale joins per
-    partition; a TEMP table would hold 180M JSON rows in RAM), then write Morton-sorted partitions."""
+    """Materialise the two datasets as persisted tables, then write Morton-sorted partitions."""
     con.execute(MORTON_MACROS)
-    # changefiles created_at falls back to the element edit time when the changeset predates the window,
-    # so in-window edits are never dropped.
     con.execute(
         f"""CREATE TABLE changefiles_all AS
             SELECT s.* EXCLUDE (seq_id),
@@ -292,9 +280,8 @@ def aggregate(work: pathlib.Path, out: pathlib.Path) -> pathlib.Path:
 def convert(
     osh: str, changesets: str, start: dt.datetime, end: dt.datetime, work_dir: pathlib.Path, parts: int = 1
 ) -> pathlib.Path:
-    """Convert one .osh history + changeset dump to the two parquet datasets under `work_dir/out`.
-    With parts>1 the history is split at blob boundaries and streamed concurrently. Returns the out
-    directory holding changefiles/, changesets/, and stats.duckdb."""
+    """Convert one .osh history + changeset dump to the two parquet datasets under `work_dir/out`,
+    returned as a path. With parts>1 the history is split and streamed concurrently."""
     work = pathlib.Path(work_dir)
     raw = work / "raw"
     raw.mkdir(parents=True, exist_ok=True)

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/month.py RENAMED Viewed

@@ -12,9 +12,6 @@ from ..exceptions import OsmsgError
 from .parquet import GEOM_COLS, MORTON_MACROS, write_partitions
 UTC = dt.UTC
-# Planet-wide edits are continuous, so a complete month reaches within minutes of its end. A larger
-# shortfall means the source day diffs did not cover the whole month (a mid-day snapshot or lagging
-# replication), so the partition would be published short, the exact gap the read-side backstep masks.
 COMPLETENESS_TOLERANCE = dt.timedelta(hours=1)

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/parquet.py RENAMED Viewed

@@ -6,8 +6,6 @@ import duckdb
 ROW_GROUP_SIZE = 100_000
-# Morton(centroid) as native SQL macros (vectorized): scale lon/lat to 16-bit and interleave the bits
-# so 2D locality maps to a contiguous 1D key. A Python UDF in ORDER BY is ~10x slower at planet scale.
 MORTON_MACROS = """
 CREATE OR REPLACE MACRO _s1(v) AS ((v | (v << 8)) & 16711935);
 CREATE OR REPLACE MACRO _s2(v) AS ((_s1(v) | (_s1(v) << 4)) & 252645135);
@@ -19,7 +17,6 @@ CREATE OR REPLACE MACRO morton2(lon, lat) AS (
 );
 """
-# lon/lat centroid plus bbox min/max derived from changesets.geom (osmsg stores the bbox envelope).
 GEOM_COLS = (
     "ST_X(ST_Centroid(c.geom)) AS lon, ST_Y(ST_Centroid(c.geom)) AS lat, "
     "ST_XMin(c.geom) AS min_lon, ST_YMin(c.geom) AS min_lat, "
@@ -30,9 +27,8 @@ GEOM_COLS = (
 def write_partitions(
     con: duckdb.DuckDBPyConnection, view: str, base: pathlib.Path, order_by: str = "morton2(lon, lat)"
 ) -> None:
-    """Write one parquet file per year/month partition, each sorted by `order_by`. DuckDB's
-    PARTITION_BY drops the global sort, so each partition is sorted on its own for tight row-group
-    min/max. `view` must expose integer `y`, `m` partition columns."""
+    """Write one parquet file per year/month partition, each sorted by `order_by`. `view` must expose
+    integer `y`, `m` partition columns."""
     base.mkdir(parents=True, exist_ok=True)
     for year, month in con.execute(f"SELECT DISTINCT y, m FROM {view} ORDER BY y, m").fetchall():
         out = base / f"year={year}" / f"month={month}"

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/pbf_split.py RENAMED Viewed

@@ -19,7 +19,6 @@ def read_blob(handle) -> tuple[bytes, bytes, str] | None:
 def _parse_blobheader(buf: bytes) -> tuple[str, int]:
-    # BlobHeader: field 1 = type (length-delimited string), field 3 = datasize (varint).
     blob_type, datasize, i = "", 0, 0
     while i < len(buf):
         key = buf[i]

{osmsg-1.2.0 → osmsg-1.2.2}/osmsg/pipeline.py RENAMED Viewed

@@ -5,6 +5,8 @@ from __future__ import annotations
 import concurrent.futures
 import copy
 import datetime as dt
+import hashlib
+import json
 import os
 import shutil
 from dataclasses import dataclass, field
@@ -95,6 +97,7 @@ class RunConfig:
     insert: bool = False
     osh_file: str | None = None
     changeset_file: str | None = None
+    overwrite: bool = False
 def _resolve_country_urls(countries: list[str]) -> list[str]:
@@ -232,9 +235,6 @@ def _seed_history_resume(conn, cfg: RunConfig) -> None:
             seed_resume_state(conn, cfg.history_url, url)
-# minute < hour < day. Day diffs land at 00:00 UTC, which is also an hour and minute boundary, so
-# resuming a finer source at a coarser source's last_ts is disjoint (no edit double-counted or
-# skipped). The reverse can skip the partial current period, so --update only ever auto-refines.
 _GRANULARITY_RANK = {SHORTCUTS["minute"]: 0, SHORTCUTS["hour"]: 1, SHORTCUTS["day"]: 2}
@@ -244,9 +244,7 @@ def _tracked_sources(conn) -> list[str]:
 def _switch_source(conn, from_url: str, to_url: str) -> None:
-    """Hand tracking from from_url to to_url at from_url's last_ts (a clean seq boundary) and retire
-    from_url, so the two sequence spaces never overlap (double count) or gap. Each granularity is a
-    separate sequence, so the disjoint-coverage invariant is what keeps stats correct across a switch."""
+    """Resume to_url at from_url's last_ts and retire from_url, so the granularities never overlap or gap."""
     state = get_state(conn, from_url)
     if state is None:
         return
@@ -258,12 +256,11 @@ def _switch_source(conn, from_url: str, to_url: str) -> None:
 def _select_update_source(conn, cfg: RunConfig, now: dt.datetime) -> None:
-    """Pick the source `--update` continues. Without `--url`, continue the tracked source and auto-refine
-    to a finer granularity as the backlog shrinks; with `--url`, switch to it. Switches are clean
-    handoffs and a store tracks one planet source at a time, so granularities never overlap."""
+    """Pick the source `--update` continues: without `--url`, continue the tracked source and auto-refine
+    to a finer granularity as the backlog shrinks; with `--url`, switch to it via a clean handoff."""
     tracked = _tracked_sources(conn)
     if not tracked:
-        return  # fresh store: _resolve_url_starts bootstraps cfg.urls as given
+        return
     if len(tracked) > 1:
         if not cfg.url_explicit:
             cfg.urls = tracked
@@ -286,9 +283,8 @@ def _select_update_source(conn, cfg: RunConfig, now: dt.datetime) -> None:
 def _history_live_start(split: WindowSplit, frontier: dt.datetime) -> dt.datetime:
-    """Where the live tail begins after a remote ingest. When the query reached the published frontier,
-    back up by the safety window: the dataset's final month can stop short of its nominal boundary, so
-    re-scanning lets the seq_id=0 dedup recover the shortfall (the overlap it did cover is dropped)."""
+    """Where the live tail begins after a remote ingest: back up by the safety window when the query
+    reached the frontier (the final month may be short), else the split boundary."""
     if split.remote_end == frontier:
         return frontier - RESUME_SAFETY
     return split.live_start
@@ -351,8 +347,6 @@ def _run_insert(cfg: RunConfig, conn: duckdb.DuckDBPyConnection, db_path: Path)
     if cfg.url_explicit:
         seed_urls = cfg.urls
     else:
-        # The catch-up gap (now - frontier) is usually weeks; seed the granularity --update can clear
-        # quickly instead of crawling minute diffs. --update continues this same source.
         seed_urls = [resolve_url(_pick_replication_for_span(dt.datetime.now(UTC) - split.remote_end))]
     for url in seed_urls:
         seed_resume_at(conn, resume_at, url)
@@ -367,6 +361,136 @@ def _run_insert(cfg: RunConfig, conn: duckdb.DuckDBPyConnection, db_path: Path)
     return {"rows": n, "files": written, "rows_data": [], "summary": None, "start_seq": None, "end_seq": None}
+def _query_fingerprint(cfg: RunConfig) -> str:
+    """Stable hash of the query's data-affecting params, excluding output formats."""
+    key = {
+        "start": cfg.start_date.isoformat() if cfg.start_date else None,
+        "end": cfg.end_date.isoformat() if cfg.end_date else None,
+        "urls": sorted(cfg.urls),
+        "countries": sorted(cfg.countries) if cfg.countries else None,
+        "boundary": cfg.boundary,
+        "hashtags": sorted(cfg.hashtags) if cfg.hashtags else None,
+        "exact_lookup": cfg.exact_lookup,
+        "users": sorted(cfg.users_filter) if cfg.users_filter else None,
+        "tag_mode": cfg.tag_mode,
+        "additional_tags": sorted(cfg.additional_tags) if cfg.additional_tags else None,
+        "length_tags": sorted(cfg.length_tags) if cfg.length_tags else None,
+        "changeset": cfg.changeset,
+        "summary": cfg.summary,
+        "tm_stats": cfg.tm_stats,
+        "history_mode": cfg.history_mode,
+    }
+    return hashlib.sha256(json.dumps(key, sort_keys=True).encode()).hexdigest()
+def _read_fingerprint(conn: duckdb.DuckDBPyConnection) -> str | None:
+    """The query fingerprint stamped on an existing store, or None if absent."""
+    present = conn.execute("SELECT 1 FROM information_schema.tables WHERE table_name = 'osmsg_run_meta'").fetchone()
+    if not present:
+        return None
+    row = conn.execute("SELECT fingerprint FROM osmsg_run_meta LIMIT 1").fetchone()
+    return row[0] if row else None
+def _store_fingerprint(conn: duckdb.DuckDBPyConnection, fingerprint: str) -> None:
+    conn.execute("CREATE TABLE IF NOT EXISTS osmsg_run_meta (fingerprint VARCHAR)")
+    conn.execute("DELETE FROM osmsg_run_meta")
+    conn.execute("INSERT INTO osmsg_run_meta VALUES (?)", [fingerprint])
+def _finalize(
+    cfg: RunConfig,
+    conn: duckdb.DuckDBPyConnection,
+    fingerprint: str,
+    *,
+    start_date_utc: dt.datetime,
+    end_date_utc: dt.datetime,
+    start_seq: int | None,
+    end_seq: int | None,
+) -> dict[str, Any]:
+    """Aggregate the populated tables into user stats and write the requested formats."""
+    rows = user_stats(conn, top_n=None)
+    if not rows:
+        dbmod.close(conn)
+        # Raised so the CLI can map "no new data" to exit 0.
+        raise NoDataFoundError("No stats produced for the requested time range.")
+    _store_fingerprint(conn, fingerprint)
+    if cfg.changeset or cfg.hashtags:
+        attach_metadata(conn, rows)
+    if cfg.additional_tags or cfg.tag_mode != "none" or cfg.length_tags:
+        attach_tag_stats(
+            conn,
+            rows,
+            additional_tags=cfg.additional_tags,
+            tag_mode=cfg.tag_mode,
+            length_tags=cfg.length_tags,
+        )
+    if cfg.tm_stats:
+        rows = tm.enrich(rows)
+    out = cfg.output_dir
+    written: dict[str, str] = {}
+    if "parquet" in cfg.formats:
+        written["parquet"] = str(to_parquet(rows, out / f"{cfg.name}.parquet"))
+    if "csv" in cfg.formats:
+        written["csv"] = str(to_csv(rows, out / f"{cfg.name}.csv"))
+    if "json" in cfg.formats:
+        written["json"] = str(to_json(rows, out / f"{cfg.name}.json"))
+    if "markdown" in cfg.formats:
+        md_path = out / f"{cfg.name}.md"
+        table_markdown(rows, output_path=md_path)
+        written["markdown"] = str(md_path)
+    summary_rows: list[dict[str, Any]] | None = None
+    if cfg.summary:
+        summary_rows = daily_summary(
+            conn,
+            additional_tags=cfg.additional_tags,
+            tag_mode=cfg.tag_mode,
+            length_tags=cfg.length_tags,
+        )
+    if summary_rows:
+        if "parquet" in cfg.formats:
+            written["summary_parquet"] = str(to_parquet(summary_rows, out / f"{cfg.name}_summary.parquet"))
+        if "csv" in cfg.formats:
+            written["summary_csv"] = str(to_csv(summary_rows, out / f"{cfg.name}_summary.csv"))
+        if "json" in cfg.formats:
+            written["summary_json"] = str(to_json(summary_rows, out / f"{cfg.name}_summary.json"))
+        if "markdown" in cfg.formats:
+            summary_md_path = out / f"{cfg.name}_summary.md"
+            summary_markdown(
+                rows,
+                output_path=summary_md_path,
+                start_date=start_date_utc,
+                end_date=end_date_utc,
+                additional_tags=cfg.additional_tags,
+                length_tags=cfg.length_tags,
+                tag_mode=cfg.tag_mode,
+                fname=cfg.name,
+                tm_stats=cfg.tm_stats,
+            )
+            written["summary_md"] = str(summary_md_path)
+        # psql: skipped on purpose, daily_summary is a query over the four base tables.
+    if "psql" in cfg.formats:
+        if not cfg.psql_dsn:
+            raise OsmsgError("'psql' format requires a libpq DSN (--psql-dsn / RunConfig.psql_dsn=...).")
+        info(f"Pushing to PostgreSQL: {cfg.psql_dsn.split()[0]}…")
+        to_psql(conn, cfg.psql_dsn, bulk_load=cfg.psql_bulk)
+        written["psql"] = cfg.psql_dsn
+    dbmod.close(conn)
+    return {
+        "rows": len(rows),
+        "files": written,
+        "rows_data": rows,
+        "summary": summary_rows,
+        "start_seq": start_seq,
+        "end_seq": end_seq,
+    }
 def _ensure_credentials(cfg: RunConfig) -> str | None:
     """Resolve OSM credentials and exchange them for a Geofabrik OAuth 2.0 cookie.
@@ -482,15 +606,33 @@ def run(cfg: RunConfig) -> dict[str, Any]:
     cookie = _ensure_credentials(cfg)
     db_path = cfg.output_dir / f"{cfg.name}.duckdb"
+    if cfg.end_date is None:
+        cfg.end_date = dt.datetime.now(UTC)
+    fingerprint = _query_fingerprint(cfg)
+    if not cfg.update and not cfg.insert and not cfg.overwrite and db_path.exists():
+        existing = dbmod.connect(str(db_path))
+        if _read_fingerprint(existing) == fingerprint:
+            info(f"Reusing {db_path} (same query); re-exporting. Pass --overwrite to recompute.")
+            start_utc = (cfg.start_date or cfg.end_date).astimezone(UTC)
+            return _finalize(
+                cfg,
+                existing,
+                fingerprint,
+                start_date_utc=start_utc,
+                end_date_utc=cfg.end_date.astimezone(UTC),
+                start_seq=None,
+                end_seq=None,
+            )
+        dbmod.close(existing)
     if not cfg.update and db_path.exists():
         db_path.unlink()
     conn = dbmod.connect(str(db_path))
     dbmod.create_tables(conn)
     info(f"DuckDB: {db_path}")
-    if cfg.end_date is None:
-        cfg.end_date = dt.datetime.now(UTC)
     if cfg.insert:
         return _run_insert(cfg, conn, db_path)
@@ -533,8 +675,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
     if (cfg.tm_stats or cfg.summary or cfg.tag_mode == "all") and not cfg.changeset and not cfg.hashtags:
         cfg.changeset = True
-    # Hybrid-auto history: serve the covered months from the published parquet, leaving only the
-    # uncovered recent tail to the live diff path. Falls back to full live on any problem.
     run_live = True
     if cfg.history_mode == "auto" and not cfg.update:
         if cfg.length_tags:
@@ -562,7 +702,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
                             _auto_switch_replication(cfg, cfg.end_date - cfg.start_date)
                     except duckdb.Error as exc:
                         warn(f"history: remote ingest failed ({type(exc).__name__}: {exc}); using live path.")
-                        # Discard any partial remote rows so the live path is the sole source.
                         for tbl in ("changeset_stats", "changesets", "users"):
                             conn.execute(f"DELETE FROM {tbl}")
                         run_live = True
@@ -700,9 +839,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
             if sub.exists():
                 shutil.rmtree(sub, ignore_errors=True)
-    # History rows (seq_id=0) hold a changeset's COMPLETE lifetime counts. If the live tail re-saw
-    # some of those edits for a changeset that straddles the frontier, drop the live duplicates: the
-    # history row already counts them, so this is no-loss and prevents double counting.
     history_row = conn.execute("SELECT count(*) FROM changeset_stats WHERE seq_id = 0").fetchone()
     has_history = bool(history_row and history_row[0] > 0)
     if run_live and has_history:
@@ -720,90 +856,15 @@ def run(cfg: RunConfig) -> dict[str, Any]:
     else:
         start_date_utc = cfg.start_date.astimezone(UTC)
-    rows = user_stats(conn, top_n=None)
-    if not rows:
-        dbmod.close(conn)
-        # Raised so the CLI can map "no new data" to exit 0.
-        raise NoDataFoundError("No stats produced for the requested time range.")
-    if cfg.changeset or cfg.hashtags:
-        attach_metadata(conn, rows)
-    if cfg.additional_tags or cfg.tag_mode != "none" or cfg.length_tags:
-        attach_tag_stats(
-            conn,
-            rows,
-            additional_tags=cfg.additional_tags,
-            tag_mode=cfg.tag_mode,
-            length_tags=cfg.length_tags,
-        )
-    if cfg.tm_stats:
-        rows = tm.enrich(rows)
-    out = cfg.output_dir
-    written: dict[str, str] = {}
-    if "parquet" in cfg.formats:
-        written["parquet"] = str(to_parquet(rows, out / f"{cfg.name}.parquet"))
-    if "csv" in cfg.formats:
-        written["csv"] = str(to_csv(rows, out / f"{cfg.name}.csv"))
-    if "json" in cfg.formats:
-        written["json"] = str(to_json(rows, out / f"{cfg.name}.json"))
-    if "markdown" in cfg.formats:
-        md_path = out / f"{cfg.name}.md"
-        table_markdown(
-            rows,
-            output_path=md_path,
-        )
-        written["markdown"] = str(md_path)
-    summary_rows: list[dict[str, Any]] | None = None
-    if cfg.summary:
-        summary_rows = daily_summary(
-            conn,
-            additional_tags=cfg.additional_tags,
-            tag_mode=cfg.tag_mode,
-            length_tags=cfg.length_tags,
-        )
-    if summary_rows:
-        if "parquet" in cfg.formats:
-            written["summary_parquet"] = str(to_parquet(summary_rows, out / f"{cfg.name}_summary.parquet"))
-        if "csv" in cfg.formats:
-            written["summary_csv"] = str(to_csv(summary_rows, out / f"{cfg.name}_summary.csv"))
-        if "json" in cfg.formats:
-            written["summary_json"] = str(to_json(summary_rows, out / f"{cfg.name}_summary.json"))
-        if "markdown" in cfg.formats:
-            summary_md_path = out / f"{cfg.name}_summary.md"
-            summary_markdown(
-                rows,
-                output_path=summary_md_path,
-                start_date=start_date_utc,
-                end_date=end_date_utc,
-                additional_tags=cfg.additional_tags,
-                length_tags=cfg.length_tags,
-                tag_mode=cfg.tag_mode,
-                fname=cfg.name,
-                tm_stats=cfg.tm_stats,
-            )
-            written["summary_md"] = str(summary_md_path)
-        # psql: skipped on purpose, daily_summary is a query over the four base tables.
-    if "psql" in cfg.formats:
-        if not cfg.psql_dsn:
-            raise OsmsgError("'psql' format requires a libpq DSN (--psql-dsn / RunConfig.psql_dsn=...).")
-        info(f"Pushing to PostgreSQL: {cfg.psql_dsn.split()[0]}…")
-        to_psql(conn, cfg.psql_dsn, bulk_load=cfg.psql_bulk)
-        written["psql"] = cfg.psql_dsn
-    dbmod.close(conn)
-    return {
-        "rows": len(rows),
-        "files": written,
-        "rows_data": rows,
-        "summary": summary_rows,
-        "start_seq": start_seq,
-        "end_seq": end_seq,
-    }
+    return _finalize(
+        cfg,
+        conn,
+        fingerprint,
+        start_date_utc=start_date_utc,
+        end_date_utc=end_date_utc,
+        start_seq=start_seq,
+        end_seq=end_seq,
+    )
 __all__ = ["RunConfig", "run"]

{osmsg-1.2.0 → osmsg-1.2.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "osmsg"
-version = "1.2.0"
+version = "1.2.2"
 description = "OpenStreetMap Stats Generator: Commandline"
 readme = "README.md"
 authors = [
@@ -41,6 +41,9 @@ repository = "https://github.com/osgeonepal/osmsg"
 [project.scripts]
 osmsg = "osmsg.cli:app"
+[project.gui-scripts]
+osmsg-gui = "osmsg.gui:launch"
 [build-system]
 requires = ["uv_build>=0.5.15,<0.9"]
 build-backend = "uv_build"