osmsg 1.2.0__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of osmsg might be problematic. Click here for more details.

Files changed (44) hide show
  1. {osmsg-1.2.0 → osmsg-1.2.2}/PKG-INFO +8 -3
  2. {osmsg-1.2.0 → osmsg-1.2.2}/README.md +7 -2
  3. osmsg-1.2.2/osmsg/__version__.py +1 -0
  4. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/cli.py +9 -0
  5. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/psql.py +0 -13
  6. osmsg-1.2.2/osmsg/gui.py +195 -0
  7. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/history.py +46 -63
  8. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/convert.py +5 -18
  9. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/month.py +0 -3
  10. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/parquet.py +2 -6
  11. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/pbf_split.py +0 -1
  12. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/pipeline.py +169 -108
  13. {osmsg-1.2.0 → osmsg-1.2.2}/pyproject.toml +4 -1
  14. osmsg-1.2.0/osmsg/__version__.py +0 -1
  15. {osmsg-1.2.0 → osmsg-1.2.2}/LICENSE +0 -0
  16. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/__init__.py +0 -0
  17. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/_http.py +0 -0
  18. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/_tick.py +0 -0
  19. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/auth.py +0 -0
  20. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/boundary.py +0 -0
  21. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/db/__init__.py +0 -0
  22. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/db/duckdb_schema.py +0 -0
  23. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/db/ingest.py +0 -0
  24. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/db/queries.py +0 -0
  25. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/db/schema.py +0 -0
  26. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/exceptions.py +0 -0
  27. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/__init__.py +0 -0
  28. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/csv.py +0 -0
  29. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/json.py +0 -0
  30. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/markdown.py +0 -0
  31. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/export/parquet.py +0 -0
  32. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/fetch.py +0 -0
  33. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/geofabrik.py +0 -0
  34. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/handlers.py +0 -0
  35. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/__init__.py +0 -0
  36. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/cli.py +0 -0
  37. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/maintain/manifest.py +0 -0
  38. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/models.py +0 -0
  39. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/pg_schema.py +0 -0
  40. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/py.typed +0 -0
  41. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/replication.py +0 -0
  42. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/tm.py +0 -0
  43. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/ui.py +0 -0
  44. {osmsg-1.2.0 → osmsg-1.2.2}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -78,7 +78,8 @@ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
78
78
  ```
79
79
 
80
80
  On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
81
- and run it directly, no Python required.
81
+ and double-click it to open the desktop app. Fill in the dates and options, click Run, and open the
82
+ output folder. The CLI below is for macOS, Linux, and pip/uv users.
82
83
 
83
84
  ## Quick start
84
85
 
@@ -213,6 +214,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
213
214
  Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
214
215
  `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
215
216
 
217
+ Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
218
+ refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
219
+
216
220
  ## Configuration
217
221
 
218
222
  Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -228,12 +232,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
228
232
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
229
233
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
230
234
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
235
+ | `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
231
236
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
232
237
  | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
233
238
  | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
234
239
  | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
235
240
  | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
236
- | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
241
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files. |
237
242
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
238
243
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
239
244
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -46,7 +46,8 @@ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
46
46
  ```
47
47
 
48
48
  On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
49
- and run it directly, no Python required.
49
+ and double-click it to open the desktop app. Fill in the dates and options, click Run, and open the
50
+ output folder. The CLI below is for macOS, Linux, and pip/uv users.
50
51
 
51
52
  ## Quick start
52
53
 
@@ -181,6 +182,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
181
182
  Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
182
183
  `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
183
184
 
185
+ Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
186
+ refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
187
+
184
188
  ## Configuration
185
189
 
186
190
  Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -196,12 +200,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
196
200
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
197
201
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
198
202
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
203
+ | `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
199
204
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
200
205
  | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
201
206
  | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
202
207
  | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
203
208
  | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
204
- | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
209
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files. |
205
210
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
206
211
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
207
212
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -0,0 +1 @@
1
+ __version__ = "1.2.2"
@@ -271,6 +271,14 @@ def main(
271
271
  str | None,
272
272
  typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
273
273
  ] = None,
274
+ overwrite: Annotated[
275
+ bool,
276
+ typer.Option(
277
+ "--overwrite",
278
+ help="Recompute even if <name>.duckdb already holds this exact query; otherwise a rerun "
279
+ "that only changes the output format re-exports from the existing store.",
280
+ ),
281
+ ] = False,
274
282
  ) -> None:
275
283
  """Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
276
284
  if ctx.invoked_subcommand is not None:
@@ -338,6 +346,7 @@ def main(
338
346
  insert=insert,
339
347
  osh_file=osh_file,
340
348
  changeset_file=changeset_file,
349
+ overwrite=overwrite,
341
350
  )
342
351
 
343
352
  if last is not None:
@@ -5,10 +5,6 @@ import duckdb
5
5
  from ..exceptions import OsmsgError
6
6
  from ..pg_schema import PG_SCHEMA
7
7
 
8
- # Secondary indexes and foreign keys that make a row-by-row insert slow. For a one-time bulk load
9
- # they are dropped before the COPY and rebuilt once after (one index build + one FK validation,
10
- # instead of maintaining them per row). Primary keys stay, because the ON CONFLICT upserts need them.
11
- # Indexes are (name, create-sql); foreign keys are (table, name, add-clause).
12
8
  _BULK_INDEXES = [
13
9
  ("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
14
10
  ("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
@@ -25,8 +21,6 @@ _BULK_FKS = [
25
21
  ]
26
22
 
27
23
 
28
- # Bulk loads push the big tables in this many changeset_id ranges, each its own statement and so its
29
- # own commit, so a failure costs one range instead of rolling back the whole multi-GB load.
30
24
  _BULK_COMMIT_CHUNKS = 32
31
25
 
32
26
 
@@ -102,9 +96,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
102
96
  )
103
97
 
104
98
  if bulk_load:
105
- # Stream rows instead of buffering them to preserve order; buffering 180M+ JSON-bearing
106
- # rows is what exhausts memory in a single INSERT. Then drop the secondary indexes and
107
- # foreign keys so the load does not maintain them per row.
108
99
  conn.execute("SET preserve_insertion_order = false")
109
100
  for table, name, _add in _BULK_FKS:
110
101
  _pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
@@ -114,8 +105,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
114
105
  _push_chunked(conn, "changesets", _push_changesets)
115
106
  _push_chunked(conn, "changeset_stats", _push_changeset_stats)
116
107
  elif _pg_has_history(conn):
117
- # The history layer (seq_id=0) is already in PG from the bulk load and never changes, so an
118
- # incremental --update pushes only the live layer and its parents, not the 180M history rows.
119
108
  live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
120
109
  conn.execute(
121
110
  "INSERT INTO pg_target.users SELECT * FROM users "
@@ -124,7 +113,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
124
113
  _push_changesets(conn, f"WHERE {live_ids}")
125
114
  _push_changeset_stats(conn, "WHERE seq_id <> 0")
126
115
  else:
127
- # No history in PG (a plain live target): push everything (live rows are all seq_id<>0).
128
116
  conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
129
117
  _push_changesets(conn)
130
118
  _push_changeset_stats(conn)
@@ -141,7 +129,6 @@ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = Fals
141
129
  )
142
130
 
143
131
  if bulk_load:
144
- # Rebuild once, with more memory for the sort-based index builds, then refresh planner stats.
145
132
  for table, name, add in _BULK_FKS:
146
133
  _pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
147
134
  for _name, create in _BULK_INDEXES:
@@ -0,0 +1,195 @@
1
+ """Minimal tkinter desktop UI for running osmsg and saving the output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import os
7
+ import queue
8
+ import sys
9
+ import threading
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from .exceptions import NoDataFoundError, OsmsgError
14
+ from .pipeline import RunConfig, run
15
+
16
+ UTC = dt.UTC
17
+ FORMATS = ["parquet", "csv", "json", "markdown"]
18
+
19
+
20
+ def _parse_date(value: str) -> dt.datetime | None:
21
+ value = value.strip()
22
+ if not value:
23
+ return None
24
+ for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
25
+ try:
26
+ return dt.datetime.strptime(value, fmt).replace(tzinfo=UTC)
27
+ except ValueError:
28
+ continue
29
+ raise OsmsgError(f"Unrecognized date: {value!r}. Use YYYY-MM-DD.")
30
+
31
+
32
+ def _split(value: str | None) -> list[str] | None:
33
+ items: list[str] = [part.strip() for part in (value or "").split(",") if part.strip()]
34
+ return items if items else None
35
+
36
+
37
+ def build_config(form: dict[str, object], output_dir: str) -> RunConfig:
38
+ """Map the form fields to a RunConfig, raising OsmsgError on invalid input."""
39
+ formats = [name for name in FORMATS if form.get(name)]
40
+ if not formats:
41
+ raise OsmsgError("Pick at least one output format.")
42
+ start = _parse_date(str(form.get("start", "")))
43
+ if start is None:
44
+ raise OsmsgError("Start date is required (YYYY-MM-DD).")
45
+ return RunConfig(
46
+ name=str(form.get("name") or "stats"),
47
+ start_date=start,
48
+ end_date=_parse_date(str(form.get("end", ""))),
49
+ hashtags=_split(str(form.get("hashtags") or "")),
50
+ additional_tags=_split(str(form.get("tags") or "")),
51
+ tag_mode="all" if form.get("all_tags") else "none",
52
+ summary=bool(form.get("summary")),
53
+ formats=formats,
54
+ output_dir=Path(output_dir or "."),
55
+ )
56
+
57
+
58
+ def _open_folder(path: Path) -> None:
59
+ if sys.platform == "win32":
60
+ os.startfile(path) # noqa: S606
61
+ elif sys.platform == "darwin":
62
+ import subprocess
63
+
64
+ subprocess.run(["open", str(path)], check=False)
65
+ else:
66
+ import subprocess
67
+
68
+ subprocess.run(["xdg-open", str(path)], check=False)
69
+
70
+
71
+ class _Redirector:
72
+ def __init__(self, sink: queue.Queue) -> None:
73
+ self.sink = sink
74
+
75
+ def write(self, text: str) -> None:
76
+ if text:
77
+ self.sink.put(("log", text))
78
+
79
+ def flush(self) -> None:
80
+ pass
81
+
82
+ def isatty(self) -> bool:
83
+ return False
84
+
85
+
86
+ class App:
87
+ def __init__(self) -> None:
88
+ import tkinter as tk
89
+ from tkinter import filedialog, scrolledtext, ttk
90
+
91
+ self._tk = tk
92
+ self._filedialog = filedialog
93
+ self.events: queue.Queue = queue.Queue()
94
+ self.out_dir = str(Path.home() / "osmsg")
95
+
96
+ self.root = tk.Tk()
97
+ self.root.title("osmsg")
98
+ self.vars: dict[str, Any] = {}
99
+ frame = ttk.Frame(self.root, padding=12)
100
+ frame.grid(sticky="nsew")
101
+
102
+ rows = [
103
+ ("Name", "name", "stats"),
104
+ ("Start (YYYY-MM-DD)", "start", ""),
105
+ ("End (blank = now)", "end", ""),
106
+ ("Hashtags (comma-sep)", "hashtags", ""),
107
+ ("Tags (comma-sep)", "tags", ""),
108
+ ]
109
+ for i, (label, key, default) in enumerate(rows):
110
+ ttk.Label(frame, text=label).grid(row=i, column=0, sticky="w", pady=2)
111
+ var = tk.StringVar(value=default)
112
+ ttk.Entry(frame, textvariable=var, width=40).grid(row=i, column=1, columnspan=3, sticky="we", pady=2)
113
+ self.vars[key] = var
114
+
115
+ self.vars["all_tags"] = tk.BooleanVar()
116
+ self.vars["summary"] = tk.BooleanVar()
117
+ ttk.Checkbutton(frame, text="All tags", variable=self.vars["all_tags"]).grid(row=5, column=0, sticky="w")
118
+ ttk.Checkbutton(frame, text="Daily summary", variable=self.vars["summary"]).grid(row=5, column=1, sticky="w")
119
+
120
+ fmt_frame = ttk.LabelFrame(frame, text="Formats", padding=6)
121
+ fmt_frame.grid(row=6, column=0, columnspan=4, sticky="we", pady=6)
122
+ for i, name in enumerate(FORMATS):
123
+ var = tk.BooleanVar(value=name in ("parquet", "csv"))
124
+ ttk.Checkbutton(fmt_frame, text=name, variable=var).grid(row=0, column=i, padx=4)
125
+ self.vars[name] = var
126
+
127
+ self.out_label = ttk.Label(frame, text=f"Output: {self.out_dir}")
128
+ self.out_label.grid(row=7, column=0, columnspan=3, sticky="w")
129
+ ttk.Button(frame, text="Choose folder", command=self._choose_folder).grid(row=7, column=3, sticky="e")
130
+
131
+ self.run_btn = ttk.Button(frame, text="Run", command=self._on_run)
132
+ self.run_btn.grid(row=8, column=0, pady=8, sticky="w")
133
+ self.open_btn = ttk.Button(frame, text="Open output folder", command=lambda: _open_folder(Path(self.out_dir)))
134
+ self.open_btn.grid(row=8, column=1, pady=8, sticky="w")
135
+
136
+ self.log = scrolledtext.ScrolledText(frame, width=70, height=14, state="disabled")
137
+ self.log.grid(row=9, column=0, columnspan=4, sticky="nsew")
138
+ self.root.after(120, self._drain)
139
+
140
+ def _choose_folder(self) -> None:
141
+ chosen = self._filedialog.askdirectory(initialdir=self.out_dir)
142
+ if chosen:
143
+ self.out_dir = chosen
144
+ self.out_label.config(text=f"Output: {self.out_dir}")
145
+
146
+ def _append(self, text: str) -> None:
147
+ self.log.config(state="normal")
148
+ self.log.insert("end", text)
149
+ self.log.see("end")
150
+ self.log.config(state="disabled")
151
+
152
+ def _on_run(self) -> None:
153
+ try:
154
+ cfg = build_config({k: v.get() for k, v in self.vars.items()}, self.out_dir)
155
+ except OsmsgError as exc:
156
+ self._append(f"\n{exc}\n")
157
+ return
158
+ self.run_btn.config(state="disabled")
159
+ self._append(f"\nRunning into {self.out_dir} ...\n")
160
+ threading.Thread(target=self._worker, args=(cfg,), daemon=True).start()
161
+
162
+ def _worker(self, cfg: RunConfig) -> None:
163
+ saved = sys.stdout, sys.stderr
164
+ sys.stdout = sys.stderr = _Redirector(self.events) # type: ignore[assignment]
165
+ try:
166
+ result = run(cfg)
167
+ self.events.put(("done", f"Done. {result['rows']} rows. Files in {self.out_dir}"))
168
+ except NoDataFoundError:
169
+ self.events.put(("done", "No data found for that range."))
170
+ except OsmsgError as exc:
171
+ self.events.put(("done", f"Error: {exc}"))
172
+ except Exception as exc:
173
+ self.events.put(("done", f"Unexpected error: {type(exc).__name__}: {exc}"))
174
+ finally:
175
+ sys.stdout, sys.stderr = saved
176
+
177
+ def _drain(self) -> None:
178
+ try:
179
+ while True:
180
+ kind, payload = self.events.get_nowait()
181
+ if kind == "log":
182
+ self._append(payload)
183
+ else:
184
+ self._append(f"\n{payload}\n")
185
+ self.run_btn.config(state="normal")
186
+ except queue.Empty:
187
+ pass
188
+ self.root.after(120, self._drain)
189
+
190
+ def run(self) -> None:
191
+ self.root.mainloop()
192
+
193
+
194
+ def launch() -> None:
195
+ App().run()
@@ -10,26 +10,26 @@ from dataclasses import dataclass
10
10
  import duckdb
11
11
  import requests
12
12
 
13
- from .ui import info, warn
13
+ from .ui import info, progress_bar, warn
14
14
 
15
15
  UTC = dt.UTC
16
16
  SCHEMA_VERSION = 1
17
17
  DEFAULT_HISTORY_URL = "hf://datasets/kshitijrajsharma/osmsg-history"
18
- HISTORY_SEQ_ID = 0 # sentinel seq_id for rows sourced from the history backfill (no replication seq)
18
+ HISTORY_SEQ_ID = 0
19
19
 
20
20
 
21
21
  @dataclass
22
22
  class Manifest:
23
23
  schema_version: int
24
- min_month: dt.datetime # first day of the earliest covered month (UTC)
25
- frontier: dt.datetime # first day of the month AFTER the latest covered month (exclusive bound)
24
+ min_month: dt.datetime
25
+ frontier: dt.datetime
26
26
 
27
27
 
28
28
  @dataclass
29
29
  class WindowSplit:
30
30
  remote_start: dt.datetime | None
31
- remote_end: dt.datetime | None # exclusive
32
- live_start: dt.datetime # the live diff path handles [live_start, end]
31
+ remote_end: dt.datetime | None
32
+ live_start: dt.datetime
33
33
 
34
34
  @property
35
35
  def has_remote(self) -> bool:
@@ -51,7 +51,6 @@ class RemoteFilters:
51
51
 
52
52
 
53
53
  def _manifest_http_url(history_url: str) -> str:
54
- # hf://datasets/<repo> -> https://huggingface.co/datasets/<repo>/resolve/main/manifest.json
55
54
  if history_url.startswith("hf://datasets/"):
56
55
  repo = history_url[len("hf://datasets/") :]
57
56
  return f"https://huggingface.co/datasets/{repo}/resolve/main/manifest.json"
@@ -78,7 +77,7 @@ def fetch_manifest(history_url: str, timeout: int = 15) -> Manifest | None:
78
77
  return None
79
78
  payload = response.json()
80
79
  else:
81
- with open(url) as handle: # local path (testing / self-hosted mirror)
80
+ with open(url) as handle:
82
81
  payload = json.load(handle)
83
82
  except (requests.RequestException, OSError, ValueError) as exc:
84
83
  warn(f"history: manifest unreachable ({type(exc).__name__}); using live path.")
@@ -124,9 +123,8 @@ def _months(start: dt.datetime, end: dt.datetime) -> list[tuple[int, int]]:
124
123
 
125
124
 
126
125
  def _partition_list(base: str, dataset: str, months: list[tuple[int, int]]) -> str | None:
127
- """Direct read_parquet() over the dataset's month partitions, or None when none exist. A glob would
128
- make DuckDB list every partition over the HF API. Local bases are filtered to files that exist,
129
- since a converted slice may lack a partition (e.g. a month with metadata but no counted edits)."""
126
+ """Direct read_parquet() over the given month partitions (local bases filtered to existing files),
127
+ or None when none exist."""
130
128
  root = base.rstrip("/")
131
129
  remote = root.startswith(("hf://", "http://", "https://", "s3://"))
132
130
  files = [f"{root}/{dataset}/year={year}/month={month}/data.parquet" for (year, month) in months]
@@ -138,9 +136,7 @@ def _partition_list(base: str, dataset: str, months: list[tuple[int, int]]) -> s
138
136
 
139
137
 
140
138
  def _hashtag_predicate(hashtags: list[str], exact_lookup: bool) -> str:
141
- """SQL predicate over the changesets `hashtags` list, matching the live ChangesetHandler.
142
- Whole-token (case-insensitive) with exact_lookup, otherwise substring. hashtags are already
143
- canonicalised to a leading '#'."""
139
+ """SQL predicate matching the changesets `hashtags` list: whole-token with exact_lookup, else substring."""
144
140
  needles = [h.lower() for h in hashtags]
145
141
  if exact_lookup:
146
142
  terms = ", ".join(f"'{n}'" for n in needles)
@@ -160,10 +156,6 @@ def ingest_remote(
160
156
  if split.remote_start is None or split.remote_end is None:
161
157
  return 0
162
158
  months = _months(split.remote_start, split.remote_end)
163
- changesets_src = _partition_list(history_url, "changesets", months)
164
- changefiles_src = _partition_list(history_url, "changefiles", months)
165
- if changesets_src is None and changefiles_src is None:
166
- return 0
167
159
  start_iso = split.remote_start.astimezone(UTC).isoformat()
168
160
  end_iso = split.remote_end.astimezone(UTC).isoformat()
169
161
  in_window = f"created_at >= TIMESTAMPTZ '{start_iso}' AND created_at < TIMESTAMPTZ '{end_iso}'"
@@ -172,21 +164,8 @@ def ingest_remote(
172
164
  conn.execute("INSTALL spatial; LOAD spatial;")
173
165
  if history_url.startswith(("hf://", "http://", "https://", "s3://")):
174
166
  conn.execute("INSTALL httpfs; LOAD httpfs;")
175
- # Ride out HF rate-limits on multi-partition reads instead of failing the run.
176
167
  conn.execute("SET http_retries=10; SET http_retry_wait_ms=2000; SET http_retry_backoff=1.5;")
177
168
 
178
- info(f"history: remote ingest {start_iso} -> {end_iso} ({len(months)} month partitions) from {history_url}")
179
-
180
- if changesets_src is not None:
181
- # Names for everyone in the window; every changeset_stats uid has a changeset row here.
182
- conn.execute(
183
- f"""INSERT INTO users
184
- SELECT uid, any_value(username) FROM {changesets_src}
185
- WHERE {in_window} AND username IS NOT NULL
186
- GROUP BY uid
187
- ON CONFLICT (uid) DO NOTHING"""
188
- )
189
-
190
169
  changeset_preds = [in_window]
191
170
  if filters.hashtags:
192
171
  changeset_preds.append(_hashtag_predicate(filters.hashtags, filters.exact_lookup))
@@ -199,51 +178,55 @@ def ingest_remote(
199
178
  changeset_preds.append(f"uid IN (SELECT uid FROM users WHERE username IN ({names}))")
200
179
  changeset_where = " AND ".join(changeset_preds)
201
180
 
202
- # Always populate changesets: every changeset_stats row needs a parent row (the live path keeps
203
- # this invariant via stubs, and Postgres enforces it as a foreign key). A metadata filter narrows
204
- # which changesets (and thus which stats) are kept; a plain run keeps all in the window.
205
- if changesets_src is not None:
206
- conn.execute(
207
- f"""INSERT INTO changesets
208
- SELECT changeset_id, uid, created_at, hashtags, editor,
209
- CASE WHEN min_lon IS NOT NULL
210
- THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
211
- FROM {changesets_src} WHERE {changeset_where}
212
- ON CONFLICT (changeset_id) DO NOTHING"""
213
- )
214
-
215
181
  stats_preds = [in_window]
216
182
  if filters.has_metadata_filter:
217
- # Keep element stats only for changesets that passed the metadata filter above.
218
183
  stats_preds.append("changeset_id IN (SELECT changeset_id FROM changesets)")
219
184
  stats_where = " AND ".join(stats_preds)
220
185
 
221
- if changefiles_src is not None:
222
- conn.execute(
223
- f"""INSERT INTO changeset_stats
224
- SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
225
- nodes_created, nodes_modified, nodes_deleted,
226
- ways_created, ways_modified, ways_deleted,
227
- rels_created, rels_modified, rels_deleted,
228
- poi_created, poi_modified, tag_stats
229
- FROM {changefiles_src} WHERE {stats_where}
230
- ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
231
- )
186
+ info(f"history: remote ingest {start_iso} -> {end_iso} ({len(months)} month partitions) from {history_url}")
187
+
188
+ with progress_bar(len(months), unit="months", description="Reading history") as advance:
189
+ for month in months:
190
+ changesets_src = _partition_list(history_url, "changesets", [month])
191
+ changefiles_src = _partition_list(history_url, "changefiles", [month])
192
+ if changesets_src is not None:
193
+ conn.execute(
194
+ f"""INSERT INTO users
195
+ SELECT uid, any_value(username) FROM {changesets_src}
196
+ WHERE {in_window} AND username IS NOT NULL
197
+ GROUP BY uid ON CONFLICT (uid) DO NOTHING"""
198
+ )
199
+ conn.execute(
200
+ f"""INSERT INTO changesets
201
+ SELECT changeset_id, uid, created_at, hashtags, editor,
202
+ CASE WHEN min_lon IS NOT NULL
203
+ THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
204
+ FROM {changesets_src} WHERE {changeset_where}
205
+ ON CONFLICT (changeset_id) DO NOTHING"""
206
+ )
207
+ if changefiles_src is not None:
208
+ conn.execute(
209
+ f"""INSERT INTO changeset_stats
210
+ SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
211
+ nodes_created, nodes_modified, nodes_deleted,
212
+ ways_created, ways_modified, ways_deleted,
213
+ rels_created, rels_modified, rels_deleted,
214
+ poi_created, poi_modified, tag_stats
215
+ FROM {changefiles_src} WHERE {stats_where}
216
+ ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
217
+ )
218
+ advance()
219
+
232
220
  row = conn.execute(f"SELECT count(*) FROM changeset_stats WHERE seq_id = {HISTORY_SEQ_ID}").fetchone()
233
221
  return row[0] if row else 0
234
222
 
235
223
 
236
- # Resume one day before the frontier, not at it. A changeset can stay open for up to 24h, so its
237
- # edits can straddle the frontier, and converting a date to a replication sequence is not exact. The
238
- # re-scanned day overlaps the history layer, which the seq_id=0 dedup removes, so this never misses an
239
- # edit and never double counts.
240
224
  RESUME_SAFETY = dt.timedelta(days=1)
241
225
 
242
226
 
243
227
  def seed_resume_at(conn: duckdb.DuckDBPyConnection, resume_at: dt.datetime, replication_url: str) -> dt.datetime | None:
244
- """Seed the `state` table so `osmsg --update` resumes at `resume_at` on `replication_url`. Derives
245
- the replication sequence from the timestamp, so the caller never picks a seq by hand. Returns the
246
- resume timestamp, or None if no sequence resolves at that time."""
228
+ """Seed `state` so `osmsg --update` resumes at `resume_at` on `replication_url`. Returns resume_at,
229
+ or None if no sequence resolves."""
247
230
  from osmium.replication.server import ReplicationServer
248
231
 
249
232
  from .db.schema import upsert_state
@@ -1,7 +1,5 @@
1
1
  """Convert a planet .osh history plus a changeset dump into the changefiles/changesets parquet
2
- datasets, out of core via osmsg's own DuckDB tables. Streams raw per-edit rows to parquet in bounded
3
- batches, then aggregates and joins in DuckDB (a changeset's edits are scattered across the .osh, so an
4
- in-memory pass OOMs at planet scale)."""
2
+ datasets, out of core via osmsg's own DuckDB tables."""
5
3
 
6
4
  import concurrent.futures as cf
7
5
  import datetime as dt
@@ -20,11 +18,8 @@ from .pbf_split import split_pbf
20
18
 
21
19
  BATCH = 1_000_000
22
20
  CREATE, MODIFY, DELETE = 0, 1, 2
23
- # Out-of-core settings for planet-scale aggregation. Leave headroom below physical RAM; spill to disk.
24
21
  DUCKDB_MEMORY_LIMIT = "40GB"
25
22
  DUCKDB_THREADS = 24
26
- # A global GROUP BY over all string-keyed tag rows OOMs even with spill, and json_group_object does
27
- # not spill. Shard raw tags to disk by changeset_id % K, then aggregate each shard independently.
28
23
  TAG_SHARDS = 64
29
24
 
30
25
  ELEM_SCHEMA = pa.schema(
@@ -162,9 +157,7 @@ def stream_changesets(dump: str, start: dt.datetime, end: dt.datetime, work: pat
162
157
 
163
158
 
164
159
  def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
165
- """Populate osmsg's tables (users, changesets, changeset_stats) from the streamed raw rows. Globs
166
- raw_elements_*/raw_tags_* so single-process and split-parallel runs both work: one global GROUP BY
167
- recombines each changeset's edits across parts."""
160
+ """Populate osmsg's tables (users, changesets, changeset_stats) from the streamed raw rows."""
168
161
  con.execute("INSTALL json; LOAD json;")
169
162
  work = pathlib.Path(work)
170
163
  cs = (work / "raw_changesets.parquet").as_posix()
@@ -209,8 +202,6 @@ def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
209
202
  a.rels_created, a.rels_modified, a.rels_deleted,
210
203
  a.poi_created, a.poi_modified"""
211
204
  for b in range(TAG_SHARDS):
212
- # Insert this shard's agg changesets; attach tag_stats only if the shard has tags (tiny inputs
213
- # and edit-only changesets carry none).
214
205
  shard_dir = shards / f"shard={b}"
215
206
  if shard_dir.is_dir():
216
207
  shard_glob = (shard_dir / "*.parquet").as_posix()
@@ -244,11 +235,8 @@ def build_tables(con: duckdb.DuckDBPyConnection, work: pathlib.Path) -> None:
244
235
 
245
236
 
246
237
  def export_parquet(con: duckdb.DuckDBPyConnection, out: pathlib.Path) -> None:
247
- """Materialise the two datasets as persisted tables (a view would re-run the planet-scale joins per
248
- partition; a TEMP table would hold 180M JSON rows in RAM), then write Morton-sorted partitions."""
238
+ """Materialise the two datasets as persisted tables, then write Morton-sorted partitions."""
249
239
  con.execute(MORTON_MACROS)
250
- # changefiles created_at falls back to the element edit time when the changeset predates the window,
251
- # so in-window edits are never dropped.
252
240
  con.execute(
253
241
  f"""CREATE TABLE changefiles_all AS
254
242
  SELECT s.* EXCLUDE (seq_id),
@@ -292,9 +280,8 @@ def aggregate(work: pathlib.Path, out: pathlib.Path) -> pathlib.Path:
292
280
  def convert(
293
281
  osh: str, changesets: str, start: dt.datetime, end: dt.datetime, work_dir: pathlib.Path, parts: int = 1
294
282
  ) -> pathlib.Path:
295
- """Convert one .osh history + changeset dump to the two parquet datasets under `work_dir/out`.
296
- With parts>1 the history is split at blob boundaries and streamed concurrently. Returns the out
297
- directory holding changefiles/, changesets/, and stats.duckdb."""
283
+ """Convert one .osh history + changeset dump to the two parquet datasets under `work_dir/out`,
284
+ returned as a path. With parts>1 the history is split and streamed concurrently."""
298
285
  work = pathlib.Path(work_dir)
299
286
  raw = work / "raw"
300
287
  raw.mkdir(parents=True, exist_ok=True)
@@ -12,9 +12,6 @@ from ..exceptions import OsmsgError
12
12
  from .parquet import GEOM_COLS, MORTON_MACROS, write_partitions
13
13
 
14
14
  UTC = dt.UTC
15
- # Planet-wide edits are continuous, so a complete month reaches within minutes of its end. A larger
16
- # shortfall means the source day diffs did not cover the whole month (a mid-day snapshot or lagging
17
- # replication), so the partition would be published short, the exact gap the read-side backstep masks.
18
15
  COMPLETENESS_TOLERANCE = dt.timedelta(hours=1)
19
16
 
20
17
 
@@ -6,8 +6,6 @@ import duckdb
6
6
 
7
7
  ROW_GROUP_SIZE = 100_000
8
8
 
9
- # Morton(centroid) as native SQL macros (vectorized): scale lon/lat to 16-bit and interleave the bits
10
- # so 2D locality maps to a contiguous 1D key. A Python UDF in ORDER BY is ~10x slower at planet scale.
11
9
  MORTON_MACROS = """
12
10
  CREATE OR REPLACE MACRO _s1(v) AS ((v | (v << 8)) & 16711935);
13
11
  CREATE OR REPLACE MACRO _s2(v) AS ((_s1(v) | (_s1(v) << 4)) & 252645135);
@@ -19,7 +17,6 @@ CREATE OR REPLACE MACRO morton2(lon, lat) AS (
19
17
  );
20
18
  """
21
19
 
22
- # lon/lat centroid plus bbox min/max derived from changesets.geom (osmsg stores the bbox envelope).
23
20
  GEOM_COLS = (
24
21
  "ST_X(ST_Centroid(c.geom)) AS lon, ST_Y(ST_Centroid(c.geom)) AS lat, "
25
22
  "ST_XMin(c.geom) AS min_lon, ST_YMin(c.geom) AS min_lat, "
@@ -30,9 +27,8 @@ GEOM_COLS = (
30
27
  def write_partitions(
31
28
  con: duckdb.DuckDBPyConnection, view: str, base: pathlib.Path, order_by: str = "morton2(lon, lat)"
32
29
  ) -> None:
33
- """Write one parquet file per year/month partition, each sorted by `order_by`. DuckDB's
34
- PARTITION_BY drops the global sort, so each partition is sorted on its own for tight row-group
35
- min/max. `view` must expose integer `y`, `m` partition columns."""
30
+ """Write one parquet file per year/month partition, each sorted by `order_by`. `view` must expose
31
+ integer `y`, `m` partition columns."""
36
32
  base.mkdir(parents=True, exist_ok=True)
37
33
  for year, month in con.execute(f"SELECT DISTINCT y, m FROM {view} ORDER BY y, m").fetchall():
38
34
  out = base / f"year={year}" / f"month={month}"
@@ -19,7 +19,6 @@ def read_blob(handle) -> tuple[bytes, bytes, str] | None:
19
19
 
20
20
 
21
21
  def _parse_blobheader(buf: bytes) -> tuple[str, int]:
22
- # BlobHeader: field 1 = type (length-delimited string), field 3 = datasize (varint).
23
22
  blob_type, datasize, i = "", 0, 0
24
23
  while i < len(buf):
25
24
  key = buf[i]
@@ -5,6 +5,8 @@ from __future__ import annotations
5
5
  import concurrent.futures
6
6
  import copy
7
7
  import datetime as dt
8
+ import hashlib
9
+ import json
8
10
  import os
9
11
  import shutil
10
12
  from dataclasses import dataclass, field
@@ -95,6 +97,7 @@ class RunConfig:
95
97
  insert: bool = False
96
98
  osh_file: str | None = None
97
99
  changeset_file: str | None = None
100
+ overwrite: bool = False
98
101
 
99
102
 
100
103
  def _resolve_country_urls(countries: list[str]) -> list[str]:
@@ -232,9 +235,6 @@ def _seed_history_resume(conn, cfg: RunConfig) -> None:
232
235
  seed_resume_state(conn, cfg.history_url, url)
233
236
 
234
237
 
235
- # minute < hour < day. Day diffs land at 00:00 UTC, which is also an hour and minute boundary, so
236
- # resuming a finer source at a coarser source's last_ts is disjoint (no edit double-counted or
237
- # skipped). The reverse can skip the partial current period, so --update only ever auto-refines.
238
238
  _GRANULARITY_RANK = {SHORTCUTS["minute"]: 0, SHORTCUTS["hour"]: 1, SHORTCUTS["day"]: 2}
239
239
 
240
240
 
@@ -244,9 +244,7 @@ def _tracked_sources(conn) -> list[str]:
244
244
 
245
245
 
246
246
  def _switch_source(conn, from_url: str, to_url: str) -> None:
247
- """Hand tracking from from_url to to_url at from_url's last_ts (a clean seq boundary) and retire
248
- from_url, so the two sequence spaces never overlap (double count) or gap. Each granularity is a
249
- separate sequence, so the disjoint-coverage invariant is what keeps stats correct across a switch."""
247
+ """Resume to_url at from_url's last_ts and retire from_url, so the granularities never overlap or gap."""
250
248
  state = get_state(conn, from_url)
251
249
  if state is None:
252
250
  return
@@ -258,12 +256,11 @@ def _switch_source(conn, from_url: str, to_url: str) -> None:
258
256
 
259
257
 
260
258
  def _select_update_source(conn, cfg: RunConfig, now: dt.datetime) -> None:
261
- """Pick the source `--update` continues. Without `--url`, continue the tracked source and auto-refine
262
- to a finer granularity as the backlog shrinks; with `--url`, switch to it. Switches are clean
263
- handoffs and a store tracks one planet source at a time, so granularities never overlap."""
259
+ """Pick the source `--update` continues: without `--url`, continue the tracked source and auto-refine
260
+ to a finer granularity as the backlog shrinks; with `--url`, switch to it via a clean handoff."""
264
261
  tracked = _tracked_sources(conn)
265
262
  if not tracked:
266
- return # fresh store: _resolve_url_starts bootstraps cfg.urls as given
263
+ return
267
264
  if len(tracked) > 1:
268
265
  if not cfg.url_explicit:
269
266
  cfg.urls = tracked
@@ -286,9 +283,8 @@ def _select_update_source(conn, cfg: RunConfig, now: dt.datetime) -> None:
286
283
 
287
284
 
288
285
  def _history_live_start(split: WindowSplit, frontier: dt.datetime) -> dt.datetime:
289
- """Where the live tail begins after a remote ingest. When the query reached the published frontier,
290
- back up by the safety window: the dataset's final month can stop short of its nominal boundary, so
291
- re-scanning lets the seq_id=0 dedup recover the shortfall (the overlap it did cover is dropped)."""
286
+ """Where the live tail begins after a remote ingest: back up by the safety window when the query
287
+ reached the frontier (the final month may be short), else the split boundary."""
292
288
  if split.remote_end == frontier:
293
289
  return frontier - RESUME_SAFETY
294
290
  return split.live_start
@@ -351,8 +347,6 @@ def _run_insert(cfg: RunConfig, conn: duckdb.DuckDBPyConnection, db_path: Path)
351
347
  if cfg.url_explicit:
352
348
  seed_urls = cfg.urls
353
349
  else:
354
- # The catch-up gap (now - frontier) is usually weeks; seed the granularity --update can clear
355
- # quickly instead of crawling minute diffs. --update continues this same source.
356
350
  seed_urls = [resolve_url(_pick_replication_for_span(dt.datetime.now(UTC) - split.remote_end))]
357
351
  for url in seed_urls:
358
352
  seed_resume_at(conn, resume_at, url)
@@ -367,6 +361,136 @@ def _run_insert(cfg: RunConfig, conn: duckdb.DuckDBPyConnection, db_path: Path)
367
361
  return {"rows": n, "files": written, "rows_data": [], "summary": None, "start_seq": None, "end_seq": None}
368
362
 
369
363
 
364
+ def _query_fingerprint(cfg: RunConfig) -> str:
365
+ """Stable hash of the query's data-affecting params, excluding output formats."""
366
+ key = {
367
+ "start": cfg.start_date.isoformat() if cfg.start_date else None,
368
+ "end": cfg.end_date.isoformat() if cfg.end_date else None,
369
+ "urls": sorted(cfg.urls),
370
+ "countries": sorted(cfg.countries) if cfg.countries else None,
371
+ "boundary": cfg.boundary,
372
+ "hashtags": sorted(cfg.hashtags) if cfg.hashtags else None,
373
+ "exact_lookup": cfg.exact_lookup,
374
+ "users": sorted(cfg.users_filter) if cfg.users_filter else None,
375
+ "tag_mode": cfg.tag_mode,
376
+ "additional_tags": sorted(cfg.additional_tags) if cfg.additional_tags else None,
377
+ "length_tags": sorted(cfg.length_tags) if cfg.length_tags else None,
378
+ "changeset": cfg.changeset,
379
+ "summary": cfg.summary,
380
+ "tm_stats": cfg.tm_stats,
381
+ "history_mode": cfg.history_mode,
382
+ }
383
+ return hashlib.sha256(json.dumps(key, sort_keys=True).encode()).hexdigest()
384
+
385
+
386
+ def _read_fingerprint(conn: duckdb.DuckDBPyConnection) -> str | None:
387
+ """The query fingerprint stamped on an existing store, or None if absent."""
388
+ present = conn.execute("SELECT 1 FROM information_schema.tables WHERE table_name = 'osmsg_run_meta'").fetchone()
389
+ if not present:
390
+ return None
391
+ row = conn.execute("SELECT fingerprint FROM osmsg_run_meta LIMIT 1").fetchone()
392
+ return row[0] if row else None
393
+
394
+
395
+ def _store_fingerprint(conn: duckdb.DuckDBPyConnection, fingerprint: str) -> None:
396
+ conn.execute("CREATE TABLE IF NOT EXISTS osmsg_run_meta (fingerprint VARCHAR)")
397
+ conn.execute("DELETE FROM osmsg_run_meta")
398
+ conn.execute("INSERT INTO osmsg_run_meta VALUES (?)", [fingerprint])
399
+
400
+
401
+ def _finalize(
402
+ cfg: RunConfig,
403
+ conn: duckdb.DuckDBPyConnection,
404
+ fingerprint: str,
405
+ *,
406
+ start_date_utc: dt.datetime,
407
+ end_date_utc: dt.datetime,
408
+ start_seq: int | None,
409
+ end_seq: int | None,
410
+ ) -> dict[str, Any]:
411
+ """Aggregate the populated tables into user stats and write the requested formats."""
412
+ rows = user_stats(conn, top_n=None)
413
+ if not rows:
414
+ dbmod.close(conn)
415
+ # Raised so the CLI can map "no new data" to exit 0.
416
+ raise NoDataFoundError("No stats produced for the requested time range.")
417
+ _store_fingerprint(conn, fingerprint)
418
+
419
+ if cfg.changeset or cfg.hashtags:
420
+ attach_metadata(conn, rows)
421
+ if cfg.additional_tags or cfg.tag_mode != "none" or cfg.length_tags:
422
+ attach_tag_stats(
423
+ conn,
424
+ rows,
425
+ additional_tags=cfg.additional_tags,
426
+ tag_mode=cfg.tag_mode,
427
+ length_tags=cfg.length_tags,
428
+ )
429
+ if cfg.tm_stats:
430
+ rows = tm.enrich(rows)
431
+
432
+ out = cfg.output_dir
433
+ written: dict[str, str] = {}
434
+ if "parquet" in cfg.formats:
435
+ written["parquet"] = str(to_parquet(rows, out / f"{cfg.name}.parquet"))
436
+ if "csv" in cfg.formats:
437
+ written["csv"] = str(to_csv(rows, out / f"{cfg.name}.csv"))
438
+ if "json" in cfg.formats:
439
+ written["json"] = str(to_json(rows, out / f"{cfg.name}.json"))
440
+ if "markdown" in cfg.formats:
441
+ md_path = out / f"{cfg.name}.md"
442
+ table_markdown(rows, output_path=md_path)
443
+ written["markdown"] = str(md_path)
444
+
445
+ summary_rows: list[dict[str, Any]] | None = None
446
+ if cfg.summary:
447
+ summary_rows = daily_summary(
448
+ conn,
449
+ additional_tags=cfg.additional_tags,
450
+ tag_mode=cfg.tag_mode,
451
+ length_tags=cfg.length_tags,
452
+ )
453
+ if summary_rows:
454
+ if "parquet" in cfg.formats:
455
+ written["summary_parquet"] = str(to_parquet(summary_rows, out / f"{cfg.name}_summary.parquet"))
456
+ if "csv" in cfg.formats:
457
+ written["summary_csv"] = str(to_csv(summary_rows, out / f"{cfg.name}_summary.csv"))
458
+ if "json" in cfg.formats:
459
+ written["summary_json"] = str(to_json(summary_rows, out / f"{cfg.name}_summary.json"))
460
+ if "markdown" in cfg.formats:
461
+ summary_md_path = out / f"{cfg.name}_summary.md"
462
+ summary_markdown(
463
+ rows,
464
+ output_path=summary_md_path,
465
+ start_date=start_date_utc,
466
+ end_date=end_date_utc,
467
+ additional_tags=cfg.additional_tags,
468
+ length_tags=cfg.length_tags,
469
+ tag_mode=cfg.tag_mode,
470
+ fname=cfg.name,
471
+ tm_stats=cfg.tm_stats,
472
+ )
473
+ written["summary_md"] = str(summary_md_path)
474
+ # psql: skipped on purpose, daily_summary is a query over the four base tables.
475
+
476
+ if "psql" in cfg.formats:
477
+ if not cfg.psql_dsn:
478
+ raise OsmsgError("'psql' format requires a libpq DSN (--psql-dsn / RunConfig.psql_dsn=...).")
479
+ info(f"Pushing to PostgreSQL: {cfg.psql_dsn.split()[0]}…")
480
+ to_psql(conn, cfg.psql_dsn, bulk_load=cfg.psql_bulk)
481
+ written["psql"] = cfg.psql_dsn
482
+
483
+ dbmod.close(conn)
484
+ return {
485
+ "rows": len(rows),
486
+ "files": written,
487
+ "rows_data": rows,
488
+ "summary": summary_rows,
489
+ "start_seq": start_seq,
490
+ "end_seq": end_seq,
491
+ }
492
+
493
+
370
494
  def _ensure_credentials(cfg: RunConfig) -> str | None:
371
495
  """Resolve OSM credentials and exchange them for a Geofabrik OAuth 2.0 cookie.
372
496
 
@@ -482,15 +606,33 @@ def run(cfg: RunConfig) -> dict[str, Any]:
482
606
  cookie = _ensure_credentials(cfg)
483
607
 
484
608
  db_path = cfg.output_dir / f"{cfg.name}.duckdb"
609
+
610
+ if cfg.end_date is None:
611
+ cfg.end_date = dt.datetime.now(UTC)
612
+ fingerprint = _query_fingerprint(cfg)
613
+
614
+ if not cfg.update and not cfg.insert and not cfg.overwrite and db_path.exists():
615
+ existing = dbmod.connect(str(db_path))
616
+ if _read_fingerprint(existing) == fingerprint:
617
+ info(f"Reusing {db_path} (same query); re-exporting. Pass --overwrite to recompute.")
618
+ start_utc = (cfg.start_date or cfg.end_date).astimezone(UTC)
619
+ return _finalize(
620
+ cfg,
621
+ existing,
622
+ fingerprint,
623
+ start_date_utc=start_utc,
624
+ end_date_utc=cfg.end_date.astimezone(UTC),
625
+ start_seq=None,
626
+ end_seq=None,
627
+ )
628
+ dbmod.close(existing)
629
+
485
630
  if not cfg.update and db_path.exists():
486
631
  db_path.unlink()
487
632
  conn = dbmod.connect(str(db_path))
488
633
  dbmod.create_tables(conn)
489
634
  info(f"DuckDB: {db_path}")
490
635
 
491
- if cfg.end_date is None:
492
- cfg.end_date = dt.datetime.now(UTC)
493
-
494
636
  if cfg.insert:
495
637
  return _run_insert(cfg, conn, db_path)
496
638
 
@@ -533,8 +675,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
533
675
  if (cfg.tm_stats or cfg.summary or cfg.tag_mode == "all") and not cfg.changeset and not cfg.hashtags:
534
676
  cfg.changeset = True
535
677
 
536
- # Hybrid-auto history: serve the covered months from the published parquet, leaving only the
537
- # uncovered recent tail to the live diff path. Falls back to full live on any problem.
538
678
  run_live = True
539
679
  if cfg.history_mode == "auto" and not cfg.update:
540
680
  if cfg.length_tags:
@@ -562,7 +702,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
562
702
  _auto_switch_replication(cfg, cfg.end_date - cfg.start_date)
563
703
  except duckdb.Error as exc:
564
704
  warn(f"history: remote ingest failed ({type(exc).__name__}: {exc}); using live path.")
565
- # Discard any partial remote rows so the live path is the sole source.
566
705
  for tbl in ("changeset_stats", "changesets", "users"):
567
706
  conn.execute(f"DELETE FROM {tbl}")
568
707
  run_live = True
@@ -700,9 +839,6 @@ def run(cfg: RunConfig) -> dict[str, Any]:
700
839
  if sub.exists():
701
840
  shutil.rmtree(sub, ignore_errors=True)
702
841
 
703
- # History rows (seq_id=0) hold a changeset's COMPLETE lifetime counts. If the live tail re-saw
704
- # some of those edits for a changeset that straddles the frontier, drop the live duplicates: the
705
- # history row already counts them, so this is no-loss and prevents double counting.
706
842
  history_row = conn.execute("SELECT count(*) FROM changeset_stats WHERE seq_id = 0").fetchone()
707
843
  has_history = bool(history_row and history_row[0] > 0)
708
844
  if run_live and has_history:
@@ -720,90 +856,15 @@ def run(cfg: RunConfig) -> dict[str, Any]:
720
856
  else:
721
857
  start_date_utc = cfg.start_date.astimezone(UTC)
722
858
 
723
- rows = user_stats(conn, top_n=None)
724
- if not rows:
725
- dbmod.close(conn)
726
- # Raised so the CLI can map "no new data" to exit 0.
727
- raise NoDataFoundError("No stats produced for the requested time range.")
728
-
729
- if cfg.changeset or cfg.hashtags:
730
- attach_metadata(conn, rows)
731
- if cfg.additional_tags or cfg.tag_mode != "none" or cfg.length_tags:
732
- attach_tag_stats(
733
- conn,
734
- rows,
735
- additional_tags=cfg.additional_tags,
736
- tag_mode=cfg.tag_mode,
737
- length_tags=cfg.length_tags,
738
- )
739
-
740
- if cfg.tm_stats:
741
- rows = tm.enrich(rows)
742
-
743
- out = cfg.output_dir
744
- written: dict[str, str] = {}
745
- if "parquet" in cfg.formats:
746
- written["parquet"] = str(to_parquet(rows, out / f"{cfg.name}.parquet"))
747
- if "csv" in cfg.formats:
748
- written["csv"] = str(to_csv(rows, out / f"{cfg.name}.csv"))
749
- if "json" in cfg.formats:
750
- written["json"] = str(to_json(rows, out / f"{cfg.name}.json"))
751
-
752
- if "markdown" in cfg.formats:
753
- md_path = out / f"{cfg.name}.md"
754
- table_markdown(
755
- rows,
756
- output_path=md_path,
757
- )
758
- written["markdown"] = str(md_path)
759
-
760
- summary_rows: list[dict[str, Any]] | None = None
761
- if cfg.summary:
762
- summary_rows = daily_summary(
763
- conn,
764
- additional_tags=cfg.additional_tags,
765
- tag_mode=cfg.tag_mode,
766
- length_tags=cfg.length_tags,
767
- )
768
- if summary_rows:
769
- if "parquet" in cfg.formats:
770
- written["summary_parquet"] = str(to_parquet(summary_rows, out / f"{cfg.name}_summary.parquet"))
771
- if "csv" in cfg.formats:
772
- written["summary_csv"] = str(to_csv(summary_rows, out / f"{cfg.name}_summary.csv"))
773
- if "json" in cfg.formats:
774
- written["summary_json"] = str(to_json(summary_rows, out / f"{cfg.name}_summary.json"))
775
- if "markdown" in cfg.formats:
776
- summary_md_path = out / f"{cfg.name}_summary.md"
777
- summary_markdown(
778
- rows,
779
- output_path=summary_md_path,
780
- start_date=start_date_utc,
781
- end_date=end_date_utc,
782
- additional_tags=cfg.additional_tags,
783
- length_tags=cfg.length_tags,
784
- tag_mode=cfg.tag_mode,
785
- fname=cfg.name,
786
- tm_stats=cfg.tm_stats,
787
- )
788
- written["summary_md"] = str(summary_md_path)
789
- # psql: skipped on purpose, daily_summary is a query over the four base tables.
790
-
791
- if "psql" in cfg.formats:
792
- if not cfg.psql_dsn:
793
- raise OsmsgError("'psql' format requires a libpq DSN (--psql-dsn / RunConfig.psql_dsn=...).")
794
- info(f"Pushing to PostgreSQL: {cfg.psql_dsn.split()[0]}…")
795
- to_psql(conn, cfg.psql_dsn, bulk_load=cfg.psql_bulk)
796
- written["psql"] = cfg.psql_dsn
797
-
798
- dbmod.close(conn)
799
- return {
800
- "rows": len(rows),
801
- "files": written,
802
- "rows_data": rows,
803
- "summary": summary_rows,
804
- "start_seq": start_seq,
805
- "end_seq": end_seq,
806
- }
859
+ return _finalize(
860
+ cfg,
861
+ conn,
862
+ fingerprint,
863
+ start_date_utc=start_date_utc,
864
+ end_date_utc=end_date_utc,
865
+ start_seq=start_seq,
866
+ end_seq=end_seq,
867
+ )
807
868
 
808
869
 
809
870
  __all__ = ["RunConfig", "run"]
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "osmsg"
3
- version = "1.2.0"
3
+ version = "1.2.2"
4
4
  description = "OpenStreetMap Stats Generator: Commandline"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -41,6 +41,9 @@ repository = "https://github.com/osgeonepal/osmsg"
41
41
  [project.scripts]
42
42
  osmsg = "osmsg.cli:app"
43
43
 
44
+ [project.gui-scripts]
45
+ osmsg-gui = "osmsg.gui:launch"
46
+
44
47
  [build-system]
45
48
  requires = ["uv_build>=0.5.15,<0.9"]
46
49
  build-backend = "uv_build"
@@ -1 +0,0 @@
1
- __version__ = "1.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes