osmsg 1.1.2__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {osmsg-1.1.2 → osmsg-1.2.0}/PKG-INFO +64 -2
  2. {osmsg-1.1.2 → osmsg-1.2.0}/README.md +63 -1
  3. osmsg-1.2.0/osmsg/__version__.py +1 -0
  4. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/cli.py +81 -4
  5. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/__init__.py +1 -1
  6. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/ingest.py +1 -1
  7. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/exceptions.py +1 -1
  8. osmsg-1.2.0/osmsg/export/psql.py +156 -0
  9. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/handlers.py +1 -1
  10. osmsg-1.2.0/osmsg/history.py +272 -0
  11. osmsg-1.2.0/osmsg/maintain/__init__.py +7 -0
  12. osmsg-1.2.0/osmsg/maintain/cli.py +83 -0
  13. osmsg-1.2.0/osmsg/maintain/convert.py +314 -0
  14. osmsg-1.2.0/osmsg/maintain/manifest.py +62 -0
  15. osmsg-1.2.0/osmsg/maintain/month.py +120 -0
  16. osmsg-1.2.0/osmsg/maintain/parquet.py +43 -0
  17. osmsg-1.2.0/osmsg/maintain/pbf_split.py +79 -0
  18. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pipeline.py +233 -16
  19. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/replication.py +1 -1
  20. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/tm.py +1 -1
  21. {osmsg-1.1.2 → osmsg-1.2.0}/pyproject.toml +1 -1
  22. osmsg-1.1.2/osmsg/__version__.py +0 -1
  23. osmsg-1.1.2/osmsg/export/psql.py +0 -69
  24. {osmsg-1.1.2 → osmsg-1.2.0}/LICENSE +0 -0
  25. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/__init__.py +0 -0
  26. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_http.py +0 -0
  27. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_tick.py +0 -0
  28. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/auth.py +0 -0
  29. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/boundary.py +0 -0
  30. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/duckdb_schema.py +0 -0
  31. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/queries.py +0 -0
  32. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/schema.py +0 -0
  33. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/__init__.py +0 -0
  34. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/csv.py +0 -0
  35. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/json.py +0 -0
  36. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/markdown.py +0 -0
  37. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/parquet.py +0 -0
  38. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/fetch.py +0 -0
  39. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/geofabrik.py +0 -0
  40. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/models.py +0 -0
  41. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pg_schema.py +0 -0
  42. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/py.typed +0 -0
  43. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/ui.py +0 -0
  44. {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.1.2
3
+ Version: 1.2.0
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -46,13 +46,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
46
46
 
47
47
  A Project of [OSGeo Nepal](https://osgeonepal.org).
48
48
 
49
- ## Features
49
+ ## What does it do?
50
50
 
51
51
  - Per-user create/modify/delete counts over any time window.
52
52
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
53
53
  - Country and custom-boundary filters via Geofabrik.
54
54
  - Cron-friendly resume with `--update`.
55
+ - One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
55
56
  - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
57
+ - Cloud-native history: months covered by a published parquet dataset are read remotely.
56
58
 
57
59
  ## Install
58
60
 
@@ -68,6 +70,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
68
70
  `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
69
71
  with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
70
72
 
73
+ More ways to install:
74
+
75
+ ```bash
76
+ conda install -c conda-forge osmsg # conda / mamba
77
+ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
78
+ ```
79
+
80
+ On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
81
+ and run it directly, no Python required.
82
+
71
83
  ## Quick start
72
84
 
73
85
  ```bash
@@ -78,6 +90,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
78
90
 
79
91
  That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
80
92
 
93
+ ## Set up a full history store
94
+
95
+ Two commands give you a complete, self-updating store. The first loads all of OSM history from the
96
+ published dataset and records where to resume; the second catches up to now and runs on a schedule.
97
+
98
+ ```bash
99
+ osmsg --insert # load all history into stats.duckdb, then exit
100
+ osmsg --update # catch up to now (repeat on cron)
101
+ ```
102
+
103
+ `osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
104
+ current. For near-real-time, run `osmsg --update --url minute`.
105
+
106
+ Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
107
+
108
+ ```bash
109
+ osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
110
+ osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
111
+ ```
112
+
113
+ Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
114
+
115
+ ```bash
116
+ osmsg --insert --start 2020-01-01 --end 2023-01-01
117
+ ```
118
+
119
+ Already have the planet files? Insert from them directly:
120
+
121
+ ```bash
122
+ osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
123
+ ```
124
+
81
125
  ## Tutorials
82
126
 
83
127
  ### 1. Stats for a country
@@ -185,6 +229,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
185
229
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
186
230
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
187
231
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
232
+ | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
233
+ | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
234
+ | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
235
+ | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
236
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
188
237
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
189
238
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
190
239
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -192,6 +241,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
192
241
 
193
242
  A `.env` file at the working directory is loaded automatically.
194
243
 
244
+ ## Maintainers
245
+
246
+ Generating and publishing the history dataset is the `osmsg maintain` group:
247
+
248
+ ```bash
249
+ osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
250
+ osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
251
+ osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
252
+ osmsg maintain publish work/out --repo osgeonepal/osmsg-history
253
+ ```
254
+
255
+ See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
256
+
195
257
  ## Documentation
196
258
 
197
259
  - [Installation](./docs/Installation.md)
@@ -14,13 +14,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
14
14
 
15
15
  A Project of [OSGeo Nepal](https://osgeonepal.org).
16
16
 
17
- ## Features
17
+ ## What does it do?
18
18
 
19
19
  - Per-user create/modify/delete counts over any time window.
20
20
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
21
21
  - Country and custom-boundary filters via Geofabrik.
22
22
  - Cron-friendly resume with `--update`.
23
+ - One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
23
24
  - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
25
+ - Cloud-native history: months covered by a published parquet dataset are read remotely.
24
26
 
25
27
  ## Install
26
28
 
@@ -36,6 +38,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
36
38
  `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
37
39
  with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
38
40
 
41
+ More ways to install:
42
+
43
+ ```bash
44
+ conda install -c conda-forge osmsg # conda / mamba
45
+ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
46
+ ```
47
+
48
+ On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
49
+ and run it directly, no Python required.
50
+
39
51
  ## Quick start
40
52
 
41
53
  ```bash
@@ -46,6 +58,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
46
58
 
47
59
  That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
48
60
 
61
+ ## Set up a full history store
62
+
63
+ Two commands give you a complete, self-updating store. The first loads all of OSM history from the
64
+ published dataset and records where to resume; the second catches up to now and runs on a schedule.
65
+
66
+ ```bash
67
+ osmsg --insert # load all history into stats.duckdb, then exit
68
+ osmsg --update # catch up to now (repeat on cron)
69
+ ```
70
+
71
+ `osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
72
+ current. For near-real-time, run `osmsg --update --url minute`.
73
+
74
+ Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
75
+
76
+ ```bash
77
+ osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
78
+ osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
79
+ ```
80
+
81
+ Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
82
+
83
+ ```bash
84
+ osmsg --insert --start 2020-01-01 --end 2023-01-01
85
+ ```
86
+
87
+ Already have the planet files? Insert from them directly:
88
+
89
+ ```bash
90
+ osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
91
+ ```
92
+
49
93
  ## Tutorials
50
94
 
51
95
  ### 1. Stats for a country
@@ -153,6 +197,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
153
197
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
154
198
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
155
199
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
200
+ | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
201
+ | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
202
+ | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
203
+ | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
204
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
156
205
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
157
206
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
158
207
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -160,6 +209,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
160
209
 
161
210
  A `.env` file at the working directory is loaded automatically.
162
211
 
212
+ ## Maintainers
213
+
214
+ Generating and publishing the history dataset is the `osmsg maintain` group:
215
+
216
+ ```bash
217
+ osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
218
+ osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
219
+ osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
220
+ osmsg maintain publish work/out --repo osgeonepal/osmsg-history
221
+ ```
222
+
223
+ See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
224
+
163
225
  ## Documentation
164
226
 
165
227
  - [Installation](./docs/Installation.md)
@@ -0,0 +1 @@
1
+ __version__ = "1.2.0"
@@ -1,6 +1,6 @@
1
1
  """Typer-based CLI for osmsg.
2
2
 
3
- UTC throughout no display timezone. Outputs default to parquet (queryable from
3
+ UTC throughout, no display timezone. Outputs default to parquet (queryable from
4
4
  disk by DuckDB / polars / pandas). Other formats: csv, json, markdown, psql.
5
5
  """
6
6
 
@@ -24,6 +24,7 @@ from .exceptions import (
24
24
  OsmsgError,
25
25
  UnknownRegionError,
26
26
  )
27
+ from .maintain.cli import maintain_app
27
28
  from .pipeline import RunConfig, run
28
29
  from .ui import console, error, info, render_table, warn
29
30
 
@@ -36,6 +37,7 @@ app = typer.Typer(
36
37
  no_args_is_help=False,
37
38
  help="OpenStreetMap stats generator. Parquet-first, OAuth 2.0, UTC-only.",
38
39
  )
40
+ app.add_typer(maintain_app, name="maintain")
39
41
 
40
42
 
41
43
  class Period(StrEnum):
@@ -104,9 +106,10 @@ def _period_range(period: Period) -> tuple[dt.datetime, dt.datetime]:
104
106
  raise ValueError(period)
105
107
 
106
108
 
107
- @app.command()
109
+ @app.callback(invoke_without_command=True)
108
110
  @use_yaml_config(param_name="config", param_help="YAML config file (CLI flags override its values).")
109
111
  def main(
112
+ ctx: typer.Context,
110
113
  version: Annotated[
111
114
  bool | None,
112
115
  typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
@@ -215,6 +218,15 @@ def main(
215
218
  str | None,
216
219
  typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
217
220
  ] = None,
221
+ psql_bulk: Annotated[
222
+ bool,
223
+ typer.Option(
224
+ "--psql-bulk",
225
+ envvar="OSMSG_PSQL_BULK",
226
+ help="Faster one-time psql load: drop secondary indexes and foreign keys during the push "
227
+ "and rebuild them after. Use for a full history import, not for incremental --update.",
228
+ ),
229
+ ] = False,
218
230
  changeset_pad_hours: Annotated[
219
231
  int,
220
232
  typer.Option(
@@ -226,16 +238,68 @@ def main(
226
238
  max=48,
227
239
  ),
228
240
  ] = 1,
241
+ history: Annotated[
242
+ bool,
243
+ typer.Option(
244
+ "--history/--no-history",
245
+ envvar="OSMSG_HISTORY",
246
+ help="Serve covered months from the published parquet (HuggingFace) and only download the "
247
+ "recent tail. Falls back to the live diff path if unavailable. Ignored by --update.",
248
+ ),
249
+ ] = True,
250
+ history_url: Annotated[
251
+ str,
252
+ typer.Option(
253
+ "--history-url",
254
+ envvar="OSMSG_HISTORY_URL",
255
+ help="Base URL of the published history dataset.",
256
+ ),
257
+ ] = "hf://datasets/kshitijrajsharma/osmsg-history",
258
+ insert: Annotated[
259
+ bool,
260
+ typer.Option(
261
+ "--insert",
262
+ help="Load history into the store and seed resume state, then exit. No window loads the "
263
+ "whole published history; --start/--end loads a slice. Follow with --update to catch up.",
264
+ ),
265
+ ] = False,
266
+ osh_file: Annotated[
267
+ str | None,
268
+ typer.Option("--osh-file", help="Insert from a local .osh.pbf instead of the published dataset."),
269
+ ] = None,
270
+ changeset_file: Annotated[
271
+ str | None,
272
+ typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
273
+ ] = None,
229
274
  ) -> None:
230
- """Run osmsg."""
275
+ """Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
276
+ if ctx.invoked_subcommand is not None:
277
+ return
231
278
  if formats is None:
232
279
  formats = [Format.parquet]
280
+ if psql_dsn and Format.psql not in formats:
281
+ formats.append(Format.psql)
233
282
  if sum(1 for x in (start, last, days) if x) > 1:
234
- error("--start, --last, and --days are mutually exclusive pick one.")
283
+ error("--start, --last, and --days are mutually exclusive, pick one.")
235
284
  raise typer.Exit(code=2)
236
285
  if update and any(x is not None for x in (start, end, last, days)):
237
286
  error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
238
287
  raise typer.Exit(code=2)
288
+ if insert and update:
289
+ error("--insert and --update are mutually exclusive; insert first, then update.")
290
+ raise typer.Exit(code=2)
291
+ if insert and (last is not None or days is not None):
292
+ error("--insert takes --start/--end (or no window), not --last/--days.")
293
+ raise typer.Exit(code=2)
294
+ if (osh_file is None) != (changeset_file is None):
295
+ error("--osh-file and --changeset-file must be given together.")
296
+ raise typer.Exit(code=2)
297
+ if osh_file and not insert:
298
+ error("--osh-file/--changeset-file are only valid with --insert.")
299
+ raise typer.Exit(code=2)
300
+ if psql_bulk and update:
301
+ error("--psql-bulk is for a one-time full load (drops indexes/keys); do not use it with --update.")
302
+ raise typer.Exit(code=2)
239
303
  if Format.psql in formats and not psql_dsn:
240
304
  error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
241
305
  raise typer.Exit(code=2)
@@ -267,7 +331,13 @@ def main(
267
331
  osm_username=username,
268
332
  osm_password=_read_password_stdin() if password_stdin else None,
269
333
  psql_dsn=psql_dsn,
334
+ psql_bulk=psql_bulk,
270
335
  changeset_pad_hours=changeset_pad_hours,
336
+ history_mode="auto" if history else "off",
337
+ history_url=history_url,
338
+ insert=insert,
339
+ osh_file=osh_file,
340
+ changeset_file=changeset_file,
271
341
  )
272
342
 
273
343
  if last is not None:
@@ -300,6 +370,13 @@ def main(
300
370
  error(str(exc))
301
371
  raise typer.Exit(code=2) from exc
302
372
 
373
+ if insert:
374
+ info(f"insert complete: {result['rows']:,} history changeset rows loaded.")
375
+ for label, path in (result.get("files") or {}).items():
376
+ console.print(f"[green]✓[/green] {label}: [bold]{path}[/bold]")
377
+ console.print("Next: [bold]osmsg --update[/bold] to catch up to now.")
378
+ return
379
+
303
380
  rows_data = result.get("rows_data") or []
304
381
  display_n = min(rows or 20, len(rows_data))
305
382
  render_table(
@@ -1,7 +1,7 @@
1
1
  """DuckDB persistence: schema, ingest, queries.
2
2
 
3
3
  The schema is portable: identical column shape works in DuckDB, Parquet, and
4
- PostgreSQL exporters re-issue the CREATE TABLE there.
4
+ PostgreSQL, exporters re-issue the CREATE TABLE there.
5
5
 
6
6
  Public surface:
7
7
 
@@ -106,7 +106,7 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
106
106
  _quarantine_corrupt(parquet_dir)
107
107
 
108
108
  def pattern(name: str) -> str:
109
- # read_parquet() takes a literal escape so quoted paths can't break out.
109
+ # read_parquet() takes a literal, escape so quoted paths can't break out.
110
110
  return _sql_escape((parquet_dir / f"temp_*_{name}_*.parquet").as_posix())
111
111
 
112
112
  conn.execute("BEGIN")
@@ -25,7 +25,7 @@ class GeofabrikAuthError(OsmsgError):
25
25
 
26
26
 
27
27
  class NoDataFoundError(Exception):
28
- """Empty range info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
28
+ """Empty range, info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
29
29
 
30
30
 
31
31
  __all__ = [
@@ -0,0 +1,156 @@
1
+ """PostgreSQL exporter via DuckDB's postgres extension."""
2
+
3
+ import duckdb
4
+
5
+ from ..exceptions import OsmsgError
6
+ from ..pg_schema import PG_SCHEMA
7
+
8
+ # Secondary indexes and foreign keys that make a row-by-row insert slow. For a one-time bulk load
9
+ # they are dropped before the COPY and rebuilt once after (one index build + one FK validation,
10
+ # instead of maintaining them per row). Primary keys stay, because the ON CONFLICT upserts need them.
11
+ # Indexes are (name, create-sql); foreign keys are (table, name, add-clause).
12
+ _BULK_INDEXES = [
13
+ ("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
14
+ ("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
15
+ ("idx_changeset_stats_uid", "CREATE INDEX idx_changeset_stats_uid ON changeset_stats (uid)"),
16
+ ]
17
+ _BULK_FKS = [
18
+ ("changesets", "changesets_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
19
+ (
20
+ "changeset_stats",
21
+ "changeset_stats_changeset_id_fkey",
22
+ "FOREIGN KEY (changeset_id) REFERENCES changesets (changeset_id)",
23
+ ),
24
+ ("changeset_stats", "changeset_stats_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
25
+ ]
26
+
27
+
28
+ # Bulk loads push the big tables in this many changeset_id ranges, each its own statement and so its
29
+ # own commit, so a failure costs one range instead of rolling back the whole multi-GB load.
30
+ _BULK_COMMIT_CHUNKS = 32
31
+
32
+
33
+ def _pg(conn: duckdb.DuckDBPyConnection, sql: str) -> None:
34
+ conn.execute(f"CALL postgres_execute('pg_target', $${sql}$$)")
35
+
36
+
37
+ def _pg_has_history(conn: duckdb.DuckDBPyConnection) -> bool:
38
+ """True if the PG target already holds the history layer (seq_id=0); checked cheaply with LIMIT 1."""
39
+ probe = "SELECT count(*) FROM (SELECT 1 FROM pg_target.changeset_stats WHERE seq_id = 0 LIMIT 1) t"
40
+ row = conn.execute(probe).fetchone()
41
+ return bool(row and row[0])
42
+
43
+
44
+ def _push_changesets(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
45
+ # Newer non-NULL wins, NULL never downgrades (mirrors the DuckDB-side merge).
46
+ conn.execute(
47
+ f"""
48
+ INSERT INTO pg_target.changesets AS c (changeset_id, uid, created_at, hashtags, editor, geom)
49
+ SELECT changeset_id, uid, created_at, hashtags, editor, geom FROM changesets {where}
50
+ ON CONFLICT (changeset_id) DO UPDATE SET
51
+ created_at = COALESCE(EXCLUDED.created_at, c.created_at),
52
+ hashtags = COALESCE(EXCLUDED.hashtags, c.hashtags),
53
+ editor = COALESCE(EXCLUDED.editor, c.editor),
54
+ geom = COALESCE(EXCLUDED.geom, c.geom)
55
+ """
56
+ )
57
+
58
+
59
+ def _push_changeset_stats(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
60
+ conn.execute(f"INSERT INTO pg_target.changeset_stats SELECT * FROM changeset_stats {where} ON CONFLICT DO NOTHING")
61
+
62
+
63
+ def _push_chunked(conn: duckdb.DuckDBPyConnection, source: str, push) -> None:
64
+ """Call push() once per changeset_id range so each range commits on its own."""
65
+ bounds = conn.execute(f"SELECT min(changeset_id), max(changeset_id) FROM {source}").fetchone()
66
+ if not bounds or bounds[0] is None:
67
+ return
68
+ lo, hi = bounds
69
+ step = (hi - lo) // _BULK_COMMIT_CHUNKS + 1
70
+ cursor = lo
71
+ while cursor <= hi:
72
+ push(conn, f"WHERE changeset_id >= {cursor} AND changeset_id < {cursor + step}")
73
+ cursor += step
74
+
75
+
76
+ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = False) -> None:
77
+ """Push every osmsg table to the libpq DSN target. bulk_load is for the one-time full-history
78
+ import (drops indexes and foreign keys, streams, rebuilds, commits per range); leave it off for
79
+ incremental --update pushes. The DSN is interpolated into ATTACH, so it must be trusted."""
80
+ conn.execute("INSTALL postgres")
81
+ conn.execute("LOAD postgres")
82
+ conn.execute("INSTALL spatial")
83
+ conn.execute("LOAD spatial")
84
+ safe_dsn = dsn.replace("'", "''")
85
+ conn.execute(f"ATTACH '{safe_dsn}' AS pg_target (TYPE postgres)")
86
+ try:
87
+ for stmt in PG_SCHEMA.strip().split(";"):
88
+ stmt = stmt.strip()
89
+ if stmt:
90
+ _pg(conn, stmt)
91
+
92
+ # Refuse cross-source push: would double-count via the (seq_id, changeset_id) PK.
93
+ local_sources = {r[0] for r in conn.execute("SELECT source_url FROM state").fetchall()}
94
+ existing_sources = {r[0] for r in conn.execute("SELECT source_url FROM pg_target.state").fetchall()}
95
+ cross_source = existing_sources - local_sources
96
+ if cross_source and local_sources:
97
+ raise OsmsgError(
98
+ f"PG target already has data from source(s) {sorted(cross_source)} "
99
+ f"but this run pushes from {sorted(local_sources)}. Mixing sources "
100
+ f"double-counts via the (seq_id, changeset_id) key. Use a separate "
101
+ f"--psql-dsn, or wipe the existing PG tables first."
102
+ )
103
+
104
+ if bulk_load:
105
+ # Stream rows instead of buffering them to preserve order; buffering 180M+ JSON-bearing
106
+ # rows is what exhausts memory in a single INSERT. Then drop the secondary indexes and
107
+ # foreign keys so the load does not maintain them per row.
108
+ conn.execute("SET preserve_insertion_order = false")
109
+ for table, name, _add in _BULK_FKS:
110
+ _pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
111
+ for name, _create in _BULK_INDEXES:
112
+ _pg(conn, f"DROP INDEX IF EXISTS {name}")
113
+ conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
114
+ _push_chunked(conn, "changesets", _push_changesets)
115
+ _push_chunked(conn, "changeset_stats", _push_changeset_stats)
116
+ elif _pg_has_history(conn):
117
+ # The history layer (seq_id=0) is already in PG from the bulk load and never changes, so an
118
+ # incremental --update pushes only the live layer and its parents, not the 180M history rows.
119
+ live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
120
+ conn.execute(
121
+ "INSERT INTO pg_target.users SELECT * FROM users "
122
+ "WHERE uid IN (SELECT uid FROM changeset_stats WHERE seq_id <> 0) ON CONFLICT DO NOTHING"
123
+ )
124
+ _push_changesets(conn, f"WHERE {live_ids}")
125
+ _push_changeset_stats(conn, "WHERE seq_id <> 0")
126
+ else:
127
+ # No history in PG (a plain live target): push everything (live rows are all seq_id<>0).
128
+ conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
129
+ _push_changesets(conn)
130
+ _push_changeset_stats(conn)
131
+
132
+ conn.execute(
133
+ """
134
+ INSERT INTO pg_target.state (source_url, last_seq, last_ts, updated_at)
135
+ SELECT source_url, last_seq, last_ts, updated_at FROM state
136
+ ON CONFLICT (source_url) DO UPDATE SET
137
+ last_seq = EXCLUDED.last_seq,
138
+ last_ts = EXCLUDED.last_ts,
139
+ updated_at = EXCLUDED.updated_at
140
+ """
141
+ )
142
+
143
+ if bulk_load:
144
+ # Rebuild once, with more memory for the sort-based index builds, then refresh planner stats.
145
+ for table, name, add in _BULK_FKS:
146
+ _pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
147
+ for _name, create in _BULK_INDEXES:
148
+ _pg(conn, f"SET maintenance_work_mem = '512MB'; {create}")
149
+ _pg(conn, "ANALYZE users")
150
+ _pg(conn, "ANALYZE changesets")
151
+ _pg(conn, "ANALYZE changeset_stats")
152
+ finally:
153
+ conn.execute("DETACH pg_target")
154
+
155
+
156
+ __all__ = ["PG_SCHEMA", "to_psql"]
@@ -56,7 +56,7 @@ class ChangesetHandler(osmium.SimpleHandler):
56
56
 
57
57
  keep = bool(cfg["changeset_meta"] and not cfg["hashtags"])
58
58
  # Some editors only fill the `hashtags` tag (comment stays generic); checking
59
- # comment alone silently drops those. Tokenize via regex on both real data
59
+ # comment alone silently drops those. Tokenize via regex on both, real data
60
60
  # mixes `;`, space, and comma as separators inside `hashtags`.
61
61
  comment = c.tags.get("comment", "")
62
62
  hashtags_field = c.tags.get("hashtags", "")