osmsg 1.1.2__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {osmsg-1.1.2 → osmsg-1.2.1}/PKG-INFO +68 -2
  2. {osmsg-1.1.2 → osmsg-1.2.1}/README.md +67 -1
  3. osmsg-1.2.1/osmsg/__version__.py +1 -0
  4. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/cli.py +90 -4
  5. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/__init__.py +1 -1
  6. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/ingest.py +1 -1
  7. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/exceptions.py +1 -1
  8. osmsg-1.2.1/osmsg/export/psql.py +143 -0
  9. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/handlers.py +1 -1
  10. osmsg-1.2.1/osmsg/history.py +255 -0
  11. osmsg-1.2.1/osmsg/maintain/__init__.py +7 -0
  12. osmsg-1.2.1/osmsg/maintain/cli.py +83 -0
  13. osmsg-1.2.1/osmsg/maintain/convert.py +301 -0
  14. osmsg-1.2.1/osmsg/maintain/manifest.py +62 -0
  15. osmsg-1.2.1/osmsg/maintain/month.py +117 -0
  16. osmsg-1.2.1/osmsg/maintain/parquet.py +39 -0
  17. osmsg-1.2.1/osmsg/maintain/pbf_split.py +78 -0
  18. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/pipeline.py +378 -100
  19. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/replication.py +1 -1
  20. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/tm.py +1 -1
  21. {osmsg-1.1.2 → osmsg-1.2.1}/pyproject.toml +1 -1
  22. osmsg-1.1.2/osmsg/__version__.py +0 -1
  23. osmsg-1.1.2/osmsg/export/psql.py +0 -69
  24. {osmsg-1.1.2 → osmsg-1.2.1}/LICENSE +0 -0
  25. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/__init__.py +0 -0
  26. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/_http.py +0 -0
  27. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/_tick.py +0 -0
  28. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/auth.py +0 -0
  29. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/boundary.py +0 -0
  30. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/duckdb_schema.py +0 -0
  31. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/queries.py +0 -0
  32. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/schema.py +0 -0
  33. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/__init__.py +0 -0
  34. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/csv.py +0 -0
  35. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/json.py +0 -0
  36. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/markdown.py +0 -0
  37. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/parquet.py +0 -0
  38. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/fetch.py +0 -0
  39. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/geofabrik.py +0 -0
  40. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/models.py +0 -0
  41. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/pg_schema.py +0 -0
  42. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/py.typed +0 -0
  43. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/ui.py +0 -0
  44. {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.1.2
3
+ Version: 1.2.1
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -46,13 +46,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
46
46
 
47
47
  A Project of [OSGeo Nepal](https://osgeonepal.org).
48
48
 
49
- ## Features
49
+ ## What does it do?
50
50
 
51
51
  - Per-user create/modify/delete counts over any time window.
52
52
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
53
53
  - Country and custom-boundary filters via Geofabrik.
54
54
  - Cron-friendly resume with `--update`.
55
+ - One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
55
56
  - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
57
+ - Cloud-native history: months covered by a published parquet dataset are read remotely.
56
58
 
57
59
  ## Install
58
60
 
@@ -68,6 +70,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
68
70
  `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
69
71
  with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
70
72
 
73
+ More ways to install:
74
+
75
+ ```bash
76
+ conda install -c conda-forge osmsg # conda / mamba
77
+ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
78
+ ```
79
+
80
+ On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
81
+ and run it directly, no Python required.
82
+
71
83
  ## Quick start
72
84
 
73
85
  ```bash
@@ -78,6 +90,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
78
90
 
79
91
  That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
80
92
 
93
+ ## Set up a full history store
94
+
95
+ Two commands give you a complete, self-updating store. The first loads all of OSM history from the
96
+ published dataset and records where to resume; the second catches up to now and runs on a schedule.
97
+
98
+ ```bash
99
+ osmsg --insert # load all history into stats.duckdb, then exit
100
+ osmsg --update # catch up to now (repeat on cron)
101
+ ```
102
+
103
+ `osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
104
+ current. For near-real-time, run `osmsg --update --url minute`.
105
+
106
+ Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
107
+
108
+ ```bash
109
+ osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
110
+ osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
111
+ ```
112
+
113
+ Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
114
+
115
+ ```bash
116
+ osmsg --insert --start 2020-01-01 --end 2023-01-01
117
+ ```
118
+
119
+ Already have the planet files? Insert from them directly:
120
+
121
+ ```bash
122
+ osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
123
+ ```
124
+
81
125
  ## Tutorials
82
126
 
83
127
  ### 1. Stats for a country
@@ -169,6 +213,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
169
213
  Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
170
214
  `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
171
215
 
216
+ Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
217
+ refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
218
+
172
219
  ## Configuration
173
220
 
174
221
  Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -184,7 +231,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
184
231
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
185
232
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
186
233
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
234
+ | `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
187
235
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
236
+ | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
237
+ | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
238
+ | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
239
+ | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
240
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
188
241
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
189
242
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
190
243
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -192,6 +245,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
192
245
 
193
246
  A `.env` file at the working directory is loaded automatically.
194
247
 
248
+ ## Maintainers
249
+
250
+ Generating and publishing the history dataset is the `osmsg maintain` group:
251
+
252
+ ```bash
253
+ osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
254
+ osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
255
+ osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
256
+ osmsg maintain publish work/out --repo osgeonepal/osmsg-history
257
+ ```
258
+
259
+ See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
260
+
195
261
  ## Documentation
196
262
 
197
263
  - [Installation](./docs/Installation.md)
@@ -14,13 +14,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
14
14
 
15
15
  A Project of [OSGeo Nepal](https://osgeonepal.org).
16
16
 
17
- ## Features
17
+ ## What does it do?
18
18
 
19
19
  - Per-user create/modify/delete counts over any time window.
20
20
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
21
21
  - Country and custom-boundary filters via Geofabrik.
22
22
  - Cron-friendly resume with `--update`.
23
+ - One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
23
24
  - Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
25
+ - Cloud-native history: months covered by a published parquet dataset are read remotely.
24
26
 
25
27
  ## Install
26
28
 
@@ -36,6 +38,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
36
38
  `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
37
39
  with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
38
40
 
41
+ More ways to install:
42
+
43
+ ```bash
44
+ conda install -c conda-forge osmsg # conda / mamba
45
+ brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
46
+ ```
47
+
48
+ On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
49
+ and run it directly, no Python required.
50
+
39
51
  ## Quick start
40
52
 
41
53
  ```bash
@@ -46,6 +58,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
46
58
 
47
59
  That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
48
60
 
61
+ ## Set up a full history store
62
+
63
+ Two commands give you a complete, self-updating store. The first loads all of OSM history from the
64
+ published dataset and records where to resume; the second catches up to now and runs on a schedule.
65
+
66
+ ```bash
67
+ osmsg --insert # load all history into stats.duckdb, then exit
68
+ osmsg --update # catch up to now (repeat on cron)
69
+ ```
70
+
71
+ `osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
72
+ current. For near-real-time, run `osmsg --update --url minute`.
73
+
74
+ Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
75
+
76
+ ```bash
77
+ osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
78
+ osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
79
+ ```
80
+
81
+ Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
82
+
83
+ ```bash
84
+ osmsg --insert --start 2020-01-01 --end 2023-01-01
85
+ ```
86
+
87
+ Already have the planet files? Insert from them directly:
88
+
89
+ ```bash
90
+ osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
91
+ ```
92
+
49
93
  ## Tutorials
50
94
 
51
95
  ### 1. Stats for a country
@@ -137,6 +181,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
137
181
  Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
138
182
  `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
139
183
 
184
+ Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
185
+ refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
186
+
140
187
  ## Configuration
141
188
 
142
189
  Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
@@ -152,7 +199,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
152
199
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
153
200
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
154
201
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
202
+ | `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
155
203
  | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
204
+ | `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
205
+ | `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
206
+ | `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
207
+ | `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
208
+ | `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
156
209
  | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
157
210
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
158
211
  | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
@@ -160,6 +213,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
160
213
 
161
214
  A `.env` file at the working directory is loaded automatically.
162
215
 
216
+ ## Maintainers
217
+
218
+ Generating and publishing the history dataset is the `osmsg maintain` group:
219
+
220
+ ```bash
221
+ osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
222
+ osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
223
+ osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
224
+ osmsg maintain publish work/out --repo osgeonepal/osmsg-history
225
+ ```
226
+
227
+ See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
228
+
163
229
  ## Documentation
164
230
 
165
231
  - [Installation](./docs/Installation.md)
@@ -0,0 +1 @@
1
+ __version__ = "1.2.1"
@@ -1,6 +1,6 @@
1
1
  """Typer-based CLI for osmsg.
2
2
 
3
- UTC throughout no display timezone. Outputs default to parquet (queryable from
3
+ UTC throughout, no display timezone. Outputs default to parquet (queryable from
4
4
  disk by DuckDB / polars / pandas). Other formats: csv, json, markdown, psql.
5
5
  """
6
6
 
@@ -24,6 +24,7 @@ from .exceptions import (
24
24
  OsmsgError,
25
25
  UnknownRegionError,
26
26
  )
27
+ from .maintain.cli import maintain_app
27
28
  from .pipeline import RunConfig, run
28
29
  from .ui import console, error, info, render_table, warn
29
30
 
@@ -36,6 +37,7 @@ app = typer.Typer(
36
37
  no_args_is_help=False,
37
38
  help="OpenStreetMap stats generator. Parquet-first, OAuth 2.0, UTC-only.",
38
39
  )
40
+ app.add_typer(maintain_app, name="maintain")
39
41
 
40
42
 
41
43
  class Period(StrEnum):
@@ -104,9 +106,10 @@ def _period_range(period: Period) -> tuple[dt.datetime, dt.datetime]:
104
106
  raise ValueError(period)
105
107
 
106
108
 
107
- @app.command()
109
+ @app.callback(invoke_without_command=True)
108
110
  @use_yaml_config(param_name="config", param_help="YAML config file (CLI flags override its values).")
109
111
  def main(
112
+ ctx: typer.Context,
110
113
  version: Annotated[
111
114
  bool | None,
112
115
  typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
@@ -215,6 +218,15 @@ def main(
215
218
  str | None,
216
219
  typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
217
220
  ] = None,
221
+ psql_bulk: Annotated[
222
+ bool,
223
+ typer.Option(
224
+ "--psql-bulk",
225
+ envvar="OSMSG_PSQL_BULK",
226
+ help="Faster one-time psql load: drop secondary indexes and foreign keys during the push "
227
+ "and rebuild them after. Use for a full history import, not for incremental --update.",
228
+ ),
229
+ ] = False,
218
230
  changeset_pad_hours: Annotated[
219
231
  int,
220
232
  typer.Option(
@@ -226,16 +238,76 @@ def main(
226
238
  max=48,
227
239
  ),
228
240
  ] = 1,
241
+ history: Annotated[
242
+ bool,
243
+ typer.Option(
244
+ "--history/--no-history",
245
+ envvar="OSMSG_HISTORY",
246
+ help="Serve covered months from the published parquet (HuggingFace) and only download the "
247
+ "recent tail. Falls back to the live diff path if unavailable. Ignored by --update.",
248
+ ),
249
+ ] = True,
250
+ history_url: Annotated[
251
+ str,
252
+ typer.Option(
253
+ "--history-url",
254
+ envvar="OSMSG_HISTORY_URL",
255
+ help="Base URL of the published history dataset.",
256
+ ),
257
+ ] = "hf://datasets/kshitijrajsharma/osmsg-history",
258
+ insert: Annotated[
259
+ bool,
260
+ typer.Option(
261
+ "--insert",
262
+ help="Load history into the store and seed resume state, then exit. No window loads the "
263
+ "whole published history; --start/--end loads a slice. Follow with --update to catch up.",
264
+ ),
265
+ ] = False,
266
+ osh_file: Annotated[
267
+ str | None,
268
+ typer.Option("--osh-file", help="Insert from a local .osh.pbf instead of the published dataset."),
269
+ ] = None,
270
+ changeset_file: Annotated[
271
+ str | None,
272
+ typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
273
+ ] = None,
274
+ overwrite: Annotated[
275
+ bool,
276
+ typer.Option(
277
+ "--overwrite",
278
+ help="Recompute even if <name>.duckdb already holds this exact query; otherwise a rerun "
279
+ "that only changes the output format re-exports from the existing store.",
280
+ ),
281
+ ] = False,
229
282
  ) -> None:
230
- """Run osmsg."""
283
+ """Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
284
+ if ctx.invoked_subcommand is not None:
285
+ return
231
286
  if formats is None:
232
287
  formats = [Format.parquet]
288
+ if psql_dsn and Format.psql not in formats:
289
+ formats.append(Format.psql)
233
290
  if sum(1 for x in (start, last, days) if x) > 1:
234
- error("--start, --last, and --days are mutually exclusive pick one.")
291
+ error("--start, --last, and --days are mutually exclusive, pick one.")
235
292
  raise typer.Exit(code=2)
236
293
  if update and any(x is not None for x in (start, end, last, days)):
237
294
  error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
238
295
  raise typer.Exit(code=2)
296
+ if insert and update:
297
+ error("--insert and --update are mutually exclusive; insert first, then update.")
298
+ raise typer.Exit(code=2)
299
+ if insert and (last is not None or days is not None):
300
+ error("--insert takes --start/--end (or no window), not --last/--days.")
301
+ raise typer.Exit(code=2)
302
+ if (osh_file is None) != (changeset_file is None):
303
+ error("--osh-file and --changeset-file must be given together.")
304
+ raise typer.Exit(code=2)
305
+ if osh_file and not insert:
306
+ error("--osh-file/--changeset-file are only valid with --insert.")
307
+ raise typer.Exit(code=2)
308
+ if psql_bulk and update:
309
+ error("--psql-bulk is for a one-time full load (drops indexes/keys); do not use it with --update.")
310
+ raise typer.Exit(code=2)
239
311
  if Format.psql in formats and not psql_dsn:
240
312
  error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
241
313
  raise typer.Exit(code=2)
@@ -267,7 +339,14 @@ def main(
267
339
  osm_username=username,
268
340
  osm_password=_read_password_stdin() if password_stdin else None,
269
341
  psql_dsn=psql_dsn,
342
+ psql_bulk=psql_bulk,
270
343
  changeset_pad_hours=changeset_pad_hours,
344
+ history_mode="auto" if history else "off",
345
+ history_url=history_url,
346
+ insert=insert,
347
+ osh_file=osh_file,
348
+ changeset_file=changeset_file,
349
+ overwrite=overwrite,
271
350
  )
272
351
 
273
352
  if last is not None:
@@ -300,6 +379,13 @@ def main(
300
379
  error(str(exc))
301
380
  raise typer.Exit(code=2) from exc
302
381
 
382
+ if insert:
383
+ info(f"insert complete: {result['rows']:,} history changeset rows loaded.")
384
+ for label, path in (result.get("files") or {}).items():
385
+ console.print(f"[green]✓[/green] {label}: [bold]{path}[/bold]")
386
+ console.print("Next: [bold]osmsg --update[/bold] to catch up to now.")
387
+ return
388
+
303
389
  rows_data = result.get("rows_data") or []
304
390
  display_n = min(rows or 20, len(rows_data))
305
391
  render_table(
@@ -1,7 +1,7 @@
1
1
  """DuckDB persistence: schema, ingest, queries.
2
2
 
3
3
  The schema is portable: identical column shape works in DuckDB, Parquet, and
4
- PostgreSQL exporters re-issue the CREATE TABLE there.
4
+ PostgreSQL, exporters re-issue the CREATE TABLE there.
5
5
 
6
6
  Public surface:
7
7
 
@@ -106,7 +106,7 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
106
106
  _quarantine_corrupt(parquet_dir)
107
107
 
108
108
  def pattern(name: str) -> str:
109
- # read_parquet() takes a literal escape so quoted paths can't break out.
109
+ # read_parquet() takes a literal, escape so quoted paths can't break out.
110
110
  return _sql_escape((parquet_dir / f"temp_*_{name}_*.parquet").as_posix())
111
111
 
112
112
  conn.execute("BEGIN")
@@ -25,7 +25,7 @@ class GeofabrikAuthError(OsmsgError):
25
25
 
26
26
 
27
27
  class NoDataFoundError(Exception):
28
- """Empty range info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
28
+ """Empty range, info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
29
29
 
30
30
 
31
31
  __all__ = [
@@ -0,0 +1,143 @@
1
+ """PostgreSQL exporter via DuckDB's postgres extension."""
2
+
3
+ import duckdb
4
+
5
+ from ..exceptions import OsmsgError
6
+ from ..pg_schema import PG_SCHEMA
7
+
8
+ _BULK_INDEXES = [
9
+ ("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
10
+ ("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
11
+ ("idx_changeset_stats_uid", "CREATE INDEX idx_changeset_stats_uid ON changeset_stats (uid)"),
12
+ ]
13
+ _BULK_FKS = [
14
+ ("changesets", "changesets_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
15
+ (
16
+ "changeset_stats",
17
+ "changeset_stats_changeset_id_fkey",
18
+ "FOREIGN KEY (changeset_id) REFERENCES changesets (changeset_id)",
19
+ ),
20
+ ("changeset_stats", "changeset_stats_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
21
+ ]
22
+
23
+
24
+ _BULK_COMMIT_CHUNKS = 32
25
+
26
+
27
+ def _pg(conn: duckdb.DuckDBPyConnection, sql: str) -> None:
28
+ conn.execute(f"CALL postgres_execute('pg_target', $${sql}$$)")
29
+
30
+
31
+ def _pg_has_history(conn: duckdb.DuckDBPyConnection) -> bool:
32
+ """True if the PG target already holds the history layer (seq_id=0); checked cheaply with LIMIT 1."""
33
+ probe = "SELECT count(*) FROM (SELECT 1 FROM pg_target.changeset_stats WHERE seq_id = 0 LIMIT 1) t"
34
+ row = conn.execute(probe).fetchone()
35
+ return bool(row and row[0])
36
+
37
+
38
+ def _push_changesets(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
39
+ # Newer non-NULL wins, NULL never downgrades (mirrors the DuckDB-side merge).
40
+ conn.execute(
41
+ f"""
42
+ INSERT INTO pg_target.changesets AS c (changeset_id, uid, created_at, hashtags, editor, geom)
43
+ SELECT changeset_id, uid, created_at, hashtags, editor, geom FROM changesets {where}
44
+ ON CONFLICT (changeset_id) DO UPDATE SET
45
+ created_at = COALESCE(EXCLUDED.created_at, c.created_at),
46
+ hashtags = COALESCE(EXCLUDED.hashtags, c.hashtags),
47
+ editor = COALESCE(EXCLUDED.editor, c.editor),
48
+ geom = COALESCE(EXCLUDED.geom, c.geom)
49
+ """
50
+ )
51
+
52
+
53
+ def _push_changeset_stats(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
54
+ conn.execute(f"INSERT INTO pg_target.changeset_stats SELECT * FROM changeset_stats {where} ON CONFLICT DO NOTHING")
55
+
56
+
57
+ def _push_chunked(conn: duckdb.DuckDBPyConnection, source: str, push) -> None:
58
+ """Call push() once per changeset_id range so each range commits on its own."""
59
+ bounds = conn.execute(f"SELECT min(changeset_id), max(changeset_id) FROM {source}").fetchone()
60
+ if not bounds or bounds[0] is None:
61
+ return
62
+ lo, hi = bounds
63
+ step = (hi - lo) // _BULK_COMMIT_CHUNKS + 1
64
+ cursor = lo
65
+ while cursor <= hi:
66
+ push(conn, f"WHERE changeset_id >= {cursor} AND changeset_id < {cursor + step}")
67
+ cursor += step
68
+
69
+
70
+ def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = False) -> None:
71
+ """Push every osmsg table to the libpq DSN target. bulk_load is for the one-time full-history
72
+ import (drops indexes and foreign keys, streams, rebuilds, commits per range); leave it off for
73
+ incremental --update pushes. The DSN is interpolated into ATTACH, so it must be trusted."""
74
+ conn.execute("INSTALL postgres")
75
+ conn.execute("LOAD postgres")
76
+ conn.execute("INSTALL spatial")
77
+ conn.execute("LOAD spatial")
78
+ safe_dsn = dsn.replace("'", "''")
79
+ conn.execute(f"ATTACH '{safe_dsn}' AS pg_target (TYPE postgres)")
80
+ try:
81
+ for stmt in PG_SCHEMA.strip().split(";"):
82
+ stmt = stmt.strip()
83
+ if stmt:
84
+ _pg(conn, stmt)
85
+
86
+ # Refuse cross-source push: would double-count via the (seq_id, changeset_id) PK.
87
+ local_sources = {r[0] for r in conn.execute("SELECT source_url FROM state").fetchall()}
88
+ existing_sources = {r[0] for r in conn.execute("SELECT source_url FROM pg_target.state").fetchall()}
89
+ cross_source = existing_sources - local_sources
90
+ if cross_source and local_sources:
91
+ raise OsmsgError(
92
+ f"PG target already has data from source(s) {sorted(cross_source)} "
93
+ f"but this run pushes from {sorted(local_sources)}. Mixing sources "
94
+ f"double-counts via the (seq_id, changeset_id) key. Use a separate "
95
+ f"--psql-dsn, or wipe the existing PG tables first."
96
+ )
97
+
98
+ if bulk_load:
99
+ conn.execute("SET preserve_insertion_order = false")
100
+ for table, name, _add in _BULK_FKS:
101
+ _pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
102
+ for name, _create in _BULK_INDEXES:
103
+ _pg(conn, f"DROP INDEX IF EXISTS {name}")
104
+ conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
105
+ _push_chunked(conn, "changesets", _push_changesets)
106
+ _push_chunked(conn, "changeset_stats", _push_changeset_stats)
107
+ elif _pg_has_history(conn):
108
+ live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
109
+ conn.execute(
110
+ "INSERT INTO pg_target.users SELECT * FROM users "
111
+ "WHERE uid IN (SELECT uid FROM changeset_stats WHERE seq_id <> 0) ON CONFLICT DO NOTHING"
112
+ )
113
+ _push_changesets(conn, f"WHERE {live_ids}")
114
+ _push_changeset_stats(conn, "WHERE seq_id <> 0")
115
+ else:
116
+ conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
117
+ _push_changesets(conn)
118
+ _push_changeset_stats(conn)
119
+
120
+ conn.execute(
121
+ """
122
+ INSERT INTO pg_target.state (source_url, last_seq, last_ts, updated_at)
123
+ SELECT source_url, last_seq, last_ts, updated_at FROM state
124
+ ON CONFLICT (source_url) DO UPDATE SET
125
+ last_seq = EXCLUDED.last_seq,
126
+ last_ts = EXCLUDED.last_ts,
127
+ updated_at = EXCLUDED.updated_at
128
+ """
129
+ )
130
+
131
+ if bulk_load:
132
+ for table, name, add in _BULK_FKS:
133
+ _pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
134
+ for _name, create in _BULK_INDEXES:
135
+ _pg(conn, f"SET maintenance_work_mem = '512MB'; {create}")
136
+ _pg(conn, "ANALYZE users")
137
+ _pg(conn, "ANALYZE changesets")
138
+ _pg(conn, "ANALYZE changeset_stats")
139
+ finally:
140
+ conn.execute("DETACH pg_target")
141
+
142
+
143
+ __all__ = ["PG_SCHEMA", "to_psql"]
@@ -56,7 +56,7 @@ class ChangesetHandler(osmium.SimpleHandler):
56
56
 
57
57
  keep = bool(cfg["changeset_meta"] and not cfg["hashtags"])
58
58
  # Some editors only fill the `hashtags` tag (comment stays generic); checking
59
- # comment alone silently drops those. Tokenize via regex on both real data
59
+ # comment alone silently drops those. Tokenize via regex on both, real data
60
60
  # mixes `;`, space, and comma as separators inside `hashtags`.
61
61
  comment = c.tags.get("comment", "")
62
62
  hashtags_field = c.tags.get("hashtags", "")