osmsg 1.1.2__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {osmsg-1.1.2 → osmsg-1.2.0}/PKG-INFO +64 -2
- {osmsg-1.1.2 → osmsg-1.2.0}/README.md +63 -1
- osmsg-1.2.0/osmsg/__version__.py +1 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/cli.py +81 -4
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/__init__.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/ingest.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/exceptions.py +1 -1
- osmsg-1.2.0/osmsg/export/psql.py +156 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/handlers.py +1 -1
- osmsg-1.2.0/osmsg/history.py +272 -0
- osmsg-1.2.0/osmsg/maintain/__init__.py +7 -0
- osmsg-1.2.0/osmsg/maintain/cli.py +83 -0
- osmsg-1.2.0/osmsg/maintain/convert.py +314 -0
- osmsg-1.2.0/osmsg/maintain/manifest.py +62 -0
- osmsg-1.2.0/osmsg/maintain/month.py +120 -0
- osmsg-1.2.0/osmsg/maintain/parquet.py +43 -0
- osmsg-1.2.0/osmsg/maintain/pbf_split.py +79 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pipeline.py +233 -16
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/replication.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/tm.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.0}/pyproject.toml +1 -1
- osmsg-1.1.2/osmsg/__version__.py +0 -1
- osmsg-1.1.2/osmsg/export/psql.py +0 -69
- {osmsg-1.1.2 → osmsg-1.2.0}/LICENSE +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/__init__.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_http.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/_tick.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/auth.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/boundary.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/duckdb_schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/queries.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/db/schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/__init__.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/csv.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/json.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/markdown.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/export/parquet.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/fetch.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/geofabrik.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/models.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/pg_schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/py.typed +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/ui.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.0}/osmsg/workers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: osmsg
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: OpenStreetMap Stats Generator: Commandline
|
|
5
5
|
Keywords: osm,stats,commandline,openstreetmap
|
|
6
6
|
Author: Kshitij Raj Sharma
|
|
@@ -46,13 +46,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
|
|
|
46
46
|
|
|
47
47
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
48
48
|
|
|
49
|
-
##
|
|
49
|
+
## What does it do?
|
|
50
50
|
|
|
51
51
|
- Per-user create/modify/delete counts over any time window.
|
|
52
52
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
53
53
|
- Country and custom-boundary filters via Geofabrik.
|
|
54
54
|
- Cron-friendly resume with `--update`.
|
|
55
|
+
- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
|
|
55
56
|
- Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
|
|
57
|
+
- Cloud-native history: months covered by a published parquet dataset are read remotely.
|
|
56
58
|
|
|
57
59
|
## Install
|
|
58
60
|
|
|
@@ -68,6 +70,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
|
|
|
68
70
|
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
69
71
|
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
70
72
|
|
|
73
|
+
More ways to install:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
conda install -c conda-forge osmsg # conda / mamba
|
|
77
|
+
brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
|
|
81
|
+
and run it directly, no Python required.
|
|
82
|
+
|
|
71
83
|
## Quick start
|
|
72
84
|
|
|
73
85
|
```bash
|
|
@@ -78,6 +90,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
|
|
|
78
90
|
|
|
79
91
|
That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
|
|
80
92
|
|
|
93
|
+
## Set up a full history store
|
|
94
|
+
|
|
95
|
+
Two commands give you a complete, self-updating store. The first loads all of OSM history from the
|
|
96
|
+
published dataset and records where to resume; the second catches up to now and runs on a schedule.
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
osmsg --insert # load all history into stats.duckdb, then exit
|
|
100
|
+
osmsg --update # catch up to now (repeat on cron)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
|
|
104
|
+
current. For near-real-time, run `osmsg --update --url minute`.
|
|
105
|
+
|
|
106
|
+
Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
110
|
+
osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
osmsg --insert --start 2020-01-01 --end 2023-01-01
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Already have the planet files? Insert from them directly:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
|
|
123
|
+
```
|
|
124
|
+
|
|
81
125
|
## Tutorials
|
|
82
126
|
|
|
83
127
|
### 1. Stats for a country
|
|
@@ -185,6 +229,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
185
229
|
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
186
230
|
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
187
231
|
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
232
|
+
| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
|
|
233
|
+
| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
|
|
234
|
+
| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
|
|
235
|
+
| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
|
|
236
|
+
| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
|
|
188
237
|
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
189
238
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
190
239
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
@@ -192,6 +241,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
192
241
|
|
|
193
242
|
A `.env` file at the working directory is loaded automatically.
|
|
194
243
|
|
|
244
|
+
## Maintainers
|
|
245
|
+
|
|
246
|
+
Generating and publishing the history dataset is the `osmsg maintain` group:
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
|
|
250
|
+
osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
|
|
251
|
+
osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
|
|
252
|
+
osmsg maintain publish work/out --repo osgeonepal/osmsg-history
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
|
|
256
|
+
|
|
195
257
|
## Documentation
|
|
196
258
|
|
|
197
259
|
- [Installation](./docs/Installation.md)
|
|
@@ -14,13 +14,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
|
|
|
14
14
|
|
|
15
15
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
16
16
|
|
|
17
|
-
##
|
|
17
|
+
## What does it do?
|
|
18
18
|
|
|
19
19
|
- Per-user create/modify/delete counts over any time window.
|
|
20
20
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
21
21
|
- Country and custom-boundary filters via Geofabrik.
|
|
22
22
|
- Cron-friendly resume with `--update`.
|
|
23
|
+
- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
|
|
23
24
|
- Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
|
|
25
|
+
- Cloud-native history: months covered by a published parquet dataset are read remotely.
|
|
24
26
|
|
|
25
27
|
## Install
|
|
26
28
|
|
|
@@ -36,6 +38,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
|
|
|
36
38
|
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
37
39
|
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
38
40
|
|
|
41
|
+
More ways to install:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
conda install -c conda-forge osmsg # conda / mamba
|
|
45
|
+
brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
|
|
49
|
+
and run it directly, no Python required.
|
|
50
|
+
|
|
39
51
|
## Quick start
|
|
40
52
|
|
|
41
53
|
```bash
|
|
@@ -46,6 +58,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
|
|
|
46
58
|
|
|
47
59
|
That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
|
|
48
60
|
|
|
61
|
+
## Set up a full history store
|
|
62
|
+
|
|
63
|
+
Two commands give you a complete, self-updating store. The first loads all of OSM history from the
|
|
64
|
+
published dataset and records where to resume; the second catches up to now and runs on a schedule.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
osmsg --insert # load all history into stats.duckdb, then exit
|
|
68
|
+
osmsg --update # catch up to now (repeat on cron)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
|
|
72
|
+
current. For near-real-time, run `osmsg --update --url minute`.
|
|
73
|
+
|
|
74
|
+
Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
78
|
+
osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
osmsg --insert --start 2020-01-01 --end 2023-01-01
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Already have the planet files? Insert from them directly:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
|
|
91
|
+
```
|
|
92
|
+
|
|
49
93
|
## Tutorials
|
|
50
94
|
|
|
51
95
|
### 1. Stats for a country
|
|
@@ -153,6 +197,11 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
153
197
|
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
154
198
|
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
155
199
|
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
200
|
+
| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
|
|
201
|
+
| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
|
|
202
|
+
| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
|
|
203
|
+
| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
|
|
204
|
+
| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
|
|
156
205
|
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
157
206
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
158
207
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
@@ -160,6 +209,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
160
209
|
|
|
161
210
|
A `.env` file at the working directory is loaded automatically.
|
|
162
211
|
|
|
212
|
+
## Maintainers
|
|
213
|
+
|
|
214
|
+
Generating and publishing the history dataset is the `osmsg maintain` group:
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
|
|
218
|
+
osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
|
|
219
|
+
osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
|
|
220
|
+
osmsg maintain publish work/out --repo osgeonepal/osmsg-history
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
|
|
224
|
+
|
|
163
225
|
## Documentation
|
|
164
226
|
|
|
165
227
|
- [Installation](./docs/Installation.md)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Typer-based CLI for osmsg.
|
|
2
2
|
|
|
3
|
-
UTC throughout
|
|
3
|
+
UTC throughout, no display timezone. Outputs default to parquet (queryable from
|
|
4
4
|
disk by DuckDB / polars / pandas). Other formats: csv, json, markdown, psql.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -24,6 +24,7 @@ from .exceptions import (
|
|
|
24
24
|
OsmsgError,
|
|
25
25
|
UnknownRegionError,
|
|
26
26
|
)
|
|
27
|
+
from .maintain.cli import maintain_app
|
|
27
28
|
from .pipeline import RunConfig, run
|
|
28
29
|
from .ui import console, error, info, render_table, warn
|
|
29
30
|
|
|
@@ -36,6 +37,7 @@ app = typer.Typer(
|
|
|
36
37
|
no_args_is_help=False,
|
|
37
38
|
help="OpenStreetMap stats generator. Parquet-first, OAuth 2.0, UTC-only.",
|
|
38
39
|
)
|
|
40
|
+
app.add_typer(maintain_app, name="maintain")
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
class Period(StrEnum):
|
|
@@ -104,9 +106,10 @@ def _period_range(period: Period) -> tuple[dt.datetime, dt.datetime]:
|
|
|
104
106
|
raise ValueError(period)
|
|
105
107
|
|
|
106
108
|
|
|
107
|
-
@app.
|
|
109
|
+
@app.callback(invoke_without_command=True)
|
|
108
110
|
@use_yaml_config(param_name="config", param_help="YAML config file (CLI flags override its values).")
|
|
109
111
|
def main(
|
|
112
|
+
ctx: typer.Context,
|
|
110
113
|
version: Annotated[
|
|
111
114
|
bool | None,
|
|
112
115
|
typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
|
|
@@ -215,6 +218,15 @@ def main(
|
|
|
215
218
|
str | None,
|
|
216
219
|
typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
|
|
217
220
|
] = None,
|
|
221
|
+
psql_bulk: Annotated[
|
|
222
|
+
bool,
|
|
223
|
+
typer.Option(
|
|
224
|
+
"--psql-bulk",
|
|
225
|
+
envvar="OSMSG_PSQL_BULK",
|
|
226
|
+
help="Faster one-time psql load: drop secondary indexes and foreign keys during the push "
|
|
227
|
+
"and rebuild them after. Use for a full history import, not for incremental --update.",
|
|
228
|
+
),
|
|
229
|
+
] = False,
|
|
218
230
|
changeset_pad_hours: Annotated[
|
|
219
231
|
int,
|
|
220
232
|
typer.Option(
|
|
@@ -226,16 +238,68 @@ def main(
|
|
|
226
238
|
max=48,
|
|
227
239
|
),
|
|
228
240
|
] = 1,
|
|
241
|
+
history: Annotated[
|
|
242
|
+
bool,
|
|
243
|
+
typer.Option(
|
|
244
|
+
"--history/--no-history",
|
|
245
|
+
envvar="OSMSG_HISTORY",
|
|
246
|
+
help="Serve covered months from the published parquet (HuggingFace) and only download the "
|
|
247
|
+
"recent tail. Falls back to the live diff path if unavailable. Ignored by --update.",
|
|
248
|
+
),
|
|
249
|
+
] = True,
|
|
250
|
+
history_url: Annotated[
|
|
251
|
+
str,
|
|
252
|
+
typer.Option(
|
|
253
|
+
"--history-url",
|
|
254
|
+
envvar="OSMSG_HISTORY_URL",
|
|
255
|
+
help="Base URL of the published history dataset.",
|
|
256
|
+
),
|
|
257
|
+
] = "hf://datasets/kshitijrajsharma/osmsg-history",
|
|
258
|
+
insert: Annotated[
|
|
259
|
+
bool,
|
|
260
|
+
typer.Option(
|
|
261
|
+
"--insert",
|
|
262
|
+
help="Load history into the store and seed resume state, then exit. No window loads the "
|
|
263
|
+
"whole published history; --start/--end loads a slice. Follow with --update to catch up.",
|
|
264
|
+
),
|
|
265
|
+
] = False,
|
|
266
|
+
osh_file: Annotated[
|
|
267
|
+
str | None,
|
|
268
|
+
typer.Option("--osh-file", help="Insert from a local .osh.pbf instead of the published dataset."),
|
|
269
|
+
] = None,
|
|
270
|
+
changeset_file: Annotated[
|
|
271
|
+
str | None,
|
|
272
|
+
typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
|
|
273
|
+
] = None,
|
|
229
274
|
) -> None:
|
|
230
|
-
"""Run osmsg."""
|
|
275
|
+
"""Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
|
|
276
|
+
if ctx.invoked_subcommand is not None:
|
|
277
|
+
return
|
|
231
278
|
if formats is None:
|
|
232
279
|
formats = [Format.parquet]
|
|
280
|
+
if psql_dsn and Format.psql not in formats:
|
|
281
|
+
formats.append(Format.psql)
|
|
233
282
|
if sum(1 for x in (start, last, days) if x) > 1:
|
|
234
|
-
error("--start, --last, and --days are mutually exclusive
|
|
283
|
+
error("--start, --last, and --days are mutually exclusive, pick one.")
|
|
235
284
|
raise typer.Exit(code=2)
|
|
236
285
|
if update and any(x is not None for x in (start, end, last, days)):
|
|
237
286
|
error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
|
|
238
287
|
raise typer.Exit(code=2)
|
|
288
|
+
if insert and update:
|
|
289
|
+
error("--insert and --update are mutually exclusive; insert first, then update.")
|
|
290
|
+
raise typer.Exit(code=2)
|
|
291
|
+
if insert and (last is not None or days is not None):
|
|
292
|
+
error("--insert takes --start/--end (or no window), not --last/--days.")
|
|
293
|
+
raise typer.Exit(code=2)
|
|
294
|
+
if (osh_file is None) != (changeset_file is None):
|
|
295
|
+
error("--osh-file and --changeset-file must be given together.")
|
|
296
|
+
raise typer.Exit(code=2)
|
|
297
|
+
if osh_file and not insert:
|
|
298
|
+
error("--osh-file/--changeset-file are only valid with --insert.")
|
|
299
|
+
raise typer.Exit(code=2)
|
|
300
|
+
if psql_bulk and update:
|
|
301
|
+
error("--psql-bulk is for a one-time full load (drops indexes/keys); do not use it with --update.")
|
|
302
|
+
raise typer.Exit(code=2)
|
|
239
303
|
if Format.psql in formats and not psql_dsn:
|
|
240
304
|
error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
|
|
241
305
|
raise typer.Exit(code=2)
|
|
@@ -267,7 +331,13 @@ def main(
|
|
|
267
331
|
osm_username=username,
|
|
268
332
|
osm_password=_read_password_stdin() if password_stdin else None,
|
|
269
333
|
psql_dsn=psql_dsn,
|
|
334
|
+
psql_bulk=psql_bulk,
|
|
270
335
|
changeset_pad_hours=changeset_pad_hours,
|
|
336
|
+
history_mode="auto" if history else "off",
|
|
337
|
+
history_url=history_url,
|
|
338
|
+
insert=insert,
|
|
339
|
+
osh_file=osh_file,
|
|
340
|
+
changeset_file=changeset_file,
|
|
271
341
|
)
|
|
272
342
|
|
|
273
343
|
if last is not None:
|
|
@@ -300,6 +370,13 @@ def main(
|
|
|
300
370
|
error(str(exc))
|
|
301
371
|
raise typer.Exit(code=2) from exc
|
|
302
372
|
|
|
373
|
+
if insert:
|
|
374
|
+
info(f"insert complete: {result['rows']:,} history changeset rows loaded.")
|
|
375
|
+
for label, path in (result.get("files") or {}).items():
|
|
376
|
+
console.print(f"[green]✓[/green] {label}: [bold]{path}[/bold]")
|
|
377
|
+
console.print("Next: [bold]osmsg --update[/bold] to catch up to now.")
|
|
378
|
+
return
|
|
379
|
+
|
|
303
380
|
rows_data = result.get("rows_data") or []
|
|
304
381
|
display_n = min(rows or 20, len(rows_data))
|
|
305
382
|
render_table(
|
|
@@ -106,7 +106,7 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
|
|
|
106
106
|
_quarantine_corrupt(parquet_dir)
|
|
107
107
|
|
|
108
108
|
def pattern(name: str) -> str:
|
|
109
|
-
# read_parquet() takes a literal
|
|
109
|
+
# read_parquet() takes a literal, escape so quoted paths can't break out.
|
|
110
110
|
return _sql_escape((parquet_dir / f"temp_*_{name}_*.parquet").as_posix())
|
|
111
111
|
|
|
112
112
|
conn.execute("BEGIN")
|
|
@@ -25,7 +25,7 @@ class GeofabrikAuthError(OsmsgError):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class NoDataFoundError(Exception):
|
|
28
|
-
"""Empty range
|
|
28
|
+
"""Empty range, info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
__all__ = [
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""PostgreSQL exporter via DuckDB's postgres extension."""
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
|
|
5
|
+
from ..exceptions import OsmsgError
|
|
6
|
+
from ..pg_schema import PG_SCHEMA
|
|
7
|
+
|
|
8
|
+
# Secondary indexes and foreign keys that make a row-by-row insert slow. For a one-time bulk load
|
|
9
|
+
# they are dropped before the COPY and rebuilt once after (one index build + one FK validation,
|
|
10
|
+
# instead of maintaining them per row). Primary keys stay, because the ON CONFLICT upserts need them.
|
|
11
|
+
# Indexes are (name, create-sql); foreign keys are (table, name, add-clause).
|
|
12
|
+
_BULK_INDEXES = [
|
|
13
|
+
("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
|
|
14
|
+
("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
|
|
15
|
+
("idx_changeset_stats_uid", "CREATE INDEX idx_changeset_stats_uid ON changeset_stats (uid)"),
|
|
16
|
+
]
|
|
17
|
+
_BULK_FKS = [
|
|
18
|
+
("changesets", "changesets_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
|
|
19
|
+
(
|
|
20
|
+
"changeset_stats",
|
|
21
|
+
"changeset_stats_changeset_id_fkey",
|
|
22
|
+
"FOREIGN KEY (changeset_id) REFERENCES changesets (changeset_id)",
|
|
23
|
+
),
|
|
24
|
+
("changeset_stats", "changeset_stats_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Bulk loads push the big tables in this many changeset_id ranges, each its own statement and so its
|
|
29
|
+
# own commit, so a failure costs one range instead of rolling back the whole multi-GB load.
|
|
30
|
+
_BULK_COMMIT_CHUNKS = 32
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _pg(conn: duckdb.DuckDBPyConnection, sql: str) -> None:
|
|
34
|
+
conn.execute(f"CALL postgres_execute('pg_target', $${sql}$$)")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _pg_has_history(conn: duckdb.DuckDBPyConnection) -> bool:
|
|
38
|
+
"""True if the PG target already holds the history layer (seq_id=0); checked cheaply with LIMIT 1."""
|
|
39
|
+
probe = "SELECT count(*) FROM (SELECT 1 FROM pg_target.changeset_stats WHERE seq_id = 0 LIMIT 1) t"
|
|
40
|
+
row = conn.execute(probe).fetchone()
|
|
41
|
+
return bool(row and row[0])
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _push_changesets(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
|
|
45
|
+
# Newer non-NULL wins, NULL never downgrades (mirrors the DuckDB-side merge).
|
|
46
|
+
conn.execute(
|
|
47
|
+
f"""
|
|
48
|
+
INSERT INTO pg_target.changesets AS c (changeset_id, uid, created_at, hashtags, editor, geom)
|
|
49
|
+
SELECT changeset_id, uid, created_at, hashtags, editor, geom FROM changesets {where}
|
|
50
|
+
ON CONFLICT (changeset_id) DO UPDATE SET
|
|
51
|
+
created_at = COALESCE(EXCLUDED.created_at, c.created_at),
|
|
52
|
+
hashtags = COALESCE(EXCLUDED.hashtags, c.hashtags),
|
|
53
|
+
editor = COALESCE(EXCLUDED.editor, c.editor),
|
|
54
|
+
geom = COALESCE(EXCLUDED.geom, c.geom)
|
|
55
|
+
"""
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _push_changeset_stats(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
|
|
60
|
+
conn.execute(f"INSERT INTO pg_target.changeset_stats SELECT * FROM changeset_stats {where} ON CONFLICT DO NOTHING")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _push_chunked(conn: duckdb.DuckDBPyConnection, source: str, push) -> None:
|
|
64
|
+
"""Call push() once per changeset_id range so each range commits on its own."""
|
|
65
|
+
bounds = conn.execute(f"SELECT min(changeset_id), max(changeset_id) FROM {source}").fetchone()
|
|
66
|
+
if not bounds or bounds[0] is None:
|
|
67
|
+
return
|
|
68
|
+
lo, hi = bounds
|
|
69
|
+
step = (hi - lo) // _BULK_COMMIT_CHUNKS + 1
|
|
70
|
+
cursor = lo
|
|
71
|
+
while cursor <= hi:
|
|
72
|
+
push(conn, f"WHERE changeset_id >= {cursor} AND changeset_id < {cursor + step}")
|
|
73
|
+
cursor += step
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = False) -> None:
|
|
77
|
+
"""Push every osmsg table to the libpq DSN target. bulk_load is for the one-time full-history
|
|
78
|
+
import (drops indexes and foreign keys, streams, rebuilds, commits per range); leave it off for
|
|
79
|
+
incremental --update pushes. The DSN is interpolated into ATTACH, so it must be trusted."""
|
|
80
|
+
conn.execute("INSTALL postgres")
|
|
81
|
+
conn.execute("LOAD postgres")
|
|
82
|
+
conn.execute("INSTALL spatial")
|
|
83
|
+
conn.execute("LOAD spatial")
|
|
84
|
+
safe_dsn = dsn.replace("'", "''")
|
|
85
|
+
conn.execute(f"ATTACH '{safe_dsn}' AS pg_target (TYPE postgres)")
|
|
86
|
+
try:
|
|
87
|
+
for stmt in PG_SCHEMA.strip().split(";"):
|
|
88
|
+
stmt = stmt.strip()
|
|
89
|
+
if stmt:
|
|
90
|
+
_pg(conn, stmt)
|
|
91
|
+
|
|
92
|
+
# Refuse cross-source push: would double-count via the (seq_id, changeset_id) PK.
|
|
93
|
+
local_sources = {r[0] for r in conn.execute("SELECT source_url FROM state").fetchall()}
|
|
94
|
+
existing_sources = {r[0] for r in conn.execute("SELECT source_url FROM pg_target.state").fetchall()}
|
|
95
|
+
cross_source = existing_sources - local_sources
|
|
96
|
+
if cross_source and local_sources:
|
|
97
|
+
raise OsmsgError(
|
|
98
|
+
f"PG target already has data from source(s) {sorted(cross_source)} "
|
|
99
|
+
f"but this run pushes from {sorted(local_sources)}. Mixing sources "
|
|
100
|
+
f"double-counts via the (seq_id, changeset_id) key. Use a separate "
|
|
101
|
+
f"--psql-dsn, or wipe the existing PG tables first."
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if bulk_load:
|
|
105
|
+
# Stream rows instead of buffering them to preserve order; buffering 180M+ JSON-bearing
|
|
106
|
+
# rows is what exhausts memory in a single INSERT. Then drop the secondary indexes and
|
|
107
|
+
# foreign keys so the load does not maintain them per row.
|
|
108
|
+
conn.execute("SET preserve_insertion_order = false")
|
|
109
|
+
for table, name, _add in _BULK_FKS:
|
|
110
|
+
_pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
|
|
111
|
+
for name, _create in _BULK_INDEXES:
|
|
112
|
+
_pg(conn, f"DROP INDEX IF EXISTS {name}")
|
|
113
|
+
conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
|
|
114
|
+
_push_chunked(conn, "changesets", _push_changesets)
|
|
115
|
+
_push_chunked(conn, "changeset_stats", _push_changeset_stats)
|
|
116
|
+
elif _pg_has_history(conn):
|
|
117
|
+
# The history layer (seq_id=0) is already in PG from the bulk load and never changes, so an
|
|
118
|
+
# incremental --update pushes only the live layer and its parents, not the 180M history rows.
|
|
119
|
+
live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
|
|
120
|
+
conn.execute(
|
|
121
|
+
"INSERT INTO pg_target.users SELECT * FROM users "
|
|
122
|
+
"WHERE uid IN (SELECT uid FROM changeset_stats WHERE seq_id <> 0) ON CONFLICT DO NOTHING"
|
|
123
|
+
)
|
|
124
|
+
_push_changesets(conn, f"WHERE {live_ids}")
|
|
125
|
+
_push_changeset_stats(conn, "WHERE seq_id <> 0")
|
|
126
|
+
else:
|
|
127
|
+
# No history in PG (a plain live target): push everything (live rows are all seq_id<>0).
|
|
128
|
+
conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
|
|
129
|
+
_push_changesets(conn)
|
|
130
|
+
_push_changeset_stats(conn)
|
|
131
|
+
|
|
132
|
+
conn.execute(
|
|
133
|
+
"""
|
|
134
|
+
INSERT INTO pg_target.state (source_url, last_seq, last_ts, updated_at)
|
|
135
|
+
SELECT source_url, last_seq, last_ts, updated_at FROM state
|
|
136
|
+
ON CONFLICT (source_url) DO UPDATE SET
|
|
137
|
+
last_seq = EXCLUDED.last_seq,
|
|
138
|
+
last_ts = EXCLUDED.last_ts,
|
|
139
|
+
updated_at = EXCLUDED.updated_at
|
|
140
|
+
"""
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if bulk_load:
|
|
144
|
+
# Rebuild once, with more memory for the sort-based index builds, then refresh planner stats.
|
|
145
|
+
for table, name, add in _BULK_FKS:
|
|
146
|
+
_pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
|
|
147
|
+
for _name, create in _BULK_INDEXES:
|
|
148
|
+
_pg(conn, f"SET maintenance_work_mem = '512MB'; {create}")
|
|
149
|
+
_pg(conn, "ANALYZE users")
|
|
150
|
+
_pg(conn, "ANALYZE changesets")
|
|
151
|
+
_pg(conn, "ANALYZE changeset_stats")
|
|
152
|
+
finally:
|
|
153
|
+
conn.execute("DETACH pg_target")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
__all__ = ["PG_SCHEMA", "to_psql"]
|
|
@@ -56,7 +56,7 @@ class ChangesetHandler(osmium.SimpleHandler):
|
|
|
56
56
|
|
|
57
57
|
keep = bool(cfg["changeset_meta"] and not cfg["hashtags"])
|
|
58
58
|
# Some editors only fill the `hashtags` tag (comment stays generic); checking
|
|
59
|
-
# comment alone silently drops those. Tokenize via regex on both
|
|
59
|
+
# comment alone silently drops those. Tokenize via regex on both, real data
|
|
60
60
|
# mixes `;`, space, and comma as separators inside `hashtags`.
|
|
61
61
|
comment = c.tags.get("comment", "")
|
|
62
62
|
hashtags_field = c.tags.get("hashtags", "")
|