osmsg 1.1.2__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {osmsg-1.1.2 → osmsg-1.2.1}/PKG-INFO +68 -2
- {osmsg-1.1.2 → osmsg-1.2.1}/README.md +67 -1
- osmsg-1.2.1/osmsg/__version__.py +1 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/cli.py +90 -4
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/__init__.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/ingest.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/exceptions.py +1 -1
- osmsg-1.2.1/osmsg/export/psql.py +143 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/handlers.py +1 -1
- osmsg-1.2.1/osmsg/history.py +255 -0
- osmsg-1.2.1/osmsg/maintain/__init__.py +7 -0
- osmsg-1.2.1/osmsg/maintain/cli.py +83 -0
- osmsg-1.2.1/osmsg/maintain/convert.py +301 -0
- osmsg-1.2.1/osmsg/maintain/manifest.py +62 -0
- osmsg-1.2.1/osmsg/maintain/month.py +117 -0
- osmsg-1.2.1/osmsg/maintain/parquet.py +39 -0
- osmsg-1.2.1/osmsg/maintain/pbf_split.py +78 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/pipeline.py +378 -100
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/replication.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/tm.py +1 -1
- {osmsg-1.1.2 → osmsg-1.2.1}/pyproject.toml +1 -1
- osmsg-1.1.2/osmsg/__version__.py +0 -1
- osmsg-1.1.2/osmsg/export/psql.py +0 -69
- {osmsg-1.1.2 → osmsg-1.2.1}/LICENSE +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/__init__.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/_http.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/_tick.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/auth.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/boundary.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/duckdb_schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/queries.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/db/schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/__init__.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/csv.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/json.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/markdown.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/export/parquet.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/fetch.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/geofabrik.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/models.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/pg_schema.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/py.typed +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/ui.py +0 -0
- {osmsg-1.1.2 → osmsg-1.2.1}/osmsg/workers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: osmsg
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: OpenStreetMap Stats Generator: Commandline
|
|
5
5
|
Keywords: osm,stats,commandline,openstreetmap
|
|
6
6
|
Author: Kshitij Raj Sharma
|
|
@@ -46,13 +46,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
|
|
|
46
46
|
|
|
47
47
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
48
48
|
|
|
49
|
-
##
|
|
49
|
+
## What does it do?
|
|
50
50
|
|
|
51
51
|
- Per-user create/modify/delete counts over any time window.
|
|
52
52
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
53
53
|
- Country and custom-boundary filters via Geofabrik.
|
|
54
54
|
- Cron-friendly resume with `--update`.
|
|
55
|
+
- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
|
|
55
56
|
- Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
|
|
57
|
+
- Cloud-native history: months covered by a published parquet dataset are read remotely.
|
|
56
58
|
|
|
57
59
|
## Install
|
|
58
60
|
|
|
@@ -68,6 +70,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
|
|
|
68
70
|
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
69
71
|
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
70
72
|
|
|
73
|
+
More ways to install:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
conda install -c conda-forge osmsg # conda / mamba
|
|
77
|
+
brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
|
|
81
|
+
and run it directly, no Python required.
|
|
82
|
+
|
|
71
83
|
## Quick start
|
|
72
84
|
|
|
73
85
|
```bash
|
|
@@ -78,6 +90,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
|
|
|
78
90
|
|
|
79
91
|
That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
|
|
80
92
|
|
|
93
|
+
## Set up a full history store
|
|
94
|
+
|
|
95
|
+
Two commands give you a complete, self-updating store. The first loads all of OSM history from the
|
|
96
|
+
published dataset and records where to resume; the second catches up to now and runs on a schedule.
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
osmsg --insert # load all history into stats.duckdb, then exit
|
|
100
|
+
osmsg --update # catch up to now (repeat on cron)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
|
|
104
|
+
current. For near-real-time, run `osmsg --update --url minute`.
|
|
105
|
+
|
|
106
|
+
Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
110
|
+
osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
osmsg --insert --start 2020-01-01 --end 2023-01-01
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Already have the planet files? Insert from them directly:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
|
|
123
|
+
```
|
|
124
|
+
|
|
81
125
|
## Tutorials
|
|
82
126
|
|
|
83
127
|
### 1. Stats for a country
|
|
@@ -169,6 +213,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
|
|
|
169
213
|
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
170
214
|
`-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
|
|
171
215
|
|
|
216
|
+
Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
|
|
217
|
+
refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
|
|
218
|
+
|
|
172
219
|
## Configuration
|
|
173
220
|
|
|
174
221
|
Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
|
|
@@ -184,7 +231,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
184
231
|
| `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
|
|
185
232
|
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
186
233
|
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
234
|
+
| `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
|
|
187
235
|
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
236
|
+
| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
|
|
237
|
+
| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
|
|
238
|
+
| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
|
|
239
|
+
| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
|
|
240
|
+
| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
|
|
188
241
|
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
189
242
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
190
243
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
@@ -192,6 +245,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
192
245
|
|
|
193
246
|
A `.env` file at the working directory is loaded automatically.
|
|
194
247
|
|
|
248
|
+
## Maintainers
|
|
249
|
+
|
|
250
|
+
Generating and publishing the history dataset is the `osmsg maintain` group:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
|
|
254
|
+
osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
|
|
255
|
+
osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
|
|
256
|
+
osmsg maintain publish work/out --repo osgeonepal/osmsg-history
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
|
|
260
|
+
|
|
195
261
|
## Documentation
|
|
196
262
|
|
|
197
263
|
- [Installation](./docs/Installation.md)
|
|
@@ -14,13 +14,15 @@ of nodes, ways, and relations created, modified, or deleted, written to parquet,
|
|
|
14
14
|
|
|
15
15
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
16
16
|
|
|
17
|
-
##
|
|
17
|
+
## What does it do?
|
|
18
18
|
|
|
19
19
|
- Per-user create/modify/delete counts over any time window.
|
|
20
20
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
21
21
|
- Country and custom-boundary filters via Geofabrik.
|
|
22
22
|
- Cron-friendly resume with `--update`.
|
|
23
|
+
- One-command setup: `osmsg --insert` loads all history into your store, `osmsg --update` keeps it current.
|
|
23
24
|
- Outputs you can query: parquet, csv, json, markdown, DuckDB, Postgres.
|
|
25
|
+
- Cloud-native history: months covered by a published parquet dataset are read remotely.
|
|
24
26
|
|
|
25
27
|
## Install
|
|
26
28
|
|
|
@@ -36,6 +38,16 @@ docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last
|
|
|
36
38
|
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
37
39
|
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
38
40
|
|
|
41
|
+
More ways to install:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
conda install -c conda-forge osmsg # conda / mamba
|
|
45
|
+
brew install osgeonepal/tap/osmsg # macOS / Linux (Homebrew tap)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
On Windows, download `osmsg.exe` from the [latest release](https://github.com/osgeonepal/osmsg/releases)
|
|
49
|
+
and run it directly, no Python required.
|
|
50
|
+
|
|
39
51
|
## Quick start
|
|
40
52
|
|
|
41
53
|
```bash
|
|
@@ -46,6 +58,38 @@ osmsg --hashtags hotosm --last day # only changesets tagged #hotosm
|
|
|
46
58
|
|
|
47
59
|
That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder.
|
|
48
60
|
|
|
61
|
+
## Set up a full history store
|
|
62
|
+
|
|
63
|
+
Two commands give you a complete, self-updating store. The first loads all of OSM history from the
|
|
64
|
+
published dataset and records where to resume; the second catches up to now and runs on a schedule.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
osmsg --insert # load all history into stats.duckdb, then exit
|
|
68
|
+
osmsg --update # catch up to now (repeat on cron)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
`osmsg` clears the multi-week backlog on day diffs, then refines to finer diffs as the store stays
|
|
72
|
+
current. For near-real-time, run `osmsg --update --url minute`.
|
|
73
|
+
|
|
74
|
+
Pick your store with one flag. DuckDB is the default (`stats.duckdb`); add a DSN for Postgres:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
osmsg --insert --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
78
|
+
osmsg --update --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Load only a slice with `--start/--end`; `--update` then continues from the end of that slice:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
osmsg --insert --start 2020-01-01 --end 2023-01-01
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Already have the planet files? Insert from them directly:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
osmsg --insert --osh-file history-latest.osh.pbf --changeset-file changesets-latest.osm.bz2
|
|
91
|
+
```
|
|
92
|
+
|
|
49
93
|
## Tutorials
|
|
50
94
|
|
|
51
95
|
### 1. Stats for a country
|
|
@@ -137,6 +181,9 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
|
|
|
137
181
|
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
138
182
|
`-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
|
|
139
183
|
|
|
184
|
+
Rerunning the same query with a different `-f` re-exports from the existing `<name>.duckdb` instead of
|
|
185
|
+
refetching, so adding a format is instant. Pass `--overwrite` to force a fresh recompute.
|
|
186
|
+
|
|
140
187
|
## Configuration
|
|
141
188
|
|
|
142
189
|
Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
|
|
@@ -152,7 +199,13 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
152
199
|
| `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
|
|
153
200
|
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
154
201
|
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
202
|
+
| `--overwrite` | (none) | off | Recompute even if `<name>.duckdb` already holds this exact query. |
|
|
155
203
|
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
204
|
+
| `--psql-bulk` | `OSMSG_PSQL_BULK` | off | Faster first full load to Postgres. |
|
|
205
|
+
| `--history` / `--no-history` | `OSMSG_HISTORY` | on | Read covered months from the published dataset. |
|
|
206
|
+
| `--history-url` | `OSMSG_HISTORY_URL` | `osmsg-history` | Published dataset location. |
|
|
207
|
+
| `--insert` | (none) | off | Load history into the store and seed resume, then exit. No window loads all of it. |
|
|
208
|
+
| `--osh-file` / `--changeset-file` | (none) | unset | Insert from local planet history + changeset files instead of the dataset. |
|
|
156
209
|
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
157
210
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
158
211
|
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
@@ -160,6 +213,19 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
|
|
|
160
213
|
|
|
161
214
|
A `.env` file at the working directory is loaded automatically.
|
|
162
215
|
|
|
216
|
+
## Maintainers
|
|
217
|
+
|
|
218
|
+
Generating and publishing the history dataset is the `osmsg maintain` group:
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
osmsg maintain month 2026-06 --repo osgeonepal/osmsg-history # append one finished month
|
|
222
|
+
osmsg maintain month 2026-06 --no-upload # generate locally, review, upload later
|
|
223
|
+
osmsg maintain convert history.osh.pbf changesets.osm.bz2 2005-01-01 2026-06-01 work --parts 24
|
|
224
|
+
osmsg maintain publish work/out --repo osgeonepal/osmsg-history
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
See [experiments/parquet-history](./experiments/parquet-history/README.md) for the full-history batch.
|
|
228
|
+
|
|
163
229
|
## Documentation
|
|
164
230
|
|
|
165
231
|
- [Installation](./docs/Installation.md)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.1"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Typer-based CLI for osmsg.
|
|
2
2
|
|
|
3
|
-
UTC throughout
|
|
3
|
+
UTC throughout, no display timezone. Outputs default to parquet (queryable from
|
|
4
4
|
disk by DuckDB / polars / pandas). Other formats: csv, json, markdown, psql.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -24,6 +24,7 @@ from .exceptions import (
|
|
|
24
24
|
OsmsgError,
|
|
25
25
|
UnknownRegionError,
|
|
26
26
|
)
|
|
27
|
+
from .maintain.cli import maintain_app
|
|
27
28
|
from .pipeline import RunConfig, run
|
|
28
29
|
from .ui import console, error, info, render_table, warn
|
|
29
30
|
|
|
@@ -36,6 +37,7 @@ app = typer.Typer(
|
|
|
36
37
|
no_args_is_help=False,
|
|
37
38
|
help="OpenStreetMap stats generator. Parquet-first, OAuth 2.0, UTC-only.",
|
|
38
39
|
)
|
|
40
|
+
app.add_typer(maintain_app, name="maintain")
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
class Period(StrEnum):
|
|
@@ -104,9 +106,10 @@ def _period_range(period: Period) -> tuple[dt.datetime, dt.datetime]:
|
|
|
104
106
|
raise ValueError(period)
|
|
105
107
|
|
|
106
108
|
|
|
107
|
-
@app.
|
|
109
|
+
@app.callback(invoke_without_command=True)
|
|
108
110
|
@use_yaml_config(param_name="config", param_help="YAML config file (CLI flags override its values).")
|
|
109
111
|
def main(
|
|
112
|
+
ctx: typer.Context,
|
|
110
113
|
version: Annotated[
|
|
111
114
|
bool | None,
|
|
112
115
|
typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
|
|
@@ -215,6 +218,15 @@ def main(
|
|
|
215
218
|
str | None,
|
|
216
219
|
typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
|
|
217
220
|
] = None,
|
|
221
|
+
psql_bulk: Annotated[
|
|
222
|
+
bool,
|
|
223
|
+
typer.Option(
|
|
224
|
+
"--psql-bulk",
|
|
225
|
+
envvar="OSMSG_PSQL_BULK",
|
|
226
|
+
help="Faster one-time psql load: drop secondary indexes and foreign keys during the push "
|
|
227
|
+
"and rebuild them after. Use for a full history import, not for incremental --update.",
|
|
228
|
+
),
|
|
229
|
+
] = False,
|
|
218
230
|
changeset_pad_hours: Annotated[
|
|
219
231
|
int,
|
|
220
232
|
typer.Option(
|
|
@@ -226,16 +238,76 @@ def main(
|
|
|
226
238
|
max=48,
|
|
227
239
|
),
|
|
228
240
|
] = 1,
|
|
241
|
+
history: Annotated[
|
|
242
|
+
bool,
|
|
243
|
+
typer.Option(
|
|
244
|
+
"--history/--no-history",
|
|
245
|
+
envvar="OSMSG_HISTORY",
|
|
246
|
+
help="Serve covered months from the published parquet (HuggingFace) and only download the "
|
|
247
|
+
"recent tail. Falls back to the live diff path if unavailable. Ignored by --update.",
|
|
248
|
+
),
|
|
249
|
+
] = True,
|
|
250
|
+
history_url: Annotated[
|
|
251
|
+
str,
|
|
252
|
+
typer.Option(
|
|
253
|
+
"--history-url",
|
|
254
|
+
envvar="OSMSG_HISTORY_URL",
|
|
255
|
+
help="Base URL of the published history dataset.",
|
|
256
|
+
),
|
|
257
|
+
] = "hf://datasets/kshitijrajsharma/osmsg-history",
|
|
258
|
+
insert: Annotated[
|
|
259
|
+
bool,
|
|
260
|
+
typer.Option(
|
|
261
|
+
"--insert",
|
|
262
|
+
help="Load history into the store and seed resume state, then exit. No window loads the "
|
|
263
|
+
"whole published history; --start/--end loads a slice. Follow with --update to catch up.",
|
|
264
|
+
),
|
|
265
|
+
] = False,
|
|
266
|
+
osh_file: Annotated[
|
|
267
|
+
str | None,
|
|
268
|
+
typer.Option("--osh-file", help="Insert from a local .osh.pbf instead of the published dataset."),
|
|
269
|
+
] = None,
|
|
270
|
+
changeset_file: Annotated[
|
|
271
|
+
str | None,
|
|
272
|
+
typer.Option("--changeset-file", help="Changeset dump (.osm.bz2) paired with --osh-file."),
|
|
273
|
+
] = None,
|
|
274
|
+
overwrite: Annotated[
|
|
275
|
+
bool,
|
|
276
|
+
typer.Option(
|
|
277
|
+
"--overwrite",
|
|
278
|
+
help="Recompute even if <name>.duckdb already holds this exact query; otherwise a rerun "
|
|
279
|
+
"that only changes the output format re-exports from the existing store.",
|
|
280
|
+
),
|
|
281
|
+
] = False,
|
|
229
282
|
) -> None:
|
|
230
|
-
"""Run osmsg."""
|
|
283
|
+
"""Run osmsg. With no subcommand this generates stats (or loads history with --insert)."""
|
|
284
|
+
if ctx.invoked_subcommand is not None:
|
|
285
|
+
return
|
|
231
286
|
if formats is None:
|
|
232
287
|
formats = [Format.parquet]
|
|
288
|
+
if psql_dsn and Format.psql not in formats:
|
|
289
|
+
formats.append(Format.psql)
|
|
233
290
|
if sum(1 for x in (start, last, days) if x) > 1:
|
|
234
|
-
error("--start, --last, and --days are mutually exclusive
|
|
291
|
+
error("--start, --last, and --days are mutually exclusive, pick one.")
|
|
235
292
|
raise typer.Exit(code=2)
|
|
236
293
|
if update and any(x is not None for x in (start, end, last, days)):
|
|
237
294
|
error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
|
|
238
295
|
raise typer.Exit(code=2)
|
|
296
|
+
if insert and update:
|
|
297
|
+
error("--insert and --update are mutually exclusive; insert first, then update.")
|
|
298
|
+
raise typer.Exit(code=2)
|
|
299
|
+
if insert and (last is not None or days is not None):
|
|
300
|
+
error("--insert takes --start/--end (or no window), not --last/--days.")
|
|
301
|
+
raise typer.Exit(code=2)
|
|
302
|
+
if (osh_file is None) != (changeset_file is None):
|
|
303
|
+
error("--osh-file and --changeset-file must be given together.")
|
|
304
|
+
raise typer.Exit(code=2)
|
|
305
|
+
if osh_file and not insert:
|
|
306
|
+
error("--osh-file/--changeset-file are only valid with --insert.")
|
|
307
|
+
raise typer.Exit(code=2)
|
|
308
|
+
if psql_bulk and update:
|
|
309
|
+
error("--psql-bulk is for a one-time full load (drops indexes/keys); do not use it with --update.")
|
|
310
|
+
raise typer.Exit(code=2)
|
|
239
311
|
if Format.psql in formats and not psql_dsn:
|
|
240
312
|
error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
|
|
241
313
|
raise typer.Exit(code=2)
|
|
@@ -267,7 +339,14 @@ def main(
|
|
|
267
339
|
osm_username=username,
|
|
268
340
|
osm_password=_read_password_stdin() if password_stdin else None,
|
|
269
341
|
psql_dsn=psql_dsn,
|
|
342
|
+
psql_bulk=psql_bulk,
|
|
270
343
|
changeset_pad_hours=changeset_pad_hours,
|
|
344
|
+
history_mode="auto" if history else "off",
|
|
345
|
+
history_url=history_url,
|
|
346
|
+
insert=insert,
|
|
347
|
+
osh_file=osh_file,
|
|
348
|
+
changeset_file=changeset_file,
|
|
349
|
+
overwrite=overwrite,
|
|
271
350
|
)
|
|
272
351
|
|
|
273
352
|
if last is not None:
|
|
@@ -300,6 +379,13 @@ def main(
|
|
|
300
379
|
error(str(exc))
|
|
301
380
|
raise typer.Exit(code=2) from exc
|
|
302
381
|
|
|
382
|
+
if insert:
|
|
383
|
+
info(f"insert complete: {result['rows']:,} history changeset rows loaded.")
|
|
384
|
+
for label, path in (result.get("files") or {}).items():
|
|
385
|
+
console.print(f"[green]✓[/green] {label}: [bold]{path}[/bold]")
|
|
386
|
+
console.print("Next: [bold]osmsg --update[/bold] to catch up to now.")
|
|
387
|
+
return
|
|
388
|
+
|
|
303
389
|
rows_data = result.get("rows_data") or []
|
|
304
390
|
display_n = min(rows or 20, len(rows_data))
|
|
305
391
|
render_table(
|
|
@@ -106,7 +106,7 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
|
|
|
106
106
|
_quarantine_corrupt(parquet_dir)
|
|
107
107
|
|
|
108
108
|
def pattern(name: str) -> str:
|
|
109
|
-
# read_parquet() takes a literal
|
|
109
|
+
# read_parquet() takes a literal, escape so quoted paths can't break out.
|
|
110
110
|
return _sql_escape((parquet_dir / f"temp_*_{name}_*.parquet").as_posix())
|
|
111
111
|
|
|
112
112
|
conn.execute("BEGIN")
|
|
@@ -25,7 +25,7 @@ class GeofabrikAuthError(OsmsgError):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class NoDataFoundError(Exception):
|
|
28
|
-
"""Empty range
|
|
28
|
+
"""Empty range, info condition, not a failure (CLI exits 0). Not an OsmsgError on purpose."""
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
__all__ = [
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""PostgreSQL exporter via DuckDB's postgres extension."""
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
|
|
5
|
+
from ..exceptions import OsmsgError
|
|
6
|
+
from ..pg_schema import PG_SCHEMA
|
|
7
|
+
|
|
8
|
+
_BULK_INDEXES = [
|
|
9
|
+
("idx_changesets_created_at", "CREATE INDEX idx_changesets_created_at ON changesets (created_at)"),
|
|
10
|
+
("idx_changesets_geom", "CREATE INDEX idx_changesets_geom ON changesets USING GIST (geom)"),
|
|
11
|
+
("idx_changeset_stats_uid", "CREATE INDEX idx_changeset_stats_uid ON changeset_stats (uid)"),
|
|
12
|
+
]
|
|
13
|
+
_BULK_FKS = [
|
|
14
|
+
("changesets", "changesets_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
|
|
15
|
+
(
|
|
16
|
+
"changeset_stats",
|
|
17
|
+
"changeset_stats_changeset_id_fkey",
|
|
18
|
+
"FOREIGN KEY (changeset_id) REFERENCES changesets (changeset_id)",
|
|
19
|
+
),
|
|
20
|
+
("changeset_stats", "changeset_stats_uid_fkey", "FOREIGN KEY (uid) REFERENCES users (uid)"),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_BULK_COMMIT_CHUNKS = 32
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pg(conn: duckdb.DuckDBPyConnection, sql: str) -> None:
|
|
28
|
+
conn.execute(f"CALL postgres_execute('pg_target', $${sql}$$)")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _pg_has_history(conn: duckdb.DuckDBPyConnection) -> bool:
|
|
32
|
+
"""True if the PG target already holds the history layer (seq_id=0); checked cheaply with LIMIT 1."""
|
|
33
|
+
probe = "SELECT count(*) FROM (SELECT 1 FROM pg_target.changeset_stats WHERE seq_id = 0 LIMIT 1) t"
|
|
34
|
+
row = conn.execute(probe).fetchone()
|
|
35
|
+
return bool(row and row[0])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _push_changesets(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
|
|
39
|
+
# Newer non-NULL wins, NULL never downgrades (mirrors the DuckDB-side merge).
|
|
40
|
+
conn.execute(
|
|
41
|
+
f"""
|
|
42
|
+
INSERT INTO pg_target.changesets AS c (changeset_id, uid, created_at, hashtags, editor, geom)
|
|
43
|
+
SELECT changeset_id, uid, created_at, hashtags, editor, geom FROM changesets {where}
|
|
44
|
+
ON CONFLICT (changeset_id) DO UPDATE SET
|
|
45
|
+
created_at = COALESCE(EXCLUDED.created_at, c.created_at),
|
|
46
|
+
hashtags = COALESCE(EXCLUDED.hashtags, c.hashtags),
|
|
47
|
+
editor = COALESCE(EXCLUDED.editor, c.editor),
|
|
48
|
+
geom = COALESCE(EXCLUDED.geom, c.geom)
|
|
49
|
+
"""
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _push_changeset_stats(conn: duckdb.DuckDBPyConnection, where: str = "") -> None:
|
|
54
|
+
conn.execute(f"INSERT INTO pg_target.changeset_stats SELECT * FROM changeset_stats {where} ON CONFLICT DO NOTHING")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _push_chunked(conn: duckdb.DuckDBPyConnection, source: str, push) -> None:
|
|
58
|
+
"""Call push() once per changeset_id range so each range commits on its own."""
|
|
59
|
+
bounds = conn.execute(f"SELECT min(changeset_id), max(changeset_id) FROM {source}").fetchone()
|
|
60
|
+
if not bounds or bounds[0] is None:
|
|
61
|
+
return
|
|
62
|
+
lo, hi = bounds
|
|
63
|
+
step = (hi - lo) // _BULK_COMMIT_CHUNKS + 1
|
|
64
|
+
cursor = lo
|
|
65
|
+
while cursor <= hi:
|
|
66
|
+
push(conn, f"WHERE changeset_id >= {cursor} AND changeset_id < {cursor + step}")
|
|
67
|
+
cursor += step
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_psql(conn: duckdb.DuckDBPyConnection, dsn: str, *, bulk_load: bool = False) -> None:
|
|
71
|
+
"""Push every osmsg table to the libpq DSN target. bulk_load is for the one-time full-history
|
|
72
|
+
import (drops indexes and foreign keys, streams, rebuilds, commits per range); leave it off for
|
|
73
|
+
incremental --update pushes. The DSN is interpolated into ATTACH, so it must be trusted."""
|
|
74
|
+
conn.execute("INSTALL postgres")
|
|
75
|
+
conn.execute("LOAD postgres")
|
|
76
|
+
conn.execute("INSTALL spatial")
|
|
77
|
+
conn.execute("LOAD spatial")
|
|
78
|
+
safe_dsn = dsn.replace("'", "''")
|
|
79
|
+
conn.execute(f"ATTACH '{safe_dsn}' AS pg_target (TYPE postgres)")
|
|
80
|
+
try:
|
|
81
|
+
for stmt in PG_SCHEMA.strip().split(";"):
|
|
82
|
+
stmt = stmt.strip()
|
|
83
|
+
if stmt:
|
|
84
|
+
_pg(conn, stmt)
|
|
85
|
+
|
|
86
|
+
# Refuse cross-source push: would double-count via the (seq_id, changeset_id) PK.
|
|
87
|
+
local_sources = {r[0] for r in conn.execute("SELECT source_url FROM state").fetchall()}
|
|
88
|
+
existing_sources = {r[0] for r in conn.execute("SELECT source_url FROM pg_target.state").fetchall()}
|
|
89
|
+
cross_source = existing_sources - local_sources
|
|
90
|
+
if cross_source and local_sources:
|
|
91
|
+
raise OsmsgError(
|
|
92
|
+
f"PG target already has data from source(s) {sorted(cross_source)} "
|
|
93
|
+
f"but this run pushes from {sorted(local_sources)}. Mixing sources "
|
|
94
|
+
f"double-counts via the (seq_id, changeset_id) key. Use a separate "
|
|
95
|
+
f"--psql-dsn, or wipe the existing PG tables first."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if bulk_load:
|
|
99
|
+
conn.execute("SET preserve_insertion_order = false")
|
|
100
|
+
for table, name, _add in _BULK_FKS:
|
|
101
|
+
_pg(conn, f"ALTER TABLE {table} DROP CONSTRAINT IF EXISTS {name}")
|
|
102
|
+
for name, _create in _BULK_INDEXES:
|
|
103
|
+
_pg(conn, f"DROP INDEX IF EXISTS {name}")
|
|
104
|
+
conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
|
|
105
|
+
_push_chunked(conn, "changesets", _push_changesets)
|
|
106
|
+
_push_chunked(conn, "changeset_stats", _push_changeset_stats)
|
|
107
|
+
elif _pg_has_history(conn):
|
|
108
|
+
live_ids = "changeset_id IN (SELECT changeset_id FROM changeset_stats WHERE seq_id <> 0)"
|
|
109
|
+
conn.execute(
|
|
110
|
+
"INSERT INTO pg_target.users SELECT * FROM users "
|
|
111
|
+
"WHERE uid IN (SELECT uid FROM changeset_stats WHERE seq_id <> 0) ON CONFLICT DO NOTHING"
|
|
112
|
+
)
|
|
113
|
+
_push_changesets(conn, f"WHERE {live_ids}")
|
|
114
|
+
_push_changeset_stats(conn, "WHERE seq_id <> 0")
|
|
115
|
+
else:
|
|
116
|
+
conn.execute("INSERT INTO pg_target.users SELECT * FROM users ON CONFLICT DO NOTHING")
|
|
117
|
+
_push_changesets(conn)
|
|
118
|
+
_push_changeset_stats(conn)
|
|
119
|
+
|
|
120
|
+
conn.execute(
|
|
121
|
+
"""
|
|
122
|
+
INSERT INTO pg_target.state (source_url, last_seq, last_ts, updated_at)
|
|
123
|
+
SELECT source_url, last_seq, last_ts, updated_at FROM state
|
|
124
|
+
ON CONFLICT (source_url) DO UPDATE SET
|
|
125
|
+
last_seq = EXCLUDED.last_seq,
|
|
126
|
+
last_ts = EXCLUDED.last_ts,
|
|
127
|
+
updated_at = EXCLUDED.updated_at
|
|
128
|
+
"""
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
if bulk_load:
|
|
132
|
+
for table, name, add in _BULK_FKS:
|
|
133
|
+
_pg(conn, f"ALTER TABLE {table} ADD CONSTRAINT {name} {add}")
|
|
134
|
+
for _name, create in _BULK_INDEXES:
|
|
135
|
+
_pg(conn, f"SET maintenance_work_mem = '512MB'; {create}")
|
|
136
|
+
_pg(conn, "ANALYZE users")
|
|
137
|
+
_pg(conn, "ANALYZE changesets")
|
|
138
|
+
_pg(conn, "ANALYZE changeset_stats")
|
|
139
|
+
finally:
|
|
140
|
+
conn.execute("DETACH pg_target")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
__all__ = ["PG_SCHEMA", "to_psql"]
|
|
@@ -56,7 +56,7 @@ class ChangesetHandler(osmium.SimpleHandler):
|
|
|
56
56
|
|
|
57
57
|
keep = bool(cfg["changeset_meta"] and not cfg["hashtags"])
|
|
58
58
|
# Some editors only fill the `hashtags` tag (comment stays generic); checking
|
|
59
|
-
# comment alone silently drops those. Tokenize via regex on both
|
|
59
|
+
# comment alone silently drops those. Tokenize via regex on both, real data
|
|
60
60
|
# mixes `;`, space, and comma as separators inside `hashtags`.
|
|
61
61
|
comment = c.tags.get("comment", "")
|
|
62
62
|
hashtags_field = c.tags.get("hashtags", "")
|