osmsg 1.0.2__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of osmsg might be problematic. Click here for more details.

Files changed (39) hide show
  1. {osmsg-1.0.2 → osmsg-1.1.0}/PKG-INFO +58 -8
  2. {osmsg-1.0.2 → osmsg-1.1.0}/README.md +57 -7
  3. osmsg-1.1.0/osmsg/__version__.py +1 -0
  4. osmsg-1.1.0/osmsg/_tick.py +71 -0
  5. osmsg-1.1.0/osmsg/boundary.py +37 -0
  6. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/cli.py +63 -12
  7. osmsg-1.1.0/osmsg/db/duckdb_schema.py +41 -0
  8. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/ingest.py +31 -1
  9. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/queries.py +13 -18
  10. osmsg-1.1.0/osmsg/db/schema.py +51 -0
  11. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/markdown.py +28 -3
  12. osmsg-1.1.0/osmsg/export/psql.py +69 -0
  13. osmsg-1.1.0/osmsg/geofabrik.py +65 -0
  14. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/handlers.py +11 -5
  15. osmsg-1.1.0/osmsg/pg_schema.py +42 -0
  16. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/pipeline.py +193 -44
  17. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/replication.py +51 -32
  18. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/ui.py +2 -2
  19. {osmsg-1.0.2 → osmsg-1.1.0}/pyproject.toml +6 -2
  20. osmsg-1.0.2/osmsg/__version__.py +0 -1
  21. osmsg-1.0.2/osmsg/boundary.py +0 -37
  22. osmsg-1.0.2/osmsg/db/schema.py +0 -111
  23. osmsg-1.0.2/osmsg/export/psql.py +0 -89
  24. osmsg-1.0.2/osmsg/geofabrik.py +0 -41
  25. {osmsg-1.0.2 → osmsg-1.1.0}/LICENSE +0 -0
  26. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/__init__.py +0 -0
  27. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/_http.py +0 -0
  28. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/auth.py +0 -0
  29. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/__init__.py +0 -0
  30. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/exceptions.py +0 -0
  31. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/__init__.py +0 -0
  32. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/csv.py +0 -0
  33. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/json.py +0 -0
  34. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/parquet.py +0 -0
  35. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/fetch.py +0 -0
  36. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/models.py +0 -0
  37. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/py.typed +0 -0
  38. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/tm.py +0 -0
  39. {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -41,11 +41,12 @@ Description-Content-Type: text/markdown
41
41
  [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
42
42
  [![Container](https://img.shields.io/badge/ghcr.io-osgeonepal%2Fosmsg-2496ED?logo=docker)](https://github.com/osgeonepal/osmsg/pkgs/container/osmsg)
43
43
 
44
- **OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
44
+ **OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
45
+ of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
45
46
 
46
47
  A Project of [OSGeo Nepal](https://osgeonepal.org).
47
48
 
48
- ## What you get
49
+ ## Features
49
50
 
50
51
  - Per-user create/modify/delete counts over any time window.
51
52
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
@@ -58,11 +59,15 @@ A Project of [OSGeo Nepal](https://osgeonepal.org).
58
59
  Pick the one that fits how you work.
59
60
 
60
61
  ```bash
62
+ uvx --from osmsg osmsg --last hour # zero-install, one-shot run
61
63
  pip install osmsg # into your project
62
64
  uv tool install osmsg # standalone CLI
63
65
  docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last hour
64
66
  ```
65
67
 
68
+ `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
69
+ with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
70
+
66
71
  ## Quick start
67
72
 
68
73
  ```bash
@@ -81,7 +86,8 @@ That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder
81
86
  osmsg --country nepal --last day
82
87
  ```
83
88
 
84
- `--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD` in your shell or a `.env` file:
89
+ `--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
90
+ in your shell or a `.env` file:
85
91
 
86
92
  ```bash
87
93
  export OSM_USERNAME=you
@@ -115,7 +121,24 @@ duckdb stats.duckdb -c "SELECT username, SUM(nodes_created) AS n
115
121
 
116
122
  Same schema in DuckDB and Postgres: `users`, `changesets`, `changeset_stats`, `state`.
117
123
 
118
- ### 5. Use it as a library
124
+ ### 5. Run the API
125
+
126
+ Push stats into Postgres, then start the Litestar API:
127
+
128
+ ```bash
129
+ osmsg --last day --format psql --psql-dsn "postgresql://user:pass@localhost/osmsg"
130
+ litestar --app api.app:app run --host 0.0.0.0 --port 8000
131
+ ```
132
+
133
+ ```text
134
+ GET /health
135
+ GET /api/v1/user-stats?start=2026-05-01T00:00:00Z&end=2026-05-02T00:00:00Z
136
+ GET /docs
137
+ ```
138
+
139
+ For self-hosting with Docker Compose and systemd, see [docs/infra.md](./docs/infra.md).
140
+
141
+ ### 6. Use it as a library
119
142
 
120
143
  ```python
121
144
  from datetime import datetime, UTC
@@ -132,7 +155,8 @@ print(result["files"]["parquet"])
132
155
 
133
156
  Same pipeline as the CLI.
134
157
 
135
- ### 6. Long flag lists? Use a config
158
+ ### 7. Long flag lists? Use a config
159
+
136
160
 
137
161
  ```bash
138
162
  osmsg --config nepal.yaml
@@ -142,12 +166,37 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
142
166
 
143
167
  ## Output formats
144
168
 
145
- Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
169
+ Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
170
+ `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
171
+
172
+ ## Configuration
173
+
174
+ Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
175
+ docker-compose `environment:` block all reach the same setting. CLI flag wins over env var.
176
+
177
+ | CLI flag | Env var | Default | Notes |
178
+ | --- | --- | --- | --- |
179
+ | `--name` | `OSMSG_NAME` | `stats` | Output basename; sets `<name>.duckdb`. |
180
+ | `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
181
+ | `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
182
+ | `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
183
+ | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
184
+ | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
185
+ | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
186
+ | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
187
+ | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
188
+ | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
189
+ | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
190
+ | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
191
+ | OSM credentials (Geofabrik) | `OSM_USERNAME`, `OSM_PASSWORD` | unset | Required only when a Geofabrik URL is in use. |
192
+
193
+ A `.env` file at the working directory is loaded automatically.
146
194
 
147
195
  ## Documentation
148
196
 
149
197
  - [Installation](./docs/Installation.md)
150
198
  - [Manual](./docs/Manual.md) (every flag, with examples)
199
+ - [Self-hosting / Docker Compose](./docs/infra.md)
151
200
  - [Version control / release notes](./docs/Version_control.md)
152
201
 
153
202
  ## Contributing
@@ -162,7 +211,8 @@ uv run pre-commit install
162
211
  uv run pytest -m "not network"
163
212
  ```
164
213
 
165
- Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR. Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
214
+ Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
215
+ Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
166
216
 
167
217
  ## License
168
218
 
@@ -9,11 +9,12 @@
9
9
  [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
10
10
  [![Container](https://img.shields.io/badge/ghcr.io-osgeonepal%2Fosmsg-2496ED?logo=docker)](https://github.com/osgeonepal/osmsg/pkgs/container/osmsg)
11
11
 
12
- **OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
12
+ **OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
13
+ of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
13
14
 
14
15
  A Project of [OSGeo Nepal](https://osgeonepal.org).
15
16
 
16
- ## What you get
17
+ ## Features
17
18
 
18
19
  - Per-user create/modify/delete counts over any time window.
19
20
  - Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
@@ -26,11 +27,15 @@ A Project of [OSGeo Nepal](https://osgeonepal.org).
26
27
  Pick the one that fits how you work.
27
28
 
28
29
  ```bash
30
+ uvx --from osmsg osmsg --last hour # zero-install, one-shot run
29
31
  pip install osmsg # into your project
30
32
  uv tool install osmsg # standalone CLI
31
33
  docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last hour
32
34
  ```
33
35
 
36
+ `uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
37
+ with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
38
+
34
39
  ## Quick start
35
40
 
36
41
  ```bash
@@ -49,7 +54,8 @@ That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder
49
54
  osmsg --country nepal --last day
50
55
  ```
51
56
 
52
- `--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD` in your shell or a `.env` file:
57
+ `--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
58
+ in your shell or a `.env` file:
53
59
 
54
60
  ```bash
55
61
  export OSM_USERNAME=you
@@ -83,7 +89,24 @@ duckdb stats.duckdb -c "SELECT username, SUM(nodes_created) AS n
83
89
 
84
90
  Same schema in DuckDB and Postgres: `users`, `changesets`, `changeset_stats`, `state`.
85
91
 
86
- ### 5. Use it as a library
92
+ ### 5. Run the API
93
+
94
+ Push stats into Postgres, then start the Litestar API:
95
+
96
+ ```bash
97
+ osmsg --last day --format psql --psql-dsn "postgresql://user:pass@localhost/osmsg"
98
+ litestar --app api.app:app run --host 0.0.0.0 --port 8000
99
+ ```
100
+
101
+ ```text
102
+ GET /health
103
+ GET /api/v1/user-stats?start=2026-05-01T00:00:00Z&end=2026-05-02T00:00:00Z
104
+ GET /docs
105
+ ```
106
+
107
+ For self-hosting with Docker Compose and systemd, see [docs/infra.md](./docs/infra.md).
108
+
109
+ ### 6. Use it as a library
87
110
 
88
111
  ```python
89
112
  from datetime import datetime, UTC
@@ -100,7 +123,8 @@ print(result["files"]["parquet"])
100
123
 
101
124
  Same pipeline as the CLI.
102
125
 
103
- ### 6. Long flag lists? Use a config
126
+ ### 7. Long flag lists? Use a config
127
+
104
128
 
105
129
  ```bash
106
130
  osmsg --config nepal.yaml
@@ -110,12 +134,37 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
110
134
 
111
135
  ## Output formats
112
136
 
113
- Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
137
+ Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
138
+ `-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
139
+
140
+ ## Configuration
141
+
142
+ Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
143
+ docker-compose `environment:` block all reach the same setting. CLI flag wins over env var.
144
+
145
+ | CLI flag | Env var | Default | Notes |
146
+ | --- | --- | --- | --- |
147
+ | `--name` | `OSMSG_NAME` | `stats` | Output basename; sets `<name>.duckdb`. |
148
+ | `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
149
+ | `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
150
+ | `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
151
+ | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
152
+ | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
153
+ | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
154
+ | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
155
+ | `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
156
+ | `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
157
+ | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
158
+ | (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
159
+ | OSM credentials (Geofabrik) | `OSM_USERNAME`, `OSM_PASSWORD` | unset | Required only when a Geofabrik URL is in use. |
160
+
161
+ A `.env` file at the working directory is loaded automatically.
114
162
 
115
163
  ## Documentation
116
164
 
117
165
  - [Installation](./docs/Installation.md)
118
166
  - [Manual](./docs/Manual.md) (every flag, with examples)
167
+ - [Self-hosting / Docker Compose](./docs/infra.md)
119
168
  - [Version control / release notes](./docs/Version_control.md)
120
169
 
121
170
  ## Contributing
@@ -130,7 +179,8 @@ uv run pre-commit install
130
179
  uv run pytest -m "not network"
131
180
  ```
132
181
 
133
- Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR. Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
182
+ Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
183
+ Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
134
184
 
135
185
  ## License
136
186
 
@@ -0,0 +1 @@
1
+ __version__ = "1.1.0"
@@ -0,0 +1,71 @@
1
+ """Worker tick: bootstrap on first run, --update thereafter."""
2
+
3
+ import fcntl
4
+ import os
5
+ import shlex
6
+ import subprocess
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from .db import connect, create_tables, get_state
11
+ from .geofabrik import country_update_url
12
+ from .replication import resolve_url
13
+
14
+
15
+ def _has_state(db_path: Path, source_url: str) -> bool:
16
+ if not db_path.exists():
17
+ return False
18
+ conn = connect(str(db_path))
19
+ create_tables(conn)
20
+ result = get_state(conn, source_url) is not None
21
+ conn.close()
22
+ return result
23
+
24
+
25
+ def _parse_arg(args: list[str], flag: str) -> str | None:
26
+ for i, arg in enumerate(args):
27
+ if arg == flag and i + 1 < len(args):
28
+ return args[i + 1]
29
+ return None
30
+
31
+
32
+ def main() -> int:
33
+ extra_args = shlex.split(os.environ.get("OSMSG_EXTRA_ARGS", ""))
34
+ bootstrap = os.environ.get("OSMSG_BOOTSTRAP", "hour")
35
+ bootstrap_days = os.environ.get("OSMSG_BOOTSTRAP_DAYS")
36
+ name = _parse_arg(extra_args, "--name") or "stats"
37
+ out = Path(_parse_arg(extra_args, "--output-dir") or "/var/lib/osmsg")
38
+ country = _parse_arg(extra_args, "--country")
39
+ url = _parse_arg(extra_args, "--url") or "minute"
40
+
41
+ out.mkdir(parents=True, exist_ok=True)
42
+
43
+ lock_path = out / f"{name}.lock"
44
+ lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
45
+ try:
46
+ fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
47
+ except BlockingIOError:
48
+ print("[osmsg-tick] previous tick still running, skipping", flush=True)
49
+ return 0
50
+
51
+ source_url = country_update_url(country) if country else resolve_url(url)
52
+ db_path = out / f"{name}.duckdb"
53
+
54
+ extra_set = set(extra_args)
55
+ cmd = ["osmsg"] + extra_args
56
+ if not (extra_set & {"--all", "--keys"}):
57
+ cmd.append("--all")
58
+
59
+ if _has_state(db_path, source_url):
60
+ cmd.append("--update")
61
+ elif bootstrap_days:
62
+ cmd.extend(["--days", bootstrap_days])
63
+ else:
64
+ cmd.extend(["--last", bootstrap])
65
+
66
+ print(f"[osmsg-tick] {' '.join(cmd)}", flush=True)
67
+ return subprocess.call(cmd)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ sys.exit(main())
@@ -0,0 +1,37 @@
1
+ """Boundary GeoJSON parsing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from shapely.geometry import MultiPolygon, Polygon, shape
10
+ from shapely.geometry.base import BaseGeometry
11
+
12
+ from .exceptions import UnknownRegionError
13
+ from .geofabrik import country_geometry
14
+
15
+
16
+ def load_boundary(input_data: str) -> BaseGeometry:
17
+ try:
18
+ payload: Any = json.loads(input_data)
19
+ except json.JSONDecodeError:
20
+ path = Path(input_data)
21
+ if path.is_file():
22
+ payload = json.loads(path.read_text())
23
+ else:
24
+ try:
25
+ return country_geometry(input_data)
26
+ except UnknownRegionError:
27
+ raise ValueError(
28
+ f"--boundary {input_data!r} is not valid JSON, a file path, or a known Geofabrik region name."
29
+ ) from None
30
+
31
+ geometry = payload.get("geometry") if "geometry" in payload else payload
32
+ if not geometry or geometry.get("type") not in ("Polygon", "MultiPolygon"):
33
+ raise ValueError("Boundary must be a Polygon or MultiPolygon GeoJSON.")
34
+ geom = shape(geometry)
35
+ if isinstance(geom, (Polygon, MultiPolygon)):
36
+ return geom
37
+ raise ValueError(f"Unexpected geometry type: {type(geom).__name__}")
@@ -111,18 +111,29 @@ def main(
111
111
  bool | None,
112
112
  typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
113
113
  ] = None,
114
- name: Annotated[str, typer.Option(help="Output basename. Writes <name>.duckdb + selected formats.")] = "stats",
114
+ name: Annotated[
115
+ str,
116
+ typer.Option(envvar="OSMSG_NAME", help="Output basename. Writes <name>.duckdb + selected formats."),
117
+ ] = "stats",
115
118
  start: Annotated[str | None, typer.Option(help="ISO start (UTC). 'YYYY-MM-DD HH:MM:SS'.")] = None,
116
119
  end: Annotated[str | None, typer.Option(help="ISO end (UTC). Defaults to now.")] = None,
117
120
  last: Annotated[Period | None, typer.Option(help="Convenience: hour|day|week|month|year.")] = None,
118
121
  days: Annotated[int | None, typer.Option(help="Last N days (mutually exclusive with --last).")] = None,
119
122
  country: Annotated[
120
123
  list[str] | None,
121
- typer.Option("--country", help="Geofabrik region id(s); resolved live. Requires OSM credentials."),
124
+ typer.Option(
125
+ "--country",
126
+ envvar="OSMSG_COUNTRY",
127
+ help="Geofabrik region id(s); resolved live. Requires OSM credentials. Comma-separated when set via env.",
128
+ ),
122
129
  ] = None,
123
130
  url: Annotated[
124
131
  list[str] | None,
125
- typer.Option("--url", help="Replication URL(s). Shortcuts: minute, hour, day."),
132
+ typer.Option(
133
+ "--url",
134
+ envvar="OSMSG_URL",
135
+ help="Replication URL(s). Shortcuts: minute, hour, day. Comma-separated when set via env.",
136
+ ),
126
137
  ] = None,
127
138
  hashtags: Annotated[
128
139
  list[str] | None,
@@ -134,29 +145,54 @@ def main(
134
145
  list[str] | None,
135
146
  typer.Option("--users", help="Filter to OSM usernames (case-sensitive, exact match). Repeat for more."),
136
147
  ] = None,
137
- workers: Annotated[int | None, typer.Option(help="Parallel workers (default: cpu count).")] = None,
148
+ workers: Annotated[
149
+ int | None,
150
+ typer.Option(envvar="OSMSG_WORKERS", help="Parallel workers (default: cpu count)."),
151
+ ] = None,
138
152
  rows: Annotated[
139
153
  int | None,
140
154
  typer.Option(help="Cap rows shown in the console table. Files always carry the full set."),
141
155
  ] = None,
142
- boundary: Annotated[str | None, typer.Option(help="Path to GeoJSON or inline geojson string.")] = None,
143
- formats: Annotated[list[Format] | None, typer.Option("--format", "-f", help="One or more output formats.")] = None,
156
+ boundary: Annotated[
157
+ str | None,
158
+ typer.Option(
159
+ envvar="OSMSG_BOUNDARY",
160
+ help="Boundary filter: Geofabrik region name (e.g. 'nepal'), GeoJSON file path, or inline GeoJSON.",
161
+ ),
162
+ ] = None,
163
+ formats: Annotated[
164
+ list[Format] | None,
165
+ typer.Option(
166
+ "--format",
167
+ "-f",
168
+ envvar="OSMSG_FORMAT",
169
+ help="One or more output formats. Comma-separated when set via env.",
170
+ ),
171
+ ] = None,
144
172
  summary: Annotated[bool, typer.Option(help="Also write <name>_summary.parquet + summary.md.")] = False,
145
173
  changeset: Annotated[bool, typer.Option(hidden=True)] = False,
146
- all_tags: Annotated[bool, typer.Option("--all-tags", help="Track every tag key.")] = False,
147
- key_value: Annotated[bool, typer.Option("--key-value", help="Store key=value combos. Implies --all-tags.")] = False,
174
+ all_stats: Annotated[
175
+ bool,
176
+ typer.Option(
177
+ "--all",
178
+ help="Collect all tag key=value stats and changeset metadata (hashtags, editors).",
179
+ ),
180
+ ] = False,
181
+ keys_only: Annotated[bool, typer.Option("--keys", help="Collect tag key stats only (no value breakdown).")] = False,
148
182
  exact_lookup: Annotated[
149
183
  bool, typer.Option("--exact-lookup", help="Hashtag whole-word match. Only meaningful with --hashtags.")
150
184
  ] = False,
151
185
  tm_stats: Annotated[bool, typer.Option("--tm-stats", help="Attach Tasking Manager totals.")] = False,
152
186
  update: Annotated[bool, typer.Option(help="Append to existing <name>.duckdb.")] = False,
153
187
  cache_dir: Annotated[
154
- Path, typer.Option("--cache-dir", help="Cache dir for downloaded OSM files.")
188
+ Path,
189
+ typer.Option("--cache-dir", envvar="OSMSG_CACHE_DIR", help="Cache dir for downloaded OSM files."),
155
190
  ] = DEFAULT_CACHE_DIR,
156
191
  output_dir: Annotated[
157
192
  Path,
158
193
  typer.Option(
159
194
  "--output-dir",
195
+ envvar="OSMSG_OUTPUT_DIR",
160
196
  help="Where to write <name>.duckdb + selected formats. Defaults to current directory.",
161
197
  ),
162
198
  ] = Path("."),
@@ -175,7 +211,21 @@ def main(
175
211
  help="Read OSM password from stdin (one line). Else $OSM_PASSWORD, then prompt.",
176
212
  ),
177
213
  ] = False,
178
- psql_dsn: Annotated[str | None, typer.Option("--psql-dsn", help="libpq DSN for --format psql.")] = None,
214
+ psql_dsn: Annotated[
215
+ str | None,
216
+ typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
217
+ ] = None,
218
+ changeset_pad_hours: Annotated[
219
+ int,
220
+ typer.Option(
221
+ "--changeset-pad-hours",
222
+ envvar="OSMSG_CHANGESET_PAD_HOURS",
223
+ help="Backward pad (hours) on first runs of changeset replication. "
224
+ "Set to 24 to capture long-running open changesets. --update runs skip the pad.",
225
+ min=0,
226
+ max=48,
227
+ ),
228
+ ] = 1,
179
229
  ) -> None:
180
230
  """Run osmsg."""
181
231
  if formats is None:
@@ -194,13 +244,13 @@ def main(
194
244
  end_date=_parse_dt(end),
195
245
  countries=country,
196
246
  urls=url or ["minute"],
247
+ url_explicit=url is not None,
197
248
  workers=workers,
198
249
  additional_tags=tags,
199
250
  hashtags=hashtags,
200
251
  length_tags=length,
201
252
  users_filter=users,
202
- all_tags=all_tags or key_value,
203
- key_value=key_value,
253
+ tag_mode="all" if all_stats else ("keys" if keys_only else "none"),
204
254
  exact_lookup=exact_lookup,
205
255
  changeset=changeset,
206
256
  summary=summary,
@@ -214,6 +264,7 @@ def main(
214
264
  osm_username=username,
215
265
  osm_password=_read_password_stdin() if password_stdin else None,
216
266
  psql_dsn=psql_dsn,
267
+ changeset_pad_hours=changeset_pad_hours,
217
268
  )
218
269
 
219
270
  if last is not None:
@@ -0,0 +1,41 @@
1
+ # No FKs: DuckDB rejects UPDATE on FK-referenced LIST/GEOMETRY columns, which would block changeset upgrades.
2
+ DUCKDB_SCHEMA = """
3
+ CREATE TABLE IF NOT EXISTS users (
4
+ uid BIGINT PRIMARY KEY,
5
+ username VARCHAR NOT NULL
6
+ );
7
+ CREATE TABLE IF NOT EXISTS changesets (
8
+ changeset_id BIGINT PRIMARY KEY,
9
+ uid BIGINT NOT NULL,
10
+ created_at TIMESTAMPTZ,
11
+ hashtags VARCHAR[],
12
+ editor VARCHAR,
13
+ geom GEOMETRY
14
+ );
15
+ CREATE INDEX IF NOT EXISTS idx_changesets_created_at ON changesets(created_at);
16
+ CREATE TABLE IF NOT EXISTS changeset_stats (
17
+ changeset_id BIGINT NOT NULL,
18
+ seq_id BIGINT NOT NULL,
19
+ uid BIGINT NOT NULL,
20
+ nodes_created INTEGER DEFAULT 0,
21
+ nodes_modified INTEGER DEFAULT 0,
22
+ nodes_deleted INTEGER DEFAULT 0,
23
+ ways_created INTEGER DEFAULT 0,
24
+ ways_modified INTEGER DEFAULT 0,
25
+ ways_deleted INTEGER DEFAULT 0,
26
+ rels_created INTEGER DEFAULT 0,
27
+ rels_modified INTEGER DEFAULT 0,
28
+ rels_deleted INTEGER DEFAULT 0,
29
+ poi_created INTEGER DEFAULT 0,
30
+ poi_modified INTEGER DEFAULT 0,
31
+ tag_stats JSON,
32
+ PRIMARY KEY (seq_id, changeset_id)
33
+ );
34
+ CREATE INDEX IF NOT EXISTS idx_changeset_stats_uid ON changeset_stats(uid);
35
+ CREATE TABLE IF NOT EXISTS state (
36
+ source_url VARCHAR PRIMARY KEY,
37
+ last_seq BIGINT NOT NULL,
38
+ last_ts TIMESTAMPTZ NOT NULL,
39
+ updated_at TIMESTAMPTZ NOT NULL
40
+ );
41
+ """
@@ -114,14 +114,44 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
114
114
  if any(parquet_dir.glob("temp_*_users_*.parquet")):
115
115
  conn.execute(f"INSERT OR IGNORE INTO users SELECT uid, username FROM read_parquet('{pattern('users')}')")
116
116
  if any(parquet_dir.glob("temp_*_changesets_*.parquet")):
117
+ conn.execute("INSTALL spatial")
118
+ conn.execute("LOAD spatial")
117
119
  conn.execute(
118
120
  f"""
119
121
  INSERT OR IGNORE INTO changesets
120
122
  SELECT changeset_id, uid, created_at, hashtags, editor,
121
- min_lon, min_lat, max_lon, max_lat
123
+ CASE WHEN min_lon IS NOT NULL
124
+ THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat)
125
+ END
122
126
  FROM read_parquet('{pattern("changesets")}')
123
127
  """
124
128
  )
129
+ # Newer non-NULL wins; dedupe src so multiple emits per window don't trip the PK on UPDATE.
130
+ conn.execute(
131
+ f"""
132
+ UPDATE changesets c
133
+ SET created_at = COALESCE(src.created_at, c.created_at),
134
+ hashtags = COALESCE(src.hashtags, c.hashtags),
135
+ editor = COALESCE(src.editor, c.editor),
136
+ geom = COALESCE(src.geom, c.geom)
137
+ FROM (
138
+ SELECT DISTINCT ON (changeset_id)
139
+ changeset_id, created_at, hashtags, editor,
140
+ CASE WHEN min_lon IS NOT NULL
141
+ THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat)
142
+ END AS geom
143
+ FROM read_parquet('{pattern("changesets")}')
144
+ ORDER BY changeset_id,
145
+ (min_lon IS NOT NULL) DESC,
146
+ (editor IS NOT NULL) DESC,
147
+ (hashtags IS NOT NULL) DESC,
148
+ created_at DESC NULLS LAST
149
+ ) src
150
+ WHERE c.changeset_id = src.changeset_id
151
+ AND (src.created_at IS NOT NULL OR src.hashtags IS NOT NULL
152
+ OR src.editor IS NOT NULL OR src.geom IS NOT NULL)
153
+ """
154
+ )
125
155
  if any(parquet_dir.glob("temp_*_changeset_stats_*.parquet")):
126
156
  conn.execute(
127
157
  f"""