osmsg 1.0.2__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of osmsg might be problematic. Click here for more details.
- {osmsg-1.0.2 → osmsg-1.1.0}/PKG-INFO +58 -8
- {osmsg-1.0.2 → osmsg-1.1.0}/README.md +57 -7
- osmsg-1.1.0/osmsg/__version__.py +1 -0
- osmsg-1.1.0/osmsg/_tick.py +71 -0
- osmsg-1.1.0/osmsg/boundary.py +37 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/cli.py +63 -12
- osmsg-1.1.0/osmsg/db/duckdb_schema.py +41 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/ingest.py +31 -1
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/queries.py +13 -18
- osmsg-1.1.0/osmsg/db/schema.py +51 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/markdown.py +28 -3
- osmsg-1.1.0/osmsg/export/psql.py +69 -0
- osmsg-1.1.0/osmsg/geofabrik.py +65 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/handlers.py +11 -5
- osmsg-1.1.0/osmsg/pg_schema.py +42 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/pipeline.py +193 -44
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/replication.py +51 -32
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/ui.py +2 -2
- {osmsg-1.0.2 → osmsg-1.1.0}/pyproject.toml +6 -2
- osmsg-1.0.2/osmsg/__version__.py +0 -1
- osmsg-1.0.2/osmsg/boundary.py +0 -37
- osmsg-1.0.2/osmsg/db/schema.py +0 -111
- osmsg-1.0.2/osmsg/export/psql.py +0 -89
- osmsg-1.0.2/osmsg/geofabrik.py +0 -41
- {osmsg-1.0.2 → osmsg-1.1.0}/LICENSE +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/__init__.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/_http.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/auth.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/db/__init__.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/exceptions.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/__init__.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/csv.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/json.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/export/parquet.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/fetch.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/models.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/py.typed +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/tm.py +0 -0
- {osmsg-1.0.2 → osmsg-1.1.0}/osmsg/workers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: osmsg
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: OpenStreetMap Stats Generator: Commandline
|
|
5
5
|
Keywords: osm,stats,commandline,openstreetmap
|
|
6
6
|
Author: Kshitij Raj Sharma
|
|
@@ -41,11 +41,12 @@ Description-Content-Type: text/markdown
|
|
|
41
41
|
[](https://github.com/astral-sh/uv)
|
|
42
42
|
[](https://github.com/osgeonepal/osmsg/pkgs/container/osmsg)
|
|
43
43
|
|
|
44
|
-
**OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
|
|
44
|
+
**OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
|
|
45
|
+
of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
|
|
45
46
|
|
|
46
47
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
47
48
|
|
|
48
|
-
##
|
|
49
|
+
## Features
|
|
49
50
|
|
|
50
51
|
- Per-user create/modify/delete counts over any time window.
|
|
51
52
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
@@ -58,11 +59,15 @@ A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
|
58
59
|
Pick the one that fits how you work.
|
|
59
60
|
|
|
60
61
|
```bash
|
|
62
|
+
uvx --from osmsg osmsg --last hour # zero-install, one-shot run
|
|
61
63
|
pip install osmsg # into your project
|
|
62
64
|
uv tool install osmsg # standalone CLI
|
|
63
65
|
docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last hour
|
|
64
66
|
```
|
|
65
67
|
|
|
68
|
+
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
69
|
+
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
70
|
+
|
|
66
71
|
## Quick start
|
|
67
72
|
|
|
68
73
|
```bash
|
|
@@ -81,7 +86,8 @@ That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder
|
|
|
81
86
|
osmsg --country nepal --last day
|
|
82
87
|
```
|
|
83
88
|
|
|
84
|
-
`--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
|
|
89
|
+
`--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
|
|
90
|
+
in your shell or a `.env` file:
|
|
85
91
|
|
|
86
92
|
```bash
|
|
87
93
|
export OSM_USERNAME=you
|
|
@@ -115,7 +121,24 @@ duckdb stats.duckdb -c "SELECT username, SUM(nodes_created) AS n
|
|
|
115
121
|
|
|
116
122
|
Same schema in DuckDB and Postgres: `users`, `changesets`, `changeset_stats`, `state`.
|
|
117
123
|
|
|
118
|
-
### 5.
|
|
124
|
+
### 5. Run the API
|
|
125
|
+
|
|
126
|
+
Push stats into Postgres, then start the Litestar API:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
osmsg --last day --format psql --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
130
|
+
litestar --app api.app:app run --host 0.0.0.0 --port 8000
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
```text
|
|
134
|
+
GET /health
|
|
135
|
+
GET /api/v1/user-stats?start=2026-05-01T00:00:00Z&end=2026-05-02T00:00:00Z
|
|
136
|
+
GET /docs
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
For self-hosting with Docker Compose and systemd, see [docs/infra.md](./docs/infra.md).
|
|
140
|
+
|
|
141
|
+
### 6. Use it as a library
|
|
119
142
|
|
|
120
143
|
```python
|
|
121
144
|
from datetime import datetime, UTC
|
|
@@ -132,7 +155,8 @@ print(result["files"]["parquet"])
|
|
|
132
155
|
|
|
133
156
|
Same pipeline as the CLI.
|
|
134
157
|
|
|
135
|
-
###
|
|
158
|
+
### 7. Long flag lists? Use a config
|
|
159
|
+
|
|
136
160
|
|
|
137
161
|
```bash
|
|
138
162
|
osmsg --config nepal.yaml
|
|
@@ -142,12 +166,37 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
|
|
|
142
166
|
|
|
143
167
|
## Output formats
|
|
144
168
|
|
|
145
|
-
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
169
|
+
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
170
|
+
`-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
|
|
171
|
+
|
|
172
|
+
## Configuration
|
|
173
|
+
|
|
174
|
+
Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
|
|
175
|
+
docker-compose `environment:` block all reach the same setting. CLI flag wins over env var.
|
|
176
|
+
|
|
177
|
+
| CLI flag | Env var | Default | Notes |
|
|
178
|
+
| --- | --- | --- | --- |
|
|
179
|
+
| `--name` | `OSMSG_NAME` | `stats` | Output basename; sets `<name>.duckdb`. |
|
|
180
|
+
| `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
|
|
181
|
+
| `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
|
|
182
|
+
| `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
|
|
183
|
+
| `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
|
|
184
|
+
| `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
|
|
185
|
+
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
186
|
+
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
187
|
+
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
188
|
+
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
189
|
+
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
190
|
+
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
191
|
+
| OSM credentials (Geofabrik) | `OSM_USERNAME`, `OSM_PASSWORD` | unset | Required only when a Geofabrik URL is in use. |
|
|
192
|
+
|
|
193
|
+
A `.env` file at the working directory is loaded automatically.
|
|
146
194
|
|
|
147
195
|
## Documentation
|
|
148
196
|
|
|
149
197
|
- [Installation](./docs/Installation.md)
|
|
150
198
|
- [Manual](./docs/Manual.md) (every flag, with examples)
|
|
199
|
+
- [Self-hosting / Docker Compose](./docs/infra.md)
|
|
151
200
|
- [Version control / release notes](./docs/Version_control.md)
|
|
152
201
|
|
|
153
202
|
## Contributing
|
|
@@ -162,7 +211,8 @@ uv run pre-commit install
|
|
|
162
211
|
uv run pytest -m "not network"
|
|
163
212
|
```
|
|
164
213
|
|
|
165
|
-
Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
|
|
214
|
+
Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
|
|
215
|
+
Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
|
|
166
216
|
|
|
167
217
|
## License
|
|
168
218
|
|
|
@@ -9,11 +9,12 @@
|
|
|
9
9
|
[](https://github.com/astral-sh/uv)
|
|
10
10
|
[](https://github.com/osgeonepal/osmsg/pkgs/container/osmsg)
|
|
11
11
|
|
|
12
|
-
**OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
|
|
12
|
+
**OpenStreetMap Stats Generator.** A tiny CLI (and Python library) that turns OSM history into per-user counts
|
|
13
|
+
of nodes, ways, and relations created, modified, or deleted, written to parquet, csv, json, markdown, or Postgres.
|
|
13
14
|
|
|
14
15
|
A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
15
16
|
|
|
16
|
-
##
|
|
17
|
+
## Features
|
|
17
18
|
|
|
18
19
|
- Per-user create/modify/delete counts over any time window.
|
|
19
20
|
- Tag and hashtag breakdowns (e.g. `building`, `#hotosm`).
|
|
@@ -26,11 +27,15 @@ A Project of [OSGeo Nepal](https://osgeonepal.org).
|
|
|
26
27
|
Pick the one that fits how you work.
|
|
27
28
|
|
|
28
29
|
```bash
|
|
30
|
+
uvx --from osmsg osmsg --last hour # zero-install, one-shot run
|
|
29
31
|
pip install osmsg # into your project
|
|
30
32
|
uv tool install osmsg # standalone CLI
|
|
31
33
|
docker run --rm -v "$PWD:/work" -w /work ghcr.io/osgeonepal/osmsg:latest --last hour
|
|
32
34
|
```
|
|
33
35
|
|
|
36
|
+
`uvx` can run osmsg in a throwaway environment , no install, no virtualenv to manage. Works
|
|
37
|
+
with any flag combination, e.g. `uvx --from osmsg osmsg --last hour --tags building --summary -f parquet -f markdown`.
|
|
38
|
+
|
|
34
39
|
## Quick start
|
|
35
40
|
|
|
36
41
|
```bash
|
|
@@ -49,7 +54,8 @@ That's it. A `stats.duckdb` and a `stats.parquet` show up in your current folder
|
|
|
49
54
|
osmsg --country nepal --last day
|
|
50
55
|
```
|
|
51
56
|
|
|
52
|
-
`--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
|
|
57
|
+
`--country` resolves through Geofabrik and needs an OSM account. Set `OSM_USERNAME` and `OSM_PASSWORD`
|
|
58
|
+
in your shell or a `.env` file:
|
|
53
59
|
|
|
54
60
|
```bash
|
|
55
61
|
export OSM_USERNAME=you
|
|
@@ -83,7 +89,24 @@ duckdb stats.duckdb -c "SELECT username, SUM(nodes_created) AS n
|
|
|
83
89
|
|
|
84
90
|
Same schema in DuckDB and Postgres: `users`, `changesets`, `changeset_stats`, `state`.
|
|
85
91
|
|
|
86
|
-
### 5.
|
|
92
|
+
### 5. Run the API
|
|
93
|
+
|
|
94
|
+
Push stats into Postgres, then start the Litestar API:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
osmsg --last day --format psql --psql-dsn "postgresql://user:pass@localhost/osmsg"
|
|
98
|
+
litestar --app api.app:app run --host 0.0.0.0 --port 8000
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
```text
|
|
102
|
+
GET /health
|
|
103
|
+
GET /api/v1/user-stats?start=2026-05-01T00:00:00Z&end=2026-05-02T00:00:00Z
|
|
104
|
+
GET /docs
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
For self-hosting with Docker Compose and systemd, see [docs/infra.md](./docs/infra.md).
|
|
108
|
+
|
|
109
|
+
### 6. Use it as a library
|
|
87
110
|
|
|
88
111
|
```python
|
|
89
112
|
from datetime import datetime, UTC
|
|
@@ -100,7 +123,8 @@ print(result["files"]["parquet"])
|
|
|
100
123
|
|
|
101
124
|
Same pipeline as the CLI.
|
|
102
125
|
|
|
103
|
-
###
|
|
126
|
+
### 7. Long flag lists? Use a config
|
|
127
|
+
|
|
104
128
|
|
|
105
129
|
```bash
|
|
106
130
|
osmsg --config nepal.yaml
|
|
@@ -110,12 +134,37 @@ Any flag works as a YAML key. See [docs/Manual.md](./docs/Manual.md) for the ful
|
|
|
110
134
|
|
|
111
135
|
## Output formats
|
|
112
136
|
|
|
113
|
-
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
137
|
+
Every run writes `stats.duckdb` (or `<--name>.duckdb`) plus the formats you ask for via
|
|
138
|
+
`-f parquet|csv|json|markdown|psql`. Parquet is the default. Open it with duckdb, polars, pandas, anything.
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
Every meaningful flag has a matching `OSMSG_*` env var so the CLI, a `.env` file, and a
|
|
143
|
+
docker-compose `environment:` block all reach the same setting. CLI flag wins over env var.
|
|
144
|
+
|
|
145
|
+
| CLI flag | Env var | Default | Notes |
|
|
146
|
+
| --- | --- | --- | --- |
|
|
147
|
+
| `--name` | `OSMSG_NAME` | `stats` | Output basename; sets `<name>.duckdb`. |
|
|
148
|
+
| `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
|
|
149
|
+
| `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
|
|
150
|
+
| `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
|
|
151
|
+
| `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
|
|
152
|
+
| `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
|
|
153
|
+
| `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
|
|
154
|
+
| `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
|
|
155
|
+
| `--psql-dsn` | `OSMSG_PSQL_DSN` | unset | libpq DSN for `-f psql`. |
|
|
156
|
+
| `--changeset-pad-hours` | `OSMSG_CHANGESET_PAD_HOURS` | `1` | See below. |
|
|
157
|
+
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP` | `hour` | `hour`, `day`, or `week`. Used when `--update` runs against an empty DB. |
|
|
158
|
+
| (auto-bootstrap on `--update`) | `OSMSG_BOOTSTRAP_DAYS` | unset | Integer N; overrides `OSMSG_BOOTSTRAP`. |
|
|
159
|
+
| OSM credentials (Geofabrik) | `OSM_USERNAME`, `OSM_PASSWORD` | unset | Required only when a Geofabrik URL is in use. |
|
|
160
|
+
|
|
161
|
+
A `.env` file at the working directory is loaded automatically.
|
|
114
162
|
|
|
115
163
|
## Documentation
|
|
116
164
|
|
|
117
165
|
- [Installation](./docs/Installation.md)
|
|
118
166
|
- [Manual](./docs/Manual.md) (every flag, with examples)
|
|
167
|
+
- [Self-hosting / Docker Compose](./docs/infra.md)
|
|
119
168
|
- [Version control / release notes](./docs/Version_control.md)
|
|
120
169
|
|
|
121
170
|
## Contributing
|
|
@@ -130,7 +179,8 @@ uv run pre-commit install
|
|
|
130
179
|
uv run pytest -m "not network"
|
|
131
180
|
```
|
|
132
181
|
|
|
133
|
-
Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
|
|
182
|
+
Please read [CONTRIBUTING.md](./CONTRIBUTING.md) and the [Code of Conduct](./CODE_OF_CONDUCT.md) before opening a PR.
|
|
183
|
+
Use [Conventional Commits](https://www.conventionalcommits.org/) (`cz commit`).
|
|
134
184
|
|
|
135
185
|
## License
|
|
136
186
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.0"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Worker tick: bootstrap on first run, --update thereafter."""
|
|
2
|
+
|
|
3
|
+
import fcntl
|
|
4
|
+
import os
|
|
5
|
+
import shlex
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from .db import connect, create_tables, get_state
|
|
11
|
+
from .geofabrik import country_update_url
|
|
12
|
+
from .replication import resolve_url
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _has_state(db_path: Path, source_url: str) -> bool:
|
|
16
|
+
if not db_path.exists():
|
|
17
|
+
return False
|
|
18
|
+
conn = connect(str(db_path))
|
|
19
|
+
create_tables(conn)
|
|
20
|
+
result = get_state(conn, source_url) is not None
|
|
21
|
+
conn.close()
|
|
22
|
+
return result
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_arg(args: list[str], flag: str) -> str | None:
|
|
26
|
+
for i, arg in enumerate(args):
|
|
27
|
+
if arg == flag and i + 1 < len(args):
|
|
28
|
+
return args[i + 1]
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main() -> int:
|
|
33
|
+
extra_args = shlex.split(os.environ.get("OSMSG_EXTRA_ARGS", ""))
|
|
34
|
+
bootstrap = os.environ.get("OSMSG_BOOTSTRAP", "hour")
|
|
35
|
+
bootstrap_days = os.environ.get("OSMSG_BOOTSTRAP_DAYS")
|
|
36
|
+
name = _parse_arg(extra_args, "--name") or "stats"
|
|
37
|
+
out = Path(_parse_arg(extra_args, "--output-dir") or "/var/lib/osmsg")
|
|
38
|
+
country = _parse_arg(extra_args, "--country")
|
|
39
|
+
url = _parse_arg(extra_args, "--url") or "minute"
|
|
40
|
+
|
|
41
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
|
|
43
|
+
lock_path = out / f"{name}.lock"
|
|
44
|
+
lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
|
|
45
|
+
try:
|
|
46
|
+
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
47
|
+
except BlockingIOError:
|
|
48
|
+
print("[osmsg-tick] previous tick still running, skipping", flush=True)
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
source_url = country_update_url(country) if country else resolve_url(url)
|
|
52
|
+
db_path = out / f"{name}.duckdb"
|
|
53
|
+
|
|
54
|
+
extra_set = set(extra_args)
|
|
55
|
+
cmd = ["osmsg"] + extra_args
|
|
56
|
+
if not (extra_set & {"--all", "--keys"}):
|
|
57
|
+
cmd.append("--all")
|
|
58
|
+
|
|
59
|
+
if _has_state(db_path, source_url):
|
|
60
|
+
cmd.append("--update")
|
|
61
|
+
elif bootstrap_days:
|
|
62
|
+
cmd.extend(["--days", bootstrap_days])
|
|
63
|
+
else:
|
|
64
|
+
cmd.extend(["--last", bootstrap])
|
|
65
|
+
|
|
66
|
+
print(f"[osmsg-tick] {' '.join(cmd)}", flush=True)
|
|
67
|
+
return subprocess.call(cmd)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
sys.exit(main())
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Boundary GeoJSON parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from shapely.geometry import MultiPolygon, Polygon, shape
|
|
10
|
+
from shapely.geometry.base import BaseGeometry
|
|
11
|
+
|
|
12
|
+
from .exceptions import UnknownRegionError
|
|
13
|
+
from .geofabrik import country_geometry
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_boundary(input_data: str) -> BaseGeometry:
|
|
17
|
+
try:
|
|
18
|
+
payload: Any = json.loads(input_data)
|
|
19
|
+
except json.JSONDecodeError:
|
|
20
|
+
path = Path(input_data)
|
|
21
|
+
if path.is_file():
|
|
22
|
+
payload = json.loads(path.read_text())
|
|
23
|
+
else:
|
|
24
|
+
try:
|
|
25
|
+
return country_geometry(input_data)
|
|
26
|
+
except UnknownRegionError:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"--boundary {input_data!r} is not valid JSON, a file path, or a known Geofabrik region name."
|
|
29
|
+
) from None
|
|
30
|
+
|
|
31
|
+
geometry = payload.get("geometry") if "geometry" in payload else payload
|
|
32
|
+
if not geometry or geometry.get("type") not in ("Polygon", "MultiPolygon"):
|
|
33
|
+
raise ValueError("Boundary must be a Polygon or MultiPolygon GeoJSON.")
|
|
34
|
+
geom = shape(geometry)
|
|
35
|
+
if isinstance(geom, (Polygon, MultiPolygon)):
|
|
36
|
+
return geom
|
|
37
|
+
raise ValueError(f"Unexpected geometry type: {type(geom).__name__}")
|
|
@@ -111,18 +111,29 @@ def main(
|
|
|
111
111
|
bool | None,
|
|
112
112
|
typer.Option("--version", callback=_version_callback, is_eager=True, help="Print version and exit."),
|
|
113
113
|
] = None,
|
|
114
|
-
name: Annotated[
|
|
114
|
+
name: Annotated[
|
|
115
|
+
str,
|
|
116
|
+
typer.Option(envvar="OSMSG_NAME", help="Output basename. Writes <name>.duckdb + selected formats."),
|
|
117
|
+
] = "stats",
|
|
115
118
|
start: Annotated[str | None, typer.Option(help="ISO start (UTC). 'YYYY-MM-DD HH:MM:SS'.")] = None,
|
|
116
119
|
end: Annotated[str | None, typer.Option(help="ISO end (UTC). Defaults to now.")] = None,
|
|
117
120
|
last: Annotated[Period | None, typer.Option(help="Convenience: hour|day|week|month|year.")] = None,
|
|
118
121
|
days: Annotated[int | None, typer.Option(help="Last N days (mutually exclusive with --last).")] = None,
|
|
119
122
|
country: Annotated[
|
|
120
123
|
list[str] | None,
|
|
121
|
-
typer.Option(
|
|
124
|
+
typer.Option(
|
|
125
|
+
"--country",
|
|
126
|
+
envvar="OSMSG_COUNTRY",
|
|
127
|
+
help="Geofabrik region id(s); resolved live. Requires OSM credentials. Comma-separated when set via env.",
|
|
128
|
+
),
|
|
122
129
|
] = None,
|
|
123
130
|
url: Annotated[
|
|
124
131
|
list[str] | None,
|
|
125
|
-
typer.Option(
|
|
132
|
+
typer.Option(
|
|
133
|
+
"--url",
|
|
134
|
+
envvar="OSMSG_URL",
|
|
135
|
+
help="Replication URL(s). Shortcuts: minute, hour, day. Comma-separated when set via env.",
|
|
136
|
+
),
|
|
126
137
|
] = None,
|
|
127
138
|
hashtags: Annotated[
|
|
128
139
|
list[str] | None,
|
|
@@ -134,29 +145,54 @@ def main(
|
|
|
134
145
|
list[str] | None,
|
|
135
146
|
typer.Option("--users", help="Filter to OSM usernames (case-sensitive, exact match). Repeat for more."),
|
|
136
147
|
] = None,
|
|
137
|
-
workers: Annotated[
|
|
148
|
+
workers: Annotated[
|
|
149
|
+
int | None,
|
|
150
|
+
typer.Option(envvar="OSMSG_WORKERS", help="Parallel workers (default: cpu count)."),
|
|
151
|
+
] = None,
|
|
138
152
|
rows: Annotated[
|
|
139
153
|
int | None,
|
|
140
154
|
typer.Option(help="Cap rows shown in the console table. Files always carry the full set."),
|
|
141
155
|
] = None,
|
|
142
|
-
boundary: Annotated[
|
|
143
|
-
|
|
156
|
+
boundary: Annotated[
|
|
157
|
+
str | None,
|
|
158
|
+
typer.Option(
|
|
159
|
+
envvar="OSMSG_BOUNDARY",
|
|
160
|
+
help="Boundary filter: Geofabrik region name (e.g. 'nepal'), GeoJSON file path, or inline GeoJSON.",
|
|
161
|
+
),
|
|
162
|
+
] = None,
|
|
163
|
+
formats: Annotated[
|
|
164
|
+
list[Format] | None,
|
|
165
|
+
typer.Option(
|
|
166
|
+
"--format",
|
|
167
|
+
"-f",
|
|
168
|
+
envvar="OSMSG_FORMAT",
|
|
169
|
+
help="One or more output formats. Comma-separated when set via env.",
|
|
170
|
+
),
|
|
171
|
+
] = None,
|
|
144
172
|
summary: Annotated[bool, typer.Option(help="Also write <name>_summary.parquet + summary.md.")] = False,
|
|
145
173
|
changeset: Annotated[bool, typer.Option(hidden=True)] = False,
|
|
146
|
-
|
|
147
|
-
|
|
174
|
+
all_stats: Annotated[
|
|
175
|
+
bool,
|
|
176
|
+
typer.Option(
|
|
177
|
+
"--all",
|
|
178
|
+
help="Collect all tag key=value stats and changeset metadata (hashtags, editors).",
|
|
179
|
+
),
|
|
180
|
+
] = False,
|
|
181
|
+
keys_only: Annotated[bool, typer.Option("--keys", help="Collect tag key stats only (no value breakdown).")] = False,
|
|
148
182
|
exact_lookup: Annotated[
|
|
149
183
|
bool, typer.Option("--exact-lookup", help="Hashtag whole-word match. Only meaningful with --hashtags.")
|
|
150
184
|
] = False,
|
|
151
185
|
tm_stats: Annotated[bool, typer.Option("--tm-stats", help="Attach Tasking Manager totals.")] = False,
|
|
152
186
|
update: Annotated[bool, typer.Option(help="Append to existing <name>.duckdb.")] = False,
|
|
153
187
|
cache_dir: Annotated[
|
|
154
|
-
Path,
|
|
188
|
+
Path,
|
|
189
|
+
typer.Option("--cache-dir", envvar="OSMSG_CACHE_DIR", help="Cache dir for downloaded OSM files."),
|
|
155
190
|
] = DEFAULT_CACHE_DIR,
|
|
156
191
|
output_dir: Annotated[
|
|
157
192
|
Path,
|
|
158
193
|
typer.Option(
|
|
159
194
|
"--output-dir",
|
|
195
|
+
envvar="OSMSG_OUTPUT_DIR",
|
|
160
196
|
help="Where to write <name>.duckdb + selected formats. Defaults to current directory.",
|
|
161
197
|
),
|
|
162
198
|
] = Path("."),
|
|
@@ -175,7 +211,21 @@ def main(
|
|
|
175
211
|
help="Read OSM password from stdin (one line). Else $OSM_PASSWORD, then prompt.",
|
|
176
212
|
),
|
|
177
213
|
] = False,
|
|
178
|
-
psql_dsn: Annotated[
|
|
214
|
+
psql_dsn: Annotated[
|
|
215
|
+
str | None,
|
|
216
|
+
typer.Option("--psql-dsn", envvar="OSMSG_PSQL_DSN", help="libpq DSN for --format psql."),
|
|
217
|
+
] = None,
|
|
218
|
+
changeset_pad_hours: Annotated[
|
|
219
|
+
int,
|
|
220
|
+
typer.Option(
|
|
221
|
+
"--changeset-pad-hours",
|
|
222
|
+
envvar="OSMSG_CHANGESET_PAD_HOURS",
|
|
223
|
+
help="Backward pad (hours) on first runs of changeset replication. "
|
|
224
|
+
"Set to 24 to capture long-running open changesets. --update runs skip the pad.",
|
|
225
|
+
min=0,
|
|
226
|
+
max=48,
|
|
227
|
+
),
|
|
228
|
+
] = 1,
|
|
179
229
|
) -> None:
|
|
180
230
|
"""Run osmsg."""
|
|
181
231
|
if formats is None:
|
|
@@ -194,13 +244,13 @@ def main(
|
|
|
194
244
|
end_date=_parse_dt(end),
|
|
195
245
|
countries=country,
|
|
196
246
|
urls=url or ["minute"],
|
|
247
|
+
url_explicit=url is not None,
|
|
197
248
|
workers=workers,
|
|
198
249
|
additional_tags=tags,
|
|
199
250
|
hashtags=hashtags,
|
|
200
251
|
length_tags=length,
|
|
201
252
|
users_filter=users,
|
|
202
|
-
|
|
203
|
-
key_value=key_value,
|
|
253
|
+
tag_mode="all" if all_stats else ("keys" if keys_only else "none"),
|
|
204
254
|
exact_lookup=exact_lookup,
|
|
205
255
|
changeset=changeset,
|
|
206
256
|
summary=summary,
|
|
@@ -214,6 +264,7 @@ def main(
|
|
|
214
264
|
osm_username=username,
|
|
215
265
|
osm_password=_read_password_stdin() if password_stdin else None,
|
|
216
266
|
psql_dsn=psql_dsn,
|
|
267
|
+
changeset_pad_hours=changeset_pad_hours,
|
|
217
268
|
)
|
|
218
269
|
|
|
219
270
|
if last is not None:
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# No FKs: DuckDB rejects UPDATE on FK-referenced LIST/GEOMETRY columns, which would block changeset upgrades.
|
|
2
|
+
DUCKDB_SCHEMA = """
|
|
3
|
+
CREATE TABLE IF NOT EXISTS users (
|
|
4
|
+
uid BIGINT PRIMARY KEY,
|
|
5
|
+
username VARCHAR NOT NULL
|
|
6
|
+
);
|
|
7
|
+
CREATE TABLE IF NOT EXISTS changesets (
|
|
8
|
+
changeset_id BIGINT PRIMARY KEY,
|
|
9
|
+
uid BIGINT NOT NULL,
|
|
10
|
+
created_at TIMESTAMPTZ,
|
|
11
|
+
hashtags VARCHAR[],
|
|
12
|
+
editor VARCHAR,
|
|
13
|
+
geom GEOMETRY
|
|
14
|
+
);
|
|
15
|
+
CREATE INDEX IF NOT EXISTS idx_changesets_created_at ON changesets(created_at);
|
|
16
|
+
CREATE TABLE IF NOT EXISTS changeset_stats (
|
|
17
|
+
changeset_id BIGINT NOT NULL,
|
|
18
|
+
seq_id BIGINT NOT NULL,
|
|
19
|
+
uid BIGINT NOT NULL,
|
|
20
|
+
nodes_created INTEGER DEFAULT 0,
|
|
21
|
+
nodes_modified INTEGER DEFAULT 0,
|
|
22
|
+
nodes_deleted INTEGER DEFAULT 0,
|
|
23
|
+
ways_created INTEGER DEFAULT 0,
|
|
24
|
+
ways_modified INTEGER DEFAULT 0,
|
|
25
|
+
ways_deleted INTEGER DEFAULT 0,
|
|
26
|
+
rels_created INTEGER DEFAULT 0,
|
|
27
|
+
rels_modified INTEGER DEFAULT 0,
|
|
28
|
+
rels_deleted INTEGER DEFAULT 0,
|
|
29
|
+
poi_created INTEGER DEFAULT 0,
|
|
30
|
+
poi_modified INTEGER DEFAULT 0,
|
|
31
|
+
tag_stats JSON,
|
|
32
|
+
PRIMARY KEY (seq_id, changeset_id)
|
|
33
|
+
);
|
|
34
|
+
CREATE INDEX IF NOT EXISTS idx_changeset_stats_uid ON changeset_stats(uid);
|
|
35
|
+
CREATE TABLE IF NOT EXISTS state (
|
|
36
|
+
source_url VARCHAR PRIMARY KEY,
|
|
37
|
+
last_seq BIGINT NOT NULL,
|
|
38
|
+
last_ts TIMESTAMPTZ NOT NULL,
|
|
39
|
+
updated_at TIMESTAMPTZ NOT NULL
|
|
40
|
+
);
|
|
41
|
+
"""
|
|
@@ -114,14 +114,44 @@ def merge_parquet_files(conn: duckdb.DuckDBPyConnection, parquet_dir: Path, *, c
|
|
|
114
114
|
if any(parquet_dir.glob("temp_*_users_*.parquet")):
|
|
115
115
|
conn.execute(f"INSERT OR IGNORE INTO users SELECT uid, username FROM read_parquet('{pattern('users')}')")
|
|
116
116
|
if any(parquet_dir.glob("temp_*_changesets_*.parquet")):
|
|
117
|
+
conn.execute("INSTALL spatial")
|
|
118
|
+
conn.execute("LOAD spatial")
|
|
117
119
|
conn.execute(
|
|
118
120
|
f"""
|
|
119
121
|
INSERT OR IGNORE INTO changesets
|
|
120
122
|
SELECT changeset_id, uid, created_at, hashtags, editor,
|
|
121
|
-
min_lon
|
|
123
|
+
CASE WHEN min_lon IS NOT NULL
|
|
124
|
+
THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat)
|
|
125
|
+
END
|
|
122
126
|
FROM read_parquet('{pattern("changesets")}')
|
|
123
127
|
"""
|
|
124
128
|
)
|
|
129
|
+
# Newer non-NULL wins; dedupe src so multiple emits per window don't trip the PK on UPDATE.
|
|
130
|
+
conn.execute(
|
|
131
|
+
f"""
|
|
132
|
+
UPDATE changesets c
|
|
133
|
+
SET created_at = COALESCE(src.created_at, c.created_at),
|
|
134
|
+
hashtags = COALESCE(src.hashtags, c.hashtags),
|
|
135
|
+
editor = COALESCE(src.editor, c.editor),
|
|
136
|
+
geom = COALESCE(src.geom, c.geom)
|
|
137
|
+
FROM (
|
|
138
|
+
SELECT DISTINCT ON (changeset_id)
|
|
139
|
+
changeset_id, created_at, hashtags, editor,
|
|
140
|
+
CASE WHEN min_lon IS NOT NULL
|
|
141
|
+
THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat)
|
|
142
|
+
END AS geom
|
|
143
|
+
FROM read_parquet('{pattern("changesets")}')
|
|
144
|
+
ORDER BY changeset_id,
|
|
145
|
+
(min_lon IS NOT NULL) DESC,
|
|
146
|
+
(editor IS NOT NULL) DESC,
|
|
147
|
+
(hashtags IS NOT NULL) DESC,
|
|
148
|
+
created_at DESC NULLS LAST
|
|
149
|
+
) src
|
|
150
|
+
WHERE c.changeset_id = src.changeset_id
|
|
151
|
+
AND (src.created_at IS NOT NULL OR src.hashtags IS NOT NULL
|
|
152
|
+
OR src.editor IS NOT NULL OR src.geom IS NOT NULL)
|
|
153
|
+
"""
|
|
154
|
+
)
|
|
125
155
|
if any(parquet_dir.glob("temp_*_changeset_stats_*.parquet")):
|
|
126
156
|
conn.execute(
|
|
127
157
|
f"""
|