osmsg 1.2.4__tar.gz → 1.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {osmsg-1.2.4 → osmsg-1.2.5}/PKG-INFO +2 -2
  2. {osmsg-1.2.4 → osmsg-1.2.5}/README.md +1 -1
  3. osmsg-1.2.5/osmsg/__version__.py +1 -0
  4. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/cli.py +1 -1
  5. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/gui.py +27 -12
  6. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/history.py +41 -28
  7. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/pipeline.py +14 -8
  8. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/replication.py +1 -1
  9. {osmsg-1.2.4 → osmsg-1.2.5}/pyproject.toml +1 -1
  10. osmsg-1.2.4/osmsg/__version__.py +0 -1
  11. {osmsg-1.2.4 → osmsg-1.2.5}/LICENSE +0 -0
  12. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/__init__.py +0 -0
  13. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/_http.py +0 -0
  14. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/_tick.py +0 -0
  15. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/auth.py +0 -0
  16. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/boundary.py +0 -0
  17. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/db/__init__.py +0 -0
  18. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/db/duckdb_schema.py +0 -0
  19. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/db/ingest.py +0 -0
  20. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/db/queries.py +0 -0
  21. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/db/schema.py +0 -0
  22. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/exceptions.py +0 -0
  23. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/__init__.py +0 -0
  24. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/csv.py +0 -0
  25. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/json.py +0 -0
  26. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/markdown.py +0 -0
  27. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/parquet.py +0 -0
  28. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/export/psql.py +0 -0
  29. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/fetch.py +0 -0
  30. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/geofabrik.py +0 -0
  31. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/handlers.py +0 -0
  32. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/__init__.py +0 -0
  33. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/cli.py +0 -0
  34. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/convert.py +0 -0
  35. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/manifest.py +0 -0
  36. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/month.py +0 -0
  37. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/parquet.py +0 -0
  38. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/maintain/pbf_split.py +0 -0
  39. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/models.py +0 -0
  40. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/pg_schema.py +0 -0
  41. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/py.typed +0 -0
  42. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/tm.py +0 -0
  43. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/ui.py +0 -0
  44. {osmsg-1.2.4 → osmsg-1.2.5}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.2.4
3
+ Version: 1.2.5
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -230,7 +230,7 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
230
230
  | `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
231
231
  | `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
232
232
  | `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
233
- | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
233
+ | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel parse workers. |
234
234
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
235
235
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
236
236
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
@@ -198,7 +198,7 @@ docker-compose `environment:` block all reach the same setting. CLI flag wins ov
198
198
  | `--country` | `OSMSG_COUNTRY` | unset | Geofabrik region id(s). Comma-separated when set via env. |
199
199
  | `--boundary` | `OSMSG_BOUNDARY` | unset | GeoJSON path or inline GeoJSON. |
200
200
  | `--url` | `OSMSG_URL` | `minute` | `minute`/`hour`/`day` shortcut or full URL. Comma-separated when set via env. |
201
- | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel workers. |
201
+ | `--workers` | `OSMSG_WORKERS` | cpu count | Parallel parse workers. |
202
202
  | `--cache-dir` | `OSMSG_CACHE_DIR` | platform cache | Where downloaded OSM files are kept across runs. |
203
203
  | `--output-dir` | `OSMSG_OUTPUT_DIR` | `.` | Where `<name>.duckdb` and exports are written. |
204
204
  | `--format` / `-f` | `OSMSG_FORMAT` | `parquet` | Repeat for multiple. Comma-separated when set via env. |
@@ -0,0 +1 @@
1
+ __version__ = "1.2.5"
@@ -150,7 +150,7 @@ def main(
150
150
  ] = None,
151
151
  workers: Annotated[
152
152
  int | None,
153
- typer.Option(envvar="OSMSG_WORKERS", help="Parallel workers (default: cpu count)."),
153
+ typer.Option(envvar="OSMSG_WORKERS", help="Parallel parse workers (default: cpu count)."),
154
154
  ] = None,
155
155
  rows: Annotated[
156
156
  int | None,
@@ -61,6 +61,19 @@ def _split(value: str | None) -> list[str] | None:
61
61
  return items if items else None
62
62
 
63
63
 
64
+ def _parse_int(value: object, field: str) -> int | None:
65
+ text = str(value or "").strip()
66
+ if not text:
67
+ return None
68
+ try:
69
+ number = int(text)
70
+ except ValueError as exc:
71
+ raise OsmsgError(f"{field} must be a whole number.") from exc
72
+ if number < 1:
73
+ raise OsmsgError(f"{field} must be at least 1.")
74
+ return number
75
+
76
+
64
77
  def build_config(form: dict[str, object], output_dir: str) -> RunConfig:
65
78
  """Map the form fields to a RunConfig, raising OsmsgError on invalid input."""
66
79
  formats = [name for name in FORMATS if form.get(name)]
@@ -78,6 +91,7 @@ def build_config(form: dict[str, object], output_dir: str) -> RunConfig:
78
91
  tag_mode="all" if form.get("all_tags") else "none",
79
92
  summary=bool(form.get("summary")),
80
93
  formats=formats,
94
+ workers=_parse_int(form.get("workers"), "Workers"),
81
95
  output_dir=Path(output_dir or "."),
82
96
  )
83
97
 
@@ -133,6 +147,7 @@ class App:
133
147
  ("End (blank = now)", "end", ""),
134
148
  ("Hashtags (comma-sep)", "hashtags", ""),
135
149
  ("Tags (comma-sep)", "tags", ""),
150
+ ("Workers", "workers", str(os.cpu_count() or 4)),
136
151
  ]
137
152
  for i, (label, key, default) in enumerate(rows):
138
153
  ttk.Label(frame, text=label).grid(row=i, column=0, sticky="w", pady=2)
@@ -141,7 +156,7 @@ class App:
141
156
  self.vars[key] = var
142
157
 
143
158
  preset_frame = ttk.LabelFrame(frame, text="Quick range", padding=6)
144
- preset_frame.grid(row=5, column=0, columnspan=4, sticky="we", pady=6)
159
+ preset_frame.grid(row=6, column=0, columnspan=4, sticky="we", pady=6)
145
160
  for i, name in enumerate(PRESETS):
146
161
  ttk.Button(preset_frame, text=name, width=11, command=lambda n=name: self._apply_preset(n)).grid(
147
162
  row=0, column=i, padx=2
@@ -149,32 +164,32 @@ class App:
149
164
 
150
165
  self.vars["all_tags"] = tk.BooleanVar()
151
166
  self.vars["summary"] = tk.BooleanVar()
152
- ttk.Checkbutton(frame, text="All tags", variable=self.vars["all_tags"]).grid(row=6, column=0, sticky="w")
153
- ttk.Checkbutton(frame, text="Daily summary", variable=self.vars["summary"]).grid(row=6, column=1, sticky="w")
167
+ ttk.Checkbutton(frame, text="All tags", variable=self.vars["all_tags"]).grid(row=7, column=0, sticky="w")
168
+ ttk.Checkbutton(frame, text="Daily summary", variable=self.vars["summary"]).grid(row=7, column=1, sticky="w")
154
169
 
155
170
  fmt_frame = ttk.LabelFrame(frame, text="Formats", padding=6)
156
- fmt_frame.grid(row=7, column=0, columnspan=4, sticky="we", pady=6)
171
+ fmt_frame.grid(row=8, column=0, columnspan=4, sticky="we", pady=6)
157
172
  for i, name in enumerate(FORMATS):
158
173
  var = tk.BooleanVar(value=name in ("parquet", "csv"))
159
174
  ttk.Checkbutton(fmt_frame, text=name, variable=var).grid(row=0, column=i, padx=4)
160
175
  self.vars[name] = var
161
176
 
162
177
  self.out_label = ttk.Label(frame, text=f"Output: {self.out_dir}")
163
- self.out_label.grid(row=8, column=0, columnspan=3, sticky="w")
164
- ttk.Button(frame, text="Choose folder", command=self._choose_folder).grid(row=8, column=3, sticky="e")
178
+ self.out_label.grid(row=9, column=0, columnspan=3, sticky="w")
179
+ ttk.Button(frame, text="Choose folder", command=self._choose_folder).grid(row=9, column=3, sticky="e")
165
180
 
166
181
  self.run_btn = ttk.Button(frame, text="Compute", command=self._on_run)
167
- self.run_btn.grid(row=9, column=0, pady=8, sticky="w")
182
+ self.run_btn.grid(row=10, column=0, pady=8, sticky="w")
168
183
  self.open_btn = ttk.Button(frame, text="Open output folder", command=lambda: _open_folder(Path(self.out_dir)))
169
- self.open_btn.grid(row=9, column=1, pady=8, sticky="w")
184
+ self.open_btn.grid(row=10, column=1, pady=8, sticky="w")
170
185
  self.spinner = ttk.Progressbar(frame, mode="indeterminate", length=160)
171
- self.spinner.grid(row=9, column=2, columnspan=2, pady=8, sticky="we")
186
+ self.spinner.grid(row=10, column=2, columnspan=2, pady=8, sticky="we")
172
187
 
173
188
  self.log = scrolledtext.ScrolledText(frame, width=70, height=14, state="disabled")
174
- self.log.grid(row=10, column=0, columnspan=4, sticky="nsew")
189
+ self.log.grid(row=11, column=0, columnspan=4, sticky="nsew")
175
190
 
176
- ttk.Button(frame, text="About", command=self._show_about).grid(row=11, column=0, pady=(6, 0), sticky="w")
177
- ttk.Label(frame, text="A project of OSGeo Nepal").grid(row=11, column=1, columnspan=3, pady=(6, 0), sticky="e")
191
+ ttk.Button(frame, text="About", command=self._show_about).grid(row=12, column=0, pady=(6, 0), sticky="w")
192
+ ttk.Label(frame, text="A project of OSGeo Nepal").grid(row=12, column=1, columnspan=3, pady=(6, 0), sticky="e")
178
193
  self.root.after(120, self._drain)
179
194
 
180
195
  def _show_about(self) -> None:
@@ -5,6 +5,7 @@ path (a glob would make DuckDB list every partition over the HF API)."""
5
5
  import datetime as dt
6
6
  import json
7
7
  import pathlib
8
+ import time
8
9
  from dataclasses import dataclass
9
10
 
10
11
  import duckdb
@@ -16,6 +17,7 @@ UTC = dt.UTC
16
17
  SCHEMA_VERSION = 1
17
18
  DEFAULT_HISTORY_URL = "hf://datasets/kshitijrajsharma/osmsg-history"
18
19
  HISTORY_SEQ_ID = 0
20
+ MONTH_READ_ATTEMPTS = 4
19
21
 
20
22
 
21
23
  @dataclass
@@ -185,36 +187,47 @@ def ingest_remote(
185
187
 
186
188
  info(f"history: remote ingest {start_iso} -> {end_iso} ({len(months)} month partitions) from {history_url}")
187
189
 
190
+ def ingest_month(month: tuple[int, int]) -> None:
191
+ changesets_src = _partition_list(history_url, "changesets", [month])
192
+ changefiles_src = _partition_list(history_url, "changefiles", [month])
193
+ if changesets_src is not None:
194
+ conn.execute(
195
+ f"""INSERT INTO users
196
+ SELECT uid, any_value(username) FROM {changesets_src}
197
+ WHERE {in_window} AND username IS NOT NULL
198
+ GROUP BY uid ON CONFLICT (uid) DO NOTHING"""
199
+ )
200
+ conn.execute(
201
+ f"""INSERT INTO changesets
202
+ SELECT changeset_id, uid, created_at, hashtags, editor,
203
+ CASE WHEN min_lon IS NOT NULL
204
+ THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
205
+ FROM {changesets_src} WHERE {changeset_where}
206
+ ON CONFLICT (changeset_id) DO NOTHING"""
207
+ )
208
+ if changefiles_src is not None:
209
+ conn.execute(
210
+ f"""INSERT INTO changeset_stats
211
+ SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
212
+ nodes_created, nodes_modified, nodes_deleted,
213
+ ways_created, ways_modified, ways_deleted,
214
+ rels_created, rels_modified, rels_deleted,
215
+ poi_created, poi_modified, tag_stats
216
+ FROM {changefiles_src} WHERE {stats_where}
217
+ ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
218
+ )
219
+
188
220
  with progress_bar(len(months), unit="months", description="Reading history") as advance:
189
221
  for month in months:
190
- changesets_src = _partition_list(history_url, "changesets", [month])
191
- changefiles_src = _partition_list(history_url, "changefiles", [month])
192
- if changesets_src is not None:
193
- conn.execute(
194
- f"""INSERT INTO users
195
- SELECT uid, any_value(username) FROM {changesets_src}
196
- WHERE {in_window} AND username IS NOT NULL
197
- GROUP BY uid ON CONFLICT (uid) DO NOTHING"""
198
- )
199
- conn.execute(
200
- f"""INSERT INTO changesets
201
- SELECT changeset_id, uid, created_at, hashtags, editor,
202
- CASE WHEN min_lon IS NOT NULL
203
- THEN ST_MakeEnvelope(min_lon, min_lat, max_lon, max_lat) END
204
- FROM {changesets_src} WHERE {changeset_where}
205
- ON CONFLICT (changeset_id) DO NOTHING"""
206
- )
207
- if changefiles_src is not None:
208
- conn.execute(
209
- f"""INSERT INTO changeset_stats
210
- SELECT changeset_id, {HISTORY_SEQ_ID} AS seq_id, uid,
211
- nodes_created, nodes_modified, nodes_deleted,
212
- ways_created, ways_modified, ways_deleted,
213
- rels_created, rels_modified, rels_deleted,
214
- poi_created, poi_modified, tag_stats
215
- FROM {changefiles_src} WHERE {stats_where}
216
- ON CONFLICT (seq_id, changeset_id) DO NOTHING"""
217
- )
222
+ for attempt in range(MONTH_READ_ATTEMPTS):
223
+ try:
224
+ ingest_month(month)
225
+ break
226
+ except duckdb.Error as exc:
227
+ if attempt == MONTH_READ_ATTEMPTS - 1:
228
+ raise
229
+ warn(f"history: {month[0]}-{month[1]:02d} read failed ({type(exc).__name__}); retrying.")
230
+ time.sleep(2 * (attempt + 1))
218
231
  advance()
219
232
 
220
233
  row = conn.execute(f"SELECT count(*) FROM changeset_stats WHERE seq_id = {HISTORY_SEQ_ID}").fetchone()
@@ -540,21 +540,18 @@ def _processing_config(cfg: RunConfig, *, parquet_dir: Path, geom_wkt: str | Non
540
540
  }
541
541
 
542
542
 
543
- # Replication servers throttle many concurrent connections, so downloads stay polite regardless of
544
- # the worker count used for local parsing. Already-downloaded files are cached, so a rerun resumes.
545
543
  _DOWNLOAD_WORKERS = 4
546
544
 
547
545
 
548
546
  def _download_all(
549
547
  urls: list[str],
550
548
  mode: str,
551
- max_workers: int,
549
+ workers: int,
552
550
  cookie: str | None,
553
551
  cache_dir: Path,
554
552
  label: str,
555
553
  description: str = "downloading",
556
554
  ) -> None:
557
- workers = min(max_workers, _DOWNLOAD_WORKERS)
558
555
  try:
559
556
  with (
560
557
  progress_bar(len(urls), unit=label, description=description) as advance,
@@ -714,10 +711,13 @@ def run(cfg: RunConfig) -> dict[str, Any]:
714
711
  if run_live:
715
712
  _auto_switch_replication(cfg, cfg.end_date - cfg.start_date)
716
713
  except duckdb.Error as exc:
717
- warn(f"history: remote ingest failed ({type(exc).__name__}: {exc}); using live path.")
718
714
  for tbl in ("changeset_stats", "changesets", "users"):
719
715
  conn.execute(f"DELETE FROM {tbl}")
720
- run_live = True
716
+ dbmod.close(conn)
717
+ raise OsmsgError(
718
+ f"Reading the published history failed after retries ({type(exc).__name__}). "
719
+ "Re-run to try again, narrow the date range, or pass --no-history for the live path."
720
+ ) from exc
721
721
 
722
722
  max_workers = cfg.workers or _cpu_count()
723
723
  info(f"Workers: {max_workers}")
@@ -756,7 +756,13 @@ def run(cfg: RunConfig) -> dict[str, Any]:
756
756
  cs_config["window_start_utc"] = cfg.start_date.astimezone(UTC)
757
757
 
758
758
  _download_all(
759
- urls, "changeset", max_workers, None, cfg.cache_dir, "changesets", description="Downloading changesets"
759
+ urls,
760
+ "changeset",
761
+ _DOWNLOAD_WORKERS,
762
+ None,
763
+ cfg.cache_dir,
764
+ "changesets",
765
+ description="Downloading changesets",
760
766
  )
761
767
  _process_all(
762
768
  urls,
@@ -819,7 +825,7 @@ def run(cfg: RunConfig) -> dict[str, Any]:
819
825
  _download_all(
820
826
  urls,
821
827
  "changefiles",
822
- max_workers,
828
+ _DOWNLOAD_WORKERS,
823
829
  cookie,
824
830
  cfg.cache_dir,
825
831
  "changefiles",
@@ -145,7 +145,7 @@ class ChangesetReplication:
145
145
  def timestamp_to_sequence(self, ts: datetime) -> int:
146
146
  cur_seq, last_run = self._state()
147
147
  wanted = int((ts - last_run).total_seconds() / 60) + cur_seq
148
- return min(wanted, cur_seq)
148
+ return max(1, min(wanted, cur_seq))
149
149
 
150
150
  def sequence_to_timestamp(self, seq: int) -> datetime:
151
151
  txt = session.get(self.state_url(seq)).text
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "osmsg"
3
- version = "1.2.4"
3
+ version = "1.2.5"
4
4
  description = "OpenStreetMap Stats Generator: Commandline"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -1 +0,0 @@
1
- __version__ = "1.2.4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes