osmsg 1.1.0__tar.gz → 1.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {osmsg-1.1.0 → osmsg-1.1.2}/PKG-INFO +1 -1
  2. osmsg-1.1.2/osmsg/__version__.py +1 -0
  3. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/_tick.py +25 -18
  4. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/cli.py +3 -0
  5. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/markdown.py +5 -4
  6. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/handlers.py +15 -19
  7. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/pipeline.py +24 -21
  8. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/replication.py +26 -13
  9. {osmsg-1.1.0 → osmsg-1.1.2}/pyproject.toml +1 -1
  10. osmsg-1.1.0/osmsg/__version__.py +0 -1
  11. {osmsg-1.1.0 → osmsg-1.1.2}/LICENSE +0 -0
  12. {osmsg-1.1.0 → osmsg-1.1.2}/README.md +0 -0
  13. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/__init__.py +0 -0
  14. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/_http.py +0 -0
  15. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/auth.py +0 -0
  16. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/boundary.py +0 -0
  17. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/db/__init__.py +0 -0
  18. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/db/duckdb_schema.py +0 -0
  19. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/db/ingest.py +0 -0
  20. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/db/queries.py +0 -0
  21. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/db/schema.py +0 -0
  22. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/exceptions.py +0 -0
  23. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/__init__.py +0 -0
  24. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/csv.py +0 -0
  25. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/json.py +0 -0
  26. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/parquet.py +0 -0
  27. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/export/psql.py +0 -0
  28. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/fetch.py +0 -0
  29. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/geofabrik.py +0 -0
  30. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/models.py +0 -0
  31. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/pg_schema.py +0 -0
  32. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/py.typed +0 -0
  33. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/tm.py +0 -0
  34. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/ui.py +0 -0
  35. {osmsg-1.1.0 → osmsg-1.1.2}/osmsg/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmsg
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: OpenStreetMap Stats Generator: Commandline
5
5
  Keywords: osm,stats,commandline,openstreetmap
6
6
  Author: Kshitij Raj Sharma
@@ -0,0 +1 @@
1
+ __version__ = "1.1.2"
@@ -36,7 +36,8 @@ def main() -> int:
36
36
  name = _parse_arg(extra_args, "--name") or "stats"
37
37
  out = Path(_parse_arg(extra_args, "--output-dir") or "/var/lib/osmsg")
38
38
  country = _parse_arg(extra_args, "--country")
39
- url = _parse_arg(extra_args, "--url") or "minute"
39
+ explicit_url = _parse_arg(extra_args, "--url")
40
+ url = explicit_url or "minute"
40
41
 
41
42
  out.mkdir(parents=True, exist_ok=True)
42
43
 
@@ -45,26 +46,32 @@ def main() -> int:
45
46
  try:
46
47
  fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
47
48
  except BlockingIOError:
49
+ os.close(lock_fd)
48
50
  print("[osmsg-tick] previous tick still running, skipping", flush=True)
49
51
  return 0
50
52
 
51
- source_url = country_update_url(country) if country else resolve_url(url)
52
- db_path = out / f"{name}.duckdb"
53
-
54
- extra_set = set(extra_args)
55
- cmd = ["osmsg"] + extra_args
56
- if not (extra_set & {"--all", "--keys"}):
57
- cmd.append("--all")
58
-
59
- if _has_state(db_path, source_url):
60
- cmd.append("--update")
61
- elif bootstrap_days:
62
- cmd.extend(["--days", bootstrap_days])
63
- else:
64
- cmd.extend(["--last", bootstrap])
65
-
66
- print(f"[osmsg-tick] {' '.join(cmd)}", flush=True)
67
- return subprocess.call(cmd)
53
+ try:
54
+ # Mirror pipeline._normalize_urls: explicit --url wins over --country's geofabrik default,
55
+ # otherwise --update can't find the state row and the DuckDB gets wiped every tick.
56
+ source_url = country_update_url(country) if country and explicit_url is None else resolve_url(url)
57
+ db_path = out / f"{name}.duckdb"
58
+
59
+ extra_set = set(extra_args)
60
+ cmd = ["osmsg"] + extra_args
61
+ if not (extra_set & {"--all", "--keys"}):
62
+ cmd.append("--all")
63
+
64
+ if _has_state(db_path, source_url):
65
+ cmd.append("--update")
66
+ elif bootstrap_days:
67
+ cmd.extend(["--days", bootstrap_days])
68
+ else:
69
+ cmd.extend(["--last", bootstrap])
70
+
71
+ print(f"[osmsg-tick] {' '.join(cmd)}", flush=True)
72
+ return subprocess.call(cmd)
73
+ finally:
74
+ os.close(lock_fd)
68
75
 
69
76
 
70
77
  if __name__ == "__main__":
@@ -233,6 +233,9 @@ def main(
233
233
  if sum(1 for x in (start, last, days) if x) > 1:
234
234
  error("--start, --last, and --days are mutually exclusive — pick one.")
235
235
  raise typer.Exit(code=2)
236
+ if update and any(x is not None for x in (start, end, last, days)):
237
+ error("--update resumes from prior state and runs to head; it ignores --start/--end/--last/--days.")
238
+ raise typer.Exit(code=2)
236
239
  if Format.psql in formats and not psql_dsn:
237
240
  error("-f psql requires --psql-dsn (libpq connection string, e.g. 'host=localhost dbname=osm user=osm').")
238
241
  raise typer.Exit(code=2)
@@ -17,15 +17,16 @@ def _stringify(v: Any) -> str:
17
17
  return str(v)
18
18
 
19
19
 
20
- def table_markdown(rows: list[dict[str, Any]], headers: list[str] | None = None) -> str:
20
+ def table_markdown(rows: list[dict[str, Any]], output_path: Path, headers: list[str] | None = None) -> Path:
21
21
  """Return a GitHub-flavored markdown table for the given rows."""
22
- if not rows:
23
- return ""
24
22
  headers = headers or list(rows[0].keys())
25
23
  lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |"]
26
24
  for r in rows:
27
25
  lines.append("| " + " | ".join(_stringify(r.get(h)) for h in headers) + " |")
28
- return "\n".join(lines)
26
+ output_path = Path(output_path)
27
+ output_path.parent.mkdir(parents=True, exist_ok=True)
28
+ output_path.write_text("\n".join(lines), encoding="utf-8")
29
+ return output_path
29
30
 
30
31
 
31
32
  def _human(n: int) -> str:
@@ -32,23 +32,15 @@ class ChangesetHandler(osmium.SimpleHandler):
32
32
  return
33
33
  cfg = self.config
34
34
 
35
- # Drop padding-pulled changesets that don't overlap the window — otherwise
36
- # attach_metadata leaks their hashtags onto users with in-window edits.
37
- # `c.open` gate is required: osmium uses 1970 as the closed_at sentinel.
35
+ # Drop closed-before-window changesets so attach_metadata can't leak their
36
+ # hashtags onto in-window users.
38
37
  start = cfg.get("window_start_utc")
39
- end = cfg.get("window_end_utc")
40
- if start is not None and end is not None:
41
- created = c.created_at
42
- if created.tzinfo is None:
43
- created = created.replace(tzinfo=dt.UTC)
44
- if created >= end:
38
+ if start is not None and not c.open:
39
+ closed = c.closed_at
40
+ if closed.tzinfo is None:
41
+ closed = closed.replace(tzinfo=dt.UTC)
42
+ if closed <= start:
45
43
  return
46
- if not c.open:
47
- closed = c.closed_at
48
- if closed.tzinfo is None:
49
- closed = closed.replace(tzinfo=dt.UTC)
50
- if closed <= start:
51
- return
52
44
 
53
45
  if self._geom is not None:
54
46
  if not c.bounds.valid():
@@ -119,7 +111,6 @@ class ChangefileHandler(osmium.SimpleHandler):
119
111
  super().__init__()
120
112
  self.config = config
121
113
  self.start = config["start_date_utc"]
122
- self.end = config["end_date_utc"]
123
114
  self.seq_id = sequence_id
124
115
  # None == no filter; empty set == filter matched nothing (collect nothing).
125
116
  self.valid_changesets = valid_changesets
@@ -188,15 +179,20 @@ class ChangefileHandler(osmium.SimpleHandler):
188
179
  if track_length and k in length_keys:
189
180
  tv.add_length(len_m)
190
181
 
182
+ def _in_window(self, ts) -> bool:
183
+ # Lower-bound only; disjoint coverage between ticks comes from the seq boundary
184
+ # (next tick resumes at last_seq+1, with state.last_ts = state_ts(last_seq)).
185
+ return ts >= self.start
186
+
191
187
  def node(self, n) -> None:
192
- if not (self.start <= n.timestamp < self.end):
188
+ if not self._in_window(n.timestamp):
193
189
  return
194
190
  if not self._should_collect(n.user, n.changeset):
195
191
  return
196
192
  self._accumulate(n.uid, n.user, n.changeset, 0 if n.deleted else n.version, n.tags, "nodes")
197
193
 
198
194
  def way(self, w) -> None:
199
- if not (self.start <= w.timestamp < self.end):
195
+ if not self._in_window(w.timestamp):
200
196
  return
201
197
  if not self._should_collect(w.user, w.changeset):
202
198
  return
@@ -204,7 +200,7 @@ class ChangefileHandler(osmium.SimpleHandler):
204
200
  self._accumulate(w.uid, w.user, w.changeset, 0 if w.deleted else w.version, w.tags, "ways", nodes)
205
201
 
206
202
  def relation(self, r) -> None:
207
- if not (self.start <= r.timestamp < self.end):
203
+ if not self._in_window(r.timestamp):
208
204
  return
209
205
  if not self._should_collect(r.user, r.changeset):
210
206
  return
@@ -22,7 +22,7 @@ from .boundary import load_boundary
22
22
  from .db.queries import attach_metadata, attach_tag_stats, daily_summary, list_changesets, user_stats
23
23
  from .db.schema import get_state, upsert_state
24
24
  from .exceptions import CredentialsRequiredError, NoDataFoundError, OsmsgError
25
- from .export import summary_markdown, to_csv, to_json, to_parquet, to_psql
25
+ from .export import summary_markdown, table_markdown, to_csv, to_json, to_parquet, to_psql
26
26
  from .fetch import download_osm_file
27
27
  from .geofabrik import country_geometry, country_update_url
28
28
  from .replication import (
@@ -30,6 +30,7 @@ from .replication import (
30
30
  SHORTCUTS,
31
31
  ChangesetReplication,
32
32
  changefile_download_urls,
33
+ changefile_seq_timestamp,
33
34
  resolve_url,
34
35
  )
35
36
  from .ui import info, progress_bar, warn
@@ -371,6 +372,8 @@ def run(cfg: RunConfig) -> dict[str, Any]:
371
372
  valid_changesets: set[int] | None = None
372
373
  start_seq: int | None = None
373
374
  end_seq: int | None = None
375
+ # Threaded into changefile_download_urls so a tick never advances cf past cs.
376
+ cs_frontier_ts: dt.datetime | None = None
374
377
 
375
378
  if cfg.hashtags or cfg.changeset:
376
379
  cs_repl = ChangesetReplication(pad_hours=cfg.changeset_pad_hours)
@@ -384,11 +387,12 @@ def run(cfg: RunConfig) -> dict[str, Any]:
384
387
  )
385
388
  info(f"Changesets: {len(urls)} files (seq {cs_start}-{cs_end}), {pad_note}.")
386
389
 
390
+ cs_frontier_ts = cs_repl.sequence_to_timestamp(cs_end)
391
+
387
392
  if urls:
388
393
  cs_dir.mkdir(parents=True, exist_ok=True)
389
394
  cs_config = _processing_config(cfg, parquet_dir=cs_dir, geom_wkt=geom_wkt)
390
395
  cs_config["window_start_utc"] = cfg.start_date.astimezone(UTC)
391
- cs_config["window_end_utc"] = cfg.end_date.astimezone(UTC)
392
396
 
393
397
  _download_all(
394
398
  urls, "changeset", max_workers, None, cfg.cache_dir, "changesets", description="Downloading changesets"
@@ -408,7 +412,7 @@ def run(cfg: RunConfig) -> dict[str, Any]:
408
412
  conn,
409
413
  source_url=CHANGESETS_REPLICATION,
410
414
  last_seq=cs_end,
411
- last_ts=cfg.end_date.astimezone(UTC),
415
+ last_ts=cs_frontier_ts,
412
416
  updated_at=dt.datetime.now(UTC),
413
417
  )
414
418
  info("Changeset processing complete.")
@@ -421,15 +425,12 @@ def run(cfg: RunConfig) -> dict[str, Any]:
421
425
  info(f"Changefiles ← {url}")
422
426
  url_start, resume_seq = url_starts[url]
423
427
  urls, server_ts, src_start_seq, src_end_seq, _, _ = changefile_download_urls(
424
- url_start, cfg.end_date, url, resume_seq=resume_seq
428
+ url_start, cfg.end_date, url, resume_seq=resume_seq, cs_ts=cs_frontier_ts
425
429
  )
426
430
  if start_seq is None:
427
431
  start_seq = src_start_seq
428
432
  end_seq = src_end_seq
429
- # Cap per URL only — never mutate cfg.end_date or sibling URLs lose their window.
430
- url_end_date = min(cfg.end_date, server_ts)
431
433
  url_start_date_utc = url_start.astimezone(UTC)
432
- url_end_date_utc = url_end_date.astimezone(UTC)
433
434
 
434
435
  gap = server_ts - url_start_date_utc
435
436
  info(
@@ -439,12 +440,20 @@ def run(cfg: RunConfig) -> dict[str, Any]:
439
440
 
440
441
  if not urls:
441
442
  info(f" {url}: already up-to-date")
443
+ if resume_seq is not None:
444
+ # Heartbeat: bump updated_at so /health can tell "alive, idle" apart from "stuck".
445
+ upsert_state(
446
+ conn,
447
+ source_url=url,
448
+ last_seq=resume_seq - 1,
449
+ last_ts=url_start,
450
+ updated_at=dt.datetime.now(UTC),
451
+ )
442
452
  continue
443
453
 
444
454
  cf_dir.mkdir(parents=True, exist_ok=True)
445
455
  cf_config = _processing_config(cfg, parquet_dir=cf_dir, geom_wkt=None)
446
456
  cf_config["start_date_utc"] = url_start_date_utc
447
- cf_config["end_date_utc"] = url_end_date_utc
448
457
 
449
458
  _download_all(
450
459
  urls,
@@ -469,15 +478,18 @@ def run(cfg: RunConfig) -> dict[str, Any]:
469
478
  description="Processing changefiles",
470
479
  )
471
480
  dbmod.merge_parquet_files(conn, cf_dir, cleanup=True)
481
+ # state.last_ts is the seq_ts of last_seq so the next tick's lower-bound filter
482
+ # aligns with the seq boundary.
483
+ state_last_ts = changefile_seq_timestamp(url, src_end_seq)
472
484
  upsert_state(
473
485
  conn,
474
486
  source_url=url,
475
487
  last_seq=src_end_seq,
476
- last_ts=url_end_date,
488
+ last_ts=state_last_ts,
477
489
  updated_at=dt.datetime.now(UTC),
478
490
  )
479
- lag = server_ts - url_end_date_utc
480
- info(f" DB now current to: {url_end_date_utc.isoformat()} | lag from server: {lag}")
491
+ lag = server_ts - state_last_ts
492
+ info(f" DB now current to: {state_last_ts.isoformat()} | lag from server: {lag}")
481
493
  info(f"Changefile processing complete: {url}")
482
494
 
483
495
  if cfg.delete_temp:
@@ -521,19 +533,10 @@ def run(cfg: RunConfig) -> dict[str, Any]:
521
533
  written["json"] = str(to_json(rows, out / f"{cfg.name}.json"))
522
534
 
523
535
  if "markdown" in cfg.formats:
524
- from .export.markdown import summary_markdown as render_md
525
-
526
536
  md_path = out / f"{cfg.name}.md"
527
- render_md(
537
+ table_markdown(
528
538
  rows,
529
539
  output_path=md_path,
530
- start_date=start_date_utc,
531
- end_date=end_date_utc,
532
- additional_tags=cfg.additional_tags,
533
- length_tags=cfg.length_tags,
534
- tag_mode=cfg.tag_mode,
535
- fname=cfg.name,
536
- tm_stats=cfg.tm_stats,
537
540
  )
538
541
  written["markdown"] = str(md_path)
539
542
 
@@ -34,14 +34,25 @@ def seq_to_timestamp(state_url: str) -> datetime:
34
34
  return datetime.strptime(raw, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=UTC)
35
35
 
36
36
 
37
+ def changefile_seq_timestamp(base_url: str, seq: int) -> datetime:
38
+ """Timestamp of a changefile-replication seq (the diff's `state.txt` `timestamp=` line)."""
39
+ return seq_to_timestamp(ReplicationServer(base_url).get_state_url(seq))
40
+
41
+
37
42
  def changefile_download_urls(
38
43
  start_date: datetime | None,
39
44
  end_date: datetime,
40
45
  base_url: str,
41
46
  *,
42
47
  resume_seq: int | None = None,
48
+ cs_ts: datetime | None = None,
43
49
  ) -> tuple[list[str], datetime, int, int, str, str]:
44
- """resume_seq starts exactly there (skipping the timestamp lookup + backward pad used on first runs)."""
50
+ """Resolve [start_seq, last_seq] for the time range, plus the URL list to fetch.
51
+
52
+ resume_seq, when given, is used verbatim (no backward pad). cs_ts gates the
53
+ upper bound on resume ticks: last_seq stays behind cs_ts so every (seq, cs) row
54
+ written has its parent changeset in place.
55
+ """
45
56
  repl = ReplicationServer(base_url)
46
57
 
47
58
  if resume_seq is not None:
@@ -75,15 +86,19 @@ def changefile_download_urls(
75
86
  end_seq = repl.timestamp_to_sequence(end_date)
76
87
  if end_seq is None:
77
88
  raise OsmsgError(f"Could not resolve end_date {end_date}")
78
- last_seq = end_seq
79
- # Pad forwards so the last requested timestamp is fully covered by the diffs we fetch.
80
- if "minute" in base_url:
81
- adjust = int((seq_to_timestamp(repl.get_state_url(end_seq)) - end_date).total_seconds() / 60)
82
- last_seq = last_seq + adjust + 60
83
- else:
84
- last_seq += 1
85
- if last_seq >= server_seq:
86
- last_seq = server_seq
89
+ # +1 only when end_seq's state_ts is strictly before end_date, since that is the
90
+ # only diff that can contain edits in (state_ts(end_seq), end_date].
91
+ end_seq_ts = seq_to_timestamp(repl.get_state_url(end_seq))
92
+ if end_seq_ts < end_date:
93
+ end_seq += 1
94
+ last_seq = min(end_seq, server_seq)
95
+
96
+ # Hold cf one diff behind cs when cs is the slower stream, so every (seq, cs) row
97
+ # written has a parent in `changesets` already.
98
+ if resume_seq is not None and cs_ts is not None:
99
+ target_ts = end_date if end_date else server_ts
100
+ if target_ts > cs_ts:
101
+ last_seq -= 1
87
102
 
88
103
  if seq >= last_seq:
89
104
  return [], server_ts, start_seq, last_seq, start_seq_url, repl.get_state_url(last_seq)
@@ -168,10 +183,8 @@ class ChangesetReplication:
168
183
  end_seq = self.timestamp_to_sequence(end_date)
169
184
  end_ts = self.sequence_to_timestamp(end_seq)
170
185
  if end_date > end_ts:
186
+ # Step to the diff covering end_date, plus one so edits at end_date land.
171
187
  end_seq += int((end_date - end_ts).total_seconds() / 60) + 1
172
- end_ts = self.sequence_to_timestamp(end_seq)
173
- if end_ts > end_date:
174
- end_seq += int((end_ts - end_date).total_seconds() / 60) + 60
175
188
  end_seq = min(end_seq, cur_seq)
176
189
 
177
190
  if start_seq >= end_seq:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "osmsg"
3
- version = "1.1.0"
3
+ version = "1.1.2"
4
4
  description = "OpenStreetMap Stats Generator: Commandline"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -1 +0,0 @@
1
- __version__ = "1.1.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes