gtfs-digester 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gtfs_digester-0.1.0/PKG-INFO +119 -0
- gtfs_digester-0.1.0/README.md +95 -0
- gtfs_digester-0.1.0/pyproject.toml +47 -0
- gtfs_digester-0.1.0/src/gtfs_digester/__init__.py +26 -0
- gtfs_digester-0.1.0/src/gtfs_digester/archive.py +181 -0
- gtfs_digester-0.1.0/src/gtfs_digester/diff.py +160 -0
- gtfs_digester-0.1.0/src/gtfs_digester/file.py +281 -0
- gtfs_digester-0.1.0/src/gtfs_digester/fingerprint.py +88 -0
- gtfs_digester-0.1.0/src/gtfs_digester/metadata.py +120 -0
- gtfs_digester-0.1.0/src/gtfs_digester/normalize.py +100 -0
- gtfs_digester-0.1.0/src/gtfs_digester/py.typed +0 -0
- gtfs_digester-0.1.0/src/gtfs_digester/schema.py +558 -0
- gtfs_digester-0.1.0/src/gtfs_digester/storage.py +185 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gtfs-digester
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds
|
|
5
|
+
Author: Chris Alfano
|
|
6
|
+
Author-email: Chris Alfano <chris@jarv.us>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: blake3>=1.0.8
|
|
16
|
+
Requires-Dist: fsspec>=2026.3.0
|
|
17
|
+
Requires-Dist: gcsfs>=2026.3.0
|
|
18
|
+
Requires-Dist: pyarrow>=23.0.1
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Project-URL: Homepage, https://github.com/JarvusInnovations/gtfs-digester
|
|
21
|
+
Project-URL: Repository, https://github.com/JarvusInnovations/gtfs-digester
|
|
22
|
+
Project-URL: Issues, https://github.com/JarvusInnovations/gtfs-digester/issues
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# gtfs-digester
|
|
26
|
+
|
|
27
|
+
Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds.
|
|
28
|
+
|
|
29
|
+
## What It Does
|
|
30
|
+
|
|
31
|
+
Takes a GTFS zip and produces:
|
|
32
|
+
|
|
33
|
+
- A **content fingerprint** (BLAKE3 merkle hash) — identical for semantically identical feeds regardless of zip metadata, file ordering, CSV whitespace, or time formatting
|
|
34
|
+
- **Canonical Arrow tables** for each file — normalized, sorted by primary key, all values as strings
|
|
35
|
+
- **Per-file hashes** for efficient hierarchical change detection
|
|
36
|
+
- **Archive diffs** — added/removed/modified files and rows by primary key
|
|
37
|
+
- **Exploded parquet storage** — write/read from local or cloud (GCS, S3) via fsspec
|
|
38
|
+
|
|
39
|
+
All files and columns are preserved, including non-standard ones.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install gtfs-digester
|
|
45
|
+
# or
|
|
46
|
+
uv add gtfs-digester
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from gtfs_digester import GTFSArchive
|
|
53
|
+
|
|
54
|
+
# Load and fingerprint
|
|
55
|
+
archive = GTFSArchive.from_zip("google_transit.zip")
|
|
56
|
+
print(archive.fingerprint.hex()) # v1:abc123...
|
|
57
|
+
|
|
58
|
+
# Access canonical tables
|
|
59
|
+
stops = archive.arrow_table("stops.txt")
|
|
60
|
+
print(stops.num_rows)
|
|
61
|
+
|
|
62
|
+
# Compare two feed versions
|
|
63
|
+
old = GTFSArchive.from_zip("old.zip")
|
|
64
|
+
new = GTFSArchive.from_zip("new.zip")
|
|
65
|
+
diff = old.diff(new)
|
|
66
|
+
print(diff.is_identical)
|
|
67
|
+
for f in diff.modified_files:
|
|
68
|
+
fd = diff.file_diff(f)
|
|
69
|
+
print(f"{f}: {fd.summary()}")
|
|
70
|
+
|
|
71
|
+
# Write as exploded parquet
|
|
72
|
+
from gtfs_digester import write_exploded
|
|
73
|
+
write_exploded(archive, "gs://bucket/schedules/feed-1", schedule_url="https://...")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## How Fingerprinting Works
|
|
77
|
+
|
|
78
|
+
1. Each `.txt` file is parsed to a PyArrow table (all strings)
|
|
79
|
+
2. Columns reordered per GTFS spec (unknown columns preserved, sorted alphabetically after spec columns)
|
|
80
|
+
3. Values normalized: whitespace stripped, times zero-padded (`9:05:00` → `09:05:00`)
|
|
81
|
+
4. Rows sorted by primary key (numeric columns sorted numerically, not lexicographically)
|
|
82
|
+
5. Canonical CSV serialized and BLAKE3 hashed per file
|
|
83
|
+
6. Archive fingerprint = BLAKE3 of sorted `filename:hash` pairs (merkle tree)
|
|
84
|
+
|
|
85
|
+
Unknown files (not in the GTFS spec) are preserved with lexicographic row sorting.
|
|
86
|
+
|
|
87
|
+
## Storage Layout
|
|
88
|
+
|
|
89
|
+
`write_exploded()` produces a version-first directory:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
base_path/
|
|
93
|
+
_feed_digest={v1:abc...}/
|
|
94
|
+
agency.parquet
|
|
95
|
+
stops.parquet
|
|
96
|
+
routes.parquet
|
|
97
|
+
trips.parquet
|
|
98
|
+
stop_times.parquet
|
|
99
|
+
...
|
|
100
|
+
metadata.json # provenance + digester info, written last (commit marker)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
DuckDB reads across versions with hive partitioning:
|
|
104
|
+
|
|
105
|
+
```sql
|
|
106
|
+
SELECT * FROM read_parquet('base_path/_feed_digest=*/stops.parquet', hive_partitioning=true);
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Development
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
uv sync --group dev
|
|
113
|
+
uv run pytest tests/ -k "not Real" # unit tests (~0.5s)
|
|
114
|
+
uv run pytest tests/ # includes real feed integration tests
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# gtfs-digester
|
|
2
|
+
|
|
3
|
+
Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds.
|
|
4
|
+
|
|
5
|
+
## What It Does
|
|
6
|
+
|
|
7
|
+
Takes a GTFS zip and produces:
|
|
8
|
+
|
|
9
|
+
- A **content fingerprint** (BLAKE3 merkle hash) — identical for semantically identical feeds regardless of zip metadata, file ordering, CSV whitespace, or time formatting
|
|
10
|
+
- **Canonical Arrow tables** for each file — normalized, sorted by primary key, all values as strings
|
|
11
|
+
- **Per-file hashes** for efficient hierarchical change detection
|
|
12
|
+
- **Archive diffs** — added/removed/modified files and rows by primary key
|
|
13
|
+
- **Exploded parquet storage** — write/read from local or cloud (GCS, S3) via fsspec
|
|
14
|
+
|
|
15
|
+
All files and columns are preserved, including non-standard ones.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install gtfs-digester
|
|
21
|
+
# or
|
|
22
|
+
uv add gtfs-digester
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from gtfs_digester import GTFSArchive
|
|
29
|
+
|
|
30
|
+
# Load and fingerprint
|
|
31
|
+
archive = GTFSArchive.from_zip("google_transit.zip")
|
|
32
|
+
print(archive.fingerprint.hex()) # v1:abc123...
|
|
33
|
+
|
|
34
|
+
# Access canonical tables
|
|
35
|
+
stops = archive.arrow_table("stops.txt")
|
|
36
|
+
print(stops.num_rows)
|
|
37
|
+
|
|
38
|
+
# Compare two feed versions
|
|
39
|
+
old = GTFSArchive.from_zip("old.zip")
|
|
40
|
+
new = GTFSArchive.from_zip("new.zip")
|
|
41
|
+
diff = old.diff(new)
|
|
42
|
+
print(diff.is_identical)
|
|
43
|
+
for f in diff.modified_files:
|
|
44
|
+
fd = diff.file_diff(f)
|
|
45
|
+
print(f"{f}: {fd.summary()}")
|
|
46
|
+
|
|
47
|
+
# Write as exploded parquet
|
|
48
|
+
from gtfs_digester import write_exploded
|
|
49
|
+
write_exploded(archive, "gs://bucket/schedules/feed-1", schedule_url="https://...")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## How Fingerprinting Works
|
|
53
|
+
|
|
54
|
+
1. Each `.txt` file is parsed to a PyArrow table (all strings)
|
|
55
|
+
2. Columns reordered per GTFS spec (unknown columns preserved, sorted alphabetically after spec columns)
|
|
56
|
+
3. Values normalized: whitespace stripped, times zero-padded (`9:05:00` → `09:05:00`)
|
|
57
|
+
4. Rows sorted by primary key (numeric columns sorted numerically, not lexicographically)
|
|
58
|
+
5. Canonical CSV serialized and BLAKE3 hashed per file
|
|
59
|
+
6. Archive fingerprint = BLAKE3 of sorted `filename:hash` pairs (merkle tree)
|
|
60
|
+
|
|
61
|
+
Unknown files (not in the GTFS spec) are preserved with lexicographic row sorting.
|
|
62
|
+
|
|
63
|
+
## Storage Layout
|
|
64
|
+
|
|
65
|
+
`write_exploded()` produces a version-first directory:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
base_path/
|
|
69
|
+
_feed_digest={v1:abc...}/
|
|
70
|
+
agency.parquet
|
|
71
|
+
stops.parquet
|
|
72
|
+
routes.parquet
|
|
73
|
+
trips.parquet
|
|
74
|
+
stop_times.parquet
|
|
75
|
+
...
|
|
76
|
+
metadata.json # provenance + digester info, written last (commit marker)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
DuckDB reads across versions with hive partitioning:
|
|
80
|
+
|
|
81
|
+
```sql
|
|
82
|
+
SELECT * FROM read_parquet('base_path/_feed_digest=*/stops.parquet', hive_partitioning=true);
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv sync --group dev
|
|
89
|
+
uv run pytest tests/ -k "not Real" # unit tests (~0.5s)
|
|
90
|
+
uv run pytest tests/ # includes real feed integration tests
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gtfs-digester"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Chris Alfano", email = "chris@jarv.us" }
|
|
8
|
+
]
|
|
9
|
+
license = "MIT"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Topic :: Scientific/Engineering :: GIS",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
dependencies = [
|
|
22
|
+
"blake3>=1.0.8",
|
|
23
|
+
"fsspec>=2026.3.0",
|
|
24
|
+
"gcsfs>=2026.3.0",
|
|
25
|
+
"pyarrow>=23.0.1",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/JarvusInnovations/gtfs-digester"
|
|
30
|
+
Repository = "https://github.com/JarvusInnovations/gtfs-digester"
|
|
31
|
+
Issues = "https://github.com/JarvusInnovations/gtfs-digester/issues"
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["uv_build>=0.11.3,<0.12.0"]
|
|
35
|
+
build-backend = "uv_build"
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
testpaths = ["tests"]
|
|
39
|
+
markers = [
|
|
40
|
+
"slow: marks tests that process real GTFS feeds (deselect with '-k not Real')",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=9.0.2",
|
|
46
|
+
"pytest-cov>=7.1.0",
|
|
47
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""gtfs-digester: Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds."""
|
|
2
|
+
|
|
3
|
+
from .archive import GTFSArchive
|
|
4
|
+
from .diff import ArchiveDiff, FileDiff
|
|
5
|
+
from .file import GTFSFile
|
|
6
|
+
from .fingerprint import ArchiveFingerprint
|
|
7
|
+
from .metadata import FeedMetadata
|
|
8
|
+
from .schema import GTFS_SCHEMAS, FileSchema, get_schema
|
|
9
|
+
from .storage import list_versions, read_metadata, read_table, version_exists, write_exploded
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"GTFSArchive",
|
|
13
|
+
"GTFSFile",
|
|
14
|
+
"ArchiveFingerprint",
|
|
15
|
+
"ArchiveDiff",
|
|
16
|
+
"FileDiff",
|
|
17
|
+
"FeedMetadata",
|
|
18
|
+
"FileSchema",
|
|
19
|
+
"get_schema",
|
|
20
|
+
"GTFS_SCHEMAS",
|
|
21
|
+
"write_exploded",
|
|
22
|
+
"read_metadata",
|
|
23
|
+
"read_table",
|
|
24
|
+
"list_versions",
|
|
25
|
+
"version_exists",
|
|
26
|
+
]
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""GTFSArchive: load from zip or directory, access Arrow tables, fingerprint, diff."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import warnings
|
|
7
|
+
import zipfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
from .diff import ArchiveDiff, FileDiff, compute_file_diff
|
|
13
|
+
from .file import GTFSFile
|
|
14
|
+
from .fingerprint import ArchiveFingerprint
|
|
15
|
+
from .schema import get_schema
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GTFSArchive:
|
|
19
|
+
"""Represents a complete GTFS feed.
|
|
20
|
+
|
|
21
|
+
Provides access to individual files, computes fingerprints,
|
|
22
|
+
and writes normalized output. Archives are immutable.
|
|
23
|
+
|
|
24
|
+
All .txt files are preserved — both known (with schema-aware normalization)
|
|
25
|
+
and unknown (with lexicographic sorting).
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, files: dict[str, GTFSFile]) -> None:
|
|
29
|
+
self._files = files
|
|
30
|
+
self._fingerprint: ArchiveFingerprint | None = None
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_zip(
|
|
34
|
+
source: str | bytes | io.BytesIO | Path,
|
|
35
|
+
) -> GTFSArchive:
|
|
36
|
+
"""Load a GTFS archive from a zip file.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
source: Path to zip file, raw bytes, or BytesIO.
|
|
40
|
+
"""
|
|
41
|
+
if isinstance(source, (str, Path)):
|
|
42
|
+
with open(source, "rb") as f:
|
|
43
|
+
zip_bytes = f.read()
|
|
44
|
+
elif isinstance(source, bytes):
|
|
45
|
+
zip_bytes = source
|
|
46
|
+
else:
|
|
47
|
+
zip_bytes = source.read()
|
|
48
|
+
|
|
49
|
+
zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
|
|
50
|
+
files: dict[str, GTFSFile] = {}
|
|
51
|
+
|
|
52
|
+
for name in sorted(zf.namelist()):
|
|
53
|
+
if name.endswith("/") or name.startswith(".") or name.startswith("__"):
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Strip directory prefix (some zips nest files in a subdirectory)
|
|
57
|
+
basename = name.split("/")[-1]
|
|
58
|
+
|
|
59
|
+
if not basename.endswith(".txt"):
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
schema = get_schema(basename)
|
|
63
|
+
data = zf.read(name)
|
|
64
|
+
gtfs_file = GTFSFile.from_csv_bytes(
|
|
65
|
+
filename=basename,
|
|
66
|
+
data=data,
|
|
67
|
+
schema=schema,
|
|
68
|
+
)
|
|
69
|
+
files[basename] = gtfs_file
|
|
70
|
+
|
|
71
|
+
return GTFSArchive(files=files)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def from_directory(
|
|
75
|
+
path: str | Path,
|
|
76
|
+
) -> GTFSArchive:
|
|
77
|
+
"""Load a GTFS archive from a directory of .txt files.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
path: Path to directory containing GTFS .txt files.
|
|
81
|
+
"""
|
|
82
|
+
dirpath = Path(path)
|
|
83
|
+
if not dirpath.is_dir():
|
|
84
|
+
raise ValueError(f"Not a directory: {dirpath}")
|
|
85
|
+
|
|
86
|
+
files: dict[str, GTFSFile] = {}
|
|
87
|
+
|
|
88
|
+
for txt_file in sorted(dirpath.glob("*.txt")):
|
|
89
|
+
basename = txt_file.name
|
|
90
|
+
if basename.startswith(".") or basename.startswith("__"):
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
schema = get_schema(basename)
|
|
94
|
+
data = txt_file.read_bytes()
|
|
95
|
+
gtfs_file = GTFSFile.from_csv_bytes(
|
|
96
|
+
filename=basename,
|
|
97
|
+
data=data,
|
|
98
|
+
schema=schema,
|
|
99
|
+
)
|
|
100
|
+
files[basename] = gtfs_file
|
|
101
|
+
|
|
102
|
+
return GTFSArchive(files=files)
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def filenames(self) -> set[str]:
|
|
106
|
+
"""Set of filenames in this archive."""
|
|
107
|
+
return set(self._files.keys())
|
|
108
|
+
|
|
109
|
+
def __contains__(self, filename: str) -> bool:
|
|
110
|
+
return filename in self._files
|
|
111
|
+
|
|
112
|
+
def __getitem__(self, filename: str) -> GTFSFile:
|
|
113
|
+
return self._files[filename]
|
|
114
|
+
|
|
115
|
+
def arrow_table(self, filename: str) -> pa.Table:
|
|
116
|
+
"""Get the canonical Arrow table for a file."""
|
|
117
|
+
return self[filename].table
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def fingerprint(self) -> ArchiveFingerprint:
|
|
121
|
+
"""Compute (or return cached) archive fingerprint."""
|
|
122
|
+
if self._fingerprint is None:
|
|
123
|
+
file_hashes: dict[str, str] = {}
|
|
124
|
+
for filename, gtfs_file in self._files.items():
|
|
125
|
+
file_hashes[filename] = gtfs_file.fingerprint_hash()
|
|
126
|
+
self._fingerprint = ArchiveFingerprint.compute(file_hashes)
|
|
127
|
+
return self._fingerprint
|
|
128
|
+
|
|
129
|
+
def to_normalized_zip(self, path: str | Path) -> None:
|
|
130
|
+
"""Write the archive as a normalized GTFS zip.
|
|
131
|
+
|
|
132
|
+
Contains canonical CSV files and a .gtfs_digester.json metadata file.
|
|
133
|
+
"""
|
|
134
|
+
fp = self.fingerprint
|
|
135
|
+
|
|
136
|
+
with zipfile.ZipFile(str(path), "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
137
|
+
for filename in sorted(self._files.keys()):
|
|
138
|
+
gtfs_file = self._files[filename]
|
|
139
|
+
csv_bytes = gtfs_file.to_canonical_csv()
|
|
140
|
+
zf.writestr(filename, csv_bytes)
|
|
141
|
+
|
|
142
|
+
zf.writestr(".gtfs_digester.json", fp.to_json())
|
|
143
|
+
|
|
144
|
+
def diff(self, other: GTFSArchive) -> ArchiveDiff:
|
|
145
|
+
"""Compare this archive with another, returning an ArchiveDiff.
|
|
146
|
+
|
|
147
|
+
This archive is treated as "old", other as "new".
|
|
148
|
+
"""
|
|
149
|
+
old_files = self.filenames
|
|
150
|
+
new_files = other.filenames
|
|
151
|
+
|
|
152
|
+
added = new_files - old_files
|
|
153
|
+
removed = old_files - new_files
|
|
154
|
+
common = old_files & new_files
|
|
155
|
+
|
|
156
|
+
modified: set[str] = set()
|
|
157
|
+
unchanged: set[str] = set()
|
|
158
|
+
file_diffs: dict[str, FileDiff] = {}
|
|
159
|
+
|
|
160
|
+
for filename in common:
|
|
161
|
+
old_hash = self._files[filename].fingerprint_hash()
|
|
162
|
+
new_hash = other._files[filename].fingerprint_hash()
|
|
163
|
+
if old_hash == new_hash:
|
|
164
|
+
unchanged.add(filename)
|
|
165
|
+
else:
|
|
166
|
+
modified.add(filename)
|
|
167
|
+
old_table = self._files[filename].table
|
|
168
|
+
new_table = other._files[filename].table
|
|
169
|
+
old_schema = self._files[filename].schema
|
|
170
|
+
pk = old_schema.primary_key if old_schema else []
|
|
171
|
+
file_diffs[filename] = compute_file_diff(
|
|
172
|
+
filename, old_table, new_table, pk
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return ArchiveDiff(
|
|
176
|
+
added_files=added,
|
|
177
|
+
removed_files=removed,
|
|
178
|
+
modified_files=modified,
|
|
179
|
+
unchanged_files=unchanged,
|
|
180
|
+
_file_diffs=file_diffs,
|
|
181
|
+
)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""ArchiveDiff and FileDiff: archive and file-level diffing by primary key.
|
|
2
|
+
|
|
3
|
+
Diffing is hierarchical:
|
|
4
|
+
1. Compare archive fingerprints — if equal, done.
|
|
5
|
+
2. Compare file fingerprints — identify added/removed/modified files.
|
|
6
|
+
3. For modified files, compare rows by primary key.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class FileDiff:
|
|
18
|
+
"""Row-level diff for a single modified GTFS file."""
|
|
19
|
+
|
|
20
|
+
filename: str
|
|
21
|
+
added: pa.Table
|
|
22
|
+
removed: pa.Table
|
|
23
|
+
modified: pa.Table
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def added_count(self) -> int:
|
|
27
|
+
return self.added.num_rows
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def removed_count(self) -> int:
|
|
31
|
+
return self.removed.num_rows
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def modified_count(self) -> int:
|
|
35
|
+
return self.modified.num_rows
|
|
36
|
+
|
|
37
|
+
def summary(self) -> str:
|
|
38
|
+
return (
|
|
39
|
+
f"+{self.added_count} added, "
|
|
40
|
+
f"-{self.removed_count} removed, "
|
|
41
|
+
f"~{self.modified_count} modified"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ArchiveDiff:
|
|
47
|
+
"""Diff between two GTFS archives."""
|
|
48
|
+
|
|
49
|
+
added_files: set[str]
|
|
50
|
+
removed_files: set[str]
|
|
51
|
+
modified_files: set[str]
|
|
52
|
+
unchanged_files: set[str]
|
|
53
|
+
_file_diffs: dict[str, FileDiff]
|
|
54
|
+
|
|
55
|
+
def file_diff(self, filename: str) -> FileDiff:
|
|
56
|
+
"""Get the row-level diff for a modified file."""
|
|
57
|
+
if filename not in self._file_diffs:
|
|
58
|
+
raise KeyError(
|
|
59
|
+
f"No file diff for '{filename}'. "
|
|
60
|
+
f"Modified files: {sorted(self.modified_files)}"
|
|
61
|
+
)
|
|
62
|
+
return self._file_diffs[filename]
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def is_identical(self) -> bool:
|
|
66
|
+
return (
|
|
67
|
+
not self.added_files
|
|
68
|
+
and not self.removed_files
|
|
69
|
+
and not self.modified_files
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compute_file_diff(
|
|
74
|
+
filename: str,
|
|
75
|
+
old_table: pa.Table,
|
|
76
|
+
new_table: pa.Table,
|
|
77
|
+
primary_key: list[str],
|
|
78
|
+
) -> FileDiff:
|
|
79
|
+
"""Compute row-level diff between two tables using primary key."""
|
|
80
|
+
if not primary_key:
|
|
81
|
+
return _diff_no_pk(filename, old_table, new_table)
|
|
82
|
+
|
|
83
|
+
old_keys: dict[tuple[str, ...], int] = {}
|
|
84
|
+
for i in range(old_table.num_rows):
|
|
85
|
+
key = tuple(old_table.column(col)[i].as_py() for col in primary_key)
|
|
86
|
+
old_keys[key] = i
|
|
87
|
+
|
|
88
|
+
new_keys: dict[tuple[str, ...], int] = {}
|
|
89
|
+
for i in range(new_table.num_rows):
|
|
90
|
+
key = tuple(new_table.column(col)[i].as_py() for col in primary_key)
|
|
91
|
+
new_keys[key] = i
|
|
92
|
+
|
|
93
|
+
old_key_set = set(old_keys.keys())
|
|
94
|
+
new_key_set = set(new_keys.keys())
|
|
95
|
+
|
|
96
|
+
added_keys = new_key_set - old_key_set
|
|
97
|
+
removed_keys = old_key_set - new_key_set
|
|
98
|
+
common_keys = old_key_set & new_key_set
|
|
99
|
+
|
|
100
|
+
modified_indices_new = []
|
|
101
|
+
all_columns = new_table.column_names
|
|
102
|
+
|
|
103
|
+
for key in common_keys:
|
|
104
|
+
old_idx = old_keys[key]
|
|
105
|
+
new_idx = new_keys[key]
|
|
106
|
+
for col_name in all_columns:
|
|
107
|
+
if col_name in old_table.column_names:
|
|
108
|
+
old_val = old_table.column(col_name)[old_idx].as_py()
|
|
109
|
+
new_val = new_table.column(col_name)[new_idx].as_py()
|
|
110
|
+
if old_val != new_val:
|
|
111
|
+
modified_indices_new.append(new_idx)
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
added_indices = sorted(new_keys[k] for k in added_keys)
|
|
115
|
+
removed_indices = sorted(old_keys[k] for k in removed_keys)
|
|
116
|
+
|
|
117
|
+
added_table = new_table.take(added_indices) if added_indices else new_table.slice(0, 0)
|
|
118
|
+
removed_table = old_table.take(removed_indices) if removed_indices else old_table.slice(0, 0)
|
|
119
|
+
modified_table = (
|
|
120
|
+
new_table.take(sorted(modified_indices_new))
|
|
121
|
+
if modified_indices_new
|
|
122
|
+
else new_table.slice(0, 0)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return FileDiff(
|
|
126
|
+
filename=filename,
|
|
127
|
+
added=added_table,
|
|
128
|
+
removed=removed_table,
|
|
129
|
+
modified=modified_table,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _diff_no_pk(filename: str, old_table: pa.Table, new_table: pa.Table) -> FileDiff:
|
|
134
|
+
"""Diff tables with no primary key by comparing full row content."""
|
|
135
|
+
def _row_tuple(table: pa.Table, idx: int) -> tuple[str, ...]:
|
|
136
|
+
return tuple(table.column(c)[idx].as_py() for c in table.column_names)
|
|
137
|
+
|
|
138
|
+
old_rows = {_row_tuple(old_table, i) for i in range(old_table.num_rows)}
|
|
139
|
+
new_rows = {_row_tuple(new_table, i) for i in range(new_table.num_rows)}
|
|
140
|
+
|
|
141
|
+
added_rows = new_rows - old_rows
|
|
142
|
+
removed_rows = old_rows - new_rows
|
|
143
|
+
|
|
144
|
+
cols = new_table.column_names
|
|
145
|
+
added_data = {c: [] for c in cols}
|
|
146
|
+
for row in sorted(added_rows):
|
|
147
|
+
for c, v in zip(cols, row):
|
|
148
|
+
added_data[c].append(v)
|
|
149
|
+
|
|
150
|
+
removed_data = {c: [] for c in cols}
|
|
151
|
+
for row in sorted(removed_rows):
|
|
152
|
+
for c, v in zip(cols, row):
|
|
153
|
+
removed_data[c].append(v)
|
|
154
|
+
|
|
155
|
+
return FileDiff(
|
|
156
|
+
filename=filename,
|
|
157
|
+
added=pa.table({c: pa.array(v, type=pa.string()) for c, v in added_data.items()}),
|
|
158
|
+
removed=pa.table({c: pa.array(v, type=pa.string()) for c, v in removed_data.items()}),
|
|
159
|
+
modified=new_table.slice(0, 0),
|
|
160
|
+
)
|