gtfs-digester 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: gtfs-digester
3
+ Version: 0.1.0
4
+ Summary: Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds
5
+ Author: Chris Alfano
6
+ Author-email: Chris Alfano <chris@jarv.us>
7
+ License-Expression: MIT
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Scientific/Engineering :: GIS
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: blake3>=1.0.8
16
+ Requires-Dist: fsspec>=2026.3.0
17
+ Requires-Dist: gcsfs>=2026.3.0
18
+ Requires-Dist: pyarrow>=23.0.1
19
+ Requires-Python: >=3.11
20
+ Project-URL: Homepage, https://github.com/JarvusInnovations/gtfs-digester
21
+ Project-URL: Repository, https://github.com/JarvusInnovations/gtfs-digester
22
+ Project-URL: Issues, https://github.com/JarvusInnovations/gtfs-digester/issues
23
+ Description-Content-Type: text/markdown
24
+
25
+ # gtfs-digester
26
+
27
+ Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds.
28
+
29
+ ## What It Does
30
+
31
+ Takes a GTFS zip and produces:
32
+
33
+ - A **content fingerprint** (BLAKE3 merkle hash) — identical for semantically identical feeds regardless of zip metadata, file ordering, CSV whitespace, or time formatting
34
+ - **Canonical Arrow tables** for each file — normalized, sorted by primary key, all values as strings
35
+ - **Per-file hashes** for efficient hierarchical change detection
36
+ - **Archive diffs** — added/removed/modified files and rows by primary key
37
+ - **Exploded parquet storage** — write/read from local or cloud (GCS, S3) via fsspec
38
+
39
+ All files and columns are preserved, including non-standard ones.
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install gtfs-digester
45
+ # or
46
+ uv add gtfs-digester
47
+ ```
48
+
49
+ ## Quick Start
50
+
51
+ ```python
52
+ from gtfs_digester import GTFSArchive
53
+
54
+ # Load and fingerprint
55
+ archive = GTFSArchive.from_zip("google_transit.zip")
56
+ print(archive.fingerprint.hex()) # v1:abc123...
57
+
58
+ # Access canonical tables
59
+ stops = archive.arrow_table("stops.txt")
60
+ print(stops.num_rows)
61
+
62
+ # Compare two feed versions
63
+ old = GTFSArchive.from_zip("old.zip")
64
+ new = GTFSArchive.from_zip("new.zip")
65
+ diff = old.diff(new)
66
+ print(diff.is_identical)
67
+ for f in diff.modified_files:
68
+ fd = diff.file_diff(f)
69
+ print(f"{f}: {fd.summary()}")
70
+
71
+ # Write as exploded parquet
72
+ from gtfs_digester import write_exploded
73
+ write_exploded(archive, "gs://bucket/schedules/feed-1", schedule_url="https://...")
74
+ ```
75
+
76
+ ## How Fingerprinting Works
77
+
78
+ 1. Each `.txt` file is parsed to a PyArrow table (all strings)
79
+ 2. Columns reordered per GTFS spec (unknown columns preserved, sorted alphabetically after spec columns)
80
+ 3. Values normalized: whitespace stripped, times zero-padded (`9:05:00` → `09:05:00`)
81
+ 4. Rows sorted by primary key (numeric columns sorted numerically, not lexicographically)
82
+ 5. Canonical CSV serialized and BLAKE3 hashed per file
83
+ 6. Archive fingerprint = BLAKE3 of sorted `filename:hash` pairs (merkle tree)
84
+
85
+ Unknown files (not in the GTFS spec) are preserved with lexicographic row sorting.
86
+
87
+ ## Storage Layout
88
+
89
+ `write_exploded()` produces a version-first directory:
90
+
91
+ ```
92
+ base_path/
93
+ _feed_digest={v1:abc...}/
94
+ agency.parquet
95
+ stops.parquet
96
+ routes.parquet
97
+ trips.parquet
98
+ stop_times.parquet
99
+ ...
100
+ metadata.json # provenance + digester info, written last (commit marker)
101
+ ```
102
+
103
+ DuckDB reads across versions with hive partitioning:
104
+
105
+ ```sql
106
+ SELECT * FROM read_parquet('base_path/_feed_digest=*/stops.parquet', hive_partitioning=true);
107
+ ```
108
+
109
+ ## Development
110
+
111
+ ```bash
112
+ uv sync --group dev
113
+ uv run pytest tests/ -k "not Real" # unit tests (~0.5s)
114
+ uv run pytest tests/ # includes real feed integration tests
115
+ ```
116
+
117
+ ## License
118
+
119
+ MIT
@@ -0,0 +1,95 @@
1
+ # gtfs-digester
2
+
3
+ Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds.
4
+
5
+ ## What It Does
6
+
7
+ Takes a GTFS zip and produces:
8
+
9
+ - A **content fingerprint** (BLAKE3 merkle hash) — identical for semantically identical feeds regardless of zip metadata, file ordering, CSV whitespace, or time formatting
10
+ - **Canonical Arrow tables** for each file — normalized, sorted by primary key, all values as strings
11
+ - **Per-file hashes** for efficient hierarchical change detection
12
+ - **Archive diffs** — added/removed/modified files and rows by primary key
13
+ - **Exploded parquet storage** — write/read from local or cloud (GCS, S3) via fsspec
14
+
15
+ All files and columns are preserved, including non-standard ones.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install gtfs-digester
21
+ # or
22
+ uv add gtfs-digester
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from gtfs_digester import GTFSArchive
29
+
30
+ # Load and fingerprint
31
+ archive = GTFSArchive.from_zip("google_transit.zip")
32
+ print(archive.fingerprint.hex()) # v1:abc123...
33
+
34
+ # Access canonical tables
35
+ stops = archive.arrow_table("stops.txt")
36
+ print(stops.num_rows)
37
+
38
+ # Compare two feed versions
39
+ old = GTFSArchive.from_zip("old.zip")
40
+ new = GTFSArchive.from_zip("new.zip")
41
+ diff = old.diff(new)
42
+ print(diff.is_identical)
43
+ for f in diff.modified_files:
44
+ fd = diff.file_diff(f)
45
+ print(f"{f}: {fd.summary()}")
46
+
47
+ # Write as exploded parquet
48
+ from gtfs_digester import write_exploded
49
+ write_exploded(archive, "gs://bucket/schedules/feed-1", schedule_url="https://...")
50
+ ```
51
+
52
+ ## How Fingerprinting Works
53
+
54
+ 1. Each `.txt` file is parsed to a PyArrow table (all strings)
55
+ 2. Columns reordered per GTFS spec (unknown columns preserved, sorted alphabetically after spec columns)
56
+ 3. Values normalized: whitespace stripped, times zero-padded (`9:05:00` → `09:05:00`)
57
+ 4. Rows sorted by primary key (numeric columns sorted numerically, not lexicographically)
58
+ 5. Canonical CSV serialized and BLAKE3 hashed per file
59
+ 6. Archive fingerprint = BLAKE3 of sorted `filename:hash` pairs (merkle tree)
60
+
61
+ Unknown files (not in the GTFS spec) are preserved with lexicographic row sorting.
62
+
63
+ ## Storage Layout
64
+
65
+ `write_exploded()` produces a version-first directory:
66
+
67
+ ```
68
+ base_path/
69
+ _feed_digest={v1:abc...}/
70
+ agency.parquet
71
+ stops.parquet
72
+ routes.parquet
73
+ trips.parquet
74
+ stop_times.parquet
75
+ ...
76
+ metadata.json # provenance + digester info, written last (commit marker)
77
+ ```
78
+
79
+ DuckDB reads across versions with hive partitioning:
80
+
81
+ ```sql
82
+ SELECT * FROM read_parquet('base_path/_feed_digest=*/stops.parquet', hive_partitioning=true);
83
+ ```
84
+
85
+ ## Development
86
+
87
+ ```bash
88
+ uv sync --group dev
89
+ uv run pytest tests/ -k "not Real" # unit tests (~0.5s)
90
+ uv run pytest tests/ # includes real feed integration tests
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT
@@ -0,0 +1,47 @@
1
+ [project]
2
+ name = "gtfs-digester"
3
+ version = "0.1.0"
4
+ description = "Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Chris Alfano", email = "chris@jarv.us" }
8
+ ]
9
+ license = "MIT"
10
+ requires-python = ">=3.11"
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "Intended Audience :: Developers",
14
+ "Topic :: Scientific/Engineering :: GIS",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ ]
20
+
21
+ dependencies = [
22
+ "blake3>=1.0.8",
23
+ "fsspec>=2026.3.0",
24
+ "gcsfs>=2026.3.0",
25
+ "pyarrow>=23.0.1",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/JarvusInnovations/gtfs-digester"
30
+ Repository = "https://github.com/JarvusInnovations/gtfs-digester"
31
+ Issues = "https://github.com/JarvusInnovations/gtfs-digester/issues"
32
+
33
+ [build-system]
34
+ requires = ["uv_build>=0.11.3,<0.12.0"]
35
+ build-backend = "uv_build"
36
+
37
+ [tool.pytest.ini_options]
38
+ testpaths = ["tests"]
39
+ markers = [
40
+ "slow: marks tests that process real GTFS feeds (deselect with '-k not Real')",
41
+ ]
42
+
43
+ [dependency-groups]
44
+ dev = [
45
+ "pytest>=9.0.2",
46
+ "pytest-cov>=7.1.0",
47
+ ]
@@ -0,0 +1,26 @@
1
+ """gtfs-digester: Canonicalization, fingerprinting, and change detection for GTFS Schedule feeds."""
2
+
3
+ from .archive import GTFSArchive
4
+ from .diff import ArchiveDiff, FileDiff
5
+ from .file import GTFSFile
6
+ from .fingerprint import ArchiveFingerprint
7
+ from .metadata import FeedMetadata
8
+ from .schema import GTFS_SCHEMAS, FileSchema, get_schema
9
+ from .storage import list_versions, read_metadata, read_table, version_exists, write_exploded
10
+
11
+ __all__ = [
12
+ "GTFSArchive",
13
+ "GTFSFile",
14
+ "ArchiveFingerprint",
15
+ "ArchiveDiff",
16
+ "FileDiff",
17
+ "FeedMetadata",
18
+ "FileSchema",
19
+ "get_schema",
20
+ "GTFS_SCHEMAS",
21
+ "write_exploded",
22
+ "read_metadata",
23
+ "read_table",
24
+ "list_versions",
25
+ "version_exists",
26
+ ]
@@ -0,0 +1,181 @@
1
+ """GTFSArchive: load from zip or directory, access Arrow tables, fingerprint, diff."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import warnings
7
+ import zipfile
8
+ from pathlib import Path
9
+
10
+ import pyarrow as pa
11
+
12
+ from .diff import ArchiveDiff, FileDiff, compute_file_diff
13
+ from .file import GTFSFile
14
+ from .fingerprint import ArchiveFingerprint
15
+ from .schema import get_schema
16
+
17
+
18
+ class GTFSArchive:
19
+ """Represents a complete GTFS feed.
20
+
21
+ Provides access to individual files, computes fingerprints,
22
+ and writes normalized output. Archives are immutable.
23
+
24
+ All .txt files are preserved — both known (with schema-aware normalization)
25
+ and unknown (with lexicographic sorting).
26
+ """
27
+
28
+ def __init__(self, files: dict[str, GTFSFile]) -> None:
29
+ self._files = files
30
+ self._fingerprint: ArchiveFingerprint | None = None
31
+
32
+ @staticmethod
33
+ def from_zip(
34
+ source: str | bytes | io.BytesIO | Path,
35
+ ) -> GTFSArchive:
36
+ """Load a GTFS archive from a zip file.
37
+
38
+ Args:
39
+ source: Path to zip file, raw bytes, or BytesIO.
40
+ """
41
+ if isinstance(source, (str, Path)):
42
+ with open(source, "rb") as f:
43
+ zip_bytes = f.read()
44
+ elif isinstance(source, bytes):
45
+ zip_bytes = source
46
+ else:
47
+ zip_bytes = source.read()
48
+
49
+ zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
50
+ files: dict[str, GTFSFile] = {}
51
+
52
+ for name in sorted(zf.namelist()):
53
+ if name.endswith("/") or name.startswith(".") or name.startswith("__"):
54
+ continue
55
+
56
+ # Strip directory prefix (some zips nest files in a subdirectory)
57
+ basename = name.split("/")[-1]
58
+
59
+ if not basename.endswith(".txt"):
60
+ continue
61
+
62
+ schema = get_schema(basename)
63
+ data = zf.read(name)
64
+ gtfs_file = GTFSFile.from_csv_bytes(
65
+ filename=basename,
66
+ data=data,
67
+ schema=schema,
68
+ )
69
+ files[basename] = gtfs_file
70
+
71
+ return GTFSArchive(files=files)
72
+
73
+ @staticmethod
74
+ def from_directory(
75
+ path: str | Path,
76
+ ) -> GTFSArchive:
77
+ """Load a GTFS archive from a directory of .txt files.
78
+
79
+ Args:
80
+ path: Path to directory containing GTFS .txt files.
81
+ """
82
+ dirpath = Path(path)
83
+ if not dirpath.is_dir():
84
+ raise ValueError(f"Not a directory: {dirpath}")
85
+
86
+ files: dict[str, GTFSFile] = {}
87
+
88
+ for txt_file in sorted(dirpath.glob("*.txt")):
89
+ basename = txt_file.name
90
+ if basename.startswith(".") or basename.startswith("__"):
91
+ continue
92
+
93
+ schema = get_schema(basename)
94
+ data = txt_file.read_bytes()
95
+ gtfs_file = GTFSFile.from_csv_bytes(
96
+ filename=basename,
97
+ data=data,
98
+ schema=schema,
99
+ )
100
+ files[basename] = gtfs_file
101
+
102
+ return GTFSArchive(files=files)
103
+
104
+ @property
105
+ def filenames(self) -> set[str]:
106
+ """Set of filenames in this archive."""
107
+ return set(self._files.keys())
108
+
109
+ def __contains__(self, filename: str) -> bool:
110
+ return filename in self._files
111
+
112
+ def __getitem__(self, filename: str) -> GTFSFile:
113
+ return self._files[filename]
114
+
115
+ def arrow_table(self, filename: str) -> pa.Table:
116
+ """Get the canonical Arrow table for a file."""
117
+ return self[filename].table
118
+
119
+ @property
120
+ def fingerprint(self) -> ArchiveFingerprint:
121
+ """Compute (or return cached) archive fingerprint."""
122
+ if self._fingerprint is None:
123
+ file_hashes: dict[str, str] = {}
124
+ for filename, gtfs_file in self._files.items():
125
+ file_hashes[filename] = gtfs_file.fingerprint_hash()
126
+ self._fingerprint = ArchiveFingerprint.compute(file_hashes)
127
+ return self._fingerprint
128
+
129
+ def to_normalized_zip(self, path: str | Path) -> None:
130
+ """Write the archive as a normalized GTFS zip.
131
+
132
+ Contains canonical CSV files and a .gtfs_digester.json metadata file.
133
+ """
134
+ fp = self.fingerprint
135
+
136
+ with zipfile.ZipFile(str(path), "w", compression=zipfile.ZIP_DEFLATED) as zf:
137
+ for filename in sorted(self._files.keys()):
138
+ gtfs_file = self._files[filename]
139
+ csv_bytes = gtfs_file.to_canonical_csv()
140
+ zf.writestr(filename, csv_bytes)
141
+
142
+ zf.writestr(".gtfs_digester.json", fp.to_json())
143
+
144
+ def diff(self, other: GTFSArchive) -> ArchiveDiff:
145
+ """Compare this archive with another, returning an ArchiveDiff.
146
+
147
+ This archive is treated as "old", other as "new".
148
+ """
149
+ old_files = self.filenames
150
+ new_files = other.filenames
151
+
152
+ added = new_files - old_files
153
+ removed = old_files - new_files
154
+ common = old_files & new_files
155
+
156
+ modified: set[str] = set()
157
+ unchanged: set[str] = set()
158
+ file_diffs: dict[str, FileDiff] = {}
159
+
160
+ for filename in common:
161
+ old_hash = self._files[filename].fingerprint_hash()
162
+ new_hash = other._files[filename].fingerprint_hash()
163
+ if old_hash == new_hash:
164
+ unchanged.add(filename)
165
+ else:
166
+ modified.add(filename)
167
+ old_table = self._files[filename].table
168
+ new_table = other._files[filename].table
169
+ old_schema = self._files[filename].schema
170
+ pk = old_schema.primary_key if old_schema else []
171
+ file_diffs[filename] = compute_file_diff(
172
+ filename, old_table, new_table, pk
173
+ )
174
+
175
+ return ArchiveDiff(
176
+ added_files=added,
177
+ removed_files=removed,
178
+ modified_files=modified,
179
+ unchanged_files=unchanged,
180
+ _file_diffs=file_diffs,
181
+ )
@@ -0,0 +1,160 @@
1
+ """ArchiveDiff and FileDiff: archive and file-level diffing by primary key.
2
+
3
+ Diffing is hierarchical:
4
+ 1. Compare archive fingerprints — if equal, done.
5
+ 2. Compare file fingerprints — identify added/removed/modified files.
6
+ 3. For modified files, compare rows by primary key.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+
13
+ import pyarrow as pa
14
+
15
+
16
+ @dataclass
17
+ class FileDiff:
18
+ """Row-level diff for a single modified GTFS file."""
19
+
20
+ filename: str
21
+ added: pa.Table
22
+ removed: pa.Table
23
+ modified: pa.Table
24
+
25
+ @property
26
+ def added_count(self) -> int:
27
+ return self.added.num_rows
28
+
29
+ @property
30
+ def removed_count(self) -> int:
31
+ return self.removed.num_rows
32
+
33
+ @property
34
+ def modified_count(self) -> int:
35
+ return self.modified.num_rows
36
+
37
+ def summary(self) -> str:
38
+ return (
39
+ f"+{self.added_count} added, "
40
+ f"-{self.removed_count} removed, "
41
+ f"~{self.modified_count} modified"
42
+ )
43
+
44
+
45
+ @dataclass
46
+ class ArchiveDiff:
47
+ """Diff between two GTFS archives."""
48
+
49
+ added_files: set[str]
50
+ removed_files: set[str]
51
+ modified_files: set[str]
52
+ unchanged_files: set[str]
53
+ _file_diffs: dict[str, FileDiff]
54
+
55
+ def file_diff(self, filename: str) -> FileDiff:
56
+ """Get the row-level diff for a modified file."""
57
+ if filename not in self._file_diffs:
58
+ raise KeyError(
59
+ f"No file diff for '{filename}'. "
60
+ f"Modified files: {sorted(self.modified_files)}"
61
+ )
62
+ return self._file_diffs[filename]
63
+
64
+ @property
65
+ def is_identical(self) -> bool:
66
+ return (
67
+ not self.added_files
68
+ and not self.removed_files
69
+ and not self.modified_files
70
+ )
71
+
72
+
73
+ def compute_file_diff(
74
+ filename: str,
75
+ old_table: pa.Table,
76
+ new_table: pa.Table,
77
+ primary_key: list[str],
78
+ ) -> FileDiff:
79
+ """Compute row-level diff between two tables using primary key."""
80
+ if not primary_key:
81
+ return _diff_no_pk(filename, old_table, new_table)
82
+
83
+ old_keys: dict[tuple[str, ...], int] = {}
84
+ for i in range(old_table.num_rows):
85
+ key = tuple(old_table.column(col)[i].as_py() for col in primary_key)
86
+ old_keys[key] = i
87
+
88
+ new_keys: dict[tuple[str, ...], int] = {}
89
+ for i in range(new_table.num_rows):
90
+ key = tuple(new_table.column(col)[i].as_py() for col in primary_key)
91
+ new_keys[key] = i
92
+
93
+ old_key_set = set(old_keys.keys())
94
+ new_key_set = set(new_keys.keys())
95
+
96
+ added_keys = new_key_set - old_key_set
97
+ removed_keys = old_key_set - new_key_set
98
+ common_keys = old_key_set & new_key_set
99
+
100
+ modified_indices_new = []
101
+ all_columns = new_table.column_names
102
+
103
+ for key in common_keys:
104
+ old_idx = old_keys[key]
105
+ new_idx = new_keys[key]
106
+ for col_name in all_columns:
107
+ if col_name in old_table.column_names:
108
+ old_val = old_table.column(col_name)[old_idx].as_py()
109
+ new_val = new_table.column(col_name)[new_idx].as_py()
110
+ if old_val != new_val:
111
+ modified_indices_new.append(new_idx)
112
+ break
113
+
114
+ added_indices = sorted(new_keys[k] for k in added_keys)
115
+ removed_indices = sorted(old_keys[k] for k in removed_keys)
116
+
117
+ added_table = new_table.take(added_indices) if added_indices else new_table.slice(0, 0)
118
+ removed_table = old_table.take(removed_indices) if removed_indices else old_table.slice(0, 0)
119
+ modified_table = (
120
+ new_table.take(sorted(modified_indices_new))
121
+ if modified_indices_new
122
+ else new_table.slice(0, 0)
123
+ )
124
+
125
+ return FileDiff(
126
+ filename=filename,
127
+ added=added_table,
128
+ removed=removed_table,
129
+ modified=modified_table,
130
+ )
131
+
132
+
133
+ def _diff_no_pk(filename: str, old_table: pa.Table, new_table: pa.Table) -> FileDiff:
134
+ """Diff tables with no primary key by comparing full row content."""
135
+ def _row_tuple(table: pa.Table, idx: int) -> tuple[str, ...]:
136
+ return tuple(table.column(c)[idx].as_py() for c in table.column_names)
137
+
138
+ old_rows = {_row_tuple(old_table, i) for i in range(old_table.num_rows)}
139
+ new_rows = {_row_tuple(new_table, i) for i in range(new_table.num_rows)}
140
+
141
+ added_rows = new_rows - old_rows
142
+ removed_rows = old_rows - new_rows
143
+
144
+ cols = new_table.column_names
145
+ added_data = {c: [] for c in cols}
146
+ for row in sorted(added_rows):
147
+ for c, v in zip(cols, row):
148
+ added_data[c].append(v)
149
+
150
+ removed_data = {c: [] for c in cols}
151
+ for row in sorted(removed_rows):
152
+ for c, v in zip(cols, row):
153
+ removed_data[c].append(v)
154
+
155
+ return FileDiff(
156
+ filename=filename,
157
+ added=pa.table({c: pa.array(v, type=pa.string()) for c, v in added_data.items()}),
158
+ removed=pa.table({c: pa.array(v, type=pa.string()) for c, v in removed_data.items()}),
159
+ modified=new_table.slice(0, 0),
160
+ )