gbfs-toolkit 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. gbfs_toolkit-1.0.0/LICENSE +32 -0
  2. gbfs_toolkit-1.0.0/PKG-INFO +329 -0
  3. gbfs_toolkit-1.0.0/README.md +282 -0
  4. gbfs_toolkit-1.0.0/pyproject.toml +83 -0
  5. gbfs_toolkit-1.0.0/setup.cfg +4 -0
  6. gbfs_toolkit-1.0.0/src/gbfs_toolkit/__init__.py +223 -0
  7. gbfs_toolkit-1.0.0/src/gbfs_toolkit/accessor.py +105 -0
  8. gbfs_toolkit-1.0.0/src/gbfs_toolkit/analysis.py +274 -0
  9. gbfs_toolkit-1.0.0/src/gbfs_toolkit/audit/__init__.py +50 -0
  10. gbfs_toolkit-1.0.0/src/gbfs_toolkit/audit/dynamic.py +94 -0
  11. gbfs_toolkit-1.0.0/src/gbfs_toolkit/audit/static.py +215 -0
  12. gbfs_toolkit-1.0.0/src/gbfs_toolkit/catalog.py +189 -0
  13. gbfs_toolkit-1.0.0/src/gbfs_toolkit/cli.py +67 -0
  14. gbfs_toolkit-1.0.0/src/gbfs_toolkit/cluster.py +348 -0
  15. gbfs_toolkit-1.0.0/src/gbfs_toolkit/datasets.py +80 -0
  16. gbfs_toolkit-1.0.0/src/gbfs_toolkit/diagnostics.py +32 -0
  17. gbfs_toolkit-1.0.0/src/gbfs_toolkit/errors.py +34 -0
  18. gbfs_toolkit-1.0.0/src/gbfs_toolkit/fetch.py +510 -0
  19. gbfs_toolkit-1.0.0/src/gbfs_toolkit/fleet.py +155 -0
  20. gbfs_toolkit-1.0.0/src/gbfs_toolkit/geo.py +269 -0
  21. gbfs_toolkit-1.0.0/src/gbfs_toolkit/geofencing.py +164 -0
  22. gbfs_toolkit-1.0.0/src/gbfs_toolkit/models.py +271 -0
  23. gbfs_toolkit-1.0.0/src/gbfs_toolkit/multimodal.py +84 -0
  24. gbfs_toolkit-1.0.0/src/gbfs_toolkit/normalize.py +362 -0
  25. gbfs_toolkit-1.0.0/src/gbfs_toolkit/osm.py +111 -0
  26. gbfs_toolkit-1.0.0/src/gbfs_toolkit/py.typed +0 -0
  27. gbfs_toolkit-1.0.0/src/gbfs_toolkit/stats.py +415 -0
  28. gbfs_toolkit-1.0.0/src/gbfs_toolkit/timeseries.py +529 -0
  29. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/PKG-INFO +329 -0
  30. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/SOURCES.txt +51 -0
  31. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/dependency_links.txt +1 -0
  32. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/entry_points.txt +2 -0
  33. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/requires.txt +30 -0
  34. gbfs_toolkit-1.0.0/src/gbfs_toolkit.egg-info/top_level.txt +1 -0
  35. gbfs_toolkit-1.0.0/tests/test_analysis_geo.py +145 -0
  36. gbfs_toolkit-1.0.0/tests/test_audit_static.py +92 -0
  37. gbfs_toolkit-1.0.0/tests/test_cli.py +33 -0
  38. gbfs_toolkit-1.0.0/tests/test_cluster.py +137 -0
  39. gbfs_toolkit-1.0.0/tests/test_consolidation.py +100 -0
  40. gbfs_toolkit-1.0.0/tests/test_ergonomics.py +85 -0
  41. gbfs_toolkit-1.0.0/tests/test_fetch.py +295 -0
  42. gbfs_toolkit-1.0.0/tests/test_fleet.py +121 -0
  43. gbfs_toolkit-1.0.0/tests/test_from_projects.py +81 -0
  44. gbfs_toolkit-1.0.0/tests/test_geofencing.py +135 -0
  45. gbfs_toolkit-1.0.0/tests/test_hardening.py +157 -0
  46. gbfs_toolkit-1.0.0/tests/test_multimodal.py +51 -0
  47. gbfs_toolkit-1.0.0/tests/test_normalize_catalog.py +67 -0
  48. gbfs_toolkit-1.0.0/tests/test_osm_surroundings.py +52 -0
  49. gbfs_toolkit-1.0.0/tests/test_quality.py +324 -0
  50. gbfs_toolkit-1.0.0/tests/test_research_helpers.py +205 -0
  51. gbfs_toolkit-1.0.0/tests/test_robustness.py +225 -0
  52. gbfs_toolkit-1.0.0/tests/test_stats.py +182 -0
  53. gbfs_toolkit-1.0.0/tests/test_timeseries.py +111 -0
@@ -0,0 +1,32 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 Rohan Fossé and Gaël Pallares
4
+ CESI LINEACT (EA 7527), Montpellier, France
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+
24
+ ----------------------------------------------------------------------
25
+
26
+ This MIT licence applies to the source code in this repository
27
+ (`audit_pipeline/`, `app/`, `notebooks/`, `paper/`).
28
+
29
+ The data files under `catalogue/`, the Zenodo deposit and the Hugging
30
+ Face dataset mirror are distributed under the Open Data Commons Open
31
+ Database License (ODbL) v1.0; see LICENSE-DATA for the full text and
32
+ attribution requirements.
@@ -0,0 +1,329 @@
1
+ Metadata-Version: 2.4
2
+ Name: gbfs-toolkit
3
+ Version: 1.0.0
4
+ Summary: Research-grade ingestion and semantic quality audit (A1–A7) for GBFS bike-share feeds
5
+ Author: Gaël Pallares
6
+ Author-email: Rohan Fossé <rfosse@cesi.fr>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/cycling-data-lab/gbfs-toolkit
9
+ Project-URL: Repository, https://github.com/cycling-data-lab/gbfs-toolkit
10
+ Keywords: GBFS,bike-sharing,shared mobility,data quality,semantic validation,open data
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: GIS
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy>=1.26
24
+ Requires-Dist: scipy>=1.13
25
+ Requires-Dist: pandas>=2.2
26
+ Provides-Extra: fetch
27
+ Requires-Dist: requests>=2.31; extra == "fetch"
28
+ Provides-Extra: geo
29
+ Requires-Dist: geopandas>=0.14; extra == "geo"
30
+ Provides-Extra: osm
31
+ Requires-Dist: geopandas>=0.14; extra == "osm"
32
+ Provides-Extra: parquet
33
+ Requires-Dist: pyarrow>=15.0; extra == "parquet"
34
+ Provides-Extra: cluster
35
+ Requires-Dist: scikit-learn>=1.4; extra == "cluster"
36
+ Provides-Extra: dtw
37
+ Requires-Dist: tslearn>=0.6; extra == "dtw"
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=8.0; extra == "dev"
40
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
41
+ Requires-Dist: ruff>=0.5; extra == "dev"
42
+ Requires-Dist: requests>=2.31; extra == "dev"
43
+ Requires-Dist: pyarrow>=15.0; extra == "dev"
44
+ Requires-Dist: scikit-learn>=1.4; extra == "dev"
45
+ Requires-Dist: geopandas>=0.14; extra == "dev"
46
+ Dynamic: license-file
47
+
48
+ # gbfs-toolkit
49
+
50
+ [![CI](https://github.com/cycling-data-lab/gbfs-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/cycling-data-lab/gbfs-toolkit/actions/workflows/ci.yml)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](./LICENSE)
52
+ [![Python 3.10+](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/)
53
+
54
+ **Research-grade ingestion and *semantic* quality audit for GBFS bike-share feeds.**
55
+
56
+ MobilityData's [`gbfs-validator`](https://github.com/MobilityData/gbfs-validator) checks
57
+ that a feed is *syntactically* valid. `gbfs-toolkit` checks whether it is *semantically*
58
+ trustworthy and analysis-ready — the **A1–A7 quality taxonomy** of Fossé & Pallares
59
+ ([`gbfs-audit-catalogue`](https://github.com/cycling-data-lab/gbfs-audit-catalogue)) — and
60
+ normalises feeds into a **stable, version-independent data model** you can reuse across
61
+ studies.
62
+
63
+ ## Why
64
+
65
+ Every bike-share study re-implements the same plumbing — discover feeds, normalise GBFS
66
+ 1.x/2.x/3.x, and (the hard part) cope with the semantic defects the syntactic validator
67
+ cannot see: placeholder capacities, phantom docks, transposed coordinates, out-of-perimeter
68
+ stations. This package consolidates that into one tested interface so the audit is a verdict
69
+ per station, not a re-run of someone's notebook.
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install gbfs-toolkit # from PyPI (when released)
75
+ pip install -e ".[dev]" # from a local clone
76
+ ```
77
+
78
+ Core depends only on numpy / scipy / pandas. Network discovery/fetch uses the optional
79
+ `[fetch]` extra (`requests`).
80
+
81
+ ## Quick start
82
+
83
+ ```python
84
+ import gbfs_toolkit as gb
85
+
86
+ info, status = gb.load_example() # bundled sample — no network needed
87
+ av = info.gbfs.join_status(status) # fluent .gbfs accessor (or gb.join_availability)
88
+ clean = info.gbfs.drop_flagged() # audit A1–A7 and keep the trustworthy stations
89
+ av.gbfs.occupancy() # bikes / (bikes + docks), NaN-safe
90
+ ```
91
+
92
+ From your own feed:
93
+
94
+ ```python
95
+ import json
96
+
97
+ raw = json.load(open("station_information.json"))
98
+ stations = gb.to_canonical_station_info(raw, system_id="velib") # version-independent frame
99
+ verdict = gb.audit_static(stations) # A1–A7 per station
100
+ clean = stations[~verdict["flagged"].to_numpy()] # quality filter in one line
101
+ ```
102
+
103
+ Every function is also a `.gbfs` accessor method, and pure (so `df.pipe(gb.occupancy)` works).
104
+ `gb.show_versions()` prints an environment report for bug reports.
105
+
106
+ Command line (the semantic counterpart to `gbfs-validator`):
107
+
108
+ ```bash
109
+ gbfs audit station_information.json --system-id velib --out verdict.csv
110
+ ```
111
+
112
+ ## The A1–A7 semantic taxonomy
113
+
114
+ | Flag | Rule | Signature | Level |
115
+ |---|---|---|---|
116
+ | A1 | Out-of-domain inclusion | car-sharing advertised as bike-sharing | station |
117
+ | A2 | Placeholder capacity | constant non-zero capacity across a whole system | system |
118
+ | A3 | Structural over-capacity | free-floating fleet anchors | station |
119
+ | A4 | Geospatial error | transposed coords / stations far from neighbours (3σ) | station |
120
+ | A5 | Out-of-perimeter | system bounding box > 50,000 km² | system |
121
+ | A6 | Zero-capacity dock | ≥1% of docked stations declare capacity = 0 | system |
122
+ | A7 | Null capacity field | ≥50% of stations declare capacity = NaN | system |
123
+
124
+ Thresholds match the published catalogue, so verdicts reproduce.
125
+
126
+ ## Canonical data model (the stable contract)
127
+
128
+ Ingestion is normalised **once** into version-independent frames; audit and analysis then
129
+ operate purely on these. Downstream code depends on these column names, never on raw GBFS
130
+ JSON.
131
+
132
+ - **StationInfo**: `system_id, station_id, name, lat, lon, capacity, station_type, is_virtual_station`
133
+ - **StationStatus**: `system_id, station_id, num_bikes_available, num_docks_available, is_renting, is_returning, last_reported, fetched_at, gbfs_version`
134
+ - **VehicleStatus**: `system_id, vehicle_id, vehicle_type_id, lat, lon, is_reserved, is_disabled, fetched_at, gbfs_version`
135
+ - **AuditVerdict**: `system_id, station_id, A1…A7, flagged, reason`
136
+
137
+ `last_reported` and `fetched_at` are tz-aware **UTC** timestamps (`datetime64[ns, UTC]`) so
138
+ feeds from different cities merge unambiguously.
139
+
140
+ ## Daily ergonomics
141
+
142
+ ```python
143
+ import gbfs_toolkit as gb
144
+
145
+ # discover by city (you rarely know the system_id)
146
+ cat = gb.systems_catalog()
147
+ paris = gb.filter_catalog(cat, country_code="FR", city="Paris")
148
+
149
+ feed = gb.GBFSFeed.from_url(url)
150
+ feed.summary() # one-glance card: stations, bikes, staleness, version
151
+ avail = feed.availability() # bikes/docks + name/coords/capacity, one frame
152
+ avail["state"] = gb.station_state(avail) # empty / full / disabled / normal
153
+ problems = gb.audit_dynamic(avail) # negative counts, over-capacity, stale
154
+ near = gb.find_nearest_stations(48.85, 2.35, feed.station_information(), k=3)
155
+
156
+ # many systems at once (threaded), broken feeds isolated as Exceptions
157
+ feeds = gb.fetch_multiple(["velib", "bixi", "lyon"], max_workers=5)
158
+ ```
159
+
160
+ ## Longitudinal data lake
161
+
162
+ Turn a stream of snapshots into an analysis-ready panel. The library owns the
163
+ formatting / dedup / I/O; your orchestrator (cron, Airflow…) owns the polling loop.
164
+ Requires the optional `[parquet]` extra (`pyarrow`).
165
+
166
+ ```python
167
+ import gbfs_toolkit as gb
168
+
169
+ # in your poller (every N minutes):
170
+ gb.append_to_parquet(feed.station_status(), "lake/") # Hive-partitioned by system_id/date
171
+
172
+ # in your analysis:
173
+ panel = gb.build_availability_panel("lake/", system_id="velib",
174
+ start_time="2026-06-01", resample_freq="5min")
175
+ flow = gb.calculate_net_flow(panel) # Δ bikes/station per poll (observed flow only)
176
+ ```
177
+
178
+ `build_availability_panel` filters partitions *before* loading (memory-bounded),
179
+ de-duplicates redundant polls (same `station_id` + `last_reported`), and optionally
180
+ resamples each station to a fixed cadence.
181
+
182
+ ## Station clustering (`[cluster]`)
183
+
184
+ Three lenses on "which stations belong together" — spatial, topological, behavioural:
185
+
186
+ ```python
187
+ gb.cluster_spatial(info, method="hdbscan") # density zones (projected metres)
188
+ gb.cluster_spectral(info, k=6) # network/topology groups
189
+ gb.cluster_diurnal_profiles(panel, n_clusters=4) # daily-rhythm typologies ⭐
190
+ ```
191
+
192
+ `cluster_diurnal_profiles` turns the longitudinal panel into station **typologies** —
193
+ e.g. "morning commuter origin" (full at night, empty by day) vs "recreational" — from each
194
+ station's 24-hour occupancy profile (robust to irregular sampling). Modern options:
195
+ auto-`k` by silhouette, shape clustering (`normalize="zscore"`), soft GMM, DTW
196
+ (`method="dtw"`, extra `[dtw]`), weekday/weekend split. And `label_diurnal_typology`
197
+ turns clusters into **named** types. The payoff of the data lake.
198
+
199
+ ## Multimodal — bikeshare ↔ transit
200
+
201
+ ```python
202
+ stops = pd.read_csv("gtfs/stops.txt") # bring your own GTFS stops
203
+ linked = gb.link_transit_stops(info, stops, radius_m=200)
204
+ feeders = linked[linked["is_transit_feeder"]] # first/last-mile docks near rail/bus
205
+ ```
206
+
207
+ Pure spatial proximity on `GeoKDTree` (no transit API, no schedules) — `is_transit_feeder`,
208
+ `nearest_stop_dist_m`, `n_transit_within`.
209
+
210
+ ## Station surroundings — what's around each dock (`[osm]`)
211
+
212
+ ```python
213
+ # generic "what's nearby" — works for any point dataset (POIs, shops, …)
214
+ gb.features_within(info, pois, radius_m=300, category_col="amenity") # n_within, n_cafe, …
215
+
216
+ # bring your own OSM frame (fetch it yourself, e.g. osmnx.features_from_point)
217
+ # one-shot context: transit feeders + OSM features, in one frame
218
+ ctx = gb.station_surroundings(info, transit=stops, osm=osm_gdf, radius_m=300)
219
+ ```
220
+
221
+ The radius summarisation (counts + per-category breakdown + nearest distance) is the durable,
222
+ tested core; data acquisition is **Bring Your Own GeoDataFrame** so the library never depends
223
+ on a live Overpass endpoint. Routing / isochrones stay out of scope (use OSMnx / pandana).
224
+
225
+ ## Descriptive stats — the bikeshare `describe()`
226
+
227
+ ```python
228
+ gb.system_profile(av) # stations, capacity, occupancy, % empty/full/…
229
+ gb.compare_systems({"velib": av1, "bixi": av2}) # one comparison row per city
230
+ gb.concentration_metrics(info) # capacity Gini + top-decile hub share (equity)
231
+ gb.coverage_stats(info, zones=zones) # density, nearest-neighbour, Clark–Evans dispersion
232
+ gb.availability_stats(panel) # per-station: occupancy, peak hour, volatility
233
+ ```
234
+
235
+ Standard spatial / inequality algorithms (numpy/scipy only, deterministic):
236
+
237
+ ```python
238
+ gb.morans_i(info, "occupancy") # spatial autocorrelation (+ z-score / p-value)
239
+ gb.ripley_k(info, radii=[100, 250, 500]) # multi-scale clustering: L>0 clustered, <0 dispersed
240
+ gb.lorenz_curve(info) # inequality curve to plot (Gini/Theil in concentration_metrics)
241
+ ```
242
+
243
+ Readable, comparable summaries — strictly descriptive (no OD/trip inference). `system_profile`
244
+ is a one-glance numeric card of a snapshot; `concentration_metrics` is an equity lens (kept
245
+ *outside* the published A1–A7 audit, since it's a metric not a quality verdict);
246
+ `availability_stats` turns a longitudinal panel into per-station scalars (pass a `target_tz`
247
+ panel for local-time peaks).
248
+
249
+ ## Fleet reconciliation — where are the bikes, really?
250
+
251
+ ```python
252
+ tally = gb.reconcile_fleet_state(status, vehicles) # or feed.reconcile_fleet()
253
+ tally["total_deployed"] # on the street: stations + free-floating, overlap excluded
254
+ tally["total_rentable"] # available in stations + available free-floating
255
+ tally["double_count_avoided"] # vehicles a naive sum would have counted twice
256
+ ```
257
+
258
+ GBFS reports the same fleet twice — aggregate docked counts in `station_status` and
259
+ individual units (some parked at stations) in `vehicle_status`. Naively adding them
260
+ double-counts every vehicle sitting at a dock. The reconciler excludes station-parked
261
+ vehicles from the deployed total and surfaces the overlap instead of hiding it.
262
+
263
+ ## Geofencing / service areas (`[geo]`)
264
+
265
+ ```python
266
+ zones = gb.to_canonical_geofencing(raw, system_id="lime") # GeoDataFrame of operator polygons
267
+ tagged = gb.zones_for_points(info, zones) # which zone each station sits in
268
+ density = len(info) / gb.zone_area_km2(zones).sum() # bikes per km² of *real* service area
269
+ no_park = tagged[tagged["station_parking"] == False] # stations in park-restricted zones
270
+ ```
271
+
272
+ For free-floating / hybrid systems the real footprint is the operator's polygons, not a
273
+ convex hull of stations. `to_canonical_geofencing` parses `geofencing_zones.json` (v2.x
274
+ `ride_allowed` and v3.x `ride_start/ride_end_allowed` reconciled), `zones_for_points` is the
275
+ point-in-zone spatial join, and `zone_area_km2` reprojects to an equal-area CRS so density is
276
+ metric and latitude-comparable. The full per-vehicle-type `rules` list is preserved.
277
+
278
+ ## Polite scraping & provenance (research-grade)
279
+
280
+ ```python
281
+ session = gb.build_session() # pooled, retry/backoff on 429/5xx (default in fetch_multiple)
282
+ resp = gb.fetch_feed_json(url, etag=prev_etag) # conditional GET; raises GBFSNotModified on HTTP 304
283
+ ...
284
+ gb.coverage_report(panel, expected_freq="5min") # per-station uptime / longest gap (no imputation)
285
+ gb.generate_manifest("lake/") # SHA-256 per partition + summary → cite on Zenodo
286
+ ```
287
+
288
+ Built for scrapers that run for months: retries/backoff, conditional GETs (skip unchanged
289
+ snapshots), an offline catalogue cache, a `GBFSError` exception hierarchy, and provenance tools
290
+ so a dataset is **citable and verifiable**. Missing data stays missing — `coverage_report`
291
+ quantifies it rather than imputing.
292
+
293
+ ## Examples
294
+
295
+ Runnable, end-to-end scripts live in [`examples/`](./examples) — auditing an unknown feed,
296
+ cron-driven collection into a Parquet lake, longitudinal analysis (coverage, typologies,
297
+ turnover), and a network equity/coverage report.
298
+
299
+ ## Roadmap
300
+
301
+ - **v0.1** — canonical model, catalogue discovery, cross-version normalisation,
302
+ static audit (A1–A7), CLI.
303
+ - **v0.2** — fetch/scrape (`GBFSFeed`, one-liners, `fetch_multiple`), dynamic audit
304
+ (D1–D3), `station_state`, geo (`GeoKDTree`, `find_nearest_stations`), schema hardening.
305
+ - **v0.3 (this)** — longitudinal data lake: `append_to_parquet`,
306
+ `build_availability_panel`, `calculate_net_flow`.
307
+ - **v0.4** — `cluster` (spatial / spectral / **diurnal profiles** + named typologies).
308
+ - **v0.5** — `multimodal` (bikeshare ↔ transit feeders, BYOG GTFS).
309
+ - **v0.6** — `osm` / surroundings: `features_within`, `station_surroundings`,
310
+ `enrich_with_osm` (BYOG infrastructure enrichment within a radius).
311
+ - **v0.7** — hardening (nullable dtypes, dockless-aware A7, antimeridian A5,
312
+ mass-conservation net flow) + `geofencing` (service-area polygons, point-in-zone
313
+ joins, equal-area density), `fleet` reconciliation (docked ↔ free-floating dedup),
314
+ and parquet column/predicate pushdown for large panels.
315
+
316
+ ## Methodology & limitations
317
+
318
+ [`METHODOLOGY.md`](./METHODOLOGY.md) documents the A1–A7 thresholds, the dynamic checks, the
319
+ polling/aliasing limit on flows, and what the spatial statistics can and cannot claim — read it
320
+ before building a study on the toolkit.
321
+
322
+ ## How to cite
323
+
324
+ See [`CITATION.cff`](./CITATION.cff). The semantic taxonomy is from the
325
+ `gbfs-audit-catalogue` dataset paper (Fossé & Pallares, 2026).
326
+
327
+ ## License
328
+
329
+ [MIT](./LICENSE). Affiliated with [CESI LINEACT (EA 7527)](https://lineact.cesi.fr), Montpellier, France.
@@ -0,0 +1,282 @@
1
+ # gbfs-toolkit
2
+
3
+ [![CI](https://github.com/cycling-data-lab/gbfs-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/cycling-data-lab/gbfs-toolkit/actions/workflows/ci.yml)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](./LICENSE)
5
+ [![Python 3.10+](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/)
6
+
7
+ **Research-grade ingestion and *semantic* quality audit for GBFS bike-share feeds.**
8
+
9
+ MobilityData's [`gbfs-validator`](https://github.com/MobilityData/gbfs-validator) checks
10
+ that a feed is *syntactically* valid. `gbfs-toolkit` checks whether it is *semantically*
11
+ trustworthy and analysis-ready — the **A1–A7 quality taxonomy** of Fossé & Pallares
12
+ ([`gbfs-audit-catalogue`](https://github.com/cycling-data-lab/gbfs-audit-catalogue)) — and
13
+ normalises feeds into a **stable, version-independent data model** you can reuse across
14
+ studies.
15
+
16
+ ## Why
17
+
18
+ Every bike-share study re-implements the same plumbing — discover feeds, normalise GBFS
19
+ 1.x/2.x/3.x, and (the hard part) cope with the semantic defects the syntactic validator
20
+ cannot see: placeholder capacities, phantom docks, transposed coordinates, out-of-perimeter
21
+ stations. This package consolidates that into one tested interface so the audit is a verdict
22
+ per station, not a re-run of someone's notebook.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install gbfs-toolkit # from PyPI (when released)
28
+ pip install -e ".[dev]" # from a local clone
29
+ ```
30
+
31
+ Core depends only on numpy / scipy / pandas. Network discovery/fetch uses the optional
32
+ `[fetch]` extra (`requests`).
33
+
34
+ ## Quick start
35
+
36
+ ```python
37
+ import gbfs_toolkit as gb
38
+
39
+ info, status = gb.load_example() # bundled sample — no network needed
40
+ av = info.gbfs.join_status(status) # fluent .gbfs accessor (or gb.join_availability)
41
+ clean = info.gbfs.drop_flagged() # audit A1–A7 and keep the trustworthy stations
42
+ av.gbfs.occupancy() # bikes / (bikes + docks), NaN-safe
43
+ ```
44
+
45
+ From your own feed:
46
+
47
+ ```python
48
+ import json
49
+
50
+ raw = json.load(open("station_information.json"))
51
+ stations = gb.to_canonical_station_info(raw, system_id="velib") # version-independent frame
52
+ verdict = gb.audit_static(stations) # A1–A7 per station
53
+ clean = stations[~verdict["flagged"].to_numpy()] # quality filter in one line
54
+ ```
55
+
56
+ Every function is also a `.gbfs` accessor method, and pure (so `df.pipe(gb.occupancy)` works).
57
+ `gb.show_versions()` prints an environment report for bug reports.
58
+
59
+ Command line (the semantic counterpart to `gbfs-validator`):
60
+
61
+ ```bash
62
+ gbfs audit station_information.json --system-id velib --out verdict.csv
63
+ ```
64
+
65
+ ## The A1–A7 semantic taxonomy
66
+
67
+ | Flag | Rule | Signature | Level |
68
+ |---|---|---|---|
69
+ | A1 | Out-of-domain inclusion | car-sharing advertised as bike-sharing | station |
70
+ | A2 | Placeholder capacity | constant non-zero capacity across a whole system | system |
71
+ | A3 | Structural over-capacity | free-floating fleet anchors | station |
72
+ | A4 | Geospatial error | transposed coords / stations far from neighbours (3σ) | station |
73
+ | A5 | Out-of-perimeter | system bounding box > 50,000 km² | system |
74
+ | A6 | Zero-capacity dock | ≥1% of docked stations declare capacity = 0 | system |
75
+ | A7 | Null capacity field | ≥50% of stations declare capacity = NaN | system |
76
+
77
+ Thresholds match the published catalogue, so verdicts reproduce.
78
+
79
+ ## Canonical data model (the stable contract)
80
+
81
+ Ingestion is normalised **once** into version-independent frames; audit and analysis then
82
+ operate purely on these. Downstream code depends on these column names, never on raw GBFS
83
+ JSON.
84
+
85
+ - **StationInfo**: `system_id, station_id, name, lat, lon, capacity, station_type, is_virtual_station`
86
+ - **StationStatus**: `system_id, station_id, num_bikes_available, num_docks_available, is_renting, is_returning, last_reported, fetched_at, gbfs_version`
87
+ - **VehicleStatus**: `system_id, vehicle_id, vehicle_type_id, lat, lon, is_reserved, is_disabled, fetched_at, gbfs_version`
88
+ - **AuditVerdict**: `system_id, station_id, A1…A7, flagged, reason`
89
+
90
+ `last_reported` and `fetched_at` are tz-aware **UTC** timestamps (`datetime64[ns, UTC]`) so
91
+ feeds from different cities merge unambiguously.
92
+
93
+ ## Daily ergonomics
94
+
95
+ ```python
96
+ import gbfs_toolkit as gb
97
+
98
+ # discover by city (you rarely know the system_id)
99
+ cat = gb.systems_catalog()
100
+ paris = gb.filter_catalog(cat, country_code="FR", city="Paris")
101
+
102
+ feed = gb.GBFSFeed.from_url(url)
103
+ feed.summary() # one-glance card: stations, bikes, staleness, version
104
+ avail = feed.availability() # bikes/docks + name/coords/capacity, one frame
105
+ avail["state"] = gb.station_state(avail) # empty / full / disabled / normal
106
+ problems = gb.audit_dynamic(avail) # negative counts, over-capacity, stale
107
+ near = gb.find_nearest_stations(48.85, 2.35, feed.station_information(), k=3)
108
+
109
+ # many systems at once (threaded), broken feeds isolated as Exceptions
110
+ feeds = gb.fetch_multiple(["velib", "bixi", "lyon"], max_workers=5)
111
+ ```
112
+
113
+ ## Longitudinal data lake
114
+
115
+ Turn a stream of snapshots into an analysis-ready panel. The library owns the
116
+ formatting / dedup / I/O; your orchestrator (cron, Airflow…) owns the polling loop.
117
+ Requires the optional `[parquet]` extra (`pyarrow`).
118
+
119
+ ```python
120
+ import gbfs_toolkit as gb
121
+
122
+ # in your poller (every N minutes):
123
+ gb.append_to_parquet(feed.station_status(), "lake/") # Hive-partitioned by system_id/date
124
+
125
+ # in your analysis:
126
+ panel = gb.build_availability_panel("lake/", system_id="velib",
127
+ start_time="2026-06-01", resample_freq="5min")
128
+ flow = gb.calculate_net_flow(panel) # Δ bikes/station per poll (observed flow only)
129
+ ```
130
+
131
+ `build_availability_panel` filters partitions *before* loading (memory-bounded),
132
+ de-duplicates redundant polls (same `station_id` + `last_reported`), and optionally
133
+ resamples each station to a fixed cadence.
134
+
135
+ ## Station clustering (`[cluster]`)
136
+
137
+ Three lenses on "which stations belong together" — spatial, topological, behavioural:
138
+
139
+ ```python
140
+ gb.cluster_spatial(info, method="hdbscan") # density zones (projected metres)
141
+ gb.cluster_spectral(info, k=6) # network/topology groups
142
+ gb.cluster_diurnal_profiles(panel, n_clusters=4) # daily-rhythm typologies ⭐
143
+ ```
144
+
145
+ `cluster_diurnal_profiles` turns the longitudinal panel into station **typologies** —
146
+ e.g. "morning commuter origin" (full at night, empty by day) vs "recreational" — from each
147
+ station's 24-hour occupancy profile (robust to irregular sampling). Modern options:
148
+ auto-`k` by silhouette, shape clustering (`normalize="zscore"`), soft GMM, DTW
149
+ (`method="dtw"`, extra `[dtw]`), weekday/weekend split. And `label_diurnal_typology`
150
+ turns clusters into **named** types. The payoff of the data lake.
151
+
152
+ ## Multimodal — bikeshare ↔ transit
153
+
154
+ ```python
155
+ stops = pd.read_csv("gtfs/stops.txt") # bring your own GTFS stops
156
+ linked = gb.link_transit_stops(info, stops, radius_m=200)
157
+ feeders = linked[linked["is_transit_feeder"]] # first/last-mile docks near rail/bus
158
+ ```
159
+
160
+ Pure spatial proximity on `GeoKDTree` (no transit API, no schedules) — `is_transit_feeder`,
161
+ `nearest_stop_dist_m`, `n_transit_within`.
162
+
163
+ ## Station surroundings — what's around each dock (`[osm]`)
164
+
165
+ ```python
166
+ # generic "what's nearby" — works for any point dataset (POIs, shops, …)
167
+ gb.features_within(info, pois, radius_m=300, category_col="amenity") # n_within, n_cafe, …
168
+
169
+ # bring your own OSM frame (fetch it yourself, e.g. osmnx.features_from_point)
170
+ # one-shot context: transit feeders + OSM features, in one frame
171
+ ctx = gb.station_surroundings(info, transit=stops, osm=osm_gdf, radius_m=300)
172
+ ```
173
+
174
+ The radius summarisation (counts + per-category breakdown + nearest distance) is the durable,
175
+ tested core; data acquisition is **Bring Your Own GeoDataFrame** so the library never depends
176
+ on a live Overpass endpoint. Routing / isochrones stay out of scope (use OSMnx / pandana).
177
+
178
+ ## Descriptive stats — the bikeshare `describe()`
179
+
180
+ ```python
181
+ gb.system_profile(av) # stations, capacity, occupancy, % empty/full/…
182
+ gb.compare_systems({"velib": av1, "bixi": av2}) # one comparison row per city
183
+ gb.concentration_metrics(info) # capacity Gini + top-decile hub share (equity)
184
+ gb.coverage_stats(info, zones=zones) # density, nearest-neighbour, Clark–Evans dispersion
185
+ gb.availability_stats(panel) # per-station: occupancy, peak hour, volatility
186
+ ```
187
+
188
+ Standard spatial / inequality algorithms (numpy/scipy only, deterministic):
189
+
190
+ ```python
191
+ gb.morans_i(info, "occupancy") # spatial autocorrelation (+ z-score / p-value)
192
+ gb.ripley_k(info, radii=[100, 250, 500]) # multi-scale clustering: L>0 clustered, <0 dispersed
193
+ gb.lorenz_curve(info) # inequality curve to plot (Gini/Theil in concentration_metrics)
194
+ ```
195
+
196
+ Readable, comparable summaries — strictly descriptive (no OD/trip inference). `system_profile`
197
+ is a one-glance numeric card of a snapshot; `concentration_metrics` is an equity lens (kept
198
+ *outside* the published A1–A7 audit, since it's a metric not a quality verdict);
199
+ `availability_stats` turns a longitudinal panel into per-station scalars (pass a `target_tz`
200
+ panel for local-time peaks).
201
+
202
+ ## Fleet reconciliation — where are the bikes, really?
203
+
204
+ ```python
205
+ tally = gb.reconcile_fleet_state(status, vehicles) # or feed.reconcile_fleet()
206
+ tally["total_deployed"] # on the street: stations + free-floating, overlap excluded
207
+ tally["total_rentable"] # available in stations + available free-floating
208
+ tally["double_count_avoided"] # vehicles a naive sum would have counted twice
209
+ ```
210
+
211
+ GBFS reports the same fleet twice — aggregate docked counts in `station_status` and
212
+ individual units (some parked at stations) in `vehicle_status`. Naively adding them
213
+ double-counts every vehicle sitting at a dock. The reconciler excludes station-parked
214
+ vehicles from the deployed total and surfaces the overlap instead of hiding it.
215
+
216
+ ## Geofencing / service areas (`[geo]`)
217
+
218
+ ```python
219
+ zones = gb.to_canonical_geofencing(raw, system_id="lime") # GeoDataFrame of operator polygons
220
+ tagged = gb.zones_for_points(info, zones) # which zone each station sits in
221
+ density = len(info) / gb.zone_area_km2(zones).sum() # bikes per km² of *real* service area
222
+ no_park = tagged[tagged["station_parking"] == False] # stations in park-restricted zones
223
+ ```
224
+
225
+ For free-floating / hybrid systems the real footprint is the operator's polygons, not a
226
+ convex hull of stations. `to_canonical_geofencing` parses `geofencing_zones.json` (v2.x
227
+ `ride_allowed` and v3.x `ride_start/ride_end_allowed` reconciled), `zones_for_points` is the
228
+ point-in-zone spatial join, and `zone_area_km2` reprojects to an equal-area CRS so density is
229
+ metric and latitude-comparable. The full per-vehicle-type `rules` list is preserved.
230
+
231
+ ## Polite scraping & provenance (research-grade)
232
+
233
+ ```python
234
+ session = gb.build_session() # pooled, retry/backoff on 429/5xx (default in fetch_multiple)
235
+ resp = gb.fetch_feed_json(url, etag=prev_etag) # conditional GET; raises GBFSNotModified on HTTP 304
236
+ ...
237
+ gb.coverage_report(panel, expected_freq="5min") # per-station uptime / longest gap (no imputation)
238
+ gb.generate_manifest("lake/") # SHA-256 per partition + summary → cite on Zenodo
239
+ ```
240
+
241
+ Built for scrapers that run for months: retries/backoff, conditional GETs (skip unchanged
242
+ snapshots), an offline catalogue cache, a `GBFSError` exception hierarchy, and provenance tools
243
+ so a dataset is **citable and verifiable**. Missing data stays missing — `coverage_report`
244
+ quantifies it rather than imputing.
245
+
246
+ ## Examples
247
+
248
+ Runnable, end-to-end scripts live in [`examples/`](./examples) — auditing an unknown feed,
249
+ cron-driven collection into a Parquet lake, longitudinal analysis (coverage, typologies,
250
+ turnover), and a network equity/coverage report.
251
+
252
+ ## Roadmap
253
+
254
+ - **v0.1** — canonical model, catalogue discovery, cross-version normalisation,
255
+ static audit (A1–A7), CLI.
256
+ - **v0.2** — fetch/scrape (`GBFSFeed`, one-liners, `fetch_multiple`), dynamic audit
257
+ (D1–D3), `station_state`, geo (`GeoKDTree`, `find_nearest_stations`), schema hardening.
258
+ - **v0.3 (this)** — longitudinal data lake: `append_to_parquet`,
259
+ `build_availability_panel`, `calculate_net_flow`.
260
+ - **v0.4** — `cluster` (spatial / spectral / **diurnal profiles** + named typologies).
261
+ - **v0.5** — `multimodal` (bikeshare ↔ transit feeders, BYOG GTFS).
262
+ - **v0.6** — `osm` / surroundings: `features_within`, `station_surroundings`,
263
+ `enrich_with_osm` (BYOG infrastructure enrichment within a radius).
264
+ - **v0.7** — hardening (nullable dtypes, dockless-aware A7, antimeridian A5,
265
+ mass-conservation net flow) + `geofencing` (service-area polygons, point-in-zone
266
+ joins, equal-area density), `fleet` reconciliation (docked ↔ free-floating dedup),
267
+ and parquet column/predicate pushdown for large panels.
268
+
269
+ ## Methodology & limitations
270
+
271
+ [`METHODOLOGY.md`](./METHODOLOGY.md) documents the A1–A7 thresholds, the dynamic checks, the
272
+ polling/aliasing limit on flows, and what the spatial statistics can and cannot claim — read it
273
+ before building a study on the toolkit.
274
+
275
+ ## How to cite
276
+
277
+ See [`CITATION.cff`](./CITATION.cff). The semantic taxonomy is from the
278
+ `gbfs-audit-catalogue` dataset paper (Fossé & Pallares, 2026).
279
+
280
+ ## License
281
+
282
+ [MIT](./LICENSE). Affiliated with [CESI LINEACT (EA 7527)](https://lineact.cesi.fr), Montpellier, France.