gbfs-toolkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ """gbfs-toolkit — research-grade ingestion + semantic quality audit for GBFS feeds.
2
+
3
+ The community's :mod:`gbfs-validator` checks that a feed is *syntactically* valid;
4
+ this package checks whether it is *semantically* trustworthy and analysis-ready —
5
+ the A1–A7 taxonomy of Fossé & Pallares — and normalises feeds into a stable,
6
+ version-independent data model you can reuse across studies.
7
+
8
+ Quick start
9
+ -----------
10
+
11
+ >>> import json, gbfs_toolkit as gb
12
+ >>> raw = json.load(open("station_information.json"))
13
+ >>> stations = gb.to_canonical_station_info(raw, system_id="velib")
14
+ >>> verdict = gb.audit_static(stations)
15
+ >>> clean = stations[~verdict["flagged"].to_numpy()]
16
+ """
17
+
18
+ from gbfs_toolkit import (
19
+ accessor, # noqa: F401 — registers the `.gbfs` DataFrame accessor
20
+ models,
21
+ )
22
+ from gbfs_toolkit.analysis import (
23
+ cyclical_time_features,
24
+ ebikes,
25
+ filter_vehicles,
26
+ join_availability,
27
+ join_pricing,
28
+ join_vehicle_types,
29
+ network_changes,
30
+ occupancy,
31
+ station_state,
32
+ )
33
+ from gbfs_toolkit.audit import audit_dynamic, audit_frames, audit_static, drop_flagged
34
+ from gbfs_toolkit.catalog import filter_catalog, normalize_operator, resolve, systems_catalog
35
+ from gbfs_toolkit.cluster import (
36
+ cluster_diurnal_profiles,
37
+ cluster_spatial,
38
+ cluster_spectral,
39
+ diurnal_profiles,
40
+ label_diurnal_typology,
41
+ )
42
+ from gbfs_toolkit.datasets import load_example
43
+ from gbfs_toolkit.diagnostics import show_versions
44
+ from gbfs_toolkit.errors import (
45
+ GBFSDiscoveryError,
46
+ GBFSError,
47
+ GBFSFetchError,
48
+ GBFSNotModified,
49
+ GBFSValidationError,
50
+ )
51
+ from gbfs_toolkit.fetch import (
52
+ FeedResponse,
53
+ GBFSFeed,
54
+ audit_feed,
55
+ availability,
56
+ build_session,
57
+ fetch_feed_json,
58
+ fetch_multiple,
59
+ parse_discovery,
60
+ )
61
+ from gbfs_toolkit.fleet import detect_ghost_vehicles, reconcile_fleet_state
62
+ from gbfs_toolkit.geo import (
63
+ GeoKDTree,
64
+ features_within,
65
+ find_nearest_stations,
66
+ haversine_m,
67
+ stations_near,
68
+ to_gdf,
69
+ to_geojson,
70
+ )
71
+ from gbfs_toolkit.geofencing import (
72
+ to_canonical_geofencing,
73
+ zone_area_km2,
74
+ zones_for_points,
75
+ )
76
+ from gbfs_toolkit.models import (
77
+ AUDIT_FLAGS,
78
+ RULES,
79
+ SCHEMAS,
80
+ SchemaError,
81
+ coerce_schema,
82
+ validate_schema,
83
+ )
84
+ from gbfs_toolkit.multimodal import link_transit_stops
85
+ from gbfs_toolkit.normalize import (
86
+ to_canonical_alerts,
87
+ to_canonical_pricing_plans,
88
+ to_canonical_station_info,
89
+ to_canonical_station_status,
90
+ to_canonical_station_vehicle_counts,
91
+ to_canonical_system_information,
92
+ to_canonical_system_regions,
93
+ to_canonical_vehicle_types,
94
+ to_canonical_vehicles,
95
+ )
96
+ from gbfs_toolkit.osm import enrich_with_osm, station_surroundings
97
+ from gbfs_toolkit.stats import (
98
+ availability_stats,
99
+ compare_systems,
100
+ concentration_metrics,
101
+ coverage_stats,
102
+ lorenz_curve,
103
+ morans_i,
104
+ ripley_k,
105
+ system_profile,
106
+ )
107
+ from gbfs_toolkit.timeseries import (
108
+ append_to_parquet,
109
+ build_availability_panel,
110
+ calculate_net_flow,
111
+ coverage_report,
112
+ detect_frozen_stations,
113
+ flow_balance,
114
+ generate_manifest,
115
+ stockout_episodes,
116
+ turnover,
117
+ )
118
+
119
+ __version__ = "1.0.0"
120
+
121
+ __all__ = [
122
+ # audit (the flagship)
123
+ "audit_static",
124
+ "audit_dynamic",
125
+ "audit_frames",
126
+ "audit_feed",
127
+ "drop_flagged",
128
+ # fetch / scrape (daily drivers)
129
+ "GBFSFeed",
130
+ "availability",
131
+ "join_availability",
132
+ "fetch_multiple",
133
+ "fetch_feed_json",
134
+ "build_session",
135
+ "FeedResponse",
136
+ "parse_discovery",
137
+ # normalise
138
+ "to_canonical_station_info",
139
+ "to_canonical_station_status",
140
+ "to_canonical_station_vehicle_counts",
141
+ "to_canonical_vehicles",
142
+ "to_canonical_vehicle_types",
143
+ "to_canonical_pricing_plans",
144
+ "to_canonical_system_information",
145
+ "to_canonical_system_regions",
146
+ "to_canonical_alerts",
147
+ # catalogue
148
+ "systems_catalog",
149
+ "filter_catalog",
150
+ "resolve",
151
+ "normalize_operator",
152
+ # longitudinal (data lake)
153
+ "append_to_parquet",
154
+ "build_availability_panel",
155
+ "calculate_net_flow",
156
+ "coverage_report",
157
+ "generate_manifest",
158
+ "stockout_episodes",
159
+ "turnover",
160
+ "flow_balance",
161
+ "detect_frozen_stations",
162
+ # clustering ([cluster])
163
+ "cluster_spatial",
164
+ "cluster_spectral",
165
+ "cluster_diurnal_profiles",
166
+ "diurnal_profiles",
167
+ "label_diurnal_typology",
168
+ # multimodal & surroundings
169
+ "link_transit_stops",
170
+ "station_surroundings",
171
+ "enrich_with_osm",
172
+ # geofencing / service areas ([geo])
173
+ "to_canonical_geofencing",
174
+ "zones_for_points",
175
+ "zone_area_km2",
176
+ # fleet reconciliation
177
+ "reconcile_fleet_state",
178
+ "detect_ghost_vehicles",
179
+ # network evolution & joins
180
+ "network_changes",
181
+ "join_vehicle_types",
182
+ "join_pricing",
183
+ "filter_vehicles",
184
+ "ebikes",
185
+ # descriptive stats
186
+ "system_profile",
187
+ "compare_systems",
188
+ "concentration_metrics",
189
+ "lorenz_curve",
190
+ "coverage_stats",
191
+ "availability_stats",
192
+ "morans_i",
193
+ "ripley_k",
194
+ # analysis & geo
195
+ "station_state",
196
+ "occupancy",
197
+ "cyclical_time_features",
198
+ "find_nearest_stations",
199
+ "features_within",
200
+ "stations_near",
201
+ "haversine_m",
202
+ "GeoKDTree",
203
+ "to_gdf",
204
+ "to_geojson",
205
+ # errors
206
+ "GBFSError",
207
+ "GBFSFetchError",
208
+ "GBFSDiscoveryError",
209
+ "GBFSValidationError",
210
+ "GBFSNotModified",
211
+ # schema / library ergonomics
212
+ "validate_schema",
213
+ "coerce_schema",
214
+ "SCHEMAS",
215
+ "load_example",
216
+ "show_versions",
217
+ # meta
218
+ "models",
219
+ "RULES",
220
+ "AUDIT_FLAGS",
221
+ "SchemaError",
222
+ "__version__",
223
+ ]
@@ -0,0 +1,105 @@
1
+ """A ``.gbfs`` pandas DataFrame accessor for fluent method chaining.
2
+
3
+ The library's functions stay pure (``f(df, ...)``); this registers a thin namespace so the
4
+ same operations also read as ``df.gbfs.audit()``. Single-frame operations map directly;
5
+ operations that need a *second* frame (join info+status, reconcile against vehicles, …) take
6
+ it as an argument — so ``info.gbfs.join_status(status)`` reads left-to-right.
7
+
8
+ Importing :mod:`gbfs_toolkit` registers the accessor as a side effect.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import pandas as pd
14
+
15
+ from gbfs_toolkit import analysis, audit, geo, models, stats, timeseries
16
+
17
+
18
+ @pd.api.extensions.register_dataframe_accessor("gbfs")
19
+ class GBFSAccessor:
20
+ """Fluent access to gbfs-toolkit operations — e.g. ``df.gbfs.occupancy()``."""
21
+
22
+ def __init__(self, pandas_obj: pd.DataFrame) -> None:
23
+ self._df = pandas_obj
24
+
25
+ # -- single-frame operations (map directly) -----------------------------
26
+ def audit(self) -> pd.DataFrame:
27
+ return audit.audit_static(self._df)
28
+
29
+ def audit_dynamic(self, **kw) -> pd.DataFrame:
30
+ return audit.audit_dynamic(self._df, **kw)
31
+
32
+ def drop_flagged(self) -> pd.DataFrame:
33
+ return audit.drop_flagged(self._df)
34
+
35
+ def occupancy(self) -> pd.Series:
36
+ return analysis.occupancy(self._df)
37
+
38
+ def station_state(self) -> pd.Series:
39
+ return analysis.station_state(self._df)
40
+
41
+ def net_flow(self) -> pd.DataFrame:
42
+ return timeseries.calculate_net_flow(self._df)
43
+
44
+ def turnover(self, **kw) -> pd.DataFrame:
45
+ return timeseries.turnover(self._df, **kw)
46
+
47
+ def flow_balance(self) -> pd.DataFrame:
48
+ return timeseries.flow_balance(self._df)
49
+
50
+ def stockout_episodes(self, **kw) -> pd.DataFrame:
51
+ return timeseries.stockout_episodes(self._df, **kw)
52
+
53
+ def coverage_report(self, **kw) -> pd.DataFrame:
54
+ return timeseries.coverage_report(self._df, **kw)
55
+
56
+ def detect_frozen_stations(self, **kw) -> pd.DataFrame:
57
+ return timeseries.detect_frozen_stations(self._df, **kw)
58
+
59
+ def system_profile(self) -> pd.Series:
60
+ return stats.system_profile(self._df)
61
+
62
+ def concentration_metrics(self, **kw) -> pd.Series:
63
+ return stats.concentration_metrics(self._df, **kw)
64
+
65
+ def coverage_stats(self, **kw) -> pd.Series:
66
+ return stats.coverage_stats(self._df, **kw)
67
+
68
+ def availability_stats(self, **kw) -> pd.DataFrame:
69
+ return stats.availability_stats(self._df, **kw)
70
+
71
+ def morans_i(self, value_col: str, **kw) -> pd.Series:
72
+ return stats.morans_i(self._df, value_col, **kw)
73
+
74
+ def to_gdf(self, **kw):
75
+ return geo.to_gdf(self._df, **kw)
76
+
77
+ def to_geojson(self, **kw):
78
+ return geo.to_geojson(self._df, **kw)
79
+
80
+ def validate(self, schema: str) -> pd.DataFrame:
81
+ return models.validate_schema(self._df, schema)
82
+
83
+ def coerce(self, schema: str) -> pd.DataFrame:
84
+ return models.coerce_schema(self._df, schema)
85
+
86
+ # -- operations needing a second frame (passed as the argument) ---------
87
+ def join_status(self, status: pd.DataFrame) -> pd.DataFrame:
88
+ """``info.gbfs.join_status(status)`` → analysis-ready availability frame."""
89
+ return analysis.join_availability(self._df, status)
90
+
91
+ def audit_frames(self, status: pd.DataFrame | None = None, **kw) -> pd.DataFrame:
92
+ return audit.audit_frames(self._df, status, **kw)
93
+
94
+ def join_vehicle_types(self, vehicle_types: pd.DataFrame) -> pd.DataFrame:
95
+ return analysis.join_vehicle_types(self._df, vehicle_types)
96
+
97
+ def join_pricing(self, plans: pd.DataFrame) -> pd.DataFrame:
98
+ return analysis.join_pricing(self._df, plans)
99
+
100
+ def ebikes(self, vehicle_types: pd.DataFrame) -> pd.DataFrame:
101
+ return analysis.ebikes(self._df, vehicle_types)
102
+
103
+ def network_changes(self, new: pd.DataFrame, **kw) -> pd.DataFrame:
104
+ """``old.gbfs.network_changes(new)`` → added/removed/recapacitated/moved."""
105
+ return analysis.network_changes(self._df, new, **kw)
@@ -0,0 +1,274 @@
1
+ """Derived, ready-to-use metrics on canonical availability frames.
2
+
3
+ Small, safe, broadly-applicable transforms that every analysis re-implements —
4
+ deliberately *not* trip/OD inference (left to dedicated research code).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from gbfs_toolkit.geo import haversine_m
13
+ from gbfs_toolkit.models import require_columns
14
+
15
+ #: Ordered categories returned by :func:`station_state`.
16
+ STATION_STATES = ("disabled", "virtual", "empty", "full", "normal")
17
+
18
+ #: Ordered categories of the ``presence`` indicator from :func:`join_availability`.
19
+ PRESENCE_STATES = ("both", "info_only", "status_only")
20
+
21
+
22
+ def join_availability(info: pd.DataFrame, status: pd.DataFrame) -> pd.DataFrame:
23
+ """Join a status snapshot onto the station inventory — the analysis-ready availability frame.
24
+
25
+ A pure function on canonical frames (no feed object needed), so it works equally on live
26
+ data and on frames read back from a Parquet lake. Uses an **outer** join — operators
27
+ routinely add/drop a station from one endpoint mid-sync — with a ``presence`` indicator
28
+ (Categorical ``both`` / ``info_only`` / ``status_only``) so orphaned rows stay visible
29
+ instead of being silently dropped.
30
+
31
+ Parameters
32
+ ----------
33
+ info : pandas.DataFrame
34
+ Canonical station information (:data:`~gbfs_toolkit.models.STATION_INFO_COLUMNS`).
35
+ status : pandas.DataFrame
36
+ Canonical station status (:data:`~gbfs_toolkit.models.STATION_STATUS_COLUMNS`).
37
+ """
38
+ require_columns(info, ["station_id"], what="join_availability(info)")
39
+ require_columns(status, ["station_id"], what="join_availability(status)")
40
+ info_cols = info.drop(columns=["system_id"]) if "system_id" in info.columns else info
41
+ merged = status.merge(
42
+ info_cols, on="station_id", how="outer", suffixes=("", "_info"), indicator="presence"
43
+ )
44
+ mapped = merged["presence"].map(
45
+ {"both": "both", "left_only": "status_only", "right_only": "info_only"}
46
+ )
47
+ merged["presence"] = pd.Categorical(mapped, categories=list(PRESENCE_STATES))
48
+ return merged
49
+
50
+
51
+ #: Period of each cyclic calendar field, for sin/cos encoding.
52
+ _CYCLE_PERIODS = {
53
+ "minute": 60,
54
+ "hour": 24,
55
+ "dayofweek": 7,
56
+ "day": 31,
57
+ "month": 12,
58
+ "dayofyear": 366,
59
+ }
60
+
61
+
62
+ def cyclical_time_features(
63
+ timestamps: object, *, fields: tuple[str, ...] = ("hour", "dayofweek", "month")
64
+ ) -> pd.DataFrame:
65
+ """Encode calendar fields as (sin, cos) pairs — the one everyone re-implements.
66
+
67
+ Periodic time variables (hour-of-day, day-of-week, month) are discontinuous as raw integers
68
+ (23:00 is adjacent to 00:00 but ``23`` is far from ``0``); sin/cos on the circle fixes that.
69
+ Pass any datetime-like (Series / Index / array); returns two columns per field.
70
+
71
+ Parameters
72
+ ----------
73
+ fields : tuple of str
74
+ Any of ``minute, hour, dayofweek, day, month, dayofyear``.
75
+
76
+ Returns
77
+ -------
78
+ pandas.DataFrame
79
+ ``{field}_sin`` / ``{field}_cos`` per requested field, aligned to the input order.
80
+ """
81
+ ts = pd.to_datetime(
82
+ pd.Series(list(timestamps) if not hasattr(timestamps, "dt") else timestamps)
83
+ )
84
+ ts = ts.reset_index(drop=True)
85
+ out: dict[str, np.ndarray] = {}
86
+ for f in fields:
87
+ if f not in _CYCLE_PERIODS:
88
+ raise ValueError(f"unknown field {f!r}; choose from {sorted(_CYCLE_PERIODS)}")
89
+ angle = 2 * np.pi * getattr(ts.dt, f).to_numpy() / _CYCLE_PERIODS[f]
90
+ out[f"{f}_sin"] = np.sin(angle)
91
+ out[f"{f}_cos"] = np.cos(angle)
92
+ return pd.DataFrame(out)
93
+
94
+
95
+ def occupancy(availability: pd.DataFrame) -> pd.Series:
96
+ """Occupancy ratio — bikes / (bikes + docks) per station.
97
+
98
+ The quantity everyone recomputes by hand. Returns ``NaN`` where there are no bikes *and*
99
+ no docks (a virtual/dead station), so the divide-by-zero is handled once, consistently.
100
+ """
101
+ require_columns(availability, ["num_bikes_available", "num_docks_available"], what="occupancy")
102
+ bikes = pd.to_numeric(availability["num_bikes_available"], errors="coerce")
103
+ docks = pd.to_numeric(availability["num_docks_available"], errors="coerce")
104
+ denom = bikes + docks
105
+ return (bikes / denom).where(denom > 0).rename("occupancy")
106
+
107
+
108
+ def filter_vehicles(
109
+ vehicles: pd.DataFrame,
110
+ vehicle_types: pd.DataFrame,
111
+ *,
112
+ form_factor: str | None = None,
113
+ propulsion: str | None = None,
114
+ ) -> pd.DataFrame:
115
+ """Resolve vehicle types then keep only matching vehicles — "where are the X?" in one call.
116
+
117
+ ``form_factor`` matches exactly (e.g. ``"bicycle"``, ``"scooter"``); ``propulsion`` matches
118
+ as a substring (so ``"electric"`` catches both ``electric`` and ``electric_assist``).
119
+ """
120
+ out = join_vehicle_types(vehicles, vehicle_types)
121
+ mask = pd.Series(True, index=out.index)
122
+ if form_factor is not None:
123
+ mask &= out["form_factor"].astype("string").str.lower() == form_factor.lower()
124
+ if propulsion is not None:
125
+ mask &= (
126
+ out["propulsion_type"].astype("string").str.contains(propulsion, case=False, na=False)
127
+ )
128
+ return out[mask].reset_index(drop=True)
129
+
130
+
131
+ def ebikes(vehicles: pd.DataFrame, vehicle_types: pd.DataFrame) -> pd.DataFrame:
132
+ """Electric vehicles only (any ``electric*`` propulsion), with their type attributes joined."""
133
+ return filter_vehicles(vehicles, vehicle_types, propulsion="electric")
134
+
135
+
136
+ def station_state(availability: pd.DataFrame) -> pd.Series:
137
+ """Classify each station as ``disabled`` / ``virtual`` / ``empty`` / ``full`` / ``normal``.
138
+
139
+ Resolves two edge cases researchers re-derive constantly:
140
+ an ``is_renting=False`` (and not returning) station is *disabled*, not merely empty;
141
+ a *virtual* station (painted box, capacity 0/NA) must not be read as "full" just
142
+ because it reports zero docks.
143
+
144
+ Parameters
145
+ ----------
146
+ availability : pandas.DataFrame
147
+ Needs ``num_bikes_available`` and ``num_docks_available``; uses
148
+ ``is_renting`` / ``is_returning`` / ``is_virtual_station`` / ``capacity`` when present.
149
+
150
+ Returns
151
+ -------
152
+ pandas.Series
153
+ Categorical (categories = :data:`STATION_STATES`), aligned to the input index.
154
+ """
155
+ n = len(availability)
156
+ bikes = (
157
+ pd.to_numeric(availability["num_bikes_available"], errors="coerce").fillna(-1).to_numpy()
158
+ )
159
+ docks = (
160
+ pd.to_numeric(availability["num_docks_available"], errors="coerce").fillna(-1).to_numpy()
161
+ )
162
+
163
+ def _bool(col: str, default: bool) -> np.ndarray:
164
+ if col in availability:
165
+ return availability[col].astype("boolean").fillna(default).to_numpy()
166
+ return np.full(n, default, dtype=bool)
167
+
168
+ renting = _bool("is_renting", True)
169
+ returning = _bool("is_returning", True)
170
+ is_virtual = _bool("is_virtual_station", False)
171
+ if "capacity" in availability:
172
+ cap = pd.to_numeric(availability["capacity"], errors="coerce").to_numpy()
173
+ is_virtual = is_virtual | ~(cap > 0) # no physical docks ⇒ treat as virtual
174
+
175
+ state = np.where(
176
+ ~renting & ~returning,
177
+ "disabled",
178
+ np.where(
179
+ is_virtual,
180
+ "virtual",
181
+ np.where(bikes <= 0, "empty", np.where(docks <= 0, "full", "normal")),
182
+ ),
183
+ )
184
+ return pd.Series(
185
+ pd.Categorical(state, categories=list(STATION_STATES)),
186
+ index=availability.index,
187
+ name="station_state",
188
+ )
189
+
190
+
191
+ _CHANGE_COLUMNS = ["system_id", "station_id", "change", "old_value", "new_value", "distance_m"]
192
+
193
+
194
+ def network_changes(
195
+ old: pd.DataFrame, new: pd.DataFrame, *, move_threshold_m: float = 50.0
196
+ ) -> pd.DataFrame:
197
+ """Diff two station inventories — how the network itself changed between two dates.
198
+
199
+ A multi-month study spans network growth, not a fixed graph. This compares two canonical
200
+ ``station_information`` frames and reports stations **added**, **removed**,
201
+ **recapacitated** (capacity changed) and **moved** (relocated beyond ``move_threshold_m``).
202
+ A station can appear twice (e.g. recapacitated *and* moved).
203
+
204
+ Returns
205
+ -------
206
+ pandas.DataFrame
207
+ ``system_id, station_id, change, old_value, new_value, distance_m`` — ``old/new_value``
208
+ carry the capacity for recapacitations; ``distance_m`` the move distance for moves.
209
+ """
210
+ require_columns(old, ["station_id"], what="network_changes(old)")
211
+ require_columns(new, ["station_id"], what="network_changes(new)")
212
+ o = old.drop_duplicates("station_id").set_index("station_id")
213
+ n = new.drop_duplicates("station_id").set_index("station_id")
214
+ sys_new = new["system_id"].iloc[0] if "system_id" in new.columns and len(new) else None
215
+
216
+ rows = []
217
+
218
+ def _row(sid, change, **kw):
219
+ src = n if sid in n.index else o
220
+ system = src.loc[sid, "system_id"] if "system_id" in src.columns else sys_new
221
+ rows.append({"system_id": system, "station_id": sid, "change": change, **kw})
222
+
223
+ for sid in n.index.difference(o.index):
224
+ _row(sid, "added")
225
+ for sid in o.index.difference(n.index):
226
+ _row(sid, "removed")
227
+
228
+ common = o.index.intersection(n.index)
229
+ if len(common):
230
+ oc, nc = o.loc[common], n.loc[common]
231
+ if "capacity" in oc.columns and "capacity" in nc.columns:
232
+ changed = oc["capacity"].ne(nc["capacity"]) & ~(
233
+ oc["capacity"].isna() & nc["capacity"].isna()
234
+ )
235
+ for sid in common[changed.to_numpy()]:
236
+ _row(
237
+ sid,
238
+ "recapacitated",
239
+ old_value=oc.at[sid, "capacity"],
240
+ new_value=nc.at[sid, "capacity"],
241
+ )
242
+ if {"lat", "lon"} <= set(oc.columns) & set(nc.columns):
243
+ dist = pd.Series(haversine_m(oc["lat"], oc["lon"], nc["lat"], nc["lon"]), index=common)
244
+ for sid in common[(dist > move_threshold_m).to_numpy()]:
245
+ _row(sid, "moved", distance_m=round(float(dist[sid]), 1))
246
+
247
+ return pd.DataFrame(rows, columns=_CHANGE_COLUMNS).reset_index(drop=True)
248
+
249
+
250
+ def join_vehicle_types(vehicles: pd.DataFrame, vehicle_types: pd.DataFrame) -> pd.DataFrame:
251
+ """Resolve ``vehicle_type_id`` → form factor / propulsion / range onto a vehicles frame.
252
+
253
+ Turns "where are the e-bikes?" into a filter: ``out[out.form_factor == "bicycle"]`` etc.
254
+ Left join on ``vehicle_type_id``; the catalogue's ``system_id`` is dropped to avoid a clash.
255
+ """
256
+ cat = vehicle_types.drop(columns=["system_id"], errors="ignore")
257
+ return vehicles.merge(cat, on="vehicle_type_id", how="left")
258
+
259
+
260
+ def join_pricing(vehicles: pd.DataFrame, plans: pd.DataFrame) -> pd.DataFrame:
261
+ """Resolve ``pricing_plan_id`` → plan name / price / currency onto a vehicles frame.
262
+
263
+ Left join of :func:`~gbfs_toolkit.to_canonical_pricing_plans` (its ``plan_id`` matches the
264
+ vehicle's ``pricing_plan_id``); plan ``name``/``description`` are prefixed ``plan_`` to
265
+ avoid clashes.
266
+ """
267
+ p = plans.drop(columns=["system_id"], errors="ignore").rename(
268
+ columns={
269
+ "plan_id": "pricing_plan_id",
270
+ "name": "plan_name",
271
+ "description": "plan_description",
272
+ }
273
+ )
274
+ return vehicles.merge(p, on="pricing_plan_id", how="left")
@@ -0,0 +1,50 @@
1
+ """Semantic audit of GBFS feeds (the toolkit's flagship)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from gbfs_toolkit.audit.dynamic import audit_dynamic
8
+ from gbfs_toolkit.audit.static import audit_static
9
+
10
+ #: Stacked-audit columns shared by static and dynamic verdicts.
11
+ AUDIT_RESULT_COLUMNS = ["system_id", "station_id", "audit_type", "flagged", "reason"]
12
+
13
+
14
+ def audit_frames(
15
+ info: pd.DataFrame,
16
+ status: pd.DataFrame | None = None,
17
+ *,
18
+ ttl_seconds: int | None = None,
19
+ system_id: str = "system",
20
+ ) -> pd.DataFrame:
21
+ """Unified semantic audit on canonical frames — static (A1–A7) and, if given, dynamic (D1–D3).
22
+
23
+ A pure function (no feed object), so it audits feeds you fetched yourself *or* frames read
24
+ back from a Parquet lake. Results are stacked with an ``audit_type`` column. Use
25
+ :func:`audit_static` / :func:`audit_dynamic` directly for the per-rule boolean columns.
26
+ """
27
+ static = audit_static(info).assign(audit_type="static")
28
+ parts = [static[AUDIT_RESULT_COLUMNS]]
29
+ if status is not None and len(status):
30
+ from gbfs_toolkit.analysis import join_availability
31
+
32
+ availability = join_availability(info, status)
33
+ dynamic = audit_dynamic(availability, ttl_seconds=ttl_seconds).assign(
34
+ audit_type="dynamic", system_id=system_id
35
+ )
36
+ parts.append(dynamic[AUDIT_RESULT_COLUMNS])
37
+ return pd.concat(parts, ignore_index=True)
38
+
39
+
40
+ def drop_flagged(stations: pd.DataFrame) -> pd.DataFrame:
41
+ """The analysis-ready subset: stations that pass the static A1–A7 audit, in one call.
42
+
43
+ Shorthand for running :func:`audit_static` and keeping the unflagged rows — the first thing
44
+ most studies do before anything else.
45
+ """
46
+ verdict = audit_static(stations)
47
+ return stations[~verdict["flagged"].to_numpy()].reset_index(drop=True)
48
+
49
+
50
+ __all__ = ["audit_static", "audit_dynamic", "audit_frames", "drop_flagged", "AUDIT_RESULT_COLUMNS"]