gbfs-toolkit 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbfs_toolkit/__init__.py +223 -0
- gbfs_toolkit/accessor.py +105 -0
- gbfs_toolkit/analysis.py +274 -0
- gbfs_toolkit/audit/__init__.py +50 -0
- gbfs_toolkit/audit/dynamic.py +94 -0
- gbfs_toolkit/audit/static.py +215 -0
- gbfs_toolkit/catalog.py +189 -0
- gbfs_toolkit/cli.py +67 -0
- gbfs_toolkit/cluster.py +348 -0
- gbfs_toolkit/datasets.py +80 -0
- gbfs_toolkit/diagnostics.py +32 -0
- gbfs_toolkit/errors.py +34 -0
- gbfs_toolkit/fetch.py +510 -0
- gbfs_toolkit/fleet.py +155 -0
- gbfs_toolkit/geo.py +269 -0
- gbfs_toolkit/geofencing.py +164 -0
- gbfs_toolkit/models.py +271 -0
- gbfs_toolkit/multimodal.py +84 -0
- gbfs_toolkit/normalize.py +362 -0
- gbfs_toolkit/osm.py +111 -0
- gbfs_toolkit/py.typed +0 -0
- gbfs_toolkit/stats.py +415 -0
- gbfs_toolkit/timeseries.py +529 -0
- gbfs_toolkit-1.0.0.dist-info/METADATA +329 -0
- gbfs_toolkit-1.0.0.dist-info/RECORD +29 -0
- gbfs_toolkit-1.0.0.dist-info/WHEEL +5 -0
- gbfs_toolkit-1.0.0.dist-info/entry_points.txt +2 -0
- gbfs_toolkit-1.0.0.dist-info/licenses/LICENSE +32 -0
- gbfs_toolkit-1.0.0.dist-info/top_level.txt +1 -0
gbfs_toolkit/__init__.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""gbfs-toolkit — research-grade ingestion + semantic quality audit for GBFS feeds.
|
|
2
|
+
|
|
3
|
+
The community's :mod:`gbfs-validator` checks that a feed is *syntactically* valid;
|
|
4
|
+
this package checks whether it is *semantically* trustworthy and analysis-ready —
|
|
5
|
+
the A1–A7 taxonomy of Fossé & Pallares — and normalises feeds into a stable,
|
|
6
|
+
version-independent data model you can reuse across studies.
|
|
7
|
+
|
|
8
|
+
Quick start
|
|
9
|
+
-----------
|
|
10
|
+
|
|
11
|
+
>>> import json, gbfs_toolkit as gb
|
|
12
|
+
>>> raw = json.load(open("station_information.json"))
|
|
13
|
+
>>> stations = gb.to_canonical_station_info(raw, system_id="velib")
|
|
14
|
+
>>> verdict = gb.audit_static(stations)
|
|
15
|
+
>>> clean = stations[~verdict["flagged"].to_numpy()]
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from gbfs_toolkit import (
|
|
19
|
+
accessor, # noqa: F401 — registers the `.gbfs` DataFrame accessor
|
|
20
|
+
models,
|
|
21
|
+
)
|
|
22
|
+
from gbfs_toolkit.analysis import (
|
|
23
|
+
cyclical_time_features,
|
|
24
|
+
ebikes,
|
|
25
|
+
filter_vehicles,
|
|
26
|
+
join_availability,
|
|
27
|
+
join_pricing,
|
|
28
|
+
join_vehicle_types,
|
|
29
|
+
network_changes,
|
|
30
|
+
occupancy,
|
|
31
|
+
station_state,
|
|
32
|
+
)
|
|
33
|
+
from gbfs_toolkit.audit import audit_dynamic, audit_frames, audit_static, drop_flagged
|
|
34
|
+
from gbfs_toolkit.catalog import filter_catalog, normalize_operator, resolve, systems_catalog
|
|
35
|
+
from gbfs_toolkit.cluster import (
|
|
36
|
+
cluster_diurnal_profiles,
|
|
37
|
+
cluster_spatial,
|
|
38
|
+
cluster_spectral,
|
|
39
|
+
diurnal_profiles,
|
|
40
|
+
label_diurnal_typology,
|
|
41
|
+
)
|
|
42
|
+
from gbfs_toolkit.datasets import load_example
|
|
43
|
+
from gbfs_toolkit.diagnostics import show_versions
|
|
44
|
+
from gbfs_toolkit.errors import (
|
|
45
|
+
GBFSDiscoveryError,
|
|
46
|
+
GBFSError,
|
|
47
|
+
GBFSFetchError,
|
|
48
|
+
GBFSNotModified,
|
|
49
|
+
GBFSValidationError,
|
|
50
|
+
)
|
|
51
|
+
from gbfs_toolkit.fetch import (
|
|
52
|
+
FeedResponse,
|
|
53
|
+
GBFSFeed,
|
|
54
|
+
audit_feed,
|
|
55
|
+
availability,
|
|
56
|
+
build_session,
|
|
57
|
+
fetch_feed_json,
|
|
58
|
+
fetch_multiple,
|
|
59
|
+
parse_discovery,
|
|
60
|
+
)
|
|
61
|
+
from gbfs_toolkit.fleet import detect_ghost_vehicles, reconcile_fleet_state
|
|
62
|
+
from gbfs_toolkit.geo import (
|
|
63
|
+
GeoKDTree,
|
|
64
|
+
features_within,
|
|
65
|
+
find_nearest_stations,
|
|
66
|
+
haversine_m,
|
|
67
|
+
stations_near,
|
|
68
|
+
to_gdf,
|
|
69
|
+
to_geojson,
|
|
70
|
+
)
|
|
71
|
+
from gbfs_toolkit.geofencing import (
|
|
72
|
+
to_canonical_geofencing,
|
|
73
|
+
zone_area_km2,
|
|
74
|
+
zones_for_points,
|
|
75
|
+
)
|
|
76
|
+
from gbfs_toolkit.models import (
|
|
77
|
+
AUDIT_FLAGS,
|
|
78
|
+
RULES,
|
|
79
|
+
SCHEMAS,
|
|
80
|
+
SchemaError,
|
|
81
|
+
coerce_schema,
|
|
82
|
+
validate_schema,
|
|
83
|
+
)
|
|
84
|
+
from gbfs_toolkit.multimodal import link_transit_stops
|
|
85
|
+
from gbfs_toolkit.normalize import (
|
|
86
|
+
to_canonical_alerts,
|
|
87
|
+
to_canonical_pricing_plans,
|
|
88
|
+
to_canonical_station_info,
|
|
89
|
+
to_canonical_station_status,
|
|
90
|
+
to_canonical_station_vehicle_counts,
|
|
91
|
+
to_canonical_system_information,
|
|
92
|
+
to_canonical_system_regions,
|
|
93
|
+
to_canonical_vehicle_types,
|
|
94
|
+
to_canonical_vehicles,
|
|
95
|
+
)
|
|
96
|
+
from gbfs_toolkit.osm import enrich_with_osm, station_surroundings
|
|
97
|
+
from gbfs_toolkit.stats import (
|
|
98
|
+
availability_stats,
|
|
99
|
+
compare_systems,
|
|
100
|
+
concentration_metrics,
|
|
101
|
+
coverage_stats,
|
|
102
|
+
lorenz_curve,
|
|
103
|
+
morans_i,
|
|
104
|
+
ripley_k,
|
|
105
|
+
system_profile,
|
|
106
|
+
)
|
|
107
|
+
from gbfs_toolkit.timeseries import (
|
|
108
|
+
append_to_parquet,
|
|
109
|
+
build_availability_panel,
|
|
110
|
+
calculate_net_flow,
|
|
111
|
+
coverage_report,
|
|
112
|
+
detect_frozen_stations,
|
|
113
|
+
flow_balance,
|
|
114
|
+
generate_manifest,
|
|
115
|
+
stockout_episodes,
|
|
116
|
+
turnover,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
__version__ = "1.0.0"
|
|
120
|
+
|
|
121
|
+
__all__ = [
|
|
122
|
+
# audit (the flagship)
|
|
123
|
+
"audit_static",
|
|
124
|
+
"audit_dynamic",
|
|
125
|
+
"audit_frames",
|
|
126
|
+
"audit_feed",
|
|
127
|
+
"drop_flagged",
|
|
128
|
+
# fetch / scrape (daily drivers)
|
|
129
|
+
"GBFSFeed",
|
|
130
|
+
"availability",
|
|
131
|
+
"join_availability",
|
|
132
|
+
"fetch_multiple",
|
|
133
|
+
"fetch_feed_json",
|
|
134
|
+
"build_session",
|
|
135
|
+
"FeedResponse",
|
|
136
|
+
"parse_discovery",
|
|
137
|
+
# normalise
|
|
138
|
+
"to_canonical_station_info",
|
|
139
|
+
"to_canonical_station_status",
|
|
140
|
+
"to_canonical_station_vehicle_counts",
|
|
141
|
+
"to_canonical_vehicles",
|
|
142
|
+
"to_canonical_vehicle_types",
|
|
143
|
+
"to_canonical_pricing_plans",
|
|
144
|
+
"to_canonical_system_information",
|
|
145
|
+
"to_canonical_system_regions",
|
|
146
|
+
"to_canonical_alerts",
|
|
147
|
+
# catalogue
|
|
148
|
+
"systems_catalog",
|
|
149
|
+
"filter_catalog",
|
|
150
|
+
"resolve",
|
|
151
|
+
"normalize_operator",
|
|
152
|
+
# longitudinal (data lake)
|
|
153
|
+
"append_to_parquet",
|
|
154
|
+
"build_availability_panel",
|
|
155
|
+
"calculate_net_flow",
|
|
156
|
+
"coverage_report",
|
|
157
|
+
"generate_manifest",
|
|
158
|
+
"stockout_episodes",
|
|
159
|
+
"turnover",
|
|
160
|
+
"flow_balance",
|
|
161
|
+
"detect_frozen_stations",
|
|
162
|
+
# clustering ([cluster])
|
|
163
|
+
"cluster_spatial",
|
|
164
|
+
"cluster_spectral",
|
|
165
|
+
"cluster_diurnal_profiles",
|
|
166
|
+
"diurnal_profiles",
|
|
167
|
+
"label_diurnal_typology",
|
|
168
|
+
# multimodal & surroundings
|
|
169
|
+
"link_transit_stops",
|
|
170
|
+
"station_surroundings",
|
|
171
|
+
"enrich_with_osm",
|
|
172
|
+
# geofencing / service areas ([geo])
|
|
173
|
+
"to_canonical_geofencing",
|
|
174
|
+
"zones_for_points",
|
|
175
|
+
"zone_area_km2",
|
|
176
|
+
# fleet reconciliation
|
|
177
|
+
"reconcile_fleet_state",
|
|
178
|
+
"detect_ghost_vehicles",
|
|
179
|
+
# network evolution & joins
|
|
180
|
+
"network_changes",
|
|
181
|
+
"join_vehicle_types",
|
|
182
|
+
"join_pricing",
|
|
183
|
+
"filter_vehicles",
|
|
184
|
+
"ebikes",
|
|
185
|
+
# descriptive stats
|
|
186
|
+
"system_profile",
|
|
187
|
+
"compare_systems",
|
|
188
|
+
"concentration_metrics",
|
|
189
|
+
"lorenz_curve",
|
|
190
|
+
"coverage_stats",
|
|
191
|
+
"availability_stats",
|
|
192
|
+
"morans_i",
|
|
193
|
+
"ripley_k",
|
|
194
|
+
# analysis & geo
|
|
195
|
+
"station_state",
|
|
196
|
+
"occupancy",
|
|
197
|
+
"cyclical_time_features",
|
|
198
|
+
"find_nearest_stations",
|
|
199
|
+
"features_within",
|
|
200
|
+
"stations_near",
|
|
201
|
+
"haversine_m",
|
|
202
|
+
"GeoKDTree",
|
|
203
|
+
"to_gdf",
|
|
204
|
+
"to_geojson",
|
|
205
|
+
# errors
|
|
206
|
+
"GBFSError",
|
|
207
|
+
"GBFSFetchError",
|
|
208
|
+
"GBFSDiscoveryError",
|
|
209
|
+
"GBFSValidationError",
|
|
210
|
+
"GBFSNotModified",
|
|
211
|
+
# schema / library ergonomics
|
|
212
|
+
"validate_schema",
|
|
213
|
+
"coerce_schema",
|
|
214
|
+
"SCHEMAS",
|
|
215
|
+
"load_example",
|
|
216
|
+
"show_versions",
|
|
217
|
+
# meta
|
|
218
|
+
"models",
|
|
219
|
+
"RULES",
|
|
220
|
+
"AUDIT_FLAGS",
|
|
221
|
+
"SchemaError",
|
|
222
|
+
"__version__",
|
|
223
|
+
]
|
gbfs_toolkit/accessor.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""A ``.gbfs`` pandas DataFrame accessor for fluent method chaining.
|
|
2
|
+
|
|
3
|
+
The library's functions stay pure (``f(df, ...)``); this registers a thin namespace so the
|
|
4
|
+
same operations also read as ``df.gbfs.audit()``. Single-frame operations map directly;
|
|
5
|
+
operations that need a *second* frame (join info+status, reconcile against vehicles, …) take
|
|
6
|
+
it as an argument — so ``info.gbfs.join_status(status)`` reads left-to-right.
|
|
7
|
+
|
|
8
|
+
Importing :mod:`gbfs_toolkit` registers the accessor as a side effect.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from gbfs_toolkit import analysis, audit, geo, models, stats, timeseries
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pd.api.extensions.register_dataframe_accessor("gbfs")
|
|
19
|
+
class GBFSAccessor:
|
|
20
|
+
"""Fluent access to gbfs-toolkit operations — e.g. ``df.gbfs.occupancy()``."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, pandas_obj: pd.DataFrame) -> None:
|
|
23
|
+
self._df = pandas_obj
|
|
24
|
+
|
|
25
|
+
# -- single-frame operations (map directly) -----------------------------
|
|
26
|
+
def audit(self) -> pd.DataFrame:
|
|
27
|
+
return audit.audit_static(self._df)
|
|
28
|
+
|
|
29
|
+
def audit_dynamic(self, **kw) -> pd.DataFrame:
|
|
30
|
+
return audit.audit_dynamic(self._df, **kw)
|
|
31
|
+
|
|
32
|
+
def drop_flagged(self) -> pd.DataFrame:
|
|
33
|
+
return audit.drop_flagged(self._df)
|
|
34
|
+
|
|
35
|
+
def occupancy(self) -> pd.Series:
|
|
36
|
+
return analysis.occupancy(self._df)
|
|
37
|
+
|
|
38
|
+
def station_state(self) -> pd.Series:
|
|
39
|
+
return analysis.station_state(self._df)
|
|
40
|
+
|
|
41
|
+
def net_flow(self) -> pd.DataFrame:
|
|
42
|
+
return timeseries.calculate_net_flow(self._df)
|
|
43
|
+
|
|
44
|
+
def turnover(self, **kw) -> pd.DataFrame:
|
|
45
|
+
return timeseries.turnover(self._df, **kw)
|
|
46
|
+
|
|
47
|
+
def flow_balance(self) -> pd.DataFrame:
|
|
48
|
+
return timeseries.flow_balance(self._df)
|
|
49
|
+
|
|
50
|
+
def stockout_episodes(self, **kw) -> pd.DataFrame:
|
|
51
|
+
return timeseries.stockout_episodes(self._df, **kw)
|
|
52
|
+
|
|
53
|
+
def coverage_report(self, **kw) -> pd.DataFrame:
|
|
54
|
+
return timeseries.coverage_report(self._df, **kw)
|
|
55
|
+
|
|
56
|
+
def detect_frozen_stations(self, **kw) -> pd.DataFrame:
|
|
57
|
+
return timeseries.detect_frozen_stations(self._df, **kw)
|
|
58
|
+
|
|
59
|
+
def system_profile(self) -> pd.Series:
|
|
60
|
+
return stats.system_profile(self._df)
|
|
61
|
+
|
|
62
|
+
def concentration_metrics(self, **kw) -> pd.Series:
|
|
63
|
+
return stats.concentration_metrics(self._df, **kw)
|
|
64
|
+
|
|
65
|
+
def coverage_stats(self, **kw) -> pd.Series:
|
|
66
|
+
return stats.coverage_stats(self._df, **kw)
|
|
67
|
+
|
|
68
|
+
def availability_stats(self, **kw) -> pd.DataFrame:
|
|
69
|
+
return stats.availability_stats(self._df, **kw)
|
|
70
|
+
|
|
71
|
+
def morans_i(self, value_col: str, **kw) -> pd.Series:
|
|
72
|
+
return stats.morans_i(self._df, value_col, **kw)
|
|
73
|
+
|
|
74
|
+
def to_gdf(self, **kw):
|
|
75
|
+
return geo.to_gdf(self._df, **kw)
|
|
76
|
+
|
|
77
|
+
def to_geojson(self, **kw):
|
|
78
|
+
return geo.to_geojson(self._df, **kw)
|
|
79
|
+
|
|
80
|
+
def validate(self, schema: str) -> pd.DataFrame:
|
|
81
|
+
return models.validate_schema(self._df, schema)
|
|
82
|
+
|
|
83
|
+
def coerce(self, schema: str) -> pd.DataFrame:
|
|
84
|
+
return models.coerce_schema(self._df, schema)
|
|
85
|
+
|
|
86
|
+
# -- operations needing a second frame (passed as the argument) ---------
|
|
87
|
+
def join_status(self, status: pd.DataFrame) -> pd.DataFrame:
|
|
88
|
+
"""``info.gbfs.join_status(status)`` → analysis-ready availability frame."""
|
|
89
|
+
return analysis.join_availability(self._df, status)
|
|
90
|
+
|
|
91
|
+
def audit_frames(self, status: pd.DataFrame | None = None, **kw) -> pd.DataFrame:
|
|
92
|
+
return audit.audit_frames(self._df, status, **kw)
|
|
93
|
+
|
|
94
|
+
def join_vehicle_types(self, vehicle_types: pd.DataFrame) -> pd.DataFrame:
|
|
95
|
+
return analysis.join_vehicle_types(self._df, vehicle_types)
|
|
96
|
+
|
|
97
|
+
def join_pricing(self, plans: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
return analysis.join_pricing(self._df, plans)
|
|
99
|
+
|
|
100
|
+
def ebikes(self, vehicle_types: pd.DataFrame) -> pd.DataFrame:
|
|
101
|
+
return analysis.ebikes(self._df, vehicle_types)
|
|
102
|
+
|
|
103
|
+
def network_changes(self, new: pd.DataFrame, **kw) -> pd.DataFrame:
|
|
104
|
+
"""``old.gbfs.network_changes(new)`` → added/removed/recapacitated/moved."""
|
|
105
|
+
return analysis.network_changes(self._df, new, **kw)
|
gbfs_toolkit/analysis.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Derived, ready-to-use metrics on canonical availability frames.
|
|
2
|
+
|
|
3
|
+
Small, safe, broadly-applicable transforms that every analysis re-implements —
|
|
4
|
+
deliberately *not* trip/OD inference (left to dedicated research code).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from gbfs_toolkit.geo import haversine_m
|
|
13
|
+
from gbfs_toolkit.models import require_columns
|
|
14
|
+
|
|
15
|
+
#: Ordered categories returned by :func:`station_state`.
|
|
16
|
+
STATION_STATES = ("disabled", "virtual", "empty", "full", "normal")
|
|
17
|
+
|
|
18
|
+
#: Ordered categories of the ``presence`` indicator from :func:`join_availability`.
|
|
19
|
+
PRESENCE_STATES = ("both", "info_only", "status_only")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def join_availability(info: pd.DataFrame, status: pd.DataFrame) -> pd.DataFrame:
|
|
23
|
+
"""Join a status snapshot onto the station inventory — the analysis-ready availability frame.
|
|
24
|
+
|
|
25
|
+
A pure function on canonical frames (no feed object needed), so it works equally on live
|
|
26
|
+
data and on frames read back from a Parquet lake. Uses an **outer** join — operators
|
|
27
|
+
routinely add/drop a station from one endpoint mid-sync — with a ``presence`` indicator
|
|
28
|
+
(Categorical ``both`` / ``info_only`` / ``status_only``) so orphaned rows stay visible
|
|
29
|
+
instead of being silently dropped.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
info : pandas.DataFrame
|
|
34
|
+
Canonical station information (:data:`~gbfs_toolkit.models.STATION_INFO_COLUMNS`).
|
|
35
|
+
status : pandas.DataFrame
|
|
36
|
+
Canonical station status (:data:`~gbfs_toolkit.models.STATION_STATUS_COLUMNS`).
|
|
37
|
+
"""
|
|
38
|
+
require_columns(info, ["station_id"], what="join_availability(info)")
|
|
39
|
+
require_columns(status, ["station_id"], what="join_availability(status)")
|
|
40
|
+
info_cols = info.drop(columns=["system_id"]) if "system_id" in info.columns else info
|
|
41
|
+
merged = status.merge(
|
|
42
|
+
info_cols, on="station_id", how="outer", suffixes=("", "_info"), indicator="presence"
|
|
43
|
+
)
|
|
44
|
+
mapped = merged["presence"].map(
|
|
45
|
+
{"both": "both", "left_only": "status_only", "right_only": "info_only"}
|
|
46
|
+
)
|
|
47
|
+
merged["presence"] = pd.Categorical(mapped, categories=list(PRESENCE_STATES))
|
|
48
|
+
return merged
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
#: Period of each cyclic calendar field, for sin/cos encoding.
|
|
52
|
+
_CYCLE_PERIODS = {
|
|
53
|
+
"minute": 60,
|
|
54
|
+
"hour": 24,
|
|
55
|
+
"dayofweek": 7,
|
|
56
|
+
"day": 31,
|
|
57
|
+
"month": 12,
|
|
58
|
+
"dayofyear": 366,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def cyclical_time_features(
|
|
63
|
+
timestamps: object, *, fields: tuple[str, ...] = ("hour", "dayofweek", "month")
|
|
64
|
+
) -> pd.DataFrame:
|
|
65
|
+
"""Encode calendar fields as (sin, cos) pairs — the one everyone re-implements.
|
|
66
|
+
|
|
67
|
+
Periodic time variables (hour-of-day, day-of-week, month) are discontinuous as raw integers
|
|
68
|
+
(23:00 is adjacent to 00:00 but ``23`` is far from ``0``); sin/cos on the circle fixes that.
|
|
69
|
+
Pass any datetime-like (Series / Index / array); returns two columns per field.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
fields : tuple of str
|
|
74
|
+
Any of ``minute, hour, dayofweek, day, month, dayofyear``.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
pandas.DataFrame
|
|
79
|
+
``{field}_sin`` / ``{field}_cos`` per requested field, aligned to the input order.
|
|
80
|
+
"""
|
|
81
|
+
ts = pd.to_datetime(
|
|
82
|
+
pd.Series(list(timestamps) if not hasattr(timestamps, "dt") else timestamps)
|
|
83
|
+
)
|
|
84
|
+
ts = ts.reset_index(drop=True)
|
|
85
|
+
out: dict[str, np.ndarray] = {}
|
|
86
|
+
for f in fields:
|
|
87
|
+
if f not in _CYCLE_PERIODS:
|
|
88
|
+
raise ValueError(f"unknown field {f!r}; choose from {sorted(_CYCLE_PERIODS)}")
|
|
89
|
+
angle = 2 * np.pi * getattr(ts.dt, f).to_numpy() / _CYCLE_PERIODS[f]
|
|
90
|
+
out[f"{f}_sin"] = np.sin(angle)
|
|
91
|
+
out[f"{f}_cos"] = np.cos(angle)
|
|
92
|
+
return pd.DataFrame(out)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def occupancy(availability: pd.DataFrame) -> pd.Series:
|
|
96
|
+
"""Occupancy ratio — bikes / (bikes + docks) per station.
|
|
97
|
+
|
|
98
|
+
The quantity everyone recomputes by hand. Returns ``NaN`` where there are no bikes *and*
|
|
99
|
+
no docks (a virtual/dead station), so the divide-by-zero is handled once, consistently.
|
|
100
|
+
"""
|
|
101
|
+
require_columns(availability, ["num_bikes_available", "num_docks_available"], what="occupancy")
|
|
102
|
+
bikes = pd.to_numeric(availability["num_bikes_available"], errors="coerce")
|
|
103
|
+
docks = pd.to_numeric(availability["num_docks_available"], errors="coerce")
|
|
104
|
+
denom = bikes + docks
|
|
105
|
+
return (bikes / denom).where(denom > 0).rename("occupancy")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def filter_vehicles(
|
|
109
|
+
vehicles: pd.DataFrame,
|
|
110
|
+
vehicle_types: pd.DataFrame,
|
|
111
|
+
*,
|
|
112
|
+
form_factor: str | None = None,
|
|
113
|
+
propulsion: str | None = None,
|
|
114
|
+
) -> pd.DataFrame:
|
|
115
|
+
"""Resolve vehicle types then keep only matching vehicles — "where are the X?" in one call.
|
|
116
|
+
|
|
117
|
+
``form_factor`` matches exactly (e.g. ``"bicycle"``, ``"scooter"``); ``propulsion`` matches
|
|
118
|
+
as a substring (so ``"electric"`` catches both ``electric`` and ``electric_assist``).
|
|
119
|
+
"""
|
|
120
|
+
out = join_vehicle_types(vehicles, vehicle_types)
|
|
121
|
+
mask = pd.Series(True, index=out.index)
|
|
122
|
+
if form_factor is not None:
|
|
123
|
+
mask &= out["form_factor"].astype("string").str.lower() == form_factor.lower()
|
|
124
|
+
if propulsion is not None:
|
|
125
|
+
mask &= (
|
|
126
|
+
out["propulsion_type"].astype("string").str.contains(propulsion, case=False, na=False)
|
|
127
|
+
)
|
|
128
|
+
return out[mask].reset_index(drop=True)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def ebikes(vehicles: pd.DataFrame, vehicle_types: pd.DataFrame) -> pd.DataFrame:
|
|
132
|
+
"""Electric vehicles only (any ``electric*`` propulsion), with their type attributes joined."""
|
|
133
|
+
return filter_vehicles(vehicles, vehicle_types, propulsion="electric")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def station_state(availability: pd.DataFrame) -> pd.Series:
|
|
137
|
+
"""Classify each station as ``disabled`` / ``virtual`` / ``empty`` / ``full`` / ``normal``.
|
|
138
|
+
|
|
139
|
+
Resolves two edge cases researchers re-derive constantly:
|
|
140
|
+
an ``is_renting=False`` (and not returning) station is *disabled*, not merely empty;
|
|
141
|
+
a *virtual* station (painted box, capacity 0/NA) must not be read as "full" just
|
|
142
|
+
because it reports zero docks.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
availability : pandas.DataFrame
|
|
147
|
+
Needs ``num_bikes_available`` and ``num_docks_available``; uses
|
|
148
|
+
``is_renting`` / ``is_returning`` / ``is_virtual_station`` / ``capacity`` when present.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
pandas.Series
|
|
153
|
+
Categorical (categories = :data:`STATION_STATES`), aligned to the input index.
|
|
154
|
+
"""
|
|
155
|
+
n = len(availability)
|
|
156
|
+
bikes = (
|
|
157
|
+
pd.to_numeric(availability["num_bikes_available"], errors="coerce").fillna(-1).to_numpy()
|
|
158
|
+
)
|
|
159
|
+
docks = (
|
|
160
|
+
pd.to_numeric(availability["num_docks_available"], errors="coerce").fillna(-1).to_numpy()
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _bool(col: str, default: bool) -> np.ndarray:
|
|
164
|
+
if col in availability:
|
|
165
|
+
return availability[col].astype("boolean").fillna(default).to_numpy()
|
|
166
|
+
return np.full(n, default, dtype=bool)
|
|
167
|
+
|
|
168
|
+
renting = _bool("is_renting", True)
|
|
169
|
+
returning = _bool("is_returning", True)
|
|
170
|
+
is_virtual = _bool("is_virtual_station", False)
|
|
171
|
+
if "capacity" in availability:
|
|
172
|
+
cap = pd.to_numeric(availability["capacity"], errors="coerce").to_numpy()
|
|
173
|
+
is_virtual = is_virtual | ~(cap > 0) # no physical docks ⇒ treat as virtual
|
|
174
|
+
|
|
175
|
+
state = np.where(
|
|
176
|
+
~renting & ~returning,
|
|
177
|
+
"disabled",
|
|
178
|
+
np.where(
|
|
179
|
+
is_virtual,
|
|
180
|
+
"virtual",
|
|
181
|
+
np.where(bikes <= 0, "empty", np.where(docks <= 0, "full", "normal")),
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
return pd.Series(
|
|
185
|
+
pd.Categorical(state, categories=list(STATION_STATES)),
|
|
186
|
+
index=availability.index,
|
|
187
|
+
name="station_state",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
_CHANGE_COLUMNS = ["system_id", "station_id", "change", "old_value", "new_value", "distance_m"]
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def network_changes(
|
|
195
|
+
old: pd.DataFrame, new: pd.DataFrame, *, move_threshold_m: float = 50.0
|
|
196
|
+
) -> pd.DataFrame:
|
|
197
|
+
"""Diff two station inventories — how the network itself changed between two dates.
|
|
198
|
+
|
|
199
|
+
A multi-month study spans network growth, not a fixed graph. This compares two canonical
|
|
200
|
+
``station_information`` frames and reports stations **added**, **removed**,
|
|
201
|
+
**recapacitated** (capacity changed) and **moved** (relocated beyond ``move_threshold_m``).
|
|
202
|
+
A station can appear twice (e.g. recapacitated *and* moved).
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
pandas.DataFrame
|
|
207
|
+
``system_id, station_id, change, old_value, new_value, distance_m`` — ``old/new_value``
|
|
208
|
+
carry the capacity for recapacitations; ``distance_m`` the move distance for moves.
|
|
209
|
+
"""
|
|
210
|
+
require_columns(old, ["station_id"], what="network_changes(old)")
|
|
211
|
+
require_columns(new, ["station_id"], what="network_changes(new)")
|
|
212
|
+
o = old.drop_duplicates("station_id").set_index("station_id")
|
|
213
|
+
n = new.drop_duplicates("station_id").set_index("station_id")
|
|
214
|
+
sys_new = new["system_id"].iloc[0] if "system_id" in new.columns and len(new) else None
|
|
215
|
+
|
|
216
|
+
rows = []
|
|
217
|
+
|
|
218
|
+
def _row(sid, change, **kw):
|
|
219
|
+
src = n if sid in n.index else o
|
|
220
|
+
system = src.loc[sid, "system_id"] if "system_id" in src.columns else sys_new
|
|
221
|
+
rows.append({"system_id": system, "station_id": sid, "change": change, **kw})
|
|
222
|
+
|
|
223
|
+
for sid in n.index.difference(o.index):
|
|
224
|
+
_row(sid, "added")
|
|
225
|
+
for sid in o.index.difference(n.index):
|
|
226
|
+
_row(sid, "removed")
|
|
227
|
+
|
|
228
|
+
common = o.index.intersection(n.index)
|
|
229
|
+
if len(common):
|
|
230
|
+
oc, nc = o.loc[common], n.loc[common]
|
|
231
|
+
if "capacity" in oc.columns and "capacity" in nc.columns:
|
|
232
|
+
changed = oc["capacity"].ne(nc["capacity"]) & ~(
|
|
233
|
+
oc["capacity"].isna() & nc["capacity"].isna()
|
|
234
|
+
)
|
|
235
|
+
for sid in common[changed.to_numpy()]:
|
|
236
|
+
_row(
|
|
237
|
+
sid,
|
|
238
|
+
"recapacitated",
|
|
239
|
+
old_value=oc.at[sid, "capacity"],
|
|
240
|
+
new_value=nc.at[sid, "capacity"],
|
|
241
|
+
)
|
|
242
|
+
if {"lat", "lon"} <= set(oc.columns) & set(nc.columns):
|
|
243
|
+
dist = pd.Series(haversine_m(oc["lat"], oc["lon"], nc["lat"], nc["lon"]), index=common)
|
|
244
|
+
for sid in common[(dist > move_threshold_m).to_numpy()]:
|
|
245
|
+
_row(sid, "moved", distance_m=round(float(dist[sid]), 1))
|
|
246
|
+
|
|
247
|
+
return pd.DataFrame(rows, columns=_CHANGE_COLUMNS).reset_index(drop=True)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def join_vehicle_types(vehicles: pd.DataFrame, vehicle_types: pd.DataFrame) -> pd.DataFrame:
|
|
251
|
+
"""Resolve ``vehicle_type_id`` → form factor / propulsion / range onto a vehicles frame.
|
|
252
|
+
|
|
253
|
+
Turns "where are the e-bikes?" into a filter: ``out[out.form_factor == "bicycle"]`` etc.
|
|
254
|
+
Left join on ``vehicle_type_id``; the catalogue's ``system_id`` is dropped to avoid a clash.
|
|
255
|
+
"""
|
|
256
|
+
cat = vehicle_types.drop(columns=["system_id"], errors="ignore")
|
|
257
|
+
return vehicles.merge(cat, on="vehicle_type_id", how="left")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def join_pricing(vehicles: pd.DataFrame, plans: pd.DataFrame) -> pd.DataFrame:
|
|
261
|
+
"""Resolve ``pricing_plan_id`` → plan name / price / currency onto a vehicles frame.
|
|
262
|
+
|
|
263
|
+
Left join of :func:`~gbfs_toolkit.to_canonical_pricing_plans` (its ``plan_id`` matches the
|
|
264
|
+
vehicle's ``pricing_plan_id``); plan ``name``/``description`` are prefixed ``plan_`` to
|
|
265
|
+
avoid clashes.
|
|
266
|
+
"""
|
|
267
|
+
p = plans.drop(columns=["system_id"], errors="ignore").rename(
|
|
268
|
+
columns={
|
|
269
|
+
"plan_id": "pricing_plan_id",
|
|
270
|
+
"name": "plan_name",
|
|
271
|
+
"description": "plan_description",
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
return vehicles.merge(p, on="pricing_plan_id", how="left")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Semantic audit of GBFS feeds (the toolkit's flagship)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from gbfs_toolkit.audit.dynamic import audit_dynamic
|
|
8
|
+
from gbfs_toolkit.audit.static import audit_static
|
|
9
|
+
|
|
10
|
+
#: Stacked-audit columns shared by static and dynamic verdicts.
|
|
11
|
+
AUDIT_RESULT_COLUMNS = ["system_id", "station_id", "audit_type", "flagged", "reason"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def audit_frames(
|
|
15
|
+
info: pd.DataFrame,
|
|
16
|
+
status: pd.DataFrame | None = None,
|
|
17
|
+
*,
|
|
18
|
+
ttl_seconds: int | None = None,
|
|
19
|
+
system_id: str = "system",
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
"""Unified semantic audit on canonical frames — static (A1–A7) and, if given, dynamic (D1–D3).
|
|
22
|
+
|
|
23
|
+
A pure function (no feed object), so it audits feeds you fetched yourself *or* frames read
|
|
24
|
+
back from a Parquet lake. Results are stacked with an ``audit_type`` column. Use
|
|
25
|
+
:func:`audit_static` / :func:`audit_dynamic` directly for the per-rule boolean columns.
|
|
26
|
+
"""
|
|
27
|
+
static = audit_static(info).assign(audit_type="static")
|
|
28
|
+
parts = [static[AUDIT_RESULT_COLUMNS]]
|
|
29
|
+
if status is not None and len(status):
|
|
30
|
+
from gbfs_toolkit.analysis import join_availability
|
|
31
|
+
|
|
32
|
+
availability = join_availability(info, status)
|
|
33
|
+
dynamic = audit_dynamic(availability, ttl_seconds=ttl_seconds).assign(
|
|
34
|
+
audit_type="dynamic", system_id=system_id
|
|
35
|
+
)
|
|
36
|
+
parts.append(dynamic[AUDIT_RESULT_COLUMNS])
|
|
37
|
+
return pd.concat(parts, ignore_index=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def drop_flagged(stations: pd.DataFrame) -> pd.DataFrame:
|
|
41
|
+
"""The analysis-ready subset: stations that pass the static A1–A7 audit, in one call.
|
|
42
|
+
|
|
43
|
+
Shorthand for running :func:`audit_static` and keeping the unflagged rows — the first thing
|
|
44
|
+
most studies do before anything else.
|
|
45
|
+
"""
|
|
46
|
+
verdict = audit_static(stations)
|
|
47
|
+
return stations[~verdict["flagged"].to_numpy()].reset_index(drop=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
__all__ = ["audit_static", "audit_dynamic", "audit_frames", "drop_flagged", "AUDIT_RESULT_COLUMNS"]
|