gtfs-segmenter 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gtfs_segmenter-0.1.0.dist-info/METADATA +136 -0
- gtfs_segmenter-0.1.0.dist-info/RECORD +9 -0
- gtfs_segmenter-0.1.0.dist-info/WHEEL +4 -0
- gtfs_segmenter-0.1.0.dist-info/licenses/LICENSE +21 -0
- gtfs_segments/__init__.py +35 -0
- gtfs_segments/feed.py +190 -0
- gtfs_segments/projection.py +84 -0
- gtfs_segments/ruler.py +165 -0
- gtfs_segments/segmentation.py +56 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gtfs-segmenter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Segment GTFS shapes into stop-to-stop route segments with loop-aware projection
|
|
5
|
+
Project-URL: Homepage, https://github.com/lizobst/gtfs-segments
|
|
6
|
+
Project-URL: Issues, https://github.com/lizobst/gtfs-segments/issues
|
|
7
|
+
Author: Liz Obst
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: geopandas,geospatial,gtfs,transit,transportation
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Requires-Dist: geopandas>=0.12
|
|
19
|
+
Requires-Dist: numpy>=1.23
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: shapely>=2.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: folium>=0.14; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
25
|
+
Provides-Extra: viz
|
|
26
|
+
Requires-Dist: folium>=0.14; extra == 'viz'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# gtfs-segments
|
|
30
|
+
|
|
31
|
+
Segment GTFS shapes into stop-to-stop route segments that follow the original road geometry.
|
|
32
|
+
|
|
33
|
+
Most GTFS tools use Shapely's `.project()` to snap stops to shapes, which breaks on routes with loops, lollipop turnarounds, and out-and-back corridors. This library uses sequential projection with candidate-aware selection to handle complex route geometries correctly.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install gtfs-segments
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For interactive map visualization:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install gtfs-segments[viz]
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
### Segment an entire feed
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from gtfs_segments import segment_feed
|
|
53
|
+
|
|
54
|
+
result = segment_feed("path/to/gtfs")
|
|
55
|
+
|
|
56
|
+
segments = result['segments'] # GeoDataFrame of all stop-to-stop segments
|
|
57
|
+
diagnostics = result['diagnostics'] # per-shape stats (offsets, degenerates)
|
|
58
|
+
failures = result['failures'] # any shapes that failed
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Segment a single shape
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import pandas as pd
|
|
65
|
+
from gtfs_segments import segment_shape
|
|
66
|
+
|
|
67
|
+
shapes = pd.read_csv("gtfs/shapes.txt")
|
|
68
|
+
trips = pd.read_csv("gtfs/trips.txt")
|
|
69
|
+
stops = pd.read_csv("gtfs/stops.txt")
|
|
70
|
+
stop_times = pd.read_csv("gtfs/stop_times.txt")
|
|
71
|
+
|
|
72
|
+
result = segment_shape(shapes, trips, stops, stop_times, shape_id=232864)
|
|
73
|
+
|
|
74
|
+
segments = result['segments']
|
|
75
|
+
projected = result['projected_stops']
|
|
76
|
+
ruler = result['ruler']
|
|
77
|
+
diag = result['diagnostics']
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Low-level control
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from gtfs_segments import ShapeRuler, project_stops_sequential, segment_route
|
|
84
|
+
|
|
85
|
+
ruler = ShapeRuler(shapes, shape_id=232864)
|
|
86
|
+
|
|
87
|
+
# Get stop visit order from stop_times
|
|
88
|
+
trip_id = trips[trips['shape_id'] == 232864]['trip_id'].iloc[0]
|
|
89
|
+
st = stop_times[stop_times['trip_id'] == trip_id].sort_values('stop_sequence')
|
|
90
|
+
stop_sequence = st['stop_id'].tolist()
|
|
91
|
+
route_stops = stops[stops['stop_id'].isin(stop_sequence)].copy()
|
|
92
|
+
|
|
93
|
+
# Project stops onto shape and build segments
|
|
94
|
+
projected = project_stops_sequential(ruler, route_stops, stop_sequence)
|
|
95
|
+
segments = segment_route(ruler, projected)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## How It Works
|
|
99
|
+
|
|
100
|
+
### The Problem
|
|
101
|
+
|
|
102
|
+
Shapely's `.project()` finds the globally nearest point on a LineString. On a route that doubles back on itself, a stop at the base of a loop has two valid projection points — the entry and the exit. `.project()` picks whichever is geometrically closer, often the exit, which causes every stop inside the loop to collapse to the same distance.
|
|
103
|
+
|
|
104
|
+
### The Solution
|
|
105
|
+
|
|
106
|
+
1. **ShapeRuler** converts the shape to a metric CRS, builds a cumulative distance ruler, and indexes every segment for fast searching.
|
|
107
|
+
|
|
108
|
+
2. **find_candidates** scans every segment and collects all positions where a stop projects within tolerance (default 50m). A stop at a loop base gets candidates at both entry and exit.
|
|
109
|
+
|
|
110
|
+
3. **project_stops_sequential** processes stops in visit order (from `stop_times`), enforcing monotonic distances. For each stop, it picks the **earliest** candidate with a reasonable offset — biasing toward forward progress along the route rather than jumping ahead to a later occurrence.
|
|
111
|
+
|
|
112
|
+
4. **segment_route** slices the shape between consecutive stop distances, including all original vertices so segments follow the actual road geometry.
|
|
113
|
+
|
|
114
|
+
### CRS Auto-Detection
|
|
115
|
+
|
|
116
|
+
If no CRS is provided, the library auto-detects the UTM zone from the median longitude of the shape coordinates. Override with `metric_crs="EPSG:32614"` if needed.
|
|
117
|
+
|
|
118
|
+
## Diagnostics
|
|
119
|
+
|
|
120
|
+
The `diagnostics` DataFrame flags potential issues:
|
|
121
|
+
|
|
122
|
+
- **max_offset_m**: Largest perpendicular distance between a stop and the shape. Over 30m usually means the stop is set back from the road (transit centers, park-and-rides) or assigned to the wrong shape.
|
|
123
|
+
- **n_high_offset**: Count of stops with offset > 30m.
|
|
124
|
+
- **n_degenerate**: Count of segments shorter than 5m. Usually near-side/far-side stop pairs at intersections.
|
|
125
|
+
|
|
126
|
+
## Requirements
|
|
127
|
+
|
|
128
|
+
- Python ≥ 3.9
|
|
129
|
+
- geopandas ≥ 0.12
|
|
130
|
+
- shapely ≥ 2.0
|
|
131
|
+
- pandas ≥ 1.5
|
|
132
|
+
- numpy ≥ 1.23
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
gtfs_segments/__init__.py,sha256=z8rs-uL9_np4NOjEnn68H00M_t10jF_fIwLojH8I7is,1005
|
|
2
|
+
gtfs_segments/feed.py,sha256=w5jVqdyxPQvZ091yjhCzcSYfRbEmJ6pa0CCVq2mICjw,6241
|
|
3
|
+
gtfs_segments/projection.py,sha256=90Hk08BvgV26gUTh-DDIcQapuMVo3J7M8DfAuPazjhs,2714
|
|
4
|
+
gtfs_segments/ruler.py,sha256=hnUEA_Q3InUSDSh9Q6QN91PFainZL9OrkyHG0Pz_HpY,5829
|
|
5
|
+
gtfs_segments/segmentation.py,sha256=pFq76rmEpNyKUMyWeDP1hsnWKt_fCZsG5b_Ik8C1dTo,1643
|
|
6
|
+
gtfs_segmenter-0.1.0.dist-info/METADATA,sha256=JZKkFdJgJRods4ZLWtWaOL1QQ_q88xX0ZKgIigvxtZQ,4893
|
|
7
|
+
gtfs_segmenter-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
8
|
+
gtfs_segmenter-0.1.0.dist-info/licenses/LICENSE,sha256=ESYyLizI0WWtxMeS7rGVcX3ivMezm-HOd5WdeOh-9oU,1056
|
|
9
|
+
gtfs_segmenter-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
gtfs-segments: Segment GTFS shapes into stop-to-stop route segments.
|
|
3
|
+
|
|
4
|
+
Handles complex route geometries including loops, lollipop turnarounds,
|
|
5
|
+
and out-and-back corridors using sequential projection with
|
|
6
|
+
candidate-aware selection.
|
|
7
|
+
|
|
8
|
+
Quick start:
|
|
9
|
+
from gtfs_segments import segment_feed
|
|
10
|
+
result = segment_feed("path/to/gtfs")
|
|
11
|
+
segments = result['segments'] # GeoDataFrame of all segments
|
|
12
|
+
|
|
13
|
+
For single shapes:
|
|
14
|
+
from gtfs_segments import segment_shape
|
|
15
|
+
result = segment_shape(shapes_df, trips_df, stops_df, stop_times_df,
|
|
16
|
+
shape_id=232864)
|
|
17
|
+
|
|
18
|
+
For low-level control:
|
|
19
|
+
from gtfs_segments import ShapeRuler, project_stops_sequential, segment_route
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from .ruler import ShapeRuler
|
|
23
|
+
from .projection import project_stops_sequential
|
|
24
|
+
from .segmentation import segment_route
|
|
25
|
+
from .feed import segment_feed, segment_shape
|
|
26
|
+
|
|
27
|
+
__version__ = "0.1.0"
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"ShapeRuler",
|
|
31
|
+
"project_stops_sequential",
|
|
32
|
+
"segment_route",
|
|
33
|
+
"segment_feed",
|
|
34
|
+
"segment_shape",
|
|
35
|
+
]
|
gtfs_segments/feed.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feed-level processing: load a GTFS directory and segment all shapes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import geopandas as gpd
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .ruler import ShapeRuler
|
|
10
|
+
from .projection import project_stops_sequential
|
|
11
|
+
from .segmentation import segment_route
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_gtfs(gtfs_path):
|
|
15
|
+
"""Load required GTFS tables from a directory."""
|
|
16
|
+
p = Path(gtfs_path)
|
|
17
|
+
if not p.is_dir():
|
|
18
|
+
raise FileNotFoundError(f"GTFS directory not found: {gtfs_path}")
|
|
19
|
+
|
|
20
|
+
required = ['shapes.txt', 'trips.txt', 'stops.txt', 'stop_times.txt']
|
|
21
|
+
for f in required:
|
|
22
|
+
if not (p / f).exists():
|
|
23
|
+
raise FileNotFoundError(f"Missing {f} in {gtfs_path}")
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
'shapes': pd.read_csv(p / 'shapes.txt'),
|
|
27
|
+
'trips': pd.read_csv(p / 'trips.txt'),
|
|
28
|
+
'stops': pd.read_csv(p / 'stops.txt'),
|
|
29
|
+
'stop_times': pd.read_csv(p / 'stop_times.txt'),
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def segment_shape(shapes_df, trips_df, stops_df, stop_times_df,
|
|
34
|
+
shape_id, metric_crs=None, tolerance_m=50):
|
|
35
|
+
"""
|
|
36
|
+
Segment a single shape into stop-to-stop segments.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
shapes_df, trips_df, stops_df, stop_times_df : DataFrame
|
|
41
|
+
GTFS tables
|
|
42
|
+
shape_id : int or str
|
|
43
|
+
Target shape ID
|
|
44
|
+
metric_crs : str or None
|
|
45
|
+
Projected CRS. Auto-detects UTM zone if None.
|
|
46
|
+
tolerance_m : float
|
|
47
|
+
Max perpendicular distance for candidate projection (default 50m)
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
dict with:
|
|
52
|
+
- segments : GeoDataFrame of stop-to-stop segments (WGS84)
|
|
53
|
+
- projected_stops : GeoDataFrame of projected stop positions
|
|
54
|
+
- ruler : ShapeRuler instance
|
|
55
|
+
- diagnostics : dict with summary stats
|
|
56
|
+
"""
|
|
57
|
+
ruler = ShapeRuler(shapes_df, shape_id, metric_crs=metric_crs)
|
|
58
|
+
|
|
59
|
+
# Find a trip that uses this shape
|
|
60
|
+
matching_trips = trips_df[trips_df['shape_id'] == shape_id]
|
|
61
|
+
if matching_trips.empty:
|
|
62
|
+
raise ValueError(f"No trips found for shape {shape_id}")
|
|
63
|
+
|
|
64
|
+
trip_id = matching_trips['trip_id'].iloc[0]
|
|
65
|
+
|
|
66
|
+
# Get stop sequence from stop_times
|
|
67
|
+
st = (stop_times_df[stop_times_df['trip_id'] == trip_id]
|
|
68
|
+
.sort_values('stop_sequence'))
|
|
69
|
+
stop_sequence = st['stop_id'].tolist()
|
|
70
|
+
|
|
71
|
+
if not stop_sequence:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"No stops found for trip {trip_id} (shape {shape_id})")
|
|
74
|
+
|
|
75
|
+
# Filter to stops in this trip
|
|
76
|
+
route_stops = stops_df[stops_df['stop_id'].isin(stop_sequence)].copy()
|
|
77
|
+
|
|
78
|
+
# Project and segment
|
|
79
|
+
projected = project_stops_sequential(
|
|
80
|
+
ruler, route_stops, stop_sequence, tolerance_m=tolerance_m)
|
|
81
|
+
segments = segment_route(ruler, projected)
|
|
82
|
+
|
|
83
|
+
# Diagnostics
|
|
84
|
+
diagnostics = {
|
|
85
|
+
'shape_id': shape_id,
|
|
86
|
+
'length_m': ruler.length,
|
|
87
|
+
'n_stops': len(projected),
|
|
88
|
+
'n_segments': len(segments),
|
|
89
|
+
'max_offset_m': projected['offset_m'].max() if len(projected) > 0 else 0,
|
|
90
|
+
'n_high_offset': int((projected['offset_m'] > 30).sum()) if len(projected) > 0 else 0,
|
|
91
|
+
'n_degenerate': int((segments['segment_dist_m'] < 5).sum()) if len(segments) > 0 else 0,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
'segments': segments,
|
|
96
|
+
'projected_stops': projected,
|
|
97
|
+
'ruler': ruler,
|
|
98
|
+
'diagnostics': diagnostics,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def segment_feed(gtfs_path, shape_ids=None, metric_crs=None,
|
|
103
|
+
tolerance_m=50, verbose=True):
|
|
104
|
+
"""
|
|
105
|
+
Segment an entire GTFS feed into stop-to-stop segments.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
gtfs_path : str or Path
|
|
110
|
+
Path to directory containing GTFS .txt files
|
|
111
|
+
shape_ids : list or None
|
|
112
|
+
Specific shape IDs to process. If None, processes all shapes.
|
|
113
|
+
metric_crs : str or None
|
|
114
|
+
Projected CRS. Auto-detects UTM zone if None.
|
|
115
|
+
tolerance_m : float
|
|
116
|
+
Max perpendicular distance for candidate projection (default 50m)
|
|
117
|
+
verbose : bool
|
|
118
|
+
Print progress (default True)
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
dict with:
|
|
123
|
+
- segments : GeoDataFrame of all segments across all shapes
|
|
124
|
+
- diagnostics : DataFrame with per-shape summary stats
|
|
125
|
+
- failures : DataFrame of shapes that failed with error messages
|
|
126
|
+
"""
|
|
127
|
+
gtfs = _load_gtfs(gtfs_path)
|
|
128
|
+
shapes_df = gtfs['shapes']
|
|
129
|
+
trips_df = gtfs['trips']
|
|
130
|
+
stops_df = gtfs['stops']
|
|
131
|
+
stop_times_df = gtfs['stop_times']
|
|
132
|
+
|
|
133
|
+
# Determine which shapes to process
|
|
134
|
+
if shape_ids is None:
|
|
135
|
+
# Get shapes that have at least one trip
|
|
136
|
+
shape_ids = (trips_df[trips_df['shape_id'].isin(
|
|
137
|
+
shapes_df['shape_id'].unique())]
|
|
138
|
+
['shape_id'].unique().tolist())
|
|
139
|
+
|
|
140
|
+
all_segments = []
|
|
141
|
+
all_diagnostics = []
|
|
142
|
+
failures = []
|
|
143
|
+
|
|
144
|
+
for i, shape_id in enumerate(shape_ids):
|
|
145
|
+
if verbose and (i + 1) % 50 == 0:
|
|
146
|
+
print(f" Processing shape {i + 1}/{len(shape_ids)}...")
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
result = segment_shape(
|
|
150
|
+
shapes_df, trips_df, stops_df, stop_times_df,
|
|
151
|
+
shape_id, metric_crs=metric_crs, tolerance_m=tolerance_m)
|
|
152
|
+
|
|
153
|
+
if len(result['segments']) > 0:
|
|
154
|
+
all_segments.append(result['segments'])
|
|
155
|
+
all_diagnostics.append(result['diagnostics'])
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
failures.append({'shape_id': shape_id, 'error': str(e)})
|
|
159
|
+
|
|
160
|
+
# Combine results
|
|
161
|
+
if all_segments:
|
|
162
|
+
combined_segments = gpd.GeoDataFrame(
|
|
163
|
+
pd.concat(all_segments, ignore_index=True),
|
|
164
|
+
crs="EPSG:4326"
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
combined_segments = gpd.GeoDataFrame()
|
|
168
|
+
|
|
169
|
+
diagnostics_df = pd.DataFrame(all_diagnostics)
|
|
170
|
+
failures_df = pd.DataFrame(failures)
|
|
171
|
+
|
|
172
|
+
if verbose:
|
|
173
|
+
n = len(diagnostics_df)
|
|
174
|
+
n_fail = len(failures_df)
|
|
175
|
+
n_high = (diagnostics_df['n_high_offset'] > 0).sum() if n > 0 else 0
|
|
176
|
+
n_degen = (diagnostics_df['n_degenerate'] > 0).sum() if n > 0 else 0
|
|
177
|
+
n_segs = len(combined_segments)
|
|
178
|
+
print(f" Processed {n} shapes → {n_segs} segments")
|
|
179
|
+
if n_fail > 0:
|
|
180
|
+
print(f" Failed: {n_fail}")
|
|
181
|
+
if n_high > 0:
|
|
182
|
+
print(f" High offset (>30m): {n_high} shapes")
|
|
183
|
+
if n_degen > 0:
|
|
184
|
+
print(f" Degenerate (<5m): {n_degen} shapes")
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
'segments': combined_segments,
|
|
188
|
+
'diagnostics': diagnostics_df,
|
|
189
|
+
'failures': failures_df,
|
|
190
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sequential stop-to-shape projection with candidate-aware loop handling.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import geopandas as gpd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def project_stops_sequential(ruler, stops_df, stop_sequence, tolerance_m=50):
|
|
9
|
+
"""
|
|
10
|
+
Project stops onto a shape in visit order with monotonic enforcement.
|
|
11
|
+
|
|
12
|
+
For each stop, finds all candidate projection points within tolerance,
|
|
13
|
+
then picks the earliest one with a reasonable offset. This handles
|
|
14
|
+
loops where a stop could match the entry or exit of a loop.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
ruler : ShapeRuler
|
|
19
|
+
Built for the target shape
|
|
20
|
+
stops_df : DataFrame
|
|
21
|
+
Stop locations with columns: stop_id, stop_name, stop_lat, stop_lon
|
|
22
|
+
stop_sequence : list
|
|
23
|
+
stop_ids in visit order (from stop_times sorted by stop_sequence)
|
|
24
|
+
tolerance_m : float
|
|
25
|
+
Max perpendicular distance for candidate detection (default 50m)
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
GeoDataFrame in metric CRS with columns:
|
|
30
|
+
stop_id, stop_name, shape_dist_traveled, offset_m,
|
|
31
|
+
snapped_geom, original_geom, n_candidates
|
|
32
|
+
"""
|
|
33
|
+
gdf = gpd.GeoDataFrame(
|
|
34
|
+
stops_df.copy(),
|
|
35
|
+
geometry=gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat),
|
|
36
|
+
crs="EPSG:4326"
|
|
37
|
+
).to_crs(ruler.metric_crs)
|
|
38
|
+
|
|
39
|
+
results = []
|
|
40
|
+
min_dist = 0.0
|
|
41
|
+
|
|
42
|
+
for stop_id in stop_sequence:
|
|
43
|
+
row = gdf[gdf['stop_id'] == stop_id]
|
|
44
|
+
if row.empty:
|
|
45
|
+
continue
|
|
46
|
+
row = row.iloc[0]
|
|
47
|
+
|
|
48
|
+
candidates = ruler.find_candidates(
|
|
49
|
+
row.geometry, after=min_dist, tolerance_m=tolerance_m)
|
|
50
|
+
|
|
51
|
+
if candidates:
|
|
52
|
+
best_offset = min(c['offset'] for c in candidates)
|
|
53
|
+
|
|
54
|
+
# Accept earliest candidate whose offset is within 2x of best
|
|
55
|
+
# or within 15m absolute, whichever is more generous.
|
|
56
|
+
threshold = max(best_offset * 2, 15)
|
|
57
|
+
|
|
58
|
+
chosen = None
|
|
59
|
+
for c in candidates: # sorted by dist
|
|
60
|
+
if c['offset'] <= threshold:
|
|
61
|
+
chosen = c
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
if chosen is None:
|
|
65
|
+
chosen = min(candidates, key=lambda c: c['offset'])
|
|
66
|
+
else:
|
|
67
|
+
chosen = ruler.project(row.geometry, after=min_dist)
|
|
68
|
+
|
|
69
|
+
snapped = ruler.line.interpolate(chosen['dist'])
|
|
70
|
+
|
|
71
|
+
results.append({
|
|
72
|
+
'stop_id': stop_id,
|
|
73
|
+
'stop_name': row.get('stop_name', ''),
|
|
74
|
+
'shape_dist_traveled': chosen['dist'],
|
|
75
|
+
'offset_m': chosen['offset'],
|
|
76
|
+
'snapped_geom': snapped,
|
|
77
|
+
'original_geom': row.geometry,
|
|
78
|
+
'n_candidates': len(candidates) if candidates else 0,
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
min_dist = chosen['dist']
|
|
82
|
+
|
|
83
|
+
return gpd.GeoDataFrame(
|
|
84
|
+
results, geometry='snapped_geom', crs=ruler.metric_crs)
|
gtfs_segments/ruler.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shape ruler: wraps a GTFS shape as a metric LineString with cumulative
|
|
3
|
+
distance, providing sequential projection and segment slicing.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import geopandas as gpd
|
|
8
|
+
from shapely.geometry import LineString
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _utm_crs_from_lon(lon):
|
|
12
|
+
"""Derive UTM EPSG code from a longitude value."""
|
|
13
|
+
zone = int((lon + 180) / 6) + 1
|
|
14
|
+
return f"EPSG:326{zone:02d}" if zone <= 60 else "EPSG:32660"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ShapeRuler:
|
|
18
|
+
"""
|
|
19
|
+
Wraps a GTFS shape as a metric LineString with cumulative distance.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
shapes_df : DataFrame
|
|
24
|
+
Full shapes.txt with columns: shape_id, shape_pt_sequence,
|
|
25
|
+
shape_pt_lat, shape_pt_lon
|
|
26
|
+
shape_id : int or str
|
|
27
|
+
Target shape to build the ruler for
|
|
28
|
+
metric_crs : str or None
|
|
29
|
+
Projected CRS for distance calculations. If None, auto-detects
|
|
30
|
+
the UTM zone from the shape's median longitude.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, shapes_df, shape_id, metric_crs=None):
|
|
34
|
+
df = (shapes_df[shapes_df['shape_id'] == shape_id]
|
|
35
|
+
.sort_values('shape_pt_sequence')
|
|
36
|
+
.copy())
|
|
37
|
+
if df.empty:
|
|
38
|
+
raise ValueError(f"Shape {shape_id} not found in shapes_df")
|
|
39
|
+
|
|
40
|
+
self.shape_id = shape_id
|
|
41
|
+
|
|
42
|
+
# Auto-detect UTM zone if no CRS provided
|
|
43
|
+
if metric_crs is None:
|
|
44
|
+
median_lon = df['shape_pt_lon'].median()
|
|
45
|
+
metric_crs = _utm_crs_from_lon(median_lon)
|
|
46
|
+
self.metric_crs = metric_crs
|
|
47
|
+
|
|
48
|
+
# Build metric GeoDataFrame of shape vertices
|
|
49
|
+
self.vertices = gpd.GeoDataFrame(
|
|
50
|
+
df,
|
|
51
|
+
geometry=gpd.points_from_xy(df.shape_pt_lon, df.shape_pt_lat),
|
|
52
|
+
crs="EPSG:4326"
|
|
53
|
+
).to_crs(metric_crs).reset_index(drop=True)
|
|
54
|
+
|
|
55
|
+
# Cumulative distance at each vertex
|
|
56
|
+
self.vertices['dist_cum'] = (
|
|
57
|
+
self.vertices.geometry
|
|
58
|
+
.distance(self.vertices.geometry.shift())
|
|
59
|
+
.fillna(0)
|
|
60
|
+
.cumsum()
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Full line in metric CRS
|
|
64
|
+
self.line = LineString(self.vertices.geometry.tolist())
|
|
65
|
+
self.length = self.line.length
|
|
66
|
+
|
|
67
|
+
# Pre-build segment index for fast windowed search
|
|
68
|
+
self._seg_starts = []
|
|
69
|
+
self._seg_lines = []
|
|
70
|
+
for i in range(len(self.vertices) - 1):
|
|
71
|
+
self._seg_starts.append(self.vertices['dist_cum'].iloc[i])
|
|
72
|
+
self._seg_lines.append(
|
|
73
|
+
LineString([self.vertices.geometry.iloc[i],
|
|
74
|
+
self.vertices.geometry.iloc[i + 1]])
|
|
75
|
+
)
|
|
76
|
+
self._seg_starts = np.array(self._seg_starts)
|
|
77
|
+
|
|
78
|
+
def project(self, point_geom, after=0.0):
|
|
79
|
+
"""
|
|
80
|
+
Project a point onto the shape, searching only after a minimum
|
|
81
|
+
distance. Returns dict with dist, offset, snapped.
|
|
82
|
+
"""
|
|
83
|
+
best_dist = None
|
|
84
|
+
best_offset = float('inf')
|
|
85
|
+
|
|
86
|
+
for seg_start, seg_line in zip(self._seg_starts, self._seg_lines):
|
|
87
|
+
seg_end = seg_start + seg_line.length
|
|
88
|
+
if seg_end < after:
|
|
89
|
+
continue
|
|
90
|
+
local_proj = seg_line.project(point_geom)
|
|
91
|
+
global_proj = max(seg_start + local_proj, after)
|
|
92
|
+
snapped = self.line.interpolate(global_proj)
|
|
93
|
+
offset = point_geom.distance(snapped)
|
|
94
|
+
if offset < best_offset:
|
|
95
|
+
best_offset = offset
|
|
96
|
+
best_dist = global_proj
|
|
97
|
+
|
|
98
|
+
if best_dist is None:
|
|
99
|
+
best_dist = self.length
|
|
100
|
+
best_offset = point_geom.distance(
|
|
101
|
+
self.line.interpolate(self.length))
|
|
102
|
+
|
|
103
|
+
snapped = self.line.interpolate(best_dist)
|
|
104
|
+
return {'dist': best_dist, 'offset': best_offset, 'snapped': snapped}
|
|
105
|
+
|
|
106
|
+
def find_candidates(self, point_geom, after=0.0, tolerance_m=50):
|
|
107
|
+
"""
|
|
108
|
+
Find all distinct positions where a point projects within tolerance.
|
|
109
|
+
|
|
110
|
+
A stop at the base of a loop will have candidates at both the entry
|
|
111
|
+
and exit. Nearby candidates (within 30m along shape) are merged.
|
|
112
|
+
|
|
113
|
+
Returns list of dict {dist, offset}, sorted by distance.
|
|
114
|
+
"""
|
|
115
|
+
raw = []
|
|
116
|
+
for seg_start, seg_line in zip(self._seg_starts, self._seg_lines):
|
|
117
|
+
seg_end = seg_start + seg_line.length
|
|
118
|
+
if seg_end < after:
|
|
119
|
+
continue
|
|
120
|
+
local_proj = seg_line.project(point_geom)
|
|
121
|
+
global_proj = max(seg_start + local_proj, after)
|
|
122
|
+
snapped = self.line.interpolate(global_proj)
|
|
123
|
+
offset = point_geom.distance(snapped)
|
|
124
|
+
if offset <= tolerance_m:
|
|
125
|
+
raw.append({'dist': global_proj, 'offset': offset})
|
|
126
|
+
|
|
127
|
+
if not raw:
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
# Merge candidates within 30m (keep best offset)
|
|
131
|
+
raw.sort(key=lambda c: c['dist'])
|
|
132
|
+
merged = [raw[0]]
|
|
133
|
+
for c in raw[1:]:
|
|
134
|
+
if c['dist'] - merged[-1]['dist'] > 30:
|
|
135
|
+
merged.append(c)
|
|
136
|
+
elif c['offset'] < merged[-1]['offset']:
|
|
137
|
+
merged[-1] = c
|
|
138
|
+
return merged
|
|
139
|
+
|
|
140
|
+
def slice(self, d_start, d_end):
|
|
141
|
+
"""
|
|
142
|
+
Extract the shape geometry between two distances.
|
|
143
|
+
Returns LineString in metric CRS, or None if degenerate.
|
|
144
|
+
"""
|
|
145
|
+
if d_end <= d_start:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
p1 = self.line.interpolate(d_start)
|
|
149
|
+
p2 = self.line.interpolate(d_end)
|
|
150
|
+
|
|
151
|
+
mask = ((self.vertices['dist_cum'] > d_start) &
|
|
152
|
+
(self.vertices['dist_cum'] < d_end))
|
|
153
|
+
inner = self.vertices[mask].geometry.tolist()
|
|
154
|
+
|
|
155
|
+
coords = ([(p1.x, p1.y)] +
|
|
156
|
+
[(pt.x, pt.y) for pt in inner] +
|
|
157
|
+
[(p2.x, p2.y)])
|
|
158
|
+
|
|
159
|
+
return LineString(coords) if len(coords) >= 2 else None
|
|
160
|
+
|
|
161
|
+
def __repr__(self):
|
|
162
|
+
return (f"ShapeRuler(shape_id={self.shape_id}, "
|
|
163
|
+
f"length={self.length:.0f}m, "
|
|
164
|
+
f"vertices={len(self.vertices)}, "
|
|
165
|
+
f"crs={self.metric_crs})")
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Segment builder: cuts a shape into stop-to-stop segments following
|
|
3
|
+
the original road geometry.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import geopandas as gpd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def segment_route(ruler, projected_stops, output_crs="EPSG:4326"):
|
|
10
|
+
"""
|
|
11
|
+
Build stop-to-stop segments following the shape geometry.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
ruler : ShapeRuler
|
|
16
|
+
projected_stops : GeoDataFrame
|
|
17
|
+
Output of project_stops_sequential
|
|
18
|
+
output_crs : str
|
|
19
|
+
CRS for output (default WGS84)
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
GeoDataFrame with columns:
|
|
24
|
+
segment_idx, from_stop_id, to_stop_id, from_stop_name,
|
|
25
|
+
to_stop_name, segment_dist_m, geometry
|
|
26
|
+
"""
|
|
27
|
+
sorted_stops = (projected_stops
|
|
28
|
+
.sort_values('shape_dist_traveled')
|
|
29
|
+
.reset_index(drop=True))
|
|
30
|
+
segments = []
|
|
31
|
+
|
|
32
|
+
for i in range(len(sorted_stops) - 1):
|
|
33
|
+
sa = sorted_stops.iloc[i]
|
|
34
|
+
sb = sorted_stops.iloc[i + 1]
|
|
35
|
+
|
|
36
|
+
geom = ruler.slice(sa['shape_dist_traveled'],
|
|
37
|
+
sb['shape_dist_traveled'])
|
|
38
|
+
if geom is None:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
segments.append({
|
|
42
|
+
'segment_idx': i,
|
|
43
|
+
'shape_id': ruler.shape_id,
|
|
44
|
+
'from_stop_id': sa['stop_id'],
|
|
45
|
+
'to_stop_id': sb['stop_id'],
|
|
46
|
+
'from_stop_name': sa.get('stop_name', ''),
|
|
47
|
+
'to_stop_name': sb.get('stop_name', ''),
|
|
48
|
+
'segment_dist_m': (sb['shape_dist_traveled'] -
|
|
49
|
+
sa['shape_dist_traveled']),
|
|
50
|
+
'geometry': geom,
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
if not segments:
|
|
54
|
+
return gpd.GeoDataFrame()
|
|
55
|
+
|
|
56
|
+
return gpd.GeoDataFrame(segments, crs=ruler.metric_crs).to_crs(output_crs)
|