gtfs_df 0.4.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/lib/gtfs_df/feed.rb +86 -64
- data/lib/gtfs_df/graph.rb +17 -16
- data/lib/gtfs_df/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c7c1d87e57bbb44ceb4ce8112da7172c5a8c76f8e88d3b1e8fbb610aad850cf3
|
|
4
|
+
data.tar.gz: 6a1b68dfc723d3c70b779687a70100b735c9a57e91603b302685318660473c66
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 85684d79eac1479bac56995cd00a4d22106e5247979e2b54b07dee0d2b1948c4e86bf005be674b57656c0a9c14acf731a11d17163edde72d0302561a2d489159
|
|
7
|
+
data.tar.gz: 7033b279730614870ff8710e51b29ff20f2b399aff9cd1b13a8a2a3202940a88221d9f0edc5dc82a5c3593c9f2314fdc6082fd8c89673f8ec956ba9eada11e2a
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,18 @@
|
|
|
1
|
+
## [0.6.0] - 2025-12-09
|
|
2
|
+
### Fixed
|
|
3
|
+
|
|
4
|
+
- visit nodes multiple times
|
|
5
|
+
## [0.5.0] - 2025-12-08
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- add Feed#filter filter_only_children param
|
|
10
|
+
|
|
11
|
+
### Maintenance
|
|
12
|
+
|
|
13
|
+
- arrange edges so parent is always first
|
|
14
|
+
- build directed graph
|
|
15
|
+
- allow ! in commit messages
|
|
1
16
|
## [0.4.1] - 2025-12-05
|
|
2
17
|
|
|
3
18
|
### Added
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -75,13 +75,17 @@ module GtfsDf
|
|
|
75
75
|
#
|
|
76
76
|
# @param view [Hash] The view used to filter the feed, with format { file => filters }.
|
|
77
77
|
# Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
|
|
78
|
-
# @param
|
|
79
|
-
#
|
|
80
|
-
#
|
|
81
|
-
#
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
|
|
78
|
+
# @param filter_only_children [Boolean] Whether only child dependencies should be pruned.
|
|
79
|
+
# When false, we:
|
|
80
|
+
# - Treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
|
|
81
|
+
# referenced by TripA, we will preserve _all stops_ referenced by TripA.
|
|
82
|
+
# - Prune unreferenced parent objects (e.g. route is a parent of trip. Unreferenced routes
|
|
83
|
+
# will be pruned.)
|
|
84
|
+
# When true we:
|
|
85
|
+
# - Do not treat trips as atomic. I can filter stopA without maintaining other stops for
|
|
86
|
+
# trips that reference it.
|
|
87
|
+
# - Only filter child objects
|
|
88
|
+
def filter(view, filter_only_children: false)
|
|
85
89
|
filtered = {}
|
|
86
90
|
|
|
87
91
|
GTFS_FILES.each do |file|
|
|
@@ -91,7 +95,11 @@ module GtfsDf
|
|
|
91
95
|
filtered[file] = df
|
|
92
96
|
end
|
|
93
97
|
|
|
94
|
-
if
|
|
98
|
+
if filter_only_children
|
|
99
|
+
view.each do |file, filters|
|
|
100
|
+
filtered = filter!(file, filters, filtered.dup, filter_only_children: true)
|
|
101
|
+
end
|
|
102
|
+
else
|
|
95
103
|
# Trips are the atomic unit of GTFS, we will generate a new view
|
|
96
104
|
# based on the set of trips that would be included for each invidual filter
|
|
97
105
|
# and cascade changes from this view in order to retain referential integrity
|
|
@@ -107,11 +115,7 @@ module GtfsDf
|
|
|
107
115
|
end
|
|
108
116
|
|
|
109
117
|
if trip_ids
|
|
110
|
-
filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
|
|
111
|
-
end
|
|
112
|
-
else
|
|
113
|
-
view.each do |file, filters|
|
|
114
|
-
filtered = filter!(file, filters, filtered.dup)
|
|
118
|
+
filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered.dup)
|
|
115
119
|
end
|
|
116
120
|
end
|
|
117
121
|
|
|
@@ -128,7 +132,7 @@ module GtfsDf
|
|
|
128
132
|
|
|
129
133
|
private
|
|
130
134
|
|
|
131
|
-
def filter!(file, filters, filtered)
|
|
135
|
+
def filter!(file, filters, filtered, filter_only_children: false)
|
|
132
136
|
unless filters.empty?
|
|
133
137
|
df = filtered[file]
|
|
134
138
|
|
|
@@ -144,7 +148,7 @@ module GtfsDf
|
|
|
144
148
|
|
|
145
149
|
filtered[file] = df
|
|
146
150
|
|
|
147
|
-
prune!(file, filtered)
|
|
151
|
+
prune!(file, filtered, filter_only_children:)
|
|
148
152
|
end
|
|
149
153
|
|
|
150
154
|
filtered
|
|
@@ -152,59 +156,77 @@ module GtfsDf
|
|
|
152
156
|
|
|
153
157
|
# Traverses the grah to prune unreferenced entities from child dataframes
|
|
154
158
|
# based on parent relationships. See GtfsDf::Graph::STOP_NODES
|
|
155
|
-
def prune!(root, filtered)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# Get valid values from parent
|
|
187
|
-
valid_values = parent_df[parent_col].to_a.uniq.compact
|
|
188
|
-
|
|
189
|
-
# Filter child to only include rows that reference valid parent values
|
|
190
|
-
before = child_df.height
|
|
191
|
-
filter = Polars.col(child_col).is_in(valid_values)
|
|
192
|
-
if allow_null
|
|
193
|
-
filter = (filter | Polars.col(child_col).is_null)
|
|
194
|
-
end
|
|
195
|
-
child_df = child_df.filter(filter)
|
|
196
|
-
changed = child_df.height < before
|
|
197
|
-
|
|
198
|
-
# If we removed a part of the child_df earlier, concat it back on
|
|
199
|
-
if saved_vals
|
|
200
|
-
child_df = Polars.concat([child_df, saved_vals], how: "vertical")
|
|
159
|
+
def prune!(root, filtered, filter_only_children: false)
|
|
160
|
+
seen_edges = Set.new
|
|
161
|
+
maybe_digraph = filter_only_children ? graph : graph.to_undirected
|
|
162
|
+
|
|
163
|
+
queue = [root]
|
|
164
|
+
|
|
165
|
+
while queue.length > 0
|
|
166
|
+
parent_node_id = queue.shift
|
|
167
|
+
maybe_digraph.adj[parent_node_id].each do |child_node_id, attrs|
|
|
168
|
+
edge = edge_id(parent_node_id, child_node_id)
|
|
169
|
+
|
|
170
|
+
next if seen_edges.include?(edge)
|
|
171
|
+
seen_edges.add(edge)
|
|
172
|
+
|
|
173
|
+
parent_node = Graph::NODES[parent_node_id]
|
|
174
|
+
child_node = Graph::NODES[child_node_id]
|
|
175
|
+
parent_df = filtered[parent_node.fetch(:file)]
|
|
176
|
+
next unless parent_df
|
|
177
|
+
|
|
178
|
+
child_df = filtered[child_node.fetch(:file)]
|
|
179
|
+
# Certain nodes are pre-filtered because they reference only
|
|
180
|
+
# a piece of the dataframe
|
|
181
|
+
filter_attrs = child_node[:filter_attrs]
|
|
182
|
+
if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
|
|
183
|
+
filter = filter_attrs.fetch(:filter)
|
|
184
|
+
# Temporarily remove rows that do not match node filter criteria to process them
|
|
185
|
+
# separately (e.g., when filtering stops, parent stations that should be preserved
|
|
186
|
+
# regardless of direct references)
|
|
187
|
+
saved_vals = child_df.filter(filter.is_not)
|
|
188
|
+
child_df = child_df.filter(filter)
|
|
201
189
|
end
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
190
|
+
next unless child_df && child_df.height > 0
|
|
191
|
+
|
|
192
|
+
queue << child_node_id
|
|
193
|
+
|
|
194
|
+
attrs[:dependencies].each do |dep|
|
|
195
|
+
parent_col = dep[parent_node_id]
|
|
196
|
+
child_col = dep[child_node_id]
|
|
197
|
+
allow_null = !!dep[:allow_null]
|
|
198
|
+
|
|
199
|
+
next unless parent_col && child_col &&
|
|
200
|
+
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
201
|
+
|
|
202
|
+
# Get valid values from parent
|
|
203
|
+
valid_values = parent_df[parent_col].to_a.uniq.compact
|
|
204
|
+
|
|
205
|
+
# Filter child to only include rows that reference valid parent values
|
|
206
|
+
before = child_df.height
|
|
207
|
+
filter = Polars.col(child_col).is_in(valid_values)
|
|
208
|
+
if allow_null
|
|
209
|
+
filter = (filter | Polars.col(child_col).is_null)
|
|
210
|
+
end
|
|
211
|
+
child_df = child_df.filter(filter)
|
|
212
|
+
changed = child_df.height < before
|
|
213
|
+
|
|
214
|
+
# If we removed a part of the child_df earlier, concat it back on
|
|
215
|
+
if saved_vals
|
|
216
|
+
child_df = Polars.concat([child_df, saved_vals], how: "vertical")
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
if changed
|
|
220
|
+
filtered[child_node.fetch(:file)] = child_df
|
|
221
|
+
end
|
|
205
222
|
end
|
|
206
223
|
end
|
|
207
224
|
end
|
|
208
225
|
end
|
|
226
|
+
|
|
227
|
+
def edge_id(parent, child)
|
|
228
|
+
# Alphabetize to make sure this works with undirected graph
|
|
229
|
+
[parent, child].sort.join("-")
|
|
230
|
+
end
|
|
209
231
|
end
|
|
210
232
|
end
|
data/lib/gtfs_df/graph.rb
CHANGED
|
@@ -42,15 +42,16 @@ module GtfsDf
|
|
|
42
42
|
|
|
43
43
|
# Returns a directed graph of GTFS file dependencies
|
|
44
44
|
def self.build
|
|
45
|
-
g = NetworkX::
|
|
45
|
+
g = NetworkX::DiGraph.new
|
|
46
46
|
NODES.keys.each { |node| g.add_node(node) }
|
|
47
47
|
|
|
48
|
+
# Edges should be parent, child
|
|
48
49
|
# TODO: Add fare_rules -> stops + test
|
|
49
50
|
edges = [
|
|
50
51
|
["agency", "routes", {dependencies: [
|
|
51
52
|
{"agency" => "agency_id", "routes" => "agency_id"}
|
|
52
53
|
]}],
|
|
53
|
-
["
|
|
54
|
+
["agency", "fare_attributes", {dependencies: [
|
|
54
55
|
{"fare_attributes" => "agency_id",
|
|
55
56
|
"agency" => "agency_id"}
|
|
56
57
|
]}],
|
|
@@ -58,7 +59,7 @@ module GtfsDf
|
|
|
58
59
|
{"fare_attributes" => "fare_id",
|
|
59
60
|
"fare_rules" => "fare_id"}
|
|
60
61
|
]}],
|
|
61
|
-
["
|
|
62
|
+
["routes", "fare_rules", {dependencies: [
|
|
62
63
|
{"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
|
|
63
64
|
]}],
|
|
64
65
|
["routes", "trips", {dependencies: [
|
|
@@ -67,24 +68,24 @@ module GtfsDf
|
|
|
67
68
|
["trips", "stop_times", {dependencies: [
|
|
68
69
|
{"trips" => "trip_id", "stop_times" => "trip_id"}
|
|
69
70
|
]}],
|
|
70
|
-
["
|
|
71
|
+
["stops", "stop_times", {dependencies: [
|
|
71
72
|
{"stop_times" => "stop_id", "stops" => "stop_id"}
|
|
72
73
|
]}],
|
|
73
74
|
# Self-referential edge: stops can reference parent stations (location_type=1)
|
|
74
|
-
["
|
|
75
|
+
["parent_stations", "stops", {dependencies: [
|
|
75
76
|
{"stops" => "parent_station", "parent_stations" => "stop_id"}
|
|
76
77
|
]}],
|
|
77
78
|
["stops", "transfers", {dependencies: [
|
|
78
79
|
{"stops" => "stop_id", "transfers" => "from_stop_id"},
|
|
79
80
|
{"stops" => "stop_id", "transfers" => "to_stop_id"}
|
|
80
81
|
]}],
|
|
81
|
-
["
|
|
82
|
+
["calendar", "trips", {dependencies: [
|
|
82
83
|
{"trips" => "service_id", "calendar" => "service_id"}
|
|
83
84
|
]}],
|
|
84
|
-
["
|
|
85
|
+
["calendar_dates", "trips", {dependencies: [
|
|
85
86
|
{"trips" => "service_id", "calendar_dates" => "service_id"}
|
|
86
87
|
]}],
|
|
87
|
-
["
|
|
88
|
+
["shapes", "trips", {dependencies: [
|
|
88
89
|
{"trips" => "shape_id", "shapes" => "shape_id"}
|
|
89
90
|
]}],
|
|
90
91
|
["trips", "frequencies", {dependencies: [
|
|
@@ -97,11 +98,11 @@ module GtfsDf
|
|
|
97
98
|
{"stops" => "stop_id", "fare_leg_join_rules" => "from_stop_id"},
|
|
98
99
|
{"stops" => "stop_id", "fare_leg_join_rules" => "to_stop_id"}
|
|
99
100
|
]}],
|
|
100
|
-
["
|
|
101
|
+
["networks", "fare_leg_join_rules", {dependencies: [
|
|
101
102
|
{"fare_leg_join_rules" => "from_network_id", "networks" => "network_id"},
|
|
102
103
|
{"fare_leg_join_rules" => "to_network_id", "networks" => "network_id"}
|
|
103
104
|
]}],
|
|
104
|
-
["
|
|
105
|
+
["fare_leg_rules", "fare_leg_join_rules",
|
|
105
106
|
{dependencies: [
|
|
106
107
|
{"fare_leg_join_rules" => "fare_leg_rule_id", "fare_leg_rules" => "fare_leg_rule_id"}
|
|
107
108
|
]}],
|
|
@@ -110,14 +111,14 @@ module GtfsDf
|
|
|
110
111
|
{"fare_transfer_rules" => "from_leg_group_id", "fare_leg_rules" => "leg_group_id"},
|
|
111
112
|
{"fare_transfer_rules" => "to_leg_group_id", "fare_leg_rules" => "leg_group_id"}
|
|
112
113
|
]}],
|
|
113
|
-
["
|
|
114
|
+
["fare_products", "fare_transfer_rules",
|
|
114
115
|
{dependencies: [
|
|
115
116
|
{"fare_transfer_rules" => "fare_product_id", "fare_products" => "fare_product_id"}
|
|
116
117
|
]}],
|
|
117
118
|
["areas", "stop_areas", {dependencies: [
|
|
118
119
|
{"areas" => "area_id", "stop_areas" => "area_id"}
|
|
119
120
|
]}],
|
|
120
|
-
["
|
|
121
|
+
["areas", "stops", {dependencies: [
|
|
121
122
|
{"stops" => "area_id", "areas" => "area_id"}
|
|
122
123
|
]}],
|
|
123
124
|
["areas", "fare_leg_rules", {dependencies: [
|
|
@@ -133,10 +134,10 @@ module GtfsDf
|
|
|
133
134
|
["networks", "fare_leg_rules", {dependencies: [
|
|
134
135
|
{"networks" => "network_id", "fare_leg_rules" => "network_id"}
|
|
135
136
|
]}],
|
|
136
|
-
["
|
|
137
|
+
["routes", "route_networks", {dependencies: [
|
|
137
138
|
{"route_networks" => "route_id", "routes" => "route_id"}
|
|
138
139
|
]}],
|
|
139
|
-
["
|
|
140
|
+
["networks", "route_networks", {dependencies: [
|
|
140
141
|
{"route_networks" => "network_id", "networks" => "network_id"}
|
|
141
142
|
]}],
|
|
142
143
|
["location_groups", "location_group_stops", {dependencies: [
|
|
@@ -145,13 +146,13 @@ module GtfsDf
|
|
|
145
146
|
["location_groups", "stops", {dependencies: [
|
|
146
147
|
{"location_groups" => "location_group_id", "stops" => "location_group_id"}
|
|
147
148
|
]}],
|
|
148
|
-
["
|
|
149
|
+
["stops", "location_group_stops", {dependencies: [
|
|
149
150
|
{"location_group_stops" => "stop_id", "stops" => "stop_id"}
|
|
150
151
|
]}],
|
|
151
152
|
["stops", "location_group_stops", {dependencies: [
|
|
152
153
|
{"stops" => "stop_id", "location_group_stops" => "stop_id"}
|
|
153
154
|
]}],
|
|
154
|
-
["
|
|
155
|
+
["location_groups", "location_group_stops", {dependencies: [
|
|
155
156
|
{"location_group_stops" => "location_group_id", "location_groups" => "location_group_id"}
|
|
156
157
|
]}],
|
|
157
158
|
["booking_rules", "stop_times", {dependencies: [
|
data/lib/gtfs_df/version.rb
CHANGED