gtfs_df 0.4.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 205ec058b41c5bd1d2b01ff3950d4cd2ebb20f304d02d5bcd3dd0c447b4e0a6e
4
- data.tar.gz: 54cca637de421c26d2144df100f8430f5dddb639b3a37d1160dfbd9704630e33
3
+ metadata.gz: c7c1d87e57bbb44ceb4ce8112da7172c5a8c76f8e88d3b1e8fbb610aad850cf3
4
+ data.tar.gz: 6a1b68dfc723d3c70b779687a70100b735c9a57e91603b302685318660473c66
5
5
  SHA512:
6
- metadata.gz: ef853b504ee701e77911259352d0057f6e327db8022370a1fa0dbaa597bfadcac7ac1c8fbe746c04ec4669d86384279a259ac35d4126f1483286937812cb7cce
7
- data.tar.gz: 728d684dd02b653cc779aa5e8152766f5dccafcb097e8fca5810b90f994c442645119f32efb62c84d8fbf1645109f579a3c74f6eb88c9e21dd03faa72193cdd0
6
+ metadata.gz: 85684d79eac1479bac56995cd00a4d22106e5247979e2b54b07dee0d2b1948c4e86bf005be674b57656c0a9c14acf731a11d17163edde72d0302561a2d489159
7
+ data.tar.gz: 7033b279730614870ff8710e51b29ff20f2b399aff9cd1b13a8a2a3202940a88221d9f0edc5dc82a5c3593c9f2314fdc6082fd8c89673f8ec956ba9eada11e2a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,18 @@
1
+ ## [0.6.0] - 2025-12-09
2
+ ### Fixed
3
+
4
+ - visit nodes multiple times
5
+ ## [0.5.0] - 2025-12-08
6
+
7
+ ### Added
8
+
9
+ - add Feed#filter filter_only_children param
10
+
11
+ ### Maintenance
12
+
13
+ - arrange edges so parent is always first
14
+ - build directed graph
15
+ - allow ! in commit messages
1
16
  ## [0.4.1] - 2025-12-05
2
17
 
3
18
  ### Added
data/lib/gtfs_df/feed.rb CHANGED
@@ -75,13 +75,17 @@ module GtfsDf
75
75
  #
76
76
  # @param view [Hash] The view used to filter the feed, with format { file => filters }.
77
77
  # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
78
- # @param maintain_trip_dependencies [Boolean] Whether trip dependencies should be preserved.
79
- # By default, we treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
80
- # referenced by TripA, we will preserve _all stops_ referenced by TripA. However, it is
81
- # occasionally useful to prune bad data and _not_ maintain all trip dependencies.
82
- # For example, if StopA contains invalid coordinates, we may wish to filter it out but keep
83
- # the other stops for TripA. In this case, `maintain_trip_dependencies` should be set to false.
84
- def filter(view, maintain_trip_dependencies = true)
78
+ # @param filter_only_children [Boolean] Whether only child dependencies should be pruned.
79
+ # When false, we:
80
+ # - Treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
81
+ # referenced by TripA, we will preserve _all stops_ referenced by TripA.
82
+ # - Prune unreferenced parent objects (e.g. route is a parent of trip. Unreferenced routes
83
+ # will be pruned.)
84
+ # When true we:
85
+ # - Do not treat trips as atomic. I can filter stopA without maintaining other stops for
86
+ # trips that reference it.
87
+ # - Only filter child objects
88
+ def filter(view, filter_only_children: false)
85
89
  filtered = {}
86
90
 
87
91
  GTFS_FILES.each do |file|
@@ -91,7 +95,11 @@ module GtfsDf
91
95
  filtered[file] = df
92
96
  end
93
97
 
94
- if maintain_trip_dependencies
98
+ if filter_only_children
99
+ view.each do |file, filters|
100
+ filtered = filter!(file, filters, filtered.dup, filter_only_children: true)
101
+ end
102
+ else
95
103
  # Trips are the atomic unit of GTFS, we will generate a new view
96
104
  # based on the set of trips that would be included for each invidual filter
97
105
  # and cascade changes from this view in order to retain referential integrity
@@ -107,11 +115,7 @@ module GtfsDf
107
115
  end
108
116
 
109
117
  if trip_ids
110
- filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
111
- end
112
- else
113
- view.each do |file, filters|
114
- filtered = filter!(file, filters, filtered.dup)
118
+ filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered.dup)
115
119
  end
116
120
  end
117
121
 
@@ -128,7 +132,7 @@ module GtfsDf
128
132
 
129
133
  private
130
134
 
131
- def filter!(file, filters, filtered)
135
+ def filter!(file, filters, filtered, filter_only_children: false)
132
136
  unless filters.empty?
133
137
  df = filtered[file]
134
138
 
@@ -144,7 +148,7 @@ module GtfsDf
144
148
 
145
149
  filtered[file] = df
146
150
 
147
- prune!(file, filtered)
151
+ prune!(file, filtered, filter_only_children:)
148
152
  end
149
153
 
150
154
  filtered
@@ -152,59 +156,77 @@ module GtfsDf
152
156
 
153
157
  # Traverses the grah to prune unreferenced entities from child dataframes
154
158
  # based on parent relationships. See GtfsDf::Graph::STOP_NODES
155
- def prune!(root, filtered)
156
- graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
157
- parent_node = Graph::NODES[parent_node_id]
158
- child_node = Graph::NODES[child_node_id]
159
- parent_df = filtered[parent_node.fetch(:file)]
160
- next unless parent_df
161
-
162
- child_df = filtered[child_node.fetch(:file)]
163
- # Certain nodes are pre-filtered because they reference only
164
- # a piece of the dataframe
165
- filter_attrs = child_node[:filter_attrs]
166
- if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
167
- filter = filter_attrs.fetch(:filter)
168
- # Temporarily remove rows that do not match node filter criteria to process them
169
- # separately (e.g., when filtering stops, parent stations that should be preserved
170
- # regardless of direct references)
171
- saved_vals = child_df.filter(filter.is_not)
172
- child_df = child_df.filter(filter)
173
- end
174
- next unless child_df && child_df.height > 0
175
-
176
- attrs = graph.get_edge_data(parent_node_id, child_node_id)
177
-
178
- attrs[:dependencies].each do |dep|
179
- parent_col = dep[parent_node_id]
180
- child_col = dep[child_node_id]
181
- allow_null = !!dep[:allow_null]
182
-
183
- next unless parent_col && child_col &&
184
- parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
185
-
186
- # Get valid values from parent
187
- valid_values = parent_df[parent_col].to_a.uniq.compact
188
-
189
- # Filter child to only include rows that reference valid parent values
190
- before = child_df.height
191
- filter = Polars.col(child_col).is_in(valid_values)
192
- if allow_null
193
- filter = (filter | Polars.col(child_col).is_null)
194
- end
195
- child_df = child_df.filter(filter)
196
- changed = child_df.height < before
197
-
198
- # If we removed a part of the child_df earlier, concat it back on
199
- if saved_vals
200
- child_df = Polars.concat([child_df, saved_vals], how: "vertical")
159
+ def prune!(root, filtered, filter_only_children: false)
160
+ seen_edges = Set.new
161
+ maybe_digraph = filter_only_children ? graph : graph.to_undirected
162
+
163
+ queue = [root]
164
+
165
+ while queue.length > 0
166
+ parent_node_id = queue.shift
167
+ maybe_digraph.adj[parent_node_id].each do |child_node_id, attrs|
168
+ edge = edge_id(parent_node_id, child_node_id)
169
+
170
+ next if seen_edges.include?(edge)
171
+ seen_edges.add(edge)
172
+
173
+ parent_node = Graph::NODES[parent_node_id]
174
+ child_node = Graph::NODES[child_node_id]
175
+ parent_df = filtered[parent_node.fetch(:file)]
176
+ next unless parent_df
177
+
178
+ child_df = filtered[child_node.fetch(:file)]
179
+ # Certain nodes are pre-filtered because they reference only
180
+ # a piece of the dataframe
181
+ filter_attrs = child_node[:filter_attrs]
182
+ if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
183
+ filter = filter_attrs.fetch(:filter)
184
+ # Temporarily remove rows that do not match node filter criteria to process them
185
+ # separately (e.g., when filtering stops, parent stations that should be preserved
186
+ # regardless of direct references)
187
+ saved_vals = child_df.filter(filter.is_not)
188
+ child_df = child_df.filter(filter)
201
189
  end
202
-
203
- if changed
204
- filtered[child_node.fetch(:file)] = child_df
190
+ next unless child_df && child_df.height > 0
191
+
192
+ queue << child_node_id
193
+
194
+ attrs[:dependencies].each do |dep|
195
+ parent_col = dep[parent_node_id]
196
+ child_col = dep[child_node_id]
197
+ allow_null = !!dep[:allow_null]
198
+
199
+ next unless parent_col && child_col &&
200
+ parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
201
+
202
+ # Get valid values from parent
203
+ valid_values = parent_df[parent_col].to_a.uniq.compact
204
+
205
+ # Filter child to only include rows that reference valid parent values
206
+ before = child_df.height
207
+ filter = Polars.col(child_col).is_in(valid_values)
208
+ if allow_null
209
+ filter = (filter | Polars.col(child_col).is_null)
210
+ end
211
+ child_df = child_df.filter(filter)
212
+ changed = child_df.height < before
213
+
214
+ # If we removed a part of the child_df earlier, concat it back on
215
+ if saved_vals
216
+ child_df = Polars.concat([child_df, saved_vals], how: "vertical")
217
+ end
218
+
219
+ if changed
220
+ filtered[child_node.fetch(:file)] = child_df
221
+ end
205
222
  end
206
223
  end
207
224
  end
208
225
  end
226
+
227
+ def edge_id(parent, child)
228
+ # Alphabetize to make sure this works with undirected graph
229
+ [parent, child].sort.join("-")
230
+ end
209
231
  end
210
232
  end
data/lib/gtfs_df/graph.rb CHANGED
@@ -42,15 +42,16 @@ module GtfsDf
42
42
 
43
43
  # Returns a directed graph of GTFS file dependencies
44
44
  def self.build
45
- g = NetworkX::Graph.new
45
+ g = NetworkX::DiGraph.new
46
46
  NODES.keys.each { |node| g.add_node(node) }
47
47
 
48
+ # Edges should be parent, child
48
49
  # TODO: Add fare_rules -> stops + test
49
50
  edges = [
50
51
  ["agency", "routes", {dependencies: [
51
52
  {"agency" => "agency_id", "routes" => "agency_id"}
52
53
  ]}],
53
- ["fare_attributes", "agency", {dependencies: [
54
+ ["agency", "fare_attributes", {dependencies: [
54
55
  {"fare_attributes" => "agency_id",
55
56
  "agency" => "agency_id"}
56
57
  ]}],
@@ -58,7 +59,7 @@ module GtfsDf
58
59
  {"fare_attributes" => "fare_id",
59
60
  "fare_rules" => "fare_id"}
60
61
  ]}],
61
- ["fare_rules", "routes", {dependencies: [
62
+ ["routes", "fare_rules", {dependencies: [
62
63
  {"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
63
64
  ]}],
64
65
  ["routes", "trips", {dependencies: [
@@ -67,24 +68,24 @@ module GtfsDf
67
68
  ["trips", "stop_times", {dependencies: [
68
69
  {"trips" => "trip_id", "stop_times" => "trip_id"}
69
70
  ]}],
70
- ["stop_times", "stops", {dependencies: [
71
+ ["stops", "stop_times", {dependencies: [
71
72
  {"stop_times" => "stop_id", "stops" => "stop_id"}
72
73
  ]}],
73
74
  # Self-referential edge: stops can reference parent stations (location_type=1)
74
- ["stops", "parent_stations", {dependencies: [
75
+ ["parent_stations", "stops", {dependencies: [
75
76
  {"stops" => "parent_station", "parent_stations" => "stop_id"}
76
77
  ]}],
77
78
  ["stops", "transfers", {dependencies: [
78
79
  {"stops" => "stop_id", "transfers" => "from_stop_id"},
79
80
  {"stops" => "stop_id", "transfers" => "to_stop_id"}
80
81
  ]}],
81
- ["trips", "calendar", {dependencies: [
82
+ ["calendar", "trips", {dependencies: [
82
83
  {"trips" => "service_id", "calendar" => "service_id"}
83
84
  ]}],
84
- ["trips", "calendar_dates", {dependencies: [
85
+ ["calendar_dates", "trips", {dependencies: [
85
86
  {"trips" => "service_id", "calendar_dates" => "service_id"}
86
87
  ]}],
87
- ["trips", "shapes", {dependencies: [
88
+ ["shapes", "trips", {dependencies: [
88
89
  {"trips" => "shape_id", "shapes" => "shape_id"}
89
90
  ]}],
90
91
  ["trips", "frequencies", {dependencies: [
@@ -97,11 +98,11 @@ module GtfsDf
97
98
  {"stops" => "stop_id", "fare_leg_join_rules" => "from_stop_id"},
98
99
  {"stops" => "stop_id", "fare_leg_join_rules" => "to_stop_id"}
99
100
  ]}],
100
- ["fare_leg_join_rules", "networks", {dependencies: [
101
+ ["networks", "fare_leg_join_rules", {dependencies: [
101
102
  {"fare_leg_join_rules" => "from_network_id", "networks" => "network_id"},
102
103
  {"fare_leg_join_rules" => "to_network_id", "networks" => "network_id"}
103
104
  ]}],
104
- ["fare_leg_join_rules", "fare_leg_rules",
105
+ ["fare_leg_rules", "fare_leg_join_rules",
105
106
  {dependencies: [
106
107
  {"fare_leg_join_rules" => "fare_leg_rule_id", "fare_leg_rules" => "fare_leg_rule_id"}
107
108
  ]}],
@@ -110,14 +111,14 @@ module GtfsDf
110
111
  {"fare_transfer_rules" => "from_leg_group_id", "fare_leg_rules" => "leg_group_id"},
111
112
  {"fare_transfer_rules" => "to_leg_group_id", "fare_leg_rules" => "leg_group_id"}
112
113
  ]}],
113
- ["fare_transfer_rules", "fare_products",
114
+ ["fare_products", "fare_transfer_rules",
114
115
  {dependencies: [
115
116
  {"fare_transfer_rules" => "fare_product_id", "fare_products" => "fare_product_id"}
116
117
  ]}],
117
118
  ["areas", "stop_areas", {dependencies: [
118
119
  {"areas" => "area_id", "stop_areas" => "area_id"}
119
120
  ]}],
120
- ["stops", "areas", {dependencies: [
121
+ ["areas", "stops", {dependencies: [
121
122
  {"stops" => "area_id", "areas" => "area_id"}
122
123
  ]}],
123
124
  ["areas", "fare_leg_rules", {dependencies: [
@@ -133,10 +134,10 @@ module GtfsDf
133
134
  ["networks", "fare_leg_rules", {dependencies: [
134
135
  {"networks" => "network_id", "fare_leg_rules" => "network_id"}
135
136
  ]}],
136
- ["route_networks", "routes", {dependencies: [
137
+ ["routes", "route_networks", {dependencies: [
137
138
  {"route_networks" => "route_id", "routes" => "route_id"}
138
139
  ]}],
139
- ["route_networks", "networks", {dependencies: [
140
+ ["networks", "route_networks", {dependencies: [
140
141
  {"route_networks" => "network_id", "networks" => "network_id"}
141
142
  ]}],
142
143
  ["location_groups", "location_group_stops", {dependencies: [
@@ -145,13 +146,13 @@ module GtfsDf
145
146
  ["location_groups", "stops", {dependencies: [
146
147
  {"location_groups" => "location_group_id", "stops" => "location_group_id"}
147
148
  ]}],
148
- ["location_group_stops", "stops", {dependencies: [
149
+ ["stops", "location_group_stops", {dependencies: [
149
150
  {"location_group_stops" => "stop_id", "stops" => "stop_id"}
150
151
  ]}],
151
152
  ["stops", "location_group_stops", {dependencies: [
152
153
  {"stops" => "stop_id", "location_group_stops" => "stop_id"}
153
154
  ]}],
154
- ["location_group_stops", "location_groups", {dependencies: [
155
+ ["location_groups", "location_group_stops", {dependencies: [
155
156
  {"location_group_stops" => "location_group_id", "location_groups" => "location_group_id"}
156
157
  ]}],
157
158
  ["booking_rules", "stop_times", {dependencies: [
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.4.1"
4
+ VERSION = "0.6.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado