gtfs_df 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b90534f9b41229026b91e4632c12dac2235ff615ac8138c301702e51bef3dfa
4
- data.tar.gz: 95ac49f7dcdecea5f08c2fe51f8f50b0466bcb75a1d8ae67720b464c2983ed88
3
+ metadata.gz: 127f50187dd26c4824abd94a6a44f64242715bf9092072276cdf6c811e0c28a6
4
+ data.tar.gz: d5a6bd9b25830d0574c6c315712d1ec4e0b14959d25f6a1cf12f2e13eeb738a2
5
5
  SHA512:
6
- metadata.gz: 2fc195cb47d81dd4799d99d8182f8c5ff7b56cdfdee08281ea8a129568b87f38d719956fd98532604dc26bf97dee344bbdd65943cc1e292248a2375b76290eda
7
- data.tar.gz: 0d4b2e22968e5ed37349c82ff84c9ba8a6f4d12c143a8709caa40998e16afe4a2178f820f68bb384dc4cf12de445a9f0d265d163d48071621e69f64501db76d5
6
+ metadata.gz: 9312f173960b069a31bb2bb37368bd65ad5345d527d5a29818a7485c731bcb6f6c5fb74639403a3bcbd5d5704ae2bbf629ed5cdfe3f5f629f36ea719663356da
7
+ data.tar.gz: 19563c256e1cbe52a34eef47c31f6b50806a0cb07b773bacb015ad40730c6d804941aef1c936aaeb6c17ea49719ab7db64d1144eeb05c7c4bd1d83ebcd87e12f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,35 @@
1
+ ## [0.4.0] - 2025-12-04
2
+
3
+ ### Added
4
+
5
+ - allow setting maintain_trip_dependencies=false
6
+
7
+ ### Fixed
8
+
9
+ - parse stop_lat as float
10
+ - add missing agency -> fare_attributes edge
11
+ - allow null for fare_rules
12
+
13
+ ### Maintenance
14
+
15
+ - provide accessor for gtfs_files (utility)
16
+ - add yard docs
17
+
18
+ ## [0.3.0] - 2025-12-04
19
+
20
+ ### Added
21
+
22
+ - keep parent stations linked to used stops
23
+
24
+ ### Fixed
25
+
26
+ - handle null values
27
+ - update lock on version bump
28
+
29
+ ### Maintenance
30
+
31
+ - reuse load_from_dir logic in reader
32
+ - clean up unused method + better comments
1
33
  ## [0.1.0] - 2025-11-10
2
34
 
3
35
  - Initial release
data/README.md CHANGED
@@ -86,11 +86,25 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
86
86
 
87
87
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
88
88
 
89
+ ## Release process
90
+
91
+ 1. `bin/bump-version`
92
+
93
+ - Bump the version in `lib/gtfs_df/version.rb`
94
+ - Update the `CHANGELOG.md` using the git log since the last version
95
+ - Create and push a new release branch with those changes
96
+ - Create a PR for that release
97
+
98
+ 2. `bin/create-tag`
99
+
100
+ Creates and pushes the git tag for the release. That will trigger the GitHub action: `.github/workflows/publish.yml` to publish to RubyGems.
101
+
89
102
  ## TODO
90
103
 
91
104
  - [ ] Time parsing
92
- Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
93
- I haven't figured out how to properly implement with Polars.
105
+
106
+ Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
107
+ I haven't figured out how to properly implement that with Polars.
94
108
 
95
109
  ## Contributing
96
110
 
data/lib/gtfs_df/feed.rb CHANGED
@@ -36,7 +36,8 @@ module GtfsDf
36
36
  booking_rules
37
37
  ].freeze
38
38
 
39
- attr_reader(*GTFS_FILES, :graph)
39
+ attr_accessor(*GTFS_FILES)
40
+ attr_reader(:graph)
40
41
 
41
42
  # Initialize with a hash of DataFrames
42
43
  REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
@@ -71,8 +72,16 @@ module GtfsDf
71
72
  end
72
73
 
73
74
  # Filter the feed using a view hash
74
- # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
75
- def filter(view)
75
+ #
76
+ # @param view [Hash] The view used to filter the feed, with format { file => filters }.
77
+ # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
78
+ # @param maintain_trip_dependencies [Boolean] Whether trip dependencies should be preserved.
79
+ # By default, we treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
80
+ # referenced by TripA, we will preserve _all stops_ referenced by TripA. However, it is
81
+ # occasionally useful to prune bad data and _not_ maintain all trip dependencies.
82
+ # For example, if StopA contains invalid coordinates, we may wish to filter it out but keep
83
+ # the other stops for TripA. In this case, `maintain_trip_dependencies` should be set to false.
84
+ def filter(view, maintain_trip_dependencies = true)
76
85
  filtered = {}
77
86
 
78
87
  GTFS_FILES.each do |file|
@@ -82,22 +91,28 @@ module GtfsDf
82
91
  filtered[file] = df
83
92
  end
84
93
 
85
- # Trips are the atomic unit of GTFS, we will generate a new view
86
- # based on the set of trips that would be included for each invidual filter
87
- # and cascade changes from this view in order to retain referential integrity
88
- trip_ids = nil
89
-
90
- view.each do |file, filters|
91
- new_filtered = filter!(file, filters, filtered.dup)
92
- trip_ids = if trip_ids.nil?
93
- new_filtered["trips"]["trip_id"]
94
- else
95
- trip_ids & new_filtered["trips"]["trip_id"]
94
+ if maintain_trip_dependencies
95
+ # Trips are the atomic unit of GTFS, we will generate a new view
96
+ # based on the set of trips that would be included for each invidual filter
97
+ # and cascade changes from this view in order to retain referential integrity
98
+ trip_ids = nil
99
+
100
+ view.each do |file, filters|
101
+ new_filtered = filter!(file, filters, filtered.dup)
102
+ trip_ids = if trip_ids.nil?
103
+ new_filtered["trips"]["trip_id"]
104
+ else
105
+ trip_ids & new_filtered["trips"]["trip_id"]
106
+ end
96
107
  end
97
- end
98
108
 
99
- if trip_ids
100
- filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
109
+ if trip_ids
110
+ filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
111
+ end
112
+ else
113
+ view.each do |file, filters|
114
+ filtered = filter!(file, filters, filtered.dup)
115
+ end
101
116
  end
102
117
 
103
118
  # Remove files that are empty, but keep required files even if empty
@@ -135,19 +150,35 @@ module GtfsDf
135
150
  filtered
136
151
  end
137
152
 
153
+ # Traverses the grah to prune unreferenced entities from child dataframes
154
+ # based on parent relationships. See GtfsDf::Graph::STOP_NODES
138
155
  def prune!(root, filtered)
139
- graph.each_bfs_edge(root) do |parent_file, child_file|
140
- parent_df = filtered[parent_file]
156
+ graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
157
+ parent_node = Graph::NODES[parent_node_id]
158
+ child_node = Graph::NODES[child_node_id]
159
+ parent_df = filtered[parent_node.fetch(:file)]
141
160
  next unless parent_df
142
161
 
143
- child_df = filtered[child_file]
162
+ child_df = filtered[child_node.fetch(:file)]
163
+ # Certain nodes are pre-filtered because they reference only
164
+ # a piece of the dataframe
165
+ filter_attrs = child_node[:filter_attrs]
166
+ if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
167
+ filter = filter_attrs.fetch(:filter)
168
+ # Temporarily remove rows that do not match node filter criteria to process them
169
+ # separately (e.g., when filtering stops, parent stations that should be preserved
170
+ # regardless of direct references)
171
+ saved_vals = child_df.filter(filter.is_not)
172
+ child_df = child_df.filter(filter)
173
+ end
144
174
  next unless child_df && child_df.height > 0
145
175
 
146
- attrs = graph.get_edge_data(parent_file, child_file)
176
+ attrs = graph.get_edge_data(parent_node_id, child_node_id)
147
177
 
148
178
  attrs[:dependencies].each do |dep|
149
- parent_col = dep[parent_file]
150
- child_col = dep[child_file]
179
+ parent_col = dep[parent_node_id]
180
+ child_col = dep[child_node_id]
181
+ allow_null = !!dep[:allow_null]
151
182
 
152
183
  next unless parent_col && child_col &&
153
184
  parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
@@ -157,10 +188,20 @@ module GtfsDf
157
188
 
158
189
  # Filter child to only include rows that reference valid parent values
159
190
  before = child_df.height
160
- child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
191
+ filter = Polars.col(child_col).is_in(valid_values)
192
+ if allow_null
193
+ filter = (filter | Polars.col(child_col).is_null)
194
+ end
195
+ child_df = child_df.filter(filter)
196
+ changed = child_df.height < before
197
+
198
+ # If we removed a part of the child_df earlier, concat it back on
199
+ if saved_vals
200
+ child_df = Polars.concat([child_df, saved_vals], how: "vertical")
201
+ end
161
202
 
162
- if child_df.height < before
163
- filtered[child_file] = child_df
203
+ if changed
204
+ filtered[child_node.fetch(:file)] = child_df
164
205
  end
165
206
  end
166
207
  end
data/lib/gtfs_df/graph.rb CHANGED
@@ -2,27 +2,64 @@
2
2
 
3
3
  module GtfsDf
4
4
  class Graph
5
+ FILES = %w[
6
+ agency routes trips stop_times calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
7
+ fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
8
+ stop_areas fare_leg_rules
9
+ ]
10
+
11
+ STANDARD_FILE_NODES = FILES.map do |file|
12
+ [file, {id: file, file: file, filter: nil}]
13
+ end.to_h.freeze
14
+
15
+ # Separate node definitions for stops and parent stations to handle the self-referential
16
+ # relationship in stops.txt where stops reference parent stations via parent_station column.
17
+ # This allows filtering to preserve parent stations when their child stops are referenced.
18
+ STOP_NODES = {
19
+ "stops" => {
20
+ id: "stops",
21
+ file: "stops",
22
+ filter_attrs: {
23
+ filter_col: "location_type",
24
+ filter: Polars.col("location_type").is_in(
25
+ Schema::EnumValues::STOP_LOCATION_TYPES.map(&:first)
26
+ ) | Polars.col("location_type").is_null
27
+ }
28
+ },
29
+ "parent_stations" => {
30
+ id: "parent_stations",
31
+ file: "stops",
32
+ filter_attrs: {
33
+ filter_col: "location_type",
34
+ filter: Polars.col("location_type").is_in(
35
+ Schema::EnumValues::STATION_LOCATION_TYPES.map(&:first)
36
+ ) & Polars.col("location_type").is_not_null
37
+ }
38
+ }
39
+ }.freeze
40
+
41
+ NODES = STANDARD_FILE_NODES.merge(STOP_NODES).freeze
42
+
5
43
  # Returns a directed graph of GTFS file dependencies
6
44
  def self.build
7
45
  g = NetworkX::Graph.new
8
- # Nodes: GTFS files
9
- files = %w[
10
- agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
11
- fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
12
- ]
13
- files.each { |f| g.add_node(f) }
46
+ NODES.keys.each { |node| g.add_node(node) }
14
47
 
15
48
  # TODO: Add fare_rules -> stops + test
16
49
  edges = [
17
50
  ["agency", "routes", {dependencies: [
18
51
  {"agency" => "agency_id", "routes" => "agency_id"}
19
52
  ]}],
53
+ ["fare_attributes", "agency", {dependencies: [
54
+ {"fare_attributes" => "agency_id",
55
+ "agency" => "agency_id"}
56
+ ]}],
20
57
  ["fare_attributes", "fare_rules", {dependencies: [
21
58
  {"fare_attributes" => "fare_id",
22
59
  "fare_rules" => "fare_id"}
23
60
  ]}],
24
61
  ["fare_rules", "routes", {dependencies: [
25
- {"fare_rules" => "route_id", "routes" => "route_id"}
62
+ {"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
26
63
  ]}],
27
64
  ["routes", "trips", {dependencies: [
28
65
  {"routes" => "route_id", "trips" => "route_id"}
@@ -33,6 +70,10 @@ module GtfsDf
33
70
  ["stop_times", "stops", {dependencies: [
34
71
  {"stop_times" => "stop_id", "stops" => "stop_id"}
35
72
  ]}],
73
+ # Self-referential edge: stops can reference parent stations (location_type=1)
74
+ ["stops", "parent_stations", {dependencies: [
75
+ {"stops" => "parent_station", "parent_stations" => "stop_id"}
76
+ ]}],
36
77
  ["stops", "transfers", {dependencies: [
37
78
  {"stops" => "stop_id", "transfers" => "from_stop_id"},
38
79
  {"stops" => "stop_id", "transfers" => "to_stop_id"}
@@ -4,24 +4,21 @@ module GtfsDf
4
4
  class Reader
5
5
  # Loads a GTFS zip file and returns a Feed
6
6
  def self.load_from_zip(zip_path)
7
- data = {}
7
+ data = nil
8
+
8
9
  Dir.mktmpdir do |tmpdir|
9
10
  Zip::File.open(zip_path) do |zip_file|
10
11
  zip_file.each do |entry|
11
12
  next unless entry.file?
12
-
13
- GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
14
- next unless entry.name == "#{gtfs_file}.txt"
15
-
16
- out_path = File.join(tmpdir, entry.name)
17
- entry.extract(out_path)
18
-
19
- data[gtfs_file] = data_frame(gtfs_file, out_path)
20
- end
13
+ out_path = File.join(tmpdir, entry.name)
14
+ entry.extract(out_path)
21
15
  end
22
16
  end
17
+
18
+ data = load_from_dir(tmpdir)
23
19
  end
24
- GtfsDf::Feed.new(data)
20
+
21
+ data
25
22
  end
26
23
 
27
24
  # Loads a GTFS dir and returns a Feed
@@ -82,13 +82,16 @@ module GtfsDf
82
82
 
83
83
  # stops.txt
84
84
  # location_type: Type of location
85
- LOCATION_TYPE = [
85
+ STOP_LOCATION_TYPES = [
86
86
  ["0", "Stop or platform"],
87
- ["1", "Station"],
88
87
  ["2", "Entrance/Exit"],
89
88
  ["3", "Generic Node"],
90
89
  ["4", "Boarding Area"]
91
90
  ]
91
+ STATION_LOCATION_TYPES = [
92
+ ["1", "Station"]
93
+ ]
94
+ LOCATION_TYPE = STOP_LOCATION_TYPES + STATION_LOCATION_TYPES
92
95
 
93
96
  # wheelchair_boarding: Indicates wheelchair boarding possibility
94
97
  WHEELCHAIR_BOARDING = [
@@ -9,7 +9,7 @@ module GtfsDf
9
9
  "stop_name" => Polars::String,
10
10
  "tts_stop_name" => Polars::String,
11
11
  "stop_desc" => Polars::String,
12
- "stop_lat" => Polars::String,
12
+ "stop_lat" => Polars::Float64,
13
13
  "stop_lon" => Polars::Float64,
14
14
  "zone_id" => Polars::String,
15
15
  "stop_url" => Polars::String,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.2.0"
4
+ VERSION = "0.4.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 1980-01-01 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: networkx