gtfs_df 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b90534f9b41229026b91e4632c12dac2235ff615ac8138c301702e51bef3dfa
4
- data.tar.gz: 95ac49f7dcdecea5f08c2fe51f8f50b0466bcb75a1d8ae67720b464c2983ed88
3
+ metadata.gz: 128899724d4613f0170fa601cb0c4cdc9f88a51b2fcd6c06108e7cc9acf9201c
4
+ data.tar.gz: 14c30a678bc9a623233c6631be1c7d497fff6bae4b30abd88406c297bc76190b
5
5
  SHA512:
6
- metadata.gz: 2fc195cb47d81dd4799d99d8182f8c5ff7b56cdfdee08281ea8a129568b87f38d719956fd98532604dc26bf97dee344bbdd65943cc1e292248a2375b76290eda
7
- data.tar.gz: 0d4b2e22968e5ed37349c82ff84c9ba8a6f4d12c143a8709caa40998e16afe4a2178f820f68bb384dc4cf12de445a9f0d265d163d48071621e69f64501db76d5
6
+ metadata.gz: b7e3d1ac85953b995b82d0abbb9f872cf5307564ba67949320d607190ef5a7ff7f8e34df241cc584d7b36fc8db9c4159a7bf74ef1ce6d6868e4a3155784a9ec9
7
+ data.tar.gz: a1067ff3a0912b3eb56e3348e707c777563b50f440084a5b8c83b2d3da6ebf8a8a7ed49093a756d8871f1d3d849906daa844a0a66c04e56f97b03bfe2a106d79
data/CHANGELOG.md CHANGED
@@ -1,3 +1,20 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.3.0] - 2025-12-04
4
+
5
+ ### Added
6
+
7
+ - keep parent stations linked to used stops
8
+
9
+ ### Fixed
10
+
11
+ - handle null values
12
+ - update lock on version bump
13
+
14
+ ### Maintenance
15
+
16
+ - reuse load_from_dir logic in reader
17
+ - clean up unused method + better comments
1
18
  ## [0.1.0] - 2025-11-10
2
19
 
3
20
  - Initial release
data/README.md CHANGED
@@ -86,11 +86,25 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
86
86
 
87
87
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
88
88
 
89
+ ## Release process
90
+
91
+ 1. `bin/bump-version`
92
+
93
+ - Bump the version in `lib/gtfs_df/version.rb`
94
+ - Update the `CHANGELOG.md` using the git log since the last version
95
+ - Create and push a new release branch with those changes
96
+ - Create a PR for that release
97
+
98
+ 2. `bin/create-tag`
99
+
100
+ Creates and pushes the git tag for the release. That will trigger the GitHub action: `.github/workflows/publish.yml` to publish to RubyGems.
101
+
89
102
  ## TODO
90
103
 
91
104
  - [ ] Time parsing
92
- Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
93
- I haven't figured out how to properly implement with Polars.
105
+
106
+ Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
107
+ I haven't figured out how to properly implement that with Polars.
94
108
 
95
109
  ## Contributing
96
110
 
data/lib/gtfs_df/feed.rb CHANGED
@@ -135,19 +135,34 @@ module GtfsDf
135
135
  filtered
136
136
  end
137
137
 
138
+ # Traverses the grah to prune unreferenced entities from child dataframes
139
+ # based on parent relationships. See GtfsDf::Graph::STOP_NODES
138
140
  def prune!(root, filtered)
139
- graph.each_bfs_edge(root) do |parent_file, child_file|
140
- parent_df = filtered[parent_file]
141
+ graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
142
+ parent_node = Graph::NODES[parent_node_id]
143
+ child_node = Graph::NODES[child_node_id]
144
+ parent_df = filtered[parent_node.fetch(:file)]
141
145
  next unless parent_df
142
146
 
143
- child_df = filtered[child_file]
147
+ child_df = filtered[child_node.fetch(:file)]
148
+ # Certain nodes are pre-filtered because they reference only
149
+ # a piece of the dataframe
150
+ filter_attrs = child_node[:filter_attrs]
151
+ if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
152
+ filter = filter_attrs.fetch(:filter)
153
+ # Temporarily remove rows that do not match node filter criteria to process them
154
+ # separately (e.g., when filtering stops, parent stations that should be preserved
155
+ # regardless of direct references)
156
+ saved_vals = child_df.filter(filter.is_not)
157
+ child_df = child_df.filter(filter)
158
+ end
144
159
  next unless child_df && child_df.height > 0
145
160
 
146
- attrs = graph.get_edge_data(parent_file, child_file)
161
+ attrs = graph.get_edge_data(parent_node_id, child_node_id)
147
162
 
148
163
  attrs[:dependencies].each do |dep|
149
- parent_col = dep[parent_file]
150
- child_col = dep[child_file]
164
+ parent_col = dep[parent_node_id]
165
+ child_col = dep[child_node_id]
151
166
 
152
167
  next unless parent_col && child_col &&
153
168
  parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
@@ -157,10 +172,18 @@ module GtfsDf
157
172
 
158
173
  # Filter child to only include rows that reference valid parent values
159
174
  before = child_df.height
160
- child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
175
+ child_df = child_df.filter(
176
+ Polars.col(child_col).is_in(valid_values)
177
+ )
178
+ changed = child_df.height < before
179
+
180
+ # If we removed a part of the child_df earlier, concat it back on
181
+ if saved_vals
182
+ child_df = Polars.concat([child_df, saved_vals], how: "vertical")
183
+ end
161
184
 
162
- if child_df.height < before
163
- filtered[child_file] = child_df
185
+ if changed
186
+ filtered[child_node.fetch(:file)] = child_df
164
187
  end
165
188
  end
166
189
  end
data/lib/gtfs_df/graph.rb CHANGED
@@ -2,15 +2,48 @@
2
2
 
3
3
  module GtfsDf
4
4
  class Graph
5
+ FILES = %w[
6
+ agency routes trips stop_times calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
7
+ fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
8
+ stop_areas fare_leg_rules
9
+ ]
10
+
11
+ STANDARD_FILE_NODES = FILES.map do |file|
12
+ [file, {id: file, file: file, filter: nil}]
13
+ end.to_h.freeze
14
+
15
+ # Separate node definitions for stops and parent stations to handle the self-referential
16
+ # relationship in stops.txt where stops reference parent stations via parent_station column.
17
+ # This allows filtering to preserve parent stations when their child stops are referenced.
18
+ STOP_NODES = {
19
+ "stops" => {
20
+ id: "stops",
21
+ file: "stops",
22
+ filter_attrs: {
23
+ filter_col: "location_type",
24
+ filter: Polars.col("location_type").is_in(
25
+ Schema::EnumValues::STOP_LOCATION_TYPES.map(&:first)
26
+ ) | Polars.col("location_type").is_null
27
+ }
28
+ },
29
+ "parent_stations" => {
30
+ id: "parent_stations",
31
+ file: "stops",
32
+ filter_attrs: {
33
+ filter_col: "location_type",
34
+ filter: Polars.col("location_type").is_in(
35
+ Schema::EnumValues::STATION_LOCATION_TYPES.map(&:first)
36
+ ) & Polars.col("location_type").is_not_null
37
+ }
38
+ }
39
+ }.freeze
40
+
41
+ NODES = STANDARD_FILE_NODES.merge(STOP_NODES).freeze
42
+
5
43
  # Returns a directed graph of GTFS file dependencies
6
44
  def self.build
7
45
  g = NetworkX::Graph.new
8
- # Nodes: GTFS files
9
- files = %w[
10
- agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
11
- fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
12
- ]
13
- files.each { |f| g.add_node(f) }
46
+ NODES.keys.each { |node| g.add_node(node) }
14
47
 
15
48
  # TODO: Add fare_rules -> stops + test
16
49
  edges = [
@@ -33,6 +66,10 @@ module GtfsDf
33
66
  ["stop_times", "stops", {dependencies: [
34
67
  {"stop_times" => "stop_id", "stops" => "stop_id"}
35
68
  ]}],
69
+ # Self-referential edge: stops can reference parent stations (location_type=1)
70
+ ["stops", "parent_stations", {dependencies: [
71
+ {"stops" => "parent_station", "parent_stations" => "stop_id"}
72
+ ]}],
36
73
  ["stops", "transfers", {dependencies: [
37
74
  {"stops" => "stop_id", "transfers" => "from_stop_id"},
38
75
  {"stops" => "stop_id", "transfers" => "to_stop_id"}
@@ -4,24 +4,21 @@ module GtfsDf
4
4
  class Reader
5
5
  # Loads a GTFS zip file and returns a Feed
6
6
  def self.load_from_zip(zip_path)
7
- data = {}
7
+ data = nil
8
+
8
9
  Dir.mktmpdir do |tmpdir|
9
10
  Zip::File.open(zip_path) do |zip_file|
10
11
  zip_file.each do |entry|
11
12
  next unless entry.file?
12
-
13
- GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
14
- next unless entry.name == "#{gtfs_file}.txt"
15
-
16
- out_path = File.join(tmpdir, entry.name)
17
- entry.extract(out_path)
18
-
19
- data[gtfs_file] = data_frame(gtfs_file, out_path)
20
- end
13
+ out_path = File.join(tmpdir, entry.name)
14
+ entry.extract(out_path)
21
15
  end
22
16
  end
17
+
18
+ data = load_from_dir(tmpdir)
23
19
  end
24
- GtfsDf::Feed.new(data)
20
+
21
+ data
25
22
  end
26
23
 
27
24
  # Loads a GTFS dir and returns a Feed
@@ -82,13 +82,16 @@ module GtfsDf
82
82
 
83
83
  # stops.txt
84
84
  # location_type: Type of location
85
- LOCATION_TYPE = [
85
+ STOP_LOCATION_TYPES = [
86
86
  ["0", "Stop or platform"],
87
- ["1", "Station"],
88
87
  ["2", "Entrance/Exit"],
89
88
  ["3", "Generic Node"],
90
89
  ["4", "Boarding Area"]
91
90
  ]
91
+ STATION_LOCATION_TYPES = [
92
+ ["1", "Station"]
93
+ ]
94
+ LOCATION_TYPE = STOP_LOCATION_TYPES + STATION_LOCATION_TYPES
92
95
 
93
96
  # wheelchair_boarding: Indicates wheelchair boarding possibility
94
97
  WHEELCHAIR_BOARDING = [
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 1980-01-01 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: networkx