RubyGems - gtfs_df - Versions diffs - 0.1.1 → 0.3.0 - Mend

gtfs_df 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/.conform.yaml +1 -1
data/CHANGELOG.md +17 -0
data/README.md +21 -6
data/examples/split-by-agency/Gemfile.lock +1 -1
data/examples/split-by-agency/split_by_agency.rb +9 -2
data/lib/gtfs_df/base_gtfs_table.rb +5 -1
data/lib/gtfs_df/feed.rb +105 -65
data/lib/gtfs_df/graph.rb +45 -11
data/lib/gtfs_df/reader.rb +24 -10
data/lib/gtfs_df/schema/enum_values.rb +5 -2
data/lib/gtfs_df/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d799dcd02b51045c948123e3b6bc64c41b5505279a583d98350377e6d06b4205
-  data.tar.gz: 57ff89fd953842d68a5d4c7424a91b54484415a0b093bbe0d0b0411b025c6ef4
+  metadata.gz: 128899724d4613f0170fa601cb0c4cdc9f88a51b2fcd6c06108e7cc9acf9201c
+  data.tar.gz: 14c30a678bc9a623233c6631be1c7d497fff6bae4b30abd88406c297bc76190b
 SHA512:
-  metadata.gz: 13b027c13cfec0a3493662cb048dd20f1b44aac778eb0e2d124ad873ae3649b1622758550399d9e457065401bfa04dbbb71e5e36da467599f8aac9675b2431ad
-  data.tar.gz: 9d248e0fe06e69af6a5d86114e77e10583dcbf9956858411cc6ed2cc2d03ebf4ce2047d3e8bbc1eb5eb5530a2e544bf8cc21c4d0184fe331558008fce64a055a
+  metadata.gz: b7e3d1ac85953b995b82d0abbb9f872cf5307564ba67949320d607190ef5a7ff7f8e34df241cc584d7b36fc8db9c4159a7bf74ef1ce6d6868e4a3155784a9ec9
+  data.tar.gz: a1067ff3a0912b3eb56e3348e707c777563b50f440084a5b8c83b2d3da6ebf8a8a7ed49093a756d8871f1d3d849906daa844a0a66c04e56f97b03bfe2a106d79

data/.conform.yaml CHANGED Viewed

@@ -7,7 +7,7 @@ policies:
               case: lower
               invalidLastCharacters: .
           body:
-              required: true
+              required: false
           dco: false
           spellcheck:
               locale: US

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,20 @@
+## [Unreleased]
+## [0.3.0] - 2025-12-04
+### Added
+- keep parent stations linked to used stops
+### Fixed
+- handle null values
+- update lock on version bump
+### Maintenance
+- reuse load_from_dir logic in reader
+- clean up unused method + better comments
 ## [0.1.0] - 2025-11-10
 - Initial release

data/README.md CHANGED Viewed

@@ -8,18 +8,16 @@ This project was created to bring the power of [partridge] to ruby.
 ## Installation
-TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
 Install the gem and add to the application's Gemfile by executing:
 ```bash
-bundle add UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
+bundle add gtfs_df
 ```
 If bundler is not being used to manage dependencies, install the gem by executing:
 ```bash
-gem install UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
+gem install gtfs_df
 ```
 ## Usage
@@ -32,6 +30,9 @@ require 'gtfs_df'
 # Load from a zip file
 feed = GtfsDf::Reader.load_from_zip('path/to/gtfs.zip')
+# Or, load from a directory
+feed = GtfsDf::Reader.load_from_dir('path/to/gtfs_dir')
 # Access dataframes for each GTFS file
 puts feed.agency.head
 puts feed.routes.head
@@ -85,11 +86,25 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Release process
+1. `bin/bump-version`
+- Bump the version in `lib/gtfs_df/version.rb`
+- Update the `CHANGELOG.md` using the git log since the last version
+- Create and push a new release branch with those changes
+- Create a PR for that release
+2. `bin/create-tag`
+Creates and pushes the git tag for the release. That will trigger the GitHub action: `.github/workflows/publish.yml` to publish to RubyGems.
 ## TODO
 - [ ] Time parsing
-    Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
-    I haven't figured out how to properly implement with Polars.
+  Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
+  I haven't figured out how to properly implement that with Polars.
 ## Contributing

data/examples/split-by-agency/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: ../..
   specs:
-    gtfs_df (0.1.0)
+    gtfs_df (0.1.1)
       networkx (~> 0.4)
       polars-df (~> 0.22)
       rubyzip (~> 2.3)

data/examples/split-by-agency/split_by_agency.rb CHANGED Viewed

@@ -40,12 +40,19 @@ end
 agency_ids.each do |agency_id|
   Whirly.start do
-    Whirly.status = "-> #{agency_id} filtering..."
     output_path = File.join(output_dir, "#{agency_id}.zip")
+    start_time = Time.now
+    Whirly.status = "-> #{agency_id} filtering..."
     filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
     Whirly.status = "-> #{agency_id} writing..."
     GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
-    Whirly.status = "-> #{agency_id}"
+    elapsed = Time.now - start_time
+    Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
   end
 end

data/lib/gtfs_df/base_gtfs_table.rb CHANGED Viewed

@@ -10,7 +10,11 @@ module GtfsDf
         if input.is_a?(Polars::DataFrame)
           input
         elsif input.is_a?(String)
-          Polars.read_csv(input, dtypes: self.class::SCHEMA)
+          # We need to account for extra columns due to: https://github.com/ankane/ruby-polars/issues/125
+          all_columns = Polars.scan_csv(input).columns
+          default_schema = all_columns.map { |c| [c, Polars::String] }.to_h
+          dtypes = default_schema.merge(self.class::SCHEMA)
+          Polars.read_csv(input, null_values: [""], dtypes:)
         elsif input.is_a?(Array)
           head, *body = input
           df_input = body.each_with_object({}) do |row, acc|

data/lib/gtfs_df/feed.rb CHANGED Viewed

@@ -36,7 +36,7 @@ module GtfsDf
       booking_rules
     ].freeze
-    attr_reader(*GTFS_FILES)
+    attr_reader(*GTFS_FILES, :graph)
     # Initialize with a hash of DataFrames
     REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
@@ -53,6 +53,8 @@ module GtfsDf
         end.join(", ")}"
       end
+      @graph = GtfsDf::Graph.build
       GTFS_FILES.each do |file|
         df = data[file]
         schema_class_name = file.split("_").map(&:capitalize).join
@@ -68,85 +70,123 @@ module GtfsDf
       end
     end
-    # Load from a directory of GTFS CSV files
-    def self.load_from_dir(dir)
-      data = {}
-      GTFS_FILES.each do |file|
-        path = File.join(dir, "#{file}.txt")
-        next unless File.exist?(path)
-        schema_class_name = file.split("_").map(&:capitalize).join
-        data[file] = GtfsDf::Schema.const_get(schema_class_name).new(path)
-      end
-      new(data)
-    end
     # Filter the feed using a view hash
     # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
     def filter(view)
       filtered = {}
-      graph = GtfsDf::Graph.build
-      # Step 1: Apply view filters
       GTFS_FILES.each do |file|
         df = send(file)
         next unless df
-        filters = view[file]
-        if filters && !filters.empty?
-          filters.each do |col, val|
-            df = if val.is_a?(Array)
-              df.filter(Polars.col(col).is_in(val))
-            elsif val.respond_to?(:call)
-              df.filter(val.call(Polars.col(col)))
-            else
-              df.filter(Polars.col(col).eq(val))
-            end
-          end
-        end
         filtered[file] = df
       end
-      # Step 2: Cascade filters following the directed edges
-      # An edge from parent->child means: filter child based on valid parent IDs
-      changed = true
-      while changed
-        changed = false
-        GTFS_FILES.each do |parent_file|
-          parent_df = filtered[parent_file]
-          next unless parent_df && parent_df.height > 0
-          # For each outgoing edge from parent_file to child_file
-          graph.adj[parent_file]&.each do |child_file, attrs|
-            child_df = filtered[child_file]
-            next unless child_df && child_df.height > 0
-            attrs[:dependencies].each do |dep|
-              parent_col = dep[parent_file]
-              child_col = dep[child_file]
-              next unless parent_col && child_col &&
-                parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
-              # Get valid values from parent
-              valid_values = parent_df[parent_col].to_a.uniq.compact
-              next if valid_values.empty?
-              # Filter child to only include rows that reference valid parent values
-              before = child_df.height
-              child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
-              if child_df.height < before
-                filtered[child_file] = child_df
-                changed = true
-              end
-            end
-          end
+      # Trips are the atomic unit of GTFS, we will generate a new view
+      # based on the set of trips that would be included for each invidual filter
+      # and cascade changes from this view in order to retain referential integrity
+      trip_ids = nil
+      view.each do |file, filters|
+        new_filtered = filter!(file, filters, filtered.dup)
+        trip_ids = if trip_ids.nil?
+          new_filtered["trips"]["trip_id"]
+        else
+          trip_ids & new_filtered["trips"]["trip_id"]
         end
       end
+      if trip_ids
+        filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
+      end
       # Remove files that are empty, but keep required files even if empty
-      filtered.delete_if { |file, df| (!df || df.height == 0) && !REQUIRED_GTFS_FILES.include?(file) }
+      filtered.delete_if do |file, df|
+        is_required_file = REQUIRED_GTFS_FILES.include?(file) ||
+          file == "calendar" && !filtered["calendar_dates"] ||
+          file == "calendar_dates" && !filtered["calendar"]
+        (!df || df.height == 0) && !is_required_file
+      end
       self.class.new(filtered)
     end
+    private
+    def filter!(file, filters, filtered)
+      unless filters.empty?
+        df = filtered[file]
+        filters.each do |col, val|
+          df = if val.is_a?(Array)
+            df.filter(Polars.col(col).is_in(val))
+          elsif val.respond_to?(:call)
+            df.filter(val.call(Polars.col(col)))
+          else
+            df.filter(Polars.col(col).eq(val))
+          end
+        end
+        filtered[file] = df
+        prune!(file, filtered)
+      end
+      filtered
+    end
+    # Traverses the grah to prune unreferenced entities from child dataframes
+    # based on parent relationships. See GtfsDf::Graph::STOP_NODES
+    def prune!(root, filtered)
+      graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
+        parent_node = Graph::NODES[parent_node_id]
+        child_node = Graph::NODES[child_node_id]
+        parent_df = filtered[parent_node.fetch(:file)]
+        next unless parent_df
+        child_df = filtered[child_node.fetch(:file)]
+        # Certain nodes are pre-filtered because they reference only
+        # a piece of the dataframe
+        filter_attrs = child_node[:filter_attrs]
+        if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
+          filter = filter_attrs.fetch(:filter)
+          # Temporarily remove rows that do not match node filter criteria to process them
+          # separately (e.g., when filtering stops, parent stations that should be preserved
+          # regardless of direct references)
+          saved_vals = child_df.filter(filter.is_not)
+          child_df = child_df.filter(filter)
+        end
+        next unless child_df && child_df.height > 0
+        attrs = graph.get_edge_data(parent_node_id, child_node_id)
+        attrs[:dependencies].each do |dep|
+          parent_col = dep[parent_node_id]
+          child_col = dep[child_node_id]
+          next unless parent_col && child_col &&
+            parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
+          # Get valid values from parent
+          valid_values = parent_df[parent_col].to_a.uniq.compact
+          # Filter child to only include rows that reference valid parent values
+          before = child_df.height
+          child_df = child_df.filter(
+            Polars.col(child_col).is_in(valid_values)
+          )
+          changed = child_df.height < before
+          # If we removed a part of the child_df earlier, concat it back on
+          if saved_vals
+            child_df = Polars.concat([child_df, saved_vals], how: "vertical")
+          end
+          if changed
+            filtered[child_node.fetch(:file)] = child_df
+          end
+        end
+      end
+    end
   end
 end

data/lib/gtfs_df/graph.rb CHANGED Viewed

@@ -2,17 +2,50 @@
 module GtfsDf
   class Graph
+    FILES = %w[
+      agency routes trips stop_times calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
+      fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
+      stop_areas fare_leg_rules
+    ]
+    STANDARD_FILE_NODES = FILES.map do |file|
+      [file, {id: file, file: file, filter: nil}]
+    end.to_h.freeze
+    # Separate node definitions for stops and parent stations to handle the self-referential
+    # relationship in stops.txt where stops reference parent stations via parent_station column.
+    # This allows filtering to preserve parent stations when their child stops are referenced.
+    STOP_NODES = {
+      "stops" => {
+        id: "stops",
+        file: "stops",
+        filter_attrs: {
+          filter_col: "location_type",
+          filter: Polars.col("location_type").is_in(
+            Schema::EnumValues::STOP_LOCATION_TYPES.map(&:first)
+          ) | Polars.col("location_type").is_null
+        }
+      },
+      "parent_stations" => {
+        id: "parent_stations",
+        file: "stops",
+        filter_attrs: {
+          filter_col: "location_type",
+          filter: Polars.col("location_type").is_in(
+            Schema::EnumValues::STATION_LOCATION_TYPES.map(&:first)
+          ) & Polars.col("location_type").is_not_null
+        }
+      }
+    }.freeze
+    NODES = STANDARD_FILE_NODES.merge(STOP_NODES).freeze
     # Returns a directed graph of GTFS file dependencies
     def self.build
-      g = NetworkX::DiGraph.new
-      # Nodes: GTFS files
-      files = %w[
-        agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
-        fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
-      ]
-      files.each { |f| g.add_node(f) }
+      g = NetworkX::Graph.new
+      NODES.keys.each { |node| g.add_node(node) }
-      # Edges: dependencies
+      # TODO: Add fare_rules -> stops + test
       edges = [
         ["agency", "routes", {dependencies: [
           {"agency" => "agency_id", "routes" => "agency_id"}
@@ -33,6 +66,10 @@ module GtfsDf
         ["stop_times", "stops", {dependencies: [
           {"stop_times" => "stop_id", "stops" => "stop_id"}
         ]}],
+        # Self-referential edge: stops can reference parent stations (location_type=1)
+        ["stops", "parent_stations", {dependencies: [
+          {"stops" => "parent_station", "parent_stations" => "stop_id"}
+        ]}],
         ["stops", "transfers", {dependencies: [
           {"stops" => "stop_id", "transfers" => "from_stop_id"},
           {"stops" => "stop_id", "transfers" => "to_stop_id"}
@@ -116,9 +153,6 @@ module GtfsDf
         ["booking_rules", "stop_times", {dependencies: [
           {"booking_rules" => "booking_rule_id", "stop_times" => "pickup_booking_rule_id"},
           {"booking_rules" => "booking_rule_id", "stop_times" => "drop_off_booking_rule_id"}
-        ]}],
-        ["stops", "booking_rules", {dependencies: [
-          {"stops" => "stop_id", "booking_rules" => "stop_id"}
         ]}]
       ]

data/lib/gtfs_df/reader.rb CHANGED Viewed

@@ -4,25 +4,39 @@ module GtfsDf
   class Reader
     # Loads a GTFS zip file and returns a Feed
     def self.load_from_zip(zip_path)
-      data = {}
+      data = nil
       Dir.mktmpdir do |tmpdir|
         Zip::File.open(zip_path) do |zip_file|
           zip_file.each do |entry|
             next unless entry.file?
+            out_path = File.join(tmpdir, entry.name)
+            entry.extract(out_path)
+          end
+        end
-            GtfsDf::Feed::GTFS_FILES.each do |file|
-              next unless entry.name == "#{file}.txt"
+        data = load_from_dir(tmpdir)
+      end
-              out_path = File.join(tmpdir, entry.name)
-              entry.extract(out_path)
-              schema_class_name = file.split("_").map(&:capitalize).join
+      data
+    end
-              data[file] = GtfsDf::Schema.const_get(schema_class_name).new(out_path).df
-            end
-          end
-        end
+    # Loads a GTFS dir and returns a Feed
+    def self.load_from_dir(dir_path)
+      data = {}
+      GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
+        path = File.join(dir_path, "#{gtfs_file}.txt")
+        next unless File.exist?(path)
+        data[gtfs_file] = data_frame(gtfs_file, path)
       end
       GtfsDf::Feed.new(data)
     end
+    private_class_method def self.data_frame(gtfs_file, path)
+      schema_class_name = gtfs_file.split("_").map(&:capitalize).join
+      GtfsDf::Schema.const_get(schema_class_name).new(path).df
+    end
   end
 end

data/lib/gtfs_df/schema/enum_values.rb CHANGED Viewed

@@ -82,13 +82,16 @@ module GtfsDf
       # stops.txt
       # location_type: Type of location
-      LOCATION_TYPE = [
+      STOP_LOCATION_TYPES = [
         ["0", "Stop or platform"],
-        ["1", "Station"],
         ["2", "Entrance/Exit"],
         ["3", "Generic Node"],
         ["4", "Boarding Area"]
       ]
+      STATION_LOCATION_TYPES = [
+        ["1", "Station"]
+      ]
+      LOCATION_TYPE = STOP_LOCATION_TYPES + STATION_LOCATION_TYPES
       # wheelchair_boarding: Indicates wheelchair boarding possibility
       WHEELCHAIR_BOARDING = [

data/lib/gtfs_df/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module GtfsDf
-  VERSION = "0.1.1"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: gtfs_df
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.3.0
 platform: ruby
 authors:
 - David Mejorado
 bindir: exe
 cert_chain: []
-date: 1980-01-01 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: networkx