RubyGems - gtfs_df - Versions diffs - 0.1.1 → 0.2.0 - Mend

gtfs_df 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/.conform.yaml +1 -1
data/README.md +5 -4
data/examples/split-by-agency/Gemfile.lock +1 -1
data/examples/split-by-agency/split_by_agency.rb +9 -2
data/lib/gtfs_df/base_gtfs_table.rb +5 -1
data/lib/gtfs_df/feed.rb +82 -65
data/lib/gtfs_df/graph.rb +2 -5
data/lib/gtfs_df/reader.rb +21 -4
data/lib/gtfs_df/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d799dcd02b51045c948123e3b6bc64c41b5505279a583d98350377e6d06b4205
-  data.tar.gz: 57ff89fd953842d68a5d4c7424a91b54484415a0b093bbe0d0b0411b025c6ef4
+  metadata.gz: 5b90534f9b41229026b91e4632c12dac2235ff615ac8138c301702e51bef3dfa
+  data.tar.gz: 95ac49f7dcdecea5f08c2fe51f8f50b0466bcb75a1d8ae67720b464c2983ed88
 SHA512:
-  metadata.gz: 13b027c13cfec0a3493662cb048dd20f1b44aac778eb0e2d124ad873ae3649b1622758550399d9e457065401bfa04dbbb71e5e36da467599f8aac9675b2431ad
-  data.tar.gz: 9d248e0fe06e69af6a5d86114e77e10583dcbf9956858411cc6ed2cc2d03ebf4ce2047d3e8bbc1eb5eb5530a2e544bf8cc21c4d0184fe331558008fce64a055a
+  metadata.gz: 2fc195cb47d81dd4799d99d8182f8c5ff7b56cdfdee08281ea8a129568b87f38d719956fd98532604dc26bf97dee344bbdd65943cc1e292248a2375b76290eda
+  data.tar.gz: 0d4b2e22968e5ed37349c82ff84c9ba8a6f4d12c143a8709caa40998e16afe4a2178f820f68bb384dc4cf12de445a9f0d265d163d48071621e69f64501db76d5

data/.conform.yaml CHANGED Viewed

@@ -7,7 +7,7 @@ policies:
               case: lower
               invalidLastCharacters: .
           body:
-              required: true
+              required: false
           dco: false
           spellcheck:
               locale: US

data/README.md CHANGED Viewed

@@ -8,18 +8,16 @@ This project was created to bring the power of [partridge] to ruby.
 ## Installation
-TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
 Install the gem and add to the application's Gemfile by executing:
 ```bash
-bundle add UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
+bundle add gtfs_df
 ```
 If bundler is not being used to manage dependencies, install the gem by executing:
 ```bash
-gem install UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
+gem install gtfs_df
 ```
 ## Usage
@@ -32,6 +30,9 @@ require 'gtfs_df'
 # Load from a zip file
 feed = GtfsDf::Reader.load_from_zip('path/to/gtfs.zip')
+# Or, load from a directory
+feed = GtfsDf::Reader.load_from_dir('path/to/gtfs_dir')
 # Access dataframes for each GTFS file
 puts feed.agency.head
 puts feed.routes.head

data/examples/split-by-agency/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: ../..
   specs:
-    gtfs_df (0.1.0)
+    gtfs_df (0.1.1)
       networkx (~> 0.4)
       polars-df (~> 0.22)
       rubyzip (~> 2.3)

data/examples/split-by-agency/split_by_agency.rb CHANGED Viewed

@@ -40,12 +40,19 @@ end
 agency_ids.each do |agency_id|
   Whirly.start do
-    Whirly.status = "-> #{agency_id} filtering..."
     output_path = File.join(output_dir, "#{agency_id}.zip")
+    start_time = Time.now
+    Whirly.status = "-> #{agency_id} filtering..."
     filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
     Whirly.status = "-> #{agency_id} writing..."
     GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
-    Whirly.status = "-> #{agency_id}"
+    elapsed = Time.now - start_time
+    Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
   end
 end

data/lib/gtfs_df/base_gtfs_table.rb CHANGED Viewed

@@ -10,7 +10,11 @@ module GtfsDf
         if input.is_a?(Polars::DataFrame)
           input
         elsif input.is_a?(String)
-          Polars.read_csv(input, dtypes: self.class::SCHEMA)
+          # We need to account for extra columns due to: https://github.com/ankane/ruby-polars/issues/125
+          all_columns = Polars.scan_csv(input).columns
+          default_schema = all_columns.map { |c| [c, Polars::String] }.to_h
+          dtypes = default_schema.merge(self.class::SCHEMA)
+          Polars.read_csv(input, null_values: [""], dtypes:)
         elsif input.is_a?(Array)
           head, *body = input
           df_input = body.each_with_object({}) do |row, acc|

data/lib/gtfs_df/feed.rb CHANGED Viewed

@@ -36,7 +36,7 @@ module GtfsDf
       booking_rules
     ].freeze
-    attr_reader(*GTFS_FILES)
+    attr_reader(*GTFS_FILES, :graph)
     # Initialize with a hash of DataFrames
     REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
@@ -53,6 +53,8 @@ module GtfsDf
         end.join(", ")}"
       end
+      @graph = GtfsDf::Graph.build
       GTFS_FILES.each do |file|
         df = data[file]
         schema_class_name = file.split("_").map(&:capitalize).join
@@ -68,85 +70,100 @@ module GtfsDf
       end
     end
-    # Load from a directory of GTFS CSV files
-    def self.load_from_dir(dir)
-      data = {}
-      GTFS_FILES.each do |file|
-        path = File.join(dir, "#{file}.txt")
-        next unless File.exist?(path)
-        schema_class_name = file.split("_").map(&:capitalize).join
-        data[file] = GtfsDf::Schema.const_get(schema_class_name).new(path)
-      end
-      new(data)
-    end
     # Filter the feed using a view hash
     # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
     def filter(view)
       filtered = {}
-      graph = GtfsDf::Graph.build
-      # Step 1: Apply view filters
       GTFS_FILES.each do |file|
         df = send(file)
         next unless df
-        filters = view[file]
-        if filters && !filters.empty?
-          filters.each do |col, val|
-            df = if val.is_a?(Array)
-              df.filter(Polars.col(col).is_in(val))
-            elsif val.respond_to?(:call)
-              df.filter(val.call(Polars.col(col)))
-            else
-              df.filter(Polars.col(col).eq(val))
-            end
-          end
-        end
         filtered[file] = df
       end
-      # Step 2: Cascade filters following the directed edges
-      # An edge from parent->child means: filter child based on valid parent IDs
-      changed = true
-      while changed
-        changed = false
-        GTFS_FILES.each do |parent_file|
-          parent_df = filtered[parent_file]
-          next unless parent_df && parent_df.height > 0
-          # For each outgoing edge from parent_file to child_file
-          graph.adj[parent_file]&.each do |child_file, attrs|
-            child_df = filtered[child_file]
-            next unless child_df && child_df.height > 0
-            attrs[:dependencies].each do |dep|
-              parent_col = dep[parent_file]
-              child_col = dep[child_file]
-              next unless parent_col && child_col &&
-                parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
-              # Get valid values from parent
-              valid_values = parent_df[parent_col].to_a.uniq.compact
-              next if valid_values.empty?
-              # Filter child to only include rows that reference valid parent values
-              before = child_df.height
-              child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
-              if child_df.height < before
-                filtered[child_file] = child_df
-                changed = true
-              end
-            end
-          end
+      # Trips are the atomic unit of GTFS, we will generate a new view
+      # based on the set of trips that would be included for each invidual filter
+      # and cascade changes from this view in order to retain referential integrity
+      trip_ids = nil
+      view.each do |file, filters|
+        new_filtered = filter!(file, filters, filtered.dup)
+        trip_ids = if trip_ids.nil?
+          new_filtered["trips"]["trip_id"]
+        else
+          trip_ids & new_filtered["trips"]["trip_id"]
         end
       end
+      if trip_ids
+        filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
+      end
       # Remove files that are empty, but keep required files even if empty
-      filtered.delete_if { |file, df| (!df || df.height == 0) && !REQUIRED_GTFS_FILES.include?(file) }
+      filtered.delete_if do |file, df|
+        is_required_file = REQUIRED_GTFS_FILES.include?(file) ||
+          file == "calendar" && !filtered["calendar_dates"] ||
+          file == "calendar_dates" && !filtered["calendar"]
+        (!df || df.height == 0) && !is_required_file
+      end
       self.class.new(filtered)
     end
+    private
+    def filter!(file, filters, filtered)
+      unless filters.empty?
+        df = filtered[file]
+        filters.each do |col, val|
+          df = if val.is_a?(Array)
+            df.filter(Polars.col(col).is_in(val))
+          elsif val.respond_to?(:call)
+            df.filter(val.call(Polars.col(col)))
+          else
+            df.filter(Polars.col(col).eq(val))
+          end
+        end
+        filtered[file] = df
+        prune!(file, filtered)
+      end
+      filtered
+    end
+    def prune!(root, filtered)
+      graph.each_bfs_edge(root) do |parent_file, child_file|
+        parent_df = filtered[parent_file]
+        next unless parent_df
+        child_df = filtered[child_file]
+        next unless child_df && child_df.height > 0
+        attrs = graph.get_edge_data(parent_file, child_file)
+        attrs[:dependencies].each do |dep|
+          parent_col = dep[parent_file]
+          child_col = dep[child_file]
+          next unless parent_col && child_col &&
+            parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
+          # Get valid values from parent
+          valid_values = parent_df[parent_col].to_a.uniq.compact
+          # Filter child to only include rows that reference valid parent values
+          before = child_df.height
+          child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
+          if child_df.height < before
+            filtered[child_file] = child_df
+          end
+        end
+      end
+    end
   end
 end

data/lib/gtfs_df/graph.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module GtfsDf
   class Graph
     # Returns a directed graph of GTFS file dependencies
     def self.build
-      g = NetworkX::DiGraph.new
+      g = NetworkX::Graph.new
       # Nodes: GTFS files
       files = %w[
         agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
@@ -12,7 +12,7 @@ module GtfsDf
       ]
       files.each { |f| g.add_node(f) }
-      # Edges: dependencies
+      # TODO: Add fare_rules -> stops + test
       edges = [
         ["agency", "routes", {dependencies: [
           {"agency" => "agency_id", "routes" => "agency_id"}
@@ -116,9 +116,6 @@ module GtfsDf
         ["booking_rules", "stop_times", {dependencies: [
           {"booking_rules" => "booking_rule_id", "stop_times" => "pickup_booking_rule_id"},
           {"booking_rules" => "booking_rule_id", "stop_times" => "drop_off_booking_rule_id"}
-        ]}],
-        ["stops", "booking_rules", {dependencies: [
-          {"stops" => "stop_id", "booking_rules" => "stop_id"}
         ]}]
       ]

data/lib/gtfs_df/reader.rb CHANGED Viewed

@@ -10,19 +10,36 @@ module GtfsDf
           zip_file.each do |entry|
             next unless entry.file?
-            GtfsDf::Feed::GTFS_FILES.each do |file|
-              next unless entry.name == "#{file}.txt"
+            GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
+              next unless entry.name == "#{gtfs_file}.txt"
               out_path = File.join(tmpdir, entry.name)
               entry.extract(out_path)
-              schema_class_name = file.split("_").map(&:capitalize).join
-              data[file] = GtfsDf::Schema.const_get(schema_class_name).new(out_path).df
+              data[gtfs_file] = data_frame(gtfs_file, out_path)
             end
           end
         end
       end
       GtfsDf::Feed.new(data)
     end
+    # Loads a GTFS dir and returns a Feed
+    def self.load_from_dir(dir_path)
+      data = {}
+      GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
+        path = File.join(dir_path, "#{gtfs_file}.txt")
+        next unless File.exist?(path)
+        data[gtfs_file] = data_frame(gtfs_file, path)
+      end
+      GtfsDf::Feed.new(data)
+    end
+    private_class_method def self.data_frame(gtfs_file, path)
+      schema_class_name = gtfs_file.split("_").map(&:capitalize).join
+      GtfsDf::Schema.const_get(schema_class_name).new(path).df
+    end
   end
 end

data/lib/gtfs_df/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module GtfsDf
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gtfs_df
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - David Mejorado