RubyGems - gtfs_df - Versions diffs - 0.9.3 → 0.10.1 - Mend

gtfs_df 0.9.3 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +26 -0
data/lib/gtfs_df/base_gtfs_table.rb +7 -0
data/lib/gtfs_df/feed.rb +165 -0
data/lib/gtfs_df/reader.rb +15 -1
data/lib/gtfs_df/utils.rb +3 -57
data/lib/gtfs_df/version.rb +1 -1
metadata +1 -6
data/examples/split-by-agency/.gitignore +0 -1
data/examples/split-by-agency/Gemfile +0 -5
data/examples/split-by-agency/Gemfile.lock +0 -54
data/examples/split-by-agency/README.md +0 -26
data/examples/split-by-agency/split_by_agency.rb +0 -63

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9c0746b1937afcfb7000425a59976b3eba8662778437ddf3adc02ec7729c3a49
-  data.tar.gz: e3a553aef868b4c29e06731f0a5e3984efc067a096001e90150743a80f3d45bc
+  metadata.gz: 5200ee072c53d95cdb96af3fc8e0f15f9a15e2cc22684b5c9b2b2eeb4c103c8a
+  data.tar.gz: 843492a5f86c595d48431edcf2decd800debc1ead5dd2df0e77d89bf920c01ce
 SHA512:
-  metadata.gz: 0fccae16bb46da6db651da04ea2591fa17e991219d4d8eb9be3a9444ed0b1b190b315f9223b738f399f3b6fade41c2c2e1081d1aad921a39483426d5b86b1aca
-  data.tar.gz: b98eccb0861c46d8510eaaea6a97c48d6b1fa3687d1c35d445edbf19c349a6c03d45c3a2654da20ab577aa083003eff5335a8fcad6ee0d9f6f7f86a78c2ed33b
+  metadata.gz: be450515810d21760afbff3800887e3e5605052bd52eaf7df81951d6193c141dc9487d6b566b9caae5bd44a990051f54907ded2fa619406018afb32df8f3209a
+  data.tar.gz: b6efebb9c5190777a3698a2b747b8528a7a72598c96254fbc7c9ead11aceaa72de75d2a95fb5a4c60745a0aacb0234108e4e5c663650bd5e0d71b858881325ca

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,28 @@
+## [0.10.1] - 2026-03-18
+### 🐛 Bug Fixes
+- Sanitize feed input
+### ⚙️ Miscellaneous Tasks
+- Normalize platforms
+- Ignore the examples folder on publishing
+## [0.10.0] - 2026-03-06
+### 🚀 Features
+- Date parsing utility
+- Calendar-based utilities
+- Consider frequencies when counting trips
+### 🐛 Bug Fixes
+- Service dates and busiest week fixes
+### ⚙️ Miscellaneous Tasks
+- Bump version to 0.10.0
 ## [0.9.3] - 2026-02-27
 ### 🐛 Bug Fixes
@@ -18,6 +43,7 @@
 - Bump version to 0.9.2
 - Avoid converting series into arrays
 - Simplify trip pool reduction
+- Bump version to 0.9.3
 ## [0.9.1] - 2026-02-17
 ### 🐛 Bug Fixes

data/lib/gtfs_df/base_gtfs_table.rb CHANGED Viewed

@@ -55,5 +55,12 @@ module GtfsDf
     def dataframe
       @df
     end
+    def self.empty_dataframe
+      Polars::DataFrame.new(
+        const_get(:REQUIRED_FIELDS).map { |field| [field, []] }.to_h,
+        schema_overrides: const_get(:SCHEMA)
+      )
+    end
   end
 end

data/lib/gtfs_df/feed.rb CHANGED Viewed

@@ -174,6 +174,171 @@ module GtfsDf
       send("#{file_name}=", value)
     end
+    # Returns a DataFrame of all service_id/date pairs active in the feed.
+    # Columns: [date, service_id]
+    #
+    # @return [Polars::DataFrame]
+    def service_dates
+      start_date_col = Polars.col("start_date")
+      end_date_col = Polars.col("end_date")
+      date_col = Polars.col("date")
+      calendar_df = @calendar&.with_columns(
+        GtfsDf::Utils.parse_date(start_date_col),
+        GtfsDf::Utils.parse_date(end_date_col)
+      )
+      calendar_dates_df = @calendar_dates&.with_columns(
+        GtfsDf::Utils.parse_date(date_col)
+      )
+      # Expand calendar to a range of (service_id, date)
+      services_by_date = nil
+      if calendar_df
+        expanded = calendar_df.with_columns(
+          Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
+        ).explode("date")
+        dow_col_names = [
+          "monday",
+          "tuesday",
+          "wednesday",
+          "thursday",
+          "friday",
+          "saturday",
+          "sunday"
+        ]
+        # Each day in the calendar table defines if a day of the week has service or not
+        # 1 - Service is available for all Mondays in the date range.
+        # 0 - Service is not available for Mondays in the date range.
+        # https://gtfs.org/documentation/schedule/reference/#calendartxt
+        #
+        # This filter will be applied to the expanded calendar dates, where the
+        # ranges become rows of individual dates, we need to ensure that each
+        # individual date matches the day of the week (DOW) before we check if
+        # it's enabled.
+        filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
+          # Polars weekday: Monday=1, Sunday=7
+          expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
+        end
+        services_by_date = expanded.filter(filter_expr).select("date", "service_id")
+      end
+      # Apply calendar_dates exceptions
+      if calendar_dates_df
+        exception_type_col = Polars.col("exception_type")
+        additions = calendar_dates_df
+          .filter(exception_type_col == "1")
+          .select("date", "service_id")
+        subtractions = calendar_dates_df
+          .filter(exception_type_col == "2")
+          .select("date", "service_id")
+        services_by_date = if services_by_date
+          # If we found service dates from the calendar table, we need to first
+          # add the inclusions, then remove the exceptions coming from the calendar_dates
+          services_by_date
+            .vstack(additions).unique
+            .join(subtractions, on: ["service_id", "date"], how: "anti")
+        else
+          # Otherwise, we can just use the additions as the new services_by_date
+          additions.unique
+        end
+      end
+      services_by_date
+    end
+    # Returns a DataFrame of trip counts per date.
+    # Columns: [date, count]
+    #
+    # @return [Polars::DataFrame]
+    def trip_count_dates
+      cached_service_dates = service_dates
+      return nil if cached_service_dates.nil? || cached_service_dates.height == 0
+      # This expression builds from the dataframe returned by frequency based
+      # trip counts, defaulting to 1 for the trips that don't have an entry in
+      # the frequencies table. We're defining the expression here just to
+      # remove some noise from the join below.
+      trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
+      # Count trips per service_id, considering the possible size they may have
+      # from the frequencies table.
+      trip_counts = @trips
+        .join(frequency_based_trip_counts, on: "trip_id", how: "left")
+        .group_by("service_id")
+        .agg(trip_size.sum.alias("trip_count"))
+      # Join to services to get trips per date
+      daily_trips = cached_service_dates
+        .join(trip_counts, on: "service_id", how: "left")
+        .with_columns(Polars.col("trip_count").fill_null(0))
+      # Sum trips per date
+      daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
+    end
+    # Returns a DataFrame of trip counts from the frequencies table
+    # Columns: [trip_id, freq_count]
+    #
+    # @return [Polars::DataFrame]
+    def frequency_based_trip_counts
+      # If the feed was initialized with the parse_times flag, we already have
+      # seconds since midnight in these columns, otherwise we need to convert
+      # them first, so we can get the duration in seconds
+      end_time_seconds_col, start_time_seconds_col = if @parse_times
+        [Polars.col("end_time"), Polars.col("start_time")]
+      else
+        [
+          GtfsDf::Utils.as_seconds_since_midnight("end_time"),
+          GtfsDf::Utils.as_seconds_since_midnight("start_time")
+        ]
+      end
+      duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
+      count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
+      # The frequencies table is optional, we default to an empty dataframe to
+      # remove friction in the join with trips.
+      if @frequencies
+        @frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
+      else
+        Polars::DataFrame.new(
+          {"trip_id" => [], "freq_count" => []},
+          schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
+        )
+      end
+    end
+    # Identifies the start date of the busiest week in the feed by trip count.
+    #
+    # @return [Date] The Monday of the busiest week
+    def busiest_week
+      daily_total = trip_count_dates
+      return nil if daily_total.nil? || daily_total.height == 0
+      # Group by week (ISO week, starting Monday)
+      weekly_agg = daily_total
+        .with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
+        .group_by("week_start")
+        .agg(Polars.col("count").sum.alias("total_trips"))
+      # Get the week with max trips
+      # Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
+      sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
+      best_week = sorted_weeks.head(1)
+      return nil if best_week.height == 0
+      # Return the start date of the busiest week
+      best_week["week_start"][0]
+    end
     private
     def filter!(file, filters, filtered, filter_only_children: false)

data/lib/gtfs_df/reader.rb CHANGED Viewed

@@ -10,10 +10,18 @@ module GtfsDf
     def self.load_from_zip(zip_path, parse_times: false)
       data = nil
+      relevant_files = GtfsDf::Feed::GTFS_FILES
+        .map { |name| "#{name}.txt" }
+        .to_set
       Dir.mktmpdir do |tmpdir|
         Zip::File.open(zip_path) do |zip_file|
           zip_file.each do |entry|
-            next unless entry.file?
+            # We're skipping:
+            # - directories
+            # - unrelated files
+            # - empty feed files
+            next unless entry.file? && relevant_files.include?(entry.name) && has_header?(entry)
             entry.extract(destination_directory: tmpdir)
           end
         end
@@ -45,5 +53,11 @@ module GtfsDf
       schema_class_name = gtfs_file.split("_").map(&:capitalize).join
       GtfsDf::Schema.const_get(schema_class_name).new(path).df
     end
+    private_class_method def self.has_header?(zip_entry)
+      zip_entry.get_input_stream.readline.strip != ""
+    rescue
+      false
+    end
   end
 end

data/lib/gtfs_df/utils.rb CHANGED Viewed

@@ -6,53 +6,6 @@ module GtfsDf
     SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
     SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
-    # Parses a GTFS time string to seconds since midnight
-    #
-    # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
-    # also accepted).
-    #
-    # The time is measured from "noon minus 12h" of the service day
-    # (effectively midnight except for days on which daylight savings time
-    # changes occur). For times occurring after midnight on the service day,
-    # enter the time as a value greater than 24:00:00 in HH:MM:SS.
-    #
-    # @example 14:30:00 for 2:30PM or
-    # 25:35:00 for 1:35AM on the next day.
-    #
-    # @param str String|Integer
-    # @return Integer|nil seconds since midnight, or nil if invalid
-    def parse_time(str)
-      return str if str.is_a?(Integer)
-      return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
-      parts = str.to_s.split(":")
-      return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
-      hours, mins, secs = parts.map(&:to_i)
-      hours * 3600 + mins * 60 + secs
-    rescue
-      nil
-    end
-    # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
-    #
-    # Handles times greater than 24 hours for times that span past midnight.
-    #
-    # @param seconds Integer seconds since midnight
-    # @return String|nil time in HH:MM:SS format, or nil if invalid
-    def format_time(seconds)
-      return nil if seconds.nil?
-      return seconds if seconds.is_a?(String)
-      hours = seconds / SECONDS_IN_HOUR
-      minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
-      secs = seconds % SECONDS_IN_MINUTE
-      format("%02d:%02d:%02d", hours, minutes, secs)
-    rescue
-      nil
-    end
     # Converts a GTFS time string column to seconds since midnight
     #
     # Use this method with Polars DataFrames to convert time columns.
@@ -118,16 +71,9 @@ module GtfsDf
     #
     # @example 20180913 for September 13th, 2018.
     #
-    # @param str String
-    def parse_date(str)
-      return nil if str.nil? || str.strip.empty?
-      return nil unless str.match?(/^\d{8}$/)
-      begin
-        Date.strptime(str, "%Y%m%d")
-      rescue ArgumentError
-        nil
-      end
+    # @param col Polars::Expr
+    def parse_date(col)
+      col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
     end
   end
 end

data/lib/gtfs_df/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module GtfsDf
-  VERSION = "0.9.3"
+  VERSION = "0.10.1"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gtfs_df
 version: !ruby/object:Gem::Version
-  version: 0.9.3
+  version: 0.10.1
 platform: ruby
 authors:
 - David Mejorado
@@ -90,11 +90,6 @@ files:
 - devenv.lock
 - devenv.nix
 - devenv.yaml
-- examples/split-by-agency/.gitignore
-- examples/split-by-agency/Gemfile
-- examples/split-by-agency/Gemfile.lock
-- examples/split-by-agency/README.md
-- examples/split-by-agency/split_by_agency.rb
 - lib/gtfs_df.rb
 - lib/gtfs_df/base_gtfs_table.rb
 - lib/gtfs_df/feed.rb

data/examples/split-by-agency/.gitignore DELETED Viewed

	@@ -1 +0,0 @@
1	- output/

data/examples/split-by-agency/Gemfile DELETED Viewed

@@ -1,5 +0,0 @@
-source "https://gem.coop"
-gem "gtfs_df", path: "../.."
-gem "optparse"
-gem "whirly"

data/examples/split-by-agency/Gemfile.lock DELETED Viewed

@@ -1,54 +0,0 @@
-PATH
-  remote: ../..
-  specs:
-    gtfs_df (0.9.3)
-      networkx (~> 0.4)
-      polars-df (~> 0.22, < 0.24)
-      rubyzip (>= 3.0, < 4.0)
-GEM
-  remote: https://gem.coop/
-  specs:
-    bigdecimal (4.0.1)
-    json (2.18.0)
-    matrix (0.4.3)
-    networkx (0.4.0)
-      matrix (~> 0.4)
-      rb_heap (~> 1.0)
-    optparse (0.8.1)
-    polars-df (0.23.0-aarch64-linux)
-      bigdecimal
-    polars-df (0.23.0-aarch64-linux-musl)
-      bigdecimal
-    polars-df (0.23.0-arm64-darwin)
-      bigdecimal
-    polars-df (0.23.0-x86_64-darwin)
-      bigdecimal
-    polars-df (0.23.0-x86_64-linux)
-      bigdecimal
-    polars-df (0.23.0-x86_64-linux-musl)
-      bigdecimal
-    rb_heap (1.1.0)
-    rubyzip (3.2.2)
-    unicode-display_width (3.2.0)
-      unicode-emoji (~> 4.1)
-    unicode-emoji (4.2.0)
-    whirly (0.4.0)
-      json
-      unicode-display_width (>= 1.1)
-PLATFORMS
-  aarch64-linux
-  aarch64-linux-musl
-  arm64-darwin
-  x86_64-darwin
-  x86_64-linux
-  x86_64-linux-musl
-DEPENDENCIES
-  gtfs_df!
-  optparse
-  whirly
-BUNDLED WITH
-   2.6.9

data/examples/split-by-agency/README.md DELETED Viewed

@@ -1,26 +0,0 @@
-# Split GTFS by Agency Example
-This example demonstrates how to split a GTFS zip file into multiple files, one for each specified `agency_id`, using the `gtfs_df` Ruby gem.
-## Usage
-```
-bundle install
-ruby split_by_agency.rb -i <input-gtfs.zip> --ids agency1,agency2
-```
-- The output files will be written to the `output/` directory, named `<agency_id>.zip`.
-## Options
-- `-i`, `--input PATH` — Path to the input GTFS zip file
-- `--ids IDS` — Comma-separated list of agency IDs to extract
-## Example
-```
-ruby split_by_agency.rb -i ../../spec/fixtures/sample_gtfs.zip --ids DTA,OTA
-```
----
-This is a port of the [original Python script](https://gist.github.com/davidmh/f51e5d93a9213e0e606a43167ff87403) using Partridge.

data/examples/split-by-agency/split_by_agency.rb DELETED Viewed

@@ -1,63 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-require "optparse"
-require "fileutils"
-require "gtfs_df"
-require "whirly"
-options = {}
-OptionParser.new do |opts|
-  opts.banner = "Usage: split_by_agency.rb -i <input-gtfs.zip> --ids NUMBERS"
-  opts.on("-i", "--input PATH", "Path to the input GTFS file") do |v|
-    options[:input] = v
-  end
-  opts.on("--ids IDS", "Comma-separated list of agency_ids") do |v|
-    options[:ids] = v
-  end
-end.parse!
-unless options[:input] && options[:ids]
-  warn "Both --input and --ids are required."
-  exit 1
-end
-input_path = File.expand_path(options[:input])
-agency_ids = options[:ids].split(",")
-output_dir = File.expand_path("./output", __dir__)
-FileUtils.mkdir_p(output_dir)
-feed = nil
-Whirly.configure spinner: "dots", stop: "✓"
-Whirly.start do
-  Whirly.status = "Loading"
-  start_time = Time.now
-  feed = GtfsDf::Reader.load_from_zip(input_path)
-  elapsed = Time.now - start_time
-  Whirly.status = "Loaded (#{elapsed.round(2)}s)"
-end
-agency_ids.each do |agency_id|
-  Whirly.start do
-    output_path = File.join(output_dir, "#{agency_id}.zip")
-    start_time = Time.now
-    Whirly.status = "-> #{agency_id} filtering..."
-    filtered_feed = feed.filter({"agency" => {"agency_id" => agency_id}})
-    Whirly.status = "-> #{agency_id} writing..."
-    GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
-    elapsed = Time.now - start_time
-    Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
-  end
-end
-puts "✓  Done, all files are stored in the output/ directory"