RubyGems - gtfs_df - Versions diffs - 0.9.2 → 0.10.0 - Mend

gtfs_df 0.9.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/.conform.yaml +1 -0
data/CHANGELOG.md +28 -1
data/examples/split-by-agency/Gemfile.lock +1 -1
data/lib/gtfs_df/base_gtfs_table.rb +7 -0
data/lib/gtfs_df/feed.rb +212 -31
data/lib/gtfs_df/utils.rb +3 -57
data/lib/gtfs_df/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 01e5ed5d1ce0ac7df81fe984b7bc8ca7716f0010672b7c3738d336ed1fbeaee0
-  data.tar.gz: 6f3a21037faaa76aadf9747bba7df94d7fdac9bb57e0b458f5f17696a7158d67
+  metadata.gz: 0be3962480f99e4a2194d6e7da248d5ef82dc5479491ac807211b3dc4d69415a
+  data.tar.gz: ceeaf189058006c0db3b26e6438b8c7fc7a0a077a48f192b588b53e74de695e8
 SHA512:
-  metadata.gz: 1a50d3af43551dda4a03a829ca1d4fd21e323c771181ee2238f9bcea41f4cb0e883d1d543bc26024d2c31d39456212e85bcd6beb6c2935ae98993f34cfd8816c
-  data.tar.gz: 7e1ef2009d75b6dc3f0b82d90183cb048a6a15db74ac7c4076e9acd264fed013a6920e65c95021b59bdfb263fbd4c014b9764b22f396f4957fcec0c7820cea87
+  metadata.gz: 1f27c1b9493aaf2220dfe9369a06a9c5cc26bcae2b9777b0d33bb061a47e1c176eea309580f893fad53dd80b3e7148e84a65ed6f3e0752ac6203c961c991d4ec
+  data.tar.gz: ad94ff3fa5b2fb69f119fdcac7b8c14df0c2ad7863e42b692d10f5eb0bda151f204618ae8fd7209e9de5ebe0684187530e0e4f789aefaaabf340c8890e9d5df3

data/.conform.yaml CHANGED Viewed

@@ -24,3 +24,4 @@ policies:
                   - docs
                   - ci
                   - qol
+                  - test

data/CHANGELOG.md CHANGED Viewed

@@ -1,8 +1,35 @@
-## [0.9.2] - 2026-02-21
+## [0.10.0] - 2026-03-06
+### 🚀 Features
+- Date parsing utility
+- Calendar-based utilities
+- Consider frequencies when counting trips
+### 🐛 Bug Fixes
+- Service dates and busiest week fixes
+## [0.9.3] - 2026-02-27
+### 🐛 Bug Fixes
+- Allow multiple filters
+- Refactor prune to keep caledar_dates-only dependencies
 ### 📚 Documentation
 - Add Brooke to the list of authors
+### 🧪 Testing
+- Ensure we don't drop trips and routes
+### ⚙️ Miscellaneous Tasks
+- Bump version to 0.9.2
+- Avoid converting series into arrays
+- Simplify trip pool reduction
+- Bump version to 0.9.3
 ## [0.9.1] - 2026-02-17
 ### 🐛 Bug Fixes

data/examples/split-by-agency/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: ../..
   specs:
-    gtfs_df (0.9.2)
+    gtfs_df (0.10.0)
       networkx (~> 0.4)
       polars-df (~> 0.22, < 0.24)
       rubyzip (>= 3.0, < 4.0)

data/lib/gtfs_df/base_gtfs_table.rb CHANGED Viewed

@@ -55,5 +55,12 @@ module GtfsDf
     def dataframe
       @df
     end
+    def self.empty_dataframe
+      Polars::DataFrame.new(
+        const_get(:REQUIRED_FIELDS).map { |field| [field, []] }.to_h,
+        schema_overrides: const_get(:SCHEMA)
+      )
+    end
   end
 end

data/lib/gtfs_df/feed.rb CHANGED Viewed

@@ -121,19 +121,19 @@ module GtfsDf
         # Trips are the atomic unit of GTFS, we will generate a new view
         # based on the set of trips that would be included for each invidual filter
         # and cascade changes from this view in order to retain referential integrity
-        trip_ids = nil
+        trip_ids = Polars::Series.new.alias("trip_id")
         view.each do |file, filters|
           new_filtered = filter!(file, filters, filtered.dup)
-          trip_ids = if trip_ids.nil?
+          trip_ids = if trip_ids.empty?
             new_filtered["trips"]["trip_id"]
           else
-            trip_ids & new_filtered["trips"]["trip_id"]
+            trip_ids.filter(trip_ids.is_in(new_filtered["trips"]["trip_id"]))
           end
         end
         if trip_ids
-          filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered.dup)
+          filtered = filter!("trips", {"trip_id" => trip_ids.implode}, filtered.dup)
         end
       end
@@ -174,6 +174,171 @@ module GtfsDf
       send("#{file_name}=", value)
     end
+    # Returns a DataFrame of all service_id/date pairs active in the feed.
+    # Columns: [date, service_id]
+    #
+    # @return [Polars::DataFrame]
+    def service_dates
+      start_date_col = Polars.col("start_date")
+      end_date_col = Polars.col("end_date")
+      date_col = Polars.col("date")
+      calendar_df = @calendar&.with_columns(
+        GtfsDf::Utils.parse_date(start_date_col),
+        GtfsDf::Utils.parse_date(end_date_col)
+      )
+      calendar_dates_df = @calendar_dates&.with_columns(
+        GtfsDf::Utils.parse_date(date_col)
+      )
+      # Expand calendar to a range of (service_id, date)
+      services_by_date = nil
+      if calendar_df
+        expanded = calendar_df.with_columns(
+          Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
+        ).explode("date")
+        dow_col_names = [
+          "monday",
+          "tuesday",
+          "wednesday",
+          "thursday",
+          "friday",
+          "saturday",
+          "sunday"
+        ]
+        # Each day in the calendar table defines if a day of the week has service or not
+        # 1 - Service is available for all Mondays in the date range.
+        # 0 - Service is not available for Mondays in the date range.
+        # https://gtfs.org/documentation/schedule/reference/#calendartxt
+        #
+        # This filter will be applied to the expanded calendar dates, where the
+        # ranges become rows of individual dates, we need to ensure that each
+        # individual date matches the day of the week (DOW) before we check if
+        # it's enabled.
+        filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
+          # Polars weekday: Monday=1, Sunday=7
+          expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
+        end
+        services_by_date = expanded.filter(filter_expr).select("date", "service_id")
+      end
+      # Apply calendar_dates exceptions
+      if calendar_dates_df
+        exception_type_col = Polars.col("exception_type")
+        additions = calendar_dates_df
+          .filter(exception_type_col == "1")
+          .select("date", "service_id")
+        subtractions = calendar_dates_df
+          .filter(exception_type_col == "2")
+          .select("date", "service_id")
+        services_by_date = if services_by_date
+          # If we found service dates from the calendar table, we need to first
+          # add the inclusions, then remove the exceptions coming from the calendar_dates
+          services_by_date
+            .vstack(additions).unique
+            .join(subtractions, on: ["service_id", "date"], how: "anti")
+        else
+          # Otherwise, we can just use the additions as the new services_by_date
+          additions.unique
+        end
+      end
+      services_by_date
+    end
+    # Returns a DataFrame of trip counts per date.
+    # Columns: [date, count]
+    #
+    # @return [Polars::DataFrame]
+    def trip_count_dates
+      cached_service_dates = service_dates
+      return nil if cached_service_dates.nil? || cached_service_dates.height == 0
+      # This expression builds from the dataframe returned by frequency based
+      # trip counts, defaulting to 1 for the trips that don't have an entry in
+      # the frequencies table. We're defining the expression here just to
+      # remove some noise from the join below.
+      trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
+      # Count trips per service_id, considering the possible size they may have
+      # from the frequencies table.
+      trip_counts = @trips
+        .join(frequency_based_trip_counts, on: "trip_id", how: "left")
+        .group_by("service_id")
+        .agg(trip_size.sum.alias("trip_count"))
+      # Join to services to get trips per date
+      daily_trips = cached_service_dates
+        .join(trip_counts, on: "service_id", how: "left")
+        .with_columns(Polars.col("trip_count").fill_null(0))
+      # Sum trips per date
+      daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
+    end
+    # Returns a DataFrame of trip counts from the frequencies table
+    # Columns: [trip_id, freq_count]
+    #
+    # @return [Polars::DataFrame]
+    def frequency_based_trip_counts
+      # If the feed was initialized with the parse_times flag, we already have
+      # seconds since midnight in these columns, otherwise we need to convert
+      # them first, so we can get the duration in seconds
+      end_time_seconds_col, start_time_seconds_col = if @parse_times
+        [Polars.col("end_time"), Polars.col("start_time")]
+      else
+        [
+          GtfsDf::Utils.as_seconds_since_midnight("end_time"),
+          GtfsDf::Utils.as_seconds_since_midnight("start_time")
+        ]
+      end
+      duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
+      count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
+      # The frequencies table is optional, we default to an empty dataframe to
+      # remove friction in the join with trips.
+      if @frequencies
+        @frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
+      else
+        Polars::DataFrame.new(
+          {"trip_id" => [], "freq_count" => []},
+          schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
+        )
+      end
+    end
+    # Identifies the start date of the busiest week in the feed by trip count.
+    #
+    # @return [Date] The Monday of the busiest week
+    def busiest_week
+      daily_total = trip_count_dates
+      return nil if daily_total.nil? || daily_total.height == 0
+      # Group by week (ISO week, starting Monday)
+      weekly_agg = daily_total
+        .with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
+        .group_by("week_start")
+        .agg(Polars.col("count").sum.alias("total_trips"))
+      # Get the week with max trips
+      # Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
+      sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
+      best_week = sorted_weeks.head(1)
+      return nil if best_week.height == 0
+      # Return the start date of the busiest week
+      best_week["week_start"][0]
+    end
     private
     def filter!(file, filters, filtered, filter_only_children: false)
@@ -181,7 +346,7 @@ module GtfsDf
         df = filtered[file]
         filters.each do |col, val|
-          df = if val.is_a?(Array)
+          df = if val.is_a?(Polars::Series) || val.is_a?(Array)
             df.filter(Polars.col(col).is_in(val))
           elsif val.respond_to?(:call)
             df.filter(val.call(Polars.col(col)))
@@ -200,9 +365,16 @@ module GtfsDf
     # Traverses the graph to prune unreferenced entities from child dataframes
     # based on parent relationships. See GtfsDf::Graph::STOP_NODES
+    #
+    # The trips table has multiple parents (calendar, calendar_dates, routes,
+    # stop_times). We accumulate valid values from all of them and keep rows
+    # that match any parent, so trips referenced only via calendar_dates are
+    # not dropped when another edge is processed first.
     def prune!(root, filtered, filter_only_children: false)
       seen_edges = Set.new
       rerooted_graph = Graph.build(bidirectional: !filter_only_children)
+      accumulated_service_ids = Polars::Series.new("service_id", dtype: Polars::String)
+      trips_base_df = nil
       queue = [root]
@@ -245,37 +417,46 @@ module GtfsDf
           attrs[:dependencies].each do |dep|
             parent_col = dep[parent_node_id]
             child_col = dep[child_node_id]
-            allow_null = !!dep[:allow_null]
+            allow_null_flag = !!dep[:allow_null]
             next unless parent_col && child_col &&
               parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
             # Get valid values from parent
-            valid_values = parent_df[parent_col].to_a.uniq.compact
-            # Annoying special case to make sure that if we have a calendar with exceptions,
-            # the calendar_dates file doesn't end up pruning other files
-            if parent_node_id == "calendar_dates" && parent_col == "service_id" &&
-                filtered["calendar"]
-              valid_values = (valid_values + calendar["service_id"].to_a).uniq
-            end
-            # Filter child to only include rows that reference valid parent values
-            before = child_df.height
-            filter = Polars.col(child_col).is_in(valid_values)
-            if allow_null
-              filter = (filter | Polars.col(child_col).is_null)
-            end
-            child_df = child_df.filter(filter)
-            changed = child_df.height < before
-            # If we removed a part of the child_df earlier, concat it back on
-            if saved_vals
-              child_df = Polars.concat([child_df, saved_vals], how: "vertical")
-            end
-            if changed
-              filtered[child_node.fetch(:file)] = child_df
+            valid_values = parent_df[parent_col].drop_nulls.unique
+            if child_node_id == "trips" && (parent_node_id == "calendar" || parent_node_id == "calendar_dates")
+              # Calendar + calendar_dates both define service for the same trips, so we want
+              # union semantics across those two parents (a trip is valid if it appears in
+              # either).
+              #
+              # Here we accumulate valid service_ids across calendar/calendar_dates, but only
+              # within the pool of trips that are already reachable from structural parents.
+              accumulated_service_ids = Polars.concat([accumulated_service_ids, valid_values]).unique
+              # Determine the base pool of trips:
+              # - If we've already restricted trips via structural parents (routes,
+              #   stop_times, shapes, etc), use that as the base.
+              # - Otherwise, like when filtering directly on trips, use the current
+              #   trips dataframe.
+              trips_base_df ||= filtered[child_node.fetch(:file)]
+              next unless trips_base_df && trips_base_df.height > 0
+              filtered[child_node.fetch(:file)] = trips_base_df.filter(
+                Polars.col("service_id").is_in(accumulated_service_ids.implode)
+              )
+            else
+              # Original single-edge logic for all other nodes
+              before = child_df.height
+              cond = Polars.col(child_col).is_in(valid_values.implode)
+              cond = (cond | Polars.col(child_col).is_null) if allow_null_flag
+              child_df = child_df.filter(cond)
+              if child_df.height < before
+                child_df = Polars.concat([child_df, saved_vals], how: "vertical") if saved_vals
+                filtered[child_node.fetch(:file)] = child_df
+              end
             end
           end
         end

data/lib/gtfs_df/utils.rb CHANGED Viewed

@@ -6,53 +6,6 @@ module GtfsDf
     SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
     SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
-    # Parses a GTFS time string to seconds since midnight
-    #
-    # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
-    # also accepted).
-    #
-    # The time is measured from "noon minus 12h" of the service day
-    # (effectively midnight except for days on which daylight savings time
-    # changes occur). For times occurring after midnight on the service day,
-    # enter the time as a value greater than 24:00:00 in HH:MM:SS.
-    #
-    # @example 14:30:00 for 2:30PM or
-    # 25:35:00 for 1:35AM on the next day.
-    #
-    # @param str String|Integer
-    # @return Integer|nil seconds since midnight, or nil if invalid
-    def parse_time(str)
-      return str if str.is_a?(Integer)
-      return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
-      parts = str.to_s.split(":")
-      return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
-      hours, mins, secs = parts.map(&:to_i)
-      hours * 3600 + mins * 60 + secs
-    rescue
-      nil
-    end
-    # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
-    #
-    # Handles times greater than 24 hours for times that span past midnight.
-    #
-    # @param seconds Integer seconds since midnight
-    # @return String|nil time in HH:MM:SS format, or nil if invalid
-    def format_time(seconds)
-      return nil if seconds.nil?
-      return seconds if seconds.is_a?(String)
-      hours = seconds / SECONDS_IN_HOUR
-      minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
-      secs = seconds % SECONDS_IN_MINUTE
-      format("%02d:%02d:%02d", hours, minutes, secs)
-    rescue
-      nil
-    end
     # Converts a GTFS time string column to seconds since midnight
     #
     # Use this method with Polars DataFrames to convert time columns.
@@ -118,16 +71,9 @@ module GtfsDf
     #
     # @example 20180913 for September 13th, 2018.
     #
-    # @param str String
-    def parse_date(str)
-      return nil if str.nil? || str.strip.empty?
-      return nil unless str.match?(/^\d{8}$/)
-      begin
-        Date.strptime(str, "%Y%m%d")
-      rescue ArgumentError
-        nil
-      end
+    # @param col Polars::Expr
+    def parse_date(col)
+      col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
     end
   end
 end

data/lib/gtfs_df/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module GtfsDf
-  VERSION = "0.9.2"
+  VERSION = "0.10.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gtfs_df
 version: !ruby/object:Gem::Version
-  version: 0.9.2
+  version: 0.10.0
 platform: ruby
 authors:
 - David Mejorado