gtfs_df 0.9.3 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c0746b1937afcfb7000425a59976b3eba8662778437ddf3adc02ec7729c3a49
4
- data.tar.gz: e3a553aef868b4c29e06731f0a5e3984efc067a096001e90150743a80f3d45bc
3
+ metadata.gz: 0be3962480f99e4a2194d6e7da248d5ef82dc5479491ac807211b3dc4d69415a
4
+ data.tar.gz: ceeaf189058006c0db3b26e6438b8c7fc7a0a077a48f192b588b53e74de695e8
5
5
  SHA512:
6
- metadata.gz: 0fccae16bb46da6db651da04ea2591fa17e991219d4d8eb9be3a9444ed0b1b190b315f9223b738f399f3b6fade41c2c2e1081d1aad921a39483426d5b86b1aca
7
- data.tar.gz: b98eccb0861c46d8510eaaea6a97c48d6b1fa3687d1c35d445edbf19c349a6c03d45c3a2654da20ab577aa083003eff5335a8fcad6ee0d9f6f7f86a78c2ed33b
6
+ metadata.gz: 1f27c1b9493aaf2220dfe9369a06a9c5cc26bcae2b9777b0d33bb061a47e1c176eea309580f893fad53dd80b3e7148e84a65ed6f3e0752ac6203c961c991d4ec
7
+ data.tar.gz: ad94ff3fa5b2fb69f119fdcac7b8c14df0c2ad7863e42b692d10f5eb0bda151f204618ae8fd7209e9de5ebe0684187530e0e4f789aefaaabf340c8890e9d5df3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [0.10.0] - 2026-03-06
2
+
3
+ ### 🚀 Features
4
+
5
+ - Date parsing utility
6
+ - Calendar-based utilities
7
+ - Consider frequencies when counting trips
8
+
9
+ ### 🐛 Bug Fixes
10
+
11
+ - Service dates and busiest week fixes
1
12
  ## [0.9.3] - 2026-02-27
2
13
 
3
14
  ### 🐛 Bug Fixes
@@ -18,6 +29,7 @@
18
29
  - Bump version to 0.9.2
19
30
  - Avoid converting series into arrays
20
31
  - Simplify trip pool reduction
32
+ - Bump version to 0.9.3
21
33
  ## [0.9.1] - 2026-02-17
22
34
 
23
35
  ### 🐛 Bug Fixes
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- gtfs_df (0.9.3)
4
+ gtfs_df (0.10.0)
5
5
  networkx (~> 0.4)
6
6
  polars-df (~> 0.22, < 0.24)
7
7
  rubyzip (>= 3.0, < 4.0)
@@ -55,5 +55,12 @@ module GtfsDf
55
55
  def dataframe
56
56
  @df
57
57
  end
58
+
59
+ def self.empty_dataframe
60
+ Polars::DataFrame.new(
61
+ const_get(:REQUIRED_FIELDS).map { |field| [field, []] }.to_h,
62
+ schema_overrides: const_get(:SCHEMA)
63
+ )
64
+ end
58
65
  end
59
66
  end
data/lib/gtfs_df/feed.rb CHANGED
@@ -174,6 +174,171 @@ module GtfsDf
174
174
  send("#{file_name}=", value)
175
175
  end
176
176
 
177
+ # Returns a DataFrame of all service_id/date pairs active in the feed.
178
+ # Columns: [date, service_id]
179
+ #
180
+ # @return [Polars::DataFrame]
181
+ def service_dates
182
+ start_date_col = Polars.col("start_date")
183
+ end_date_col = Polars.col("end_date")
184
+ date_col = Polars.col("date")
185
+
186
+ calendar_df = @calendar&.with_columns(
187
+ GtfsDf::Utils.parse_date(start_date_col),
188
+ GtfsDf::Utils.parse_date(end_date_col)
189
+ )
190
+
191
+ calendar_dates_df = @calendar_dates&.with_columns(
192
+ GtfsDf::Utils.parse_date(date_col)
193
+ )
194
+
195
+ # Expand calendar to a range of (service_id, date)
196
+ services_by_date = nil
197
+ if calendar_df
198
+ expanded = calendar_df.with_columns(
199
+ Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
200
+ ).explode("date")
201
+
202
+ dow_col_names = [
203
+ "monday",
204
+ "tuesday",
205
+ "wednesday",
206
+ "thursday",
207
+ "friday",
208
+ "saturday",
209
+ "sunday"
210
+ ]
211
+
212
+ # Each day in the calendar table defines if a day of the week has service or not
213
+ # 1 - Service is available for all Mondays in the date range.
214
+ # 0 - Service is not available for Mondays in the date range.
215
+ # https://gtfs.org/documentation/schedule/reference/#calendartxt
216
+ #
217
+ # This filter will be applied to the expanded calendar dates, where the
218
+ # ranges become rows of individual dates, we need to ensure that each
219
+ # individual date matches the day of the week (DOW) before we check if
220
+ # it's enabled.
221
+ filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
222
+ # Polars weekday: Monday=1, Sunday=7
223
+ expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
224
+ end
225
+
226
+ services_by_date = expanded.filter(filter_expr).select("date", "service_id")
227
+ end
228
+
229
+ # Apply calendar_dates exceptions
230
+ if calendar_dates_df
231
+ exception_type_col = Polars.col("exception_type")
232
+
233
+ additions = calendar_dates_df
234
+ .filter(exception_type_col == "1")
235
+ .select("date", "service_id")
236
+
237
+ subtractions = calendar_dates_df
238
+ .filter(exception_type_col == "2")
239
+ .select("date", "service_id")
240
+
241
+ services_by_date = if services_by_date
242
+ # If we found service dates from the calendar table, we need to first
243
+ # add the inclusions, then remove the exceptions coming from the calendar_dates
244
+ services_by_date
245
+ .vstack(additions).unique
246
+ .join(subtractions, on: ["service_id", "date"], how: "anti")
247
+ else
248
+ # Otherwise, we can just use the additions as the new services_by_date
249
+ additions.unique
250
+ end
251
+ end
252
+
253
+ services_by_date
254
+ end
255
+
256
+ # Returns a DataFrame of trip counts per date.
257
+ # Columns: [date, count]
258
+ #
259
+ # @return [Polars::DataFrame]
260
+ def trip_count_dates
261
+ cached_service_dates = service_dates
262
+ return nil if cached_service_dates.nil? || cached_service_dates.height == 0
263
+
264
+ # This expression builds from the dataframe returned by frequency based
265
+ # trip counts, defaulting to 1 for the trips that don't have an entry in
266
+ # the frequencies table. We're defining the expression here just to
267
+ # remove some noise from the join below.
268
+ trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
269
+
270
+ # Count trips per service_id, considering the possible size they may have
271
+ # from the frequencies table.
272
+ trip_counts = @trips
273
+ .join(frequency_based_trip_counts, on: "trip_id", how: "left")
274
+ .group_by("service_id")
275
+ .agg(trip_size.sum.alias("trip_count"))
276
+
277
+ # Join to services to get trips per date
278
+ daily_trips = cached_service_dates
279
+ .join(trip_counts, on: "service_id", how: "left")
280
+ .with_columns(Polars.col("trip_count").fill_null(0))
281
+
282
+ # Sum trips per date
283
+ daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
284
+ end
285
+
286
+ # Returns a DataFrame of trip counts from the frequencies table
287
+ # Columns: [trip_id, freq_count]
288
+ #
289
+ # @return [Polars::DataFrame]
290
+ def frequency_based_trip_counts
291
+ # If the feed was initialized with the parse_times flag, we already have
292
+ # seconds since midnight in these columns, otherwise we need to convert
293
+ # them first, so we can get the duration in seconds
294
+ end_time_seconds_col, start_time_seconds_col = if @parse_times
295
+ [Polars.col("end_time"), Polars.col("start_time")]
296
+ else
297
+ [
298
+ GtfsDf::Utils.as_seconds_since_midnight("end_time"),
299
+ GtfsDf::Utils.as_seconds_since_midnight("start_time")
300
+ ]
301
+ end
302
+
303
+ duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
304
+ count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
305
+
306
+ # The frequencies table is optional, we default to an empty dataframe to
307
+ # remove friction in the join with trips.
308
+ if @frequencies
309
+ @frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
310
+ else
311
+ Polars::DataFrame.new(
312
+ {"trip_id" => [], "freq_count" => []},
313
+ schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
314
+ )
315
+ end
316
+ end
317
+
318
+ # Identifies the start date of the busiest week in the feed by trip count.
319
+ #
320
+ # @return [Date] The Monday of the busiest week
321
+ def busiest_week
322
+ daily_total = trip_count_dates
323
+ return nil if daily_total.nil? || daily_total.height == 0
324
+
325
+ # Group by week (ISO week, starting Monday)
326
+ weekly_agg = daily_total
327
+ .with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
328
+ .group_by("week_start")
329
+ .agg(Polars.col("count").sum.alias("total_trips"))
330
+
331
+ # Get the week with max trips
332
+ # Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
333
+ sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
334
+ best_week = sorted_weeks.head(1)
335
+
336
+ return nil if best_week.height == 0
337
+
338
+ # Return the start date of the busiest week
339
+ best_week["week_start"][0]
340
+ end
341
+
177
342
  private
178
343
 
179
344
  def filter!(file, filters, filtered, filter_only_children: false)
data/lib/gtfs_df/utils.rb CHANGED
@@ -6,53 +6,6 @@ module GtfsDf
6
6
  SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
7
7
  SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
8
8
 
9
- # Parses a GTFS time string to seconds since midnight
10
- #
11
- # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
12
- # also accepted).
13
- #
14
- # The time is measured from "noon minus 12h" of the service day
15
- # (effectively midnight except for days on which daylight savings time
16
- # changes occur). For times occurring after midnight on the service day,
17
- # enter the time as a value greater than 24:00:00 in HH:MM:SS.
18
- #
19
- # @example 14:30:00 for 2:30PM or
20
- # 25:35:00 for 1:35AM on the next day.
21
- #
22
- # @param str String|Integer
23
- # @return Integer|nil seconds since midnight, or nil if invalid
24
- def parse_time(str)
25
- return str if str.is_a?(Integer)
26
- return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
27
-
28
- parts = str.to_s.split(":")
29
- return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
30
-
31
- hours, mins, secs = parts.map(&:to_i)
32
- hours * 3600 + mins * 60 + secs
33
- rescue
34
- nil
35
- end
36
-
37
- # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
38
- #
39
- # Handles times greater than 24 hours for times that span past midnight.
40
- #
41
- # @param seconds Integer seconds since midnight
42
- # @return String|nil time in HH:MM:SS format, or nil if invalid
43
- def format_time(seconds)
44
- return nil if seconds.nil?
45
- return seconds if seconds.is_a?(String)
46
-
47
- hours = seconds / SECONDS_IN_HOUR
48
- minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
49
- secs = seconds % SECONDS_IN_MINUTE
50
-
51
- format("%02d:%02d:%02d", hours, minutes, secs)
52
- rescue
53
- nil
54
- end
55
-
56
9
  # Converts a GTFS time string column to seconds since midnight
57
10
  #
58
11
  # Use this method with Polars DataFrames to convert time columns.
@@ -118,16 +71,9 @@ module GtfsDf
118
71
  #
119
72
  # @example 20180913 for September 13th, 2018.
120
73
  #
121
- # @param str String
122
- def parse_date(str)
123
- return nil if str.nil? || str.strip.empty?
124
- return nil unless str.match?(/^\d{8}$/)
125
-
126
- begin
127
- Date.strptime(str, "%Y%m%d")
128
- rescue ArgumentError
129
- nil
130
- end
74
+ # @param col Polars::Expr
75
+ def parse_date(col)
76
+ col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
131
77
  end
132
78
  end
133
79
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.9.3"
4
+ VERSION = "0.10.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado