gtfs_df 0.9.2 โ†’ 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01e5ed5d1ce0ac7df81fe984b7bc8ca7716f0010672b7c3738d336ed1fbeaee0
4
- data.tar.gz: 6f3a21037faaa76aadf9747bba7df94d7fdac9bb57e0b458f5f17696a7158d67
3
+ metadata.gz: 0be3962480f99e4a2194d6e7da248d5ef82dc5479491ac807211b3dc4d69415a
4
+ data.tar.gz: ceeaf189058006c0db3b26e6438b8c7fc7a0a077a48f192b588b53e74de695e8
5
5
  SHA512:
6
- metadata.gz: 1a50d3af43551dda4a03a829ca1d4fd21e323c771181ee2238f9bcea41f4cb0e883d1d543bc26024d2c31d39456212e85bcd6beb6c2935ae98993f34cfd8816c
7
- data.tar.gz: 7e1ef2009d75b6dc3f0b82d90183cb048a6a15db74ac7c4076e9acd264fed013a6920e65c95021b59bdfb263fbd4c014b9764b22f396f4957fcec0c7820cea87
6
+ metadata.gz: 1f27c1b9493aaf2220dfe9369a06a9c5cc26bcae2b9777b0d33bb061a47e1c176eea309580f893fad53dd80b3e7148e84a65ed6f3e0752ac6203c961c991d4ec
7
+ data.tar.gz: ad94ff3fa5b2fb69f119fdcac7b8c14df0c2ad7863e42b692d10f5eb0bda151f204618ae8fd7209e9de5ebe0684187530e0e4f789aefaaabf340c8890e9d5df3
data/.conform.yaml CHANGED
@@ -24,3 +24,4 @@ policies:
24
24
  - docs
25
25
  - ci
26
26
  - qol
27
+ - test
data/CHANGELOG.md CHANGED
@@ -1,8 +1,35 @@
1
- ## [0.9.2] - 2026-02-21
1
+ ## [0.10.0] - 2026-03-06
2
+
3
+ ### ๐Ÿš€ Features
4
+
5
+ - Date parsing utility
6
+ - Calendar-based utilities
7
+ - Consider frequencies when counting trips
8
+
9
+ ### ๐Ÿ› Bug Fixes
10
+
11
+ - Service dates and busiest week fixes
12
+ ## [0.9.3] - 2026-02-27
13
+
14
+ ### ๐Ÿ› Bug Fixes
15
+
16
+ - Allow multiple filters
17
+ - Refactor prune to keep caledar_dates-only dependencies
2
18
 
3
19
  ### ๐Ÿ“š Documentation
4
20
 
5
21
  - Add Brooke to the list of authors
22
+
23
+ ### ๐Ÿงช Testing
24
+
25
+ - Ensure we don't drop trips and routes
26
+
27
+ ### โš™๏ธ Miscellaneous Tasks
28
+
29
+ - Bump version to 0.9.2
30
+ - Avoid converting series into arrays
31
+ - Simplify trip pool reduction
32
+ - Bump version to 0.9.3
6
33
  ## [0.9.1] - 2026-02-17
7
34
 
8
35
  ### ๐Ÿ› Bug Fixes
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- gtfs_df (0.9.2)
4
+ gtfs_df (0.10.0)
5
5
  networkx (~> 0.4)
6
6
  polars-df (~> 0.22, < 0.24)
7
7
  rubyzip (>= 3.0, < 4.0)
@@ -55,5 +55,12 @@ module GtfsDf
55
55
  def dataframe
56
56
  @df
57
57
  end
58
+
59
+ def self.empty_dataframe
60
+ Polars::DataFrame.new(
61
+ const_get(:REQUIRED_FIELDS).map { |field| [field, []] }.to_h,
62
+ schema_overrides: const_get(:SCHEMA)
63
+ )
64
+ end
58
65
  end
59
66
  end
data/lib/gtfs_df/feed.rb CHANGED
@@ -121,19 +121,19 @@ module GtfsDf
121
121
  # Trips are the atomic unit of GTFS, we will generate a new view
122
122
  # based on the set of trips that would be included for each invidual filter
123
123
  # and cascade changes from this view in order to retain referential integrity
124
- trip_ids = nil
124
+ trip_ids = Polars::Series.new.alias("trip_id")
125
125
 
126
126
  view.each do |file, filters|
127
127
  new_filtered = filter!(file, filters, filtered.dup)
128
- trip_ids = if trip_ids.nil?
128
+ trip_ids = if trip_ids.empty?
129
129
  new_filtered["trips"]["trip_id"]
130
130
  else
131
- trip_ids & new_filtered["trips"]["trip_id"]
131
+ trip_ids.filter(trip_ids.is_in(new_filtered["trips"]["trip_id"]))
132
132
  end
133
133
  end
134
134
 
135
135
  if trip_ids
136
- filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered.dup)
136
+ filtered = filter!("trips", {"trip_id" => trip_ids.implode}, filtered.dup)
137
137
  end
138
138
  end
139
139
 
@@ -174,6 +174,171 @@ module GtfsDf
174
174
  send("#{file_name}=", value)
175
175
  end
176
176
 
177
+ # Returns a DataFrame of all service_id/date pairs active in the feed.
178
+ # Columns: [date, service_id]
179
+ #
180
+ # @return [Polars::DataFrame]
181
+ def service_dates
182
+ start_date_col = Polars.col("start_date")
183
+ end_date_col = Polars.col("end_date")
184
+ date_col = Polars.col("date")
185
+
186
+ calendar_df = @calendar&.with_columns(
187
+ GtfsDf::Utils.parse_date(start_date_col),
188
+ GtfsDf::Utils.parse_date(end_date_col)
189
+ )
190
+
191
+ calendar_dates_df = @calendar_dates&.with_columns(
192
+ GtfsDf::Utils.parse_date(date_col)
193
+ )
194
+
195
+ # Expand calendar to a range of (service_id, date)
196
+ services_by_date = nil
197
+ if calendar_df
198
+ expanded = calendar_df.with_columns(
199
+ Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
200
+ ).explode("date")
201
+
202
+ dow_col_names = [
203
+ "monday",
204
+ "tuesday",
205
+ "wednesday",
206
+ "thursday",
207
+ "friday",
208
+ "saturday",
209
+ "sunday"
210
+ ]
211
+
212
+ # Each day in the calendar table defines if a day of the week has service or not
213
+ # 1 - Service is available for all Mondays in the date range.
214
+ # 0 - Service is not available for Mondays in the date range.
215
+ # https://gtfs.org/documentation/schedule/reference/#calendartxt
216
+ #
217
+ # This filter will be applied to the expanded calendar dates, where the
218
+ # ranges become rows of individual dates, we need to ensure that each
219
+ # individual date matches the day of the week (DOW) before we check if
220
+ # it's enabled.
221
+ filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
222
+ # Polars weekday: Monday=1, Sunday=7
223
+ expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
224
+ end
225
+
226
+ services_by_date = expanded.filter(filter_expr).select("date", "service_id")
227
+ end
228
+
229
+ # Apply calendar_dates exceptions
230
+ if calendar_dates_df
231
+ exception_type_col = Polars.col("exception_type")
232
+
233
+ additions = calendar_dates_df
234
+ .filter(exception_type_col == "1")
235
+ .select("date", "service_id")
236
+
237
+ subtractions = calendar_dates_df
238
+ .filter(exception_type_col == "2")
239
+ .select("date", "service_id")
240
+
241
+ services_by_date = if services_by_date
242
+ # If we found service dates from the calendar table, we need to first
243
+ # add the inclusions, then remove the exceptions coming from the calendar_dates
244
+ services_by_date
245
+ .vstack(additions).unique
246
+ .join(subtractions, on: ["service_id", "date"], how: "anti")
247
+ else
248
+ # Otherwise, we can just use the additions as the new services_by_date
249
+ additions.unique
250
+ end
251
+ end
252
+
253
+ services_by_date
254
+ end
255
+
256
+ # Returns a DataFrame of trip counts per date.
257
+ # Columns: [date, count]
258
+ #
259
+ # @return [Polars::DataFrame]
260
+ def trip_count_dates
261
+ cached_service_dates = service_dates
262
+ return nil if cached_service_dates.nil? || cached_service_dates.height == 0
263
+
264
+ # This expression builds from the dataframe returned by frequency based
265
+ # trip counts, defaulting to 1 for the trips that don't have an entry in
266
+ # the frequencies table. We're defining the expression here just to
267
+ # remove some noise from the join below.
268
+ trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
269
+
270
+ # Count trips per service_id, considering the possible size they may have
271
+ # from the frequencies table.
272
+ trip_counts = @trips
273
+ .join(frequency_based_trip_counts, on: "trip_id", how: "left")
274
+ .group_by("service_id")
275
+ .agg(trip_size.sum.alias("trip_count"))
276
+
277
+ # Join to services to get trips per date
278
+ daily_trips = cached_service_dates
279
+ .join(trip_counts, on: "service_id", how: "left")
280
+ .with_columns(Polars.col("trip_count").fill_null(0))
281
+
282
+ # Sum trips per date
283
+ daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
284
+ end
285
+
286
+ # Returns a DataFrame of trip counts from the frequencies table
287
+ # Columns: [trip_id, freq_count]
288
+ #
289
+ # @return [Polars::DataFrame]
290
+ def frequency_based_trip_counts
291
+ # If the feed was initialized with the parse_times flag, we already have
292
+ # seconds since midnight in these columns, otherwise we need to convert
293
+ # them first, so we can get the duration in seconds
294
+ end_time_seconds_col, start_time_seconds_col = if @parse_times
295
+ [Polars.col("end_time"), Polars.col("start_time")]
296
+ else
297
+ [
298
+ GtfsDf::Utils.as_seconds_since_midnight("end_time"),
299
+ GtfsDf::Utils.as_seconds_since_midnight("start_time")
300
+ ]
301
+ end
302
+
303
+ duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
304
+ count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
305
+
306
+ # The frequencies table is optional, we default to an empty dataframe to
307
+ # remove friction in the join with trips.
308
+ if @frequencies
309
+ @frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
310
+ else
311
+ Polars::DataFrame.new(
312
+ {"trip_id" => [], "freq_count" => []},
313
+ schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
314
+ )
315
+ end
316
+ end
317
+
318
+ # Identifies the start date of the busiest week in the feed by trip count.
319
+ #
320
+ # @return [Date] The Monday of the busiest week
321
+ def busiest_week
322
+ daily_total = trip_count_dates
323
+ return nil if daily_total.nil? || daily_total.height == 0
324
+
325
+ # Group by week (ISO week, starting Monday)
326
+ weekly_agg = daily_total
327
+ .with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
328
+ .group_by("week_start")
329
+ .agg(Polars.col("count").sum.alias("total_trips"))
330
+
331
+ # Get the week with max trips
332
+ # Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
333
+ sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
334
+ best_week = sorted_weeks.head(1)
335
+
336
+ return nil if best_week.height == 0
337
+
338
+ # Return the start date of the busiest week
339
+ best_week["week_start"][0]
340
+ end
341
+
177
342
  private
178
343
 
179
344
  def filter!(file, filters, filtered, filter_only_children: false)
@@ -181,7 +346,7 @@ module GtfsDf
181
346
  df = filtered[file]
182
347
 
183
348
  filters.each do |col, val|
184
- df = if val.is_a?(Array)
349
+ df = if val.is_a?(Polars::Series) || val.is_a?(Array)
185
350
  df.filter(Polars.col(col).is_in(val))
186
351
  elsif val.respond_to?(:call)
187
352
  df.filter(val.call(Polars.col(col)))
@@ -200,9 +365,16 @@ module GtfsDf
200
365
 
201
366
  # Traverses the graph to prune unreferenced entities from child dataframes
202
367
  # based on parent relationships. See GtfsDf::Graph::STOP_NODES
368
+ #
369
+ # The trips table has multiple parents (calendar, calendar_dates, routes,
370
+ # stop_times). We accumulate valid values from all of them and keep rows
371
+ # that match any parent, so trips referenced only via calendar_dates are
372
+ # not dropped when another edge is processed first.
203
373
  def prune!(root, filtered, filter_only_children: false)
204
374
  seen_edges = Set.new
205
375
  rerooted_graph = Graph.build(bidirectional: !filter_only_children)
376
+ accumulated_service_ids = Polars::Series.new("service_id", dtype: Polars::String)
377
+ trips_base_df = nil
206
378
 
207
379
  queue = [root]
208
380
 
@@ -245,37 +417,46 @@ module GtfsDf
245
417
  attrs[:dependencies].each do |dep|
246
418
  parent_col = dep[parent_node_id]
247
419
  child_col = dep[child_node_id]
248
- allow_null = !!dep[:allow_null]
420
+ allow_null_flag = !!dep[:allow_null]
249
421
 
250
422
  next unless parent_col && child_col &&
251
423
  parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
252
424
 
253
425
  # Get valid values from parent
254
- valid_values = parent_df[parent_col].to_a.uniq.compact
255
-
256
- # Annoying special case to make sure that if we have a calendar with exceptions,
257
- # the calendar_dates file doesn't end up pruning other files
258
- if parent_node_id == "calendar_dates" && parent_col == "service_id" &&
259
- filtered["calendar"]
260
- valid_values = (valid_values + calendar["service_id"].to_a).uniq
261
- end
262
-
263
- # Filter child to only include rows that reference valid parent values
264
- before = child_df.height
265
- filter = Polars.col(child_col).is_in(valid_values)
266
- if allow_null
267
- filter = (filter | Polars.col(child_col).is_null)
268
- end
269
- child_df = child_df.filter(filter)
270
- changed = child_df.height < before
271
-
272
- # If we removed a part of the child_df earlier, concat it back on
273
- if saved_vals
274
- child_df = Polars.concat([child_df, saved_vals], how: "vertical")
275
- end
276
-
277
- if changed
278
- filtered[child_node.fetch(:file)] = child_df
426
+ valid_values = parent_df[parent_col].drop_nulls.unique
427
+
428
+ if child_node_id == "trips" && (parent_node_id == "calendar" || parent_node_id == "calendar_dates")
429
+ # Calendar + calendar_dates both define service for the same trips, so we want
430
+ # union semantics across those two parents (a trip is valid if it appears in
431
+ # either).
432
+ #
433
+ # Here we accumulate valid service_ids across calendar/calendar_dates, but only
434
+ # within the pool of trips that are already reachable from structural parents.
435
+ accumulated_service_ids = Polars.concat([accumulated_service_ids, valid_values]).unique
436
+
437
+ # Determine the base pool of trips:
438
+ # - If we've already restricted trips via structural parents (routes,
439
+ # stop_times, shapes, etc), use that as the base.
440
+ # - Otherwise, like when filtering directly on trips, use the current
441
+ # trips dataframe.
442
+ trips_base_df ||= filtered[child_node.fetch(:file)]
443
+ next unless trips_base_df && trips_base_df.height > 0
444
+
445
+ filtered[child_node.fetch(:file)] = trips_base_df.filter(
446
+ Polars.col("service_id").is_in(accumulated_service_ids.implode)
447
+ )
448
+ else
449
+ # Original single-edge logic for all other nodes
450
+ before = child_df.height
451
+
452
+ cond = Polars.col(child_col).is_in(valid_values.implode)
453
+ cond = (cond | Polars.col(child_col).is_null) if allow_null_flag
454
+ child_df = child_df.filter(cond)
455
+
456
+ if child_df.height < before
457
+ child_df = Polars.concat([child_df, saved_vals], how: "vertical") if saved_vals
458
+ filtered[child_node.fetch(:file)] = child_df
459
+ end
279
460
  end
280
461
  end
281
462
  end
data/lib/gtfs_df/utils.rb CHANGED
@@ -6,53 +6,6 @@ module GtfsDf
6
6
  SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
7
7
  SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
8
8
 
9
- # Parses a GTFS time string to seconds since midnight
10
- #
11
- # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
12
- # also accepted).
13
- #
14
- # The time is measured from "noon minus 12h" of the service day
15
- # (effectively midnight except for days on which daylight savings time
16
- # changes occur). For times occurring after midnight on the service day,
17
- # enter the time as a value greater than 24:00:00 in HH:MM:SS.
18
- #
19
- # @example 14:30:00 for 2:30PM or
20
- # 25:35:00 for 1:35AM on the next day.
21
- #
22
- # @param str String|Integer
23
- # @return Integer|nil seconds since midnight, or nil if invalid
24
- def parse_time(str)
25
- return str if str.is_a?(Integer)
26
- return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
27
-
28
- parts = str.to_s.split(":")
29
- return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
30
-
31
- hours, mins, secs = parts.map(&:to_i)
32
- hours * 3600 + mins * 60 + secs
33
- rescue
34
- nil
35
- end
36
-
37
- # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
38
- #
39
- # Handles times greater than 24 hours for times that span past midnight.
40
- #
41
- # @param seconds Integer seconds since midnight
42
- # @return String|nil time in HH:MM:SS format, or nil if invalid
43
- def format_time(seconds)
44
- return nil if seconds.nil?
45
- return seconds if seconds.is_a?(String)
46
-
47
- hours = seconds / SECONDS_IN_HOUR
48
- minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
49
- secs = seconds % SECONDS_IN_MINUTE
50
-
51
- format("%02d:%02d:%02d", hours, minutes, secs)
52
- rescue
53
- nil
54
- end
55
-
56
9
  # Converts a GTFS time string column to seconds since midnight
57
10
  #
58
11
  # Use this method with Polars DataFrames to convert time columns.
@@ -118,16 +71,9 @@ module GtfsDf
118
71
  #
119
72
  # @example 20180913 for September 13th, 2018.
120
73
  #
121
- # @param str String
122
- def parse_date(str)
123
- return nil if str.nil? || str.strip.empty?
124
- return nil unless str.match?(/^\d{8}$/)
125
-
126
- begin
127
- Date.strptime(str, "%Y%m%d")
128
- rescue ArgumentError
129
- nil
130
- end
74
+ # @param col Polars::Expr
75
+ def parse_date(col)
76
+ col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
131
77
  end
132
78
  end
133
79
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.9.2"
4
+ VERSION = "0.10.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado