gtfs_df 0.9.2 โ 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.conform.yaml +1 -0
- data/CHANGELOG.md +28 -1
- data/examples/split-by-agency/Gemfile.lock +1 -1
- data/lib/gtfs_df/base_gtfs_table.rb +7 -0
- data/lib/gtfs_df/feed.rb +212 -31
- data/lib/gtfs_df/utils.rb +3 -57
- data/lib/gtfs_df/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0be3962480f99e4a2194d6e7da248d5ef82dc5479491ac807211b3dc4d69415a
|
|
4
|
+
data.tar.gz: ceeaf189058006c0db3b26e6438b8c7fc7a0a077a48f192b588b53e74de695e8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1f27c1b9493aaf2220dfe9369a06a9c5cc26bcae2b9777b0d33bb061a47e1c176eea309580f893fad53dd80b3e7148e84a65ed6f3e0752ac6203c961c991d4ec
|
|
7
|
+
data.tar.gz: ad94ff3fa5b2fb69f119fdcac7b8c14df0c2ad7863e42b692d10f5eb0bda151f204618ae8fd7209e9de5ebe0684187530e0e4f789aefaaabf340c8890e9d5df3
|
data/.conform.yaml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,8 +1,35 @@
|
|
|
1
|
-
## [0.
|
|
1
|
+
## [0.10.0] - 2026-03-06
|
|
2
|
+
|
|
3
|
+
### ๐ Features
|
|
4
|
+
|
|
5
|
+
- Date parsing utility
|
|
6
|
+
- Calendar-based utilities
|
|
7
|
+
- Consider frequencies when counting trips
|
|
8
|
+
|
|
9
|
+
### ๐ Bug Fixes
|
|
10
|
+
|
|
11
|
+
- Service dates and busiest week fixes
|
|
12
|
+
## [0.9.3] - 2026-02-27
|
|
13
|
+
|
|
14
|
+
### ๐ Bug Fixes
|
|
15
|
+
|
|
16
|
+
- Allow multiple filters
|
|
17
|
+
- Refactor prune to keep caledar_dates-only dependencies
|
|
2
18
|
|
|
3
19
|
### ๐ Documentation
|
|
4
20
|
|
|
5
21
|
- Add Brooke to the list of authors
|
|
22
|
+
|
|
23
|
+
### ๐งช Testing
|
|
24
|
+
|
|
25
|
+
- Ensure we don't drop trips and routes
|
|
26
|
+
|
|
27
|
+
### โ๏ธ Miscellaneous Tasks
|
|
28
|
+
|
|
29
|
+
- Bump version to 0.9.2
|
|
30
|
+
- Avoid converting series into arrays
|
|
31
|
+
- Simplify trip pool reduction
|
|
32
|
+
- Bump version to 0.9.3
|
|
6
33
|
## [0.9.1] - 2026-02-17
|
|
7
34
|
|
|
8
35
|
### ๐ Bug Fixes
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -121,19 +121,19 @@ module GtfsDf
|
|
|
121
121
|
# Trips are the atomic unit of GTFS, we will generate a new view
|
|
122
122
|
# based on the set of trips that would be included for each invidual filter
|
|
123
123
|
# and cascade changes from this view in order to retain referential integrity
|
|
124
|
-
trip_ids =
|
|
124
|
+
trip_ids = Polars::Series.new.alias("trip_id")
|
|
125
125
|
|
|
126
126
|
view.each do |file, filters|
|
|
127
127
|
new_filtered = filter!(file, filters, filtered.dup)
|
|
128
|
-
trip_ids = if trip_ids.
|
|
128
|
+
trip_ids = if trip_ids.empty?
|
|
129
129
|
new_filtered["trips"]["trip_id"]
|
|
130
130
|
else
|
|
131
|
-
trip_ids
|
|
131
|
+
trip_ids.filter(trip_ids.is_in(new_filtered["trips"]["trip_id"]))
|
|
132
132
|
end
|
|
133
133
|
end
|
|
134
134
|
|
|
135
135
|
if trip_ids
|
|
136
|
-
filtered = filter!("trips", {"trip_id" => trip_ids.
|
|
136
|
+
filtered = filter!("trips", {"trip_id" => trip_ids.implode}, filtered.dup)
|
|
137
137
|
end
|
|
138
138
|
end
|
|
139
139
|
|
|
@@ -174,6 +174,171 @@ module GtfsDf
|
|
|
174
174
|
send("#{file_name}=", value)
|
|
175
175
|
end
|
|
176
176
|
|
|
177
|
+
# Returns a DataFrame of all service_id/date pairs active in the feed.
|
|
178
|
+
# Columns: [date, service_id]
|
|
179
|
+
#
|
|
180
|
+
# @return [Polars::DataFrame]
|
|
181
|
+
def service_dates
|
|
182
|
+
start_date_col = Polars.col("start_date")
|
|
183
|
+
end_date_col = Polars.col("end_date")
|
|
184
|
+
date_col = Polars.col("date")
|
|
185
|
+
|
|
186
|
+
calendar_df = @calendar&.with_columns(
|
|
187
|
+
GtfsDf::Utils.parse_date(start_date_col),
|
|
188
|
+
GtfsDf::Utils.parse_date(end_date_col)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
calendar_dates_df = @calendar_dates&.with_columns(
|
|
192
|
+
GtfsDf::Utils.parse_date(date_col)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Expand calendar to a range of (service_id, date)
|
|
196
|
+
services_by_date = nil
|
|
197
|
+
if calendar_df
|
|
198
|
+
expanded = calendar_df.with_columns(
|
|
199
|
+
Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
|
|
200
|
+
).explode("date")
|
|
201
|
+
|
|
202
|
+
dow_col_names = [
|
|
203
|
+
"monday",
|
|
204
|
+
"tuesday",
|
|
205
|
+
"wednesday",
|
|
206
|
+
"thursday",
|
|
207
|
+
"friday",
|
|
208
|
+
"saturday",
|
|
209
|
+
"sunday"
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
# Each day in the calendar table defines if a day of the week has service or not
|
|
213
|
+
# 1 - Service is available for all Mondays in the date range.
|
|
214
|
+
# 0 - Service is not available for Mondays in the date range.
|
|
215
|
+
# https://gtfs.org/documentation/schedule/reference/#calendartxt
|
|
216
|
+
#
|
|
217
|
+
# This filter will be applied to the expanded calendar dates, where the
|
|
218
|
+
# ranges become rows of individual dates, we need to ensure that each
|
|
219
|
+
# individual date matches the day of the week (DOW) before we check if
|
|
220
|
+
# it's enabled.
|
|
221
|
+
filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
|
|
222
|
+
# Polars weekday: Monday=1, Sunday=7
|
|
223
|
+
expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
services_by_date = expanded.filter(filter_expr).select("date", "service_id")
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Apply calendar_dates exceptions
|
|
230
|
+
if calendar_dates_df
|
|
231
|
+
exception_type_col = Polars.col("exception_type")
|
|
232
|
+
|
|
233
|
+
additions = calendar_dates_df
|
|
234
|
+
.filter(exception_type_col == "1")
|
|
235
|
+
.select("date", "service_id")
|
|
236
|
+
|
|
237
|
+
subtractions = calendar_dates_df
|
|
238
|
+
.filter(exception_type_col == "2")
|
|
239
|
+
.select("date", "service_id")
|
|
240
|
+
|
|
241
|
+
services_by_date = if services_by_date
|
|
242
|
+
# If we found service dates from the calendar table, we need to first
|
|
243
|
+
# add the inclusions, then remove the exceptions coming from the calendar_dates
|
|
244
|
+
services_by_date
|
|
245
|
+
.vstack(additions).unique
|
|
246
|
+
.join(subtractions, on: ["service_id", "date"], how: "anti")
|
|
247
|
+
else
|
|
248
|
+
# Otherwise, we can just use the additions as the new services_by_date
|
|
249
|
+
additions.unique
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
services_by_date
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Returns a DataFrame of trip counts per date.
|
|
257
|
+
# Columns: [date, count]
|
|
258
|
+
#
|
|
259
|
+
# @return [Polars::DataFrame]
|
|
260
|
+
def trip_count_dates
|
|
261
|
+
cached_service_dates = service_dates
|
|
262
|
+
return nil if cached_service_dates.nil? || cached_service_dates.height == 0
|
|
263
|
+
|
|
264
|
+
# This expression builds from the dataframe returned by frequency based
|
|
265
|
+
# trip counts, defaulting to 1 for the trips that don't have an entry in
|
|
266
|
+
# the frequencies table. We're defining the expression here just to
|
|
267
|
+
# remove some noise from the join below.
|
|
268
|
+
trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
|
|
269
|
+
|
|
270
|
+
# Count trips per service_id, considering the possible size they may have
|
|
271
|
+
# from the frequencies table.
|
|
272
|
+
trip_counts = @trips
|
|
273
|
+
.join(frequency_based_trip_counts, on: "trip_id", how: "left")
|
|
274
|
+
.group_by("service_id")
|
|
275
|
+
.agg(trip_size.sum.alias("trip_count"))
|
|
276
|
+
|
|
277
|
+
# Join to services to get trips per date
|
|
278
|
+
daily_trips = cached_service_dates
|
|
279
|
+
.join(trip_counts, on: "service_id", how: "left")
|
|
280
|
+
.with_columns(Polars.col("trip_count").fill_null(0))
|
|
281
|
+
|
|
282
|
+
# Sum trips per date
|
|
283
|
+
daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Returns a DataFrame of trip counts from the frequencies table
|
|
287
|
+
# Columns: [trip_id, freq_count]
|
|
288
|
+
#
|
|
289
|
+
# @return [Polars::DataFrame]
|
|
290
|
+
def frequency_based_trip_counts
|
|
291
|
+
# If the feed was initialized with the parse_times flag, we already have
|
|
292
|
+
# seconds since midnight in these columns, otherwise we need to convert
|
|
293
|
+
# them first, so we can get the duration in seconds
|
|
294
|
+
end_time_seconds_col, start_time_seconds_col = if @parse_times
|
|
295
|
+
[Polars.col("end_time"), Polars.col("start_time")]
|
|
296
|
+
else
|
|
297
|
+
[
|
|
298
|
+
GtfsDf::Utils.as_seconds_since_midnight("end_time"),
|
|
299
|
+
GtfsDf::Utils.as_seconds_since_midnight("start_time")
|
|
300
|
+
]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
|
|
304
|
+
count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
|
|
305
|
+
|
|
306
|
+
# The frequencies table is optional, we default to an empty dataframe to
|
|
307
|
+
# remove friction in the join with trips.
|
|
308
|
+
if @frequencies
|
|
309
|
+
@frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
|
|
310
|
+
else
|
|
311
|
+
Polars::DataFrame.new(
|
|
312
|
+
{"trip_id" => [], "freq_count" => []},
|
|
313
|
+
schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
|
|
314
|
+
)
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Identifies the start date of the busiest week in the feed by trip count.
|
|
319
|
+
#
|
|
320
|
+
# @return [Date] The Monday of the busiest week
|
|
321
|
+
def busiest_week
|
|
322
|
+
daily_total = trip_count_dates
|
|
323
|
+
return nil if daily_total.nil? || daily_total.height == 0
|
|
324
|
+
|
|
325
|
+
# Group by week (ISO week, starting Monday)
|
|
326
|
+
weekly_agg = daily_total
|
|
327
|
+
.with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
|
|
328
|
+
.group_by("week_start")
|
|
329
|
+
.agg(Polars.col("count").sum.alias("total_trips"))
|
|
330
|
+
|
|
331
|
+
# Get the week with max trips
|
|
332
|
+
# Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
|
|
333
|
+
sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
|
|
334
|
+
best_week = sorted_weeks.head(1)
|
|
335
|
+
|
|
336
|
+
return nil if best_week.height == 0
|
|
337
|
+
|
|
338
|
+
# Return the start date of the busiest week
|
|
339
|
+
best_week["week_start"][0]
|
|
340
|
+
end
|
|
341
|
+
|
|
177
342
|
private
|
|
178
343
|
|
|
179
344
|
def filter!(file, filters, filtered, filter_only_children: false)
|
|
@@ -181,7 +346,7 @@ module GtfsDf
|
|
|
181
346
|
df = filtered[file]
|
|
182
347
|
|
|
183
348
|
filters.each do |col, val|
|
|
184
|
-
df = if val.is_a?(Array)
|
|
349
|
+
df = if val.is_a?(Polars::Series) || val.is_a?(Array)
|
|
185
350
|
df.filter(Polars.col(col).is_in(val))
|
|
186
351
|
elsif val.respond_to?(:call)
|
|
187
352
|
df.filter(val.call(Polars.col(col)))
|
|
@@ -200,9 +365,16 @@ module GtfsDf
|
|
|
200
365
|
|
|
201
366
|
# Traverses the graph to prune unreferenced entities from child dataframes
|
|
202
367
|
# based on parent relationships. See GtfsDf::Graph::STOP_NODES
|
|
368
|
+
#
|
|
369
|
+
# The trips table has multiple parents (calendar, calendar_dates, routes,
|
|
370
|
+
# stop_times). We accumulate valid values from all of them and keep rows
|
|
371
|
+
# that match any parent, so trips referenced only via calendar_dates are
|
|
372
|
+
# not dropped when another edge is processed first.
|
|
203
373
|
def prune!(root, filtered, filter_only_children: false)
|
|
204
374
|
seen_edges = Set.new
|
|
205
375
|
rerooted_graph = Graph.build(bidirectional: !filter_only_children)
|
|
376
|
+
accumulated_service_ids = Polars::Series.new("service_id", dtype: Polars::String)
|
|
377
|
+
trips_base_df = nil
|
|
206
378
|
|
|
207
379
|
queue = [root]
|
|
208
380
|
|
|
@@ -245,37 +417,46 @@ module GtfsDf
|
|
|
245
417
|
attrs[:dependencies].each do |dep|
|
|
246
418
|
parent_col = dep[parent_node_id]
|
|
247
419
|
child_col = dep[child_node_id]
|
|
248
|
-
|
|
420
|
+
allow_null_flag = !!dep[:allow_null]
|
|
249
421
|
|
|
250
422
|
next unless parent_col && child_col &&
|
|
251
423
|
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
252
424
|
|
|
253
425
|
# Get valid values from parent
|
|
254
|
-
valid_values = parent_df[parent_col].
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
426
|
+
valid_values = parent_df[parent_col].drop_nulls.unique
|
|
427
|
+
|
|
428
|
+
if child_node_id == "trips" && (parent_node_id == "calendar" || parent_node_id == "calendar_dates")
|
|
429
|
+
# Calendar + calendar_dates both define service for the same trips, so we want
|
|
430
|
+
# union semantics across those two parents (a trip is valid if it appears in
|
|
431
|
+
# either).
|
|
432
|
+
#
|
|
433
|
+
# Here we accumulate valid service_ids across calendar/calendar_dates, but only
|
|
434
|
+
# within the pool of trips that are already reachable from structural parents.
|
|
435
|
+
accumulated_service_ids = Polars.concat([accumulated_service_ids, valid_values]).unique
|
|
436
|
+
|
|
437
|
+
# Determine the base pool of trips:
|
|
438
|
+
# - If we've already restricted trips via structural parents (routes,
|
|
439
|
+
# stop_times, shapes, etc), use that as the base.
|
|
440
|
+
# - Otherwise, like when filtering directly on trips, use the current
|
|
441
|
+
# trips dataframe.
|
|
442
|
+
trips_base_df ||= filtered[child_node.fetch(:file)]
|
|
443
|
+
next unless trips_base_df && trips_base_df.height > 0
|
|
444
|
+
|
|
445
|
+
filtered[child_node.fetch(:file)] = trips_base_df.filter(
|
|
446
|
+
Polars.col("service_id").is_in(accumulated_service_ids.implode)
|
|
447
|
+
)
|
|
448
|
+
else
|
|
449
|
+
# Original single-edge logic for all other nodes
|
|
450
|
+
before = child_df.height
|
|
451
|
+
|
|
452
|
+
cond = Polars.col(child_col).is_in(valid_values.implode)
|
|
453
|
+
cond = (cond | Polars.col(child_col).is_null) if allow_null_flag
|
|
454
|
+
child_df = child_df.filter(cond)
|
|
455
|
+
|
|
456
|
+
if child_df.height < before
|
|
457
|
+
child_df = Polars.concat([child_df, saved_vals], how: "vertical") if saved_vals
|
|
458
|
+
filtered[child_node.fetch(:file)] = child_df
|
|
459
|
+
end
|
|
279
460
|
end
|
|
280
461
|
end
|
|
281
462
|
end
|
data/lib/gtfs_df/utils.rb
CHANGED
|
@@ -6,53 +6,6 @@ module GtfsDf
|
|
|
6
6
|
SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
|
|
7
7
|
SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
|
|
8
8
|
|
|
9
|
-
# Parses a GTFS time string to seconds since midnight
|
|
10
|
-
#
|
|
11
|
-
# The input string is expected to be in the HH:MM:SS format (H:MM:SS is
|
|
12
|
-
# also accepted).
|
|
13
|
-
#
|
|
14
|
-
# The time is measured from "noon minus 12h" of the service day
|
|
15
|
-
# (effectively midnight except for days on which daylight savings time
|
|
16
|
-
# changes occur). For times occurring after midnight on the service day,
|
|
17
|
-
# enter the time as a value greater than 24:00:00 in HH:MM:SS.
|
|
18
|
-
#
|
|
19
|
-
# @example 14:30:00 for 2:30PM or
|
|
20
|
-
# 25:35:00 for 1:35AM on the next day.
|
|
21
|
-
#
|
|
22
|
-
# @param str String|Integer
|
|
23
|
-
# @return Integer|nil seconds since midnight, or nil if invalid
|
|
24
|
-
def parse_time(str)
|
|
25
|
-
return str if str.is_a?(Integer)
|
|
26
|
-
return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
|
|
27
|
-
|
|
28
|
-
parts = str.to_s.split(":")
|
|
29
|
-
return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
|
|
30
|
-
|
|
31
|
-
hours, mins, secs = parts.map(&:to_i)
|
|
32
|
-
hours * 3600 + mins * 60 + secs
|
|
33
|
-
rescue
|
|
34
|
-
nil
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Formats seconds since midnight as a GTFS time string (HH:MM:SS)
|
|
38
|
-
#
|
|
39
|
-
# Handles times greater than 24 hours for times that span past midnight.
|
|
40
|
-
#
|
|
41
|
-
# @param seconds Integer seconds since midnight
|
|
42
|
-
# @return String|nil time in HH:MM:SS format, or nil if invalid
|
|
43
|
-
def format_time(seconds)
|
|
44
|
-
return nil if seconds.nil?
|
|
45
|
-
return seconds if seconds.is_a?(String)
|
|
46
|
-
|
|
47
|
-
hours = seconds / SECONDS_IN_HOUR
|
|
48
|
-
minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
|
|
49
|
-
secs = seconds % SECONDS_IN_MINUTE
|
|
50
|
-
|
|
51
|
-
format("%02d:%02d:%02d", hours, minutes, secs)
|
|
52
|
-
rescue
|
|
53
|
-
nil
|
|
54
|
-
end
|
|
55
|
-
|
|
56
9
|
# Converts a GTFS time string column to seconds since midnight
|
|
57
10
|
#
|
|
58
11
|
# Use this method with Polars DataFrames to convert time columns.
|
|
@@ -118,16 +71,9 @@ module GtfsDf
|
|
|
118
71
|
#
|
|
119
72
|
# @example 20180913 for September 13th, 2018.
|
|
120
73
|
#
|
|
121
|
-
# @param
|
|
122
|
-
def parse_date(
|
|
123
|
-
|
|
124
|
-
return nil unless str.match?(/^\d{8}$/)
|
|
125
|
-
|
|
126
|
-
begin
|
|
127
|
-
Date.strptime(str, "%Y%m%d")
|
|
128
|
-
rescue ArgumentError
|
|
129
|
-
nil
|
|
130
|
-
end
|
|
74
|
+
# @param col Polars::Expr
|
|
75
|
+
def parse_date(col)
|
|
76
|
+
col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
|
|
131
77
|
end
|
|
132
78
|
end
|
|
133
79
|
end
|
data/lib/gtfs_df/version.rb
CHANGED