gtfs_df 0.9.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/examples/split-by-agency/Gemfile.lock +1 -1
- data/lib/gtfs_df/base_gtfs_table.rb +7 -0
- data/lib/gtfs_df/feed.rb +165 -0
- data/lib/gtfs_df/utils.rb +3 -57
- data/lib/gtfs_df/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0be3962480f99e4a2194d6e7da248d5ef82dc5479491ac807211b3dc4d69415a
|
|
4
|
+
data.tar.gz: ceeaf189058006c0db3b26e6438b8c7fc7a0a077a48f192b588b53e74de695e8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1f27c1b9493aaf2220dfe9369a06a9c5cc26bcae2b9777b0d33bb061a47e1c176eea309580f893fad53dd80b3e7148e84a65ed6f3e0752ac6203c961c991d4ec
|
|
7
|
+
data.tar.gz: ad94ff3fa5b2fb69f119fdcac7b8c14df0c2ad7863e42b692d10f5eb0bda151f204618ae8fd7209e9de5ebe0684187530e0e4f789aefaaabf340c8890e9d5df3
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
## [0.10.0] - 2026-03-06
|
|
2
|
+
|
|
3
|
+
### 🚀 Features
|
|
4
|
+
|
|
5
|
+
- Date parsing utility
|
|
6
|
+
- Calendar-based utilities
|
|
7
|
+
- Consider frequencies when counting trips
|
|
8
|
+
|
|
9
|
+
### 🐛 Bug Fixes
|
|
10
|
+
|
|
11
|
+
- Service dates and busiest week fixes
|
|
1
12
|
## [0.9.3] - 2026-02-27
|
|
2
13
|
|
|
3
14
|
### 🐛 Bug Fixes
|
|
@@ -18,6 +29,7 @@
|
|
|
18
29
|
- Bump version to 0.9.2
|
|
19
30
|
- Avoid converting series into arrays
|
|
20
31
|
- Simplify trip pool reduction
|
|
32
|
+
- Bump version to 0.9.3
|
|
21
33
|
## [0.9.1] - 2026-02-17
|
|
22
34
|
|
|
23
35
|
### 🐛 Bug Fixes
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -174,6 +174,171 @@ module GtfsDf
|
|
|
174
174
|
send("#{file_name}=", value)
|
|
175
175
|
end
|
|
176
176
|
|
|
177
|
+
# Returns a DataFrame of all service_id/date pairs active in the feed.
|
|
178
|
+
# Columns: [date, service_id]
|
|
179
|
+
#
|
|
180
|
+
# @return [Polars::DataFrame]
|
|
181
|
+
def service_dates
|
|
182
|
+
start_date_col = Polars.col("start_date")
|
|
183
|
+
end_date_col = Polars.col("end_date")
|
|
184
|
+
date_col = Polars.col("date")
|
|
185
|
+
|
|
186
|
+
calendar_df = @calendar&.with_columns(
|
|
187
|
+
GtfsDf::Utils.parse_date(start_date_col),
|
|
188
|
+
GtfsDf::Utils.parse_date(end_date_col)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
calendar_dates_df = @calendar_dates&.with_columns(
|
|
192
|
+
GtfsDf::Utils.parse_date(date_col)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Expand calendar to a range of (service_id, date)
|
|
196
|
+
services_by_date = nil
|
|
197
|
+
if calendar_df
|
|
198
|
+
expanded = calendar_df.with_columns(
|
|
199
|
+
Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
|
|
200
|
+
).explode("date")
|
|
201
|
+
|
|
202
|
+
dow_col_names = [
|
|
203
|
+
"monday",
|
|
204
|
+
"tuesday",
|
|
205
|
+
"wednesday",
|
|
206
|
+
"thursday",
|
|
207
|
+
"friday",
|
|
208
|
+
"saturday",
|
|
209
|
+
"sunday"
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
# Each day in the calendar table defines if a day of the week has service or not
|
|
213
|
+
# 1 - Service is available for all Mondays in the date range.
|
|
214
|
+
# 0 - Service is not available for Mondays in the date range.
|
|
215
|
+
# https://gtfs.org/documentation/schedule/reference/#calendartxt
|
|
216
|
+
#
|
|
217
|
+
# This filter will be applied to the expanded calendar dates, where the
|
|
218
|
+
# ranges become rows of individual dates, we need to ensure that each
|
|
219
|
+
# individual date matches the day of the week (DOW) before we check if
|
|
220
|
+
# it's enabled.
|
|
221
|
+
filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
|
|
222
|
+
# Polars weekday: Monday=1, Sunday=7
|
|
223
|
+
expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
services_by_date = expanded.filter(filter_expr).select("date", "service_id")
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Apply calendar_dates exceptions
|
|
230
|
+
if calendar_dates_df
|
|
231
|
+
exception_type_col = Polars.col("exception_type")
|
|
232
|
+
|
|
233
|
+
additions = calendar_dates_df
|
|
234
|
+
.filter(exception_type_col == "1")
|
|
235
|
+
.select("date", "service_id")
|
|
236
|
+
|
|
237
|
+
subtractions = calendar_dates_df
|
|
238
|
+
.filter(exception_type_col == "2")
|
|
239
|
+
.select("date", "service_id")
|
|
240
|
+
|
|
241
|
+
services_by_date = if services_by_date
|
|
242
|
+
# If we found service dates from the calendar table, we need to first
|
|
243
|
+
# add the inclusions, then remove the exceptions coming from the calendar_dates
|
|
244
|
+
services_by_date
|
|
245
|
+
.vstack(additions).unique
|
|
246
|
+
.join(subtractions, on: ["service_id", "date"], how: "anti")
|
|
247
|
+
else
|
|
248
|
+
# Otherwise, we can just use the additions as the new services_by_date
|
|
249
|
+
additions.unique
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
services_by_date
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Returns a DataFrame of trip counts per date.
|
|
257
|
+
# Columns: [date, count]
|
|
258
|
+
#
|
|
259
|
+
# @return [Polars::DataFrame]
|
|
260
|
+
def trip_count_dates
|
|
261
|
+
cached_service_dates = service_dates
|
|
262
|
+
return nil if cached_service_dates.nil? || cached_service_dates.height == 0
|
|
263
|
+
|
|
264
|
+
# This expression builds from the dataframe returned by frequency based
|
|
265
|
+
# trip counts, defaulting to 1 for the trips that don't have an entry in
|
|
266
|
+
# the frequencies table. We're defining the expression here just to
|
|
267
|
+
# remove some noise from the join below.
|
|
268
|
+
trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
|
|
269
|
+
|
|
270
|
+
# Count trips per service_id, considering the possible size they may have
|
|
271
|
+
# from the frequencies table.
|
|
272
|
+
trip_counts = @trips
|
|
273
|
+
.join(frequency_based_trip_counts, on: "trip_id", how: "left")
|
|
274
|
+
.group_by("service_id")
|
|
275
|
+
.agg(trip_size.sum.alias("trip_count"))
|
|
276
|
+
|
|
277
|
+
# Join to services to get trips per date
|
|
278
|
+
daily_trips = cached_service_dates
|
|
279
|
+
.join(trip_counts, on: "service_id", how: "left")
|
|
280
|
+
.with_columns(Polars.col("trip_count").fill_null(0))
|
|
281
|
+
|
|
282
|
+
# Sum trips per date
|
|
283
|
+
daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Returns a DataFrame of trip counts from the frequencies table
|
|
287
|
+
# Columns: [trip_id, freq_count]
|
|
288
|
+
#
|
|
289
|
+
# @return [Polars::DataFrame]
|
|
290
|
+
def frequency_based_trip_counts
|
|
291
|
+
# If the feed was initialized with the parse_times flag, we already have
|
|
292
|
+
# seconds since midnight in these columns, otherwise we need to convert
|
|
293
|
+
# them first, so we can get the duration in seconds
|
|
294
|
+
end_time_seconds_col, start_time_seconds_col = if @parse_times
|
|
295
|
+
[Polars.col("end_time"), Polars.col("start_time")]
|
|
296
|
+
else
|
|
297
|
+
[
|
|
298
|
+
GtfsDf::Utils.as_seconds_since_midnight("end_time"),
|
|
299
|
+
GtfsDf::Utils.as_seconds_since_midnight("start_time")
|
|
300
|
+
]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
|
|
304
|
+
count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
|
|
305
|
+
|
|
306
|
+
# The frequencies table is optional, we default to an empty dataframe to
|
|
307
|
+
# remove friction in the join with trips.
|
|
308
|
+
if @frequencies
|
|
309
|
+
@frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
|
|
310
|
+
else
|
|
311
|
+
Polars::DataFrame.new(
|
|
312
|
+
{"trip_id" => [], "freq_count" => []},
|
|
313
|
+
schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
|
|
314
|
+
)
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Identifies the start date of the busiest week in the feed by trip count.
|
|
319
|
+
#
|
|
320
|
+
# @return [Date] The Monday of the busiest week
|
|
321
|
+
def busiest_week
|
|
322
|
+
daily_total = trip_count_dates
|
|
323
|
+
return nil if daily_total.nil? || daily_total.height == 0
|
|
324
|
+
|
|
325
|
+
# Group by week (ISO week, starting Monday)
|
|
326
|
+
weekly_agg = daily_total
|
|
327
|
+
.with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
|
|
328
|
+
.group_by("week_start")
|
|
329
|
+
.agg(Polars.col("count").sum.alias("total_trips"))
|
|
330
|
+
|
|
331
|
+
# Get the week with max trips
|
|
332
|
+
# Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
|
|
333
|
+
sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
|
|
334
|
+
best_week = sorted_weeks.head(1)
|
|
335
|
+
|
|
336
|
+
return nil if best_week.height == 0
|
|
337
|
+
|
|
338
|
+
# Return the start date of the busiest week
|
|
339
|
+
best_week["week_start"][0]
|
|
340
|
+
end
|
|
341
|
+
|
|
177
342
|
private
|
|
178
343
|
|
|
179
344
|
def filter!(file, filters, filtered, filter_only_children: false)
|
data/lib/gtfs_df/utils.rb
CHANGED
|
@@ -6,53 +6,6 @@ module GtfsDf
|
|
|
6
6
|
SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
|
|
7
7
|
SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
|
|
8
8
|
|
|
9
|
-
# Parses a GTFS time string to seconds since midnight
|
|
10
|
-
#
|
|
11
|
-
# The input string is expected to be in the HH:MM:SS format (H:MM:SS is
|
|
12
|
-
# also accepted).
|
|
13
|
-
#
|
|
14
|
-
# The time is measured from "noon minus 12h" of the service day
|
|
15
|
-
# (effectively midnight except for days on which daylight savings time
|
|
16
|
-
# changes occur). For times occurring after midnight on the service day,
|
|
17
|
-
# enter the time as a value greater than 24:00:00 in HH:MM:SS.
|
|
18
|
-
#
|
|
19
|
-
# @example 14:30:00 for 2:30PM or
|
|
20
|
-
# 25:35:00 for 1:35AM on the next day.
|
|
21
|
-
#
|
|
22
|
-
# @param str String|Integer
|
|
23
|
-
# @return Integer|nil seconds since midnight, or nil if invalid
|
|
24
|
-
def parse_time(str)
|
|
25
|
-
return str if str.is_a?(Integer)
|
|
26
|
-
return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
|
|
27
|
-
|
|
28
|
-
parts = str.to_s.split(":")
|
|
29
|
-
return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
|
|
30
|
-
|
|
31
|
-
hours, mins, secs = parts.map(&:to_i)
|
|
32
|
-
hours * 3600 + mins * 60 + secs
|
|
33
|
-
rescue
|
|
34
|
-
nil
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Formats seconds since midnight as a GTFS time string (HH:MM:SS)
|
|
38
|
-
#
|
|
39
|
-
# Handles times greater than 24 hours for times that span past midnight.
|
|
40
|
-
#
|
|
41
|
-
# @param seconds Integer seconds since midnight
|
|
42
|
-
# @return String|nil time in HH:MM:SS format, or nil if invalid
|
|
43
|
-
def format_time(seconds)
|
|
44
|
-
return nil if seconds.nil?
|
|
45
|
-
return seconds if seconds.is_a?(String)
|
|
46
|
-
|
|
47
|
-
hours = seconds / SECONDS_IN_HOUR
|
|
48
|
-
minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
|
|
49
|
-
secs = seconds % SECONDS_IN_MINUTE
|
|
50
|
-
|
|
51
|
-
format("%02d:%02d:%02d", hours, minutes, secs)
|
|
52
|
-
rescue
|
|
53
|
-
nil
|
|
54
|
-
end
|
|
55
|
-
|
|
56
9
|
# Converts a GTFS time string column to seconds since midnight
|
|
57
10
|
#
|
|
58
11
|
# Use this method with Polars DataFrames to convert time columns.
|
|
@@ -118,16 +71,9 @@ module GtfsDf
|
|
|
118
71
|
#
|
|
119
72
|
# @example 20180913 for September 13th, 2018.
|
|
120
73
|
#
|
|
121
|
-
# @param
|
|
122
|
-
def parse_date(
|
|
123
|
-
|
|
124
|
-
return nil unless str.match?(/^\d{8}$/)
|
|
125
|
-
|
|
126
|
-
begin
|
|
127
|
-
Date.strptime(str, "%Y%m%d")
|
|
128
|
-
rescue ArgumentError
|
|
129
|
-
nil
|
|
130
|
-
end
|
|
74
|
+
# @param col Polars::Expr
|
|
75
|
+
def parse_date(col)
|
|
76
|
+
col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
|
|
131
77
|
end
|
|
132
78
|
end
|
|
133
79
|
end
|
data/lib/gtfs_df/version.rb
CHANGED