gtfs_df 0.9.3 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c0746b1937afcfb7000425a59976b3eba8662778437ddf3adc02ec7729c3a49
4
- data.tar.gz: e3a553aef868b4c29e06731f0a5e3984efc067a096001e90150743a80f3d45bc
3
+ metadata.gz: 5200ee072c53d95cdb96af3fc8e0f15f9a15e2cc22684b5c9b2b2eeb4c103c8a
4
+ data.tar.gz: 843492a5f86c595d48431edcf2decd800debc1ead5dd2df0e77d89bf920c01ce
5
5
  SHA512:
6
- metadata.gz: 0fccae16bb46da6db651da04ea2591fa17e991219d4d8eb9be3a9444ed0b1b190b315f9223b738f399f3b6fade41c2c2e1081d1aad921a39483426d5b86b1aca
7
- data.tar.gz: b98eccb0861c46d8510eaaea6a97c48d6b1fa3687d1c35d445edbf19c349a6c03d45c3a2654da20ab577aa083003eff5335a8fcad6ee0d9f6f7f86a78c2ed33b
6
+ metadata.gz: be450515810d21760afbff3800887e3e5605052bd52eaf7df81951d6193c141dc9487d6b566b9caae5bd44a990051f54907ded2fa619406018afb32df8f3209a
7
+ data.tar.gz: b6efebb9c5190777a3698a2b747b8528a7a72598c96254fbc7c9ead11aceaa72de75d2a95fb5a4c60745a0aacb0234108e4e5c663650bd5e0d71b858881325ca
data/CHANGELOG.md CHANGED
@@ -1,3 +1,28 @@
1
+ ## [0.10.1] - 2026-03-18
2
+
3
+ ### 🐛 Bug Fixes
4
+
5
+ - Sanitize feed input
6
+
7
+ ### ⚙️ Miscellaneous Tasks
8
+
9
+ - Normalize platforms
10
+ - Ignore the examples folder on publishing
11
+ ## [0.10.0] - 2026-03-06
12
+
13
+ ### 🚀 Features
14
+
15
+ - Date parsing utility
16
+ - Calendar-based utilities
17
+ - Consider frequencies when counting trips
18
+
19
+ ### 🐛 Bug Fixes
20
+
21
+ - Service dates and busiest week fixes
22
+
23
+ ### ⚙️ Miscellaneous Tasks
24
+
25
+ - Bump version to 0.10.0
1
26
  ## [0.9.3] - 2026-02-27
2
27
 
3
28
  ### 🐛 Bug Fixes
@@ -18,6 +43,7 @@
18
43
  - Bump version to 0.9.2
19
44
  - Avoid converting series into arrays
20
45
  - Simplify trip pool reduction
46
+ - Bump version to 0.9.3
21
47
  ## [0.9.1] - 2026-02-17
22
48
 
23
49
  ### 🐛 Bug Fixes
@@ -55,5 +55,12 @@ module GtfsDf
55
55
  def dataframe
56
56
  @df
57
57
  end
58
+
59
+ def self.empty_dataframe
60
+ Polars::DataFrame.new(
61
+ const_get(:REQUIRED_FIELDS).map { |field| [field, []] }.to_h,
62
+ schema_overrides: const_get(:SCHEMA)
63
+ )
64
+ end
58
65
  end
59
66
  end
data/lib/gtfs_df/feed.rb CHANGED
@@ -174,6 +174,171 @@ module GtfsDf
174
174
  send("#{file_name}=", value)
175
175
  end
176
176
 
177
+ # Returns a DataFrame of all service_id/date pairs active in the feed.
178
+ # Columns: [date, service_id]
179
+ #
180
+ # @return [Polars::DataFrame]
181
+ def service_dates
182
+ start_date_col = Polars.col("start_date")
183
+ end_date_col = Polars.col("end_date")
184
+ date_col = Polars.col("date")
185
+
186
+ calendar_df = @calendar&.with_columns(
187
+ GtfsDf::Utils.parse_date(start_date_col),
188
+ GtfsDf::Utils.parse_date(end_date_col)
189
+ )
190
+
191
+ calendar_dates_df = @calendar_dates&.with_columns(
192
+ GtfsDf::Utils.parse_date(date_col)
193
+ )
194
+
195
+ # Expand calendar to a range of (service_id, date)
196
+ services_by_date = nil
197
+ if calendar_df
198
+ expanded = calendar_df.with_columns(
199
+ Polars.date_ranges(start_date_col, end_date_col, "1d").alias("date")
200
+ ).explode("date")
201
+
202
+ dow_col_names = [
203
+ "monday",
204
+ "tuesday",
205
+ "wednesday",
206
+ "thursday",
207
+ "friday",
208
+ "saturday",
209
+ "sunday"
210
+ ]
211
+
212
+ # Each day in the calendar table defines if a day of the week has service or not
213
+ # 1 - Service is available for all Mondays in the date range.
214
+ # 0 - Service is not available for Mondays in the date range.
215
+ # https://gtfs.org/documentation/schedule/reference/#calendartxt
216
+ #
217
+ # This filter will be applied to the expanded calendar dates, where the
218
+ # ranges become rows of individual dates, we need to ensure that each
219
+ # individual date matches the day of the week (DOW) before we check if
220
+ # it's enabled.
221
+ filter_expr = dow_col_names.each_with_index.reduce(Polars.lit(false)) do |expr, (dow_col_name, idx)|
222
+ # Polars weekday: Monday=1, Sunday=7
223
+ expr | ((Polars.col("date").dt.weekday == (idx + 1)) & (Polars.col(dow_col_name) == "1"))
224
+ end
225
+
226
+ services_by_date = expanded.filter(filter_expr).select("date", "service_id")
227
+ end
228
+
229
+ # Apply calendar_dates exceptions
230
+ if calendar_dates_df
231
+ exception_type_col = Polars.col("exception_type")
232
+
233
+ additions = calendar_dates_df
234
+ .filter(exception_type_col == "1")
235
+ .select("date", "service_id")
236
+
237
+ subtractions = calendar_dates_df
238
+ .filter(exception_type_col == "2")
239
+ .select("date", "service_id")
240
+
241
+ services_by_date = if services_by_date
242
+ # If we found service dates from the calendar table, we need to first
243
+ # add the inclusions, then remove the exceptions coming from the calendar_dates
244
+ services_by_date
245
+ .vstack(additions).unique
246
+ .join(subtractions, on: ["service_id", "date"], how: "anti")
247
+ else
248
+ # Otherwise, we can just use the additions as the new services_by_date
249
+ additions.unique
250
+ end
251
+ end
252
+
253
+ services_by_date
254
+ end
255
+
256
+ # Returns a DataFrame of trip counts per date.
257
+ # Columns: [date, count]
258
+ #
259
+ # @return [Polars::DataFrame]
260
+ def trip_count_dates
261
+ cached_service_dates = service_dates
262
+ return nil if cached_service_dates.nil? || cached_service_dates.height == 0
263
+
264
+ # This expression builds from the dataframe returned by frequency based
265
+ # trip counts, defaulting to 1 for the trips that don't have an entry in
266
+ # the frequencies table. We're defining the expression here just to
267
+ # remove some noise from the join below.
268
+ trip_size = Polars.coalesce("freq_count", Polars.lit(1)).alias("trip_size")
269
+
270
+ # Count trips per service_id, considering the possible size they may have
271
+ # from the frequencies table.
272
+ trip_counts = @trips
273
+ .join(frequency_based_trip_counts, on: "trip_id", how: "left")
274
+ .group_by("service_id")
275
+ .agg(trip_size.sum.alias("trip_count"))
276
+
277
+ # Join to services to get trips per date
278
+ daily_trips = cached_service_dates
279
+ .join(trip_counts, on: "service_id", how: "left")
280
+ .with_columns(Polars.col("trip_count").fill_null(0))
281
+
282
+ # Sum trips per date
283
+ daily_trips.group_by("date").agg(Polars.col("trip_count").sum.alias("count"))
284
+ end
285
+
286
+ # Returns a DataFrame of trip counts from the frequencies table
287
+ # Columns: [trip_id, freq_count]
288
+ #
289
+ # @return [Polars::DataFrame]
290
+ def frequency_based_trip_counts
291
+ # If the feed was initialized with the parse_times flag, we already have
292
+ # seconds since midnight in these columns, otherwise we need to convert
293
+ # them first, so we can get the duration in seconds
294
+ end_time_seconds_col, start_time_seconds_col = if @parse_times
295
+ [Polars.col("end_time"), Polars.col("start_time")]
296
+ else
297
+ [
298
+ GtfsDf::Utils.as_seconds_since_midnight("end_time"),
299
+ GtfsDf::Utils.as_seconds_since_midnight("start_time")
300
+ ]
301
+ end
302
+
303
+ duration_seconds = (end_time_seconds_col - start_time_seconds_col).alias("duration_seconds")
304
+ count = (duration_seconds / Polars.col("headway_secs")).floor.sum.alias("freq_count")
305
+
306
+ # The frequencies table is optional, we default to an empty dataframe to
307
+ # remove friction in the join with trips.
308
+ if @frequencies
309
+ @frequencies.group_by("trip_id").agg(count).select("trip_id", "freq_count")
310
+ else
311
+ Polars::DataFrame.new(
312
+ {"trip_id" => [], "freq_count" => []},
313
+ schema: {"trip_id" => Polars::String, "freq_count" => Polars::Float64}
314
+ )
315
+ end
316
+ end
317
+
318
+ # Identifies the start date of the busiest week in the feed by trip count.
319
+ #
320
+ # @return [Date] The Monday of the busiest week
321
+ def busiest_week
322
+ daily_total = trip_count_dates
323
+ return nil if daily_total.nil? || daily_total.height == 0
324
+
325
+ # Group by week (ISO week, starting Monday)
326
+ weekly_agg = daily_total
327
+ .with_columns(Polars.col("date").dt.truncate("1w").alias("week_start"))
328
+ .group_by("week_start")
329
+ .agg(Polars.col("count").sum.alias("total_trips"))
330
+
331
+ # Get the week with max trips
332
+ # Sort by total_trips descending, then date ascending to pick the earliest date in case of a tie
333
+ sorted_weeks = weekly_agg.sort(["total_trips", "week_start"], reverse: [true, false])
334
+ best_week = sorted_weeks.head(1)
335
+
336
+ return nil if best_week.height == 0
337
+
338
+ # Return the start date of the busiest week
339
+ best_week["week_start"][0]
340
+ end
341
+
177
342
  private
178
343
 
179
344
  def filter!(file, filters, filtered, filter_only_children: false)
@@ -10,10 +10,18 @@ module GtfsDf
10
10
  def self.load_from_zip(zip_path, parse_times: false)
11
11
  data = nil
12
12
 
13
+ relevant_files = GtfsDf::Feed::GTFS_FILES
14
+ .map { |name| "#{name}.txt" }
15
+ .to_set
16
+
13
17
  Dir.mktmpdir do |tmpdir|
14
18
  Zip::File.open(zip_path) do |zip_file|
15
19
  zip_file.each do |entry|
16
- next unless entry.file?
20
+ # We're skipping:
21
+ # - directories
22
+ # - unrelated files
23
+ # - empty feed files
24
+ next unless entry.file? && relevant_files.include?(entry.name) && has_header?(entry)
17
25
  entry.extract(destination_directory: tmpdir)
18
26
  end
19
27
  end
@@ -45,5 +53,11 @@ module GtfsDf
45
53
  schema_class_name = gtfs_file.split("_").map(&:capitalize).join
46
54
  GtfsDf::Schema.const_get(schema_class_name).new(path).df
47
55
  end
56
+
57
+ private_class_method def self.has_header?(zip_entry)
58
+ zip_entry.get_input_stream.readline.strip != ""
59
+ rescue
60
+ false
61
+ end
48
62
  end
49
63
  end
data/lib/gtfs_df/utils.rb CHANGED
@@ -6,53 +6,6 @@ module GtfsDf
6
6
  SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
7
7
  SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
8
8
 
9
- # Parses a GTFS time string to seconds since midnight
10
- #
11
- # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
12
- # also accepted).
13
- #
14
- # The time is measured from "noon minus 12h" of the service day
15
- # (effectively midnight except for days on which daylight savings time
16
- # changes occur). For times occurring after midnight on the service day,
17
- # enter the time as a value greater than 24:00:00 in HH:MM:SS.
18
- #
19
- # @example 14:30:00 for 2:30PM or
20
- # 25:35:00 for 1:35AM on the next day.
21
- #
22
- # @param str String|Integer
23
- # @return Integer|nil seconds since midnight, or nil if invalid
24
- def parse_time(str)
25
- return str if str.is_a?(Integer)
26
- return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
27
-
28
- parts = str.to_s.split(":")
29
- return nil unless parts.size == 3 && parts.all? { |p| p.match?(/^\d+$/) }
30
-
31
- hours, mins, secs = parts.map(&:to_i)
32
- hours * 3600 + mins * 60 + secs
33
- rescue
34
- nil
35
- end
36
-
37
- # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
38
- #
39
- # Handles times greater than 24 hours for times that span past midnight.
40
- #
41
- # @param seconds Integer seconds since midnight
42
- # @return String|nil time in HH:MM:SS format, or nil if invalid
43
- def format_time(seconds)
44
- return nil if seconds.nil?
45
- return seconds if seconds.is_a?(String)
46
-
47
- hours = seconds / SECONDS_IN_HOUR
48
- minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
49
- secs = seconds % SECONDS_IN_MINUTE
50
-
51
- format("%02d:%02d:%02d", hours, minutes, secs)
52
- rescue
53
- nil
54
- end
55
-
56
9
  # Converts a GTFS time string column to seconds since midnight
57
10
  #
58
11
  # Use this method with Polars DataFrames to convert time columns.
@@ -118,16 +71,9 @@ module GtfsDf
118
71
  #
119
72
  # @example 20180913 for September 13th, 2018.
120
73
  #
121
- # @param str String
122
- def parse_date(str)
123
- return nil if str.nil? || str.strip.empty?
124
- return nil unless str.match?(/^\d{8}$/)
125
-
126
- begin
127
- Date.strptime(str, "%Y%m%d")
128
- rescue ArgumentError
129
- nil
130
- end
74
+ # @param col Polars::Expr
75
+ def parse_date(col)
76
+ col.str.strptime(Polars::Date, "%Y%m%d", strict: false)
131
77
  end
132
78
  end
133
79
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.9.3"
4
+ VERSION = "0.10.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.10.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado
@@ -90,11 +90,6 @@ files:
90
90
  - devenv.lock
91
91
  - devenv.nix
92
92
  - devenv.yaml
93
- - examples/split-by-agency/.gitignore
94
- - examples/split-by-agency/Gemfile
95
- - examples/split-by-agency/Gemfile.lock
96
- - examples/split-by-agency/README.md
97
- - examples/split-by-agency/split_by_agency.rb
98
93
  - lib/gtfs_df.rb
99
94
  - lib/gtfs_df/base_gtfs_table.rb
100
95
  - lib/gtfs_df/feed.rb
@@ -1 +0,0 @@
1
- output/
@@ -1,5 +0,0 @@
1
- source "https://gem.coop"
2
-
3
- gem "gtfs_df", path: "../.."
4
- gem "optparse"
5
- gem "whirly"
@@ -1,54 +0,0 @@
1
- PATH
2
- remote: ../..
3
- specs:
4
- gtfs_df (0.9.3)
5
- networkx (~> 0.4)
6
- polars-df (~> 0.22, < 0.24)
7
- rubyzip (>= 3.0, < 4.0)
8
-
9
- GEM
10
- remote: https://gem.coop/
11
- specs:
12
- bigdecimal (4.0.1)
13
- json (2.18.0)
14
- matrix (0.4.3)
15
- networkx (0.4.0)
16
- matrix (~> 0.4)
17
- rb_heap (~> 1.0)
18
- optparse (0.8.1)
19
- polars-df (0.23.0-aarch64-linux)
20
- bigdecimal
21
- polars-df (0.23.0-aarch64-linux-musl)
22
- bigdecimal
23
- polars-df (0.23.0-arm64-darwin)
24
- bigdecimal
25
- polars-df (0.23.0-x86_64-darwin)
26
- bigdecimal
27
- polars-df (0.23.0-x86_64-linux)
28
- bigdecimal
29
- polars-df (0.23.0-x86_64-linux-musl)
30
- bigdecimal
31
- rb_heap (1.1.0)
32
- rubyzip (3.2.2)
33
- unicode-display_width (3.2.0)
34
- unicode-emoji (~> 4.1)
35
- unicode-emoji (4.2.0)
36
- whirly (0.4.0)
37
- json
38
- unicode-display_width (>= 1.1)
39
-
40
- PLATFORMS
41
- aarch64-linux
42
- aarch64-linux-musl
43
- arm64-darwin
44
- x86_64-darwin
45
- x86_64-linux
46
- x86_64-linux-musl
47
-
48
- DEPENDENCIES
49
- gtfs_df!
50
- optparse
51
- whirly
52
-
53
- BUNDLED WITH
54
- 2.6.9
@@ -1,26 +0,0 @@
1
- # Split GTFS by Agency Example
2
-
3
- This example demonstrates how to split a GTFS zip file into multiple files, one for each specified `agency_id`, using the `gtfs_df` Ruby gem.
4
-
5
- ## Usage
6
-
7
- ```
8
- bundle install
9
- ruby split_by_agency.rb -i <input-gtfs.zip> --ids agency1,agency2
10
- ```
11
-
12
- - The output files will be written to the `output/` directory, named `<agency_id>.zip`.
13
-
14
- ## Options
15
- - `-i`, `--input PATH` — Path to the input GTFS zip file
16
- - `--ids IDS` — Comma-separated list of agency IDs to extract
17
-
18
- ## Example
19
-
20
- ```
21
- ruby split_by_agency.rb -i ../../spec/fixtures/sample_gtfs.zip --ids DTA,OTA
22
- ```
23
-
24
- ---
25
-
26
- This is a port of the [original Python script](https://gist.github.com/davidmh/f51e5d93a9213e0e606a43167ff87403) using Partridge.
@@ -1,63 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require "optparse"
5
- require "fileutils"
6
- require "gtfs_df"
7
- require "whirly"
8
-
9
- options = {}
10
- OptionParser.new do |opts|
11
- opts.banner = "Usage: split_by_agency.rb -i <input-gtfs.zip> --ids NUMBERS"
12
-
13
- opts.on("-i", "--input PATH", "Path to the input GTFS file") do |v|
14
- options[:input] = v
15
- end
16
- opts.on("--ids IDS", "Comma-separated list of agency_ids") do |v|
17
- options[:ids] = v
18
- end
19
- end.parse!
20
-
21
- unless options[:input] && options[:ids]
22
- warn "Both --input and --ids are required."
23
- exit 1
24
- end
25
-
26
- input_path = File.expand_path(options[:input])
27
- agency_ids = options[:ids].split(",")
28
- output_dir = File.expand_path("./output", __dir__)
29
- FileUtils.mkdir_p(output_dir)
30
-
31
- feed = nil
32
-
33
- Whirly.configure spinner: "dots", stop: "✓"
34
-
35
- Whirly.start do
36
- Whirly.status = "Loading"
37
-
38
- start_time = Time.now
39
- feed = GtfsDf::Reader.load_from_zip(input_path)
40
- elapsed = Time.now - start_time
41
-
42
- Whirly.status = "Loaded (#{elapsed.round(2)}s)"
43
- end
44
-
45
- agency_ids.each do |agency_id|
46
- Whirly.start do
47
- output_path = File.join(output_dir, "#{agency_id}.zip")
48
-
49
- start_time = Time.now
50
-
51
- Whirly.status = "-> #{agency_id} filtering..."
52
- filtered_feed = feed.filter({"agency" => {"agency_id" => agency_id}})
53
-
54
- Whirly.status = "-> #{agency_id} writing..."
55
- GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
56
-
57
- elapsed = Time.now - start_time
58
-
59
- Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
60
- end
61
- end
62
-
63
- puts "✓ Done, all files are stored in the output/ directory"