gtfs_df 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7fd255d96a4b0990822cfe16cbbbeaa958b6b05f0df9fe149442ace3731034b
4
- data.tar.gz: 98d5d680a0c2dd8bcec10168d8cb08639c25b90a830bfb65f597aa229bd2fd5c
3
+ metadata.gz: 0b218f60e33e005576eb278a74ba89e6164fddccfe99e9f8e5665d105d9fa471
4
+ data.tar.gz: cb4023192418d5ba89f08b605b2ec8ac79073b28d29f6161f252d97df3291dff
5
5
  SHA512:
6
- metadata.gz: 72f6480afb171feffe387f02d069ee935e2373068099598b54789c6fe60982021a34d929b53e9509e0a478335f0a76fa2678cd2d71247e96a2c8394587f9f925
7
- data.tar.gz: 4153e5271961917a7cff004795c239daea3c9b31069500b3c376295821d250ea5dd85fa848276a06a9881e544d14ffcd798371304eeb576a5c49042d0f615fc3
6
+ metadata.gz: cefa279582b0579a637e4ebc39fe4c3d90d56582aa1cd0d00ffda041b92f681831334d46aeae95dc1c291c1295758c500a0c7bfe32876f226605f5b150fc0a9c
7
+ data.tar.gz: e4b5bbe129674304121c61cb732a8261421158f206370289e67d3e6d514bbb089bc0c35f88fde6e46012a18afd3700b86eb4c2a2b457b9880270ef3a4f74cf79
data/.conform.yaml CHANGED
@@ -23,3 +23,4 @@ policies:
23
23
  - deps
24
24
  - docs
25
25
  - ci
26
+ - qol
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 3.4.7
data/CHANGELOG.md CHANGED
@@ -1,8 +1,36 @@
1
+ ## [0.7.0] - 2025-12-30
2
+
3
+ ### 🚀 Features
4
+
5
+ - Add GTFS time parsing and formatting utilities
6
+ - Option to parse time fields as seconds since midnight
7
+ - Thread time columns through the input/output cycle
8
+ - Allow modifying the parse_time flag after extraction
9
+ - Provide a method to write to a directory
10
+
11
+ ### 💼 Other
12
+
13
+ - Silence devenv version warning
14
+ - Update example on release
15
+ - Replace byebug with pry-byebug
16
+ - Enable dependabot updates
17
+
18
+ ### 📚 Documentation
19
+
20
+ - Update filter examples to use explicit hash syntax
21
+
22
+ ### ⚙️ Miscellaneous Tasks
23
+
24
+ - Include the util helpers in the console and the test spec
1
25
  ## [0.6.2] - 2025-12-15
2
26
 
3
27
  ### 🐛 Bug Fixes
4
28
 
5
29
  - Permit non UTF-8 characters
30
+
31
+ ### ⚙️ Miscellaneous Tasks
32
+
33
+ - Bump version to 0.6.2
6
34
  ## [0.6.1] - 2025-12-12
7
35
 
8
36
  ### 🐛 Bug Fixes
data/README.md CHANGED
@@ -49,19 +49,19 @@ The library supports filtering feeds by any field in any table. The filter autom
49
49
 
50
50
  ```ruby
51
51
  # Filter by agency
52
- filtered_feed = feed.filter('agency' => { 'agency_id' => 'MTA' })
52
+ filtered_feed = feed.filter({ 'agency' => { 'agency_id' => 'MTA' } })
53
53
 
54
54
  # Filter by route
55
- filtered_feed = feed.filter('routes' => { 'route_id' => ['1', '2', '3'] })
55
+ filtered_feed = feed.filter({ 'routes' => { 'route_id' => ['1', '2', '3'] } })
56
56
 
57
57
  # Filter by a service
58
- filtered_feed = feed.filter('calendar' => { 'service_id' => 'WEEKDAY' })
58
+ filtered_feed = feed.filter({ 'calendar' => { 'service_id' => 'WEEKDAY' } })
59
59
 
60
60
  # Multiple filters
61
- filtered_feed = feed.filter(
61
+ filtered_feed = feed.filter({
62
62
  'agency' => { 'agency_id' => 'MTA' },
63
63
  'routes' => { 'route_type' => 1 } # Filter to subway routes
64
- )
64
+ })
65
65
  ```
66
66
 
67
67
  When you filter by a field, the library automatically:
data/devenv.nix CHANGED
@@ -1,9 +1,11 @@
1
- { pkgs, lib, config, inputs, ... }:
1
+ { pkgs, ... }:
2
2
 
3
3
  {
4
+ devenv.warnOnNewVersion = false;
5
+
4
6
  languages.ruby = {
5
7
  enable = true;
6
- version = "3.4.7";
8
+ versionFile = ./.ruby-version;
7
9
  bundler.enable = false;
8
10
  };
9
11
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- gtfs_df (0.1.1)
4
+ gtfs_df (0.7.0)
5
5
  networkx (~> 0.4)
6
6
  polars-df (~> 0.22)
7
7
  rubyzip (~> 2.3)
@@ -34,8 +34,12 @@ Whirly.configure spinner: "dots", stop: "✓"
34
34
 
35
35
  Whirly.start do
36
36
  Whirly.status = "Loading"
37
+
38
+ start_time = Time.now
37
39
  feed = GtfsDf::Reader.load_from_zip(input_path)
38
- Whirly.status = "Loaded"
40
+ elapsed = Time.now - start_time
41
+
42
+ Whirly.status = "Loaded (#{elapsed.round(2)}s)"
39
43
  end
40
44
 
41
45
  agency_ids.each do |agency_id|
@@ -45,7 +49,7 @@ agency_ids.each do |agency_id|
45
49
  start_time = Time.now
46
50
 
47
51
  Whirly.status = "-> #{agency_id} filtering..."
48
- filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
52
+ filtered_feed = feed.filter({"agency" => {"agency_id" => agency_id}})
49
53
 
50
54
  Whirly.status = "-> #{agency_id} writing..."
51
55
  GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
@@ -36,6 +36,10 @@ module GtfsDf
36
36
  self.class::SCHEMA.keys
37
37
  end
38
38
 
39
+ def self.time_fields
40
+ const_defined?(:TIME_FIELDS) ? const_get(:TIME_FIELDS) : []
41
+ end
42
+
39
43
  def valid?
40
44
  @validator.valid?
41
45
  end
data/lib/gtfs_df/feed.rb CHANGED
@@ -37,12 +37,17 @@ module GtfsDf
37
37
  ].freeze
38
38
 
39
39
  attr_accessor(*GTFS_FILES)
40
+ attr_accessor(:parse_times)
40
41
  attr_reader(:graph)
41
42
 
42
43
  # Initialize with a hash of DataFrames
43
44
  REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
44
45
 
45
- def initialize(data = {})
46
+ # @param data [Hash] Hash of DataFrames for each GTFS file
47
+ # @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
48
+ def initialize(data = {}, parse_times: false)
49
+ @parse_times = parse_times
50
+
46
51
  missing = REQUIRED_GTFS_FILES.reject { |file| data[file].is_a?(Polars::DataFrame) }
47
52
  # At least one of calendar or calendar_dates must be present
48
53
  unless data["calendar"].is_a?(Polars::DataFrame) || data["calendar_dates"].is_a?(Polars::DataFrame)
@@ -66,6 +71,19 @@ module GtfsDf
66
71
  end
67
72
  if df.is_a?(Polars::DataFrame) && schema_class && schema_class.const_defined?(:SCHEMA)
68
73
  df = schema_class.new(df).df
74
+ # Parse time fields if enabled and they're still strings
75
+ if @parse_times && schema_class.respond_to?(:time_fields)
76
+ time_fields = schema_class.time_fields
77
+ time_fields.each do |field|
78
+ next unless df.columns.include?(field)
79
+ # Only parse if the field is still a string (not already parsed)
80
+ if df[field].dtype == Polars::String
81
+ df = df.with_columns(
82
+ GtfsDf::Utils.as_seconds_since_midnight(field)
83
+ )
84
+ end
85
+ end
86
+ end
69
87
  end
70
88
  instance_variable_set("@#{file}", df.is_a?(Polars::DataFrame) ? df : nil)
71
89
  end
@@ -127,7 +145,7 @@ module GtfsDf
127
145
 
128
146
  (!df || df.height == 0) && !is_required_file
129
147
  end
130
- self.class.new(filtered)
148
+ self.class.new(filtered, parse_times: @parse_times)
131
149
  end
132
150
 
133
151
  private
@@ -3,7 +3,11 @@
3
3
  module GtfsDf
4
4
  class Reader
5
5
  # Loads a GTFS zip file and returns a Feed
6
- def self.load_from_zip(zip_path)
6
+ #
7
+ # @param zip_path [String] Path to the GTFS zip file
8
+ # @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
9
+ # @return [Feed] The loaded GTFS feed
10
+ def self.load_from_zip(zip_path, parse_times: false)
7
11
  data = nil
8
12
 
9
13
  Dir.mktmpdir do |tmpdir|
@@ -15,14 +19,18 @@ module GtfsDf
15
19
  end
16
20
  end
17
21
 
18
- data = load_from_dir(tmpdir)
22
+ data = load_from_dir(tmpdir, parse_times: parse_times)
19
23
  end
20
24
 
21
25
  data
22
26
  end
23
27
 
24
28
  # Loads a GTFS dir and returns a Feed
25
- def self.load_from_dir(dir_path)
29
+ #
30
+ # @param dir_path [String] Path to the GTFS directory
31
+ # @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
32
+ # @return [Feed] The loaded GTFS feed
33
+ def self.load_from_dir(dir_path, parse_times: false)
26
34
  data = {}
27
35
  GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
28
36
  path = File.join(dir_path, "#{gtfs_file}.txt")
@@ -31,7 +39,7 @@ module GtfsDf
31
39
  data[gtfs_file] = data_frame(gtfs_file, path)
32
40
  end
33
41
 
34
- GtfsDf::Feed.new(data)
42
+ GtfsDf::Feed.new(data, parse_times: parse_times)
35
43
  end
36
44
 
37
45
  private_class_method def self.data_frame(gtfs_file, path)
@@ -17,6 +17,11 @@ module GtfsDf
17
17
  end_time
18
18
  headway_secs
19
19
  ].freeze
20
+
21
+ TIME_FIELDS = %w[
22
+ start_time
23
+ end_time
24
+ ].freeze
20
25
  end
21
26
  end
22
27
  end
@@ -26,6 +26,13 @@ module GtfsDf
26
26
 
27
27
  REQUIRED_FIELDS = %w[trip_id stop_sequence stop_id]
28
28
 
29
+ TIME_FIELDS = %w[
30
+ arrival_time
31
+ departure_time
32
+ start_pickup_drop_off_window
33
+ end_pickup_drop_off_window
34
+ ].freeze
35
+
29
36
  ENUM_VALUE_MAP = {
30
37
  "pickup_type" => :PICKUP_TYPE,
31
38
  "drop_off_type" => :DROP_OFF_TYPE,
data/lib/gtfs_df/utils.rb CHANGED
@@ -1,6 +1,12 @@
1
1
  module GtfsDf
2
2
  module Utils
3
- # Parses a GTFS time string
3
+ extend self
4
+
5
+ SECONDS_IN_MINUTE = 60
6
+ SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
7
+ SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
8
+
9
+ # Parses a GTFS time string to seconds since midnight
4
10
  #
5
11
  # The input string is expected to be in the HH:MM:SS format (H:MM:SS is
6
12
  # also accepted).
@@ -13,10 +19,9 @@ module GtfsDf
13
19
  # @example 14:30:00 for 2:30PM or
14
20
  # 25:35:00 for 1:35AM on the next day.
15
21
  #
16
- # @param str String
17
- # Parses a GTFS time string or returns integer seconds if already provided.
18
- # Accepts Integer (returns as-is), or HH:MM:SS string (possibly >24h).
19
- def self.parse_time(str)
22
+ # @param str String|Integer
23
+ # @return Integer|nil seconds since midnight, or nil if invalid
24
+ def parse_time(str)
20
25
  return str if str.is_a?(Integer)
21
26
  return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
22
27
 
@@ -29,6 +34,82 @@ module GtfsDf
29
34
  nil
30
35
  end
31
36
 
37
+ # Formats seconds since midnight as a GTFS time string (HH:MM:SS)
38
+ #
39
+ # Handles times greater than 24 hours for times that span past midnight.
40
+ #
41
+ # @param seconds Integer seconds since midnight
42
+ # @return String|nil time in HH:MM:SS format, or nil if invalid
43
+ def format_time(seconds)
44
+ return nil if seconds.nil?
45
+ return seconds if seconds.is_a?(String)
46
+
47
+ hours = seconds / SECONDS_IN_HOUR
48
+ minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
49
+ secs = seconds % SECONDS_IN_MINUTE
50
+
51
+ format("%02d:%02d:%02d", hours, minutes, secs)
52
+ rescue
53
+ nil
54
+ end
55
+
56
+ # Converts a GTFS time string column to seconds since midnight
57
+ #
58
+ # Use this method with Polars DataFrames to convert time columns.
59
+ #
60
+ # @example dataframe.with_columns(GtfsDf::Utils.as_seconds_since_midnight('start_time'))
61
+ #
62
+ # @param col_name String The column to convert
63
+ # @return Polars::Expr
64
+ def as_seconds_since_midnight(col_name)
65
+ parts = Polars.col(col_name).str.split(":")
66
+
67
+ hours = parts.list.get(0).cast(:i64)
68
+ minutes = parts.list.get(1).cast(:i64)
69
+ seconds = parts.list.get(2).cast(:i64)
70
+
71
+ (hours * SECONDS_IN_HOUR) +
72
+ (minutes * SECONDS_IN_MINUTE) +
73
+ seconds
74
+ end
75
+
76
+ # Converts a seconds since midnight column to GTFS time string (HH:MM:SS)
77
+ #
78
+ # Use this method with Polars DataFrames to convert time columns back to strings.
79
+ #
80
+ # @example dataframe.with_columns(GtfsDf::Utils.as_time_string('start_time'))
81
+ #
82
+ # @param col_name String The column to convert
83
+ # @return Polars::Expr
84
+ def as_time_string(col_name)
85
+ total_seconds = Polars.col(col_name)
86
+ hours = total_seconds.floordiv(SECONDS_IN_HOUR)
87
+ minutes = (total_seconds % SECONDS_IN_HOUR).floordiv(SECONDS_IN_MINUTE)
88
+ seconds = total_seconds % SECONDS_IN_MINUTE
89
+
90
+ Polars.format(
91
+ "{}:{}:{}",
92
+ hours.cast(:str).str.zfill(2),
93
+ minutes.cast(:str).str.zfill(2),
94
+ seconds.cast(:str).str.zfill(2)
95
+ )
96
+ end
97
+
98
+ # Converts a seconds since midnight Series to GTFS time strings for inspection
99
+ #
100
+ # Use this method to get a readable view of time columns during debugging.
101
+ # It's not meant to be performant.
102
+ #
103
+ # @example GtfsDf::Utils.inspect_time(feed.stop_times["arrival_time"])
104
+ #
105
+ # @param series Polars::Series The series to convert
106
+ # @return Polars::Series A series with time strings
107
+ def inspect_time(series)
108
+ series.to_frame.with_columns(
109
+ as_time_string(series.name)
110
+ )[series.name]
111
+ end
112
+
32
113
  # Parses a GTFS date string
33
114
  #
34
115
  # The input string is expected to be a service day in the YYYYMMDD format.
@@ -38,7 +119,7 @@ module GtfsDf
38
119
  # @example 20180913 for September 13th, 2018.
39
120
  #
40
121
  # @param str String
41
- def self.parse_date(str)
122
+ def parse_date(str)
42
123
  return nil if str.nil? || str.strip.empty?
43
124
  return nil unless str.match?(/^\d{8}$/)
44
125
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.6.2"
4
+ VERSION = "0.7.0"
5
5
  end
@@ -3,6 +3,9 @@
3
3
  module GtfsDf
4
4
  class Writer
5
5
  # Exports a Feed to a GTFS zip file
6
+ #
7
+ # @param feed [Feed] The GTFS feed to export
8
+ # @param zip_path [String] The path where the zip file will be created
6
9
  def self.write_to_zip(feed, zip_path)
7
10
  require "stringio"
8
11
  require "zlib"
@@ -12,6 +15,11 @@ module GtfsDf
12
15
  df = feed.send(file)
13
16
  next unless df.is_a?(Polars::DataFrame)
14
17
 
18
+ # Convert time fields back to strings if parse_times was enabled
19
+ if feed.parse_times
20
+ df = format_time_fields(file, df)
21
+ end
22
+
15
23
  # Write CSV to StringIO
16
24
  csv_io = StringIO.new
17
25
  df.write_csv(csv_io)
@@ -22,5 +30,47 @@ module GtfsDf
22
30
  end
23
31
  end
24
32
  end
33
+
34
+ # Exports a Feed to a directory as individual text files
35
+ #
36
+ # @param feed [Feed] The GTFS feed to export
37
+ # @param dir_path [String] The path where the directory will be created
38
+ def self.write_to_dir(feed, dir_path)
39
+ FileUtils.mkdir_p(dir_path)
40
+ GtfsDf::Feed::GTFS_FILES.each do |file|
41
+ df = feed.send(file)
42
+ next unless df.is_a?(Polars::DataFrame)
43
+
44
+ # Convert time fields back to strings if parse_times was enabled
45
+ df = format_time_fields(file, df) if feed.parse_times
46
+
47
+ # Write CSV directly to file
48
+ df.write_csv(File.join(dir_path, "#{file}.txt"))
49
+ end
50
+ end
51
+
52
+ # Formats time fields back to HH:MM:SS strings for a given GTFS file
53
+ #
54
+ # @param file [String] The GTFS file name (e.g., "stop_times")
55
+ # @param df [Polars::DataFrame] The DataFrame to format
56
+ # @return [Polars::DataFrame] DataFrame with time fields formatted as strings
57
+ def self.format_time_fields(file, df)
58
+ schema_class_name = file.split("_").map(&:capitalize).join
59
+ schema_class = begin
60
+ GtfsDf::Schema.const_get(schema_class_name)
61
+ rescue
62
+ nil
63
+ end
64
+
65
+ return df unless schema_class&.respond_to?(:time_fields)
66
+
67
+ time_fields = schema_class.time_fields
68
+ time_fields.each do |field|
69
+ next unless df.columns.include?(field)
70
+ df = df.with_columns(GtfsDf::Utils.as_time_string(field))
71
+ end
72
+
73
+ df
74
+ end
25
75
  end
26
76
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado
@@ -65,6 +65,7 @@ files:
65
65
  - ".envrc"
66
66
  - ".rspec"
67
67
  - ".rubocop.yml"
68
+ - ".ruby-version"
68
69
  - ".solargraph.yml"
69
70
  - ".standard.yml"
70
71
  - CHANGELOG.md