gtfs_df 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.conform.yaml +1 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +28 -0
- data/README.md +5 -5
- data/devenv.nix +4 -2
- data/examples/split-by-agency/Gemfile.lock +1 -1
- data/examples/split-by-agency/split_by_agency.rb +6 -2
- data/lib/gtfs_df/base_gtfs_table.rb +4 -0
- data/lib/gtfs_df/feed.rb +20 -2
- data/lib/gtfs_df/reader.rb +12 -4
- data/lib/gtfs_df/schema/frequencies.rb +5 -0
- data/lib/gtfs_df/schema/stop_times.rb +7 -0
- data/lib/gtfs_df/utils.rb +87 -6
- data/lib/gtfs_df/version.rb +1 -1
- data/lib/gtfs_df/writer.rb +50 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0b218f60e33e005576eb278a74ba89e6164fddccfe99e9f8e5665d105d9fa471
|
|
4
|
+
data.tar.gz: cb4023192418d5ba89f08b605b2ec8ac79073b28d29f6161f252d97df3291dff
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cefa279582b0579a637e4ebc39fe4c3d90d56582aa1cd0d00ffda041b92f681831334d46aeae95dc1c291c1295758c500a0c7bfe32876f226605f5b150fc0a9c
|
|
7
|
+
data.tar.gz: e4b5bbe129674304121c61cb732a8261421158f206370289e67d3e6d514bbb089bc0c35f88fde6e46012a18afd3700b86eb4c2a2b457b9880270ef3a4f74cf79
|
data/.conform.yaml
CHANGED
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.4.7
|
data/CHANGELOG.md
CHANGED
|
@@ -1,8 +1,36 @@
|
|
|
1
|
+
## [0.7.0] - 2025-12-30
|
|
2
|
+
|
|
3
|
+
### 🚀 Features
|
|
4
|
+
|
|
5
|
+
- Add GTFS time parsing and formatting utilities
|
|
6
|
+
- Option to parse time fields as seconds since midnight
|
|
7
|
+
- Thread time columns through the input/output cycle
|
|
8
|
+
- Allow modifying the parse_time flag after extraction
|
|
9
|
+
- Provide a method to write to a directory
|
|
10
|
+
|
|
11
|
+
### 💼 Other
|
|
12
|
+
|
|
13
|
+
- Silence devenv version warning
|
|
14
|
+
- Update example on release
|
|
15
|
+
- Replace byebug with pry-byebug
|
|
16
|
+
- Enable dependabot updates
|
|
17
|
+
|
|
18
|
+
### 📚 Documentation
|
|
19
|
+
|
|
20
|
+
- Update filter examples to use explicit hash syntax
|
|
21
|
+
|
|
22
|
+
### ⚙️ Miscellaneous Tasks
|
|
23
|
+
|
|
24
|
+
- Include the util helpers in the console and the test spec
|
|
1
25
|
## [0.6.2] - 2025-12-15
|
|
2
26
|
|
|
3
27
|
### 🐛 Bug Fixes
|
|
4
28
|
|
|
5
29
|
- Permit non UTF-8 characters
|
|
30
|
+
|
|
31
|
+
### ⚙️ Miscellaneous Tasks
|
|
32
|
+
|
|
33
|
+
- Bump version to 0.6.2
|
|
6
34
|
## [0.6.1] - 2025-12-12
|
|
7
35
|
|
|
8
36
|
### 🐛 Bug Fixes
|
data/README.md
CHANGED
|
@@ -49,19 +49,19 @@ The library supports filtering feeds by any field in any table. The filter autom
|
|
|
49
49
|
|
|
50
50
|
```ruby
|
|
51
51
|
# Filter by agency
|
|
52
|
-
filtered_feed = feed.filter('agency' => { 'agency_id' => 'MTA' })
|
|
52
|
+
filtered_feed = feed.filter({ 'agency' => { 'agency_id' => 'MTA' } })
|
|
53
53
|
|
|
54
54
|
# Filter by route
|
|
55
|
-
filtered_feed = feed.filter('routes' => { 'route_id' => ['1', '2', '3'] })
|
|
55
|
+
filtered_feed = feed.filter({ 'routes' => { 'route_id' => ['1', '2', '3'] } })
|
|
56
56
|
|
|
57
57
|
# Filter by a service
|
|
58
|
-
filtered_feed = feed.filter('calendar' => { 'service_id' => 'WEEKDAY' })
|
|
58
|
+
filtered_feed = feed.filter({ 'calendar' => { 'service_id' => 'WEEKDAY' } })
|
|
59
59
|
|
|
60
60
|
# Multiple filters
|
|
61
|
-
filtered_feed = feed.filter(
|
|
61
|
+
filtered_feed = feed.filter({
|
|
62
62
|
'agency' => { 'agency_id' => 'MTA' },
|
|
63
63
|
'routes' => { 'route_type' => 1 } # Filter to subway routes
|
|
64
|
-
)
|
|
64
|
+
})
|
|
65
65
|
```
|
|
66
66
|
|
|
67
67
|
When you filter by a field, the library automatically:
|
data/devenv.nix
CHANGED
|
@@ -34,8 +34,12 @@ Whirly.configure spinner: "dots", stop: "✓"
|
|
|
34
34
|
|
|
35
35
|
Whirly.start do
|
|
36
36
|
Whirly.status = "Loading"
|
|
37
|
+
|
|
38
|
+
start_time = Time.now
|
|
37
39
|
feed = GtfsDf::Reader.load_from_zip(input_path)
|
|
38
|
-
|
|
40
|
+
elapsed = Time.now - start_time
|
|
41
|
+
|
|
42
|
+
Whirly.status = "Loaded (#{elapsed.round(2)}s)"
|
|
39
43
|
end
|
|
40
44
|
|
|
41
45
|
agency_ids.each do |agency_id|
|
|
@@ -45,7 +49,7 @@ agency_ids.each do |agency_id|
|
|
|
45
49
|
start_time = Time.now
|
|
46
50
|
|
|
47
51
|
Whirly.status = "-> #{agency_id} filtering..."
|
|
48
|
-
filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
|
|
52
|
+
filtered_feed = feed.filter({"agency" => {"agency_id" => agency_id}})
|
|
49
53
|
|
|
50
54
|
Whirly.status = "-> #{agency_id} writing..."
|
|
51
55
|
GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -37,12 +37,17 @@ module GtfsDf
|
|
|
37
37
|
].freeze
|
|
38
38
|
|
|
39
39
|
attr_accessor(*GTFS_FILES)
|
|
40
|
+
attr_accessor(:parse_times)
|
|
40
41
|
attr_reader(:graph)
|
|
41
42
|
|
|
42
43
|
# Initialize with a hash of DataFrames
|
|
43
44
|
REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
# @param data [Hash] Hash of DataFrames for each GTFS file
|
|
47
|
+
# @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
|
|
48
|
+
def initialize(data = {}, parse_times: false)
|
|
49
|
+
@parse_times = parse_times
|
|
50
|
+
|
|
46
51
|
missing = REQUIRED_GTFS_FILES.reject { |file| data[file].is_a?(Polars::DataFrame) }
|
|
47
52
|
# At least one of calendar or calendar_dates must be present
|
|
48
53
|
unless data["calendar"].is_a?(Polars::DataFrame) || data["calendar_dates"].is_a?(Polars::DataFrame)
|
|
@@ -66,6 +71,19 @@ module GtfsDf
|
|
|
66
71
|
end
|
|
67
72
|
if df.is_a?(Polars::DataFrame) && schema_class && schema_class.const_defined?(:SCHEMA)
|
|
68
73
|
df = schema_class.new(df).df
|
|
74
|
+
# Parse time fields if enabled and they're still strings
|
|
75
|
+
if @parse_times && schema_class.respond_to?(:time_fields)
|
|
76
|
+
time_fields = schema_class.time_fields
|
|
77
|
+
time_fields.each do |field|
|
|
78
|
+
next unless df.columns.include?(field)
|
|
79
|
+
# Only parse if the field is still a string (not already parsed)
|
|
80
|
+
if df[field].dtype == Polars::String
|
|
81
|
+
df = df.with_columns(
|
|
82
|
+
GtfsDf::Utils.as_seconds_since_midnight(field)
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
69
87
|
end
|
|
70
88
|
instance_variable_set("@#{file}", df.is_a?(Polars::DataFrame) ? df : nil)
|
|
71
89
|
end
|
|
@@ -127,7 +145,7 @@ module GtfsDf
|
|
|
127
145
|
|
|
128
146
|
(!df || df.height == 0) && !is_required_file
|
|
129
147
|
end
|
|
130
|
-
self.class.new(filtered)
|
|
148
|
+
self.class.new(filtered, parse_times: @parse_times)
|
|
131
149
|
end
|
|
132
150
|
|
|
133
151
|
private
|
data/lib/gtfs_df/reader.rb
CHANGED
|
@@ -3,7 +3,11 @@
|
|
|
3
3
|
module GtfsDf
|
|
4
4
|
class Reader
|
|
5
5
|
# Loads a GTFS zip file and returns a Feed
|
|
6
|
-
|
|
6
|
+
#
|
|
7
|
+
# @param zip_path [String] Path to the GTFS zip file
|
|
8
|
+
# @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
|
|
9
|
+
# @return [Feed] The loaded GTFS feed
|
|
10
|
+
def self.load_from_zip(zip_path, parse_times: false)
|
|
7
11
|
data = nil
|
|
8
12
|
|
|
9
13
|
Dir.mktmpdir do |tmpdir|
|
|
@@ -15,14 +19,18 @@ module GtfsDf
|
|
|
15
19
|
end
|
|
16
20
|
end
|
|
17
21
|
|
|
18
|
-
data = load_from_dir(tmpdir)
|
|
22
|
+
data = load_from_dir(tmpdir, parse_times: parse_times)
|
|
19
23
|
end
|
|
20
24
|
|
|
21
25
|
data
|
|
22
26
|
end
|
|
23
27
|
|
|
24
28
|
# Loads a GTFS dir and returns a Feed
|
|
25
|
-
|
|
29
|
+
#
|
|
30
|
+
# @param dir_path [String] Path to the GTFS directory
|
|
31
|
+
# @param parse_times [Boolean] Whether to parse time fields to seconds since midnight (default: false)
|
|
32
|
+
# @return [Feed] The loaded GTFS feed
|
|
33
|
+
def self.load_from_dir(dir_path, parse_times: false)
|
|
26
34
|
data = {}
|
|
27
35
|
GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
|
|
28
36
|
path = File.join(dir_path, "#{gtfs_file}.txt")
|
|
@@ -31,7 +39,7 @@ module GtfsDf
|
|
|
31
39
|
data[gtfs_file] = data_frame(gtfs_file, path)
|
|
32
40
|
end
|
|
33
41
|
|
|
34
|
-
GtfsDf::Feed.new(data)
|
|
42
|
+
GtfsDf::Feed.new(data, parse_times: parse_times)
|
|
35
43
|
end
|
|
36
44
|
|
|
37
45
|
private_class_method def self.data_frame(gtfs_file, path)
|
|
@@ -26,6 +26,13 @@ module GtfsDf
|
|
|
26
26
|
|
|
27
27
|
REQUIRED_FIELDS = %w[trip_id stop_sequence stop_id]
|
|
28
28
|
|
|
29
|
+
TIME_FIELDS = %w[
|
|
30
|
+
arrival_time
|
|
31
|
+
departure_time
|
|
32
|
+
start_pickup_drop_off_window
|
|
33
|
+
end_pickup_drop_off_window
|
|
34
|
+
].freeze
|
|
35
|
+
|
|
29
36
|
ENUM_VALUE_MAP = {
|
|
30
37
|
"pickup_type" => :PICKUP_TYPE,
|
|
31
38
|
"drop_off_type" => :DROP_OFF_TYPE,
|
data/lib/gtfs_df/utils.rb
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
module GtfsDf
|
|
2
2
|
module Utils
|
|
3
|
-
|
|
3
|
+
extend self
|
|
4
|
+
|
|
5
|
+
SECONDS_IN_MINUTE = 60
|
|
6
|
+
SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
|
|
7
|
+
SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
|
|
8
|
+
|
|
9
|
+
# Parses a GTFS time string to seconds since midnight
|
|
4
10
|
#
|
|
5
11
|
# The input string is expected to be in the HH:MM:SS format (H:MM:SS is
|
|
6
12
|
# also accepted).
|
|
@@ -13,10 +19,9 @@ module GtfsDf
|
|
|
13
19
|
# @example 14:30:00 for 2:30PM or
|
|
14
20
|
# 25:35:00 for 1:35AM on the next day.
|
|
15
21
|
#
|
|
16
|
-
# @param str String
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
def self.parse_time(str)
|
|
22
|
+
# @param str String|Integer
|
|
23
|
+
# @return Integer|nil seconds since midnight, or nil if invalid
|
|
24
|
+
def parse_time(str)
|
|
20
25
|
return str if str.is_a?(Integer)
|
|
21
26
|
return nil if str.nil? || (str.respond_to?(:strip) && str.strip.empty?)
|
|
22
27
|
|
|
@@ -29,6 +34,82 @@ module GtfsDf
|
|
|
29
34
|
nil
|
|
30
35
|
end
|
|
31
36
|
|
|
37
|
+
# Formats seconds since midnight as a GTFS time string (HH:MM:SS)
|
|
38
|
+
#
|
|
39
|
+
# Handles times greater than 24 hours for times that span past midnight.
|
|
40
|
+
#
|
|
41
|
+
# @param seconds Integer seconds since midnight
|
|
42
|
+
# @return String|nil time in HH:MM:SS format, or nil if invalid
|
|
43
|
+
def format_time(seconds)
|
|
44
|
+
return nil if seconds.nil?
|
|
45
|
+
return seconds if seconds.is_a?(String)
|
|
46
|
+
|
|
47
|
+
hours = seconds / SECONDS_IN_HOUR
|
|
48
|
+
minutes = (seconds % SECONDS_IN_HOUR) / SECONDS_IN_MINUTE
|
|
49
|
+
secs = seconds % SECONDS_IN_MINUTE
|
|
50
|
+
|
|
51
|
+
format("%02d:%02d:%02d", hours, minutes, secs)
|
|
52
|
+
rescue
|
|
53
|
+
nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Converts a GTFS time string column to seconds since midnight
|
|
57
|
+
#
|
|
58
|
+
# Use this method with Polars DataFrames to convert time columns.
|
|
59
|
+
#
|
|
60
|
+
# @example dataframe.with_columns(GtfsDf::Utils.as_seconds_since_midnight('start_time'))
|
|
61
|
+
#
|
|
62
|
+
# @param col_name String The column to convert
|
|
63
|
+
# @return Polars::Expr
|
|
64
|
+
def as_seconds_since_midnight(col_name)
|
|
65
|
+
parts = Polars.col(col_name).str.split(":")
|
|
66
|
+
|
|
67
|
+
hours = parts.list.get(0).cast(:i64)
|
|
68
|
+
minutes = parts.list.get(1).cast(:i64)
|
|
69
|
+
seconds = parts.list.get(2).cast(:i64)
|
|
70
|
+
|
|
71
|
+
(hours * SECONDS_IN_HOUR) +
|
|
72
|
+
(minutes * SECONDS_IN_MINUTE) +
|
|
73
|
+
seconds
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Converts a seconds since midnight column to GTFS time string (HH:MM:SS)
|
|
77
|
+
#
|
|
78
|
+
# Use this method with Polars DataFrames to convert time columns back to strings.
|
|
79
|
+
#
|
|
80
|
+
# @example dataframe.with_columns(GtfsDf::Utils.as_time_string('start_time'))
|
|
81
|
+
#
|
|
82
|
+
# @param col_name String The column to convert
|
|
83
|
+
# @return Polars::Expr
|
|
84
|
+
def as_time_string(col_name)
|
|
85
|
+
total_seconds = Polars.col(col_name)
|
|
86
|
+
hours = total_seconds.floordiv(SECONDS_IN_HOUR)
|
|
87
|
+
minutes = (total_seconds % SECONDS_IN_HOUR).floordiv(SECONDS_IN_MINUTE)
|
|
88
|
+
seconds = total_seconds % SECONDS_IN_MINUTE
|
|
89
|
+
|
|
90
|
+
Polars.format(
|
|
91
|
+
"{}:{}:{}",
|
|
92
|
+
hours.cast(:str).str.zfill(2),
|
|
93
|
+
minutes.cast(:str).str.zfill(2),
|
|
94
|
+
seconds.cast(:str).str.zfill(2)
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Converts a seconds since midnight Series to GTFS time strings for inspection
|
|
99
|
+
#
|
|
100
|
+
# Use this method to get a readable view of time columns during debugging.
|
|
101
|
+
# It's not meant to be performant.
|
|
102
|
+
#
|
|
103
|
+
# @example GtfsDf::Utils.inspect_time(feed.stop_times["arrival_time"])
|
|
104
|
+
#
|
|
105
|
+
# @param series Polars::Series The series to convert
|
|
106
|
+
# @return Polars::Series A series with time strings
|
|
107
|
+
def inspect_time(series)
|
|
108
|
+
series.to_frame.with_columns(
|
|
109
|
+
as_time_string(series.name)
|
|
110
|
+
)[series.name]
|
|
111
|
+
end
|
|
112
|
+
|
|
32
113
|
# Parses a GTFS date string
|
|
33
114
|
#
|
|
34
115
|
# The input string is expected to be a service day in the YYYYMMDD format.
|
|
@@ -38,7 +119,7 @@ module GtfsDf
|
|
|
38
119
|
# @example 20180913 for September 13th, 2018.
|
|
39
120
|
#
|
|
40
121
|
# @param str String
|
|
41
|
-
def
|
|
122
|
+
def parse_date(str)
|
|
42
123
|
return nil if str.nil? || str.strip.empty?
|
|
43
124
|
return nil unless str.match?(/^\d{8}$/)
|
|
44
125
|
|
data/lib/gtfs_df/version.rb
CHANGED
data/lib/gtfs_df/writer.rb
CHANGED
|
@@ -3,6 +3,9 @@
|
|
|
3
3
|
module GtfsDf
|
|
4
4
|
class Writer
|
|
5
5
|
# Exports a Feed to a GTFS zip file
|
|
6
|
+
#
|
|
7
|
+
# @param feed [Feed] The GTFS feed to export
|
|
8
|
+
# @param zip_path [String] The path where the zip file will be created
|
|
6
9
|
def self.write_to_zip(feed, zip_path)
|
|
7
10
|
require "stringio"
|
|
8
11
|
require "zlib"
|
|
@@ -12,6 +15,11 @@ module GtfsDf
|
|
|
12
15
|
df = feed.send(file)
|
|
13
16
|
next unless df.is_a?(Polars::DataFrame)
|
|
14
17
|
|
|
18
|
+
# Convert time fields back to strings if parse_times was enabled
|
|
19
|
+
if feed.parse_times
|
|
20
|
+
df = format_time_fields(file, df)
|
|
21
|
+
end
|
|
22
|
+
|
|
15
23
|
# Write CSV to StringIO
|
|
16
24
|
csv_io = StringIO.new
|
|
17
25
|
df.write_csv(csv_io)
|
|
@@ -22,5 +30,47 @@ module GtfsDf
|
|
|
22
30
|
end
|
|
23
31
|
end
|
|
24
32
|
end
|
|
33
|
+
|
|
34
|
+
# Exports a Feed to a directory as individual text files
|
|
35
|
+
#
|
|
36
|
+
# @param feed [Feed] The GTFS feed to export
|
|
37
|
+
# @param dir_path [String] The path where the directory will be created
|
|
38
|
+
def self.write_to_dir(feed, dir_path)
|
|
39
|
+
FileUtils.mkdir_p(dir_path)
|
|
40
|
+
GtfsDf::Feed::GTFS_FILES.each do |file|
|
|
41
|
+
df = feed.send(file)
|
|
42
|
+
next unless df.is_a?(Polars::DataFrame)
|
|
43
|
+
|
|
44
|
+
# Convert time fields back to strings if parse_times was enabled
|
|
45
|
+
df = format_time_fields(file, df) if feed.parse_times
|
|
46
|
+
|
|
47
|
+
# Write CSV directly to file
|
|
48
|
+
df.write_csv(File.join(dir_path, "#{file}.txt"))
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Formats time fields back to HH:MM:SS strings for a given GTFS file
|
|
53
|
+
#
|
|
54
|
+
# @param file [String] The GTFS file name (e.g., "stop_times")
|
|
55
|
+
# @param df [Polars::DataFrame] The DataFrame to format
|
|
56
|
+
# @return [Polars::DataFrame] DataFrame with time fields formatted as strings
|
|
57
|
+
def self.format_time_fields(file, df)
|
|
58
|
+
schema_class_name = file.split("_").map(&:capitalize).join
|
|
59
|
+
schema_class = begin
|
|
60
|
+
GtfsDf::Schema.const_get(schema_class_name)
|
|
61
|
+
rescue
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
return df unless schema_class&.respond_to?(:time_fields)
|
|
66
|
+
|
|
67
|
+
time_fields = schema_class.time_fields
|
|
68
|
+
time_fields.each do |field|
|
|
69
|
+
next unless df.columns.include?(field)
|
|
70
|
+
df = df.with_columns(GtfsDf::Utils.as_time_string(field))
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
df
|
|
74
|
+
end
|
|
25
75
|
end
|
|
26
76
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gtfs_df
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Mejorado
|
|
@@ -65,6 +65,7 @@ files:
|
|
|
65
65
|
- ".envrc"
|
|
66
66
|
- ".rspec"
|
|
67
67
|
- ".rubocop.yml"
|
|
68
|
+
- ".ruby-version"
|
|
68
69
|
- ".solargraph.yml"
|
|
69
70
|
- ".standard.yml"
|
|
70
71
|
- CHANGELOG.md
|