gtfs_df 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -1
- data/lib/gtfs_df/base_gtfs_table.rb +15 -14
- data/lib/gtfs_df/feed.rb +38 -20
- data/lib/gtfs_df/graph.rb +5 -1
- data/lib/gtfs_df/schema/stops.rb +1 -1
- data/lib/gtfs_df/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 205ec058b41c5bd1d2b01ff3950d4cd2ebb20f304d02d5bcd3dd0c447b4e0a6e
|
|
4
|
+
data.tar.gz: 54cca637de421c26d2144df100f8430f5dddb639b3a37d1160dfbd9704630e33
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ef853b504ee701e77911259352d0057f6e327db8022370a1fa0dbaa597bfadcac7ac1c8fbe746c04ec4669d86384279a259ac35d4126f1483286937812cb7cce
|
|
7
|
+
data.tar.gz: 728d684dd02b653cc779aa5e8152766f5dccafcb097e8fca5810b90f994c442645119f32efb62c84d8fbf1645109f579a3c74f6eb88c9e21dd03faa72193cdd0
|
data/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,28 @@
|
|
|
1
|
-
## [
|
|
1
|
+
## [0.4.1] - 2025-12-05
|
|
2
|
+
|
|
3
|
+
### Added
|
|
4
|
+
|
|
5
|
+
- handle extra whitespace in csvs
|
|
6
|
+
|
|
7
|
+
### Maintenance
|
|
8
|
+
|
|
9
|
+
- remove unused initializer format
|
|
10
|
+
## [0.4.0] - 2025-12-04
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- allow setting maintain_trip_dependencies=false
|
|
15
|
+
|
|
16
|
+
### Fixed
|
|
17
|
+
|
|
18
|
+
- parse stop_lat as float
|
|
19
|
+
- add missing agency -> fare_attributes edge
|
|
20
|
+
- allow null for fare_rules
|
|
21
|
+
|
|
22
|
+
### Maintenance
|
|
23
|
+
|
|
24
|
+
- provide accessor for gtfs_files (utility)
|
|
25
|
+
- add yard docs
|
|
2
26
|
|
|
3
27
|
## [0.3.0] - 2025-12-04
|
|
4
28
|
|
|
@@ -10,20 +10,21 @@ module GtfsDf
|
|
|
10
10
|
if input.is_a?(Polars::DataFrame)
|
|
11
11
|
input
|
|
12
12
|
elsif input.is_a?(String)
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
dtypes =
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
|
|
13
|
+
# TODO: use `infer_schema: false` instead of `infer_schema_length` after polars release:
|
|
14
|
+
# https://github.com/ankane/ruby-polars/blob/master/CHANGELOG.md#100-unreleased
|
|
15
|
+
df = Polars.read_csv(input, infer_schema_length: 0)
|
|
16
|
+
dtypes = self.class::SCHEMA.slice(*df.columns)
|
|
17
|
+
|
|
18
|
+
df
|
|
19
|
+
.with_columns(dtypes.keys.map do |col|
|
|
20
|
+
stripped = Polars.col(col).str.strip
|
|
21
|
+
Polars.when(stripped.str.len_chars.gt(0))
|
|
22
|
+
.then(stripped)
|
|
23
|
+
.otherwise(Polars.lit(nil))
|
|
24
|
+
end)
|
|
25
|
+
.with_columns(dtypes.map do |name, type|
|
|
26
|
+
Polars.col(name).cast(type)
|
|
27
|
+
end)
|
|
27
28
|
else
|
|
28
29
|
throw GtfsDf::Error, "Unrecognized input"
|
|
29
30
|
end
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -36,7 +36,8 @@ module GtfsDf
|
|
|
36
36
|
booking_rules
|
|
37
37
|
].freeze
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
attr_accessor(*GTFS_FILES)
|
|
40
|
+
attr_reader(:graph)
|
|
40
41
|
|
|
41
42
|
# Initialize with a hash of DataFrames
|
|
42
43
|
REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
|
|
@@ -71,8 +72,16 @@ module GtfsDf
|
|
|
71
72
|
end
|
|
72
73
|
|
|
73
74
|
# Filter the feed using a view hash
|
|
74
|
-
#
|
|
75
|
-
|
|
75
|
+
#
|
|
76
|
+
# @param view [Hash] The view used to filter the feed, with format { file => filters }.
|
|
77
|
+
# Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
|
|
78
|
+
# @param maintain_trip_dependencies [Boolean] Whether trip dependencies should be preserved.
|
|
79
|
+
# By default, we treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
|
|
80
|
+
# referenced by TripA, we will preserve _all stops_ referenced by TripA. However, it is
|
|
81
|
+
# occasionally useful to prune bad data and _not_ maintain all trip dependencies.
|
|
82
|
+
# For example, if StopA contains invalid coordinates, we may wish to filter it out but keep
|
|
83
|
+
# the other stops for TripA. In this case, `maintain_trip_dependencies` should be set to false.
|
|
84
|
+
def filter(view, maintain_trip_dependencies = true)
|
|
76
85
|
filtered = {}
|
|
77
86
|
|
|
78
87
|
GTFS_FILES.each do |file|
|
|
@@ -82,22 +91,28 @@ module GtfsDf
|
|
|
82
91
|
filtered[file] = df
|
|
83
92
|
end
|
|
84
93
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
if maintain_trip_dependencies
|
|
95
|
+
# Trips are the atomic unit of GTFS, we will generate a new view
|
|
96
|
+
# based on the set of trips that would be included for each invidual filter
|
|
97
|
+
# and cascade changes from this view in order to retain referential integrity
|
|
98
|
+
trip_ids = nil
|
|
99
|
+
|
|
100
|
+
view.each do |file, filters|
|
|
101
|
+
new_filtered = filter!(file, filters, filtered.dup)
|
|
102
|
+
trip_ids = if trip_ids.nil?
|
|
103
|
+
new_filtered["trips"]["trip_id"]
|
|
104
|
+
else
|
|
105
|
+
trip_ids & new_filtered["trips"]["trip_id"]
|
|
106
|
+
end
|
|
96
107
|
end
|
|
97
|
-
end
|
|
98
108
|
|
|
99
|
-
|
|
100
|
-
|
|
109
|
+
if trip_ids
|
|
110
|
+
filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
|
|
111
|
+
end
|
|
112
|
+
else
|
|
113
|
+
view.each do |file, filters|
|
|
114
|
+
filtered = filter!(file, filters, filtered.dup)
|
|
115
|
+
end
|
|
101
116
|
end
|
|
102
117
|
|
|
103
118
|
# Remove files that are empty, but keep required files even if empty
|
|
@@ -163,6 +178,7 @@ module GtfsDf
|
|
|
163
178
|
attrs[:dependencies].each do |dep|
|
|
164
179
|
parent_col = dep[parent_node_id]
|
|
165
180
|
child_col = dep[child_node_id]
|
|
181
|
+
allow_null = !!dep[:allow_null]
|
|
166
182
|
|
|
167
183
|
next unless parent_col && child_col &&
|
|
168
184
|
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
@@ -172,9 +188,11 @@ module GtfsDf
|
|
|
172
188
|
|
|
173
189
|
# Filter child to only include rows that reference valid parent values
|
|
174
190
|
before = child_df.height
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
191
|
+
filter = Polars.col(child_col).is_in(valid_values)
|
|
192
|
+
if allow_null
|
|
193
|
+
filter = (filter | Polars.col(child_col).is_null)
|
|
194
|
+
end
|
|
195
|
+
child_df = child_df.filter(filter)
|
|
178
196
|
changed = child_df.height < before
|
|
179
197
|
|
|
180
198
|
# If we removed a part of the child_df earlier, concat it back on
|
data/lib/gtfs_df/graph.rb
CHANGED
|
@@ -50,12 +50,16 @@ module GtfsDf
|
|
|
50
50
|
["agency", "routes", {dependencies: [
|
|
51
51
|
{"agency" => "agency_id", "routes" => "agency_id"}
|
|
52
52
|
]}],
|
|
53
|
+
["fare_attributes", "agency", {dependencies: [
|
|
54
|
+
{"fare_attributes" => "agency_id",
|
|
55
|
+
"agency" => "agency_id"}
|
|
56
|
+
]}],
|
|
53
57
|
["fare_attributes", "fare_rules", {dependencies: [
|
|
54
58
|
{"fare_attributes" => "fare_id",
|
|
55
59
|
"fare_rules" => "fare_id"}
|
|
56
60
|
]}],
|
|
57
61
|
["fare_rules", "routes", {dependencies: [
|
|
58
|
-
{"fare_rules" => "route_id", "routes" => "route_id"}
|
|
62
|
+
{"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
|
|
59
63
|
]}],
|
|
60
64
|
["routes", "trips", {dependencies: [
|
|
61
65
|
{"routes" => "route_id", "trips" => "route_id"}
|
data/lib/gtfs_df/schema/stops.rb
CHANGED
|
@@ -9,7 +9,7 @@ module GtfsDf
|
|
|
9
9
|
"stop_name" => Polars::String,
|
|
10
10
|
"tts_stop_name" => Polars::String,
|
|
11
11
|
"stop_desc" => Polars::String,
|
|
12
|
-
"stop_lat" => Polars::
|
|
12
|
+
"stop_lat" => Polars::Float64,
|
|
13
13
|
"stop_lon" => Polars::Float64,
|
|
14
14
|
"zone_id" => Polars::String,
|
|
15
15
|
"stop_url" => Polars::String,
|
data/lib/gtfs_df/version.rb
CHANGED