gtfs_df 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +16 -2
- data/lib/gtfs_df/feed.rb +67 -26
- data/lib/gtfs_df/graph.rb +48 -7
- data/lib/gtfs_df/reader.rb +8 -11
- data/lib/gtfs_df/schema/enum_values.rb +5 -2
- data/lib/gtfs_df/schema/stops.rb +1 -1
- data/lib/gtfs_df/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 127f50187dd26c4824abd94a6a44f64242715bf9092072276cdf6c811e0c28a6
|
|
4
|
+
data.tar.gz: d5a6bd9b25830d0574c6c315712d1ec4e0b14959d25f6a1cf12f2e13eeb738a2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9312f173960b069a31bb2bb37368bd65ad5345d527d5a29818a7485c731bcb6f6c5fb74639403a3bcbd5d5704ae2bbf629ed5cdfe3f5f629f36ea719663356da
|
|
7
|
+
data.tar.gz: 19563c256e1cbe52a34eef47c31f6b50806a0cb07b773bacb015ad40730c6d804941aef1c936aaeb6c17ea49719ab7db64d1144eeb05c7c4bd1d83ebcd87e12f
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,35 @@
|
|
|
1
|
+
## [0.4.0] - 2025-12-04
|
|
2
|
+
|
|
3
|
+
### Added
|
|
4
|
+
|
|
5
|
+
- allow setting maintain_trip_dependencies=false
|
|
6
|
+
|
|
7
|
+
### Fixed
|
|
8
|
+
|
|
9
|
+
- parse stop_lat as float
|
|
10
|
+
- add missing agency -> fare_attributes edge
|
|
11
|
+
- allow null for fare_rules
|
|
12
|
+
|
|
13
|
+
### Maintenance
|
|
14
|
+
|
|
15
|
+
- provide accessor for gtfs_files (utility)
|
|
16
|
+
- add yard docs
|
|
17
|
+
|
|
18
|
+
## [0.3.0] - 2025-12-04
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- keep parent stations linked to used stops
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
|
|
26
|
+
- handle null values
|
|
27
|
+
- update lock on version bump
|
|
28
|
+
|
|
29
|
+
### Maintenance
|
|
30
|
+
|
|
31
|
+
- reuse load_from_dir logic in reader
|
|
32
|
+
- clean up unused method + better comments
|
|
1
33
|
## [0.1.0] - 2025-11-10
|
|
2
34
|
|
|
3
35
|
- Initial release
|
data/README.md
CHANGED
|
@@ -86,11 +86,25 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
|
86
86
|
|
|
87
87
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
88
88
|
|
|
89
|
+
## Release process
|
|
90
|
+
|
|
91
|
+
1. `bin/bump-version`
|
|
92
|
+
|
|
93
|
+
- Bump the version in `lib/gtfs_df/version.rb`
|
|
94
|
+
- Update the `CHANGELOG.md` using the git log since the last version
|
|
95
|
+
- Create and push a new release branch with those changes
|
|
96
|
+
- Create a PR for that release
|
|
97
|
+
|
|
98
|
+
2. `bin/create-tag`
|
|
99
|
+
|
|
100
|
+
Creates and pushes the git tag for the release. That will trigger the GitHub action: `.github/workflows/publish.yml` to publish to RubyGems.
|
|
101
|
+
|
|
89
102
|
## TODO
|
|
90
103
|
|
|
91
104
|
- [ ] Time parsing
|
|
92
|
-
|
|
93
|
-
|
|
105
|
+
|
|
106
|
+
Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
|
|
107
|
+
I haven't figured out how to properly implement that with Polars.
|
|
94
108
|
|
|
95
109
|
## Contributing
|
|
96
110
|
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -36,7 +36,8 @@ module GtfsDf
|
|
|
36
36
|
booking_rules
|
|
37
37
|
].freeze
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
attr_accessor(*GTFS_FILES)
|
|
40
|
+
attr_reader(:graph)
|
|
40
41
|
|
|
41
42
|
# Initialize with a hash of DataFrames
|
|
42
43
|
REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
|
|
@@ -71,8 +72,16 @@ module GtfsDf
|
|
|
71
72
|
end
|
|
72
73
|
|
|
73
74
|
# Filter the feed using a view hash
|
|
74
|
-
#
|
|
75
|
-
|
|
75
|
+
#
|
|
76
|
+
# @param view [Hash] The view used to filter the feed, with format { file => filters }.
|
|
77
|
+
# Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
|
|
78
|
+
# @param maintain_trip_dependencies [Boolean] Whether trip dependencies should be preserved.
|
|
79
|
+
# By default, we treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
|
|
80
|
+
# referenced by TripA, we will preserve _all stops_ referenced by TripA. However, it is
|
|
81
|
+
# occasionally useful to prune bad data and _not_ maintain all trip dependencies.
|
|
82
|
+
# For example, if StopA contains invalid coordinates, we may wish to filter it out but keep
|
|
83
|
+
# the other stops for TripA. In this case, `maintain_trip_dependencies` should be set to false.
|
|
84
|
+
def filter(view, maintain_trip_dependencies = true)
|
|
76
85
|
filtered = {}
|
|
77
86
|
|
|
78
87
|
GTFS_FILES.each do |file|
|
|
@@ -82,22 +91,28 @@ module GtfsDf
|
|
|
82
91
|
filtered[file] = df
|
|
83
92
|
end
|
|
84
93
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
if maintain_trip_dependencies
|
|
95
|
+
# Trips are the atomic unit of GTFS, we will generate a new view
|
|
96
|
+
# based on the set of trips that would be included for each invidual filter
|
|
97
|
+
# and cascade changes from this view in order to retain referential integrity
|
|
98
|
+
trip_ids = nil
|
|
99
|
+
|
|
100
|
+
view.each do |file, filters|
|
|
101
|
+
new_filtered = filter!(file, filters, filtered.dup)
|
|
102
|
+
trip_ids = if trip_ids.nil?
|
|
103
|
+
new_filtered["trips"]["trip_id"]
|
|
104
|
+
else
|
|
105
|
+
trip_ids & new_filtered["trips"]["trip_id"]
|
|
106
|
+
end
|
|
96
107
|
end
|
|
97
|
-
end
|
|
98
108
|
|
|
99
|
-
|
|
100
|
-
|
|
109
|
+
if trip_ids
|
|
110
|
+
filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
|
|
111
|
+
end
|
|
112
|
+
else
|
|
113
|
+
view.each do |file, filters|
|
|
114
|
+
filtered = filter!(file, filters, filtered.dup)
|
|
115
|
+
end
|
|
101
116
|
end
|
|
102
117
|
|
|
103
118
|
# Remove files that are empty, but keep required files even if empty
|
|
@@ -135,19 +150,35 @@ module GtfsDf
|
|
|
135
150
|
filtered
|
|
136
151
|
end
|
|
137
152
|
|
|
153
|
+
# Traverses the grah to prune unreferenced entities from child dataframes
|
|
154
|
+
# based on parent relationships. See GtfsDf::Graph::STOP_NODES
|
|
138
155
|
def prune!(root, filtered)
|
|
139
|
-
graph.each_bfs_edge(root) do |
|
|
140
|
-
|
|
156
|
+
graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
|
|
157
|
+
parent_node = Graph::NODES[parent_node_id]
|
|
158
|
+
child_node = Graph::NODES[child_node_id]
|
|
159
|
+
parent_df = filtered[parent_node.fetch(:file)]
|
|
141
160
|
next unless parent_df
|
|
142
161
|
|
|
143
|
-
child_df = filtered[
|
|
162
|
+
child_df = filtered[child_node.fetch(:file)]
|
|
163
|
+
# Certain nodes are pre-filtered because they reference only
|
|
164
|
+
# a piece of the dataframe
|
|
165
|
+
filter_attrs = child_node[:filter_attrs]
|
|
166
|
+
if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
|
|
167
|
+
filter = filter_attrs.fetch(:filter)
|
|
168
|
+
# Temporarily remove rows that do not match node filter criteria to process them
|
|
169
|
+
# separately (e.g., when filtering stops, parent stations that should be preserved
|
|
170
|
+
# regardless of direct references)
|
|
171
|
+
saved_vals = child_df.filter(filter.is_not)
|
|
172
|
+
child_df = child_df.filter(filter)
|
|
173
|
+
end
|
|
144
174
|
next unless child_df && child_df.height > 0
|
|
145
175
|
|
|
146
|
-
attrs = graph.get_edge_data(
|
|
176
|
+
attrs = graph.get_edge_data(parent_node_id, child_node_id)
|
|
147
177
|
|
|
148
178
|
attrs[:dependencies].each do |dep|
|
|
149
|
-
parent_col = dep[
|
|
150
|
-
child_col = dep[
|
|
179
|
+
parent_col = dep[parent_node_id]
|
|
180
|
+
child_col = dep[child_node_id]
|
|
181
|
+
allow_null = !!dep[:allow_null]
|
|
151
182
|
|
|
152
183
|
next unless parent_col && child_col &&
|
|
153
184
|
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
@@ -157,10 +188,20 @@ module GtfsDf
|
|
|
157
188
|
|
|
158
189
|
# Filter child to only include rows that reference valid parent values
|
|
159
190
|
before = child_df.height
|
|
160
|
-
|
|
191
|
+
filter = Polars.col(child_col).is_in(valid_values)
|
|
192
|
+
if allow_null
|
|
193
|
+
filter = (filter | Polars.col(child_col).is_null)
|
|
194
|
+
end
|
|
195
|
+
child_df = child_df.filter(filter)
|
|
196
|
+
changed = child_df.height < before
|
|
197
|
+
|
|
198
|
+
# If we removed a part of the child_df earlier, concat it back on
|
|
199
|
+
if saved_vals
|
|
200
|
+
child_df = Polars.concat([child_df, saved_vals], how: "vertical")
|
|
201
|
+
end
|
|
161
202
|
|
|
162
|
-
if
|
|
163
|
-
filtered[
|
|
203
|
+
if changed
|
|
204
|
+
filtered[child_node.fetch(:file)] = child_df
|
|
164
205
|
end
|
|
165
206
|
end
|
|
166
207
|
end
|
data/lib/gtfs_df/graph.rb
CHANGED
|
@@ -2,27 +2,64 @@
|
|
|
2
2
|
|
|
3
3
|
module GtfsDf
|
|
4
4
|
class Graph
|
|
5
|
+
FILES = %w[
|
|
6
|
+
agency routes trips stop_times calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
|
|
7
|
+
fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
|
|
8
|
+
stop_areas fare_leg_rules
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
STANDARD_FILE_NODES = FILES.map do |file|
|
|
12
|
+
[file, {id: file, file: file, filter: nil}]
|
|
13
|
+
end.to_h.freeze
|
|
14
|
+
|
|
15
|
+
# Separate node definitions for stops and parent stations to handle the self-referential
|
|
16
|
+
# relationship in stops.txt where stops reference parent stations via parent_station column.
|
|
17
|
+
# This allows filtering to preserve parent stations when their child stops are referenced.
|
|
18
|
+
STOP_NODES = {
|
|
19
|
+
"stops" => {
|
|
20
|
+
id: "stops",
|
|
21
|
+
file: "stops",
|
|
22
|
+
filter_attrs: {
|
|
23
|
+
filter_col: "location_type",
|
|
24
|
+
filter: Polars.col("location_type").is_in(
|
|
25
|
+
Schema::EnumValues::STOP_LOCATION_TYPES.map(&:first)
|
|
26
|
+
) | Polars.col("location_type").is_null
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"parent_stations" => {
|
|
30
|
+
id: "parent_stations",
|
|
31
|
+
file: "stops",
|
|
32
|
+
filter_attrs: {
|
|
33
|
+
filter_col: "location_type",
|
|
34
|
+
filter: Polars.col("location_type").is_in(
|
|
35
|
+
Schema::EnumValues::STATION_LOCATION_TYPES.map(&:first)
|
|
36
|
+
) & Polars.col("location_type").is_not_null
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
NODES = STANDARD_FILE_NODES.merge(STOP_NODES).freeze
|
|
42
|
+
|
|
5
43
|
# Returns a directed graph of GTFS file dependencies
|
|
6
44
|
def self.build
|
|
7
45
|
g = NetworkX::Graph.new
|
|
8
|
-
|
|
9
|
-
files = %w[
|
|
10
|
-
agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
|
|
11
|
-
fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
|
|
12
|
-
]
|
|
13
|
-
files.each { |f| g.add_node(f) }
|
|
46
|
+
NODES.keys.each { |node| g.add_node(node) }
|
|
14
47
|
|
|
15
48
|
# TODO: Add fare_rules -> stops + test
|
|
16
49
|
edges = [
|
|
17
50
|
["agency", "routes", {dependencies: [
|
|
18
51
|
{"agency" => "agency_id", "routes" => "agency_id"}
|
|
19
52
|
]}],
|
|
53
|
+
["fare_attributes", "agency", {dependencies: [
|
|
54
|
+
{"fare_attributes" => "agency_id",
|
|
55
|
+
"agency" => "agency_id"}
|
|
56
|
+
]}],
|
|
20
57
|
["fare_attributes", "fare_rules", {dependencies: [
|
|
21
58
|
{"fare_attributes" => "fare_id",
|
|
22
59
|
"fare_rules" => "fare_id"}
|
|
23
60
|
]}],
|
|
24
61
|
["fare_rules", "routes", {dependencies: [
|
|
25
|
-
{"fare_rules" => "route_id", "routes" => "route_id"}
|
|
62
|
+
{"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
|
|
26
63
|
]}],
|
|
27
64
|
["routes", "trips", {dependencies: [
|
|
28
65
|
{"routes" => "route_id", "trips" => "route_id"}
|
|
@@ -33,6 +70,10 @@ module GtfsDf
|
|
|
33
70
|
["stop_times", "stops", {dependencies: [
|
|
34
71
|
{"stop_times" => "stop_id", "stops" => "stop_id"}
|
|
35
72
|
]}],
|
|
73
|
+
# Self-referential edge: stops can reference parent stations (location_type=1)
|
|
74
|
+
["stops", "parent_stations", {dependencies: [
|
|
75
|
+
{"stops" => "parent_station", "parent_stations" => "stop_id"}
|
|
76
|
+
]}],
|
|
36
77
|
["stops", "transfers", {dependencies: [
|
|
37
78
|
{"stops" => "stop_id", "transfers" => "from_stop_id"},
|
|
38
79
|
{"stops" => "stop_id", "transfers" => "to_stop_id"}
|
data/lib/gtfs_df/reader.rb
CHANGED
|
@@ -4,24 +4,21 @@ module GtfsDf
|
|
|
4
4
|
class Reader
|
|
5
5
|
# Loads a GTFS zip file and returns a Feed
|
|
6
6
|
def self.load_from_zip(zip_path)
|
|
7
|
-
data =
|
|
7
|
+
data = nil
|
|
8
|
+
|
|
8
9
|
Dir.mktmpdir do |tmpdir|
|
|
9
10
|
Zip::File.open(zip_path) do |zip_file|
|
|
10
11
|
zip_file.each do |entry|
|
|
11
12
|
next unless entry.file?
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
next unless entry.name == "#{gtfs_file}.txt"
|
|
15
|
-
|
|
16
|
-
out_path = File.join(tmpdir, entry.name)
|
|
17
|
-
entry.extract(out_path)
|
|
18
|
-
|
|
19
|
-
data[gtfs_file] = data_frame(gtfs_file, out_path)
|
|
20
|
-
end
|
|
13
|
+
out_path = File.join(tmpdir, entry.name)
|
|
14
|
+
entry.extract(out_path)
|
|
21
15
|
end
|
|
22
16
|
end
|
|
17
|
+
|
|
18
|
+
data = load_from_dir(tmpdir)
|
|
23
19
|
end
|
|
24
|
-
|
|
20
|
+
|
|
21
|
+
data
|
|
25
22
|
end
|
|
26
23
|
|
|
27
24
|
# Loads a GTFS dir and returns a Feed
|
|
@@ -82,13 +82,16 @@ module GtfsDf
|
|
|
82
82
|
|
|
83
83
|
# stops.txt
|
|
84
84
|
# location_type: Type of location
|
|
85
|
-
|
|
85
|
+
STOP_LOCATION_TYPES = [
|
|
86
86
|
["0", "Stop or platform"],
|
|
87
|
-
["1", "Station"],
|
|
88
87
|
["2", "Entrance/Exit"],
|
|
89
88
|
["3", "Generic Node"],
|
|
90
89
|
["4", "Boarding Area"]
|
|
91
90
|
]
|
|
91
|
+
STATION_LOCATION_TYPES = [
|
|
92
|
+
["1", "Station"]
|
|
93
|
+
]
|
|
94
|
+
LOCATION_TYPE = STOP_LOCATION_TYPES + STATION_LOCATION_TYPES
|
|
92
95
|
|
|
93
96
|
# wheelchair_boarding: Indicates wheelchair boarding possibility
|
|
94
97
|
WHEELCHAIR_BOARDING = [
|
data/lib/gtfs_df/schema/stops.rb
CHANGED
|
@@ -9,7 +9,7 @@ module GtfsDf
|
|
|
9
9
|
"stop_name" => Polars::String,
|
|
10
10
|
"tts_stop_name" => Polars::String,
|
|
11
11
|
"stop_desc" => Polars::String,
|
|
12
|
-
"stop_lat" => Polars::
|
|
12
|
+
"stop_lat" => Polars::Float64,
|
|
13
13
|
"stop_lon" => Polars::Float64,
|
|
14
14
|
"zone_id" => Polars::String,
|
|
15
15
|
"stop_url" => Polars::String,
|
data/lib/gtfs_df/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gtfs_df
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Mejorado
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 1980-01-
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: networkx
|