gtfs_df 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +16 -2
- data/lib/gtfs_df/feed.rb +32 -9
- data/lib/gtfs_df/graph.rb +43 -6
- data/lib/gtfs_df/reader.rb +8 -11
- data/lib/gtfs_df/schema/enum_values.rb +5 -2
- data/lib/gtfs_df/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 128899724d4613f0170fa601cb0c4cdc9f88a51b2fcd6c06108e7cc9acf9201c
|
|
4
|
+
data.tar.gz: 14c30a678bc9a623233c6631be1c7d497fff6bae4b30abd88406c297bc76190b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b7e3d1ac85953b995b82d0abbb9f872cf5307564ba67949320d607190ef5a7ff7f8e34df241cc584d7b36fc8db9c4159a7bf74ef1ce6d6868e4a3155784a9ec9
|
|
7
|
+
data.tar.gz: a1067ff3a0912b3eb56e3348e707c777563b50f440084a5b8c83b2d3da6ebf8a8a7ed49093a756d8871f1d3d849906daa844a0a66c04e56f97b03bfe2a106d79
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
## [Unreleased]
|
|
2
|
+
|
|
3
|
+
## [0.3.0] - 2025-12-04
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- keep parent stations linked to used stops
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
|
|
11
|
+
- handle null values
|
|
12
|
+
- update lock on version bump
|
|
13
|
+
|
|
14
|
+
### Maintenance
|
|
15
|
+
|
|
16
|
+
- reuse load_from_dir logic in reader
|
|
17
|
+
- clean up unused method + better comments
|
|
1
18
|
## [0.1.0] - 2025-11-10
|
|
2
19
|
|
|
3
20
|
- Initial release
|
data/README.md
CHANGED
|
@@ -86,11 +86,25 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
|
86
86
|
|
|
87
87
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
88
88
|
|
|
89
|
+
## Release process
|
|
90
|
+
|
|
91
|
+
1. `bin/bump-version`
|
|
92
|
+
|
|
93
|
+
- Bump the version in `lib/gtfs_df/version.rb`
|
|
94
|
+
- Update the `CHANGELOG.md` using the git log since the last version
|
|
95
|
+
- Create and push a new release branch with those changes
|
|
96
|
+
- Create a PR for that release
|
|
97
|
+
|
|
98
|
+
2. `bin/create-tag`
|
|
99
|
+
|
|
100
|
+
Creates and pushes the git tag for the release. That will trigger the GitHub action: `.github/workflows/publish.yml` to publish to RubyGems.
|
|
101
|
+
|
|
89
102
|
## TODO
|
|
90
103
|
|
|
91
104
|
- [ ] Time parsing
|
|
92
|
-
|
|
93
|
-
|
|
105
|
+
|
|
106
|
+
Just like partridge, we should parse Time as seconds since midnight. There's a draft in `lib/gtfs_df/utils.rb` but it's not used anywhere.
|
|
107
|
+
I haven't figured out how to properly implement that with Polars.
|
|
94
108
|
|
|
95
109
|
## Contributing
|
|
96
110
|
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -135,19 +135,34 @@ module GtfsDf
|
|
|
135
135
|
filtered
|
|
136
136
|
end
|
|
137
137
|
|
|
138
|
+
# Traverses the grah to prune unreferenced entities from child dataframes
|
|
139
|
+
# based on parent relationships. See GtfsDf::Graph::STOP_NODES
|
|
138
140
|
def prune!(root, filtered)
|
|
139
|
-
graph.each_bfs_edge(root) do |
|
|
140
|
-
|
|
141
|
+
graph.each_bfs_edge(root) do |parent_node_id, child_node_id|
|
|
142
|
+
parent_node = Graph::NODES[parent_node_id]
|
|
143
|
+
child_node = Graph::NODES[child_node_id]
|
|
144
|
+
parent_df = filtered[parent_node.fetch(:file)]
|
|
141
145
|
next unless parent_df
|
|
142
146
|
|
|
143
|
-
child_df = filtered[
|
|
147
|
+
child_df = filtered[child_node.fetch(:file)]
|
|
148
|
+
# Certain nodes are pre-filtered because they reference only
|
|
149
|
+
# a piece of the dataframe
|
|
150
|
+
filter_attrs = child_node[:filter_attrs]
|
|
151
|
+
if filter_attrs && child_df.columns.include?(filter_attrs.fetch(:filter_col))
|
|
152
|
+
filter = filter_attrs.fetch(:filter)
|
|
153
|
+
# Temporarily remove rows that do not match node filter criteria to process them
|
|
154
|
+
# separately (e.g., when filtering stops, parent stations that should be preserved
|
|
155
|
+
# regardless of direct references)
|
|
156
|
+
saved_vals = child_df.filter(filter.is_not)
|
|
157
|
+
child_df = child_df.filter(filter)
|
|
158
|
+
end
|
|
144
159
|
next unless child_df && child_df.height > 0
|
|
145
160
|
|
|
146
|
-
attrs = graph.get_edge_data(
|
|
161
|
+
attrs = graph.get_edge_data(parent_node_id, child_node_id)
|
|
147
162
|
|
|
148
163
|
attrs[:dependencies].each do |dep|
|
|
149
|
-
parent_col = dep[
|
|
150
|
-
child_col = dep[
|
|
164
|
+
parent_col = dep[parent_node_id]
|
|
165
|
+
child_col = dep[child_node_id]
|
|
151
166
|
|
|
152
167
|
next unless parent_col && child_col &&
|
|
153
168
|
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
@@ -157,10 +172,18 @@ module GtfsDf
|
|
|
157
172
|
|
|
158
173
|
# Filter child to only include rows that reference valid parent values
|
|
159
174
|
before = child_df.height
|
|
160
|
-
child_df = child_df.filter(
|
|
175
|
+
child_df = child_df.filter(
|
|
176
|
+
Polars.col(child_col).is_in(valid_values)
|
|
177
|
+
)
|
|
178
|
+
changed = child_df.height < before
|
|
179
|
+
|
|
180
|
+
# If we removed a part of the child_df earlier, concat it back on
|
|
181
|
+
if saved_vals
|
|
182
|
+
child_df = Polars.concat([child_df, saved_vals], how: "vertical")
|
|
183
|
+
end
|
|
161
184
|
|
|
162
|
-
if
|
|
163
|
-
filtered[
|
|
185
|
+
if changed
|
|
186
|
+
filtered[child_node.fetch(:file)] = child_df
|
|
164
187
|
end
|
|
165
188
|
end
|
|
166
189
|
end
|
data/lib/gtfs_df/graph.rb
CHANGED
|
@@ -2,15 +2,48 @@
|
|
|
2
2
|
|
|
3
3
|
module GtfsDf
|
|
4
4
|
class Graph
|
|
5
|
+
FILES = %w[
|
|
6
|
+
agency routes trips stop_times calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
|
|
7
|
+
fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
|
|
8
|
+
stop_areas fare_leg_rules
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
STANDARD_FILE_NODES = FILES.map do |file|
|
|
12
|
+
[file, {id: file, file: file, filter: nil}]
|
|
13
|
+
end.to_h.freeze
|
|
14
|
+
|
|
15
|
+
# Separate node definitions for stops and parent stations to handle the self-referential
|
|
16
|
+
# relationship in stops.txt where stops reference parent stations via parent_station column.
|
|
17
|
+
# This allows filtering to preserve parent stations when their child stops are referenced.
|
|
18
|
+
STOP_NODES = {
|
|
19
|
+
"stops" => {
|
|
20
|
+
id: "stops",
|
|
21
|
+
file: "stops",
|
|
22
|
+
filter_attrs: {
|
|
23
|
+
filter_col: "location_type",
|
|
24
|
+
filter: Polars.col("location_type").is_in(
|
|
25
|
+
Schema::EnumValues::STOP_LOCATION_TYPES.map(&:first)
|
|
26
|
+
) | Polars.col("location_type").is_null
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"parent_stations" => {
|
|
30
|
+
id: "parent_stations",
|
|
31
|
+
file: "stops",
|
|
32
|
+
filter_attrs: {
|
|
33
|
+
filter_col: "location_type",
|
|
34
|
+
filter: Polars.col("location_type").is_in(
|
|
35
|
+
Schema::EnumValues::STATION_LOCATION_TYPES.map(&:first)
|
|
36
|
+
) & Polars.col("location_type").is_not_null
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
NODES = STANDARD_FILE_NODES.merge(STOP_NODES).freeze
|
|
42
|
+
|
|
5
43
|
# Returns a directed graph of GTFS file dependencies
|
|
6
44
|
def self.build
|
|
7
45
|
g = NetworkX::Graph.new
|
|
8
|
-
|
|
9
|
-
files = %w[
|
|
10
|
-
agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
|
|
11
|
-
fare_leg_join_rules fare_transfer_rules areas networks route_networks location_groups location_group_stops booking_rules
|
|
12
|
-
]
|
|
13
|
-
files.each { |f| g.add_node(f) }
|
|
46
|
+
NODES.keys.each { |node| g.add_node(node) }
|
|
14
47
|
|
|
15
48
|
# TODO: Add fare_rules -> stops + test
|
|
16
49
|
edges = [
|
|
@@ -33,6 +66,10 @@ module GtfsDf
|
|
|
33
66
|
["stop_times", "stops", {dependencies: [
|
|
34
67
|
{"stop_times" => "stop_id", "stops" => "stop_id"}
|
|
35
68
|
]}],
|
|
69
|
+
# Self-referential edge: stops can reference parent stations (location_type=1)
|
|
70
|
+
["stops", "parent_stations", {dependencies: [
|
|
71
|
+
{"stops" => "parent_station", "parent_stations" => "stop_id"}
|
|
72
|
+
]}],
|
|
36
73
|
["stops", "transfers", {dependencies: [
|
|
37
74
|
{"stops" => "stop_id", "transfers" => "from_stop_id"},
|
|
38
75
|
{"stops" => "stop_id", "transfers" => "to_stop_id"}
|
data/lib/gtfs_df/reader.rb
CHANGED
|
@@ -4,24 +4,21 @@ module GtfsDf
|
|
|
4
4
|
class Reader
|
|
5
5
|
# Loads a GTFS zip file and returns a Feed
|
|
6
6
|
def self.load_from_zip(zip_path)
|
|
7
|
-
data =
|
|
7
|
+
data = nil
|
|
8
|
+
|
|
8
9
|
Dir.mktmpdir do |tmpdir|
|
|
9
10
|
Zip::File.open(zip_path) do |zip_file|
|
|
10
11
|
zip_file.each do |entry|
|
|
11
12
|
next unless entry.file?
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
next unless entry.name == "#{gtfs_file}.txt"
|
|
15
|
-
|
|
16
|
-
out_path = File.join(tmpdir, entry.name)
|
|
17
|
-
entry.extract(out_path)
|
|
18
|
-
|
|
19
|
-
data[gtfs_file] = data_frame(gtfs_file, out_path)
|
|
20
|
-
end
|
|
13
|
+
out_path = File.join(tmpdir, entry.name)
|
|
14
|
+
entry.extract(out_path)
|
|
21
15
|
end
|
|
22
16
|
end
|
|
17
|
+
|
|
18
|
+
data = load_from_dir(tmpdir)
|
|
23
19
|
end
|
|
24
|
-
|
|
20
|
+
|
|
21
|
+
data
|
|
25
22
|
end
|
|
26
23
|
|
|
27
24
|
# Loads a GTFS dir and returns a Feed
|
|
@@ -82,13 +82,16 @@ module GtfsDf
|
|
|
82
82
|
|
|
83
83
|
# stops.txt
|
|
84
84
|
# location_type: Type of location
|
|
85
|
-
|
|
85
|
+
STOP_LOCATION_TYPES = [
|
|
86
86
|
["0", "Stop or platform"],
|
|
87
|
-
["1", "Station"],
|
|
88
87
|
["2", "Entrance/Exit"],
|
|
89
88
|
["3", "Generic Node"],
|
|
90
89
|
["4", "Boarding Area"]
|
|
91
90
|
]
|
|
91
|
+
STATION_LOCATION_TYPES = [
|
|
92
|
+
["1", "Station"]
|
|
93
|
+
]
|
|
94
|
+
LOCATION_TYPE = STOP_LOCATION_TYPES + STATION_LOCATION_TYPES
|
|
92
95
|
|
|
93
96
|
# wheelchair_boarding: Indicates wheelchair boarding possibility
|
|
94
97
|
WHEELCHAIR_BOARDING = [
|
data/lib/gtfs_df/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gtfs_df
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Mejorado
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 1980-01-
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: networkx
|