gtfs_df 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.conform.yaml +1 -1
- data/README.md +5 -4
- data/examples/split-by-agency/Gemfile.lock +1 -1
- data/examples/split-by-agency/split_by_agency.rb +9 -2
- data/lib/gtfs_df/base_gtfs_table.rb +5 -1
- data/lib/gtfs_df/feed.rb +82 -65
- data/lib/gtfs_df/graph.rb +2 -5
- data/lib/gtfs_df/reader.rb +21 -4
- data/lib/gtfs_df/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5b90534f9b41229026b91e4632c12dac2235ff615ac8138c301702e51bef3dfa
|
|
4
|
+
data.tar.gz: 95ac49f7dcdecea5f08c2fe51f8f50b0466bcb75a1d8ae67720b464c2983ed88
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2fc195cb47d81dd4799d99d8182f8c5ff7b56cdfdee08281ea8a129568b87f38d719956fd98532604dc26bf97dee344bbdd65943cc1e292248a2375b76290eda
|
|
7
|
+
data.tar.gz: 0d4b2e22968e5ed37349c82ff84c9ba8a6f4d12c143a8709caa40998e16afe4a2178f820f68bb384dc4cf12de445a9f0d265d163d48071621e69f64501db76d5
|
data/.conform.yaml
CHANGED
data/README.md
CHANGED
|
@@ -8,18 +8,16 @@ This project was created to bring the power of [partridge] to ruby.
|
|
|
8
8
|
|
|
9
9
|
## Installation
|
|
10
10
|
|
|
11
|
-
TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
|
|
12
|
-
|
|
13
11
|
Install the gem and add to the application's Gemfile by executing:
|
|
14
12
|
|
|
15
13
|
```bash
|
|
16
|
-
bundle add
|
|
14
|
+
bundle add gtfs_df
|
|
17
15
|
```
|
|
18
16
|
|
|
19
17
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
|
20
18
|
|
|
21
19
|
```bash
|
|
22
|
-
gem install
|
|
20
|
+
gem install gtfs_df
|
|
23
21
|
```
|
|
24
22
|
|
|
25
23
|
## Usage
|
|
@@ -32,6 +30,9 @@ require 'gtfs_df'
|
|
|
32
30
|
# Load from a zip file
|
|
33
31
|
feed = GtfsDf::Reader.load_from_zip('path/to/gtfs.zip')
|
|
34
32
|
|
|
33
|
+
# Or, load from a directory
|
|
34
|
+
feed = GtfsDf::Reader.load_from_dir('path/to/gtfs_dir')
|
|
35
|
+
|
|
35
36
|
# Access dataframes for each GTFS file
|
|
36
37
|
puts feed.agency.head
|
|
37
38
|
puts feed.routes.head
|
|
@@ -40,12 +40,19 @@ end
|
|
|
40
40
|
|
|
41
41
|
agency_ids.each do |agency_id|
|
|
42
42
|
Whirly.start do
|
|
43
|
-
Whirly.status = "-> #{agency_id} filtering..."
|
|
44
43
|
output_path = File.join(output_dir, "#{agency_id}.zip")
|
|
44
|
+
|
|
45
|
+
start_time = Time.now
|
|
46
|
+
|
|
47
|
+
Whirly.status = "-> #{agency_id} filtering..."
|
|
45
48
|
filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
|
|
49
|
+
|
|
46
50
|
Whirly.status = "-> #{agency_id} writing..."
|
|
47
51
|
GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
|
|
48
|
-
|
|
52
|
+
|
|
53
|
+
elapsed = Time.now - start_time
|
|
54
|
+
|
|
55
|
+
Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
|
|
49
56
|
end
|
|
50
57
|
end
|
|
51
58
|
|
|
@@ -10,7 +10,11 @@ module GtfsDf
|
|
|
10
10
|
if input.is_a?(Polars::DataFrame)
|
|
11
11
|
input
|
|
12
12
|
elsif input.is_a?(String)
|
|
13
|
-
|
|
13
|
+
# We need to account for extra columns due to: https://github.com/ankane/ruby-polars/issues/125
|
|
14
|
+
all_columns = Polars.scan_csv(input).columns
|
|
15
|
+
default_schema = all_columns.map { |c| [c, Polars::String] }.to_h
|
|
16
|
+
dtypes = default_schema.merge(self.class::SCHEMA)
|
|
17
|
+
Polars.read_csv(input, null_values: [""], dtypes:)
|
|
14
18
|
elsif input.is_a?(Array)
|
|
15
19
|
head, *body = input
|
|
16
20
|
df_input = body.each_with_object({}) do |row, acc|
|
data/lib/gtfs_df/feed.rb
CHANGED
|
@@ -36,7 +36,7 @@ module GtfsDf
|
|
|
36
36
|
booking_rules
|
|
37
37
|
].freeze
|
|
38
38
|
|
|
39
|
-
attr_reader(*GTFS_FILES)
|
|
39
|
+
attr_reader(*GTFS_FILES, :graph)
|
|
40
40
|
|
|
41
41
|
# Initialize with a hash of DataFrames
|
|
42
42
|
REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
|
|
@@ -53,6 +53,8 @@ module GtfsDf
|
|
|
53
53
|
end.join(", ")}"
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
@graph = GtfsDf::Graph.build
|
|
57
|
+
|
|
56
58
|
GTFS_FILES.each do |file|
|
|
57
59
|
df = data[file]
|
|
58
60
|
schema_class_name = file.split("_").map(&:capitalize).join
|
|
@@ -68,85 +70,100 @@ module GtfsDf
|
|
|
68
70
|
end
|
|
69
71
|
end
|
|
70
72
|
|
|
71
|
-
# Load from a directory of GTFS CSV files
|
|
72
|
-
def self.load_from_dir(dir)
|
|
73
|
-
data = {}
|
|
74
|
-
GTFS_FILES.each do |file|
|
|
75
|
-
path = File.join(dir, "#{file}.txt")
|
|
76
|
-
next unless File.exist?(path)
|
|
77
|
-
|
|
78
|
-
schema_class_name = file.split("_").map(&:capitalize).join
|
|
79
|
-
|
|
80
|
-
data[file] = GtfsDf::Schema.const_get(schema_class_name).new(path)
|
|
81
|
-
end
|
|
82
|
-
new(data)
|
|
83
|
-
end
|
|
84
|
-
|
|
85
73
|
# Filter the feed using a view hash
|
|
86
74
|
# Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
|
|
87
75
|
def filter(view)
|
|
88
76
|
filtered = {}
|
|
89
|
-
|
|
90
|
-
# Step 1: Apply view filters
|
|
77
|
+
|
|
91
78
|
GTFS_FILES.each do |file|
|
|
92
79
|
df = send(file)
|
|
93
80
|
next unless df
|
|
94
81
|
|
|
95
|
-
filters = view[file]
|
|
96
|
-
if filters && !filters.empty?
|
|
97
|
-
filters.each do |col, val|
|
|
98
|
-
df = if val.is_a?(Array)
|
|
99
|
-
df.filter(Polars.col(col).is_in(val))
|
|
100
|
-
elsif val.respond_to?(:call)
|
|
101
|
-
df.filter(val.call(Polars.col(col)))
|
|
102
|
-
else
|
|
103
|
-
df.filter(Polars.col(col).eq(val))
|
|
104
|
-
end
|
|
105
|
-
end
|
|
106
|
-
end
|
|
107
82
|
filtered[file] = df
|
|
108
83
|
end
|
|
109
|
-
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
next unless child_df && child_df.height > 0
|
|
122
|
-
|
|
123
|
-
attrs[:dependencies].each do |dep|
|
|
124
|
-
parent_col = dep[parent_file]
|
|
125
|
-
child_col = dep[child_file]
|
|
126
|
-
|
|
127
|
-
next unless parent_col && child_col &&
|
|
128
|
-
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
129
|
-
|
|
130
|
-
# Get valid values from parent
|
|
131
|
-
valid_values = parent_df[parent_col].to_a.uniq.compact
|
|
132
|
-
next if valid_values.empty?
|
|
133
|
-
|
|
134
|
-
# Filter child to only include rows that reference valid parent values
|
|
135
|
-
before = child_df.height
|
|
136
|
-
child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
|
|
137
|
-
|
|
138
|
-
if child_df.height < before
|
|
139
|
-
filtered[child_file] = child_df
|
|
140
|
-
changed = true
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
end
|
|
84
|
+
|
|
85
|
+
# Trips are the atomic unit of GTFS, we will generate a new view
|
|
86
|
+
# based on the set of trips that would be included for each invidual filter
|
|
87
|
+
# and cascade changes from this view in order to retain referential integrity
|
|
88
|
+
trip_ids = nil
|
|
89
|
+
|
|
90
|
+
view.each do |file, filters|
|
|
91
|
+
new_filtered = filter!(file, filters, filtered.dup)
|
|
92
|
+
trip_ids = if trip_ids.nil?
|
|
93
|
+
new_filtered["trips"]["trip_id"]
|
|
94
|
+
else
|
|
95
|
+
trip_ids & new_filtered["trips"]["trip_id"]
|
|
144
96
|
end
|
|
145
97
|
end
|
|
146
98
|
|
|
99
|
+
if trip_ids
|
|
100
|
+
filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
|
|
101
|
+
end
|
|
102
|
+
|
|
147
103
|
# Remove files that are empty, but keep required files even if empty
|
|
148
|
-
filtered.delete_if
|
|
104
|
+
filtered.delete_if do |file, df|
|
|
105
|
+
is_required_file = REQUIRED_GTFS_FILES.include?(file) ||
|
|
106
|
+
file == "calendar" && !filtered["calendar_dates"] ||
|
|
107
|
+
file == "calendar_dates" && !filtered["calendar"]
|
|
108
|
+
|
|
109
|
+
(!df || df.height == 0) && !is_required_file
|
|
110
|
+
end
|
|
149
111
|
self.class.new(filtered)
|
|
150
112
|
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
def filter!(file, filters, filtered)
|
|
117
|
+
unless filters.empty?
|
|
118
|
+
df = filtered[file]
|
|
119
|
+
|
|
120
|
+
filters.each do |col, val|
|
|
121
|
+
df = if val.is_a?(Array)
|
|
122
|
+
df.filter(Polars.col(col).is_in(val))
|
|
123
|
+
elsif val.respond_to?(:call)
|
|
124
|
+
df.filter(val.call(Polars.col(col)))
|
|
125
|
+
else
|
|
126
|
+
df.filter(Polars.col(col).eq(val))
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
filtered[file] = df
|
|
131
|
+
|
|
132
|
+
prune!(file, filtered)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
filtered
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def prune!(root, filtered)
|
|
139
|
+
graph.each_bfs_edge(root) do |parent_file, child_file|
|
|
140
|
+
parent_df = filtered[parent_file]
|
|
141
|
+
next unless parent_df
|
|
142
|
+
|
|
143
|
+
child_df = filtered[child_file]
|
|
144
|
+
next unless child_df && child_df.height > 0
|
|
145
|
+
|
|
146
|
+
attrs = graph.get_edge_data(parent_file, child_file)
|
|
147
|
+
|
|
148
|
+
attrs[:dependencies].each do |dep|
|
|
149
|
+
parent_col = dep[parent_file]
|
|
150
|
+
child_col = dep[child_file]
|
|
151
|
+
|
|
152
|
+
next unless parent_col && child_col &&
|
|
153
|
+
parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
|
|
154
|
+
|
|
155
|
+
# Get valid values from parent
|
|
156
|
+
valid_values = parent_df[parent_col].to_a.uniq.compact
|
|
157
|
+
|
|
158
|
+
# Filter child to only include rows that reference valid parent values
|
|
159
|
+
before = child_df.height
|
|
160
|
+
child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
|
|
161
|
+
|
|
162
|
+
if child_df.height < before
|
|
163
|
+
filtered[child_file] = child_df
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
151
168
|
end
|
|
152
169
|
end
|
data/lib/gtfs_df/graph.rb
CHANGED
|
@@ -4,7 +4,7 @@ module GtfsDf
|
|
|
4
4
|
class Graph
|
|
5
5
|
# Returns a directed graph of GTFS file dependencies
|
|
6
6
|
def self.build
|
|
7
|
-
g = NetworkX::
|
|
7
|
+
g = NetworkX::Graph.new
|
|
8
8
|
# Nodes: GTFS files
|
|
9
9
|
files = %w[
|
|
10
10
|
agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
|
|
@@ -12,7 +12,7 @@ module GtfsDf
|
|
|
12
12
|
]
|
|
13
13
|
files.each { |f| g.add_node(f) }
|
|
14
14
|
|
|
15
|
-
#
|
|
15
|
+
# TODO: Add fare_rules -> stops + test
|
|
16
16
|
edges = [
|
|
17
17
|
["agency", "routes", {dependencies: [
|
|
18
18
|
{"agency" => "agency_id", "routes" => "agency_id"}
|
|
@@ -116,9 +116,6 @@ module GtfsDf
|
|
|
116
116
|
["booking_rules", "stop_times", {dependencies: [
|
|
117
117
|
{"booking_rules" => "booking_rule_id", "stop_times" => "pickup_booking_rule_id"},
|
|
118
118
|
{"booking_rules" => "booking_rule_id", "stop_times" => "drop_off_booking_rule_id"}
|
|
119
|
-
]}],
|
|
120
|
-
["stops", "booking_rules", {dependencies: [
|
|
121
|
-
{"stops" => "stop_id", "booking_rules" => "stop_id"}
|
|
122
119
|
]}]
|
|
123
120
|
]
|
|
124
121
|
|
data/lib/gtfs_df/reader.rb
CHANGED
|
@@ -10,19 +10,36 @@ module GtfsDf
|
|
|
10
10
|
zip_file.each do |entry|
|
|
11
11
|
next unless entry.file?
|
|
12
12
|
|
|
13
|
-
GtfsDf::Feed::GTFS_FILES.each do |
|
|
14
|
-
next unless entry.name == "#{
|
|
13
|
+
GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
|
|
14
|
+
next unless entry.name == "#{gtfs_file}.txt"
|
|
15
15
|
|
|
16
16
|
out_path = File.join(tmpdir, entry.name)
|
|
17
17
|
entry.extract(out_path)
|
|
18
|
-
schema_class_name = file.split("_").map(&:capitalize).join
|
|
19
18
|
|
|
20
|
-
data[
|
|
19
|
+
data[gtfs_file] = data_frame(gtfs_file, out_path)
|
|
21
20
|
end
|
|
22
21
|
end
|
|
23
22
|
end
|
|
24
23
|
end
|
|
25
24
|
GtfsDf::Feed.new(data)
|
|
26
25
|
end
|
|
26
|
+
|
|
27
|
+
# Loads a GTFS dir and returns a Feed
|
|
28
|
+
def self.load_from_dir(dir_path)
|
|
29
|
+
data = {}
|
|
30
|
+
GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
|
|
31
|
+
path = File.join(dir_path, "#{gtfs_file}.txt")
|
|
32
|
+
next unless File.exist?(path)
|
|
33
|
+
|
|
34
|
+
data[gtfs_file] = data_frame(gtfs_file, path)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
GtfsDf::Feed.new(data)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private_class_method def self.data_frame(gtfs_file, path)
|
|
41
|
+
schema_class_name = gtfs_file.split("_").map(&:capitalize).join
|
|
42
|
+
GtfsDf::Schema.const_get(schema_class_name).new(path).df
|
|
43
|
+
end
|
|
27
44
|
end
|
|
28
45
|
end
|
data/lib/gtfs_df/version.rb
CHANGED