gtfs_df 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d799dcd02b51045c948123e3b6bc64c41b5505279a583d98350377e6d06b4205
4
- data.tar.gz: 57ff89fd953842d68a5d4c7424a91b54484415a0b093bbe0d0b0411b025c6ef4
3
+ metadata.gz: 5b90534f9b41229026b91e4632c12dac2235ff615ac8138c301702e51bef3dfa
4
+ data.tar.gz: 95ac49f7dcdecea5f08c2fe51f8f50b0466bcb75a1d8ae67720b464c2983ed88
5
5
  SHA512:
6
- metadata.gz: 13b027c13cfec0a3493662cb048dd20f1b44aac778eb0e2d124ad873ae3649b1622758550399d9e457065401bfa04dbbb71e5e36da467599f8aac9675b2431ad
7
- data.tar.gz: 9d248e0fe06e69af6a5d86114e77e10583dcbf9956858411cc6ed2cc2d03ebf4ce2047d3e8bbc1eb5eb5530a2e544bf8cc21c4d0184fe331558008fce64a055a
6
+ metadata.gz: 2fc195cb47d81dd4799d99d8182f8c5ff7b56cdfdee08281ea8a129568b87f38d719956fd98532604dc26bf97dee344bbdd65943cc1e292248a2375b76290eda
7
+ data.tar.gz: 0d4b2e22968e5ed37349c82ff84c9ba8a6f4d12c143a8709caa40998e16afe4a2178f820f68bb384dc4cf12de445a9f0d265d163d48071621e69f64501db76d5
data/.conform.yaml CHANGED
@@ -7,7 +7,7 @@ policies:
7
7
  case: lower
8
8
  invalidLastCharacters: .
9
9
  body:
10
- required: true
10
+ required: false
11
11
  dco: false
12
12
  spellcheck:
13
13
  locale: US
data/README.md CHANGED
@@ -8,18 +8,16 @@ This project was created to bring the power of [partridge] to ruby.
8
8
 
9
9
  ## Installation
10
10
 
11
- TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
12
-
13
11
  Install the gem and add to the application's Gemfile by executing:
14
12
 
15
13
  ```bash
16
- bundle add UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
14
+ bundle add gtfs_df
17
15
  ```
18
16
 
19
17
  If bundler is not being used to manage dependencies, install the gem by executing:
20
18
 
21
19
  ```bash
22
- gem install UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
20
+ gem install gtfs_df
23
21
  ```
24
22
 
25
23
  ## Usage
@@ -32,6 +30,9 @@ require 'gtfs_df'
32
30
  # Load from a zip file
33
31
  feed = GtfsDf::Reader.load_from_zip('path/to/gtfs.zip')
34
32
 
33
+ # Or, load from a directory
34
+ feed = GtfsDf::Reader.load_from_dir('path/to/gtfs_dir')
35
+
35
36
  # Access dataframes for each GTFS file
36
37
  puts feed.agency.head
37
38
  puts feed.routes.head
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- gtfs_df (0.1.0)
4
+ gtfs_df (0.1.1)
5
5
  networkx (~> 0.4)
6
6
  polars-df (~> 0.22)
7
7
  rubyzip (~> 2.3)
@@ -40,12 +40,19 @@ end
40
40
 
41
41
  agency_ids.each do |agency_id|
42
42
  Whirly.start do
43
- Whirly.status = "-> #{agency_id} filtering..."
44
43
  output_path = File.join(output_dir, "#{agency_id}.zip")
44
+
45
+ start_time = Time.now
46
+
47
+ Whirly.status = "-> #{agency_id} filtering..."
45
48
  filtered_feed = feed.filter("agency" => {"agency_id" => agency_id})
49
+
46
50
  Whirly.status = "-> #{agency_id} writing..."
47
51
  GtfsDf::Writer.write_to_zip(filtered_feed, output_path)
48
- Whirly.status = "-> #{agency_id}"
52
+
53
+ elapsed = Time.now - start_time
54
+
55
+ Whirly.status = "-> #{agency_id}.zip (#{elapsed.round(2)}s)"
49
56
  end
50
57
  end
51
58
 
@@ -10,7 +10,11 @@ module GtfsDf
10
10
  if input.is_a?(Polars::DataFrame)
11
11
  input
12
12
  elsif input.is_a?(String)
13
- Polars.read_csv(input, dtypes: self.class::SCHEMA)
13
+ # We need to account for extra columns due to: https://github.com/ankane/ruby-polars/issues/125
14
+ all_columns = Polars.scan_csv(input).columns
15
+ default_schema = all_columns.map { |c| [c, Polars::String] }.to_h
16
+ dtypes = default_schema.merge(self.class::SCHEMA)
17
+ Polars.read_csv(input, null_values: [""], dtypes:)
14
18
  elsif input.is_a?(Array)
15
19
  head, *body = input
16
20
  df_input = body.each_with_object({}) do |row, acc|
data/lib/gtfs_df/feed.rb CHANGED
@@ -36,7 +36,7 @@ module GtfsDf
36
36
  booking_rules
37
37
  ].freeze
38
38
 
39
- attr_reader(*GTFS_FILES)
39
+ attr_reader(*GTFS_FILES, :graph)
40
40
 
41
41
  # Initialize with a hash of DataFrames
42
42
  REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
@@ -53,6 +53,8 @@ module GtfsDf
53
53
  end.join(", ")}"
54
54
  end
55
55
 
56
+ @graph = GtfsDf::Graph.build
57
+
56
58
  GTFS_FILES.each do |file|
57
59
  df = data[file]
58
60
  schema_class_name = file.split("_").map(&:capitalize).join
@@ -68,85 +70,100 @@ module GtfsDf
68
70
  end
69
71
  end
70
72
 
71
- # Load from a directory of GTFS CSV files
72
- def self.load_from_dir(dir)
73
- data = {}
74
- GTFS_FILES.each do |file|
75
- path = File.join(dir, "#{file}.txt")
76
- next unless File.exist?(path)
77
-
78
- schema_class_name = file.split("_").map(&:capitalize).join
79
-
80
- data[file] = GtfsDf::Schema.const_get(schema_class_name).new(path)
81
- end
82
- new(data)
83
- end
84
-
85
73
  # Filter the feed using a view hash
86
74
  # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
87
75
  def filter(view)
88
76
  filtered = {}
89
- graph = GtfsDf::Graph.build
90
- # Step 1: Apply view filters
77
+
91
78
  GTFS_FILES.each do |file|
92
79
  df = send(file)
93
80
  next unless df
94
81
 
95
- filters = view[file]
96
- if filters && !filters.empty?
97
- filters.each do |col, val|
98
- df = if val.is_a?(Array)
99
- df.filter(Polars.col(col).is_in(val))
100
- elsif val.respond_to?(:call)
101
- df.filter(val.call(Polars.col(col)))
102
- else
103
- df.filter(Polars.col(col).eq(val))
104
- end
105
- end
106
- end
107
82
  filtered[file] = df
108
83
  end
109
- # Step 2: Cascade filters following the directed edges
110
- # An edge from parent->child means: filter child based on valid parent IDs
111
- changed = true
112
- while changed
113
- changed = false
114
- GTFS_FILES.each do |parent_file|
115
- parent_df = filtered[parent_file]
116
- next unless parent_df && parent_df.height > 0
117
-
118
- # For each outgoing edge from parent_file to child_file
119
- graph.adj[parent_file]&.each do |child_file, attrs|
120
- child_df = filtered[child_file]
121
- next unless child_df && child_df.height > 0
122
-
123
- attrs[:dependencies].each do |dep|
124
- parent_col = dep[parent_file]
125
- child_col = dep[child_file]
126
-
127
- next unless parent_col && child_col &&
128
- parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
129
-
130
- # Get valid values from parent
131
- valid_values = parent_df[parent_col].to_a.uniq.compact
132
- next if valid_values.empty?
133
-
134
- # Filter child to only include rows that reference valid parent values
135
- before = child_df.height
136
- child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
137
-
138
- if child_df.height < before
139
- filtered[child_file] = child_df
140
- changed = true
141
- end
142
- end
143
- end
84
+
85
+ # Trips are the atomic unit of GTFS, we will generate a new view
86
+ # based on the set of trips that would be included for each invidual filter
87
+ # and cascade changes from this view in order to retain referential integrity
88
+ trip_ids = nil
89
+
90
+ view.each do |file, filters|
91
+ new_filtered = filter!(file, filters, filtered.dup)
92
+ trip_ids = if trip_ids.nil?
93
+ new_filtered["trips"]["trip_id"]
94
+ else
95
+ trip_ids & new_filtered["trips"]["trip_id"]
144
96
  end
145
97
  end
146
98
 
99
+ if trip_ids
100
+ filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
101
+ end
102
+
147
103
  # Remove files that are empty, but keep required files even if empty
148
- filtered.delete_if { |file, df| (!df || df.height == 0) && !REQUIRED_GTFS_FILES.include?(file) }
104
+ filtered.delete_if do |file, df|
105
+ is_required_file = REQUIRED_GTFS_FILES.include?(file) ||
106
+ file == "calendar" && !filtered["calendar_dates"] ||
107
+ file == "calendar_dates" && !filtered["calendar"]
108
+
109
+ (!df || df.height == 0) && !is_required_file
110
+ end
149
111
  self.class.new(filtered)
150
112
  end
113
+
114
+ private
115
+
116
+ def filter!(file, filters, filtered)
117
+ unless filters.empty?
118
+ df = filtered[file]
119
+
120
+ filters.each do |col, val|
121
+ df = if val.is_a?(Array)
122
+ df.filter(Polars.col(col).is_in(val))
123
+ elsif val.respond_to?(:call)
124
+ df.filter(val.call(Polars.col(col)))
125
+ else
126
+ df.filter(Polars.col(col).eq(val))
127
+ end
128
+ end
129
+
130
+ filtered[file] = df
131
+
132
+ prune!(file, filtered)
133
+ end
134
+
135
+ filtered
136
+ end
137
+
138
+ def prune!(root, filtered)
139
+ graph.each_bfs_edge(root) do |parent_file, child_file|
140
+ parent_df = filtered[parent_file]
141
+ next unless parent_df
142
+
143
+ child_df = filtered[child_file]
144
+ next unless child_df && child_df.height > 0
145
+
146
+ attrs = graph.get_edge_data(parent_file, child_file)
147
+
148
+ attrs[:dependencies].each do |dep|
149
+ parent_col = dep[parent_file]
150
+ child_col = dep[child_file]
151
+
152
+ next unless parent_col && child_col &&
153
+ parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
154
+
155
+ # Get valid values from parent
156
+ valid_values = parent_df[parent_col].to_a.uniq.compact
157
+
158
+ # Filter child to only include rows that reference valid parent values
159
+ before = child_df.height
160
+ child_df = child_df.filter(Polars.col(child_col).is_in(valid_values))
161
+
162
+ if child_df.height < before
163
+ filtered[child_file] = child_df
164
+ end
165
+ end
166
+ end
167
+ end
151
168
  end
152
169
  end
data/lib/gtfs_df/graph.rb CHANGED
@@ -4,7 +4,7 @@ module GtfsDf
4
4
  class Graph
5
5
  # Returns a directed graph of GTFS file dependencies
6
6
  def self.build
7
- g = NetworkX::DiGraph.new
7
+ g = NetworkX::Graph.new
8
8
  # Nodes: GTFS files
9
9
  files = %w[
10
10
  agency routes trips stop_times stops calendar calendar_dates shapes transfers frequencies fare_attributes fare_rules
@@ -12,7 +12,7 @@ module GtfsDf
12
12
  ]
13
13
  files.each { |f| g.add_node(f) }
14
14
 
15
- # Edges: dependencies
15
+ # TODO: Add fare_rules -> stops + test
16
16
  edges = [
17
17
  ["agency", "routes", {dependencies: [
18
18
  {"agency" => "agency_id", "routes" => "agency_id"}
@@ -116,9 +116,6 @@ module GtfsDf
116
116
  ["booking_rules", "stop_times", {dependencies: [
117
117
  {"booking_rules" => "booking_rule_id", "stop_times" => "pickup_booking_rule_id"},
118
118
  {"booking_rules" => "booking_rule_id", "stop_times" => "drop_off_booking_rule_id"}
119
- ]}],
120
- ["stops", "booking_rules", {dependencies: [
121
- {"stops" => "stop_id", "booking_rules" => "stop_id"}
122
119
  ]}]
123
120
  ]
124
121
 
@@ -10,19 +10,36 @@ module GtfsDf
10
10
  zip_file.each do |entry|
11
11
  next unless entry.file?
12
12
 
13
- GtfsDf::Feed::GTFS_FILES.each do |file|
14
- next unless entry.name == "#{file}.txt"
13
+ GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
14
+ next unless entry.name == "#{gtfs_file}.txt"
15
15
 
16
16
  out_path = File.join(tmpdir, entry.name)
17
17
  entry.extract(out_path)
18
- schema_class_name = file.split("_").map(&:capitalize).join
19
18
 
20
- data[file] = GtfsDf::Schema.const_get(schema_class_name).new(out_path).df
19
+ data[gtfs_file] = data_frame(gtfs_file, out_path)
21
20
  end
22
21
  end
23
22
  end
24
23
  end
25
24
  GtfsDf::Feed.new(data)
26
25
  end
26
+
27
+ # Loads a GTFS dir and returns a Feed
28
+ def self.load_from_dir(dir_path)
29
+ data = {}
30
+ GtfsDf::Feed::GTFS_FILES.each do |gtfs_file|
31
+ path = File.join(dir_path, "#{gtfs_file}.txt")
32
+ next unless File.exist?(path)
33
+
34
+ data[gtfs_file] = data_frame(gtfs_file, path)
35
+ end
36
+
37
+ GtfsDf::Feed.new(data)
38
+ end
39
+
40
+ private_class_method def self.data_frame(gtfs_file, path)
41
+ schema_class_name = gtfs_file.split("_").map(&:capitalize).join
42
+ GtfsDf::Schema.const_get(schema_class_name).new(path).df
43
+ end
27
44
  end
28
45
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado