gtfs_df 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 128899724d4613f0170fa601cb0c4cdc9f88a51b2fcd6c06108e7cc9acf9201c
4
- data.tar.gz: 14c30a678bc9a623233c6631be1c7d497fff6bae4b30abd88406c297bc76190b
3
+ metadata.gz: 205ec058b41c5bd1d2b01ff3950d4cd2ebb20f304d02d5bcd3dd0c447b4e0a6e
4
+ data.tar.gz: 54cca637de421c26d2144df100f8430f5dddb639b3a37d1160dfbd9704630e33
5
5
  SHA512:
6
- metadata.gz: b7e3d1ac85953b995b82d0abbb9f872cf5307564ba67949320d607190ef5a7ff7f8e34df241cc584d7b36fc8db9c4159a7bf74ef1ce6d6868e4a3155784a9ec9
7
- data.tar.gz: a1067ff3a0912b3eb56e3348e707c777563b50f440084a5b8c83b2d3da6ebf8a8a7ed49093a756d8871f1d3d849906daa844a0a66c04e56f97b03bfe2a106d79
6
+ metadata.gz: ef853b504ee701e77911259352d0057f6e327db8022370a1fa0dbaa597bfadcac7ac1c8fbe746c04ec4669d86384279a259ac35d4126f1483286937812cb7cce
7
+ data.tar.gz: 728d684dd02b653cc779aa5e8152766f5dccafcb097e8fca5810b90f994c442645119f32efb62c84d8fbf1645109f579a3c74f6eb88c9e21dd03faa72193cdd0
data/CHANGELOG.md CHANGED
@@ -1,4 +1,28 @@
1
- ## [Unreleased]
1
+ ## [0.4.1] - 2025-12-05
2
+
3
+ ### Added
4
+
5
+ - handle extra whitespace in csvs
6
+
7
+ ### Maintenance
8
+
9
+ - remove unused initializer format
10
+ ## [0.4.0] - 2025-12-04
11
+
12
+ ### Added
13
+
14
+ - allow setting maintain_trip_dependencies=false
15
+
16
+ ### Fixed
17
+
18
+ - parse stop_lat as float
19
+ - add missing agency -> fare_attributes edge
20
+ - allow null for fare_rules
21
+
22
+ ### Maintenance
23
+
24
+ - provide accessor for gtfs_files (utility)
25
+ - add yard docs
2
26
 
3
27
  ## [0.3.0] - 2025-12-04
4
28
 
@@ -10,20 +10,21 @@ module GtfsDf
10
10
  if input.is_a?(Polars::DataFrame)
11
11
  input
12
12
  elsif input.is_a?(String)
13
- # We need to account for extra columns due to: https://github.com/ankane/ruby-polars/issues/125
14
- all_columns = Polars.scan_csv(input).columns
15
- default_schema = all_columns.map { |c| [c, Polars::String] }.to_h
16
- dtypes = default_schema.merge(self.class::SCHEMA)
17
- Polars.read_csv(input, null_values: [""], dtypes:)
18
- elsif input.is_a?(Array)
19
- head, *body = input
20
- df_input = body.each_with_object({}) do |row, acc|
21
- head.each_with_index do |column, index|
22
- acc[column] ||= []
23
- acc[column] << row[index]
24
- end
25
- end
26
- Polars::DataFrame.new(df_input, schema_overrides: self.class::SCHEMA, strict: false)
13
+ # TODO: use `infer_schema: false` instead of `infer_schema_length` after polars release:
14
+ # https://github.com/ankane/ruby-polars/blob/master/CHANGELOG.md#100-unreleased
15
+ df = Polars.read_csv(input, infer_schema_length: 0)
16
+ dtypes = self.class::SCHEMA.slice(*df.columns)
17
+
18
+ df
19
+ .with_columns(dtypes.keys.map do |col|
20
+ stripped = Polars.col(col).str.strip
21
+ Polars.when(stripped.str.len_chars.gt(0))
22
+ .then(stripped)
23
+ .otherwise(Polars.lit(nil))
24
+ end)
25
+ .with_columns(dtypes.map do |name, type|
26
+ Polars.col(name).cast(type)
27
+ end)
27
28
  else
28
29
  throw GtfsDf::Error, "Unrecognized input"
29
30
  end
data/lib/gtfs_df/feed.rb CHANGED
@@ -36,7 +36,8 @@ module GtfsDf
36
36
  booking_rules
37
37
  ].freeze
38
38
 
39
- attr_reader(*GTFS_FILES, :graph)
39
+ attr_accessor(*GTFS_FILES)
40
+ attr_reader(:graph)
40
41
 
41
42
  # Initialize with a hash of DataFrames
42
43
  REQUIRED_GTFS_FILES = %w[agency stops routes trips stop_times].freeze
@@ -71,8 +72,16 @@ module GtfsDf
71
72
  end
72
73
 
73
74
  # Filter the feed using a view hash
74
- # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
75
- def filter(view)
75
+ #
76
+ # @param view [Hash] The view used to filter the feed, with format { file => filters }.
77
+ # Example view: { 'routes' => { 'route_id' => '123' }, 'trips' => { 'service_id' => 'A' } }
78
+ # @param maintain_trip_dependencies [Boolean] Whether trip dependencies should be preserved.
79
+ # By default, we treat trips as the atomic unit of GTFS. Therefore, if we filter to one stop
80
+ # referenced by TripA, we will preserve _all stops_ referenced by TripA. However, it is
81
+ # occasionally useful to prune bad data and _not_ maintain all trip dependencies.
82
+ # For example, if StopA contains invalid coordinates, we may wish to filter it out but keep
83
+ # the other stops for TripA. In this case, `maintain_trip_dependencies` should be set to false.
84
+ def filter(view, maintain_trip_dependencies = true)
76
85
  filtered = {}
77
86
 
78
87
  GTFS_FILES.each do |file|
@@ -82,22 +91,28 @@ module GtfsDf
82
91
  filtered[file] = df
83
92
  end
84
93
 
85
- # Trips are the atomic unit of GTFS, we will generate a new view
86
- # based on the set of trips that would be included for each invidual filter
87
- # and cascade changes from this view in order to retain referential integrity
88
- trip_ids = nil
89
-
90
- view.each do |file, filters|
91
- new_filtered = filter!(file, filters, filtered.dup)
92
- trip_ids = if trip_ids.nil?
93
- new_filtered["trips"]["trip_id"]
94
- else
95
- trip_ids & new_filtered["trips"]["trip_id"]
94
+ if maintain_trip_dependencies
95
+ # Trips are the atomic unit of GTFS, we will generate a new view
96
+ # based on the set of trips that would be included for each invidual filter
97
+ # and cascade changes from this view in order to retain referential integrity
98
+ trip_ids = nil
99
+
100
+ view.each do |file, filters|
101
+ new_filtered = filter!(file, filters, filtered.dup)
102
+ trip_ids = if trip_ids.nil?
103
+ new_filtered["trips"]["trip_id"]
104
+ else
105
+ trip_ids & new_filtered["trips"]["trip_id"]
106
+ end
96
107
  end
97
- end
98
108
 
99
- if trip_ids
100
- filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
109
+ if trip_ids
110
+ filtered = filter!("trips", {"trip_id" => trip_ids.to_a}, filtered)
111
+ end
112
+ else
113
+ view.each do |file, filters|
114
+ filtered = filter!(file, filters, filtered.dup)
115
+ end
101
116
  end
102
117
 
103
118
  # Remove files that are empty, but keep required files even if empty
@@ -163,6 +178,7 @@ module GtfsDf
163
178
  attrs[:dependencies].each do |dep|
164
179
  parent_col = dep[parent_node_id]
165
180
  child_col = dep[child_node_id]
181
+ allow_null = !!dep[:allow_null]
166
182
 
167
183
  next unless parent_col && child_col &&
168
184
  parent_df.columns.include?(parent_col) && child_df.columns.include?(child_col)
@@ -172,9 +188,11 @@ module GtfsDf
172
188
 
173
189
  # Filter child to only include rows that reference valid parent values
174
190
  before = child_df.height
175
- child_df = child_df.filter(
176
- Polars.col(child_col).is_in(valid_values)
177
- )
191
+ filter = Polars.col(child_col).is_in(valid_values)
192
+ if allow_null
193
+ filter = (filter | Polars.col(child_col).is_null)
194
+ end
195
+ child_df = child_df.filter(filter)
178
196
  changed = child_df.height < before
179
197
 
180
198
  # If we removed a part of the child_df earlier, concat it back on
data/lib/gtfs_df/graph.rb CHANGED
@@ -50,12 +50,16 @@ module GtfsDf
50
50
  ["agency", "routes", {dependencies: [
51
51
  {"agency" => "agency_id", "routes" => "agency_id"}
52
52
  ]}],
53
+ ["fare_attributes", "agency", {dependencies: [
54
+ {"fare_attributes" => "agency_id",
55
+ "agency" => "agency_id"}
56
+ ]}],
53
57
  ["fare_attributes", "fare_rules", {dependencies: [
54
58
  {"fare_attributes" => "fare_id",
55
59
  "fare_rules" => "fare_id"}
56
60
  ]}],
57
61
  ["fare_rules", "routes", {dependencies: [
58
- {"fare_rules" => "route_id", "routes" => "route_id"}
62
+ {"fare_rules" => "route_id", "routes" => "route_id", :allow_null => true}
59
63
  ]}],
60
64
  ["routes", "trips", {dependencies: [
61
65
  {"routes" => "route_id", "trips" => "route_id"}
@@ -9,7 +9,7 @@ module GtfsDf
9
9
  "stop_name" => Polars::String,
10
10
  "tts_stop_name" => Polars::String,
11
11
  "stop_desc" => Polars::String,
12
- "stop_lat" => Polars::String,
12
+ "stop_lat" => Polars::Float64,
13
13
  "stop_lon" => Polars::Float64,
14
14
  "zone_id" => Polars::String,
15
15
  "stop_url" => Polars::String,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module GtfsDf
4
- VERSION = "0.3.0"
4
+ VERSION = "0.4.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gtfs_df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Mejorado