data_miner 0.3.13 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,19 +30,15 @@ module DataMiner
30
30
  options = args.last
31
31
 
32
32
  self.runnable_counter += 1
33
- runnables << DataMiner::Import.new(self, runnable_counter, description, options, &block)
34
- end
35
-
36
- def after_invoke
37
- if unique_indices.empty?
38
- raise(MissingHashColumn, "No unique_index defined for #{resource.name}, so you need a row_hash:string column.") unless resource.column_names.include?('row_hash')
39
- unique_indices.add 'row_hash'
40
- end
41
- runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
33
+ runnable = DataMiner::Import.new self, runnable_counter, description, options
34
+ Blockenspiel.invoke block, runnable
35
+ runnables << runnable
42
36
  end
43
37
 
44
38
  # Mine data for this class.
45
39
  def run(options = {})
40
+ options.symbolize_keys!
41
+
46
42
  finished = false
47
43
  run = DataMiner::Run.create! :started_at => Time.now, :resource_name => resource.name
48
44
  resource.delete_all if options[:from_scratch]
@@ -63,6 +59,8 @@ module DataMiner
63
59
  # Options
64
60
  # * <tt>:resource_names</tt>: array of resource (class) names to mine
65
61
  def run(options = {})
62
+ options.symbolize_keys!
63
+
66
64
  resource_names.each do |resource_name|
67
65
  if options[:resource_names].blank? or options[:resource_names].include?(resource_name)
68
66
  resource_name.constantize.data_miner_config.run options
@@ -15,7 +15,7 @@ module DataMiner
15
15
 
16
16
  def find(key_name, key, value_name, options = {})
17
17
  if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
- match[value_name].to_s.split(/\s*;\s/)
18
+ match[value_name].to_s
19
19
  end
20
20
  end
21
21
 
@@ -1,16 +1,20 @@
1
1
  module DataMiner
2
2
  class Import
3
+ include Blockenspiel::DSL
4
+
5
+ attr_reader :attributes
3
6
  attr_accessor :configuration, :position_in_run, :options, :table, :errata
4
7
  attr_accessor :description
5
8
  delegate :resource, :to => :configuration
6
- delegate :unique_indices, :to => :configuration
7
9
 
8
- def initialize(configuration, position_in_run, description, options = {}, &block)
10
+ def initialize(configuration, position_in_run, description, options = {})
11
+ options.symbolize_keys!
12
+ @options = options
13
+
14
+ @attributes = ActiveSupport::OrderedHash.new
9
15
  @configuration = configuration
10
16
  @position_in_run = position_in_run
11
17
  @description = description
12
- @options = options
13
- yield self if block_given? # pull in attributes
14
18
  @errata = Errata.new(:url => options[:errata], :klass => resource) if options[:errata]
15
19
  @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
16
20
  end
@@ -19,17 +23,18 @@ module DataMiner
19
23
  "Import(#{resource}) position #{position_in_run} (#{description})"
20
24
  end
21
25
 
22
- def attributes
23
- configuration.attributes.reject { |k, v| !v.stored_by? self }
24
- end
25
-
26
26
  def stores?(attr_name)
27
- configuration.attributes[attr_name].andand.stored_by? self
27
+ attributes.has_key? attr_name
28
28
  end
29
-
29
+
30
30
  def store(attr_name, attr_options = {})
31
- configuration.attributes[attr_name] ||= Attribute.new(resource, attr_name)
32
- configuration.attributes[attr_name].options_for_import[self] = attr_options
31
+ raise "[data_miner gem] Column #{attr_name} doesn't exist on table #{resource.table_name}" unless resource.column_names.include?(attr_name)
32
+ attributes[attr_name] = Attribute.new self, attr_name, attr_options
33
+ end
34
+
35
+ def key(attr_name, attr_options = {})
36
+ @key = attr_name
37
+ store attr_name, attr_options
33
38
  end
34
39
 
35
40
  def run(run)
@@ -38,25 +43,15 @@ module DataMiner
38
43
  next if errata.rejects?(row)
39
44
  errata.correct!(row)
40
45
  end
41
-
42
- unifying_values = unique_indices.map do |attr_name|
43
- [ attributes[attr_name].value_from_row(self, row) ]
44
- end
45
46
 
46
- record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
47
- next if combination.include?(nil)
48
- resource.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
49
- end.flatten
50
-
51
- Array.wrap(record_set).each do |record|
52
- changes = attributes.values.map { |attr| attr.set_record_from_row self, record, row }
53
- record.data_miner_touch_count ||= 0
54
- if changes.any?
55
- record.data_miner_touch_count += 1
56
- record.data_miner_last_run = run
57
- end
58
- record.save!
47
+ record = resource.send "find_or_initialize_by_#{@key}", attributes[@key].value_from_row(row)
48
+ changes = attributes.map { |_, attr| attr.set_record_from_row record, row }
49
+ record.data_miner_touch_count ||= 0
50
+ if changes.any?
51
+ record.data_miner_touch_count += 1
52
+ record.data_miner_last_run = run
59
53
  end
54
+ record.save!
60
55
  end
61
56
  DataMiner.logger.info "performed #{inspect}"
62
57
  end
@@ -1,284 +1,9 @@
1
1
  require 'test_helper'
2
2
 
3
- module FuelEconomyGuide
4
- TRANSMISSIONS = {
5
- 'A' => 'automatic',
6
- 'M' => 'manual',
7
- 'L' => 'automatic', # Lockup/automatic
8
- 'S' => 'semiautomatic', # Semiautomatic
9
- 'C' => 'manual' # TODO verify for VW Syncro
10
- }
11
-
12
- ENGINE_TYPES = {
13
- '(GUZZLER)' => nil, # "gas guzzler"
14
- '(POLICE)' => nil, # police automobile_variant
15
- '(MPFI)' => 'injection',
16
- '(MPI*)' => 'injection',
17
- '(SPFI)' => 'injection',
18
- '(FFS)' => 'injection',
19
- '(TURBO)' => 'turbo',
20
- '(TRBO)' => 'turbo',
21
- '(TC*)' => 'turbo',
22
- '(FFS,TRBO)' => %w(injection turbo),
23
- '(S-CHARGE)' => 'supercharger',
24
- '(SC*)' => 'supercharger',
25
- '(DIESEL)' => nil, # diesel
26
- '(DSL)' => nil, # diesel
27
- '(ROTARY)' => nil, # rotary
28
- '(VARIABLE)' => nil, # variable displacement
29
- '(NO-CAT)' => nil, # no catalytic converter
30
- '(OHC)' => nil, # overhead camshaft
31
- '(OHV)' => nil, # overhead valves
32
- '(16-VALVE)' => nil, # 16V
33
- '(305)' => nil, # 305 cubic inch displacement
34
- '(307)' => nil, # 307 cubic inch displacement
35
- '(M-ENG)' => nil,
36
- '(W-ENG)' => nil,
37
- '(GM-BUICK)' => nil,
38
- '(GM-CHEV)' => nil,
39
- '(GM-OLDS)' => nil,
40
- '(GM-PONT)' => nil,
41
- }
42
-
43
- class ParserB
44
- attr_accessor :year
45
- def initialize(options = {})
46
- @year = options[:year]
47
- end
48
-
49
- def apply(row)
50
- row.merge!({
51
- 'make' => row['carline_mfr_name'], # make it line up with the errata
52
- 'model' => row['carline_name'], # ditto
53
- 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
54
- 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
55
- 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
56
- 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
57
- 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
58
- 'displacement' => _displacement(row['opt_disp']),
59
- 'year' => year
60
- })
61
- row
62
- end
63
-
64
- def _displacement(str)
65
- str = str.gsub(/[\(\)]/, '').strip
66
- if str =~ /^(.+)L$/
67
- $1.to_f
68
- elsif str =~ /^(.+)CC$/
69
- $1.to_f / 1000
70
- end
71
- end
72
-
73
- def add_hints!(bus)
74
- bus[:format] = :fixed_width
75
- bus[:cut] = '13-' if year == 1995
76
- bus[:schema_name] = :fuel_economy_guide_b
77
- bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
78
- Slither.define :fuel_economy_guide_b do |d|
79
- d.rows do |row|
80
- row.trap { true } # there's only one section
81
- row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
82
- row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
83
- row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
84
- row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
85
- row.column 'carline_name' , 28, :type => :string # CARLINE NAME
86
- row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
87
- row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
88
- row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
89
- row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
90
- row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
91
- row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
92
- row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
93
- row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
94
- row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
95
- row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
96
- row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
97
- row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
98
- row.spacer 2
99
- row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
100
- row.spacer 2
101
- row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
102
- row.spacer 2
103
- row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
104
- row.spacer 2
105
- row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
106
- row.spacer 2
107
- row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
108
- row.spacer 2
109
- row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
110
- row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
111
- row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
112
- row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
113
- row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
114
- row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
115
- row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
116
- row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
117
- row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
118
- row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
119
- row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
120
- row.column 'filler' , 1, :type => :string # NOT USED
121
- row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
122
- row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
123
- end
124
- end
125
- end
126
- end
127
- class ParserC
128
- attr_accessor :year
129
- def initialize(options = {})
130
- @year = options[:year]
131
- end
132
-
133
- def add_hints!(bus)
134
- # File will decide format based on filename
135
- end
136
-
137
- def apply(row)
138
- row.merge!({
139
- 'make' => row['Manufacturer'], # make it line up with the errata
140
- 'model' => row['carline name'], # ditto
141
- 'drive' => row['drv'] + 'WD',
142
- 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
143
- 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
144
- 'turbo' => row['T'] == 'T',
145
- 'supercharger' => row['S'] == 'S',
146
- 'injection' => true,
147
- 'year' => year
148
- })
149
- row
150
- end
151
- end
152
- class ParserD
153
- attr_accessor :year
154
- def initialize(options = {})
155
- @year = options[:year]
156
- end
157
-
158
- def add_hints!(bus)
159
- bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
160
- end
161
-
162
- def apply(row)
163
- row.merge!({
164
- 'make' => row['MFR'], # make it line up with the errata
165
- 'model' => row['CAR LINE'], # ditto
166
- 'drive' => row['DRIVE SYS'] + 'WD',
167
- 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
168
- 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
169
- 'turbo' => row['TURBO'] == 'T',
170
- 'supercharger' => row['SPCHGR'] == 'S',
171
- 'injection' => true,
172
- 'year' => year
173
- })
174
- row
175
- end
176
- end
177
- end
178
-
179
- class AutomobileMakeYear < ActiveRecord::Base
180
- set_primary_key :row_hash
181
-
182
- belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
183
- belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
184
- has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
185
-
186
- data_miner do
187
- process :derive_from_make_fleet_years
188
- process :derive_association_to_make_fleet_years
189
- process :derive_fuel_efficiency
190
- process :derive_volume
191
- end
192
-
193
- # validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
194
-
195
- class << self
196
- def derive_from_make_fleet_years
197
- AutomobileMakeFleetYear.find_in_batches do |batch|
198
- batch.each do |record|
199
- #puts " * Considering AMFY #{record.inspect}"
200
- if record.make and record.model_year
201
- find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
202
- end
203
- end
204
- end
205
- end
206
-
207
- def derive_association_to_make_fleet_years
208
- AutomobileMakeFleetYear.find_in_batches do |batch|
209
- batch.each do |record|
210
- if record.make and record.model_year
211
- record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
212
- record.save! if record.changed?
213
- end
214
- end
215
- end
216
- end
217
-
218
- def derive_fuel_efficiency
219
- AutomobileMakeFleetYear.find_in_batches do |batch|
220
- batch.each do |record|
221
- if record.make and record.model_year
222
- make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
223
- # make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
224
- make_year.save!
225
- end
226
- end
227
- end
228
- end
229
-
230
- def derive_volume
231
- find_in_batches do |batch|
232
- batch.each do |record|
233
- record.volume = record.fleet_years.collect(&:volume).sum
234
- record.save!
235
- end
236
- end
237
- end
238
- end
239
- end
240
-
241
- class AutomobileMakeFleetYear < ActiveRecord::Base
242
- set_primary_key :row_hash
243
- belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
244
- belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
245
- belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
246
-
247
- data_miner do
248
- # CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
249
- import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
250
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
251
- :select => lambda { |row| row['volume'].to_i > 0 } do |attr|
252
- attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
253
- attr.store 'year', :field_name => 'year_content'
254
- attr.store 'fleet', :chars => 2..3
255
- attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
256
- attr.store 'volume'
257
- end
258
- end
259
- end
260
-
261
- class AutomobileModelYear < ActiveRecord::Base
262
- set_primary_key :year
263
-
264
- has_many :make_years, :class_name => 'AutomobileMakeYear'
265
- has_many :variants, :class_name => 'AutomobileVariant'
266
-
267
- data_miner do
268
- unique_index 'year'
269
-
270
- # await :other_class => AutomobileMakeYear do |deferred|
271
- # # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
272
- # end
273
- end
274
- end
275
-
276
3
  class AutomobileFuelType < ActiveRecord::Base
277
4
  set_primary_key :code
278
5
 
279
6
  data_miner do
280
- unique_index 'code'
281
-
282
7
  import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
283
8
  :filename => 'Gd6-dsc.txt',
284
9
  :format => :fixed_width,
@@ -287,21 +12,24 @@ class AutomobileFuelType < ActiveRecord::Base
287
12
  :select => lambda { |row| /\A[A-Z]/.match row[:code] },
288
13
  :schema => [[ 'code', 2, { :type => :string } ],
289
14
  [ 'spacer', 2 ],
290
- [ 'name', 52, { :type => :string } ]]) do |attr|
291
- attr.store 'name'
15
+ [ 'name', 52, { :type => :string } ]]) do
16
+ key 'code'
17
+ store 'name'
292
18
  end
293
19
 
294
- import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do |attr|
295
- attr.store 'name'
296
- attr.store 'annual_distance'
297
- attr.store 'emission_factor'
20
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
21
+ key 'code'
22
+ store 'name'
23
+ store 'annual_distance'
24
+ store 'emission_factor'
298
25
  end
299
26
 
300
27
  # pull electricity emission factor from residential electricity
301
28
  import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
302
- :select => lambda { |row| row['code'] == 'El' }) do |attr|
303
- attr.store 'name'
304
- attr.store 'emission_factor'
29
+ :select => lambda { |row| row['code'] == 'El' }) do
30
+ key 'code'
31
+ store 'name'
32
+ store 'emission_factor'
305
33
  end
306
34
 
307
35
  # still need distance estimate for electric cars
@@ -313,71 +41,212 @@ class AutomobileFuelType < ActiveRecord::Base
313
41
  }
314
42
  end
315
43
 
316
- class AutomobileModel < ActiveRecord::Base
44
+ class AutomobileVariant < ActiveRecord::Base
317
45
  set_primary_key :row_hash
318
-
319
- has_many :variants, :class_name => 'AutomobileVariant'
320
- belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
321
-
322
- data_miner do
323
- # derived from FEG automobile variants
324
- end
325
- end
326
46
 
327
- class AutomobileMake < ActiveRecord::Base
328
- set_primary_key :name
47
+ module FuelEconomyGuide
48
+ TRANSMISSIONS = {
49
+ 'A' => 'automatic',
50
+ 'M' => 'manual',
51
+ 'L' => 'automatic', # Lockup/automatic
52
+ 'S' => 'semiautomatic', # Semiautomatic
53
+ 'C' => 'manual' # TODO verify for VW Syncro
54
+ }
55
+
56
+ ENGINE_TYPES = {
57
+ '(GUZZLER)' => nil, # "gas guzzler"
58
+ '(POLICE)' => nil, # police automobile_variant
59
+ '(MPFI)' => 'injection',
60
+ '(MPI*)' => 'injection',
61
+ '(SPFI)' => 'injection',
62
+ '(FFS)' => 'injection',
63
+ '(TURBO)' => 'turbo',
64
+ '(TRBO)' => 'turbo',
65
+ '(TC*)' => 'turbo',
66
+ '(FFS,TRBO)' => %w(injection turbo),
67
+ '(S-CHARGE)' => 'supercharger',
68
+ '(SC*)' => 'supercharger',
69
+ '(DIESEL)' => nil, # diesel
70
+ '(DSL)' => nil, # diesel
71
+ '(ROTARY)' => nil, # rotary
72
+ '(VARIABLE)' => nil, # variable displacement
73
+ '(NO-CAT)' => nil, # no catalytic converter
74
+ '(OHC)' => nil, # overhead camshaft
75
+ '(OHV)' => nil, # overhead valves
76
+ '(16-VALVE)' => nil, # 16V
77
+ '(305)' => nil, # 305 cubic inch displacement
78
+ '(307)' => nil, # 307 cubic inch displacement
79
+ '(M-ENG)' => nil,
80
+ '(W-ENG)' => nil,
81
+ '(GM-BUICK)' => nil,
82
+ '(GM-CHEV)' => nil,
83
+ '(GM-OLDS)' => nil,
84
+ '(GM-PONT)' => nil,
85
+ }
86
+
87
+ class ParserB
88
+ attr_accessor :year
89
+ def initialize(options = {})
90
+ @year = options[:year]
91
+ end
329
92
 
330
- has_many :make_years, :class_name => 'AutomobileMakeYear'
331
- has_many :models, :class_name => 'AutomobileModel'
332
- has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
333
- has_many :variants, :class_name => 'AutomobileVariant'
93
+ def apply(row)
94
+ row.merge!({
95
+ 'make' => row['carline_mfr_name'], # make it line up with the errata
96
+ 'model' => row['carline_name'], # ditto
97
+ 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
98
+ 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
99
+ 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
100
+ 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
101
+ 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
102
+ 'displacement' => _displacement(row['opt_disp']),
103
+ 'year' => year
104
+ })
105
+ row
106
+ end
334
107
 
335
- data_miner do
336
- unique_index 'name'
337
-
338
- import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/makes/make_importance.csv' do |attr|
339
- attr.store 'major'
108
+ def _displacement(str)
109
+ str = str.gsub(/[\(\)]/, '').strip
110
+ if str =~ /^(.+)L$/
111
+ $1.to_f
112
+ elsif str =~ /^(.+)CC$/
113
+ $1.to_f / 1000
114
+ end
115
+ end
116
+
117
+ def add_hints!(bus)
118
+ bus[:format] = :fixed_width
119
+ bus[:cut] = '13-' if year == 1995
120
+ bus[:schema_name] = :fuel_economy_guide_b
121
+ bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
122
+ Slither.define :fuel_economy_guide_b do |d|
123
+ d.rows do |row|
124
+ row.trap { true } # there's only one section
125
+ row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
126
+ row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
127
+ row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
128
+ row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
129
+ row.column 'carline_name' , 28, :type => :string # CARLINE NAME
130
+ row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
131
+ row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
132
+ row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
133
+ row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
134
+ row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
135
+ row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
136
+ row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
137
+ row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
138
+ row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
139
+ row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
140
+ row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
141
+ row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
142
+ row.spacer 2
143
+ row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
144
+ row.spacer 2
145
+ row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
146
+ row.spacer 2
147
+ row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
148
+ row.spacer 2
149
+ row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
150
+ row.spacer 2
151
+ row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
152
+ row.spacer 2
153
+ row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
154
+ row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
155
+ row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
156
+ row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
157
+ row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
158
+ row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
159
+ row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
160
+ row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
161
+ row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
162
+ row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
163
+ row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
164
+ row.column 'filler' , 1, :type => :string # NOT USED
165
+ row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
166
+ row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
167
+ end
168
+ end
169
+ end
340
170
  end
341
- # await :other_class => AutomobileMakeYear do |deferred|
342
- # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => 'volume'
343
- # end
344
- end
345
- end
171
+ class ParserC
172
+ attr_accessor :year
173
+ def initialize(options = {})
174
+ @year = options[:year]
175
+ end
346
176
 
347
- class AutomobileVariant < ActiveRecord::Base
348
- set_primary_key :row_hash
349
-
350
- belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
351
- belongs_to :model, :class_name => 'AutomobileModel', :foreign_key => 'automobile_model_id'
352
- belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
353
- belongs_to :fuel_type, :class_name => 'AutomobileFuelType', :foreign_key => 'automobile_fuel_type_id'
177
+ def add_hints!(bus)
178
+ # File will decide format based on filename
179
+ end
180
+
181
+ def apply(row)
182
+ row.merge!({
183
+ 'make' => row['Manufacturer'], # make it line up with the errata
184
+ 'model' => row['carline name'], # ditto
185
+ 'drive' => row['drv'] + 'WD',
186
+ 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
187
+ 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
188
+ 'turbo' => row['T'] == 'T',
189
+ 'supercharger' => row['S'] == 'S',
190
+ 'injection' => true,
191
+ 'year' => year
192
+ })
193
+ row
194
+ end
195
+ end
196
+ class ParserD
197
+ attr_accessor :year
198
+ def initialize(options = {})
199
+ @year = options[:year]
200
+ end
354
201
 
202
+ def add_hints!(bus)
203
+ bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
204
+ end
205
+
206
+ def apply(row)
207
+ row.merge!({
208
+ 'make' => row['MFR'], # make it line up with the errata
209
+ 'model' => row['CAR LINE'], # ditto
210
+ 'drive' => row['DRIVE SYS'] + 'WD',
211
+ 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
212
+ 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
213
+ 'turbo' => row['TURBO'] == 'T',
214
+ 'supercharger' => row['SPCHGR'] == 'S',
215
+ 'injection' => true,
216
+ 'year' => year
217
+ })
218
+ row
219
+ end
220
+ end
221
+ end
222
+
355
223
  data_miner do
356
224
  # 1985---1997
357
225
  (85..97).each do |yy|
358
226
  filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
359
227
  import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
360
- :filename => filename,
361
- :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
362
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
363
- attr.store 'make_name', :field_name => 'make'
364
- attr.store 'model_name', :field_name => 'model'
365
- attr.store 'year'
366
- attr.store 'fuel_type_code', :field_name => 'fuel_type'
367
- attr.store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
368
- attr.store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
369
- attr.store 'cylinders', :field_name => 'no_cyc'
370
- attr.store 'drive', :field_name => 'drive_system'
371
- attr.store 'carline_mfr_code'
372
- attr.store 'vi_mfr_code'
373
- attr.store 'carline_code'
374
- attr.store 'carline_class_code', :field_name => 'carline_clss'
375
- attr.store 'transmission'
376
- attr.store 'speeds'
377
- attr.store 'turbo'
378
- attr.store 'supercharger'
379
- attr.store 'injection'
380
- attr.store 'displacement'
228
+ :filename => filename,
229
+ :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
230
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
231
+ key 'row_hash'
232
+ store 'make_name', :field_name => 'make'
233
+ store 'model_name', :field_name => 'model'
234
+ store 'year'
235
+ store 'fuel_type_code', :field_name => 'fuel_type'
236
+ store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
237
+ store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
238
+ store 'cylinders', :field_name => 'no_cyc'
239
+ store 'drive', :field_name => 'drive_system'
240
+ store 'carline_mfr_code'
241
+ store 'vi_mfr_code'
242
+ store 'carline_code'
243
+ store 'carline_class_code', :field_name => 'carline_clss'
244
+ store 'transmission'
245
+ store 'speeds'
246
+ store 'turbo'
247
+ store 'supercharger'
248
+ store 'injection'
249
+ store 'displacement'
381
250
  end
382
251
  end
383
252
 
@@ -393,23 +262,24 @@ class AutomobileVariant < ActiveRecord::Base
393
262
  2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
394
263
  }.sort { |a, b| a.first <=> b.first }.each do |year, options|
395
264
  import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
396
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
397
- attr.store 'make_name', :field_name => 'make'
398
- attr.store 'model_name', :field_name => 'model'
399
- attr.store 'fuel_type_code', :field_name => 'fl'
400
- attr.store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
401
- attr.store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
402
- attr.store 'cylinders', :field_name => 'cyl'
403
- attr.store 'displacement', :field_name => 'displ'
404
- attr.store 'carline_class_code', :field_name => 'cls' if year >= 2000
405
- attr.store 'carline_class_name', :field_name => 'Class'
406
- attr.store 'year'
407
- attr.store 'transmission'
408
- attr.store 'speeds'
409
- attr.store 'turbo'
410
- attr.store 'supercharger'
411
- attr.store 'injection'
412
- attr.store 'drive'
265
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
266
+ key 'row_hash'
267
+ store 'make_name', :field_name => 'make'
268
+ store 'model_name', :field_name => 'model'
269
+ store 'fuel_type_code', :field_name => 'fl'
270
+ store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
271
+ store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
272
+ store 'cylinders', :field_name => 'cyl'
273
+ store 'displacement', :field_name => 'displ'
274
+ store 'carline_class_code', :field_name => 'cls' if year >= 2000
275
+ store 'carline_class_name', :field_name => 'Class'
276
+ store 'year'
277
+ store 'transmission'
278
+ store 'speeds'
279
+ store 'turbo'
280
+ store 'supercharger'
281
+ store 'injection'
282
+ store 'drive'
413
283
  end
414
284
  end
415
285
 
@@ -422,29 +292,29 @@ class AutomobileVariant < ActiveRecord::Base
422
292
  # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
423
293
  }.sort { |a, b| a.first <=> b.first }.each do |year, options|
424
294
  import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
425
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
426
- attr.store 'make_name', :field_name => 'make'
427
- attr.store 'model_name', :field_name => 'model'
428
- attr.store 'fuel_type_code', :field_name => 'FUEL TYPE'
429
- attr.store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
430
- attr.store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
431
- attr.store 'cylinders', :field_name => 'NUMB CYL'
432
- attr.store 'displacement', :field_name => 'DISPLACEMENT'
433
- attr.store 'carline_class_code', :field_name => 'CLS'
434
- attr.store 'carline_class_name', :field_name => 'CLASS'
435
- attr.store 'year'
436
- attr.store 'transmission'
437
- attr.store 'speeds'
438
- attr.store 'turbo'
439
- attr.store 'supercharger'
440
- attr.store 'injection'
441
- attr.store 'drive'
295
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
296
+ key 'row_hash'
297
+ store 'make_name', :field_name => 'make'
298
+ store 'model_name', :field_name => 'model'
299
+ store 'fuel_type_code', :field_name => 'FUEL TYPE'
300
+ store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
301
+ store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
302
+ store 'cylinders', :field_name => 'NUMB CYL'
303
+ store 'displacement', :field_name => 'DISPLACEMENT'
304
+ store 'carline_class_code', :field_name => 'CLS'
305
+ store 'carline_class_name', :field_name => 'CLASS'
306
+ store 'year'
307
+ store 'transmission'
308
+ store 'speeds'
309
+ store 'turbo'
310
+ store 'supercharger'
311
+ store 'injection'
312
+ store 'drive'
442
313
  end
443
314
  end
444
315
 
445
316
  # associate :make, :key => :original_automobile_make_name, :foreign_key => :name
446
317
  # derive :automobile_model_id # creates models by name
447
- # associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
448
318
  # associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
449
319
 
450
320
  process 'Set adjusted fuel economy' do
@@ -535,34 +405,32 @@ class Country < ActiveRecord::Base
535
405
  set_primary_key :iso_3166
536
406
 
537
407
  data_miner do
538
- unique_index 'iso_3166'
539
-
540
- import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
541
- attr.store 'iso_3166', :field_number => 1
542
- attr.store 'name', :field_number => 0
408
+ import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
409
+ key 'iso_3166'
410
+ store 'iso_3166', :field_number => 1
411
+ store 'name', :field_number => 0
543
412
  end
544
413
 
545
- import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
546
- attr.store 'iso_3166', :field_name => 'country code'
547
- attr.store 'name', :field_name => 'country'
414
+ import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
415
+ key 'iso_3166'
416
+ store 'iso_3166', :field_name => 'country code'
417
+ store 'name', :field_name => 'country'
548
418
  end
549
419
  end
550
420
  end
551
421
 
552
422
  class Airport < ActiveRecord::Base
553
423
  set_primary_key :iata_code
554
- # belongs_to :country
555
424
 
556
425
  data_miner do
557
- unique_index 'iata_code'
558
-
559
- import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do |attr|
560
- attr.store 'name', :field_number => 1
561
- attr.store 'city', :field_number => 2
562
- attr.store 'country_name', :field_number => 3
563
- attr.store 'iata_code', :field_number => 4
564
- attr.store 'latitude', :field_number => 6
565
- attr.store 'longitude', :field_number => 7
426
+ import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
427
+ key 'iata_code'
428
+ store 'name', :field_number => 1
429
+ store 'city', :field_number => 2
430
+ store 'country_name', :field_number => 3
431
+ store 'iata_code', :field_number => 4
432
+ store 'latitude', :field_number => 6
433
+ store 'longitude', :field_number => 7
566
434
  end
567
435
  end
568
436
  end
@@ -571,18 +439,18 @@ class CensusRegion < ActiveRecord::Base
571
439
  set_primary_key :number
572
440
 
573
441
  data_miner do
574
- unique_index 'number'
575
-
576
- import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
577
- attr.store 'name', :field_name => 'Name'
578
- attr.store 'number', :field_name => 'Region'
442
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
443
+ key 'number'
444
+ store 'name', :field_name => 'Name'
445
+ store 'number', :field_name => 'Region'
579
446
  end
580
447
 
581
448
  # pretend this is a different data source
582
449
  # fake! just for testing purposes
583
- import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
584
- attr.store 'name', :field_name => 'Name'
585
- attr.store 'number', :field_name => 'Region'
450
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
451
+ key 'number'
452
+ store 'name', :field_name => 'Name'
453
+ store 'number', :field_name => 'Region'
586
454
  end
587
455
  end
588
456
  end
@@ -590,20 +458,14 @@ end
590
458
  # smaller than a region
591
459
  class CensusDivision < ActiveRecord::Base
592
460
  set_primary_key :number
593
- # belongs_to :census_region
594
- # has_many :states
595
- # has_many :zip_codes, :through => :states
596
- # has_many :climate_divisions, :through => :states
597
- # has_many :residence_survey_responses
598
461
 
599
462
  data_miner do
600
- unique_index 'number'
601
-
602
- import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do |attr|
603
- attr.store 'name', :field_name => 'Name'
604
- attr.store 'number', :field_name => 'Division'
605
- attr.store 'census_region_number', :field_name => 'Region'
606
- attr.store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
463
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
464
+ key 'number'
465
+ store 'name', :field_name => 'Name'
466
+ store 'number', :field_name => 'Division'
467
+ store 'census_region_number', :field_name => 'Region'
468
+ store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
607
469
  end
608
470
  end
609
471
  end
@@ -612,8 +474,6 @@ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
612
474
  set_primary_key :department_of_energy_identifier
613
475
 
614
476
  data_miner do
615
- unique_index 'department_of_energy_identifier'
616
-
617
477
  process 'Define some unit conversions' do
618
478
  Conversions.register :kbtus, :joules, 1_000.0 * 1_055.05585
619
479
  Conversions.register :square_feet, :square_metres, 0.09290304
@@ -621,70 +481,71 @@ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
621
481
 
622
482
  # conversions are NOT performed here, since we first have to zero out legitimate skips
623
483
  # otherwise you will get values like "999 pounds = 453.138778 kilograms" (where 999 is really a legit skip)
624
- import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv', :headers => :upcase do |attr|
625
- attr.store 'department_of_energy_identifier', :field_name => 'DOEID'
484
+ import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv', :headers => :upcase do
485
+ key 'department_of_energy_identifier'
486
+ store 'department_of_energy_identifier', :field_name => 'DOEID'
626
487
 
627
- attr.store 'residence_class', :field_name => 'TYPEHUQ', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/typehuq/typehuq.csv' }
628
- attr.store 'construction_year', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Date in the middle (synthetic)', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
629
- attr.store 'construction_period', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
630
- attr.store 'urbanity', :field_name => 'URBRUR', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/urbrur/urbrur.csv' }
631
- attr.store 'dishwasher_use', :field_name => 'DWASHUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dwashuse/dwashuse.csv' }
632
- attr.store 'central_ac_use', :field_name => 'USECENAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usecenac/usecenac.csv' }
633
- attr.store 'window_ac_use', :field_name => 'USEWWAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usewwac/usewwac.csv' }
634
- attr.store 'clothes_washer_use', :field_name => 'WASHLOAD', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/washload/washload.csv' }
635
- attr.store 'clothes_dryer_use', :field_name => 'DRYRUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dryruse/dryruse.csv' }
488
+ store 'residence_class', :field_name => 'TYPEHUQ', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/typehuq/typehuq.csv' }
489
+ store 'construction_year', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Date in the middle (synthetic)', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
490
+ store 'construction_period', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
491
+ store 'urbanity', :field_name => 'URBRUR', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/urbrur/urbrur.csv' }
492
+ store 'dishwasher_use', :field_name => 'DWASHUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dwashuse/dwashuse.csv' }
493
+ store 'central_ac_use', :field_name => 'USECENAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usecenac/usecenac.csv' }
494
+ store 'window_ac_use', :field_name => 'USEWWAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usewwac/usewwac.csv' }
495
+ store 'clothes_washer_use', :field_name => 'WASHLOAD', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/washload/washload.csv' }
496
+ store 'clothes_dryer_use', :field_name => 'DRYRUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dryruse/dryruse.csv' }
636
497
 
637
- attr.store 'census_division_number', :field_name => 'DIVISION'
638
- attr.store 'census_division_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
639
- attr.store 'census_region_number', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_number', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
640
- attr.store 'census_region_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
498
+ store 'census_division_number', :field_name => 'DIVISION'
499
+ store 'census_division_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
500
+ store 'census_region_number', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_number', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
501
+ store 'census_region_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
641
502
 
642
- attr.store 'floorspace', :field_name => 'TOTSQFT'
643
- attr.store 'residents', :field_name => 'NHSLDMEM'
644
- attr.store 'ownership', :field_name => 'KOWNRENT'
645
- attr.store 'thermostat_programmability', :field_name => 'PROTHERM'
646
- attr.store 'refrigerator_count', :field_name => 'NUMFRIG'
647
- attr.store 'freezer_count', :field_name => 'NUMFREEZ'
648
- attr.store 'heating_degree_days', :field_name => 'HD65'
649
- attr.store 'cooling_degree_days', :field_name => 'CD65'
650
- attr.store 'annual_energy_from_fuel_oil_for_heating_space', :field_name => 'BTUFOSPH'
651
- attr.store 'annual_energy_from_fuel_oil_for_heating_water', :field_name => 'BTUFOWTH'
652
- attr.store 'annual_energy_from_fuel_oil_for_appliances', :field_name => 'BTUFOAPL'
653
- attr.store 'annual_energy_from_natural_gas_for_heating_space', :field_name => 'BTUNGSPH'
654
- attr.store 'annual_energy_from_natural_gas_for_heating_water', :field_name => 'BTUNGWTH'
655
- attr.store 'annual_energy_from_natural_gas_for_appliances', :field_name => 'BTUNGAPL'
656
- attr.store 'annual_energy_from_propane_for_heating_space', :field_name => 'BTULPSPH'
657
- attr.store 'annual_energy_from_propane_for_heating_water', :field_name => 'BTULPWTH'
658
- attr.store 'annual_energy_from_propane_for_appliances', :field_name => 'BTULPAPL'
659
- attr.store 'annual_energy_from_wood', :field_name => 'BTUWOOD'
660
- attr.store 'annual_energy_from_kerosene', :field_name => 'BTUKER'
661
- attr.store 'annual_energy_from_electricity_for_clothes_driers', :field_name => 'BTUELCDR'
662
- attr.store 'annual_energy_from_electricity_for_dishwashers', :field_name => 'BTUELDWH'
663
- attr.store 'annual_energy_from_electricity_for_freezers', :field_name => 'BTUELFZZ'
664
- attr.store 'annual_energy_from_electricity_for_refrigerators', :field_name => 'BTUELRFG'
665
- attr.store 'annual_energy_from_electricity_for_air_conditioners', :field_name => 'BTUELCOL'
666
- attr.store 'annual_energy_from_electricity_for_heating_space', :field_name => 'BTUELSPH'
667
- attr.store 'annual_energy_from_electricity_for_heating_water', :field_name => 'BTUELWTH'
668
- attr.store 'annual_energy_from_electricity_for_other_appliances', :field_name => 'BTUELAPL'
669
- attr.store 'weighting', :field_name => 'NWEIGHT'
670
- attr.store 'total_rooms', :field_name => 'TOTROOMS'
671
- attr.store 'bathrooms', :field_name => 'NCOMBATH'
672
- attr.store 'halfbaths', :field_name => 'NHAFBATH'
673
- attr.store 'heated_garage', :field_name => 'GARGHEAT'
674
- attr.store 'attached_1car_garage', :field_name => 'GARAGE1C'
675
- attr.store 'detached_1car_garage', :field_name => 'DGARG1C'
676
- attr.store 'attached_2car_garage', :field_name => 'GARAGE2C'
677
- attr.store 'detached_2car_garage', :field_name => 'DGARG2C'
678
- attr.store 'attached_3car_garage', :field_name => 'GARAGE3C'
679
- attr.store 'detached_3car_garage', :field_name => 'DGARG3C'
680
- attr.store 'lights_on_1_to_4_hours', :field_name => 'LGT1'
681
- attr.store 'efficient_lights_on_1_to_4_hours', :field_name => 'LGT1EE'
682
- attr.store 'lights_on_4_to_12_hours', :field_name => 'LGT4'
683
- attr.store 'efficient_lights_on_4_to_12_hours', :field_name => 'LGT4EE'
684
- attr.store 'lights_on_over_12_hours', :field_name => 'LGT12'
685
- attr.store 'efficient_lights_on_over_12_hours', :field_name => 'LGT12EE'
686
- attr.store 'outdoor_all_night_lights', :field_name => 'NOUTLGTNT'
687
- attr.store 'outdoor_all_night_gas_lights', :field_name => 'NGASLIGHT'
503
+ store 'floorspace', :field_name => 'TOTSQFT'
504
+ store 'residents', :field_name => 'NHSLDMEM'
505
+ store 'ownership', :field_name => 'KOWNRENT'
506
+ store 'thermostat_programmability', :field_name => 'PROTHERM'
507
+ store 'refrigerator_count', :field_name => 'NUMFRIG'
508
+ store 'freezer_count', :field_name => 'NUMFREEZ'
509
+ store 'heating_degree_days', :field_name => 'HD65'
510
+ store 'cooling_degree_days', :field_name => 'CD65'
511
+ store 'annual_energy_from_fuel_oil_for_heating_space', :field_name => 'BTUFOSPH'
512
+ store 'annual_energy_from_fuel_oil_for_heating_water', :field_name => 'BTUFOWTH'
513
+ store 'annual_energy_from_fuel_oil_for_appliances', :field_name => 'BTUFOAPL'
514
+ store 'annual_energy_from_natural_gas_for_heating_space', :field_name => 'BTUNGSPH'
515
+ store 'annual_energy_from_natural_gas_for_heating_water', :field_name => 'BTUNGWTH'
516
+ store 'annual_energy_from_natural_gas_for_appliances', :field_name => 'BTUNGAPL'
517
+ store 'annual_energy_from_propane_for_heating_space', :field_name => 'BTULPSPH'
518
+ store 'annual_energy_from_propane_for_heating_water', :field_name => 'BTULPWTH'
519
+ store 'annual_energy_from_propane_for_appliances', :field_name => 'BTULPAPL'
520
+ store 'annual_energy_from_wood', :field_name => 'BTUWOOD'
521
+ store 'annual_energy_from_kerosene', :field_name => 'BTUKER'
522
+ store 'annual_energy_from_electricity_for_clothes_driers', :field_name => 'BTUELCDR'
523
+ store 'annual_energy_from_electricity_for_dishwashers', :field_name => 'BTUELDWH'
524
+ store 'annual_energy_from_electricity_for_freezers', :field_name => 'BTUELFZZ'
525
+ store 'annual_energy_from_electricity_for_refrigerators', :field_name => 'BTUELRFG'
526
+ store 'annual_energy_from_electricity_for_air_conditioners', :field_name => 'BTUELCOL'
527
+ store 'annual_energy_from_electricity_for_heating_space', :field_name => 'BTUELSPH'
528
+ store 'annual_energy_from_electricity_for_heating_water', :field_name => 'BTUELWTH'
529
+ store 'annual_energy_from_electricity_for_other_appliances', :field_name => 'BTUELAPL'
530
+ store 'weighting', :field_name => 'NWEIGHT'
531
+ store 'total_rooms', :field_name => 'TOTROOMS'
532
+ store 'bathrooms', :field_name => 'NCOMBATH'
533
+ store 'halfbaths', :field_name => 'NHAFBATH'
534
+ store 'heated_garage', :field_name => 'GARGHEAT'
535
+ store 'attached_1car_garage', :field_name => 'GARAGE1C'
536
+ store 'detached_1car_garage', :field_name => 'DGARG1C'
537
+ store 'attached_2car_garage', :field_name => 'GARAGE2C'
538
+ store 'detached_2car_garage', :field_name => 'DGARG2C'
539
+ store 'attached_3car_garage', :field_name => 'GARAGE3C'
540
+ store 'detached_3car_garage', :field_name => 'DGARG3C'
541
+ store 'lights_on_1_to_4_hours', :field_name => 'LGT1'
542
+ store 'efficient_lights_on_1_to_4_hours', :field_name => 'LGT1EE'
543
+ store 'lights_on_4_to_12_hours', :field_name => 'LGT4'
544
+ store 'efficient_lights_on_4_to_12_hours', :field_name => 'LGT4EE'
545
+ store 'lights_on_over_12_hours', :field_name => 'LGT12'
546
+ store 'efficient_lights_on_over_12_hours', :field_name => 'LGT12EE'
547
+ store 'outdoor_all_night_lights', :field_name => 'NOUTLGTNT'
548
+ store 'outdoor_all_night_gas_lights', :field_name => 'NGASLIGHT'
688
549
  end
689
550
 
690
551
  # Rather than nullify the continuous variables that EIA identifies as LEGITIMATE SKIPS, we convert them to zero
@@ -806,31 +667,7 @@ class DataMinerTest < Test::Unit::TestCase
806
667
  b = CensusRegion.count
807
668
  assert_equal a, b
808
669
  end
809
-
810
- should "assume that no unique indices means it wants a big hash" do
811
- assert_raises DataMiner::MissingHashColumn do
812
- class IncompleteCountry < ActiveRecord::Base
813
- set_table_name 'countries'
814
-
815
- data_miner do
816
- # no unique index
817
-
818
- # get a complete list
819
- import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
820
- attr.store 'iso_3166', :field_number => 1
821
- attr.store 'name', :field_number => 0
822
- end
823
-
824
- # get nicer names
825
- import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
826
- attr.store 'iso_3166', :field_name => 'country code'
827
- attr.store 'name', :field_name => 'country'
828
- end
829
- end
830
- end
831
- end
832
- end
833
-
670
+
834
671
  should "hash things if no unique index is listed" do
835
672
  AutomobileVariant.data_miner_config.runnables[0].run(nil)
836
673
  assert AutomobileVariant.first.row_hash.present?
@@ -892,15 +729,14 @@ class DataMinerTest < Test::Unit::TestCase
892
729
  end
893
730
 
894
731
  if ENV['SLOW'] == 'true'
895
- should "import using a dictionary" do
896
- DataMiner.run :resource_names => %w{ ResidentialEnergyConsumptionSurveyResponse }
897
- assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
732
+ should "mine automobile variants" do
733
+ AutomobileVariant.run_data_miner!
734
+ assert AutomobileVariant.count('make_name LIKE "%tesla"') > 0
898
735
  end
899
736
 
900
- should "mine multiple classes in the correct order" do
901
- DataMiner.run
902
- uy = Country.find_by_iso_3166('UY')
903
- assert_equal 'Uruguay', uy.name
737
+ should "mine residence survey day" do
738
+ ResidentialEnergyConsumptionSurveyResponse.run_data_miner!
739
+ assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
904
740
  end
905
741
  end
906
742
  end