data_miner-ruby19 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+
2
+ module DataMiner
3
+ class Tap
4
+ attr_reader :base
5
+ attr_reader :position_in_run
6
+ attr_reader :description
7
+ attr_reader :source
8
+ attr_reader :options
9
+ delegate :resource, :to => :base
10
+
11
+ def initialize(base, position_in_run, description, source, options = {})
12
+ options.symbolize_keys!
13
+ DataMiner.log_or_raise "Tap has to be the first step." unless position_in_run == 0
14
+ @base = base
15
+ @position_in_run = position_in_run
16
+ @description = description
17
+ @source = source
18
+ @options = options
19
+ end
20
+
21
+ def inspect
22
+ "Tap(#{resource}): #{description} (#{source})"
23
+ end
24
+
25
+ def run(run)
26
+ [ source_table_name, resource.table_name ].each do |possible_obstacle|
27
+ if connection.table_exists?(possible_obstacle)
28
+ connection.drop_table possible_obstacle
29
+ end
30
+ end
31
+ DataMiner.backtick_with_reporting taps_pull_cmd
32
+ if needs_table_rename?
33
+ connection.rename_table source_table_name, resource.table_name
34
+ end
35
+ DataMiner.log_info "ran #{inspect}"
36
+ end
37
+
38
+ private
39
+
40
+ def connection
41
+ ActiveRecord::Base.connection
42
+ end
43
+
44
+ def db_config
45
+ @_db_config ||= connection.instance_variable_get(:@config).dup.merge(options.except(:source_table_name))
46
+ end
47
+
48
+ def source_table_name
49
+ options[:source_table_name] || resource.table_name
50
+ end
51
+
52
+ def needs_table_rename?
53
+ source_table_name != resource.table_name
54
+ end
55
+
56
+ def adapter
57
+ case connection.adapter_name
58
+ when /mysql/i
59
+ 'mysql'
60
+ when /postgres/i
61
+ 'postgres'
62
+ when /sqlite/i
63
+ 'sqlite'
64
+ end
65
+ end
66
+
67
+ # never optional
68
+ def database
69
+ db_config[:database]
70
+ end
71
+
72
+ DEFAULT_PORTS = {
73
+ 'mysql' => 3306,
74
+ 'postgres' => 5432
75
+ }
76
+
77
+ DEFAULT_USERNAMES = {
78
+ 'mysql' => 'root',
79
+ 'postgres' => ''
80
+ }
81
+
82
+ DEFAULT_PASSWORDS = {}
83
+ DEFAULT_PASSWORDS.default = ''
84
+
85
+ DEFAULT_HOSTS = {}
86
+ DEFAULT_HOSTS.default = 'localhost'
87
+
88
+ %w{ username password port host }.each do |x|
89
+ module_eval %{
90
+ def #{x}
91
+ db_config[:#{x}] || DEFAULT_#{x.upcase}S[adapter]
92
+ end
93
+ }
94
+ end
95
+
96
+ def db_locator
97
+ case adapter
98
+ when 'mysql', 'postgres'
99
+ "#{username}:#{password}@#{host}:#{port}/#{database}"
100
+ when 'sqlite'
101
+ database
102
+ end
103
+ end
104
+
105
+ # taps pull mysql://root:password@localhost/taps_test http://foo:bar@data.brighterplanet.com:5000 --tables aircraft
106
+ def taps_pull_cmd
107
+ Escape.shell_command [
108
+ 'taps',
109
+ 'pull',
110
+ "#{adapter}://#{db_locator}",
111
+ source,
112
+ '--indexes-first',
113
+ '--tables',
114
+ source_table_name
115
+ ]
116
+ # "taps pull #{source} --indexes-first --tables #{source_table_name}"
117
+ end
118
+
119
+ # 2.3.5 mysql
120
+ # * <tt>:host</tt> - Defaults to "localhost".
121
+ # * <tt>:port</tt> - Defaults to 3306.
122
+ # * <tt>:socket</tt> - Defaults to "/tmp/mysql.sock".
123
+ # * <tt>:username</tt> - Defaults to "root"
124
+ # * <tt>:password</tt> - Defaults to nothing.
125
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
126
+ # * <tt>:encoding</tt> - (Optional) Sets the client encoding by executing "SET NAMES <encoding>" after connection.
127
+ # * <tt>:reconnect</tt> - Defaults to false (See MySQL documentation: http://dev.mysql.com/doc/refman/5.0/en/auto-reconnect.html).
128
+ # * <tt>:sslca</tt> - Necessary to use MySQL with an SSL connection.
129
+ # * <tt>:sslkey</tt> - Necessary to use MySQL with an SSL connection.
130
+ # * <tt>:sslcert</tt> - Necessary to use MySQL with an SSL connection.
131
+ # * <tt>:sslcapath</tt> - Necessary to use MySQL with an SSL connection.
132
+ # * <tt>:sslcipher</tt> - Necessary to use MySQL with an SSL connection.
133
+ # 2.3.5 mysql
134
+ # * <tt>:host</tt> - Defaults to "localhost".
135
+ # * <tt>:port</tt> - Defaults to 5432.
136
+ # * <tt>:username</tt> - Defaults to nothing.
137
+ # * <tt>:password</tt> - Defaults to nothing.
138
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
139
+ # * <tt>:schema_search_path</tt> - An optional schema search path for the connection given as a string of comma-separated schema names. This is backward-compatible with the <tt>:schema_order</tt> option.
140
+ # * <tt>:encoding</tt> - An optional client encoding that is used in a <tt>SET client_encoding TO <encoding></tt> call on the connection.
141
+ # * <tt>:min_messages</tt> - An optional client min messages that is used in a <tt>SET client_min_messages TO <min_messages></tt> call on the connection.
142
+ # * <tt>:allow_concurrency</tt> - If true, use async query methods so Ruby threads don't deadlock; otherwise, use blocking query methods.
143
+ # 2.3.5 sqlite[3]
144
+ # * <tt>:database</tt> - Path to the database file.
145
+ end
146
+ end
@@ -0,0 +1,1399 @@
1
+ require 'test_helper'
2
+
3
+ class AutomobileFuelType < ActiveRecord::Base
4
+ set_primary_key :code
5
+
6
+ data_miner do
7
+ import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
8
+ :filename => 'Gd6-dsc.txt',
9
+ :format => :fixed_width,
10
+ :crop => 21..26, # inclusive
11
+ :cut => '2-',
12
+ :select => lambda { |row| /\A[A-Z]/.match row[:code] },
13
+ :schema => [[ 'code', 2, { :type => :string } ],
14
+ [ 'spacer', 2 ],
15
+ [ 'name', 52, { :type => :string } ]]) do
16
+ key 'code'
17
+ store 'name'
18
+ end
19
+
20
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
21
+ key 'code'
22
+ store 'name'
23
+ store 'annual_distance'
24
+ store 'emission_factor'
25
+ end
26
+
27
+ # pull electricity emission factor from residential electricity
28
+ import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
29
+ :select => lambda { |row| row['code'] == 'El' }) do
30
+ key 'code'
31
+ store 'name'
32
+ store 'emission_factor'
33
+ end
34
+
35
+ # still need distance estimate for electric cars
36
+ end
37
+
38
+ CODES = {
39
+ :electricity => 'El',
40
+ :diesel => 'D'
41
+ }
42
+ end
43
+
44
+ class AutomobileVariant < ActiveRecord::Base
45
+ set_primary_key :row_hash
46
+
47
+ module FuelEconomyGuide
48
+ TRANSMISSIONS = {
49
+ 'A' => 'automatic',
50
+ 'M' => 'manual',
51
+ 'L' => 'automatic', # Lockup/automatic
52
+ 'S' => 'semiautomatic', # Semiautomatic
53
+ 'C' => 'manual' # TODO verify for VW Syncro
54
+ }
55
+
56
+ ENGINE_TYPES = {
57
+ '(GUZZLER)' => nil, # "gas guzzler"
58
+ '(POLICE)' => nil, # police automobile_variant
59
+ '(MPFI)' => 'injection',
60
+ '(MPI*)' => 'injection',
61
+ '(SPFI)' => 'injection',
62
+ '(FFS)' => 'injection',
63
+ '(TURBO)' => 'turbo',
64
+ '(TRBO)' => 'turbo',
65
+ '(TC*)' => 'turbo',
66
+ '(FFS,TRBO)' => %w(injection turbo),
67
+ '(S-CHARGE)' => 'supercharger',
68
+ '(SC*)' => 'supercharger',
69
+ '(DIESEL)' => nil, # diesel
70
+ '(DSL)' => nil, # diesel
71
+ '(ROTARY)' => nil, # rotary
72
+ '(VARIABLE)' => nil, # variable displacement
73
+ '(NO-CAT)' => nil, # no catalytic converter
74
+ '(OHC)' => nil, # overhead camshaft
75
+ '(OHV)' => nil, # overhead valves
76
+ '(16-VALVE)' => nil, # 16V
77
+ '(305)' => nil, # 305 cubic inch displacement
78
+ '(307)' => nil, # 307 cubic inch displacement
79
+ '(M-ENG)' => nil,
80
+ '(W-ENG)' => nil,
81
+ '(GM-BUICK)' => nil,
82
+ '(GM-CHEV)' => nil,
83
+ '(GM-OLDS)' => nil,
84
+ '(GM-PONT)' => nil,
85
+ }
86
+
87
+ class ParserB
88
+ attr_accessor :year
89
+ def initialize(options = {})
90
+ @year = options[:year]
91
+ end
92
+
93
+ def apply(row)
94
+ row.merge!({
95
+ 'make' => row['carline_mfr_name'], # make it line up with the errata
96
+ 'model' => row['carline_name'], # ditto
97
+ 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
98
+ 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
99
+ 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
100
+ 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
101
+ 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
102
+ 'displacement' => _displacement(row['opt_disp']),
103
+ 'year' => year
104
+ })
105
+ row
106
+ end
107
+
108
+ def _displacement(str)
109
+ str = str.gsub(/[\(\)]/, '').strip
110
+ if str =~ /^(.+)L$/
111
+ $1.to_f
112
+ elsif str =~ /^(.+)CC$/
113
+ $1.to_f / 1000
114
+ end
115
+ end
116
+
117
+ def add_hints!(bus)
118
+ bus[:format] = :fixed_width
119
+ bus[:cut] = '13-' if year == 1995
120
+ bus[:schema_name] = :fuel_economy_guide_b
121
+ bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
122
+ Slither.define :fuel_economy_guide_b do |d|
123
+ d.rows do |row|
124
+ row.trap { true } # there's only one section
125
+ row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
126
+ row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
127
+ row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
128
+ row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
129
+ row.column 'carline_name' , 28, :type => :string # CARLINE NAME
130
+ row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
131
+ row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
132
+ row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
133
+ row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
134
+ row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
135
+ row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
136
+ row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
137
+ row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
138
+ row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
139
+ row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
140
+ row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
141
+ row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
142
+ row.spacer 2
143
+ row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
144
+ row.spacer 2
145
+ row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
146
+ row.spacer 2
147
+ row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
148
+ row.spacer 2
149
+ row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
150
+ row.spacer 2
151
+ row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
152
+ row.spacer 2
153
+ row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
154
+ row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
155
+ row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
156
+ row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
157
+ row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
158
+ row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
159
+ row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
160
+ row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
161
+ row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
162
+ row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
163
+ row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
164
+ row.column 'filler' , 1, :type => :string # NOT USED
165
+ row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
166
+ row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
167
+ end
168
+ end
169
+ end
170
+ end
171
+ class ParserC
172
+ attr_accessor :year
173
+ def initialize(options = {})
174
+ @year = options[:year]
175
+ end
176
+
177
+ def add_hints!(bus)
178
+ # File will decide format based on filename
179
+ end
180
+
181
+ def apply(row)
182
+ row.merge!({
183
+ 'make' => row['Manufacturer'], # make it line up with the errata
184
+ 'model' => row['carline name'], # ditto
185
+ 'drive' => row['drv'] + 'WD',
186
+ 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
187
+ 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
188
+ 'turbo' => row['T'] == 'T',
189
+ 'supercharger' => row['S'] == 'S',
190
+ 'injection' => true,
191
+ 'year' => year
192
+ })
193
+ row
194
+ end
195
+ end
196
+ class ParserD
197
+ attr_accessor :year
198
+ def initialize(options = {})
199
+ @year = options[:year]
200
+ end
201
+
202
+ def add_hints!(bus)
203
+ bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
204
+ end
205
+
206
+ def apply(row)
207
+ row.merge!({
208
+ 'make' => row['MFR'], # make it line up with the errata
209
+ 'model' => row['CAR LINE'], # ditto
210
+ 'drive' => row['DRIVE SYS'] + 'WD',
211
+ 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
212
+ 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
213
+ 'turbo' => row['TURBO'] == 'T',
214
+ 'supercharger' => row['SPCHGR'] == 'S',
215
+ 'injection' => true,
216
+ 'year' => year
217
+ })
218
+ row
219
+ end
220
+ end
221
+ end
222
+
223
+ class Guru
224
+ # the following matching methods are needed by the errata
225
+ # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
226
+
227
+ def transmission_is_blank?(row)
228
+ row['transmission'].blank?
229
+ end
230
+
231
+ def is_a_2007_gmc_or_chevrolet?(row)
232
+ row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
233
+ end
234
+
235
+ def is_a_porsche?(row)
236
+ row['make'].upcase == 'PORSCHE'
237
+ end
238
+
239
+ def is_not_a_porsche?(row)
240
+ !is_a_porsche? row
241
+ end
242
+
243
+ def is_a_mercedes_benz?(row)
244
+ row['make'] =~ /MERCEDES/i
245
+ end
246
+
247
+ def is_a_lexus?(row)
248
+ row['make'].upcase == 'LEXUS'
249
+ end
250
+
251
+ def is_a_bmw?(row)
252
+ row['make'].upcase == 'BMW'
253
+ end
254
+
255
+ def is_a_ford?(row)
256
+ row['make'].upcase == 'FORD'
257
+ end
258
+
259
+ def is_a_rolls_royce_and_model_contains_bentley?(row)
260
+ is_a_rolls_royce?(row) and model_contains_bentley?(row)
261
+ end
262
+
263
+ def is_a_bentley?(row)
264
+ row['make'].upcase == 'BENTLEY'
265
+ end
266
+
267
+ def is_a_rolls_royce?(row)
268
+ row['make'] =~ /ROLLS/i
269
+ end
270
+
271
+ def is_a_turbo_brooklands?(row)
272
+ row['model'] =~ /TURBO R\/RL BKLDS/i
273
+ end
274
+
275
+ def model_contains_maybach?(row)
276
+ row['model'] =~ /MAYBACH/i
277
+ end
278
+
279
+ def model_contains_bentley?(row)
280
+ row['model'] =~ /BENTLEY/i
281
+ end
282
+ end
283
+
284
+ errata = Errata.new :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv',
285
+ :responder => AutomobileVariant::Guru.new
286
+
287
+ data_miner do
288
+ # 1985---1997
289
+ (85..97).each do |yy|
290
+ filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
291
+ import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
292
+ :filename => filename,
293
+ :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
294
+ :errata => errata) do
295
+ key 'row_hash'
296
+ store 'make_name', :field_name => 'make'
297
+ store 'model_name', :field_name => 'model'
298
+ store 'year'
299
+ store 'fuel_type_code', :field_name => 'fuel_type'
300
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
301
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
302
+ store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
303
+ store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
304
+ store 'cylinders', :field_name => 'no_cyc'
305
+ store 'drive', :field_name => 'drive_system'
306
+ store 'carline_mfr_code'
307
+ store 'vi_mfr_code'
308
+ store 'carline_code'
309
+ store 'carline_class_code', :field_name => 'carline_clss'
310
+ store 'transmission'
311
+ store 'speeds'
312
+ store 'turbo'
313
+ store 'supercharger'
314
+ store 'injection'
315
+ store 'displacement'
316
+ end
317
+ end
318
+
319
+ # 1998--2005
320
+ {
321
+ 1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
322
+ 1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
323
+ 2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
324
+ 2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
325
+ 2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
326
+ 2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
327
+ 2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
328
+ 2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
329
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
330
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
331
+ :errata => errata) do
332
+ key 'row_hash'
333
+ store 'make_name', :field_name => 'make'
334
+ store 'model_name', :field_name => 'model'
335
+ store 'fuel_type_code', :field_name => 'fl'
336
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
337
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
338
+ store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
339
+ store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
340
+ store 'cylinders', :field_name => 'cyl'
341
+ store 'displacement', :field_name => 'displ'
342
+ store 'carline_class_code', :field_name => 'cls' if year >= 2000
343
+ store 'carline_class_name', :field_name => 'Class'
344
+ store 'year'
345
+ store 'transmission'
346
+ store 'speeds'
347
+ store 'turbo'
348
+ store 'supercharger'
349
+ store 'injection'
350
+ store 'drive'
351
+ end
352
+ end
353
+
354
+ # 2006--2010
355
+ {
356
+ 2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
357
+ 2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
358
+ 2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
359
+ 2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
360
+ # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
361
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
362
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
363
+ :errata => errata) do
364
+ key 'row_hash'
365
+ store 'make_name', :field_name => 'make'
366
+ store 'model_name', :field_name => 'model'
367
+ store 'fuel_type_code', :field_name => 'FUEL TYPE'
368
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
369
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
370
+ store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
371
+ store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
372
+ store 'cylinders', :field_name => 'NUMB CYL'
373
+ store 'displacement', :field_name => 'DISPLACEMENT'
374
+ store 'carline_class_code', :field_name => 'CLS'
375
+ store 'carline_class_name', :field_name => 'CLASS'
376
+ store 'year'
377
+ store 'transmission'
378
+ store 'speeds'
379
+ store 'turbo'
380
+ store 'supercharger'
381
+ store 'injection'
382
+ store 'drive'
383
+ end
384
+ end
385
+
386
+ # associate :make, :key => :original_automobile_make_name, :foreign_key => :name
387
+ # derive :automobile_model_id # creates models by name
388
+ # associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
389
+
390
+ process 'Set adjusted fuel economy' do
391
+ update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
392
+ update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
393
+ end
394
+ end
395
+
396
+ def name
397
+ extra = []
398
+ extra << "V#{cylinders}" if cylinders
399
+ extra << "#{displacement}L" if displacement
400
+ extra << "turbo" if turbo
401
+ extra << "FI" if injection
402
+ extra << "#{speeds}spd" if speeds.present?
403
+ extra << transmission if transmission.present?
404
+ extra << "(#{fuel_type.name})" if fuel_type
405
+ extra.join(' ')
406
+ end
407
+
408
+ def fuel_economy_description
409
+ [ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
410
+ end
411
+ end
412
+
413
+ class Country < ActiveRecord::Base
414
+ set_primary_key :iso_3166
415
+
416
+ data_miner do
417
+ import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
418
+ key 'iso_3166', :field_number => 1
419
+ store 'name', :field_number => 0
420
+ end
421
+
422
+ import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
423
+ key 'iso_3166', :field_name => 'country code'
424
+ store 'name', :field_name => 'country'
425
+ end
426
+ end
427
+ end
428
+
429
+ class Airport < ActiveRecord::Base
430
+ set_primary_key :iata_code
431
+
432
+ data_miner do
433
+ import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
434
+ key 'iata_code', :field_number => 4
435
+ store 'name', :field_number => 1
436
+ store 'city', :field_number => 2
437
+ store 'country_name', :field_number => 3
438
+ store 'latitude', :field_number => 6
439
+ store 'longitude', :field_number => 7
440
+ end
441
+ end
442
+ end
443
+
444
+ class TappedAirport < ActiveRecord::Base
445
+ set_primary_key :iata_code
446
+
447
+ data_miner do
448
+ tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@data.brighterplanet.com:5001", :source_table_name => 'airports'
449
+ # tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@localhost:5000", :source_table_name => 'airports'
450
+ end
451
+ end
452
+
453
+ class CensusRegion < ActiveRecord::Base
454
+ set_primary_key :number
455
+
456
+ data_miner do
457
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
458
+ key 'number', :field_name => 'Region'
459
+ store 'name', :field_name => 'Name'
460
+ end
461
+
462
+ # pretend this is a different data source
463
+ # fake! just for testing purposes
464
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
465
+ key 'number', :field_name => 'Region'
466
+ store 'name', :field_name => 'Name'
467
+ end
468
+ end
469
+ end
470
+
471
+ # smaller than a region
472
+ class CensusDivision < ActiveRecord::Base
473
+ set_primary_key :number
474
+
475
+ data_miner do
476
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
477
+ key 'number', :field_name => 'Division'
478
+ store 'name', :field_name => 'Name'
479
+ store 'census_region_number', :field_name => 'Region'
480
+ store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
481
+ end
482
+ end
483
+ end
484
+
485
+ class CensusDivisionDeux < ActiveRecord::Base
486
+ set_primary_key :number
487
+
488
+ data_miner do
489
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
490
+ key 'number', :field_name => 'Division'
491
+ store 'name', :field_name => 'Name'
492
+ store 'census_region_number', :field_name => 'Region'
493
+ store 'census_region_name', :field_name => 'Region', :dictionary => DataMiner::Dictionary.new(:input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv')
494
+ end
495
+ end
496
+ end
497
+
498
+ class CrosscallingCensusRegion < ActiveRecord::Base
499
+ set_primary_key :number
500
+
501
+ has_many :crosscalling_census_divisions
502
+
503
+ data_miner do
504
+ process "derive ourselves from the census divisions table (i.e., cross call census divisions)" do
505
+ CrosscallingCensusDivision.run_data_miner!
506
+ connection.create_table :crosscalling_census_regions, :options => 'ENGINE=InnoDB default charset=utf8', :id => false, :force => true do |t|
507
+ t.column :number, :integer
508
+ t.column :name, :string
509
+ end
510
+ connection.execute 'ALTER TABLE crosscalling_census_regions ADD PRIMARY KEY (number);'
511
+ connection.execute %{
512
+ INSERT IGNORE INTO crosscalling_census_regions(number, name)
513
+ SELECT crosscalling_census_divisions.census_region_number, crosscalling_census_divisions.census_region_name FROM crosscalling_census_divisions
514
+ }
515
+ end
516
+ end
517
+ end
518
+
519
+ class CrosscallingCensusDivision < ActiveRecord::Base
520
+ set_primary_key :number
521
+
522
+ belongs_to :crosscalling_census_regions, :foreign_key => 'census_region_number'
523
+
524
+ data_miner do
525
+ import "get a list of census divisions and their regions", :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
526
+ key 'number', :field_name => 'Division'
527
+ store 'name', :field_name => 'Name'
528
+ store 'census_region_number', :field_name => 'Region'
529
+ store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
530
+ end
531
+
532
+ process "make sure my parent object is set up (i.e., cross-call it)" do
533
+ CrosscallingCensusRegion.run_data_miner!
534
+ end
535
+ end
536
+ end
537
+
538
+ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
539
+ set_primary_key :department_of_energy_identifier
540
+
541
+ data_miner do
542
+ process 'Define some unit conversions' do
543
+ Conversions.register :kbtus, :joules, 1_000.0 * 1_055.05585
544
+ Conversions.register :square_feet, :square_metres, 0.09290304
545
+ end
546
+
547
+ # conversions are NOT performed here, since we first have to zero out legitimate skips
548
+ # otherwise you will get values like "999 pounds = 453.138778 kilograms" (where 999 is really a legit skip)
549
+ import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv' do
550
+ key 'department_of_energy_identifier', :field_name => 'DOEID'
551
+
552
+ store 'residence_class', :field_name => 'TYPEHUQ', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/typehuq/typehuq.csv' }
553
+ store 'construction_year', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Date in the middle (synthetic)', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
554
+ store 'construction_period', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
555
+ store 'urbanity', :field_name => 'URBRUR', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/urbrur/urbrur.csv' }
556
+ store 'dishwasher_use', :field_name => 'DWASHUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dwashuse/dwashuse.csv' }
557
+ store 'central_ac_use', :field_name => 'USECENAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usecenac/usecenac.csv' }
558
+ store 'window_ac_use', :field_name => 'USEWWAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usewwac/usewwac.csv' }
559
+ store 'clothes_washer_use', :field_name => 'WASHLOAD', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/washload/washload.csv' }
560
+ store 'clothes_dryer_use', :field_name => 'DRYRUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dryruse/dryruse.csv' }
561
+
562
+ store 'census_division_number', :field_name => 'DIVISION'
563
+ store 'census_division_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
564
+ store 'census_region_number', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_number', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
565
+ store 'census_region_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
566
+
567
+ store 'floorspace', :field_name => 'TOTSQFT'
568
+ store 'residents', :field_name => 'NHSLDMEM'
569
+ store 'ownership', :field_name => 'KOWNRENT'
570
+ store 'thermostat_programmability', :field_name => 'PROTHERM'
571
+ store 'refrigerator_count', :field_name => 'NUMFRIG'
572
+ store 'freezer_count', :field_name => 'NUMFREEZ'
573
+ store 'heating_degree_days', :field_name => 'HD65'
574
+ store 'cooling_degree_days', :field_name => 'CD65'
575
+ store 'annual_energy_from_fuel_oil_for_heating_space', :field_name => 'BTUFOSPH'
576
+ store 'annual_energy_from_fuel_oil_for_heating_water', :field_name => 'BTUFOWTH'
577
+ store 'annual_energy_from_fuel_oil_for_appliances', :field_name => 'BTUFOAPL'
578
+ store 'annual_energy_from_natural_gas_for_heating_space', :field_name => 'BTUNGSPH'
579
+ store 'annual_energy_from_natural_gas_for_heating_water', :field_name => 'BTUNGWTH'
580
+ store 'annual_energy_from_natural_gas_for_appliances', :field_name => 'BTUNGAPL'
581
+ store 'annual_energy_from_propane_for_heating_space', :field_name => 'BTULPSPH'
582
+ store 'annual_energy_from_propane_for_heating_water', :field_name => 'BTULPWTH'
583
+ store 'annual_energy_from_propane_for_appliances', :field_name => 'BTULPAPL'
584
+ store 'annual_energy_from_wood', :field_name => 'BTUWOOD'
585
+ store 'annual_energy_from_kerosene', :field_name => 'BTUKER'
586
+ store 'annual_energy_from_electricity_for_clothes_driers', :field_name => 'BTUELCDR'
587
+ store 'annual_energy_from_electricity_for_dishwashers', :field_name => 'BTUELDWH'
588
+ store 'annual_energy_from_electricity_for_freezers', :field_name => 'BTUELFZZ'
589
+ store 'annual_energy_from_electricity_for_refrigerators', :field_name => 'BTUELRFG'
590
+ store 'annual_energy_from_electricity_for_air_conditioners', :field_name => 'BTUELCOL'
591
+ store 'annual_energy_from_electricity_for_heating_space', :field_name => 'BTUELSPH'
592
+ store 'annual_energy_from_electricity_for_heating_water', :field_name => 'BTUELWTH'
593
+ store 'annual_energy_from_electricity_for_other_appliances', :field_name => 'BTUELAPL'
594
+ store 'weighting', :field_name => 'NWEIGHT'
595
+ store 'total_rooms', :field_name => 'TOTROOMS'
596
+ store 'bathrooms', :field_name => 'NCOMBATH'
597
+ store 'halfbaths', :field_name => 'NHAFBATH'
598
+ store 'heated_garage', :field_name => 'GARGHEAT'
599
+ store 'attached_1car_garage', :field_name => 'GARAGE1C'
600
+ store 'detached_1car_garage', :field_name => 'DGARG1C'
601
+ store 'attached_2car_garage', :field_name => 'GARAGE2C'
602
+ store 'detached_2car_garage', :field_name => 'DGARG2C'
603
+ store 'attached_3car_garage', :field_name => 'GARAGE3C'
604
+ store 'detached_3car_garage', :field_name => 'DGARG3C'
605
+ store 'lights_on_1_to_4_hours', :field_name => 'LGT1'
606
+ store 'efficient_lights_on_1_to_4_hours', :field_name => 'LGT1EE'
607
+ store 'lights_on_4_to_12_hours', :field_name => 'LGT4'
608
+ store 'efficient_lights_on_4_to_12_hours', :field_name => 'LGT4EE'
609
+ store 'lights_on_over_12_hours', :field_name => 'LGT12'
610
+ store 'efficient_lights_on_over_12_hours', :field_name => 'LGT12EE'
611
+ store 'outdoor_all_night_lights', :field_name => 'NOUTLGTNT'
612
+ store 'outdoor_all_night_gas_lights', :field_name => 'NGASLIGHT'
613
+ end
614
+
615
+ # Rather than nullify the continuous variables that EIA identifies as LEGITIMATE SKIPS, we convert them to zero
616
+ # This makes it easier to derive useful information like "how many rooms does the house have?"
617
+ process 'Zero out what the EIA calls "LEGITIMATE SKIPS"' do
618
+ %w{
619
+ annual_energy_from_electricity_for_air_conditioners
620
+ annual_energy_from_electricity_for_clothes_driers
621
+ annual_energy_from_electricity_for_dishwashers
622
+ annual_energy_from_electricity_for_freezers
623
+ annual_energy_from_electricity_for_heating_space
624
+ annual_energy_from_electricity_for_heating_water
625
+ annual_energy_from_electricity_for_other_appliances
626
+ annual_energy_from_electricity_for_refrigerators
627
+ annual_energy_from_fuel_oil_for_appliances
628
+ annual_energy_from_fuel_oil_for_heating_space
629
+ annual_energy_from_fuel_oil_for_heating_water
630
+ annual_energy_from_kerosene
631
+ annual_energy_from_propane_for_appliances
632
+ annual_energy_from_propane_for_heating_space
633
+ annual_energy_from_propane_for_heating_water
634
+ annual_energy_from_natural_gas_for_appliances
635
+ annual_energy_from_natural_gas_for_heating_space
636
+ annual_energy_from_natural_gas_for_heating_water
637
+ annual_energy_from_wood
638
+ lights_on_1_to_4_hours
639
+ lights_on_over_12_hours
640
+ efficient_lights_on_over_12_hours
641
+ efficient_lights_on_1_to_4_hours
642
+ lights_on_4_to_12_hours
643
+ efficient_lights_on_4_to_12_hours
644
+ outdoor_all_night_gas_lights
645
+ outdoor_all_night_lights
646
+ thermostat_programmability
647
+ detached_1car_garage
648
+ detached_2car_garage
649
+ detached_3car_garage
650
+ attached_1car_garage
651
+ attached_2car_garage
652
+ attached_3car_garage
653
+ heated_garage
654
+ }.each do |attr_name|
655
+ max = maximum attr_name, :select => "CONVERT(#{attr_name}, UNSIGNED INTEGER)"
656
+ # if the maximum value of a row is all 999's, then it's a LEGITIMATE SKIP and we should set it to zero
657
+ if /^9+$/.match(max.to_i.to_s)
658
+ update_all "#{attr_name} = 0", "#{attr_name} = #{max}"
659
+ end
660
+ end
661
+ end
662
+
663
+ process 'Convert units to metric after zeroing out LEGITIMATE SKIPS' do
664
+ [
665
+ [ 'floorspace', :square_feet, :square_metres ],
666
+ [ 'annual_energy_from_fuel_oil_for_heating_space', :kbtus, :joules ],
667
+ [ 'annual_energy_from_fuel_oil_for_heating_water', :kbtus, :joules ],
668
+ [ 'annual_energy_from_fuel_oil_for_appliances', :kbtus, :joules ],
669
+ [ 'annual_energy_from_natural_gas_for_heating_space', :kbtus, :joules ],
670
+ [ 'annual_energy_from_natural_gas_for_heating_water', :kbtus, :joules ],
671
+ [ 'annual_energy_from_natural_gas_for_appliances', :kbtus, :joules ],
672
+ [ 'annual_energy_from_propane_for_heating_space', :kbtus, :joules ],
673
+ [ 'annual_energy_from_propane_for_heating_water', :kbtus, :joules ],
674
+ [ 'annual_energy_from_propane_for_appliances', :kbtus, :joules ],
675
+ [ 'annual_energy_from_wood', :kbtus, :joules ],
676
+ [ 'annual_energy_from_kerosene', :kbtus, :joules ],
677
+ [ 'annual_energy_from_electricity_for_clothes_driers', :kbtus, :joules ],
678
+ [ 'annual_energy_from_electricity_for_dishwashers', :kbtus, :joules ],
679
+ [ 'annual_energy_from_electricity_for_freezers', :kbtus, :joules ],
680
+ [ 'annual_energy_from_electricity_for_refrigerators', :kbtus, :joules ],
681
+ [ 'annual_energy_from_electricity_for_air_conditioners', :kbtus, :joules ],
682
+ [ 'annual_energy_from_electricity_for_heating_space', :kbtus, :joules ],
683
+ [ 'annual_energy_from_electricity_for_heating_water', :kbtus, :joules ],
684
+ [ 'annual_energy_from_electricity_for_other_appliances', :kbtus, :joules ],
685
+ ].each do |attr_name, from_units, to_units|
686
+ update_all "#{attr_name} = #{attr_name} * #{Conversions::Unit.exchange_rate from_units, to_units}"
687
+ end
688
+ end
689
+
690
+ process 'Add a new field "rooms" that estimates how many rooms are in the house' do
691
+ update_all 'rooms = total_rooms + bathrooms/2 + halfbaths/4 + heated_garage*(attached_1car_garage + detached_1car_garage + 2*(attached_2car_garage + detached_2car_garage) + 3*(attached_3car_garage + detached_3car_garage))'
692
+ end
693
+
694
+ process 'Add a new field "lighting_use" that estimates how many hours light bulbs are turned on in the house' do
695
+ update_all 'lighting_use = 2*(lights_on_1_to_4_hours + efficient_lights_on_1_to_4_hours) + 8*(lights_on_4_to_12_hours + efficient_lights_on_4_to_12_hours) + 16*(lights_on_over_12_hours + efficient_lights_on_over_12_hours) + 12*(outdoor_all_night_lights + outdoor_all_night_gas_lights)'
696
+ end
697
+
698
+ process 'Add a new field "lighting_efficiency" that estimates what percentage of light bulbs in a house are energy-efficient' do
699
+ update_all 'lighting_efficiency = (2*efficient_lights_on_1_to_4_hours + 8*efficient_lights_on_4_to_12_hours + 16*efficient_lights_on_over_12_hours) / lighting_use'
700
+ end
701
+ end
702
+ end
703
+
704
+ # T-100 Segment (All Carriers): http://www.transtats.bts.gov/Fields.asp?Table_ID=293
705
+ class T100FlightSegment < ActiveRecord::Base
706
+ set_primary_key :row_hash
707
+ URL = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
708
+ FORM_DATA = %{
709
+ UserTableName=T_100_Segment__All_Carriers&
710
+ DBShortName=Air_Carriers&
711
+ RawDataTable=T_T100_SEGMENT_ALL_CARRIER&
712
+ sqlstr=+SELECT+DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE+FROM++T_T100_SEGMENT_ALL_CARRIER+WHERE+Month+%3D__MONTH_NUMBER__+AND+YEAR%3D__YEAR__&
713
+ varlist=DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE&
714
+ grouplist=&
715
+ suml=&
716
+ sumRegion=&
717
+ filter1=title%3D&
718
+ filter2=title%3D&
719
+ geo=All%A0&
720
+ time=__MONTH_NAME__&
721
+ timename=Month&
722
+ GEOGRAPHY=All&
723
+ XYEAR=__YEAR__&
724
+ FREQUENCY=__MONTH_NUMBER__&
725
+ AllVars=All&
726
+ VarName=DEPARTURES_SCHEDULED&
727
+ VarDesc=DepScheduled&
728
+ VarType=Num&
729
+ VarName=DEPARTURES_PERFORMED&
730
+ VarDesc=DepPerformed&
731
+ VarType=Num&
732
+ VarName=PAYLOAD&
733
+ VarDesc=Payload&
734
+ VarType=Num&
735
+ VarName=SEATS&
736
+ VarDesc=Seats&
737
+ VarType=Num&
738
+ VarName=PASSENGERS&
739
+ VarDesc=Passengers&
740
+ VarType=Num&
741
+ VarName=FREIGHT&
742
+ VarDesc=Freight&
743
+ VarType=Num&
744
+ VarName=MAIL&
745
+ VarDesc=Mail&
746
+ VarType=Num&
747
+ VarName=DISTANCE&
748
+ VarDesc=Distance&
749
+ VarType=Num&
750
+ VarName=RAMP_TO_RAMP&
751
+ VarDesc=RampToRamp&
752
+ VarType=Num&
753
+ VarName=AIR_TIME&
754
+ VarDesc=AirTime&
755
+ VarType=Num&
756
+ VarName=UNIQUE_CARRIER&
757
+ VarDesc=UniqueCarrier&
758
+ VarType=Char&
759
+ VarName=AIRLINE_ID&
760
+ VarDesc=AirlineID&
761
+ VarType=Num&
762
+ VarName=UNIQUE_CARRIER_NAME&
763
+ VarDesc=UniqueCarrierName&
764
+ VarType=Char&
765
+ VarName=UNIQUE_CARRIER_ENTITY&
766
+ VarDesc=UniqCarrierEntity&
767
+ VarType=Char&
768
+ VarName=REGION&
769
+ VarDesc=CarrierRegion&
770
+ VarType=Char&
771
+ VarName=CARRIER&
772
+ VarDesc=Carrier&
773
+ VarType=Char&
774
+ VarName=CARRIER_NAME&
775
+ VarDesc=CarrierName&
776
+ VarType=Char&
777
+ VarName=CARRIER_GROUP&
778
+ VarDesc=CarrierGroup&
779
+ VarType=Num&
780
+ VarName=CARRIER_GROUP_NEW&
781
+ VarDesc=CarrierGroupNew&
782
+ VarType=Num&
783
+ VarName=ORIGIN&
784
+ VarDesc=Origin&
785
+ VarType=Char&
786
+ VarName=ORIGIN_CITY_NAME&
787
+ VarDesc=OriginCityName&
788
+ VarType=Char&
789
+ VarName=ORIGIN_CITY_NUM&
790
+ VarDesc=OriginCityNum&
791
+ VarType=Num&
792
+ VarName=ORIGIN_STATE_ABR&
793
+ VarDesc=OriginState&
794
+ VarType=Char&
795
+ VarName=ORIGIN_STATE_FIPS&
796
+ VarDesc=OriginStateFips&
797
+ VarType=Char&
798
+ VarName=ORIGIN_STATE_NM&
799
+ VarDesc=OriginStateName&
800
+ VarType=Char&
801
+ VarName=ORIGIN_COUNTRY&
802
+ VarDesc=OriginCountry&
803
+ VarType=Char&
804
+ VarName=ORIGIN_COUNTRY_NAME&
805
+ VarDesc=OriginCountryName&
806
+ VarType=Char&
807
+ VarName=ORIGIN_WAC&
808
+ VarDesc=OriginWac&
809
+ VarType=Num&
810
+ VarName=DEST&
811
+ VarDesc=Dest&
812
+ VarType=Char&
813
+ VarName=DEST_CITY_NAME&
814
+ VarDesc=DestCityName&
815
+ VarType=Char&
816
+ VarName=DEST_CITY_NUM&
817
+ VarDesc=DestCityNum&
818
+ VarType=Num&
819
+ VarName=DEST_STATE_ABR&
820
+ VarDesc=DestState&
821
+ VarType=Char&
822
+ VarName=DEST_STATE_FIPS&
823
+ VarDesc=DestStateFips&
824
+ VarType=Char&
825
+ VarName=DEST_STATE_NM&
826
+ VarDesc=DestStateName&
827
+ VarType=Char&
828
+ VarName=DEST_COUNTRY&
829
+ VarDesc=DestCountry&
830
+ VarType=Char&
831
+ VarName=DEST_COUNTRY_NAME&
832
+ VarDesc=DestCountryName&
833
+ VarType=Char&
834
+ VarName=DEST_WAC&
835
+ VarDesc=DestWac&
836
+ VarType=Num&
837
+ VarName=AIRCRAFT_GROUP&
838
+ VarDesc=AircraftGroup&
839
+ VarType=Num&
840
+ VarName=AIRCRAFT_TYPE&
841
+ VarDesc=AircraftType&
842
+ VarType=Char&
843
+ VarName=AIRCRAFT_CONFIG&
844
+ VarDesc=AircraftConfig&
845
+ VarType=Num&
846
+ VarName=YEAR&
847
+ VarDesc=Year&
848
+ VarType=Num&
849
+ VarName=QUARTER&
850
+ VarDesc=Quarter&
851
+ VarType=Num&
852
+ VarName=MONTH&
853
+ VarDesc=Month&
854
+ VarType=Num&
855
+ VarName=DISTANCE_GROUP&
856
+ VarDesc=DistanceGroup&
857
+ VarType=Num&
858
+ VarName=CLASS&
859
+ VarDesc=Class&
860
+ VarType=Char&
861
+ VarName=DATA_SOURCE&
862
+ VarDesc=DataSource&
863
+ VarType=Char
864
+ }.gsub /[\s]+/,''
865
+
866
+ data_miner do
867
+ months = Hash.new
868
+ # (2008..2009).each do |year|
869
+ (2008..2008).each do |year|
870
+ # (1..12).each do |month|
871
+ (1..1).each do |month|
872
+ time = Time.gm year, month
873
+ form_data = FORM_DATA.dup
874
+ form_data.gsub! '__YEAR__', time.year.to_s
875
+ form_data.gsub! '__MONTH_NUMBER__', time.month.to_s
876
+ form_data.gsub! '__MONTH_NAME__', time.strftime('%B')
877
+ months[time] = form_data
878
+ end
879
+ end
880
+ months.each do |month, form_data|
881
+ import "T100 data from #{month.strftime('%B %Y')}",
882
+ :url => URL,
883
+ :form_data => form_data,
884
+ :compression => :zip,
885
+ :glob => '/*.csv' do
886
+ key 'row_hash'
887
+ store 'departures_scheduled', :field_name => 'DEPARTURES_SCHEDULED'
888
+ store 'departures_performed', :field_name => 'DEPARTURES_PERFORMED'
889
+ store 'payload', :field_name => 'PAYLOAD', :from_units => :pounds, :to_units => :kilograms
890
+ store 'seats', :field_name => 'SEATS'
891
+ store 'passengers', :field_name => 'PASSENGERS'
892
+ store 'freight', :field_name => 'FREIGHT', :from_units => :pounds, :to_units => :kilograms
893
+ store 'mail', :field_name => 'MAIL', :from_units => :pounds, :to_units => :kilograms
894
+ store 'distance', :field_name => 'DISTANCE', :from_units => :miles, :to_units => :kilometres
895
+ store 'ramp_to_ramp', :field_name => 'RAMP_TO_RAMP'
896
+ store 'air_time', :field_name => 'AIR_TIME'
897
+ store 'unique_carrier', :field_name => 'UNIQUE_CARRIER'
898
+ store 'dot_airline_id', :field_name => 'AIRLINE_ID'
899
+ store 'unique_carrier_name', :field_name => 'UNIQUE_CARRIER_NAME'
900
+ store 'unique_carrier_entity', :field_name => 'UNIQUE_CARRIER_ENTITY'
901
+ store 'region', :field_name => 'REGION'
902
+ store 'carrier', :field_name => 'CARRIER'
903
+ store 'carrier_name', :field_name => 'CARRIER_NAME'
904
+ store 'carrier_group', :field_name => 'CARRIER_GROUP'
905
+ store 'carrier_group_new', :field_name => 'CARRIER_GROUP_NEW'
906
+ store 'origin_airport_iata', :field_name => 'ORIGIN'
907
+ store 'origin_city_name', :field_name => 'ORIGIN_CITY_NAME'
908
+ store 'origin_city_num', :field_name => 'ORIGIN_CITY_NUM'
909
+ store 'origin_state_abr', :field_name => 'ORIGIN_STATE_ABR'
910
+ store 'origin_state_fips', :field_name => 'ORIGIN_STATE_FIPS'
911
+ store 'origin_state_nm', :field_name => 'ORIGIN_STATE_NM'
912
+ store 'origin_country_iso_3166', :field_name => 'ORIGIN_COUNTRY'
913
+ store 'origin_country_name', :field_name => 'ORIGIN_COUNTRY_NAME'
914
+ store 'origin_wac', :field_name => 'ORIGIN_WAC'
915
+ store 'dest_airport_iata', :field_name => 'DEST'
916
+ store 'dest_city_name', :field_name => 'DEST_CITY_NAME'
917
+ store 'dest_city_num', :field_name => 'DEST_CITY_NUM'
918
+ store 'dest_state_abr', :field_name => 'DEST_STATE_ABR'
919
+ store 'dest_state_fips', :field_name => 'DEST_STATE_FIPS'
920
+ store 'dest_state_nm', :field_name => 'DEST_STATE_NM'
921
+ store 'dest_country_iso_3166', :field_name => 'DEST_COUNTRY'
922
+ store 'dest_country_name', :field_name => 'DEST_COUNTRY_NAME'
923
+ store 'dest_wac', :field_name => 'DEST_WAC'
924
+ store 'bts_aircraft_group', :field_name => 'AIRCRAFT_GROUP'
925
+ store 'bts_aircraft_type', :field_name => 'AIRCRAFT_TYPE'
926
+ store 'bts_aircraft_config', :field_name => 'AIRCRAFT_CONFIG'
927
+ store 'year', :field_name => 'YEAR'
928
+ store 'quarter', :field_name => 'QUARTER'
929
+ store 'month', :field_name => 'MONTH'
930
+ store 'bts_distance_group', :field_name => 'DISTANCE_GROUP'
931
+ store 'bts_service_class', :field_name => 'CLASS'
932
+ store 'data_source', :field_name => 'DATA_SOURCE'
933
+ end
934
+ end
935
+
936
+ process 'Derive freight share as a fraction of payload' do
937
+ update_all 'freight_share = (freight + mail) / payload', 'payload > 0'
938
+ end
939
+
940
+ process 'Derive load factor, which is passengers divided by the total seats available' do
941
+ update_all 'load_factor = passengers / seats', 'passengers <= seats'
942
+ end
943
+
944
+ process 'Derive average seats per departure' do
945
+ update_all 'seats_per_departure = seats / departures_performed', 'departures_performed > 0'
946
+ end
947
+ end
948
+ end
949
+
950
+ require 'loose_tight_dictionary'
951
+ class Aircraft < ActiveRecord::Base
952
+ set_primary_key :icao_code
953
+
954
+ def self.bts_dictionary
955
+ @_dictionary ||= LooseTightDictionary.new RemoteTable.new(:url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv', :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }),
956
+ :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
957
+ :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
958
+ :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
959
+ :left_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Model'] },
960
+ :right_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
961
+ end
962
+
963
+ class BtsAircraftTypeCodeMatcher
964
+ def match(left_record)
965
+ right_record = Aircraft.bts_dictionary.left_to_right left_record
966
+ right_record['Aircraft Type'] if right_record
967
+ end
968
+ end
969
+
970
+ class BtsNameMatcher
971
+ def match(left_record)
972
+ right_record = Aircraft.bts_dictionary.left_to_right left_record
973
+ right_record['Manufacturer'] + ' ' + right_record['Long Name'] if right_record
974
+ end
975
+ end
976
+
977
+ class Guru
978
+ # for errata
979
+ def is_attributed_to_boeing?(row)
980
+ row['Manufacturer'] =~ /BOEING/i
981
+ end
982
+
983
+ def is_attributed_to_cessna?(row)
984
+ row['Manufacturer'] =~ /CESSNA/i
985
+ end
986
+
987
+ def is_attributed_to_fokker?(row)
988
+ row['Manufacturer'] =~ /FOKKER/i
989
+ end
990
+
991
+ def is_not_attributed_to_aerospatiale?(row)
992
+ not row['Manufacturer'] =~ /AEROSPATIALE/i
993
+ end
994
+
995
+ def is_not_attributed_to_cessna?(row)
996
+ not row['Manufacturer'] =~ /CESSNA/i
997
+ end
998
+
999
+ def is_not_attributed_to_learjet?(row)
1000
+ not row['Manufacturer'] =~ /LEAR/i
1001
+ end
1002
+
1003
+ def is_not_attributed_to_dehavilland?(row)
1004
+ not row['Manufacturer'] =~ /DE ?HAVILLAND/i
1005
+ end
1006
+
1007
+ def is_not_attributed_to_mcdonnell_douglas?(row)
1008
+ not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
1009
+ end
1010
+
1011
+ def is_not_a_dc_plane?(row)
1012
+ not row['Model'] =~ /DC/i
1013
+ end
1014
+
1015
+ def is_a_crj_900?(row)
1016
+ row['Designator'].downcase == 'crj9'
1017
+ end
1018
+ end
1019
+
1020
+ data_miner do
1021
+ # ('A'..'Z').each do |letter|
1022
+ # Note: for the purposes of testing, only importing "D"
1023
+ %w{ D }.each do |letter|
1024
+ import("ICAO codes starting with letter #{letter} used by the FAA",
1025
+ :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1026
+ :encoding => 'US-ASCII',
1027
+ :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1028
+ :responder => Aircraft::Guru.new),
1029
+ :row_xpath => '//table/tr[2]/td/table/tr',
1030
+ :column_xpath => 'td') do
1031
+ key 'icao_code', :field_name => 'Designator'
1032
+ store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new
1033
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new
1034
+ store 'manufacturer_name', :field_name => 'Manufacturer'
1035
+ store 'name', :field_name => 'Model'
1036
+ end
1037
+
1038
+ import 'Brighter Planet aircraft class codes',
1039
+ :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
1040
+ key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
1041
+ store 'brighter_planet_aircraft_class_code'
1042
+ end
1043
+ end
1044
+ end
1045
+ end
1046
+
1047
+ # note that this depends on stuff in Aircraft
1048
+ class AircraftDeux < ActiveRecord::Base
1049
+ set_primary_key :icao_code
1050
+
1051
+ # defined on the class because we defined the errata with a shorthand
1052
+ class << self
1053
+ def is_not_attributed_to_aerospatiale?(row)
1054
+ not row['Manufacturer'] =~ /AEROSPATIALE/i
1055
+ end
1056
+
1057
+ def is_not_attributed_to_cessna?(row)
1058
+ not row['Manufacturer'] =~ /CESSNA/i
1059
+ end
1060
+
1061
+ def is_not_attributed_to_learjet?(row)
1062
+ not row['Manufacturer'] =~ /LEAR/i
1063
+ end
1064
+
1065
+ def is_not_attributed_to_dehavilland?(row)
1066
+ not row['Manufacturer'] =~ /DE ?HAVILLAND/i
1067
+ end
1068
+
1069
+ def is_not_attributed_to_mcdonnell_douglas?(row)
1070
+ not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
1071
+ end
1072
+
1073
+ def is_not_a_dc_plane?(row)
1074
+ not row['Model'] =~ /DC/i
1075
+ end
1076
+
1077
+ def is_a_crj_900?(row)
1078
+ row['Designator'].downcase == 'crj9'
1079
+ end
1080
+ end
1081
+
1082
+ data_miner do
1083
+ # ('A'..'Z').each do |letter|
1084
+ # Note: for the purposes of testing, only importing "D"
1085
+ %w{ D }.each do |letter|
1086
+ import("ICAO codes starting with letter #{letter} used by the FAA",
1087
+ :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1088
+ :encoding => 'windows-1252',
1089
+ :errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1090
+ :row_xpath => '//table/tr[2]/td/table/tr',
1091
+ :column_xpath => 'td') do
1092
+ key 'icao_code', :field_name => 'Designator'
1093
+ store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new
1094
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new
1095
+ store 'manufacturer_name', :field_name => 'Manufacturer'
1096
+ store 'name', :field_name => 'Model'
1097
+ end
1098
+ end
1099
+ end
1100
+ end
1101
+
1102
+ class AutomobileMakeFleetYear < ActiveRecord::Base
1103
+ set_primary_key :name
1104
+
1105
+ data_miner do
1106
+ schema :id => false do
1107
+ string "name"
1108
+ string "make_name"
1109
+ string "fleet"
1110
+ integer "year"
1111
+ float "fuel_efficiency"
1112
+ string "fuel_efficiency_units"
1113
+ integer "volume"
1114
+ string "make_year_name"
1115
+ datetime "created_at"
1116
+ datetime "updated_at"
1117
+ end
1118
+
1119
+ process "finish if i tell you to" do
1120
+ raise DataMiner::Finish if $force_finish
1121
+ end
1122
+
1123
+ process "skip if i tell you to" do
1124
+ raise DataMiner::Skip if $force_skip
1125
+ end
1126
+
1127
+ # CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
1128
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
1129
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
1130
+ :select => lambda { |row| row['volume'].to_i > 0 } do
1131
+ key 'name', :synthesize => lambda { |row| [ row['manufacturer_name'], row['fleet'][2,2], row['year_content'] ].join ' ' }
1132
+ store 'make_name', :field_name => 'manufacturer_name'
1133
+ store 'year', :field_name => 'year_content'
1134
+ store 'fleet', :chars => 2..3 # zero-based
1135
+ store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
1136
+ store 'volume'
1137
+ end
1138
+ end
1139
+ end
1140
+
1141
+ class CensusDivisionTrois < ActiveRecord::Base
1142
+ set_primary_key :number_code
1143
+ data_miner do
1144
+ schema :options => 'ENGINE=InnoDB default charset=utf8' do
1145
+ string 'number_code'
1146
+ string 'name'
1147
+ string 'census_region_name'
1148
+ integer 'census_region_number'
1149
+ index 'census_region_name', :name => 'homefry'
1150
+ index ['number_code', 'name', 'census_region_name', 'census_region_number', 'updated_at', 'created_at']
1151
+ end
1152
+ end
1153
+ end
1154
+
1155
+ class CensusDivisionFour < ActiveRecord::Base
1156
+ data_miner do
1157
+ schema do
1158
+ string 'number_code'
1159
+ string 'name'
1160
+ string 'census_region_name'
1161
+ integer 'census_region_number'
1162
+ index 'census_region_name', :name => 'homefry'
1163
+ end
1164
+ end
1165
+ end
1166
+
1167
+ # todo: have somebody properly organize these
1168
+ class DataMinerTest < Test::Unit::TestCase
1169
+ if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
1170
+ should 'directly create a table for the model' do
1171
+ if AutomobileMakeFleetYear.table_exists?
1172
+ ActiveRecord::Base.connection.execute 'DROP TABLE automobile_make_fleet_years;'
1173
+ end
1174
+ AutomobileMakeFleetYear.execute_schema
1175
+ assert AutomobileMakeFleetYear.table_exists?
1176
+ end
1177
+ end
1178
+
1179
+ if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
1180
+ should 'override an existing data_miner configuration' do
1181
+ AutomobileFuelType.class_eval do
1182
+ data_miner do
1183
+ import 'example', :url => 'http://example.com' do
1184
+ key 'code'
1185
+ store 'name'
1186
+ end
1187
+ end
1188
+ end
1189
+ assert_kind_of DataMiner::Import, AutomobileFuelType.data_miner_base.steps.first
1190
+ assert_equal 'http://example.com', AutomobileFuelType.data_miner_base.steps.first.table.package.url
1191
+ assert_equal 1, AutomobileFuelType.data_miner_base.step_counter
1192
+ end
1193
+ should "stop and finish if it gets a DataMiner::Finish" do
1194
+ AutomobileMakeFleetYear.delete_all
1195
+ AutomobileMakeFleetYear.data_miner_runs.delete_all
1196
+ $force_finish = true
1197
+ AutomobileMakeFleetYear.run_data_miner!
1198
+ assert_equal 0, AutomobileMakeFleetYear.count
1199
+ assert_equal true, (AutomobileMakeFleetYear.data_miner_runs.count > 0)
1200
+ assert_equal true, AutomobileMakeFleetYear.data_miner_runs.all? { |run| run.finished? and not run.skipped and not run.killed? }
1201
+ $force_finish = false
1202
+ AutomobileMakeFleetYear.run_data_miner!
1203
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1204
+ end
1205
+
1206
+ should "stop and register skipped if it gets a DataMiner::Skip" do
1207
+ AutomobileMakeFleetYear.delete_all
1208
+ AutomobileMakeFleetYear.data_miner_runs.delete_all
1209
+ $force_skip = true
1210
+ AutomobileMakeFleetYear.run_data_miner!
1211
+ assert_equal 0, AutomobileMakeFleetYear.count
1212
+ assert_equal true, (AutomobileMakeFleetYear.data_miner_runs.count > 0)
1213
+ assert_equal true, AutomobileMakeFleetYear.data_miner_runs.all? { |run| run.skipped? and not run.finished? and not run.killed? }
1214
+ $force_skip = false
1215
+ AutomobileMakeFleetYear.run_data_miner!
1216
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1217
+ end
1218
+
1219
+ should "eagerly enforce a schema" do
1220
+ ActiveRecord::Base.connection.create_table 'census_division_trois', :force => true, :options => 'ENGINE=InnoDB default charset=utf8' do |t|
1221
+ t.string 'name'
1222
+ # t.datetime 'updated_at'
1223
+ # t.datetime 'created_at'
1224
+ t.string 'census_region_name'
1225
+ # t.integer 'census_region_number'
1226
+ end
1227
+ ActiveRecord::Base.connection.execute 'ALTER TABLE census_division_trois ADD INDEX (census_region_name)'
1228
+ CensusDivisionTrois.reset_column_information
1229
+ missing_columns = %w{ updated_at created_at census_region_number }
1230
+
1231
+ # sanity check
1232
+ missing_columns.each do |column|
1233
+ assert_equal false, CensusDivisionTrois.column_names.include?(column)
1234
+ end
1235
+ assert_equal false, ActiveRecord::Base.connection.indexes(CensusDivisionTrois.table_name).any? { |index| index.name == 'homefry' }
1236
+
1237
+ 3.times do
1238
+ CensusDivisionTrois.run_data_miner!
1239
+ missing_columns.each do |column|
1240
+ assert_equal true, CensusDivisionTrois.column_names.include?(column)
1241
+ end
1242
+ assert_equal true, ActiveRecord::Base.connection.indexes(CensusDivisionTrois.table_name).any? { |index| index.name == 'homefry' }
1243
+ assert_equal :string, CensusDivisionTrois.columns_hash[CensusDivisionTrois.primary_key].type
1244
+ end
1245
+ end
1246
+
1247
+ should "let schemas work with default id primary keys" do
1248
+ ActiveRecord::Base.connection.create_table 'census_division_fours', :force => true, :options => 'ENGINE=InnoDB default charset=utf8' do |t|
1249
+ t.string 'name'
1250
+ # t.datetime 'updated_at'
1251
+ # t.datetime 'created_at'
1252
+ t.string 'census_region_name'
1253
+ # t.integer 'census_region_number'
1254
+ end
1255
+ ActiveRecord::Base.connection.execute 'ALTER TABLE census_division_fours ADD INDEX (census_region_name)'
1256
+ CensusDivisionFour.reset_column_information
1257
+ missing_columns = %w{ updated_at created_at census_region_number }
1258
+
1259
+ # sanity check
1260
+ missing_columns.each do |column|
1261
+ assert_equal false, CensusDivisionFour.column_names.include?(column)
1262
+ end
1263
+ assert_equal false, ActiveRecord::Base.connection.indexes(CensusDivisionFour.table_name).any? { |index| index.name == 'homefry' }
1264
+
1265
+ 3.times do
1266
+ CensusDivisionFour.run_data_miner!
1267
+ missing_columns.each do |column|
1268
+ assert_equal true, CensusDivisionFour.column_names.include?(column)
1269
+ end
1270
+ assert_equal true, ActiveRecord::Base.connection.indexes(CensusDivisionFour.table_name).any? { |index| index.name == 'homefry' }
1271
+ assert_equal :integer, CensusDivisionFour.columns_hash[CensusDivisionFour.primary_key].type
1272
+ end
1273
+ end
1274
+
1275
+ should "allow specifying dictionaries explicitly" do
1276
+ CensusDivisionDeux.run_data_miner!
1277
+ assert_equal 'South Region', CensusDivisionDeux.find(5).census_region_name
1278
+ end
1279
+
1280
+ should "be able to key on things other than the primary key" do
1281
+ Aircraft.run_data_miner!
1282
+ assert_equal 'SP', Aircraft.find('DHC6').brighter_planet_aircraft_class_code
1283
+ end
1284
+
1285
+ should "be able to synthesize rows without using a full parser class" do
1286
+ AutomobileMakeFleetYear.run_data_miner!
1287
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1288
+ end
1289
+
1290
+ should "keep a call stack so that you can call run_data_miner! on a child" do
1291
+ CrosscallingCensusDivision.run_data_miner!
1292
+ assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
1293
+ assert CrosscallingCensusRegion.exists? :name => 'West Region', :number => 4
1294
+ end
1295
+
1296
+ should "keep a call stack so that you can call run_data_miner! on a parent" do
1297
+ CrosscallingCensusRegion.run_data_miner!
1298
+ assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
1299
+ assert CrosscallingCensusRegion.exists? :name => 'West Region', :number => 4
1300
+ end
1301
+
1302
+ should "import airports" do
1303
+ Airport.run_data_miner!
1304
+ assert Airport.count > 0
1305
+ end
1306
+
1307
+ should "tap airports" do
1308
+ TappedAirport.run_data_miner!
1309
+ assert TappedAirport.count > 0
1310
+ end
1311
+
1312
+ should "pull in census divisions using a data.brighterplanet.com dictionary" do
1313
+ CensusDivision.run_data_miner!
1314
+ assert CensusDivision.count > 0
1315
+ end
1316
+
1317
+ should "have a way to queue up runs that works with delated_job's send_later" do
1318
+ assert AutomobileVariant.respond_to?(:run_data_miner!)
1319
+ end
1320
+
1321
+ should "be idempotent" do
1322
+ Country.data_miner_base.run
1323
+ a = Country.count
1324
+ Country.data_miner_base.run
1325
+ b = Country.count
1326
+ assert_equal a, b
1327
+
1328
+ CensusRegion.data_miner_base.run
1329
+ a = CensusRegion.count
1330
+ CensusRegion.data_miner_base.run
1331
+ b = CensusRegion.count
1332
+ assert_equal a, b
1333
+ end
1334
+
1335
+ should "hash things" do
1336
+ AutomobileVariant.data_miner_base.steps[0].run(nil)
1337
+ assert AutomobileVariant.first.row_hash.present?
1338
+ end
1339
+
1340
+ should "process a callback block instead of a method" do
1341
+ AutomobileVariant.delete_all
1342
+ AutomobileVariant.data_miner_base.steps[0].run(nil)
1343
+ assert !AutomobileVariant.first.fuel_efficiency_city.present?
1344
+ AutomobileVariant.data_miner_base.steps.last.run(nil)
1345
+ assert AutomobileVariant.first.fuel_efficiency_city.present?
1346
+ end
1347
+
1348
+ should "keep a log when it does a run" do
1349
+ approx_started_at = Time.now
1350
+ DataMiner.run :resource_names => %w{ Country }
1351
+ approx_terminated_at = Time.now
1352
+ last_run = DataMiner::Run.first(:conditions => { :resource_name => 'Country' }, :order => 'id DESC')
1353
+ assert (last_run.started_at - approx_started_at).abs < 5 # seconds
1354
+ assert (last_run.terminated_at - approx_terminated_at).abs < 5 # seconds
1355
+ end
1356
+
1357
+ should "request a re-import from scratch" do
1358
+ c = Country.new
1359
+ c.iso_3166 = 'JUNK'
1360
+ c.save!
1361
+ assert Country.exists?(:iso_3166 => 'JUNK')
1362
+ DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
1363
+ assert !Country.exists?(:iso_3166 => 'JUNK')
1364
+ end
1365
+
1366
+ should "know what runs were on a resource" do
1367
+ DataMiner.run :resource_names => %w{ Country }
1368
+ DataMiner.run :resource_names => %w{ Country }
1369
+ assert Country.data_miner_runs.count > 0
1370
+ end
1371
+ end
1372
+
1373
+ if ENV['ALL'] == 'true' or ENV['SLOW'] == 'true'
1374
+ should "allow errata to be specified with a shorthand, assuming the responder is the resource class itself" do
1375
+ AircraftDeux.run_data_miner!
1376
+ assert AircraftDeux.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1377
+ end
1378
+
1379
+ should "mine aircraft" do
1380
+ Aircraft.run_data_miner!
1381
+ assert Aircraft.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1382
+ end
1383
+
1384
+ should "mine automobile variants" do
1385
+ AutomobileVariant.run_data_miner!
1386
+ assert AutomobileVariant.count('make_name LIKE "%tesla"') > 0
1387
+ end
1388
+
1389
+ should "mine T100 flight segments" do
1390
+ T100FlightSegment.run_data_miner!
1391
+ assert T100FlightSegment.count('dest_country_name LIKE "%United States"') > 0
1392
+ end
1393
+
1394
+ should "mine residence survey responses" do
1395
+ ResidentialEnergyConsumptionSurveyResponse.run_data_miner!
1396
+ assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.start_with?('Single-family detached house')
1397
+ end
1398
+ end
1399
+ end