data_miner-ruby19 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,146 @@
1
+
2
+ module DataMiner
3
+ class Tap
4
+ attr_reader :base
5
+ attr_reader :position_in_run
6
+ attr_reader :description
7
+ attr_reader :source
8
+ attr_reader :options
9
+ delegate :resource, :to => :base
10
+
11
+ def initialize(base, position_in_run, description, source, options = {})
12
+ options.symbolize_keys!
13
+ DataMiner.log_or_raise "Tap has to be the first step." unless position_in_run == 0
14
+ @base = base
15
+ @position_in_run = position_in_run
16
+ @description = description
17
+ @source = source
18
+ @options = options
19
+ end
20
+
21
+ def inspect
22
+ "Tap(#{resource}): #{description} (#{source})"
23
+ end
24
+
25
+ def run(run)
26
+ [ source_table_name, resource.table_name ].each do |possible_obstacle|
27
+ if connection.table_exists?(possible_obstacle)
28
+ connection.drop_table possible_obstacle
29
+ end
30
+ end
31
+ DataMiner.backtick_with_reporting taps_pull_cmd
32
+ if needs_table_rename?
33
+ connection.rename_table source_table_name, resource.table_name
34
+ end
35
+ DataMiner.log_info "ran #{inspect}"
36
+ end
37
+
38
+ private
39
+
40
+ def connection
41
+ ActiveRecord::Base.connection
42
+ end
43
+
44
+ def db_config
45
+ @_db_config ||= connection.instance_variable_get(:@config).dup.merge(options.except(:source_table_name))
46
+ end
47
+
48
+ def source_table_name
49
+ options[:source_table_name] || resource.table_name
50
+ end
51
+
52
+ def needs_table_rename?
53
+ source_table_name != resource.table_name
54
+ end
55
+
56
+ def adapter
57
+ case connection.adapter_name
58
+ when /mysql/i
59
+ 'mysql'
60
+ when /postgres/i
61
+ 'postgres'
62
+ when /sqlite/i
63
+ 'sqlite'
64
+ end
65
+ end
66
+
67
+ # never optional
68
+ def database
69
+ db_config[:database]
70
+ end
71
+
72
+ DEFAULT_PORTS = {
73
+ 'mysql' => 3306,
74
+ 'postgres' => 5432
75
+ }
76
+
77
+ DEFAULT_USERNAMES = {
78
+ 'mysql' => 'root',
79
+ 'postgres' => ''
80
+ }
81
+
82
+ DEFAULT_PASSWORDS = {}
83
+ DEFAULT_PASSWORDS.default = ''
84
+
85
+ DEFAULT_HOSTS = {}
86
+ DEFAULT_HOSTS.default = 'localhost'
87
+
88
+ %w{ username password port host }.each do |x|
89
+ module_eval %{
90
+ def #{x}
91
+ db_config[:#{x}] || DEFAULT_#{x.upcase}S[adapter]
92
+ end
93
+ }
94
+ end
95
+
96
+ def db_locator
97
+ case adapter
98
+ when 'mysql', 'postgres'
99
+ "#{username}:#{password}@#{host}:#{port}/#{database}"
100
+ when 'sqlite'
101
+ database
102
+ end
103
+ end
104
+
105
+ # taps pull mysql://root:password@localhost/taps_test http://foo:bar@data.brighterplanet.com:5000 --tables aircraft
106
+ def taps_pull_cmd
107
+ Escape.shell_command [
108
+ 'taps',
109
+ 'pull',
110
+ "#{adapter}://#{db_locator}",
111
+ source,
112
+ '--indexes-first',
113
+ '--tables',
114
+ source_table_name
115
+ ]
116
+ # "taps pull #{source} --indexes-first --tables #{source_table_name}"
117
+ end
118
+
119
+ # 2.3.5 mysql
120
+ # * <tt>:host</tt> - Defaults to "localhost".
121
+ # * <tt>:port</tt> - Defaults to 3306.
122
+ # * <tt>:socket</tt> - Defaults to "/tmp/mysql.sock".
123
+ # * <tt>:username</tt> - Defaults to "root"
124
+ # * <tt>:password</tt> - Defaults to nothing.
125
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
126
+ # * <tt>:encoding</tt> - (Optional) Sets the client encoding by executing "SET NAMES <encoding>" after connection.
127
+ # * <tt>:reconnect</tt> - Defaults to false (See MySQL documentation: http://dev.mysql.com/doc/refman/5.0/en/auto-reconnect.html).
128
+ # * <tt>:sslca</tt> - Necessary to use MySQL with an SSL connection.
129
+ # * <tt>:sslkey</tt> - Necessary to use MySQL with an SSL connection.
130
+ # * <tt>:sslcert</tt> - Necessary to use MySQL with an SSL connection.
131
+ # * <tt>:sslcapath</tt> - Necessary to use MySQL with an SSL connection.
132
+ # * <tt>:sslcipher</tt> - Necessary to use MySQL with an SSL connection.
133
+ # 2.3.5 mysql
134
+ # * <tt>:host</tt> - Defaults to "localhost".
135
+ # * <tt>:port</tt> - Defaults to 5432.
136
+ # * <tt>:username</tt> - Defaults to nothing.
137
+ # * <tt>:password</tt> - Defaults to nothing.
138
+ # * <tt>:database</tt> - The name of the database. No default, must be provided.
139
+ # * <tt>:schema_search_path</tt> - An optional schema search path for the connection given as a string of comma-separated schema names. This is backward-compatible with the <tt>:schema_order</tt> option.
140
+ # * <tt>:encoding</tt> - An optional client encoding that is used in a <tt>SET client_encoding TO <encoding></tt> call on the connection.
141
+ # * <tt>:min_messages</tt> - An optional client min messages that is used in a <tt>SET client_min_messages TO <min_messages></tt> call on the connection.
142
+ # * <tt>:allow_concurrency</tt> - If true, use async query methods so Ruby threads don't deadlock; otherwise, use blocking query methods.
143
+ # 2.3.5 sqlite[3]
144
+ # * <tt>:database</tt> - Path to the database file.
145
+ end
146
+ end
@@ -0,0 +1,1399 @@
1
+ require 'test_helper'
2
+
3
+ class AutomobileFuelType < ActiveRecord::Base
4
+ set_primary_key :code
5
+
6
+ data_miner do
7
+ import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
8
+ :filename => 'Gd6-dsc.txt',
9
+ :format => :fixed_width,
10
+ :crop => 21..26, # inclusive
11
+ :cut => '2-',
12
+ :select => lambda { |row| /\A[A-Z]/.match row[:code] },
13
+ :schema => [[ 'code', 2, { :type => :string } ],
14
+ [ 'spacer', 2 ],
15
+ [ 'name', 52, { :type => :string } ]]) do
16
+ key 'code'
17
+ store 'name'
18
+ end
19
+
20
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
21
+ key 'code'
22
+ store 'name'
23
+ store 'annual_distance'
24
+ store 'emission_factor'
25
+ end
26
+
27
+ # pull electricity emission factor from residential electricity
28
+ import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
29
+ :select => lambda { |row| row['code'] == 'El' }) do
30
+ key 'code'
31
+ store 'name'
32
+ store 'emission_factor'
33
+ end
34
+
35
+ # still need distance estimate for electric cars
36
+ end
37
+
38
+ CODES = {
39
+ :electricity => 'El',
40
+ :diesel => 'D'
41
+ }
42
+ end
43
+
44
+ class AutomobileVariant < ActiveRecord::Base
45
+ set_primary_key :row_hash
46
+
47
+ module FuelEconomyGuide
48
+ TRANSMISSIONS = {
49
+ 'A' => 'automatic',
50
+ 'M' => 'manual',
51
+ 'L' => 'automatic', # Lockup/automatic
52
+ 'S' => 'semiautomatic', # Semiautomatic
53
+ 'C' => 'manual' # TODO verify for VW Syncro
54
+ }
55
+
56
+ ENGINE_TYPES = {
57
+ '(GUZZLER)' => nil, # "gas guzzler"
58
+ '(POLICE)' => nil, # police automobile_variant
59
+ '(MPFI)' => 'injection',
60
+ '(MPI*)' => 'injection',
61
+ '(SPFI)' => 'injection',
62
+ '(FFS)' => 'injection',
63
+ '(TURBO)' => 'turbo',
64
+ '(TRBO)' => 'turbo',
65
+ '(TC*)' => 'turbo',
66
+ '(FFS,TRBO)' => %w(injection turbo),
67
+ '(S-CHARGE)' => 'supercharger',
68
+ '(SC*)' => 'supercharger',
69
+ '(DIESEL)' => nil, # diesel
70
+ '(DSL)' => nil, # diesel
71
+ '(ROTARY)' => nil, # rotary
72
+ '(VARIABLE)' => nil, # variable displacement
73
+ '(NO-CAT)' => nil, # no catalytic converter
74
+ '(OHC)' => nil, # overhead camshaft
75
+ '(OHV)' => nil, # overhead valves
76
+ '(16-VALVE)' => nil, # 16V
77
+ '(305)' => nil, # 305 cubic inch displacement
78
+ '(307)' => nil, # 307 cubic inch displacement
79
+ '(M-ENG)' => nil,
80
+ '(W-ENG)' => nil,
81
+ '(GM-BUICK)' => nil,
82
+ '(GM-CHEV)' => nil,
83
+ '(GM-OLDS)' => nil,
84
+ '(GM-PONT)' => nil,
85
+ }
86
+
87
+ class ParserB
88
+ attr_accessor :year
89
+ def initialize(options = {})
90
+ @year = options[:year]
91
+ end
92
+
93
+ def apply(row)
94
+ row.merge!({
95
+ 'make' => row['carline_mfr_name'], # make it line up with the errata
96
+ 'model' => row['carline_name'], # ditto
97
+ 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
98
+ 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
99
+ 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
100
+ 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
101
+ 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
102
+ 'displacement' => _displacement(row['opt_disp']),
103
+ 'year' => year
104
+ })
105
+ row
106
+ end
107
+
108
+ def _displacement(str)
109
+ str = str.gsub(/[\(\)]/, '').strip
110
+ if str =~ /^(.+)L$/
111
+ $1.to_f
112
+ elsif str =~ /^(.+)CC$/
113
+ $1.to_f / 1000
114
+ end
115
+ end
116
+
117
+ def add_hints!(bus)
118
+ bus[:format] = :fixed_width
119
+ bus[:cut] = '13-' if year == 1995
120
+ bus[:schema_name] = :fuel_economy_guide_b
121
+ bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
122
+ Slither.define :fuel_economy_guide_b do |d|
123
+ d.rows do |row|
124
+ row.trap { true } # there's only one section
125
+ row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
126
+ row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
127
+ row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
128
+ row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
129
+ row.column 'carline_name' , 28, :type => :string # CARLINE NAME
130
+ row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
131
+ row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
132
+ row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
133
+ row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
134
+ row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
135
+ row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
136
+ row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
137
+ row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
138
+ row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
139
+ row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
140
+ row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
141
+ row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
142
+ row.spacer 2
143
+ row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
144
+ row.spacer 2
145
+ row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
146
+ row.spacer 2
147
+ row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
148
+ row.spacer 2
149
+ row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
150
+ row.spacer 2
151
+ row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
152
+ row.spacer 2
153
+ row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
154
+ row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
155
+ row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
156
+ row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
157
+ row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
158
+ row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
159
+ row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
160
+ row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
161
+ row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
162
+ row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
163
+ row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
164
+ row.column 'filler' , 1, :type => :string # NOT USED
165
+ row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
166
+ row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
167
+ end
168
+ end
169
+ end
170
+ end
171
+ class ParserC
172
+ attr_accessor :year
173
+ def initialize(options = {})
174
+ @year = options[:year]
175
+ end
176
+
177
+ def add_hints!(bus)
178
+ # File will decide format based on filename
179
+ end
180
+
181
+ def apply(row)
182
+ row.merge!({
183
+ 'make' => row['Manufacturer'], # make it line up with the errata
184
+ 'model' => row['carline name'], # ditto
185
+ 'drive' => row['drv'] + 'WD',
186
+ 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
187
+ 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
188
+ 'turbo' => row['T'] == 'T',
189
+ 'supercharger' => row['S'] == 'S',
190
+ 'injection' => true,
191
+ 'year' => year
192
+ })
193
+ row
194
+ end
195
+ end
196
+ class ParserD
197
+ attr_accessor :year
198
+ def initialize(options = {})
199
+ @year = options[:year]
200
+ end
201
+
202
+ def add_hints!(bus)
203
+ bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
204
+ end
205
+
206
+ def apply(row)
207
+ row.merge!({
208
+ 'make' => row['MFR'], # make it line up with the errata
209
+ 'model' => row['CAR LINE'], # ditto
210
+ 'drive' => row['DRIVE SYS'] + 'WD',
211
+ 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
212
+ 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
213
+ 'turbo' => row['TURBO'] == 'T',
214
+ 'supercharger' => row['SPCHGR'] == 'S',
215
+ 'injection' => true,
216
+ 'year' => year
217
+ })
218
+ row
219
+ end
220
+ end
221
+ end
222
+
223
+ class Guru
224
+ # the following matching methods are needed by the errata
225
+ # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
226
+
227
+ def transmission_is_blank?(row)
228
+ row['transmission'].blank?
229
+ end
230
+
231
+ def is_a_2007_gmc_or_chevrolet?(row)
232
+ row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
233
+ end
234
+
235
+ def is_a_porsche?(row)
236
+ row['make'].upcase == 'PORSCHE'
237
+ end
238
+
239
+ def is_not_a_porsche?(row)
240
+ !is_a_porsche? row
241
+ end
242
+
243
+ def is_a_mercedes_benz?(row)
244
+ row['make'] =~ /MERCEDES/i
245
+ end
246
+
247
+ def is_a_lexus?(row)
248
+ row['make'].upcase == 'LEXUS'
249
+ end
250
+
251
+ def is_a_bmw?(row)
252
+ row['make'].upcase == 'BMW'
253
+ end
254
+
255
+ def is_a_ford?(row)
256
+ row['make'].upcase == 'FORD'
257
+ end
258
+
259
+ def is_a_rolls_royce_and_model_contains_bentley?(row)
260
+ is_a_rolls_royce?(row) and model_contains_bentley?(row)
261
+ end
262
+
263
+ def is_a_bentley?(row)
264
+ row['make'].upcase == 'BENTLEY'
265
+ end
266
+
267
+ def is_a_rolls_royce?(row)
268
+ row['make'] =~ /ROLLS/i
269
+ end
270
+
271
+ def is_a_turbo_brooklands?(row)
272
+ row['model'] =~ /TURBO R\/RL BKLDS/i
273
+ end
274
+
275
+ def model_contains_maybach?(row)
276
+ row['model'] =~ /MAYBACH/i
277
+ end
278
+
279
+ def model_contains_bentley?(row)
280
+ row['model'] =~ /BENTLEY/i
281
+ end
282
+ end
283
+
284
+ errata = Errata.new :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv',
285
+ :responder => AutomobileVariant::Guru.new
286
+
287
+ data_miner do
288
+ # 1985---1997
289
+ (85..97).each do |yy|
290
+ filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
291
+ import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
292
+ :filename => filename,
293
+ :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
294
+ :errata => errata) do
295
+ key 'row_hash'
296
+ store 'make_name', :field_name => 'make'
297
+ store 'model_name', :field_name => 'model'
298
+ store 'year'
299
+ store 'fuel_type_code', :field_name => 'fuel_type'
300
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
301
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
302
+ store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
303
+ store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
304
+ store 'cylinders', :field_name => 'no_cyc'
305
+ store 'drive', :field_name => 'drive_system'
306
+ store 'carline_mfr_code'
307
+ store 'vi_mfr_code'
308
+ store 'carline_code'
309
+ store 'carline_class_code', :field_name => 'carline_clss'
310
+ store 'transmission'
311
+ store 'speeds'
312
+ store 'turbo'
313
+ store 'supercharger'
314
+ store 'injection'
315
+ store 'displacement'
316
+ end
317
+ end
318
+
319
+ # 1998--2005
320
+ {
321
+ 1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
322
+ 1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
323
+ 2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
324
+ 2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
325
+ 2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
326
+ 2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
327
+ 2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
328
+ 2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
329
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
330
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
331
+ :errata => errata) do
332
+ key 'row_hash'
333
+ store 'make_name', :field_name => 'make'
334
+ store 'model_name', :field_name => 'model'
335
+ store 'fuel_type_code', :field_name => 'fl'
336
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
337
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
338
+ store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
339
+ store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
340
+ store 'cylinders', :field_name => 'cyl'
341
+ store 'displacement', :field_name => 'displ'
342
+ store 'carline_class_code', :field_name => 'cls' if year >= 2000
343
+ store 'carline_class_name', :field_name => 'Class'
344
+ store 'year'
345
+ store 'transmission'
346
+ store 'speeds'
347
+ store 'turbo'
348
+ store 'supercharger'
349
+ store 'injection'
350
+ store 'drive'
351
+ end
352
+ end
353
+
354
+ # 2006--2010
355
+ {
356
+ 2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
357
+ 2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
358
+ 2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
359
+ 2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
360
+ # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
361
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
362
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
363
+ :errata => errata) do
364
+ key 'row_hash'
365
+ store 'make_name', :field_name => 'make'
366
+ store 'model_name', :field_name => 'model'
367
+ store 'fuel_type_code', :field_name => 'FUEL TYPE'
368
+ store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
369
+ store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
370
+ store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
371
+ store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
372
+ store 'cylinders', :field_name => 'NUMB CYL'
373
+ store 'displacement', :field_name => 'DISPLACEMENT'
374
+ store 'carline_class_code', :field_name => 'CLS'
375
+ store 'carline_class_name', :field_name => 'CLASS'
376
+ store 'year'
377
+ store 'transmission'
378
+ store 'speeds'
379
+ store 'turbo'
380
+ store 'supercharger'
381
+ store 'injection'
382
+ store 'drive'
383
+ end
384
+ end
385
+
386
+ # associate :make, :key => :original_automobile_make_name, :foreign_key => :name
387
+ # derive :automobile_model_id # creates models by name
388
+ # associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
389
+
390
+ process 'Set adjusted fuel economy' do
391
+ update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
392
+ update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
393
+ end
394
+ end
395
+
396
+ def name
397
+ extra = []
398
+ extra << "V#{cylinders}" if cylinders
399
+ extra << "#{displacement}L" if displacement
400
+ extra << "turbo" if turbo
401
+ extra << "FI" if injection
402
+ extra << "#{speeds}spd" if speeds.present?
403
+ extra << transmission if transmission.present?
404
+ extra << "(#{fuel_type.name})" if fuel_type
405
+ extra.join(' ')
406
+ end
407
+
408
+ def fuel_economy_description
409
+ [ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
410
+ end
411
+ end
412
+
413
+ class Country < ActiveRecord::Base
414
+ set_primary_key :iso_3166
415
+
416
+ data_miner do
417
+ import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
418
+ key 'iso_3166', :field_number => 1
419
+ store 'name', :field_number => 0
420
+ end
421
+
422
+ import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
423
+ key 'iso_3166', :field_name => 'country code'
424
+ store 'name', :field_name => 'country'
425
+ end
426
+ end
427
+ end
428
+
429
+ class Airport < ActiveRecord::Base
430
+ set_primary_key :iata_code
431
+
432
+ data_miner do
433
+ import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
434
+ key 'iata_code', :field_number => 4
435
+ store 'name', :field_number => 1
436
+ store 'city', :field_number => 2
437
+ store 'country_name', :field_number => 3
438
+ store 'latitude', :field_number => 6
439
+ store 'longitude', :field_number => 7
440
+ end
441
+ end
442
+ end
443
+
444
+ class TappedAirport < ActiveRecord::Base
445
+ set_primary_key :iata_code
446
+
447
+ data_miner do
448
+ tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@data.brighterplanet.com:5001", :source_table_name => 'airports'
449
+ # tap "Brighter Planet's sanitized airports table", "http://carbon:neutral@localhost:5000", :source_table_name => 'airports'
450
+ end
451
+ end
452
+
453
+ class CensusRegion < ActiveRecord::Base
454
+ set_primary_key :number
455
+
456
+ data_miner do
457
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
458
+ key 'number', :field_name => 'Region'
459
+ store 'name', :field_name => 'Name'
460
+ end
461
+
462
+ # pretend this is a different data source
463
+ # fake! just for testing purposes
464
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
465
+ key 'number', :field_name => 'Region'
466
+ store 'name', :field_name => 'Name'
467
+ end
468
+ end
469
+ end
470
+
471
+ # smaller than a region
472
+ class CensusDivision < ActiveRecord::Base
473
+ set_primary_key :number
474
+
475
+ data_miner do
476
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
477
+ key 'number', :field_name => 'Division'
478
+ store 'name', :field_name => 'Name'
479
+ store 'census_region_number', :field_name => 'Region'
480
+ store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
481
+ end
482
+ end
483
+ end
484
+
485
+ class CensusDivisionDeux < ActiveRecord::Base
486
+ set_primary_key :number
487
+
488
+ data_miner do
489
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
490
+ key 'number', :field_name => 'Division'
491
+ store 'name', :field_name => 'Name'
492
+ store 'census_region_number', :field_name => 'Region'
493
+ store 'census_region_name', :field_name => 'Region', :dictionary => DataMiner::Dictionary.new(:input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv')
494
+ end
495
+ end
496
+ end
497
+
498
+ class CrosscallingCensusRegion < ActiveRecord::Base
499
+ set_primary_key :number
500
+
501
+ has_many :crosscalling_census_divisions
502
+
503
+ data_miner do
504
+ process "derive ourselves from the census divisions table (i.e., cross call census divisions)" do
505
+ CrosscallingCensusDivision.run_data_miner!
506
+ connection.create_table :crosscalling_census_regions, :options => 'ENGINE=InnoDB default charset=utf8', :id => false, :force => true do |t|
507
+ t.column :number, :integer
508
+ t.column :name, :string
509
+ end
510
+ connection.execute 'ALTER TABLE crosscalling_census_regions ADD PRIMARY KEY (number);'
511
+ connection.execute %{
512
+ INSERT IGNORE INTO crosscalling_census_regions(number, name)
513
+ SELECT crosscalling_census_divisions.census_region_number, crosscalling_census_divisions.census_region_name FROM crosscalling_census_divisions
514
+ }
515
+ end
516
+ end
517
+ end
518
+
519
+ class CrosscallingCensusDivision < ActiveRecord::Base
520
+ set_primary_key :number
521
+
522
+ belongs_to :crosscalling_census_regions, :foreign_key => 'census_region_number'
523
+
524
+ data_miner do
525
+ import "get a list of census divisions and their regions", :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
526
+ key 'number', :field_name => 'Division'
527
+ store 'name', :field_name => 'Name'
528
+ store 'census_region_number', :field_name => 'Region'
529
+ store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
530
+ end
531
+
532
+ process "make sure my parent object is set up (i.e., cross-call it)" do
533
+ CrosscallingCensusRegion.run_data_miner!
534
+ end
535
+ end
536
+ end
537
+
538
+ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
539
+ set_primary_key :department_of_energy_identifier
540
+
541
+ data_miner do
542
+ process 'Define some unit conversions' do
543
+ Conversions.register :kbtus, :joules, 1_000.0 * 1_055.05585
544
+ Conversions.register :square_feet, :square_metres, 0.09290304
545
+ end
546
+
547
+ # conversions are NOT performed here, since we first have to zero out legitimate skips
548
+ # otherwise you will get values like "999 pounds = 453.138778 kilograms" (where 999 is really a legit skip)
549
+ import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv' do
550
+ key 'department_of_energy_identifier', :field_name => 'DOEID'
551
+
552
+ store 'residence_class', :field_name => 'TYPEHUQ', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/typehuq/typehuq.csv' }
553
+ store 'construction_year', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Date in the middle (synthetic)', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
554
+ store 'construction_period', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
555
+ store 'urbanity', :field_name => 'URBRUR', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/urbrur/urbrur.csv' }
556
+ store 'dishwasher_use', :field_name => 'DWASHUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dwashuse/dwashuse.csv' }
557
+ store 'central_ac_use', :field_name => 'USECENAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usecenac/usecenac.csv' }
558
+ store 'window_ac_use', :field_name => 'USEWWAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usewwac/usewwac.csv' }
559
+ store 'clothes_washer_use', :field_name => 'WASHLOAD', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/washload/washload.csv' }
560
+ store 'clothes_dryer_use', :field_name => 'DRYRUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dryruse/dryruse.csv' }
561
+
562
+ store 'census_division_number', :field_name => 'DIVISION'
563
+ store 'census_division_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
564
+ store 'census_region_number', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_number', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
565
+ store 'census_region_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
566
+
567
+ store 'floorspace', :field_name => 'TOTSQFT'
568
+ store 'residents', :field_name => 'NHSLDMEM'
569
+ store 'ownership', :field_name => 'KOWNRENT'
570
+ store 'thermostat_programmability', :field_name => 'PROTHERM'
571
+ store 'refrigerator_count', :field_name => 'NUMFRIG'
572
+ store 'freezer_count', :field_name => 'NUMFREEZ'
573
+ store 'heating_degree_days', :field_name => 'HD65'
574
+ store 'cooling_degree_days', :field_name => 'CD65'
575
+ store 'annual_energy_from_fuel_oil_for_heating_space', :field_name => 'BTUFOSPH'
576
+ store 'annual_energy_from_fuel_oil_for_heating_water', :field_name => 'BTUFOWTH'
577
+ store 'annual_energy_from_fuel_oil_for_appliances', :field_name => 'BTUFOAPL'
578
+ store 'annual_energy_from_natural_gas_for_heating_space', :field_name => 'BTUNGSPH'
579
+ store 'annual_energy_from_natural_gas_for_heating_water', :field_name => 'BTUNGWTH'
580
+ store 'annual_energy_from_natural_gas_for_appliances', :field_name => 'BTUNGAPL'
581
+ store 'annual_energy_from_propane_for_heating_space', :field_name => 'BTULPSPH'
582
+ store 'annual_energy_from_propane_for_heating_water', :field_name => 'BTULPWTH'
583
+ store 'annual_energy_from_propane_for_appliances', :field_name => 'BTULPAPL'
584
+ store 'annual_energy_from_wood', :field_name => 'BTUWOOD'
585
+ store 'annual_energy_from_kerosene', :field_name => 'BTUKER'
586
+ store 'annual_energy_from_electricity_for_clothes_driers', :field_name => 'BTUELCDR'
587
+ store 'annual_energy_from_electricity_for_dishwashers', :field_name => 'BTUELDWH'
588
+ store 'annual_energy_from_electricity_for_freezers', :field_name => 'BTUELFZZ'
589
+ store 'annual_energy_from_electricity_for_refrigerators', :field_name => 'BTUELRFG'
590
+ store 'annual_energy_from_electricity_for_air_conditioners', :field_name => 'BTUELCOL'
591
+ store 'annual_energy_from_electricity_for_heating_space', :field_name => 'BTUELSPH'
592
+ store 'annual_energy_from_electricity_for_heating_water', :field_name => 'BTUELWTH'
593
+ store 'annual_energy_from_electricity_for_other_appliances', :field_name => 'BTUELAPL'
594
+ store 'weighting', :field_name => 'NWEIGHT'
595
+ store 'total_rooms', :field_name => 'TOTROOMS'
596
+ store 'bathrooms', :field_name => 'NCOMBATH'
597
+ store 'halfbaths', :field_name => 'NHAFBATH'
598
+ store 'heated_garage', :field_name => 'GARGHEAT'
599
+ store 'attached_1car_garage', :field_name => 'GARAGE1C'
600
+ store 'detached_1car_garage', :field_name => 'DGARG1C'
601
+ store 'attached_2car_garage', :field_name => 'GARAGE2C'
602
+ store 'detached_2car_garage', :field_name => 'DGARG2C'
603
+ store 'attached_3car_garage', :field_name => 'GARAGE3C'
604
+ store 'detached_3car_garage', :field_name => 'DGARG3C'
605
+ store 'lights_on_1_to_4_hours', :field_name => 'LGT1'
606
+ store 'efficient_lights_on_1_to_4_hours', :field_name => 'LGT1EE'
607
+ store 'lights_on_4_to_12_hours', :field_name => 'LGT4'
608
+ store 'efficient_lights_on_4_to_12_hours', :field_name => 'LGT4EE'
609
+ store 'lights_on_over_12_hours', :field_name => 'LGT12'
610
+ store 'efficient_lights_on_over_12_hours', :field_name => 'LGT12EE'
611
+ store 'outdoor_all_night_lights', :field_name => 'NOUTLGTNT'
612
+ store 'outdoor_all_night_gas_lights', :field_name => 'NGASLIGHT'
613
+ end
614
+
615
+ # Rather than nullify the continuous variables that EIA identifies as LEGITIMATE SKIPS, we convert them to zero
616
+ # This makes it easier to derive useful information like "how many rooms does the house have?"
617
+ process 'Zero out what the EIA calls "LEGITIMATE SKIPS"' do
618
+ %w{
619
+ annual_energy_from_electricity_for_air_conditioners
620
+ annual_energy_from_electricity_for_clothes_driers
621
+ annual_energy_from_electricity_for_dishwashers
622
+ annual_energy_from_electricity_for_freezers
623
+ annual_energy_from_electricity_for_heating_space
624
+ annual_energy_from_electricity_for_heating_water
625
+ annual_energy_from_electricity_for_other_appliances
626
+ annual_energy_from_electricity_for_refrigerators
627
+ annual_energy_from_fuel_oil_for_appliances
628
+ annual_energy_from_fuel_oil_for_heating_space
629
+ annual_energy_from_fuel_oil_for_heating_water
630
+ annual_energy_from_kerosene
631
+ annual_energy_from_propane_for_appliances
632
+ annual_energy_from_propane_for_heating_space
633
+ annual_energy_from_propane_for_heating_water
634
+ annual_energy_from_natural_gas_for_appliances
635
+ annual_energy_from_natural_gas_for_heating_space
636
+ annual_energy_from_natural_gas_for_heating_water
637
+ annual_energy_from_wood
638
+ lights_on_1_to_4_hours
639
+ lights_on_over_12_hours
640
+ efficient_lights_on_over_12_hours
641
+ efficient_lights_on_1_to_4_hours
642
+ lights_on_4_to_12_hours
643
+ efficient_lights_on_4_to_12_hours
644
+ outdoor_all_night_gas_lights
645
+ outdoor_all_night_lights
646
+ thermostat_programmability
647
+ detached_1car_garage
648
+ detached_2car_garage
649
+ detached_3car_garage
650
+ attached_1car_garage
651
+ attached_2car_garage
652
+ attached_3car_garage
653
+ heated_garage
654
+ }.each do |attr_name|
655
+ max = maximum attr_name, :select => "CONVERT(#{attr_name}, UNSIGNED INTEGER)"
656
+ # if the maximum value of a row is all 999's, then it's a LEGITIMATE SKIP and we should set it to zero
657
+ if /^9+$/.match(max.to_i.to_s)
658
+ update_all "#{attr_name} = 0", "#{attr_name} = #{max}"
659
+ end
660
+ end
661
+ end
662
+
663
+ process 'Convert units to metric after zeroing out LEGITIMATE SKIPS' do
664
+ [
665
+ [ 'floorspace', :square_feet, :square_metres ],
666
+ [ 'annual_energy_from_fuel_oil_for_heating_space', :kbtus, :joules ],
667
+ [ 'annual_energy_from_fuel_oil_for_heating_water', :kbtus, :joules ],
668
+ [ 'annual_energy_from_fuel_oil_for_appliances', :kbtus, :joules ],
669
+ [ 'annual_energy_from_natural_gas_for_heating_space', :kbtus, :joules ],
670
+ [ 'annual_energy_from_natural_gas_for_heating_water', :kbtus, :joules ],
671
+ [ 'annual_energy_from_natural_gas_for_appliances', :kbtus, :joules ],
672
+ [ 'annual_energy_from_propane_for_heating_space', :kbtus, :joules ],
673
+ [ 'annual_energy_from_propane_for_heating_water', :kbtus, :joules ],
674
+ [ 'annual_energy_from_propane_for_appliances', :kbtus, :joules ],
675
+ [ 'annual_energy_from_wood', :kbtus, :joules ],
676
+ [ 'annual_energy_from_kerosene', :kbtus, :joules ],
677
+ [ 'annual_energy_from_electricity_for_clothes_driers', :kbtus, :joules ],
678
+ [ 'annual_energy_from_electricity_for_dishwashers', :kbtus, :joules ],
679
+ [ 'annual_energy_from_electricity_for_freezers', :kbtus, :joules ],
680
+ [ 'annual_energy_from_electricity_for_refrigerators', :kbtus, :joules ],
681
+ [ 'annual_energy_from_electricity_for_air_conditioners', :kbtus, :joules ],
682
+ [ 'annual_energy_from_electricity_for_heating_space', :kbtus, :joules ],
683
+ [ 'annual_energy_from_electricity_for_heating_water', :kbtus, :joules ],
684
+ [ 'annual_energy_from_electricity_for_other_appliances', :kbtus, :joules ],
685
+ ].each do |attr_name, from_units, to_units|
686
+ update_all "#{attr_name} = #{attr_name} * #{Conversions::Unit.exchange_rate from_units, to_units}"
687
+ end
688
+ end
689
+
690
+ process 'Add a new field "rooms" that estimates how many rooms are in the house' do
691
+ update_all 'rooms = total_rooms + bathrooms/2 + halfbaths/4 + heated_garage*(attached_1car_garage + detached_1car_garage + 2*(attached_2car_garage + detached_2car_garage) + 3*(attached_3car_garage + detached_3car_garage))'
692
+ end
693
+
694
+ process 'Add a new field "lighting_use" that estimates how many hours light bulbs are turned on in the house' do
695
+ update_all 'lighting_use = 2*(lights_on_1_to_4_hours + efficient_lights_on_1_to_4_hours) + 8*(lights_on_4_to_12_hours + efficient_lights_on_4_to_12_hours) + 16*(lights_on_over_12_hours + efficient_lights_on_over_12_hours) + 12*(outdoor_all_night_lights + outdoor_all_night_gas_lights)'
696
+ end
697
+
698
+ process 'Add a new field "lighting_efficiency" that estimates what percentage of light bulbs in a house are energy-efficient' do
699
+ update_all 'lighting_efficiency = (2*efficient_lights_on_1_to_4_hours + 8*efficient_lights_on_4_to_12_hours + 16*efficient_lights_on_over_12_hours) / lighting_use'
700
+ end
701
+ end
702
+ end
703
+
704
+ # T-100 Segment (All Carriers): http://www.transtats.bts.gov/Fields.asp?Table_ID=293
705
+ class T100FlightSegment < ActiveRecord::Base
706
+ set_primary_key :row_hash
707
+ URL = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
708
+ FORM_DATA = %{
709
+ UserTableName=T_100_Segment__All_Carriers&
710
+ DBShortName=Air_Carriers&
711
+ RawDataTable=T_T100_SEGMENT_ALL_CARRIER&
712
+ sqlstr=+SELECT+DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE+FROM++T_T100_SEGMENT_ALL_CARRIER+WHERE+Month+%3D__MONTH_NUMBER__+AND+YEAR%3D__YEAR__&
713
+ varlist=DEPARTURES_SCHEDULED%2CDEPARTURES_PERFORMED%2CPAYLOAD%2CSEATS%2CPASSENGERS%2CFREIGHT%2CMAIL%2CDISTANCE%2CRAMP_TO_RAMP%2CAIR_TIME%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CUNIQUE_CARRIER_NAME%2CUNIQUE_CARRIER_ENTITY%2CREGION%2CCARRIER%2CCARRIER_NAME%2CCARRIER_GROUP%2CCARRIER_GROUP_NEW%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_CITY_NUM%2CORIGIN_STATE_ABR%2CORIGIN_STATE_FIPS%2CORIGIN_STATE_NM%2CORIGIN_COUNTRY%2CORIGIN_COUNTRY_NAME%2CORIGIN_WAC%2CDEST%2CDEST_CITY_NAME%2CDEST_CITY_NUM%2CDEST_STATE_ABR%2CDEST_STATE_FIPS%2CDEST_STATE_NM%2CDEST_COUNTRY%2CDEST_COUNTRY_NAME%2CDEST_WAC%2CAIRCRAFT_GROUP%2CAIRCRAFT_TYPE%2CAIRCRAFT_CONFIG%2CYEAR%2CQUARTER%2CMONTH%2CDISTANCE_GROUP%2CCLASS%2CDATA_SOURCE&
714
+ grouplist=&
715
+ suml=&
716
+ sumRegion=&
717
+ filter1=title%3D&
718
+ filter2=title%3D&
719
+ geo=All%A0&
720
+ time=__MONTH_NAME__&
721
+ timename=Month&
722
+ GEOGRAPHY=All&
723
+ XYEAR=__YEAR__&
724
+ FREQUENCY=__MONTH_NUMBER__&
725
+ AllVars=All&
726
+ VarName=DEPARTURES_SCHEDULED&
727
+ VarDesc=DepScheduled&
728
+ VarType=Num&
729
+ VarName=DEPARTURES_PERFORMED&
730
+ VarDesc=DepPerformed&
731
+ VarType=Num&
732
+ VarName=PAYLOAD&
733
+ VarDesc=Payload&
734
+ VarType=Num&
735
+ VarName=SEATS&
736
+ VarDesc=Seats&
737
+ VarType=Num&
738
+ VarName=PASSENGERS&
739
+ VarDesc=Passengers&
740
+ VarType=Num&
741
+ VarName=FREIGHT&
742
+ VarDesc=Freight&
743
+ VarType=Num&
744
+ VarName=MAIL&
745
+ VarDesc=Mail&
746
+ VarType=Num&
747
+ VarName=DISTANCE&
748
+ VarDesc=Distance&
749
+ VarType=Num&
750
+ VarName=RAMP_TO_RAMP&
751
+ VarDesc=RampToRamp&
752
+ VarType=Num&
753
+ VarName=AIR_TIME&
754
+ VarDesc=AirTime&
755
+ VarType=Num&
756
+ VarName=UNIQUE_CARRIER&
757
+ VarDesc=UniqueCarrier&
758
+ VarType=Char&
759
+ VarName=AIRLINE_ID&
760
+ VarDesc=AirlineID&
761
+ VarType=Num&
762
+ VarName=UNIQUE_CARRIER_NAME&
763
+ VarDesc=UniqueCarrierName&
764
+ VarType=Char&
765
+ VarName=UNIQUE_CARRIER_ENTITY&
766
+ VarDesc=UniqCarrierEntity&
767
+ VarType=Char&
768
+ VarName=REGION&
769
+ VarDesc=CarrierRegion&
770
+ VarType=Char&
771
+ VarName=CARRIER&
772
+ VarDesc=Carrier&
773
+ VarType=Char&
774
+ VarName=CARRIER_NAME&
775
+ VarDesc=CarrierName&
776
+ VarType=Char&
777
+ VarName=CARRIER_GROUP&
778
+ VarDesc=CarrierGroup&
779
+ VarType=Num&
780
+ VarName=CARRIER_GROUP_NEW&
781
+ VarDesc=CarrierGroupNew&
782
+ VarType=Num&
783
+ VarName=ORIGIN&
784
+ VarDesc=Origin&
785
+ VarType=Char&
786
+ VarName=ORIGIN_CITY_NAME&
787
+ VarDesc=OriginCityName&
788
+ VarType=Char&
789
+ VarName=ORIGIN_CITY_NUM&
790
+ VarDesc=OriginCityNum&
791
+ VarType=Num&
792
+ VarName=ORIGIN_STATE_ABR&
793
+ VarDesc=OriginState&
794
+ VarType=Char&
795
+ VarName=ORIGIN_STATE_FIPS&
796
+ VarDesc=OriginStateFips&
797
+ VarType=Char&
798
+ VarName=ORIGIN_STATE_NM&
799
+ VarDesc=OriginStateName&
800
+ VarType=Char&
801
+ VarName=ORIGIN_COUNTRY&
802
+ VarDesc=OriginCountry&
803
+ VarType=Char&
804
+ VarName=ORIGIN_COUNTRY_NAME&
805
+ VarDesc=OriginCountryName&
806
+ VarType=Char&
807
+ VarName=ORIGIN_WAC&
808
+ VarDesc=OriginWac&
809
+ VarType=Num&
810
+ VarName=DEST&
811
+ VarDesc=Dest&
812
+ VarType=Char&
813
+ VarName=DEST_CITY_NAME&
814
+ VarDesc=DestCityName&
815
+ VarType=Char&
816
+ VarName=DEST_CITY_NUM&
817
+ VarDesc=DestCityNum&
818
+ VarType=Num&
819
+ VarName=DEST_STATE_ABR&
820
+ VarDesc=DestState&
821
+ VarType=Char&
822
+ VarName=DEST_STATE_FIPS&
823
+ VarDesc=DestStateFips&
824
+ VarType=Char&
825
+ VarName=DEST_STATE_NM&
826
+ VarDesc=DestStateName&
827
+ VarType=Char&
828
+ VarName=DEST_COUNTRY&
829
+ VarDesc=DestCountry&
830
+ VarType=Char&
831
+ VarName=DEST_COUNTRY_NAME&
832
+ VarDesc=DestCountryName&
833
+ VarType=Char&
834
+ VarName=DEST_WAC&
835
+ VarDesc=DestWac&
836
+ VarType=Num&
837
+ VarName=AIRCRAFT_GROUP&
838
+ VarDesc=AircraftGroup&
839
+ VarType=Num&
840
+ VarName=AIRCRAFT_TYPE&
841
+ VarDesc=AircraftType&
842
+ VarType=Char&
843
+ VarName=AIRCRAFT_CONFIG&
844
+ VarDesc=AircraftConfig&
845
+ VarType=Num&
846
+ VarName=YEAR&
847
+ VarDesc=Year&
848
+ VarType=Num&
849
+ VarName=QUARTER&
850
+ VarDesc=Quarter&
851
+ VarType=Num&
852
+ VarName=MONTH&
853
+ VarDesc=Month&
854
+ VarType=Num&
855
+ VarName=DISTANCE_GROUP&
856
+ VarDesc=DistanceGroup&
857
+ VarType=Num&
858
+ VarName=CLASS&
859
+ VarDesc=Class&
860
+ VarType=Char&
861
+ VarName=DATA_SOURCE&
862
+ VarDesc=DataSource&
863
+ VarType=Char
864
+ }.gsub /[\s]+/,''
865
+
866
+ data_miner do
867
+ months = Hash.new
868
+ # (2008..2009).each do |year|
869
+ (2008..2008).each do |year|
870
+ # (1..12).each do |month|
871
+ (1..1).each do |month|
872
+ time = Time.gm year, month
873
+ form_data = FORM_DATA.dup
874
+ form_data.gsub! '__YEAR__', time.year.to_s
875
+ form_data.gsub! '__MONTH_NUMBER__', time.month.to_s
876
+ form_data.gsub! '__MONTH_NAME__', time.strftime('%B')
877
+ months[time] = form_data
878
+ end
879
+ end
880
+ months.each do |month, form_data|
881
+ import "T100 data from #{month.strftime('%B %Y')}",
882
+ :url => URL,
883
+ :form_data => form_data,
884
+ :compression => :zip,
885
+ :glob => '/*.csv' do
886
+ key 'row_hash'
887
+ store 'departures_scheduled', :field_name => 'DEPARTURES_SCHEDULED'
888
+ store 'departures_performed', :field_name => 'DEPARTURES_PERFORMED'
889
+ store 'payload', :field_name => 'PAYLOAD', :from_units => :pounds, :to_units => :kilograms
890
+ store 'seats', :field_name => 'SEATS'
891
+ store 'passengers', :field_name => 'PASSENGERS'
892
+ store 'freight', :field_name => 'FREIGHT', :from_units => :pounds, :to_units => :kilograms
893
+ store 'mail', :field_name => 'MAIL', :from_units => :pounds, :to_units => :kilograms
894
+ store 'distance', :field_name => 'DISTANCE', :from_units => :miles, :to_units => :kilometres
895
+ store 'ramp_to_ramp', :field_name => 'RAMP_TO_RAMP'
896
+ store 'air_time', :field_name => 'AIR_TIME'
897
+ store 'unique_carrier', :field_name => 'UNIQUE_CARRIER'
898
+ store 'dot_airline_id', :field_name => 'AIRLINE_ID'
899
+ store 'unique_carrier_name', :field_name => 'UNIQUE_CARRIER_NAME'
900
+ store 'unique_carrier_entity', :field_name => 'UNIQUE_CARRIER_ENTITY'
901
+ store 'region', :field_name => 'REGION'
902
+ store 'carrier', :field_name => 'CARRIER'
903
+ store 'carrier_name', :field_name => 'CARRIER_NAME'
904
+ store 'carrier_group', :field_name => 'CARRIER_GROUP'
905
+ store 'carrier_group_new', :field_name => 'CARRIER_GROUP_NEW'
906
+ store 'origin_airport_iata', :field_name => 'ORIGIN'
907
+ store 'origin_city_name', :field_name => 'ORIGIN_CITY_NAME'
908
+ store 'origin_city_num', :field_name => 'ORIGIN_CITY_NUM'
909
+ store 'origin_state_abr', :field_name => 'ORIGIN_STATE_ABR'
910
+ store 'origin_state_fips', :field_name => 'ORIGIN_STATE_FIPS'
911
+ store 'origin_state_nm', :field_name => 'ORIGIN_STATE_NM'
912
+ store 'origin_country_iso_3166', :field_name => 'ORIGIN_COUNTRY'
913
+ store 'origin_country_name', :field_name => 'ORIGIN_COUNTRY_NAME'
914
+ store 'origin_wac', :field_name => 'ORIGIN_WAC'
915
+ store 'dest_airport_iata', :field_name => 'DEST'
916
+ store 'dest_city_name', :field_name => 'DEST_CITY_NAME'
917
+ store 'dest_city_num', :field_name => 'DEST_CITY_NUM'
918
+ store 'dest_state_abr', :field_name => 'DEST_STATE_ABR'
919
+ store 'dest_state_fips', :field_name => 'DEST_STATE_FIPS'
920
+ store 'dest_state_nm', :field_name => 'DEST_STATE_NM'
921
+ store 'dest_country_iso_3166', :field_name => 'DEST_COUNTRY'
922
+ store 'dest_country_name', :field_name => 'DEST_COUNTRY_NAME'
923
+ store 'dest_wac', :field_name => 'DEST_WAC'
924
+ store 'bts_aircraft_group', :field_name => 'AIRCRAFT_GROUP'
925
+ store 'bts_aircraft_type', :field_name => 'AIRCRAFT_TYPE'
926
+ store 'bts_aircraft_config', :field_name => 'AIRCRAFT_CONFIG'
927
+ store 'year', :field_name => 'YEAR'
928
+ store 'quarter', :field_name => 'QUARTER'
929
+ store 'month', :field_name => 'MONTH'
930
+ store 'bts_distance_group', :field_name => 'DISTANCE_GROUP'
931
+ store 'bts_service_class', :field_name => 'CLASS'
932
+ store 'data_source', :field_name => 'DATA_SOURCE'
933
+ end
934
+ end
935
+
936
+ process 'Derive freight share as a fraction of payload' do
937
+ update_all 'freight_share = (freight + mail) / payload', 'payload > 0'
938
+ end
939
+
940
+ process 'Derive load factor, which is passengers divided by the total seats available' do
941
+ update_all 'load_factor = passengers / seats', 'passengers <= seats'
942
+ end
943
+
944
+ process 'Derive average seats per departure' do
945
+ update_all 'seats_per_departure = seats / departures_performed', 'departures_performed > 0'
946
+ end
947
+ end
948
+ end
949
+
950
+ require 'loose_tight_dictionary'
951
+ class Aircraft < ActiveRecord::Base
952
+ set_primary_key :icao_code
953
+
954
+ def self.bts_dictionary
955
+ @_dictionary ||= LooseTightDictionary.new RemoteTable.new(:url => 'http://www.bts.gov/programs/airline_information/accounting_and_reporting_directives/csv/number_260.csv', :select => lambda { |record| record['Aircraft Type'].to_i.between?(1, 998) and record['Manufacturer'].present? }),
956
+ :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
957
+ :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
958
+ :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
959
+ :left_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Model'] },
960
+ :right_reader => lambda { |record| record['Manufacturer'] + ' ' + record['Long Name'] }
961
+ end
962
+
963
+ class BtsAircraftTypeCodeMatcher
964
+ def match(left_record)
965
+ right_record = Aircraft.bts_dictionary.left_to_right left_record
966
+ right_record['Aircraft Type'] if right_record
967
+ end
968
+ end
969
+
970
+ class BtsNameMatcher
971
+ def match(left_record)
972
+ right_record = Aircraft.bts_dictionary.left_to_right left_record
973
+ right_record['Manufacturer'] + ' ' + right_record['Long Name'] if right_record
974
+ end
975
+ end
976
+
977
+ class Guru
978
+ # for errata
979
+ def is_attributed_to_boeing?(row)
980
+ row['Manufacturer'] =~ /BOEING/i
981
+ end
982
+
983
+ def is_attributed_to_cessna?(row)
984
+ row['Manufacturer'] =~ /CESSNA/i
985
+ end
986
+
987
+ def is_attributed_to_fokker?(row)
988
+ row['Manufacturer'] =~ /FOKKER/i
989
+ end
990
+
991
+ def is_not_attributed_to_aerospatiale?(row)
992
+ not row['Manufacturer'] =~ /AEROSPATIALE/i
993
+ end
994
+
995
+ def is_not_attributed_to_cessna?(row)
996
+ not row['Manufacturer'] =~ /CESSNA/i
997
+ end
998
+
999
+ def is_not_attributed_to_learjet?(row)
1000
+ not row['Manufacturer'] =~ /LEAR/i
1001
+ end
1002
+
1003
+ def is_not_attributed_to_dehavilland?(row)
1004
+ not row['Manufacturer'] =~ /DE ?HAVILLAND/i
1005
+ end
1006
+
1007
+ def is_not_attributed_to_mcdonnell_douglas?(row)
1008
+ not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
1009
+ end
1010
+
1011
+ def is_not_a_dc_plane?(row)
1012
+ not row['Model'] =~ /DC/i
1013
+ end
1014
+
1015
+ def is_a_crj_900?(row)
1016
+ row['Designator'].downcase == 'crj9'
1017
+ end
1018
+ end
1019
+
1020
+ data_miner do
1021
+ # ('A'..'Z').each do |letter|
1022
+ # Note: for the purposes of testing, only importing "D"
1023
+ %w{ D }.each do |letter|
1024
+ import("ICAO codes starting with letter #{letter} used by the FAA",
1025
+ :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1026
+ :encoding => 'US-ASCII',
1027
+ :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1028
+ :responder => Aircraft::Guru.new),
1029
+ :row_xpath => '//table/tr[2]/td/table/tr',
1030
+ :column_xpath => 'td') do
1031
+ key 'icao_code', :field_name => 'Designator'
1032
+ store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new
1033
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new
1034
+ store 'manufacturer_name', :field_name => 'Manufacturer'
1035
+ store 'name', :field_name => 'Model'
1036
+ end
1037
+
1038
+ import 'Brighter Planet aircraft class codes',
1039
+ :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
1040
+ key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
1041
+ store 'brighter_planet_aircraft_class_code'
1042
+ end
1043
+ end
1044
+ end
1045
+ end
1046
+
1047
+ # note that this depends on stuff in Aircraft
1048
+ class AircraftDeux < ActiveRecord::Base
1049
+ set_primary_key :icao_code
1050
+
1051
+ # defined on the class because we defined the errata with a shorthand
1052
+ class << self
1053
+ def is_not_attributed_to_aerospatiale?(row)
1054
+ not row['Manufacturer'] =~ /AEROSPATIALE/i
1055
+ end
1056
+
1057
+ def is_not_attributed_to_cessna?(row)
1058
+ not row['Manufacturer'] =~ /CESSNA/i
1059
+ end
1060
+
1061
+ def is_not_attributed_to_learjet?(row)
1062
+ not row['Manufacturer'] =~ /LEAR/i
1063
+ end
1064
+
1065
+ def is_not_attributed_to_dehavilland?(row)
1066
+ not row['Manufacturer'] =~ /DE ?HAVILLAND/i
1067
+ end
1068
+
1069
+ def is_not_attributed_to_mcdonnell_douglas?(row)
1070
+ not row['Manufacturer'] =~ /MCDONNELL DOUGLAS/i
1071
+ end
1072
+
1073
+ def is_not_a_dc_plane?(row)
1074
+ not row['Model'] =~ /DC/i
1075
+ end
1076
+
1077
+ def is_a_crj_900?(row)
1078
+ row['Designator'].downcase == 'crj9'
1079
+ end
1080
+ end
1081
+
1082
+ data_miner do
1083
+ # ('A'..'Z').each do |letter|
1084
+ # Note: for the purposes of testing, only importing "D"
1085
+ %w{ D }.each do |letter|
1086
+ import("ICAO codes starting with letter #{letter} used by the FAA",
1087
+ :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1088
+ :encoding => 'windows-1252',
1089
+ :errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1090
+ :row_xpath => '//table/tr[2]/td/table/tr',
1091
+ :column_xpath => 'td') do
1092
+ key 'icao_code', :field_name => 'Designator'
1093
+ store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new
1094
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new
1095
+ store 'manufacturer_name', :field_name => 'Manufacturer'
1096
+ store 'name', :field_name => 'Model'
1097
+ end
1098
+ end
1099
+ end
1100
+ end
1101
+
1102
+ class AutomobileMakeFleetYear < ActiveRecord::Base
1103
+ set_primary_key :name
1104
+
1105
+ data_miner do
1106
+ schema :id => false do
1107
+ string "name"
1108
+ string "make_name"
1109
+ string "fleet"
1110
+ integer "year"
1111
+ float "fuel_efficiency"
1112
+ string "fuel_efficiency_units"
1113
+ integer "volume"
1114
+ string "make_year_name"
1115
+ datetime "created_at"
1116
+ datetime "updated_at"
1117
+ end
1118
+
1119
+ process "finish if i tell you to" do
1120
+ raise DataMiner::Finish if $force_finish
1121
+ end
1122
+
1123
+ process "skip if i tell you to" do
1124
+ raise DataMiner::Skip if $force_skip
1125
+ end
1126
+
1127
+ # CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
1128
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
1129
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
1130
+ :select => lambda { |row| row['volume'].to_i > 0 } do
1131
+ key 'name', :synthesize => lambda { |row| [ row['manufacturer_name'], row['fleet'][2,2], row['year_content'] ].join ' ' }
1132
+ store 'make_name', :field_name => 'manufacturer_name'
1133
+ store 'year', :field_name => 'year_content'
1134
+ store 'fleet', :chars => 2..3 # zero-based
1135
+ store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
1136
+ store 'volume'
1137
+ end
1138
+ end
1139
+ end
1140
+
1141
+ class CensusDivisionTrois < ActiveRecord::Base
1142
+ set_primary_key :number_code
1143
+ data_miner do
1144
+ schema :options => 'ENGINE=InnoDB default charset=utf8' do
1145
+ string 'number_code'
1146
+ string 'name'
1147
+ string 'census_region_name'
1148
+ integer 'census_region_number'
1149
+ index 'census_region_name', :name => 'homefry'
1150
+ index ['number_code', 'name', 'census_region_name', 'census_region_number', 'updated_at', 'created_at']
1151
+ end
1152
+ end
1153
+ end
1154
+
1155
+ class CensusDivisionFour < ActiveRecord::Base
1156
+ data_miner do
1157
+ schema do
1158
+ string 'number_code'
1159
+ string 'name'
1160
+ string 'census_region_name'
1161
+ integer 'census_region_number'
1162
+ index 'census_region_name', :name => 'homefry'
1163
+ end
1164
+ end
1165
+ end
1166
+
1167
+ # todo: have somebody properly organize these
1168
+ class DataMinerTest < Test::Unit::TestCase
1169
+ if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
1170
+ should 'directly create a table for the model' do
1171
+ if AutomobileMakeFleetYear.table_exists?
1172
+ ActiveRecord::Base.connection.execute 'DROP TABLE automobile_make_fleet_years;'
1173
+ end
1174
+ AutomobileMakeFleetYear.execute_schema
1175
+ assert AutomobileMakeFleetYear.table_exists?
1176
+ end
1177
+ end
1178
+
1179
+ if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
1180
+ should 'override an existing data_miner configuration' do
1181
+ AutomobileFuelType.class_eval do
1182
+ data_miner do
1183
+ import 'example', :url => 'http://example.com' do
1184
+ key 'code'
1185
+ store 'name'
1186
+ end
1187
+ end
1188
+ end
1189
+ assert_kind_of DataMiner::Import, AutomobileFuelType.data_miner_base.steps.first
1190
+ assert_equal 'http://example.com', AutomobileFuelType.data_miner_base.steps.first.table.package.url
1191
+ assert_equal 1, AutomobileFuelType.data_miner_base.step_counter
1192
+ end
1193
+ should "stop and finish if it gets a DataMiner::Finish" do
1194
+ AutomobileMakeFleetYear.delete_all
1195
+ AutomobileMakeFleetYear.data_miner_runs.delete_all
1196
+ $force_finish = true
1197
+ AutomobileMakeFleetYear.run_data_miner!
1198
+ assert_equal 0, AutomobileMakeFleetYear.count
1199
+ assert_equal true, (AutomobileMakeFleetYear.data_miner_runs.count > 0)
1200
+ assert_equal true, AutomobileMakeFleetYear.data_miner_runs.all? { |run| run.finished? and not run.skipped and not run.killed? }
1201
+ $force_finish = false
1202
+ AutomobileMakeFleetYear.run_data_miner!
1203
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1204
+ end
1205
+
1206
+ should "stop and register skipped if it gets a DataMiner::Skip" do
1207
+ AutomobileMakeFleetYear.delete_all
1208
+ AutomobileMakeFleetYear.data_miner_runs.delete_all
1209
+ $force_skip = true
1210
+ AutomobileMakeFleetYear.run_data_miner!
1211
+ assert_equal 0, AutomobileMakeFleetYear.count
1212
+ assert_equal true, (AutomobileMakeFleetYear.data_miner_runs.count > 0)
1213
+ assert_equal true, AutomobileMakeFleetYear.data_miner_runs.all? { |run| run.skipped? and not run.finished? and not run.killed? }
1214
+ $force_skip = false
1215
+ AutomobileMakeFleetYear.run_data_miner!
1216
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1217
+ end
1218
+
1219
+ should "eagerly enforce a schema" do
1220
+ ActiveRecord::Base.connection.create_table 'census_division_trois', :force => true, :options => 'ENGINE=InnoDB default charset=utf8' do |t|
1221
+ t.string 'name'
1222
+ # t.datetime 'updated_at'
1223
+ # t.datetime 'created_at'
1224
+ t.string 'census_region_name'
1225
+ # t.integer 'census_region_number'
1226
+ end
1227
+ ActiveRecord::Base.connection.execute 'ALTER TABLE census_division_trois ADD INDEX (census_region_name)'
1228
+ CensusDivisionTrois.reset_column_information
1229
+ missing_columns = %w{ updated_at created_at census_region_number }
1230
+
1231
+ # sanity check
1232
+ missing_columns.each do |column|
1233
+ assert_equal false, CensusDivisionTrois.column_names.include?(column)
1234
+ end
1235
+ assert_equal false, ActiveRecord::Base.connection.indexes(CensusDivisionTrois.table_name).any? { |index| index.name == 'homefry' }
1236
+
1237
+ 3.times do
1238
+ CensusDivisionTrois.run_data_miner!
1239
+ missing_columns.each do |column|
1240
+ assert_equal true, CensusDivisionTrois.column_names.include?(column)
1241
+ end
1242
+ assert_equal true, ActiveRecord::Base.connection.indexes(CensusDivisionTrois.table_name).any? { |index| index.name == 'homefry' }
1243
+ assert_equal :string, CensusDivisionTrois.columns_hash[CensusDivisionTrois.primary_key].type
1244
+ end
1245
+ end
1246
+
1247
+ should "let schemas work with default id primary keys" do
1248
+ ActiveRecord::Base.connection.create_table 'census_division_fours', :force => true, :options => 'ENGINE=InnoDB default charset=utf8' do |t|
1249
+ t.string 'name'
1250
+ # t.datetime 'updated_at'
1251
+ # t.datetime 'created_at'
1252
+ t.string 'census_region_name'
1253
+ # t.integer 'census_region_number'
1254
+ end
1255
+ ActiveRecord::Base.connection.execute 'ALTER TABLE census_division_fours ADD INDEX (census_region_name)'
1256
+ CensusDivisionFour.reset_column_information
1257
+ missing_columns = %w{ updated_at created_at census_region_number }
1258
+
1259
+ # sanity check
1260
+ missing_columns.each do |column|
1261
+ assert_equal false, CensusDivisionFour.column_names.include?(column)
1262
+ end
1263
+ assert_equal false, ActiveRecord::Base.connection.indexes(CensusDivisionFour.table_name).any? { |index| index.name == 'homefry' }
1264
+
1265
+ 3.times do
1266
+ CensusDivisionFour.run_data_miner!
1267
+ missing_columns.each do |column|
1268
+ assert_equal true, CensusDivisionFour.column_names.include?(column)
1269
+ end
1270
+ assert_equal true, ActiveRecord::Base.connection.indexes(CensusDivisionFour.table_name).any? { |index| index.name == 'homefry' }
1271
+ assert_equal :integer, CensusDivisionFour.columns_hash[CensusDivisionFour.primary_key].type
1272
+ end
1273
+ end
1274
+
1275
+ should "allow specifying dictionaries explicitly" do
1276
+ CensusDivisionDeux.run_data_miner!
1277
+ assert_equal 'South Region', CensusDivisionDeux.find(5).census_region_name
1278
+ end
1279
+
1280
+ should "be able to key on things other than the primary key" do
1281
+ Aircraft.run_data_miner!
1282
+ assert_equal 'SP', Aircraft.find('DHC6').brighter_planet_aircraft_class_code
1283
+ end
1284
+
1285
+ should "be able to synthesize rows without using a full parser class" do
1286
+ AutomobileMakeFleetYear.run_data_miner!
1287
+ assert AutomobileMakeFleetYear.exists?(:name => 'Alfa Romeo IP 1978')
1288
+ end
1289
+
1290
+ should "keep a call stack so that you can call run_data_miner! on a child" do
1291
+ CrosscallingCensusDivision.run_data_miner!
1292
+ assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
1293
+ assert CrosscallingCensusRegion.exists? :name => 'West Region', :number => 4
1294
+ end
1295
+
1296
+ should "keep a call stack so that you can call run_data_miner! on a parent" do
1297
+ CrosscallingCensusRegion.run_data_miner!
1298
+ assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
1299
+ assert CrosscallingCensusRegion.exists? :name => 'West Region', :number => 4
1300
+ end
1301
+
1302
+ should "import airports" do
1303
+ Airport.run_data_miner!
1304
+ assert Airport.count > 0
1305
+ end
1306
+
1307
+ should "tap airports" do
1308
+ TappedAirport.run_data_miner!
1309
+ assert TappedAirport.count > 0
1310
+ end
1311
+
1312
+ should "pull in census divisions using a data.brighterplanet.com dictionary" do
1313
+ CensusDivision.run_data_miner!
1314
+ assert CensusDivision.count > 0
1315
+ end
1316
+
1317
+ should "have a way to queue up runs that works with delated_job's send_later" do
1318
+ assert AutomobileVariant.respond_to?(:run_data_miner!)
1319
+ end
1320
+
1321
+ should "be idempotent" do
1322
+ Country.data_miner_base.run
1323
+ a = Country.count
1324
+ Country.data_miner_base.run
1325
+ b = Country.count
1326
+ assert_equal a, b
1327
+
1328
+ CensusRegion.data_miner_base.run
1329
+ a = CensusRegion.count
1330
+ CensusRegion.data_miner_base.run
1331
+ b = CensusRegion.count
1332
+ assert_equal a, b
1333
+ end
1334
+
1335
+ should "hash things" do
1336
+ AutomobileVariant.data_miner_base.steps[0].run(nil)
1337
+ assert AutomobileVariant.first.row_hash.present?
1338
+ end
1339
+
1340
+ should "process a callback block instead of a method" do
1341
+ AutomobileVariant.delete_all
1342
+ AutomobileVariant.data_miner_base.steps[0].run(nil)
1343
+ assert !AutomobileVariant.first.fuel_efficiency_city.present?
1344
+ AutomobileVariant.data_miner_base.steps.last.run(nil)
1345
+ assert AutomobileVariant.first.fuel_efficiency_city.present?
1346
+ end
1347
+
1348
+ should "keep a log when it does a run" do
1349
+ approx_started_at = Time.now
1350
+ DataMiner.run :resource_names => %w{ Country }
1351
+ approx_terminated_at = Time.now
1352
+ last_run = DataMiner::Run.first(:conditions => { :resource_name => 'Country' }, :order => 'id DESC')
1353
+ assert (last_run.started_at - approx_started_at).abs < 5 # seconds
1354
+ assert (last_run.terminated_at - approx_terminated_at).abs < 5 # seconds
1355
+ end
1356
+
1357
+ should "request a re-import from scratch" do
1358
+ c = Country.new
1359
+ c.iso_3166 = 'JUNK'
1360
+ c.save!
1361
+ assert Country.exists?(:iso_3166 => 'JUNK')
1362
+ DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
1363
+ assert !Country.exists?(:iso_3166 => 'JUNK')
1364
+ end
1365
+
1366
+ should "know what runs were on a resource" do
1367
+ DataMiner.run :resource_names => %w{ Country }
1368
+ DataMiner.run :resource_names => %w{ Country }
1369
+ assert Country.data_miner_runs.count > 0
1370
+ end
1371
+ end
1372
+
1373
+ if ENV['ALL'] == 'true' or ENV['SLOW'] == 'true'
1374
+ should "allow errata to be specified with a shorthand, assuming the responder is the resource class itself" do
1375
+ AircraftDeux.run_data_miner!
1376
+ assert AircraftDeux.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1377
+ end
1378
+
1379
+ should "mine aircraft" do
1380
+ Aircraft.run_data_miner!
1381
+ assert Aircraft.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1382
+ end
1383
+
1384
+ should "mine automobile variants" do
1385
+ AutomobileVariant.run_data_miner!
1386
+ assert AutomobileVariant.count('make_name LIKE "%tesla"') > 0
1387
+ end
1388
+
1389
+ should "mine T100 flight segments" do
1390
+ T100FlightSegment.run_data_miner!
1391
+ assert T100FlightSegment.count('dest_country_name LIKE "%United States"') > 0
1392
+ end
1393
+
1394
+ should "mine residence survey responses" do
1395
+ ResidentialEnergyConsumptionSurveyResponse.run_data_miner!
1396
+ assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.start_with?('Single-family detached house')
1397
+ end
1398
+ end
1399
+ end