data_miner 1.3.8 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +42 -0
- data/Gemfile +19 -3
- data/README.rdoc +3 -3
- data/Rakefile +13 -15
- data/data_miner.gemspec +4 -15
- data/lib/data_miner.rb +69 -70
- data/lib/data_miner/active_record_extensions.rb +17 -22
- data/lib/data_miner/attribute.rb +176 -179
- data/lib/data_miner/dictionary.rb +38 -31
- data/lib/data_miner/run.rb +49 -18
- data/lib/data_miner/script.rb +116 -0
- data/lib/data_miner/step.rb +5 -0
- data/lib/data_miner/step/import.rb +74 -0
- data/lib/data_miner/step/process.rb +34 -0
- data/lib/data_miner/step/tap.rb +134 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +26 -24
- data/test/support/breeds.xls +0 -0
- data/test/support/pet_color_dictionary.en.csv +5 -0
- data/test/support/pet_color_dictionary.es.csv +5 -0
- data/test/support/pets.csv +5 -0
- data/test/support/pets_funny.csv +4 -0
- data/test/test_data_miner.rb +103 -0
- data/test/test_earth_import.rb +25 -0
- data/test/test_earth_tap.rb +25 -0
- data/test/test_safety.rb +43 -0
- metadata +72 -78
- data/.document +0 -5
- data/lib/data_miner/config.rb +0 -124
- data/lib/data_miner/import.rb +0 -93
- data/lib/data_miner/process.rb +0 -38
- data/lib/data_miner/tap.rb +0 -143
- data/test/support/aircraft.rb +0 -102
- data/test/support/airport.rb +0 -16
- data/test/support/automobile_fuel_type.rb +0 -40
- data/test/support/automobile_variant.rb +0 -362
- data/test/support/country.rb +0 -15
- data/test/support/test_database.rb +0 -311
- data/test/test_data_miner_attribute.rb +0 -111
- data/test/test_data_miner_process.rb +0 -18
- data/test/test_old_syntax.rb +0 -825
- data/test/test_tap.rb +0 -21
data/test/support/airport.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
class Airport < ActiveRecord::Base
|
2
|
-
set_primary_key :iata_code
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import :url => 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat',
|
6
|
-
:headers => false,
|
7
|
-
:select => lambda { |row| row[4].present? } do
|
8
|
-
key 'iata_code', :field_number => 4
|
9
|
-
store 'name', :field_number => 1
|
10
|
-
store 'city', :field_number => 2
|
11
|
-
store 'country_name', :field_number => 3
|
12
|
-
store 'latitude', :field_number => 6, :nullify => true
|
13
|
-
store 'longitude', :field_number => 7, :nullify => true
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
class AutomobileFuelType < ActiveRecord::Base
|
2
|
-
set_primary_key :code
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
6
|
-
:filename => 'Gd6-dsc.txt',
|
7
|
-
:format => :fixed_width,
|
8
|
-
:crop => 21..26, # inclusive
|
9
|
-
:cut => '2-',
|
10
|
-
:select => lambda { |row| /\A[A-Z]/.match row[:code] },
|
11
|
-
:schema => [[ 'code', 2, { :type => :string } ],
|
12
|
-
[ 'spacer', 2 ],
|
13
|
-
[ 'name', 52, { :type => :string } ]]) do
|
14
|
-
key 'code'
|
15
|
-
store 'name'
|
16
|
-
end
|
17
|
-
|
18
|
-
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
|
19
|
-
key 'code'
|
20
|
-
store 'name'
|
21
|
-
store 'annual_distance'
|
22
|
-
store 'emission_factor'
|
23
|
-
end
|
24
|
-
|
25
|
-
# pull electricity emission factor from residential electricity
|
26
|
-
import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
|
27
|
-
:select => lambda { |row| row['code'] == 'El' }) do
|
28
|
-
key 'code'
|
29
|
-
store 'name'
|
30
|
-
store 'emission_factor'
|
31
|
-
end
|
32
|
-
|
33
|
-
# still need distance estimate for electric cars
|
34
|
-
end
|
35
|
-
|
36
|
-
CODES = {
|
37
|
-
:electricity => 'El',
|
38
|
-
:diesel => 'D'
|
39
|
-
}
|
40
|
-
end
|
@@ -1,362 +0,0 @@
|
|
1
|
-
class AutomobileVariant < ActiveRecord::Base
|
2
|
-
set_primary_key :row_hash
|
3
|
-
|
4
|
-
module FuelEconomyGuide
|
5
|
-
TRANSMISSIONS = {
|
6
|
-
'A' => 'automatic',
|
7
|
-
'M' => 'manual',
|
8
|
-
'L' => 'automatic', # Lockup/automatic
|
9
|
-
'S' => 'semiautomatic', # Semiautomatic
|
10
|
-
'C' => 'manual' # TODO verify for VW Syncro
|
11
|
-
}
|
12
|
-
|
13
|
-
ENGINE_TYPES = {
|
14
|
-
'(GUZZLER)' => nil, # "gas guzzler"
|
15
|
-
'(POLICE)' => nil, # police automobile_variant
|
16
|
-
'(MPFI)' => 'injection',
|
17
|
-
'(MPI*)' => 'injection',
|
18
|
-
'(SPFI)' => 'injection',
|
19
|
-
'(FFS)' => 'injection',
|
20
|
-
'(TURBO)' => 'turbo',
|
21
|
-
'(TRBO)' => 'turbo',
|
22
|
-
'(TC*)' => 'turbo',
|
23
|
-
'(FFS,TRBO)' => %w(injection turbo),
|
24
|
-
'(S-CHARGE)' => 'supercharger',
|
25
|
-
'(SC*)' => 'supercharger',
|
26
|
-
'(DIESEL)' => nil, # diesel
|
27
|
-
'(DSL)' => nil, # diesel
|
28
|
-
'(ROTARY)' => nil, # rotary
|
29
|
-
'(VARIABLE)' => nil, # variable displacement
|
30
|
-
'(NO-CAT)' => nil, # no catalytic converter
|
31
|
-
'(OHC)' => nil, # overhead camshaft
|
32
|
-
'(OHV)' => nil, # overhead valves
|
33
|
-
'(16-VALVE)' => nil, # 16V
|
34
|
-
'(305)' => nil, # 305 cubic inch displacement
|
35
|
-
'(307)' => nil, # 307 cubic inch displacement
|
36
|
-
'(M-ENG)' => nil,
|
37
|
-
'(W-ENG)' => nil,
|
38
|
-
'(GM-BUICK)' => nil,
|
39
|
-
'(GM-CHEV)' => nil,
|
40
|
-
'(GM-OLDS)' => nil,
|
41
|
-
'(GM-PONT)' => nil,
|
42
|
-
}
|
43
|
-
|
44
|
-
class ParserB
|
45
|
-
require 'fixed_width'
|
46
|
-
::FixedWidth.define :fuel_economy_guide_b do |d|
|
47
|
-
d.rows do |row|
|
48
|
-
row.trap { true } # there's only one section
|
49
|
-
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
50
|
-
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
51
|
-
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
52
|
-
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
53
|
-
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
54
|
-
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
55
|
-
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
56
|
-
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
57
|
-
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
58
|
-
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
59
|
-
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
60
|
-
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
61
|
-
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
62
|
-
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
63
|
-
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
64
|
-
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
65
|
-
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
66
|
-
row.spacer 2
|
67
|
-
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
68
|
-
row.spacer 2
|
69
|
-
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
70
|
-
row.spacer 2
|
71
|
-
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
72
|
-
row.spacer 2
|
73
|
-
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
74
|
-
row.spacer 2
|
75
|
-
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
76
|
-
row.spacer 2
|
77
|
-
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
78
|
-
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
79
|
-
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
80
|
-
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
81
|
-
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
82
|
-
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
83
|
-
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
84
|
-
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
85
|
-
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
86
|
-
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
87
|
-
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
88
|
-
row.column 'filler' , 1, :type => :string # NOT USED
|
89
|
-
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
90
|
-
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
91
|
-
end
|
92
|
-
end
|
93
|
-
attr_accessor :year
|
94
|
-
def initialize(options = {})
|
95
|
-
options = options.stringify_keys
|
96
|
-
@year = options['year']
|
97
|
-
end
|
98
|
-
|
99
|
-
def apply(row)
|
100
|
-
row.merge!({
|
101
|
-
'make' => row['carline_mfr_name'], # make it line up with the errata
|
102
|
-
'model' => row['carline_name'], # ditto
|
103
|
-
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
104
|
-
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
105
|
-
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
106
|
-
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
107
|
-
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
108
|
-
'displacement' => _displacement(row['opt_disp']),
|
109
|
-
'year' => year
|
110
|
-
})
|
111
|
-
row
|
112
|
-
end
|
113
|
-
|
114
|
-
def _displacement(str)
|
115
|
-
str = str.gsub(/[\(\)]/, '').strip
|
116
|
-
if str =~ /^(.+)L$/
|
117
|
-
$1.to_f
|
118
|
-
elsif str =~ /^(.+)CC$/
|
119
|
-
$1.to_f / 1000
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|
124
|
-
class ParserC
|
125
|
-
attr_accessor :year
|
126
|
-
def initialize(options = {})
|
127
|
-
options = options.stringify_keys
|
128
|
-
@year = options['year']
|
129
|
-
end
|
130
|
-
|
131
|
-
def apply(row)
|
132
|
-
row.merge!({
|
133
|
-
'make' => row['Manufacturer'], # make it line up with the errata
|
134
|
-
'model' => row['carline name'], # ditto
|
135
|
-
'drive' => row['drv'] + 'WD',
|
136
|
-
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
137
|
-
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
138
|
-
'turbo' => row['T'] == 'T',
|
139
|
-
'supercharger' => row['S'] == 'S',
|
140
|
-
'injection' => true,
|
141
|
-
'year' => year
|
142
|
-
})
|
143
|
-
row
|
144
|
-
end
|
145
|
-
end
|
146
|
-
class ParserD
|
147
|
-
attr_accessor :year
|
148
|
-
def initialize(options = {})
|
149
|
-
options = options.stringify_keys
|
150
|
-
@year = options['year']
|
151
|
-
end
|
152
|
-
|
153
|
-
def apply(row)
|
154
|
-
row.merge!({
|
155
|
-
'make' => row['MFR'], # make it line up with the errata
|
156
|
-
'model' => row['CAR LINE'], # ditto
|
157
|
-
'drive' => row['DRIVE SYS'] + 'WD',
|
158
|
-
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
159
|
-
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
160
|
-
'turbo' => row['TURBO'] == 'T',
|
161
|
-
'supercharger' => row['SPCHGR'] == 'S',
|
162
|
-
'injection' => true,
|
163
|
-
'year' => year
|
164
|
-
})
|
165
|
-
row
|
166
|
-
end
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
class Guru
|
171
|
-
# the following matching methods are needed by the errata
|
172
|
-
# per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
|
173
|
-
|
174
|
-
def transmission_is_blank?(row)
|
175
|
-
row['transmission'].blank?
|
176
|
-
end
|
177
|
-
|
178
|
-
def is_a_2007_gmc_or_chevrolet?(row)
|
179
|
-
row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
|
180
|
-
end
|
181
|
-
|
182
|
-
def is_a_porsche?(row)
|
183
|
-
row['make'].upcase == 'PORSCHE'
|
184
|
-
end
|
185
|
-
|
186
|
-
def is_not_a_porsche?(row)
|
187
|
-
!is_a_porsche? row
|
188
|
-
end
|
189
|
-
|
190
|
-
def is_a_mercedes_benz?(row)
|
191
|
-
row['make'] =~ /MERCEDES/i
|
192
|
-
end
|
193
|
-
|
194
|
-
def is_a_lexus?(row)
|
195
|
-
row['make'].upcase == 'LEXUS'
|
196
|
-
end
|
197
|
-
|
198
|
-
def is_a_bmw?(row)
|
199
|
-
row['make'].upcase == 'BMW'
|
200
|
-
end
|
201
|
-
|
202
|
-
def is_a_ford?(row)
|
203
|
-
row['make'].upcase == 'FORD'
|
204
|
-
end
|
205
|
-
|
206
|
-
def is_a_rolls_royce_and_model_contains_bentley?(row)
|
207
|
-
is_a_rolls_royce?(row) and model_contains_bentley?(row)
|
208
|
-
end
|
209
|
-
|
210
|
-
def is_a_bentley?(row)
|
211
|
-
row['make'].upcase == 'BENTLEY'
|
212
|
-
end
|
213
|
-
|
214
|
-
def is_a_rolls_royce?(row)
|
215
|
-
row['make'] =~ /ROLLS/i
|
216
|
-
end
|
217
|
-
|
218
|
-
def is_a_turbo_brooklands?(row)
|
219
|
-
row['model'] =~ /TURBO R\/RL BKLDS/i
|
220
|
-
end
|
221
|
-
|
222
|
-
def model_contains_maybach?(row)
|
223
|
-
row['model'] =~ /MAYBACH/i
|
224
|
-
end
|
225
|
-
|
226
|
-
def model_contains_bentley?(row)
|
227
|
-
row['model'] =~ /BENTLEY/i
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
errata = { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv', :responder => 'AutomobileVariant::Guru' }
|
232
|
-
|
233
|
-
data_miner do
|
234
|
-
# 1985---1997
|
235
|
-
(85..97).each do |yy|
|
236
|
-
filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
|
237
|
-
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
238
|
-
:filename => filename,
|
239
|
-
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
240
|
-
:format => :fixed_width,
|
241
|
-
:cut => (yy == 95) ? '13-' : nil,
|
242
|
-
:schema_name => :fuel_economy_guide_b,
|
243
|
-
:select => lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' },
|
244
|
-
:errata => errata) do
|
245
|
-
key 'row_hash'
|
246
|
-
store 'make_name', :field_name => 'make'
|
247
|
-
store 'model_name', :field_name => 'model'
|
248
|
-
store 'year'
|
249
|
-
store 'fuel_type_code', :field_name => 'fuel_type'
|
250
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
251
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
252
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
253
|
-
store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
254
|
-
store 'cylinders', :field_name => 'no_cyc'
|
255
|
-
store 'drive', :field_name => 'drive_system'
|
256
|
-
store 'carline_mfr_code'
|
257
|
-
store 'vi_mfr_code'
|
258
|
-
store 'carline_code'
|
259
|
-
store 'carline_class_code', :field_name => 'carline_clss'
|
260
|
-
store 'transmission'
|
261
|
-
store 'speeds'
|
262
|
-
store 'turbo'
|
263
|
-
store 'supercharger'
|
264
|
-
store 'injection'
|
265
|
-
store 'displacement'
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
# 1998--2005
|
270
|
-
{
|
271
|
-
1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
|
272
|
-
1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
|
273
|
-
2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
|
274
|
-
2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
|
275
|
-
2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
|
276
|
-
2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
|
277
|
-
2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
|
278
|
-
2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
|
279
|
-
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
280
|
-
import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
|
281
|
-
:errata => errata) do
|
282
|
-
key 'row_hash'
|
283
|
-
store 'make_name', :field_name => 'make'
|
284
|
-
store 'model_name', :field_name => 'model'
|
285
|
-
store 'fuel_type_code', :field_name => 'fl'
|
286
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
287
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
288
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
289
|
-
store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
290
|
-
store 'cylinders', :field_name => 'cyl'
|
291
|
-
store 'displacement', :field_name => 'displ'
|
292
|
-
store 'carline_class_code', :field_name => 'cls' if year >= 2000
|
293
|
-
store 'carline_class_name', :field_name => 'Class'
|
294
|
-
store 'year'
|
295
|
-
store 'transmission'
|
296
|
-
store 'speeds'
|
297
|
-
store 'turbo'
|
298
|
-
store 'supercharger'
|
299
|
-
store 'injection'
|
300
|
-
store 'drive'
|
301
|
-
end
|
302
|
-
end
|
303
|
-
|
304
|
-
# 2006--2010
|
305
|
-
{
|
306
|
-
2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
|
307
|
-
2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
|
308
|
-
2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
|
309
|
-
2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
|
310
|
-
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
311
|
-
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
312
|
-
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
313
|
-
:reject => (year == 2007) ? lambda { |row| row.values.first.blank? } : nil,
|
314
|
-
:errata => errata) do
|
315
|
-
key 'row_hash'
|
316
|
-
store 'make_name', :field_name => 'make'
|
317
|
-
store 'model_name', :field_name => 'model'
|
318
|
-
store 'fuel_type_code', :field_name => 'FUEL TYPE'
|
319
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
320
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
321
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
322
|
-
store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
323
|
-
store 'cylinders', :field_name => 'NUMB CYL'
|
324
|
-
store 'displacement', :field_name => 'DISPLACEMENT'
|
325
|
-
store 'carline_class_code', :field_name => 'CLS'
|
326
|
-
store 'carline_class_name', :field_name => 'CLASS'
|
327
|
-
store 'year'
|
328
|
-
store 'transmission'
|
329
|
-
store 'speeds'
|
330
|
-
store 'turbo'
|
331
|
-
store 'supercharger'
|
332
|
-
store 'injection'
|
333
|
-
store 'drive'
|
334
|
-
end
|
335
|
-
end
|
336
|
-
|
337
|
-
# associate :make, :key => :original_automobile_make_name, :foreign_key => :name
|
338
|
-
# derive :automobile_model_id # creates models by name
|
339
|
-
# associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
|
340
|
-
|
341
|
-
process 'Set adjusted fuel economy' do
|
342
|
-
update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
|
343
|
-
update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
|
344
|
-
end
|
345
|
-
end
|
346
|
-
|
347
|
-
def name
|
348
|
-
extra = []
|
349
|
-
extra << "V#{cylinders}" if cylinders
|
350
|
-
extra << "#{displacement}L" if displacement
|
351
|
-
extra << "turbo" if turbo
|
352
|
-
extra << "FI" if injection
|
353
|
-
extra << "#{speeds}spd" if speeds.present?
|
354
|
-
extra << transmission if transmission.present?
|
355
|
-
extra << "(#{fuel_type.name})" if fuel_type
|
356
|
-
extra.join(' ')
|
357
|
-
end
|
358
|
-
|
359
|
-
def fuel_economy_description
|
360
|
-
[ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
|
361
|
-
end
|
362
|
-
end
|
data/test/support/country.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
class Country < ActiveRecord::Base
|
2
|
-
set_primary_key :iso_3166
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :encoding => 'ISO-8859-1', :skip => 2, :headers => false, :delimiter => ';' do
|
6
|
-
key 'iso_3166', :field_number => 1
|
7
|
-
store 'name', :field_number => 0
|
8
|
-
end
|
9
|
-
|
10
|
-
import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
11
|
-
key 'iso_3166', :field_name => 'country code'
|
12
|
-
store 'name', :field_name => 'country'
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,311 +0,0 @@
|
|
1
|
-
module TestDatabase
|
2
|
-
extend self
|
3
|
-
|
4
|
-
def connect
|
5
|
-
@connection ||= ActiveRecord::Base.establish_connection(
|
6
|
-
'adapter' => 'mysql',
|
7
|
-
'database' => 'data_miner_test',
|
8
|
-
'username' => 'root',
|
9
|
-
'password' => 'password'
|
10
|
-
)
|
11
|
-
end
|
12
|
-
|
13
|
-
def load_schema
|
14
|
-
connect
|
15
|
-
|
16
|
-
ActiveRecord::Schema.define(:version => 20090819143429) do
|
17
|
-
create_table "t100_flight_segments", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
18
|
-
t.integer "departures_performed"
|
19
|
-
t.integer "payload"
|
20
|
-
t.integer "seats"
|
21
|
-
t.integer "passengers"
|
22
|
-
t.integer "freight"
|
23
|
-
t.integer "mail"
|
24
|
-
t.integer "ramp_to_ramp"
|
25
|
-
t.integer "air_time"
|
26
|
-
t.float "load_factor"
|
27
|
-
t.float "freight_share"
|
28
|
-
t.integer "distance"
|
29
|
-
t.integer "departures_scheduled"
|
30
|
-
t.string "unique_carrier"
|
31
|
-
t.integer "dot_airline_id"
|
32
|
-
t.string "unique_carrier_name"
|
33
|
-
t.string "unique_carrier_entity"
|
34
|
-
t.string "region"
|
35
|
-
t.string "carrier"
|
36
|
-
t.string "carrier_name"
|
37
|
-
t.integer "carrier_group"
|
38
|
-
t.integer "carrier_group_new"
|
39
|
-
t.string "origin_airport_iata"
|
40
|
-
t.string "origin_city_name"
|
41
|
-
t.integer "origin_city_num"
|
42
|
-
t.string "origin_state_abr"
|
43
|
-
t.string "origin_state_fips"
|
44
|
-
t.string "origin_state_nm"
|
45
|
-
t.string "origin_country_iso_3166"
|
46
|
-
t.string "origin_country_name"
|
47
|
-
t.integer "origin_wac"
|
48
|
-
t.string "dest_airport_iata"
|
49
|
-
t.string "dest_city_name"
|
50
|
-
t.integer "dest_city_num"
|
51
|
-
t.string "dest_state_abr"
|
52
|
-
t.string "dest_state_fips"
|
53
|
-
t.string "dest_state_nm"
|
54
|
-
t.string "dest_country_iso_3166"
|
55
|
-
t.string "dest_country_name"
|
56
|
-
t.integer "dest_wac"
|
57
|
-
t.integer "bts_aircraft_group"
|
58
|
-
t.integer "bts_aircraft_type"
|
59
|
-
t.integer "bts_aircraft_config"
|
60
|
-
t.integer "year"
|
61
|
-
t.integer "quarter"
|
62
|
-
t.integer "month"
|
63
|
-
t.integer "bts_distance_group"
|
64
|
-
t.string "bts_service_class"
|
65
|
-
t.string "data_source"
|
66
|
-
t.float "seats_per_departure"
|
67
|
-
|
68
|
-
t.string 'payload_units'
|
69
|
-
t.string 'freight_units'
|
70
|
-
t.string 'mail_units'
|
71
|
-
t.string 'distance_units'
|
72
|
-
|
73
|
-
t.datetime "created_at"
|
74
|
-
t.datetime "updated_at"
|
75
|
-
|
76
|
-
t.string "row_hash"
|
77
|
-
end
|
78
|
-
execute 'ALTER TABLE t100_flight_segments ADD PRIMARY KEY (row_hash);'
|
79
|
-
|
80
|
-
create_table 'tapped_airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
81
|
-
t.string 'i_am_just_here_to_get_in_the_way'
|
82
|
-
end
|
83
|
-
|
84
|
-
create_table 'airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
85
|
-
t.string 'iata_code'
|
86
|
-
t.string 'name'
|
87
|
-
t.string 'city'
|
88
|
-
t.string 'country_name'
|
89
|
-
t.float 'latitude'
|
90
|
-
t.float 'longitude'
|
91
|
-
t.datetime 'created_at'
|
92
|
-
t.datetime 'updated_at'
|
93
|
-
end
|
94
|
-
execute 'ALTER TABLE airports ADD PRIMARY KEY (iata_code);'
|
95
|
-
|
96
|
-
create_table "countries", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
97
|
-
t.string "iso_3166"
|
98
|
-
t.string "name"
|
99
|
-
t.datetime "created_at"
|
100
|
-
t.datetime "updated_at"
|
101
|
-
end
|
102
|
-
execute "ALTER TABLE countries ADD PRIMARY KEY (iso_3166);"
|
103
|
-
|
104
|
-
create_table "census_regions", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
105
|
-
t.integer "number"
|
106
|
-
t.string "name"
|
107
|
-
t.datetime "updated_at"
|
108
|
-
t.datetime "created_at"
|
109
|
-
end
|
110
|
-
execute "ALTER TABLE census_regions ADD PRIMARY KEY (number);"
|
111
|
-
|
112
|
-
create_table 'census_divisions', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
113
|
-
t.integer 'number'
|
114
|
-
t.string 'name'
|
115
|
-
t.datetime 'updated_at'
|
116
|
-
t.datetime 'created_at'
|
117
|
-
t.string 'census_region_name'
|
118
|
-
t.integer 'census_region_number'
|
119
|
-
|
120
|
-
end
|
121
|
-
execute 'ALTER TABLE census_divisions ADD PRIMARY KEY (number);'
|
122
|
-
|
123
|
-
create_table 'census_division_deux', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
124
|
-
t.integer 'number'
|
125
|
-
t.string 'name'
|
126
|
-
t.datetime 'updated_at'
|
127
|
-
t.datetime 'created_at'
|
128
|
-
t.string 'census_region_name'
|
129
|
-
t.integer 'census_region_number'
|
130
|
-
|
131
|
-
end
|
132
|
-
execute 'ALTER TABLE census_division_deux ADD PRIMARY KEY (number);'
|
133
|
-
|
134
|
-
create_table 'crosscalling_census_divisions', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
135
|
-
t.integer 'number'
|
136
|
-
t.string 'name'
|
137
|
-
t.datetime 'updated_at'
|
138
|
-
t.datetime 'created_at'
|
139
|
-
t.string 'census_region_name'
|
140
|
-
t.integer 'census_region_number'
|
141
|
-
|
142
|
-
end
|
143
|
-
execute 'ALTER TABLE crosscalling_census_divisions ADD PRIMARY KEY (number);'
|
144
|
-
|
145
|
-
create_table "automobile_variants", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
146
|
-
t.float "fuel_efficiency_city"
|
147
|
-
t.float "fuel_efficiency_highway"
|
148
|
-
t.string "make_name"
|
149
|
-
t.string "model_name"
|
150
|
-
t.string "year"
|
151
|
-
t.string "fuel_type_code"
|
152
|
-
t.datetime "updated_at"
|
153
|
-
t.datetime "created_at"
|
154
|
-
t.string "transmission"
|
155
|
-
t.string "drive"
|
156
|
-
t.boolean "turbo"
|
157
|
-
t.boolean "supercharger"
|
158
|
-
t.integer "cylinders"
|
159
|
-
t.float "displacement"
|
160
|
-
t.float "raw_fuel_efficiency_city"
|
161
|
-
t.float "raw_fuel_efficiency_highway"
|
162
|
-
t.integer "carline_mfr_code"
|
163
|
-
t.integer "vi_mfr_code"
|
164
|
-
t.integer "carline_code"
|
165
|
-
t.integer "carline_class_code"
|
166
|
-
t.boolean "injection"
|
167
|
-
t.string "carline_class_name"
|
168
|
-
t.string "speeds"
|
169
|
-
|
170
|
-
t.string 'raw_fuel_efficiency_highway_units'
|
171
|
-
t.string 'raw_fuel_efficiency_city_units'
|
172
|
-
t.string 'fuel_efficiency_highway_units'
|
173
|
-
t.string 'fuel_efficiency_city_units'
|
174
|
-
|
175
|
-
t.string "row_hash"
|
176
|
-
end
|
177
|
-
execute "ALTER TABLE automobile_variants ADD PRIMARY KEY (row_hash);"
|
178
|
-
|
179
|
-
create_table "automobile_fuel_types", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
180
|
-
t.string "name"
|
181
|
-
t.datetime "created_at"
|
182
|
-
t.datetime "updated_at"
|
183
|
-
t.float "emission_factor"
|
184
|
-
t.string "emission_factor_units"
|
185
|
-
t.float "annual_distance"
|
186
|
-
t.string "annual_distance_units"
|
187
|
-
t.string "code"
|
188
|
-
end
|
189
|
-
execute "ALTER TABLE automobile_fuel_types ADD PRIMARY KEY (code);"
|
190
|
-
|
191
|
-
create_table "residential_energy_consumption_survey_responses", :options => 'ENGINE=InnoDB default charset=utf8', :id => false, :force => true do |t|
|
192
|
-
t.integer "department_of_energy_identifier"
|
193
|
-
|
194
|
-
t.string "residence_class"
|
195
|
-
t.date "construction_year"
|
196
|
-
t.string "construction_period"
|
197
|
-
t.string "urbanity"
|
198
|
-
t.string "dishwasher_use"
|
199
|
-
t.string "central_ac_use"
|
200
|
-
t.string "window_ac_use"
|
201
|
-
t.string "clothes_washer_use"
|
202
|
-
t.string "clothes_dryer_use"
|
203
|
-
|
204
|
-
t.integer "census_division_number"
|
205
|
-
t.string "census_division_name"
|
206
|
-
t.integer "census_region_number"
|
207
|
-
t.string "census_region_name"
|
208
|
-
|
209
|
-
t.float "rooms"
|
210
|
-
t.float "floorspace"
|
211
|
-
t.integer "residents"
|
212
|
-
t.boolean "ownership"
|
213
|
-
t.boolean "thermostat_programmability"
|
214
|
-
t.integer "refrigerator_count"
|
215
|
-
t.integer "freezer_count"
|
216
|
-
t.float "annual_energy_from_fuel_oil_for_heating_space"
|
217
|
-
t.float "annual_energy_from_fuel_oil_for_heating_water"
|
218
|
-
t.float "annual_energy_from_fuel_oil_for_appliances"
|
219
|
-
t.float "annual_energy_from_natural_gas_for_heating_space"
|
220
|
-
t.float "annual_energy_from_natural_gas_for_heating_water"
|
221
|
-
t.float "annual_energy_from_natural_gas_for_appliances"
|
222
|
-
t.float "annual_energy_from_propane_for_heating_space"
|
223
|
-
t.float "annual_energy_from_propane_for_heating_water"
|
224
|
-
t.float "annual_energy_from_propane_for_appliances"
|
225
|
-
t.float "annual_energy_from_wood"
|
226
|
-
t.float "annual_energy_from_kerosene"
|
227
|
-
t.float "annual_energy_from_electricity_for_clothes_driers"
|
228
|
-
t.float "annual_energy_from_electricity_for_dishwashers"
|
229
|
-
t.float "annual_energy_from_electricity_for_freezers"
|
230
|
-
t.float "annual_energy_from_electricity_for_refrigerators"
|
231
|
-
t.float "annual_energy_from_electricity_for_air_conditioners"
|
232
|
-
t.float "annual_energy_from_electricity_for_heating_space"
|
233
|
-
t.float "annual_energy_from_electricity_for_heating_water"
|
234
|
-
t.float "annual_energy_from_electricity_for_other_appliances"
|
235
|
-
t.float "weighting"
|
236
|
-
t.float "lighting_use"
|
237
|
-
t.float "lighting_efficiency"
|
238
|
-
t.integer "heating_degree_days"
|
239
|
-
t.integer "cooling_degree_days"
|
240
|
-
t.integer "total_rooms"
|
241
|
-
t.integer "bathrooms"
|
242
|
-
t.integer "halfbaths"
|
243
|
-
t.integer "heated_garage"
|
244
|
-
t.integer "attached_1car_garage"
|
245
|
-
t.integer "detached_1car_garage"
|
246
|
-
t.integer "attached_2car_garage"
|
247
|
-
t.integer "detached_2car_garage"
|
248
|
-
t.integer "attached_3car_garage"
|
249
|
-
t.integer "detached_3car_garage"
|
250
|
-
t.integer "lights_on_1_to_4_hours"
|
251
|
-
t.integer "efficient_lights_on_1_to_4_hours"
|
252
|
-
t.integer "lights_on_4_to_12_hours"
|
253
|
-
t.integer "efficient_lights_on_4_to_12_hours"
|
254
|
-
t.integer "lights_on_over_12_hours"
|
255
|
-
t.integer "efficient_lights_on_over_12_hours"
|
256
|
-
t.integer "outdoor_all_night_lights"
|
257
|
-
t.integer "outdoor_all_night_gas_lights"
|
258
|
-
|
259
|
-
t.datetime "created_at"
|
260
|
-
t.datetime "updated_at"
|
261
|
-
end
|
262
|
-
execute "ALTER TABLE residential_energy_consumption_survey_responses ADD PRIMARY KEY (department_of_energy_identifier);"
|
263
|
-
|
264
|
-
create_table 'aircraft', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
265
|
-
t.string 'icao_code'
|
266
|
-
t.string 'manufacturer_name'
|
267
|
-
t.string 'name'
|
268
|
-
|
269
|
-
t.string "bts_name"
|
270
|
-
t.string "bts_aircraft_type_code"
|
271
|
-
|
272
|
-
t.string 'brighter_planet_aircraft_class_code'
|
273
|
-
# t.float 'm3'
|
274
|
-
# t.float 'm2'
|
275
|
-
# t.float 'm1'
|
276
|
-
# t.float 'endpoint_fuel'
|
277
|
-
t.datetime 'updated_at'
|
278
|
-
t.datetime 'created_at'
|
279
|
-
end
|
280
|
-
execute 'ALTER TABLE aircraft ADD PRIMARY KEY (icao_code);'
|
281
|
-
|
282
|
-
create_table 'aircraft_deux', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
283
|
-
t.string 'icao_code'
|
284
|
-
t.string 'manufacturer_name'
|
285
|
-
t.string 'name'
|
286
|
-
|
287
|
-
t.string "bts_name"
|
288
|
-
t.string "bts_aircraft_type_code"
|
289
|
-
|
290
|
-
# t.string 'brighter_planet_aircraft_class_code'
|
291
|
-
# t.float 'm3'
|
292
|
-
# t.float 'm2'
|
293
|
-
# t.float 'm1'
|
294
|
-
# t.float 'endpoint_fuel'
|
295
|
-
t.datetime 'updated_at'
|
296
|
-
t.datetime 'created_at'
|
297
|
-
end
|
298
|
-
execute 'ALTER TABLE aircraft_deux ADD PRIMARY KEY (icao_code);'
|
299
|
-
end
|
300
|
-
|
301
|
-
DataMiner::Run.create_tables
|
302
|
-
end
|
303
|
-
|
304
|
-
def load_models
|
305
|
-
load_schema
|
306
|
-
|
307
|
-
Dir.glob(File.expand_path('*.rb', File.dirname(__FILE__))).each do |lib|
|
308
|
-
require lib
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|