data_miner 1.3.8 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +42 -0
- data/Gemfile +19 -3
- data/README.rdoc +3 -3
- data/Rakefile +13 -15
- data/data_miner.gemspec +4 -15
- data/lib/data_miner.rb +69 -70
- data/lib/data_miner/active_record_extensions.rb +17 -22
- data/lib/data_miner/attribute.rb +176 -179
- data/lib/data_miner/dictionary.rb +38 -31
- data/lib/data_miner/run.rb +49 -18
- data/lib/data_miner/script.rb +116 -0
- data/lib/data_miner/step.rb +5 -0
- data/lib/data_miner/step/import.rb +74 -0
- data/lib/data_miner/step/process.rb +34 -0
- data/lib/data_miner/step/tap.rb +134 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +26 -24
- data/test/support/breeds.xls +0 -0
- data/test/support/pet_color_dictionary.en.csv +5 -0
- data/test/support/pet_color_dictionary.es.csv +5 -0
- data/test/support/pets.csv +5 -0
- data/test/support/pets_funny.csv +4 -0
- data/test/test_data_miner.rb +103 -0
- data/test/test_earth_import.rb +25 -0
- data/test/test_earth_tap.rb +25 -0
- data/test/test_safety.rb +43 -0
- metadata +72 -78
- data/.document +0 -5
- data/lib/data_miner/config.rb +0 -124
- data/lib/data_miner/import.rb +0 -93
- data/lib/data_miner/process.rb +0 -38
- data/lib/data_miner/tap.rb +0 -143
- data/test/support/aircraft.rb +0 -102
- data/test/support/airport.rb +0 -16
- data/test/support/automobile_fuel_type.rb +0 -40
- data/test/support/automobile_variant.rb +0 -362
- data/test/support/country.rb +0 -15
- data/test/support/test_database.rb +0 -311
- data/test/test_data_miner_attribute.rb +0 -111
- data/test/test_data_miner_process.rb +0 -18
- data/test/test_old_syntax.rb +0 -825
- data/test/test_tap.rb +0 -21
data/test/support/airport.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
class Airport < ActiveRecord::Base
|
2
|
-
set_primary_key :iata_code
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import :url => 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat',
|
6
|
-
:headers => false,
|
7
|
-
:select => lambda { |row| row[4].present? } do
|
8
|
-
key 'iata_code', :field_number => 4
|
9
|
-
store 'name', :field_number => 1
|
10
|
-
store 'city', :field_number => 2
|
11
|
-
store 'country_name', :field_number => 3
|
12
|
-
store 'latitude', :field_number => 6, :nullify => true
|
13
|
-
store 'longitude', :field_number => 7, :nullify => true
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
class AutomobileFuelType < ActiveRecord::Base
|
2
|
-
set_primary_key :code
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
6
|
-
:filename => 'Gd6-dsc.txt',
|
7
|
-
:format => :fixed_width,
|
8
|
-
:crop => 21..26, # inclusive
|
9
|
-
:cut => '2-',
|
10
|
-
:select => lambda { |row| /\A[A-Z]/.match row[:code] },
|
11
|
-
:schema => [[ 'code', 2, { :type => :string } ],
|
12
|
-
[ 'spacer', 2 ],
|
13
|
-
[ 'name', 52, { :type => :string } ]]) do
|
14
|
-
key 'code'
|
15
|
-
store 'name'
|
16
|
-
end
|
17
|
-
|
18
|
-
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
|
19
|
-
key 'code'
|
20
|
-
store 'name'
|
21
|
-
store 'annual_distance'
|
22
|
-
store 'emission_factor'
|
23
|
-
end
|
24
|
-
|
25
|
-
# pull electricity emission factor from residential electricity
|
26
|
-
import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
|
27
|
-
:select => lambda { |row| row['code'] == 'El' }) do
|
28
|
-
key 'code'
|
29
|
-
store 'name'
|
30
|
-
store 'emission_factor'
|
31
|
-
end
|
32
|
-
|
33
|
-
# still need distance estimate for electric cars
|
34
|
-
end
|
35
|
-
|
36
|
-
CODES = {
|
37
|
-
:electricity => 'El',
|
38
|
-
:diesel => 'D'
|
39
|
-
}
|
40
|
-
end
|
@@ -1,362 +0,0 @@
|
|
1
|
-
class AutomobileVariant < ActiveRecord::Base
|
2
|
-
set_primary_key :row_hash
|
3
|
-
|
4
|
-
module FuelEconomyGuide
|
5
|
-
TRANSMISSIONS = {
|
6
|
-
'A' => 'automatic',
|
7
|
-
'M' => 'manual',
|
8
|
-
'L' => 'automatic', # Lockup/automatic
|
9
|
-
'S' => 'semiautomatic', # Semiautomatic
|
10
|
-
'C' => 'manual' # TODO verify for VW Syncro
|
11
|
-
}
|
12
|
-
|
13
|
-
ENGINE_TYPES = {
|
14
|
-
'(GUZZLER)' => nil, # "gas guzzler"
|
15
|
-
'(POLICE)' => nil, # police automobile_variant
|
16
|
-
'(MPFI)' => 'injection',
|
17
|
-
'(MPI*)' => 'injection',
|
18
|
-
'(SPFI)' => 'injection',
|
19
|
-
'(FFS)' => 'injection',
|
20
|
-
'(TURBO)' => 'turbo',
|
21
|
-
'(TRBO)' => 'turbo',
|
22
|
-
'(TC*)' => 'turbo',
|
23
|
-
'(FFS,TRBO)' => %w(injection turbo),
|
24
|
-
'(S-CHARGE)' => 'supercharger',
|
25
|
-
'(SC*)' => 'supercharger',
|
26
|
-
'(DIESEL)' => nil, # diesel
|
27
|
-
'(DSL)' => nil, # diesel
|
28
|
-
'(ROTARY)' => nil, # rotary
|
29
|
-
'(VARIABLE)' => nil, # variable displacement
|
30
|
-
'(NO-CAT)' => nil, # no catalytic converter
|
31
|
-
'(OHC)' => nil, # overhead camshaft
|
32
|
-
'(OHV)' => nil, # overhead valves
|
33
|
-
'(16-VALVE)' => nil, # 16V
|
34
|
-
'(305)' => nil, # 305 cubic inch displacement
|
35
|
-
'(307)' => nil, # 307 cubic inch displacement
|
36
|
-
'(M-ENG)' => nil,
|
37
|
-
'(W-ENG)' => nil,
|
38
|
-
'(GM-BUICK)' => nil,
|
39
|
-
'(GM-CHEV)' => nil,
|
40
|
-
'(GM-OLDS)' => nil,
|
41
|
-
'(GM-PONT)' => nil,
|
42
|
-
}
|
43
|
-
|
44
|
-
class ParserB
|
45
|
-
require 'fixed_width'
|
46
|
-
::FixedWidth.define :fuel_economy_guide_b do |d|
|
47
|
-
d.rows do |row|
|
48
|
-
row.trap { true } # there's only one section
|
49
|
-
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
50
|
-
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
51
|
-
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
52
|
-
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
53
|
-
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
54
|
-
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
55
|
-
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
56
|
-
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
57
|
-
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
58
|
-
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
59
|
-
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
60
|
-
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
61
|
-
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
62
|
-
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
63
|
-
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
64
|
-
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
65
|
-
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
66
|
-
row.spacer 2
|
67
|
-
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
68
|
-
row.spacer 2
|
69
|
-
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
70
|
-
row.spacer 2
|
71
|
-
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
72
|
-
row.spacer 2
|
73
|
-
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
74
|
-
row.spacer 2
|
75
|
-
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
76
|
-
row.spacer 2
|
77
|
-
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
78
|
-
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
79
|
-
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
80
|
-
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
81
|
-
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
82
|
-
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
83
|
-
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
84
|
-
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
85
|
-
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
86
|
-
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
87
|
-
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
88
|
-
row.column 'filler' , 1, :type => :string # NOT USED
|
89
|
-
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
90
|
-
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
91
|
-
end
|
92
|
-
end
|
93
|
-
attr_accessor :year
|
94
|
-
def initialize(options = {})
|
95
|
-
options = options.stringify_keys
|
96
|
-
@year = options['year']
|
97
|
-
end
|
98
|
-
|
99
|
-
def apply(row)
|
100
|
-
row.merge!({
|
101
|
-
'make' => row['carline_mfr_name'], # make it line up with the errata
|
102
|
-
'model' => row['carline_name'], # ditto
|
103
|
-
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
104
|
-
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
105
|
-
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
106
|
-
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
107
|
-
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
108
|
-
'displacement' => _displacement(row['opt_disp']),
|
109
|
-
'year' => year
|
110
|
-
})
|
111
|
-
row
|
112
|
-
end
|
113
|
-
|
114
|
-
def _displacement(str)
|
115
|
-
str = str.gsub(/[\(\)]/, '').strip
|
116
|
-
if str =~ /^(.+)L$/
|
117
|
-
$1.to_f
|
118
|
-
elsif str =~ /^(.+)CC$/
|
119
|
-
$1.to_f / 1000
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|
124
|
-
class ParserC
|
125
|
-
attr_accessor :year
|
126
|
-
def initialize(options = {})
|
127
|
-
options = options.stringify_keys
|
128
|
-
@year = options['year']
|
129
|
-
end
|
130
|
-
|
131
|
-
def apply(row)
|
132
|
-
row.merge!({
|
133
|
-
'make' => row['Manufacturer'], # make it line up with the errata
|
134
|
-
'model' => row['carline name'], # ditto
|
135
|
-
'drive' => row['drv'] + 'WD',
|
136
|
-
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
137
|
-
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
138
|
-
'turbo' => row['T'] == 'T',
|
139
|
-
'supercharger' => row['S'] == 'S',
|
140
|
-
'injection' => true,
|
141
|
-
'year' => year
|
142
|
-
})
|
143
|
-
row
|
144
|
-
end
|
145
|
-
end
|
146
|
-
class ParserD
|
147
|
-
attr_accessor :year
|
148
|
-
def initialize(options = {})
|
149
|
-
options = options.stringify_keys
|
150
|
-
@year = options['year']
|
151
|
-
end
|
152
|
-
|
153
|
-
def apply(row)
|
154
|
-
row.merge!({
|
155
|
-
'make' => row['MFR'], # make it line up with the errata
|
156
|
-
'model' => row['CAR LINE'], # ditto
|
157
|
-
'drive' => row['DRIVE SYS'] + 'WD',
|
158
|
-
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
159
|
-
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
160
|
-
'turbo' => row['TURBO'] == 'T',
|
161
|
-
'supercharger' => row['SPCHGR'] == 'S',
|
162
|
-
'injection' => true,
|
163
|
-
'year' => year
|
164
|
-
})
|
165
|
-
row
|
166
|
-
end
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
class Guru
|
171
|
-
# the following matching methods are needed by the errata
|
172
|
-
# per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
|
173
|
-
|
174
|
-
def transmission_is_blank?(row)
|
175
|
-
row['transmission'].blank?
|
176
|
-
end
|
177
|
-
|
178
|
-
def is_a_2007_gmc_or_chevrolet?(row)
|
179
|
-
row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
|
180
|
-
end
|
181
|
-
|
182
|
-
def is_a_porsche?(row)
|
183
|
-
row['make'].upcase == 'PORSCHE'
|
184
|
-
end
|
185
|
-
|
186
|
-
def is_not_a_porsche?(row)
|
187
|
-
!is_a_porsche? row
|
188
|
-
end
|
189
|
-
|
190
|
-
def is_a_mercedes_benz?(row)
|
191
|
-
row['make'] =~ /MERCEDES/i
|
192
|
-
end
|
193
|
-
|
194
|
-
def is_a_lexus?(row)
|
195
|
-
row['make'].upcase == 'LEXUS'
|
196
|
-
end
|
197
|
-
|
198
|
-
def is_a_bmw?(row)
|
199
|
-
row['make'].upcase == 'BMW'
|
200
|
-
end
|
201
|
-
|
202
|
-
def is_a_ford?(row)
|
203
|
-
row['make'].upcase == 'FORD'
|
204
|
-
end
|
205
|
-
|
206
|
-
def is_a_rolls_royce_and_model_contains_bentley?(row)
|
207
|
-
is_a_rolls_royce?(row) and model_contains_bentley?(row)
|
208
|
-
end
|
209
|
-
|
210
|
-
def is_a_bentley?(row)
|
211
|
-
row['make'].upcase == 'BENTLEY'
|
212
|
-
end
|
213
|
-
|
214
|
-
def is_a_rolls_royce?(row)
|
215
|
-
row['make'] =~ /ROLLS/i
|
216
|
-
end
|
217
|
-
|
218
|
-
def is_a_turbo_brooklands?(row)
|
219
|
-
row['model'] =~ /TURBO R\/RL BKLDS/i
|
220
|
-
end
|
221
|
-
|
222
|
-
def model_contains_maybach?(row)
|
223
|
-
row['model'] =~ /MAYBACH/i
|
224
|
-
end
|
225
|
-
|
226
|
-
def model_contains_bentley?(row)
|
227
|
-
row['model'] =~ /BENTLEY/i
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
errata = { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv', :responder => 'AutomobileVariant::Guru' }
|
232
|
-
|
233
|
-
data_miner do
|
234
|
-
# 1985---1997
|
235
|
-
(85..97).each do |yy|
|
236
|
-
filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
|
237
|
-
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
238
|
-
:filename => filename,
|
239
|
-
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
240
|
-
:format => :fixed_width,
|
241
|
-
:cut => (yy == 95) ? '13-' : nil,
|
242
|
-
:schema_name => :fuel_economy_guide_b,
|
243
|
-
:select => lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' },
|
244
|
-
:errata => errata) do
|
245
|
-
key 'row_hash'
|
246
|
-
store 'make_name', :field_name => 'make'
|
247
|
-
store 'model_name', :field_name => 'model'
|
248
|
-
store 'year'
|
249
|
-
store 'fuel_type_code', :field_name => 'fuel_type'
|
250
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
251
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
252
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
253
|
-
store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
254
|
-
store 'cylinders', :field_name => 'no_cyc'
|
255
|
-
store 'drive', :field_name => 'drive_system'
|
256
|
-
store 'carline_mfr_code'
|
257
|
-
store 'vi_mfr_code'
|
258
|
-
store 'carline_code'
|
259
|
-
store 'carline_class_code', :field_name => 'carline_clss'
|
260
|
-
store 'transmission'
|
261
|
-
store 'speeds'
|
262
|
-
store 'turbo'
|
263
|
-
store 'supercharger'
|
264
|
-
store 'injection'
|
265
|
-
store 'displacement'
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
# 1998--2005
|
270
|
-
{
|
271
|
-
1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
|
272
|
-
1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
|
273
|
-
2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
|
274
|
-
2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
|
275
|
-
2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
|
276
|
-
2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
|
277
|
-
2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
|
278
|
-
2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
|
279
|
-
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
280
|
-
import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
|
281
|
-
:errata => errata) do
|
282
|
-
key 'row_hash'
|
283
|
-
store 'make_name', :field_name => 'make'
|
284
|
-
store 'model_name', :field_name => 'model'
|
285
|
-
store 'fuel_type_code', :field_name => 'fl'
|
286
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
287
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
288
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
289
|
-
store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
290
|
-
store 'cylinders', :field_name => 'cyl'
|
291
|
-
store 'displacement', :field_name => 'displ'
|
292
|
-
store 'carline_class_code', :field_name => 'cls' if year >= 2000
|
293
|
-
store 'carline_class_name', :field_name => 'Class'
|
294
|
-
store 'year'
|
295
|
-
store 'transmission'
|
296
|
-
store 'speeds'
|
297
|
-
store 'turbo'
|
298
|
-
store 'supercharger'
|
299
|
-
store 'injection'
|
300
|
-
store 'drive'
|
301
|
-
end
|
302
|
-
end
|
303
|
-
|
304
|
-
# 2006--2010
|
305
|
-
{
|
306
|
-
2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
|
307
|
-
2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
|
308
|
-
2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
|
309
|
-
2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
|
310
|
-
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
311
|
-
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
312
|
-
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
313
|
-
:reject => (year == 2007) ? lambda { |row| row.values.first.blank? } : nil,
|
314
|
-
:errata => errata) do
|
315
|
-
key 'row_hash'
|
316
|
-
store 'make_name', :field_name => 'make'
|
317
|
-
store 'model_name', :field_name => 'model'
|
318
|
-
store 'fuel_type_code', :field_name => 'FUEL TYPE'
|
319
|
-
store 'fuel_efficiency_highway', :static => nil, :units => :kilometres_per_litre # we'll convert these in a later step, just setting the stage
|
320
|
-
store 'fuel_efficiency_city', :static => nil, :units => :kilometres_per_litre # ditto
|
321
|
-
store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
322
|
-
store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
323
|
-
store 'cylinders', :field_name => 'NUMB CYL'
|
324
|
-
store 'displacement', :field_name => 'DISPLACEMENT'
|
325
|
-
store 'carline_class_code', :field_name => 'CLS'
|
326
|
-
store 'carline_class_name', :field_name => 'CLASS'
|
327
|
-
store 'year'
|
328
|
-
store 'transmission'
|
329
|
-
store 'speeds'
|
330
|
-
store 'turbo'
|
331
|
-
store 'supercharger'
|
332
|
-
store 'injection'
|
333
|
-
store 'drive'
|
334
|
-
end
|
335
|
-
end
|
336
|
-
|
337
|
-
# associate :make, :key => :original_automobile_make_name, :foreign_key => :name
|
338
|
-
# derive :automobile_model_id # creates models by name
|
339
|
-
# associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
|
340
|
-
|
341
|
-
process 'Set adjusted fuel economy' do
|
342
|
-
update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
|
343
|
-
update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
|
344
|
-
end
|
345
|
-
end
|
346
|
-
|
347
|
-
def name
|
348
|
-
extra = []
|
349
|
-
extra << "V#{cylinders}" if cylinders
|
350
|
-
extra << "#{displacement}L" if displacement
|
351
|
-
extra << "turbo" if turbo
|
352
|
-
extra << "FI" if injection
|
353
|
-
extra << "#{speeds}spd" if speeds.present?
|
354
|
-
extra << transmission if transmission.present?
|
355
|
-
extra << "(#{fuel_type.name})" if fuel_type
|
356
|
-
extra.join(' ')
|
357
|
-
end
|
358
|
-
|
359
|
-
def fuel_economy_description
|
360
|
-
[ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
|
361
|
-
end
|
362
|
-
end
|
data/test/support/country.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
class Country < ActiveRecord::Base
|
2
|
-
set_primary_key :iso_3166
|
3
|
-
|
4
|
-
data_miner do
|
5
|
-
import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :encoding => 'ISO-8859-1', :skip => 2, :headers => false, :delimiter => ';' do
|
6
|
-
key 'iso_3166', :field_number => 1
|
7
|
-
store 'name', :field_number => 0
|
8
|
-
end
|
9
|
-
|
10
|
-
import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
11
|
-
key 'iso_3166', :field_name => 'country code'
|
12
|
-
store 'name', :field_name => 'country'
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,311 +0,0 @@
|
|
1
|
-
module TestDatabase
|
2
|
-
extend self
|
3
|
-
|
4
|
-
def connect
|
5
|
-
@connection ||= ActiveRecord::Base.establish_connection(
|
6
|
-
'adapter' => 'mysql',
|
7
|
-
'database' => 'data_miner_test',
|
8
|
-
'username' => 'root',
|
9
|
-
'password' => 'password'
|
10
|
-
)
|
11
|
-
end
|
12
|
-
|
13
|
-
def load_schema
|
14
|
-
connect
|
15
|
-
|
16
|
-
ActiveRecord::Schema.define(:version => 20090819143429) do
|
17
|
-
create_table "t100_flight_segments", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
18
|
-
t.integer "departures_performed"
|
19
|
-
t.integer "payload"
|
20
|
-
t.integer "seats"
|
21
|
-
t.integer "passengers"
|
22
|
-
t.integer "freight"
|
23
|
-
t.integer "mail"
|
24
|
-
t.integer "ramp_to_ramp"
|
25
|
-
t.integer "air_time"
|
26
|
-
t.float "load_factor"
|
27
|
-
t.float "freight_share"
|
28
|
-
t.integer "distance"
|
29
|
-
t.integer "departures_scheduled"
|
30
|
-
t.string "unique_carrier"
|
31
|
-
t.integer "dot_airline_id"
|
32
|
-
t.string "unique_carrier_name"
|
33
|
-
t.string "unique_carrier_entity"
|
34
|
-
t.string "region"
|
35
|
-
t.string "carrier"
|
36
|
-
t.string "carrier_name"
|
37
|
-
t.integer "carrier_group"
|
38
|
-
t.integer "carrier_group_new"
|
39
|
-
t.string "origin_airport_iata"
|
40
|
-
t.string "origin_city_name"
|
41
|
-
t.integer "origin_city_num"
|
42
|
-
t.string "origin_state_abr"
|
43
|
-
t.string "origin_state_fips"
|
44
|
-
t.string "origin_state_nm"
|
45
|
-
t.string "origin_country_iso_3166"
|
46
|
-
t.string "origin_country_name"
|
47
|
-
t.integer "origin_wac"
|
48
|
-
t.string "dest_airport_iata"
|
49
|
-
t.string "dest_city_name"
|
50
|
-
t.integer "dest_city_num"
|
51
|
-
t.string "dest_state_abr"
|
52
|
-
t.string "dest_state_fips"
|
53
|
-
t.string "dest_state_nm"
|
54
|
-
t.string "dest_country_iso_3166"
|
55
|
-
t.string "dest_country_name"
|
56
|
-
t.integer "dest_wac"
|
57
|
-
t.integer "bts_aircraft_group"
|
58
|
-
t.integer "bts_aircraft_type"
|
59
|
-
t.integer "bts_aircraft_config"
|
60
|
-
t.integer "year"
|
61
|
-
t.integer "quarter"
|
62
|
-
t.integer "month"
|
63
|
-
t.integer "bts_distance_group"
|
64
|
-
t.string "bts_service_class"
|
65
|
-
t.string "data_source"
|
66
|
-
t.float "seats_per_departure"
|
67
|
-
|
68
|
-
t.string 'payload_units'
|
69
|
-
t.string 'freight_units'
|
70
|
-
t.string 'mail_units'
|
71
|
-
t.string 'distance_units'
|
72
|
-
|
73
|
-
t.datetime "created_at"
|
74
|
-
t.datetime "updated_at"
|
75
|
-
|
76
|
-
t.string "row_hash"
|
77
|
-
end
|
78
|
-
execute 'ALTER TABLE t100_flight_segments ADD PRIMARY KEY (row_hash);'
|
79
|
-
|
80
|
-
create_table 'tapped_airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
81
|
-
t.string 'i_am_just_here_to_get_in_the_way'
|
82
|
-
end
|
83
|
-
|
84
|
-
create_table 'airports', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
85
|
-
t.string 'iata_code'
|
86
|
-
t.string 'name'
|
87
|
-
t.string 'city'
|
88
|
-
t.string 'country_name'
|
89
|
-
t.float 'latitude'
|
90
|
-
t.float 'longitude'
|
91
|
-
t.datetime 'created_at'
|
92
|
-
t.datetime 'updated_at'
|
93
|
-
end
|
94
|
-
execute 'ALTER TABLE airports ADD PRIMARY KEY (iata_code);'
|
95
|
-
|
96
|
-
create_table "countries", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
97
|
-
t.string "iso_3166"
|
98
|
-
t.string "name"
|
99
|
-
t.datetime "created_at"
|
100
|
-
t.datetime "updated_at"
|
101
|
-
end
|
102
|
-
execute "ALTER TABLE countries ADD PRIMARY KEY (iso_3166);"
|
103
|
-
|
104
|
-
create_table "census_regions", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
105
|
-
t.integer "number"
|
106
|
-
t.string "name"
|
107
|
-
t.datetime "updated_at"
|
108
|
-
t.datetime "created_at"
|
109
|
-
end
|
110
|
-
execute "ALTER TABLE census_regions ADD PRIMARY KEY (number);"
|
111
|
-
|
112
|
-
create_table 'census_divisions', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
113
|
-
t.integer 'number'
|
114
|
-
t.string 'name'
|
115
|
-
t.datetime 'updated_at'
|
116
|
-
t.datetime 'created_at'
|
117
|
-
t.string 'census_region_name'
|
118
|
-
t.integer 'census_region_number'
|
119
|
-
|
120
|
-
end
|
121
|
-
execute 'ALTER TABLE census_divisions ADD PRIMARY KEY (number);'
|
122
|
-
|
123
|
-
create_table 'census_division_deux', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
124
|
-
t.integer 'number'
|
125
|
-
t.string 'name'
|
126
|
-
t.datetime 'updated_at'
|
127
|
-
t.datetime 'created_at'
|
128
|
-
t.string 'census_region_name'
|
129
|
-
t.integer 'census_region_number'
|
130
|
-
|
131
|
-
end
|
132
|
-
execute 'ALTER TABLE census_division_deux ADD PRIMARY KEY (number);'
|
133
|
-
|
134
|
-
create_table 'crosscalling_census_divisions', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
135
|
-
t.integer 'number'
|
136
|
-
t.string 'name'
|
137
|
-
t.datetime 'updated_at'
|
138
|
-
t.datetime 'created_at'
|
139
|
-
t.string 'census_region_name'
|
140
|
-
t.integer 'census_region_number'
|
141
|
-
|
142
|
-
end
|
143
|
-
execute 'ALTER TABLE crosscalling_census_divisions ADD PRIMARY KEY (number);'
|
144
|
-
|
145
|
-
create_table "automobile_variants", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
146
|
-
t.float "fuel_efficiency_city"
|
147
|
-
t.float "fuel_efficiency_highway"
|
148
|
-
t.string "make_name"
|
149
|
-
t.string "model_name"
|
150
|
-
t.string "year"
|
151
|
-
t.string "fuel_type_code"
|
152
|
-
t.datetime "updated_at"
|
153
|
-
t.datetime "created_at"
|
154
|
-
t.string "transmission"
|
155
|
-
t.string "drive"
|
156
|
-
t.boolean "turbo"
|
157
|
-
t.boolean "supercharger"
|
158
|
-
t.integer "cylinders"
|
159
|
-
t.float "displacement"
|
160
|
-
t.float "raw_fuel_efficiency_city"
|
161
|
-
t.float "raw_fuel_efficiency_highway"
|
162
|
-
t.integer "carline_mfr_code"
|
163
|
-
t.integer "vi_mfr_code"
|
164
|
-
t.integer "carline_code"
|
165
|
-
t.integer "carline_class_code"
|
166
|
-
t.boolean "injection"
|
167
|
-
t.string "carline_class_name"
|
168
|
-
t.string "speeds"
|
169
|
-
|
170
|
-
t.string 'raw_fuel_efficiency_highway_units'
|
171
|
-
t.string 'raw_fuel_efficiency_city_units'
|
172
|
-
t.string 'fuel_efficiency_highway_units'
|
173
|
-
t.string 'fuel_efficiency_city_units'
|
174
|
-
|
175
|
-
t.string "row_hash"
|
176
|
-
end
|
177
|
-
execute "ALTER TABLE automobile_variants ADD PRIMARY KEY (row_hash);"
|
178
|
-
|
179
|
-
create_table "automobile_fuel_types", :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
180
|
-
t.string "name"
|
181
|
-
t.datetime "created_at"
|
182
|
-
t.datetime "updated_at"
|
183
|
-
t.float "emission_factor"
|
184
|
-
t.string "emission_factor_units"
|
185
|
-
t.float "annual_distance"
|
186
|
-
t.string "annual_distance_units"
|
187
|
-
t.string "code"
|
188
|
-
end
|
189
|
-
execute "ALTER TABLE automobile_fuel_types ADD PRIMARY KEY (code);"
|
190
|
-
|
191
|
-
create_table "residential_energy_consumption_survey_responses", :options => 'ENGINE=InnoDB default charset=utf8', :id => false, :force => true do |t|
|
192
|
-
t.integer "department_of_energy_identifier"
|
193
|
-
|
194
|
-
t.string "residence_class"
|
195
|
-
t.date "construction_year"
|
196
|
-
t.string "construction_period"
|
197
|
-
t.string "urbanity"
|
198
|
-
t.string "dishwasher_use"
|
199
|
-
t.string "central_ac_use"
|
200
|
-
t.string "window_ac_use"
|
201
|
-
t.string "clothes_washer_use"
|
202
|
-
t.string "clothes_dryer_use"
|
203
|
-
|
204
|
-
t.integer "census_division_number"
|
205
|
-
t.string "census_division_name"
|
206
|
-
t.integer "census_region_number"
|
207
|
-
t.string "census_region_name"
|
208
|
-
|
209
|
-
t.float "rooms"
|
210
|
-
t.float "floorspace"
|
211
|
-
t.integer "residents"
|
212
|
-
t.boolean "ownership"
|
213
|
-
t.boolean "thermostat_programmability"
|
214
|
-
t.integer "refrigerator_count"
|
215
|
-
t.integer "freezer_count"
|
216
|
-
t.float "annual_energy_from_fuel_oil_for_heating_space"
|
217
|
-
t.float "annual_energy_from_fuel_oil_for_heating_water"
|
218
|
-
t.float "annual_energy_from_fuel_oil_for_appliances"
|
219
|
-
t.float "annual_energy_from_natural_gas_for_heating_space"
|
220
|
-
t.float "annual_energy_from_natural_gas_for_heating_water"
|
221
|
-
t.float "annual_energy_from_natural_gas_for_appliances"
|
222
|
-
t.float "annual_energy_from_propane_for_heating_space"
|
223
|
-
t.float "annual_energy_from_propane_for_heating_water"
|
224
|
-
t.float "annual_energy_from_propane_for_appliances"
|
225
|
-
t.float "annual_energy_from_wood"
|
226
|
-
t.float "annual_energy_from_kerosene"
|
227
|
-
t.float "annual_energy_from_electricity_for_clothes_driers"
|
228
|
-
t.float "annual_energy_from_electricity_for_dishwashers"
|
229
|
-
t.float "annual_energy_from_electricity_for_freezers"
|
230
|
-
t.float "annual_energy_from_electricity_for_refrigerators"
|
231
|
-
t.float "annual_energy_from_electricity_for_air_conditioners"
|
232
|
-
t.float "annual_energy_from_electricity_for_heating_space"
|
233
|
-
t.float "annual_energy_from_electricity_for_heating_water"
|
234
|
-
t.float "annual_energy_from_electricity_for_other_appliances"
|
235
|
-
t.float "weighting"
|
236
|
-
t.float "lighting_use"
|
237
|
-
t.float "lighting_efficiency"
|
238
|
-
t.integer "heating_degree_days"
|
239
|
-
t.integer "cooling_degree_days"
|
240
|
-
t.integer "total_rooms"
|
241
|
-
t.integer "bathrooms"
|
242
|
-
t.integer "halfbaths"
|
243
|
-
t.integer "heated_garage"
|
244
|
-
t.integer "attached_1car_garage"
|
245
|
-
t.integer "detached_1car_garage"
|
246
|
-
t.integer "attached_2car_garage"
|
247
|
-
t.integer "detached_2car_garage"
|
248
|
-
t.integer "attached_3car_garage"
|
249
|
-
t.integer "detached_3car_garage"
|
250
|
-
t.integer "lights_on_1_to_4_hours"
|
251
|
-
t.integer "efficient_lights_on_1_to_4_hours"
|
252
|
-
t.integer "lights_on_4_to_12_hours"
|
253
|
-
t.integer "efficient_lights_on_4_to_12_hours"
|
254
|
-
t.integer "lights_on_over_12_hours"
|
255
|
-
t.integer "efficient_lights_on_over_12_hours"
|
256
|
-
t.integer "outdoor_all_night_lights"
|
257
|
-
t.integer "outdoor_all_night_gas_lights"
|
258
|
-
|
259
|
-
t.datetime "created_at"
|
260
|
-
t.datetime "updated_at"
|
261
|
-
end
|
262
|
-
execute "ALTER TABLE residential_energy_consumption_survey_responses ADD PRIMARY KEY (department_of_energy_identifier);"
|
263
|
-
|
264
|
-
create_table 'aircraft', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
265
|
-
t.string 'icao_code'
|
266
|
-
t.string 'manufacturer_name'
|
267
|
-
t.string 'name'
|
268
|
-
|
269
|
-
t.string "bts_name"
|
270
|
-
t.string "bts_aircraft_type_code"
|
271
|
-
|
272
|
-
t.string 'brighter_planet_aircraft_class_code'
|
273
|
-
# t.float 'm3'
|
274
|
-
# t.float 'm2'
|
275
|
-
# t.float 'm1'
|
276
|
-
# t.float 'endpoint_fuel'
|
277
|
-
t.datetime 'updated_at'
|
278
|
-
t.datetime 'created_at'
|
279
|
-
end
|
280
|
-
execute 'ALTER TABLE aircraft ADD PRIMARY KEY (icao_code);'
|
281
|
-
|
282
|
-
create_table 'aircraft_deux', :force => true, :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
283
|
-
t.string 'icao_code'
|
284
|
-
t.string 'manufacturer_name'
|
285
|
-
t.string 'name'
|
286
|
-
|
287
|
-
t.string "bts_name"
|
288
|
-
t.string "bts_aircraft_type_code"
|
289
|
-
|
290
|
-
# t.string 'brighter_planet_aircraft_class_code'
|
291
|
-
# t.float 'm3'
|
292
|
-
# t.float 'm2'
|
293
|
-
# t.float 'm1'
|
294
|
-
# t.float 'endpoint_fuel'
|
295
|
-
t.datetime 'updated_at'
|
296
|
-
t.datetime 'created_at'
|
297
|
-
end
|
298
|
-
execute 'ALTER TABLE aircraft_deux ADD PRIMARY KEY (icao_code);'
|
299
|
-
end
|
300
|
-
|
301
|
-
DataMiner::Run.create_tables
|
302
|
-
end
|
303
|
-
|
304
|
-
def load_models
|
305
|
-
load_schema
|
306
|
-
|
307
|
-
Dir.glob(File.expand_path('*.rb', File.dirname(__FILE__))).each do |lib|
|
308
|
-
require lib
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|