data_miner 0.3.13 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +30 -32
- data/VERSION +1 -1
- data/data_miner.gemspec +2 -3
- data/lib/data_miner.rb +0 -4
- data/lib/data_miner/attribute.rb +103 -137
- data/lib/data_miner/configuration.rb +7 -9
- data/lib/data_miner/dictionary.rb +1 -1
- data/lib/data_miner/import.rb +24 -29
- data/test/data_miner_test.rb +341 -505
- data/test/test_helper.rb +0 -99
- metadata +2 -3
- data/lib/data_miner/william_james_cartesian_product.rb +0 -11
@@ -30,19 +30,15 @@ module DataMiner
|
|
30
30
|
options = args.last
|
31
31
|
|
32
32
|
self.runnable_counter += 1
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
def after_invoke
|
37
|
-
if unique_indices.empty?
|
38
|
-
raise(MissingHashColumn, "No unique_index defined for #{resource.name}, so you need a row_hash:string column.") unless resource.column_names.include?('row_hash')
|
39
|
-
unique_indices.add 'row_hash'
|
40
|
-
end
|
41
|
-
runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
|
33
|
+
runnable = DataMiner::Import.new self, runnable_counter, description, options
|
34
|
+
Blockenspiel.invoke block, runnable
|
35
|
+
runnables << runnable
|
42
36
|
end
|
43
37
|
|
44
38
|
# Mine data for this class.
|
45
39
|
def run(options = {})
|
40
|
+
options.symbolize_keys!
|
41
|
+
|
46
42
|
finished = false
|
47
43
|
run = DataMiner::Run.create! :started_at => Time.now, :resource_name => resource.name
|
48
44
|
resource.delete_all if options[:from_scratch]
|
@@ -63,6 +59,8 @@ module DataMiner
|
|
63
59
|
# Options
|
64
60
|
# * <tt>:resource_names</tt>: array of resource (class) names to mine
|
65
61
|
def run(options = {})
|
62
|
+
options.symbolize_keys!
|
63
|
+
|
66
64
|
resource_names.each do |resource_name|
|
67
65
|
if options[:resource_names].blank? or options[:resource_names].include?(resource_name)
|
68
66
|
resource_name.constantize.data_miner_config.run options
|
@@ -15,7 +15,7 @@ module DataMiner
|
|
15
15
|
|
16
16
|
def find(key_name, key, value_name, options = {})
|
17
17
|
if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
|
18
|
-
match[value_name].to_s
|
18
|
+
match[value_name].to_s
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
data/lib/data_miner/import.rb
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Import
|
3
|
+
include Blockenspiel::DSL
|
4
|
+
|
5
|
+
attr_reader :attributes
|
3
6
|
attr_accessor :configuration, :position_in_run, :options, :table, :errata
|
4
7
|
attr_accessor :description
|
5
8
|
delegate :resource, :to => :configuration
|
6
|
-
delegate :unique_indices, :to => :configuration
|
7
9
|
|
8
|
-
def initialize(configuration, position_in_run, description, options = {}
|
10
|
+
def initialize(configuration, position_in_run, description, options = {})
|
11
|
+
options.symbolize_keys!
|
12
|
+
@options = options
|
13
|
+
|
14
|
+
@attributes = ActiveSupport::OrderedHash.new
|
9
15
|
@configuration = configuration
|
10
16
|
@position_in_run = position_in_run
|
11
17
|
@description = description
|
12
|
-
@options = options
|
13
|
-
yield self if block_given? # pull in attributes
|
14
18
|
@errata = Errata.new(:url => options[:errata], :klass => resource) if options[:errata]
|
15
19
|
@table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
|
16
20
|
end
|
@@ -19,17 +23,18 @@ module DataMiner
|
|
19
23
|
"Import(#{resource}) position #{position_in_run} (#{description})"
|
20
24
|
end
|
21
25
|
|
22
|
-
def attributes
|
23
|
-
configuration.attributes.reject { |k, v| !v.stored_by? self }
|
24
|
-
end
|
25
|
-
|
26
26
|
def stores?(attr_name)
|
27
|
-
|
27
|
+
attributes.has_key? attr_name
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
def store(attr_name, attr_options = {})
|
31
|
-
|
32
|
-
|
31
|
+
raise "[data_miner gem] Column #{attr_name} doesn't exist on table #{resource.table_name}" unless resource.column_names.include?(attr_name)
|
32
|
+
attributes[attr_name] = Attribute.new self, attr_name, attr_options
|
33
|
+
end
|
34
|
+
|
35
|
+
def key(attr_name, attr_options = {})
|
36
|
+
@key = attr_name
|
37
|
+
store attr_name, attr_options
|
33
38
|
end
|
34
39
|
|
35
40
|
def run(run)
|
@@ -38,25 +43,15 @@ module DataMiner
|
|
38
43
|
next if errata.rejects?(row)
|
39
44
|
errata.correct!(row)
|
40
45
|
end
|
41
|
-
|
42
|
-
unifying_values = unique_indices.map do |attr_name|
|
43
|
-
[ attributes[attr_name].value_from_row(self, row) ]
|
44
|
-
end
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
changes = attributes.values.map { |attr| attr.set_record_from_row self, record, row }
|
53
|
-
record.data_miner_touch_count ||= 0
|
54
|
-
if changes.any?
|
55
|
-
record.data_miner_touch_count += 1
|
56
|
-
record.data_miner_last_run = run
|
57
|
-
end
|
58
|
-
record.save!
|
47
|
+
record = resource.send "find_or_initialize_by_#{@key}", attributes[@key].value_from_row(row)
|
48
|
+
changes = attributes.map { |_, attr| attr.set_record_from_row record, row }
|
49
|
+
record.data_miner_touch_count ||= 0
|
50
|
+
if changes.any?
|
51
|
+
record.data_miner_touch_count += 1
|
52
|
+
record.data_miner_last_run = run
|
59
53
|
end
|
54
|
+
record.save!
|
60
55
|
end
|
61
56
|
DataMiner.logger.info "performed #{inspect}"
|
62
57
|
end
|
data/test/data_miner_test.rb
CHANGED
@@ -1,284 +1,9 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
|
-
module FuelEconomyGuide
|
4
|
-
TRANSMISSIONS = {
|
5
|
-
'A' => 'automatic',
|
6
|
-
'M' => 'manual',
|
7
|
-
'L' => 'automatic', # Lockup/automatic
|
8
|
-
'S' => 'semiautomatic', # Semiautomatic
|
9
|
-
'C' => 'manual' # TODO verify for VW Syncro
|
10
|
-
}
|
11
|
-
|
12
|
-
ENGINE_TYPES = {
|
13
|
-
'(GUZZLER)' => nil, # "gas guzzler"
|
14
|
-
'(POLICE)' => nil, # police automobile_variant
|
15
|
-
'(MPFI)' => 'injection',
|
16
|
-
'(MPI*)' => 'injection',
|
17
|
-
'(SPFI)' => 'injection',
|
18
|
-
'(FFS)' => 'injection',
|
19
|
-
'(TURBO)' => 'turbo',
|
20
|
-
'(TRBO)' => 'turbo',
|
21
|
-
'(TC*)' => 'turbo',
|
22
|
-
'(FFS,TRBO)' => %w(injection turbo),
|
23
|
-
'(S-CHARGE)' => 'supercharger',
|
24
|
-
'(SC*)' => 'supercharger',
|
25
|
-
'(DIESEL)' => nil, # diesel
|
26
|
-
'(DSL)' => nil, # diesel
|
27
|
-
'(ROTARY)' => nil, # rotary
|
28
|
-
'(VARIABLE)' => nil, # variable displacement
|
29
|
-
'(NO-CAT)' => nil, # no catalytic converter
|
30
|
-
'(OHC)' => nil, # overhead camshaft
|
31
|
-
'(OHV)' => nil, # overhead valves
|
32
|
-
'(16-VALVE)' => nil, # 16V
|
33
|
-
'(305)' => nil, # 305 cubic inch displacement
|
34
|
-
'(307)' => nil, # 307 cubic inch displacement
|
35
|
-
'(M-ENG)' => nil,
|
36
|
-
'(W-ENG)' => nil,
|
37
|
-
'(GM-BUICK)' => nil,
|
38
|
-
'(GM-CHEV)' => nil,
|
39
|
-
'(GM-OLDS)' => nil,
|
40
|
-
'(GM-PONT)' => nil,
|
41
|
-
}
|
42
|
-
|
43
|
-
class ParserB
|
44
|
-
attr_accessor :year
|
45
|
-
def initialize(options = {})
|
46
|
-
@year = options[:year]
|
47
|
-
end
|
48
|
-
|
49
|
-
def apply(row)
|
50
|
-
row.merge!({
|
51
|
-
'make' => row['carline_mfr_name'], # make it line up with the errata
|
52
|
-
'model' => row['carline_name'], # ditto
|
53
|
-
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
54
|
-
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
55
|
-
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
56
|
-
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
57
|
-
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
58
|
-
'displacement' => _displacement(row['opt_disp']),
|
59
|
-
'year' => year
|
60
|
-
})
|
61
|
-
row
|
62
|
-
end
|
63
|
-
|
64
|
-
def _displacement(str)
|
65
|
-
str = str.gsub(/[\(\)]/, '').strip
|
66
|
-
if str =~ /^(.+)L$/
|
67
|
-
$1.to_f
|
68
|
-
elsif str =~ /^(.+)CC$/
|
69
|
-
$1.to_f / 1000
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def add_hints!(bus)
|
74
|
-
bus[:format] = :fixed_width
|
75
|
-
bus[:cut] = '13-' if year == 1995
|
76
|
-
bus[:schema_name] = :fuel_economy_guide_b
|
77
|
-
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
78
|
-
Slither.define :fuel_economy_guide_b do |d|
|
79
|
-
d.rows do |row|
|
80
|
-
row.trap { true } # there's only one section
|
81
|
-
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
82
|
-
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
83
|
-
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
84
|
-
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
85
|
-
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
86
|
-
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
87
|
-
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
88
|
-
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
89
|
-
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
90
|
-
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
91
|
-
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
92
|
-
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
93
|
-
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
94
|
-
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
95
|
-
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
96
|
-
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
97
|
-
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
98
|
-
row.spacer 2
|
99
|
-
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
100
|
-
row.spacer 2
|
101
|
-
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
102
|
-
row.spacer 2
|
103
|
-
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
104
|
-
row.spacer 2
|
105
|
-
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
106
|
-
row.spacer 2
|
107
|
-
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
108
|
-
row.spacer 2
|
109
|
-
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
110
|
-
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
111
|
-
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
112
|
-
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
113
|
-
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
114
|
-
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
115
|
-
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
116
|
-
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
117
|
-
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
118
|
-
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
119
|
-
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
120
|
-
row.column 'filler' , 1, :type => :string # NOT USED
|
121
|
-
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
122
|
-
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
class ParserC
|
128
|
-
attr_accessor :year
|
129
|
-
def initialize(options = {})
|
130
|
-
@year = options[:year]
|
131
|
-
end
|
132
|
-
|
133
|
-
def add_hints!(bus)
|
134
|
-
# File will decide format based on filename
|
135
|
-
end
|
136
|
-
|
137
|
-
def apply(row)
|
138
|
-
row.merge!({
|
139
|
-
'make' => row['Manufacturer'], # make it line up with the errata
|
140
|
-
'model' => row['carline name'], # ditto
|
141
|
-
'drive' => row['drv'] + 'WD',
|
142
|
-
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
143
|
-
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
144
|
-
'turbo' => row['T'] == 'T',
|
145
|
-
'supercharger' => row['S'] == 'S',
|
146
|
-
'injection' => true,
|
147
|
-
'year' => year
|
148
|
-
})
|
149
|
-
row
|
150
|
-
end
|
151
|
-
end
|
152
|
-
class ParserD
|
153
|
-
attr_accessor :year
|
154
|
-
def initialize(options = {})
|
155
|
-
@year = options[:year]
|
156
|
-
end
|
157
|
-
|
158
|
-
def add_hints!(bus)
|
159
|
-
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
160
|
-
end
|
161
|
-
|
162
|
-
def apply(row)
|
163
|
-
row.merge!({
|
164
|
-
'make' => row['MFR'], # make it line up with the errata
|
165
|
-
'model' => row['CAR LINE'], # ditto
|
166
|
-
'drive' => row['DRIVE SYS'] + 'WD',
|
167
|
-
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
168
|
-
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
169
|
-
'turbo' => row['TURBO'] == 'T',
|
170
|
-
'supercharger' => row['SPCHGR'] == 'S',
|
171
|
-
'injection' => true,
|
172
|
-
'year' => year
|
173
|
-
})
|
174
|
-
row
|
175
|
-
end
|
176
|
-
end
|
177
|
-
end
|
178
|
-
|
179
|
-
class AutomobileMakeYear < ActiveRecord::Base
|
180
|
-
set_primary_key :row_hash
|
181
|
-
|
182
|
-
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
183
|
-
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
184
|
-
has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
|
185
|
-
|
186
|
-
data_miner do
|
187
|
-
process :derive_from_make_fleet_years
|
188
|
-
process :derive_association_to_make_fleet_years
|
189
|
-
process :derive_fuel_efficiency
|
190
|
-
process :derive_volume
|
191
|
-
end
|
192
|
-
|
193
|
-
# validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
|
194
|
-
|
195
|
-
class << self
|
196
|
-
def derive_from_make_fleet_years
|
197
|
-
AutomobileMakeFleetYear.find_in_batches do |batch|
|
198
|
-
batch.each do |record|
|
199
|
-
#puts " * Considering AMFY #{record.inspect}"
|
200
|
-
if record.make and record.model_year
|
201
|
-
find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
202
|
-
end
|
203
|
-
end
|
204
|
-
end
|
205
|
-
end
|
206
|
-
|
207
|
-
def derive_association_to_make_fleet_years
|
208
|
-
AutomobileMakeFleetYear.find_in_batches do |batch|
|
209
|
-
batch.each do |record|
|
210
|
-
if record.make and record.model_year
|
211
|
-
record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
212
|
-
record.save! if record.changed?
|
213
|
-
end
|
214
|
-
end
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
def derive_fuel_efficiency
|
219
|
-
AutomobileMakeFleetYear.find_in_batches do |batch|
|
220
|
-
batch.each do |record|
|
221
|
-
if record.make and record.model_year
|
222
|
-
make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
223
|
-
# make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
|
224
|
-
make_year.save!
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
end
|
229
|
-
|
230
|
-
def derive_volume
|
231
|
-
find_in_batches do |batch|
|
232
|
-
batch.each do |record|
|
233
|
-
record.volume = record.fleet_years.collect(&:volume).sum
|
234
|
-
record.save!
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
239
|
-
end
|
240
|
-
|
241
|
-
class AutomobileMakeFleetYear < ActiveRecord::Base
|
242
|
-
set_primary_key :row_hash
|
243
|
-
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
244
|
-
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
245
|
-
belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
|
246
|
-
|
247
|
-
data_miner do
|
248
|
-
# CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
|
249
|
-
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
|
250
|
-
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
|
251
|
-
:select => lambda { |row| row['volume'].to_i > 0 } do |attr|
|
252
|
-
attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
|
253
|
-
attr.store 'year', :field_name => 'year_content'
|
254
|
-
attr.store 'fleet', :chars => 2..3
|
255
|
-
attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
256
|
-
attr.store 'volume'
|
257
|
-
end
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
class AutomobileModelYear < ActiveRecord::Base
|
262
|
-
set_primary_key :year
|
263
|
-
|
264
|
-
has_many :make_years, :class_name => 'AutomobileMakeYear'
|
265
|
-
has_many :variants, :class_name => 'AutomobileVariant'
|
266
|
-
|
267
|
-
data_miner do
|
268
|
-
unique_index 'year'
|
269
|
-
|
270
|
-
# await :other_class => AutomobileMakeYear do |deferred|
|
271
|
-
# # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
|
272
|
-
# end
|
273
|
-
end
|
274
|
-
end
|
275
|
-
|
276
3
|
class AutomobileFuelType < ActiveRecord::Base
|
277
4
|
set_primary_key :code
|
278
5
|
|
279
6
|
data_miner do
|
280
|
-
unique_index 'code'
|
281
|
-
|
282
7
|
import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
283
8
|
:filename => 'Gd6-dsc.txt',
|
284
9
|
:format => :fixed_width,
|
@@ -287,21 +12,24 @@ class AutomobileFuelType < ActiveRecord::Base
|
|
287
12
|
:select => lambda { |row| /\A[A-Z]/.match row[:code] },
|
288
13
|
:schema => [[ 'code', 2, { :type => :string } ],
|
289
14
|
[ 'spacer', 2 ],
|
290
|
-
[ 'name', 52, { :type => :string } ]]) do
|
291
|
-
|
15
|
+
[ 'name', 52, { :type => :string } ]]) do
|
16
|
+
key 'code'
|
17
|
+
store 'name'
|
292
18
|
end
|
293
19
|
|
294
|
-
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
|
295
|
-
|
296
|
-
|
297
|
-
|
20
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do
|
21
|
+
key 'code'
|
22
|
+
store 'name'
|
23
|
+
store 'annual_distance'
|
24
|
+
store 'emission_factor'
|
298
25
|
end
|
299
26
|
|
300
27
|
# pull electricity emission factor from residential electricity
|
301
28
|
import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
|
302
|
-
:select => lambda { |row| row['code'] == 'El' }) do
|
303
|
-
|
304
|
-
|
29
|
+
:select => lambda { |row| row['code'] == 'El' }) do
|
30
|
+
key 'code'
|
31
|
+
store 'name'
|
32
|
+
store 'emission_factor'
|
305
33
|
end
|
306
34
|
|
307
35
|
# still need distance estimate for electric cars
|
@@ -313,71 +41,212 @@ class AutomobileFuelType < ActiveRecord::Base
|
|
313
41
|
}
|
314
42
|
end
|
315
43
|
|
316
|
-
class
|
44
|
+
class AutomobileVariant < ActiveRecord::Base
|
317
45
|
set_primary_key :row_hash
|
318
|
-
|
319
|
-
has_many :variants, :class_name => 'AutomobileVariant'
|
320
|
-
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
321
|
-
|
322
|
-
data_miner do
|
323
|
-
# derived from FEG automobile variants
|
324
|
-
end
|
325
|
-
end
|
326
46
|
|
327
|
-
|
328
|
-
|
47
|
+
module FuelEconomyGuide
|
48
|
+
TRANSMISSIONS = {
|
49
|
+
'A' => 'automatic',
|
50
|
+
'M' => 'manual',
|
51
|
+
'L' => 'automatic', # Lockup/automatic
|
52
|
+
'S' => 'semiautomatic', # Semiautomatic
|
53
|
+
'C' => 'manual' # TODO verify for VW Syncro
|
54
|
+
}
|
55
|
+
|
56
|
+
ENGINE_TYPES = {
|
57
|
+
'(GUZZLER)' => nil, # "gas guzzler"
|
58
|
+
'(POLICE)' => nil, # police automobile_variant
|
59
|
+
'(MPFI)' => 'injection',
|
60
|
+
'(MPI*)' => 'injection',
|
61
|
+
'(SPFI)' => 'injection',
|
62
|
+
'(FFS)' => 'injection',
|
63
|
+
'(TURBO)' => 'turbo',
|
64
|
+
'(TRBO)' => 'turbo',
|
65
|
+
'(TC*)' => 'turbo',
|
66
|
+
'(FFS,TRBO)' => %w(injection turbo),
|
67
|
+
'(S-CHARGE)' => 'supercharger',
|
68
|
+
'(SC*)' => 'supercharger',
|
69
|
+
'(DIESEL)' => nil, # diesel
|
70
|
+
'(DSL)' => nil, # diesel
|
71
|
+
'(ROTARY)' => nil, # rotary
|
72
|
+
'(VARIABLE)' => nil, # variable displacement
|
73
|
+
'(NO-CAT)' => nil, # no catalytic converter
|
74
|
+
'(OHC)' => nil, # overhead camshaft
|
75
|
+
'(OHV)' => nil, # overhead valves
|
76
|
+
'(16-VALVE)' => nil, # 16V
|
77
|
+
'(305)' => nil, # 305 cubic inch displacement
|
78
|
+
'(307)' => nil, # 307 cubic inch displacement
|
79
|
+
'(M-ENG)' => nil,
|
80
|
+
'(W-ENG)' => nil,
|
81
|
+
'(GM-BUICK)' => nil,
|
82
|
+
'(GM-CHEV)' => nil,
|
83
|
+
'(GM-OLDS)' => nil,
|
84
|
+
'(GM-PONT)' => nil,
|
85
|
+
}
|
86
|
+
|
87
|
+
class ParserB
|
88
|
+
attr_accessor :year
|
89
|
+
def initialize(options = {})
|
90
|
+
@year = options[:year]
|
91
|
+
end
|
329
92
|
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
93
|
+
def apply(row)
|
94
|
+
row.merge!({
|
95
|
+
'make' => row['carline_mfr_name'], # make it line up with the errata
|
96
|
+
'model' => row['carline_name'], # ditto
|
97
|
+
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
98
|
+
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
99
|
+
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
100
|
+
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
101
|
+
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
102
|
+
'displacement' => _displacement(row['opt_disp']),
|
103
|
+
'year' => year
|
104
|
+
})
|
105
|
+
row
|
106
|
+
end
|
334
107
|
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
108
|
+
def _displacement(str)
|
109
|
+
str = str.gsub(/[\(\)]/, '').strip
|
110
|
+
if str =~ /^(.+)L$/
|
111
|
+
$1.to_f
|
112
|
+
elsif str =~ /^(.+)CC$/
|
113
|
+
$1.to_f / 1000
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def add_hints!(bus)
|
118
|
+
bus[:format] = :fixed_width
|
119
|
+
bus[:cut] = '13-' if year == 1995
|
120
|
+
bus[:schema_name] = :fuel_economy_guide_b
|
121
|
+
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
122
|
+
Slither.define :fuel_economy_guide_b do |d|
|
123
|
+
d.rows do |row|
|
124
|
+
row.trap { true } # there's only one section
|
125
|
+
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
126
|
+
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
127
|
+
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
128
|
+
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
129
|
+
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
130
|
+
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
131
|
+
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
132
|
+
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
133
|
+
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
134
|
+
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
135
|
+
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
136
|
+
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
137
|
+
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
138
|
+
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
139
|
+
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
140
|
+
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
141
|
+
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
142
|
+
row.spacer 2
|
143
|
+
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
144
|
+
row.spacer 2
|
145
|
+
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
146
|
+
row.spacer 2
|
147
|
+
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
148
|
+
row.spacer 2
|
149
|
+
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
150
|
+
row.spacer 2
|
151
|
+
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
152
|
+
row.spacer 2
|
153
|
+
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
154
|
+
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
155
|
+
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
156
|
+
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
157
|
+
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
158
|
+
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
159
|
+
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
160
|
+
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
161
|
+
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
162
|
+
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
163
|
+
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
164
|
+
row.column 'filler' , 1, :type => :string # NOT USED
|
165
|
+
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
166
|
+
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
340
170
|
end
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
end
|
171
|
+
class ParserC
|
172
|
+
attr_accessor :year
|
173
|
+
def initialize(options = {})
|
174
|
+
@year = options[:year]
|
175
|
+
end
|
346
176
|
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
177
|
+
def add_hints!(bus)
|
178
|
+
# File will decide format based on filename
|
179
|
+
end
|
180
|
+
|
181
|
+
def apply(row)
|
182
|
+
row.merge!({
|
183
|
+
'make' => row['Manufacturer'], # make it line up with the errata
|
184
|
+
'model' => row['carline name'], # ditto
|
185
|
+
'drive' => row['drv'] + 'WD',
|
186
|
+
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
187
|
+
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
188
|
+
'turbo' => row['T'] == 'T',
|
189
|
+
'supercharger' => row['S'] == 'S',
|
190
|
+
'injection' => true,
|
191
|
+
'year' => year
|
192
|
+
})
|
193
|
+
row
|
194
|
+
end
|
195
|
+
end
|
196
|
+
class ParserD
|
197
|
+
attr_accessor :year
|
198
|
+
def initialize(options = {})
|
199
|
+
@year = options[:year]
|
200
|
+
end
|
354
201
|
|
202
|
+
def add_hints!(bus)
|
203
|
+
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
204
|
+
end
|
205
|
+
|
206
|
+
def apply(row)
|
207
|
+
row.merge!({
|
208
|
+
'make' => row['MFR'], # make it line up with the errata
|
209
|
+
'model' => row['CAR LINE'], # ditto
|
210
|
+
'drive' => row['DRIVE SYS'] + 'WD',
|
211
|
+
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
212
|
+
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
213
|
+
'turbo' => row['TURBO'] == 'T',
|
214
|
+
'supercharger' => row['SPCHGR'] == 'S',
|
215
|
+
'injection' => true,
|
216
|
+
'year' => year
|
217
|
+
})
|
218
|
+
row
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
355
223
|
data_miner do
|
356
224
|
# 1985---1997
|
357
225
|
(85..97).each do |yy|
|
358
226
|
filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
|
359
227
|
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
228
|
+
:filename => filename,
|
229
|
+
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
230
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
|
231
|
+
key 'row_hash'
|
232
|
+
store 'make_name', :field_name => 'make'
|
233
|
+
store 'model_name', :field_name => 'model'
|
234
|
+
store 'year'
|
235
|
+
store 'fuel_type_code', :field_name => 'fuel_type'
|
236
|
+
store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
237
|
+
store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
238
|
+
store 'cylinders', :field_name => 'no_cyc'
|
239
|
+
store 'drive', :field_name => 'drive_system'
|
240
|
+
store 'carline_mfr_code'
|
241
|
+
store 'vi_mfr_code'
|
242
|
+
store 'carline_code'
|
243
|
+
store 'carline_class_code', :field_name => 'carline_clss'
|
244
|
+
store 'transmission'
|
245
|
+
store 'speeds'
|
246
|
+
store 'turbo'
|
247
|
+
store 'supercharger'
|
248
|
+
store 'injection'
|
249
|
+
store 'displacement'
|
381
250
|
end
|
382
251
|
end
|
383
252
|
|
@@ -393,23 +262,24 @@ class AutomobileVariant < ActiveRecord::Base
|
|
393
262
|
2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
|
394
263
|
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
395
264
|
import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
265
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
|
266
|
+
key 'row_hash'
|
267
|
+
store 'make_name', :field_name => 'make'
|
268
|
+
store 'model_name', :field_name => 'model'
|
269
|
+
store 'fuel_type_code', :field_name => 'fl'
|
270
|
+
store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
271
|
+
store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
272
|
+
store 'cylinders', :field_name => 'cyl'
|
273
|
+
store 'displacement', :field_name => 'displ'
|
274
|
+
store 'carline_class_code', :field_name => 'cls' if year >= 2000
|
275
|
+
store 'carline_class_name', :field_name => 'Class'
|
276
|
+
store 'year'
|
277
|
+
store 'transmission'
|
278
|
+
store 'speeds'
|
279
|
+
store 'turbo'
|
280
|
+
store 'supercharger'
|
281
|
+
store 'injection'
|
282
|
+
store 'drive'
|
413
283
|
end
|
414
284
|
end
|
415
285
|
|
@@ -422,29 +292,29 @@ class AutomobileVariant < ActiveRecord::Base
|
|
422
292
|
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
423
293
|
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
424
294
|
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
295
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
|
296
|
+
key 'row_hash'
|
297
|
+
store 'make_name', :field_name => 'make'
|
298
|
+
store 'model_name', :field_name => 'model'
|
299
|
+
store 'fuel_type_code', :field_name => 'FUEL TYPE'
|
300
|
+
store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
301
|
+
store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
302
|
+
store 'cylinders', :field_name => 'NUMB CYL'
|
303
|
+
store 'displacement', :field_name => 'DISPLACEMENT'
|
304
|
+
store 'carline_class_code', :field_name => 'CLS'
|
305
|
+
store 'carline_class_name', :field_name => 'CLASS'
|
306
|
+
store 'year'
|
307
|
+
store 'transmission'
|
308
|
+
store 'speeds'
|
309
|
+
store 'turbo'
|
310
|
+
store 'supercharger'
|
311
|
+
store 'injection'
|
312
|
+
store 'drive'
|
442
313
|
end
|
443
314
|
end
|
444
315
|
|
445
316
|
# associate :make, :key => :original_automobile_make_name, :foreign_key => :name
|
446
317
|
# derive :automobile_model_id # creates models by name
|
447
|
-
# associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
|
448
318
|
# associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
|
449
319
|
|
450
320
|
process 'Set adjusted fuel economy' do
|
@@ -535,34 +405,32 @@ class Country < ActiveRecord::Base
|
|
535
405
|
set_primary_key :iso_3166
|
536
406
|
|
537
407
|
data_miner do
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
attr.store 'name', :field_number => 0
|
408
|
+
import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
|
409
|
+
key 'iso_3166'
|
410
|
+
store 'iso_3166', :field_number => 1
|
411
|
+
store 'name', :field_number => 0
|
543
412
|
end
|
544
413
|
|
545
|
-
import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
546
|
-
|
547
|
-
|
414
|
+
import 'A Princeton dataset with better capitalization', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
415
|
+
key 'iso_3166'
|
416
|
+
store 'iso_3166', :field_name => 'country code'
|
417
|
+
store 'name', :field_name => 'country'
|
548
418
|
end
|
549
419
|
end
|
550
420
|
end
|
551
421
|
|
552
422
|
class Airport < ActiveRecord::Base
|
553
423
|
set_primary_key :iata_code
|
554
|
-
# belongs_to :country
|
555
424
|
|
556
425
|
data_miner do
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
attr.store 'longitude', :field_number => 7
|
426
|
+
import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
|
427
|
+
key 'iata_code'
|
428
|
+
store 'name', :field_number => 1
|
429
|
+
store 'city', :field_number => 2
|
430
|
+
store 'country_name', :field_number => 3
|
431
|
+
store 'iata_code', :field_number => 4
|
432
|
+
store 'latitude', :field_number => 6
|
433
|
+
store 'longitude', :field_number => 7
|
566
434
|
end
|
567
435
|
end
|
568
436
|
end
|
@@ -571,18 +439,18 @@ class CensusRegion < ActiveRecord::Base
|
|
571
439
|
set_primary_key :number
|
572
440
|
|
573
441
|
data_miner do
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
attr.store 'number', :field_name => 'Region'
|
442
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
|
443
|
+
key 'number'
|
444
|
+
store 'name', :field_name => 'Name'
|
445
|
+
store 'number', :field_name => 'Region'
|
579
446
|
end
|
580
447
|
|
581
448
|
# pretend this is a different data source
|
582
449
|
# fake! just for testing purposes
|
583
|
-
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
|
584
|
-
|
585
|
-
|
450
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do
|
451
|
+
key 'number'
|
452
|
+
store 'name', :field_name => 'Name'
|
453
|
+
store 'number', :field_name => 'Region'
|
586
454
|
end
|
587
455
|
end
|
588
456
|
end
|
@@ -590,20 +458,14 @@ end
|
|
590
458
|
# smaller than a region
|
591
459
|
class CensusDivision < ActiveRecord::Base
|
592
460
|
set_primary_key :number
|
593
|
-
# belongs_to :census_region
|
594
|
-
# has_many :states
|
595
|
-
# has_many :zip_codes, :through => :states
|
596
|
-
# has_many :climate_divisions, :through => :states
|
597
|
-
# has_many :residence_survey_responses
|
598
461
|
|
599
462
|
data_miner do
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
attr.store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
|
463
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Division'].to_s.strip != 'X' and row['FIPS CODE STATE'].to_s.strip == 'X'} do
|
464
|
+
key 'number'
|
465
|
+
store 'name', :field_name => 'Name'
|
466
|
+
store 'number', :field_name => 'Division'
|
467
|
+
store 'census_region_number', :field_name => 'Region'
|
468
|
+
store 'census_region_name', :field_name => 'Region', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_regions.csv' }
|
607
469
|
end
|
608
470
|
end
|
609
471
|
end
|
@@ -612,8 +474,6 @@ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
|
|
612
474
|
set_primary_key :department_of_energy_identifier
|
613
475
|
|
614
476
|
data_miner do
|
615
|
-
unique_index 'department_of_energy_identifier'
|
616
|
-
|
617
477
|
process 'Define some unit conversions' do
|
618
478
|
Conversions.register :kbtus, :joules, 1_000.0 * 1_055.05585
|
619
479
|
Conversions.register :square_feet, :square_metres, 0.09290304
|
@@ -621,70 +481,71 @@ class ResidentialEnergyConsumptionSurveyResponse < ActiveRecord::Base
|
|
621
481
|
|
622
482
|
# conversions are NOT performed here, since we first have to zero out legitimate skips
|
623
483
|
# otherwise you will get values like "999 pounds = 453.138778 kilograms" (where 999 is really a legit skip)
|
624
|
-
import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv', :headers => :upcase do
|
625
|
-
|
484
|
+
import 'RECs 2005 (but not converting units to metric just yet)', :url => 'http://www.eia.doe.gov/emeu/recs/recspubuse05/datafiles/RECS05alldata.csv', :headers => :upcase do
|
485
|
+
key 'department_of_energy_identifier'
|
486
|
+
store 'department_of_energy_identifier', :field_name => 'DOEID'
|
626
487
|
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
488
|
+
store 'residence_class', :field_name => 'TYPEHUQ', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/typehuq/typehuq.csv' }
|
489
|
+
store 'construction_year', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Date in the middle (synthetic)', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
|
490
|
+
store 'construction_period', :field_name => 'YEARMADE', :dictionary => { :input => 'Code', :sprintf => '%02d', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/yearmade/yearmade.csv' }
|
491
|
+
store 'urbanity', :field_name => 'URBRUR', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/urbrur/urbrur.csv' }
|
492
|
+
store 'dishwasher_use', :field_name => 'DWASHUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dwashuse/dwashuse.csv' }
|
493
|
+
store 'central_ac_use', :field_name => 'USECENAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usecenac/usecenac.csv' }
|
494
|
+
store 'window_ac_use', :field_name => 'USEWWAC', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/usewwac/usewwac.csv' }
|
495
|
+
store 'clothes_washer_use', :field_name => 'WASHLOAD', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/washload/washload.csv' }
|
496
|
+
store 'clothes_dryer_use', :field_name => 'DRYRUSE', :dictionary => { :input => 'Code', :output => 'Description', :url => 'http://github.com/brighterplanet/manually_curated_data/raw/master/dryruse/dryruse.csv' }
|
636
497
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
498
|
+
store 'census_division_number', :field_name => 'DIVISION'
|
499
|
+
store 'census_division_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
|
500
|
+
store 'census_region_number', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_number', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
|
501
|
+
store 'census_region_name', :field_name => 'DIVISION', :dictionary => { :input => 'number', :output => 'census_region_name', :url => 'http://data.brighterplanet.com/census_divisions.csv' }
|
641
502
|
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
503
|
+
store 'floorspace', :field_name => 'TOTSQFT'
|
504
|
+
store 'residents', :field_name => 'NHSLDMEM'
|
505
|
+
store 'ownership', :field_name => 'KOWNRENT'
|
506
|
+
store 'thermostat_programmability', :field_name => 'PROTHERM'
|
507
|
+
store 'refrigerator_count', :field_name => 'NUMFRIG'
|
508
|
+
store 'freezer_count', :field_name => 'NUMFREEZ'
|
509
|
+
store 'heating_degree_days', :field_name => 'HD65'
|
510
|
+
store 'cooling_degree_days', :field_name => 'CD65'
|
511
|
+
store 'annual_energy_from_fuel_oil_for_heating_space', :field_name => 'BTUFOSPH'
|
512
|
+
store 'annual_energy_from_fuel_oil_for_heating_water', :field_name => 'BTUFOWTH'
|
513
|
+
store 'annual_energy_from_fuel_oil_for_appliances', :field_name => 'BTUFOAPL'
|
514
|
+
store 'annual_energy_from_natural_gas_for_heating_space', :field_name => 'BTUNGSPH'
|
515
|
+
store 'annual_energy_from_natural_gas_for_heating_water', :field_name => 'BTUNGWTH'
|
516
|
+
store 'annual_energy_from_natural_gas_for_appliances', :field_name => 'BTUNGAPL'
|
517
|
+
store 'annual_energy_from_propane_for_heating_space', :field_name => 'BTULPSPH'
|
518
|
+
store 'annual_energy_from_propane_for_heating_water', :field_name => 'BTULPWTH'
|
519
|
+
store 'annual_energy_from_propane_for_appliances', :field_name => 'BTULPAPL'
|
520
|
+
store 'annual_energy_from_wood', :field_name => 'BTUWOOD'
|
521
|
+
store 'annual_energy_from_kerosene', :field_name => 'BTUKER'
|
522
|
+
store 'annual_energy_from_electricity_for_clothes_driers', :field_name => 'BTUELCDR'
|
523
|
+
store 'annual_energy_from_electricity_for_dishwashers', :field_name => 'BTUELDWH'
|
524
|
+
store 'annual_energy_from_electricity_for_freezers', :field_name => 'BTUELFZZ'
|
525
|
+
store 'annual_energy_from_electricity_for_refrigerators', :field_name => 'BTUELRFG'
|
526
|
+
store 'annual_energy_from_electricity_for_air_conditioners', :field_name => 'BTUELCOL'
|
527
|
+
store 'annual_energy_from_electricity_for_heating_space', :field_name => 'BTUELSPH'
|
528
|
+
store 'annual_energy_from_electricity_for_heating_water', :field_name => 'BTUELWTH'
|
529
|
+
store 'annual_energy_from_electricity_for_other_appliances', :field_name => 'BTUELAPL'
|
530
|
+
store 'weighting', :field_name => 'NWEIGHT'
|
531
|
+
store 'total_rooms', :field_name => 'TOTROOMS'
|
532
|
+
store 'bathrooms', :field_name => 'NCOMBATH'
|
533
|
+
store 'halfbaths', :field_name => 'NHAFBATH'
|
534
|
+
store 'heated_garage', :field_name => 'GARGHEAT'
|
535
|
+
store 'attached_1car_garage', :field_name => 'GARAGE1C'
|
536
|
+
store 'detached_1car_garage', :field_name => 'DGARG1C'
|
537
|
+
store 'attached_2car_garage', :field_name => 'GARAGE2C'
|
538
|
+
store 'detached_2car_garage', :field_name => 'DGARG2C'
|
539
|
+
store 'attached_3car_garage', :field_name => 'GARAGE3C'
|
540
|
+
store 'detached_3car_garage', :field_name => 'DGARG3C'
|
541
|
+
store 'lights_on_1_to_4_hours', :field_name => 'LGT1'
|
542
|
+
store 'efficient_lights_on_1_to_4_hours', :field_name => 'LGT1EE'
|
543
|
+
store 'lights_on_4_to_12_hours', :field_name => 'LGT4'
|
544
|
+
store 'efficient_lights_on_4_to_12_hours', :field_name => 'LGT4EE'
|
545
|
+
store 'lights_on_over_12_hours', :field_name => 'LGT12'
|
546
|
+
store 'efficient_lights_on_over_12_hours', :field_name => 'LGT12EE'
|
547
|
+
store 'outdoor_all_night_lights', :field_name => 'NOUTLGTNT'
|
548
|
+
store 'outdoor_all_night_gas_lights', :field_name => 'NGASLIGHT'
|
688
549
|
end
|
689
550
|
|
690
551
|
# Rather than nullify the continuous variables that EIA identifies as LEGITIMATE SKIPS, we convert them to zero
|
@@ -806,31 +667,7 @@ class DataMinerTest < Test::Unit::TestCase
|
|
806
667
|
b = CensusRegion.count
|
807
668
|
assert_equal a, b
|
808
669
|
end
|
809
|
-
|
810
|
-
should "assume that no unique indices means it wants a big hash" do
|
811
|
-
assert_raises DataMiner::MissingHashColumn do
|
812
|
-
class IncompleteCountry < ActiveRecord::Base
|
813
|
-
set_table_name 'countries'
|
814
|
-
|
815
|
-
data_miner do
|
816
|
-
# no unique index
|
817
|
-
|
818
|
-
# get a complete list
|
819
|
-
import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
|
820
|
-
attr.store 'iso_3166', :field_number => 1
|
821
|
-
attr.store 'name', :field_number => 0
|
822
|
-
end
|
823
|
-
|
824
|
-
# get nicer names
|
825
|
-
import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
|
826
|
-
attr.store 'iso_3166', :field_name => 'country code'
|
827
|
-
attr.store 'name', :field_name => 'country'
|
828
|
-
end
|
829
|
-
end
|
830
|
-
end
|
831
|
-
end
|
832
|
-
end
|
833
|
-
|
670
|
+
|
834
671
|
should "hash things if no unique index is listed" do
|
835
672
|
AutomobileVariant.data_miner_config.runnables[0].run(nil)
|
836
673
|
assert AutomobileVariant.first.row_hash.present?
|
@@ -892,15 +729,14 @@ class DataMinerTest < Test::Unit::TestCase
|
|
892
729
|
end
|
893
730
|
|
894
731
|
if ENV['SLOW'] == 'true'
|
895
|
-
should "
|
896
|
-
|
897
|
-
assert
|
732
|
+
should "mine automobile variants" do
|
733
|
+
AutomobileVariant.run_data_miner!
|
734
|
+
assert AutomobileVariant.count('make_name LIKE "%tesla"') > 0
|
898
735
|
end
|
899
736
|
|
900
|
-
should "mine
|
901
|
-
|
902
|
-
|
903
|
-
assert_equal 'Uruguay', uy.name
|
737
|
+
should "mine residence survey day" do
|
738
|
+
ResidentialEnergyConsumptionSurveyResponse.run_data_miner!
|
739
|
+
assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
|
904
740
|
end
|
905
741
|
end
|
906
742
|
end
|