data_miner 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,61 +1,55 @@
1
1
  module DataMiner
2
2
  class Configuration
3
- attr_accessor :steps, :klass, :counter, :attributes, :awaiting
3
+ include Blockenspiel::DSL
4
+
5
+ attr_accessor :klass, :runnables, :runnable_counter, :attributes, :unique_indices
4
6
 
5
7
  def initialize(klass)
6
- @steps = []
8
+ @runnables = Array.new
9
+ @unique_indices = Set.new
7
10
  @klass = klass
8
- @counter = 0
9
- @attributes = AttributeCollection.new(klass)
11
+ @runnable_counter = 0
12
+ @attributes = HashWithIndifferentAccess.new
10
13
  end
11
14
 
12
- %w(import associate derive await).each do |method|
13
- eval <<-EOS
14
- def #{method}(*args, &block)
15
- self.counter += 1
16
- if block_given? # FORM C
17
- step_options = args[0] || {}
18
- set_awaiting!(step_options)
19
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
- elsif args[0].is_a?(Hash) # FORM A
21
- step_options = args[0]
22
- set_awaiting!(step_options)
23
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
- else # FORM B
25
- attr_name = args[0]
26
- attr_options = args[1] || {}
27
- step_options = {}
28
- set_awaiting!(step_options)
29
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
- attr.affect attr_name, attr_options
31
- end
32
- end
33
- end
34
- EOS
15
+ def unique_index(*args)
16
+ args.each { |arg| unique_indices.add arg }
35
17
  end
36
-
37
- def set_awaiting!(step_options)
38
- step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
18
+
19
+ def process(callback)
20
+ self.runnable_counter += 1
21
+ runnables << DataMiner::Process.new(self, runnable_counter, callback)
39
22
  end
40
23
 
41
- def awaiting!(step)
42
- self.awaiting = step
24
+ def import(options = {}, &block)
25
+ self.runnable_counter += 1
26
+ runnables << DataMiner::Import.new(self, runnable_counter, options, &block)
27
+ end
28
+
29
+ def before_invoke
30
+ self.class.create_tables
43
31
  end
44
32
 
45
- def stop_awaiting!
46
- self.awaiting = nil
33
+ def after_invoke
34
+ if unique_indices.empty?
35
+ raise(MissingHashColumn, "No unique_index defined for #{klass.name}, so you need a row_hash:string column.") unless klass.column_names.include?('row_hash')
36
+ unique_indices.add 'row_hash'
37
+ end
38
+ runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
47
39
  end
48
40
 
49
41
  # Mine data for this class.
50
- def mine(options = {})
51
- steps.each { |step| step.perform options }
42
+ def run
43
+ target = DataMiner::Target.find_or_create_by_name klass.name
44
+ run = target.runs.create! :started_at => Time.now
45
+ begin
46
+ runnables.each(&:run)
47
+ ensure
48
+ run.update_attributes! :ended_at => Time.now
49
+ end
50
+ nil
52
51
  end
53
52
 
54
- # Map <tt>method</tt> to attributes
55
- def map_to_attrs(method)
56
- steps.map { |step| step.map_to_attrs(method) }.compact
57
- end
58
-
59
53
  cattr_accessor :classes
60
54
  self.classes = []
61
55
  class << self
@@ -63,32 +57,41 @@ module DataMiner
63
57
  #
64
58
  # Options
65
59
  # * <tt>:class_names</tt>: provide an array class names to mine
66
- def mine(options = {})
60
+ def run(options = {})
67
61
  classes.each do |klass|
68
62
  if options[:class_names].blank? or options[:class_names].include?(klass.name)
69
- klass.data_mine.mine options
63
+ klass.data_miner_config.run
70
64
  end
71
65
  end
72
66
  end
73
67
 
74
- # Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
75
- #
76
- # Options
77
- # * <tt>:class_names</tt>: provide an array class names to mine
78
- def map_to_attrs(method, options = {})
79
- classes.map do |klass|
80
- if options[:class_names].blank? or options[:class_names].include?(klass.name)
81
- klass.data_mine.map_to_attrs method
82
- end
83
- end.flatten.compact
84
- end
85
-
86
68
  # Queue up all the ActiveRecord classes that DataMiner should touch.
87
69
  #
88
70
  # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
89
71
  def enqueue(&block)
90
72
  yield self.classes
91
73
  end
74
+
75
+ def create_tables
76
+ c = ActiveRecord::Base.connection
77
+ unless c.table_exists?('data_miner_targets')
78
+ c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
79
+ t.string 'name'
80
+ t.datetime 'created_at'
81
+ t.datetime 'updated_at'
82
+ end
83
+ c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
84
+ end
85
+ unless c.table_exists?('data_miner_runs')
86
+ c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
87
+ t.string 'data_miner_target_id'
88
+ t.datetime 'started_at'
89
+ t.datetime 'ended_at'
90
+ t.datetime 'created_at'
91
+ t.datetime 'updated_at'
92
+ end
93
+ end
94
+ end
92
95
  end
93
96
  end
94
97
  end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Import
3
+ attr_accessor :configuration, :position_in_run, :options, :table, :errata
4
+ delegate :klass, :to => :configuration
5
+ delegate :unique_indices, :to => :configuration
6
+
7
+ def initialize(configuration, position_in_run, options = {}, &block)
8
+ @configuration = configuration
9
+ @position_in_run = position_in_run
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
13
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
14
+ end
15
+
16
+ def inspect
17
+ "Import(#{klass}) position #{position_in_run}"
18
+ end
19
+
20
+ def attributes
21
+ configuration.attributes.reject { |k, v| !v.stored_by? self }
22
+ end
23
+
24
+ def stores?(attr_name)
25
+ configuration.attributes[attr_name].andand.stored_by? self
26
+ end
27
+
28
+ def store(attr_name, attr_options = {})
29
+ configuration.attributes[attr_name] ||= Attribute.new(klass, attr_name)
30
+ configuration.attributes[attr_name].options_for_import[self] = attr_options
31
+ end
32
+
33
+ def run
34
+ table.each_row do |row|
35
+ if errata
36
+ next if errata.rejects?(row)
37
+ errata.correct!(row)
38
+ end
39
+
40
+ unifying_values = unique_indices.map do |attr_name|
41
+ [ attributes[attr_name].value_from_row(self, row) ]
42
+ end
43
+
44
+ record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
45
+ next if combination.include?(nil)
46
+ klass.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
47
+ end.flatten
48
+
49
+ Array.wrap(record_set).each do |record|
50
+ attributes.values.each { |attr| attr.set_record_from_row(self, record, row) }
51
+ record.save!
52
+ end
53
+ end
54
+ DataMiner.logger.info "performed #{inspect}"
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,21 @@
1
+ module DataMiner
2
+ class Process
3
+ attr_accessor :configuration, :position_in_run, :callback
4
+ delegate :klass, :to => :configuration
5
+
6
+ def initialize(configuration, position_in_run, callback)
7
+ @configuration = configuration
8
+ @position_in_run = position_in_run
9
+ @callback = callback
10
+ end
11
+
12
+ def inspect
13
+ "Process(#{klass}) position #{position_in_run}"
14
+ end
15
+
16
+ def run
17
+ klass.send callback
18
+ DataMiner.logger.info "ran #{inspect}"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,7 @@
1
+ module DataMiner
2
+ class Run < ActiveRecord::Base
3
+ set_table_name 'data_miner_runs'
4
+ default_scope :order => 'id ASC'
5
+ belongs_to :target
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module DataMiner
2
+ class Target < ActiveRecord::Base
3
+ set_table_name 'data_miner_targets'
4
+ set_primary_key :name
5
+ has_many :runs, :foreign_key => 'data_miner_target_id'
6
+ end
7
+ end
@@ -1,47 +1,591 @@
1
1
  require 'test_helper'
2
2
 
3
- ActiveRecord::Schema.define(:version => 20090819143429) do
4
- create_table "airports", :force => true do |t|
5
- t.string "iata_code"
6
- t.string "name"
7
- t.string "city"
8
- t.integer "country_id"
9
- t.float "latitude"
10
- t.float "longitude"
11
- t.datetime "created_at"
12
- t.datetime "updated_at"
13
- end
14
- create_table "countries", :force => true do |t|
15
- t.string "iso_3166"
16
- t.string "name"
17
- t.datetime "created_at"
18
- t.datetime "updated_at"
3
+ module FuelEconomyGuide
4
+ TRANSMISSIONS = {
5
+ 'A' => 'automatic',
6
+ 'M' => 'manual',
7
+ 'L' => 'automatic', # Lockup/automatic
8
+ 'S' => 'semiautomatic', # Semiautomatic
9
+ 'C' => 'manual' # TODO verify for VW Syncro
10
+ }
11
+
12
+ ENGINE_TYPES = {
13
+ '(GUZZLER)' => nil, # "gas guzzler"
14
+ '(POLICE)' => nil, # police automobile_variant
15
+ '(MPFI)' => 'injection',
16
+ '(MPI*)' => 'injection',
17
+ '(SPFI)' => 'injection',
18
+ '(FFS)' => 'injection',
19
+ '(TURBO)' => 'turbo',
20
+ '(TRBO)' => 'turbo',
21
+ '(TC*)' => 'turbo',
22
+ '(FFS,TRBO)' => %w(injection turbo),
23
+ '(S-CHARGE)' => 'supercharger',
24
+ '(SC*)' => 'supercharger',
25
+ '(DIESEL)' => nil, # diesel
26
+ '(DSL)' => nil, # diesel
27
+ '(ROTARY)' => nil, # rotary
28
+ '(VARIABLE)' => nil, # variable displacement
29
+ '(NO-CAT)' => nil, # no catalytic converter
30
+ '(OHC)' => nil, # overhead camshaft
31
+ '(OHV)' => nil, # overhead valves
32
+ '(16-VALVE)' => nil, # 16V
33
+ '(305)' => nil, # 305 cubic inch displacement
34
+ '(307)' => nil, # 307 cubic inch displacement
35
+ '(M-ENG)' => nil,
36
+ '(W-ENG)' => nil,
37
+ '(GM-BUICK)' => nil,
38
+ '(GM-CHEV)' => nil,
39
+ '(GM-OLDS)' => nil,
40
+ '(GM-PONT)' => nil,
41
+ }
42
+
43
+ class ParserB
44
+ attr_accessor :year
45
+ def initialize(options = {})
46
+ @year = options[:year]
47
+ end
48
+
49
+ def apply(row)
50
+ row.merge!({
51
+ 'make' => row['carline_mfr_name'], # make it line up with the errata
52
+ 'model' => row['carline_name'], # ditto
53
+ 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
54
+ 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
55
+ 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
56
+ 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
57
+ 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
58
+ 'displacement' => _displacement(row['opt_disp']),
59
+ 'year' => year
60
+ })
61
+ row
62
+ end
63
+
64
+ def _displacement(str)
65
+ str = str.gsub(/[\(\)]/, '').strip
66
+ if str =~ /^(.+)L$/
67
+ $1.to_f
68
+ elsif str =~ /^(.+)CC$/
69
+ $1.to_f / 1000
70
+ end
71
+ end
72
+
73
+ def add_hints!(bus)
74
+ bus[:format] = :fixed_width
75
+ bus[:cut] = '13-' if year == 1995
76
+ bus[:schema_name] = :fuel_economy_guide_b
77
+ bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
78
+ Slither.define :fuel_economy_guide_b do |d|
79
+ d.rows do |row|
80
+ row.trap { true } # there's only one section
81
+ row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
82
+ row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
83
+ row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
84
+ row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
85
+ row.column 'carline_name' , 28, :type => :string # CARLINE NAME
86
+ row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
87
+ row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
88
+ row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
89
+ row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
90
+ row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
91
+ row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
92
+ row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
93
+ row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
94
+ row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
95
+ row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
96
+ row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
97
+ row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
98
+ row.spacer 2
99
+ row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
100
+ row.spacer 2
101
+ row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
102
+ row.spacer 2
103
+ row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
104
+ row.spacer 2
105
+ row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
106
+ row.spacer 2
107
+ row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
108
+ row.spacer 2
109
+ row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
110
+ row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
111
+ row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
112
+ row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
113
+ row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
114
+ row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
115
+ row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
116
+ row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
117
+ row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
118
+ row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
119
+ row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
120
+ row.column 'filler' , 1, :type => :string # NOT USED
121
+ row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
122
+ row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
123
+ end
124
+ end
125
+ end
126
+ end
127
+ class ParserC
128
+ attr_accessor :year
129
+ def initialize(options = {})
130
+ @year = options[:year]
131
+ end
132
+
133
+ def add_hints!(bus)
134
+ # File will decide format based on filename
135
+ end
136
+
137
+ def apply(row)
138
+ row.merge!({
139
+ 'make' => row['Manufacturer'], # make it line up with the errata
140
+ 'model' => row['carline name'], # ditto
141
+ 'drive' => row['drv'] + 'WD',
142
+ 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
143
+ 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
144
+ 'turbo' => row['T'] == 'T',
145
+ 'supercharger' => row['S'] == 'S',
146
+ 'injection' => true,
147
+ 'year' => year
148
+ })
149
+ row
150
+ end
151
+ end
152
+ class ParserD
153
+ attr_accessor :year
154
+ def initialize(options = {})
155
+ @year = options[:year]
156
+ end
157
+
158
+ def add_hints!(bus)
159
+ bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
160
+ end
161
+
162
+ def apply(row)
163
+ row.merge!({
164
+ 'make' => row['MFR'], # make it line up with the errata
165
+ 'model' => row['CAR LINE'], # ditto
166
+ 'drive' => row['DRIVE SYS'] + 'WD',
167
+ 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
168
+ 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
169
+ 'turbo' => row['TURBO'] == 'T',
170
+ 'supercharger' => row['SPCHGR'] == 'S',
171
+ 'injection' => true,
172
+ 'year' => year
173
+ })
174
+ row
175
+ end
176
+ end
177
+ end
178
+
179
+ class AutomobileMakeYear < ActiveRecord::Base
180
+ set_primary_key :row_hash
181
+
182
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
183
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
184
+ has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
185
+
186
+ data_miner do
187
+ process :derive_from_make_fleet_years
188
+ process :derive_association_to_make_fleet_years
189
+ process :derive_fuel_efficiency
190
+ process :derive_volume
191
+ end
192
+
193
+ # validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
194
+
195
+ class << self
196
+ def derive_from_make_fleet_years
197
+ AutomobileMakeFleetYear.find_in_batches do |batch|
198
+ batch.each do |record|
199
+ #puts " * Considering AMFY #{record.inspect}"
200
+ if record.make and record.model_year
201
+ find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
202
+ end
203
+ end
204
+ end
205
+ end
206
+
207
+ def derive_association_to_make_fleet_years
208
+ AutomobileMakeFleetYear.find_in_batches do |batch|
209
+ batch.each do |record|
210
+ if record.make and record.model_year
211
+ record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
212
+ record.save! if record.changed?
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ def derive_fuel_efficiency
219
+ AutomobileMakeFleetYear.find_in_batches do |batch|
220
+ batch.each do |record|
221
+ if record.make and record.model_year
222
+ make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
223
+ # make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
224
+ make_year.save!
225
+ end
226
+ end
227
+ end
228
+ end
229
+
230
+ def derive_volume
231
+ find_in_batches do |batch|
232
+ batch.each do |record|
233
+ record.volume = record.fleet_years.collect(&:volume).sum
234
+ record.save!
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
240
+
241
+ class AutomobileMakeFleetYear < ActiveRecord::Base
242
+ set_primary_key :row_hash
243
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
244
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
245
+ belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
246
+
247
+ data_miner do
248
+ # CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
249
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
250
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
251
+ :select => lambda { |row| row['volume'].to_i > 0 } do |attr|
252
+ attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
253
+ attr.store 'year', :field_name => 'year_content'
254
+ attr.store 'fleet', :chars => 2..3
255
+ attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
256
+ attr.store 'volume'
257
+ end
258
+ end
259
+ end
260
+
261
+ class AutomobileModelYear < ActiveRecord::Base
262
+ set_primary_key :year
263
+
264
+ has_many :make_years, :class_name => 'AutomobileMakeYear'
265
+ has_many :variants, :class_name => 'AutomobileVariant'
266
+
267
+ data_miner do
268
+ unique_index 'year'
269
+
270
+ # await :other_class => AutomobileMakeYear do |deferred|
271
+ # # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
272
+ # end
273
+ end
274
+ end
275
+
276
+ class AutomobileFuelType < ActiveRecord::Base
277
+ set_primary_key :code
278
+
279
+ data_miner do
280
+ unique_index 'code'
281
+
282
+ import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
283
+ :filename => 'Gd6-dsc.txt',
284
+ :format => :fixed_width,
285
+ :crop => 21..26, # inclusive
286
+ :cut => '2-',
287
+ :select => lambda { |row| /\A[A-Z]/.match row[:code] },
288
+ :schema => [[ 'code', 2, { :type => :string } ],
289
+ [ 'spacer', 2 ],
290
+ [ 'name', 52, { :type => :string } ]]) do |attr|
291
+ attr.store 'name'
292
+ end
293
+
294
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do |attr|
295
+ attr.store 'name'
296
+ attr.store 'annual_distance'
297
+ attr.store 'emission_factor'
298
+ end
299
+
300
+ # pull electricity emission factor from residential electricity
301
+ import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
302
+ :select => lambda { |row| row['code'] == 'El' }) do |attr|
303
+ attr.store 'name'
304
+ attr.store 'emission_factor'
305
+ end
306
+
307
+ # still need distance estimate for electric cars
308
+ end
309
+
310
+ CODES = {
311
+ :electricity => 'El',
312
+ :diesel => 'D'
313
+ }
314
+ end
315
+
316
+ class AutomobileModel < ActiveRecord::Base
317
+ set_primary_key :row_hash
318
+
319
+ has_many :variants, :class_name => 'AutomobileVariant'
320
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
321
+
322
+ data_miner do
323
+ # derived from FEG automobile variants
324
+ end
325
+ end
326
+
327
+ class AutomobileMake < ActiveRecord::Base
328
+ set_primary_key :name
329
+
330
+ has_many :make_years, :class_name => 'AutomobileMakeYear'
331
+ has_many :models, :class_name => 'AutomobileModel'
332
+ has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
333
+ has_many :variants, :class_name => 'AutomobileVariant'
334
+
335
+ data_miner do
336
+ unique_index 'name'
337
+
338
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/makes/make_importance.csv' do |attr|
339
+ attr.store 'major'
340
+ end
341
+ # await :other_class => AutomobileMakeYear do |deferred|
342
+ # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => 'volume'
343
+ # end
344
+ end
345
+ end
346
+
347
+ class AutomobileVariant < ActiveRecord::Base
348
+ set_primary_key :row_hash
349
+
350
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
351
+ belongs_to :model, :class_name => 'AutomobileModel', :foreign_key => 'automobile_model_id'
352
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
353
+ belongs_to :fuel_type, :class_name => 'AutomobileFuelType', :foreign_key => 'automobile_fuel_type_id'
354
+
355
+ data_miner do
356
+ # 1985---1997
357
+ (85..97).each do |yy|
358
+ filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
359
+ import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
360
+ :filename => filename,
361
+ :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
362
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
363
+ attr.store 'make_name', :field_name => 'make'
364
+ attr.store 'model_name', :field_name => 'model'
365
+ attr.store 'year'
366
+ attr.store 'fuel_type_code', :field_name => 'fuel_type'
367
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
368
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
369
+ attr.store 'cylinders', :field_name => 'no_cyc'
370
+ attr.store 'drive', :field_name => 'drive_system'
371
+ attr.store 'carline_mfr_code'
372
+ attr.store 'vi_mfr_code'
373
+ attr.store 'carline_code'
374
+ attr.store 'carline_class_code', :field_name => 'carline_clss'
375
+ attr.store 'transmission'
376
+ attr.store 'speeds'
377
+ attr.store 'turbo'
378
+ attr.store 'supercharger'
379
+ attr.store 'injection'
380
+ attr.store 'displacement'
381
+ end
382
+ end
383
+
384
+ # 1998--2005
385
+ {
386
+ 1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
387
+ 1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
388
+ 2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
389
+ 2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
390
+ 2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
391
+ 2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
392
+ 2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
393
+ 2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
394
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
395
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
396
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
397
+ attr.store 'make_name', :field_name => 'make'
398
+ attr.store 'model_name', :field_name => 'model'
399
+ attr.store 'fuel_type_code', :field_name => 'fl'
400
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
401
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
402
+ attr.store 'cylinders', :field_name => 'cyl'
403
+ attr.store 'displacement', :field_name => 'displ'
404
+ attr.store 'carline_class_code', :field_name => 'cls' if year >= 2000
405
+ attr.store 'carline_class_name', :field_name => 'Class'
406
+ attr.store 'year'
407
+ attr.store 'transmission'
408
+ attr.store 'speeds'
409
+ attr.store 'turbo'
410
+ attr.store 'supercharger'
411
+ attr.store 'injection'
412
+ attr.store 'drive'
413
+ end
414
+ end
415
+
416
+ # 2006--2010
417
+ {
418
+ 2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
419
+ 2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
420
+ 2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
421
+ 2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
422
+ # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
423
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
424
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
425
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
426
+ attr.store 'make_name', :field_name => 'make'
427
+ attr.store 'model_name', :field_name => 'model'
428
+ attr.store 'fuel_type_code', :field_name => 'FUEL TYPE'
429
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
430
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
431
+ attr.store 'cylinders', :field_name => 'NUMB CYL'
432
+ attr.store 'displacement', :field_name => 'DISPLACEMENT'
433
+ attr.store 'carline_class_code', :field_name => 'CLS'
434
+ attr.store 'carline_class_name', :field_name => 'CLASS'
435
+ attr.store 'year'
436
+ attr.store 'transmission'
437
+ attr.store 'speeds'
438
+ attr.store 'turbo'
439
+ attr.store 'supercharger'
440
+ attr.store 'injection'
441
+ attr.store 'drive'
442
+ end
443
+ end
444
+
445
+ # associate :make, :key => :original_automobile_make_name, :foreign_key => :name
446
+ # derive :automobile_model_id # creates models by name
447
+ # associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
448
+ # associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
449
+ process :set_adjusted_fuel_economy
450
+ end
451
+
452
+ def name
453
+ extra = []
454
+ extra << "V#{cylinders}" if cylinders
455
+ extra << "#{displacement}L" if displacement
456
+ extra << "turbo" if turbo
457
+ extra << "FI" if injection
458
+ extra << "#{speeds}spd" if speeds.present?
459
+ extra << transmission if transmission.present?
460
+ extra << "(#{fuel_type.name})" if fuel_type
461
+ extra.join(' ')
462
+ end
463
+
464
+ def fuel_economy_description
465
+ [ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
466
+ end
467
+
468
+ class << self
469
+ def set_adjusted_fuel_economy
470
+ update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
471
+ update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
472
+ end
473
+
474
+ # the following matching methods are needed by the errata
475
+ # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
476
+
477
+ def transmission_is_blank?(row)
478
+ row['transmission'].blank?
479
+ end
480
+
481
+ def is_a_2007_gmc_or_chevrolet?(row)
482
+ row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
483
+ end
484
+
485
+ def is_a_porsche?(row)
486
+ row['make'].upcase == 'PORSCHE'
487
+ end
488
+
489
+ def is_not_a_porsche?(row)
490
+ !is_a_porsche? row
491
+ end
492
+
493
+ def is_a_mercedes_benz?(row)
494
+ row['make'] =~ /MERCEDES/i
495
+ end
496
+
497
+ def is_a_lexus?(row)
498
+ row['make'].upcase == 'LEXUS'
499
+ end
500
+
501
+ def is_a_bmw?(row)
502
+ row['make'].upcase == 'BMW'
503
+ end
504
+
505
+ def is_a_ford?(row)
506
+ row['make'].upcase == 'FORD'
507
+ end
508
+
509
+ def is_a_rolls_royce_and_model_contains_bentley?(row)
510
+ is_a_rolls_royce?(row) and model_contains_bentley?(row)
511
+ end
512
+
513
+ def is_a_bentley?(row)
514
+ row['make'].upcase == 'BENTLEY'
515
+ end
516
+
517
+ def is_a_rolls_royce?(row)
518
+ row['make'] =~ /ROLLS/i
519
+ end
520
+
521
+ def is_a_turbo_brooklands?(row)
522
+ row['model'] =~ /TURBO R\/RL BKLDS/i
523
+ end
524
+
525
+ def model_contains_maybach?(row)
526
+ row['model'] =~ /MAYBACH/i
527
+ end
528
+
529
+ def model_contains_bentley?(row)
530
+ row['model'] =~ /BENTLEY/i
531
+ end
19
532
  end
20
533
  end
21
534
 
22
535
  class Country < ActiveRecord::Base
23
- mine_data do |step|
24
- # import country names and country codes
25
- step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
- attr.key :iso_3166, :name_in_source => 'country code'
27
- attr.store :iso_3166, :name_in_source => 'country code'
28
- attr.store :name, :name_in_source => 'country'
536
+ set_primary_key :iso_3166
537
+
538
+ data_miner do
539
+ unique_index 'iso_3166'
540
+
541
+ # get a complete list
542
+ import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
543
+ attr.store 'iso_3166', :field_number => 1
544
+ attr.store 'name', :field_number => 0
545
+ end
546
+
547
+ # get nicer names
548
+ import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
549
+ attr.store 'iso_3166', :field_name => 'country code'
550
+ attr.store 'name', :field_name => 'country'
29
551
  end
30
552
  end
31
553
  end
32
554
 
33
555
  class Airport < ActiveRecord::Base
556
+ set_primary_key :iata_code
34
557
  belongs_to :country
35
- mine_data do |step|
558
+
559
+ data_miner do
560
+ unique_index 'iata_code'
561
+
36
562
  # import airport iata_code, name, etc.
37
- step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
- attr.key :iata_code, :field_number => 3
39
- attr.store :name, :field_number => 0
40
- attr.store :city, :field_number => 1
41
- attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
- attr.store :iata_code, :field_number => 3
43
- attr.store :latitude, :field_number => 5
44
- attr.store :longitude, :field_number => 6
563
+ import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? }) do |attr|
564
+ attr.store 'name', :field_number => 1
565
+ attr.store 'city', :field_number => 2
566
+ attr.store 'country_name', :field_number => 3
567
+ attr.store 'iata_code', :field_number => 4
568
+ attr.store 'latitude', :field_number => 6
569
+ attr.store 'longitude', :field_number => 7
570
+ end
571
+ end
572
+ end
573
+
574
+ class CensusRegion < ActiveRecord::Base
575
+ set_primary_key :number
576
+
577
+ data_miner do
578
+ unique_index 'number'
579
+
580
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
581
+ attr.store 'name', :field_name => 'Name'
582
+ attr.store 'number', :field_name => 'Region'
583
+ end
584
+
585
+ # pretend this is a different data source
586
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
587
+ attr.store 'name', :field_name => 'Name'
588
+ attr.store 'number', :field_name => 'Region'
45
589
  end
46
590
  end
47
591
  end
@@ -49,30 +593,82 @@ end
49
593
  DataMiner.enqueue do |queue|
50
594
  queue << Country
51
595
  queue << Airport
596
+ queue << CensusRegion
597
+ queue << AutomobileFuelType # OK
598
+ queue << AutomobileModel # OK
599
+ queue << AutomobileMake # OK
600
+ queue << AutomobileModelYear # OK
601
+ queue << AutomobileVariant # OK
602
+ queue << AutomobileMakeFleetYear # OK; third-party data not yet hosted on third-party site
603
+ queue << AutomobileMakeYear # OK
52
604
  end
53
605
 
54
- class DataMinerTest < Test::Unit::TestCase
55
- def teardown
56
- Airport.delete_all
57
- Country.delete_all
606
+ class DataMinerTest < Test::Unit::TestCase
607
+ should "be idempotent" do
608
+ Country.data_miner_config.run
609
+ a = Country.count
610
+ Country.data_miner_config.run
611
+ b = Country.count
612
+ assert_equal a, b
613
+
614
+ CensusRegion.data_miner_config.run
615
+ a = CensusRegion.count
616
+ CensusRegion.data_miner_config.run
617
+ b = CensusRegion.count
618
+ assert_equal a, b
58
619
  end
620
+
621
+ should "assume that no unique indices means it wants a big hash" do
622
+ assert_raises DataMiner::MissingHashColumn do
623
+ class IncompleteCountry < ActiveRecord::Base
624
+ set_table_name 'countries'
625
+
626
+ data_miner do
627
+ # no unique index
628
+
629
+ # get a complete list
630
+ import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
631
+ attr.store 'iso_3166', :field_number => 1
632
+ attr.store 'name', :field_number => 0
633
+ end
634
+
635
+ # get nicer names
636
+ import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
637
+ attr.store 'iso_3166', :field_name => 'country code'
638
+ attr.store 'name', :field_name => 'country'
639
+ end
640
+ end
641
+ end
642
+ end
643
+ end
644
+
645
+ should "hash things if no unique index is listed" do
646
+ AutomobileVariant.data_miner_config.runnables[0].run
647
+ assert AutomobileVariant.first.row_hash.present?
648
+ end
649
+
650
+ # should "mine multiple classes in the correct order" do
651
+ # DataMiner.run :class_names => DataMiner.classes.map(&:class_name)
652
+ # uy = Country.find_by_iso_3166('UY')
653
+ # assert_equal 'Uruguay', uy.name
654
+ # end
59
655
 
60
- should "mine a single class" do
61
- Country.data_mine.mine
62
- assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
- assert_equal 0, Airport.count
656
+ should "have a target record for every class that is mined" do
657
+ DataMiner.run :class_names => %w{ Country }
658
+ assert DataMiner::Target.exists?(:name => 'Country')
659
+ assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
64
660
  end
65
661
 
66
- should "mine a single class using the API" do
67
- DataMiner.mine :class_names => ['Country']
68
- assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
- assert_equal 0, Airport.count
662
+ should "keep a log when it does a run" do
663
+ approx_started_at = Time.now
664
+ DataMiner.run :class_names => %w{ Country }
665
+ approx_ended_at = Time.now
666
+ target = DataMiner::Target.find_by_name('Country')
667
+ assert (target.runs.last.started_at - approx_started_at).abs < 5 # seconds
668
+ assert (target.runs.last.ended_at - approx_ended_at).abs < 5 # seconds
70
669
  end
71
670
 
72
- should "mine all classes" do
73
- DataMiner.mine
74
- uy = Country.find_by_iso_3166('UY')
75
- assert_equal 'Uruguay', uy.name
76
- assert_equal uy, Airport.find_by_iata_code('MVD').country
671
+ should "remove rows that have disappeared from the external data source" do
672
+ flunk "not implemented yet"
77
673
  end
78
674
  end