data_miner 0.2.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,61 +1,55 @@
1
1
  module DataMiner
2
2
  class Configuration
3
- attr_accessor :steps, :klass, :counter, :attributes, :awaiting
3
+ include Blockenspiel::DSL
4
+
5
+ attr_accessor :klass, :runnables, :runnable_counter, :attributes, :unique_indices
4
6
 
5
7
  def initialize(klass)
6
- @steps = []
8
+ @runnables = Array.new
9
+ @unique_indices = Set.new
7
10
  @klass = klass
8
- @counter = 0
9
- @attributes = AttributeCollection.new(klass)
11
+ @runnable_counter = 0
12
+ @attributes = HashWithIndifferentAccess.new
10
13
  end
11
14
 
12
- %w(import associate derive await).each do |method|
13
- eval <<-EOS
14
- def #{method}(*args, &block)
15
- self.counter += 1
16
- if block_given? # FORM C
17
- step_options = args[0] || {}
18
- set_awaiting!(step_options)
19
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
- elsif args[0].is_a?(Hash) # FORM A
21
- step_options = args[0]
22
- set_awaiting!(step_options)
23
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
- else # FORM B
25
- attr_name = args[0]
26
- attr_options = args[1] || {}
27
- step_options = {}
28
- set_awaiting!(step_options)
29
- self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
- attr.affect attr_name, attr_options
31
- end
32
- end
33
- end
34
- EOS
15
+ def unique_index(*args)
16
+ args.each { |arg| unique_indices.add arg }
35
17
  end
36
-
37
- def set_awaiting!(step_options)
38
- step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
18
+
19
+ def process(callback)
20
+ self.runnable_counter += 1
21
+ runnables << DataMiner::Process.new(self, runnable_counter, callback)
39
22
  end
40
23
 
41
- def awaiting!(step)
42
- self.awaiting = step
24
+ def import(options = {}, &block)
25
+ self.runnable_counter += 1
26
+ runnables << DataMiner::Import.new(self, runnable_counter, options, &block)
27
+ end
28
+
29
+ def before_invoke
30
+ self.class.create_tables
43
31
  end
44
32
 
45
- def stop_awaiting!
46
- self.awaiting = nil
33
+ def after_invoke
34
+ if unique_indices.empty?
35
+ raise(MissingHashColumn, "No unique_index defined for #{klass.name}, so you need a row_hash:string column.") unless klass.column_names.include?('row_hash')
36
+ unique_indices.add 'row_hash'
37
+ end
38
+ runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
47
39
  end
48
40
 
49
41
  # Mine data for this class.
50
- def mine(options = {})
51
- steps.each { |step| step.perform options }
42
+ def run
43
+ target = DataMiner::Target.find_or_create_by_name klass.name
44
+ run = target.runs.create! :started_at => Time.now
45
+ begin
46
+ runnables.each(&:run)
47
+ ensure
48
+ run.update_attributes! :ended_at => Time.now
49
+ end
50
+ nil
52
51
  end
53
52
 
54
- # Map <tt>method</tt> to attributes
55
- def map_to_attrs(method)
56
- steps.map { |step| step.map_to_attrs(method) }.compact
57
- end
58
-
59
53
  cattr_accessor :classes
60
54
  self.classes = []
61
55
  class << self
@@ -63,32 +57,41 @@ module DataMiner
63
57
  #
64
58
  # Options
65
59
  # * <tt>:class_names</tt>: provide an array class names to mine
66
- def mine(options = {})
60
+ def run(options = {})
67
61
  classes.each do |klass|
68
62
  if options[:class_names].blank? or options[:class_names].include?(klass.name)
69
- klass.data_mine.mine options
63
+ klass.data_miner_config.run
70
64
  end
71
65
  end
72
66
  end
73
67
 
74
- # Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
75
- #
76
- # Options
77
- # * <tt>:class_names</tt>: provide an array class names to mine
78
- def map_to_attrs(method, options = {})
79
- classes.map do |klass|
80
- if options[:class_names].blank? or options[:class_names].include?(klass.name)
81
- klass.data_mine.map_to_attrs method
82
- end
83
- end.flatten.compact
84
- end
85
-
86
68
  # Queue up all the ActiveRecord classes that DataMiner should touch.
87
69
  #
88
70
  # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
89
71
  def enqueue(&block)
90
72
  yield self.classes
91
73
  end
74
+
75
+ def create_tables
76
+ c = ActiveRecord::Base.connection
77
+ unless c.table_exists?('data_miner_targets')
78
+ c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
79
+ t.string 'name'
80
+ t.datetime 'created_at'
81
+ t.datetime 'updated_at'
82
+ end
83
+ c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
84
+ end
85
+ unless c.table_exists?('data_miner_runs')
86
+ c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
87
+ t.string 'data_miner_target_id'
88
+ t.datetime 'started_at'
89
+ t.datetime 'ended_at'
90
+ t.datetime 'created_at'
91
+ t.datetime 'updated_at'
92
+ end
93
+ end
94
+ end
92
95
  end
93
96
  end
94
97
  end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Import
3
+ attr_accessor :configuration, :position_in_run, :options, :table, :errata
4
+ delegate :klass, :to => :configuration
5
+ delegate :unique_indices, :to => :configuration
6
+
7
+ def initialize(configuration, position_in_run, options = {}, &block)
8
+ @configuration = configuration
9
+ @position_in_run = position_in_run
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
13
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
14
+ end
15
+
16
+ def inspect
17
+ "Import(#{klass}) position #{position_in_run}"
18
+ end
19
+
20
+ def attributes
21
+ configuration.attributes.reject { |k, v| !v.stored_by? self }
22
+ end
23
+
24
+ def stores?(attr_name)
25
+ configuration.attributes[attr_name].andand.stored_by? self
26
+ end
27
+
28
+ def store(attr_name, attr_options = {})
29
+ configuration.attributes[attr_name] ||= Attribute.new(klass, attr_name)
30
+ configuration.attributes[attr_name].options_for_import[self] = attr_options
31
+ end
32
+
33
+ def run
34
+ table.each_row do |row|
35
+ if errata
36
+ next if errata.rejects?(row)
37
+ errata.correct!(row)
38
+ end
39
+
40
+ unifying_values = unique_indices.map do |attr_name|
41
+ [ attributes[attr_name].value_from_row(self, row) ]
42
+ end
43
+
44
+ record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
45
+ next if combination.include?(nil)
46
+ klass.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
47
+ end.flatten
48
+
49
+ Array.wrap(record_set).each do |record|
50
+ attributes.values.each { |attr| attr.set_record_from_row(self, record, row) }
51
+ record.save!
52
+ end
53
+ end
54
+ DataMiner.logger.info "performed #{inspect}"
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,21 @@
1
+ module DataMiner
2
+ class Process
3
+ attr_accessor :configuration, :position_in_run, :callback
4
+ delegate :klass, :to => :configuration
5
+
6
+ def initialize(configuration, position_in_run, callback)
7
+ @configuration = configuration
8
+ @position_in_run = position_in_run
9
+ @callback = callback
10
+ end
11
+
12
+ def inspect
13
+ "Process(#{klass}) position #{position_in_run}"
14
+ end
15
+
16
+ def run
17
+ klass.send callback
18
+ DataMiner.logger.info "ran #{inspect}"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,7 @@
1
+ module DataMiner
2
+ class Run < ActiveRecord::Base
3
+ set_table_name 'data_miner_runs'
4
+ default_scope :order => 'id ASC'
5
+ belongs_to :target
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module DataMiner
2
+ class Target < ActiveRecord::Base
3
+ set_table_name 'data_miner_targets'
4
+ set_primary_key :name
5
+ has_many :runs, :foreign_key => 'data_miner_target_id'
6
+ end
7
+ end
@@ -1,47 +1,591 @@
1
1
  require 'test_helper'
2
2
 
3
- ActiveRecord::Schema.define(:version => 20090819143429) do
4
- create_table "airports", :force => true do |t|
5
- t.string "iata_code"
6
- t.string "name"
7
- t.string "city"
8
- t.integer "country_id"
9
- t.float "latitude"
10
- t.float "longitude"
11
- t.datetime "created_at"
12
- t.datetime "updated_at"
13
- end
14
- create_table "countries", :force => true do |t|
15
- t.string "iso_3166"
16
- t.string "name"
17
- t.datetime "created_at"
18
- t.datetime "updated_at"
3
+ module FuelEconomyGuide
4
+ TRANSMISSIONS = {
5
+ 'A' => 'automatic',
6
+ 'M' => 'manual',
7
+ 'L' => 'automatic', # Lockup/automatic
8
+ 'S' => 'semiautomatic', # Semiautomatic
9
+ 'C' => 'manual' # TODO verify for VW Syncro
10
+ }
11
+
12
+ ENGINE_TYPES = {
13
+ '(GUZZLER)' => nil, # "gas guzzler"
14
+ '(POLICE)' => nil, # police automobile_variant
15
+ '(MPFI)' => 'injection',
16
+ '(MPI*)' => 'injection',
17
+ '(SPFI)' => 'injection',
18
+ '(FFS)' => 'injection',
19
+ '(TURBO)' => 'turbo',
20
+ '(TRBO)' => 'turbo',
21
+ '(TC*)' => 'turbo',
22
+ '(FFS,TRBO)' => %w(injection turbo),
23
+ '(S-CHARGE)' => 'supercharger',
24
+ '(SC*)' => 'supercharger',
25
+ '(DIESEL)' => nil, # diesel
26
+ '(DSL)' => nil, # diesel
27
+ '(ROTARY)' => nil, # rotary
28
+ '(VARIABLE)' => nil, # variable displacement
29
+ '(NO-CAT)' => nil, # no catalytic converter
30
+ '(OHC)' => nil, # overhead camshaft
31
+ '(OHV)' => nil, # overhead valves
32
+ '(16-VALVE)' => nil, # 16V
33
+ '(305)' => nil, # 305 cubic inch displacement
34
+ '(307)' => nil, # 307 cubic inch displacement
35
+ '(M-ENG)' => nil,
36
+ '(W-ENG)' => nil,
37
+ '(GM-BUICK)' => nil,
38
+ '(GM-CHEV)' => nil,
39
+ '(GM-OLDS)' => nil,
40
+ '(GM-PONT)' => nil,
41
+ }
42
+
43
+ class ParserB
44
+ attr_accessor :year
45
+ def initialize(options = {})
46
+ @year = options[:year]
47
+ end
48
+
49
+ def apply(row)
50
+ row.merge!({
51
+ 'make' => row['carline_mfr_name'], # make it line up with the errata
52
+ 'model' => row['carline_name'], # ditto
53
+ 'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
54
+ 'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
55
+ 'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
56
+ 'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
57
+ 'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
58
+ 'displacement' => _displacement(row['opt_disp']),
59
+ 'year' => year
60
+ })
61
+ row
62
+ end
63
+
64
+ def _displacement(str)
65
+ str = str.gsub(/[\(\)]/, '').strip
66
+ if str =~ /^(.+)L$/
67
+ $1.to_f
68
+ elsif str =~ /^(.+)CC$/
69
+ $1.to_f / 1000
70
+ end
71
+ end
72
+
73
+ def add_hints!(bus)
74
+ bus[:format] = :fixed_width
75
+ bus[:cut] = '13-' if year == 1995
76
+ bus[:schema_name] = :fuel_economy_guide_b
77
+ bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
78
+ Slither.define :fuel_economy_guide_b do |d|
79
+ d.rows do |row|
80
+ row.trap { true } # there's only one section
81
+ row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
82
+ row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
83
+ row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
84
+ row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
85
+ row.column 'carline_name' , 28, :type => :string # CARLINE NAME
86
+ row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
87
+ row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
88
+ row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
89
+ row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
90
+ row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
91
+ row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
92
+ row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
93
+ row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
94
+ row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
95
+ row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
96
+ row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
97
+ row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
98
+ row.spacer 2
99
+ row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
100
+ row.spacer 2
101
+ row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
102
+ row.spacer 2
103
+ row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
104
+ row.spacer 2
105
+ row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
106
+ row.spacer 2
107
+ row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
108
+ row.spacer 2
109
+ row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
110
+ row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
111
+ row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
112
+ row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
113
+ row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
114
+ row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
115
+ row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
116
+ row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
117
+ row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
118
+ row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
119
+ row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
120
+ row.column 'filler' , 1, :type => :string # NOT USED
121
+ row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
122
+ row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
123
+ end
124
+ end
125
+ end
126
+ end
127
+ class ParserC
128
+ attr_accessor :year
129
+ def initialize(options = {})
130
+ @year = options[:year]
131
+ end
132
+
133
+ def add_hints!(bus)
134
+ # File will decide format based on filename
135
+ end
136
+
137
+ def apply(row)
138
+ row.merge!({
139
+ 'make' => row['Manufacturer'], # make it line up with the errata
140
+ 'model' => row['carline name'], # ditto
141
+ 'drive' => row['drv'] + 'WD',
142
+ 'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
143
+ 'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
144
+ 'turbo' => row['T'] == 'T',
145
+ 'supercharger' => row['S'] == 'S',
146
+ 'injection' => true,
147
+ 'year' => year
148
+ })
149
+ row
150
+ end
151
+ end
152
+ class ParserD
153
+ attr_accessor :year
154
+ def initialize(options = {})
155
+ @year = options[:year]
156
+ end
157
+
158
+ def add_hints!(bus)
159
+ bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
160
+ end
161
+
162
+ def apply(row)
163
+ row.merge!({
164
+ 'make' => row['MFR'], # make it line up with the errata
165
+ 'model' => row['CAR LINE'], # ditto
166
+ 'drive' => row['DRIVE SYS'] + 'WD',
167
+ 'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
168
+ 'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
169
+ 'turbo' => row['TURBO'] == 'T',
170
+ 'supercharger' => row['SPCHGR'] == 'S',
171
+ 'injection' => true,
172
+ 'year' => year
173
+ })
174
+ row
175
+ end
176
+ end
177
+ end
178
+
179
+ class AutomobileMakeYear < ActiveRecord::Base
180
+ set_primary_key :row_hash
181
+
182
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
183
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
184
+ has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
185
+
186
+ data_miner do
187
+ process :derive_from_make_fleet_years
188
+ process :derive_association_to_make_fleet_years
189
+ process :derive_fuel_efficiency
190
+ process :derive_volume
191
+ end
192
+
193
+ # validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
194
+
195
+ class << self
196
+ def derive_from_make_fleet_years
197
+ AutomobileMakeFleetYear.find_in_batches do |batch|
198
+ batch.each do |record|
199
+ #puts " * Considering AMFY #{record.inspect}"
200
+ if record.make and record.model_year
201
+ find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
202
+ end
203
+ end
204
+ end
205
+ end
206
+
207
+ def derive_association_to_make_fleet_years
208
+ AutomobileMakeFleetYear.find_in_batches do |batch|
209
+ batch.each do |record|
210
+ if record.make and record.model_year
211
+ record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
212
+ record.save! if record.changed?
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ def derive_fuel_efficiency
219
+ AutomobileMakeFleetYear.find_in_batches do |batch|
220
+ batch.each do |record|
221
+ if record.make and record.model_year
222
+ make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
223
+ # make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
224
+ make_year.save!
225
+ end
226
+ end
227
+ end
228
+ end
229
+
230
+ def derive_volume
231
+ find_in_batches do |batch|
232
+ batch.each do |record|
233
+ record.volume = record.fleet_years.collect(&:volume).sum
234
+ record.save!
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
240
+
241
+ class AutomobileMakeFleetYear < ActiveRecord::Base
242
+ set_primary_key :row_hash
243
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
244
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
245
+ belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
246
+
247
+ data_miner do
248
+ # CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
249
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
250
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
251
+ :select => lambda { |row| row['volume'].to_i > 0 } do |attr|
252
+ attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
253
+ attr.store 'year', :field_name => 'year_content'
254
+ attr.store 'fleet', :chars => 2..3
255
+ attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
256
+ attr.store 'volume'
257
+ end
258
+ end
259
+ end
260
+
261
+ class AutomobileModelYear < ActiveRecord::Base
262
+ set_primary_key :year
263
+
264
+ has_many :make_years, :class_name => 'AutomobileMakeYear'
265
+ has_many :variants, :class_name => 'AutomobileVariant'
266
+
267
+ data_miner do
268
+ unique_index 'year'
269
+
270
+ # await :other_class => AutomobileMakeYear do |deferred|
271
+ # # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
272
+ # end
273
+ end
274
+ end
275
+
276
+ class AutomobileFuelType < ActiveRecord::Base
277
+ set_primary_key :code
278
+
279
+ data_miner do
280
+ unique_index 'code'
281
+
282
+ import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
283
+ :filename => 'Gd6-dsc.txt',
284
+ :format => :fixed_width,
285
+ :crop => 21..26, # inclusive
286
+ :cut => '2-',
287
+ :select => lambda { |row| /\A[A-Z]/.match row[:code] },
288
+ :schema => [[ 'code', 2, { :type => :string } ],
289
+ [ 'spacer', 2 ],
290
+ [ 'name', 52, { :type => :string } ]]) do |attr|
291
+ attr.store 'name'
292
+ end
293
+
294
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do |attr|
295
+ attr.store 'name'
296
+ attr.store 'annual_distance'
297
+ attr.store 'emission_factor'
298
+ end
299
+
300
+ # pull electricity emission factor from residential electricity
301
+ import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
302
+ :select => lambda { |row| row['code'] == 'El' }) do |attr|
303
+ attr.store 'name'
304
+ attr.store 'emission_factor'
305
+ end
306
+
307
+ # still need distance estimate for electric cars
308
+ end
309
+
310
+ CODES = {
311
+ :electricity => 'El',
312
+ :diesel => 'D'
313
+ }
314
+ end
315
+
316
+ class AutomobileModel < ActiveRecord::Base
317
+ set_primary_key :row_hash
318
+
319
+ has_many :variants, :class_name => 'AutomobileVariant'
320
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
321
+
322
+ data_miner do
323
+ # derived from FEG automobile variants
324
+ end
325
+ end
326
+
327
+ class AutomobileMake < ActiveRecord::Base
328
+ set_primary_key :name
329
+
330
+ has_many :make_years, :class_name => 'AutomobileMakeYear'
331
+ has_many :models, :class_name => 'AutomobileModel'
332
+ has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
333
+ has_many :variants, :class_name => 'AutomobileVariant'
334
+
335
+ data_miner do
336
+ unique_index 'name'
337
+
338
+ import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/makes/make_importance.csv' do |attr|
339
+ attr.store 'major'
340
+ end
341
+ # await :other_class => AutomobileMakeYear do |deferred|
342
+ # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => 'volume'
343
+ # end
344
+ end
345
+ end
346
+
347
+ class AutomobileVariant < ActiveRecord::Base
348
+ set_primary_key :row_hash
349
+
350
+ belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
351
+ belongs_to :model, :class_name => 'AutomobileModel', :foreign_key => 'automobile_model_id'
352
+ belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
353
+ belongs_to :fuel_type, :class_name => 'AutomobileFuelType', :foreign_key => 'automobile_fuel_type_id'
354
+
355
+ data_miner do
356
+ # 1985---1997
357
+ (85..97).each do |yy|
358
+ filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
359
+ import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
360
+ :filename => filename,
361
+ :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
362
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
363
+ attr.store 'make_name', :field_name => 'make'
364
+ attr.store 'model_name', :field_name => 'model'
365
+ attr.store 'year'
366
+ attr.store 'fuel_type_code', :field_name => 'fuel_type'
367
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
368
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
369
+ attr.store 'cylinders', :field_name => 'no_cyc'
370
+ attr.store 'drive', :field_name => 'drive_system'
371
+ attr.store 'carline_mfr_code'
372
+ attr.store 'vi_mfr_code'
373
+ attr.store 'carline_code'
374
+ attr.store 'carline_class_code', :field_name => 'carline_clss'
375
+ attr.store 'transmission'
376
+ attr.store 'speeds'
377
+ attr.store 'turbo'
378
+ attr.store 'supercharger'
379
+ attr.store 'injection'
380
+ attr.store 'displacement'
381
+ end
382
+ end
383
+
384
+ # 1998--2005
385
+ {
386
+ 1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
387
+ 1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
388
+ 2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
389
+ 2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
390
+ 2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
391
+ 2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
392
+ 2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
393
+ 2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
394
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
395
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
396
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
397
+ attr.store 'make_name', :field_name => 'make'
398
+ attr.store 'model_name', :field_name => 'model'
399
+ attr.store 'fuel_type_code', :field_name => 'fl'
400
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
401
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
402
+ attr.store 'cylinders', :field_name => 'cyl'
403
+ attr.store 'displacement', :field_name => 'displ'
404
+ attr.store 'carline_class_code', :field_name => 'cls' if year >= 2000
405
+ attr.store 'carline_class_name', :field_name => 'Class'
406
+ attr.store 'year'
407
+ attr.store 'transmission'
408
+ attr.store 'speeds'
409
+ attr.store 'turbo'
410
+ attr.store 'supercharger'
411
+ attr.store 'injection'
412
+ attr.store 'drive'
413
+ end
414
+ end
415
+
416
+ # 2006--2010
417
+ {
418
+ 2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
419
+ 2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
420
+ 2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
421
+ 2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
422
+ # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
423
+ }.sort { |a, b| a.first <=> b.first }.each do |year, options|
424
+ import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
425
+ :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
426
+ attr.store 'make_name', :field_name => 'make'
427
+ attr.store 'model_name', :field_name => 'model'
428
+ attr.store 'fuel_type_code', :field_name => 'FUEL TYPE'
429
+ attr.store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
430
+ attr.store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
431
+ attr.store 'cylinders', :field_name => 'NUMB CYL'
432
+ attr.store 'displacement', :field_name => 'DISPLACEMENT'
433
+ attr.store 'carline_class_code', :field_name => 'CLS'
434
+ attr.store 'carline_class_name', :field_name => 'CLASS'
435
+ attr.store 'year'
436
+ attr.store 'transmission'
437
+ attr.store 'speeds'
438
+ attr.store 'turbo'
439
+ attr.store 'supercharger'
440
+ attr.store 'injection'
441
+ attr.store 'drive'
442
+ end
443
+ end
444
+
445
+ # associate :make, :key => :original_automobile_make_name, :foreign_key => :name
446
+ # derive :automobile_model_id # creates models by name
447
+ # associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
448
+ # associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
449
+ process :set_adjusted_fuel_economy
450
+ end
451
+
452
+ def name
453
+ extra = []
454
+ extra << "V#{cylinders}" if cylinders
455
+ extra << "#{displacement}L" if displacement
456
+ extra << "turbo" if turbo
457
+ extra << "FI" if injection
458
+ extra << "#{speeds}spd" if speeds.present?
459
+ extra << transmission if transmission.present?
460
+ extra << "(#{fuel_type.name})" if fuel_type
461
+ extra.join(' ')
462
+ end
463
+
464
+ def fuel_economy_description
465
+ [ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
466
+ end
467
+
468
+ class << self
469
+ def set_adjusted_fuel_economy
470
+ update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
471
+ update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
472
+ end
473
+
474
+ # the following matching methods are needed by the errata
475
+ # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
476
+
477
+ def transmission_is_blank?(row)
478
+ row['transmission'].blank?
479
+ end
480
+
481
+ def is_a_2007_gmc_or_chevrolet?(row)
482
+ row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
483
+ end
484
+
485
+ def is_a_porsche?(row)
486
+ row['make'].upcase == 'PORSCHE'
487
+ end
488
+
489
+ def is_not_a_porsche?(row)
490
+ !is_a_porsche? row
491
+ end
492
+
493
+ def is_a_mercedes_benz?(row)
494
+ row['make'] =~ /MERCEDES/i
495
+ end
496
+
497
+ def is_a_lexus?(row)
498
+ row['make'].upcase == 'LEXUS'
499
+ end
500
+
501
+ def is_a_bmw?(row)
502
+ row['make'].upcase == 'BMW'
503
+ end
504
+
505
+ def is_a_ford?(row)
506
+ row['make'].upcase == 'FORD'
507
+ end
508
+
509
+ def is_a_rolls_royce_and_model_contains_bentley?(row)
510
+ is_a_rolls_royce?(row) and model_contains_bentley?(row)
511
+ end
512
+
513
+ def is_a_bentley?(row)
514
+ row['make'].upcase == 'BENTLEY'
515
+ end
516
+
517
+ def is_a_rolls_royce?(row)
518
+ row['make'] =~ /ROLLS/i
519
+ end
520
+
521
+ def is_a_turbo_brooklands?(row)
522
+ row['model'] =~ /TURBO R\/RL BKLDS/i
523
+ end
524
+
525
+ def model_contains_maybach?(row)
526
+ row['model'] =~ /MAYBACH/i
527
+ end
528
+
529
+ def model_contains_bentley?(row)
530
+ row['model'] =~ /BENTLEY/i
531
+ end
19
532
  end
20
533
  end
21
534
 
22
535
  class Country < ActiveRecord::Base
23
- mine_data do |step|
24
- # import country names and country codes
25
- step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
- attr.key :iso_3166, :name_in_source => 'country code'
27
- attr.store :iso_3166, :name_in_source => 'country code'
28
- attr.store :name, :name_in_source => 'country'
536
+ set_primary_key :iso_3166
537
+
538
+ data_miner do
539
+ unique_index 'iso_3166'
540
+
541
+ # get a complete list
542
+ import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
543
+ attr.store 'iso_3166', :field_number => 1
544
+ attr.store 'name', :field_number => 0
545
+ end
546
+
547
+ # get nicer names
548
+ import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
549
+ attr.store 'iso_3166', :field_name => 'country code'
550
+ attr.store 'name', :field_name => 'country'
29
551
  end
30
552
  end
31
553
  end
32
554
 
33
555
  class Airport < ActiveRecord::Base
556
+ set_primary_key :iata_code
34
557
  belongs_to :country
35
- mine_data do |step|
558
+
559
+ data_miner do
560
+ unique_index 'iata_code'
561
+
36
562
  # import airport iata_code, name, etc.
37
- step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
- attr.key :iata_code, :field_number => 3
39
- attr.store :name, :field_number => 0
40
- attr.store :city, :field_number => 1
41
- attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
- attr.store :iata_code, :field_number => 3
43
- attr.store :latitude, :field_number => 5
44
- attr.store :longitude, :field_number => 6
563
+ import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? }) do |attr|
564
+ attr.store 'name', :field_number => 1
565
+ attr.store 'city', :field_number => 2
566
+ attr.store 'country_name', :field_number => 3
567
+ attr.store 'iata_code', :field_number => 4
568
+ attr.store 'latitude', :field_number => 6
569
+ attr.store 'longitude', :field_number => 7
570
+ end
571
+ end
572
+ end
573
+
574
+ class CensusRegion < ActiveRecord::Base
575
+ set_primary_key :number
576
+
577
+ data_miner do
578
+ unique_index 'number'
579
+
580
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
581
+ attr.store 'name', :field_name => 'Name'
582
+ attr.store 'number', :field_name => 'Region'
583
+ end
584
+
585
+ # pretend this is a different data source
586
+ import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
587
+ attr.store 'name', :field_name => 'Name'
588
+ attr.store 'number', :field_name => 'Region'
45
589
  end
46
590
  end
47
591
  end
@@ -49,30 +593,82 @@ end
49
593
  DataMiner.enqueue do |queue|
50
594
  queue << Country
51
595
  queue << Airport
596
+ queue << CensusRegion
597
+ queue << AutomobileFuelType # OK
598
+ queue << AutomobileModel # OK
599
+ queue << AutomobileMake # OK
600
+ queue << AutomobileModelYear # OK
601
+ queue << AutomobileVariant # OK
602
+ queue << AutomobileMakeFleetYear # OK; third-party data not yet hosted on third-party site
603
+ queue << AutomobileMakeYear # OK
52
604
  end
53
605
 
54
- class DataMinerTest < Test::Unit::TestCase
55
- def teardown
56
- Airport.delete_all
57
- Country.delete_all
606
+ class DataMinerTest < Test::Unit::TestCase
607
+ should "be idempotent" do
608
+ Country.data_miner_config.run
609
+ a = Country.count
610
+ Country.data_miner_config.run
611
+ b = Country.count
612
+ assert_equal a, b
613
+
614
+ CensusRegion.data_miner_config.run
615
+ a = CensusRegion.count
616
+ CensusRegion.data_miner_config.run
617
+ b = CensusRegion.count
618
+ assert_equal a, b
58
619
  end
620
+
621
+ should "assume that no unique indices means it wants a big hash" do
622
+ assert_raises DataMiner::MissingHashColumn do
623
+ class IncompleteCountry < ActiveRecord::Base
624
+ set_table_name 'countries'
625
+
626
+ data_miner do
627
+ # no unique index
628
+
629
+ # get a complete list
630
+ import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
631
+ attr.store 'iso_3166', :field_number => 1
632
+ attr.store 'name', :field_number => 0
633
+ end
634
+
635
+ # get nicer names
636
+ import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
637
+ attr.store 'iso_3166', :field_name => 'country code'
638
+ attr.store 'name', :field_name => 'country'
639
+ end
640
+ end
641
+ end
642
+ end
643
+ end
644
+
645
+ should "hash things if no unique index is listed" do
646
+ AutomobileVariant.data_miner_config.runnables[0].run
647
+ assert AutomobileVariant.first.row_hash.present?
648
+ end
649
+
650
+ # should "mine multiple classes in the correct order" do
651
+ # DataMiner.run :class_names => DataMiner.classes.map(&:class_name)
652
+ # uy = Country.find_by_iso_3166('UY')
653
+ # assert_equal 'Uruguay', uy.name
654
+ # end
59
655
 
60
- should "mine a single class" do
61
- Country.data_mine.mine
62
- assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
- assert_equal 0, Airport.count
656
+ should "have a target record for every class that is mined" do
657
+ DataMiner.run :class_names => %w{ Country }
658
+ assert DataMiner::Target.exists?(:name => 'Country')
659
+ assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
64
660
  end
65
661
 
66
- should "mine a single class using the API" do
67
- DataMiner.mine :class_names => ['Country']
68
- assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
- assert_equal 0, Airport.count
662
+ should "keep a log when it does a run" do
663
+ approx_started_at = Time.now
664
+ DataMiner.run :class_names => %w{ Country }
665
+ approx_ended_at = Time.now
666
+ target = DataMiner::Target.find_by_name('Country')
667
+ assert (target.runs.last.started_at - approx_started_at).abs < 5 # seconds
668
+ assert (target.runs.last.ended_at - approx_ended_at).abs < 5 # seconds
70
669
  end
71
670
 
72
- should "mine all classes" do
73
- DataMiner.mine
74
- uy = Country.find_by_iso_3166('UY')
75
- assert_equal 'Uruguay', uy.name
76
- assert_equal uy, Airport.find_by_iata_code('MVD').country
671
+ should "remove rows that have disappeared from the external data source" do
672
+ flunk "not implemented yet"
77
673
  end
78
674
  end