data_miner 0.2.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/CHANGELOG +5 -0
- data/README.rdoc +11 -15
- data/Rakefile +7 -2
- data/VERSION +1 -1
- data/data_miner.gemspec +27 -28
- data/lib/data_miner.rb +50 -27
- data/lib/data_miner/attribute.rb +157 -240
- data/lib/data_miner/configuration.rb +58 -55
- data/lib/data_miner/import.rb +57 -0
- data/lib/data_miner/process.rb +21 -0
- data/lib/data_miner/run.rb +7 -0
- data/lib/data_miner/target.rb +7 -0
- data/test/data_miner_test.rb +644 -48
- data/test/test_helper.rb +134 -3
- metadata +29 -23
- data/lib/data_miner/active_record_ext.rb +0 -25
- data/lib/data_miner/attribute_collection.rb +0 -51
- data/lib/data_miner/step.rb +0 -64
- data/lib/data_miner/step/associate.rb +0 -9
- data/lib/data_miner/step/await.rb +0 -35
- data/lib/data_miner/step/callback.rb +0 -22
- data/lib/data_miner/step/derive.rb +0 -9
- data/lib/data_miner/step/import.rb +0 -57
@@ -1,61 +1,55 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Configuration
|
3
|
-
|
3
|
+
include Blockenspiel::DSL
|
4
|
+
|
5
|
+
attr_accessor :klass, :runnables, :runnable_counter, :attributes, :unique_indices
|
4
6
|
|
5
7
|
def initialize(klass)
|
6
|
-
@
|
8
|
+
@runnables = Array.new
|
9
|
+
@unique_indices = Set.new
|
7
10
|
@klass = klass
|
8
|
-
@
|
9
|
-
@attributes =
|
11
|
+
@runnable_counter = 0
|
12
|
+
@attributes = HashWithIndifferentAccess.new
|
10
13
|
end
|
11
14
|
|
12
|
-
|
13
|
-
|
14
|
-
def #{method}(*args, &block)
|
15
|
-
self.counter += 1
|
16
|
-
if block_given? # FORM C
|
17
|
-
step_options = args[0] || {}
|
18
|
-
set_awaiting!(step_options)
|
19
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
|
20
|
-
elsif args[0].is_a?(Hash) # FORM A
|
21
|
-
step_options = args[0]
|
22
|
-
set_awaiting!(step_options)
|
23
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
|
24
|
-
else # FORM B
|
25
|
-
attr_name = args[0]
|
26
|
-
attr_options = args[1] || {}
|
27
|
-
step_options = {}
|
28
|
-
set_awaiting!(step_options)
|
29
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
|
30
|
-
attr.affect attr_name, attr_options
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
EOS
|
15
|
+
def unique_index(*args)
|
16
|
+
args.each { |arg| unique_indices.add arg }
|
35
17
|
end
|
36
|
-
|
37
|
-
def
|
38
|
-
|
18
|
+
|
19
|
+
def process(callback)
|
20
|
+
self.runnable_counter += 1
|
21
|
+
runnables << DataMiner::Process.new(self, runnable_counter, callback)
|
39
22
|
end
|
40
23
|
|
41
|
-
def
|
42
|
-
self.
|
24
|
+
def import(options = {}, &block)
|
25
|
+
self.runnable_counter += 1
|
26
|
+
runnables << DataMiner::Import.new(self, runnable_counter, options, &block)
|
27
|
+
end
|
28
|
+
|
29
|
+
def before_invoke
|
30
|
+
self.class.create_tables
|
43
31
|
end
|
44
32
|
|
45
|
-
def
|
46
|
-
|
33
|
+
def after_invoke
|
34
|
+
if unique_indices.empty?
|
35
|
+
raise(MissingHashColumn, "No unique_index defined for #{klass.name}, so you need a row_hash:string column.") unless klass.column_names.include?('row_hash')
|
36
|
+
unique_indices.add 'row_hash'
|
37
|
+
end
|
38
|
+
runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
|
47
39
|
end
|
48
40
|
|
49
41
|
# Mine data for this class.
|
50
|
-
def
|
51
|
-
|
42
|
+
def run
|
43
|
+
target = DataMiner::Target.find_or_create_by_name klass.name
|
44
|
+
run = target.runs.create! :started_at => Time.now
|
45
|
+
begin
|
46
|
+
runnables.each(&:run)
|
47
|
+
ensure
|
48
|
+
run.update_attributes! :ended_at => Time.now
|
49
|
+
end
|
50
|
+
nil
|
52
51
|
end
|
53
52
|
|
54
|
-
# Map <tt>method</tt> to attributes
|
55
|
-
def map_to_attrs(method)
|
56
|
-
steps.map { |step| step.map_to_attrs(method) }.compact
|
57
|
-
end
|
58
|
-
|
59
53
|
cattr_accessor :classes
|
60
54
|
self.classes = []
|
61
55
|
class << self
|
@@ -63,32 +57,41 @@ module DataMiner
|
|
63
57
|
#
|
64
58
|
# Options
|
65
59
|
# * <tt>:class_names</tt>: provide an array class names to mine
|
66
|
-
def
|
60
|
+
def run(options = {})
|
67
61
|
classes.each do |klass|
|
68
62
|
if options[:class_names].blank? or options[:class_names].include?(klass.name)
|
69
|
-
klass.
|
63
|
+
klass.data_miner_config.run
|
70
64
|
end
|
71
65
|
end
|
72
66
|
end
|
73
67
|
|
74
|
-
# Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
|
75
|
-
#
|
76
|
-
# Options
|
77
|
-
# * <tt>:class_names</tt>: provide an array class names to mine
|
78
|
-
def map_to_attrs(method, options = {})
|
79
|
-
classes.map do |klass|
|
80
|
-
if options[:class_names].blank? or options[:class_names].include?(klass.name)
|
81
|
-
klass.data_mine.map_to_attrs method
|
82
|
-
end
|
83
|
-
end.flatten.compact
|
84
|
-
end
|
85
|
-
|
86
68
|
# Queue up all the ActiveRecord classes that DataMiner should touch.
|
87
69
|
#
|
88
70
|
# Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
|
89
71
|
def enqueue(&block)
|
90
72
|
yield self.classes
|
91
73
|
end
|
74
|
+
|
75
|
+
def create_tables
|
76
|
+
c = ActiveRecord::Base.connection
|
77
|
+
unless c.table_exists?('data_miner_targets')
|
78
|
+
c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
79
|
+
t.string 'name'
|
80
|
+
t.datetime 'created_at'
|
81
|
+
t.datetime 'updated_at'
|
82
|
+
end
|
83
|
+
c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
|
84
|
+
end
|
85
|
+
unless c.table_exists?('data_miner_runs')
|
86
|
+
c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
|
87
|
+
t.string 'data_miner_target_id'
|
88
|
+
t.datetime 'started_at'
|
89
|
+
t.datetime 'ended_at'
|
90
|
+
t.datetime 'created_at'
|
91
|
+
t.datetime 'updated_at'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
92
95
|
end
|
93
96
|
end
|
94
97
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Import
|
3
|
+
attr_accessor :configuration, :position_in_run, :options, :table, :errata
|
4
|
+
delegate :klass, :to => :configuration
|
5
|
+
delegate :unique_indices, :to => :configuration
|
6
|
+
|
7
|
+
def initialize(configuration, position_in_run, options = {}, &block)
|
8
|
+
@configuration = configuration
|
9
|
+
@position_in_run = position_in_run
|
10
|
+
@options = options
|
11
|
+
yield self if block_given? # pull in attributes
|
12
|
+
@errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
|
13
|
+
@table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
"Import(#{klass}) position #{position_in_run}"
|
18
|
+
end
|
19
|
+
|
20
|
+
def attributes
|
21
|
+
configuration.attributes.reject { |k, v| !v.stored_by? self }
|
22
|
+
end
|
23
|
+
|
24
|
+
def stores?(attr_name)
|
25
|
+
configuration.attributes[attr_name].andand.stored_by? self
|
26
|
+
end
|
27
|
+
|
28
|
+
def store(attr_name, attr_options = {})
|
29
|
+
configuration.attributes[attr_name] ||= Attribute.new(klass, attr_name)
|
30
|
+
configuration.attributes[attr_name].options_for_import[self] = attr_options
|
31
|
+
end
|
32
|
+
|
33
|
+
def run
|
34
|
+
table.each_row do |row|
|
35
|
+
if errata
|
36
|
+
next if errata.rejects?(row)
|
37
|
+
errata.correct!(row)
|
38
|
+
end
|
39
|
+
|
40
|
+
unifying_values = unique_indices.map do |attr_name|
|
41
|
+
[ attributes[attr_name].value_from_row(self, row) ]
|
42
|
+
end
|
43
|
+
|
44
|
+
record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
|
45
|
+
next if combination.include?(nil)
|
46
|
+
klass.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
|
47
|
+
end.flatten
|
48
|
+
|
49
|
+
Array.wrap(record_set).each do |record|
|
50
|
+
attributes.values.each { |attr| attr.set_record_from_row(self, record, row) }
|
51
|
+
record.save!
|
52
|
+
end
|
53
|
+
end
|
54
|
+
DataMiner.logger.info "performed #{inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Process
|
3
|
+
attr_accessor :configuration, :position_in_run, :callback
|
4
|
+
delegate :klass, :to => :configuration
|
5
|
+
|
6
|
+
def initialize(configuration, position_in_run, callback)
|
7
|
+
@configuration = configuration
|
8
|
+
@position_in_run = position_in_run
|
9
|
+
@callback = callback
|
10
|
+
end
|
11
|
+
|
12
|
+
def inspect
|
13
|
+
"Process(#{klass}) position #{position_in_run}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
klass.send callback
|
18
|
+
DataMiner.logger.info "ran #{inspect}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/test/data_miner_test.rb
CHANGED
@@ -1,47 +1,591 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
3
|
+
module FuelEconomyGuide
|
4
|
+
TRANSMISSIONS = {
|
5
|
+
'A' => 'automatic',
|
6
|
+
'M' => 'manual',
|
7
|
+
'L' => 'automatic', # Lockup/automatic
|
8
|
+
'S' => 'semiautomatic', # Semiautomatic
|
9
|
+
'C' => 'manual' # TODO verify for VW Syncro
|
10
|
+
}
|
11
|
+
|
12
|
+
ENGINE_TYPES = {
|
13
|
+
'(GUZZLER)' => nil, # "gas guzzler"
|
14
|
+
'(POLICE)' => nil, # police automobile_variant
|
15
|
+
'(MPFI)' => 'injection',
|
16
|
+
'(MPI*)' => 'injection',
|
17
|
+
'(SPFI)' => 'injection',
|
18
|
+
'(FFS)' => 'injection',
|
19
|
+
'(TURBO)' => 'turbo',
|
20
|
+
'(TRBO)' => 'turbo',
|
21
|
+
'(TC*)' => 'turbo',
|
22
|
+
'(FFS,TRBO)' => %w(injection turbo),
|
23
|
+
'(S-CHARGE)' => 'supercharger',
|
24
|
+
'(SC*)' => 'supercharger',
|
25
|
+
'(DIESEL)' => nil, # diesel
|
26
|
+
'(DSL)' => nil, # diesel
|
27
|
+
'(ROTARY)' => nil, # rotary
|
28
|
+
'(VARIABLE)' => nil, # variable displacement
|
29
|
+
'(NO-CAT)' => nil, # no catalytic converter
|
30
|
+
'(OHC)' => nil, # overhead camshaft
|
31
|
+
'(OHV)' => nil, # overhead valves
|
32
|
+
'(16-VALVE)' => nil, # 16V
|
33
|
+
'(305)' => nil, # 305 cubic inch displacement
|
34
|
+
'(307)' => nil, # 307 cubic inch displacement
|
35
|
+
'(M-ENG)' => nil,
|
36
|
+
'(W-ENG)' => nil,
|
37
|
+
'(GM-BUICK)' => nil,
|
38
|
+
'(GM-CHEV)' => nil,
|
39
|
+
'(GM-OLDS)' => nil,
|
40
|
+
'(GM-PONT)' => nil,
|
41
|
+
}
|
42
|
+
|
43
|
+
class ParserB
|
44
|
+
attr_accessor :year
|
45
|
+
def initialize(options = {})
|
46
|
+
@year = options[:year]
|
47
|
+
end
|
48
|
+
|
49
|
+
def apply(row)
|
50
|
+
row.merge!({
|
51
|
+
'make' => row['carline_mfr_name'], # make it line up with the errata
|
52
|
+
'model' => row['carline_name'], # ditto
|
53
|
+
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
54
|
+
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
55
|
+
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
56
|
+
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
57
|
+
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
58
|
+
'displacement' => _displacement(row['opt_disp']),
|
59
|
+
'year' => year
|
60
|
+
})
|
61
|
+
row
|
62
|
+
end
|
63
|
+
|
64
|
+
def _displacement(str)
|
65
|
+
str = str.gsub(/[\(\)]/, '').strip
|
66
|
+
if str =~ /^(.+)L$/
|
67
|
+
$1.to_f
|
68
|
+
elsif str =~ /^(.+)CC$/
|
69
|
+
$1.to_f / 1000
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def add_hints!(bus)
|
74
|
+
bus[:format] = :fixed_width
|
75
|
+
bus[:cut] = '13-' if year == 1995
|
76
|
+
bus[:schema_name] = :fuel_economy_guide_b
|
77
|
+
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
78
|
+
Slither.define :fuel_economy_guide_b do |d|
|
79
|
+
d.rows do |row|
|
80
|
+
row.trap { true } # there's only one section
|
81
|
+
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
82
|
+
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
83
|
+
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
84
|
+
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
85
|
+
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
86
|
+
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
87
|
+
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
88
|
+
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
89
|
+
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
90
|
+
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
91
|
+
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
92
|
+
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
93
|
+
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
94
|
+
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
95
|
+
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
96
|
+
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
97
|
+
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
98
|
+
row.spacer 2
|
99
|
+
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
100
|
+
row.spacer 2
|
101
|
+
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
102
|
+
row.spacer 2
|
103
|
+
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
104
|
+
row.spacer 2
|
105
|
+
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
106
|
+
row.spacer 2
|
107
|
+
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
108
|
+
row.spacer 2
|
109
|
+
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
110
|
+
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
111
|
+
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
112
|
+
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
113
|
+
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
114
|
+
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
115
|
+
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
116
|
+
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
117
|
+
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
118
|
+
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
119
|
+
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
120
|
+
row.column 'filler' , 1, :type => :string # NOT USED
|
121
|
+
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
122
|
+
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
class ParserC
|
128
|
+
attr_accessor :year
|
129
|
+
def initialize(options = {})
|
130
|
+
@year = options[:year]
|
131
|
+
end
|
132
|
+
|
133
|
+
def add_hints!(bus)
|
134
|
+
# File will decide format based on filename
|
135
|
+
end
|
136
|
+
|
137
|
+
def apply(row)
|
138
|
+
row.merge!({
|
139
|
+
'make' => row['Manufacturer'], # make it line up with the errata
|
140
|
+
'model' => row['carline name'], # ditto
|
141
|
+
'drive' => row['drv'] + 'WD',
|
142
|
+
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
143
|
+
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
144
|
+
'turbo' => row['T'] == 'T',
|
145
|
+
'supercharger' => row['S'] == 'S',
|
146
|
+
'injection' => true,
|
147
|
+
'year' => year
|
148
|
+
})
|
149
|
+
row
|
150
|
+
end
|
151
|
+
end
|
152
|
+
class ParserD
|
153
|
+
attr_accessor :year
|
154
|
+
def initialize(options = {})
|
155
|
+
@year = options[:year]
|
156
|
+
end
|
157
|
+
|
158
|
+
def add_hints!(bus)
|
159
|
+
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
160
|
+
end
|
161
|
+
|
162
|
+
def apply(row)
|
163
|
+
row.merge!({
|
164
|
+
'make' => row['MFR'], # make it line up with the errata
|
165
|
+
'model' => row['CAR LINE'], # ditto
|
166
|
+
'drive' => row['DRIVE SYS'] + 'WD',
|
167
|
+
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
168
|
+
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
169
|
+
'turbo' => row['TURBO'] == 'T',
|
170
|
+
'supercharger' => row['SPCHGR'] == 'S',
|
171
|
+
'injection' => true,
|
172
|
+
'year' => year
|
173
|
+
})
|
174
|
+
row
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
class AutomobileMakeYear < ActiveRecord::Base
|
180
|
+
set_primary_key :row_hash
|
181
|
+
|
182
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
183
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
184
|
+
has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
|
185
|
+
|
186
|
+
data_miner do
|
187
|
+
process :derive_from_make_fleet_years
|
188
|
+
process :derive_association_to_make_fleet_years
|
189
|
+
process :derive_fuel_efficiency
|
190
|
+
process :derive_volume
|
191
|
+
end
|
192
|
+
|
193
|
+
# validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
|
194
|
+
|
195
|
+
class << self
|
196
|
+
def derive_from_make_fleet_years
|
197
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
198
|
+
batch.each do |record|
|
199
|
+
#puts " * Considering AMFY #{record.inspect}"
|
200
|
+
if record.make and record.model_year
|
201
|
+
find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def derive_association_to_make_fleet_years
|
208
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
209
|
+
batch.each do |record|
|
210
|
+
if record.make and record.model_year
|
211
|
+
record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
212
|
+
record.save! if record.changed?
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def derive_fuel_efficiency
|
219
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
220
|
+
batch.each do |record|
|
221
|
+
if record.make and record.model_year
|
222
|
+
make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
223
|
+
# make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
|
224
|
+
make_year.save!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def derive_volume
|
231
|
+
find_in_batches do |batch|
|
232
|
+
batch.each do |record|
|
233
|
+
record.volume = record.fleet_years.collect(&:volume).sum
|
234
|
+
record.save!
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
class AutomobileMakeFleetYear < ActiveRecord::Base
|
242
|
+
set_primary_key :row_hash
|
243
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
244
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
245
|
+
belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
|
246
|
+
|
247
|
+
data_miner do
|
248
|
+
# CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
|
249
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
|
250
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
|
251
|
+
:select => lambda { |row| row['volume'].to_i > 0 } do |attr|
|
252
|
+
attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
|
253
|
+
attr.store 'year', :field_name => 'year_content'
|
254
|
+
attr.store 'fleet', :chars => 2..3
|
255
|
+
attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
256
|
+
attr.store 'volume'
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
class AutomobileModelYear < ActiveRecord::Base
|
262
|
+
set_primary_key :year
|
263
|
+
|
264
|
+
has_many :make_years, :class_name => 'AutomobileMakeYear'
|
265
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
266
|
+
|
267
|
+
data_miner do
|
268
|
+
unique_index 'year'
|
269
|
+
|
270
|
+
# await :other_class => AutomobileMakeYear do |deferred|
|
271
|
+
# # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
|
272
|
+
# end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
class AutomobileFuelType < ActiveRecord::Base
|
277
|
+
set_primary_key :code
|
278
|
+
|
279
|
+
data_miner do
|
280
|
+
unique_index 'code'
|
281
|
+
|
282
|
+
import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
283
|
+
:filename => 'Gd6-dsc.txt',
|
284
|
+
:format => :fixed_width,
|
285
|
+
:crop => 21..26, # inclusive
|
286
|
+
:cut => '2-',
|
287
|
+
:select => lambda { |row| /\A[A-Z]/.match row[:code] },
|
288
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
289
|
+
[ 'spacer', 2 ],
|
290
|
+
[ 'name', 52, { :type => :string } ]]) do |attr|
|
291
|
+
attr.store 'name'
|
292
|
+
end
|
293
|
+
|
294
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do |attr|
|
295
|
+
attr.store 'name'
|
296
|
+
attr.store 'annual_distance'
|
297
|
+
attr.store 'emission_factor'
|
298
|
+
end
|
299
|
+
|
300
|
+
# pull electricity emission factor from residential electricity
|
301
|
+
import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
|
302
|
+
:select => lambda { |row| row['code'] == 'El' }) do |attr|
|
303
|
+
attr.store 'name'
|
304
|
+
attr.store 'emission_factor'
|
305
|
+
end
|
306
|
+
|
307
|
+
# still need distance estimate for electric cars
|
308
|
+
end
|
309
|
+
|
310
|
+
CODES = {
|
311
|
+
:electricity => 'El',
|
312
|
+
:diesel => 'D'
|
313
|
+
}
|
314
|
+
end
|
315
|
+
|
316
|
+
class AutomobileModel < ActiveRecord::Base
|
317
|
+
set_primary_key :row_hash
|
318
|
+
|
319
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
320
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
321
|
+
|
322
|
+
data_miner do
|
323
|
+
# derived from FEG automobile variants
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
class AutomobileMake < ActiveRecord::Base
|
328
|
+
set_primary_key :name
|
329
|
+
|
330
|
+
has_many :make_years, :class_name => 'AutomobileMakeYear'
|
331
|
+
has_many :models, :class_name => 'AutomobileModel'
|
332
|
+
has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
|
333
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
334
|
+
|
335
|
+
data_miner do
|
336
|
+
unique_index 'name'
|
337
|
+
|
338
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/makes/make_importance.csv' do |attr|
|
339
|
+
attr.store 'major'
|
340
|
+
end
|
341
|
+
# await :other_class => AutomobileMakeYear do |deferred|
|
342
|
+
# deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => 'volume'
|
343
|
+
# end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
class AutomobileVariant < ActiveRecord::Base
|
348
|
+
set_primary_key :row_hash
|
349
|
+
|
350
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
351
|
+
belongs_to :model, :class_name => 'AutomobileModel', :foreign_key => 'automobile_model_id'
|
352
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
353
|
+
belongs_to :fuel_type, :class_name => 'AutomobileFuelType', :foreign_key => 'automobile_fuel_type_id'
|
354
|
+
|
355
|
+
data_miner do
|
356
|
+
# 1985---1997
|
357
|
+
(85..97).each do |yy|
|
358
|
+
filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
|
359
|
+
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
360
|
+
:filename => filename,
|
361
|
+
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
362
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
363
|
+
attr.store 'make_name', :field_name => 'make'
|
364
|
+
attr.store 'model_name', :field_name => 'model'
|
365
|
+
attr.store 'year'
|
366
|
+
attr.store 'fuel_type_code', :field_name => 'fuel_type'
|
367
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
368
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
369
|
+
attr.store 'cylinders', :field_name => 'no_cyc'
|
370
|
+
attr.store 'drive', :field_name => 'drive_system'
|
371
|
+
attr.store 'carline_mfr_code'
|
372
|
+
attr.store 'vi_mfr_code'
|
373
|
+
attr.store 'carline_code'
|
374
|
+
attr.store 'carline_class_code', :field_name => 'carline_clss'
|
375
|
+
attr.store 'transmission'
|
376
|
+
attr.store 'speeds'
|
377
|
+
attr.store 'turbo'
|
378
|
+
attr.store 'supercharger'
|
379
|
+
attr.store 'injection'
|
380
|
+
attr.store 'displacement'
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
# 1998--2005
|
385
|
+
{
|
386
|
+
1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
|
387
|
+
1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
|
388
|
+
2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
|
389
|
+
2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
|
390
|
+
2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
|
391
|
+
2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
|
392
|
+
2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
|
393
|
+
2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
|
394
|
+
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
395
|
+
import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
|
396
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
397
|
+
attr.store 'make_name', :field_name => 'make'
|
398
|
+
attr.store 'model_name', :field_name => 'model'
|
399
|
+
attr.store 'fuel_type_code', :field_name => 'fl'
|
400
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
401
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
402
|
+
attr.store 'cylinders', :field_name => 'cyl'
|
403
|
+
attr.store 'displacement', :field_name => 'displ'
|
404
|
+
attr.store 'carline_class_code', :field_name => 'cls' if year >= 2000
|
405
|
+
attr.store 'carline_class_name', :field_name => 'Class'
|
406
|
+
attr.store 'year'
|
407
|
+
attr.store 'transmission'
|
408
|
+
attr.store 'speeds'
|
409
|
+
attr.store 'turbo'
|
410
|
+
attr.store 'supercharger'
|
411
|
+
attr.store 'injection'
|
412
|
+
attr.store 'drive'
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
# 2006--2010
|
417
|
+
{
|
418
|
+
2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
|
419
|
+
2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
|
420
|
+
2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
|
421
|
+
2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
|
422
|
+
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
423
|
+
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
424
|
+
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
425
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
426
|
+
attr.store 'make_name', :field_name => 'make'
|
427
|
+
attr.store 'model_name', :field_name => 'model'
|
428
|
+
attr.store 'fuel_type_code', :field_name => 'FUEL TYPE'
|
429
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
430
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
431
|
+
attr.store 'cylinders', :field_name => 'NUMB CYL'
|
432
|
+
attr.store 'displacement', :field_name => 'DISPLACEMENT'
|
433
|
+
attr.store 'carline_class_code', :field_name => 'CLS'
|
434
|
+
attr.store 'carline_class_name', :field_name => 'CLASS'
|
435
|
+
attr.store 'year'
|
436
|
+
attr.store 'transmission'
|
437
|
+
attr.store 'speeds'
|
438
|
+
attr.store 'turbo'
|
439
|
+
attr.store 'supercharger'
|
440
|
+
attr.store 'injection'
|
441
|
+
attr.store 'drive'
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
# associate :make, :key => :original_automobile_make_name, :foreign_key => :name
|
446
|
+
# derive :automobile_model_id # creates models by name
|
447
|
+
# associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
|
448
|
+
# associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
|
449
|
+
process :set_adjusted_fuel_economy
|
450
|
+
end
|
451
|
+
|
452
|
+
def name
|
453
|
+
extra = []
|
454
|
+
extra << "V#{cylinders}" if cylinders
|
455
|
+
extra << "#{displacement}L" if displacement
|
456
|
+
extra << "turbo" if turbo
|
457
|
+
extra << "FI" if injection
|
458
|
+
extra << "#{speeds}spd" if speeds.present?
|
459
|
+
extra << transmission if transmission.present?
|
460
|
+
extra << "(#{fuel_type.name})" if fuel_type
|
461
|
+
extra.join(' ')
|
462
|
+
end
|
463
|
+
|
464
|
+
def fuel_economy_description
|
465
|
+
[ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
|
466
|
+
end
|
467
|
+
|
468
|
+
class << self
|
469
|
+
def set_adjusted_fuel_economy
|
470
|
+
update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
|
471
|
+
update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
|
472
|
+
end
|
473
|
+
|
474
|
+
# the following matching methods are needed by the errata
|
475
|
+
# per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
|
476
|
+
|
477
|
+
def transmission_is_blank?(row)
|
478
|
+
row['transmission'].blank?
|
479
|
+
end
|
480
|
+
|
481
|
+
def is_a_2007_gmc_or_chevrolet?(row)
|
482
|
+
row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
|
483
|
+
end
|
484
|
+
|
485
|
+
def is_a_porsche?(row)
|
486
|
+
row['make'].upcase == 'PORSCHE'
|
487
|
+
end
|
488
|
+
|
489
|
+
def is_not_a_porsche?(row)
|
490
|
+
!is_a_porsche? row
|
491
|
+
end
|
492
|
+
|
493
|
+
def is_a_mercedes_benz?(row)
|
494
|
+
row['make'] =~ /MERCEDES/i
|
495
|
+
end
|
496
|
+
|
497
|
+
def is_a_lexus?(row)
|
498
|
+
row['make'].upcase == 'LEXUS'
|
499
|
+
end
|
500
|
+
|
501
|
+
def is_a_bmw?(row)
|
502
|
+
row['make'].upcase == 'BMW'
|
503
|
+
end
|
504
|
+
|
505
|
+
def is_a_ford?(row)
|
506
|
+
row['make'].upcase == 'FORD'
|
507
|
+
end
|
508
|
+
|
509
|
+
def is_a_rolls_royce_and_model_contains_bentley?(row)
|
510
|
+
is_a_rolls_royce?(row) and model_contains_bentley?(row)
|
511
|
+
end
|
512
|
+
|
513
|
+
def is_a_bentley?(row)
|
514
|
+
row['make'].upcase == 'BENTLEY'
|
515
|
+
end
|
516
|
+
|
517
|
+
def is_a_rolls_royce?(row)
|
518
|
+
row['make'] =~ /ROLLS/i
|
519
|
+
end
|
520
|
+
|
521
|
+
def is_a_turbo_brooklands?(row)
|
522
|
+
row['model'] =~ /TURBO R\/RL BKLDS/i
|
523
|
+
end
|
524
|
+
|
525
|
+
def model_contains_maybach?(row)
|
526
|
+
row['model'] =~ /MAYBACH/i
|
527
|
+
end
|
528
|
+
|
529
|
+
def model_contains_bentley?(row)
|
530
|
+
row['model'] =~ /BENTLEY/i
|
531
|
+
end
|
19
532
|
end
|
20
533
|
end
|
21
534
|
|
22
535
|
class Country < ActiveRecord::Base
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
536
|
+
set_primary_key :iso_3166
|
537
|
+
|
538
|
+
data_miner do
|
539
|
+
unique_index 'iso_3166'
|
540
|
+
|
541
|
+
# get a complete list
|
542
|
+
import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
|
543
|
+
attr.store 'iso_3166', :field_number => 1
|
544
|
+
attr.store 'name', :field_number => 0
|
545
|
+
end
|
546
|
+
|
547
|
+
# get nicer names
|
548
|
+
import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
|
549
|
+
attr.store 'iso_3166', :field_name => 'country code'
|
550
|
+
attr.store 'name', :field_name => 'country'
|
29
551
|
end
|
30
552
|
end
|
31
553
|
end
|
32
554
|
|
33
555
|
class Airport < ActiveRecord::Base
|
556
|
+
set_primary_key :iata_code
|
34
557
|
belongs_to :country
|
35
|
-
|
558
|
+
|
559
|
+
data_miner do
|
560
|
+
unique_index 'iata_code'
|
561
|
+
|
36
562
|
# import airport iata_code, name, etc.
|
37
|
-
|
38
|
-
attr.
|
39
|
-
attr.store
|
40
|
-
attr.store
|
41
|
-
attr.store
|
42
|
-
attr.store
|
43
|
-
attr.store
|
44
|
-
|
563
|
+
import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? }) do |attr|
|
564
|
+
attr.store 'name', :field_number => 1
|
565
|
+
attr.store 'city', :field_number => 2
|
566
|
+
attr.store 'country_name', :field_number => 3
|
567
|
+
attr.store 'iata_code', :field_number => 4
|
568
|
+
attr.store 'latitude', :field_number => 6
|
569
|
+
attr.store 'longitude', :field_number => 7
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
|
574
|
+
class CensusRegion < ActiveRecord::Base
|
575
|
+
set_primary_key :number
|
576
|
+
|
577
|
+
data_miner do
|
578
|
+
unique_index 'number'
|
579
|
+
|
580
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
|
581
|
+
attr.store 'name', :field_name => 'Name'
|
582
|
+
attr.store 'number', :field_name => 'Region'
|
583
|
+
end
|
584
|
+
|
585
|
+
# pretend this is a different data source
|
586
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
|
587
|
+
attr.store 'name', :field_name => 'Name'
|
588
|
+
attr.store 'number', :field_name => 'Region'
|
45
589
|
end
|
46
590
|
end
|
47
591
|
end
|
@@ -49,30 +593,82 @@ end
|
|
49
593
|
DataMiner.enqueue do |queue|
|
50
594
|
queue << Country
|
51
595
|
queue << Airport
|
596
|
+
queue << CensusRegion
|
597
|
+
queue << AutomobileFuelType # OK
|
598
|
+
queue << AutomobileModel # OK
|
599
|
+
queue << AutomobileMake # OK
|
600
|
+
queue << AutomobileModelYear # OK
|
601
|
+
queue << AutomobileVariant # OK
|
602
|
+
queue << AutomobileMakeFleetYear # OK; third-party data not yet hosted on third-party site
|
603
|
+
queue << AutomobileMakeYear # OK
|
52
604
|
end
|
53
605
|
|
54
|
-
class DataMinerTest < Test::Unit::TestCase
|
55
|
-
|
56
|
-
|
57
|
-
Country.
|
606
|
+
class DataMinerTest < Test::Unit::TestCase
|
607
|
+
should "be idempotent" do
|
608
|
+
Country.data_miner_config.run
|
609
|
+
a = Country.count
|
610
|
+
Country.data_miner_config.run
|
611
|
+
b = Country.count
|
612
|
+
assert_equal a, b
|
613
|
+
|
614
|
+
CensusRegion.data_miner_config.run
|
615
|
+
a = CensusRegion.count
|
616
|
+
CensusRegion.data_miner_config.run
|
617
|
+
b = CensusRegion.count
|
618
|
+
assert_equal a, b
|
58
619
|
end
|
620
|
+
|
621
|
+
should "assume that no unique indices means it wants a big hash" do
|
622
|
+
assert_raises DataMiner::MissingHashColumn do
|
623
|
+
class IncompleteCountry < ActiveRecord::Base
|
624
|
+
set_table_name 'countries'
|
625
|
+
|
626
|
+
data_miner do
|
627
|
+
# no unique index
|
628
|
+
|
629
|
+
# get a complete list
|
630
|
+
import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
|
631
|
+
attr.store 'iso_3166', :field_number => 1
|
632
|
+
attr.store 'name', :field_number => 0
|
633
|
+
end
|
634
|
+
|
635
|
+
# get nicer names
|
636
|
+
import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
|
637
|
+
attr.store 'iso_3166', :field_name => 'country code'
|
638
|
+
attr.store 'name', :field_name => 'country'
|
639
|
+
end
|
640
|
+
end
|
641
|
+
end
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
should "hash things if no unique index is listed" do
|
646
|
+
AutomobileVariant.data_miner_config.runnables[0].run
|
647
|
+
assert AutomobileVariant.first.row_hash.present?
|
648
|
+
end
|
649
|
+
|
650
|
+
# should "mine multiple classes in the correct order" do
|
651
|
+
# DataMiner.run :class_names => DataMiner.classes.map(&:class_name)
|
652
|
+
# uy = Country.find_by_iso_3166('UY')
|
653
|
+
# assert_equal 'Uruguay', uy.name
|
654
|
+
# end
|
59
655
|
|
60
|
-
should "
|
61
|
-
Country
|
62
|
-
|
63
|
-
assert_equal
|
656
|
+
should "have a target record for every class that is mined" do
|
657
|
+
DataMiner.run :class_names => %w{ Country }
|
658
|
+
assert DataMiner::Target.exists?(:name => 'Country')
|
659
|
+
assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
|
64
660
|
end
|
65
661
|
|
66
|
-
should "
|
67
|
-
|
68
|
-
|
69
|
-
|
662
|
+
should "keep a log when it does a run" do
|
663
|
+
approx_started_at = Time.now
|
664
|
+
DataMiner.run :class_names => %w{ Country }
|
665
|
+
approx_ended_at = Time.now
|
666
|
+
target = DataMiner::Target.find_by_name('Country')
|
667
|
+
assert (target.runs.last.started_at - approx_started_at).abs < 5 # seconds
|
668
|
+
assert (target.runs.last.ended_at - approx_ended_at).abs < 5 # seconds
|
70
669
|
end
|
71
670
|
|
72
|
-
should "
|
73
|
-
|
74
|
-
uy = Country.find_by_iso_3166('UY')
|
75
|
-
assert_equal 'Uruguay', uy.name
|
76
|
-
assert_equal uy, Airport.find_by_iata_code('MVD').country
|
671
|
+
should "remove rows that have disappeared from the external data source" do
|
672
|
+
flunk "not implemented yet"
|
77
673
|
end
|
78
674
|
end
|