data_miner 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/CHANGELOG +5 -0
- data/README.rdoc +11 -15
- data/Rakefile +7 -2
- data/VERSION +1 -1
- data/data_miner.gemspec +27 -28
- data/lib/data_miner.rb +50 -27
- data/lib/data_miner/attribute.rb +157 -240
- data/lib/data_miner/configuration.rb +58 -55
- data/lib/data_miner/import.rb +57 -0
- data/lib/data_miner/process.rb +21 -0
- data/lib/data_miner/run.rb +7 -0
- data/lib/data_miner/target.rb +7 -0
- data/test/data_miner_test.rb +644 -48
- data/test/test_helper.rb +134 -3
- metadata +29 -23
- data/lib/data_miner/active_record_ext.rb +0 -25
- data/lib/data_miner/attribute_collection.rb +0 -51
- data/lib/data_miner/step.rb +0 -64
- data/lib/data_miner/step/associate.rb +0 -9
- data/lib/data_miner/step/await.rb +0 -35
- data/lib/data_miner/step/callback.rb +0 -22
- data/lib/data_miner/step/derive.rb +0 -9
- data/lib/data_miner/step/import.rb +0 -57
@@ -1,61 +1,55 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Configuration
|
3
|
-
|
3
|
+
include Blockenspiel::DSL
|
4
|
+
|
5
|
+
attr_accessor :klass, :runnables, :runnable_counter, :attributes, :unique_indices
|
4
6
|
|
5
7
|
def initialize(klass)
|
6
|
-
@
|
8
|
+
@runnables = Array.new
|
9
|
+
@unique_indices = Set.new
|
7
10
|
@klass = klass
|
8
|
-
@
|
9
|
-
@attributes =
|
11
|
+
@runnable_counter = 0
|
12
|
+
@attributes = HashWithIndifferentAccess.new
|
10
13
|
end
|
11
14
|
|
12
|
-
|
13
|
-
|
14
|
-
def #{method}(*args, &block)
|
15
|
-
self.counter += 1
|
16
|
-
if block_given? # FORM C
|
17
|
-
step_options = args[0] || {}
|
18
|
-
set_awaiting!(step_options)
|
19
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
|
20
|
-
elsif args[0].is_a?(Hash) # FORM A
|
21
|
-
step_options = args[0]
|
22
|
-
set_awaiting!(step_options)
|
23
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
|
24
|
-
else # FORM B
|
25
|
-
attr_name = args[0]
|
26
|
-
attr_options = args[1] || {}
|
27
|
-
step_options = {}
|
28
|
-
set_awaiting!(step_options)
|
29
|
-
self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
|
30
|
-
attr.affect attr_name, attr_options
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
EOS
|
15
|
+
def unique_index(*args)
|
16
|
+
args.each { |arg| unique_indices.add arg }
|
35
17
|
end
|
36
|
-
|
37
|
-
def
|
38
|
-
|
18
|
+
|
19
|
+
def process(callback)
|
20
|
+
self.runnable_counter += 1
|
21
|
+
runnables << DataMiner::Process.new(self, runnable_counter, callback)
|
39
22
|
end
|
40
23
|
|
41
|
-
def
|
42
|
-
self.
|
24
|
+
def import(options = {}, &block)
|
25
|
+
self.runnable_counter += 1
|
26
|
+
runnables << DataMiner::Import.new(self, runnable_counter, options, &block)
|
27
|
+
end
|
28
|
+
|
29
|
+
def before_invoke
|
30
|
+
self.class.create_tables
|
43
31
|
end
|
44
32
|
|
45
|
-
def
|
46
|
-
|
33
|
+
def after_invoke
|
34
|
+
if unique_indices.empty?
|
35
|
+
raise(MissingHashColumn, "No unique_index defined for #{klass.name}, so you need a row_hash:string column.") unless klass.column_names.include?('row_hash')
|
36
|
+
unique_indices.add 'row_hash'
|
37
|
+
end
|
38
|
+
runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
|
47
39
|
end
|
48
40
|
|
49
41
|
# Mine data for this class.
|
50
|
-
def
|
51
|
-
|
42
|
+
def run
|
43
|
+
target = DataMiner::Target.find_or_create_by_name klass.name
|
44
|
+
run = target.runs.create! :started_at => Time.now
|
45
|
+
begin
|
46
|
+
runnables.each(&:run)
|
47
|
+
ensure
|
48
|
+
run.update_attributes! :ended_at => Time.now
|
49
|
+
end
|
50
|
+
nil
|
52
51
|
end
|
53
52
|
|
54
|
-
# Map <tt>method</tt> to attributes
|
55
|
-
def map_to_attrs(method)
|
56
|
-
steps.map { |step| step.map_to_attrs(method) }.compact
|
57
|
-
end
|
58
|
-
|
59
53
|
cattr_accessor :classes
|
60
54
|
self.classes = []
|
61
55
|
class << self
|
@@ -63,32 +57,41 @@ module DataMiner
|
|
63
57
|
#
|
64
58
|
# Options
|
65
59
|
# * <tt>:class_names</tt>: provide an array class names to mine
|
66
|
-
def
|
60
|
+
def run(options = {})
|
67
61
|
classes.each do |klass|
|
68
62
|
if options[:class_names].blank? or options[:class_names].include?(klass.name)
|
69
|
-
klass.
|
63
|
+
klass.data_miner_config.run
|
70
64
|
end
|
71
65
|
end
|
72
66
|
end
|
73
67
|
|
74
|
-
# Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
|
75
|
-
#
|
76
|
-
# Options
|
77
|
-
# * <tt>:class_names</tt>: provide an array class names to mine
|
78
|
-
def map_to_attrs(method, options = {})
|
79
|
-
classes.map do |klass|
|
80
|
-
if options[:class_names].blank? or options[:class_names].include?(klass.name)
|
81
|
-
klass.data_mine.map_to_attrs method
|
82
|
-
end
|
83
|
-
end.flatten.compact
|
84
|
-
end
|
85
|
-
|
86
68
|
# Queue up all the ActiveRecord classes that DataMiner should touch.
|
87
69
|
#
|
88
70
|
# Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
|
89
71
|
def enqueue(&block)
|
90
72
|
yield self.classes
|
91
73
|
end
|
74
|
+
|
75
|
+
def create_tables
|
76
|
+
c = ActiveRecord::Base.connection
|
77
|
+
unless c.table_exists?('data_miner_targets')
|
78
|
+
c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
79
|
+
t.string 'name'
|
80
|
+
t.datetime 'created_at'
|
81
|
+
t.datetime 'updated_at'
|
82
|
+
end
|
83
|
+
c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
|
84
|
+
end
|
85
|
+
unless c.table_exists?('data_miner_runs')
|
86
|
+
c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
|
87
|
+
t.string 'data_miner_target_id'
|
88
|
+
t.datetime 'started_at'
|
89
|
+
t.datetime 'ended_at'
|
90
|
+
t.datetime 'created_at'
|
91
|
+
t.datetime 'updated_at'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
92
95
|
end
|
93
96
|
end
|
94
97
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Import
|
3
|
+
attr_accessor :configuration, :position_in_run, :options, :table, :errata
|
4
|
+
delegate :klass, :to => :configuration
|
5
|
+
delegate :unique_indices, :to => :configuration
|
6
|
+
|
7
|
+
def initialize(configuration, position_in_run, options = {}, &block)
|
8
|
+
@configuration = configuration
|
9
|
+
@position_in_run = position_in_run
|
10
|
+
@options = options
|
11
|
+
yield self if block_given? # pull in attributes
|
12
|
+
@errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
|
13
|
+
@table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
"Import(#{klass}) position #{position_in_run}"
|
18
|
+
end
|
19
|
+
|
20
|
+
def attributes
|
21
|
+
configuration.attributes.reject { |k, v| !v.stored_by? self }
|
22
|
+
end
|
23
|
+
|
24
|
+
def stores?(attr_name)
|
25
|
+
configuration.attributes[attr_name].andand.stored_by? self
|
26
|
+
end
|
27
|
+
|
28
|
+
def store(attr_name, attr_options = {})
|
29
|
+
configuration.attributes[attr_name] ||= Attribute.new(klass, attr_name)
|
30
|
+
configuration.attributes[attr_name].options_for_import[self] = attr_options
|
31
|
+
end
|
32
|
+
|
33
|
+
def run
|
34
|
+
table.each_row do |row|
|
35
|
+
if errata
|
36
|
+
next if errata.rejects?(row)
|
37
|
+
errata.correct!(row)
|
38
|
+
end
|
39
|
+
|
40
|
+
unifying_values = unique_indices.map do |attr_name|
|
41
|
+
[ attributes[attr_name].value_from_row(self, row) ]
|
42
|
+
end
|
43
|
+
|
44
|
+
record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
|
45
|
+
next if combination.include?(nil)
|
46
|
+
klass.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
|
47
|
+
end.flatten
|
48
|
+
|
49
|
+
Array.wrap(record_set).each do |record|
|
50
|
+
attributes.values.each { |attr| attr.set_record_from_row(self, record, row) }
|
51
|
+
record.save!
|
52
|
+
end
|
53
|
+
end
|
54
|
+
DataMiner.logger.info "performed #{inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Process
|
3
|
+
attr_accessor :configuration, :position_in_run, :callback
|
4
|
+
delegate :klass, :to => :configuration
|
5
|
+
|
6
|
+
def initialize(configuration, position_in_run, callback)
|
7
|
+
@configuration = configuration
|
8
|
+
@position_in_run = position_in_run
|
9
|
+
@callback = callback
|
10
|
+
end
|
11
|
+
|
12
|
+
def inspect
|
13
|
+
"Process(#{klass}) position #{position_in_run}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
klass.send callback
|
18
|
+
DataMiner.logger.info "ran #{inspect}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/test/data_miner_test.rb
CHANGED
@@ -1,47 +1,591 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
3
|
+
module FuelEconomyGuide
|
4
|
+
TRANSMISSIONS = {
|
5
|
+
'A' => 'automatic',
|
6
|
+
'M' => 'manual',
|
7
|
+
'L' => 'automatic', # Lockup/automatic
|
8
|
+
'S' => 'semiautomatic', # Semiautomatic
|
9
|
+
'C' => 'manual' # TODO verify for VW Syncro
|
10
|
+
}
|
11
|
+
|
12
|
+
ENGINE_TYPES = {
|
13
|
+
'(GUZZLER)' => nil, # "gas guzzler"
|
14
|
+
'(POLICE)' => nil, # police automobile_variant
|
15
|
+
'(MPFI)' => 'injection',
|
16
|
+
'(MPI*)' => 'injection',
|
17
|
+
'(SPFI)' => 'injection',
|
18
|
+
'(FFS)' => 'injection',
|
19
|
+
'(TURBO)' => 'turbo',
|
20
|
+
'(TRBO)' => 'turbo',
|
21
|
+
'(TC*)' => 'turbo',
|
22
|
+
'(FFS,TRBO)' => %w(injection turbo),
|
23
|
+
'(S-CHARGE)' => 'supercharger',
|
24
|
+
'(SC*)' => 'supercharger',
|
25
|
+
'(DIESEL)' => nil, # diesel
|
26
|
+
'(DSL)' => nil, # diesel
|
27
|
+
'(ROTARY)' => nil, # rotary
|
28
|
+
'(VARIABLE)' => nil, # variable displacement
|
29
|
+
'(NO-CAT)' => nil, # no catalytic converter
|
30
|
+
'(OHC)' => nil, # overhead camshaft
|
31
|
+
'(OHV)' => nil, # overhead valves
|
32
|
+
'(16-VALVE)' => nil, # 16V
|
33
|
+
'(305)' => nil, # 305 cubic inch displacement
|
34
|
+
'(307)' => nil, # 307 cubic inch displacement
|
35
|
+
'(M-ENG)' => nil,
|
36
|
+
'(W-ENG)' => nil,
|
37
|
+
'(GM-BUICK)' => nil,
|
38
|
+
'(GM-CHEV)' => nil,
|
39
|
+
'(GM-OLDS)' => nil,
|
40
|
+
'(GM-PONT)' => nil,
|
41
|
+
}
|
42
|
+
|
43
|
+
class ParserB
|
44
|
+
attr_accessor :year
|
45
|
+
def initialize(options = {})
|
46
|
+
@year = options[:year]
|
47
|
+
end
|
48
|
+
|
49
|
+
def apply(row)
|
50
|
+
row.merge!({
|
51
|
+
'make' => row['carline_mfr_name'], # make it line up with the errata
|
52
|
+
'model' => row['carline_name'], # ditto
|
53
|
+
'transmission' => TRANSMISSIONS[row['model_trans'][0, 1]],
|
54
|
+
'speeds' => (row['model_trans'][1, 1] == 'V') ? 'variable' : row['model_trans'][1, 1],
|
55
|
+
'turbo' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('turbo'),
|
56
|
+
'supercharger' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('supercharger'),
|
57
|
+
'injection' => [ENGINE_TYPES[row['engine_desc1']], ENGINE_TYPES[row['engine_desc2']]].flatten.include?('injection'),
|
58
|
+
'displacement' => _displacement(row['opt_disp']),
|
59
|
+
'year' => year
|
60
|
+
})
|
61
|
+
row
|
62
|
+
end
|
63
|
+
|
64
|
+
def _displacement(str)
|
65
|
+
str = str.gsub(/[\(\)]/, '').strip
|
66
|
+
if str =~ /^(.+)L$/
|
67
|
+
$1.to_f
|
68
|
+
elsif str =~ /^(.+)CC$/
|
69
|
+
$1.to_f / 1000
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def add_hints!(bus)
|
74
|
+
bus[:format] = :fixed_width
|
75
|
+
bus[:cut] = '13-' if year == 1995
|
76
|
+
bus[:schema_name] = :fuel_economy_guide_b
|
77
|
+
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
78
|
+
Slither.define :fuel_economy_guide_b do |d|
|
79
|
+
d.rows do |row|
|
80
|
+
row.trap { true } # there's only one section
|
81
|
+
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
82
|
+
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
83
|
+
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
84
|
+
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
85
|
+
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
86
|
+
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
87
|
+
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
88
|
+
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
89
|
+
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
90
|
+
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
91
|
+
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
92
|
+
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
93
|
+
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
94
|
+
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
95
|
+
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
96
|
+
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
97
|
+
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
98
|
+
row.spacer 2
|
99
|
+
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
100
|
+
row.spacer 2
|
101
|
+
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
102
|
+
row.spacer 2
|
103
|
+
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
104
|
+
row.spacer 2
|
105
|
+
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
106
|
+
row.spacer 2
|
107
|
+
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
108
|
+
row.spacer 2
|
109
|
+
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
110
|
+
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
111
|
+
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
112
|
+
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
113
|
+
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
114
|
+
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
115
|
+
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
116
|
+
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
117
|
+
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
118
|
+
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
119
|
+
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
120
|
+
row.column 'filler' , 1, :type => :string # NOT USED
|
121
|
+
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
122
|
+
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
class ParserC
|
128
|
+
attr_accessor :year
|
129
|
+
def initialize(options = {})
|
130
|
+
@year = options[:year]
|
131
|
+
end
|
132
|
+
|
133
|
+
def add_hints!(bus)
|
134
|
+
# File will decide format based on filename
|
135
|
+
end
|
136
|
+
|
137
|
+
def apply(row)
|
138
|
+
row.merge!({
|
139
|
+
'make' => row['Manufacturer'], # make it line up with the errata
|
140
|
+
'model' => row['carline name'], # ditto
|
141
|
+
'drive' => row['drv'] + 'WD',
|
142
|
+
'transmission' => TRANSMISSIONS[row['trans'][-3, 1]],
|
143
|
+
'speeds' => (row['trans'][-2, 1] == 'V') ? 'variable' : row['trans'][-2, 1],
|
144
|
+
'turbo' => row['T'] == 'T',
|
145
|
+
'supercharger' => row['S'] == 'S',
|
146
|
+
'injection' => true,
|
147
|
+
'year' => year
|
148
|
+
})
|
149
|
+
row
|
150
|
+
end
|
151
|
+
end
|
152
|
+
class ParserD
|
153
|
+
attr_accessor :year
|
154
|
+
def initialize(options = {})
|
155
|
+
@year = options[:year]
|
156
|
+
end
|
157
|
+
|
158
|
+
def add_hints!(bus)
|
159
|
+
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
160
|
+
end
|
161
|
+
|
162
|
+
def apply(row)
|
163
|
+
row.merge!({
|
164
|
+
'make' => row['MFR'], # make it line up with the errata
|
165
|
+
'model' => row['CAR LINE'], # ditto
|
166
|
+
'drive' => row['DRIVE SYS'] + 'WD',
|
167
|
+
'transmission' => TRANSMISSIONS[row['TRANS'][-3, 1]],
|
168
|
+
'speeds' => (row['TRANS'][-2, 1] == 'V') ? 'variable' : row['TRANS'][-2, 1],
|
169
|
+
'turbo' => row['TURBO'] == 'T',
|
170
|
+
'supercharger' => row['SPCHGR'] == 'S',
|
171
|
+
'injection' => true,
|
172
|
+
'year' => year
|
173
|
+
})
|
174
|
+
row
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
class AutomobileMakeYear < ActiveRecord::Base
|
180
|
+
set_primary_key :row_hash
|
181
|
+
|
182
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
183
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
184
|
+
has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
|
185
|
+
|
186
|
+
data_miner do
|
187
|
+
process :derive_from_make_fleet_years
|
188
|
+
process :derive_association_to_make_fleet_years
|
189
|
+
process :derive_fuel_efficiency
|
190
|
+
process :derive_volume
|
191
|
+
end
|
192
|
+
|
193
|
+
# validates_numericality_of :fuel_efficiency, :greater_than => 0, :allow_nil => true
|
194
|
+
|
195
|
+
class << self
|
196
|
+
def derive_from_make_fleet_years
|
197
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
198
|
+
batch.each do |record|
|
199
|
+
#puts " * Considering AMFY #{record.inspect}"
|
200
|
+
if record.make and record.model_year
|
201
|
+
find_or_create_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def derive_association_to_make_fleet_years
|
208
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
209
|
+
batch.each do |record|
|
210
|
+
if record.make and record.model_year
|
211
|
+
record.make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
212
|
+
record.save! if record.changed?
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def derive_fuel_efficiency
|
219
|
+
AutomobileMakeFleetYear.find_in_batches do |batch|
|
220
|
+
batch.each do |record|
|
221
|
+
if record.make and record.model_year
|
222
|
+
make_year = find_by_automobile_make_id_and_automobile_model_year_id record.make.id, record.model_year.id
|
223
|
+
# make_year.fuel_efficiency = make_year.fleet_years.weighted_average :fuel_efficiency, :by => :volume
|
224
|
+
make_year.save!
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def derive_volume
|
231
|
+
find_in_batches do |batch|
|
232
|
+
batch.each do |record|
|
233
|
+
record.volume = record.fleet_years.collect(&:volume).sum
|
234
|
+
record.save!
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
class AutomobileMakeFleetYear < ActiveRecord::Base
|
242
|
+
set_primary_key :row_hash
|
243
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
244
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
245
|
+
belongs_to :make_year, :class_name => 'AutomobileMakeYear', :foreign_key => 'automobile_make_year_id'
|
246
|
+
|
247
|
+
data_miner do
|
248
|
+
# CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
|
249
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
|
250
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
|
251
|
+
:select => lambda { |row| row['volume'].to_i > 0 } do |attr|
|
252
|
+
attr.store 'make_name', :field_name => 'manufacturer_name' # prefix
|
253
|
+
attr.store 'year', :field_name => 'year_content'
|
254
|
+
attr.store 'fleet', :chars => 2..3
|
255
|
+
attr.store 'fuel_efficiency', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
256
|
+
attr.store 'volume'
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
class AutomobileModelYear < ActiveRecord::Base
|
262
|
+
set_primary_key :year
|
263
|
+
|
264
|
+
has_many :make_years, :class_name => 'AutomobileMakeYear'
|
265
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
266
|
+
|
267
|
+
data_miner do
|
268
|
+
unique_index 'year'
|
269
|
+
|
270
|
+
# await :other_class => AutomobileMakeYear do |deferred|
|
271
|
+
# # deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => :volume
|
272
|
+
# end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
class AutomobileFuelType < ActiveRecord::Base
|
277
|
+
set_primary_key :code
|
278
|
+
|
279
|
+
data_miner do
|
280
|
+
unique_index 'code'
|
281
|
+
|
282
|
+
import(:url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip',
|
283
|
+
:filename => 'Gd6-dsc.txt',
|
284
|
+
:format => :fixed_width,
|
285
|
+
:crop => 21..26, # inclusive
|
286
|
+
:cut => '2-',
|
287
|
+
:select => lambda { |row| /\A[A-Z]/.match row[:code] },
|
288
|
+
:schema => [[ 'code', 2, { :type => :string } ],
|
289
|
+
[ 'spacer', 2 ],
|
290
|
+
[ 'name', 52, { :type => :string } ]]) do |attr|
|
291
|
+
attr.store 'name'
|
292
|
+
end
|
293
|
+
|
294
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/models_export/automobile_fuel_type.csv' do |attr|
|
295
|
+
attr.store 'name'
|
296
|
+
attr.store 'annual_distance'
|
297
|
+
attr.store 'emission_factor'
|
298
|
+
end
|
299
|
+
|
300
|
+
# pull electricity emission factor from residential electricity
|
301
|
+
import(:url => 'http://spreadsheets.google.com/pub?key=rukxnmuhhsOsrztTrUaFCXQ',
|
302
|
+
:select => lambda { |row| row['code'] == 'El' }) do |attr|
|
303
|
+
attr.store 'name'
|
304
|
+
attr.store 'emission_factor'
|
305
|
+
end
|
306
|
+
|
307
|
+
# still need distance estimate for electric cars
|
308
|
+
end
|
309
|
+
|
310
|
+
CODES = {
|
311
|
+
:electricity => 'El',
|
312
|
+
:diesel => 'D'
|
313
|
+
}
|
314
|
+
end
|
315
|
+
|
316
|
+
class AutomobileModel < ActiveRecord::Base
|
317
|
+
set_primary_key :row_hash
|
318
|
+
|
319
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
320
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
321
|
+
|
322
|
+
data_miner do
|
323
|
+
# derived from FEG automobile variants
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
class AutomobileMake < ActiveRecord::Base
|
328
|
+
set_primary_key :name
|
329
|
+
|
330
|
+
has_many :make_years, :class_name => 'AutomobileMakeYear'
|
331
|
+
has_many :models, :class_name => 'AutomobileModel'
|
332
|
+
has_many :fleet_years, :class_name => 'AutomobileMakeFleetYear'
|
333
|
+
has_many :variants, :class_name => 'AutomobileVariant'
|
334
|
+
|
335
|
+
data_miner do
|
336
|
+
unique_index 'name'
|
337
|
+
|
338
|
+
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/makes/make_importance.csv' do |attr|
|
339
|
+
attr.store 'major'
|
340
|
+
end
|
341
|
+
# await :other_class => AutomobileMakeYear do |deferred|
|
342
|
+
# deferred.derive :fuel_efficiency, :weighting_association => :make_years, :weighting_column => 'volume'
|
343
|
+
# end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
class AutomobileVariant < ActiveRecord::Base
|
348
|
+
set_primary_key :row_hash
|
349
|
+
|
350
|
+
belongs_to :make, :class_name => 'AutomobileMake', :foreign_key => 'automobile_make_id'
|
351
|
+
belongs_to :model, :class_name => 'AutomobileModel', :foreign_key => 'automobile_model_id'
|
352
|
+
belongs_to :model_year, :class_name => 'AutomobileModelYear', :foreign_key => 'automobile_model_year_id'
|
353
|
+
belongs_to :fuel_type, :class_name => 'AutomobileFuelType', :foreign_key => 'automobile_fuel_type_id'
|
354
|
+
|
355
|
+
data_miner do
|
356
|
+
# 1985---1997
|
357
|
+
(85..97).each do |yy|
|
358
|
+
filename = (yy == 96) ? "#{yy}MFGUI.ASC" : "#{yy}MFGUI.DAT"
|
359
|
+
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
360
|
+
:filename => filename,
|
361
|
+
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
362
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
363
|
+
attr.store 'make_name', :field_name => 'make'
|
364
|
+
attr.store 'model_name', :field_name => 'model'
|
365
|
+
attr.store 'year'
|
366
|
+
attr.store 'fuel_type_code', :field_name => 'fuel_type'
|
367
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'unadj_hwy_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
368
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'unadj_city_mpg', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
369
|
+
attr.store 'cylinders', :field_name => 'no_cyc'
|
370
|
+
attr.store 'drive', :field_name => 'drive_system'
|
371
|
+
attr.store 'carline_mfr_code'
|
372
|
+
attr.store 'vi_mfr_code'
|
373
|
+
attr.store 'carline_code'
|
374
|
+
attr.store 'carline_class_code', :field_name => 'carline_clss'
|
375
|
+
attr.store 'transmission'
|
376
|
+
attr.store 'speeds'
|
377
|
+
attr.store 'turbo'
|
378
|
+
attr.store 'supercharger'
|
379
|
+
attr.store 'injection'
|
380
|
+
attr.store 'displacement'
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
# 1998--2005
|
385
|
+
{
|
386
|
+
1998 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv' },
|
387
|
+
1999 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/99guide.zip', :filename => '99guide6.csv' },
|
388
|
+
2000 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/00data.zip', :filename => 'G6080900.xls' },
|
389
|
+
2001 => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/01guide0918.csv' }, # parseexcel 0.5.2 can't read Excel 5.0 { :url => 'http://www.fueleconomy.gov/FEG/epadata/01data.zip', :filename => '01guide0918.xls' }
|
390
|
+
2002 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/02data.zip', :filename => 'guide_jan28.xls' },
|
391
|
+
2003 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/03data.zip', :filename => 'guide_2003_feb04-03b.csv' },
|
392
|
+
2004 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/04data.zip', :filename => 'gd04-Feb1804-RelDtFeb20.csv' },
|
393
|
+
2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
|
394
|
+
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
395
|
+
import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
|
396
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
397
|
+
attr.store 'make_name', :field_name => 'make'
|
398
|
+
attr.store 'model_name', :field_name => 'model'
|
399
|
+
attr.store 'fuel_type_code', :field_name => 'fl'
|
400
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'uhwy', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
401
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'ucty', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
402
|
+
attr.store 'cylinders', :field_name => 'cyl'
|
403
|
+
attr.store 'displacement', :field_name => 'displ'
|
404
|
+
attr.store 'carline_class_code', :field_name => 'cls' if year >= 2000
|
405
|
+
attr.store 'carline_class_name', :field_name => 'Class'
|
406
|
+
attr.store 'year'
|
407
|
+
attr.store 'transmission'
|
408
|
+
attr.store 'speeds'
|
409
|
+
attr.store 'turbo'
|
410
|
+
attr.store 'supercharger'
|
411
|
+
attr.store 'injection'
|
412
|
+
attr.store 'drive'
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
# 2006--2010
|
417
|
+
{
|
418
|
+
2006 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/06data.zip', :filename => '2006_FE_Guide_14-Nov-2005_download.csv' },
|
419
|
+
2007 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/07data.zip', :filename => '2007_FE_guide_ALL_no_sales_May_01_2007.xls' },
|
420
|
+
2008 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv' },
|
421
|
+
2009 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/09data.zip', :filename => '2009_FE_guide for DOE_ALL-rel dates-no-sales-8-28-08download.csv' },
|
422
|
+
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
423
|
+
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
424
|
+
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
425
|
+
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do |attr|
|
426
|
+
attr.store 'make_name', :field_name => 'make'
|
427
|
+
attr.store 'model_name', :field_name => 'model'
|
428
|
+
attr.store 'fuel_type_code', :field_name => 'FUEL TYPE'
|
429
|
+
attr.store 'raw_fuel_efficiency_highway', :field_name => 'UNRND HWY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
430
|
+
attr.store 'raw_fuel_efficiency_city', :field_name => 'UNRND CITY (EPA)', :from_units => :miles_per_gallon, :to_units => :kilometres_per_litre
|
431
|
+
attr.store 'cylinders', :field_name => 'NUMB CYL'
|
432
|
+
attr.store 'displacement', :field_name => 'DISPLACEMENT'
|
433
|
+
attr.store 'carline_class_code', :field_name => 'CLS'
|
434
|
+
attr.store 'carline_class_name', :field_name => 'CLASS'
|
435
|
+
attr.store 'year'
|
436
|
+
attr.store 'transmission'
|
437
|
+
attr.store 'speeds'
|
438
|
+
attr.store 'turbo'
|
439
|
+
attr.store 'supercharger'
|
440
|
+
attr.store 'injection'
|
441
|
+
attr.store 'drive'
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
# associate :make, :key => :original_automobile_make_name, :foreign_key => :name
|
446
|
+
# derive :automobile_model_id # creates models by name
|
447
|
+
# associate :model_year, :key => :original_automobile_model_year_year, :foreign_key => :year
|
448
|
+
# associate :fuel_type, :key => :original_automobile_fuel_type_code, :foreign_key => :code
|
449
|
+
process :set_adjusted_fuel_economy
|
450
|
+
end
|
451
|
+
|
452
|
+
def name
|
453
|
+
extra = []
|
454
|
+
extra << "V#{cylinders}" if cylinders
|
455
|
+
extra << "#{displacement}L" if displacement
|
456
|
+
extra << "turbo" if turbo
|
457
|
+
extra << "FI" if injection
|
458
|
+
extra << "#{speeds}spd" if speeds.present?
|
459
|
+
extra << transmission if transmission.present?
|
460
|
+
extra << "(#{fuel_type.name})" if fuel_type
|
461
|
+
extra.join(' ')
|
462
|
+
end
|
463
|
+
|
464
|
+
def fuel_economy_description
|
465
|
+
[ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
|
466
|
+
end
|
467
|
+
|
468
|
+
class << self
|
469
|
+
def set_adjusted_fuel_economy
|
470
|
+
update_all 'fuel_efficiency_city = 1 / ((0.003259 / 0.425143707) + (1.1805 / raw_fuel_efficiency_city))'
|
471
|
+
update_all 'fuel_efficiency_highway = 1 / ((0.001376 / 0.425143707) + (1.3466 / raw_fuel_efficiency_highway))'
|
472
|
+
end
|
473
|
+
|
474
|
+
# the following matching methods are needed by the errata
|
475
|
+
# per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
|
476
|
+
|
477
|
+
def transmission_is_blank?(row)
|
478
|
+
row['transmission'].blank?
|
479
|
+
end
|
480
|
+
|
481
|
+
def is_a_2007_gmc_or_chevrolet?(row)
|
482
|
+
row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
|
483
|
+
end
|
484
|
+
|
485
|
+
def is_a_porsche?(row)
|
486
|
+
row['make'].upcase == 'PORSCHE'
|
487
|
+
end
|
488
|
+
|
489
|
+
def is_not_a_porsche?(row)
|
490
|
+
!is_a_porsche? row
|
491
|
+
end
|
492
|
+
|
493
|
+
def is_a_mercedes_benz?(row)
|
494
|
+
row['make'] =~ /MERCEDES/i
|
495
|
+
end
|
496
|
+
|
497
|
+
def is_a_lexus?(row)
|
498
|
+
row['make'].upcase == 'LEXUS'
|
499
|
+
end
|
500
|
+
|
501
|
+
def is_a_bmw?(row)
|
502
|
+
row['make'].upcase == 'BMW'
|
503
|
+
end
|
504
|
+
|
505
|
+
def is_a_ford?(row)
|
506
|
+
row['make'].upcase == 'FORD'
|
507
|
+
end
|
508
|
+
|
509
|
+
def is_a_rolls_royce_and_model_contains_bentley?(row)
|
510
|
+
is_a_rolls_royce?(row) and model_contains_bentley?(row)
|
511
|
+
end
|
512
|
+
|
513
|
+
def is_a_bentley?(row)
|
514
|
+
row['make'].upcase == 'BENTLEY'
|
515
|
+
end
|
516
|
+
|
517
|
+
def is_a_rolls_royce?(row)
|
518
|
+
row['make'] =~ /ROLLS/i
|
519
|
+
end
|
520
|
+
|
521
|
+
def is_a_turbo_brooklands?(row)
|
522
|
+
row['model'] =~ /TURBO R\/RL BKLDS/i
|
523
|
+
end
|
524
|
+
|
525
|
+
def model_contains_maybach?(row)
|
526
|
+
row['model'] =~ /MAYBACH/i
|
527
|
+
end
|
528
|
+
|
529
|
+
def model_contains_bentley?(row)
|
530
|
+
row['model'] =~ /BENTLEY/i
|
531
|
+
end
|
19
532
|
end
|
20
533
|
end
|
21
534
|
|
22
535
|
class Country < ActiveRecord::Base
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
536
|
+
set_primary_key :iso_3166
|
537
|
+
|
538
|
+
data_miner do
|
539
|
+
unique_index 'iso_3166'
|
540
|
+
|
541
|
+
# get a complete list
|
542
|
+
import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
|
543
|
+
attr.store 'iso_3166', :field_number => 1
|
544
|
+
attr.store 'name', :field_number => 0
|
545
|
+
end
|
546
|
+
|
547
|
+
# get nicer names
|
548
|
+
import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
|
549
|
+
attr.store 'iso_3166', :field_name => 'country code'
|
550
|
+
attr.store 'name', :field_name => 'country'
|
29
551
|
end
|
30
552
|
end
|
31
553
|
end
|
32
554
|
|
33
555
|
class Airport < ActiveRecord::Base
|
556
|
+
set_primary_key :iata_code
|
34
557
|
belongs_to :country
|
35
|
-
|
558
|
+
|
559
|
+
data_miner do
|
560
|
+
unique_index 'iata_code'
|
561
|
+
|
36
562
|
# import airport iata_code, name, etc.
|
37
|
-
|
38
|
-
attr.
|
39
|
-
attr.store
|
40
|
-
attr.store
|
41
|
-
attr.store
|
42
|
-
attr.store
|
43
|
-
attr.store
|
44
|
-
|
563
|
+
import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? }) do |attr|
|
564
|
+
attr.store 'name', :field_number => 1
|
565
|
+
attr.store 'city', :field_number => 2
|
566
|
+
attr.store 'country_name', :field_number => 3
|
567
|
+
attr.store 'iata_code', :field_number => 4
|
568
|
+
attr.store 'latitude', :field_number => 6
|
569
|
+
attr.store 'longitude', :field_number => 7
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
|
574
|
+
class CensusRegion < ActiveRecord::Base
|
575
|
+
set_primary_key :number
|
576
|
+
|
577
|
+
data_miner do
|
578
|
+
unique_index 'number'
|
579
|
+
|
580
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
|
581
|
+
attr.store 'name', :field_name => 'Name'
|
582
|
+
attr.store 'number', :field_name => 'Region'
|
583
|
+
end
|
584
|
+
|
585
|
+
# pretend this is a different data source
|
586
|
+
import :url => 'http://www.census.gov/popest/geographic/codes02.csv', :skip => 9, :select => lambda { |row| row['Region'].to_i > 0 and row['Division'].to_s.strip == 'X'} do |attr|
|
587
|
+
attr.store 'name', :field_name => 'Name'
|
588
|
+
attr.store 'number', :field_name => 'Region'
|
45
589
|
end
|
46
590
|
end
|
47
591
|
end
|
@@ -49,30 +593,82 @@ end
|
|
49
593
|
DataMiner.enqueue do |queue|
|
50
594
|
queue << Country
|
51
595
|
queue << Airport
|
596
|
+
queue << CensusRegion
|
597
|
+
queue << AutomobileFuelType # OK
|
598
|
+
queue << AutomobileModel # OK
|
599
|
+
queue << AutomobileMake # OK
|
600
|
+
queue << AutomobileModelYear # OK
|
601
|
+
queue << AutomobileVariant # OK
|
602
|
+
queue << AutomobileMakeFleetYear # OK; third-party data not yet hosted on third-party site
|
603
|
+
queue << AutomobileMakeYear # OK
|
52
604
|
end
|
53
605
|
|
54
|
-
class DataMinerTest < Test::Unit::TestCase
|
55
|
-
|
56
|
-
|
57
|
-
Country.
|
606
|
+
class DataMinerTest < Test::Unit::TestCase
|
607
|
+
should "be idempotent" do
|
608
|
+
Country.data_miner_config.run
|
609
|
+
a = Country.count
|
610
|
+
Country.data_miner_config.run
|
611
|
+
b = Country.count
|
612
|
+
assert_equal a, b
|
613
|
+
|
614
|
+
CensusRegion.data_miner_config.run
|
615
|
+
a = CensusRegion.count
|
616
|
+
CensusRegion.data_miner_config.run
|
617
|
+
b = CensusRegion.count
|
618
|
+
assert_equal a, b
|
58
619
|
end
|
620
|
+
|
621
|
+
should "assume that no unique indices means it wants a big hash" do
|
622
|
+
assert_raises DataMiner::MissingHashColumn do
|
623
|
+
class IncompleteCountry < ActiveRecord::Base
|
624
|
+
set_table_name 'countries'
|
625
|
+
|
626
|
+
data_miner do
|
627
|
+
# no unique index
|
628
|
+
|
629
|
+
# get a complete list
|
630
|
+
import :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do |attr|
|
631
|
+
attr.store 'iso_3166', :field_number => 1
|
632
|
+
attr.store 'name', :field_number => 0
|
633
|
+
end
|
634
|
+
|
635
|
+
# get nicer names
|
636
|
+
import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
|
637
|
+
attr.store 'iso_3166', :field_name => 'country code'
|
638
|
+
attr.store 'name', :field_name => 'country'
|
639
|
+
end
|
640
|
+
end
|
641
|
+
end
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
should "hash things if no unique index is listed" do
|
646
|
+
AutomobileVariant.data_miner_config.runnables[0].run
|
647
|
+
assert AutomobileVariant.first.row_hash.present?
|
648
|
+
end
|
649
|
+
|
650
|
+
# should "mine multiple classes in the correct order" do
|
651
|
+
# DataMiner.run :class_names => DataMiner.classes.map(&:class_name)
|
652
|
+
# uy = Country.find_by_iso_3166('UY')
|
653
|
+
# assert_equal 'Uruguay', uy.name
|
654
|
+
# end
|
59
655
|
|
60
|
-
should "
|
61
|
-
Country
|
62
|
-
|
63
|
-
assert_equal
|
656
|
+
should "have a target record for every class that is mined" do
|
657
|
+
DataMiner.run :class_names => %w{ Country }
|
658
|
+
assert DataMiner::Target.exists?(:name => 'Country')
|
659
|
+
assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
|
64
660
|
end
|
65
661
|
|
66
|
-
should "
|
67
|
-
|
68
|
-
|
69
|
-
|
662
|
+
should "keep a log when it does a run" do
|
663
|
+
approx_started_at = Time.now
|
664
|
+
DataMiner.run :class_names => %w{ Country }
|
665
|
+
approx_ended_at = Time.now
|
666
|
+
target = DataMiner::Target.find_by_name('Country')
|
667
|
+
assert (target.runs.last.started_at - approx_started_at).abs < 5 # seconds
|
668
|
+
assert (target.runs.last.ended_at - approx_ended_at).abs < 5 # seconds
|
70
669
|
end
|
71
670
|
|
72
|
-
should "
|
73
|
-
|
74
|
-
uy = Country.find_by_iso_3166('UY')
|
75
|
-
assert_equal 'Uruguay', uy.name
|
76
|
-
assert_equal uy, Airport.find_by_iata_code('MVD').country
|
671
|
+
should "remove rows that have disappeared from the external data source" do
|
672
|
+
flunk "not implemented yet"
|
77
673
|
end
|
78
674
|
end
|