data_miner 0.4.24 → 0.4.25

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -10,15 +10,15 @@ begin
10
10
  gem.email = "seamus@abshere.net"
11
11
  gem.homepage = "http://github.com/seamusabshere/data_miner"
12
12
  gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
- gem.add_dependency 'remote_table', '>=0.2.19'
13
+ gem.add_dependency 'remote_table', '>=0.2.20'
14
14
  gem.add_dependency 'activerecord', '>=2.3.4'
15
15
  gem.add_dependency 'activesupport', '>=2.3.4'
16
16
  gem.add_dependency 'andand', '>=1.3.1'
17
- gem.add_dependency 'errata', '>=0.1.7'
18
17
  gem.add_dependency 'conversions', '>=1.4.4'
19
18
  gem.add_dependency 'blockenspiel', '>=0.3.2'
20
19
  gem.add_dependency 'log4r', '>=1.1.7'
21
- gem.add_development_dependency "loose_tight_dictionary", ">=0.0.3"
20
+ gem.add_development_dependency 'errata', '>=0.2.1'
21
+ gem.add_development_dependency "loose_tight_dictionary", ">=0.0.5"
22
22
  gem.require_path = "lib"
23
23
  gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
24
24
  gem.rdoc_options << '--line-numbers' << '--inline-source'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.24
1
+ 0.4.25
data/data_miner.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{data_miner}
8
- s.version = "0.4.24"
8
+ s.version = "0.4.25"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-04-29}
12
+ s.date = %q{2010-05-06}
13
13
  s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -52,36 +52,36 @@ Gem::Specification.new do |s|
52
52
  s.specification_version = 3
53
53
 
54
54
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
55
- s.add_runtime_dependency(%q<remote_table>, [">= 0.2.19"])
55
+ s.add_runtime_dependency(%q<remote_table>, [">= 0.2.20"])
56
56
  s.add_runtime_dependency(%q<activerecord>, [">= 2.3.4"])
57
57
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.4"])
58
58
  s.add_runtime_dependency(%q<andand>, [">= 1.3.1"])
59
- s.add_runtime_dependency(%q<errata>, [">= 0.1.7"])
60
59
  s.add_runtime_dependency(%q<conversions>, [">= 1.4.4"])
61
60
  s.add_runtime_dependency(%q<blockenspiel>, [">= 0.3.2"])
62
61
  s.add_runtime_dependency(%q<log4r>, [">= 1.1.7"])
63
- s.add_development_dependency(%q<loose_tight_dictionary>, [">= 0.0.3"])
62
+ s.add_development_dependency(%q<errata>, [">= 0.2.1"])
63
+ s.add_development_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
64
64
  else
65
- s.add_dependency(%q<remote_table>, [">= 0.2.19"])
65
+ s.add_dependency(%q<remote_table>, [">= 0.2.20"])
66
66
  s.add_dependency(%q<activerecord>, [">= 2.3.4"])
67
67
  s.add_dependency(%q<activesupport>, [">= 2.3.4"])
68
68
  s.add_dependency(%q<andand>, [">= 1.3.1"])
69
- s.add_dependency(%q<errata>, [">= 0.1.7"])
70
69
  s.add_dependency(%q<conversions>, [">= 1.4.4"])
71
70
  s.add_dependency(%q<blockenspiel>, [">= 0.3.2"])
72
71
  s.add_dependency(%q<log4r>, [">= 1.1.7"])
73
- s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.3"])
72
+ s.add_dependency(%q<errata>, [">= 0.2.1"])
73
+ s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
74
74
  end
75
75
  else
76
- s.add_dependency(%q<remote_table>, [">= 0.2.19"])
76
+ s.add_dependency(%q<remote_table>, [">= 0.2.20"])
77
77
  s.add_dependency(%q<activerecord>, [">= 2.3.4"])
78
78
  s.add_dependency(%q<activesupport>, [">= 2.3.4"])
79
79
  s.add_dependency(%q<andand>, [">= 1.3.1"])
80
- s.add_dependency(%q<errata>, [">= 0.1.7"])
81
80
  s.add_dependency(%q<conversions>, [">= 1.4.4"])
82
81
  s.add_dependency(%q<blockenspiel>, [">= 0.3.2"])
83
82
  s.add_dependency(%q<log4r>, [">= 1.1.7"])
84
- s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.3"])
83
+ s.add_dependency(%q<errata>, [">= 0.2.1"])
84
+ s.add_dependency(%q<loose_tight_dictionary>, [">= 0.0.5"])
85
85
  end
86
86
  end
87
87
 
data/lib/data_miner.rb CHANGED
@@ -12,7 +12,6 @@ require 'active_record'
12
12
  require 'blockenspiel'
13
13
  require 'conversions'
14
14
  require 'remote_table'
15
- require 'errata'
16
15
  require 'andand'
17
16
  require 'log4r'
18
17
  require 'fileutils'
@@ -74,7 +74,7 @@ module DataMiner
74
74
  end
75
75
 
76
76
  def match_row(row)
77
- matcher.lookup row
77
+ matcher.match row
78
78
  end
79
79
 
80
80
  def value_from_row(row)
@@ -219,7 +219,7 @@ module DataMiner
219
219
  @_dictionary ||= Dictionary.new options[:dictionary]
220
220
  end
221
221
  def matcher
222
- @_matcher ||= options[:matcher].new
222
+ @_matcher ||= (options[:matcher].is_a?(String) ? options[:matcher].constantize.new : options[:matcher])
223
223
  end
224
224
  end
225
225
  end
@@ -3,20 +3,23 @@ module DataMiner
3
3
  include Blockenspiel::DSL
4
4
 
5
5
  attr_reader :attributes
6
- attr_accessor :configuration, :position_in_run, :options, :table, :errata
6
+ attr_accessor :configuration, :position_in_run, :table
7
7
  attr_accessor :description
8
8
  delegate :resource, :to => :configuration
9
9
 
10
- def initialize(configuration, position_in_run, description, options = {})
11
- options.symbolize_keys!
12
- @options = options
10
+ def initialize(configuration, position_in_run, description, table_options = {})
11
+ table_options.symbolize_keys!
13
12
 
14
13
  @attributes = ActiveSupport::OrderedHash.new
15
14
  @configuration = configuration
16
15
  @position_in_run = position_in_run
17
16
  @description = description
18
- @errata = Errata.new(:url => options[:errata], :klass => resource) if options[:errata]
19
- @table = RemoteTable.new options
17
+ if table_options[:table].present?
18
+ DataMiner.log_or_raise "You should specify :table or :url, but not both" if table_options[:url].present?
19
+ @table = table_options[:table]
20
+ else
21
+ @table = RemoteTable.new table_options
22
+ end
20
23
  end
21
24
 
22
25
  def inspect
@@ -46,11 +49,6 @@ module DataMiner
46
49
  test_counter = 0
47
50
 
48
51
  table.each_row do |row|
49
- if errata
50
- next if errata.rejects?(row)
51
- errata.correct!(row)
52
- end
53
-
54
52
  if ENV['DUMP'] == 'true'
55
53
  raise "[data_miner gem] Stopping after 5 rows because TEST=true" if test_counter > 5
56
54
  test_counter += 1
@@ -220,6 +220,70 @@ class AutomobileVariant < ActiveRecord::Base
220
220
  end
221
221
  end
222
222
 
223
+ class Guru
224
+ # the following matching methods are needed by the errata
225
+ # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
226
+
227
+ def transmission_is_blank?(row)
228
+ row['transmission'].blank?
229
+ end
230
+
231
+ def is_a_2007_gmc_or_chevrolet?(row)
232
+ row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
233
+ end
234
+
235
+ def is_a_porsche?(row)
236
+ row['make'].upcase == 'PORSCHE'
237
+ end
238
+
239
+ def is_not_a_porsche?(row)
240
+ !is_a_porsche? row
241
+ end
242
+
243
+ def is_a_mercedes_benz?(row)
244
+ row['make'] =~ /MERCEDES/i
245
+ end
246
+
247
+ def is_a_lexus?(row)
248
+ row['make'].upcase == 'LEXUS'
249
+ end
250
+
251
+ def is_a_bmw?(row)
252
+ row['make'].upcase == 'BMW'
253
+ end
254
+
255
+ def is_a_ford?(row)
256
+ row['make'].upcase == 'FORD'
257
+ end
258
+
259
+ def is_a_rolls_royce_and_model_contains_bentley?(row)
260
+ is_a_rolls_royce?(row) and model_contains_bentley?(row)
261
+ end
262
+
263
+ def is_a_bentley?(row)
264
+ row['make'].upcase == 'BENTLEY'
265
+ end
266
+
267
+ def is_a_rolls_royce?(row)
268
+ row['make'] =~ /ROLLS/i
269
+ end
270
+
271
+ def is_a_turbo_brooklands?(row)
272
+ row['model'] =~ /TURBO R\/RL BKLDS/i
273
+ end
274
+
275
+ def model_contains_maybach?(row)
276
+ row['model'] =~ /MAYBACH/i
277
+ end
278
+
279
+ def model_contains_bentley?(row)
280
+ row['model'] =~ /BENTLEY/i
281
+ end
282
+ end
283
+
284
+ errata = Errata.new :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv',
285
+ :responder => AutomobileVariant::Guru.new
286
+
223
287
  data_miner do
224
288
  # 1985---1997
225
289
  (85..97).each do |yy|
@@ -227,7 +291,7 @@ class AutomobileVariant < ActiveRecord::Base
227
291
  import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
228
292
  :filename => filename,
229
293
  :transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
230
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
294
+ :errata => errata) do
231
295
  key 'row_hash'
232
296
  store 'make_name', :field_name => 'make'
233
297
  store 'model_name', :field_name => 'model'
@@ -264,7 +328,7 @@ class AutomobileVariant < ActiveRecord::Base
264
328
  2005 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/05data.zip', :filename => 'guide2005-2004oct15.csv' }
265
329
  }.sort { |a, b| a.first <=> b.first }.each do |year, options|
266
330
  import options.merge(:transform => { :class => FuelEconomyGuide::ParserC, :year => year },
267
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
331
+ :errata => errata) do
268
332
  key 'row_hash'
269
333
  store 'make_name', :field_name => 'make'
270
334
  store 'model_name', :field_name => 'model'
@@ -296,7 +360,7 @@ class AutomobileVariant < ActiveRecord::Base
296
360
  # 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
297
361
  }.sort { |a, b| a.first <=> b.first }.each do |year, options|
298
362
  import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
299
- :errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv') do
363
+ :errata => errata) do
300
364
  key 'row_hash'
301
365
  store 'make_name', :field_name => 'make'
302
366
  store 'model_name', :field_name => 'model'
@@ -344,67 +408,6 @@ class AutomobileVariant < ActiveRecord::Base
344
408
  def fuel_economy_description
345
409
  [ fuel_efficiency_city, fuel_efficiency_highway ].map { |f| f.kilometres_per_litre.to(:miles_per_gallon).round }.join('/')
346
410
  end
347
-
348
- class << self
349
- # the following matching methods are needed by the errata
350
- # per https://brighterplanet.sifterapp.com/projects/30/issues/750/comments
351
-
352
- def transmission_is_blank?(row)
353
- row['transmission'].blank?
354
- end
355
-
356
- def is_a_2007_gmc_or_chevrolet?(row)
357
- row['year'] == 2007 and %w(GMC CHEVROLET).include? row['MFR'].upcase
358
- end
359
-
360
- def is_a_porsche?(row)
361
- row['make'].upcase == 'PORSCHE'
362
- end
363
-
364
- def is_not_a_porsche?(row)
365
- !is_a_porsche? row
366
- end
367
-
368
- def is_a_mercedes_benz?(row)
369
- row['make'] =~ /MERCEDES/i
370
- end
371
-
372
- def is_a_lexus?(row)
373
- row['make'].upcase == 'LEXUS'
374
- end
375
-
376
- def is_a_bmw?(row)
377
- row['make'].upcase == 'BMW'
378
- end
379
-
380
- def is_a_ford?(row)
381
- row['make'].upcase == 'FORD'
382
- end
383
-
384
- def is_a_rolls_royce_and_model_contains_bentley?(row)
385
- is_a_rolls_royce?(row) and model_contains_bentley?(row)
386
- end
387
-
388
- def is_a_bentley?(row)
389
- row['make'].upcase == 'BENTLEY'
390
- end
391
-
392
- def is_a_rolls_royce?(row)
393
- row['make'] =~ /ROLLS/i
394
- end
395
-
396
- def is_a_turbo_brooklands?(row)
397
- row['model'] =~ /TURBO R\/RL BKLDS/i
398
- end
399
-
400
- def model_contains_maybach?(row)
401
- row['model'] =~ /MAYBACH/i
402
- end
403
-
404
- def model_contains_bentley?(row)
405
- row['model'] =~ /BENTLEY/i
406
- end
407
- end
408
411
  end
409
412
 
410
413
  class Country < ActiveRecord::Base
@@ -954,20 +957,20 @@ class Aircraft < ActiveRecord::Base
954
957
  end
955
958
 
956
959
  class BtsAircraftTypeCodeMatcher
957
- def lookup(left_record)
960
+ def match(left_record)
958
961
  right_record = Aircraft.bts_dictionary.left_to_right left_record
959
962
  right_record['Aircraft Type'] if right_record
960
963
  end
961
964
  end
962
965
 
963
966
  class BtsNameMatcher
964
- def lookup(left_record)
967
+ def match(left_record)
965
968
  right_record = Aircraft.bts_dictionary.left_to_right left_record
966
969
  right_record['Manufacturer'] + ' ' + right_record['Long Name'] if right_record
967
970
  end
968
971
  end
969
972
 
970
- class << self
973
+ class Guru
971
974
  # for errata
972
975
  def is_not_attributed_to_aerospatiale?(row)
973
976
  not row['Manufacturer'] =~ /AEROSPATIALE/i
@@ -1005,12 +1008,13 @@ class Aircraft < ActiveRecord::Base
1005
1008
  import("ICAO codes starting with letter #{letter} used by the FAA",
1006
1009
  :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
1007
1010
  :encoding => 'US-ASCII',
1011
+ :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1012
+ :responder => Aircraft::Guru.new),
1008
1013
  :row_xpath => '//table/tr[2]/td/table/tr',
1009
- :errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
1010
1014
  :column_xpath => 'td') do
1011
1015
  key 'icao_code', :field_name => 'Designator'
1012
- store 'bts_name', :matcher => Aircraft::BtsNameMatcher
1013
- store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher
1016
+ store 'bts_name', :matcher => Aircraft::BtsNameMatcher.new
1017
+ store 'bts_aircraft_type_code', :matcher => Aircraft::BtsAircraftTypeCodeMatcher.new
1014
1018
  store 'manufacturer_name', :field_name => 'Manufacturer'
1015
1019
  store 'name', :field_name => 'Model'
1016
1020
  end
@@ -1020,14 +1024,10 @@ end
1020
1024
 
1021
1025
  # todo: have somebody properly organize these
1022
1026
  class DataMinerTest < Test::Unit::TestCase
1023
- if ENV['NEW'] == 'true'
1024
- should "mine aircraft" do
1025
- Aircraft.run_data_miner!
1026
- assert Aircraft.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1027
- end
1027
+ if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
1028
1028
  end
1029
1029
 
1030
- if ENV['FAST'] == 'true'
1030
+ if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
1031
1031
  should "keep a call stack so that you can call run_data_miner! on a child" do
1032
1032
  CrosscallingCensusDivision.run_data_miner!
1033
1033
  assert CrosscallingCensusDivision.exists? :name => 'Mountain Division', :number => 8, :census_region_number => 4, :census_region_name => 'West Region'
@@ -1139,7 +1139,12 @@ class DataMinerTest < Test::Unit::TestCase
1139
1139
  end
1140
1140
  end
1141
1141
 
1142
- if ENV['SLOW'] == 'true'
1142
+ if ENV['ALL'] == 'true' or ENV['SLOW'] == 'true'
1143
+ should "mine aircraft" do
1144
+ Aircraft.run_data_miner!
1145
+ assert Aircraft.exists? :icao_code => 'DC91', :bts_aircraft_type_code => '630'
1146
+ end
1147
+
1143
1148
  should "mine automobile variants" do
1144
1149
  AutomobileVariant.run_data_miner!
1145
1150
  assert AutomobileVariant.count('make_name LIKE "%tesla"') > 0
@@ -1150,7 +1155,7 @@ class DataMinerTest < Test::Unit::TestCase
1150
1155
  assert T100FlightSegment.count('dest_country_name LIKE "%United States"') > 0
1151
1156
  end
1152
1157
 
1153
- should "mine residence survey day" do
1158
+ should "mine residence survey responses" do
1154
1159
  ResidentialEnergyConsumptionSurveyResponse.run_data_miner!
1155
1160
  assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
1156
1161
  end
data/test/test_helper.rb CHANGED
@@ -3,6 +3,8 @@ require 'test/unit'
3
3
  require 'shoulda'
4
4
  require 'ruby-debug'
5
5
 
6
+ require 'errata'
7
+
6
8
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
9
  $LOAD_PATH.unshift(File.dirname(__FILE__))
8
10
  require 'data_miner'
@@ -11,7 +13,7 @@ ActiveRecord::Base.establish_connection(
11
13
  'adapter' => 'mysql',
12
14
  'database' => 'data_miner_test',
13
15
  'username' => 'root',
14
- 'password' => ''
16
+ 'password' => 'password'
15
17
  )
16
18
 
17
19
  ActiveSupport::Inflector.inflections do |inflect|
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 4
8
- - 24
9
- version: 0.4.24
8
+ - 25
9
+ version: 0.4.25
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-04-29 00:00:00 -04:00
18
+ date: 2010-05-06 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -28,8 +28,8 @@ dependencies:
28
28
  segments:
29
29
  - 0
30
30
  - 2
31
- - 19
32
- version: 0.2.19
31
+ - 20
32
+ version: 0.2.20
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
@@ -74,24 +74,10 @@ dependencies:
74
74
  version: 1.3.1
75
75
  type: :runtime
76
76
  version_requirements: *id004
77
- - !ruby/object:Gem::Dependency
78
- name: errata
79
- prerelease: false
80
- requirement: &id005 !ruby/object:Gem::Requirement
81
- requirements:
82
- - - ">="
83
- - !ruby/object:Gem::Version
84
- segments:
85
- - 0
86
- - 1
87
- - 7
88
- version: 0.1.7
89
- type: :runtime
90
- version_requirements: *id005
91
77
  - !ruby/object:Gem::Dependency
92
78
  name: conversions
93
79
  prerelease: false
94
- requirement: &id006 !ruby/object:Gem::Requirement
80
+ requirement: &id005 !ruby/object:Gem::Requirement
95
81
  requirements:
96
82
  - - ">="
97
83
  - !ruby/object:Gem::Version
@@ -101,11 +87,11 @@ dependencies:
101
87
  - 4
102
88
  version: 1.4.4
103
89
  type: :runtime
104
- version_requirements: *id006
90
+ version_requirements: *id005
105
91
  - !ruby/object:Gem::Dependency
106
92
  name: blockenspiel
107
93
  prerelease: false
108
- requirement: &id007 !ruby/object:Gem::Requirement
94
+ requirement: &id006 !ruby/object:Gem::Requirement
109
95
  requirements:
110
96
  - - ">="
111
97
  - !ruby/object:Gem::Version
@@ -115,11 +101,11 @@ dependencies:
115
101
  - 2
116
102
  version: 0.3.2
117
103
  type: :runtime
118
- version_requirements: *id007
104
+ version_requirements: *id006
119
105
  - !ruby/object:Gem::Dependency
120
106
  name: log4r
121
107
  prerelease: false
122
- requirement: &id008 !ruby/object:Gem::Requirement
108
+ requirement: &id007 !ruby/object:Gem::Requirement
123
109
  requirements:
124
110
  - - ">="
125
111
  - !ruby/object:Gem::Version
@@ -129,6 +115,20 @@ dependencies:
129
115
  - 7
130
116
  version: 1.1.7
131
117
  type: :runtime
118
+ version_requirements: *id007
119
+ - !ruby/object:Gem::Dependency
120
+ name: errata
121
+ prerelease: false
122
+ requirement: &id008 !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ segments:
127
+ - 0
128
+ - 2
129
+ - 1
130
+ version: 0.2.1
131
+ type: :development
132
132
  version_requirements: *id008
133
133
  - !ruby/object:Gem::Dependency
134
134
  name: loose_tight_dictionary
@@ -140,8 +140,8 @@ dependencies:
140
140
  segments:
141
141
  - 0
142
142
  - 0
143
- - 3
144
- version: 0.0.3
143
+ - 5
144
+ version: 0.0.5
145
145
  type: :development
146
146
  version_requirements: *id009
147
147
  description: Mine remote data into your ActiveRecord models. You can also perform associations and convert units.