data_miner 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/data_miner.gemspec +6 -2
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +1 -7
- data/test/support/aircraft.rb +5 -2
- data/test/support/automobile_variant.rb +60 -66
- data/test/test_old_syntax.rb +3 -3
- metadata +6 -6
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
1.1.0
|
2
|
+
* fixed dependency issues
|
3
|
+
1.0.0
|
4
|
+
* bundler and gemspec instead of jeweler
|
5
|
+
* clear up memory leaks and destructive argument borking
|
6
|
+
* mostly backwards compatible (but no add_hints! in remote_table transforms, for example)
|
1
7
|
0.2.6
|
2
8
|
* Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
|
3
9
|
0.3.0
|
data/data_miner.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
|
-
s.add_dependency 'remote_table', '>=1.0
|
22
|
+
s.add_dependency 'remote_table', '>=1.1.0'
|
23
23
|
s.add_dependency 'escape', '>=0.0.4'
|
24
24
|
s.add_dependency 'activerecord', '>=2.3.4'
|
25
25
|
s.add_dependency 'activesupport', '>=2.3.4'
|
@@ -31,5 +31,9 @@ Gem::Specification.new do |s|
|
|
31
31
|
s.add_development_dependency 'test-unit'
|
32
32
|
s.add_development_dependency 'shoulda'
|
33
33
|
s.add_development_dependency 'mysql'
|
34
|
-
|
34
|
+
if RUBY_VERSION >= '1.9'
|
35
|
+
s.add_development_dependency 'ruby-debug19'
|
36
|
+
else
|
37
|
+
s.add_development_dependency 'ruby-debug'
|
38
|
+
end
|
35
39
|
end
|
data/lib/data_miner/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -1,15 +1,9 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler'
|
3
|
-
unless RUBY_VERSION >= '1.9'
|
4
|
-
gem 'fastercsv'
|
5
|
-
require 'fastercsv'
|
6
|
-
end
|
7
3
|
Bundler.setup
|
8
4
|
require 'test/unit'
|
9
5
|
require 'shoulda'
|
10
|
-
|
11
|
-
require 'ruby-debug'
|
12
|
-
end
|
6
|
+
require 'ruby-debug'
|
13
7
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
14
8
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
15
9
|
require 'data_miner'
|
data/test/support/aircraft.rb
CHANGED
@@ -33,6 +33,10 @@ class Aircraft < ActiveRecord::Base
|
|
33
33
|
row['Manufacturer'] =~ /BOEING/i
|
34
34
|
end
|
35
35
|
|
36
|
+
def is_not_attributed_to_airbus?(row)
|
37
|
+
row['Manufacturer'] =~ /AIRBUS/i
|
38
|
+
end
|
39
|
+
|
36
40
|
def is_attributed_to_cessna?(row)
|
37
41
|
row['Manufacturer'] =~ /CESSNA/i
|
38
42
|
end
|
@@ -77,8 +81,7 @@ class Aircraft < ActiveRecord::Base
|
|
77
81
|
import("ICAO codes starting with letter #{letter} used by the FAA",
|
78
82
|
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
79
83
|
:encoding => 'US-ASCII',
|
80
|
-
:errata =>
|
81
|
-
:responder => Aircraft::Guru.new),
|
84
|
+
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => 'Aircraft::Guru' },
|
82
85
|
:row_xpath => '//table/tr[2]/td/table/tr',
|
83
86
|
:column_xpath => 'td') do
|
84
87
|
key 'icao_code', :field_name => 'Designator'
|
@@ -42,9 +42,58 @@ class AutomobileVariant < ActiveRecord::Base
|
|
42
42
|
}
|
43
43
|
|
44
44
|
class ParserB
|
45
|
+
require 'slither'
|
46
|
+
::Slither.define :fuel_economy_guide_b do |d|
|
47
|
+
d.rows do |row|
|
48
|
+
row.trap { true } # there's only one section
|
49
|
+
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
50
|
+
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
51
|
+
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
52
|
+
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
53
|
+
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
54
|
+
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
55
|
+
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
56
|
+
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
57
|
+
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
58
|
+
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
59
|
+
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
60
|
+
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
61
|
+
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
62
|
+
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
63
|
+
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
64
|
+
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
65
|
+
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
66
|
+
row.spacer 2
|
67
|
+
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
68
|
+
row.spacer 2
|
69
|
+
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
70
|
+
row.spacer 2
|
71
|
+
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
72
|
+
row.spacer 2
|
73
|
+
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
74
|
+
row.spacer 2
|
75
|
+
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
76
|
+
row.spacer 2
|
77
|
+
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
78
|
+
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
79
|
+
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
80
|
+
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
81
|
+
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
82
|
+
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
83
|
+
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
84
|
+
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
85
|
+
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
86
|
+
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
87
|
+
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
88
|
+
row.column 'filler' , 1, :type => :string # NOT USED
|
89
|
+
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
90
|
+
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
91
|
+
end
|
92
|
+
end
|
45
93
|
attr_accessor :year
|
46
94
|
def initialize(options = {})
|
47
|
-
|
95
|
+
options = options.stringify_keys
|
96
|
+
@year = options['year']
|
48
97
|
end
|
49
98
|
|
50
99
|
def apply(row)
|
@@ -71,68 +120,12 @@ class AutomobileVariant < ActiveRecord::Base
|
|
71
120
|
end
|
72
121
|
end
|
73
122
|
|
74
|
-
def add_hints!(bus)
|
75
|
-
bus[:format] = :fixed_width
|
76
|
-
bus[:cut] = '13-' if year == 1995
|
77
|
-
bus[:schema_name] = :fuel_economy_guide_b
|
78
|
-
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
79
|
-
Slither.define :fuel_economy_guide_b do |d|
|
80
|
-
d.rows do |row|
|
81
|
-
row.trap { true } # there's only one section
|
82
|
-
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
83
|
-
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
84
|
-
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
85
|
-
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
86
|
-
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
87
|
-
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
88
|
-
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
89
|
-
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
90
|
-
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
91
|
-
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
92
|
-
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
93
|
-
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
94
|
-
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
95
|
-
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
96
|
-
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
97
|
-
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
98
|
-
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
99
|
-
row.spacer 2
|
100
|
-
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
101
|
-
row.spacer 2
|
102
|
-
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
103
|
-
row.spacer 2
|
104
|
-
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
105
|
-
row.spacer 2
|
106
|
-
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
107
|
-
row.spacer 2
|
108
|
-
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
109
|
-
row.spacer 2
|
110
|
-
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
111
|
-
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
112
|
-
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
113
|
-
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
114
|
-
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
115
|
-
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
116
|
-
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
117
|
-
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
118
|
-
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
119
|
-
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
120
|
-
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
121
|
-
row.column 'filler' , 1, :type => :string # NOT USED
|
122
|
-
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
123
|
-
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
123
|
end
|
128
124
|
class ParserC
|
129
125
|
attr_accessor :year
|
130
126
|
def initialize(options = {})
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
def add_hints!(bus)
|
135
|
-
# File will decide format based on filename
|
127
|
+
options = options.stringify_keys
|
128
|
+
@year = options['year']
|
136
129
|
end
|
137
130
|
|
138
131
|
def apply(row)
|
@@ -153,11 +146,8 @@ class AutomobileVariant < ActiveRecord::Base
|
|
153
146
|
class ParserD
|
154
147
|
attr_accessor :year
|
155
148
|
def initialize(options = {})
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
def add_hints!(bus)
|
160
|
-
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
149
|
+
options = options.stringify_keys
|
150
|
+
@year = options['year']
|
161
151
|
end
|
162
152
|
|
163
153
|
def apply(row)
|
@@ -238,8 +228,7 @@ class AutomobileVariant < ActiveRecord::Base
|
|
238
228
|
end
|
239
229
|
end
|
240
230
|
|
241
|
-
errata =
|
242
|
-
:responder => AutomobileVariant::Guru.new
|
231
|
+
errata = { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv', :responder => 'AutomobileVariant::Guru' }
|
243
232
|
|
244
233
|
data_miner do
|
245
234
|
# 1985---1997
|
@@ -248,6 +237,10 @@ class AutomobileVariant < ActiveRecord::Base
|
|
248
237
|
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
249
238
|
:filename => filename,
|
250
239
|
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
240
|
+
:format => :fixed_width,
|
241
|
+
:cut => (yy == 95) ? '13-' : nil,
|
242
|
+
:schema_name => :fuel_economy_guide_b,
|
243
|
+
:select => lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' },
|
251
244
|
:errata => errata) do
|
252
245
|
key 'row_hash'
|
253
246
|
store 'make_name', :field_name => 'make'
|
@@ -317,6 +310,7 @@ class AutomobileVariant < ActiveRecord::Base
|
|
317
310
|
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
318
311
|
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
319
312
|
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
313
|
+
:reject => (year == 2007) ? lambda { |row| row.values.first.blank? } : nil,
|
320
314
|
:errata => errata) do
|
321
315
|
key 'row_hash'
|
322
316
|
store 'make_name', :field_name => 'make'
|
data/test/test_old_syntax.rb
CHANGED
@@ -548,7 +548,7 @@ class AircraftDeux < ActiveRecord::Base
|
|
548
548
|
import("ICAO codes starting with letter #{letter} used by the FAA",
|
549
549
|
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
550
550
|
:encoding => 'windows-1252',
|
551
|
-
:errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
551
|
+
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw' },
|
552
552
|
:row_xpath => '//table/tr[2]/td/table/tr',
|
553
553
|
:column_xpath => 'td') do
|
554
554
|
key 'icao_code', :field_name => 'Designator'
|
@@ -588,7 +588,7 @@ class AutomobileMakeFleetYear < ActiveRecord::Base
|
|
588
588
|
|
589
589
|
# CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
|
590
590
|
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
|
591
|
-
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
|
591
|
+
:errata => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv' },
|
592
592
|
:select => lambda { |row| row['volume'].to_i > 0 } do
|
593
593
|
key 'name', :synthesize => lambda { |row| [ row['manufacturer_name'], row['fleet'][2,2], row['year_content'] ].join ' ' }
|
594
594
|
store 'make_name', :field_name => 'manufacturer_name'
|
@@ -661,7 +661,7 @@ class TestOldSyntax < Test::Unit::TestCase
|
|
661
661
|
end
|
662
662
|
end
|
663
663
|
assert_kind_of DataMiner::Import, AutomobileFuelType.data_miner_config.steps.first
|
664
|
-
assert_equal 'http://example.com', AutomobileFuelType.data_miner_config.steps.first.table.
|
664
|
+
assert_equal 'http://example.com', AutomobileFuelType.data_miner_config.steps.first.table.url
|
665
665
|
assert_equal 1, AutomobileFuelType.data_miner_config.step_counter
|
666
666
|
end
|
667
667
|
should "stop and finish if it gets a DataMiner::Finish" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 0
|
9
8
|
- 1
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Seamus Abshere
|
@@ -28,12 +28,12 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - ">="
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
hash:
|
31
|
+
hash: 19
|
32
32
|
segments:
|
33
33
|
- 1
|
34
|
+
- 1
|
34
35
|
- 0
|
35
|
-
|
36
|
-
version: 1.0.3
|
36
|
+
version: 1.1.0
|
37
37
|
type: :runtime
|
38
38
|
version_requirements: *id001
|
39
39
|
- !ruby/object:Gem::Dependency
|