data_miner 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/data_miner.gemspec +6 -2
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +1 -7
- data/test/support/aircraft.rb +5 -2
- data/test/support/automobile_variant.rb +60 -66
- data/test/test_old_syntax.rb +3 -3
- metadata +6 -6
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
1.1.0
|
2
|
+
* fixed dependency issues
|
3
|
+
1.0.0
|
4
|
+
* bundler and gemspec instead of jeweler
|
5
|
+
* clear up memory leaks and destructive argument borking
|
6
|
+
* mostly backwards compatible (but no add_hints! in remote_table transforms, for example)
|
1
7
|
0.2.6
|
2
8
|
* Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
|
3
9
|
0.3.0
|
data/data_miner.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
|
-
s.add_dependency 'remote_table', '>=1.0
|
22
|
+
s.add_dependency 'remote_table', '>=1.1.0'
|
23
23
|
s.add_dependency 'escape', '>=0.0.4'
|
24
24
|
s.add_dependency 'activerecord', '>=2.3.4'
|
25
25
|
s.add_dependency 'activesupport', '>=2.3.4'
|
@@ -31,5 +31,9 @@ Gem::Specification.new do |s|
|
|
31
31
|
s.add_development_dependency 'test-unit'
|
32
32
|
s.add_development_dependency 'shoulda'
|
33
33
|
s.add_development_dependency 'mysql'
|
34
|
-
|
34
|
+
if RUBY_VERSION >= '1.9'
|
35
|
+
s.add_development_dependency 'ruby-debug19'
|
36
|
+
else
|
37
|
+
s.add_development_dependency 'ruby-debug'
|
38
|
+
end
|
35
39
|
end
|
data/lib/data_miner/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -1,15 +1,9 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler'
|
3
|
-
unless RUBY_VERSION >= '1.9'
|
4
|
-
gem 'fastercsv'
|
5
|
-
require 'fastercsv'
|
6
|
-
end
|
7
3
|
Bundler.setup
|
8
4
|
require 'test/unit'
|
9
5
|
require 'shoulda'
|
10
|
-
|
11
|
-
require 'ruby-debug'
|
12
|
-
end
|
6
|
+
require 'ruby-debug'
|
13
7
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
14
8
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
15
9
|
require 'data_miner'
|
data/test/support/aircraft.rb
CHANGED
@@ -33,6 +33,10 @@ class Aircraft < ActiveRecord::Base
|
|
33
33
|
row['Manufacturer'] =~ /BOEING/i
|
34
34
|
end
|
35
35
|
|
36
|
+
def is_not_attributed_to_airbus?(row)
|
37
|
+
row['Manufacturer'] =~ /AIRBUS/i
|
38
|
+
end
|
39
|
+
|
36
40
|
def is_attributed_to_cessna?(row)
|
37
41
|
row['Manufacturer'] =~ /CESSNA/i
|
38
42
|
end
|
@@ -77,8 +81,7 @@ class Aircraft < ActiveRecord::Base
|
|
77
81
|
import("ICAO codes starting with letter #{letter} used by the FAA",
|
78
82
|
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
79
83
|
:encoding => 'US-ASCII',
|
80
|
-
:errata =>
|
81
|
-
:responder => Aircraft::Guru.new),
|
84
|
+
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => 'Aircraft::Guru' },
|
82
85
|
:row_xpath => '//table/tr[2]/td/table/tr',
|
83
86
|
:column_xpath => 'td') do
|
84
87
|
key 'icao_code', :field_name => 'Designator'
|
@@ -42,9 +42,58 @@ class AutomobileVariant < ActiveRecord::Base
|
|
42
42
|
}
|
43
43
|
|
44
44
|
class ParserB
|
45
|
+
require 'slither'
|
46
|
+
::Slither.define :fuel_economy_guide_b do |d|
|
47
|
+
d.rows do |row|
|
48
|
+
row.trap { true } # there's only one section
|
49
|
+
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
50
|
+
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
51
|
+
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
52
|
+
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
53
|
+
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
54
|
+
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
55
|
+
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
56
|
+
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
57
|
+
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
58
|
+
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
59
|
+
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
60
|
+
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
61
|
+
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
62
|
+
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
63
|
+
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
64
|
+
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
65
|
+
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
66
|
+
row.spacer 2
|
67
|
+
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
68
|
+
row.spacer 2
|
69
|
+
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
70
|
+
row.spacer 2
|
71
|
+
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
72
|
+
row.spacer 2
|
73
|
+
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
74
|
+
row.spacer 2
|
75
|
+
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
76
|
+
row.spacer 2
|
77
|
+
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
78
|
+
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
79
|
+
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
80
|
+
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
81
|
+
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
82
|
+
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
83
|
+
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
84
|
+
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
85
|
+
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
86
|
+
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
87
|
+
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
88
|
+
row.column 'filler' , 1, :type => :string # NOT USED
|
89
|
+
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
90
|
+
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
91
|
+
end
|
92
|
+
end
|
45
93
|
attr_accessor :year
|
46
94
|
def initialize(options = {})
|
47
|
-
|
95
|
+
options = options.stringify_keys
|
96
|
+
@year = options['year']
|
48
97
|
end
|
49
98
|
|
50
99
|
def apply(row)
|
@@ -71,68 +120,12 @@ class AutomobileVariant < ActiveRecord::Base
|
|
71
120
|
end
|
72
121
|
end
|
73
122
|
|
74
|
-
def add_hints!(bus)
|
75
|
-
bus[:format] = :fixed_width
|
76
|
-
bus[:cut] = '13-' if year == 1995
|
77
|
-
bus[:schema_name] = :fuel_economy_guide_b
|
78
|
-
bus[:select] = lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' }
|
79
|
-
Slither.define :fuel_economy_guide_b do |d|
|
80
|
-
d.rows do |row|
|
81
|
-
row.trap { true } # there's only one section
|
82
|
-
row.column 'active_year' , 4, :type => :integer # ACTIVE YEAR
|
83
|
-
row.column 'state_code' , 1, :type => :string # STATE CODE: F=49-STATE,C=CALIFORNIA
|
84
|
-
row.column 'carline_clss' , 2, :type => :integer # CARLINE CLASS CODE
|
85
|
-
row.column 'carline_mfr_code' , 3, :type => :integer # CARLINE MANUFACTURER CODE
|
86
|
-
row.column 'carline_name' , 28, :type => :string # CARLINE NAME
|
87
|
-
row.column 'disp_cub_in' , 4, :type => :integer # DISP CUBIC INCHES
|
88
|
-
row.column 'fuel_system' , 2, :type => :string # FUEL SYSTEM: 'FI' FOR FUEL INJECTION, 2-DIGIT INTEGER VALUE FOR #OF VENTURIES IF CARBURETOR SYSTEM.
|
89
|
-
row.column 'model_trans' , 6, :type => :string # TRANSMISSION TYPE
|
90
|
-
row.column 'no_cyc' , 2, :type => :integer # NUMBER OF ENGINE CYLINDERS
|
91
|
-
row.column 'date_time' , 12, :type => :string # DATE AND TIME RECORD ENTERED -YYMMDDHHMMSS (YEAR, MONTH, DAY, HOUR, MINUTE, SECOND)
|
92
|
-
row.column 'release_date' , 6, :type => :string # RELEASE DATE - YYMMDD (YEAR, MONTH, DAY)
|
93
|
-
row.column 'vi_mfr_code' , 3, :type => :integer # VI MANUFACTURER CODE
|
94
|
-
row.column 'carline_code' , 5, :type => :integer # CARLINE CODE
|
95
|
-
row.column 'basic_eng_id' , 5, :type => :integer # BASIC ENGINE INDEX
|
96
|
-
row.column 'carline_mfr_name' , 32, :type => :string # CARLINE MANUFACTURER NAME
|
97
|
-
row.column 'suppress_code' , 1, :type => :integer # SUPPRESSION CODE (NO SUPPRESSED RECORD IF FOR PUBLIC ACCESS)
|
98
|
-
row.column 'est_city_mpg' , 3, :type => :integer # ESTIMATED (CITY) MILES PER GALLON - 90% OF UNADJUSTED VALUE
|
99
|
-
row.spacer 2
|
100
|
-
row.column 'highway_mpg' , 3, :type => :integer # ESTIMATED (HWY) MILES PER GALLON - 78% OF UNADJUSTED VALUE
|
101
|
-
row.spacer 2
|
102
|
-
row.column 'combined_mpg' , 3, :type => :integer # COMBINED MILES PER GALLON
|
103
|
-
row.spacer 2
|
104
|
-
row.column 'unadj_city_mpg' , 3, :type => :integer # UNADJUSTED CITY MILES PER GALLON
|
105
|
-
row.spacer 2
|
106
|
-
row.column 'unadj_hwy_mpg' , 3, :type => :integer # UNADJUSTED HIGHWAY MILES PER GALLON
|
107
|
-
row.spacer 2
|
108
|
-
row.column 'unadj_comb_mpg' , 3, :type => :integer # UNADJUSTED COMBINED MILES PER GALLON
|
109
|
-
row.spacer 2
|
110
|
-
row.column 'ave_anl_fuel' , 6, :type => :integer # "$" in col 147, Annual Fuel Cost starting col 148 in I5
|
111
|
-
row.column 'opt_disp' , 8, :type => :string # OPTIONAL DISPLACEMENT
|
112
|
-
row.column 'engine_desc1' , 10, :type => :string # ENGINE DESCRIPTION 1
|
113
|
-
row.column 'engine_desc2' , 10, :type => :string # ENGINE DESCRIPTION 2
|
114
|
-
row.column 'engine_desc3' , 10, :type => :string # ENGINE DESCRIPTION 3
|
115
|
-
row.column 'body_type_2d' , 10, :type => :string # BODY TYPE 2 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '2DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
116
|
-
row.column 'body_type_4d' , 10, :type => :string # BODY TYPE 4 DOOR - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM '4DR-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
117
|
-
row.column 'body_type_hbk' , 10, :type => :string # BODY TYPE HBK - IF THE BODY TYPE APPLIES IT WILL TAKE THE FORM 'HBK-PPP/LL' WHERE PPP=PASSENGER INTERIOR VOLUME AND LL=LUGGAGE INTERIOR VOLUME.
|
118
|
-
row.column 'puerto_rico' , 1, :type => :string # '*' IF FOR PUERTO RICO SALES ONLY
|
119
|
-
row.column 'overdrive' , 4, :type => :string # OVERDRIVE: ' OD ' FOR OVERDRIVE, 'EOD ' FOR ELECTRICALLY OPERATED OVERDRIVE AND 'AEOD' FOR AUTOMATIC OVERDRIVE
|
120
|
-
row.column 'drive_system' , 3, :type => :string # FWD=FRONT WHEEL DRIVE, RWD=REAR, 4WD=4-WHEEL
|
121
|
-
row.column 'filler' , 1, :type => :string # NOT USED
|
122
|
-
row.column 'fuel_type' , 1, :type => :string # R=REGULAR(UNLEADED), P=PREMIUM, D=DIESEL
|
123
|
-
row.column 'trans_desc' , 15, :type => :string # TRANSMISSION DESCRIPTORS
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
123
|
end
|
128
124
|
class ParserC
|
129
125
|
attr_accessor :year
|
130
126
|
def initialize(options = {})
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
def add_hints!(bus)
|
135
|
-
# File will decide format based on filename
|
127
|
+
options = options.stringify_keys
|
128
|
+
@year = options['year']
|
136
129
|
end
|
137
130
|
|
138
131
|
def apply(row)
|
@@ -153,11 +146,8 @@ class AutomobileVariant < ActiveRecord::Base
|
|
153
146
|
class ParserD
|
154
147
|
attr_accessor :year
|
155
148
|
def initialize(options = {})
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
def add_hints!(bus)
|
160
|
-
bus[:reject] = lambda { |row| row.values.first.blank? } if year == 2007
|
149
|
+
options = options.stringify_keys
|
150
|
+
@year = options['year']
|
161
151
|
end
|
162
152
|
|
163
153
|
def apply(row)
|
@@ -238,8 +228,7 @@ class AutomobileVariant < ActiveRecord::Base
|
|
238
228
|
end
|
239
229
|
end
|
240
230
|
|
241
|
-
errata =
|
242
|
-
:responder => AutomobileVariant::Guru.new
|
231
|
+
errata = { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/fuel_economy_guide/errata.csv', :responder => 'AutomobileVariant::Guru' }
|
243
232
|
|
244
233
|
data_miner do
|
245
234
|
# 1985---1997
|
@@ -248,6 +237,10 @@ class AutomobileVariant < ActiveRecord::Base
|
|
248
237
|
import(:url => "http://www.fueleconomy.gov/FEG/epadata/#{yy}mfgui.zip",
|
249
238
|
:filename => filename,
|
250
239
|
:transform => { :class => FuelEconomyGuide::ParserB, :year => "19#{yy}".to_i },
|
240
|
+
:format => :fixed_width,
|
241
|
+
:cut => (yy == 95) ? '13-' : nil,
|
242
|
+
:schema_name => :fuel_economy_guide_b,
|
243
|
+
:select => lambda { |row| row['supress_code'].blank? and row['state_code'] == 'F' },
|
251
244
|
:errata => errata) do
|
252
245
|
key 'row_hash'
|
253
246
|
store 'make_name', :field_name => 'make'
|
@@ -317,6 +310,7 @@ class AutomobileVariant < ActiveRecord::Base
|
|
317
310
|
# 2010 => { :url => 'http://www.fueleconomy.gov/FEG/epadata/10data.zip', :filename => '2010FEguide-for DOE-rel dates before 10-16-09-no-sales10-8-09public.xls' }
|
318
311
|
}.sort { |a, b| a.first <=> b.first }.each do |year, options|
|
319
312
|
import options.merge(:transform => { :class => FuelEconomyGuide::ParserD, :year => year },
|
313
|
+
:reject => (year == 2007) ? lambda { |row| row.values.first.blank? } : nil,
|
320
314
|
:errata => errata) do
|
321
315
|
key 'row_hash'
|
322
316
|
store 'make_name', :field_name => 'make'
|
data/test/test_old_syntax.rb
CHANGED
@@ -548,7 +548,7 @@ class AircraftDeux < ActiveRecord::Base
|
|
548
548
|
import("ICAO codes starting with letter #{letter} used by the FAA",
|
549
549
|
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
550
550
|
:encoding => 'windows-1252',
|
551
|
-
:errata => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw',
|
551
|
+
:errata => { :url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw' },
|
552
552
|
:row_xpath => '//table/tr[2]/td/table/tr',
|
553
553
|
:column_xpath => 'td') do
|
554
554
|
key 'icao_code', :field_name => 'Designator'
|
@@ -588,7 +588,7 @@ class AutomobileMakeFleetYear < ActiveRecord::Base
|
|
588
588
|
|
589
589
|
# CAFE data privately emailed to Andy from Terry Anderson at the DOT/NHTSA
|
590
590
|
import :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/make_fleet_years.csv',
|
591
|
-
:errata => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv',
|
591
|
+
:errata => { :url => 'http://static.brighterplanet.com/science/data/transport/automobiles/make_fleet_years/errata.csv' },
|
592
592
|
:select => lambda { |row| row['volume'].to_i > 0 } do
|
593
593
|
key 'name', :synthesize => lambda { |row| [ row['manufacturer_name'], row['fleet'][2,2], row['year_content'] ].join ' ' }
|
594
594
|
store 'make_name', :field_name => 'manufacturer_name'
|
@@ -661,7 +661,7 @@ class TestOldSyntax < Test::Unit::TestCase
|
|
661
661
|
end
|
662
662
|
end
|
663
663
|
assert_kind_of DataMiner::Import, AutomobileFuelType.data_miner_config.steps.first
|
664
|
-
assert_equal 'http://example.com', AutomobileFuelType.data_miner_config.steps.first.table.
|
664
|
+
assert_equal 'http://example.com', AutomobileFuelType.data_miner_config.steps.first.table.url
|
665
665
|
assert_equal 1, AutomobileFuelType.data_miner_config.step_counter
|
666
666
|
end
|
667
667
|
should "stop and finish if it gets a DataMiner::Finish" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 0
|
9
8
|
- 1
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Seamus Abshere
|
@@ -28,12 +28,12 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - ">="
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
hash:
|
31
|
+
hash: 19
|
32
32
|
segments:
|
33
33
|
- 1
|
34
|
+
- 1
|
34
35
|
- 0
|
35
|
-
|
36
|
-
version: 1.0.3
|
36
|
+
version: 1.1.0
|
37
37
|
type: :runtime
|
38
38
|
version_requirements: *id001
|
39
39
|
- !ruby/object:Gem::Dependency
|