data_miner 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -7
- data/CHANGELOG +13 -0
- data/LICENSE +1 -1
- data/README.markdown +112 -0
- data/data_miner.gemspec +2 -2
- data/lib/data_miner.rb +26 -12
- data/lib/data_miner/active_record_class_methods.rb +108 -0
- data/lib/data_miner/attribute.rb +150 -76
- data/lib/data_miner/dictionary.rb +40 -18
- data/lib/data_miner/run.rb +35 -0
- data/lib/data_miner/script.rb +123 -2
- data/lib/data_miner/step.rb +11 -3
- data/lib/data_miner/step/import.rb +100 -64
- data/lib/data_miner/step/process.rb +46 -28
- data/lib/data_miner/step/tap.rb +156 -123
- data/lib/data_miner/version.rb +1 -1
- data/test/test_safety.rb +61 -25
- metadata +8 -6
- data/README.rdoc +0 -289
- data/lib/data_miner/active_record_extensions.rb +0 -38
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-04
|
14
|
+
date: 2012-05-04 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: remote_table
|
@@ -141,7 +141,8 @@ dependencies:
|
|
141
141
|
- - ! '>='
|
142
142
|
- !ruby/object:Gem::Version
|
143
143
|
version: 0.5.1
|
144
|
-
description:
|
144
|
+
description: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
|
145
|
+
XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models. You can also convert
|
145
146
|
units.
|
146
147
|
email:
|
147
148
|
- seamus@abshere.net
|
@@ -153,11 +154,11 @@ files:
|
|
153
154
|
- CHANGELOG
|
154
155
|
- Gemfile
|
155
156
|
- LICENSE
|
156
|
-
- README.
|
157
|
+
- README.markdown
|
157
158
|
- Rakefile
|
158
159
|
- data_miner.gemspec
|
159
160
|
- lib/data_miner.rb
|
160
|
-
- lib/data_miner/
|
161
|
+
- lib/data_miner/active_record_class_methods.rb
|
161
162
|
- lib/data_miner/attribute.rb
|
162
163
|
- lib/data_miner/dictionary.rb
|
163
164
|
- lib/data_miner/run.rb
|
@@ -200,7 +201,8 @@ rubyforge_project: data_miner
|
|
200
201
|
rubygems_version: 1.8.21
|
201
202
|
signing_key:
|
202
203
|
specification_version: 3
|
203
|
-
summary:
|
204
|
+
summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
|
205
|
+
XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
|
204
206
|
test_files:
|
205
207
|
- test/helper.rb
|
206
208
|
- test/support/breeds.xls
|
data/README.rdoc
DELETED
@@ -1,289 +0,0 @@
|
|
1
|
-
=data_miner
|
2
|
-
|
3
|
-
Programmatically import useful data into your ActiveRecord models.
|
4
|
-
|
5
|
-
(see http://wiki.github.com/seamusabshere/data_miner for more examples)
|
6
|
-
|
7
|
-
==Quick start
|
8
|
-
|
9
|
-
You define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
|
10
|
-
|
11
|
-
class Country < ActiveRecord::Base
|
12
|
-
self.primary_key = :iso_3166_code
|
13
|
-
|
14
|
-
data_miner do
|
15
|
-
import 'the official ISO country list',
|
16
|
-
:url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
|
17
|
-
:skip => 2,
|
18
|
-
:headers => false,
|
19
|
-
:delimiter => ';',
|
20
|
-
:encoding => 'ISO-8859-1' do
|
21
|
-
key :iso_3166_code, :field_number => 1
|
22
|
-
store :name, :field_number => 0
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
Now you can run:
|
28
|
-
|
29
|
-
irb(main):001:0> Country.run_data_miner!
|
30
|
-
=> nil
|
31
|
-
|
32
|
-
== Creating tables from scratch (changed in 1.2)
|
33
|
-
|
34
|
-
We recommend using the <tt>mini_record-compat</tt> gem (https://github.com/seamusabshere/mini_record)
|
35
|
-
|
36
|
-
This replaces the <tt>schema</tt> method that was available before. It didn't make sense for <tt>data_miner</tt> to provide this natively.
|
37
|
-
|
38
|
-
class Car < ActiveRecord::Base
|
39
|
-
# the mini_record way
|
40
|
-
col :make
|
41
|
-
col :model
|
42
|
-
|
43
|
-
data_miner do
|
44
|
-
# DEPRECATED - see above
|
45
|
-
# schema do
|
46
|
-
# string :make
|
47
|
-
# string :model
|
48
|
-
# end
|
49
|
-
|
50
|
-
# the mini_record way
|
51
|
-
process :auto_upgrade!
|
52
|
-
|
53
|
-
# [... other data mining steps]
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
==Advanced usage
|
58
|
-
|
59
|
-
This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:
|
60
|
-
|
61
|
-
class Aircraft < ActiveRecord::Base
|
62
|
-
# Tell ActiveRecord that we want to use a string primary key.
|
63
|
-
# This makes it easier to repeatedly truncate and re-import this
|
64
|
-
# table without breaking associations.
|
65
|
-
self.primary_key = :icao_code
|
66
|
-
|
67
|
-
# Use the mini_record-compat gem to define the database schema in-line.
|
68
|
-
# It will destructively and automatically add/remove columns.
|
69
|
-
# This is "OK" because you can always just re-run the import script to get the data back.
|
70
|
-
# PS. If you're using DataMapper, you don't need this
|
71
|
-
col :icao_code
|
72
|
-
col :manufacturer_name
|
73
|
-
col :name
|
74
|
-
col :bts_name
|
75
|
-
col :bts_aircraft_type_code
|
76
|
-
col :brighter_planet_aircraft_class_code
|
77
|
-
col :fuel_use_aircraft_name
|
78
|
-
col :m3, :type => :float
|
79
|
-
col :m3_units
|
80
|
-
col :m2, :type => :float
|
81
|
-
col :m2_units
|
82
|
-
col :m1, :type => :float
|
83
|
-
col :m1_units
|
84
|
-
col :endpoint_fuel, :type => :float
|
85
|
-
col :endpoint_fuel_units
|
86
|
-
col :seats, :type => :float
|
87
|
-
col :distance, :type => :float
|
88
|
-
col :distance_units
|
89
|
-
col :load_factor, :type => :float
|
90
|
-
col :freight_share, :type => :float
|
91
|
-
col :payload, :type => :float
|
92
|
-
col :weighting, :type => :float
|
93
|
-
col :bts_aircraft_type_code, :type => :index
|
94
|
-
|
95
|
-
# A dictionary between BTS aircraft type codes and ICAO aircraft
|
96
|
-
# codes that uses string similarity instead of exact matching.
|
97
|
-
# This is preferable to typing everything out.
|
98
|
-
def self.bts_name_dictionary
|
99
|
-
# Sorry for documenting the LooseTightDictionary gem here, but it's useful
|
100
|
-
@_bts_dictionary ||= LooseTightDictionary.new(
|
101
|
-
# The first argument is the source... the possible matches. Most Enumerables will do.
|
102
|
-
RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
|
103
|
-
# Tightenings optionally pull out what is important on both sides of a potential match
|
104
|
-
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
105
|
-
# Identities optionally require a particular capture from both sides of a match to be equal
|
106
|
-
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
107
|
-
# Blockings restrict comparisons to a subset where everything matches the blocking
|
108
|
-
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
109
|
-
# This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
|
110
|
-
# This is useful because we say /boeing/ and only boeings are matched against other boeings.
|
111
|
-
:blocking_only => true,
|
112
|
-
# Tell the dictionary how read things from the source.
|
113
|
-
:right_reader => lambda { |record| record['Description'] }
|
114
|
-
)
|
115
|
-
end
|
116
|
-
|
117
|
-
# A dictionary between what appear to be ICAO aircraft names and
|
118
|
-
# objects of this class itself.
|
119
|
-
# Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
|
120
|
-
def self.icao_name_dictionary
|
121
|
-
@_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
|
122
|
-
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
123
|
-
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
124
|
-
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
125
|
-
:right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
|
126
|
-
end
|
127
|
-
|
128
|
-
# This responds to the "Matcher" interface as defined by DataMiner.
|
129
|
-
# In other words, it takes Matcher#match(*args) and returns something.
|
130
|
-
class BtsMatcher
|
131
|
-
attr_reader :wants
|
132
|
-
def initialize(wants)
|
133
|
-
@wants = wants
|
134
|
-
end
|
135
|
-
def match(raw_faa_icao_record)
|
136
|
-
@_match ||= Hash.new
|
137
|
-
return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
|
138
|
-
faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
|
139
|
-
bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
|
140
|
-
retval = case wants
|
141
|
-
when :bts_aircraft_type_code
|
142
|
-
bts_record['Code']
|
143
|
-
when :bts_name
|
144
|
-
bts_record['Description']
|
145
|
-
end if bts_record
|
146
|
-
@_match[raw_faa_icao_record] = retval
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
# Another class that implements the "Matcher" interface as expected by DataMiner.
|
151
|
-
class FuelUseMatcher
|
152
|
-
def match(raw_fuel_use_record)
|
153
|
-
@_match ||= Hash.new
|
154
|
-
return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
|
155
|
-
# First try assuming we have an ICAO code
|
156
|
-
aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
|
157
|
-
Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
|
158
|
-
end
|
159
|
-
# No luck? then try a fuzzy match
|
160
|
-
aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
|
161
|
-
Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
|
162
|
-
end
|
163
|
-
if aircraft_record
|
164
|
-
@_match[raw_fuel_use_record] = aircraft_record.icao_code
|
165
|
-
else
|
166
|
-
# While we're developing the dictionary, we want it to blow up until we have 100% matchability
|
167
|
-
raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
# This responds to the "Responder" interface as expected by Errata.
|
173
|
-
# Basically it lets you say "Is a DC plane" in the errata file and
|
174
|
-
# have it map to a Ruby method.
|
175
|
-
class Guru
|
176
|
-
def is_a_dc_plane?(row)
|
177
|
-
row['Designator'] =~ /^DC\d/i
|
178
|
-
end
|
179
|
-
def is_a_g159?(row)
|
180
|
-
row['Designator'] =~ /^G159$/
|
181
|
-
end
|
182
|
-
def is_a_galx?(row)
|
183
|
-
row['Designator'] =~ /^GALX$/
|
184
|
-
end
|
185
|
-
def method_missing(method_id, *args, &block)
|
186
|
-
if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
|
187
|
-
manufacturer_name = $1
|
188
|
-
manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
|
189
|
-
matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
|
190
|
-
method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
|
191
|
-
else
|
192
|
-
super
|
193
|
-
end
|
194
|
-
end
|
195
|
-
end
|
196
|
-
|
197
|
-
data_miner do
|
198
|
-
# In our app, we defined DataMiner::Run.allowed? to return false if a run
|
199
|
-
# has taken place in the last hour (among other things).
|
200
|
-
# By raising DataMiner::Skip, we skip this run but call it a success.
|
201
|
-
process "Don't re-import too often" do
|
202
|
-
raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
|
203
|
-
end
|
204
|
-
|
205
|
-
# The FAA publishes a document to help people identify aircraft by different names.
|
206
|
-
('A'..'Z').each do |letter|
|
207
|
-
import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
|
208
|
-
# The master URL of the source file (one for every letter)
|
209
|
-
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
210
|
-
# The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
|
211
|
-
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
|
212
|
-
# If it's not UTF-8, you should say what it is so that we can iconv it!
|
213
|
-
:encoding => 'windows-1252',
|
214
|
-
# Nokogiri is being used to grab each row starting from the second
|
215
|
-
:row_xpath => '//table/tr[2]/td/table/tr',
|
216
|
-
# ditto... XPath for Nokogiri
|
217
|
-
:column_xpath => 'td' ) do
|
218
|
-
# The code that they use is in fact the ICAO code!
|
219
|
-
key 'icao_code', :field_name => 'Designator'
|
220
|
-
# We get this for free
|
221
|
-
store 'manufacturer_name', :field_name => 'Manufacturer'
|
222
|
-
# ditto
|
223
|
-
store 'name', :field_name => 'Model'
|
224
|
-
# Use the loose-tight dictionary.
|
225
|
-
# It gets the entire input row to play with before deciding on an output.
|
226
|
-
store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
|
227
|
-
store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
# Pull in some data that might only be important to Brighter Planet
|
232
|
-
import "Brighter Planet's aircraft class codes",
|
233
|
-
:url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
|
234
|
-
key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
|
235
|
-
store 'brighter_planet_aircraft_class_code'
|
236
|
-
end
|
237
|
-
|
238
|
-
# Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
|
239
|
-
# This data comes from the EEA.
|
240
|
-
import "pre-calculated fuel use equation coefficients",
|
241
|
-
:url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
|
242
|
-
:select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
|
243
|
-
# We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
|
244
|
-
key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new
|
245
|
-
# Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
|
246
|
-
store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
|
247
|
-
store 'm3'
|
248
|
-
store 'm2'
|
249
|
-
store 'm1'
|
250
|
-
store 'endpoint_fuel', :field_name => 'b'
|
251
|
-
end
|
252
|
-
|
253
|
-
# Use arel and the weighted_average gem to do some crazy averaging.
|
254
|
-
# This assumes that you're dealing with the BTS T-100 flight segment data.
|
255
|
-
# See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
|
256
|
-
process "Derive some average flight characteristics from flight segments" do
|
257
|
-
FlightSegment.run_data_miner!
|
258
|
-
aircraft = Aircraft.arel_table
|
259
|
-
segments = FlightSegment.arel_table
|
260
|
-
|
261
|
-
conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
|
262
|
-
update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
263
|
-
update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
264
|
-
update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
265
|
-
update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
266
|
-
update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"
|
267
|
-
|
268
|
-
update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
|
269
|
-
end
|
270
|
-
|
271
|
-
# And finally re-run the import of resources that depend on this model.
|
272
|
-
# Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
|
273
|
-
# that's the right way to do dependencies. It won't get called twice in the same run.
|
274
|
-
[ AircraftManufacturer ].each do |synthetic_resource|
|
275
|
-
process "Synthesize #{synthetic_resource}" do
|
276
|
-
synthetic_resource.run_data_miner!
|
277
|
-
end
|
278
|
-
end
|
279
|
-
end
|
280
|
-
end
|
281
|
-
|
282
|
-
==Authors
|
283
|
-
|
284
|
-
* Seamus Abshere <seamus@abshere.net>
|
285
|
-
* Andy Rossmeissl <andy@rossmeissl.net>
|
286
|
-
|
287
|
-
==Copyright
|
288
|
-
|
289
|
-
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
@@ -1,38 +0,0 @@
|
|
1
|
-
require 'active_record'
|
2
|
-
require 'lock_method'
|
3
|
-
|
4
|
-
class DataMiner
|
5
|
-
module ActiveRecordExtensions
|
6
|
-
MUTEX = ::Mutex.new
|
7
|
-
|
8
|
-
def data_miner_script
|
9
|
-
@data_miner_script || MUTEX.synchronize do
|
10
|
-
@data_miner_script ||= DataMiner::Script.new(self)
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def data_miner_runs
|
15
|
-
DataMiner::Run.scoped :conditions => { :model_name => name }
|
16
|
-
end
|
17
|
-
|
18
|
-
def run_data_miner!
|
19
|
-
data_miner_script.perform
|
20
|
-
end
|
21
|
-
|
22
|
-
def run_data_miner_on_parent_associations!
|
23
|
-
reflect_on_all_associations(:belongs_to).reject do |assoc|
|
24
|
-
assoc.options[:polymorphic]
|
25
|
-
end.each do |non_polymorphic_belongs_to_assoc|
|
26
|
-
non_polymorphic_belongs_to_assoc.klass.run_data_miner!
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def data_miner(options = {}, &blk)
|
31
|
-
DataMiner.model_names.add name
|
32
|
-
unless options[:append]
|
33
|
-
@data_miner_script = nil
|
34
|
-
end
|
35
|
-
data_miner_script.append_block blk
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|