data_miner-ruby19 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.rdoc +271 -0
- data/lib/data_miner.rb +136 -0
- data/lib/data_miner/attribute.rb +233 -0
- data/lib/data_miner/base.rb +194 -0
- data/lib/data_miner/dictionary.rb +36 -0
- data/lib/data_miner/import.rb +70 -0
- data/lib/data_miner/process.rb +37 -0
- data/lib/data_miner/run.rb +26 -0
- data/lib/data_miner/schema.rb +244 -0
- data/lib/data_miner/tap.rb +146 -0
- data/test/data_miner_test.rb +1399 -0
- data/test/test_helper.rb +307 -0
- metadata +330 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Brighter Planet
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,271 @@
|
|
1
|
+
=data_miner
|
2
|
+
|
3
|
+
Programmatically import useful data into your ActiveRecord models.
|
4
|
+
|
5
|
+
(see http://wiki.github.com/seamusabshere/data_miner for more examples)
|
6
|
+
|
7
|
+
==Quick start
|
8
|
+
|
9
|
+
You define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
|
10
|
+
|
11
|
+
class Country < ActiveRecord::Base
|
12
|
+
set_primary_key :iso_3166_code
|
13
|
+
|
14
|
+
data_miner do
|
15
|
+
schema do
|
16
|
+
string 'iso_3166_code'
|
17
|
+
string 'name'
|
18
|
+
end
|
19
|
+
|
20
|
+
import 'the official ISO country list',
|
21
|
+
:url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
|
22
|
+
:skip => 2,
|
23
|
+
:headers => false,
|
24
|
+
:delimiter => ';',
|
25
|
+
:encoding => 'ISO-8859-1' do
|
26
|
+
key 'iso_3166_code', :field_number => 1
|
27
|
+
store 'name', :field_number => 0
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
Now you can run:
|
33
|
+
|
34
|
+
irb(main):001:0> Country.run_data_miner!
|
35
|
+
=> nil
|
36
|
+
|
37
|
+
==Advanced usage
|
38
|
+
|
39
|
+
This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:
|
40
|
+
|
41
|
+
class Aircraft < ActiveRecord::Base
|
42
|
+
# Tell ActiveRecord that we want to use a string primary key.
|
43
|
+
# This makes it easier to repeatedly truncate and re-import this
|
44
|
+
# table without breaking associations.
|
45
|
+
set_primary_key :icao_code
|
46
|
+
|
47
|
+
# A dictionary between BTS aircraft type codes and ICAO aircraft
|
48
|
+
# codes that uses string similarity instead of exact matching.
|
49
|
+
# This is preferable to typing everything out.
|
50
|
+
def self.bts_name_dictionary
|
51
|
+
# Sorry for documenting the LooseTightDictionary gem here, but it's useful
|
52
|
+
@_bts_dictionary ||= LooseTightDictionary.new(
|
53
|
+
# The first argument is the source... the possible matches. Most Enumerables will do.
|
54
|
+
RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
|
55
|
+
# Tightenings optionally pull out what is important on both sides of a potential match
|
56
|
+
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
57
|
+
# Identities optionally require a particular capture from both sides of a match to be equal
|
58
|
+
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
59
|
+
# Blockings restrict comparisons to a subset where everything matches the blocking
|
60
|
+
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
61
|
+
# This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
|
62
|
+
# This is useful because we say /boeing/ and only boeings are matched against other boeings.
|
63
|
+
:blocking_only => true,
|
64
|
+
# Tell the dictionary how read things from the source.
|
65
|
+
:right_reader => lambda { |record| record['Description'] }
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
# A dictionary between what appear to be ICAO aircraft names and
|
70
|
+
# objects of this class itself.
|
71
|
+
# Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
|
72
|
+
def self.icao_name_dictionary
|
73
|
+
@_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
|
74
|
+
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
75
|
+
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
76
|
+
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
77
|
+
:right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
|
78
|
+
end
|
79
|
+
|
80
|
+
# This responds to the "Matcher" interface as defined by DataMiner.
|
81
|
+
# In other words, it takes Matcher#match(*args) and returns something.
|
82
|
+
class BtsMatcher
|
83
|
+
attr_reader :wants
|
84
|
+
def initialize(wants)
|
85
|
+
@wants = wants
|
86
|
+
end
|
87
|
+
def match(raw_faa_icao_record)
|
88
|
+
@_match ||= Hash.new
|
89
|
+
return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
|
90
|
+
faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
|
91
|
+
bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
|
92
|
+
retval = case wants
|
93
|
+
when :bts_aircraft_type_code
|
94
|
+
bts_record['Code']
|
95
|
+
when :bts_name
|
96
|
+
bts_record['Description']
|
97
|
+
end if bts_record
|
98
|
+
@_match[raw_faa_icao_record] = retval
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Another class that implements the "Matcher" interface as expected by DataMiner.
|
103
|
+
class FuelUseMatcher
|
104
|
+
def match(raw_fuel_use_record)
|
105
|
+
@_match ||= Hash.new
|
106
|
+
return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
|
107
|
+
# First try assuming we have an ICAO code
|
108
|
+
aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
|
109
|
+
Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
|
110
|
+
end
|
111
|
+
# No luck? then try a fuzzy match
|
112
|
+
aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
|
113
|
+
Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
|
114
|
+
end
|
115
|
+
if aircraft_record
|
116
|
+
@_match[raw_fuel_use_record] = aircraft_record.icao_code
|
117
|
+
else
|
118
|
+
# While we're developing the dictionary, we want it to blow up until we have 100% matchability
|
119
|
+
raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# This responds to the "Responder" interface as expected by Errata.
|
125
|
+
# Basically it lets you say "Is a DC plane" in the errata file and
|
126
|
+
# have it map to a Ruby method.
|
127
|
+
class Guru
|
128
|
+
def is_a_dc_plane?(row)
|
129
|
+
row['Designator'] =~ /^DC\d/i
|
130
|
+
end
|
131
|
+
def is_a_g159?(row)
|
132
|
+
row['Designator'] =~ /^G159$/
|
133
|
+
end
|
134
|
+
def is_a_galx?(row)
|
135
|
+
row['Designator'] =~ /^GALX$/
|
136
|
+
end
|
137
|
+
def method_missing(method_id, *args, &block)
|
138
|
+
if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
|
139
|
+
manufacturer_name = $1
|
140
|
+
manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
|
141
|
+
matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
|
142
|
+
method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
|
143
|
+
else
|
144
|
+
super
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
data_miner do
|
150
|
+
# In our app, we defined DataMiner::Run.allowed? to return false if a run
|
151
|
+
# has taken place in the last hour (among other things).
|
152
|
+
# By raising DataMiner::Skip, we skip this run but call it a success.
|
153
|
+
process "Don't re-import too often" do
|
154
|
+
raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
|
155
|
+
end
|
156
|
+
|
157
|
+
# Define the database schema in-line.
|
158
|
+
# It will destructively and automatically add/remove columns.
|
159
|
+
# This is "OK" because you can always just re-run the import script to get the data back.
|
160
|
+
# PS. if we were using DataMapper, we wouldn't need this.
|
161
|
+
schema :options => 'ENGINE=InnoDB default charset=utf8' do
|
162
|
+
string 'icao_code'
|
163
|
+
string 'manufacturer_name'
|
164
|
+
string 'name'
|
165
|
+
string 'bts_name'
|
166
|
+
string 'bts_aircraft_type_code'
|
167
|
+
string 'brighter_planet_aircraft_class_code'
|
168
|
+
string 'fuel_use_aircraft_name'
|
169
|
+
float 'm3'
|
170
|
+
string 'm3_units'
|
171
|
+
float 'm2'
|
172
|
+
string 'm2_units'
|
173
|
+
float 'm1'
|
174
|
+
string 'm1_units'
|
175
|
+
float 'endpoint_fuel'
|
176
|
+
string 'endpoint_fuel_units'
|
177
|
+
float 'seats'
|
178
|
+
float 'distance'
|
179
|
+
string 'distance_units'
|
180
|
+
float 'load_factor'
|
181
|
+
float 'freight_share'
|
182
|
+
float 'payload'
|
183
|
+
float 'weighting'
|
184
|
+
index 'bts_aircraft_type_code'
|
185
|
+
end
|
186
|
+
|
187
|
+
# The FAA publishes a document to help people identify aircraft by different names.
|
188
|
+
('A'..'Z').each do |letter|
|
189
|
+
import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
|
190
|
+
# The master URL of the source file (one for every letter)
|
191
|
+
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
192
|
+
# The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
|
193
|
+
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
|
194
|
+
# If it's not UTF-8, you should say what it is so that we can iconv it!
|
195
|
+
:encoding => 'windows-1252',
|
196
|
+
# Nokogiri is being used to grab each row starting from the second
|
197
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
198
|
+
# ditto... XPath for Nokogiri
|
199
|
+
:column_xpath => 'td' ) do
|
200
|
+
# The code that they use is in fact the ICAO code!
|
201
|
+
key 'icao_code', :field_name => 'Designator'
|
202
|
+
# We get this for free
|
203
|
+
store 'manufacturer_name', :field_name => 'Manufacturer'
|
204
|
+
# ditto
|
205
|
+
store 'name', :field_name => 'Model'
|
206
|
+
# Use the loose-tight dictionary.
|
207
|
+
# It gets the entire input row to play with before deciding on an output.
|
208
|
+
store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
|
209
|
+
store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# Pull in some data that might only be important to Brighter Planet
|
214
|
+
import "Brighter Planet's aircraft class codes",
|
215
|
+
:url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
|
216
|
+
key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
|
217
|
+
store 'brighter_planet_aircraft_class_code'
|
218
|
+
end
|
219
|
+
|
220
|
+
# Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
|
221
|
+
# This data comes from the EEA.
|
222
|
+
import "pre-calculated fuel use equation coefficients",
|
223
|
+
:url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
|
224
|
+
:select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
|
225
|
+
# We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
|
226
|
+
key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new
|
227
|
+
# Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
|
228
|
+
store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
|
229
|
+
store 'm3'
|
230
|
+
store 'm2'
|
231
|
+
store 'm1'
|
232
|
+
store 'endpoint_fuel', :field_name => 'b'
|
233
|
+
end
|
234
|
+
|
235
|
+
# Use arel and the weighted_average gem to do some crazy averaging.
|
236
|
+
# This assumes that you're dealing with the BTS T-100 flight segment data.
|
237
|
+
# See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
|
238
|
+
process "Derive some average flight characteristics from flight segments" do
|
239
|
+
FlightSegment.run_data_miner!
|
240
|
+
aircraft = Aircraft.arel_table
|
241
|
+
segments = FlightSegment.arel_table
|
242
|
+
|
243
|
+
conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
|
244
|
+
update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
245
|
+
update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
246
|
+
update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
247
|
+
update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
248
|
+
update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"
|
249
|
+
|
250
|
+
update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
|
251
|
+
end
|
252
|
+
|
253
|
+
# And finally re-run the import of resources that depend on this resource.
|
254
|
+
# Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
|
255
|
+
# that's the right way to do dependencies. It won't get called twice in the same run.
|
256
|
+
[ AircraftManufacturer ].each do |synthetic_resource|
|
257
|
+
process "Synthesize #{synthetic_resource}" do
|
258
|
+
synthetic_resource.run_data_miner!
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
==Authors
|
265
|
+
|
266
|
+
* Seamus Abshere <seamus@abshere.net>
|
267
|
+
* Andy Rossmeissl <andy@rossmeissl.net>
|
268
|
+
|
269
|
+
==Copyright
|
270
|
+
|
271
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/lib/data_miner.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/version'
|
3
|
+
%w{
|
4
|
+
active_support/core_ext/array/conversions
|
5
|
+
active_support/core_ext/string/access
|
6
|
+
active_support/core_ext/string/multibyte
|
7
|
+
}.each do |active_support_3_requirement|
|
8
|
+
require active_support_3_requirement
|
9
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
10
|
+
|
11
|
+
require 'active_record'
|
12
|
+
require 'blockenspiel'
|
13
|
+
require 'conversions'
|
14
|
+
require 'errata'
|
15
|
+
require 'remote_table'
|
16
|
+
require 'escape'
|
17
|
+
require 'andand'
|
18
|
+
require 'log4r'
|
19
|
+
require 'fileutils'
|
20
|
+
require 'tmpdir'
|
21
|
+
require 'zlib'
|
22
|
+
|
23
|
+
require 'data_miner/attribute'
|
24
|
+
require 'data_miner/base'
|
25
|
+
require 'data_miner/dictionary'
|
26
|
+
require 'data_miner/import'
|
27
|
+
require 'data_miner/tap'
|
28
|
+
require 'data_miner/process'
|
29
|
+
require 'data_miner/run'
|
30
|
+
require 'data_miner/schema'
|
31
|
+
|
32
|
+
module DataMiner
|
33
|
+
class MissingHashColumn < StandardError; end
|
34
|
+
class Finish < StandardError; end
|
35
|
+
class Skip < StandardError; end
|
36
|
+
|
37
|
+
mattr_accessor :logger
|
38
|
+
|
39
|
+
def self.start_logging
|
40
|
+
return if logger
|
41
|
+
|
42
|
+
if defined? Rails
|
43
|
+
self.logger = Rails.logger
|
44
|
+
else
|
45
|
+
class_eval { include Log4r }
|
46
|
+
info_outputter = FileOutputter.new 'f1', :filename => 'data_miner.log'
|
47
|
+
error_outputter = Outputter.stderr
|
48
|
+
info_outputter.only_at DEBUG, INFO
|
49
|
+
error_outputter.only_at WARN, ERROR, FATAL
|
50
|
+
|
51
|
+
self.logger = Logger.new 'data_miner'
|
52
|
+
logger.add info_outputter, error_outputter
|
53
|
+
ActiveRecord::Base.logger = logger
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.log_or_raise(message)
|
58
|
+
message = "[data_miner gem] #{message}"
|
59
|
+
if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
|
60
|
+
logger.error message
|
61
|
+
else
|
62
|
+
raise message
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.log_info(message)
|
67
|
+
logger.info "[data_miner gem] #{message}"
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.log_debug(message)
|
71
|
+
logger.debug "[data_miner gem] #{message}"
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.run(options = {})
|
75
|
+
DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
|
76
|
+
DataMiner::Base.call_stack.clear
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.resource_names
|
80
|
+
DataMiner::Base.resource_names
|
81
|
+
end
|
82
|
+
|
83
|
+
# TODO this should probably live somewhere else
|
84
|
+
def self.backtick_with_reporting(cmd)
|
85
|
+
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
86
|
+
output = `#{cmd}`
|
87
|
+
if not $?.success?
|
88
|
+
raise %{
|
89
|
+
From the data_miner gem...
|
90
|
+
|
91
|
+
Command failed:
|
92
|
+
#{cmd}
|
93
|
+
|
94
|
+
Output:
|
95
|
+
#{output}
|
96
|
+
}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
ActiveRecord::Base.class_eval do
|
103
|
+
def self.x_data_miner(&block)
|
104
|
+
DataMiner.start_logging
|
105
|
+
|
106
|
+
DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.data_miner(&block)
|
110
|
+
DataMiner.start_logging
|
111
|
+
|
112
|
+
DataMiner.log_debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
|
113
|
+
|
114
|
+
DataMiner.resource_names.push self.name unless DataMiner.resource_names.include? self.name
|
115
|
+
|
116
|
+
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
117
|
+
class_eval do
|
118
|
+
cattr_accessor :data_miner_base
|
119
|
+
def self.data_miner_runs
|
120
|
+
DataMiner::Run.scoped :conditions => { :resource_name => name }
|
121
|
+
end
|
122
|
+
def self.run_data_miner!(options = {})
|
123
|
+
data_miner_base.run options
|
124
|
+
end
|
125
|
+
def self.execute_schema
|
126
|
+
schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
|
127
|
+
schema.run(nil) if schema
|
128
|
+
end
|
129
|
+
end
|
130
|
+
self.data_miner_base = DataMiner::Base.new self
|
131
|
+
|
132
|
+
Blockenspiel.invoke block, data_miner_base
|
133
|
+
|
134
|
+
data_miner_base.after_invoke
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Attribute
|
3
|
+
attr_accessor :step
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :options
|
6
|
+
|
7
|
+
delegate :resource, :to => :step
|
8
|
+
|
9
|
+
VALID_OPTIONS = [
|
10
|
+
:from_units,
|
11
|
+
:to_units,
|
12
|
+
:static,
|
13
|
+
:dictionary,
|
14
|
+
:matcher,
|
15
|
+
:field_name,
|
16
|
+
:delimiter,
|
17
|
+
:split,
|
18
|
+
:units,
|
19
|
+
:sprintf,
|
20
|
+
:nullify,
|
21
|
+
:overwrite,
|
22
|
+
:upcase,
|
23
|
+
:units_field_name,
|
24
|
+
:units_field_number,
|
25
|
+
:field_number,
|
26
|
+
:chars,
|
27
|
+
:synthesize
|
28
|
+
]
|
29
|
+
|
30
|
+
def initialize(step, name, options = {})
|
31
|
+
options.symbolize_keys!
|
32
|
+
|
33
|
+
@step = step
|
34
|
+
@name = name
|
35
|
+
|
36
|
+
invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
|
37
|
+
DataMiner.log_or_raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
|
38
|
+
@options = options
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"Attribute(#{resource}##{name})"
|
43
|
+
end
|
44
|
+
|
45
|
+
def value_in_dictionary(str)
|
46
|
+
dictionary.lookup str
|
47
|
+
end
|
48
|
+
|
49
|
+
def value_in_source(row)
|
50
|
+
if wants_static?
|
51
|
+
value = static
|
52
|
+
elsif field_number
|
53
|
+
if field_number.is_a?(Range)
|
54
|
+
value = field_number.map { |n| row[n] }.join(delimiter)
|
55
|
+
else
|
56
|
+
value = row[field_number]
|
57
|
+
end
|
58
|
+
else
|
59
|
+
value = row[field_name]
|
60
|
+
end
|
61
|
+
return nil if value.nil?
|
62
|
+
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
63
|
+
value = value.to_s
|
64
|
+
value = value[chars] if wants_chars?
|
65
|
+
value = do_split(value) if wants_split?
|
66
|
+
# taken from old errata... maybe we want to do this here
|
67
|
+
value.gsub! /[ ]+/, ' '
|
68
|
+
# text.gsub!('- ', '-')
|
69
|
+
value.gsub! /([^\\])~/, '\1 '
|
70
|
+
value.strip!
|
71
|
+
value.upcase! if wants_upcase?
|
72
|
+
value = do_convert row, value if wants_conversion?
|
73
|
+
value = do_sprintf value if wants_sprintf?
|
74
|
+
value
|
75
|
+
end
|
76
|
+
|
77
|
+
def match_row(row)
|
78
|
+
matcher.match row
|
79
|
+
end
|
80
|
+
|
81
|
+
def value_from_row(row)
|
82
|
+
return match_row row if wants_matcher?
|
83
|
+
value = value_in_source row
|
84
|
+
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
85
|
+
value = value_in_dictionary value if wants_dictionary?
|
86
|
+
value = synthesize.call(row) if wants_synthesize?
|
87
|
+
value
|
88
|
+
end
|
89
|
+
|
90
|
+
# this will overwrite nils, even if wants_overwriting? is false
|
91
|
+
# returns true if an attr was changed, otherwise false
|
92
|
+
def set_record_from_row(record, row)
|
93
|
+
return false if !wants_overwriting? and !record.send(name).nil?
|
94
|
+
what_it_was = record.send name
|
95
|
+
what_it_should_be = value_from_row row
|
96
|
+
|
97
|
+
record.send "#{name}=", what_it_should_be
|
98
|
+
record.send "#{name}_units=", (to_units || unit_from_source(row)).to_s if wants_units?
|
99
|
+
|
100
|
+
what_it_is = record.send name
|
101
|
+
if what_it_is.nil? and !what_it_should_be.nil?
|
102
|
+
DataMiner.log_debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
103
|
+
nil
|
104
|
+
elsif what_it_is == what_it_was
|
105
|
+
false
|
106
|
+
else
|
107
|
+
true
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def unit_from_source(row)
|
112
|
+
row[units_field_name || units_field_number].to_s.strip.underscore.to_sym
|
113
|
+
end
|
114
|
+
|
115
|
+
def do_convert(row, value)
|
116
|
+
DataMiner.log_or_raise "If you use :from_units, you need to set :to_units (#{inspect})" unless wants_units?
|
117
|
+
value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
|
118
|
+
end
|
119
|
+
|
120
|
+
def do_sprintf(value)
|
121
|
+
if /\%[0-9\.]*f/.match sprintf
|
122
|
+
value = value.to_f
|
123
|
+
elsif /\%[0-9\.]*d/.match sprintf
|
124
|
+
value = value.to_i
|
125
|
+
end
|
126
|
+
sprintf % value
|
127
|
+
end
|
128
|
+
|
129
|
+
def do_split(value)
|
130
|
+
pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
|
131
|
+
keep = split_options[:keep] || 0 # default is keep first element
|
132
|
+
value.to_s.split(pattern)[keep].to_s
|
133
|
+
end
|
134
|
+
|
135
|
+
def column_type
|
136
|
+
resource.columns_hash[name.to_s].type
|
137
|
+
end
|
138
|
+
|
139
|
+
# Our wants and needs :)
|
140
|
+
def wants_split?
|
141
|
+
split_options.present?
|
142
|
+
end
|
143
|
+
def wants_sprintf?
|
144
|
+
sprintf.present?
|
145
|
+
end
|
146
|
+
def wants_upcase?
|
147
|
+
upcase.present?
|
148
|
+
end
|
149
|
+
def wants_static?
|
150
|
+
options.has_key? :static
|
151
|
+
end
|
152
|
+
def wants_nullification?
|
153
|
+
nullify != false
|
154
|
+
end
|
155
|
+
def wants_chars?
|
156
|
+
chars.present?
|
157
|
+
end
|
158
|
+
def wants_synthesize?
|
159
|
+
synthesize.is_a?(Proc)
|
160
|
+
end
|
161
|
+
def wants_overwriting?
|
162
|
+
overwrite != false
|
163
|
+
end
|
164
|
+
def wants_conversion?
|
165
|
+
from_units.present? or units_field_name.present? or units_field_number.present?
|
166
|
+
end
|
167
|
+
def wants_units?
|
168
|
+
to_units.present? or units_field_name.present? or units_field_number.present?
|
169
|
+
end
|
170
|
+
def wants_dictionary?
|
171
|
+
options[:dictionary].present?
|
172
|
+
end
|
173
|
+
def wants_matcher?
|
174
|
+
options[:matcher].present?
|
175
|
+
end
|
176
|
+
|
177
|
+
# Options that always have values
|
178
|
+
def field_name
|
179
|
+
(options[:field_name] || name).to_s
|
180
|
+
end
|
181
|
+
def delimiter
|
182
|
+
(options[:delimiter] || ', ')
|
183
|
+
end
|
184
|
+
|
185
|
+
# Options that can't be referred to by their names
|
186
|
+
def split_options
|
187
|
+
options[:split]
|
188
|
+
end
|
189
|
+
|
190
|
+
def from_units
|
191
|
+
options[:from_units]
|
192
|
+
end
|
193
|
+
def to_units
|
194
|
+
options[:to_units] || options[:units]
|
195
|
+
end
|
196
|
+
def sprintf
|
197
|
+
options[:sprintf]
|
198
|
+
end
|
199
|
+
def nullify
|
200
|
+
options[:nullify]
|
201
|
+
end
|
202
|
+
def overwrite
|
203
|
+
options[:overwrite]
|
204
|
+
end
|
205
|
+
def upcase
|
206
|
+
options[:upcase]
|
207
|
+
end
|
208
|
+
def units_field_name
|
209
|
+
options[:units_field_name]
|
210
|
+
end
|
211
|
+
def units_field_number
|
212
|
+
options[:units_field_number]
|
213
|
+
end
|
214
|
+
def field_number
|
215
|
+
options[:field_number]
|
216
|
+
end
|
217
|
+
def chars
|
218
|
+
options[:chars]
|
219
|
+
end
|
220
|
+
def synthesize
|
221
|
+
options[:synthesize]
|
222
|
+
end
|
223
|
+
def static
|
224
|
+
options[:static]
|
225
|
+
end
|
226
|
+
def dictionary
|
227
|
+
@_dictionary ||= (options[:dictionary].is_a?(Dictionary) ? options[:dictionary] : Dictionary.new(options[:dictionary]))
|
228
|
+
end
|
229
|
+
def matcher
|
230
|
+
@_matcher ||= (options[:matcher].is_a?(String) ? options[:matcher].constantize.new : options[:matcher])
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|