data_miner-ruby19 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +271 -0
- data/lib/data_miner.rb +136 -0
- data/lib/data_miner/attribute.rb +233 -0
- data/lib/data_miner/base.rb +194 -0
- data/lib/data_miner/dictionary.rb +36 -0
- data/lib/data_miner/import.rb +70 -0
- data/lib/data_miner/process.rb +37 -0
- data/lib/data_miner/run.rb +26 -0
- data/lib/data_miner/schema.rb +244 -0
- data/lib/data_miner/tap.rb +146 -0
- data/test/data_miner_test.rb +1399 -0
- data/test/test_helper.rb +307 -0
- metadata +330 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Brighter Planet
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,271 @@
|
|
1
|
+
=data_miner
|
2
|
+
|
3
|
+
Programmatically import useful data into your ActiveRecord models.
|
4
|
+
|
5
|
+
(see http://wiki.github.com/seamusabshere/data_miner for more examples)
|
6
|
+
|
7
|
+
==Quick start
|
8
|
+
|
9
|
+
You define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
|
10
|
+
|
11
|
+
class Country < ActiveRecord::Base
|
12
|
+
set_primary_key :iso_3166_code
|
13
|
+
|
14
|
+
data_miner do
|
15
|
+
schema do
|
16
|
+
string 'iso_3166_code'
|
17
|
+
string 'name'
|
18
|
+
end
|
19
|
+
|
20
|
+
import 'the official ISO country list',
|
21
|
+
:url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
|
22
|
+
:skip => 2,
|
23
|
+
:headers => false,
|
24
|
+
:delimiter => ';',
|
25
|
+
:encoding => 'ISO-8859-1' do
|
26
|
+
key 'iso_3166_code', :field_number => 1
|
27
|
+
store 'name', :field_number => 0
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
Now you can run:
|
33
|
+
|
34
|
+
irb(main):001:0> Country.run_data_miner!
|
35
|
+
=> nil
|
36
|
+
|
37
|
+
==Advanced usage
|
38
|
+
|
39
|
+
This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:
|
40
|
+
|
41
|
+
class Aircraft < ActiveRecord::Base
|
42
|
+
# Tell ActiveRecord that we want to use a string primary key.
|
43
|
+
# This makes it easier to repeatedly truncate and re-import this
|
44
|
+
# table without breaking associations.
|
45
|
+
set_primary_key :icao_code
|
46
|
+
|
47
|
+
# A dictionary between BTS aircraft type codes and ICAO aircraft
|
48
|
+
# codes that uses string similarity instead of exact matching.
|
49
|
+
# This is preferable to typing everything out.
|
50
|
+
def self.bts_name_dictionary
|
51
|
+
# Sorry for documenting the LooseTightDictionary gem here, but it's useful
|
52
|
+
@_bts_dictionary ||= LooseTightDictionary.new(
|
53
|
+
# The first argument is the source... the possible matches. Most Enumerables will do.
|
54
|
+
RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
|
55
|
+
# Tightenings optionally pull out what is important on both sides of a potential match
|
56
|
+
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
57
|
+
# Identities optionally require a particular capture from both sides of a match to be equal
|
58
|
+
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
59
|
+
# Blockings restrict comparisons to a subset where everything matches the blocking
|
60
|
+
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
61
|
+
# This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
|
62
|
+
# This is useful because we say /boeing/ and only boeings are matched against other boeings.
|
63
|
+
:blocking_only => true,
|
64
|
+
# Tell the dictionary how read things from the source.
|
65
|
+
:right_reader => lambda { |record| record['Description'] }
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
# A dictionary between what appear to be ICAO aircraft names and
|
70
|
+
# objects of this class itself.
|
71
|
+
# Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
|
72
|
+
def self.icao_name_dictionary
|
73
|
+
@_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
|
74
|
+
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
|
75
|
+
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
|
76
|
+
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
|
77
|
+
:right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
|
78
|
+
end
|
79
|
+
|
80
|
+
# This responds to the "Matcher" interface as defined by DataMiner.
|
81
|
+
# In other words, it takes Matcher#match(*args) and returns something.
|
82
|
+
class BtsMatcher
|
83
|
+
attr_reader :wants
|
84
|
+
def initialize(wants)
|
85
|
+
@wants = wants
|
86
|
+
end
|
87
|
+
def match(raw_faa_icao_record)
|
88
|
+
@_match ||= Hash.new
|
89
|
+
return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
|
90
|
+
faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
|
91
|
+
bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
|
92
|
+
retval = case wants
|
93
|
+
when :bts_aircraft_type_code
|
94
|
+
bts_record['Code']
|
95
|
+
when :bts_name
|
96
|
+
bts_record['Description']
|
97
|
+
end if bts_record
|
98
|
+
@_match[raw_faa_icao_record] = retval
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Another class that implements the "Matcher" interface as expected by DataMiner.
|
103
|
+
class FuelUseMatcher
|
104
|
+
def match(raw_fuel_use_record)
|
105
|
+
@_match ||= Hash.new
|
106
|
+
return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
|
107
|
+
# First try assuming we have an ICAO code
|
108
|
+
aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
|
109
|
+
Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
|
110
|
+
end
|
111
|
+
# No luck? then try a fuzzy match
|
112
|
+
aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
|
113
|
+
Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
|
114
|
+
end
|
115
|
+
if aircraft_record
|
116
|
+
@_match[raw_fuel_use_record] = aircraft_record.icao_code
|
117
|
+
else
|
118
|
+
# While we're developing the dictionary, we want it to blow up until we have 100% matchability
|
119
|
+
raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# This responds to the "Responder" interface as expected by Errata.
|
125
|
+
# Basically it lets you say "Is a DC plane" in the errata file and
|
126
|
+
# have it map to a Ruby method.
|
127
|
+
class Guru
|
128
|
+
def is_a_dc_plane?(row)
|
129
|
+
row['Designator'] =~ /^DC\d/i
|
130
|
+
end
|
131
|
+
def is_a_g159?(row)
|
132
|
+
row['Designator'] =~ /^G159$/
|
133
|
+
end
|
134
|
+
def is_a_galx?(row)
|
135
|
+
row['Designator'] =~ /^GALX$/
|
136
|
+
end
|
137
|
+
def method_missing(method_id, *args, &block)
|
138
|
+
if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
|
139
|
+
manufacturer_name = $1
|
140
|
+
manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
|
141
|
+
matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
|
142
|
+
method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
|
143
|
+
else
|
144
|
+
super
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
data_miner do
|
150
|
+
# In our app, we defined DataMiner::Run.allowed? to return false if a run
|
151
|
+
# has taken place in the last hour (among other things).
|
152
|
+
# By raising DataMiner::Skip, we skip this run but call it a success.
|
153
|
+
process "Don't re-import too often" do
|
154
|
+
raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
|
155
|
+
end
|
156
|
+
|
157
|
+
# Define the database schema in-line.
|
158
|
+
# It will destructively and automatically add/remove columns.
|
159
|
+
# This is "OK" because you can always just re-run the import script to get the data back.
|
160
|
+
# PS. if we were using DataMapper, we wouldn't need this.
|
161
|
+
schema :options => 'ENGINE=InnoDB default charset=utf8' do
|
162
|
+
string 'icao_code'
|
163
|
+
string 'manufacturer_name'
|
164
|
+
string 'name'
|
165
|
+
string 'bts_name'
|
166
|
+
string 'bts_aircraft_type_code'
|
167
|
+
string 'brighter_planet_aircraft_class_code'
|
168
|
+
string 'fuel_use_aircraft_name'
|
169
|
+
float 'm3'
|
170
|
+
string 'm3_units'
|
171
|
+
float 'm2'
|
172
|
+
string 'm2_units'
|
173
|
+
float 'm1'
|
174
|
+
string 'm1_units'
|
175
|
+
float 'endpoint_fuel'
|
176
|
+
string 'endpoint_fuel_units'
|
177
|
+
float 'seats'
|
178
|
+
float 'distance'
|
179
|
+
string 'distance_units'
|
180
|
+
float 'load_factor'
|
181
|
+
float 'freight_share'
|
182
|
+
float 'payload'
|
183
|
+
float 'weighting'
|
184
|
+
index 'bts_aircraft_type_code'
|
185
|
+
end
|
186
|
+
|
187
|
+
# The FAA publishes a document to help people identify aircraft by different names.
|
188
|
+
('A'..'Z').each do |letter|
|
189
|
+
import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
|
190
|
+
# The master URL of the source file (one for every letter)
|
191
|
+
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
|
192
|
+
# The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
|
193
|
+
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
|
194
|
+
# If it's not UTF-8, you should say what it is so that we can iconv it!
|
195
|
+
:encoding => 'windows-1252',
|
196
|
+
# Nokogiri is being used to grab each row starting from the second
|
197
|
+
:row_xpath => '//table/tr[2]/td/table/tr',
|
198
|
+
# ditto... XPath for Nokogiri
|
199
|
+
:column_xpath => 'td' ) do
|
200
|
+
# The code that they use is in fact the ICAO code!
|
201
|
+
key 'icao_code', :field_name => 'Designator'
|
202
|
+
# We get this for free
|
203
|
+
store 'manufacturer_name', :field_name => 'Manufacturer'
|
204
|
+
# ditto
|
205
|
+
store 'name', :field_name => 'Model'
|
206
|
+
# Use the loose-tight dictionary.
|
207
|
+
# It gets the entire input row to play with before deciding on an output.
|
208
|
+
store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
|
209
|
+
store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# Pull in some data that might only be important to Brighter Planet
|
214
|
+
import "Brighter Planet's aircraft class codes",
|
215
|
+
:url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
|
216
|
+
key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
|
217
|
+
store 'brighter_planet_aircraft_class_code'
|
218
|
+
end
|
219
|
+
|
220
|
+
# Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
|
221
|
+
# This data comes from the EEA.
|
222
|
+
import "pre-calculated fuel use equation coefficients",
|
223
|
+
:url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
|
224
|
+
:select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
|
225
|
+
# We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
|
226
|
+
key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new
|
227
|
+
# Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
|
228
|
+
store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
|
229
|
+
store 'm3'
|
230
|
+
store 'm2'
|
231
|
+
store 'm1'
|
232
|
+
store 'endpoint_fuel', :field_name => 'b'
|
233
|
+
end
|
234
|
+
|
235
|
+
# Use arel and the weighted_average gem to do some crazy averaging.
|
236
|
+
# This assumes that you're dealing with the BTS T-100 flight segment data.
|
237
|
+
# See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
|
238
|
+
process "Derive some average flight characteristics from flight segments" do
|
239
|
+
FlightSegment.run_data_miner!
|
240
|
+
aircraft = Aircraft.arel_table
|
241
|
+
segments = FlightSegment.arel_table
|
242
|
+
|
243
|
+
conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
|
244
|
+
update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
245
|
+
update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
246
|
+
update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
247
|
+
update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
|
248
|
+
update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"
|
249
|
+
|
250
|
+
update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
|
251
|
+
end
|
252
|
+
|
253
|
+
# And finally re-run the import of resources that depend on this resource.
|
254
|
+
# Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
|
255
|
+
# that's the right way to do dependencies. It won't get called twice in the same run.
|
256
|
+
[ AircraftManufacturer ].each do |synthetic_resource|
|
257
|
+
process "Synthesize #{synthetic_resource}" do
|
258
|
+
synthetic_resource.run_data_miner!
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
==Authors
|
265
|
+
|
266
|
+
* Seamus Abshere <seamus@abshere.net>
|
267
|
+
* Andy Rossmeissl <andy@rossmeissl.net>
|
268
|
+
|
269
|
+
==Copyright
|
270
|
+
|
271
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/lib/data_miner.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/version'
|
3
|
+
%w{
|
4
|
+
active_support/core_ext/array/conversions
|
5
|
+
active_support/core_ext/string/access
|
6
|
+
active_support/core_ext/string/multibyte
|
7
|
+
}.each do |active_support_3_requirement|
|
8
|
+
require active_support_3_requirement
|
9
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
10
|
+
|
11
|
+
require 'active_record'
|
12
|
+
require 'blockenspiel'
|
13
|
+
require 'conversions'
|
14
|
+
require 'errata'
|
15
|
+
require 'remote_table'
|
16
|
+
require 'escape'
|
17
|
+
require 'andand'
|
18
|
+
require 'log4r'
|
19
|
+
require 'fileutils'
|
20
|
+
require 'tmpdir'
|
21
|
+
require 'zlib'
|
22
|
+
|
23
|
+
require 'data_miner/attribute'
|
24
|
+
require 'data_miner/base'
|
25
|
+
require 'data_miner/dictionary'
|
26
|
+
require 'data_miner/import'
|
27
|
+
require 'data_miner/tap'
|
28
|
+
require 'data_miner/process'
|
29
|
+
require 'data_miner/run'
|
30
|
+
require 'data_miner/schema'
|
31
|
+
|
32
|
+
module DataMiner
|
33
|
+
class MissingHashColumn < StandardError; end
|
34
|
+
class Finish < StandardError; end
|
35
|
+
class Skip < StandardError; end
|
36
|
+
|
37
|
+
mattr_accessor :logger
|
38
|
+
|
39
|
+
def self.start_logging
|
40
|
+
return if logger
|
41
|
+
|
42
|
+
if defined? Rails
|
43
|
+
self.logger = Rails.logger
|
44
|
+
else
|
45
|
+
class_eval { include Log4r }
|
46
|
+
info_outputter = FileOutputter.new 'f1', :filename => 'data_miner.log'
|
47
|
+
error_outputter = Outputter.stderr
|
48
|
+
info_outputter.only_at DEBUG, INFO
|
49
|
+
error_outputter.only_at WARN, ERROR, FATAL
|
50
|
+
|
51
|
+
self.logger = Logger.new 'data_miner'
|
52
|
+
logger.add info_outputter, error_outputter
|
53
|
+
ActiveRecord::Base.logger = logger
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.log_or_raise(message)
|
58
|
+
message = "[data_miner gem] #{message}"
|
59
|
+
if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
|
60
|
+
logger.error message
|
61
|
+
else
|
62
|
+
raise message
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.log_info(message)
|
67
|
+
logger.info "[data_miner gem] #{message}"
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.log_debug(message)
|
71
|
+
logger.debug "[data_miner gem] #{message}"
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.run(options = {})
|
75
|
+
DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
|
76
|
+
DataMiner::Base.call_stack.clear
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.resource_names
|
80
|
+
DataMiner::Base.resource_names
|
81
|
+
end
|
82
|
+
|
83
|
+
# TODO this should probably live somewhere else
|
84
|
+
def self.backtick_with_reporting(cmd)
|
85
|
+
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
86
|
+
output = `#{cmd}`
|
87
|
+
if not $?.success?
|
88
|
+
raise %{
|
89
|
+
From the data_miner gem...
|
90
|
+
|
91
|
+
Command failed:
|
92
|
+
#{cmd}
|
93
|
+
|
94
|
+
Output:
|
95
|
+
#{output}
|
96
|
+
}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
ActiveRecord::Base.class_eval do
|
103
|
+
def self.x_data_miner(&block)
|
104
|
+
DataMiner.start_logging
|
105
|
+
|
106
|
+
DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.data_miner(&block)
|
110
|
+
DataMiner.start_logging
|
111
|
+
|
112
|
+
DataMiner.log_debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
|
113
|
+
|
114
|
+
DataMiner.resource_names.push self.name unless DataMiner.resource_names.include? self.name
|
115
|
+
|
116
|
+
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
117
|
+
class_eval do
|
118
|
+
cattr_accessor :data_miner_base
|
119
|
+
def self.data_miner_runs
|
120
|
+
DataMiner::Run.scoped :conditions => { :resource_name => name }
|
121
|
+
end
|
122
|
+
def self.run_data_miner!(options = {})
|
123
|
+
data_miner_base.run options
|
124
|
+
end
|
125
|
+
def self.execute_schema
|
126
|
+
schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
|
127
|
+
schema.run(nil) if schema
|
128
|
+
end
|
129
|
+
end
|
130
|
+
self.data_miner_base = DataMiner::Base.new self
|
131
|
+
|
132
|
+
Blockenspiel.invoke block, data_miner_base
|
133
|
+
|
134
|
+
data_miner_base.after_invoke
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module DataMiner
|
2
|
+
class Attribute
|
3
|
+
attr_accessor :step
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :options
|
6
|
+
|
7
|
+
delegate :resource, :to => :step
|
8
|
+
|
9
|
+
VALID_OPTIONS = [
|
10
|
+
:from_units,
|
11
|
+
:to_units,
|
12
|
+
:static,
|
13
|
+
:dictionary,
|
14
|
+
:matcher,
|
15
|
+
:field_name,
|
16
|
+
:delimiter,
|
17
|
+
:split,
|
18
|
+
:units,
|
19
|
+
:sprintf,
|
20
|
+
:nullify,
|
21
|
+
:overwrite,
|
22
|
+
:upcase,
|
23
|
+
:units_field_name,
|
24
|
+
:units_field_number,
|
25
|
+
:field_number,
|
26
|
+
:chars,
|
27
|
+
:synthesize
|
28
|
+
]
|
29
|
+
|
30
|
+
def initialize(step, name, options = {})
|
31
|
+
options.symbolize_keys!
|
32
|
+
|
33
|
+
@step = step
|
34
|
+
@name = name
|
35
|
+
|
36
|
+
invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
|
37
|
+
DataMiner.log_or_raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
|
38
|
+
@options = options
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"Attribute(#{resource}##{name})"
|
43
|
+
end
|
44
|
+
|
45
|
+
def value_in_dictionary(str)
|
46
|
+
dictionary.lookup str
|
47
|
+
end
|
48
|
+
|
49
|
+
def value_in_source(row)
|
50
|
+
if wants_static?
|
51
|
+
value = static
|
52
|
+
elsif field_number
|
53
|
+
if field_number.is_a?(Range)
|
54
|
+
value = field_number.map { |n| row[n] }.join(delimiter)
|
55
|
+
else
|
56
|
+
value = row[field_number]
|
57
|
+
end
|
58
|
+
else
|
59
|
+
value = row[field_name]
|
60
|
+
end
|
61
|
+
return nil if value.nil?
|
62
|
+
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
63
|
+
value = value.to_s
|
64
|
+
value = value[chars] if wants_chars?
|
65
|
+
value = do_split(value) if wants_split?
|
66
|
+
# taken from old errata... maybe we want to do this here
|
67
|
+
value.gsub! /[ ]+/, ' '
|
68
|
+
# text.gsub!('- ', '-')
|
69
|
+
value.gsub! /([^\\])~/, '\1 '
|
70
|
+
value.strip!
|
71
|
+
value.upcase! if wants_upcase?
|
72
|
+
value = do_convert row, value if wants_conversion?
|
73
|
+
value = do_sprintf value if wants_sprintf?
|
74
|
+
value
|
75
|
+
end
|
76
|
+
|
77
|
+
def match_row(row)
|
78
|
+
matcher.match row
|
79
|
+
end
|
80
|
+
|
81
|
+
def value_from_row(row)
|
82
|
+
return match_row row if wants_matcher?
|
83
|
+
value = value_in_source row
|
84
|
+
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
85
|
+
value = value_in_dictionary value if wants_dictionary?
|
86
|
+
value = synthesize.call(row) if wants_synthesize?
|
87
|
+
value
|
88
|
+
end
|
89
|
+
|
90
|
+
# this will overwrite nils, even if wants_overwriting? is false
|
91
|
+
# returns true if an attr was changed, otherwise false
|
92
|
+
def set_record_from_row(record, row)
|
93
|
+
return false if !wants_overwriting? and !record.send(name).nil?
|
94
|
+
what_it_was = record.send name
|
95
|
+
what_it_should_be = value_from_row row
|
96
|
+
|
97
|
+
record.send "#{name}=", what_it_should_be
|
98
|
+
record.send "#{name}_units=", (to_units || unit_from_source(row)).to_s if wants_units?
|
99
|
+
|
100
|
+
what_it_is = record.send name
|
101
|
+
if what_it_is.nil? and !what_it_should_be.nil?
|
102
|
+
DataMiner.log_debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
103
|
+
nil
|
104
|
+
elsif what_it_is == what_it_was
|
105
|
+
false
|
106
|
+
else
|
107
|
+
true
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def unit_from_source(row)
|
112
|
+
row[units_field_name || units_field_number].to_s.strip.underscore.to_sym
|
113
|
+
end
|
114
|
+
|
115
|
+
def do_convert(row, value)
|
116
|
+
DataMiner.log_or_raise "If you use :from_units, you need to set :to_units (#{inspect})" unless wants_units?
|
117
|
+
value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
|
118
|
+
end
|
119
|
+
|
120
|
+
def do_sprintf(value)
|
121
|
+
if /\%[0-9\.]*f/.match sprintf
|
122
|
+
value = value.to_f
|
123
|
+
elsif /\%[0-9\.]*d/.match sprintf
|
124
|
+
value = value.to_i
|
125
|
+
end
|
126
|
+
sprintf % value
|
127
|
+
end
|
128
|
+
|
129
|
+
def do_split(value)
|
130
|
+
pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
|
131
|
+
keep = split_options[:keep] || 0 # default is keep first element
|
132
|
+
value.to_s.split(pattern)[keep].to_s
|
133
|
+
end
|
134
|
+
|
135
|
+
def column_type
|
136
|
+
resource.columns_hash[name.to_s].type
|
137
|
+
end
|
138
|
+
|
139
|
+
# Our wants and needs :)
|
140
|
+
def wants_split?
|
141
|
+
split_options.present?
|
142
|
+
end
|
143
|
+
def wants_sprintf?
|
144
|
+
sprintf.present?
|
145
|
+
end
|
146
|
+
def wants_upcase?
|
147
|
+
upcase.present?
|
148
|
+
end
|
149
|
+
def wants_static?
|
150
|
+
options.has_key? :static
|
151
|
+
end
|
152
|
+
def wants_nullification?
|
153
|
+
nullify != false
|
154
|
+
end
|
155
|
+
def wants_chars?
|
156
|
+
chars.present?
|
157
|
+
end
|
158
|
+
def wants_synthesize?
|
159
|
+
synthesize.is_a?(Proc)
|
160
|
+
end
|
161
|
+
def wants_overwriting?
|
162
|
+
overwrite != false
|
163
|
+
end
|
164
|
+
def wants_conversion?
|
165
|
+
from_units.present? or units_field_name.present? or units_field_number.present?
|
166
|
+
end
|
167
|
+
def wants_units?
|
168
|
+
to_units.present? or units_field_name.present? or units_field_number.present?
|
169
|
+
end
|
170
|
+
def wants_dictionary?
|
171
|
+
options[:dictionary].present?
|
172
|
+
end
|
173
|
+
def wants_matcher?
|
174
|
+
options[:matcher].present?
|
175
|
+
end
|
176
|
+
|
177
|
+
# Options that always have values
|
178
|
+
def field_name
|
179
|
+
(options[:field_name] || name).to_s
|
180
|
+
end
|
181
|
+
def delimiter
|
182
|
+
(options[:delimiter] || ', ')
|
183
|
+
end
|
184
|
+
|
185
|
+
# Options that can't be referred to by their names
|
186
|
+
def split_options
|
187
|
+
options[:split]
|
188
|
+
end
|
189
|
+
|
190
|
+
def from_units
|
191
|
+
options[:from_units]
|
192
|
+
end
|
193
|
+
def to_units
|
194
|
+
options[:to_units] || options[:units]
|
195
|
+
end
|
196
|
+
def sprintf
|
197
|
+
options[:sprintf]
|
198
|
+
end
|
199
|
+
def nullify
|
200
|
+
options[:nullify]
|
201
|
+
end
|
202
|
+
def overwrite
|
203
|
+
options[:overwrite]
|
204
|
+
end
|
205
|
+
def upcase
|
206
|
+
options[:upcase]
|
207
|
+
end
|
208
|
+
def units_field_name
|
209
|
+
options[:units_field_name]
|
210
|
+
end
|
211
|
+
def units_field_number
|
212
|
+
options[:units_field_number]
|
213
|
+
end
|
214
|
+
def field_number
|
215
|
+
options[:field_number]
|
216
|
+
end
|
217
|
+
def chars
|
218
|
+
options[:chars]
|
219
|
+
end
|
220
|
+
def synthesize
|
221
|
+
options[:synthesize]
|
222
|
+
end
|
223
|
+
def static
|
224
|
+
options[:static]
|
225
|
+
end
|
226
|
+
def dictionary
|
227
|
+
@_dictionary ||= (options[:dictionary].is_a?(Dictionary) ? options[:dictionary] : Dictionary.new(options[:dictionary]))
|
228
|
+
end
|
229
|
+
def matcher
|
230
|
+
@_matcher ||= (options[:matcher].is_a?(String) ? options[:matcher].constantize.new : options[:matcher])
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|