data_miner 0.5.7 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +8 -0
- data/CHANGELOG +7 -0
- data/Gemfile +4 -0
- data/LICENSE +1 -1
- data/Rakefile +23 -0
- data/data_miner.gemspec +35 -0
- data/lib/data_miner.rb +55 -96
- data/lib/data_miner/active_record_extensions.rb +38 -0
- data/lib/data_miner/attribute.rb +63 -58
- data/lib/data_miner/config.rb +184 -0
- data/lib/data_miner/dictionary.rb +25 -12
- data/lib/data_miner/import.rb +59 -50
- data/lib/data_miner/process.rb +24 -19
- data/lib/data_miner/run.rb +3 -3
- data/lib/data_miner/schema.rb +50 -53
- data/lib/data_miner/tap.rb +24 -24
- data/lib/data_miner/verify.rb +17 -24
- data/lib/data_miner/version.rb +3 -0
- data/test/{test_helper.rb → helper.rb} +20 -3
- data/test/{data_miner/attribute_test.rb → test_attribute.rb} +2 -2
- data/test/{data_miner_test.rb → test_old_syntax.rb} +28 -32
- data/test/{data_miner/verify_test.rb → test_verify.rb} +4 -4
- metadata +80 -101
- data/lib/data_miner/base.rb +0 -204
data/.document
ADDED
data/.gitignore
ADDED
data/CHANGELOG
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
0.2.6
|
2
|
+
* Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
|
3
|
+
0.3.0
|
4
|
+
* Removed association code... now data_miner focuses on just importing.
|
5
|
+
* New, simpler DSL
|
6
|
+
* Upgrade to remote_table 0.2.1 for row_hashes and better blank row handling
|
7
|
+
* Remove all association-related code
|
data/Gemfile
ADDED
data/LICENSE
CHANGED
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/testtask'
|
6
|
+
Rake::TestTask.new(:test) do |test|
|
7
|
+
test.libs << 'lib' << 'test'
|
8
|
+
test.pattern = 'test/**/test_*.rb'
|
9
|
+
test.verbose = true
|
10
|
+
end
|
11
|
+
|
12
|
+
begin
|
13
|
+
require 'rake/rdoctask'
|
14
|
+
Rake::RDocTask.new do |rdoc|
|
15
|
+
rdoc.rdoc_dir = 'rdoc'
|
16
|
+
rdoc.title = 'data_miner'
|
17
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
18
|
+
rdoc.rdoc_files.include('README*')
|
19
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
20
|
+
end
|
21
|
+
rescue LoadError
|
22
|
+
puts "Rdoc is not available"
|
23
|
+
end
|
data/data_miner.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "data_miner/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "data_miner"
|
7
|
+
s.version = DataMiner::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Seamus Abshere", "Andy Rossmeissl", "Derek Kastner"]
|
10
|
+
s.email = ["seamus@abshere.net"]
|
11
|
+
s.homepage = "https://github.com/seamusabshere/data_miner"
|
12
|
+
s.summary = %{Mine remote data into your ActiveRecord models.}
|
13
|
+
s.description = %q{Mine remote data into your ActiveRecord models. You can also convert units.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "data_miner"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency 'remote_table', '>=1.0.2'
|
23
|
+
s.add_dependency 'escape', '>=0.0.4'
|
24
|
+
s.add_dependency 'activerecord', '>=2.3.4'
|
25
|
+
s.add_dependency 'activesupport', '>=2.3.4'
|
26
|
+
s.add_dependency 'conversions', '>=1.4.4'
|
27
|
+
s.add_dependency 'blockenspiel', '>=0.3.2'
|
28
|
+
s.add_dependency 'taps', '>=0.3.11'
|
29
|
+
s.add_development_dependency 'loose_tight_dictionary', ">=0.0.5"
|
30
|
+
s.add_development_dependency 'errata', '>=1.0.1'
|
31
|
+
s.add_development_dependency 'test-unit'
|
32
|
+
s.add_development_dependency 'shoulda'
|
33
|
+
s.add_development_dependency 'mysql'
|
34
|
+
s.add_development_dependency 'ruby-debug'
|
35
|
+
end
|
data/lib/data_miner.rb
CHANGED
@@ -6,82 +6,37 @@ require 'active_support/version'
|
|
6
6
|
active_support/core_ext/string/multibyte
|
7
7
|
}.each do |active_support_3_requirement|
|
8
8
|
require active_support_3_requirement
|
9
|
-
end if ActiveSupport::VERSION::MAJOR == 3
|
9
|
+
end if ::ActiveSupport::VERSION::MAJOR == 3
|
10
10
|
|
11
|
-
require '
|
12
|
-
require 'blockenspiel'
|
13
|
-
require 'conversions'
|
14
|
-
require 'errata'
|
15
|
-
require 'remote_table'
|
16
|
-
require 'escape'
|
17
|
-
require 'andand'
|
18
|
-
require 'log4r'
|
19
|
-
require 'fileutils'
|
20
|
-
require 'tmpdir'
|
21
|
-
require 'zlib'
|
22
|
-
|
23
|
-
require 'data_miner/attribute'
|
24
|
-
require 'data_miner/base'
|
25
|
-
require 'data_miner/dictionary'
|
26
|
-
require 'data_miner/import'
|
27
|
-
require 'data_miner/tap'
|
28
|
-
require 'data_miner/process'
|
29
|
-
require 'data_miner/run'
|
30
|
-
require 'data_miner/schema'
|
31
|
-
require 'data_miner/verify'
|
11
|
+
require 'singleton'
|
32
12
|
|
33
|
-
|
13
|
+
class DataMiner
|
14
|
+
include ::Singleton
|
15
|
+
|
34
16
|
class MissingHashColumn < StandardError; end
|
35
17
|
class Finish < StandardError; end
|
36
18
|
class Skip < StandardError; end
|
19
|
+
class VerificationFailed < StandardError; end
|
37
20
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
error_outputter = Outputter.stderr
|
49
|
-
info_outputter.only_at DEBUG, INFO
|
50
|
-
error_outputter.only_at WARN, ERROR, FATAL
|
51
|
-
|
52
|
-
self.logger = Logger.new 'data_miner'
|
53
|
-
logger.add info_outputter, error_outputter
|
54
|
-
ActiveRecord::Base.logger = logger
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def self.log_or_raise(message)
|
59
|
-
message = "[data_miner gem] #{message}"
|
60
|
-
if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
|
61
|
-
logger.error message
|
62
|
-
else
|
63
|
-
raise message
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.log_info(message)
|
68
|
-
logger.info "[data_miner gem] #{message}"
|
69
|
-
end
|
70
|
-
|
71
|
-
def self.log_debug(message)
|
72
|
-
logger.debug "[data_miner gem] #{message}"
|
73
|
-
end
|
74
|
-
|
75
|
-
def self.run(options = {})
|
76
|
-
DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
|
77
|
-
DataMiner::Base.call_stack.clear
|
78
|
-
end
|
21
|
+
autoload :ActiveRecordExtensions, 'data_miner/active_record_extensions'
|
22
|
+
autoload :Attribute, 'data_miner/attribute'
|
23
|
+
autoload :Config, 'data_miner/config'
|
24
|
+
autoload :Dictionary, 'data_miner/dictionary'
|
25
|
+
autoload :Import, 'data_miner/import'
|
26
|
+
autoload :Tap, 'data_miner/tap'
|
27
|
+
autoload :Process, 'data_miner/process'
|
28
|
+
autoload :Run, 'data_miner/run'
|
29
|
+
autoload :Schema, 'data_miner/schema'
|
30
|
+
autoload :Verify, 'data_miner/verify'
|
79
31
|
|
80
|
-
|
81
|
-
|
32
|
+
class << self
|
33
|
+
delegate :logger, :to => :instance
|
34
|
+
delegate :logger=, :to => :instance
|
35
|
+
delegate :run, :to => :instance
|
36
|
+
delegate :resource_names, :to => :instance
|
82
37
|
end
|
83
38
|
|
84
|
-
|
39
|
+
# TODO this should probably live somewhere else
|
85
40
|
def self.backtick_with_reporting(cmd)
|
86
41
|
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
87
42
|
output = `#{cmd}`
|
@@ -97,41 +52,45 @@ Output:
|
|
97
52
|
}
|
98
53
|
end
|
99
54
|
end
|
55
|
+
|
56
|
+
attr_accessor :logger
|
100
57
|
|
101
|
-
|
58
|
+
def resource_names
|
59
|
+
@resource_names ||= []
|
60
|
+
end
|
102
61
|
|
103
|
-
|
104
|
-
|
105
|
-
DataMiner.start_logging
|
106
|
-
|
107
|
-
DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
|
62
|
+
def call_stack
|
63
|
+
@call_stack ||= []
|
108
64
|
end
|
109
65
|
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
118
|
-
class_eval do
|
119
|
-
cattr_accessor :data_miner_base
|
120
|
-
def self.data_miner_runs
|
121
|
-
DataMiner::Run.scoped :conditions => { :resource_name => name }
|
122
|
-
end
|
123
|
-
def self.run_data_miner!(options = {})
|
124
|
-
data_miner_base.run options
|
125
|
-
end
|
126
|
-
def self.execute_schema
|
127
|
-
schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
|
128
|
-
schema.run(nil) if schema
|
66
|
+
def start_logging
|
67
|
+
if logger.nil?
|
68
|
+
if defined? ::Rails
|
69
|
+
@logger = ::Rails.logger
|
70
|
+
else
|
71
|
+
@logger = ::Logger.new $stdout
|
129
72
|
end
|
130
73
|
end
|
131
|
-
|
132
|
-
|
133
|
-
Blockenspiel.invoke block, data_miner_base
|
74
|
+
::ActiveRecord::Base.logger = logger
|
75
|
+
end
|
134
76
|
|
135
|
-
|
77
|
+
# Mine data. Defaults to all resource_names touched by DataMiner.
|
78
|
+
#
|
79
|
+
# Options
|
80
|
+
# * <tt>:resource_names</tt>: array of resource (class) names to mine
|
81
|
+
def run(options = {})
|
82
|
+
options = options.dup
|
83
|
+
options.stringify_keys!
|
84
|
+
options['preserve_call_stack_between_runs'] = true
|
85
|
+
resource_names.each do |resource_name|
|
86
|
+
if options['resource_names'].blank? or options['resource_names'].include?(resource_name)
|
87
|
+
resource_name.constantize.data_miner_config.run options
|
88
|
+
end
|
89
|
+
end
|
90
|
+
call_stack.clear
|
91
|
+
# RemoteTable.cleanup
|
136
92
|
end
|
137
93
|
end
|
94
|
+
|
95
|
+
require 'active_record'
|
96
|
+
::ActiveRecord::Base.extend ::DataMiner::ActiveRecordExtensions
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
require 'blockenspiel'
|
3
|
+
|
4
|
+
class DataMiner
|
5
|
+
module ActiveRecordExtensions
|
6
|
+
def data_miner(&blk)
|
7
|
+
::DataMiner.instance.start_logging
|
8
|
+
|
9
|
+
::DataMiner.logger.debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
|
10
|
+
|
11
|
+
::DataMiner.instance.resource_names.push self.name unless ::DataMiner.instance.resource_names.include? self.name
|
12
|
+
|
13
|
+
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
14
|
+
class_eval do
|
15
|
+
cattr_accessor :data_miner_config
|
16
|
+
def self.data_miner_runs
|
17
|
+
::DataMiner::Run.scoped :conditions => { :resource_name => name }
|
18
|
+
end
|
19
|
+
def self.run_data_miner!(options = {})
|
20
|
+
data_miner_config.run options
|
21
|
+
end
|
22
|
+
def self.execute_schema
|
23
|
+
if schema = data_miner_config.steps.detect { |s| s.instance_of?(::DataMiner::Schema) }
|
24
|
+
schema.run
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
self.data_miner_config = ::DataMiner::Config.new self
|
30
|
+
|
31
|
+
::Blockenspiel.invoke blk, data_miner_config
|
32
|
+
|
33
|
+
data_miner_config.after_invoke
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,45 +1,49 @@
|
|
1
|
-
|
1
|
+
require 'conversions'
|
2
|
+
|
3
|
+
class DataMiner
|
2
4
|
class Attribute
|
3
|
-
|
4
|
-
|
5
|
-
|
5
|
+
attr_reader :step
|
6
|
+
attr_reader :name
|
7
|
+
attr_reader :options
|
6
8
|
|
7
|
-
|
9
|
+
def resource
|
10
|
+
step.resource
|
11
|
+
end
|
8
12
|
|
9
|
-
VALID_OPTIONS =
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
13
|
+
VALID_OPTIONS = %w{
|
14
|
+
from_units
|
15
|
+
to_units
|
16
|
+
static
|
17
|
+
dictionary
|
18
|
+
matcher
|
19
|
+
field_name
|
20
|
+
delimiter
|
21
|
+
split
|
22
|
+
units
|
23
|
+
sprintf
|
24
|
+
nullify
|
25
|
+
overwrite
|
26
|
+
upcase
|
27
|
+
units_field_name
|
28
|
+
units_field_number
|
29
|
+
field_number
|
30
|
+
chars
|
31
|
+
synthesize
|
32
|
+
}
|
29
33
|
|
30
34
|
def initialize(step, name, options = {})
|
31
|
-
options.
|
35
|
+
@options = options.dup
|
36
|
+
@options.stringify_keys!
|
32
37
|
|
33
38
|
@step = step
|
34
39
|
@name = name
|
35
40
|
|
36
|
-
invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
|
37
|
-
|
38
|
-
@options = options
|
41
|
+
invalid_option_keys = @options.keys.select { |k| not VALID_OPTIONS.include? k }
|
42
|
+
raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
|
39
43
|
end
|
40
44
|
|
41
45
|
def inspect
|
42
|
-
|
46
|
+
%{#<DataMiner::Attribute(#{resource}##{name})>}
|
43
47
|
end
|
44
48
|
|
45
49
|
def value_in_dictionary(str)
|
@@ -50,7 +54,7 @@ module DataMiner
|
|
50
54
|
if wants_static?
|
51
55
|
value = static
|
52
56
|
elsif field_number
|
53
|
-
if field_number.is_a?(Range)
|
57
|
+
if field_number.is_a?(::Range)
|
54
58
|
value = field_number.map { |n| row[n] }.join(delimiter)
|
55
59
|
else
|
56
60
|
value = row[field_number]
|
@@ -59,7 +63,7 @@ module DataMiner
|
|
59
63
|
value = row[field_name]
|
60
64
|
end
|
61
65
|
return nil if value.nil?
|
62
|
-
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
66
|
+
return value if value.is_a?(::ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
63
67
|
value = value.to_s
|
64
68
|
value = value[chars] if wants_chars?
|
65
69
|
value = do_split(value) if wants_split?
|
@@ -81,7 +85,7 @@ module DataMiner
|
|
81
85
|
def value_from_row(row)
|
82
86
|
return match_row row if wants_matcher?
|
83
87
|
value = value_in_source row
|
84
|
-
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
88
|
+
return value if value.is_a? ::ActiveRecord::Base # carry through trapdoor
|
85
89
|
value = value_in_dictionary value if wants_dictionary?
|
86
90
|
value = synthesize.call(row) if wants_synthesize?
|
87
91
|
value = nil if value.blank? and wants_nullification?
|
@@ -100,7 +104,7 @@ module DataMiner
|
|
100
104
|
|
101
105
|
what_it_is = record.send name
|
102
106
|
if what_it_is.nil? and !what_it_should_be.nil?
|
103
|
-
DataMiner.
|
107
|
+
::DataMiner.logger.debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
104
108
|
nil
|
105
109
|
elsif what_it_is == what_it_was
|
106
110
|
false
|
@@ -114,7 +118,7 @@ module DataMiner
|
|
114
118
|
end
|
115
119
|
|
116
120
|
def do_convert(row, value)
|
117
|
-
|
121
|
+
raise "If you use 'from_units', you need to set 'to_units' (#{inspect})" unless wants_units?
|
118
122
|
value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
|
119
123
|
end
|
120
124
|
|
@@ -128,8 +132,8 @@ module DataMiner
|
|
128
132
|
end
|
129
133
|
|
130
134
|
def do_split(value)
|
131
|
-
pattern = split_options[
|
132
|
-
keep = split_options[
|
135
|
+
pattern = split_options['pattern'] || /\s+/ # default is split on whitespace
|
136
|
+
keep = split_options['keep'] || 0 # default is keep first element
|
133
137
|
value.to_s.split(pattern)[keep].to_s
|
134
138
|
end
|
135
139
|
|
@@ -148,7 +152,7 @@ module DataMiner
|
|
148
152
|
upcase.present?
|
149
153
|
end
|
150
154
|
def wants_static?
|
151
|
-
options.has_key?
|
155
|
+
options.has_key? 'static'
|
152
156
|
end
|
153
157
|
def wants_nullification?
|
154
158
|
nullify == true
|
@@ -157,7 +161,7 @@ module DataMiner
|
|
157
161
|
chars.present?
|
158
162
|
end
|
159
163
|
def wants_synthesize?
|
160
|
-
synthesize.is_a?(Proc)
|
164
|
+
synthesize.is_a?(::Proc)
|
161
165
|
end
|
162
166
|
def wants_overwriting?
|
163
167
|
overwrite != false
|
@@ -169,66 +173,67 @@ module DataMiner
|
|
169
173
|
to_units.present? or units_field_name.present? or units_field_number.present?
|
170
174
|
end
|
171
175
|
def wants_dictionary?
|
172
|
-
options[
|
176
|
+
options['dictionary'].present?
|
173
177
|
end
|
174
178
|
def wants_matcher?
|
175
|
-
options[
|
179
|
+
options['matcher'].present?
|
176
180
|
end
|
177
181
|
|
178
182
|
# Options that always have values
|
179
183
|
def field_name
|
180
|
-
(options[
|
184
|
+
(options['field_name'] || name).to_s
|
181
185
|
end
|
182
186
|
def delimiter
|
183
|
-
(options[
|
187
|
+
(options['delimiter'] || ', ')
|
184
188
|
end
|
185
189
|
|
186
190
|
# Options that can't be referred to by their names
|
187
191
|
def split_options
|
188
|
-
options[
|
192
|
+
options['split']
|
189
193
|
end
|
190
194
|
|
191
195
|
def from_units
|
192
|
-
options[
|
196
|
+
options['from_units']
|
193
197
|
end
|
194
198
|
def to_units
|
195
|
-
options[
|
199
|
+
options['to_units'] || options['units']
|
196
200
|
end
|
197
201
|
def sprintf
|
198
|
-
options[
|
202
|
+
options['sprintf']
|
199
203
|
end
|
200
204
|
def nullify
|
201
|
-
options[
|
205
|
+
options['nullify']
|
202
206
|
end
|
203
207
|
def overwrite
|
204
|
-
options[
|
208
|
+
options['overwrite']
|
205
209
|
end
|
206
210
|
def upcase
|
207
|
-
options[
|
211
|
+
options['upcase']
|
208
212
|
end
|
209
213
|
def units_field_name
|
210
|
-
options[
|
214
|
+
options['units_field_name']
|
211
215
|
end
|
212
216
|
def units_field_number
|
213
|
-
options[
|
217
|
+
options['units_field_number']
|
214
218
|
end
|
215
219
|
def field_number
|
216
|
-
options[
|
220
|
+
options['field_number']
|
217
221
|
end
|
218
222
|
def chars
|
219
|
-
options[
|
223
|
+
options['chars']
|
220
224
|
end
|
221
225
|
def synthesize
|
222
|
-
options[
|
226
|
+
options['synthesize']
|
223
227
|
end
|
224
228
|
def static
|
225
|
-
options[
|
229
|
+
options['static']
|
226
230
|
end
|
231
|
+
# must be cleared before every run! (because it relies on remote data)
|
227
232
|
def dictionary
|
228
|
-
@
|
233
|
+
@dictionary ||= (options['dictionary'].is_a?(Dictionary) ? options['dictionary'] : Dictionary.new(options['dictionary']))
|
229
234
|
end
|
230
235
|
def matcher
|
231
|
-
@
|
236
|
+
@matcher ||= (options['matcher'].is_a?(::String) ? options['matcher'].constantize.new : options['matcher'])
|
232
237
|
end
|
233
238
|
end
|
234
239
|
end
|