data_miner 0.5.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +8 -0
- data/CHANGELOG +7 -0
- data/Gemfile +4 -0
- data/LICENSE +1 -1
- data/Rakefile +23 -0
- data/data_miner.gemspec +35 -0
- data/lib/data_miner.rb +55 -96
- data/lib/data_miner/active_record_extensions.rb +38 -0
- data/lib/data_miner/attribute.rb +63 -58
- data/lib/data_miner/config.rb +184 -0
- data/lib/data_miner/dictionary.rb +25 -12
- data/lib/data_miner/import.rb +59 -50
- data/lib/data_miner/process.rb +24 -19
- data/lib/data_miner/run.rb +3 -3
- data/lib/data_miner/schema.rb +50 -53
- data/lib/data_miner/tap.rb +24 -24
- data/lib/data_miner/verify.rb +17 -24
- data/lib/data_miner/version.rb +3 -0
- data/test/{test_helper.rb → helper.rb} +20 -3
- data/test/{data_miner/attribute_test.rb → test_attribute.rb} +2 -2
- data/test/{data_miner_test.rb → test_old_syntax.rb} +28 -32
- data/test/{data_miner/verify_test.rb → test_verify.rb} +4 -4
- metadata +80 -101
- data/lib/data_miner/base.rb +0 -204
data/.document
ADDED
data/.gitignore
ADDED
data/CHANGELOG
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
0.2.6
|
2
|
+
* Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
|
3
|
+
0.3.0
|
4
|
+
* Removed association code... now data_miner focuses on just importing.
|
5
|
+
* New, simpler DSL
|
6
|
+
* Upgrade to remote_table 0.2.1 for row_hashes and better blank row handling
|
7
|
+
* Remove all association-related code
|
data/Gemfile
ADDED
data/LICENSE
CHANGED
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/testtask'
|
6
|
+
Rake::TestTask.new(:test) do |test|
|
7
|
+
test.libs << 'lib' << 'test'
|
8
|
+
test.pattern = 'test/**/test_*.rb'
|
9
|
+
test.verbose = true
|
10
|
+
end
|
11
|
+
|
12
|
+
begin
|
13
|
+
require 'rake/rdoctask'
|
14
|
+
Rake::RDocTask.new do |rdoc|
|
15
|
+
rdoc.rdoc_dir = 'rdoc'
|
16
|
+
rdoc.title = 'data_miner'
|
17
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
18
|
+
rdoc.rdoc_files.include('README*')
|
19
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
20
|
+
end
|
21
|
+
rescue LoadError
|
22
|
+
puts "Rdoc is not available"
|
23
|
+
end
|
data/data_miner.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "data_miner/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "data_miner"
|
7
|
+
s.version = DataMiner::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Seamus Abshere", "Andy Rossmeissl", "Derek Kastner"]
|
10
|
+
s.email = ["seamus@abshere.net"]
|
11
|
+
s.homepage = "https://github.com/seamusabshere/data_miner"
|
12
|
+
s.summary = %{Mine remote data into your ActiveRecord models.}
|
13
|
+
s.description = %q{Mine remote data into your ActiveRecord models. You can also convert units.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "data_miner"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency 'remote_table', '>=1.0.2'
|
23
|
+
s.add_dependency 'escape', '>=0.0.4'
|
24
|
+
s.add_dependency 'activerecord', '>=2.3.4'
|
25
|
+
s.add_dependency 'activesupport', '>=2.3.4'
|
26
|
+
s.add_dependency 'conversions', '>=1.4.4'
|
27
|
+
s.add_dependency 'blockenspiel', '>=0.3.2'
|
28
|
+
s.add_dependency 'taps', '>=0.3.11'
|
29
|
+
s.add_development_dependency 'loose_tight_dictionary', ">=0.0.5"
|
30
|
+
s.add_development_dependency 'errata', '>=1.0.1'
|
31
|
+
s.add_development_dependency 'test-unit'
|
32
|
+
s.add_development_dependency 'shoulda'
|
33
|
+
s.add_development_dependency 'mysql'
|
34
|
+
s.add_development_dependency 'ruby-debug'
|
35
|
+
end
|
data/lib/data_miner.rb
CHANGED
@@ -6,82 +6,37 @@ require 'active_support/version'
|
|
6
6
|
active_support/core_ext/string/multibyte
|
7
7
|
}.each do |active_support_3_requirement|
|
8
8
|
require active_support_3_requirement
|
9
|
-
end if ActiveSupport::VERSION::MAJOR == 3
|
9
|
+
end if ::ActiveSupport::VERSION::MAJOR == 3
|
10
10
|
|
11
|
-
require '
|
12
|
-
require 'blockenspiel'
|
13
|
-
require 'conversions'
|
14
|
-
require 'errata'
|
15
|
-
require 'remote_table'
|
16
|
-
require 'escape'
|
17
|
-
require 'andand'
|
18
|
-
require 'log4r'
|
19
|
-
require 'fileutils'
|
20
|
-
require 'tmpdir'
|
21
|
-
require 'zlib'
|
22
|
-
|
23
|
-
require 'data_miner/attribute'
|
24
|
-
require 'data_miner/base'
|
25
|
-
require 'data_miner/dictionary'
|
26
|
-
require 'data_miner/import'
|
27
|
-
require 'data_miner/tap'
|
28
|
-
require 'data_miner/process'
|
29
|
-
require 'data_miner/run'
|
30
|
-
require 'data_miner/schema'
|
31
|
-
require 'data_miner/verify'
|
11
|
+
require 'singleton'
|
32
12
|
|
33
|
-
|
13
|
+
class DataMiner
|
14
|
+
include ::Singleton
|
15
|
+
|
34
16
|
class MissingHashColumn < StandardError; end
|
35
17
|
class Finish < StandardError; end
|
36
18
|
class Skip < StandardError; end
|
19
|
+
class VerificationFailed < StandardError; end
|
37
20
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
error_outputter = Outputter.stderr
|
49
|
-
info_outputter.only_at DEBUG, INFO
|
50
|
-
error_outputter.only_at WARN, ERROR, FATAL
|
51
|
-
|
52
|
-
self.logger = Logger.new 'data_miner'
|
53
|
-
logger.add info_outputter, error_outputter
|
54
|
-
ActiveRecord::Base.logger = logger
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def self.log_or_raise(message)
|
59
|
-
message = "[data_miner gem] #{message}"
|
60
|
-
if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
|
61
|
-
logger.error message
|
62
|
-
else
|
63
|
-
raise message
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.log_info(message)
|
68
|
-
logger.info "[data_miner gem] #{message}"
|
69
|
-
end
|
70
|
-
|
71
|
-
def self.log_debug(message)
|
72
|
-
logger.debug "[data_miner gem] #{message}"
|
73
|
-
end
|
74
|
-
|
75
|
-
def self.run(options = {})
|
76
|
-
DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
|
77
|
-
DataMiner::Base.call_stack.clear
|
78
|
-
end
|
21
|
+
autoload :ActiveRecordExtensions, 'data_miner/active_record_extensions'
|
22
|
+
autoload :Attribute, 'data_miner/attribute'
|
23
|
+
autoload :Config, 'data_miner/config'
|
24
|
+
autoload :Dictionary, 'data_miner/dictionary'
|
25
|
+
autoload :Import, 'data_miner/import'
|
26
|
+
autoload :Tap, 'data_miner/tap'
|
27
|
+
autoload :Process, 'data_miner/process'
|
28
|
+
autoload :Run, 'data_miner/run'
|
29
|
+
autoload :Schema, 'data_miner/schema'
|
30
|
+
autoload :Verify, 'data_miner/verify'
|
79
31
|
|
80
|
-
|
81
|
-
|
32
|
+
class << self
|
33
|
+
delegate :logger, :to => :instance
|
34
|
+
delegate :logger=, :to => :instance
|
35
|
+
delegate :run, :to => :instance
|
36
|
+
delegate :resource_names, :to => :instance
|
82
37
|
end
|
83
38
|
|
84
|
-
|
39
|
+
# TODO this should probably live somewhere else
|
85
40
|
def self.backtick_with_reporting(cmd)
|
86
41
|
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
87
42
|
output = `#{cmd}`
|
@@ -97,41 +52,45 @@ Output:
|
|
97
52
|
}
|
98
53
|
end
|
99
54
|
end
|
55
|
+
|
56
|
+
attr_accessor :logger
|
100
57
|
|
101
|
-
|
58
|
+
def resource_names
|
59
|
+
@resource_names ||= []
|
60
|
+
end
|
102
61
|
|
103
|
-
|
104
|
-
|
105
|
-
DataMiner.start_logging
|
106
|
-
|
107
|
-
DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
|
62
|
+
def call_stack
|
63
|
+
@call_stack ||= []
|
108
64
|
end
|
109
65
|
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
118
|
-
class_eval do
|
119
|
-
cattr_accessor :data_miner_base
|
120
|
-
def self.data_miner_runs
|
121
|
-
DataMiner::Run.scoped :conditions => { :resource_name => name }
|
122
|
-
end
|
123
|
-
def self.run_data_miner!(options = {})
|
124
|
-
data_miner_base.run options
|
125
|
-
end
|
126
|
-
def self.execute_schema
|
127
|
-
schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
|
128
|
-
schema.run(nil) if schema
|
66
|
+
def start_logging
|
67
|
+
if logger.nil?
|
68
|
+
if defined? ::Rails
|
69
|
+
@logger = ::Rails.logger
|
70
|
+
else
|
71
|
+
@logger = ::Logger.new $stdout
|
129
72
|
end
|
130
73
|
end
|
131
|
-
|
132
|
-
|
133
|
-
Blockenspiel.invoke block, data_miner_base
|
74
|
+
::ActiveRecord::Base.logger = logger
|
75
|
+
end
|
134
76
|
|
135
|
-
|
77
|
+
# Mine data. Defaults to all resource_names touched by DataMiner.
|
78
|
+
#
|
79
|
+
# Options
|
80
|
+
# * <tt>:resource_names</tt>: array of resource (class) names to mine
|
81
|
+
def run(options = {})
|
82
|
+
options = options.dup
|
83
|
+
options.stringify_keys!
|
84
|
+
options['preserve_call_stack_between_runs'] = true
|
85
|
+
resource_names.each do |resource_name|
|
86
|
+
if options['resource_names'].blank? or options['resource_names'].include?(resource_name)
|
87
|
+
resource_name.constantize.data_miner_config.run options
|
88
|
+
end
|
89
|
+
end
|
90
|
+
call_stack.clear
|
91
|
+
# RemoteTable.cleanup
|
136
92
|
end
|
137
93
|
end
|
94
|
+
|
95
|
+
require 'active_record'
|
96
|
+
::ActiveRecord::Base.extend ::DataMiner::ActiveRecordExtensions
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
require 'blockenspiel'
|
3
|
+
|
4
|
+
class DataMiner
|
5
|
+
module ActiveRecordExtensions
|
6
|
+
def data_miner(&blk)
|
7
|
+
::DataMiner.instance.start_logging
|
8
|
+
|
9
|
+
::DataMiner.logger.debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
|
10
|
+
|
11
|
+
::DataMiner.instance.resource_names.push self.name unless ::DataMiner.instance.resource_names.include? self.name
|
12
|
+
|
13
|
+
# this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
|
14
|
+
class_eval do
|
15
|
+
cattr_accessor :data_miner_config
|
16
|
+
def self.data_miner_runs
|
17
|
+
::DataMiner::Run.scoped :conditions => { :resource_name => name }
|
18
|
+
end
|
19
|
+
def self.run_data_miner!(options = {})
|
20
|
+
data_miner_config.run options
|
21
|
+
end
|
22
|
+
def self.execute_schema
|
23
|
+
if schema = data_miner_config.steps.detect { |s| s.instance_of?(::DataMiner::Schema) }
|
24
|
+
schema.run
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
self.data_miner_config = ::DataMiner::Config.new self
|
30
|
+
|
31
|
+
::Blockenspiel.invoke blk, data_miner_config
|
32
|
+
|
33
|
+
data_miner_config.after_invoke
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,45 +1,49 @@
|
|
1
|
-
|
1
|
+
require 'conversions'
|
2
|
+
|
3
|
+
class DataMiner
|
2
4
|
class Attribute
|
3
|
-
|
4
|
-
|
5
|
-
|
5
|
+
attr_reader :step
|
6
|
+
attr_reader :name
|
7
|
+
attr_reader :options
|
6
8
|
|
7
|
-
|
9
|
+
def resource
|
10
|
+
step.resource
|
11
|
+
end
|
8
12
|
|
9
|
-
VALID_OPTIONS =
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
13
|
+
VALID_OPTIONS = %w{
|
14
|
+
from_units
|
15
|
+
to_units
|
16
|
+
static
|
17
|
+
dictionary
|
18
|
+
matcher
|
19
|
+
field_name
|
20
|
+
delimiter
|
21
|
+
split
|
22
|
+
units
|
23
|
+
sprintf
|
24
|
+
nullify
|
25
|
+
overwrite
|
26
|
+
upcase
|
27
|
+
units_field_name
|
28
|
+
units_field_number
|
29
|
+
field_number
|
30
|
+
chars
|
31
|
+
synthesize
|
32
|
+
}
|
29
33
|
|
30
34
|
def initialize(step, name, options = {})
|
31
|
-
options.
|
35
|
+
@options = options.dup
|
36
|
+
@options.stringify_keys!
|
32
37
|
|
33
38
|
@step = step
|
34
39
|
@name = name
|
35
40
|
|
36
|
-
invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
|
37
|
-
|
38
|
-
@options = options
|
41
|
+
invalid_option_keys = @options.keys.select { |k| not VALID_OPTIONS.include? k }
|
42
|
+
raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
|
39
43
|
end
|
40
44
|
|
41
45
|
def inspect
|
42
|
-
|
46
|
+
%{#<DataMiner::Attribute(#{resource}##{name})>}
|
43
47
|
end
|
44
48
|
|
45
49
|
def value_in_dictionary(str)
|
@@ -50,7 +54,7 @@ module DataMiner
|
|
50
54
|
if wants_static?
|
51
55
|
value = static
|
52
56
|
elsif field_number
|
53
|
-
if field_number.is_a?(Range)
|
57
|
+
if field_number.is_a?(::Range)
|
54
58
|
value = field_number.map { |n| row[n] }.join(delimiter)
|
55
59
|
else
|
56
60
|
value = row[field_number]
|
@@ -59,7 +63,7 @@ module DataMiner
|
|
59
63
|
value = row[field_name]
|
60
64
|
end
|
61
65
|
return nil if value.nil?
|
62
|
-
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
66
|
+
return value if value.is_a?(::ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
63
67
|
value = value.to_s
|
64
68
|
value = value[chars] if wants_chars?
|
65
69
|
value = do_split(value) if wants_split?
|
@@ -81,7 +85,7 @@ module DataMiner
|
|
81
85
|
def value_from_row(row)
|
82
86
|
return match_row row if wants_matcher?
|
83
87
|
value = value_in_source row
|
84
|
-
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
88
|
+
return value if value.is_a? ::ActiveRecord::Base # carry through trapdoor
|
85
89
|
value = value_in_dictionary value if wants_dictionary?
|
86
90
|
value = synthesize.call(row) if wants_synthesize?
|
87
91
|
value = nil if value.blank? and wants_nullification?
|
@@ -100,7 +104,7 @@ module DataMiner
|
|
100
104
|
|
101
105
|
what_it_is = record.send name
|
102
106
|
if what_it_is.nil? and !what_it_should_be.nil?
|
103
|
-
DataMiner.
|
107
|
+
::DataMiner.logger.debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
104
108
|
nil
|
105
109
|
elsif what_it_is == what_it_was
|
106
110
|
false
|
@@ -114,7 +118,7 @@ module DataMiner
|
|
114
118
|
end
|
115
119
|
|
116
120
|
def do_convert(row, value)
|
117
|
-
|
121
|
+
raise "If you use 'from_units', you need to set 'to_units' (#{inspect})" unless wants_units?
|
118
122
|
value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
|
119
123
|
end
|
120
124
|
|
@@ -128,8 +132,8 @@ module DataMiner
|
|
128
132
|
end
|
129
133
|
|
130
134
|
def do_split(value)
|
131
|
-
pattern = split_options[
|
132
|
-
keep = split_options[
|
135
|
+
pattern = split_options['pattern'] || /\s+/ # default is split on whitespace
|
136
|
+
keep = split_options['keep'] || 0 # default is keep first element
|
133
137
|
value.to_s.split(pattern)[keep].to_s
|
134
138
|
end
|
135
139
|
|
@@ -148,7 +152,7 @@ module DataMiner
|
|
148
152
|
upcase.present?
|
149
153
|
end
|
150
154
|
def wants_static?
|
151
|
-
options.has_key?
|
155
|
+
options.has_key? 'static'
|
152
156
|
end
|
153
157
|
def wants_nullification?
|
154
158
|
nullify == true
|
@@ -157,7 +161,7 @@ module DataMiner
|
|
157
161
|
chars.present?
|
158
162
|
end
|
159
163
|
def wants_synthesize?
|
160
|
-
synthesize.is_a?(Proc)
|
164
|
+
synthesize.is_a?(::Proc)
|
161
165
|
end
|
162
166
|
def wants_overwriting?
|
163
167
|
overwrite != false
|
@@ -169,66 +173,67 @@ module DataMiner
|
|
169
173
|
to_units.present? or units_field_name.present? or units_field_number.present?
|
170
174
|
end
|
171
175
|
def wants_dictionary?
|
172
|
-
options[
|
176
|
+
options['dictionary'].present?
|
173
177
|
end
|
174
178
|
def wants_matcher?
|
175
|
-
options[
|
179
|
+
options['matcher'].present?
|
176
180
|
end
|
177
181
|
|
178
182
|
# Options that always have values
|
179
183
|
def field_name
|
180
|
-
(options[
|
184
|
+
(options['field_name'] || name).to_s
|
181
185
|
end
|
182
186
|
def delimiter
|
183
|
-
(options[
|
187
|
+
(options['delimiter'] || ', ')
|
184
188
|
end
|
185
189
|
|
186
190
|
# Options that can't be referred to by their names
|
187
191
|
def split_options
|
188
|
-
options[
|
192
|
+
options['split']
|
189
193
|
end
|
190
194
|
|
191
195
|
def from_units
|
192
|
-
options[
|
196
|
+
options['from_units']
|
193
197
|
end
|
194
198
|
def to_units
|
195
|
-
options[
|
199
|
+
options['to_units'] || options['units']
|
196
200
|
end
|
197
201
|
def sprintf
|
198
|
-
options[
|
202
|
+
options['sprintf']
|
199
203
|
end
|
200
204
|
def nullify
|
201
|
-
options[
|
205
|
+
options['nullify']
|
202
206
|
end
|
203
207
|
def overwrite
|
204
|
-
options[
|
208
|
+
options['overwrite']
|
205
209
|
end
|
206
210
|
def upcase
|
207
|
-
options[
|
211
|
+
options['upcase']
|
208
212
|
end
|
209
213
|
def units_field_name
|
210
|
-
options[
|
214
|
+
options['units_field_name']
|
211
215
|
end
|
212
216
|
def units_field_number
|
213
|
-
options[
|
217
|
+
options['units_field_number']
|
214
218
|
end
|
215
219
|
def field_number
|
216
|
-
options[
|
220
|
+
options['field_number']
|
217
221
|
end
|
218
222
|
def chars
|
219
|
-
options[
|
223
|
+
options['chars']
|
220
224
|
end
|
221
225
|
def synthesize
|
222
|
-
options[
|
226
|
+
options['synthesize']
|
223
227
|
end
|
224
228
|
def static
|
225
|
-
options[
|
229
|
+
options['static']
|
226
230
|
end
|
231
|
+
# must be cleared before every run! (because it relies on remote data)
|
227
232
|
def dictionary
|
228
|
-
@
|
233
|
+
@dictionary ||= (options['dictionary'].is_a?(Dictionary) ? options['dictionary'] : Dictionary.new(options['dictionary']))
|
229
234
|
end
|
230
235
|
def matcher
|
231
|
-
@
|
236
|
+
@matcher ||= (options['matcher'].is_a?(::String) ? options['matcher'].constantize.new : options['matcher'])
|
232
237
|
end
|
233
238
|
end
|
234
239
|
end
|