data_miner 1.3.8 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +42 -0
- data/Gemfile +19 -3
- data/README.rdoc +3 -3
- data/Rakefile +13 -15
- data/data_miner.gemspec +4 -15
- data/lib/data_miner.rb +69 -70
- data/lib/data_miner/active_record_extensions.rb +17 -22
- data/lib/data_miner/attribute.rb +176 -179
- data/lib/data_miner/dictionary.rb +38 -31
- data/lib/data_miner/run.rb +49 -18
- data/lib/data_miner/script.rb +116 -0
- data/lib/data_miner/step.rb +5 -0
- data/lib/data_miner/step/import.rb +74 -0
- data/lib/data_miner/step/process.rb +34 -0
- data/lib/data_miner/step/tap.rb +134 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +26 -24
- data/test/support/breeds.xls +0 -0
- data/test/support/pet_color_dictionary.en.csv +5 -0
- data/test/support/pet_color_dictionary.es.csv +5 -0
- data/test/support/pets.csv +5 -0
- data/test/support/pets_funny.csv +4 -0
- data/test/test_data_miner.rb +103 -0
- data/test/test_earth_import.rb +25 -0
- data/test/test_earth_tap.rb +25 -0
- data/test/test_safety.rb +43 -0
- metadata +72 -78
- data/.document +0 -5
- data/lib/data_miner/config.rb +0 -124
- data/lib/data_miner/import.rb +0 -93
- data/lib/data_miner/process.rb +0 -38
- data/lib/data_miner/tap.rb +0 -143
- data/test/support/aircraft.rb +0 -102
- data/test/support/airport.rb +0 -16
- data/test/support/automobile_fuel_type.rb +0 -40
- data/test/support/automobile_variant.rb +0 -362
- data/test/support/country.rb +0 -15
- data/test/support/test_database.rb +0 -311
- data/test/test_data_miner_attribute.rb +0 -111
- data/test/test_data_miner_process.rb +0 -18
- data/test/test_old_syntax.rb +0 -825
- data/test/test_tap.rb +0 -21
@@ -1,55 +1,62 @@
|
|
1
1
|
require 'remote_table'
|
2
|
+
|
2
3
|
class DataMiner
|
3
4
|
class Dictionary
|
4
|
-
|
5
|
+
DEFAULT_CASE_SENSITIVE = true
|
6
|
+
|
7
|
+
attr_reader :key_name
|
8
|
+
attr_reader :value_name
|
9
|
+
attr_reader :sprintf
|
10
|
+
attr_reader :url
|
11
|
+
attr_reader :case_sensitive
|
12
|
+
|
5
13
|
def initialize(options = {})
|
6
|
-
|
7
|
-
@options
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
options
|
12
|
-
|
13
|
-
|
14
|
-
def value_name
|
15
|
-
options['output']
|
16
|
-
end
|
17
|
-
|
18
|
-
def sprintf
|
19
|
-
options['sprintf'] || '%s'
|
14
|
+
options = options.symbolize_keys
|
15
|
+
@url = options[:url]
|
16
|
+
@key_name = options[:input]
|
17
|
+
@value_name = options[:output]
|
18
|
+
@sprintf = options[:sprintf]
|
19
|
+
@case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
|
20
|
+
@table_mutex = ::Mutex.new
|
20
21
|
end
|
21
22
|
|
22
23
|
def table
|
23
|
-
@table
|
24
|
+
@table || @table_mutex.synchronize do
|
25
|
+
@table ||= ::RemoteTable.new(url).to_a # make sure it's fully cached
|
26
|
+
end
|
24
27
|
end
|
25
28
|
|
26
|
-
def
|
27
|
-
@table.free if @table.is_a?(::RemoteTable)
|
29
|
+
def refresh
|
28
30
|
@table = nil
|
29
31
|
end
|
30
|
-
|
32
|
+
|
31
33
|
def lookup(key)
|
32
|
-
find key_name, key, value_name,
|
34
|
+
find key_name, key, value_name, {:sprintf => sprintf, :case_sensitive => case_sensitive}
|
33
35
|
end
|
34
36
|
|
35
37
|
def find(key_name, key, value_name, options = {})
|
36
|
-
|
37
|
-
|
38
|
+
normalized_key = normalize_for_comparison(key, options)
|
39
|
+
if match = table.detect { |row| normalized_key == normalize_for_comparison(row[key_name.to_s], options) }
|
40
|
+
match[value_name.to_s].to_s
|
38
41
|
end
|
39
42
|
end
|
40
|
-
|
43
|
+
|
41
44
|
private
|
42
45
|
|
43
|
-
def normalize_for_comparison(
|
44
|
-
if
|
45
|
-
if
|
46
|
-
|
47
|
-
elsif
|
48
|
-
|
46
|
+
def normalize_for_comparison(str, options = {})
|
47
|
+
if sprintf
|
48
|
+
if sprintf.end_with?('f')
|
49
|
+
str = str.to_f
|
50
|
+
elsif sprintf.end_with?('d')
|
51
|
+
str = str.to_i
|
49
52
|
end
|
50
|
-
|
53
|
+
str = sprintf % str
|
54
|
+
end
|
55
|
+
str = DataMiner.compress_whitespace str
|
56
|
+
unless options[:case_sensitive]
|
57
|
+
str = DataMiner.downcase str
|
51
58
|
end
|
52
|
-
|
59
|
+
str
|
53
60
|
end
|
54
61
|
end
|
55
62
|
end
|
data/lib/data_miner/run.rb
CHANGED
@@ -1,26 +1,57 @@
|
|
1
|
+
require 'aasm'
|
2
|
+
require 'active_record_inline_schema'
|
3
|
+
|
1
4
|
class DataMiner
|
2
5
|
class Run < ::ActiveRecord::Base
|
3
|
-
|
4
|
-
|
5
|
-
def resource
|
6
|
-
resource_name.constantize
|
6
|
+
class Skip < ::Exception
|
7
7
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
8
|
+
|
9
|
+
INITIAL_STATE = :limbo
|
10
|
+
|
11
|
+
self.table_name = 'data_miner_runs'
|
12
|
+
|
13
|
+
col :model_name
|
14
|
+
col :aasm_state
|
15
|
+
col :created_at, :type => :datetime
|
16
|
+
col :stopped_at, :type => :datetime
|
17
|
+
col :updated_at, :type => :datetime
|
18
|
+
col :error, :type => :text
|
19
|
+
|
20
|
+
include ::AASM
|
21
|
+
aasm_initial_state INITIAL_STATE
|
22
|
+
aasm_state :limbo
|
23
|
+
aasm_state :skipped
|
24
|
+
aasm_state :succeeded
|
25
|
+
aasm_state :failed
|
26
|
+
aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
|
27
|
+
aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
|
28
|
+
aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
|
29
|
+
|
30
|
+
validates_presence_of :model_name
|
31
|
+
|
32
|
+
def perform
|
33
|
+
save!
|
34
|
+
begin
|
35
|
+
catch :data_miner_succeed do
|
36
|
+
yield
|
21
37
|
end
|
22
|
-
|
38
|
+
succeed!
|
39
|
+
rescue Skip
|
40
|
+
skip!
|
41
|
+
rescue
|
42
|
+
self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
|
43
|
+
fail!
|
44
|
+
raise $!
|
45
|
+
ensure
|
46
|
+
self.stopped_at = ::Time.now
|
47
|
+
save!
|
48
|
+
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
23
49
|
end
|
24
50
|
end
|
51
|
+
lock_method :perform
|
52
|
+
|
53
|
+
def as_lock
|
54
|
+
[Run.connection.current_database, model_name]
|
55
|
+
end
|
25
56
|
end
|
26
57
|
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
class DataMiner
|
2
|
+
class Script
|
3
|
+
class << self
|
4
|
+
# @private
|
5
|
+
# activerecord-3.2.3/lib/active_record/scoping.rb
|
6
|
+
def uniq
|
7
|
+
previous_uniq = current_uniq
|
8
|
+
Script.current_uniq = true
|
9
|
+
begin
|
10
|
+
yield
|
11
|
+
ensure
|
12
|
+
Script.current_uniq = previous_uniq
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def current_stack
|
17
|
+
::Thread.current[STACK_THREAD_VAR] ||= []
|
18
|
+
end
|
19
|
+
|
20
|
+
def current_stack=(stack)
|
21
|
+
::Thread.current[STACK_THREAD_VAR] = stack
|
22
|
+
end
|
23
|
+
|
24
|
+
def current_uniq
|
25
|
+
::Thread.current[UNIQ_THREAD_VAR]
|
26
|
+
end
|
27
|
+
|
28
|
+
def current_uniq=(uniq)
|
29
|
+
::Thread.current[UNIQ_THREAD_VAR] = uniq
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
UNIQ_THREAD_VAR = 'DataMiner::Script.current_uniq'
|
34
|
+
STACK_THREAD_VAR = 'DataMiner::Script.current_stack'
|
35
|
+
|
36
|
+
attr_reader :model
|
37
|
+
attr_reader :steps
|
38
|
+
|
39
|
+
def initialize(model)
|
40
|
+
@model = model
|
41
|
+
@steps = []
|
42
|
+
end
|
43
|
+
|
44
|
+
def append_block(blk)
|
45
|
+
instance_eval(&blk)
|
46
|
+
end
|
47
|
+
|
48
|
+
def process(method_id_or_description, &blk)
|
49
|
+
append(:process, method_id_or_description, &blk)
|
50
|
+
end
|
51
|
+
|
52
|
+
def tap(description, source, options = {})
|
53
|
+
append :tap, description, source, options
|
54
|
+
end
|
55
|
+
|
56
|
+
def import(description = nil, options = {}, &blk)
|
57
|
+
append(:import, description, options, &blk)
|
58
|
+
end
|
59
|
+
|
60
|
+
def prepend_once(*args, &blk)
|
61
|
+
step = make(*args, &blk)
|
62
|
+
unless steps.include? step
|
63
|
+
steps.unshift step
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def prepend(*args, &blk)
|
68
|
+
steps.unshift make(*args, &blk)
|
69
|
+
end
|
70
|
+
|
71
|
+
def append_once(*args, &blk)
|
72
|
+
step = make(*args, &blk)
|
73
|
+
unless steps.include? step
|
74
|
+
steps << step
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def append(*args, &blk)
|
79
|
+
steps << make(*args, &blk)
|
80
|
+
end
|
81
|
+
|
82
|
+
def perform
|
83
|
+
model_name = model.name
|
84
|
+
# $stderr.write "0 - #{model_name}\n"
|
85
|
+
# $stderr.write "A - current_uniq - #{Script.current_uniq ? 'true' : 'false'}\n"
|
86
|
+
# $stderr.write "B - #{Script.current_stack.join(',')}\n"
|
87
|
+
if Script.current_uniq and Script.current_stack.include?(model_name)
|
88
|
+
# we've already done this in the current stack, so skip it
|
89
|
+
return
|
90
|
+
end
|
91
|
+
if not Script.current_uniq
|
92
|
+
# since we're not trying to uniq, ignore the current contents of the stack
|
93
|
+
Script.current_stack.clear
|
94
|
+
end
|
95
|
+
Script.current_stack << model_name
|
96
|
+
Run.new(:model_name => model_name).perform do
|
97
|
+
steps.each do |step|
|
98
|
+
step.perform
|
99
|
+
model.reset_column_information
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def make(*args, &blk)
|
107
|
+
klass = Step.const_get(args.shift.to_s.camelcase)
|
108
|
+
options = args.extract_options!
|
109
|
+
if args.empty?
|
110
|
+
args = ["#{klass.name.demodulize} step with no description"]
|
111
|
+
end
|
112
|
+
initializer = [self] + args + [options]
|
113
|
+
klass.new(*initializer, &blk)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'errata'
|
2
|
+
require 'remote_table'
|
3
|
+
|
4
|
+
class DataMiner::Step::Import
|
5
|
+
attr_reader :attributes
|
6
|
+
attr_reader :script
|
7
|
+
attr_reader :description
|
8
|
+
attr_reader :attributes
|
9
|
+
|
10
|
+
def initialize(script, description, options = {}, &blk)
|
11
|
+
options = options.symbolize_keys
|
12
|
+
if options.has_key?(:table)
|
13
|
+
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed option.}
|
14
|
+
end
|
15
|
+
if (errata_options = options[:errata]) and not errata_options.is_a?(::Hash)
|
16
|
+
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization options to Errata}
|
17
|
+
end
|
18
|
+
@script = script
|
19
|
+
@mutex = ::Mutex.new
|
20
|
+
@attributes = ::ActiveSupport::OrderedHash.new
|
21
|
+
@description = description
|
22
|
+
if options.has_key? :errata
|
23
|
+
errata_options = options[:errata].symbolize_keys
|
24
|
+
errata_options[:responder] ||= model
|
25
|
+
options[:errata] = errata_options
|
26
|
+
end
|
27
|
+
@table_options = options.dup
|
28
|
+
@table_options[:streaming] = true
|
29
|
+
instance_eval(&blk)
|
30
|
+
end
|
31
|
+
|
32
|
+
def model
|
33
|
+
script.model
|
34
|
+
end
|
35
|
+
|
36
|
+
def store(attr_name, attr_options = {})
|
37
|
+
attr_name = attr_name.to_sym
|
38
|
+
if attributes.has_key? attr_name
|
39
|
+
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
40
|
+
end
|
41
|
+
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
|
42
|
+
end
|
43
|
+
|
44
|
+
def key(attr_name, attr_options = {})
|
45
|
+
attr_name = attr_name.to_sym
|
46
|
+
if attributes.has_key? attr_name
|
47
|
+
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
48
|
+
end
|
49
|
+
@key = attr_name
|
50
|
+
store attr_name, attr_options
|
51
|
+
end
|
52
|
+
|
53
|
+
def table
|
54
|
+
@table || @mutex.synchronize do
|
55
|
+
@table ||= ::RemoteTable.new(@table_options)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def refresh
|
60
|
+
@table = nil
|
61
|
+
attributes.each { |_, attr| attr.refresh }
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
|
65
|
+
def perform
|
66
|
+
table.each do |row|
|
67
|
+
record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
|
68
|
+
attributes.each { |_, attr| attr.set_from_row record, row }
|
69
|
+
record.save!
|
70
|
+
end
|
71
|
+
refresh
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
class DataMiner::Step::Process
|
2
|
+
attr_reader :script
|
3
|
+
attr_reader :method_id
|
4
|
+
attr_reader :description
|
5
|
+
attr_reader :blk
|
6
|
+
|
7
|
+
alias :block_description :description
|
8
|
+
|
9
|
+
def initialize(script, method_id_or_description, ignored_options = {}, &blk)
|
10
|
+
@script = script
|
11
|
+
if block_given?
|
12
|
+
@description = method_id_or_description
|
13
|
+
@blk = blk
|
14
|
+
else
|
15
|
+
@description = method_id_or_description
|
16
|
+
@method_id = method_id_or_description
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def model
|
21
|
+
script.model
|
22
|
+
end
|
23
|
+
|
24
|
+
def perform
|
25
|
+
DataMiner::Script.uniq do
|
26
|
+
if blk
|
27
|
+
model.instance_eval(&blk)
|
28
|
+
else
|
29
|
+
model.send method_id
|
30
|
+
end
|
31
|
+
end
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'uri'
|
2
|
+
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
3
|
+
#
|
4
|
+
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
5
|
+
class DataMiner::Step::Tap
|
6
|
+
DEFAULT_PORTS = {
|
7
|
+
:mysql => 3306,
|
8
|
+
:mysql2 => 3306,
|
9
|
+
:postgres => 5432
|
10
|
+
}
|
11
|
+
|
12
|
+
DEFAULT_USERNAMES = {
|
13
|
+
:mysql => 'root',
|
14
|
+
:mysql2 => 'root',
|
15
|
+
:postgres => ''
|
16
|
+
}
|
17
|
+
|
18
|
+
DEFAULT_PASSWORDS = {}
|
19
|
+
DEFAULT_PASSWORDS.default = ''
|
20
|
+
|
21
|
+
DEFAULT_HOSTS = {}
|
22
|
+
DEFAULT_HOSTS.default = '127.0.0.1'
|
23
|
+
|
24
|
+
attr_reader :script
|
25
|
+
attr_reader :description
|
26
|
+
attr_reader :source
|
27
|
+
attr_reader :database_options
|
28
|
+
attr_reader :source_table_name
|
29
|
+
|
30
|
+
def initialize(script, description, source, options = {})
|
31
|
+
options = options.symbolize_keys
|
32
|
+
@script = script
|
33
|
+
@description = description
|
34
|
+
@source = source
|
35
|
+
@database_options = options.except(:source_table_name).reverse_merge(active_record_config)
|
36
|
+
@source_table_name = options.fetch :source_table_name, model.table_name
|
37
|
+
end
|
38
|
+
|
39
|
+
def model
|
40
|
+
script.model
|
41
|
+
end
|
42
|
+
|
43
|
+
def perform
|
44
|
+
[ source_table_name, model.table_name ].each do |possible_obstacle|
|
45
|
+
if connection.table_exists? possible_obstacle
|
46
|
+
connection.drop_table possible_obstacle
|
47
|
+
end
|
48
|
+
end
|
49
|
+
taps_pull
|
50
|
+
if needs_table_rename?
|
51
|
+
connection.rename_table source_table_name, model.table_name
|
52
|
+
end
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# sabshere 1/25/11 what if there were multiple connections
|
57
|
+
# blockenspiel doesn't like to delegate this to #model
|
58
|
+
def connection
|
59
|
+
::ActiveRecord::Base.connection
|
60
|
+
end
|
61
|
+
|
62
|
+
def needs_table_rename?
|
63
|
+
source_table_name != model.table_name
|
64
|
+
end
|
65
|
+
|
66
|
+
def adapter
|
67
|
+
case connection.adapter_name
|
68
|
+
when /mysql2/i
|
69
|
+
'mysql2'
|
70
|
+
when /mysql/i
|
71
|
+
'mysql'
|
72
|
+
when /postgres/i
|
73
|
+
'postgres'
|
74
|
+
when /sqlite/i
|
75
|
+
'sqlite'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# never optional
|
80
|
+
def database
|
81
|
+
database_options[:database]
|
82
|
+
end
|
83
|
+
|
84
|
+
%w{ username password port host }.each do |x|
|
85
|
+
module_eval %{
|
86
|
+
def #{x}
|
87
|
+
database_options[:#{x}] || DEFAULT_#{x.upcase}S[adapter.to_sym]
|
88
|
+
end
|
89
|
+
}
|
90
|
+
end
|
91
|
+
|
92
|
+
# "user:pass"
|
93
|
+
# "user"
|
94
|
+
# nil
|
95
|
+
def userinfo
|
96
|
+
if username.present?
|
97
|
+
[username, password].select(&:present?).join(':')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def db_url
|
102
|
+
case adapter
|
103
|
+
when 'sqlite'
|
104
|
+
"sqlite://#{database}"
|
105
|
+
else
|
106
|
+
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def active_record_config
|
111
|
+
connection.instance_variable_get(:@config).symbolize_keys
|
112
|
+
end
|
113
|
+
|
114
|
+
def taps_pull
|
115
|
+
args = [
|
116
|
+
'taps',
|
117
|
+
'pull',
|
118
|
+
db_url,
|
119
|
+
source,
|
120
|
+
'--indexes-first',
|
121
|
+
'--tables',
|
122
|
+
source_table_name
|
123
|
+
]
|
124
|
+
|
125
|
+
# https://github.com/carlhuda/bundler/issues/1579
|
126
|
+
if defined?(::Bundler)
|
127
|
+
::Bundler.with_clean_env do
|
128
|
+
::Kernel.system args.join(' ')
|
129
|
+
end
|
130
|
+
else
|
131
|
+
::Kernel.system args.join(' ')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|