data_miner 1.3.8 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +42 -0
- data/Gemfile +19 -3
- data/README.rdoc +3 -3
- data/Rakefile +13 -15
- data/data_miner.gemspec +4 -15
- data/lib/data_miner.rb +69 -70
- data/lib/data_miner/active_record_extensions.rb +17 -22
- data/lib/data_miner/attribute.rb +176 -179
- data/lib/data_miner/dictionary.rb +38 -31
- data/lib/data_miner/run.rb +49 -18
- data/lib/data_miner/script.rb +116 -0
- data/lib/data_miner/step.rb +5 -0
- data/lib/data_miner/step/import.rb +74 -0
- data/lib/data_miner/step/process.rb +34 -0
- data/lib/data_miner/step/tap.rb +134 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +26 -24
- data/test/support/breeds.xls +0 -0
- data/test/support/pet_color_dictionary.en.csv +5 -0
- data/test/support/pet_color_dictionary.es.csv +5 -0
- data/test/support/pets.csv +5 -0
- data/test/support/pets_funny.csv +4 -0
- data/test/test_data_miner.rb +103 -0
- data/test/test_earth_import.rb +25 -0
- data/test/test_earth_tap.rb +25 -0
- data/test/test_safety.rb +43 -0
- metadata +72 -78
- data/.document +0 -5
- data/lib/data_miner/config.rb +0 -124
- data/lib/data_miner/import.rb +0 -93
- data/lib/data_miner/process.rb +0 -38
- data/lib/data_miner/tap.rb +0 -143
- data/test/support/aircraft.rb +0 -102
- data/test/support/airport.rb +0 -16
- data/test/support/automobile_fuel_type.rb +0 -40
- data/test/support/automobile_variant.rb +0 -362
- data/test/support/country.rb +0 -15
- data/test/support/test_database.rb +0 -311
- data/test/test_data_miner_attribute.rb +0 -111
- data/test/test_data_miner_process.rb +0 -18
- data/test/test_old_syntax.rb +0 -825
- data/test/test_tap.rb +0 -21
@@ -1,55 +1,62 @@
|
|
1
1
|
require 'remote_table'
|
2
|
+
|
2
3
|
class DataMiner
|
3
4
|
class Dictionary
|
4
|
-
|
5
|
+
DEFAULT_CASE_SENSITIVE = true
|
6
|
+
|
7
|
+
attr_reader :key_name
|
8
|
+
attr_reader :value_name
|
9
|
+
attr_reader :sprintf
|
10
|
+
attr_reader :url
|
11
|
+
attr_reader :case_sensitive
|
12
|
+
|
5
13
|
def initialize(options = {})
|
6
|
-
|
7
|
-
@options
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
options
|
12
|
-
|
13
|
-
|
14
|
-
def value_name
|
15
|
-
options['output']
|
16
|
-
end
|
17
|
-
|
18
|
-
def sprintf
|
19
|
-
options['sprintf'] || '%s'
|
14
|
+
options = options.symbolize_keys
|
15
|
+
@url = options[:url]
|
16
|
+
@key_name = options[:input]
|
17
|
+
@value_name = options[:output]
|
18
|
+
@sprintf = options[:sprintf]
|
19
|
+
@case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
|
20
|
+
@table_mutex = ::Mutex.new
|
20
21
|
end
|
21
22
|
|
22
23
|
def table
|
23
|
-
@table
|
24
|
+
@table || @table_mutex.synchronize do
|
25
|
+
@table ||= ::RemoteTable.new(url).to_a # make sure it's fully cached
|
26
|
+
end
|
24
27
|
end
|
25
28
|
|
26
|
-
def
|
27
|
-
@table.free if @table.is_a?(::RemoteTable)
|
29
|
+
def refresh
|
28
30
|
@table = nil
|
29
31
|
end
|
30
|
-
|
32
|
+
|
31
33
|
def lookup(key)
|
32
|
-
find key_name, key, value_name,
|
34
|
+
find key_name, key, value_name, {:sprintf => sprintf, :case_sensitive => case_sensitive}
|
33
35
|
end
|
34
36
|
|
35
37
|
def find(key_name, key, value_name, options = {})
|
36
|
-
|
37
|
-
|
38
|
+
normalized_key = normalize_for_comparison(key, options)
|
39
|
+
if match = table.detect { |row| normalized_key == normalize_for_comparison(row[key_name.to_s], options) }
|
40
|
+
match[value_name.to_s].to_s
|
38
41
|
end
|
39
42
|
end
|
40
|
-
|
43
|
+
|
41
44
|
private
|
42
45
|
|
43
|
-
def normalize_for_comparison(
|
44
|
-
if
|
45
|
-
if
|
46
|
-
|
47
|
-
elsif
|
48
|
-
|
46
|
+
def normalize_for_comparison(str, options = {})
|
47
|
+
if sprintf
|
48
|
+
if sprintf.end_with?('f')
|
49
|
+
str = str.to_f
|
50
|
+
elsif sprintf.end_with?('d')
|
51
|
+
str = str.to_i
|
49
52
|
end
|
50
|
-
|
53
|
+
str = sprintf % str
|
54
|
+
end
|
55
|
+
str = DataMiner.compress_whitespace str
|
56
|
+
unless options[:case_sensitive]
|
57
|
+
str = DataMiner.downcase str
|
51
58
|
end
|
52
|
-
|
59
|
+
str
|
53
60
|
end
|
54
61
|
end
|
55
62
|
end
|
data/lib/data_miner/run.rb
CHANGED
@@ -1,26 +1,57 @@
|
|
1
|
+
require 'aasm'
|
2
|
+
require 'active_record_inline_schema'
|
3
|
+
|
1
4
|
class DataMiner
|
2
5
|
class Run < ::ActiveRecord::Base
|
3
|
-
|
4
|
-
|
5
|
-
def resource
|
6
|
-
resource_name.constantize
|
6
|
+
class Skip < ::Exception
|
7
7
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
8
|
+
|
9
|
+
INITIAL_STATE = :limbo
|
10
|
+
|
11
|
+
self.table_name = 'data_miner_runs'
|
12
|
+
|
13
|
+
col :model_name
|
14
|
+
col :aasm_state
|
15
|
+
col :created_at, :type => :datetime
|
16
|
+
col :stopped_at, :type => :datetime
|
17
|
+
col :updated_at, :type => :datetime
|
18
|
+
col :error, :type => :text
|
19
|
+
|
20
|
+
include ::AASM
|
21
|
+
aasm_initial_state INITIAL_STATE
|
22
|
+
aasm_state :limbo
|
23
|
+
aasm_state :skipped
|
24
|
+
aasm_state :succeeded
|
25
|
+
aasm_state :failed
|
26
|
+
aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
|
27
|
+
aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
|
28
|
+
aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
|
29
|
+
|
30
|
+
validates_presence_of :model_name
|
31
|
+
|
32
|
+
def perform
|
33
|
+
save!
|
34
|
+
begin
|
35
|
+
catch :data_miner_succeed do
|
36
|
+
yield
|
21
37
|
end
|
22
|
-
|
38
|
+
succeed!
|
39
|
+
rescue Skip
|
40
|
+
skip!
|
41
|
+
rescue
|
42
|
+
self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
|
43
|
+
fail!
|
44
|
+
raise $!
|
45
|
+
ensure
|
46
|
+
self.stopped_at = ::Time.now
|
47
|
+
save!
|
48
|
+
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
23
49
|
end
|
24
50
|
end
|
51
|
+
lock_method :perform
|
52
|
+
|
53
|
+
def as_lock
|
54
|
+
[Run.connection.current_database, model_name]
|
55
|
+
end
|
25
56
|
end
|
26
57
|
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
class DataMiner
|
2
|
+
class Script
|
3
|
+
class << self
|
4
|
+
# @private
|
5
|
+
# activerecord-3.2.3/lib/active_record/scoping.rb
|
6
|
+
def uniq
|
7
|
+
previous_uniq = current_uniq
|
8
|
+
Script.current_uniq = true
|
9
|
+
begin
|
10
|
+
yield
|
11
|
+
ensure
|
12
|
+
Script.current_uniq = previous_uniq
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def current_stack
|
17
|
+
::Thread.current[STACK_THREAD_VAR] ||= []
|
18
|
+
end
|
19
|
+
|
20
|
+
def current_stack=(stack)
|
21
|
+
::Thread.current[STACK_THREAD_VAR] = stack
|
22
|
+
end
|
23
|
+
|
24
|
+
def current_uniq
|
25
|
+
::Thread.current[UNIQ_THREAD_VAR]
|
26
|
+
end
|
27
|
+
|
28
|
+
def current_uniq=(uniq)
|
29
|
+
::Thread.current[UNIQ_THREAD_VAR] = uniq
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
UNIQ_THREAD_VAR = 'DataMiner::Script.current_uniq'
|
34
|
+
STACK_THREAD_VAR = 'DataMiner::Script.current_stack'
|
35
|
+
|
36
|
+
attr_reader :model
|
37
|
+
attr_reader :steps
|
38
|
+
|
39
|
+
def initialize(model)
|
40
|
+
@model = model
|
41
|
+
@steps = []
|
42
|
+
end
|
43
|
+
|
44
|
+
def append_block(blk)
|
45
|
+
instance_eval(&blk)
|
46
|
+
end
|
47
|
+
|
48
|
+
def process(method_id_or_description, &blk)
|
49
|
+
append(:process, method_id_or_description, &blk)
|
50
|
+
end
|
51
|
+
|
52
|
+
def tap(description, source, options = {})
|
53
|
+
append :tap, description, source, options
|
54
|
+
end
|
55
|
+
|
56
|
+
def import(description = nil, options = {}, &blk)
|
57
|
+
append(:import, description, options, &blk)
|
58
|
+
end
|
59
|
+
|
60
|
+
def prepend_once(*args, &blk)
|
61
|
+
step = make(*args, &blk)
|
62
|
+
unless steps.include? step
|
63
|
+
steps.unshift step
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def prepend(*args, &blk)
|
68
|
+
steps.unshift make(*args, &blk)
|
69
|
+
end
|
70
|
+
|
71
|
+
def append_once(*args, &blk)
|
72
|
+
step = make(*args, &blk)
|
73
|
+
unless steps.include? step
|
74
|
+
steps << step
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def append(*args, &blk)
|
79
|
+
steps << make(*args, &blk)
|
80
|
+
end
|
81
|
+
|
82
|
+
def perform
|
83
|
+
model_name = model.name
|
84
|
+
# $stderr.write "0 - #{model_name}\n"
|
85
|
+
# $stderr.write "A - current_uniq - #{Script.current_uniq ? 'true' : 'false'}\n"
|
86
|
+
# $stderr.write "B - #{Script.current_stack.join(',')}\n"
|
87
|
+
if Script.current_uniq and Script.current_stack.include?(model_name)
|
88
|
+
# we've already done this in the current stack, so skip it
|
89
|
+
return
|
90
|
+
end
|
91
|
+
if not Script.current_uniq
|
92
|
+
# since we're not trying to uniq, ignore the current contents of the stack
|
93
|
+
Script.current_stack.clear
|
94
|
+
end
|
95
|
+
Script.current_stack << model_name
|
96
|
+
Run.new(:model_name => model_name).perform do
|
97
|
+
steps.each do |step|
|
98
|
+
step.perform
|
99
|
+
model.reset_column_information
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def make(*args, &blk)
|
107
|
+
klass = Step.const_get(args.shift.to_s.camelcase)
|
108
|
+
options = args.extract_options!
|
109
|
+
if args.empty?
|
110
|
+
args = ["#{klass.name.demodulize} step with no description"]
|
111
|
+
end
|
112
|
+
initializer = [self] + args + [options]
|
113
|
+
klass.new(*initializer, &blk)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'errata'
|
2
|
+
require 'remote_table'
|
3
|
+
|
4
|
+
class DataMiner::Step::Import
|
5
|
+
attr_reader :attributes
|
6
|
+
attr_reader :script
|
7
|
+
attr_reader :description
|
8
|
+
attr_reader :attributes
|
9
|
+
|
10
|
+
def initialize(script, description, options = {}, &blk)
|
11
|
+
options = options.symbolize_keys
|
12
|
+
if options.has_key?(:table)
|
13
|
+
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed option.}
|
14
|
+
end
|
15
|
+
if (errata_options = options[:errata]) and not errata_options.is_a?(::Hash)
|
16
|
+
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization options to Errata}
|
17
|
+
end
|
18
|
+
@script = script
|
19
|
+
@mutex = ::Mutex.new
|
20
|
+
@attributes = ::ActiveSupport::OrderedHash.new
|
21
|
+
@description = description
|
22
|
+
if options.has_key? :errata
|
23
|
+
errata_options = options[:errata].symbolize_keys
|
24
|
+
errata_options[:responder] ||= model
|
25
|
+
options[:errata] = errata_options
|
26
|
+
end
|
27
|
+
@table_options = options.dup
|
28
|
+
@table_options[:streaming] = true
|
29
|
+
instance_eval(&blk)
|
30
|
+
end
|
31
|
+
|
32
|
+
def model
|
33
|
+
script.model
|
34
|
+
end
|
35
|
+
|
36
|
+
def store(attr_name, attr_options = {})
|
37
|
+
attr_name = attr_name.to_sym
|
38
|
+
if attributes.has_key? attr_name
|
39
|
+
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
40
|
+
end
|
41
|
+
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
|
42
|
+
end
|
43
|
+
|
44
|
+
def key(attr_name, attr_options = {})
|
45
|
+
attr_name = attr_name.to_sym
|
46
|
+
if attributes.has_key? attr_name
|
47
|
+
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
48
|
+
end
|
49
|
+
@key = attr_name
|
50
|
+
store attr_name, attr_options
|
51
|
+
end
|
52
|
+
|
53
|
+
def table
|
54
|
+
@table || @mutex.synchronize do
|
55
|
+
@table ||= ::RemoteTable.new(@table_options)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def refresh
|
60
|
+
@table = nil
|
61
|
+
attributes.each { |_, attr| attr.refresh }
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
|
65
|
+
def perform
|
66
|
+
table.each do |row|
|
67
|
+
record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
|
68
|
+
attributes.each { |_, attr| attr.set_from_row record, row }
|
69
|
+
record.save!
|
70
|
+
end
|
71
|
+
refresh
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
class DataMiner::Step::Process
|
2
|
+
attr_reader :script
|
3
|
+
attr_reader :method_id
|
4
|
+
attr_reader :description
|
5
|
+
attr_reader :blk
|
6
|
+
|
7
|
+
alias :block_description :description
|
8
|
+
|
9
|
+
def initialize(script, method_id_or_description, ignored_options = {}, &blk)
|
10
|
+
@script = script
|
11
|
+
if block_given?
|
12
|
+
@description = method_id_or_description
|
13
|
+
@blk = blk
|
14
|
+
else
|
15
|
+
@description = method_id_or_description
|
16
|
+
@method_id = method_id_or_description
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def model
|
21
|
+
script.model
|
22
|
+
end
|
23
|
+
|
24
|
+
def perform
|
25
|
+
DataMiner::Script.uniq do
|
26
|
+
if blk
|
27
|
+
model.instance_eval(&blk)
|
28
|
+
else
|
29
|
+
model.send method_id
|
30
|
+
end
|
31
|
+
end
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'uri'
|
2
|
+
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
3
|
+
#
|
4
|
+
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
5
|
+
class DataMiner::Step::Tap
|
6
|
+
DEFAULT_PORTS = {
|
7
|
+
:mysql => 3306,
|
8
|
+
:mysql2 => 3306,
|
9
|
+
:postgres => 5432
|
10
|
+
}
|
11
|
+
|
12
|
+
DEFAULT_USERNAMES = {
|
13
|
+
:mysql => 'root',
|
14
|
+
:mysql2 => 'root',
|
15
|
+
:postgres => ''
|
16
|
+
}
|
17
|
+
|
18
|
+
DEFAULT_PASSWORDS = {}
|
19
|
+
DEFAULT_PASSWORDS.default = ''
|
20
|
+
|
21
|
+
DEFAULT_HOSTS = {}
|
22
|
+
DEFAULT_HOSTS.default = '127.0.0.1'
|
23
|
+
|
24
|
+
attr_reader :script
|
25
|
+
attr_reader :description
|
26
|
+
attr_reader :source
|
27
|
+
attr_reader :database_options
|
28
|
+
attr_reader :source_table_name
|
29
|
+
|
30
|
+
def initialize(script, description, source, options = {})
|
31
|
+
options = options.symbolize_keys
|
32
|
+
@script = script
|
33
|
+
@description = description
|
34
|
+
@source = source
|
35
|
+
@database_options = options.except(:source_table_name).reverse_merge(active_record_config)
|
36
|
+
@source_table_name = options.fetch :source_table_name, model.table_name
|
37
|
+
end
|
38
|
+
|
39
|
+
def model
|
40
|
+
script.model
|
41
|
+
end
|
42
|
+
|
43
|
+
def perform
|
44
|
+
[ source_table_name, model.table_name ].each do |possible_obstacle|
|
45
|
+
if connection.table_exists? possible_obstacle
|
46
|
+
connection.drop_table possible_obstacle
|
47
|
+
end
|
48
|
+
end
|
49
|
+
taps_pull
|
50
|
+
if needs_table_rename?
|
51
|
+
connection.rename_table source_table_name, model.table_name
|
52
|
+
end
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# sabshere 1/25/11 what if there were multiple connections
|
57
|
+
# blockenspiel doesn't like to delegate this to #model
|
58
|
+
def connection
|
59
|
+
::ActiveRecord::Base.connection
|
60
|
+
end
|
61
|
+
|
62
|
+
def needs_table_rename?
|
63
|
+
source_table_name != model.table_name
|
64
|
+
end
|
65
|
+
|
66
|
+
def adapter
|
67
|
+
case connection.adapter_name
|
68
|
+
when /mysql2/i
|
69
|
+
'mysql2'
|
70
|
+
when /mysql/i
|
71
|
+
'mysql'
|
72
|
+
when /postgres/i
|
73
|
+
'postgres'
|
74
|
+
when /sqlite/i
|
75
|
+
'sqlite'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# never optional
|
80
|
+
def database
|
81
|
+
database_options[:database]
|
82
|
+
end
|
83
|
+
|
84
|
+
%w{ username password port host }.each do |x|
|
85
|
+
module_eval %{
|
86
|
+
def #{x}
|
87
|
+
database_options[:#{x}] || DEFAULT_#{x.upcase}S[adapter.to_sym]
|
88
|
+
end
|
89
|
+
}
|
90
|
+
end
|
91
|
+
|
92
|
+
# "user:pass"
|
93
|
+
# "user"
|
94
|
+
# nil
|
95
|
+
def userinfo
|
96
|
+
if username.present?
|
97
|
+
[username, password].select(&:present?).join(':')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def db_url
|
102
|
+
case adapter
|
103
|
+
when 'sqlite'
|
104
|
+
"sqlite://#{database}"
|
105
|
+
else
|
106
|
+
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def active_record_config
|
111
|
+
connection.instance_variable_get(:@config).symbolize_keys
|
112
|
+
end
|
113
|
+
|
114
|
+
def taps_pull
|
115
|
+
args = [
|
116
|
+
'taps',
|
117
|
+
'pull',
|
118
|
+
db_url,
|
119
|
+
source,
|
120
|
+
'--indexes-first',
|
121
|
+
'--tables',
|
122
|
+
source_table_name
|
123
|
+
]
|
124
|
+
|
125
|
+
# https://github.com/carlhuda/bundler/issues/1579
|
126
|
+
if defined?(::Bundler)
|
127
|
+
::Bundler.with_clean_env do
|
128
|
+
::Kernel.system args.join(' ')
|
129
|
+
end
|
130
|
+
else
|
131
|
+
::Kernel.system args.join(' ')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|