data_miner 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -7
- data/CHANGELOG +13 -0
- data/LICENSE +1 -1
- data/README.markdown +112 -0
- data/data_miner.gemspec +2 -2
- data/lib/data_miner.rb +26 -12
- data/lib/data_miner/active_record_class_methods.rb +108 -0
- data/lib/data_miner/attribute.rb +150 -76
- data/lib/data_miner/dictionary.rb +40 -18
- data/lib/data_miner/run.rb +35 -0
- data/lib/data_miner/script.rb +123 -2
- data/lib/data_miner/step.rb +11 -3
- data/lib/data_miner/step/import.rb +100 -64
- data/lib/data_miner/step/process.rb +46 -28
- data/lib/data_miner/step/tap.rb +156 -123
- data/lib/data_miner/version.rb +1 -1
- data/test/test_safety.rb +61 -25
- metadata +8 -6
- data/README.rdoc +0 -289
- data/lib/data_miner/active_record_extensions.rb +0 -38
@@ -1,34 +1,52 @@
|
|
1
|
-
class DataMiner
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
class DataMiner
|
2
|
+
class Step
|
3
|
+
# A step that executes a single class method on the model or an arbitrary code block.
|
4
|
+
#
|
5
|
+
# Create these by calling +process+ inside a +data_miner+ block.
|
6
|
+
#
|
7
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
8
|
+
# @see DataMiner::Script#process
|
9
|
+
class Process < Step
|
10
|
+
# @private
|
11
|
+
attr_reader :script
|
6
12
|
|
7
|
-
|
13
|
+
# The method to be called on the model class.
|
14
|
+
# @return [Symbol]
|
15
|
+
attr_reader :method_id
|
8
16
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
17
|
+
# A description of what the block does. Doesn't exist when a single class method is specified using a Symbol.
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :description
|
20
|
+
|
21
|
+
# The block of arbitrary code to be run.
|
22
|
+
# @return [Proc]
|
23
|
+
attr_reader :blk
|
24
|
+
|
25
|
+
alias :block_description :description
|
26
|
+
|
27
|
+
# @private
|
28
|
+
def initialize(script, method_id_or_description, ignored_options = {}, &blk)
|
29
|
+
@script = script
|
30
|
+
if block_given?
|
31
|
+
@description = method_id_or_description
|
32
|
+
@blk = blk
|
33
|
+
else
|
34
|
+
@description = method_id_or_description
|
35
|
+
@method_id = method_id_or_description
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# @private
|
40
|
+
def perform
|
41
|
+
DataMiner::Script.uniq do
|
42
|
+
if blk
|
43
|
+
model.instance_eval(&blk)
|
44
|
+
else
|
45
|
+
model.send method_id
|
46
|
+
end
|
47
|
+
end
|
48
|
+
nil
|
30
49
|
end
|
31
50
|
end
|
32
|
-
nil
|
33
51
|
end
|
34
52
|
end
|
data/lib/data_miner/step/tap.rb
CHANGED
@@ -1,134 +1,167 @@
|
|
1
1
|
require 'uri'
|
2
|
-
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
3
|
-
#
|
4
|
-
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
5
|
-
class DataMiner::Step::Tap
|
6
|
-
DEFAULT_PORTS = {
|
7
|
-
:mysql => 3306,
|
8
|
-
:mysql2 => 3306,
|
9
|
-
:postgres => 5432
|
10
|
-
}
|
11
|
-
|
12
|
-
DEFAULT_USERNAMES = {
|
13
|
-
:mysql => 'root',
|
14
|
-
:mysql2 => 'root',
|
15
|
-
:postgres => ''
|
16
|
-
}
|
17
|
-
|
18
|
-
DEFAULT_PASSWORDS = {}
|
19
|
-
DEFAULT_PASSWORDS.default = ''
|
20
|
-
|
21
|
-
DEFAULT_HOSTS = {}
|
22
|
-
DEFAULT_HOSTS.default = '127.0.0.1'
|
23
2
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
3
|
+
class DataMiner
|
4
|
+
class Step
|
5
|
+
# A step that uses https://github.com/ricardochimal/taps to import table structure and data.
|
6
|
+
#
|
7
|
+
# Create these by calling +tap+ inside a +data_miner+ block.
|
8
|
+
#
|
9
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
10
|
+
# @see DataMiner::Script#tap
|
11
|
+
class Tap < Step
|
12
|
+
DEFAULT_PORTS = {
|
13
|
+
:mysql => 3306,
|
14
|
+
:mysql2 => 3306,
|
15
|
+
:postgres => 5432
|
16
|
+
}
|
17
|
+
|
18
|
+
DEFAULT_USERNAMES = {
|
19
|
+
:mysql => 'root',
|
20
|
+
:mysql2 => 'root',
|
21
|
+
:postgres => ''
|
22
|
+
}
|
23
|
+
|
24
|
+
DEFAULT_PASSWORDS = {}
|
25
|
+
DEFAULT_PASSWORDS.default = ''
|
26
|
+
|
27
|
+
DEFAULT_HOSTS = {}
|
28
|
+
DEFAULT_HOSTS.default = '127.0.0.1'
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
30
|
+
# @private
|
31
|
+
attr_reader :script
|
32
|
+
|
33
|
+
# A description of the tapped data source.
|
34
|
+
# @return [String]
|
35
|
+
attr_reader :description
|
36
|
+
|
37
|
+
# The URL of the tapped data source, including username, password, domain, and port number.
|
38
|
+
# @return [String]
|
39
|
+
attr_reader :source
|
40
|
+
|
41
|
+
# Connection options that will be passed to the +taps pull command+. Defaults to the ActiveRecord connection config, if available.
|
42
|
+
# @return [Hash]
|
43
|
+
attr_reader :database_options
|
44
|
+
|
45
|
+
# Source table name. Defaults to the table name of the model.
|
46
|
+
# @return [String]
|
47
|
+
attr_reader :source_table_name
|
48
|
+
|
49
|
+
# @private
|
50
|
+
def initialize(script, description, source, options = {})
|
51
|
+
options = options.symbolize_keys
|
52
|
+
@script = script
|
53
|
+
@description = description
|
54
|
+
@source = source
|
55
|
+
@source_table_name = options.delete(:source_table_name) || model.table_name
|
56
|
+
@database_options = options.reverse_merge script.model.connection.instance_variable_get(:@config).symbolize_keys
|
57
|
+
end
|
58
|
+
|
59
|
+
# @private
|
60
|
+
def perform
|
61
|
+
[ source_table_name, model.table_name ].each do |possible_obstacle|
|
62
|
+
if connection.table_exists? possible_obstacle
|
63
|
+
connection.drop_table possible_obstacle
|
64
|
+
end
|
65
|
+
end
|
66
|
+
taps_pull
|
67
|
+
if needs_table_rename?
|
68
|
+
connection.rename_table source_table_name, model.table_name
|
69
|
+
end
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [String] The name of the current database.
|
74
|
+
def database
|
75
|
+
unless database = database_options[:database]
|
76
|
+
raise ::ArgumentError, %{[data_miner] Can't infer database name from options or ActiveRecord config.}
|
77
|
+
end
|
78
|
+
database
|
79
|
+
end
|
80
|
+
|
81
|
+
# @return [String] The database username.
|
82
|
+
def username
|
83
|
+
database_options[:username] || DEFAULT_USERNAMES[adapter.to_sym]
|
47
84
|
end
|
48
|
-
end
|
49
|
-
taps_pull
|
50
|
-
if needs_table_rename?
|
51
|
-
connection.rename_table source_table_name, model.table_name
|
52
|
-
end
|
53
|
-
nil
|
54
|
-
end
|
55
|
-
|
56
|
-
# sabshere 1/25/11 what if there were multiple connections
|
57
|
-
# blockenspiel doesn't like to delegate this to #model
|
58
|
-
def connection
|
59
|
-
::ActiveRecord::Base.connection
|
60
|
-
end
|
61
|
-
|
62
|
-
def needs_table_rename?
|
63
|
-
source_table_name != model.table_name
|
64
|
-
end
|
65
|
-
|
66
|
-
def adapter
|
67
|
-
case connection.adapter_name
|
68
|
-
when /mysql2/i
|
69
|
-
'mysql2'
|
70
|
-
when /mysql/i
|
71
|
-
'mysql'
|
72
|
-
when /postgres/i
|
73
|
-
'postgres'
|
74
|
-
when /sqlite/i
|
75
|
-
'sqlite'
|
76
|
-
end
|
77
|
-
end
|
78
85
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
%w{ username password port host }.each do |x|
|
85
|
-
module_eval %{
|
86
|
-
def #{x}
|
87
|
-
database_options[:#{x}] || DEFAULT_#{x.upcase}S[adapter.to_sym]
|
86
|
+
# @return [String] The database password.
|
87
|
+
def password
|
88
|
+
database_options[:password] || DEFAULT_PASSWORDS[adapter.to_sym]
|
88
89
|
end
|
89
|
-
}
|
90
|
-
end
|
91
|
-
|
92
|
-
# "user:pass"
|
93
|
-
# "user"
|
94
|
-
# nil
|
95
|
-
def userinfo
|
96
|
-
if username.present?
|
97
|
-
[username, password].select(&:present?).join(':')
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def db_url
|
102
|
-
case adapter
|
103
|
-
when 'sqlite'
|
104
|
-
"sqlite://#{database}"
|
105
|
-
else
|
106
|
-
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
107
|
-
end
|
108
|
-
end
|
109
90
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
91
|
+
# @return [String] The database port number.
|
92
|
+
def port
|
93
|
+
database_options[:port] || DEFAULT_PORTS[adapter.to_sym]
|
94
|
+
end
|
95
|
+
|
96
|
+
# @return [String] The database hostname.
|
97
|
+
def host
|
98
|
+
database_options[:host] || DEFAULT_HOSTS[adapter.to_sym]
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def connection
|
104
|
+
model.connection
|
105
|
+
end
|
106
|
+
|
107
|
+
def needs_table_rename?
|
108
|
+
source_table_name != model.table_name
|
109
|
+
end
|
110
|
+
|
111
|
+
def adapter
|
112
|
+
case connection.adapter_name
|
113
|
+
when /mysql2/i
|
114
|
+
'mysql2'
|
115
|
+
when /mysql/i
|
116
|
+
'mysql'
|
117
|
+
when /postgres/i
|
118
|
+
'postgres'
|
119
|
+
when /sqlite/i
|
120
|
+
'sqlite'
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# "user:pass"
|
125
|
+
# "user"
|
126
|
+
# nil
|
127
|
+
def userinfo
|
128
|
+
if username.present?
|
129
|
+
[username, password].select(&:present?).join(':')
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def db_url
|
134
|
+
case adapter
|
135
|
+
when 'sqlite'
|
136
|
+
"sqlite://#{database}"
|
137
|
+
else
|
138
|
+
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
143
|
+
#
|
144
|
+
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
145
|
+
def taps_pull
|
146
|
+
args = [
|
147
|
+
'taps',
|
148
|
+
'pull',
|
149
|
+
db_url,
|
150
|
+
source,
|
151
|
+
'--indexes-first',
|
152
|
+
'--tables',
|
153
|
+
source_table_name
|
154
|
+
]
|
155
|
+
|
156
|
+
# https://github.com/carlhuda/bundler/issues/1579
|
157
|
+
if defined?(::Bundler)
|
158
|
+
::Bundler.with_clean_env do
|
159
|
+
::Kernel.system args.join(' ')
|
160
|
+
end
|
161
|
+
else
|
162
|
+
::Kernel.system args.join(' ')
|
163
|
+
end
|
129
164
|
end
|
130
|
-
else
|
131
|
-
::Kernel.system args.join(' ')
|
132
165
|
end
|
133
166
|
end
|
134
167
|
end
|
data/lib/data_miner/version.rb
CHANGED
data/test/test_safety.rb
CHANGED
@@ -7,36 +7,72 @@ Earth.init :locality, :pet, :load_data_miner => true, :apply_schemas => true
|
|
7
7
|
|
8
8
|
describe DataMiner do
|
9
9
|
describe "when being run in a multi-threaded environment" do
|
10
|
+
before do
|
11
|
+
@old_thread_abort_on_exception = Thread.abort_on_exception
|
12
|
+
Thread.abort_on_exception = false
|
13
|
+
end
|
14
|
+
|
15
|
+
after do
|
16
|
+
Thread.abort_on_exception = @old_thread_abort_on_exception
|
17
|
+
end
|
18
|
+
|
10
19
|
it "tries not to duplicate data" do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
Breed.delete_all
|
21
|
+
Breed.run_data_miner!
|
22
|
+
reference_count = Breed.count
|
23
|
+
Breed.delete_all
|
24
|
+
threads = (0..2).map do |i|
|
25
|
+
Thread.new do
|
26
|
+
# $stderr.write "Thread #{i} starting\n"
|
27
|
+
Breed.run_data_miner!
|
28
|
+
# $stderr.write "Thread #{i} done\n"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
exceptions = []
|
32
|
+
threads.each do |t|
|
33
|
+
begin
|
34
|
+
t.join
|
35
|
+
rescue
|
36
|
+
exceptions << $!
|
24
37
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
38
|
+
end
|
39
|
+
exceptions.length.must_equal 2
|
40
|
+
exceptions.each do |exception|
|
41
|
+
exception.must_be_kind_of LockMethod::Locked
|
42
|
+
end
|
43
|
+
Breed.count.must_equal reference_count
|
44
|
+
end
|
45
|
+
|
46
|
+
it "allows you to clear locks if necessary" do
|
47
|
+
threads = (0..2).map do |i|
|
48
|
+
Thread.new do
|
49
|
+
# $stderr.write "Thread #{i} starting\n"
|
50
|
+
case i
|
51
|
+
when 0
|
52
|
+
Breed.run_data_miner!
|
53
|
+
when 1
|
54
|
+
sleep 0.3
|
55
|
+
DataMiner::Run.clear_locks
|
56
|
+
Breed.run_data_miner!
|
57
|
+
when 2
|
58
|
+
# i will hit a lock!
|
59
|
+
sleep 0.6
|
60
|
+
Breed.run_data_miner!
|
31
61
|
end
|
62
|
+
# $stderr.write "Thread #{i} done\n"
|
32
63
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
64
|
+
end
|
65
|
+
exceptions = []
|
66
|
+
threads.each do |t|
|
67
|
+
begin
|
68
|
+
t.join
|
69
|
+
rescue
|
70
|
+
exceptions << $!
|
36
71
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
72
|
+
end
|
73
|
+
exceptions.length.must_equal 1
|
74
|
+
exceptions.each do |exception|
|
75
|
+
exception.must_be_kind_of LockMethod::Locked
|
40
76
|
end
|
41
77
|
end
|
42
78
|
end
|