data_miner 2.5.2 → 3.0.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +18 -0
- data/Gemfile +0 -2
- data/data_miner.gemspec +3 -7
- data/lib/data_miner.rb +2 -31
- data/lib/data_miner/active_record_class_methods.rb +5 -11
- data/lib/data_miner/attribute.rb +100 -198
- data/lib/data_miner/script.rb +5 -11
- data/lib/data_miner/step/import.rb +41 -27
- data/lib/data_miner/step/sql.rb +10 -10
- data/lib/data_miner/version.rb +1 -1
- data/test/data_miner/step/test_sql.rb +14 -18
- data/test/data_miner/test_attribute.rb +0 -32
- data/test/helper.rb +4 -9
- data/test/support/data_miner_with_alchemist.rb +1 -5
- data/test/support/pet.rb +10 -9
- data/test/support/pet2.rb +1 -1
- data/test/support/pets.csv +2 -2
- data/test/test_data_miner.rb +6 -40
- metadata +9 -97
- data/lib/data_miner/dictionary.rb +0 -84
- data/lib/data_miner/run.rb +0 -144
- data/lib/data_miner/run/column_statistic.rb +0 -78
- data/lib/data_miner/unit_converter.rb +0 -12
- data/lib/data_miner/unit_converter/alchemist.rb +0 -11
- data/lib/data_miner/unit_converter/conversions.rb +0 -11
- data/test/data_miner/step/test_import.rb +0 -35
- data/test/data_miner/unit_converter/test_alchemist.rb +0 -20
- data/test/data_miner/unit_converter/test_conversions.rb +0 -20
- data/test/support/data_miner_with_conversions.rb +0 -16
- data/test/support/data_miner_without_unit_converter.rb +0 -51
- data/test/test_data_miner_run_column_statistic.rb +0 -52
- data/test/test_earth_import.rb +0 -26
- data/test/test_safety.rb +0 -84
- data/test/test_unit_conversion.rb +0 -16
data/lib/data_miner/script.rb
CHANGED
@@ -202,7 +202,7 @@ class DataMiner
|
|
202
202
|
# @note Normally you should use +Country.run_data_miner!+
|
203
203
|
# @note A primitive "call stack" is kept that will prevent infinite loops. So, if Country's data miner script calls Province's AND vice-versa, each one will only be run once.
|
204
204
|
#
|
205
|
-
# @return
|
205
|
+
# @return nil
|
206
206
|
def start
|
207
207
|
model_name = model.name
|
208
208
|
# $stderr.write "0 - #{model_name}\n"
|
@@ -217,17 +217,11 @@ class DataMiner
|
|
217
217
|
Script.current_stack.clear
|
218
218
|
end
|
219
219
|
Script.current_stack << model_name
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
run = Run.new
|
224
|
-
run.model_name = model_name
|
225
|
-
run.start do
|
226
|
-
steps.each do |step|
|
227
|
-
step.start
|
228
|
-
model.reset_column_information
|
229
|
-
end
|
220
|
+
steps.each do |step|
|
221
|
+
step.start
|
222
|
+
model.reset_column_information
|
230
223
|
end
|
224
|
+
nil
|
231
225
|
end
|
232
226
|
|
233
227
|
private
|
@@ -22,24 +22,24 @@ class DataMiner
|
|
22
22
|
|
23
23
|
# @private
|
24
24
|
def initialize(script, description, settings, &blk)
|
25
|
-
settings = settings.
|
26
|
-
if settings.has_key?(
|
25
|
+
settings = settings.stringify_keys
|
26
|
+
if settings.has_key?('table')
|
27
27
|
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
|
28
28
|
end
|
29
|
-
if (errata_settings = settings[
|
29
|
+
if (errata_settings = settings['errata']) and not errata_settings.is_a?(::Hash)
|
30
30
|
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
|
31
31
|
end
|
32
32
|
@script = script
|
33
33
|
@attributes = ::ActiveSupport::OrderedHash.new
|
34
|
-
@validate_query = !!settings[
|
34
|
+
@validate_query = !!settings['validate']
|
35
35
|
@description = description
|
36
|
-
if settings.has_key?
|
37
|
-
errata_settings = settings[
|
38
|
-
errata_settings[
|
39
|
-
settings[
|
36
|
+
if settings.has_key? 'errata'
|
37
|
+
errata_settings = settings['errata'].stringify_keys
|
38
|
+
errata_settings['responder'] ||= model
|
39
|
+
settings['errata'] = errata_settings
|
40
40
|
end
|
41
41
|
@table_settings = settings.dup
|
42
|
-
@table_settings[
|
42
|
+
@table_settings['streaming'] = true
|
43
43
|
@table_mutex = ::Mutex.new
|
44
44
|
instance_eval(&blk)
|
45
45
|
end
|
@@ -48,17 +48,17 @@ class DataMiner
|
|
48
48
|
#
|
49
49
|
# @see DataMiner::Attribute The actual Attribute class.
|
50
50
|
#
|
51
|
-
# @param [
|
51
|
+
# @param [String] attr_name The name of the local model column.
|
52
52
|
# @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
|
53
53
|
# @option attr_options [*] anything Any option for +DataMiner::Attribute+.
|
54
54
|
#
|
55
55
|
# @return [nil]
|
56
|
-
def store(attr_name, attr_options = {})
|
57
|
-
attr_name = attr_name.
|
56
|
+
def store(attr_name, attr_options = {}, &blk)
|
57
|
+
attr_name = attr_name.to_s
|
58
58
|
if attributes.has_key? attr_name
|
59
59
|
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
60
60
|
end
|
61
|
-
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
|
61
|
+
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options, &blk
|
62
62
|
end
|
63
63
|
|
64
64
|
# Store data into a model column AND use it as the key.
|
@@ -67,13 +67,13 @@ class DataMiner
|
|
67
67
|
#
|
68
68
|
# Enables idempotency. In other words, you can run the data miner script multiple times, get updated data, and not get duplicate rows.
|
69
69
|
#
|
70
|
-
# @param [
|
70
|
+
# @param [String] attr_name The name of the local model column.
|
71
71
|
# @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
|
72
72
|
# @option attr_options [*] anything Any option for +DataMiner::Attribute+.
|
73
73
|
#
|
74
74
|
# @return [nil]
|
75
75
|
def key(attr_name, attr_options = {})
|
76
|
-
attr_name = attr_name.
|
76
|
+
attr_name = attr_name.to_s
|
77
77
|
if attributes.has_key? attr_name
|
78
78
|
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
79
79
|
end
|
@@ -83,9 +83,8 @@ class DataMiner
|
|
83
83
|
|
84
84
|
# @private
|
85
85
|
def start
|
86
|
-
upsert_enabled? ? save_with_upsert :
|
86
|
+
upsert_enabled? ? save_with_upsert : save_with_find_or_initialize
|
87
87
|
refresh
|
88
|
-
|
89
88
|
nil
|
90
89
|
end
|
91
90
|
|
@@ -101,13 +100,28 @@ class DataMiner
|
|
101
100
|
(not validate?) and (storing_primary_key? or table_has_autoincrementing_primary_key?)
|
102
101
|
end
|
103
102
|
|
103
|
+
def count_every
|
104
|
+
@count_every ||= ENV.fetch('DATA_MINER_COUNT_EVERY', -1).to_i
|
105
|
+
end
|
106
|
+
|
104
107
|
def save_with_upsert
|
105
108
|
c = model.connection_pool.checkout
|
109
|
+
attrs_except_key = attributes.except(@key).values
|
110
|
+
count = 0
|
106
111
|
Upsert.stream(c, model.table_name) do |upsert|
|
107
112
|
table.each do |row|
|
113
|
+
$stderr.puts "#{count}..." if count_every > 0 and count % count_every == 0
|
114
|
+
count += 1
|
108
115
|
selector = @key ? { @key => attributes[@key].read(row) } : { model.primary_key => nil }
|
109
|
-
document =
|
110
|
-
|
116
|
+
document = attrs_except_key.inject({}) do |memo, attr|
|
117
|
+
attr.updates(row).each do |k, v|
|
118
|
+
case memo[k]
|
119
|
+
when ::Hash
|
120
|
+
memo[k] = memo[k].merge v
|
121
|
+
else
|
122
|
+
memo[k] = v
|
123
|
+
end
|
124
|
+
end
|
111
125
|
memo
|
112
126
|
end
|
113
127
|
upsert.row selector, document
|
@@ -116,8 +130,11 @@ class DataMiner
|
|
116
130
|
model.connection_pool.checkin c
|
117
131
|
end
|
118
132
|
|
119
|
-
def
|
133
|
+
def save_with_find_or_initialize
|
134
|
+
count = 0
|
120
135
|
table.each do |row|
|
136
|
+
$stderr.puts "#{count}..." if count_every > 0 and count % count_every == 0
|
137
|
+
count += 1
|
121
138
|
record = @key ? model.send("find_or_initialize_by_#{@key}", attributes[@key].read(row)) : model.new
|
122
139
|
attributes.each { |_, attr| attr.set_from_row record, row }
|
123
140
|
record.save!
|
@@ -125,7 +142,7 @@ class DataMiner
|
|
125
142
|
end
|
126
143
|
|
127
144
|
def table_has_autoincrementing_primary_key?
|
128
|
-
return @table_has_autoincrementing_primary_key_query
|
145
|
+
return @table_has_autoincrementing_primary_key_query if defined?(@table_has_autoincrementing_primary_key_query)
|
129
146
|
c = model.connection_pool.checkout
|
130
147
|
answer = if (pk = model.primary_key) and model.columns_hash[pk].type == :integer
|
131
148
|
case c.adapter_name
|
@@ -143,14 +160,12 @@ class DataMiner
|
|
143
160
|
end
|
144
161
|
end
|
145
162
|
model.connection_pool.checkin c
|
146
|
-
@table_has_autoincrementing_primary_key_query =
|
147
|
-
answer
|
163
|
+
@table_has_autoincrementing_primary_key_query = answer
|
148
164
|
end
|
149
165
|
|
150
166
|
def storing_primary_key?
|
151
|
-
return @storing_primary_key_query
|
152
|
-
@storing_primary_key_query =
|
153
|
-
@storing_primary_key_query.first
|
167
|
+
return @storing_primary_key_query if defined?(@storing_primary_key_query)
|
168
|
+
@storing_primary_key_query = model.primary_key && attributes.has_key?(model.primary_key)
|
154
169
|
end
|
155
170
|
|
156
171
|
def table
|
@@ -161,7 +176,6 @@ class DataMiner
|
|
161
176
|
|
162
177
|
def refresh
|
163
178
|
@table = nil
|
164
|
-
attributes.each { |_, attr| attr.refresh }
|
165
179
|
nil
|
166
180
|
end
|
167
181
|
end
|
data/lib/data_miner/step/sql.rb
CHANGED
@@ -43,7 +43,7 @@ class DataMiner
|
|
43
43
|
ActiveRecord::Base.connection.execute statement
|
44
44
|
else
|
45
45
|
tmp_path = UnixUtils.curl url
|
46
|
-
send config[
|
46
|
+
send config['adapter'], tmp_path
|
47
47
|
File.unlink tmp_path
|
48
48
|
end
|
49
49
|
end
|
@@ -55,24 +55,24 @@ class DataMiner
|
|
55
55
|
ActiveRecord::Base.connection_config
|
56
56
|
else
|
57
57
|
ActiveRecord::Base.connection_pool.spec.config
|
58
|
-
end
|
58
|
+
end.stringify_keys
|
59
59
|
end
|
60
60
|
|
61
61
|
def mysql(path)
|
62
|
-
connect = if config[
|
63
|
-
[ '--socket', config[
|
62
|
+
connect = if config['socket']
|
63
|
+
[ '--socket', config['socket'] ]
|
64
64
|
else
|
65
|
-
[ '--host', config.fetch(
|
65
|
+
[ '--host', config.fetch('host', '127.0.0.1'), '--port', config.fetch('port', 3306).to_s ]
|
66
66
|
end
|
67
67
|
|
68
68
|
argv = [
|
69
69
|
'mysql',
|
70
70
|
'--compress',
|
71
|
-
'--user', config[
|
72
|
-
"-p#{config[
|
71
|
+
'--user', config['username'],
|
72
|
+
"-p#{config['password']}",
|
73
73
|
connect,
|
74
74
|
'--default-character-set', 'utf8',
|
75
|
-
config[
|
75
|
+
config['database']
|
76
76
|
].flatten
|
77
77
|
|
78
78
|
File.open(path) do |f|
|
@@ -97,7 +97,7 @@ class DataMiner
|
|
97
97
|
argv = [
|
98
98
|
'psql',
|
99
99
|
'--quiet',
|
100
|
-
'--dbname', config[
|
100
|
+
'--dbname', config['database'],
|
101
101
|
'--file', path
|
102
102
|
].flatten
|
103
103
|
|
@@ -113,7 +113,7 @@ class DataMiner
|
|
113
113
|
def sqlite3(path)
|
114
114
|
argv = [
|
115
115
|
'sqlite3',
|
116
|
-
config[
|
116
|
+
config['database']
|
117
117
|
]
|
118
118
|
File.open(path) do |f|
|
119
119
|
pid = POSIX::Spawn.spawn(*(argv+[{:in => f}]))
|
data/lib/data_miner/version.rb
CHANGED
@@ -2,37 +2,33 @@
|
|
2
2
|
require 'helper'
|
3
3
|
init_database
|
4
4
|
|
5
|
-
class
|
6
|
-
self.table_name = '
|
7
|
-
self.primary_key = '
|
5
|
+
class StateBlue < ActiveRecord::Base
|
6
|
+
self.table_name = 'states'
|
7
|
+
self.primary_key = 'postal_abbreviation'
|
8
8
|
data_miner do
|
9
|
-
sql "Brighter Planet's list of
|
9
|
+
sql "Brighter Planet's list of states (as a URL)", 'http://data.brighterplanet.com/states.sql'
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
13
|
-
class
|
14
|
-
self.table_name = '
|
15
|
-
self.primary_key = '
|
13
|
+
class StateRed < ActiveRecord::Base
|
14
|
+
self.table_name = 'states'
|
15
|
+
self.primary_key = 'postal_abbreviation'
|
16
16
|
data_miner do
|
17
|
-
sql "Brighter Planet's list of
|
18
|
-
sql "Mess up weights", %{UPDATE
|
17
|
+
sql "Brighter Planet's list of states (as a URL)", 'http://data.brighterplanet.com/states.sql'
|
18
|
+
sql "Mess up weights", %{UPDATE states SET name = 'Foobar'}
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
22
|
describe DataMiner::Step::Sql do
|
23
23
|
before do
|
24
|
-
|
24
|
+
StateBlue.delete_all rescue nil
|
25
25
|
end
|
26
26
|
it "can be provided as a URL" do
|
27
|
-
|
28
|
-
|
29
|
-
BreedBlue.where(:name => 'Württemberger').count.must_equal 1
|
30
|
-
BreedBlue.find('Afghan Hound').weight.must_be_close_to 24.9476
|
27
|
+
StateBlue.run_data_miner!
|
28
|
+
StateBlue.where(:name => 'Wisconsin').count.must_equal 1
|
31
29
|
end
|
32
30
|
it "can be provided as a string" do
|
33
|
-
|
34
|
-
|
35
|
-
BreedRed.where(:name => 'Württemberger').count.must_equal 1
|
36
|
-
BreedRed.find('Afghan Hound').weight.must_be_close_to 999
|
31
|
+
StateRed.run_data_miner!
|
32
|
+
StateRed.find('NJ').name.must_equal 'Foobar'
|
37
33
|
end
|
38
34
|
end
|
@@ -1,36 +1,4 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
3
|
describe DataMiner::Attribute do
|
4
|
-
before do
|
5
|
-
DataMiner.unit_converter = :alchemist
|
6
|
-
end
|
7
|
-
|
8
|
-
describe '#convert?' do
|
9
|
-
it 'returns true if from_units is set' do
|
10
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :from_units => :pounds, :to_units => :kilograms
|
11
|
-
assert attribute.send(:convert?)
|
12
|
-
end
|
13
|
-
it 'returns true if to_units and units_field_name are set' do
|
14
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_name => 'bar', :to_units => :kilograms
|
15
|
-
assert attribute.send(:convert?)
|
16
|
-
end
|
17
|
-
it 'returns true if to_units and units_field_number are set' do
|
18
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_number => 3, :to_units => :kilograms
|
19
|
-
assert attribute.send(:convert?)
|
20
|
-
end
|
21
|
-
it 'returns false if units_field_name only is set' do
|
22
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_name => 'bar'
|
23
|
-
refute attribute.send(:convert?)
|
24
|
-
end
|
25
|
-
it 'returns false if units_field_number only is set' do
|
26
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_number => 'bar'
|
27
|
-
refute attribute.send(:convert?)
|
28
|
-
end
|
29
|
-
it 'raises if no converter and units are used' do
|
30
|
-
DataMiner.unit_converter = nil
|
31
|
-
lambda {
|
32
|
-
DataMiner::Attribute.new :foo, 'bar', :from_units => :pounds, :to_units => :kilograms
|
33
|
-
}.must_raise ArgumentError, /unit_converter/
|
34
|
-
end
|
35
|
-
end
|
36
4
|
end
|
data/test/helper.rb
CHANGED
@@ -15,14 +15,15 @@ MiniTest::Reporters.use!
|
|
15
15
|
require 'active_record'
|
16
16
|
require 'logger'
|
17
17
|
ActiveRecord::Base.logger = Logger.new $stderr
|
18
|
-
ActiveRecord::Base.logger.level = Logger::INFO
|
19
|
-
# ActiveRecord::Base.logger.level = Logger::DEBUG
|
18
|
+
ActiveRecord::Base.logger.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO
|
20
19
|
|
21
20
|
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
22
21
|
|
22
|
+
require 'active_record_inline_schema'
|
23
|
+
|
23
24
|
require 'data_miner'
|
24
25
|
|
25
|
-
def init_database
|
26
|
+
def init_database
|
26
27
|
case ENV['DATABASE']
|
27
28
|
when /postgr/i
|
28
29
|
system %{dropdb test_data_miner}
|
@@ -46,12 +47,6 @@ def init_database(unit_converter = :conversions)
|
|
46
47
|
'password' => 'password'
|
47
48
|
)
|
48
49
|
end
|
49
|
-
|
50
|
-
DataMiner::Run.auto_upgrade!
|
51
|
-
DataMiner::Run::ColumnStatistic.auto_upgrade!
|
52
|
-
DataMiner::Run.clear_locks
|
53
|
-
|
54
|
-
DataMiner.unit_converter = unit_converter
|
55
50
|
end
|
56
51
|
|
57
52
|
def init_models
|
@@ -2,12 +2,8 @@ require 'helper'
|
|
2
2
|
|
3
3
|
describe 'DataMiner with Alchemist' do
|
4
4
|
before do
|
5
|
-
init_database
|
5
|
+
init_database
|
6
6
|
init_models
|
7
7
|
Pet.run_data_miner!
|
8
8
|
end
|
9
|
-
|
10
|
-
it 'converts convertible units' do
|
11
|
-
Pet.find('Pierre').weight.must_be_close_to 4.4.pounds.to.kilograms.to_f
|
12
|
-
end
|
13
9
|
end
|
data/test/support/pet.rb
CHANGED
@@ -9,26 +9,27 @@ class Pet < ActiveRecord::Base
|
|
9
9
|
col :breed_id
|
10
10
|
col :color_id
|
11
11
|
col :age, :type => :integer
|
12
|
-
col :age_units
|
13
12
|
col :weight, :type => :float
|
14
|
-
col :weight_units
|
15
13
|
col :height, :type => :float
|
16
|
-
col :height_units
|
17
14
|
col :favorite_food
|
18
15
|
col :command_phrase
|
16
|
+
col :emphatic_command_phrase
|
19
17
|
belongs_to :breed
|
20
18
|
data_miner do
|
21
19
|
process :auto_upgrade!
|
22
20
|
process :run_data_miner_on_parent_associations!
|
23
21
|
import("A list of pets", :url => "file://#{PETS}") do
|
24
22
|
key :name
|
25
|
-
store :age
|
26
|
-
store :breed_id, :field_name => :breed
|
27
|
-
store :color_id, :field_name => :color, :dictionary =>
|
28
|
-
store :weight
|
29
|
-
store :favorite_food
|
23
|
+
store :age
|
24
|
+
store :breed_id, :field_name => :breed
|
25
|
+
store :color_id, :field_name => :color, :dictionary => RemoteTable.new("file://#{COLOR_DICTIONARY_ENGLISH}").inject({}) { |memo, row| memo[row['input']] = row['output']; memo }
|
26
|
+
store :weight
|
27
|
+
store :favorite_food
|
30
28
|
store :command_phrase
|
31
|
-
store :height
|
29
|
+
store :height
|
30
|
+
store :emphatic_command_phrase do |row|
|
31
|
+
(row['command_phrase'] + "!!!!!") if row['command_phrase']
|
32
|
+
end
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
data/test/support/pet2.rb
CHANGED
@@ -15,7 +15,7 @@ class Pet2 < ActiveRecord::Base
|
|
15
15
|
end
|
16
16
|
import("Breed numbers based on license number", :url => "file://#{BREED_BY_LICENSE_NUMBER}") do
|
17
17
|
key :license_number
|
18
|
-
store :breed_id, :field_name => :breed
|
18
|
+
store :breed_id, :field_name => :breed
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
data/test/support/pets.csv
CHANGED
@@ -2,5 +2,5 @@ license_number,name,breed,color,age,age_units,weight,height,favorite_food,comman
|
|
2
2
|
111,Pierre,Tabby,GO,4,years,4.4,"3.000,5",tomato,"eh"
|
3
3
|
222,Jerry,Beagle,BR/BL,5,years,10,"3,000.0",cheese,"che"
|
4
4
|
333,Amigo,Spanish Lizarto,GR/BU,17,years," ","300,5",crickets," "
|
5
|
-
444,Johnny,Beagle,BR/BL,2,years,20,"4,000"," ",
|
6
|
-
555,Nemo,,,,,,,,
|
5
|
+
444,Johnny,Beagle,BR/BL,2,years,20,"4,000"," "," oh ok "
|
6
|
+
555,Nemo,,,,,,,,
|