data_miner 2.5.2 → 3.0.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +18 -0
- data/Gemfile +0 -2
- data/data_miner.gemspec +3 -7
- data/lib/data_miner.rb +2 -31
- data/lib/data_miner/active_record_class_methods.rb +5 -11
- data/lib/data_miner/attribute.rb +100 -198
- data/lib/data_miner/script.rb +5 -11
- data/lib/data_miner/step/import.rb +41 -27
- data/lib/data_miner/step/sql.rb +10 -10
- data/lib/data_miner/version.rb +1 -1
- data/test/data_miner/step/test_sql.rb +14 -18
- data/test/data_miner/test_attribute.rb +0 -32
- data/test/helper.rb +4 -9
- data/test/support/data_miner_with_alchemist.rb +1 -5
- data/test/support/pet.rb +10 -9
- data/test/support/pet2.rb +1 -1
- data/test/support/pets.csv +2 -2
- data/test/test_data_miner.rb +6 -40
- metadata +9 -97
- data/lib/data_miner/dictionary.rb +0 -84
- data/lib/data_miner/run.rb +0 -144
- data/lib/data_miner/run/column_statistic.rb +0 -78
- data/lib/data_miner/unit_converter.rb +0 -12
- data/lib/data_miner/unit_converter/alchemist.rb +0 -11
- data/lib/data_miner/unit_converter/conversions.rb +0 -11
- data/test/data_miner/step/test_import.rb +0 -35
- data/test/data_miner/unit_converter/test_alchemist.rb +0 -20
- data/test/data_miner/unit_converter/test_conversions.rb +0 -20
- data/test/support/data_miner_with_conversions.rb +0 -16
- data/test/support/data_miner_without_unit_converter.rb +0 -51
- data/test/test_data_miner_run_column_statistic.rb +0 -52
- data/test/test_earth_import.rb +0 -26
- data/test/test_safety.rb +0 -84
- data/test/test_unit_conversion.rb +0 -16
data/lib/data_miner/script.rb
CHANGED
@@ -202,7 +202,7 @@ class DataMiner
|
|
202
202
|
# @note Normally you should use +Country.run_data_miner!+
|
203
203
|
# @note A primitive "call stack" is kept that will prevent infinite loops. So, if Country's data miner script calls Province's AND vice-versa, each one will only be run once.
|
204
204
|
#
|
205
|
-
# @return
|
205
|
+
# @return nil
|
206
206
|
def start
|
207
207
|
model_name = model.name
|
208
208
|
# $stderr.write "0 - #{model_name}\n"
|
@@ -217,17 +217,11 @@ class DataMiner
|
|
217
217
|
Script.current_stack.clear
|
218
218
|
end
|
219
219
|
Script.current_stack << model_name
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
run = Run.new
|
224
|
-
run.model_name = model_name
|
225
|
-
run.start do
|
226
|
-
steps.each do |step|
|
227
|
-
step.start
|
228
|
-
model.reset_column_information
|
229
|
-
end
|
220
|
+
steps.each do |step|
|
221
|
+
step.start
|
222
|
+
model.reset_column_information
|
230
223
|
end
|
224
|
+
nil
|
231
225
|
end
|
232
226
|
|
233
227
|
private
|
@@ -22,24 +22,24 @@ class DataMiner
|
|
22
22
|
|
23
23
|
# @private
|
24
24
|
def initialize(script, description, settings, &blk)
|
25
|
-
settings = settings.
|
26
|
-
if settings.has_key?(
|
25
|
+
settings = settings.stringify_keys
|
26
|
+
if settings.has_key?('table')
|
27
27
|
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
|
28
28
|
end
|
29
|
-
if (errata_settings = settings[
|
29
|
+
if (errata_settings = settings['errata']) and not errata_settings.is_a?(::Hash)
|
30
30
|
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
|
31
31
|
end
|
32
32
|
@script = script
|
33
33
|
@attributes = ::ActiveSupport::OrderedHash.new
|
34
|
-
@validate_query = !!settings[
|
34
|
+
@validate_query = !!settings['validate']
|
35
35
|
@description = description
|
36
|
-
if settings.has_key?
|
37
|
-
errata_settings = settings[
|
38
|
-
errata_settings[
|
39
|
-
settings[
|
36
|
+
if settings.has_key? 'errata'
|
37
|
+
errata_settings = settings['errata'].stringify_keys
|
38
|
+
errata_settings['responder'] ||= model
|
39
|
+
settings['errata'] = errata_settings
|
40
40
|
end
|
41
41
|
@table_settings = settings.dup
|
42
|
-
@table_settings[
|
42
|
+
@table_settings['streaming'] = true
|
43
43
|
@table_mutex = ::Mutex.new
|
44
44
|
instance_eval(&blk)
|
45
45
|
end
|
@@ -48,17 +48,17 @@ class DataMiner
|
|
48
48
|
#
|
49
49
|
# @see DataMiner::Attribute The actual Attribute class.
|
50
50
|
#
|
51
|
-
# @param [
|
51
|
+
# @param [String] attr_name The name of the local model column.
|
52
52
|
# @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
|
53
53
|
# @option attr_options [*] anything Any option for +DataMiner::Attribute+.
|
54
54
|
#
|
55
55
|
# @return [nil]
|
56
|
-
def store(attr_name, attr_options = {})
|
57
|
-
attr_name = attr_name.
|
56
|
+
def store(attr_name, attr_options = {}, &blk)
|
57
|
+
attr_name = attr_name.to_s
|
58
58
|
if attributes.has_key? attr_name
|
59
59
|
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
60
60
|
end
|
61
|
-
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
|
61
|
+
attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options, &blk
|
62
62
|
end
|
63
63
|
|
64
64
|
# Store data into a model column AND use it as the key.
|
@@ -67,13 +67,13 @@ class DataMiner
|
|
67
67
|
#
|
68
68
|
# Enables idempotency. In other words, you can run the data miner script multiple times, get updated data, and not get duplicate rows.
|
69
69
|
#
|
70
|
-
# @param [
|
70
|
+
# @param [String] attr_name The name of the local model column.
|
71
71
|
# @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
|
72
72
|
# @option attr_options [*] anything Any option for +DataMiner::Attribute+.
|
73
73
|
#
|
74
74
|
# @return [nil]
|
75
75
|
def key(attr_name, attr_options = {})
|
76
|
-
attr_name = attr_name.
|
76
|
+
attr_name = attr_name.to_s
|
77
77
|
if attributes.has_key? attr_name
|
78
78
|
raise "You should only call store or key once for #{model.name}##{attr_name}"
|
79
79
|
end
|
@@ -83,9 +83,8 @@ class DataMiner
|
|
83
83
|
|
84
84
|
# @private
|
85
85
|
def start
|
86
|
-
upsert_enabled? ? save_with_upsert :
|
86
|
+
upsert_enabled? ? save_with_upsert : save_with_find_or_initialize
|
87
87
|
refresh
|
88
|
-
|
89
88
|
nil
|
90
89
|
end
|
91
90
|
|
@@ -101,13 +100,28 @@ class DataMiner
|
|
101
100
|
(not validate?) and (storing_primary_key? or table_has_autoincrementing_primary_key?)
|
102
101
|
end
|
103
102
|
|
103
|
+
def count_every
|
104
|
+
@count_every ||= ENV.fetch('DATA_MINER_COUNT_EVERY', -1).to_i
|
105
|
+
end
|
106
|
+
|
104
107
|
def save_with_upsert
|
105
108
|
c = model.connection_pool.checkout
|
109
|
+
attrs_except_key = attributes.except(@key).values
|
110
|
+
count = 0
|
106
111
|
Upsert.stream(c, model.table_name) do |upsert|
|
107
112
|
table.each do |row|
|
113
|
+
$stderr.puts "#{count}..." if count_every > 0 and count % count_every == 0
|
114
|
+
count += 1
|
108
115
|
selector = @key ? { @key => attributes[@key].read(row) } : { model.primary_key => nil }
|
109
|
-
document =
|
110
|
-
|
116
|
+
document = attrs_except_key.inject({}) do |memo, attr|
|
117
|
+
attr.updates(row).each do |k, v|
|
118
|
+
case memo[k]
|
119
|
+
when ::Hash
|
120
|
+
memo[k] = memo[k].merge v
|
121
|
+
else
|
122
|
+
memo[k] = v
|
123
|
+
end
|
124
|
+
end
|
111
125
|
memo
|
112
126
|
end
|
113
127
|
upsert.row selector, document
|
@@ -116,8 +130,11 @@ class DataMiner
|
|
116
130
|
model.connection_pool.checkin c
|
117
131
|
end
|
118
132
|
|
119
|
-
def
|
133
|
+
def save_with_find_or_initialize
|
134
|
+
count = 0
|
120
135
|
table.each do |row|
|
136
|
+
$stderr.puts "#{count}..." if count_every > 0 and count % count_every == 0
|
137
|
+
count += 1
|
121
138
|
record = @key ? model.send("find_or_initialize_by_#{@key}", attributes[@key].read(row)) : model.new
|
122
139
|
attributes.each { |_, attr| attr.set_from_row record, row }
|
123
140
|
record.save!
|
@@ -125,7 +142,7 @@ class DataMiner
|
|
125
142
|
end
|
126
143
|
|
127
144
|
def table_has_autoincrementing_primary_key?
|
128
|
-
return @table_has_autoincrementing_primary_key_query
|
145
|
+
return @table_has_autoincrementing_primary_key_query if defined?(@table_has_autoincrementing_primary_key_query)
|
129
146
|
c = model.connection_pool.checkout
|
130
147
|
answer = if (pk = model.primary_key) and model.columns_hash[pk].type == :integer
|
131
148
|
case c.adapter_name
|
@@ -143,14 +160,12 @@ class DataMiner
|
|
143
160
|
end
|
144
161
|
end
|
145
162
|
model.connection_pool.checkin c
|
146
|
-
@table_has_autoincrementing_primary_key_query =
|
147
|
-
answer
|
163
|
+
@table_has_autoincrementing_primary_key_query = answer
|
148
164
|
end
|
149
165
|
|
150
166
|
def storing_primary_key?
|
151
|
-
return @storing_primary_key_query
|
152
|
-
@storing_primary_key_query =
|
153
|
-
@storing_primary_key_query.first
|
167
|
+
return @storing_primary_key_query if defined?(@storing_primary_key_query)
|
168
|
+
@storing_primary_key_query = model.primary_key && attributes.has_key?(model.primary_key)
|
154
169
|
end
|
155
170
|
|
156
171
|
def table
|
@@ -161,7 +176,6 @@ class DataMiner
|
|
161
176
|
|
162
177
|
def refresh
|
163
178
|
@table = nil
|
164
|
-
attributes.each { |_, attr| attr.refresh }
|
165
179
|
nil
|
166
180
|
end
|
167
181
|
end
|
data/lib/data_miner/step/sql.rb
CHANGED
@@ -43,7 +43,7 @@ class DataMiner
|
|
43
43
|
ActiveRecord::Base.connection.execute statement
|
44
44
|
else
|
45
45
|
tmp_path = UnixUtils.curl url
|
46
|
-
send config[
|
46
|
+
send config['adapter'], tmp_path
|
47
47
|
File.unlink tmp_path
|
48
48
|
end
|
49
49
|
end
|
@@ -55,24 +55,24 @@ class DataMiner
|
|
55
55
|
ActiveRecord::Base.connection_config
|
56
56
|
else
|
57
57
|
ActiveRecord::Base.connection_pool.spec.config
|
58
|
-
end
|
58
|
+
end.stringify_keys
|
59
59
|
end
|
60
60
|
|
61
61
|
def mysql(path)
|
62
|
-
connect = if config[
|
63
|
-
[ '--socket', config[
|
62
|
+
connect = if config['socket']
|
63
|
+
[ '--socket', config['socket'] ]
|
64
64
|
else
|
65
|
-
[ '--host', config.fetch(
|
65
|
+
[ '--host', config.fetch('host', '127.0.0.1'), '--port', config.fetch('port', 3306).to_s ]
|
66
66
|
end
|
67
67
|
|
68
68
|
argv = [
|
69
69
|
'mysql',
|
70
70
|
'--compress',
|
71
|
-
'--user', config[
|
72
|
-
"-p#{config[
|
71
|
+
'--user', config['username'],
|
72
|
+
"-p#{config['password']}",
|
73
73
|
connect,
|
74
74
|
'--default-character-set', 'utf8',
|
75
|
-
config[
|
75
|
+
config['database']
|
76
76
|
].flatten
|
77
77
|
|
78
78
|
File.open(path) do |f|
|
@@ -97,7 +97,7 @@ class DataMiner
|
|
97
97
|
argv = [
|
98
98
|
'psql',
|
99
99
|
'--quiet',
|
100
|
-
'--dbname', config[
|
100
|
+
'--dbname', config['database'],
|
101
101
|
'--file', path
|
102
102
|
].flatten
|
103
103
|
|
@@ -113,7 +113,7 @@ class DataMiner
|
|
113
113
|
def sqlite3(path)
|
114
114
|
argv = [
|
115
115
|
'sqlite3',
|
116
|
-
config[
|
116
|
+
config['database']
|
117
117
|
]
|
118
118
|
File.open(path) do |f|
|
119
119
|
pid = POSIX::Spawn.spawn(*(argv+[{:in => f}]))
|
data/lib/data_miner/version.rb
CHANGED
@@ -2,37 +2,33 @@
|
|
2
2
|
require 'helper'
|
3
3
|
init_database
|
4
4
|
|
5
|
-
class
|
6
|
-
self.table_name = '
|
7
|
-
self.primary_key = '
|
5
|
+
class StateBlue < ActiveRecord::Base
|
6
|
+
self.table_name = 'states'
|
7
|
+
self.primary_key = 'postal_abbreviation'
|
8
8
|
data_miner do
|
9
|
-
sql "Brighter Planet's list of
|
9
|
+
sql "Brighter Planet's list of states (as a URL)", 'http://data.brighterplanet.com/states.sql'
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
13
|
-
class
|
14
|
-
self.table_name = '
|
15
|
-
self.primary_key = '
|
13
|
+
class StateRed < ActiveRecord::Base
|
14
|
+
self.table_name = 'states'
|
15
|
+
self.primary_key = 'postal_abbreviation'
|
16
16
|
data_miner do
|
17
|
-
sql "Brighter Planet's list of
|
18
|
-
sql "Mess up weights", %{UPDATE
|
17
|
+
sql "Brighter Planet's list of states (as a URL)", 'http://data.brighterplanet.com/states.sql'
|
18
|
+
sql "Mess up weights", %{UPDATE states SET name = 'Foobar'}
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
22
|
describe DataMiner::Step::Sql do
|
23
23
|
before do
|
24
|
-
|
24
|
+
StateBlue.delete_all rescue nil
|
25
25
|
end
|
26
26
|
it "can be provided as a URL" do
|
27
|
-
|
28
|
-
|
29
|
-
BreedBlue.where(:name => 'Württemberger').count.must_equal 1
|
30
|
-
BreedBlue.find('Afghan Hound').weight.must_be_close_to 24.9476
|
27
|
+
StateBlue.run_data_miner!
|
28
|
+
StateBlue.where(:name => 'Wisconsin').count.must_equal 1
|
31
29
|
end
|
32
30
|
it "can be provided as a string" do
|
33
|
-
|
34
|
-
|
35
|
-
BreedRed.where(:name => 'Württemberger').count.must_equal 1
|
36
|
-
BreedRed.find('Afghan Hound').weight.must_be_close_to 999
|
31
|
+
StateRed.run_data_miner!
|
32
|
+
StateRed.find('NJ').name.must_equal 'Foobar'
|
37
33
|
end
|
38
34
|
end
|
@@ -1,36 +1,4 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
3
|
describe DataMiner::Attribute do
|
4
|
-
before do
|
5
|
-
DataMiner.unit_converter = :alchemist
|
6
|
-
end
|
7
|
-
|
8
|
-
describe '#convert?' do
|
9
|
-
it 'returns true if from_units is set' do
|
10
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :from_units => :pounds, :to_units => :kilograms
|
11
|
-
assert attribute.send(:convert?)
|
12
|
-
end
|
13
|
-
it 'returns true if to_units and units_field_name are set' do
|
14
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_name => 'bar', :to_units => :kilograms
|
15
|
-
assert attribute.send(:convert?)
|
16
|
-
end
|
17
|
-
it 'returns true if to_units and units_field_number are set' do
|
18
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_number => 3, :to_units => :kilograms
|
19
|
-
assert attribute.send(:convert?)
|
20
|
-
end
|
21
|
-
it 'returns false if units_field_name only is set' do
|
22
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_name => 'bar'
|
23
|
-
refute attribute.send(:convert?)
|
24
|
-
end
|
25
|
-
it 'returns false if units_field_number only is set' do
|
26
|
-
attribute = DataMiner::Attribute.new :foo, 'bar', :units_field_number => 'bar'
|
27
|
-
refute attribute.send(:convert?)
|
28
|
-
end
|
29
|
-
it 'raises if no converter and units are used' do
|
30
|
-
DataMiner.unit_converter = nil
|
31
|
-
lambda {
|
32
|
-
DataMiner::Attribute.new :foo, 'bar', :from_units => :pounds, :to_units => :kilograms
|
33
|
-
}.must_raise ArgumentError, /unit_converter/
|
34
|
-
end
|
35
|
-
end
|
36
4
|
end
|
data/test/helper.rb
CHANGED
@@ -15,14 +15,15 @@ MiniTest::Reporters.use!
|
|
15
15
|
require 'active_record'
|
16
16
|
require 'logger'
|
17
17
|
ActiveRecord::Base.logger = Logger.new $stderr
|
18
|
-
ActiveRecord::Base.logger.level = Logger::INFO
|
19
|
-
# ActiveRecord::Base.logger.level = Logger::DEBUG
|
18
|
+
ActiveRecord::Base.logger.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO
|
20
19
|
|
21
20
|
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
22
21
|
|
22
|
+
require 'active_record_inline_schema'
|
23
|
+
|
23
24
|
require 'data_miner'
|
24
25
|
|
25
|
-
def init_database
|
26
|
+
def init_database
|
26
27
|
case ENV['DATABASE']
|
27
28
|
when /postgr/i
|
28
29
|
system %{dropdb test_data_miner}
|
@@ -46,12 +47,6 @@ def init_database(unit_converter = :conversions)
|
|
46
47
|
'password' => 'password'
|
47
48
|
)
|
48
49
|
end
|
49
|
-
|
50
|
-
DataMiner::Run.auto_upgrade!
|
51
|
-
DataMiner::Run::ColumnStatistic.auto_upgrade!
|
52
|
-
DataMiner::Run.clear_locks
|
53
|
-
|
54
|
-
DataMiner.unit_converter = unit_converter
|
55
50
|
end
|
56
51
|
|
57
52
|
def init_models
|
@@ -2,12 +2,8 @@ require 'helper'
|
|
2
2
|
|
3
3
|
describe 'DataMiner with Alchemist' do
|
4
4
|
before do
|
5
|
-
init_database
|
5
|
+
init_database
|
6
6
|
init_models
|
7
7
|
Pet.run_data_miner!
|
8
8
|
end
|
9
|
-
|
10
|
-
it 'converts convertible units' do
|
11
|
-
Pet.find('Pierre').weight.must_be_close_to 4.4.pounds.to.kilograms.to_f
|
12
|
-
end
|
13
9
|
end
|
data/test/support/pet.rb
CHANGED
@@ -9,26 +9,27 @@ class Pet < ActiveRecord::Base
|
|
9
9
|
col :breed_id
|
10
10
|
col :color_id
|
11
11
|
col :age, :type => :integer
|
12
|
-
col :age_units
|
13
12
|
col :weight, :type => :float
|
14
|
-
col :weight_units
|
15
13
|
col :height, :type => :float
|
16
|
-
col :height_units
|
17
14
|
col :favorite_food
|
18
15
|
col :command_phrase
|
16
|
+
col :emphatic_command_phrase
|
19
17
|
belongs_to :breed
|
20
18
|
data_miner do
|
21
19
|
process :auto_upgrade!
|
22
20
|
process :run_data_miner_on_parent_associations!
|
23
21
|
import("A list of pets", :url => "file://#{PETS}") do
|
24
22
|
key :name
|
25
|
-
store :age
|
26
|
-
store :breed_id, :field_name => :breed
|
27
|
-
store :color_id, :field_name => :color, :dictionary =>
|
28
|
-
store :weight
|
29
|
-
store :favorite_food
|
23
|
+
store :age
|
24
|
+
store :breed_id, :field_name => :breed
|
25
|
+
store :color_id, :field_name => :color, :dictionary => RemoteTable.new("file://#{COLOR_DICTIONARY_ENGLISH}").inject({}) { |memo, row| memo[row['input']] = row['output']; memo }
|
26
|
+
store :weight
|
27
|
+
store :favorite_food
|
30
28
|
store :command_phrase
|
31
|
-
store :height
|
29
|
+
store :height
|
30
|
+
store :emphatic_command_phrase do |row|
|
31
|
+
(row['command_phrase'] + "!!!!!") if row['command_phrase']
|
32
|
+
end
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
data/test/support/pet2.rb
CHANGED
@@ -15,7 +15,7 @@ class Pet2 < ActiveRecord::Base
|
|
15
15
|
end
|
16
16
|
import("Breed numbers based on license number", :url => "file://#{BREED_BY_LICENSE_NUMBER}") do
|
17
17
|
key :license_number
|
18
|
-
store :breed_id, :field_name => :breed
|
18
|
+
store :breed_id, :field_name => :breed
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
data/test/support/pets.csv
CHANGED
@@ -2,5 +2,5 @@ license_number,name,breed,color,age,age_units,weight,height,favorite_food,comman
|
|
2
2
|
111,Pierre,Tabby,GO,4,years,4.4,"3.000,5",tomato,"eh"
|
3
3
|
222,Jerry,Beagle,BR/BL,5,years,10,"3,000.0",cheese,"che"
|
4
4
|
333,Amigo,Spanish Lizarto,GR/BU,17,years," ","300,5",crickets," "
|
5
|
-
444,Johnny,Beagle,BR/BL,2,years,20,"4,000"," ",
|
6
|
-
555,Nemo,,,,,,,,
|
5
|
+
444,Johnny,Beagle,BR/BL,2,years,20,"4,000"," "," oh ok "
|
6
|
+
555,Nemo,,,,,,,,
|