data_miner 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/lib/data_miner.rb +10 -0
- data/lib/data_miner/run.rb +46 -2
- data/lib/data_miner/run/column_statistic.rb +87 -0
- data/lib/data_miner/script.rb +3 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +66 -1
- data/test/support/pets.csv +1 -0
- data/test/test_data_miner.rb +7 -63
- data/test/test_data_miner_run_column_statistic.rb +43 -0
- metadata +5 -2
data/CHANGELOG
CHANGED
data/lib/data_miner.rb
CHANGED
@@ -96,6 +96,16 @@ class DataMiner
|
|
96
96
|
end
|
97
97
|
end
|
98
98
|
|
99
|
+
# Whether per-column stats like max, min, average, standard deviation, etc are enabled.
|
100
|
+
def per_column_statistics?
|
101
|
+
@per_column_statistics == true
|
102
|
+
end
|
103
|
+
|
104
|
+
# Turn on or off per-column stats.
|
105
|
+
def per_column_statistics=(boolean)
|
106
|
+
@per_column_statistics = boolean
|
107
|
+
end
|
108
|
+
|
99
109
|
class << self
|
100
110
|
delegate(*DataMiner.instance_methods(false), :to => :instance)
|
101
111
|
end
|
data/lib/data_miner/run.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'aasm'
|
2
2
|
require 'active_record_inline_schema'
|
3
3
|
|
4
|
+
require 'data_miner/run/column_statistic'
|
5
|
+
|
4
6
|
class DataMiner
|
5
7
|
# A record of what happened when you ran a data miner script.
|
6
8
|
#
|
@@ -57,6 +59,14 @@ class DataMiner
|
|
57
59
|
col :stopped_at, :type => :datetime
|
58
60
|
col :updated_at, :type => :datetime
|
59
61
|
col :error, :type => :text
|
62
|
+
col :row_count_before, :type => :integer
|
63
|
+
col :row_count_after, :type => :integer
|
64
|
+
add_index :model_name
|
65
|
+
add_index :aasm_state
|
66
|
+
|
67
|
+
validates_presence_of :model_name
|
68
|
+
|
69
|
+
has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic'
|
60
70
|
|
61
71
|
include ::AASM
|
62
72
|
aasm_initial_state INITIAL_STATE
|
@@ -68,11 +78,16 @@ class DataMiner
|
|
68
78
|
aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
|
69
79
|
aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
|
70
80
|
|
71
|
-
validates_presence_of :model_name
|
72
|
-
|
73
81
|
# @private
|
74
82
|
def start
|
83
|
+
model = model_name.constantize
|
84
|
+
if model.table_exists?
|
85
|
+
self.row_count_before = model.count
|
86
|
+
end
|
75
87
|
save!
|
88
|
+
if DataMiner.per_column_statistics?
|
89
|
+
ColumnStatistic.before self
|
90
|
+
end
|
76
91
|
begin
|
77
92
|
catch :data_miner_succeed do
|
78
93
|
yield
|
@@ -85,6 +100,10 @@ class DataMiner
|
|
85
100
|
fail!
|
86
101
|
raise $!
|
87
102
|
ensure
|
103
|
+
self.row_count_after = model.count
|
104
|
+
if DataMiner.per_column_statistics?
|
105
|
+
ColumnStatistic.after self
|
106
|
+
end
|
88
107
|
self.stopped_at = ::Time.now
|
89
108
|
save!
|
90
109
|
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
@@ -92,6 +111,31 @@ class DataMiner
|
|
92
111
|
self
|
93
112
|
end
|
94
113
|
|
114
|
+
# Get the column statistics for a particular column before or after this run.
|
115
|
+
#
|
116
|
+
# @param [String] column_name The column you want to know about.
|
117
|
+
# @param ["before","after"] period Whether you want to know about before or after the run.
|
118
|
+
#
|
119
|
+
# @return [ColumnStatistic]
|
120
|
+
def column_statistics_for(column_name, period)
|
121
|
+
column_name = column_name.to_s
|
122
|
+
period = period.to_s
|
123
|
+
model = model_name.constantize
|
124
|
+
if existing = column_statistics.where(:column_name => column_name, :period => period).first
|
125
|
+
existing
|
126
|
+
elsif model.table_exists?
|
127
|
+
unless model.column_names.include?(column_name)
|
128
|
+
raise ::ArgumentError, %{[data_miner] Nonexistent column #{column_name.inspect} on #{model_name}}
|
129
|
+
end
|
130
|
+
blank = ColumnStatistic.new
|
131
|
+
blank.run = self
|
132
|
+
blank.model_name = model_name
|
133
|
+
blank.period = period
|
134
|
+
blank.column_name = column_name
|
135
|
+
blank
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
95
139
|
# @private
|
96
140
|
def as_lock
|
97
141
|
database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
|
@@ -0,0 +1,87 @@
|
|
1
|
+
class DataMiner
|
2
|
+
class Run < ::ActiveRecord::Base
|
3
|
+
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
4
|
+
#
|
5
|
+
# Each +DataMiner::Run+ will have two of these for every column; a "before" and an "after".
|
6
|
+
class ColumnStatistic < ::ActiveRecord::Base
|
7
|
+
class << self
|
8
|
+
# @private
|
9
|
+
def before(run)
|
10
|
+
period run, 'before'
|
11
|
+
end
|
12
|
+
|
13
|
+
# @private
|
14
|
+
def after(run)
|
15
|
+
period run, 'after'
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def period(run, period)
|
21
|
+
unless table_exists?
|
22
|
+
auto_upgrade!
|
23
|
+
end
|
24
|
+
model = run.model_name.constantize
|
25
|
+
return unless model.table_exists?
|
26
|
+
model.column_names.each do |column_name|
|
27
|
+
column_statistic = new
|
28
|
+
column_statistic.run = run
|
29
|
+
column_statistic.model_name = run.model_name
|
30
|
+
column_statistic.period = period
|
31
|
+
column_statistic.column_name = column_name
|
32
|
+
column_statistic.perform_calculations
|
33
|
+
column_statistic.save!
|
34
|
+
end
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
NUMERIC = [
|
41
|
+
:integer,
|
42
|
+
:float,
|
43
|
+
:decimal,
|
44
|
+
]
|
45
|
+
|
46
|
+
self.table_name = 'data_miner_run_column_statistics'
|
47
|
+
|
48
|
+
belongs_to :run, :class_name => 'DataMiner::Run'
|
49
|
+
|
50
|
+
col :run_id, :type => :integer
|
51
|
+
col :model_name
|
52
|
+
col :period
|
53
|
+
col :column_name
|
54
|
+
col :null_count, :type => :integer
|
55
|
+
col :max
|
56
|
+
col :min
|
57
|
+
col :average, :type => :float
|
58
|
+
col :standard_deviation, :type => :float
|
59
|
+
col :sum, :type => :float
|
60
|
+
add_index :run_id
|
61
|
+
add_index :model_name
|
62
|
+
|
63
|
+
# @private
|
64
|
+
def perform_calculations
|
65
|
+
model = run.model_name.constantize
|
66
|
+
|
67
|
+
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
68
|
+
self.max = calculate(:MAX).inspect
|
69
|
+
self.min = calculate(:MIN).inspect
|
70
|
+
|
71
|
+
column = model.columns_hash[column_name]
|
72
|
+
if NUMERIC.include?(column.type)
|
73
|
+
self.average = calculate :AVG
|
74
|
+
self.standard_deviation = calculate :STDDEV
|
75
|
+
self.sum = calculate :SUM
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def calculate(operation)
|
82
|
+
model = run.model_name.constantize
|
83
|
+
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/data_miner/script.rb
CHANGED
data/lib/data_miner/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -11,7 +11,7 @@ require 'minitest/reporters'
|
|
11
11
|
MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
12
12
|
MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
13
13
|
|
14
|
-
cmd = %{mysql -u root -ppassword -e "
|
14
|
+
cmd = %{mysql -u root -ppassword -e "DROP DATABASE data_miner_test; CREATE DATABASE data_miner_test CHARSET utf8"}
|
15
15
|
$stderr.puts "Running `#{cmd}`..."
|
16
16
|
system cmd
|
17
17
|
$stderr.puts "Done."
|
@@ -28,6 +28,71 @@ ActiveRecord::Base.establish_connection(
|
|
28
28
|
'password' => 'password'
|
29
29
|
)
|
30
30
|
|
31
|
+
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
32
|
+
|
31
33
|
require 'data_miner'
|
32
34
|
DataMiner::Run.auto_upgrade!
|
35
|
+
DataMiner::Run::ColumnStatistic.auto_upgrade!
|
33
36
|
DataMiner::Run.clear_locks
|
37
|
+
|
38
|
+
PETS = File.expand_path('../support/pets.csv', __FILE__)
|
39
|
+
PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
|
40
|
+
COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
|
41
|
+
COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
|
42
|
+
BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
|
43
|
+
|
44
|
+
class Pet < ActiveRecord::Base
|
45
|
+
self.primary_key = "name"
|
46
|
+
col :name
|
47
|
+
col :breed_id
|
48
|
+
col :color_id
|
49
|
+
col :age, :type => :integer
|
50
|
+
col :age_units
|
51
|
+
col :weight, :type => :float
|
52
|
+
col :weight_units
|
53
|
+
col :height, :type => :integer
|
54
|
+
col :height_units
|
55
|
+
col :favorite_food
|
56
|
+
col :command_phrase
|
57
|
+
belongs_to :breed
|
58
|
+
data_miner do
|
59
|
+
process :auto_upgrade!
|
60
|
+
process :run_data_miner_on_parent_associations!
|
61
|
+
import("A list of pets", :url => "file://#{PETS}") do
|
62
|
+
key :name
|
63
|
+
store :age, :units_field_name => 'age_units'
|
64
|
+
store :breed_id, :field_name => :breed, :nullify_blank_strings => true
|
65
|
+
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
66
|
+
store :weight, :from_units => :pounds, :to_units => :kilograms
|
67
|
+
store :favorite_food, :nullify_blank_strings => true
|
68
|
+
store :command_phrase
|
69
|
+
store :height, :units => :centimetres
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Breed < ActiveRecord::Base
|
75
|
+
class << self
|
76
|
+
def update_average_age!
|
77
|
+
# make sure pet is populated
|
78
|
+
Pet.run_data_miner!
|
79
|
+
update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
|
80
|
+
end
|
81
|
+
end
|
82
|
+
self.primary_key = "name"
|
83
|
+
col :name
|
84
|
+
col :average_age, :type => :float
|
85
|
+
data_miner do
|
86
|
+
process :auto_upgrade!
|
87
|
+
import("A list of breeds", :url => "file://#{BREEDS}") do
|
88
|
+
key :name, :field_name => 'Breed name'
|
89
|
+
end
|
90
|
+
process :update_average_age!
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
ActiveRecord::Base.descendants.each do |model|
|
95
|
+
model.attr_accessible nil
|
96
|
+
end
|
97
|
+
|
98
|
+
Pet.auto_upgrade!
|
data/test/support/pets.csv
CHANGED
data/test/test_data_miner.rb
CHANGED
@@ -1,73 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require 'helper'
|
3
3
|
|
4
|
-
PETS = File.expand_path('../support/pets.csv', __FILE__)
|
5
|
-
PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
|
6
|
-
COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
|
7
|
-
COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
|
8
|
-
BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
|
9
|
-
|
10
|
-
class Pet < ActiveRecord::Base
|
11
|
-
self.primary_key = "name"
|
12
|
-
col :name
|
13
|
-
col :breed_id
|
14
|
-
col :color_id
|
15
|
-
col :age, :type => :integer
|
16
|
-
col :age_units
|
17
|
-
col :weight, :type => :float
|
18
|
-
col :weight_units
|
19
|
-
col :height, :type => :integer
|
20
|
-
col :height_units
|
21
|
-
col :favorite_food
|
22
|
-
col :command_phrase
|
23
|
-
belongs_to :breed
|
24
|
-
data_miner do
|
25
|
-
process :auto_upgrade!
|
26
|
-
process :run_data_miner_on_parent_associations!
|
27
|
-
import("A list of pets", :url => "file://#{PETS}") do
|
28
|
-
key :name
|
29
|
-
store :age, :units_field_name => 'age_units'
|
30
|
-
store :breed_id, :field_name => :breed
|
31
|
-
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
32
|
-
store :weight, :from_units => :pounds, :to_units => :kilograms
|
33
|
-
store :favorite_food, :nullify_blank_strings => true
|
34
|
-
store :command_phrase
|
35
|
-
store :height, :units => :centimetres
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
class Breed < ActiveRecord::Base
|
41
|
-
class << self
|
42
|
-
def update_average_age!
|
43
|
-
# make sure pet is populated
|
44
|
-
Pet.run_data_miner!
|
45
|
-
update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
|
46
|
-
end
|
47
|
-
end
|
48
|
-
self.primary_key = "name"
|
49
|
-
col :name
|
50
|
-
col :average_age, :type => :float
|
51
|
-
data_miner do
|
52
|
-
process :auto_upgrade!
|
53
|
-
import("A list of breeds", :url => "file://#{BREEDS}") do
|
54
|
-
key :name, :field_name => 'Breed name'
|
55
|
-
end
|
56
|
-
process :update_average_age!
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
61
|
-
ActiveRecord::Base.descendants.each do |model|
|
62
|
-
model.attr_accessible nil
|
63
|
-
end
|
64
|
-
|
65
|
-
Pet.auto_upgrade!
|
66
|
-
|
67
4
|
describe DataMiner do
|
68
5
|
describe "when used to import example data about pets" do
|
69
6
|
before do
|
70
7
|
Pet.delete_all
|
8
|
+
DataMiner::Run.delete_all
|
9
|
+
DataMiner::Run::ColumnStatistic.delete_all
|
71
10
|
end
|
72
11
|
it "it does not depend on mass-assignment" do
|
73
12
|
lambda do
|
@@ -155,5 +94,10 @@ describe DataMiner do
|
|
155
94
|
Pet.find('Amigo').weight.must_be_nil
|
156
95
|
Pet.find('Amigo').weight_units.must_be_nil
|
157
96
|
end
|
97
|
+
it "keeps a row count before and after" do
|
98
|
+
Pet.run_data_miner!
|
99
|
+
Pet.data_miner_runs.first.row_count_before.must_equal 0
|
100
|
+
Pet.data_miner_runs.first.row_count_after.must_equal 5
|
101
|
+
end
|
158
102
|
end
|
159
103
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
describe DataMiner::Run::ColumnStatistic do
|
5
|
+
describe "when advanced statistics are enabled" do
|
6
|
+
before do
|
7
|
+
DataMiner.per_column_statistics = true
|
8
|
+
Pet.delete_all
|
9
|
+
DataMiner::Run.delete_all
|
10
|
+
DataMiner::Run::ColumnStatistic.delete_all
|
11
|
+
end
|
12
|
+
|
13
|
+
after do
|
14
|
+
DataMiner.per_column_statistics = false
|
15
|
+
end
|
16
|
+
|
17
|
+
it "keeps null count" do
|
18
|
+
Pet.run_data_miner!
|
19
|
+
|
20
|
+
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
|
21
|
+
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
|
22
|
+
|
23
|
+
Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :before).null_count.must_equal 0
|
24
|
+
Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :after).null_count.must_equal 0
|
25
|
+
end
|
26
|
+
|
27
|
+
it "keeps max and min (as strings)" do
|
28
|
+
Pet.run_data_miner!
|
29
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).max.must_equal 'nil'
|
30
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "keeps average and stddev" do
|
34
|
+
Pet.run_data_miner!
|
35
|
+
|
36
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).average.must_be_nil
|
37
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).average.must_equal 7.0
|
38
|
+
|
39
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).standard_deviation.must_be_nil
|
40
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).standard_deviation.must_equal 5.8737
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-05-
|
14
|
+
date: 2012-05-16 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: remote_table
|
@@ -147,6 +147,7 @@ files:
|
|
147
147
|
- lib/data_miner/attribute.rb
|
148
148
|
- lib/data_miner/dictionary.rb
|
149
149
|
- lib/data_miner/run.rb
|
150
|
+
- lib/data_miner/run/column_statistic.rb
|
150
151
|
- lib/data_miner/script.rb
|
151
152
|
- lib/data_miner/step.rb
|
152
153
|
- lib/data_miner/step/import.rb
|
@@ -160,6 +161,7 @@ files:
|
|
160
161
|
- test/support/pets.csv
|
161
162
|
- test/support/pets_funny.csv
|
162
163
|
- test/test_data_miner.rb
|
164
|
+
- test/test_data_miner_run_column_statistic.rb
|
163
165
|
- test/test_earth_import.rb
|
164
166
|
- test/test_earth_tap.rb
|
165
167
|
- test/test_safety.rb
|
@@ -196,6 +198,7 @@ test_files:
|
|
196
198
|
- test/support/pets.csv
|
197
199
|
- test/support/pets_funny.csv
|
198
200
|
- test/test_data_miner.rb
|
201
|
+
- test/test_data_miner_run_column_statistic.rb
|
199
202
|
- test/test_earth_import.rb
|
200
203
|
- test/test_earth_tap.rb
|
201
204
|
- test/test_safety.rb
|