data_miner 2.1.0 → 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/lib/data_miner.rb +10 -0
- data/lib/data_miner/run.rb +46 -2
- data/lib/data_miner/run/column_statistic.rb +87 -0
- data/lib/data_miner/script.rb +3 -0
- data/lib/data_miner/version.rb +1 -1
- data/test/helper.rb +66 -1
- data/test/support/pets.csv +1 -0
- data/test/test_data_miner.rb +7 -63
- data/test/test_data_miner_run_column_statistic.rb +43 -0
- metadata +5 -2
data/CHANGELOG
CHANGED
data/lib/data_miner.rb
CHANGED
@@ -96,6 +96,16 @@ class DataMiner
|
|
96
96
|
end
|
97
97
|
end
|
98
98
|
|
99
|
+
# Whether per-column stats like max, min, average, standard deviation, etc are enabled.
|
100
|
+
def per_column_statistics?
|
101
|
+
@per_column_statistics == true
|
102
|
+
end
|
103
|
+
|
104
|
+
# Turn on or off per-column stats.
|
105
|
+
def per_column_statistics=(boolean)
|
106
|
+
@per_column_statistics = boolean
|
107
|
+
end
|
108
|
+
|
99
109
|
class << self
|
100
110
|
delegate(*DataMiner.instance_methods(false), :to => :instance)
|
101
111
|
end
|
data/lib/data_miner/run.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'aasm'
|
2
2
|
require 'active_record_inline_schema'
|
3
3
|
|
4
|
+
require 'data_miner/run/column_statistic'
|
5
|
+
|
4
6
|
class DataMiner
|
5
7
|
# A record of what happened when you ran a data miner script.
|
6
8
|
#
|
@@ -57,6 +59,14 @@ class DataMiner
|
|
57
59
|
col :stopped_at, :type => :datetime
|
58
60
|
col :updated_at, :type => :datetime
|
59
61
|
col :error, :type => :text
|
62
|
+
col :row_count_before, :type => :integer
|
63
|
+
col :row_count_after, :type => :integer
|
64
|
+
add_index :model_name
|
65
|
+
add_index :aasm_state
|
66
|
+
|
67
|
+
validates_presence_of :model_name
|
68
|
+
|
69
|
+
has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic'
|
60
70
|
|
61
71
|
include ::AASM
|
62
72
|
aasm_initial_state INITIAL_STATE
|
@@ -68,11 +78,16 @@ class DataMiner
|
|
68
78
|
aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
|
69
79
|
aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
|
70
80
|
|
71
|
-
validates_presence_of :model_name
|
72
|
-
|
73
81
|
# @private
|
74
82
|
def start
|
83
|
+
model = model_name.constantize
|
84
|
+
if model.table_exists?
|
85
|
+
self.row_count_before = model.count
|
86
|
+
end
|
75
87
|
save!
|
88
|
+
if DataMiner.per_column_statistics?
|
89
|
+
ColumnStatistic.before self
|
90
|
+
end
|
76
91
|
begin
|
77
92
|
catch :data_miner_succeed do
|
78
93
|
yield
|
@@ -85,6 +100,10 @@ class DataMiner
|
|
85
100
|
fail!
|
86
101
|
raise $!
|
87
102
|
ensure
|
103
|
+
self.row_count_after = model.count
|
104
|
+
if DataMiner.per_column_statistics?
|
105
|
+
ColumnStatistic.after self
|
106
|
+
end
|
88
107
|
self.stopped_at = ::Time.now
|
89
108
|
save!
|
90
109
|
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
@@ -92,6 +111,31 @@ class DataMiner
|
|
92
111
|
self
|
93
112
|
end
|
94
113
|
|
114
|
+
# Get the column statistics for a particular column before or after this run.
|
115
|
+
#
|
116
|
+
# @param [String] column_name The column you want to know about.
|
117
|
+
# @param ["before","after"] period Whether you want to know about before or after the run.
|
118
|
+
#
|
119
|
+
# @return [ColumnStatistic]
|
120
|
+
def column_statistics_for(column_name, period)
|
121
|
+
column_name = column_name.to_s
|
122
|
+
period = period.to_s
|
123
|
+
model = model_name.constantize
|
124
|
+
if existing = column_statistics.where(:column_name => column_name, :period => period).first
|
125
|
+
existing
|
126
|
+
elsif model.table_exists?
|
127
|
+
unless model.column_names.include?(column_name)
|
128
|
+
raise ::ArgumentError, %{[data_miner] Nonexistent column #{column_name.inspect} on #{model_name}}
|
129
|
+
end
|
130
|
+
blank = ColumnStatistic.new
|
131
|
+
blank.run = self
|
132
|
+
blank.model_name = model_name
|
133
|
+
blank.period = period
|
134
|
+
blank.column_name = column_name
|
135
|
+
blank
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
95
139
|
# @private
|
96
140
|
def as_lock
|
97
141
|
database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
|
@@ -0,0 +1,87 @@
|
|
1
|
+
class DataMiner
|
2
|
+
class Run < ::ActiveRecord::Base
|
3
|
+
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
4
|
+
#
|
5
|
+
# Each +DataMiner::Run+ will have two of these for every column; a "before" and an "after".
|
6
|
+
class ColumnStatistic < ::ActiveRecord::Base
|
7
|
+
class << self
|
8
|
+
# @private
|
9
|
+
def before(run)
|
10
|
+
period run, 'before'
|
11
|
+
end
|
12
|
+
|
13
|
+
# @private
|
14
|
+
def after(run)
|
15
|
+
period run, 'after'
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def period(run, period)
|
21
|
+
unless table_exists?
|
22
|
+
auto_upgrade!
|
23
|
+
end
|
24
|
+
model = run.model_name.constantize
|
25
|
+
return unless model.table_exists?
|
26
|
+
model.column_names.each do |column_name|
|
27
|
+
column_statistic = new
|
28
|
+
column_statistic.run = run
|
29
|
+
column_statistic.model_name = run.model_name
|
30
|
+
column_statistic.period = period
|
31
|
+
column_statistic.column_name = column_name
|
32
|
+
column_statistic.perform_calculations
|
33
|
+
column_statistic.save!
|
34
|
+
end
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
NUMERIC = [
|
41
|
+
:integer,
|
42
|
+
:float,
|
43
|
+
:decimal,
|
44
|
+
]
|
45
|
+
|
46
|
+
self.table_name = 'data_miner_run_column_statistics'
|
47
|
+
|
48
|
+
belongs_to :run, :class_name => 'DataMiner::Run'
|
49
|
+
|
50
|
+
col :run_id, :type => :integer
|
51
|
+
col :model_name
|
52
|
+
col :period
|
53
|
+
col :column_name
|
54
|
+
col :null_count, :type => :integer
|
55
|
+
col :max
|
56
|
+
col :min
|
57
|
+
col :average, :type => :float
|
58
|
+
col :standard_deviation, :type => :float
|
59
|
+
col :sum, :type => :float
|
60
|
+
add_index :run_id
|
61
|
+
add_index :model_name
|
62
|
+
|
63
|
+
# @private
|
64
|
+
def perform_calculations
|
65
|
+
model = run.model_name.constantize
|
66
|
+
|
67
|
+
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
68
|
+
self.max = calculate(:MAX).inspect
|
69
|
+
self.min = calculate(:MIN).inspect
|
70
|
+
|
71
|
+
column = model.columns_hash[column_name]
|
72
|
+
if NUMERIC.include?(column.type)
|
73
|
+
self.average = calculate :AVG
|
74
|
+
self.standard_deviation = calculate :STDDEV
|
75
|
+
self.sum = calculate :SUM
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def calculate(operation)
|
82
|
+
model = run.model_name.constantize
|
83
|
+
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/data_miner/script.rb
CHANGED
data/lib/data_miner/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -11,7 +11,7 @@ require 'minitest/reporters'
|
|
11
11
|
MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
12
12
|
MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
13
13
|
|
14
|
-
cmd = %{mysql -u root -ppassword -e "
|
14
|
+
cmd = %{mysql -u root -ppassword -e "DROP DATABASE data_miner_test; CREATE DATABASE data_miner_test CHARSET utf8"}
|
15
15
|
$stderr.puts "Running `#{cmd}`..."
|
16
16
|
system cmd
|
17
17
|
$stderr.puts "Done."
|
@@ -28,6 +28,71 @@ ActiveRecord::Base.establish_connection(
|
|
28
28
|
'password' => 'password'
|
29
29
|
)
|
30
30
|
|
31
|
+
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
32
|
+
|
31
33
|
require 'data_miner'
|
32
34
|
DataMiner::Run.auto_upgrade!
|
35
|
+
DataMiner::Run::ColumnStatistic.auto_upgrade!
|
33
36
|
DataMiner::Run.clear_locks
|
37
|
+
|
38
|
+
PETS = File.expand_path('../support/pets.csv', __FILE__)
|
39
|
+
PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
|
40
|
+
COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
|
41
|
+
COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
|
42
|
+
BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
|
43
|
+
|
44
|
+
class Pet < ActiveRecord::Base
|
45
|
+
self.primary_key = "name"
|
46
|
+
col :name
|
47
|
+
col :breed_id
|
48
|
+
col :color_id
|
49
|
+
col :age, :type => :integer
|
50
|
+
col :age_units
|
51
|
+
col :weight, :type => :float
|
52
|
+
col :weight_units
|
53
|
+
col :height, :type => :integer
|
54
|
+
col :height_units
|
55
|
+
col :favorite_food
|
56
|
+
col :command_phrase
|
57
|
+
belongs_to :breed
|
58
|
+
data_miner do
|
59
|
+
process :auto_upgrade!
|
60
|
+
process :run_data_miner_on_parent_associations!
|
61
|
+
import("A list of pets", :url => "file://#{PETS}") do
|
62
|
+
key :name
|
63
|
+
store :age, :units_field_name => 'age_units'
|
64
|
+
store :breed_id, :field_name => :breed, :nullify_blank_strings => true
|
65
|
+
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
66
|
+
store :weight, :from_units => :pounds, :to_units => :kilograms
|
67
|
+
store :favorite_food, :nullify_blank_strings => true
|
68
|
+
store :command_phrase
|
69
|
+
store :height, :units => :centimetres
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Breed < ActiveRecord::Base
|
75
|
+
class << self
|
76
|
+
def update_average_age!
|
77
|
+
# make sure pet is populated
|
78
|
+
Pet.run_data_miner!
|
79
|
+
update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
|
80
|
+
end
|
81
|
+
end
|
82
|
+
self.primary_key = "name"
|
83
|
+
col :name
|
84
|
+
col :average_age, :type => :float
|
85
|
+
data_miner do
|
86
|
+
process :auto_upgrade!
|
87
|
+
import("A list of breeds", :url => "file://#{BREEDS}") do
|
88
|
+
key :name, :field_name => 'Breed name'
|
89
|
+
end
|
90
|
+
process :update_average_age!
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
ActiveRecord::Base.descendants.each do |model|
|
95
|
+
model.attr_accessible nil
|
96
|
+
end
|
97
|
+
|
98
|
+
Pet.auto_upgrade!
|
data/test/support/pets.csv
CHANGED
data/test/test_data_miner.rb
CHANGED
@@ -1,73 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require 'helper'
|
3
3
|
|
4
|
-
PETS = File.expand_path('../support/pets.csv', __FILE__)
|
5
|
-
PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
|
6
|
-
COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
|
7
|
-
COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
|
8
|
-
BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
|
9
|
-
|
10
|
-
class Pet < ActiveRecord::Base
|
11
|
-
self.primary_key = "name"
|
12
|
-
col :name
|
13
|
-
col :breed_id
|
14
|
-
col :color_id
|
15
|
-
col :age, :type => :integer
|
16
|
-
col :age_units
|
17
|
-
col :weight, :type => :float
|
18
|
-
col :weight_units
|
19
|
-
col :height, :type => :integer
|
20
|
-
col :height_units
|
21
|
-
col :favorite_food
|
22
|
-
col :command_phrase
|
23
|
-
belongs_to :breed
|
24
|
-
data_miner do
|
25
|
-
process :auto_upgrade!
|
26
|
-
process :run_data_miner_on_parent_associations!
|
27
|
-
import("A list of pets", :url => "file://#{PETS}") do
|
28
|
-
key :name
|
29
|
-
store :age, :units_field_name => 'age_units'
|
30
|
-
store :breed_id, :field_name => :breed
|
31
|
-
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
32
|
-
store :weight, :from_units => :pounds, :to_units => :kilograms
|
33
|
-
store :favorite_food, :nullify_blank_strings => true
|
34
|
-
store :command_phrase
|
35
|
-
store :height, :units => :centimetres
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
class Breed < ActiveRecord::Base
|
41
|
-
class << self
|
42
|
-
def update_average_age!
|
43
|
-
# make sure pet is populated
|
44
|
-
Pet.run_data_miner!
|
45
|
-
update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
|
46
|
-
end
|
47
|
-
end
|
48
|
-
self.primary_key = "name"
|
49
|
-
col :name
|
50
|
-
col :average_age, :type => :float
|
51
|
-
data_miner do
|
52
|
-
process :auto_upgrade!
|
53
|
-
import("A list of breeds", :url => "file://#{BREEDS}") do
|
54
|
-
key :name, :field_name => 'Breed name'
|
55
|
-
end
|
56
|
-
process :update_average_age!
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
ActiveRecord::Base.mass_assignment_sanitizer = :strict
|
61
|
-
ActiveRecord::Base.descendants.each do |model|
|
62
|
-
model.attr_accessible nil
|
63
|
-
end
|
64
|
-
|
65
|
-
Pet.auto_upgrade!
|
66
|
-
|
67
4
|
describe DataMiner do
|
68
5
|
describe "when used to import example data about pets" do
|
69
6
|
before do
|
70
7
|
Pet.delete_all
|
8
|
+
DataMiner::Run.delete_all
|
9
|
+
DataMiner::Run::ColumnStatistic.delete_all
|
71
10
|
end
|
72
11
|
it "it does not depend on mass-assignment" do
|
73
12
|
lambda do
|
@@ -155,5 +94,10 @@ describe DataMiner do
|
|
155
94
|
Pet.find('Amigo').weight.must_be_nil
|
156
95
|
Pet.find('Amigo').weight_units.must_be_nil
|
157
96
|
end
|
97
|
+
it "keeps a row count before and after" do
|
98
|
+
Pet.run_data_miner!
|
99
|
+
Pet.data_miner_runs.first.row_count_before.must_equal 0
|
100
|
+
Pet.data_miner_runs.first.row_count_after.must_equal 5
|
101
|
+
end
|
158
102
|
end
|
159
103
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
describe DataMiner::Run::ColumnStatistic do
|
5
|
+
describe "when advanced statistics are enabled" do
|
6
|
+
before do
|
7
|
+
DataMiner.per_column_statistics = true
|
8
|
+
Pet.delete_all
|
9
|
+
DataMiner::Run.delete_all
|
10
|
+
DataMiner::Run::ColumnStatistic.delete_all
|
11
|
+
end
|
12
|
+
|
13
|
+
after do
|
14
|
+
DataMiner.per_column_statistics = false
|
15
|
+
end
|
16
|
+
|
17
|
+
it "keeps null count" do
|
18
|
+
Pet.run_data_miner!
|
19
|
+
|
20
|
+
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
|
21
|
+
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
|
22
|
+
|
23
|
+
Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :before).null_count.must_equal 0
|
24
|
+
Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :after).null_count.must_equal 0
|
25
|
+
end
|
26
|
+
|
27
|
+
it "keeps max and min (as strings)" do
|
28
|
+
Pet.run_data_miner!
|
29
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).max.must_equal 'nil'
|
30
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "keeps average and stddev" do
|
34
|
+
Pet.run_data_miner!
|
35
|
+
|
36
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).average.must_be_nil
|
37
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).average.must_equal 7.0
|
38
|
+
|
39
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :before).standard_deviation.must_be_nil
|
40
|
+
Pet.data_miner_runs.first.column_statistics_for(:age, :after).standard_deviation.must_equal 5.8737
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-05-
|
14
|
+
date: 2012-05-16 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: remote_table
|
@@ -147,6 +147,7 @@ files:
|
|
147
147
|
- lib/data_miner/attribute.rb
|
148
148
|
- lib/data_miner/dictionary.rb
|
149
149
|
- lib/data_miner/run.rb
|
150
|
+
- lib/data_miner/run/column_statistic.rb
|
150
151
|
- lib/data_miner/script.rb
|
151
152
|
- lib/data_miner/step.rb
|
152
153
|
- lib/data_miner/step/import.rb
|
@@ -160,6 +161,7 @@ files:
|
|
160
161
|
- test/support/pets.csv
|
161
162
|
- test/support/pets_funny.csv
|
162
163
|
- test/test_data_miner.rb
|
164
|
+
- test/test_data_miner_run_column_statistic.rb
|
163
165
|
- test/test_earth_import.rb
|
164
166
|
- test/test_earth_tap.rb
|
165
167
|
- test/test_safety.rb
|
@@ -196,6 +198,7 @@ test_files:
|
|
196
198
|
- test/support/pets.csv
|
197
199
|
- test/support/pets_funny.csv
|
198
200
|
- test/test_data_miner.rb
|
201
|
+
- test/test_data_miner_run_column_statistic.rb
|
199
202
|
- test/test_earth_import.rb
|
200
203
|
- test/test_earth_tap.rb
|
201
204
|
- test/test_safety.rb
|