data_miner 2.1.1 → 2.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +11 -0
- data/lib/data_miner/run/column_statistic.rb +16 -25
- data/lib/data_miner/run.rb +15 -22
- data/lib/data_miner/version.rb +1 -1
- data/test/test_data_miner_run_column_statistic.rb +22 -15
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
2.1.2 / 2012-05-22
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* DataMiner::Run#column_statistics_for(column_name, period) -> {initial,final}_column_statistics(column_name)
|
6
|
+
* Remove std dev from statistics kept
|
7
|
+
|
8
|
+
* Enhancements
|
9
|
+
|
10
|
+
* Added zero count and null count
|
11
|
+
|
1
12
|
2.1.1 / 2012-05-16
|
2
13
|
|
3
14
|
* Enhancements
|
@@ -2,22 +2,10 @@ class DataMiner
|
|
2
2
|
class Run < ::ActiveRecord::Base
|
3
3
|
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
4
4
|
#
|
5
|
-
# Each +DataMiner::Run+ will have two of these for every column;
|
5
|
+
# Each +DataMiner::Run+ will have two of these for every column; an "initial" and a "final"
|
6
6
|
class ColumnStatistic < ::ActiveRecord::Base
|
7
7
|
class << self
|
8
|
-
|
9
|
-
def before(run)
|
10
|
-
period run, 'before'
|
11
|
-
end
|
12
|
-
|
13
|
-
# @private
|
14
|
-
def after(run)
|
15
|
-
period run, 'after'
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def period(run, period)
|
8
|
+
def take(run)
|
21
9
|
unless table_exists?
|
22
10
|
auto_upgrade!
|
23
11
|
end
|
@@ -27,9 +15,8 @@ class DataMiner
|
|
27
15
|
column_statistic = new
|
28
16
|
column_statistic.run = run
|
29
17
|
column_statistic.model_name = run.model_name
|
30
|
-
column_statistic.period = period
|
31
18
|
column_statistic.column_name = column_name
|
32
|
-
column_statistic.
|
19
|
+
column_statistic.take_statistics
|
33
20
|
column_statistic.save!
|
34
21
|
end
|
35
22
|
nil
|
@@ -49,36 +36,40 @@ class DataMiner
|
|
49
36
|
|
50
37
|
col :run_id, :type => :integer
|
51
38
|
col :model_name
|
52
|
-
col :period
|
53
39
|
col :column_name
|
54
40
|
col :null_count, :type => :integer
|
41
|
+
col :zero_count, :type => :integer
|
42
|
+
col :blank_count, :type => :integer
|
55
43
|
col :max
|
56
44
|
col :min
|
57
45
|
col :average, :type => :float
|
58
|
-
col :standard_deviation, :type => :float
|
59
46
|
col :sum, :type => :float
|
47
|
+
col :created_at, :type => :datetime
|
60
48
|
add_index :run_id
|
61
49
|
add_index :model_name
|
62
50
|
|
63
51
|
# @private
|
64
|
-
def
|
52
|
+
def take_statistics
|
65
53
|
model = run.model_name.constantize
|
66
54
|
|
67
55
|
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
68
|
-
|
69
|
-
self.
|
56
|
+
|
57
|
+
self.max = calc(:MAX).inspect
|
58
|
+
self.min = calc(:MIN).inspect
|
70
59
|
|
71
60
|
column = model.columns_hash[column_name]
|
72
61
|
if NUMERIC.include?(column.type)
|
73
|
-
self.
|
74
|
-
self.
|
75
|
-
self.sum =
|
62
|
+
self.zero_count = model.where(column_name => 0).count
|
63
|
+
self.average = calc :AVG
|
64
|
+
self.sum = calc :SUM
|
65
|
+
elsif column.type == :string
|
66
|
+
self.blank_count = model.where("LENGTH(TRIM(#{model.connection.quote_column_name(column_name)})) = 0").count
|
76
67
|
end
|
77
68
|
end
|
78
69
|
|
79
70
|
private
|
80
71
|
|
81
|
-
def
|
72
|
+
def calc(operation)
|
82
73
|
model = run.model_name.constantize
|
83
74
|
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
84
75
|
end
|
data/lib/data_miner/run.rb
CHANGED
@@ -86,7 +86,7 @@ class DataMiner
|
|
86
86
|
end
|
87
87
|
save!
|
88
88
|
if DataMiner.per_column_statistics?
|
89
|
-
ColumnStatistic.
|
89
|
+
ColumnStatistic.take self
|
90
90
|
end
|
91
91
|
begin
|
92
92
|
catch :data_miner_succeed do
|
@@ -102,38 +102,31 @@ class DataMiner
|
|
102
102
|
ensure
|
103
103
|
self.row_count_after = model.count
|
104
104
|
if DataMiner.per_column_statistics?
|
105
|
-
ColumnStatistic.
|
105
|
+
ColumnStatistic.take self
|
106
106
|
end
|
107
|
-
self.stopped_at = ::Time.now
|
107
|
+
self.stopped_at = ::Time.now.utc
|
108
108
|
save!
|
109
109
|
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
110
110
|
end
|
111
111
|
self
|
112
112
|
end
|
113
113
|
|
114
|
-
# Get the column statistics for a particular column before
|
114
|
+
# Get the column statistics for a particular column before this run started.
|
115
115
|
#
|
116
116
|
# @param [String] column_name The column you want to know about.
|
117
|
-
# @param ["before","after"] period Whether you want to know about before or after the run.
|
118
117
|
#
|
119
118
|
# @return [ColumnStatistic]
|
120
|
-
def
|
121
|
-
column_name
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
blank.run = self
|
132
|
-
blank.model_name = model_name
|
133
|
-
blank.period = period
|
134
|
-
blank.column_name = column_name
|
135
|
-
blank
|
136
|
-
end
|
119
|
+
def initial_column_statistics(column_name)
|
120
|
+
column_statistics.where(:column_name => column_name.to_s).first
|
121
|
+
end
|
122
|
+
|
123
|
+
# Get the column statistics for a particular column after this run finished.
|
124
|
+
#
|
125
|
+
# @param [String] column_name The column you want to know about.
|
126
|
+
#
|
127
|
+
# @return [ColumnStatistic]
|
128
|
+
def final_column_statistics(column_name)
|
129
|
+
column_statistics.where(:column_name => column_name.to_s).last
|
137
130
|
end
|
138
131
|
|
139
132
|
# @private
|
data/lib/data_miner/version.rb
CHANGED
@@ -8,6 +8,7 @@ describe DataMiner::Run::ColumnStatistic do
|
|
8
8
|
Pet.delete_all
|
9
9
|
DataMiner::Run.delete_all
|
10
10
|
DataMiner::Run::ColumnStatistic.delete_all
|
11
|
+
Pet.run_data_miner!
|
11
12
|
end
|
12
13
|
|
13
14
|
after do
|
@@ -15,29 +16,35 @@ describe DataMiner::Run::ColumnStatistic do
|
|
15
16
|
end
|
16
17
|
|
17
18
|
it "keeps null count" do
|
18
|
-
Pet.
|
19
|
-
|
20
|
-
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
|
21
|
-
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
|
19
|
+
Pet.data_miner_runs.first.initial_column_statistics(:breed_id).null_count.must_equal 0
|
20
|
+
Pet.data_miner_runs.first.final_column_statistics(:breed_id).null_count.must_equal 1
|
22
21
|
|
23
|
-
Pet.data_miner_runs.first.
|
24
|
-
Pet.data_miner_runs.first.
|
22
|
+
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).null_count.must_equal 0
|
23
|
+
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).null_count.must_equal 0
|
25
24
|
end
|
26
25
|
|
27
26
|
it "keeps max and min (as strings)" do
|
28
|
-
Pet.
|
29
|
-
Pet.data_miner_runs.first.
|
30
|
-
Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
|
27
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).max.must_equal 'nil'
|
28
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).max.must_equal '17'
|
31
29
|
end
|
32
30
|
|
33
|
-
it "keeps average and
|
34
|
-
Pet.
|
31
|
+
it "keeps average and sum" do
|
32
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).average.must_be_nil
|
33
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).average.must_equal 7.0
|
35
34
|
|
36
|
-
Pet.data_miner_runs.first.
|
37
|
-
Pet.data_miner_runs.first.
|
35
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).sum.must_be_nil
|
36
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).sum.must_equal 28.0
|
37
|
+
end
|
38
|
+
|
39
|
+
it "keeps blank (empty string) count" do
|
40
|
+
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).blank_count.must_equal 0
|
41
|
+
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).blank_count.must_equal 3
|
42
|
+
end
|
38
43
|
|
39
|
-
|
40
|
-
Pet.data_miner_runs.first.
|
44
|
+
it "keeps zero count" do
|
45
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).zero_count.must_equal 0
|
46
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).zero_count.must_equal 0
|
41
47
|
end
|
48
|
+
|
42
49
|
end
|
43
50
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1.
|
4
|
+
version: 2.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-05-
|
14
|
+
date: 2012-05-22 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: remote_table
|