data_miner 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -0
- data/lib/data_miner/run/column_statistic.rb +16 -25
- data/lib/data_miner/run.rb +15 -22
- data/lib/data_miner/version.rb +1 -1
- data/test/test_data_miner_run_column_statistic.rb +22 -15
- metadata +2 -2
data/CHANGELOG
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
2.1.2 / 2012-05-22
|
|
2
|
+
|
|
3
|
+
* Breaking changes
|
|
4
|
+
|
|
5
|
+
* DataMiner::Run#column_statistics_for(column_name, period) -> {initial,final}_column_statistics(column_name)
|
|
6
|
+
* Remove std dev from statistics kept
|
|
7
|
+
|
|
8
|
+
* Enhancements
|
|
9
|
+
|
|
10
|
+
* Added zero count and null count
|
|
11
|
+
|
|
1
12
|
2.1.1 / 2012-05-16
|
|
2
13
|
|
|
3
14
|
* Enhancements
|
|
@@ -2,22 +2,10 @@ class DataMiner
|
|
|
2
2
|
class Run < ::ActiveRecord::Base
|
|
3
3
|
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
|
4
4
|
#
|
|
5
|
-
# Each +DataMiner::Run+ will have two of these for every column;
|
|
5
|
+
# Each +DataMiner::Run+ will have two of these for every column; an "initial" and a "final"
|
|
6
6
|
class ColumnStatistic < ::ActiveRecord::Base
|
|
7
7
|
class << self
|
|
8
|
-
|
|
9
|
-
def before(run)
|
|
10
|
-
period run, 'before'
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
# @private
|
|
14
|
-
def after(run)
|
|
15
|
-
period run, 'after'
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
private
|
|
19
|
-
|
|
20
|
-
def period(run, period)
|
|
8
|
+
def take(run)
|
|
21
9
|
unless table_exists?
|
|
22
10
|
auto_upgrade!
|
|
23
11
|
end
|
|
@@ -27,9 +15,8 @@ class DataMiner
|
|
|
27
15
|
column_statistic = new
|
|
28
16
|
column_statistic.run = run
|
|
29
17
|
column_statistic.model_name = run.model_name
|
|
30
|
-
column_statistic.period = period
|
|
31
18
|
column_statistic.column_name = column_name
|
|
32
|
-
column_statistic.
|
|
19
|
+
column_statistic.take_statistics
|
|
33
20
|
column_statistic.save!
|
|
34
21
|
end
|
|
35
22
|
nil
|
|
@@ -49,36 +36,40 @@ class DataMiner
|
|
|
49
36
|
|
|
50
37
|
col :run_id, :type => :integer
|
|
51
38
|
col :model_name
|
|
52
|
-
col :period
|
|
53
39
|
col :column_name
|
|
54
40
|
col :null_count, :type => :integer
|
|
41
|
+
col :zero_count, :type => :integer
|
|
42
|
+
col :blank_count, :type => :integer
|
|
55
43
|
col :max
|
|
56
44
|
col :min
|
|
57
45
|
col :average, :type => :float
|
|
58
|
-
col :standard_deviation, :type => :float
|
|
59
46
|
col :sum, :type => :float
|
|
47
|
+
col :created_at, :type => :datetime
|
|
60
48
|
add_index :run_id
|
|
61
49
|
add_index :model_name
|
|
62
50
|
|
|
63
51
|
# @private
|
|
64
|
-
def
|
|
52
|
+
def take_statistics
|
|
65
53
|
model = run.model_name.constantize
|
|
66
54
|
|
|
67
55
|
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
|
68
|
-
|
|
69
|
-
self.
|
|
56
|
+
|
|
57
|
+
self.max = calc(:MAX).inspect
|
|
58
|
+
self.min = calc(:MIN).inspect
|
|
70
59
|
|
|
71
60
|
column = model.columns_hash[column_name]
|
|
72
61
|
if NUMERIC.include?(column.type)
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
75
|
-
self.sum =
|
|
62
|
+
self.zero_count = model.where(column_name => 0).count
|
|
63
|
+
self.average = calc :AVG
|
|
64
|
+
self.sum = calc :SUM
|
|
65
|
+
elsif column.type == :string
|
|
66
|
+
self.blank_count = model.where("LENGTH(TRIM(#{model.connection.quote_column_name(column_name)})) = 0").count
|
|
76
67
|
end
|
|
77
68
|
end
|
|
78
69
|
|
|
79
70
|
private
|
|
80
71
|
|
|
81
|
-
def
|
|
72
|
+
def calc(operation)
|
|
82
73
|
model = run.model_name.constantize
|
|
83
74
|
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
|
84
75
|
end
|
data/lib/data_miner/run.rb
CHANGED
|
@@ -86,7 +86,7 @@ class DataMiner
|
|
|
86
86
|
end
|
|
87
87
|
save!
|
|
88
88
|
if DataMiner.per_column_statistics?
|
|
89
|
-
ColumnStatistic.
|
|
89
|
+
ColumnStatistic.take self
|
|
90
90
|
end
|
|
91
91
|
begin
|
|
92
92
|
catch :data_miner_succeed do
|
|
@@ -102,38 +102,31 @@ class DataMiner
|
|
|
102
102
|
ensure
|
|
103
103
|
self.row_count_after = model.count
|
|
104
104
|
if DataMiner.per_column_statistics?
|
|
105
|
-
ColumnStatistic.
|
|
105
|
+
ColumnStatistic.take self
|
|
106
106
|
end
|
|
107
|
-
self.stopped_at = ::Time.now
|
|
107
|
+
self.stopped_at = ::Time.now.utc
|
|
108
108
|
save!
|
|
109
109
|
DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
|
|
110
110
|
end
|
|
111
111
|
self
|
|
112
112
|
end
|
|
113
113
|
|
|
114
|
-
# Get the column statistics for a particular column before
|
|
114
|
+
# Get the column statistics for a particular column before this run started.
|
|
115
115
|
#
|
|
116
116
|
# @param [String] column_name The column you want to know about.
|
|
117
|
-
# @param ["before","after"] period Whether you want to know about before or after the run.
|
|
118
117
|
#
|
|
119
118
|
# @return [ColumnStatistic]
|
|
120
|
-
def
|
|
121
|
-
column_name
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
blank.run = self
|
|
132
|
-
blank.model_name = model_name
|
|
133
|
-
blank.period = period
|
|
134
|
-
blank.column_name = column_name
|
|
135
|
-
blank
|
|
136
|
-
end
|
|
119
|
+
def initial_column_statistics(column_name)
|
|
120
|
+
column_statistics.where(:column_name => column_name.to_s).first
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get the column statistics for a particular column after this run finished.
|
|
124
|
+
#
|
|
125
|
+
# @param [String] column_name The column you want to know about.
|
|
126
|
+
#
|
|
127
|
+
# @return [ColumnStatistic]
|
|
128
|
+
def final_column_statistics(column_name)
|
|
129
|
+
column_statistics.where(:column_name => column_name.to_s).last
|
|
137
130
|
end
|
|
138
131
|
|
|
139
132
|
# @private
|
data/lib/data_miner/version.rb
CHANGED
|
@@ -8,6 +8,7 @@ describe DataMiner::Run::ColumnStatistic do
|
|
|
8
8
|
Pet.delete_all
|
|
9
9
|
DataMiner::Run.delete_all
|
|
10
10
|
DataMiner::Run::ColumnStatistic.delete_all
|
|
11
|
+
Pet.run_data_miner!
|
|
11
12
|
end
|
|
12
13
|
|
|
13
14
|
after do
|
|
@@ -15,29 +16,35 @@ describe DataMiner::Run::ColumnStatistic do
|
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
it "keeps null count" do
|
|
18
|
-
Pet.
|
|
19
|
-
|
|
20
|
-
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
|
|
21
|
-
Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
|
|
19
|
+
Pet.data_miner_runs.first.initial_column_statistics(:breed_id).null_count.must_equal 0
|
|
20
|
+
Pet.data_miner_runs.first.final_column_statistics(:breed_id).null_count.must_equal 1
|
|
22
21
|
|
|
23
|
-
Pet.data_miner_runs.first.
|
|
24
|
-
Pet.data_miner_runs.first.
|
|
22
|
+
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).null_count.must_equal 0
|
|
23
|
+
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).null_count.must_equal 0
|
|
25
24
|
end
|
|
26
25
|
|
|
27
26
|
it "keeps max and min (as strings)" do
|
|
28
|
-
Pet.
|
|
29
|
-
Pet.data_miner_runs.first.
|
|
30
|
-
Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
|
|
27
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).max.must_equal 'nil'
|
|
28
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).max.must_equal '17'
|
|
31
29
|
end
|
|
32
30
|
|
|
33
|
-
it "keeps average and
|
|
34
|
-
Pet.
|
|
31
|
+
it "keeps average and sum" do
|
|
32
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).average.must_be_nil
|
|
33
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).average.must_equal 7.0
|
|
35
34
|
|
|
36
|
-
Pet.data_miner_runs.first.
|
|
37
|
-
Pet.data_miner_runs.first.
|
|
35
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).sum.must_be_nil
|
|
36
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).sum.must_equal 28.0
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it "keeps blank (empty string) count" do
|
|
40
|
+
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).blank_count.must_equal 0
|
|
41
|
+
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).blank_count.must_equal 3
|
|
42
|
+
end
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
Pet.data_miner_runs.first.
|
|
44
|
+
it "keeps zero count" do
|
|
45
|
+
Pet.data_miner_runs.first.initial_column_statistics(:age).zero_count.must_equal 0
|
|
46
|
+
Pet.data_miner_runs.first.final_column_statistics(:age).zero_count.must_equal 0
|
|
41
47
|
end
|
|
48
|
+
|
|
42
49
|
end
|
|
43
50
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_miner
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.2
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -11,7 +11,7 @@ authors:
|
|
|
11
11
|
autorequire:
|
|
12
12
|
bindir: bin
|
|
13
13
|
cert_chain: []
|
|
14
|
-
date: 2012-05-
|
|
14
|
+
date: 2012-05-22 00:00:00.000000000 Z
|
|
15
15
|
dependencies:
|
|
16
16
|
- !ruby/object:Gem::Dependency
|
|
17
17
|
name: remote_table
|