data_miner 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ 2.1.2 / 2012-05-22
2
+
3
+ * Breaking changes
4
+
5
+ * DataMiner::Run#column_statistics_for(column_name, period) -> {initial,final}_column_statistics(column_name)
6
+ * Remove std dev from statistics kept
7
+
8
+ * Enhancements
9
+
10
+ * Added zero count and null count
11
+
1
12
  2.1.1 / 2012-05-16
2
13
 
3
14
  * Enhancements
@@ -2,22 +2,10 @@ class DataMiner
2
2
  class Run < ::ActiveRecord::Base
3
3
  # If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
4
4
  #
5
- # Each +DataMiner::Run+ will have two of these for every column; a "before" and an "after".
5
+ # Each +DataMiner::Run+ will have two of these for every column; an "initial" and a "final"
6
6
  class ColumnStatistic < ::ActiveRecord::Base
7
7
  class << self
8
- # @private
9
- def before(run)
10
- period run, 'before'
11
- end
12
-
13
- # @private
14
- def after(run)
15
- period run, 'after'
16
- end
17
-
18
- private
19
-
20
- def period(run, period)
8
+ def take(run)
21
9
  unless table_exists?
22
10
  auto_upgrade!
23
11
  end
@@ -27,9 +15,8 @@ class DataMiner
27
15
  column_statistic = new
28
16
  column_statistic.run = run
29
17
  column_statistic.model_name = run.model_name
30
- column_statistic.period = period
31
18
  column_statistic.column_name = column_name
32
- column_statistic.perform_calculations
19
+ column_statistic.take_statistics
33
20
  column_statistic.save!
34
21
  end
35
22
  nil
@@ -49,36 +36,40 @@ class DataMiner
49
36
 
50
37
  col :run_id, :type => :integer
51
38
  col :model_name
52
- col :period
53
39
  col :column_name
54
40
  col :null_count, :type => :integer
41
+ col :zero_count, :type => :integer
42
+ col :blank_count, :type => :integer
55
43
  col :max
56
44
  col :min
57
45
  col :average, :type => :float
58
- col :standard_deviation, :type => :float
59
46
  col :sum, :type => :float
47
+ col :created_at, :type => :datetime
60
48
  add_index :run_id
61
49
  add_index :model_name
62
50
 
63
51
  # @private
64
- def perform_calculations
52
+ def take_statistics
65
53
  model = run.model_name.constantize
66
54
 
67
55
  self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
68
- self.max = calculate(:MAX).inspect
69
- self.min = calculate(:MIN).inspect
56
+
57
+ self.max = calc(:MAX).inspect
58
+ self.min = calc(:MIN).inspect
70
59
 
71
60
  column = model.columns_hash[column_name]
72
61
  if NUMERIC.include?(column.type)
73
- self.average = calculate :AVG
74
- self.standard_deviation = calculate :STDDEV
75
- self.sum = calculate :SUM
62
+ self.zero_count = model.where(column_name => 0).count
63
+ self.average = calc :AVG
64
+ self.sum = calc :SUM
65
+ elsif column.type == :string
66
+ self.blank_count = model.where("LENGTH(TRIM(#{model.connection.quote_column_name(column_name)})) = 0").count
76
67
  end
77
68
  end
78
69
 
79
70
  private
80
71
 
81
- def calculate(operation)
72
+ def calc(operation)
82
73
  model = run.model_name.constantize
83
74
  model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
84
75
  end
@@ -86,7 +86,7 @@ class DataMiner
86
86
  end
87
87
  save!
88
88
  if DataMiner.per_column_statistics?
89
- ColumnStatistic.before self
89
+ ColumnStatistic.take self
90
90
  end
91
91
  begin
92
92
  catch :data_miner_succeed do
@@ -102,38 +102,31 @@ class DataMiner
102
102
  ensure
103
103
  self.row_count_after = model.count
104
104
  if DataMiner.per_column_statistics?
105
- ColumnStatistic.after self
105
+ ColumnStatistic.take self
106
106
  end
107
- self.stopped_at = ::Time.now
107
+ self.stopped_at = ::Time.now.utc
108
108
  save!
109
109
  DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
110
110
  end
111
111
  self
112
112
  end
113
113
 
114
- # Get the column statistics for a particular column before or after this run.
114
+ # Get the column statistics for a particular column before this run started.
115
115
  #
116
116
  # @param [String] column_name The column you want to know about.
117
- # @param ["before","after"] period Whether you want to know about before or after the run.
118
117
  #
119
118
  # @return [ColumnStatistic]
120
- def column_statistics_for(column_name, period)
121
- column_name = column_name.to_s
122
- period = period.to_s
123
- model = model_name.constantize
124
- if existing = column_statistics.where(:column_name => column_name, :period => period).first
125
- existing
126
- elsif model.table_exists?
127
- unless model.column_names.include?(column_name)
128
- raise ::ArgumentError, %{[data_miner] Nonexistent column #{column_name.inspect} on #{model_name}}
129
- end
130
- blank = ColumnStatistic.new
131
- blank.run = self
132
- blank.model_name = model_name
133
- blank.period = period
134
- blank.column_name = column_name
135
- blank
136
- end
119
+ def initial_column_statistics(column_name)
120
+ column_statistics.where(:column_name => column_name.to_s).first
121
+ end
122
+
123
+ # Get the column statistics for a particular column after this run finished.
124
+ #
125
+ # @param [String] column_name The column you want to know about.
126
+ #
127
+ # @return [ColumnStatistic]
128
+ def final_column_statistics(column_name)
129
+ column_statistics.where(:column_name => column_name.to_s).last
137
130
  end
138
131
 
139
132
  # @private
@@ -1,3 +1,3 @@
1
1
  class DataMiner
2
- VERSION = '2.1.1'
2
+ VERSION = '2.1.2'
3
3
  end
@@ -8,6 +8,7 @@ describe DataMiner::Run::ColumnStatistic do
8
8
  Pet.delete_all
9
9
  DataMiner::Run.delete_all
10
10
  DataMiner::Run::ColumnStatistic.delete_all
11
+ Pet.run_data_miner!
11
12
  end
12
13
 
13
14
  after do
@@ -15,29 +16,35 @@ describe DataMiner::Run::ColumnStatistic do
15
16
  end
16
17
 
17
18
  it "keeps null count" do
18
- Pet.run_data_miner!
19
-
20
- Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
21
- Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
19
+ Pet.data_miner_runs.first.initial_column_statistics(:breed_id).null_count.must_equal 0
20
+ Pet.data_miner_runs.first.final_column_statistics(:breed_id).null_count.must_equal 1
22
21
 
23
- Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :before).null_count.must_equal 0
24
- Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :after).null_count.must_equal 0
22
+ Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).null_count.must_equal 0
23
+ Pet.data_miner_runs.first.final_column_statistics(:command_phrase).null_count.must_equal 0
25
24
  end
26
25
 
27
26
  it "keeps max and min (as strings)" do
28
- Pet.run_data_miner!
29
- Pet.data_miner_runs.first.column_statistics_for(:age, :before).max.must_equal 'nil'
30
- Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
27
+ Pet.data_miner_runs.first.initial_column_statistics(:age).max.must_equal 'nil'
28
+ Pet.data_miner_runs.first.final_column_statistics(:age).max.must_equal '17'
31
29
  end
32
30
 
33
- it "keeps average and stddev" do
34
- Pet.run_data_miner!
31
+ it "keeps average and sum" do
32
+ Pet.data_miner_runs.first.initial_column_statistics(:age).average.must_be_nil
33
+ Pet.data_miner_runs.first.final_column_statistics(:age).average.must_equal 7.0
35
34
 
36
- Pet.data_miner_runs.first.column_statistics_for(:age, :before).average.must_be_nil
37
- Pet.data_miner_runs.first.column_statistics_for(:age, :after).average.must_equal 7.0
35
+ Pet.data_miner_runs.first.initial_column_statistics(:age).sum.must_be_nil
36
+ Pet.data_miner_runs.first.final_column_statistics(:age).sum.must_equal 28.0
37
+ end
38
+
39
+ it "keeps blank (empty string) count" do
40
+ Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).blank_count.must_equal 0
41
+ Pet.data_miner_runs.first.final_column_statistics(:command_phrase).blank_count.must_equal 3
42
+ end
38
43
 
39
- Pet.data_miner_runs.first.column_statistics_for(:age, :before).standard_deviation.must_be_nil
40
- Pet.data_miner_runs.first.column_statistics_for(:age, :after).standard_deviation.must_equal 5.8737
44
+ it "keeps zero count" do
45
+ Pet.data_miner_runs.first.initial_column_statistics(:age).zero_count.must_equal 0
46
+ Pet.data_miner_runs.first.final_column_statistics(:age).zero_count.must_equal 0
41
47
  end
48
+
42
49
  end
43
50
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-05-16 00:00:00.000000000 Z
14
+ date: 2012-05-22 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: remote_table