data_miner 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ 2.1.1 / 2012-05-16
2
+
3
+ * Enhancements
4
+
5
+ * Keep row counts from before and after runs.
6
+ * If DataMiner.per_column_statistics?, keep per-column statistics from before and after runs.
7
+
1
8
  2.1.0 / 2012-05-10
2
9
 
3
10
  * Enhancements
data/lib/data_miner.rb CHANGED
@@ -96,6 +96,16 @@ class DataMiner
96
96
  end
97
97
  end
98
98
 
99
+ # Whether per-column stats like max, min, average, standard deviation, etc are enabled.
100
+ def per_column_statistics?
101
+ @per_column_statistics == true
102
+ end
103
+
104
+ # Turn on or off per-column stats.
105
+ def per_column_statistics=(boolean)
106
+ @per_column_statistics = boolean
107
+ end
108
+
99
109
  class << self
100
110
  delegate(*DataMiner.instance_methods(false), :to => :instance)
101
111
  end
@@ -1,6 +1,8 @@
1
1
  require 'aasm'
2
2
  require 'active_record_inline_schema'
3
3
 
4
+ require 'data_miner/run/column_statistic'
5
+
4
6
  class DataMiner
5
7
  # A record of what happened when you ran a data miner script.
6
8
  #
@@ -57,6 +59,14 @@ class DataMiner
57
59
  col :stopped_at, :type => :datetime
58
60
  col :updated_at, :type => :datetime
59
61
  col :error, :type => :text
62
+ col :row_count_before, :type => :integer
63
+ col :row_count_after, :type => :integer
64
+ add_index :model_name
65
+ add_index :aasm_state
66
+
67
+ validates_presence_of :model_name
68
+
69
+ has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic'
60
70
 
61
71
  include ::AASM
62
72
  aasm_initial_state INITIAL_STATE
@@ -68,11 +78,16 @@ class DataMiner
68
78
  aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
69
79
  aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
70
80
 
71
- validates_presence_of :model_name
72
-
73
81
  # @private
74
82
  def start
83
+ model = model_name.constantize
84
+ if model.table_exists?
85
+ self.row_count_before = model.count
86
+ end
75
87
  save!
88
+ if DataMiner.per_column_statistics?
89
+ ColumnStatistic.before self
90
+ end
76
91
  begin
77
92
  catch :data_miner_succeed do
78
93
  yield
@@ -85,6 +100,10 @@ class DataMiner
85
100
  fail!
86
101
  raise $!
87
102
  ensure
103
+ self.row_count_after = model.count
104
+ if DataMiner.per_column_statistics?
105
+ ColumnStatistic.after self
106
+ end
88
107
  self.stopped_at = ::Time.now
89
108
  save!
90
109
  DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
@@ -92,6 +111,31 @@ class DataMiner
92
111
  self
93
112
  end
94
113
 
114
+ # Get the column statistics for a particular column before or after this run.
115
+ #
116
+ # @param [String] column_name The column you want to know about.
117
+ # @param ["before","after"] period Whether you want to know about before or after the run.
118
+ #
119
+ # @return [ColumnStatistic]
120
+ def column_statistics_for(column_name, period)
121
+ column_name = column_name.to_s
122
+ period = period.to_s
123
+ model = model_name.constantize
124
+ if existing = column_statistics.where(:column_name => column_name, :period => period).first
125
+ existing
126
+ elsif model.table_exists?
127
+ unless model.column_names.include?(column_name)
128
+ raise ::ArgumentError, %{[data_miner] Nonexistent column #{column_name.inspect} on #{model_name}}
129
+ end
130
+ blank = ColumnStatistic.new
131
+ blank.run = self
132
+ blank.model_name = model_name
133
+ blank.period = period
134
+ blank.column_name = column_name
135
+ blank
136
+ end
137
+ end
138
+
95
139
  # @private
96
140
  def as_lock
97
141
  database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
@@ -0,0 +1,87 @@
1
+ class DataMiner
2
+ class Run < ::ActiveRecord::Base
3
+ # If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
4
+ #
5
+ # Each +DataMiner::Run+ will have two of these for every column; a "before" and an "after".
6
+ class ColumnStatistic < ::ActiveRecord::Base
7
+ class << self
8
+ # @private
9
+ def before(run)
10
+ period run, 'before'
11
+ end
12
+
13
+ # @private
14
+ def after(run)
15
+ period run, 'after'
16
+ end
17
+
18
+ private
19
+
20
+ def period(run, period)
21
+ unless table_exists?
22
+ auto_upgrade!
23
+ end
24
+ model = run.model_name.constantize
25
+ return unless model.table_exists?
26
+ model.column_names.each do |column_name|
27
+ column_statistic = new
28
+ column_statistic.run = run
29
+ column_statistic.model_name = run.model_name
30
+ column_statistic.period = period
31
+ column_statistic.column_name = column_name
32
+ column_statistic.perform_calculations
33
+ column_statistic.save!
34
+ end
35
+ nil
36
+ end
37
+
38
+ end
39
+
40
+ NUMERIC = [
41
+ :integer,
42
+ :float,
43
+ :decimal,
44
+ ]
45
+
46
+ self.table_name = 'data_miner_run_column_statistics'
47
+
48
+ belongs_to :run, :class_name => 'DataMiner::Run'
49
+
50
+ col :run_id, :type => :integer
51
+ col :model_name
52
+ col :period
53
+ col :column_name
54
+ col :null_count, :type => :integer
55
+ col :max
56
+ col :min
57
+ col :average, :type => :float
58
+ col :standard_deviation, :type => :float
59
+ col :sum, :type => :float
60
+ add_index :run_id
61
+ add_index :model_name
62
+
63
+ # @private
64
+ def perform_calculations
65
+ model = run.model_name.constantize
66
+
67
+ self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
68
+ self.max = calculate(:MAX).inspect
69
+ self.min = calculate(:MIN).inspect
70
+
71
+ column = model.columns_hash[column_name]
72
+ if NUMERIC.include?(column.type)
73
+ self.average = calculate :AVG
74
+ self.standard_deviation = calculate :STDDEV
75
+ self.sum = calculate :SUM
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def calculate(operation)
82
+ model = run.model_name.constantize
83
+ model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
84
+ end
85
+ end
86
+ end
87
+ end
@@ -213,6 +213,9 @@ class DataMiner
213
213
  Script.current_stack.clear
214
214
  end
215
215
  Script.current_stack << model_name
216
+ unless Run.table_exists?
217
+ Run.auto_upgrade!
218
+ end
216
219
  run = Run.new
217
220
  run.model_name = model_name
218
221
  run.start do
@@ -1,3 +1,3 @@
1
1
  class DataMiner
2
- VERSION = '2.1.0'
2
+ VERSION = '2.1.1'
3
3
  end
data/test/helper.rb CHANGED
@@ -11,7 +11,7 @@ require 'minitest/reporters'
11
11
  MiniTest::Unit.runner = MiniTest::SuiteRunner.new
12
12
  MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
13
13
 
14
- cmd = %{mysql -u root -ppassword -e "drop database data_miner_test; create database data_miner_test charset utf8"}
14
+ cmd = %{mysql -u root -ppassword -e "DROP DATABASE data_miner_test; CREATE DATABASE data_miner_test CHARSET utf8"}
15
15
  $stderr.puts "Running `#{cmd}`..."
16
16
  system cmd
17
17
  $stderr.puts "Done."
@@ -28,6 +28,71 @@ ActiveRecord::Base.establish_connection(
28
28
  'password' => 'password'
29
29
  )
30
30
 
31
+ ActiveRecord::Base.mass_assignment_sanitizer = :strict
32
+
31
33
  require 'data_miner'
32
34
  DataMiner::Run.auto_upgrade!
35
+ DataMiner::Run::ColumnStatistic.auto_upgrade!
33
36
  DataMiner::Run.clear_locks
37
+
38
+ PETS = File.expand_path('../support/pets.csv', __FILE__)
39
+ PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
40
+ COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
41
+ COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
42
+ BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
43
+
44
+ class Pet < ActiveRecord::Base
45
+ self.primary_key = "name"
46
+ col :name
47
+ col :breed_id
48
+ col :color_id
49
+ col :age, :type => :integer
50
+ col :age_units
51
+ col :weight, :type => :float
52
+ col :weight_units
53
+ col :height, :type => :integer
54
+ col :height_units
55
+ col :favorite_food
56
+ col :command_phrase
57
+ belongs_to :breed
58
+ data_miner do
59
+ process :auto_upgrade!
60
+ process :run_data_miner_on_parent_associations!
61
+ import("A list of pets", :url => "file://#{PETS}") do
62
+ key :name
63
+ store :age, :units_field_name => 'age_units'
64
+ store :breed_id, :field_name => :breed, :nullify_blank_strings => true
65
+ store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
66
+ store :weight, :from_units => :pounds, :to_units => :kilograms
67
+ store :favorite_food, :nullify_blank_strings => true
68
+ store :command_phrase
69
+ store :height, :units => :centimetres
70
+ end
71
+ end
72
+ end
73
+
74
+ class Breed < ActiveRecord::Base
75
+ class << self
76
+ def update_average_age!
77
+ # make sure pet is populated
78
+ Pet.run_data_miner!
79
+ update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
80
+ end
81
+ end
82
+ self.primary_key = "name"
83
+ col :name
84
+ col :average_age, :type => :float
85
+ data_miner do
86
+ process :auto_upgrade!
87
+ import("A list of breeds", :url => "file://#{BREEDS}") do
88
+ key :name, :field_name => 'Breed name'
89
+ end
90
+ process :update_average_age!
91
+ end
92
+ end
93
+
94
+ ActiveRecord::Base.descendants.each do |model|
95
+ model.attr_accessible nil
96
+ end
97
+
98
+ Pet.auto_upgrade!
@@ -3,3 +3,4 @@ Pierre,Tabby,GO,4,years,4.4,30,tomato,"eh"
3
3
  Jerry,Beagle,BR/BL,5,years,10,30,cheese,"che"
4
4
  Amigo,Spanish Lizarto,GR/BU,17,years," ",3,crickets," "
5
5
  Johnny,Beagle,BR/BL,2,years,20,45," ",
6
+ Nemo,,,,,,,,
@@ -1,73 +1,12 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  require 'helper'
3
3
 
4
- PETS = File.expand_path('../support/pets.csv', __FILE__)
5
- PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
6
- COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
7
- COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
8
- BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
9
-
10
- class Pet < ActiveRecord::Base
11
- self.primary_key = "name"
12
- col :name
13
- col :breed_id
14
- col :color_id
15
- col :age, :type => :integer
16
- col :age_units
17
- col :weight, :type => :float
18
- col :weight_units
19
- col :height, :type => :integer
20
- col :height_units
21
- col :favorite_food
22
- col :command_phrase
23
- belongs_to :breed
24
- data_miner do
25
- process :auto_upgrade!
26
- process :run_data_miner_on_parent_associations!
27
- import("A list of pets", :url => "file://#{PETS}") do
28
- key :name
29
- store :age, :units_field_name => 'age_units'
30
- store :breed_id, :field_name => :breed
31
- store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
32
- store :weight, :from_units => :pounds, :to_units => :kilograms
33
- store :favorite_food, :nullify_blank_strings => true
34
- store :command_phrase
35
- store :height, :units => :centimetres
36
- end
37
- end
38
- end
39
-
40
- class Breed < ActiveRecord::Base
41
- class << self
42
- def update_average_age!
43
- # make sure pet is populated
44
- Pet.run_data_miner!
45
- update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
46
- end
47
- end
48
- self.primary_key = "name"
49
- col :name
50
- col :average_age, :type => :float
51
- data_miner do
52
- process :auto_upgrade!
53
- import("A list of breeds", :url => "file://#{BREEDS}") do
54
- key :name, :field_name => 'Breed name'
55
- end
56
- process :update_average_age!
57
- end
58
- end
59
-
60
- ActiveRecord::Base.mass_assignment_sanitizer = :strict
61
- ActiveRecord::Base.descendants.each do |model|
62
- model.attr_accessible nil
63
- end
64
-
65
- Pet.auto_upgrade!
66
-
67
4
  describe DataMiner do
68
5
  describe "when used to import example data about pets" do
69
6
  before do
70
7
  Pet.delete_all
8
+ DataMiner::Run.delete_all
9
+ DataMiner::Run::ColumnStatistic.delete_all
71
10
  end
72
11
  it "it does not depend on mass-assignment" do
73
12
  lambda do
@@ -155,5 +94,10 @@ describe DataMiner do
155
94
  Pet.find('Amigo').weight.must_be_nil
156
95
  Pet.find('Amigo').weight_units.must_be_nil
157
96
  end
97
+ it "keeps a row count before and after" do
98
+ Pet.run_data_miner!
99
+ Pet.data_miner_runs.first.row_count_before.must_equal 0
100
+ Pet.data_miner_runs.first.row_count_after.must_equal 5
101
+ end
158
102
  end
159
103
  end
@@ -0,0 +1,43 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'helper'
3
+
4
+ describe DataMiner::Run::ColumnStatistic do
5
+ describe "when advanced statistics are enabled" do
6
+ before do
7
+ DataMiner.per_column_statistics = true
8
+ Pet.delete_all
9
+ DataMiner::Run.delete_all
10
+ DataMiner::Run::ColumnStatistic.delete_all
11
+ end
12
+
13
+ after do
14
+ DataMiner.per_column_statistics = false
15
+ end
16
+
17
+ it "keeps null count" do
18
+ Pet.run_data_miner!
19
+
20
+ Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
21
+ Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
22
+
23
+ Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :before).null_count.must_equal 0
24
+ Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :after).null_count.must_equal 0
25
+ end
26
+
27
+ it "keeps max and min (as strings)" do
28
+ Pet.run_data_miner!
29
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).max.must_equal 'nil'
30
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
31
+ end
32
+
33
+ it "keeps average and stddev" do
34
+ Pet.run_data_miner!
35
+
36
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).average.must_be_nil
37
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).average.must_equal 7.0
38
+
39
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).standard_deviation.must_be_nil
40
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).standard_deviation.must_equal 5.8737
41
+ end
42
+ end
43
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-05-10 00:00:00.000000000 Z
14
+ date: 2012-05-16 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: remote_table
@@ -147,6 +147,7 @@ files:
147
147
  - lib/data_miner/attribute.rb
148
148
  - lib/data_miner/dictionary.rb
149
149
  - lib/data_miner/run.rb
150
+ - lib/data_miner/run/column_statistic.rb
150
151
  - lib/data_miner/script.rb
151
152
  - lib/data_miner/step.rb
152
153
  - lib/data_miner/step/import.rb
@@ -160,6 +161,7 @@ files:
160
161
  - test/support/pets.csv
161
162
  - test/support/pets_funny.csv
162
163
  - test/test_data_miner.rb
164
+ - test/test_data_miner_run_column_statistic.rb
163
165
  - test/test_earth_import.rb
164
166
  - test/test_earth_tap.rb
165
167
  - test/test_safety.rb
@@ -196,6 +198,7 @@ test_files:
196
198
  - test/support/pets.csv
197
199
  - test/support/pets_funny.csv
198
200
  - test/test_data_miner.rb
201
+ - test/test_data_miner_run_column_statistic.rb
199
202
  - test/test_earth_import.rb
200
203
  - test/test_earth_tap.rb
201
204
  - test/test_safety.rb