data_miner 2.1.0 → 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ 2.1.1 / 2012-05-16
2
+
3
+ * Enhancements
4
+
5
+ * Keep row counts from before and after runs.
6
+ * If DataMiner.per_column_statistics?, keep per-column statistics from before and after runs.
7
+
1
8
  2.1.0 / 2012-05-10
2
9
 
3
10
  * Enhancements
data/lib/data_miner.rb CHANGED
@@ -96,6 +96,16 @@ class DataMiner
96
96
  end
97
97
  end
98
98
 
99
+ # Whether per-column stats like max, min, average, standard deviation, etc are enabled.
100
+ def per_column_statistics?
101
+ @per_column_statistics == true
102
+ end
103
+
104
+ # Turn on or off per-column stats.
105
+ def per_column_statistics=(boolean)
106
+ @per_column_statistics = boolean
107
+ end
108
+
99
109
  class << self
100
110
  delegate(*DataMiner.instance_methods(false), :to => :instance)
101
111
  end
@@ -1,6 +1,8 @@
1
1
  require 'aasm'
2
2
  require 'active_record_inline_schema'
3
3
 
4
+ require 'data_miner/run/column_statistic'
5
+
4
6
  class DataMiner
5
7
  # A record of what happened when you ran a data miner script.
6
8
  #
@@ -57,6 +59,14 @@ class DataMiner
57
59
  col :stopped_at, :type => :datetime
58
60
  col :updated_at, :type => :datetime
59
61
  col :error, :type => :text
62
+ col :row_count_before, :type => :integer
63
+ col :row_count_after, :type => :integer
64
+ add_index :model_name
65
+ add_index :aasm_state
66
+
67
+ validates_presence_of :model_name
68
+
69
+ has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic'
60
70
 
61
71
  include ::AASM
62
72
  aasm_initial_state INITIAL_STATE
@@ -68,11 +78,16 @@ class DataMiner
68
78
  aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
69
79
  aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
70
80
 
71
- validates_presence_of :model_name
72
-
73
81
  # @private
74
82
  def start
83
+ model = model_name.constantize
84
+ if model.table_exists?
85
+ self.row_count_before = model.count
86
+ end
75
87
  save!
88
+ if DataMiner.per_column_statistics?
89
+ ColumnStatistic.before self
90
+ end
76
91
  begin
77
92
  catch :data_miner_succeed do
78
93
  yield
@@ -85,6 +100,10 @@ class DataMiner
85
100
  fail!
86
101
  raise $!
87
102
  ensure
103
+ self.row_count_after = model.count
104
+ if DataMiner.per_column_statistics?
105
+ ColumnStatistic.after self
106
+ end
88
107
  self.stopped_at = ::Time.now
89
108
  save!
90
109
  DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
@@ -92,6 +111,31 @@ class DataMiner
92
111
  self
93
112
  end
94
113
 
114
+ # Get the column statistics for a particular column before or after this run.
115
+ #
116
+ # @param [String] column_name The column you want to know about.
117
+ # @param ["before","after"] period Whether you want to know about before or after the run.
118
+ #
119
+ # @return [ColumnStatistic]
120
+ def column_statistics_for(column_name, period)
121
+ column_name = column_name.to_s
122
+ period = period.to_s
123
+ model = model_name.constantize
124
+ if existing = column_statistics.where(:column_name => column_name, :period => period).first
125
+ existing
126
+ elsif model.table_exists?
127
+ unless model.column_names.include?(column_name)
128
+ raise ::ArgumentError, %{[data_miner] Nonexistent column #{column_name.inspect} on #{model_name}}
129
+ end
130
+ blank = ColumnStatistic.new
131
+ blank.run = self
132
+ blank.model_name = model_name
133
+ blank.period = period
134
+ blank.column_name = column_name
135
+ blank
136
+ end
137
+ end
138
+
95
139
  # @private
96
140
  def as_lock
97
141
  database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
@@ -0,0 +1,87 @@
1
+ class DataMiner
2
+ class Run < ::ActiveRecord::Base
3
+ # If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
4
+ #
5
+ # Each +DataMiner::Run+ will have two of these for every column; a "before" and an "after".
6
+ class ColumnStatistic < ::ActiveRecord::Base
7
+ class << self
8
+ # @private
9
+ def before(run)
10
+ period run, 'before'
11
+ end
12
+
13
+ # @private
14
+ def after(run)
15
+ period run, 'after'
16
+ end
17
+
18
+ private
19
+
20
+ def period(run, period)
21
+ unless table_exists?
22
+ auto_upgrade!
23
+ end
24
+ model = run.model_name.constantize
25
+ return unless model.table_exists?
26
+ model.column_names.each do |column_name|
27
+ column_statistic = new
28
+ column_statistic.run = run
29
+ column_statistic.model_name = run.model_name
30
+ column_statistic.period = period
31
+ column_statistic.column_name = column_name
32
+ column_statistic.perform_calculations
33
+ column_statistic.save!
34
+ end
35
+ nil
36
+ end
37
+
38
+ end
39
+
40
+ NUMERIC = [
41
+ :integer,
42
+ :float,
43
+ :decimal,
44
+ ]
45
+
46
+ self.table_name = 'data_miner_run_column_statistics'
47
+
48
+ belongs_to :run, :class_name => 'DataMiner::Run'
49
+
50
+ col :run_id, :type => :integer
51
+ col :model_name
52
+ col :period
53
+ col :column_name
54
+ col :null_count, :type => :integer
55
+ col :max
56
+ col :min
57
+ col :average, :type => :float
58
+ col :standard_deviation, :type => :float
59
+ col :sum, :type => :float
60
+ add_index :run_id
61
+ add_index :model_name
62
+
63
+ # @private
64
+ def perform_calculations
65
+ model = run.model_name.constantize
66
+
67
+ self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
68
+ self.max = calculate(:MAX).inspect
69
+ self.min = calculate(:MIN).inspect
70
+
71
+ column = model.columns_hash[column_name]
72
+ if NUMERIC.include?(column.type)
73
+ self.average = calculate :AVG
74
+ self.standard_deviation = calculate :STDDEV
75
+ self.sum = calculate :SUM
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def calculate(operation)
82
+ model = run.model_name.constantize
83
+ model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
84
+ end
85
+ end
86
+ end
87
+ end
@@ -213,6 +213,9 @@ class DataMiner
213
213
  Script.current_stack.clear
214
214
  end
215
215
  Script.current_stack << model_name
216
+ unless Run.table_exists?
217
+ Run.auto_upgrade!
218
+ end
216
219
  run = Run.new
217
220
  run.model_name = model_name
218
221
  run.start do
@@ -1,3 +1,3 @@
1
1
  class DataMiner
2
- VERSION = '2.1.0'
2
+ VERSION = '2.1.1'
3
3
  end
data/test/helper.rb CHANGED
@@ -11,7 +11,7 @@ require 'minitest/reporters'
11
11
  MiniTest::Unit.runner = MiniTest::SuiteRunner.new
12
12
  MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
13
13
 
14
- cmd = %{mysql -u root -ppassword -e "drop database data_miner_test; create database data_miner_test charset utf8"}
14
+ cmd = %{mysql -u root -ppassword -e "DROP DATABASE data_miner_test; CREATE DATABASE data_miner_test CHARSET utf8"}
15
15
  $stderr.puts "Running `#{cmd}`..."
16
16
  system cmd
17
17
  $stderr.puts "Done."
@@ -28,6 +28,71 @@ ActiveRecord::Base.establish_connection(
28
28
  'password' => 'password'
29
29
  )
30
30
 
31
+ ActiveRecord::Base.mass_assignment_sanitizer = :strict
32
+
31
33
  require 'data_miner'
32
34
  DataMiner::Run.auto_upgrade!
35
+ DataMiner::Run::ColumnStatistic.auto_upgrade!
33
36
  DataMiner::Run.clear_locks
37
+
38
+ PETS = File.expand_path('../support/pets.csv', __FILE__)
39
+ PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
40
+ COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
41
+ COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
42
+ BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
43
+
44
+ class Pet < ActiveRecord::Base
45
+ self.primary_key = "name"
46
+ col :name
47
+ col :breed_id
48
+ col :color_id
49
+ col :age, :type => :integer
50
+ col :age_units
51
+ col :weight, :type => :float
52
+ col :weight_units
53
+ col :height, :type => :integer
54
+ col :height_units
55
+ col :favorite_food
56
+ col :command_phrase
57
+ belongs_to :breed
58
+ data_miner do
59
+ process :auto_upgrade!
60
+ process :run_data_miner_on_parent_associations!
61
+ import("A list of pets", :url => "file://#{PETS}") do
62
+ key :name
63
+ store :age, :units_field_name => 'age_units'
64
+ store :breed_id, :field_name => :breed, :nullify_blank_strings => true
65
+ store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
66
+ store :weight, :from_units => :pounds, :to_units => :kilograms
67
+ store :favorite_food, :nullify_blank_strings => true
68
+ store :command_phrase
69
+ store :height, :units => :centimetres
70
+ end
71
+ end
72
+ end
73
+
74
+ class Breed < ActiveRecord::Base
75
+ class << self
76
+ def update_average_age!
77
+ # make sure pet is populated
78
+ Pet.run_data_miner!
79
+ update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
80
+ end
81
+ end
82
+ self.primary_key = "name"
83
+ col :name
84
+ col :average_age, :type => :float
85
+ data_miner do
86
+ process :auto_upgrade!
87
+ import("A list of breeds", :url => "file://#{BREEDS}") do
88
+ key :name, :field_name => 'Breed name'
89
+ end
90
+ process :update_average_age!
91
+ end
92
+ end
93
+
94
+ ActiveRecord::Base.descendants.each do |model|
95
+ model.attr_accessible nil
96
+ end
97
+
98
+ Pet.auto_upgrade!
@@ -3,3 +3,4 @@ Pierre,Tabby,GO,4,years,4.4,30,tomato,"eh"
3
3
  Jerry,Beagle,BR/BL,5,years,10,30,cheese,"che"
4
4
  Amigo,Spanish Lizarto,GR/BU,17,years," ",3,crickets," "
5
5
  Johnny,Beagle,BR/BL,2,years,20,45," ",
6
+ Nemo,,,,,,,,
@@ -1,73 +1,12 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  require 'helper'
3
3
 
4
- PETS = File.expand_path('../support/pets.csv', __FILE__)
5
- PETS_FUNNY = File.expand_path('../support/pets_funny.csv', __FILE__)
6
- COLOR_DICTIONARY_ENGLISH = File.expand_path('../support/pet_color_dictionary.en.csv', __FILE__)
7
- COLOR_DICTIONARY_SPANISH = File.expand_path('../support/pet_color_dictionary.es.csv', __FILE__)
8
- BREEDS = File.expand_path('../support/breeds.xls', __FILE__)
9
-
10
- class Pet < ActiveRecord::Base
11
- self.primary_key = "name"
12
- col :name
13
- col :breed_id
14
- col :color_id
15
- col :age, :type => :integer
16
- col :age_units
17
- col :weight, :type => :float
18
- col :weight_units
19
- col :height, :type => :integer
20
- col :height_units
21
- col :favorite_food
22
- col :command_phrase
23
- belongs_to :breed
24
- data_miner do
25
- process :auto_upgrade!
26
- process :run_data_miner_on_parent_associations!
27
- import("A list of pets", :url => "file://#{PETS}") do
28
- key :name
29
- store :age, :units_field_name => 'age_units'
30
- store :breed_id, :field_name => :breed
31
- store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
32
- store :weight, :from_units => :pounds, :to_units => :kilograms
33
- store :favorite_food, :nullify_blank_strings => true
34
- store :command_phrase
35
- store :height, :units => :centimetres
36
- end
37
- end
38
- end
39
-
40
- class Breed < ActiveRecord::Base
41
- class << self
42
- def update_average_age!
43
- # make sure pet is populated
44
- Pet.run_data_miner!
45
- update_all %{breeds.average_age = (SELECT AVG(pets.age) FROM pets WHERE pets.breed_id = breeds.name)}
46
- end
47
- end
48
- self.primary_key = "name"
49
- col :name
50
- col :average_age, :type => :float
51
- data_miner do
52
- process :auto_upgrade!
53
- import("A list of breeds", :url => "file://#{BREEDS}") do
54
- key :name, :field_name => 'Breed name'
55
- end
56
- process :update_average_age!
57
- end
58
- end
59
-
60
- ActiveRecord::Base.mass_assignment_sanitizer = :strict
61
- ActiveRecord::Base.descendants.each do |model|
62
- model.attr_accessible nil
63
- end
64
-
65
- Pet.auto_upgrade!
66
-
67
4
  describe DataMiner do
68
5
  describe "when used to import example data about pets" do
69
6
  before do
70
7
  Pet.delete_all
8
+ DataMiner::Run.delete_all
9
+ DataMiner::Run::ColumnStatistic.delete_all
71
10
  end
72
11
  it "it does not depend on mass-assignment" do
73
12
  lambda do
@@ -155,5 +94,10 @@ describe DataMiner do
155
94
  Pet.find('Amigo').weight.must_be_nil
156
95
  Pet.find('Amigo').weight_units.must_be_nil
157
96
  end
97
+ it "keeps a row count before and after" do
98
+ Pet.run_data_miner!
99
+ Pet.data_miner_runs.first.row_count_before.must_equal 0
100
+ Pet.data_miner_runs.first.row_count_after.must_equal 5
101
+ end
158
102
  end
159
103
  end
@@ -0,0 +1,43 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'helper'
3
+
4
+ describe DataMiner::Run::ColumnStatistic do
5
+ describe "when advanced statistics are enabled" do
6
+ before do
7
+ DataMiner.per_column_statistics = true
8
+ Pet.delete_all
9
+ DataMiner::Run.delete_all
10
+ DataMiner::Run::ColumnStatistic.delete_all
11
+ end
12
+
13
+ after do
14
+ DataMiner.per_column_statistics = false
15
+ end
16
+
17
+ it "keeps null count" do
18
+ Pet.run_data_miner!
19
+
20
+ Pet.data_miner_runs.first.column_statistics_for(:breed_id, :before).null_count.must_equal 0
21
+ Pet.data_miner_runs.first.column_statistics_for(:breed_id, :after).null_count.must_equal 1
22
+
23
+ Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :before).null_count.must_equal 0
24
+ Pet.data_miner_runs.first.column_statistics_for(:command_phrase, :after).null_count.must_equal 0
25
+ end
26
+
27
+ it "keeps max and min (as strings)" do
28
+ Pet.run_data_miner!
29
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).max.must_equal 'nil'
30
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).max.must_equal '17'
31
+ end
32
+
33
+ it "keeps average and stddev" do
34
+ Pet.run_data_miner!
35
+
36
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).average.must_be_nil
37
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).average.must_equal 7.0
38
+
39
+ Pet.data_miner_runs.first.column_statistics_for(:age, :before).standard_deviation.must_be_nil
40
+ Pet.data_miner_runs.first.column_statistics_for(:age, :after).standard_deviation.must_equal 5.8737
41
+ end
42
+ end
43
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-05-10 00:00:00.000000000 Z
14
+ date: 2012-05-16 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: remote_table
@@ -147,6 +147,7 @@ files:
147
147
  - lib/data_miner/attribute.rb
148
148
  - lib/data_miner/dictionary.rb
149
149
  - lib/data_miner/run.rb
150
+ - lib/data_miner/run/column_statistic.rb
150
151
  - lib/data_miner/script.rb
151
152
  - lib/data_miner/step.rb
152
153
  - lib/data_miner/step/import.rb
@@ -160,6 +161,7 @@ files:
160
161
  - test/support/pets.csv
161
162
  - test/support/pets_funny.csv
162
163
  - test/test_data_miner.rb
164
+ - test/test_data_miner_run_column_statistic.rb
163
165
  - test/test_earth_import.rb
164
166
  - test/test_earth_tap.rb
165
167
  - test/test_safety.rb
@@ -196,6 +198,7 @@ test_files:
196
198
  - test/support/pets.csv
197
199
  - test/support/pets_funny.csv
198
200
  - test/test_data_miner.rb
201
+ - test/test_data_miner_run_column_statistic.rb
199
202
  - test/test_earth_import.rb
200
203
  - test/test_earth_tap.rb
201
204
  - test/test_safety.rb