data_miner 2.5.2 → 3.0.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +18 -0
- data/Gemfile +0 -2
- data/data_miner.gemspec +3 -7
- data/lib/data_miner.rb +2 -31
- data/lib/data_miner/active_record_class_methods.rb +5 -11
- data/lib/data_miner/attribute.rb +100 -198
- data/lib/data_miner/script.rb +5 -11
- data/lib/data_miner/step/import.rb +41 -27
- data/lib/data_miner/step/sql.rb +10 -10
- data/lib/data_miner/version.rb +1 -1
- data/test/data_miner/step/test_sql.rb +14 -18
- data/test/data_miner/test_attribute.rb +0 -32
- data/test/helper.rb +4 -9
- data/test/support/data_miner_with_alchemist.rb +1 -5
- data/test/support/pet.rb +10 -9
- data/test/support/pet2.rb +1 -1
- data/test/support/pets.csv +2 -2
- data/test/test_data_miner.rb +6 -40
- metadata +9 -97
- data/lib/data_miner/dictionary.rb +0 -84
- data/lib/data_miner/run.rb +0 -144
- data/lib/data_miner/run/column_statistic.rb +0 -78
- data/lib/data_miner/unit_converter.rb +0 -12
- data/lib/data_miner/unit_converter/alchemist.rb +0 -11
- data/lib/data_miner/unit_converter/conversions.rb +0 -11
- data/test/data_miner/step/test_import.rb +0 -35
- data/test/data_miner/unit_converter/test_alchemist.rb +0 -20
- data/test/data_miner/unit_converter/test_conversions.rb +0 -20
- data/test/support/data_miner_with_conversions.rb +0 -16
- data/test/support/data_miner_without_unit_converter.rb +0 -51
- data/test/test_data_miner_run_column_statistic.rb +0 -52
- data/test/test_earth_import.rb +0 -26
- data/test/test_safety.rb +0 -84
- data/test/test_unit_conversion.rb +0 -16
@@ -1,78 +0,0 @@
|
|
1
|
-
class DataMiner
|
2
|
-
class Run < ::ActiveRecord::Base
|
3
|
-
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
4
|
-
#
|
5
|
-
# Each +DataMiner::Run+ will have two of these for every column; an "initial" and a "final"
|
6
|
-
class ColumnStatistic < ::ActiveRecord::Base
|
7
|
-
class << self
|
8
|
-
def take(run)
|
9
|
-
unless table_exists?
|
10
|
-
auto_upgrade!
|
11
|
-
end
|
12
|
-
model = run.model_name.constantize
|
13
|
-
return unless model.table_exists?
|
14
|
-
model.column_names.each do |column_name|
|
15
|
-
column_statistic = new
|
16
|
-
column_statistic.run = run
|
17
|
-
column_statistic.model_name = run.model_name
|
18
|
-
column_statistic.column_name = column_name
|
19
|
-
column_statistic.take_statistics
|
20
|
-
column_statistic.save!
|
21
|
-
end
|
22
|
-
nil
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
NUMERIC = [
|
28
|
-
:integer,
|
29
|
-
:float,
|
30
|
-
:decimal,
|
31
|
-
]
|
32
|
-
|
33
|
-
self.table_name = 'data_miner_run_column_statistics'
|
34
|
-
|
35
|
-
belongs_to :run, :class_name => 'DataMiner::Run'
|
36
|
-
|
37
|
-
col :run_id, :type => :integer
|
38
|
-
col :model_name
|
39
|
-
col :column_name
|
40
|
-
col :null_count, :type => :integer
|
41
|
-
col :zero_count, :type => :integer
|
42
|
-
col :blank_count, :type => :integer
|
43
|
-
col :max
|
44
|
-
col :min
|
45
|
-
col :average, :type => :float
|
46
|
-
col :sum, :type => :float
|
47
|
-
col :created_at, :type => :datetime
|
48
|
-
add_index :run_id
|
49
|
-
add_index :model_name
|
50
|
-
|
51
|
-
# @private
|
52
|
-
def take_statistics
|
53
|
-
model = run.model_name.constantize
|
54
|
-
|
55
|
-
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
56
|
-
|
57
|
-
self.max = calc(:MAX).inspect
|
58
|
-
self.min = calc(:MIN).inspect
|
59
|
-
|
60
|
-
column = model.columns_hash[column_name]
|
61
|
-
if NUMERIC.include?(column.type)
|
62
|
-
self.zero_count = model.where(column_name => 0).count
|
63
|
-
self.average = calc :AVG
|
64
|
-
self.sum = calc :SUM
|
65
|
-
elsif column.type == :string
|
66
|
-
self.blank_count = model.where("LENGTH(TRIM(#{model.connection.quote_column_name(column_name)})) = 0").count
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def calc(operation)
|
73
|
-
model = run.model_name.constantize
|
74
|
-
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
@@ -1,35 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
init_database
|
3
|
-
require 'earth'
|
4
|
-
|
5
|
-
require 'earth/residence'
|
6
|
-
require 'earth/electricity'
|
7
|
-
require 'earth/hospitality'
|
8
|
-
|
9
|
-
class PetBlue < ActiveRecord::Base
|
10
|
-
data_miner do
|
11
|
-
import 'fake', :url => 'fake' do
|
12
|
-
key :id
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
PetBlue.auto_upgrade!
|
17
|
-
|
18
|
-
describe DataMiner::Step::Import do
|
19
|
-
describe '#table_has_autoincrementing_primary_key?' do
|
20
|
-
it "recognizes auto-increment primary keys" do
|
21
|
-
PetBlue.data_miner_script.steps.first.send(:table_has_autoincrementing_primary_key?).must_equal true
|
22
|
-
end
|
23
|
-
it "recognizes that not all integer primary keys are auto-increment" do
|
24
|
-
[
|
25
|
-
ElectricUtility,
|
26
|
-
ResidentialEnergyConsumptionSurveyResponse,
|
27
|
-
CommercialBuildingEnergyConsumptionSurveyResponse,
|
28
|
-
].each do |model|
|
29
|
-
model.data_miner_script.steps.select { |s| s.is_a?(DataMiner::Step::Import) }.each do |import_step|
|
30
|
-
import_step.send(:table_has_autoincrementing_primary_key?).must_equal false
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner::UnitConverter::Alchemist' do
|
4
|
-
before do
|
5
|
-
@original_converter = DataMiner.unit_converter
|
6
|
-
DataMiner.unit_converter = :alchemist
|
7
|
-
end
|
8
|
-
|
9
|
-
after do
|
10
|
-
DataMiner.unit_converter = @original_converter
|
11
|
-
end
|
12
|
-
|
13
|
-
describe '#convert' do
|
14
|
-
it 'converts a value from one unit to another' do
|
15
|
-
value = DataMiner.unit_converter.convert 3.5, :kilograms, :pounds
|
16
|
-
assert value.is_a?(Float)
|
17
|
-
value.must_be_close_to 7.71617918
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner::UnitConverter::Conversions' do
|
4
|
-
before do
|
5
|
-
@original_converter = DataMiner.unit_converter
|
6
|
-
DataMiner.unit_converter = :conversions
|
7
|
-
end
|
8
|
-
|
9
|
-
after do
|
10
|
-
DataMiner.unit_converter = @original_converter
|
11
|
-
end
|
12
|
-
|
13
|
-
describe '#convert' do
|
14
|
-
it 'converts a value from one unit to another' do
|
15
|
-
value = DataMiner.unit_converter.convert 3.5, :kilograms, :pounds
|
16
|
-
assert value.is_a?(Float)
|
17
|
-
value.must_be_close_to 7.71617918
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
require 'conversions'
|
4
|
-
Conversions.register :years, :years, 1
|
5
|
-
|
6
|
-
describe 'DataMiner with Conversions' do
|
7
|
-
before do
|
8
|
-
init_database(:conversions)
|
9
|
-
init_models
|
10
|
-
Pet.run_data_miner!
|
11
|
-
end
|
12
|
-
|
13
|
-
it 'converts convertible units' do
|
14
|
-
Pet.find('Pierre').weight.must_be_close_to 4.4.pounds.to(:kilograms)
|
15
|
-
end
|
16
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
class MyPet < ActiveRecord::Base
|
4
|
-
PETS = File.expand_path('../pets.csv', __FILE__)
|
5
|
-
COLOR_DICTIONARY_ENGLISH = File.expand_path('../pet_color_dictionary.en.csv', __FILE__)
|
6
|
-
|
7
|
-
self.primary_key = "name"
|
8
|
-
col :name
|
9
|
-
col :color_id
|
10
|
-
col :age, :type => :integer
|
11
|
-
col :age_units
|
12
|
-
col :weight, :type => :float
|
13
|
-
col :weight_units
|
14
|
-
col :height, :type => :integer
|
15
|
-
col :height_units
|
16
|
-
col :favorite_food
|
17
|
-
col :command_phrase
|
18
|
-
|
19
|
-
data_miner do
|
20
|
-
process :auto_upgrade!
|
21
|
-
import("A list of pets", :url => "file://#{PETS}") do
|
22
|
-
key :name
|
23
|
-
store :age
|
24
|
-
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
25
|
-
store :weight
|
26
|
-
store :favorite_food, :nullify_blank_strings => true
|
27
|
-
store :command_phrase
|
28
|
-
store :height, :units => :centimetres
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
describe 'DataMiner with Conversions' do
|
34
|
-
it 'happens when DataMiner.unit_converter is nil' do
|
35
|
-
DataMiner.unit_converter.must_be_nil
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'converts convertible units' do
|
39
|
-
init_database(nil)
|
40
|
-
MyPet.run_data_miner!
|
41
|
-
MyPet.find('Pierre').weight.must_equal 4.4
|
42
|
-
end
|
43
|
-
|
44
|
-
it 'raises an error if conversions are attempted' do
|
45
|
-
init_database(nil)
|
46
|
-
lambda do
|
47
|
-
init_models
|
48
|
-
Pet.run_data_miner!
|
49
|
-
end.must_raise DataMiner::Attribute::NoConverterSet
|
50
|
-
end
|
51
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
init_models
|
5
|
-
|
6
|
-
describe DataMiner::Run::ColumnStatistic do
|
7
|
-
describe "when advanced statistics are enabled" do
|
8
|
-
before do
|
9
|
-
DataMiner.per_column_statistics = true
|
10
|
-
Pet.delete_all
|
11
|
-
DataMiner::Run.delete_all
|
12
|
-
DataMiner::Run::ColumnStatistic.delete_all
|
13
|
-
Pet.run_data_miner!
|
14
|
-
end
|
15
|
-
|
16
|
-
after do
|
17
|
-
DataMiner.per_column_statistics = false
|
18
|
-
end
|
19
|
-
|
20
|
-
it "keeps null count" do
|
21
|
-
Pet.data_miner_runs.first.initial_column_statistics(:breed_id).null_count.must_equal 0
|
22
|
-
Pet.data_miner_runs.first.final_column_statistics(:breed_id).null_count.must_equal 1
|
23
|
-
|
24
|
-
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).null_count.must_equal 0
|
25
|
-
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).null_count.must_equal 0
|
26
|
-
end
|
27
|
-
|
28
|
-
it "keeps max and min (as strings)" do
|
29
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).max.must_equal 'nil'
|
30
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).max.must_include '17'
|
31
|
-
end
|
32
|
-
|
33
|
-
it "keeps average and sum" do
|
34
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).average.must_be_nil
|
35
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).average.must_equal 7.0
|
36
|
-
|
37
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).sum.must_be_nil
|
38
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).sum.must_equal 28.0
|
39
|
-
end
|
40
|
-
|
41
|
-
it "keeps blank (empty string) count" do
|
42
|
-
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).blank_count.must_equal 0
|
43
|
-
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).blank_count.must_equal 3
|
44
|
-
end
|
45
|
-
|
46
|
-
it "keeps zero count" do
|
47
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).zero_count.must_equal 0
|
48
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).zero_count.must_equal 0
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
data/test/test_earth_import.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
require 'earth'
|
5
|
-
|
6
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
7
|
-
Earth.init :locality, :pet, :load_data_miner => true, :apply_schemas => true
|
8
|
-
|
9
|
-
describe DataMiner do
|
10
|
-
describe "being used by the Earth library's import steps" do
|
11
|
-
describe "for pets" do
|
12
|
-
it "can pull breed and species" do
|
13
|
-
Breed.run_data_miner!
|
14
|
-
Breed.find('Golden Retriever').species.must_equal Species.find('dog')
|
15
|
-
end
|
16
|
-
end
|
17
|
-
describe "for localities" do
|
18
|
-
it "can handle non-latin characters" do
|
19
|
-
Country.run_data_miner!
|
20
|
-
Country.find('DE').name.must_equal 'Germany'
|
21
|
-
Country.find('AX').name.must_equal 'Åland Islands'
|
22
|
-
Country.find('CI').name.must_equal "Côte d'Ivoire"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
data/test/test_safety.rb
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
init_models
|
5
|
-
require 'earth'
|
6
|
-
|
7
|
-
require 'lock_method'
|
8
|
-
DataMiner::Run.lock_method :start
|
9
|
-
|
10
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
11
|
-
Earth.init :locality, :pet, :load_data_miner => true, :apply_schemas => true
|
12
|
-
|
13
|
-
describe DataMiner do
|
14
|
-
describe "when being run in a multi-threaded environment" do
|
15
|
-
before do
|
16
|
-
@old_thread_abort_on_exception = Thread.abort_on_exception
|
17
|
-
Thread.abort_on_exception = false
|
18
|
-
end
|
19
|
-
|
20
|
-
after do
|
21
|
-
Thread.abort_on_exception = @old_thread_abort_on_exception
|
22
|
-
end
|
23
|
-
|
24
|
-
it "tries not to duplicate data" do
|
25
|
-
Breed.delete_all
|
26
|
-
Breed.run_data_miner!
|
27
|
-
reference_count = Breed.count
|
28
|
-
Breed.delete_all
|
29
|
-
threads = (0..2).map do |i|
|
30
|
-
Thread.new do
|
31
|
-
# $stderr.write "Thread #{i} starting\n"
|
32
|
-
Breed.run_data_miner!
|
33
|
-
# $stderr.write "Thread #{i} done\n"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
exceptions = []
|
37
|
-
threads.each do |t|
|
38
|
-
begin
|
39
|
-
t.join
|
40
|
-
rescue
|
41
|
-
exceptions << $!
|
42
|
-
end
|
43
|
-
end
|
44
|
-
exceptions.length.must_equal 2
|
45
|
-
exceptions.each do |exception|
|
46
|
-
exception.must_be_kind_of LockMethod::Locked
|
47
|
-
end
|
48
|
-
Breed.count.must_equal reference_count
|
49
|
-
end
|
50
|
-
|
51
|
-
it "allows you to clear locks if necessary" do
|
52
|
-
threads = (0..2).map do |i|
|
53
|
-
Thread.new do
|
54
|
-
# $stderr.write "Thread #{i} starting\n"
|
55
|
-
case i
|
56
|
-
when 0
|
57
|
-
Breed.run_data_miner!
|
58
|
-
when 1
|
59
|
-
sleep 0.3
|
60
|
-
DataMiner::Run.clear_locks
|
61
|
-
Breed.run_data_miner!
|
62
|
-
when 2
|
63
|
-
# i will hit a lock!
|
64
|
-
sleep 0.6
|
65
|
-
Breed.run_data_miner!
|
66
|
-
end
|
67
|
-
# $stderr.write "Thread #{i} done\n"
|
68
|
-
end
|
69
|
-
end
|
70
|
-
exceptions = []
|
71
|
-
threads.each do |t|
|
72
|
-
begin
|
73
|
-
t.join
|
74
|
-
rescue
|
75
|
-
exceptions << $!
|
76
|
-
end
|
77
|
-
end
|
78
|
-
exceptions.length.must_equal 1
|
79
|
-
exceptions.each do |exception|
|
80
|
-
exception.must_be_kind_of LockMethod::Locked
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner unit conversion' do
|
4
|
-
it "blows up if you don't specify a converter" do
|
5
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_without_unit_converter.rb', __FILE__)}`
|
6
|
-
refute $?.success?, output
|
7
|
-
end
|
8
|
-
it 'can convert with alchemist' do
|
9
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_with_alchemist.rb', __FILE__)}`
|
10
|
-
assert $?.success?, output
|
11
|
-
end
|
12
|
-
it 'can convert with conversions' do
|
13
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_with_conversions.rb', __FILE__)}`
|
14
|
-
assert $?.success?, output
|
15
|
-
end
|
16
|
-
end
|