data_miner 2.5.2 → 3.0.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +18 -0
- data/Gemfile +0 -2
- data/data_miner.gemspec +3 -7
- data/lib/data_miner.rb +2 -31
- data/lib/data_miner/active_record_class_methods.rb +5 -11
- data/lib/data_miner/attribute.rb +100 -198
- data/lib/data_miner/script.rb +5 -11
- data/lib/data_miner/step/import.rb +41 -27
- data/lib/data_miner/step/sql.rb +10 -10
- data/lib/data_miner/version.rb +1 -1
- data/test/data_miner/step/test_sql.rb +14 -18
- data/test/data_miner/test_attribute.rb +0 -32
- data/test/helper.rb +4 -9
- data/test/support/data_miner_with_alchemist.rb +1 -5
- data/test/support/pet.rb +10 -9
- data/test/support/pet2.rb +1 -1
- data/test/support/pets.csv +2 -2
- data/test/test_data_miner.rb +6 -40
- metadata +9 -97
- data/lib/data_miner/dictionary.rb +0 -84
- data/lib/data_miner/run.rb +0 -144
- data/lib/data_miner/run/column_statistic.rb +0 -78
- data/lib/data_miner/unit_converter.rb +0 -12
- data/lib/data_miner/unit_converter/alchemist.rb +0 -11
- data/lib/data_miner/unit_converter/conversions.rb +0 -11
- data/test/data_miner/step/test_import.rb +0 -35
- data/test/data_miner/unit_converter/test_alchemist.rb +0 -20
- data/test/data_miner/unit_converter/test_conversions.rb +0 -20
- data/test/support/data_miner_with_conversions.rb +0 -16
- data/test/support/data_miner_without_unit_converter.rb +0 -51
- data/test/test_data_miner_run_column_statistic.rb +0 -52
- data/test/test_earth_import.rb +0 -26
- data/test/test_safety.rb +0 -84
- data/test/test_unit_conversion.rb +0 -16
@@ -1,78 +0,0 @@
|
|
1
|
-
class DataMiner
|
2
|
-
class Run < ::ActiveRecord::Base
|
3
|
-
# If +DataMiner.per_column_statistics?+, this model keeps per-column stats like max, min, average, standard deviation, etc.
|
4
|
-
#
|
5
|
-
# Each +DataMiner::Run+ will have two of these for every column; an "initial" and a "final"
|
6
|
-
class ColumnStatistic < ::ActiveRecord::Base
|
7
|
-
class << self
|
8
|
-
def take(run)
|
9
|
-
unless table_exists?
|
10
|
-
auto_upgrade!
|
11
|
-
end
|
12
|
-
model = run.model_name.constantize
|
13
|
-
return unless model.table_exists?
|
14
|
-
model.column_names.each do |column_name|
|
15
|
-
column_statistic = new
|
16
|
-
column_statistic.run = run
|
17
|
-
column_statistic.model_name = run.model_name
|
18
|
-
column_statistic.column_name = column_name
|
19
|
-
column_statistic.take_statistics
|
20
|
-
column_statistic.save!
|
21
|
-
end
|
22
|
-
nil
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
NUMERIC = [
|
28
|
-
:integer,
|
29
|
-
:float,
|
30
|
-
:decimal,
|
31
|
-
]
|
32
|
-
|
33
|
-
self.table_name = 'data_miner_run_column_statistics'
|
34
|
-
|
35
|
-
belongs_to :run, :class_name => 'DataMiner::Run'
|
36
|
-
|
37
|
-
col :run_id, :type => :integer
|
38
|
-
col :model_name
|
39
|
-
col :column_name
|
40
|
-
col :null_count, :type => :integer
|
41
|
-
col :zero_count, :type => :integer
|
42
|
-
col :blank_count, :type => :integer
|
43
|
-
col :max
|
44
|
-
col :min
|
45
|
-
col :average, :type => :float
|
46
|
-
col :sum, :type => :float
|
47
|
-
col :created_at, :type => :datetime
|
48
|
-
add_index :run_id
|
49
|
-
add_index :model_name
|
50
|
-
|
51
|
-
# @private
|
52
|
-
def take_statistics
|
53
|
-
model = run.model_name.constantize
|
54
|
-
|
55
|
-
self.null_count = model.where("#{model.connection.quote_column_name(column_name)} IS NULL").count
|
56
|
-
|
57
|
-
self.max = calc(:MAX).inspect
|
58
|
-
self.min = calc(:MIN).inspect
|
59
|
-
|
60
|
-
column = model.columns_hash[column_name]
|
61
|
-
if NUMERIC.include?(column.type)
|
62
|
-
self.zero_count = model.where(column_name => 0).count
|
63
|
-
self.average = calc :AVG
|
64
|
-
self.sum = calc :SUM
|
65
|
-
elsif column.type == :string
|
66
|
-
self.blank_count = model.where("LENGTH(TRIM(#{model.connection.quote_column_name(column_name)})) = 0").count
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
def calc(operation)
|
73
|
-
model = run.model_name.constantize
|
74
|
-
model.connection.select_value "SELECT #{operation}(#{model.connection.quote_column_name(column_name)}) FROM #{model.quoted_table_name}"
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
@@ -1,35 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
init_database
|
3
|
-
require 'earth'
|
4
|
-
|
5
|
-
require 'earth/residence'
|
6
|
-
require 'earth/electricity'
|
7
|
-
require 'earth/hospitality'
|
8
|
-
|
9
|
-
class PetBlue < ActiveRecord::Base
|
10
|
-
data_miner do
|
11
|
-
import 'fake', :url => 'fake' do
|
12
|
-
key :id
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
PetBlue.auto_upgrade!
|
17
|
-
|
18
|
-
describe DataMiner::Step::Import do
|
19
|
-
describe '#table_has_autoincrementing_primary_key?' do
|
20
|
-
it "recognizes auto-increment primary keys" do
|
21
|
-
PetBlue.data_miner_script.steps.first.send(:table_has_autoincrementing_primary_key?).must_equal true
|
22
|
-
end
|
23
|
-
it "recognizes that not all integer primary keys are auto-increment" do
|
24
|
-
[
|
25
|
-
ElectricUtility,
|
26
|
-
ResidentialEnergyConsumptionSurveyResponse,
|
27
|
-
CommercialBuildingEnergyConsumptionSurveyResponse,
|
28
|
-
].each do |model|
|
29
|
-
model.data_miner_script.steps.select { |s| s.is_a?(DataMiner::Step::Import) }.each do |import_step|
|
30
|
-
import_step.send(:table_has_autoincrementing_primary_key?).must_equal false
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner::UnitConverter::Alchemist' do
|
4
|
-
before do
|
5
|
-
@original_converter = DataMiner.unit_converter
|
6
|
-
DataMiner.unit_converter = :alchemist
|
7
|
-
end
|
8
|
-
|
9
|
-
after do
|
10
|
-
DataMiner.unit_converter = @original_converter
|
11
|
-
end
|
12
|
-
|
13
|
-
describe '#convert' do
|
14
|
-
it 'converts a value from one unit to another' do
|
15
|
-
value = DataMiner.unit_converter.convert 3.5, :kilograms, :pounds
|
16
|
-
assert value.is_a?(Float)
|
17
|
-
value.must_be_close_to 7.71617918
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner::UnitConverter::Conversions' do
|
4
|
-
before do
|
5
|
-
@original_converter = DataMiner.unit_converter
|
6
|
-
DataMiner.unit_converter = :conversions
|
7
|
-
end
|
8
|
-
|
9
|
-
after do
|
10
|
-
DataMiner.unit_converter = @original_converter
|
11
|
-
end
|
12
|
-
|
13
|
-
describe '#convert' do
|
14
|
-
it 'converts a value from one unit to another' do
|
15
|
-
value = DataMiner.unit_converter.convert 3.5, :kilograms, :pounds
|
16
|
-
assert value.is_a?(Float)
|
17
|
-
value.must_be_close_to 7.71617918
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
require 'conversions'
|
4
|
-
Conversions.register :years, :years, 1
|
5
|
-
|
6
|
-
describe 'DataMiner with Conversions' do
|
7
|
-
before do
|
8
|
-
init_database(:conversions)
|
9
|
-
init_models
|
10
|
-
Pet.run_data_miner!
|
11
|
-
end
|
12
|
-
|
13
|
-
it 'converts convertible units' do
|
14
|
-
Pet.find('Pierre').weight.must_be_close_to 4.4.pounds.to(:kilograms)
|
15
|
-
end
|
16
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
class MyPet < ActiveRecord::Base
|
4
|
-
PETS = File.expand_path('../pets.csv', __FILE__)
|
5
|
-
COLOR_DICTIONARY_ENGLISH = File.expand_path('../pet_color_dictionary.en.csv', __FILE__)
|
6
|
-
|
7
|
-
self.primary_key = "name"
|
8
|
-
col :name
|
9
|
-
col :color_id
|
10
|
-
col :age, :type => :integer
|
11
|
-
col :age_units
|
12
|
-
col :weight, :type => :float
|
13
|
-
col :weight_units
|
14
|
-
col :height, :type => :integer
|
15
|
-
col :height_units
|
16
|
-
col :favorite_food
|
17
|
-
col :command_phrase
|
18
|
-
|
19
|
-
data_miner do
|
20
|
-
process :auto_upgrade!
|
21
|
-
import("A list of pets", :url => "file://#{PETS}") do
|
22
|
-
key :name
|
23
|
-
store :age
|
24
|
-
store :color_id, :field_name => :color, :dictionary => { :url => "file://#{COLOR_DICTIONARY_ENGLISH}", :input => :input, :output => :output }
|
25
|
-
store :weight
|
26
|
-
store :favorite_food, :nullify_blank_strings => true
|
27
|
-
store :command_phrase
|
28
|
-
store :height, :units => :centimetres
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
describe 'DataMiner with Conversions' do
|
34
|
-
it 'happens when DataMiner.unit_converter is nil' do
|
35
|
-
DataMiner.unit_converter.must_be_nil
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'converts convertible units' do
|
39
|
-
init_database(nil)
|
40
|
-
MyPet.run_data_miner!
|
41
|
-
MyPet.find('Pierre').weight.must_equal 4.4
|
42
|
-
end
|
43
|
-
|
44
|
-
it 'raises an error if conversions are attempted' do
|
45
|
-
init_database(nil)
|
46
|
-
lambda do
|
47
|
-
init_models
|
48
|
-
Pet.run_data_miner!
|
49
|
-
end.must_raise DataMiner::Attribute::NoConverterSet
|
50
|
-
end
|
51
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
init_models
|
5
|
-
|
6
|
-
describe DataMiner::Run::ColumnStatistic do
|
7
|
-
describe "when advanced statistics are enabled" do
|
8
|
-
before do
|
9
|
-
DataMiner.per_column_statistics = true
|
10
|
-
Pet.delete_all
|
11
|
-
DataMiner::Run.delete_all
|
12
|
-
DataMiner::Run::ColumnStatistic.delete_all
|
13
|
-
Pet.run_data_miner!
|
14
|
-
end
|
15
|
-
|
16
|
-
after do
|
17
|
-
DataMiner.per_column_statistics = false
|
18
|
-
end
|
19
|
-
|
20
|
-
it "keeps null count" do
|
21
|
-
Pet.data_miner_runs.first.initial_column_statistics(:breed_id).null_count.must_equal 0
|
22
|
-
Pet.data_miner_runs.first.final_column_statistics(:breed_id).null_count.must_equal 1
|
23
|
-
|
24
|
-
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).null_count.must_equal 0
|
25
|
-
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).null_count.must_equal 0
|
26
|
-
end
|
27
|
-
|
28
|
-
it "keeps max and min (as strings)" do
|
29
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).max.must_equal 'nil'
|
30
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).max.must_include '17'
|
31
|
-
end
|
32
|
-
|
33
|
-
it "keeps average and sum" do
|
34
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).average.must_be_nil
|
35
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).average.must_equal 7.0
|
36
|
-
|
37
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).sum.must_be_nil
|
38
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).sum.must_equal 28.0
|
39
|
-
end
|
40
|
-
|
41
|
-
it "keeps blank (empty string) count" do
|
42
|
-
Pet.data_miner_runs.first.initial_column_statistics(:command_phrase).blank_count.must_equal 0
|
43
|
-
Pet.data_miner_runs.first.final_column_statistics(:command_phrase).blank_count.must_equal 3
|
44
|
-
end
|
45
|
-
|
46
|
-
it "keeps zero count" do
|
47
|
-
Pet.data_miner_runs.first.initial_column_statistics(:age).zero_count.must_equal 0
|
48
|
-
Pet.data_miner_runs.first.final_column_statistics(:age).zero_count.must_equal 0
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
data/test/test_earth_import.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
require 'earth'
|
5
|
-
|
6
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
7
|
-
Earth.init :locality, :pet, :load_data_miner => true, :apply_schemas => true
|
8
|
-
|
9
|
-
describe DataMiner do
|
10
|
-
describe "being used by the Earth library's import steps" do
|
11
|
-
describe "for pets" do
|
12
|
-
it "can pull breed and species" do
|
13
|
-
Breed.run_data_miner!
|
14
|
-
Breed.find('Golden Retriever').species.must_equal Species.find('dog')
|
15
|
-
end
|
16
|
-
end
|
17
|
-
describe "for localities" do
|
18
|
-
it "can handle non-latin characters" do
|
19
|
-
Country.run_data_miner!
|
20
|
-
Country.find('DE').name.must_equal 'Germany'
|
21
|
-
Country.find('AX').name.must_equal 'Åland Islands'
|
22
|
-
Country.find('CI').name.must_equal "Côte d'Ivoire"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
data/test/test_safety.rb
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
init_models
|
5
|
-
require 'earth'
|
6
|
-
|
7
|
-
require 'lock_method'
|
8
|
-
DataMiner::Run.lock_method :start
|
9
|
-
|
10
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
11
|
-
Earth.init :locality, :pet, :load_data_miner => true, :apply_schemas => true
|
12
|
-
|
13
|
-
describe DataMiner do
|
14
|
-
describe "when being run in a multi-threaded environment" do
|
15
|
-
before do
|
16
|
-
@old_thread_abort_on_exception = Thread.abort_on_exception
|
17
|
-
Thread.abort_on_exception = false
|
18
|
-
end
|
19
|
-
|
20
|
-
after do
|
21
|
-
Thread.abort_on_exception = @old_thread_abort_on_exception
|
22
|
-
end
|
23
|
-
|
24
|
-
it "tries not to duplicate data" do
|
25
|
-
Breed.delete_all
|
26
|
-
Breed.run_data_miner!
|
27
|
-
reference_count = Breed.count
|
28
|
-
Breed.delete_all
|
29
|
-
threads = (0..2).map do |i|
|
30
|
-
Thread.new do
|
31
|
-
# $stderr.write "Thread #{i} starting\n"
|
32
|
-
Breed.run_data_miner!
|
33
|
-
# $stderr.write "Thread #{i} done\n"
|
34
|
-
end
|
35
|
-
end
|
36
|
-
exceptions = []
|
37
|
-
threads.each do |t|
|
38
|
-
begin
|
39
|
-
t.join
|
40
|
-
rescue
|
41
|
-
exceptions << $!
|
42
|
-
end
|
43
|
-
end
|
44
|
-
exceptions.length.must_equal 2
|
45
|
-
exceptions.each do |exception|
|
46
|
-
exception.must_be_kind_of LockMethod::Locked
|
47
|
-
end
|
48
|
-
Breed.count.must_equal reference_count
|
49
|
-
end
|
50
|
-
|
51
|
-
it "allows you to clear locks if necessary" do
|
52
|
-
threads = (0..2).map do |i|
|
53
|
-
Thread.new do
|
54
|
-
# $stderr.write "Thread #{i} starting\n"
|
55
|
-
case i
|
56
|
-
when 0
|
57
|
-
Breed.run_data_miner!
|
58
|
-
when 1
|
59
|
-
sleep 0.3
|
60
|
-
DataMiner::Run.clear_locks
|
61
|
-
Breed.run_data_miner!
|
62
|
-
when 2
|
63
|
-
# i will hit a lock!
|
64
|
-
sleep 0.6
|
65
|
-
Breed.run_data_miner!
|
66
|
-
end
|
67
|
-
# $stderr.write "Thread #{i} done\n"
|
68
|
-
end
|
69
|
-
end
|
70
|
-
exceptions = []
|
71
|
-
threads.each do |t|
|
72
|
-
begin
|
73
|
-
t.join
|
74
|
-
rescue
|
75
|
-
exceptions << $!
|
76
|
-
end
|
77
|
-
end
|
78
|
-
exceptions.length.must_equal 1
|
79
|
-
exceptions.each do |exception|
|
80
|
-
exception.must_be_kind_of LockMethod::Locked
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
describe 'DataMiner unit conversion' do
|
4
|
-
it "blows up if you don't specify a converter" do
|
5
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_without_unit_converter.rb', __FILE__)}`
|
6
|
-
refute $?.success?, output
|
7
|
-
end
|
8
|
-
it 'can convert with alchemist' do
|
9
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_with_alchemist.rb', __FILE__)}`
|
10
|
-
assert $?.success?, output
|
11
|
-
end
|
12
|
-
it 'can convert with conversions' do
|
13
|
-
output = `ruby -I#{File.dirname(__FILE__)} #{File.expand_path('../support/data_miner_with_conversions.rb', __FILE__)}`
|
14
|
-
assert $?.success?, output
|
15
|
-
end
|
16
|
-
end
|