data_miner 0.3.7 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -1
- data/VERSION +1 -1
- data/data_miner.gemspec +1 -2
- data/lib/data_miner.rb +4 -6
- data/lib/data_miner/attribute.rb +19 -9
- data/lib/data_miner/configuration.rb +14 -23
- data/lib/data_miner/import.rb +10 -8
- data/lib/data_miner/process.rb +6 -6
- data/lib/data_miner/run.rb +0 -1
- data/test/data_miner_test.rb +18 -24
- metadata +1 -2
- data/lib/data_miner/target.rb +0 -26
data/README.rdoc
CHANGED
@@ -44,7 +44,7 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
|
|
44
44
|
|
45
45
|
namespace :data_miner do
|
46
46
|
task :run => :environment do
|
47
|
-
DataMiner.run :
|
47
|
+
DataMiner.run :resource_names => ENV['RESOURCES'].to_s.split(/\s*,\s*/).flatten.compact
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.8
|
data/data_miner.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{data_miner}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/data_miner/import.rb",
|
33
33
|
"lib/data_miner/process.rb",
|
34
34
|
"lib/data_miner/run.rb",
|
35
|
-
"lib/data_miner/target.rb",
|
36
35
|
"lib/data_miner/william_james_cartesian_product.rb",
|
37
36
|
"test/data_miner_test.rb",
|
38
37
|
"test/test_helper.rb"
|
data/lib/data_miner.rb
CHANGED
@@ -12,7 +12,6 @@ require 'data_miner/configuration'
|
|
12
12
|
require 'data_miner/dictionary'
|
13
13
|
require 'data_miner/import'
|
14
14
|
require 'data_miner/process'
|
15
|
-
require 'data_miner/target'
|
16
15
|
require 'data_miner/run'
|
17
16
|
|
18
17
|
# TODO: move to gem
|
@@ -38,8 +37,8 @@ module DataMiner
|
|
38
37
|
DataMiner::Configuration.run options
|
39
38
|
end
|
40
39
|
|
41
|
-
def self.
|
42
|
-
DataMiner::Configuration.
|
40
|
+
def self.resource_names
|
41
|
+
DataMiner::Configuration.resource_names
|
43
42
|
end
|
44
43
|
|
45
44
|
def self.create_tables
|
@@ -53,10 +52,9 @@ ActiveRecord::Base.class_eval do
|
|
53
52
|
logger.error "[DataMiner gem] Database table `#{table_name}` doesn't exist. DataMiner probably won't work properly until you run a migration or otherwise fix the schema."
|
54
53
|
return
|
55
54
|
end
|
56
|
-
|
57
|
-
DataMiner.
|
55
|
+
|
56
|
+
DataMiner.resource_names.add self.name
|
58
57
|
DataMiner.create_tables
|
59
|
-
DataMiner::Target.find_or_create_by_name name
|
60
58
|
|
61
59
|
belongs_to :data_miner_last_run, :class_name => 'DataMiner::Run'
|
62
60
|
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Attribute
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :resource, :name, :options_for_import
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(resource, name)
|
6
|
+
@resource = resource
|
7
7
|
@name = name
|
8
8
|
@options_for_import = {}
|
9
9
|
end
|
10
10
|
|
11
11
|
def inspect
|
12
|
-
"Attribute(#{
|
12
|
+
"Attribute(#{resource}##{name})"
|
13
13
|
end
|
14
14
|
|
15
15
|
def stored_by?(import)
|
@@ -56,11 +56,21 @@ module DataMiner
|
|
56
56
|
end
|
57
57
|
|
58
58
|
# this will overwrite nils, even if wants_overwriting?(import) is false
|
59
|
+
# returns true if an attr was changed, otherwise false
|
59
60
|
def set_record_from_row(import, record, row)
|
60
|
-
return if !wants_overwriting?(import) and !record.send(name).nil?
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
return false if !wants_overwriting?(import) and !record.send(name).nil?
|
62
|
+
what_it_was = record.send name
|
63
|
+
what_it_should_be = value_from_row import, row
|
64
|
+
record.send "#{name}=", what_it_should_be
|
65
|
+
what_it_is = record.send name
|
66
|
+
if what_it_is.nil? and !what_it_should_be.nil?
|
67
|
+
DataMiner.logger.info "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
68
|
+
nil
|
69
|
+
elsif what_it_is == what_it_was
|
70
|
+
false
|
71
|
+
else
|
72
|
+
true
|
73
|
+
end
|
64
74
|
end
|
65
75
|
|
66
76
|
def unit_from_source(import, row)
|
@@ -87,7 +97,7 @@ module DataMiner
|
|
87
97
|
end
|
88
98
|
|
89
99
|
def column_type
|
90
|
-
|
100
|
+
resource.columns_hash[name.to_s].type
|
91
101
|
end
|
92
102
|
|
93
103
|
def dictionary(import)
|
@@ -2,12 +2,12 @@ module DataMiner
|
|
2
2
|
class Configuration
|
3
3
|
include Blockenspiel::DSL
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :resource, :runnables, :runnable_counter, :attributes, :unique_indices
|
6
6
|
|
7
|
-
def initialize(
|
7
|
+
def initialize(resource)
|
8
8
|
@runnables = Array.new
|
9
9
|
@unique_indices = Set.new
|
10
|
-
@
|
10
|
+
@resource = resource
|
11
11
|
@runnable_counter = 0
|
12
12
|
@attributes = HashWithIndifferentAccess.new
|
13
13
|
end
|
@@ -35,7 +35,7 @@ module DataMiner
|
|
35
35
|
|
36
36
|
def after_invoke
|
37
37
|
if unique_indices.empty?
|
38
|
-
raise(MissingHashColumn, "No unique_index defined for #{
|
38
|
+
raise(MissingHashColumn, "No unique_index defined for #{resource.name}, so you need a row_hash:string column.") unless resource.column_names.include?('row_hash')
|
39
39
|
unique_indices.add 'row_hash'
|
40
40
|
end
|
41
41
|
runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
|
@@ -43,10 +43,9 @@ module DataMiner
|
|
43
43
|
|
44
44
|
# Mine data for this class.
|
45
45
|
def run(options = {})
|
46
|
-
target = DataMiner::Target.find(klass.name)
|
47
46
|
finished = false
|
48
|
-
run =
|
49
|
-
|
47
|
+
run = DataMiner::Run.create! :started_at => Time.now, :resource_name => resource.name
|
48
|
+
resource.delete_all if options[:from_scratch]
|
50
49
|
begin
|
51
50
|
runnables.each { |runnable| runnable.run(run) }
|
52
51
|
finished = true
|
@@ -56,34 +55,26 @@ module DataMiner
|
|
56
55
|
nil
|
57
56
|
end
|
58
57
|
|
59
|
-
cattr_accessor :
|
60
|
-
self.
|
58
|
+
cattr_accessor :resource_names
|
59
|
+
self.resource_names = Set.new
|
61
60
|
class << self
|
62
|
-
# Mine data. Defaults to all
|
61
|
+
# Mine data. Defaults to all resource_names touched by DataMiner.
|
63
62
|
#
|
64
63
|
# Options
|
65
|
-
# * <tt>:
|
64
|
+
# * <tt>:resource_names</tt>: array of resource (class) names to mine
|
66
65
|
def run(options = {})
|
67
|
-
|
68
|
-
if options[:
|
69
|
-
|
66
|
+
resource_names.each do |resource_name|
|
67
|
+
if options[:resource_names].blank? or options[:resource_names].include?(resource_name)
|
68
|
+
resource_name.constantize.data_miner_config.run options
|
70
69
|
end
|
71
70
|
end
|
72
71
|
end
|
73
72
|
|
74
73
|
def create_tables
|
75
74
|
c = ActiveRecord::Base.connection
|
76
|
-
unless c.table_exists?('data_miner_targets')
|
77
|
-
c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
78
|
-
t.string 'name'
|
79
|
-
t.datetime 'created_at'
|
80
|
-
t.datetime 'updated_at'
|
81
|
-
end
|
82
|
-
c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
|
83
|
-
end
|
84
75
|
unless c.table_exists?('data_miner_runs')
|
85
76
|
c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
|
86
|
-
t.string '
|
77
|
+
t.string 'resource_name'
|
87
78
|
t.boolean 'finished'
|
88
79
|
t.datetime 'started_at'
|
89
80
|
t.datetime 'ended_at'
|
data/lib/data_miner/import.rb
CHANGED
@@ -2,7 +2,7 @@ module DataMiner
|
|
2
2
|
class Import
|
3
3
|
attr_accessor :configuration, :position_in_run, :options, :table, :errata
|
4
4
|
attr_accessor :description
|
5
|
-
delegate :
|
5
|
+
delegate :resource, :to => :configuration
|
6
6
|
delegate :unique_indices, :to => :configuration
|
7
7
|
|
8
8
|
def initialize(configuration, position_in_run, description, options = {}, &block)
|
@@ -11,12 +11,12 @@ module DataMiner
|
|
11
11
|
@description = description
|
12
12
|
@options = options
|
13
13
|
yield self if block_given? # pull in attributes
|
14
|
-
@errata = Errata.new(:url => options[:errata], :klass =>
|
14
|
+
@errata = Errata.new(:url => options[:errata], :klass => resource) if options[:errata]
|
15
15
|
@table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
|
16
16
|
end
|
17
17
|
|
18
18
|
def inspect
|
19
|
-
"Import(#{
|
19
|
+
"Import(#{resource}) position #{position_in_run} (#{description})"
|
20
20
|
end
|
21
21
|
|
22
22
|
def attributes
|
@@ -28,7 +28,7 @@ module DataMiner
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def store(attr_name, attr_options = {})
|
31
|
-
configuration.attributes[attr_name] ||= Attribute.new(
|
31
|
+
configuration.attributes[attr_name] ||= Attribute.new(resource, attr_name)
|
32
32
|
configuration.attributes[attr_name].options_for_import[self] = attr_options
|
33
33
|
end
|
34
34
|
|
@@ -45,14 +45,16 @@ module DataMiner
|
|
45
45
|
|
46
46
|
record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
|
47
47
|
next if combination.include?(nil)
|
48
|
-
|
48
|
+
resource.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
|
49
49
|
end.flatten
|
50
50
|
|
51
51
|
Array.wrap(record_set).each do |record|
|
52
|
-
attributes.values.
|
52
|
+
hits = attributes.values.map { |attr| attr.set_record_from_row self, record, row }
|
53
53
|
record.data_miner_touch_count ||= 0
|
54
|
-
|
55
|
-
|
54
|
+
if hits.any?
|
55
|
+
record.data_miner_touch_count += 1
|
56
|
+
record.data_miner_last_run = run
|
57
|
+
end
|
56
58
|
record.save!
|
57
59
|
end
|
58
60
|
end
|
data/lib/data_miner/process.rb
CHANGED
@@ -3,7 +3,7 @@ module DataMiner
|
|
3
3
|
attr_accessor :configuration, :position_in_run
|
4
4
|
attr_accessor :method_name
|
5
5
|
attr_accessor :block_description, :block
|
6
|
-
delegate :
|
6
|
+
delegate :resource, :to => :configuration
|
7
7
|
|
8
8
|
def initialize(configuration, position_in_run, method_name_or_block_description, &block)
|
9
9
|
@configuration = configuration
|
@@ -12,16 +12,16 @@ module DataMiner
|
|
12
12
|
@block_description = method_name_or_block_description
|
13
13
|
@block = block
|
14
14
|
else
|
15
|
-
@method_name =
|
15
|
+
@method_name = method_name_or_block_description
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
19
|
def inspect
|
20
|
-
str = "Process(#{
|
20
|
+
str = "Process(#{resource}) position #{position_in_run}"
|
21
21
|
if block
|
22
|
-
str << " called :#{method_name}"
|
23
|
-
else
|
24
22
|
str << " ran block (#{block_description})"
|
23
|
+
else
|
24
|
+
str << " called :#{method_name}"
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
@@ -29,7 +29,7 @@ module DataMiner
|
|
29
29
|
if block
|
30
30
|
block.call
|
31
31
|
else
|
32
|
-
|
32
|
+
resource.send method_name
|
33
33
|
end
|
34
34
|
DataMiner.logger.info "ran #{inspect}"
|
35
35
|
end
|
data/lib/data_miner/run.rb
CHANGED
data/test/data_miner_test.rb
CHANGED
@@ -876,25 +876,13 @@ class DataMinerTest < Test::Unit::TestCase
|
|
876
876
|
assert AutomobileVariant.first.fuel_efficiency_city.present?
|
877
877
|
end
|
878
878
|
|
879
|
-
# should "mine multiple classes in the correct order" do
|
880
|
-
# DataMiner.run
|
881
|
-
# uy = Country.find_by_iso_3166('UY')
|
882
|
-
# assert_equal 'Uruguay', uy.name
|
883
|
-
# end
|
884
|
-
|
885
|
-
should "have a target record for every class that is mined" do
|
886
|
-
DataMiner.run :class_names => %w{ Country }
|
887
|
-
assert DataMiner::Target.exists?(:name => 'Country')
|
888
|
-
assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
|
889
|
-
end
|
890
|
-
|
891
879
|
should "keep a log when it does a run" do
|
892
880
|
approx_started_at = Time.now
|
893
|
-
DataMiner.run :
|
881
|
+
DataMiner.run :resource_names => %w{ Country }
|
894
882
|
approx_ended_at = Time.now
|
895
|
-
|
896
|
-
assert (
|
897
|
-
assert (
|
883
|
+
last_run = DataMiner::Run.first(:conditions => { :resource_name => 'Country' }, :order => 'id DESC')
|
884
|
+
assert (last_run.started_at - approx_started_at).abs < 5 # seconds
|
885
|
+
assert (last_run.ended_at - approx_ended_at).abs < 5 # seconds
|
898
886
|
end
|
899
887
|
|
900
888
|
should "request a re-import from scratch" do
|
@@ -902,31 +890,37 @@ class DataMinerTest < Test::Unit::TestCase
|
|
902
890
|
c.iso_3166 = 'JUNK'
|
903
891
|
c.save!
|
904
892
|
assert Country.exists?(:iso_3166 => 'JUNK')
|
905
|
-
DataMiner.run :
|
893
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
906
894
|
assert !Country.exists?(:iso_3166 => 'JUNK')
|
907
895
|
end
|
908
896
|
|
909
897
|
should "track how many times a row was touched" do
|
910
|
-
DataMiner.run :
|
898
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
899
|
+
assert_equal 1, Country.first.data_miner_touch_count
|
900
|
+
DataMiner.run :resource_names => %w{ Country }
|
911
901
|
assert_equal 1, Country.first.data_miner_touch_count
|
912
|
-
DataMiner.run :class_names => %w{ Country }
|
913
|
-
assert_equal 2, Country.first.data_miner_touch_count
|
914
902
|
end
|
915
903
|
|
916
904
|
should "keep track of what the last import run that touched a row was" do
|
917
|
-
DataMiner.run :
|
905
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
918
906
|
a = DataMiner::Run.last
|
919
907
|
assert_equal a, Country.first.data_miner_last_run
|
920
|
-
DataMiner.run :
|
908
|
+
DataMiner.run :resource_names => %w{ Country }
|
921
909
|
b = DataMiner::Run.last
|
922
910
|
assert a != b
|
923
|
-
assert_equal
|
911
|
+
assert_equal a, Country.first.data_miner_last_run
|
924
912
|
end
|
925
913
|
|
926
914
|
unless ENV['FAST'] == 'true'
|
927
915
|
should "import using a dictionary" do
|
928
|
-
DataMiner.run :
|
916
|
+
DataMiner.run :resource_names => %w{ ResidentialEnergyConsumptionSurveyResponse }
|
929
917
|
assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
|
930
918
|
end
|
919
|
+
|
920
|
+
should "mine multiple classes in the correct order" do
|
921
|
+
DataMiner.run
|
922
|
+
uy = Country.find_by_iso_3166('UY')
|
923
|
+
assert_equal 'Uruguay', uy.name
|
924
|
+
end
|
931
925
|
end
|
932
926
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seamus Abshere
|
@@ -108,7 +108,6 @@ files:
|
|
108
108
|
- lib/data_miner/import.rb
|
109
109
|
- lib/data_miner/process.rb
|
110
110
|
- lib/data_miner/run.rb
|
111
|
-
- lib/data_miner/target.rb
|
112
111
|
- lib/data_miner/william_james_cartesian_product.rb
|
113
112
|
- test/data_miner_test.rb
|
114
113
|
- test/test_helper.rb
|
data/lib/data_miner/target.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module DataMiner
|
2
|
-
class Target < ActiveRecord::Base
|
3
|
-
set_table_name 'data_miner_targets'
|
4
|
-
set_primary_key :name
|
5
|
-
has_many :runs, :class_name => '::DataMiner::Run', :foreign_key => 'data_miner_target_id'
|
6
|
-
|
7
|
-
def klass
|
8
|
-
name.constantize
|
9
|
-
end
|
10
|
-
|
11
|
-
def run(options = {})
|
12
|
-
klass.data_miner_config.run options
|
13
|
-
end
|
14
|
-
|
15
|
-
def included_in_list_of_targets
|
16
|
-
msg = "must have a data_miner block"
|
17
|
-
unless DataMiner.classes.include?(name.constantize)
|
18
|
-
errors.add :name, msg
|
19
|
-
end
|
20
|
-
rescue NameError
|
21
|
-
errors.add :name, msg
|
22
|
-
end
|
23
|
-
|
24
|
-
validate :included_in_list_of_targets
|
25
|
-
end
|
26
|
-
end
|