data_miner 0.3.7 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/VERSION +1 -1
- data/data_miner.gemspec +1 -2
- data/lib/data_miner.rb +4 -6
- data/lib/data_miner/attribute.rb +19 -9
- data/lib/data_miner/configuration.rb +14 -23
- data/lib/data_miner/import.rb +10 -8
- data/lib/data_miner/process.rb +6 -6
- data/lib/data_miner/run.rb +0 -1
- data/test/data_miner_test.rb +18 -24
- metadata +1 -2
- data/lib/data_miner/target.rb +0 -26
data/README.rdoc
CHANGED
@@ -44,7 +44,7 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
|
|
44
44
|
|
45
45
|
namespace :data_miner do
|
46
46
|
task :run => :environment do
|
47
|
-
DataMiner.run :
|
47
|
+
DataMiner.run :resource_names => ENV['RESOURCES'].to_s.split(/\s*,\s*/).flatten.compact
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.8
|
data/data_miner.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{data_miner}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/data_miner/import.rb",
|
33
33
|
"lib/data_miner/process.rb",
|
34
34
|
"lib/data_miner/run.rb",
|
35
|
-
"lib/data_miner/target.rb",
|
36
35
|
"lib/data_miner/william_james_cartesian_product.rb",
|
37
36
|
"test/data_miner_test.rb",
|
38
37
|
"test/test_helper.rb"
|
data/lib/data_miner.rb
CHANGED
@@ -12,7 +12,6 @@ require 'data_miner/configuration'
|
|
12
12
|
require 'data_miner/dictionary'
|
13
13
|
require 'data_miner/import'
|
14
14
|
require 'data_miner/process'
|
15
|
-
require 'data_miner/target'
|
16
15
|
require 'data_miner/run'
|
17
16
|
|
18
17
|
# TODO: move to gem
|
@@ -38,8 +37,8 @@ module DataMiner
|
|
38
37
|
DataMiner::Configuration.run options
|
39
38
|
end
|
40
39
|
|
41
|
-
def self.
|
42
|
-
DataMiner::Configuration.
|
40
|
+
def self.resource_names
|
41
|
+
DataMiner::Configuration.resource_names
|
43
42
|
end
|
44
43
|
|
45
44
|
def self.create_tables
|
@@ -53,10 +52,9 @@ ActiveRecord::Base.class_eval do
|
|
53
52
|
logger.error "[DataMiner gem] Database table `#{table_name}` doesn't exist. DataMiner probably won't work properly until you run a migration or otherwise fix the schema."
|
54
53
|
return
|
55
54
|
end
|
56
|
-
|
57
|
-
DataMiner.
|
55
|
+
|
56
|
+
DataMiner.resource_names.add self.name
|
58
57
|
DataMiner.create_tables
|
59
|
-
DataMiner::Target.find_or_create_by_name name
|
60
58
|
|
61
59
|
belongs_to :data_miner_last_run, :class_name => 'DataMiner::Run'
|
62
60
|
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Attribute
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :resource, :name, :options_for_import
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(resource, name)
|
6
|
+
@resource = resource
|
7
7
|
@name = name
|
8
8
|
@options_for_import = {}
|
9
9
|
end
|
10
10
|
|
11
11
|
def inspect
|
12
|
-
"Attribute(#{
|
12
|
+
"Attribute(#{resource}##{name})"
|
13
13
|
end
|
14
14
|
|
15
15
|
def stored_by?(import)
|
@@ -56,11 +56,21 @@ module DataMiner
|
|
56
56
|
end
|
57
57
|
|
58
58
|
# this will overwrite nils, even if wants_overwriting?(import) is false
|
59
|
+
# returns true if an attr was changed, otherwise false
|
59
60
|
def set_record_from_row(import, record, row)
|
60
|
-
return if !wants_overwriting?(import) and !record.send(name).nil?
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
return false if !wants_overwriting?(import) and !record.send(name).nil?
|
62
|
+
what_it_was = record.send name
|
63
|
+
what_it_should_be = value_from_row import, row
|
64
|
+
record.send "#{name}=", what_it_should_be
|
65
|
+
what_it_is = record.send name
|
66
|
+
if what_it_is.nil? and !what_it_should_be.nil?
|
67
|
+
DataMiner.logger.info "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
|
68
|
+
nil
|
69
|
+
elsif what_it_is == what_it_was
|
70
|
+
false
|
71
|
+
else
|
72
|
+
true
|
73
|
+
end
|
64
74
|
end
|
65
75
|
|
66
76
|
def unit_from_source(import, row)
|
@@ -87,7 +97,7 @@ module DataMiner
|
|
87
97
|
end
|
88
98
|
|
89
99
|
def column_type
|
90
|
-
|
100
|
+
resource.columns_hash[name.to_s].type
|
91
101
|
end
|
92
102
|
|
93
103
|
def dictionary(import)
|
@@ -2,12 +2,12 @@ module DataMiner
|
|
2
2
|
class Configuration
|
3
3
|
include Blockenspiel::DSL
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :resource, :runnables, :runnable_counter, :attributes, :unique_indices
|
6
6
|
|
7
|
-
def initialize(
|
7
|
+
def initialize(resource)
|
8
8
|
@runnables = Array.new
|
9
9
|
@unique_indices = Set.new
|
10
|
-
@
|
10
|
+
@resource = resource
|
11
11
|
@runnable_counter = 0
|
12
12
|
@attributes = HashWithIndifferentAccess.new
|
13
13
|
end
|
@@ -35,7 +35,7 @@ module DataMiner
|
|
35
35
|
|
36
36
|
def after_invoke
|
37
37
|
if unique_indices.empty?
|
38
|
-
raise(MissingHashColumn, "No unique_index defined for #{
|
38
|
+
raise(MissingHashColumn, "No unique_index defined for #{resource.name}, so you need a row_hash:string column.") unless resource.column_names.include?('row_hash')
|
39
39
|
unique_indices.add 'row_hash'
|
40
40
|
end
|
41
41
|
runnables.select { |runnable| runnable.is_a?(Import) }.each { |runnable| unique_indices.each { |unique_index| runnable.store(unique_index) unless runnable.stores?(unique_index) } }
|
@@ -43,10 +43,9 @@ module DataMiner
|
|
43
43
|
|
44
44
|
# Mine data for this class.
|
45
45
|
def run(options = {})
|
46
|
-
target = DataMiner::Target.find(klass.name)
|
47
46
|
finished = false
|
48
|
-
run =
|
49
|
-
|
47
|
+
run = DataMiner::Run.create! :started_at => Time.now, :resource_name => resource.name
|
48
|
+
resource.delete_all if options[:from_scratch]
|
50
49
|
begin
|
51
50
|
runnables.each { |runnable| runnable.run(run) }
|
52
51
|
finished = true
|
@@ -56,34 +55,26 @@ module DataMiner
|
|
56
55
|
nil
|
57
56
|
end
|
58
57
|
|
59
|
-
cattr_accessor :
|
60
|
-
self.
|
58
|
+
cattr_accessor :resource_names
|
59
|
+
self.resource_names = Set.new
|
61
60
|
class << self
|
62
|
-
# Mine data. Defaults to all
|
61
|
+
# Mine data. Defaults to all resource_names touched by DataMiner.
|
63
62
|
#
|
64
63
|
# Options
|
65
|
-
# * <tt>:
|
64
|
+
# * <tt>:resource_names</tt>: array of resource (class) names to mine
|
66
65
|
def run(options = {})
|
67
|
-
|
68
|
-
if options[:
|
69
|
-
|
66
|
+
resource_names.each do |resource_name|
|
67
|
+
if options[:resource_names].blank? or options[:resource_names].include?(resource_name)
|
68
|
+
resource_name.constantize.data_miner_config.run options
|
70
69
|
end
|
71
70
|
end
|
72
71
|
end
|
73
72
|
|
74
73
|
def create_tables
|
75
74
|
c = ActiveRecord::Base.connection
|
76
|
-
unless c.table_exists?('data_miner_targets')
|
77
|
-
c.create_table 'data_miner_targets', :options => 'ENGINE=InnoDB default charset=utf8', :id => false do |t|
|
78
|
-
t.string 'name'
|
79
|
-
t.datetime 'created_at'
|
80
|
-
t.datetime 'updated_at'
|
81
|
-
end
|
82
|
-
c.execute 'ALTER TABLE data_miner_targets ADD PRIMARY KEY (name);'
|
83
|
-
end
|
84
75
|
unless c.table_exists?('data_miner_runs')
|
85
76
|
c.create_table 'data_miner_runs', :options => 'ENGINE=InnoDB default charset=utf8' do |t|
|
86
|
-
t.string '
|
77
|
+
t.string 'resource_name'
|
87
78
|
t.boolean 'finished'
|
88
79
|
t.datetime 'started_at'
|
89
80
|
t.datetime 'ended_at'
|
data/lib/data_miner/import.rb
CHANGED
@@ -2,7 +2,7 @@ module DataMiner
|
|
2
2
|
class Import
|
3
3
|
attr_accessor :configuration, :position_in_run, :options, :table, :errata
|
4
4
|
attr_accessor :description
|
5
|
-
delegate :
|
5
|
+
delegate :resource, :to => :configuration
|
6
6
|
delegate :unique_indices, :to => :configuration
|
7
7
|
|
8
8
|
def initialize(configuration, position_in_run, description, options = {}, &block)
|
@@ -11,12 +11,12 @@ module DataMiner
|
|
11
11
|
@description = description
|
12
12
|
@options = options
|
13
13
|
yield self if block_given? # pull in attributes
|
14
|
-
@errata = Errata.new(:url => options[:errata], :klass =>
|
14
|
+
@errata = Errata.new(:url => options[:errata], :klass => resource) if options[:errata]
|
15
15
|
@table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
|
16
16
|
end
|
17
17
|
|
18
18
|
def inspect
|
19
|
-
"Import(#{
|
19
|
+
"Import(#{resource}) position #{position_in_run} (#{description})"
|
20
20
|
end
|
21
21
|
|
22
22
|
def attributes
|
@@ -28,7 +28,7 @@ module DataMiner
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def store(attr_name, attr_options = {})
|
31
|
-
configuration.attributes[attr_name] ||= Attribute.new(
|
31
|
+
configuration.attributes[attr_name] ||= Attribute.new(resource, attr_name)
|
32
32
|
configuration.attributes[attr_name].options_for_import[self] = attr_options
|
33
33
|
end
|
34
34
|
|
@@ -45,14 +45,16 @@ module DataMiner
|
|
45
45
|
|
46
46
|
record_set = WilliamJamesCartesianProduct.cart_prod(*unifying_values).map do |combination|
|
47
47
|
next if combination.include?(nil)
|
48
|
-
|
48
|
+
resource.send "find_or_initialize_by_#{unique_indices.to_a.join('_and_')}", *combination
|
49
49
|
end.flatten
|
50
50
|
|
51
51
|
Array.wrap(record_set).each do |record|
|
52
|
-
attributes.values.
|
52
|
+
hits = attributes.values.map { |attr| attr.set_record_from_row self, record, row }
|
53
53
|
record.data_miner_touch_count ||= 0
|
54
|
-
|
55
|
-
|
54
|
+
if hits.any?
|
55
|
+
record.data_miner_touch_count += 1
|
56
|
+
record.data_miner_last_run = run
|
57
|
+
end
|
56
58
|
record.save!
|
57
59
|
end
|
58
60
|
end
|
data/lib/data_miner/process.rb
CHANGED
@@ -3,7 +3,7 @@ module DataMiner
|
|
3
3
|
attr_accessor :configuration, :position_in_run
|
4
4
|
attr_accessor :method_name
|
5
5
|
attr_accessor :block_description, :block
|
6
|
-
delegate :
|
6
|
+
delegate :resource, :to => :configuration
|
7
7
|
|
8
8
|
def initialize(configuration, position_in_run, method_name_or_block_description, &block)
|
9
9
|
@configuration = configuration
|
@@ -12,16 +12,16 @@ module DataMiner
|
|
12
12
|
@block_description = method_name_or_block_description
|
13
13
|
@block = block
|
14
14
|
else
|
15
|
-
@method_name =
|
15
|
+
@method_name = method_name_or_block_description
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
19
|
def inspect
|
20
|
-
str = "Process(#{
|
20
|
+
str = "Process(#{resource}) position #{position_in_run}"
|
21
21
|
if block
|
22
|
-
str << " called :#{method_name}"
|
23
|
-
else
|
24
22
|
str << " ran block (#{block_description})"
|
23
|
+
else
|
24
|
+
str << " called :#{method_name}"
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
@@ -29,7 +29,7 @@ module DataMiner
|
|
29
29
|
if block
|
30
30
|
block.call
|
31
31
|
else
|
32
|
-
|
32
|
+
resource.send method_name
|
33
33
|
end
|
34
34
|
DataMiner.logger.info "ran #{inspect}"
|
35
35
|
end
|
data/lib/data_miner/run.rb
CHANGED
data/test/data_miner_test.rb
CHANGED
@@ -876,25 +876,13 @@ class DataMinerTest < Test::Unit::TestCase
|
|
876
876
|
assert AutomobileVariant.first.fuel_efficiency_city.present?
|
877
877
|
end
|
878
878
|
|
879
|
-
# should "mine multiple classes in the correct order" do
|
880
|
-
# DataMiner.run
|
881
|
-
# uy = Country.find_by_iso_3166('UY')
|
882
|
-
# assert_equal 'Uruguay', uy.name
|
883
|
-
# end
|
884
|
-
|
885
|
-
should "have a target record for every class that is mined" do
|
886
|
-
DataMiner.run :class_names => %w{ Country }
|
887
|
-
assert DataMiner::Target.exists?(:name => 'Country')
|
888
|
-
assert_equal 1, DataMiner::Target.count(:conditions => {:name => 'country'})
|
889
|
-
end
|
890
|
-
|
891
879
|
should "keep a log when it does a run" do
|
892
880
|
approx_started_at = Time.now
|
893
|
-
DataMiner.run :
|
881
|
+
DataMiner.run :resource_names => %w{ Country }
|
894
882
|
approx_ended_at = Time.now
|
895
|
-
|
896
|
-
assert (
|
897
|
-
assert (
|
883
|
+
last_run = DataMiner::Run.first(:conditions => { :resource_name => 'Country' }, :order => 'id DESC')
|
884
|
+
assert (last_run.started_at - approx_started_at).abs < 5 # seconds
|
885
|
+
assert (last_run.ended_at - approx_ended_at).abs < 5 # seconds
|
898
886
|
end
|
899
887
|
|
900
888
|
should "request a re-import from scratch" do
|
@@ -902,31 +890,37 @@ class DataMinerTest < Test::Unit::TestCase
|
|
902
890
|
c.iso_3166 = 'JUNK'
|
903
891
|
c.save!
|
904
892
|
assert Country.exists?(:iso_3166 => 'JUNK')
|
905
|
-
DataMiner.run :
|
893
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
906
894
|
assert !Country.exists?(:iso_3166 => 'JUNK')
|
907
895
|
end
|
908
896
|
|
909
897
|
should "track how many times a row was touched" do
|
910
|
-
DataMiner.run :
|
898
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
899
|
+
assert_equal 1, Country.first.data_miner_touch_count
|
900
|
+
DataMiner.run :resource_names => %w{ Country }
|
911
901
|
assert_equal 1, Country.first.data_miner_touch_count
|
912
|
-
DataMiner.run :class_names => %w{ Country }
|
913
|
-
assert_equal 2, Country.first.data_miner_touch_count
|
914
902
|
end
|
915
903
|
|
916
904
|
should "keep track of what the last import run that touched a row was" do
|
917
|
-
DataMiner.run :
|
905
|
+
DataMiner.run :resource_names => %w{ Country }, :from_scratch => true
|
918
906
|
a = DataMiner::Run.last
|
919
907
|
assert_equal a, Country.first.data_miner_last_run
|
920
|
-
DataMiner.run :
|
908
|
+
DataMiner.run :resource_names => %w{ Country }
|
921
909
|
b = DataMiner::Run.last
|
922
910
|
assert a != b
|
923
|
-
assert_equal
|
911
|
+
assert_equal a, Country.first.data_miner_last_run
|
924
912
|
end
|
925
913
|
|
926
914
|
unless ENV['FAST'] == 'true'
|
927
915
|
should "import using a dictionary" do
|
928
|
-
DataMiner.run :
|
916
|
+
DataMiner.run :resource_names => %w{ ResidentialEnergyConsumptionSurveyResponse }
|
929
917
|
assert ResidentialEnergyConsumptionSurveyResponse.find(6).residence_class.starts_with?('Single-family detached house')
|
930
918
|
end
|
919
|
+
|
920
|
+
should "mine multiple classes in the correct order" do
|
921
|
+
DataMiner.run
|
922
|
+
uy = Country.find_by_iso_3166('UY')
|
923
|
+
assert_equal 'Uruguay', uy.name
|
924
|
+
end
|
931
925
|
end
|
932
926
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seamus Abshere
|
@@ -108,7 +108,6 @@ files:
|
|
108
108
|
- lib/data_miner/import.rb
|
109
109
|
- lib/data_miner/process.rb
|
110
110
|
- lib/data_miner/run.rb
|
111
|
-
- lib/data_miner/target.rb
|
112
111
|
- lib/data_miner/william_james_cartesian_product.rb
|
113
112
|
- test/data_miner_test.rb
|
114
113
|
- test/test_helper.rb
|
data/lib/data_miner/target.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module DataMiner
|
2
|
-
class Target < ActiveRecord::Base
|
3
|
-
set_table_name 'data_miner_targets'
|
4
|
-
set_primary_key :name
|
5
|
-
has_many :runs, :class_name => '::DataMiner::Run', :foreign_key => 'data_miner_target_id'
|
6
|
-
|
7
|
-
def klass
|
8
|
-
name.constantize
|
9
|
-
end
|
10
|
-
|
11
|
-
def run(options = {})
|
12
|
-
klass.data_miner_config.run options
|
13
|
-
end
|
14
|
-
|
15
|
-
def included_in_list_of_targets
|
16
|
-
msg = "must have a data_miner block"
|
17
|
-
unless DataMiner.classes.include?(name.constantize)
|
18
|
-
errors.add :name, msg
|
19
|
-
end
|
20
|
-
rescue NameError
|
21
|
-
errors.add :name, msg
|
22
|
-
end
|
23
|
-
|
24
|
-
validate :included_in_list_of_targets
|
25
|
-
end
|
26
|
-
end
|