data_miner 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,96 @@
1
+ =data_miner
2
+
3
+ Mine remote data into your ActiveRecord models.
4
+
5
+ ==Quick start
6
+
7
+ Put this in <tt>config/environment.rb</tt>:
8
+
9
+ config.gem 'seamusabshere-data_miner', :lib => 'data_miner', :source => 'http://gems.github.com'
10
+
11
+ You need to define <tt>mine_data</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
12
+
13
+ class Country < ActiveRecord::Base
14
+ mine_data do |step|
15
+ # import country names and country codes
16
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
17
+ attr.key :iso_3166, :name_in_source => 'country code'
18
+ attr.store :iso_3166, :name_in_source => 'country code'
19
+ attr.store :name, :name_in_source => 'country'
20
+ end
21
+ end
22
+ end
23
+
24
+ ...and in <tt>app/models/airport.rb</tt>:
25
+
26
+ class Airport < ActiveRecord::Base
27
+ belongs_to :country
28
+
29
+ mine_data do |step|
30
+ # import airport iata_code, name, etc.
31
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
32
+ attr.key :iata_code, :field_number => 3
33
+ attr.store :name, :field_number => 0
34
+ attr.store :city, :field_number => 1
35
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
36
+ attr.store :iata_code, :field_number => 3
37
+ attr.store :latitude, :field_number => 5
38
+ attr.store :longitude, :field_number => 6
39
+ end
40
+ end
41
+ end
42
+
43
+ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now)
44
+
45
+ namespace :data_miner do
46
+ task :mine => :environment do
47
+ DataMiner.mine :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
48
+ end
49
+
50
+ task :map_to_attrs => :environment do
51
+ DataMiner.map_to_attrs ENV['METHOD'], :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
52
+ end
53
+ end
54
+
55
+ You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
56
+
57
+ DataMiner.enqueue do |queue|
58
+ queue << Country # class whose data should be mined 1st
59
+ queue << Airport # class whose data should be mined 2nd
60
+ # etc
61
+ end
62
+
63
+ Once you have (1) set up the order of data mining and (2) defined <tt>mine_data</tt> blocks in your classes, you can:
64
+
65
+ $ rake data_miner:mine
66
+
67
+ ==Complete example
68
+
69
+ ~ $ rails testapp
70
+ ~ $ cd testapp/
71
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
72
+ ~/testapp $ ./script/generate model Country iso_3166:string name:string
73
+ ~/testapp $ rake db:migrate
74
+ ~/testapp $ touch lib/tasks/data_miner_tasks.rb
75
+ [...edit per quick start...]
76
+ ~/testapp $ touch config/initializers/data_miner_config.rake
77
+ [...edit per quick start...]
78
+ ~/testapp $ rake data_miner:mine
79
+
80
+ Now you should have
81
+
82
+ ~/testapp $ ./script/console
83
+ Loading development environment (Rails 2.3.3)
84
+ >> Airport.first.iata_code
85
+ => "GKA"
86
+ >> Airport.first.country.name
87
+ => "Papua New Guinea"
88
+
89
+ ==Authors
90
+
91
+ * Seamus Abshere <seamus@abshere.net>
92
+ * Andy Rossmeissl <andy@rossmeissl.net>
93
+
94
+ ==Copyright
95
+
96
+ Copyright (c) 2009 Brighter Planet. See LICENSE for details.
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "data_miner"
8
+ gem.summary = %Q{Mine remote data into your ActiveRecord models.}
9
+ gem.description = %Q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/data_miner"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ %w{ activerecord activesupport andand remote_table seamusabshere-errata seamusabshere-conversions }.each { |name| gem.add_dependency name }
14
+ gem.require_path = "lib"
15
+ gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
16
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
17
+ # gem.rubyforge_project = "dataminer"
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+ Jeweler::GemcutterTasks.new
21
+ Jeweler::RubyforgeTasks.new do |rubyforge|
22
+ rubyforge.doc_task = "rdoc"
23
+ end
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/*_test.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+
49
+
50
+
51
+ task :default => :test
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ if File.exist?('VERSION')
56
+ version = File.read('VERSION')
57
+ else
58
+ version = ""
59
+ end
60
+
61
+ rdoc.rdoc_dir = 'rdoc'
62
+ rdoc.title = "data_miner #{version}"
63
+ rdoc.rdoc_files.include('README*')
64
+ rdoc.rdoc_files.include('lib/**/*.rb')
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,81 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{data_miner}
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
+ s.date = %q{2009-10-30}
13
+ s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
+ s.email = %q{seamus@abshere.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "data_miner.gemspec",
27
+ "lib/data_miner.rb",
28
+ "lib/data_miner/active_record_ext.rb",
29
+ "lib/data_miner/attribute.rb",
30
+ "lib/data_miner/attribute_collection.rb",
31
+ "lib/data_miner/configuration.rb",
32
+ "lib/data_miner/dictionary.rb",
33
+ "lib/data_miner/step.rb",
34
+ "lib/data_miner/step/associate.rb",
35
+ "lib/data_miner/step/await.rb",
36
+ "lib/data_miner/step/callback.rb",
37
+ "lib/data_miner/step/derive.rb",
38
+ "lib/data_miner/step/import.rb",
39
+ "lib/data_miner/william_james_cartesian_product.rb",
40
+ "test/data_miner_test.rb",
41
+ "test/test_helper.rb"
42
+ ]
43
+ s.homepage = %q{http://github.com/seamusabshere/data_miner}
44
+ s.rdoc_options = ["--charset=UTF-8", "--line-numbers", "--inline-source"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = %q{1.3.5}
47
+ s.summary = %q{Mine remote data into your ActiveRecord models.}
48
+ s.test_files = [
49
+ "test/data_miner_test.rb",
50
+ "test/test_helper.rb"
51
+ ]
52
+
53
+ if s.respond_to? :specification_version then
54
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<activerecord>, [">= 0"])
59
+ s.add_runtime_dependency(%q<activesupport>, [">= 0"])
60
+ s.add_runtime_dependency(%q<andand>, [">= 0"])
61
+ s.add_runtime_dependency(%q<remote_table>, [">= 0"])
62
+ s.add_runtime_dependency(%q<seamusabshere-errata>, [">= 0"])
63
+ s.add_runtime_dependency(%q<seamusabshere-conversions>, [">= 0"])
64
+ else
65
+ s.add_dependency(%q<activerecord>, [">= 0"])
66
+ s.add_dependency(%q<activesupport>, [">= 0"])
67
+ s.add_dependency(%q<andand>, [">= 0"])
68
+ s.add_dependency(%q<remote_table>, [">= 0"])
69
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
70
+ s.add_dependency(%q<seamusabshere-conversions>, [">= 0"])
71
+ end
72
+ else
73
+ s.add_dependency(%q<activerecord>, [">= 0"])
74
+ s.add_dependency(%q<activesupport>, [">= 0"])
75
+ s.add_dependency(%q<andand>, [">= 0"])
76
+ s.add_dependency(%q<remote_table>, [">= 0"])
77
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
78
+ s.add_dependency(%q<seamusabshere-conversions>, [">= 0"])
79
+ end
80
+ end
81
+
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'activesupport'
3
+ require 'activerecord'
4
+ require 'conversions'
5
+ require 'remote_table'
6
+ require 'errata'
7
+
8
+ require 'data_miner/active_record_ext'
9
+ require 'data_miner/attribute'
10
+ require 'data_miner/attribute_collection'
11
+ require 'data_miner/configuration'
12
+ require 'data_miner/dictionary'
13
+ require 'data_miner/step'
14
+ require 'data_miner/step/associate'
15
+ require 'data_miner/step/await'
16
+ require 'data_miner/step/callback'
17
+ require 'data_miner/step/derive'
18
+ require 'data_miner/step/import'
19
+ require 'data_miner/william_james_cartesian_product' # TODO: move to gem
20
+
21
+ module DataMiner
22
+ class << self
23
+ def mine(options = {})
24
+ DataMiner::Configuration.mine options
25
+ end
26
+
27
+ def map_to_attrs(method, options = {})
28
+ puts DataMiner::Configuration.map_to_attrs(method, options)
29
+ end
30
+
31
+ def enqueue(&block)
32
+ DataMiner::Configuration.enqueue &block
33
+ end
34
+
35
+ def classes
36
+ DataMiner::Configuration.classes
37
+ end
38
+ end
39
+ end
40
+
41
+ ActiveRecord::Base.class_eval do
42
+ include DataMiner::ActiveRecordExt
43
+ end
@@ -0,0 +1,25 @@
1
+ module DataMiner
2
+ module ActiveRecordExt
3
+ def self.included(klass)
4
+ klass.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def mine_data(options = {}, &block)
9
+ if defined?(NO_DATA_MINER) and NO_DATA_MINER == true
10
+ class_eval do
11
+ class << self
12
+ def data_mine
13
+ raise "NO_DATA_MINER is set to true, so data_mine is not available"
14
+ end
15
+ end
16
+ end
17
+ else
18
+ class_eval { cattr_accessor :data_mine }
19
+ self.data_mine = Configuration.new(self)
20
+ yield data_mine
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,299 @@
1
+ module DataMiner
2
+ class Attribute
3
+ attr_accessor :klass, :name, :options_for_step, :affected_by_steps, :key_for_steps
4
+
5
+ def initialize(klass, name)
6
+ @klass = klass
7
+ @name = name.to_sym
8
+ @options_for_step = {}
9
+ @affected_by_steps = []
10
+ @key_for_steps = []
11
+ end
12
+
13
+ # polling questions
14
+ def report_find_or_create(step)
15
+ "Creates parents: #{klass}##{name} is set with #{reflection_klass(step)}.find_or_create_by_#{foreign_key(step)}" if wants_create?(step)
16
+ end
17
+
18
+ def report_unnatural_order(step)
19
+ if (
20
+ (rk = klass.reflect_on_association(weighting_association(step)).andand.klass) or
21
+ (wants_inline_association? and rk = reflection_klass(step))
22
+ ) and
23
+ step.configuration.classes.index(rk) > step.configuration.classes.index(klass) and
24
+ step.options[:awaiting].andand.klass != klass
25
+ "Unnatural order: #{klass} comes before #{rk}"
26
+ end
27
+ end
28
+
29
+ def inspect
30
+ "Attribute(#{klass}.#{name})"
31
+ end
32
+
33
+ def affected_by!(step, options = {})
34
+ self.options_for_step[step] = options
35
+ self.affected_by_steps << step
36
+ end
37
+
38
+ def affected_by?(step)
39
+ affected_by_steps.include?(step)
40
+ end
41
+
42
+ def key_for!(step, options = {})
43
+ self.options_for_step[step] = options
44
+ self.key_for_steps << step
45
+ end
46
+
47
+ def key_for?(step)
48
+ key_for_steps.include?(step)
49
+ end
50
+
51
+ def value_in_dictionary(step, key)
52
+ return *dictionary(step).lookup(key) # strip the array wrapper if there's only one element
53
+ end
54
+
55
+ def value_in_source(step, row)
56
+ if wants_static?(step)
57
+ value = static(step)
58
+ elsif field_number(step)
59
+ if field_number(step).is_a?(Range)
60
+ value = field_number(step).map { |n| row[n] }.join(delimiter(step))
61
+ else
62
+ value = row[field_number(step)]
63
+ end
64
+ else
65
+ value = row[name_in_source(step)]
66
+ end
67
+ return nil if value.nil?
68
+ return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
69
+ value = value.to_s
70
+ value = value[keep(step)] if wants_keep?(step)
71
+ value = do_split(step, value) if wants_split?(step)
72
+ # taken from old errata... maybe we want to do this here
73
+ value.gsub!(/[ ]+/, ' ')
74
+ # text.gsub!('- ', '-')
75
+ value.gsub!(/([^\\])~/, '\1 ')
76
+ value.strip!
77
+ value.upcase! if wants_upcase?(step)
78
+ value = do_convert(step, row, value) if wants_conversion?(step)
79
+ value = do_sprintf(step, value) if wants_sprintf?(step)
80
+ value
81
+ end
82
+
83
+ def value_from_row(step, row)
84
+ value = value_in_source(step, row)
85
+ return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
86
+ value = value_in_dictionary(step, value) if wants_dictionary?(step)
87
+ value = value_as_association(step, value) if wants_inline_association?
88
+ value
89
+ end
90
+
91
+ def value_as_association(step, value)
92
+ @_value_as_association ||= {}
93
+ @_value_as_association[step] ||= {}
94
+ if !@_value_as_association[step].has_key?(value)
95
+ dynamic_matcher = wants_create?(step) ? "find_or_create_by_#{foreign_key(step)}" : "find_by_#{foreign_key(step)}"
96
+ @_value_as_association[step][value] = reflection_klass(step).send(dynamic_matcher, value)
97
+ end
98
+ @_value_as_association[step][value]
99
+ end
100
+
101
+ # this will overwrite nils, even if wants_overwriting?(step) is false
102
+ def set_record_from_row(step, record, row)
103
+ return if !wants_overwriting?(step) and !record.send(name).nil?
104
+ value = value_from_row(step, row)
105
+ record.send "#{name}=", value
106
+ $stderr.puts("ActiveRecord didn't like trying to set #{klass}.#{name} = #{value}") if !value.nil? and record.send(name).nil?
107
+ end
108
+
109
+ def perform(step)
110
+ case step.variant
111
+ when :associate
112
+ perform_association(step)
113
+ when :derive
114
+ if wants_update_all?(step)
115
+ perform_update_all(step)
116
+ elsif wants_weighted_average?(step)
117
+ perform_weighted_average(step)
118
+ else
119
+ perform_callback(step)
120
+ end
121
+ when :import
122
+ raise "This shouldn't be called, the import step is special"
123
+ end
124
+ end
125
+
126
+ def perform_association(step)
127
+ raise "dictionary and prefix don't mix" if wants_dictionary?(step) and wants_prefix?(step)
128
+ klass.update_all("#{reflection.primary_key_name} = NULL") if wants_nullification?(step)
129
+ if wants_create?(step)
130
+ klass.find_in_batches do |batch|
131
+ batch.each do |record|
132
+ if wants_prefix?(step)
133
+ sql = "SELECT reflection_table.id FROM #{reflection_klass(step).quoted_table_name} AS reflection_table INNER JOIN #{klass.quoted_table_name} AS klass_table ON LEFT(klass_table.#{key(step)}, LENGTH(reflection_table.#{foreign_key(step)})) = reflection_table.#{foreign_key(step)} WHERE klass_table.id = #{record.id} ORDER BY LENGTH(reflection_table.#{foreign_key(step)}) DESC"
134
+ associated_id = ActiveRecord::Base.connection.select_value(sql)
135
+ next if associated_id.blank?
136
+ record.send("#{reflection.primary_key_name}=", associated_id)
137
+ else
138
+ dynamic_finder_value = record.send(key(step))
139
+ dynamic_finder_value = value_in_dictionary(step, dynamic_finder_value) if wants_dictionary?(step)
140
+ next if dynamic_finder_value.blank?
141
+ associated = reflection_klass(step).send("find_or_create_by_#{foreign_key(step)}", dynamic_finder_value) # TODO cache results
142
+ record.send("#{name}=", associated)
143
+ end
144
+ record.save
145
+ end
146
+ end
147
+ else
148
+ reflection_klass(step).find_in_batches do |batch|
149
+ batch.each do |reflection_record|
150
+ klass.update_all ["#{reflection.primary_key_name} = ?", reflection_record.id], ["#{key(step)} = ?", reflection_record.send(foreign_key(step))]
151
+ end
152
+ end
153
+ end
154
+ end
155
+
156
+ def perform_update_all(step)
157
+ klass.update_all("#{name} = #{set(step)}", conditions(step))
158
+ end
159
+
160
+ def perform_weighted_average(step)
161
+ # handle weighting by scopes instead of associations
162
+ if weighting_association(step) and !klass.reflect_on_association(weighting_association(step))
163
+ klass.find_in_batches do |batch|
164
+ batch.each do |record|
165
+ record.send "#{name}=", record.send(weighting_association(step)).weighted_average(name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step))
166
+ record.save
167
+ end
168
+ end
169
+ else # there's no weighting association OR there is one and it's a valid association
170
+ klass.update_all_weighted_averages name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step), :association => weighting_association(step)
171
+ end
172
+ end
173
+
174
+ def perform_callback(step)
175
+ case klass.method(callback(step)).arity
176
+ when 0:
177
+ klass.send(callback(step))
178
+ when 1:
179
+ klass.send(callback(step), name)
180
+ when 2:
181
+ klass.send(callback(step), name, options_for_step[step])
182
+ end
183
+ end
184
+
185
+ def unit_from_source(step, row)
186
+ row[unit_in_source(step)].to_s.strip.underscore.to_sym
187
+ end
188
+
189
+ def do_convert(step, row, value)
190
+ from_unit = from(step) || unit_from_source(step, row)
191
+ value.to_f.convert(from_unit, to(step))
192
+ end
193
+
194
+ def do_sprintf(step, value)
195
+ if /\%[0-9\.]*f/.match(sprintf(step))
196
+ value = value.to_f
197
+ elsif /\%[0-9\.]*d/.match(sprintf(step))
198
+ value = value.to_i
199
+ end
200
+ sprintf(step) % value
201
+ end
202
+
203
+ def do_split(step, value)
204
+ pattern = split_options(step)[:pattern] || /\s+/ # default is split on whitespace
205
+ keep = split_options(step)[:keep] || 0 # default is keep first element
206
+ value.to_s.split(pattern)[keep].to_s
207
+ end
208
+
209
+ def column_type
210
+ @column_type ||= klass.columns_hash[name.to_s].type
211
+ end
212
+
213
+ {
214
+ :static => 'options_for_step[step].has_key?(:static)',
215
+ :prefix => :prefix,
216
+ :create => :create,
217
+ :keep => :keep,
218
+ :upcase => :upcase,
219
+ :conversion => '!from(step).nil? or !unit_in_source(step).nil?',
220
+ :sprintf => :sprintf,
221
+ :dictionary => :dictionary_options,
222
+ :split => :split_options,
223
+ :update_all => :set,
224
+ :nullification => 'nullify(step) != false',
225
+ :overwriting => 'overwrite(step) != false',
226
+ :weighted_average => '!weighting_association(step).nil? or !weighting_column(step).nil?'
227
+ }.each do |name, condition|
228
+ condition = "!#{condition}(step).nil?" if condition.is_a?(Symbol)
229
+ eval <<-EOS
230
+ def wants_#{name}?(step)
231
+ #{condition}
232
+ end
233
+ EOS
234
+ end
235
+
236
+ {
237
+ :name_in_source => { :default => :name, :stringify => true },
238
+ :key => { :default => :name, :stringify => true },
239
+ :foreign_key => { :default => 'key(step)', :stringify => true },
240
+ :delimiter => { :default => '", "' }
241
+ }.each do |name, options|
242
+ eval <<-EOS
243
+ def #{name}(step)
244
+ (options_for_step[step][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
245
+ end
246
+ EOS
247
+ end
248
+
249
+ def reflection
250
+ if @_reflection.nil?
251
+ @_reflection = klass.reflect_on_association(name) || :missing
252
+ reflection
253
+ elsif @_reflection == :missing
254
+ nil
255
+ else
256
+ @_reflection
257
+ end
258
+ end
259
+
260
+ def reflection_klass(step)
261
+ return nil unless reflection
262
+ if reflection.options[:polymorphic]
263
+ polymorphic_type(step).andand.constantize
264
+ else
265
+ reflection.klass
266
+ end
267
+ end
268
+
269
+ def wants_inline_association?
270
+ reflection.present?
271
+ end
272
+
273
+ def callback(step)
274
+ (options_for_step[step][:callback] || "derive_#{name}").to_sym
275
+ end
276
+
277
+ def dictionary(step)
278
+ raise "shouldn't ask for this" unless wants_dictionary?(step) # don't try to initialize if there are no dictionary options
279
+ @dictionaries ||= {}
280
+ @dictionaries[step] ||= Dictionary.new(dictionary_options(step))
281
+ end
282
+
283
+ %w(dictionary split).each do |name|
284
+ eval <<-EOS
285
+ def #{name}_options(step)
286
+ options_for_step[step][:#{name}]
287
+ end
288
+ EOS
289
+ end
290
+
291
+ %w(from to set conditions weighting_association weighting_column weighting_disaggregator sprintf nullify overwrite upcase prefix unit_in_source field_number keep create static polymorphic_type).each do |name|
292
+ eval <<-EOS
293
+ def #{name}(step)
294
+ options_for_step[step][:#{name}]
295
+ end
296
+ EOS
297
+ end
298
+ end
299
+ end
@@ -0,0 +1,51 @@
1
+ module DataMiner
2
+ class AttributeCollection
3
+ attr_accessor :klass, :attributes
4
+
5
+ def initialize(klass)
6
+ @klass = klass
7
+ @attributes = {}
8
+ end
9
+
10
+ def key!(step, attr_name, attr_options = {})
11
+ find_or_initialize(attr_name).key_for!(step, attr_options)
12
+ end
13
+
14
+ def affect!(step, attr_name, attr_options = {})
15
+ find_or_initialize(attr_name).affected_by!(step, attr_options)
16
+ end
17
+
18
+ def affect_all_content_columns!(step, options = {})
19
+ except = Array.wrap(options[:except]).map(&:to_sym)
20
+ step.klass.content_columns.map(&:name).reject { |content_column| except.include?(content_column.to_sym) }.each do |content_column|
21
+ find_or_initialize(content_column).affected_by!(step)
22
+ end
23
+ end
24
+
25
+ def all_affected_by(step)
26
+ attributes.values.select { |attr| attr.affected_by?(step) }
27
+ end
28
+
29
+ def all_keys_for(step)
30
+ attributes.values.select { |attr| attr.key_for?(step) }
31
+ end
32
+
33
+ def all_for(step)
34
+ (all_affected_by(step) + all_keys_for(step)).uniq
35
+ end
36
+
37
+ def has_keys_for?(step)
38
+ attributes.values.any? { |attr| attr.key_for?(step) }
39
+ end
40
+
41
+ def has_conditional_writes_for?(step)
42
+ all_affected_by(step).any? { |attr| !attr.wants_overwriting?(step) }
43
+ end
44
+
45
+ private
46
+
47
+ def find_or_initialize(attr_name)
48
+ self.attributes[attr_name] ||= Attribute.new(klass, attr_name)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,94 @@
1
+ module DataMiner
2
+ class Configuration
3
+ attr_accessor :steps, :klass, :counter, :attributes, :awaiting
4
+
5
+ def initialize(klass)
6
+ @steps = []
7
+ @klass = klass
8
+ @counter = 0
9
+ @attributes = AttributeCollection.new(klass)
10
+ end
11
+
12
+ %w(import associate derive await).each do |method|
13
+ eval <<-EOS
14
+ def #{method}(*args, &block)
15
+ self.counter += 1
16
+ if block_given? # FORM C
17
+ step_options = args[0] || {}
18
+ set_awaiting!(step_options)
19
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
+ elsif args[0].is_a?(Hash) # FORM A
21
+ step_options = args[0]
22
+ set_awaiting!(step_options)
23
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
+ else # FORM B
25
+ attr_name = args[0]
26
+ attr_options = args[1] || {}
27
+ step_options = {}
28
+ set_awaiting!(step_options)
29
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
+ attr.affect attr_name, attr_options
31
+ end
32
+ end
33
+ end
34
+ EOS
35
+ end
36
+
37
+ def set_awaiting!(step_options)
38
+ step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
39
+ end
40
+
41
+ def awaiting!(step)
42
+ self.awaiting = step
43
+ end
44
+
45
+ def stop_awaiting!
46
+ self.awaiting = nil
47
+ end
48
+
49
+ # Mine data for this class.
50
+ def mine(options = {})
51
+ steps.each { |step| step.perform options }
52
+ end
53
+
54
+ # Map <tt>method</tt> to attributes
55
+ def map_to_attrs(method)
56
+ steps.map { |step| step.map_to_attrs(method) }.compact
57
+ end
58
+
59
+ cattr_accessor :classes
60
+ self.classes = []
61
+ class << self
62
+ # Mine data. Defaults to all classes touched by DataMiner.
63
+ #
64
+ # Options
65
+ # * <tt>:class_names</tt>: provide an array class names to mine
66
+ def mine(options = {})
67
+ classes.each do |klass|
68
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
69
+ klass.data_mine.mine options
70
+ end
71
+ end
72
+ end
73
+
74
+ # Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
75
+ #
76
+ # Options
77
+ # * <tt>:class_names</tt>: provide an array class names to mine
78
+ def map_to_attrs(method, options = {})
79
+ classes.map do |klass|
80
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
81
+ klass.data_mine.map_to_attrs method
82
+ end
83
+ end.flatten.compact
84
+ end
85
+
86
+ # Queue up all the ActiveRecord classes that DataMiner should touch.
87
+ #
88
+ # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
89
+ def enqueue(&block)
90
+ yield self.classes
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,36 @@
1
+ module DataMiner
2
+ class Dictionary
3
+ attr_accessor :key_name, :value_name, :sprintf, :table
4
+
5
+ def initialize(options = {})
6
+ @key_name = options[:key]
7
+ @value_name = options[:returns]
8
+ @sprintf = options[:sprintf] || '%s'
9
+ @table = RemoteTable.new(:url => options[:url])
10
+ end
11
+
12
+ def lookup(key)
13
+ find(self.key_name, key, self.value_name, :sprintf => self.sprintf)
14
+ end
15
+
16
+ def find(key_name, key, value_name, options = {})
17
+ if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
+ match[value_name].to_s.split(/\s*;\s/)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def normalize_for_comparison(string, options = {})
25
+ if options[:sprintf]
26
+ if /\%[0-9\.]*f/.match(options[:sprintf])
27
+ string = string.to_f
28
+ elsif /\%[0-9\.]*d/.match(options[:sprintf])
29
+ string = string.to_i
30
+ end
31
+ string = sprintf % string
32
+ end
33
+ string.to_s.strip
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,64 @@
1
+ module DataMiner
2
+ class Step
3
+ attr_accessor :configuration, :number, :options
4
+ delegate :klass, :to => :configuration
5
+ delegate :attributes, :to => :configuration
6
+
7
+ def initialize(configuration, number, options = {}, &block)
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ attributes.affect_all_content_columns!(self, :except => options[:except]) if options[:affect_all] == :content_columns
13
+ affected_attributes.each { |attr| attr.options_for_step[self][:callback] = options[:callback] } if options[:callback]
14
+ all_attributes.each { |attr| attr.options_for_step[self][:name_in_source] = attr.name_in_source(self).upcase } if options[:headers] == :upcase # TODO remove
15
+ end
16
+
17
+ def variant
18
+ self.class.name.demodulize.underscore.to_sym
19
+ end
20
+
21
+ def awaiting?
22
+ !options[:awaiting].nil?
23
+ end
24
+
25
+ def inspect
26
+ "Step(#{klass} #{variant.to_s.camelcase} #{number})"
27
+ end
28
+
29
+ def signature
30
+ "#{klass} step #{number}: #{variant}"
31
+ end
32
+
33
+ def perform(options = {})
34
+ return if awaiting? and !options[:force]
35
+ affected_attributes.each { |attr| attr.perform self }
36
+ $stderr.puts "performed #{signature}"
37
+ end
38
+
39
+ def affected_attributes
40
+ @affected_attributes ||= attributes.all_affected_by self
41
+ end
42
+
43
+ def key_attributes
44
+ @key_attributes ||= attributes.all_keys_for self
45
+ end
46
+
47
+ def all_attributes
48
+ @all_attributes ||= attributes.all_for self
49
+ end
50
+
51
+ def key(attr_name, attr_options = {})
52
+ attributes.key! self, attr_name, attr_options
53
+ end
54
+
55
+ def affect(attr_name, attr_options = {})
56
+ attributes.affect! self, attr_name, attr_options
57
+ end
58
+ alias_method :store, :affect
59
+
60
+ def map_to_attrs(method)
61
+ affected_attributes.map { |attr| attr.send method, self }.compact
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Associate < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,35 @@
1
+ module DataMiner
2
+ class Step
3
+ class Await < Step
4
+ attr_accessor :other_class
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ # doesn't call super
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ @other_class = options.delete :other_class
12
+ configuration.awaiting! self
13
+ yield configuration # pull in steps
14
+ configuration.stop_awaiting!
15
+ end
16
+
17
+ def perform(*args)
18
+ other_class.data_mine.steps << Step::Callback.new(other_class.data_mine, self)
19
+ $stderr.puts "added #{signature} to callbacks after #{other_class}"
20
+ end
21
+
22
+ def callback
23
+ $stderr.puts "starting to perform deferred steps in #{signature}..."
24
+ all_awaiting.each { |step| step.perform :force => true }
25
+ $stderr.puts "...done"
26
+ end
27
+
28
+ private
29
+
30
+ def all_awaiting
31
+ configuration.steps.select { |step| step.options and step.options[:awaiting] == self }
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module DataMiner
2
+ class Step
3
+ class Callback < Step
4
+ attr_accessor :foreign_step
5
+
6
+ def initialize(configuration, foreign_step)
7
+ @configuration = configuration
8
+ @foreign_step = foreign_step
9
+ @number = "(last)"
10
+ end
11
+
12
+ def perform(*args)
13
+ foreign_step.callback
14
+ $stderr.puts "performed #{signature}"
15
+ end
16
+
17
+ def signature
18
+ "#{super} (on behalf of #{foreign_step.signature})"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Derive < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Step
3
+ class Import < Step
4
+ attr_accessor :table, :errata
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ super
8
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
9
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
10
+ end
11
+
12
+ def signature
13
+ "#{super} #{options[:url]}"
14
+ end
15
+
16
+ def perform(*args)
17
+ ActiveRecord::Base.connection.execute("TRUNCATE #{klass.quoted_table_name}") if wants_truncate?
18
+ table.each_row do |row|
19
+ if errata
20
+ next if errata.rejects?(row)
21
+ errata.correct!(row)
22
+ end
23
+ if uses_existing_data?
24
+ key_values = key_attributes.map { |key_attr| [ key_attr.value_from_row(self, row) ] }
25
+ record_set = WilliamJamesCartesianProduct.cart_prod(*key_values).map do |combination|
26
+ next if combination.include?(nil) and !wants_nil_keys?
27
+ klass.send(dynamic_finder_name, *combination)
28
+ end.flatten
29
+ else
30
+ record_set = klass.new
31
+ end
32
+ Array.wrap(record_set).each do |record|
33
+ affected_attributes.each { |attr| attr.set_record_from_row(self, record, row) }
34
+ record.save
35
+ end
36
+ end
37
+ $stderr.puts "performed #{signature}"
38
+ end
39
+
40
+ def wants_truncate?
41
+ options[:truncate] == true or (!(options[:truncate] == false) and !uses_existing_data?)
42
+ end
43
+
44
+ def wants_nil_keys?
45
+ options[:allow_nil_keys] == true
46
+ end
47
+
48
+ def uses_existing_data?
49
+ @uses_existing_data ||= attributes.has_keys_for?(self) or attributes.has_conditional_writes_for?(self)
50
+ end
51
+
52
+ def dynamic_finder_name
53
+ "find_or_initialize_by_#{key_attributes.map(&:name).join('_and_')}".to_sym
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,11 @@
1
+ # http://www.ruby-forum.com/topic/95519#200484
2
+
3
+ module WilliamJamesCartesianProduct
4
+ def self.cart_prod( *args )
5
+ args.inject([[]]){|old,lst|
6
+ new = []
7
+ lst.each{|e| new += old.map{|c| c.dup << e }}
8
+ new
9
+ }
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'test_helper'
2
+
3
+ ActiveRecord::Schema.define(:version => 20090819143429) do
4
+ create_table "airports", :force => true do |t|
5
+ t.string "iata_code"
6
+ t.string "name"
7
+ t.string "city"
8
+ t.integer "country_id"
9
+ t.float "latitude"
10
+ t.float "longitude"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ end
14
+ create_table "countries", :force => true do |t|
15
+ t.string "iso_3166"
16
+ t.string "name"
17
+ t.datetime "created_at"
18
+ t.datetime "updated_at"
19
+ end
20
+ end
21
+
22
+ class Country < ActiveRecord::Base
23
+ mine_data do |step|
24
+ # import country names and country codes
25
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
+ attr.key :iso_3166, :name_in_source => 'country code'
27
+ attr.store :iso_3166, :name_in_source => 'country code'
28
+ attr.store :name, :name_in_source => 'country'
29
+ end
30
+ end
31
+ end
32
+
33
+ class Airport < ActiveRecord::Base
34
+ belongs_to :country
35
+ mine_data do |step|
36
+ # import airport iata_code, name, etc.
37
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
+ attr.key :iata_code, :field_number => 3
39
+ attr.store :name, :field_number => 0
40
+ attr.store :city, :field_number => 1
41
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
+ attr.store :iata_code, :field_number => 3
43
+ attr.store :latitude, :field_number => 5
44
+ attr.store :longitude, :field_number => 6
45
+ end
46
+ end
47
+ end
48
+
49
+ DataMiner.enqueue do |queue|
50
+ queue << Country
51
+ queue << Airport
52
+ end
53
+
54
+ class DataMinerTest < Test::Unit::TestCase
55
+ def teardown
56
+ Airport.delete_all
57
+ Country.delete_all
58
+ end
59
+
60
+ should "mine a single class" do
61
+ Country.data_mine.mine
62
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
+ assert_equal 0, Airport.count
64
+ end
65
+
66
+ should "mine a single class using the API" do
67
+ DataMiner.mine :class_names => ['Country']
68
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
+ assert_equal 0, Airport.count
70
+ end
71
+
72
+ should "mine all classes" do
73
+ DataMiner.mine
74
+ uy = Country.find_by_iso_3166('UY')
75
+ assert_equal 'Uruguay', uy.name
76
+ assert_equal uy, Airport.find_by_iata_code('MVD').country
77
+ end
78
+ end
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'sqlite3'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'data_miner'
9
+
10
+ ActiveRecord::Base.establish_connection(
11
+ 'adapter' => 'sqlite3',
12
+ 'database' => 'test/test.sqlite3'
13
+ )
14
+
15
+ class Test::Unit::TestCase
16
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_miner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
+ platform: ruby
6
+ authors:
7
+ - Seamus Abshere
8
+ - Andy Rossmeissl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-10-30 00:00:00 -04:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activerecord
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: activesupport
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ version:
36
+ - !ruby/object:Gem::Dependency
37
+ name: andand
38
+ type: :runtime
39
+ version_requirement:
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: remote_table
48
+ type: :runtime
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ - !ruby/object:Gem::Dependency
57
+ name: seamusabshere-errata
58
+ type: :runtime
59
+ version_requirement:
60
+ version_requirements: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ - !ruby/object:Gem::Dependency
67
+ name: seamusabshere-conversions
68
+ type: :runtime
69
+ version_requirement:
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: "0"
75
+ version:
76
+ description: Mine remote data into your ActiveRecord models. You can also perform associations and convert units.
77
+ email: seamus@abshere.net
78
+ executables: []
79
+
80
+ extensions: []
81
+
82
+ extra_rdoc_files:
83
+ - LICENSE
84
+ - README.rdoc
85
+ files:
86
+ - .document
87
+ - .gitignore
88
+ - LICENSE
89
+ - README.rdoc
90
+ - Rakefile
91
+ - VERSION
92
+ - data_miner.gemspec
93
+ - lib/data_miner.rb
94
+ - lib/data_miner/active_record_ext.rb
95
+ - lib/data_miner/attribute.rb
96
+ - lib/data_miner/attribute_collection.rb
97
+ - lib/data_miner/configuration.rb
98
+ - lib/data_miner/dictionary.rb
99
+ - lib/data_miner/step.rb
100
+ - lib/data_miner/step/associate.rb
101
+ - lib/data_miner/step/await.rb
102
+ - lib/data_miner/step/callback.rb
103
+ - lib/data_miner/step/derive.rb
104
+ - lib/data_miner/step/import.rb
105
+ - lib/data_miner/william_james_cartesian_product.rb
106
+ - test/data_miner_test.rb
107
+ - test/test_helper.rb
108
+ has_rdoc: true
109
+ homepage: http://github.com/seamusabshere/data_miner
110
+ licenses: []
111
+
112
+ post_install_message:
113
+ rdoc_options:
114
+ - --charset=UTF-8
115
+ - --line-numbers
116
+ - --inline-source
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: "0"
124
+ version:
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: "0"
130
+ version:
131
+ requirements: []
132
+
133
+ rubyforge_project:
134
+ rubygems_version: 1.3.5
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: Mine remote data into your ActiveRecord models.
138
+ test_files:
139
+ - test/data_miner_test.rb
140
+ - test/test_helper.rb