seamusabshere-data_miner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,87 @@
1
+ =data_miner
2
+
3
+ Mine remote data into your ActiveRecord models.
4
+
5
+ ==Quick start
6
+
7
+ Put this in <tt>config/environment.rb</tt>:
8
+
9
+ config.gem 'seamusabshere-data_miner', :lib => 'data_miner', :source => 'http://gems.github.com'
10
+
11
+ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now)
12
+
13
+ namespace :data_miner do
14
+ task :mine => :environment do
15
+ DataMiner.mine :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
16
+ end
17
+ end
18
+
19
+ You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
20
+
21
+ DataMiner.enqueue do |queue|
22
+ queue << Country # class whose data should be mined 1st
23
+ queue << Airport # class whose data should be mined 2nd
24
+ # etc
25
+ end
26
+
27
+ You need to define <tt>mine_data</tt> blocks. For example, in <tt>app/models/country.rb</tt>:
28
+
29
+ class Country < ActiveRecord::Base
30
+ mine_data do |step|
31
+ # import country names and country codes
32
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
33
+ attr.key :iso_3166, :name_in_source => 'country code'
34
+ attr.store :iso_3166, :name_in_source => 'country code'
35
+ attr.store :name, :name_in_source => 'country'
36
+ end
37
+ end
38
+ end
39
+
40
+ To complete the example, in <tt>app/models/airport.rb</tt>:
41
+
42
+ class Airport < ActiveRecord::Base
43
+ belongs_to :country
44
+
45
+ mine_data do |step|
46
+ # import airport iata_code, name, etc.
47
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
48
+ attr.key :iata_code, :field_number => 3
49
+ attr.store :name, :field_number => 0
50
+ attr.store :city, :field_number => 1
51
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
52
+ attr.store :iata_code, :field_number => 3
53
+ attr.store :latitude, :field_number => 5
54
+ attr.store :longitude, :field_number => 6
55
+ end
56
+ end
57
+ end
58
+
59
+ Once you have (1) set up the order of data mining and (2) defined <tt>mine_data</tt> blocks in your classes, you can:
60
+
61
+ $ rake data_miner:mine
62
+
63
+ ==Complete example
64
+
65
+ ~ $ rails testapp
66
+ ~ $ cd testapp/
67
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
68
+ ~/testapp $ ./script/generate model Country iso_3166:string name:string
69
+ ~/testapp $ rake db:migrate
70
+ ~/testapp $ touch lib/tasks/data_miner_tasks.rb
71
+ [...edit per quick start...]
72
+ ~/testapp $ touch config/initializers/data_miner_config.rake
73
+ [...edit per quick start...]
74
+ ~/testapp $ rake data_miner:mine
75
+
76
+ Now you should have
77
+
78
+ ~/testapp $ ./script/console
79
+ Loading development environment (Rails 2.3.3)
80
+ >> Airport.first.iata_code
81
+ => "GKA"
82
+ >> Airport.first.country.name
83
+ => "Papua New Guinea"
84
+
85
+ ==Copyright
86
+
87
+ Copyright (c) 2009 Brighter Planet. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "data_miner"
8
+ gem.summary = %Q{Mine remote data into your ActiveRecord models.}
9
+ gem.description = %Q{Mine remote data into your ActiveRecord models.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/data_miner"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ %w{ activerecord activesupport seamusabshere-remote_table seamusabshere-errata }.each { |name| gem.add_dependency name }
14
+ gem.require_path = "lib"
15
+ gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
16
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
17
+ # gem.rubyforge_project = "dataminer"
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+
21
+ Jeweler::RubyforgeTasks.new do |rubyforge|
22
+ rubyforge.doc_task = "rdoc"
23
+ end
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/*_test.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+
49
+
50
+
51
+ task :default => :test
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ if File.exist?('VERSION')
56
+ version = File.read('VERSION')
57
+ else
58
+ version = ""
59
+ end
60
+
61
+ rdoc.rdoc_dir = 'rdoc'
62
+ rdoc.title = "data_miner #{version}"
63
+ rdoc.rdoc_files.include('README*')
64
+ rdoc.rdoc_files.include('lib/**/*.rb')
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,74 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{data_miner}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
+ s.date = %q{2009-08-19}
13
+ s.description = %q{Mine remote data into your ActiveRecord models.}
14
+ s.email = %q{seamus@abshere.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "data_miner.gemspec",
27
+ "lib/data_miner.rb",
28
+ "lib/data_miner/active_record_ext.rb",
29
+ "lib/data_miner/attribute.rb",
30
+ "lib/data_miner/attribute_collection.rb",
31
+ "lib/data_miner/configuration.rb",
32
+ "lib/data_miner/dictionary.rb",
33
+ "lib/data_miner/step.rb",
34
+ "lib/data_miner/step/associate.rb",
35
+ "lib/data_miner/step/await.rb",
36
+ "lib/data_miner/step/callback.rb",
37
+ "lib/data_miner/step/derive.rb",
38
+ "lib/data_miner/step/import.rb",
39
+ "lib/data_miner/william_james_cartesian_product.rb",
40
+ "test/data_miner_test.rb",
41
+ "test/test_helper.rb"
42
+ ]
43
+ s.homepage = %q{http://github.com/seamusabshere/data_miner}
44
+ s.rdoc_options = ["--charset=UTF-8", "--line-numbers", "--inline-source"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = %q{1.3.5}
47
+ s.summary = %q{Mine remote data into your ActiveRecord models.}
48
+ s.test_files = [
49
+ "test/data_miner_test.rb",
50
+ "test/test_helper.rb"
51
+ ]
52
+
53
+ if s.respond_to? :specification_version then
54
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<activerecord>, [">= 0"])
59
+ s.add_runtime_dependency(%q<activesupport>, [">= 0"])
60
+ s.add_runtime_dependency(%q<seamusabshere-remote_table>, [">= 0"])
61
+ s.add_runtime_dependency(%q<seamusabshere-errata>, [">= 0"])
62
+ else
63
+ s.add_dependency(%q<activerecord>, [">= 0"])
64
+ s.add_dependency(%q<activesupport>, [">= 0"])
65
+ s.add_dependency(%q<seamusabshere-remote_table>, [">= 0"])
66
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
67
+ end
68
+ else
69
+ s.add_dependency(%q<activerecord>, [">= 0"])
70
+ s.add_dependency(%q<activesupport>, [">= 0"])
71
+ s.add_dependency(%q<seamusabshere-remote_table>, [">= 0"])
72
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
73
+ end
74
+ end
data/lib/data_miner.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'rubygems'
2
+ require 'activesupport'
3
+ require 'activerecord'
4
+ require 'remote_table'
5
+ require 'errata'
6
+
7
+ require 'data_miner/active_record_ext'
8
+ require 'data_miner/attribute'
9
+ require 'data_miner/attribute_collection'
10
+ require 'data_miner/configuration'
11
+ require 'data_miner/dictionary'
12
+ require 'data_miner/step'
13
+ require 'data_miner/step/associate'
14
+ require 'data_miner/step/await'
15
+ require 'data_miner/step/callback'
16
+ require 'data_miner/step/derive'
17
+ require 'data_miner/step/import'
18
+ require 'data_miner/william_james_cartesian_product' # TODO: move to gem
19
+
20
+ module DataMiner
21
+ class << self
22
+ def mine(options = {})
23
+ DataMiner::Configuration.mine options
24
+ end
25
+
26
+ def enqueue(&block)
27
+ DataMiner::Configuration.enqueue &block
28
+ end
29
+
30
+ def classes
31
+ DataMiner::Configuration.classes
32
+ end
33
+ end
34
+ end
35
+
36
+ ActiveRecord::Base.class_eval do
37
+ include DataMiner::ActiveRecordExt
38
+ end
@@ -0,0 +1,15 @@
1
+ module DataMiner
2
+ module ActiveRecordExt
3
+ def self.included(klass)
4
+ klass.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def mine_data(options = {}, &block)
9
+ class_eval { cattr_accessor :data_mine }
10
+ self.data_mine = Configuration.new(self)
11
+ yield data_mine
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,279 @@
1
+ module DataMiner
2
+ class Attribute
3
+ attr_accessor :klass, :name, :options_for_step, :affected_by_steps, :key_for_steps
4
+
5
+ def initialize(klass, name)
6
+ @klass = klass
7
+ @name = name.to_sym
8
+ @options_for_step = {}
9
+ @affected_by_steps = []
10
+ @key_for_steps = []
11
+ end
12
+
13
+ def inspect
14
+ "Attribute(#{klass}.#{name})"
15
+ end
16
+
17
+ def affected_by!(step, options = {})
18
+ self.options_for_step[step] = options
19
+ self.affected_by_steps << step
20
+ end
21
+
22
+ def affected_by?(step)
23
+ affected_by_steps.include?(step)
24
+ end
25
+
26
+ def key_for!(step, options = {})
27
+ self.options_for_step[step] = options
28
+ self.key_for_steps << step
29
+ end
30
+
31
+ def key_for?(step)
32
+ key_for_steps.include?(step)
33
+ end
34
+
35
+ def value_in_dictionary(step, key)
36
+ return *dictionary(step).lookup(key) # strip the array wrapper if there's only one element
37
+ end
38
+
39
+ def value_in_source(step, row)
40
+ if wants_static?(step)
41
+ value = static(step)
42
+ elsif field_number(step)
43
+ if field_number(step).is_a?(Range)
44
+ value = field_number(step).map { |n| row[n] }.join(delimiter(step))
45
+ else
46
+ value = row[field_number(step)]
47
+ end
48
+ else
49
+ value = row[name_in_source(step)]
50
+ end
51
+ return nil if value.nil?
52
+ return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
53
+ value = value.to_s
54
+ value = value[keep(step)] if wants_keep?(step)
55
+ value = do_split(step, value) if wants_split?(step)
56
+ # taken from old errata... maybe we want to do this here
57
+ value.gsub!(/[ ]+/, ' ')
58
+ # text.gsub!('- ', '-')
59
+ value.gsub!(/([^\\])~/, '\1 ')
60
+ value.strip!
61
+ value.upcase! if wants_upcase?(step)
62
+ value = do_convert(step, row, value) if wants_conversion?(step)
63
+ value = do_sprintf(step, value) if wants_sprintf?(step)
64
+ value
65
+ end
66
+
67
+ def value_from_row(step, row)
68
+ value = value_in_source(step, row)
69
+ return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
70
+ value = value_in_dictionary(step, value) if wants_dictionary?(step)
71
+ value = value_as_association(step, value) if wants_inline_association?
72
+ value
73
+ end
74
+
75
+ def value_as_association(step, value)
76
+ @_value_as_association ||= {}
77
+ @_value_as_association[step] ||= {}
78
+ @_value_as_association[step][value] ||= reflection_klass(step).send("find_by_#{foreign_key(step)}", value)
79
+ end
80
+
81
+ # this will overwrite nils, even if wants_overwriting?(step) is false
82
+ def set_record_from_row(step, record, row)
83
+ return if !wants_overwriting?(step) and !record.send(name).nil?
84
+ value = value_from_row(step, row)
85
+ record.send "#{name}=", value
86
+ $stderr.puts("ActiveRecord didn't like trying to set #{klass}.#{name} = #{value}") if !value.nil? and record.send(name).nil?
87
+ end
88
+
89
+ def perform(step)
90
+ case step.variant
91
+ when :associate
92
+ perform_association(step)
93
+ when :derive
94
+ if wants_update_all?(step)
95
+ perform_update_all(step)
96
+ elsif wants_weighted_average?(step)
97
+ perform_weighted_average(step)
98
+ else
99
+ perform_callback(step)
100
+ end
101
+ when :import
102
+ raise "This shouldn't be called, the import step is special"
103
+ end
104
+ end
105
+
106
+ def perform_association(step)
107
+ raise "dictionary and prefix don't mix" if wants_dictionary?(step) and wants_prefix?(step)
108
+ klass.update_all("#{reflection.primary_key_name} = NULL") if wants_nullification?(step)
109
+ if wants_create?(step)
110
+ klass.find_in_batches do |batch|
111
+ batch.each do |record|
112
+ if wants_prefix?(step)
113
+ sql = "SELECT reflection_table.id FROM #{reflection_klass(step).quoted_table_name} AS reflection_table INNER JOIN #{klass.quoted_table_name} AS klass_table ON LEFT(klass_table.#{key(step)}, LENGTH(reflection_table.#{foreign_key(step)})) = reflection_table.#{foreign_key(step)} WHERE klass_table.id = #{record.id} ORDER BY LENGTH(reflection_table.#{foreign_key(step)}) DESC"
114
+ associated_id = ActiveRecord::Base.connection.select_value(sql)
115
+ next if associated_id.blank?
116
+ record.send("#{reflection.primary_key_name}=", associated_id)
117
+ else
118
+ dynamic_finder_value = record.send(key(step))
119
+ dynamic_finder_value = value_in_dictionary(step, dynamic_finder_value) if wants_dictionary?(step)
120
+ next if dynamic_finder_value.blank?
121
+ associated = reflection_klass(step).send("find_or_create_by_#{foreign_key(step)}", dynamic_finder_value) # TODO cache results
122
+ record.send("#{name}=", associated)
123
+ end
124
+ record.save
125
+ end
126
+ end
127
+ else
128
+ reflection_klass(step).find_in_batches do |batch|
129
+ batch.each do |reflection_record|
130
+ klass.update_all ["#{reflection.primary_key_name} = ?", reflection_record.id], ["#{key(step)} = ?", reflection_record.send(foreign_key(step))]
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ def perform_update_all(step)
137
+ klass.update_all("#{name} = #{set(step)}", conditions(step))
138
+ end
139
+
140
+ def perform_weighted_average(step)
141
+ # handle weighting by scopes instead of associations
142
+ if weighting_association(step) and !klass.reflect_on_association(weighting_association(step))
143
+ klass.find_in_batches do |batch|
144
+ batch.each do |record|
145
+ record.send "#{name}=", record.send(weighting_association(step)).weighted_average(name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step))
146
+ record.save
147
+ end
148
+ end
149
+ else # there's no weighting association OR there is one and it's a valid association
150
+ klass.update_all_weighted_averages name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step), :association => weighting_association(step)
151
+ end
152
+ end
153
+
154
+ def perform_callback(step)
155
+ case klass.method(callback(step)).arity
156
+ when 0:
157
+ klass.send(callback(step))
158
+ when 1:
159
+ klass.send(callback(step), name)
160
+ when 2:
161
+ klass.send(callback(step), name, options_for_step[step])
162
+ end
163
+ end
164
+
165
+ def unit_from_source(step, row)
166
+ row[unit_in_source(step)].to_s.strip.underscore.to_sym
167
+ end
168
+
169
+ def do_convert(step, row, value)
170
+ from_unit = from(step) || unit_from_source(step, row)
171
+ value.to_f.convert(from_unit, to(step))
172
+ end
173
+
174
+ def do_sprintf(step, value)
175
+ if /\%[0-9\.]*f/.match(sprintf(step))
176
+ value = value.to_f
177
+ elsif /\%[0-9\.]*d/.match(sprintf(step))
178
+ value = value.to_i
179
+ end
180
+ sprintf(step) % value
181
+ end
182
+
183
+ def do_split(step, value)
184
+ pattern = split_options(step)[:pattern] || /\s+/ # default is split on whitespace
185
+ keep = split_options(step)[:keep] || 0 # default is keep first element
186
+ value.to_s.split(pattern)[keep].to_s
187
+ end
188
+
189
+ def column_type
190
+ @column_type ||= klass.columns_hash[name.to_s].type
191
+ end
192
+
193
+ {
194
+ :static => 'options_for_step[step].has_key?(:static)',
195
+ :prefix => :prefix,
196
+ :create => 'create(step) != false',
197
+ :keep => :keep,
198
+ :upcase => :upcase,
199
+ :conversion => '!from(step).nil? or !unit_in_source(step).nil?',
200
+ :sprintf => :sprintf,
201
+ :dictionary => :dictionary_options,
202
+ :split => :split_options,
203
+ :update_all => :set,
204
+ :nullification => 'nullify(step) != false',
205
+ :overwriting => 'overwrite(step) != false',
206
+ :weighted_average => '!weighting_association(step).nil? or !weighting_column(step).nil?'
207
+ }.each do |name, condition|
208
+ condition = "!#{condition}(step).nil?" if condition.is_a?(Symbol)
209
+ eval <<-EOS
210
+ def wants_#{name}?(step)
211
+ #{condition}
212
+ end
213
+ EOS
214
+ end
215
+
216
+ {
217
+ :name_in_source => { :default => :name, :stringify => true },
218
+ :key => { :default => :name, :stringify => true },
219
+ :foreign_key => { :default => 'key(step)', :stringify => true },
220
+ :delimiter => { :default => '", "' }
221
+ }.each do |name, options|
222
+ eval <<-EOS
223
+ def #{name}(step)
224
+ (options_for_step[step][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
225
+ end
226
+ EOS
227
+ end
228
+
229
+ def reflection
230
+ if @_reflection.nil?
231
+ @_reflection = klass.reflect_on_association(name) || :missing
232
+ reflection
233
+ elsif @_reflection == :missing
234
+ nil
235
+ else
236
+ @_reflection
237
+ end
238
+ end
239
+
240
+ def reflection_klass(step)
241
+ return nil unless reflection
242
+ if reflection.options[:polymorphic]
243
+ polymorphic_type(step).constantize
244
+ else
245
+ reflection.klass
246
+ end
247
+ end
248
+
249
+ def wants_inline_association?
250
+ !reflection.nil?
251
+ end
252
+
253
+ def callback(step)
254
+ (options_for_step[step][:callback] || "derive_#{name}").to_sym
255
+ end
256
+
257
+ def dictionary(step)
258
+ raise "shouldn't ask for this" unless wants_dictionary?(step) # don't try to initialize if there are no dictionary options
259
+ @dictionaries ||= {}
260
+ @dictionaries[step] ||= Dictionary.new(dictionary_options(step))
261
+ end
262
+
263
+ %w(dictionary split).each do |name|
264
+ eval <<-EOS
265
+ def #{name}_options(step)
266
+ options_for_step[step][:#{name}]
267
+ end
268
+ EOS
269
+ end
270
+
271
+ %w(from to set conditions weighting_association weighting_column weighting_disaggregator sprintf nullify overwrite upcase prefix unit_in_source field_number keep create static polymorphic_type).each do |name|
272
+ eval <<-EOS
273
+ def #{name}(step)
274
+ options_for_step[step][:#{name}]
275
+ end
276
+ EOS
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,51 @@
1
+ module DataMiner
2
+ class AttributeCollection
3
+ attr_accessor :klass, :attributes
4
+
5
+ def initialize(klass)
6
+ @klass = klass
7
+ @attributes = {}
8
+ end
9
+
10
+ def key!(step, attr_name, attr_options = {})
11
+ find_or_initialize(attr_name).key_for!(step, attr_options)
12
+ end
13
+
14
+ def affect!(step, attr_name, attr_options = {})
15
+ find_or_initialize(attr_name).affected_by!(step, attr_options)
16
+ end
17
+
18
+ def affect_all_content_columns!(step, options = {})
19
+ except = Array.wrap(options[:except]).map(&:to_sym)
20
+ step.klass.content_columns.map(&:name).reject { |content_column| except.include?(content_column.to_sym) }.each do |content_column|
21
+ find_or_initialize(content_column).affected_by!(step)
22
+ end
23
+ end
24
+
25
+ def all_affected_by(step)
26
+ attributes.values.select { |attr| attr.affected_by?(step) }
27
+ end
28
+
29
+ def all_keys_for(step)
30
+ attributes.values.select { |attr| attr.key_for?(step) }
31
+ end
32
+
33
+ def all_for(step)
34
+ (all_affected_by(step) + all_keys_for(step)).uniq
35
+ end
36
+
37
+ def has_keys_for?(step)
38
+ attributes.values.any? { |attr| attr.key_for?(step) }
39
+ end
40
+
41
+ def has_conditional_writes_for?(step)
42
+ all_affected_by(step).any? { |attr| !attr.wants_overwriting?(step) }
43
+ end
44
+
45
+ private
46
+
47
+ def find_or_initialize(attr_name)
48
+ self.attributes[attr_name] ||= Attribute.new(klass, attr_name)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,77 @@
1
+ module DataMiner
2
+ class Configuration
3
+ attr_accessor :steps, :klass, :counter, :attributes, :awaiting
4
+
5
+ def initialize(klass)
6
+ @steps = []
7
+ @klass = klass
8
+ @counter = 0
9
+ @attributes = AttributeCollection.new(klass)
10
+ end
11
+
12
+ %w(import associate derive await).each do |method|
13
+ eval <<-EOS
14
+ def #{method}(*args, &block)
15
+ self.counter += 1
16
+ if block_given? # FORM C
17
+ step_options = args[0] || {}
18
+ set_awaiting!(step_options)
19
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
+ elsif args[0].is_a?(Hash) # FORM A
21
+ step_options = args[0]
22
+ set_awaiting!(step_options)
23
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
+ else # FORM B
25
+ attr_name = args[0]
26
+ attr_options = args[1] || {}
27
+ step_options = {}
28
+ set_awaiting!(step_options)
29
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
+ attr.affect attr_name, attr_options
31
+ end
32
+ end
33
+ end
34
+ EOS
35
+ end
36
+
37
+ def set_awaiting!(step_options)
38
+ step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
39
+ end
40
+
41
+ def awaiting!(step)
42
+ self.awaiting = step
43
+ end
44
+
45
+ def stop_awaiting!
46
+ self.awaiting = nil
47
+ end
48
+
49
+ # Mine data for this class.
50
+ def mine(options = {})
51
+ steps.each { |step| step.perform options }
52
+ end
53
+
54
+ cattr_accessor :classes
55
+ self.classes = []
56
+ class << self
57
+ # Mine data. Defaults to all classes touched by DataMiner.
58
+ #
59
+ # Options
60
+ # * <tt>:class_names</tt>: provide an array class names to mine
61
+ def mine(options = {})
62
+ classes.each do |klass|
63
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
64
+ klass.data_mine.mine options
65
+ end
66
+ end
67
+ end
68
+
69
+ # Queue up all the ActiveRecord classes that DataMiner should touch.
70
+ #
71
+ # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
72
+ def enqueue(&block)
73
+ yield self.classes
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,36 @@
1
+ module DataMiner
2
+ class Dictionary
3
+ attr_accessor :key_name, :value_name, :sprintf, :table
4
+
5
+ def initialize(options = {})
6
+ @key_name = options[:key]
7
+ @value_name = options[:returns]
8
+ @sprintf = options[:sprintf] || '%s'
9
+ @table = RemoteTable.new(:url => options[:url])
10
+ end
11
+
12
+ def lookup(key)
13
+ find(self.key_name, key, self.value_name, :sprintf => self.sprintf)
14
+ end
15
+
16
+ def find(key_name, key, value_name, options = {})
17
+ if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
+ match[value_name].to_s.split(/\s*;\s/)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def normalize_for_comparison(string, options = {})
25
+ if options[:sprintf]
26
+ if /\%[0-9\.]*f/.match(options[:sprintf])
27
+ string = string.to_f
28
+ elsif /\%[0-9\.]*d/.match(options[:sprintf])
29
+ string = string.to_i
30
+ end
31
+ string = sprintf % string
32
+ end
33
+ string.to_s.strip
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,60 @@
1
+ module DataMiner
2
+ class Step
3
+ attr_accessor :configuration, :number, :options
4
+ delegate :klass, :to => :configuration
5
+ delegate :attributes, :to => :configuration
6
+
7
+ def initialize(configuration, number, options = {}, &block)
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ attributes.affect_all_content_columns!(self, :except => options[:except]) if options[:affect_all] == :content_columns
13
+ affected_attributes.each { |attr| attr.options_for_step[self][:callback] = options[:callback] } if options[:callback]
14
+ all_attributes.each { |attr| attr.options_for_step[self][:name_in_source] = attr.name_in_source(self).upcase } if options[:headers] == :upcase # TODO remove
15
+ end
16
+
17
+ def variant
18
+ self.class.name.demodulize.underscore.to_sym
19
+ end
20
+
21
+ def awaiting?
22
+ !options[:awaiting].nil?
23
+ end
24
+
25
+ def inspect
26
+ "Step(#{klass} #{variant.to_s.camelcase} #{number})"
27
+ end
28
+
29
+ def signature
30
+ "#{klass} step #{number}: #{variant}"
31
+ end
32
+
33
+ def perform(options = {})
34
+ return if awaiting? and !options[:force]
35
+ affected_attributes.each { |attr| attr.perform self }
36
+ $stderr.puts "performed #{signature}"
37
+ end
38
+
39
+ def affected_attributes
40
+ @affected_attributes ||= attributes.all_affected_by self
41
+ end
42
+
43
+ def key_attributes
44
+ @key_attributes ||= attributes.all_keys_for self
45
+ end
46
+
47
+ def all_attributes
48
+ @all_attributes ||= attributes.all_for self
49
+ end
50
+
51
+ def key(attr_name, attr_options = {})
52
+ attributes.key! self, attr_name, attr_options
53
+ end
54
+
55
+ def affect(attr_name, attr_options = {})
56
+ attributes.affect! self, attr_name, attr_options
57
+ end
58
+ alias_method :store, :affect
59
+ end
60
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Associate < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,35 @@
1
+ module DataMiner
2
+ class Step
3
+ class Await < Step
4
+ attr_accessor :other_class
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ # doesn't call super
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ @other_class = options.delete :other_class
12
+ configuration.awaiting! self
13
+ yield configuration # pull in steps
14
+ configuration.stop_awaiting!
15
+ end
16
+
17
+ def perform(*args)
18
+ other_class.data_mine.steps << Step::Callback.new(other_class.data_mine, self)
19
+ $stderr.puts "added #{signature} to callbacks after #{other_class}"
20
+ end
21
+
22
+ def callback
23
+ $stderr.puts "starting to perform deferred steps in #{signature}..."
24
+ all_awaiting.each { |step| step.perform :force => true }
25
+ $stderr.puts "...done"
26
+ end
27
+
28
+ private
29
+
30
+ def all_awaiting
31
+ configuration.steps.select { |step| step.options and step.options[:awaiting] == self }
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module DataMiner
2
+ class Step
3
+ class Callback < Step
4
+ attr_accessor :foreign_step
5
+
6
+ def initialize(configuration, foreign_step)
7
+ @configuration = configuration
8
+ @foreign_step = foreign_step
9
+ @number = "(last)"
10
+ end
11
+
12
+ def perform(*args)
13
+ foreign_step.callback
14
+ $stderr.puts "performed #{signature}"
15
+ end
16
+
17
+ def signature
18
+ "#{super} (on behalf of #{foreign_step.signature})"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Derive < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Step
3
+ class Import < Step
4
+ attr_accessor :table, :errata
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ super
8
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
9
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
10
+ end
11
+
12
+ def signature
13
+ "#{super} #{options[:url]}"
14
+ end
15
+
16
+ def perform(*args)
17
+ ActiveRecord::Base.connection.execute("TRUNCATE #{klass.quoted_table_name}") if wants_truncate?
18
+ table.each_row do |row|
19
+ if errata
20
+ next if errata.rejects?(row)
21
+ errata.correct!(row)
22
+ end
23
+ if uses_existing_data?
24
+ key_values = key_attributes.map { |key_attr| [ key_attr.value_from_row(self, row) ] }
25
+ record_set = WilliamJamesCartesianProduct.cart_prod(*key_values).map do |combination|
26
+ next if combination.include?(nil) and !wants_nil_keys?
27
+ klass.send(dynamic_finder_name, *combination)
28
+ end.flatten
29
+ else
30
+ record_set = klass.new
31
+ end
32
+ Array.wrap(record_set).each do |record|
33
+ affected_attributes.each { |attr| attr.set_record_from_row(self, record, row) }
34
+ record.save
35
+ end
36
+ end
37
+ $stderr.puts "performed #{signature}"
38
+ end
39
+
40
+ def wants_truncate?
41
+ options[:truncate] == true or (!(options[:truncate] == false) and !uses_existing_data?)
42
+ end
43
+
44
+ def wants_nil_keys?
45
+ options[:allow_nil_keys] == true
46
+ end
47
+
48
+ def uses_existing_data?
49
+ @uses_existing_data ||= attributes.has_keys_for?(self) or attributes.has_conditional_writes_for?(self)
50
+ end
51
+
52
+ def dynamic_finder_name
53
+ "find_or_initialize_by_#{key_attributes.map(&:name).join('_and_')}".to_sym
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,11 @@
1
+ # http://www.ruby-forum.com/topic/95519#200484
2
+
3
+ module WilliamJamesCartesianProduct
4
+ def self.cart_prod( *args )
5
+ args.inject([[]]){|old,lst|
6
+ new = []
7
+ lst.each{|e| new += old.map{|c| c.dup << e }}
8
+ new
9
+ }
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'test_helper'
2
+
3
+ ActiveRecord::Schema.define(:version => 20090819143429) do
4
+ create_table "airports", :force => true do |t|
5
+ t.string "iata_code"
6
+ t.string "name"
7
+ t.string "city"
8
+ t.integer "country_id"
9
+ t.float "latitude"
10
+ t.float "longitude"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ end
14
+ create_table "countries", :force => true do |t|
15
+ t.string "iso_3166"
16
+ t.string "name"
17
+ t.datetime "created_at"
18
+ t.datetime "updated_at"
19
+ end
20
+ end
21
+
22
+ class Country < ActiveRecord::Base
23
+ mine_data do |step|
24
+ # import country names and country codes
25
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
+ attr.key :iso_3166, :name_in_source => 'country code'
27
+ attr.store :iso_3166, :name_in_source => 'country code'
28
+ attr.store :name, :name_in_source => 'country'
29
+ end
30
+ end
31
+ end
32
+
33
+ class Airport < ActiveRecord::Base
34
+ belongs_to :country
35
+ mine_data do |step|
36
+ # import airport iata_code, name, etc.
37
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
+ attr.key :iata_code, :field_number => 3
39
+ attr.store :name, :field_number => 0
40
+ attr.store :city, :field_number => 1
41
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
+ attr.store :iata_code, :field_number => 3
43
+ attr.store :latitude, :field_number => 5
44
+ attr.store :longitude, :field_number => 6
45
+ end
46
+ end
47
+ end
48
+
49
+ DataMiner.enqueue do |queue|
50
+ queue << Country
51
+ queue << Airport
52
+ end
53
+
54
+ class DataMinerTest < Test::Unit::TestCase
55
+ def teardown
56
+ Airport.delete_all
57
+ Country.delete_all
58
+ end
59
+
60
+ should "mine a single class" do
61
+ Country.data_mine.mine
62
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
+ assert_equal 0, Airport.count
64
+ end
65
+
66
+ should "mine a single class using the API" do
67
+ DataMiner.mine :class_names => ['Country']
68
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
+ assert_equal 0, Airport.count
70
+ end
71
+
72
+ # should "mine all classes" do
73
+ # DataMiner.mine
74
+ # uy = Country.find_by_iso_3166('UY')
75
+ # assert_equal 'Uruguay', uy.name
76
+ # assert_equal uy, Airport.find_by_iata_code('MVD').country
77
+ # end
78
+ end
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'sqlite3'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'data_miner'
9
+
10
+ ActiveRecord::Base.establish_connection(
11
+ 'adapter' => 'sqlite3',
12
+ 'database' => 'test/test.sqlite3'
13
+ )
14
+
15
+ class Test::Unit::TestCase
16
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seamusabshere-data_miner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Seamus Abshere
8
+ - Andy Rossmeissl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-08-19 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activerecord
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: activesupport
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ version:
36
+ - !ruby/object:Gem::Dependency
37
+ name: seamusabshere-remote_table
38
+ type: :runtime
39
+ version_requirement:
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: seamusabshere-errata
48
+ type: :runtime
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ description: Mine remote data into your ActiveRecord models.
57
+ email: seamus@abshere.net
58
+ executables: []
59
+
60
+ extensions: []
61
+
62
+ extra_rdoc_files:
63
+ - LICENSE
64
+ - README.rdoc
65
+ files:
66
+ - .document
67
+ - .gitignore
68
+ - LICENSE
69
+ - README.rdoc
70
+ - Rakefile
71
+ - VERSION
72
+ - data_miner.gemspec
73
+ - lib/data_miner.rb
74
+ - lib/data_miner/active_record_ext.rb
75
+ - lib/data_miner/attribute.rb
76
+ - lib/data_miner/attribute_collection.rb
77
+ - lib/data_miner/configuration.rb
78
+ - lib/data_miner/dictionary.rb
79
+ - lib/data_miner/step.rb
80
+ - lib/data_miner/step/associate.rb
81
+ - lib/data_miner/step/await.rb
82
+ - lib/data_miner/step/callback.rb
83
+ - lib/data_miner/step/derive.rb
84
+ - lib/data_miner/step/import.rb
85
+ - lib/data_miner/william_james_cartesian_product.rb
86
+ - test/data_miner_test.rb
87
+ - test/test_helper.rb
88
+ has_rdoc: false
89
+ homepage: http://github.com/seamusabshere/data_miner
90
+ licenses:
91
+ post_install_message:
92
+ rdoc_options:
93
+ - --charset=UTF-8
94
+ - --line-numbers
95
+ - --inline-source
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: "0"
109
+ version:
110
+ requirements: []
111
+
112
+ rubyforge_project:
113
+ rubygems_version: 1.3.5
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: Mine remote data into your ActiveRecord models.
117
+ test_files:
118
+ - test/data_miner_test.rb
119
+ - test/test_helper.rb