seamusabshere-data_miner 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,87 @@
1
+ =data_miner
2
+
3
+ Mine remote data into your ActiveRecord models.
4
+
5
+ ==Quick start
6
+
7
+ Put this in <tt>config/environment.rb</tt>:
8
+
9
+ config.gem 'seamusabshere-data_miner', :lib => 'data_miner', :source => 'http://gems.github.com'
10
+
11
+ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now)
12
+
13
+ namespace :data_miner do
14
+ task :mine => :environment do
15
+ DataMiner.mine :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
16
+ end
17
+ end
18
+
19
+ You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
20
+
21
+ DataMiner.enqueue do |queue|
22
+ queue << Country # class whose data should be mined 1st
23
+ queue << Airport # class whose data should be mined 2nd
24
+ # etc
25
+ end
26
+
27
+ You need to define <tt>mine_data</tt> blocks. For example, in <tt>app/models/country.rb</tt>:
28
+
29
+ class Country < ActiveRecord::Base
30
+ mine_data do |step|
31
+ # import country names and country codes
32
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
33
+ attr.key :iso_3166, :name_in_source => 'country code'
34
+ attr.store :iso_3166, :name_in_source => 'country code'
35
+ attr.store :name, :name_in_source => 'country'
36
+ end
37
+ end
38
+ end
39
+
40
+ To complete the example, in <tt>app/models/airport.rb</tt>:
41
+
42
+ class Airport < ActiveRecord::Base
43
+ belongs_to :country
44
+
45
+ mine_data do |step|
46
+ # import airport iata_code, name, etc.
47
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
48
+ attr.key :iata_code, :field_number => 3
49
+ attr.store :name, :field_number => 0
50
+ attr.store :city, :field_number => 1
51
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
52
+ attr.store :iata_code, :field_number => 3
53
+ attr.store :latitude, :field_number => 5
54
+ attr.store :longitude, :field_number => 6
55
+ end
56
+ end
57
+ end
58
+
59
+ Once you have (1) set up the order of data mining and (2) defined <tt>mine_data</tt> blocks in your classes, you can:
60
+
61
+ $ rake data_miner:mine
62
+
63
+ ==Complete example
64
+
65
+ ~ $ rails testapp
66
+ ~ $ cd testapp/
67
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
68
+ ~/testapp $ ./script/generate model Country iso_3166:string name:string
69
+ ~/testapp $ rake db:migrate
70
+ ~/testapp $ touch lib/tasks/data_miner_tasks.rb
71
+ [...edit per quick start...]
72
+ ~/testapp $ touch config/initializers/data_miner_config.rake
73
+ [...edit per quick start...]
74
+ ~/testapp $ rake data_miner:mine
75
+
76
+ Now you should have
77
+
78
+ ~/testapp $ ./script/console
79
+ Loading development environment (Rails 2.3.3)
80
+ >> Airport.first.iata_code
81
+ => "GKA"
82
+ >> Airport.first.country.name
83
+ => "Papua New Guinea"
84
+
85
+ ==Copyright
86
+
87
+ Copyright (c) 2009 Brighter Planet. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "data_miner"
8
+ gem.summary = %Q{Mine remote data into your ActiveRecord models.}
9
+ gem.description = %Q{Mine remote data into your ActiveRecord models.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/data_miner"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ %w{ activerecord activesupport seamusabshere-remote_table seamusabshere-errata }.each { |name| gem.add_dependency name }
14
+ gem.require_path = "lib"
15
+ gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
16
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
17
+ # gem.rubyforge_project = "dataminer"
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+
21
+ Jeweler::RubyforgeTasks.new do |rubyforge|
22
+ rubyforge.doc_task = "rdoc"
23
+ end
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/*_test.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+
49
+
50
+
51
+ task :default => :test
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ if File.exist?('VERSION')
56
+ version = File.read('VERSION')
57
+ else
58
+ version = ""
59
+ end
60
+
61
+ rdoc.rdoc_dir = 'rdoc'
62
+ rdoc.title = "data_miner #{version}"
63
+ rdoc.rdoc_files.include('README*')
64
+ rdoc.rdoc_files.include('lib/**/*.rb')
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,74 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{data_miner}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
+ s.date = %q{2009-08-19}
13
+ s.description = %q{Mine remote data into your ActiveRecord models.}
14
+ s.email = %q{seamus@abshere.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "data_miner.gemspec",
27
+ "lib/data_miner.rb",
28
+ "lib/data_miner/active_record_ext.rb",
29
+ "lib/data_miner/attribute.rb",
30
+ "lib/data_miner/attribute_collection.rb",
31
+ "lib/data_miner/configuration.rb",
32
+ "lib/data_miner/dictionary.rb",
33
+ "lib/data_miner/step.rb",
34
+ "lib/data_miner/step/associate.rb",
35
+ "lib/data_miner/step/await.rb",
36
+ "lib/data_miner/step/callback.rb",
37
+ "lib/data_miner/step/derive.rb",
38
+ "lib/data_miner/step/import.rb",
39
+ "lib/data_miner/william_james_cartesian_product.rb",
40
+ "test/data_miner_test.rb",
41
+ "test/test_helper.rb"
42
+ ]
43
+ s.homepage = %q{http://github.com/seamusabshere/data_miner}
44
+ s.rdoc_options = ["--charset=UTF-8", "--line-numbers", "--inline-source"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = %q{1.3.5}
47
+ s.summary = %q{Mine remote data into your ActiveRecord models.}
48
+ s.test_files = [
49
+ "test/data_miner_test.rb",
50
+ "test/test_helper.rb"
51
+ ]
52
+
53
+ if s.respond_to? :specification_version then
54
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<activerecord>, [">= 0"])
59
+ s.add_runtime_dependency(%q<activesupport>, [">= 0"])
60
+ s.add_runtime_dependency(%q<seamusabshere-remote_table>, [">= 0"])
61
+ s.add_runtime_dependency(%q<seamusabshere-errata>, [">= 0"])
62
+ else
63
+ s.add_dependency(%q<activerecord>, [">= 0"])
64
+ s.add_dependency(%q<activesupport>, [">= 0"])
65
+ s.add_dependency(%q<seamusabshere-remote_table>, [">= 0"])
66
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
67
+ end
68
+ else
69
+ s.add_dependency(%q<activerecord>, [">= 0"])
70
+ s.add_dependency(%q<activesupport>, [">= 0"])
71
+ s.add_dependency(%q<seamusabshere-remote_table>, [">= 0"])
72
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
73
+ end
74
+ end
data/lib/data_miner.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'rubygems'
2
+ require 'activesupport'
3
+ require 'activerecord'
4
+ require 'remote_table'
5
+ require 'errata'
6
+
7
+ require 'data_miner/active_record_ext'
8
+ require 'data_miner/attribute'
9
+ require 'data_miner/attribute_collection'
10
+ require 'data_miner/configuration'
11
+ require 'data_miner/dictionary'
12
+ require 'data_miner/step'
13
+ require 'data_miner/step/associate'
14
+ require 'data_miner/step/await'
15
+ require 'data_miner/step/callback'
16
+ require 'data_miner/step/derive'
17
+ require 'data_miner/step/import'
18
+ require 'data_miner/william_james_cartesian_product' # TODO: move to gem
19
+
20
+ module DataMiner
21
+ class << self
22
+ def mine(options = {})
23
+ DataMiner::Configuration.mine options
24
+ end
25
+
26
+ def enqueue(&block)
27
+ DataMiner::Configuration.enqueue &block
28
+ end
29
+
30
+ def classes
31
+ DataMiner::Configuration.classes
32
+ end
33
+ end
34
+ end
35
+
36
+ ActiveRecord::Base.class_eval do
37
+ include DataMiner::ActiveRecordExt
38
+ end
@@ -0,0 +1,15 @@
1
+ module DataMiner
2
+ module ActiveRecordExt
3
+ def self.included(klass)
4
+ klass.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def mine_data(options = {}, &block)
9
+ class_eval { cattr_accessor :data_mine }
10
+ self.data_mine = Configuration.new(self)
11
+ yield data_mine
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,279 @@
1
+ module DataMiner
2
+ class Attribute
3
+ attr_accessor :klass, :name, :options_for_step, :affected_by_steps, :key_for_steps
4
+
5
+ def initialize(klass, name)
6
+ @klass = klass
7
+ @name = name.to_sym
8
+ @options_for_step = {}
9
+ @affected_by_steps = []
10
+ @key_for_steps = []
11
+ end
12
+
13
+ def inspect
14
+ "Attribute(#{klass}.#{name})"
15
+ end
16
+
17
+ def affected_by!(step, options = {})
18
+ self.options_for_step[step] = options
19
+ self.affected_by_steps << step
20
+ end
21
+
22
+ def affected_by?(step)
23
+ affected_by_steps.include?(step)
24
+ end
25
+
26
+ def key_for!(step, options = {})
27
+ self.options_for_step[step] = options
28
+ self.key_for_steps << step
29
+ end
30
+
31
+ def key_for?(step)
32
+ key_for_steps.include?(step)
33
+ end
34
+
35
+ def value_in_dictionary(step, key)
36
+ return *dictionary(step).lookup(key) # strip the array wrapper if there's only one element
37
+ end
38
+
39
+ def value_in_source(step, row)
40
+ if wants_static?(step)
41
+ value = static(step)
42
+ elsif field_number(step)
43
+ if field_number(step).is_a?(Range)
44
+ value = field_number(step).map { |n| row[n] }.join(delimiter(step))
45
+ else
46
+ value = row[field_number(step)]
47
+ end
48
+ else
49
+ value = row[name_in_source(step)]
50
+ end
51
+ return nil if value.nil?
52
+ return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
53
+ value = value.to_s
54
+ value = value[keep(step)] if wants_keep?(step)
55
+ value = do_split(step, value) if wants_split?(step)
56
+ # taken from old errata... maybe we want to do this here
57
+ value.gsub!(/[ ]+/, ' ')
58
+ # text.gsub!('- ', '-')
59
+ value.gsub!(/([^\\])~/, '\1 ')
60
+ value.strip!
61
+ value.upcase! if wants_upcase?(step)
62
+ value = do_convert(step, row, value) if wants_conversion?(step)
63
+ value = do_sprintf(step, value) if wants_sprintf?(step)
64
+ value
65
+ end
66
+
67
+ def value_from_row(step, row)
68
+ value = value_in_source(step, row)
69
+ return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
70
+ value = value_in_dictionary(step, value) if wants_dictionary?(step)
71
+ value = value_as_association(step, value) if wants_inline_association?
72
+ value
73
+ end
74
+
75
+ def value_as_association(step, value)
76
+ @_value_as_association ||= {}
77
+ @_value_as_association[step] ||= {}
78
+ @_value_as_association[step][value] ||= reflection_klass(step).send("find_by_#{foreign_key(step)}", value)
79
+ end
80
+
81
+ # this will overwrite nils, even if wants_overwriting?(step) is false
82
+ def set_record_from_row(step, record, row)
83
+ return if !wants_overwriting?(step) and !record.send(name).nil?
84
+ value = value_from_row(step, row)
85
+ record.send "#{name}=", value
86
+ $stderr.puts("ActiveRecord didn't like trying to set #{klass}.#{name} = #{value}") if !value.nil? and record.send(name).nil?
87
+ end
88
+
89
+ def perform(step)
90
+ case step.variant
91
+ when :associate
92
+ perform_association(step)
93
+ when :derive
94
+ if wants_update_all?(step)
95
+ perform_update_all(step)
96
+ elsif wants_weighted_average?(step)
97
+ perform_weighted_average(step)
98
+ else
99
+ perform_callback(step)
100
+ end
101
+ when :import
102
+ raise "This shouldn't be called, the import step is special"
103
+ end
104
+ end
105
+
106
+ def perform_association(step)
107
+ raise "dictionary and prefix don't mix" if wants_dictionary?(step) and wants_prefix?(step)
108
+ klass.update_all("#{reflection.primary_key_name} = NULL") if wants_nullification?(step)
109
+ if wants_create?(step)
110
+ klass.find_in_batches do |batch|
111
+ batch.each do |record|
112
+ if wants_prefix?(step)
113
+ sql = "SELECT reflection_table.id FROM #{reflection_klass(step).quoted_table_name} AS reflection_table INNER JOIN #{klass.quoted_table_name} AS klass_table ON LEFT(klass_table.#{key(step)}, LENGTH(reflection_table.#{foreign_key(step)})) = reflection_table.#{foreign_key(step)} WHERE klass_table.id = #{record.id} ORDER BY LENGTH(reflection_table.#{foreign_key(step)}) DESC"
114
+ associated_id = ActiveRecord::Base.connection.select_value(sql)
115
+ next if associated_id.blank?
116
+ record.send("#{reflection.primary_key_name}=", associated_id)
117
+ else
118
+ dynamic_finder_value = record.send(key(step))
119
+ dynamic_finder_value = value_in_dictionary(step, dynamic_finder_value) if wants_dictionary?(step)
120
+ next if dynamic_finder_value.blank?
121
+ associated = reflection_klass(step).send("find_or_create_by_#{foreign_key(step)}", dynamic_finder_value) # TODO cache results
122
+ record.send("#{name}=", associated)
123
+ end
124
+ record.save
125
+ end
126
+ end
127
+ else
128
+ reflection_klass(step).find_in_batches do |batch|
129
+ batch.each do |reflection_record|
130
+ klass.update_all ["#{reflection.primary_key_name} = ?", reflection_record.id], ["#{key(step)} = ?", reflection_record.send(foreign_key(step))]
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ def perform_update_all(step)
137
+ klass.update_all("#{name} = #{set(step)}", conditions(step))
138
+ end
139
+
140
+ def perform_weighted_average(step)
141
+ # handle weighting by scopes instead of associations
142
+ if weighting_association(step) and !klass.reflect_on_association(weighting_association(step))
143
+ klass.find_in_batches do |batch|
144
+ batch.each do |record|
145
+ record.send "#{name}=", record.send(weighting_association(step)).weighted_average(name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step))
146
+ record.save
147
+ end
148
+ end
149
+ else # there's no weighting association OR there is one and it's a valid association
150
+ klass.update_all_weighted_averages name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step), :association => weighting_association(step)
151
+ end
152
+ end
153
+
154
+ def perform_callback(step)
155
+ case klass.method(callback(step)).arity
156
+ when 0:
157
+ klass.send(callback(step))
158
+ when 1:
159
+ klass.send(callback(step), name)
160
+ when 2:
161
+ klass.send(callback(step), name, options_for_step[step])
162
+ end
163
+ end
164
+
165
+ def unit_from_source(step, row)
166
+ row[unit_in_source(step)].to_s.strip.underscore.to_sym
167
+ end
168
+
169
+ def do_convert(step, row, value)
170
+ from_unit = from(step) || unit_from_source(step, row)
171
+ value.to_f.convert(from_unit, to(step))
172
+ end
173
+
174
+ def do_sprintf(step, value)
175
+ if /\%[0-9\.]*f/.match(sprintf(step))
176
+ value = value.to_f
177
+ elsif /\%[0-9\.]*d/.match(sprintf(step))
178
+ value = value.to_i
179
+ end
180
+ sprintf(step) % value
181
+ end
182
+
183
+ def do_split(step, value)
184
+ pattern = split_options(step)[:pattern] || /\s+/ # default is split on whitespace
185
+ keep = split_options(step)[:keep] || 0 # default is keep first element
186
+ value.to_s.split(pattern)[keep].to_s
187
+ end
188
+
189
+ def column_type
190
+ @column_type ||= klass.columns_hash[name.to_s].type
191
+ end
192
+
193
+ {
194
+ :static => 'options_for_step[step].has_key?(:static)',
195
+ :prefix => :prefix,
196
+ :create => 'create(step) != false',
197
+ :keep => :keep,
198
+ :upcase => :upcase,
199
+ :conversion => '!from(step).nil? or !unit_in_source(step).nil?',
200
+ :sprintf => :sprintf,
201
+ :dictionary => :dictionary_options,
202
+ :split => :split_options,
203
+ :update_all => :set,
204
+ :nullification => 'nullify(step) != false',
205
+ :overwriting => 'overwrite(step) != false',
206
+ :weighted_average => '!weighting_association(step).nil? or !weighting_column(step).nil?'
207
+ }.each do |name, condition|
208
+ condition = "!#{condition}(step).nil?" if condition.is_a?(Symbol)
209
+ eval <<-EOS
210
+ def wants_#{name}?(step)
211
+ #{condition}
212
+ end
213
+ EOS
214
+ end
215
+
216
+ {
217
+ :name_in_source => { :default => :name, :stringify => true },
218
+ :key => { :default => :name, :stringify => true },
219
+ :foreign_key => { :default => 'key(step)', :stringify => true },
220
+ :delimiter => { :default => '", "' }
221
+ }.each do |name, options|
222
+ eval <<-EOS
223
+ def #{name}(step)
224
+ (options_for_step[step][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
225
+ end
226
+ EOS
227
+ end
228
+
229
+ def reflection
230
+ if @_reflection.nil?
231
+ @_reflection = klass.reflect_on_association(name) || :missing
232
+ reflection
233
+ elsif @_reflection == :missing
234
+ nil
235
+ else
236
+ @_reflection
237
+ end
238
+ end
239
+
240
+ def reflection_klass(step)
241
+ return nil unless reflection
242
+ if reflection.options[:polymorphic]
243
+ polymorphic_type(step).constantize
244
+ else
245
+ reflection.klass
246
+ end
247
+ end
248
+
249
+ def wants_inline_association?
250
+ !reflection.nil?
251
+ end
252
+
253
+ def callback(step)
254
+ (options_for_step[step][:callback] || "derive_#{name}").to_sym
255
+ end
256
+
257
+ def dictionary(step)
258
+ raise "shouldn't ask for this" unless wants_dictionary?(step) # don't try to initialize if there are no dictionary options
259
+ @dictionaries ||= {}
260
+ @dictionaries[step] ||= Dictionary.new(dictionary_options(step))
261
+ end
262
+
263
+ %w(dictionary split).each do |name|
264
+ eval <<-EOS
265
+ def #{name}_options(step)
266
+ options_for_step[step][:#{name}]
267
+ end
268
+ EOS
269
+ end
270
+
271
+ %w(from to set conditions weighting_association weighting_column weighting_disaggregator sprintf nullify overwrite upcase prefix unit_in_source field_number keep create static polymorphic_type).each do |name|
272
+ eval <<-EOS
273
+ def #{name}(step)
274
+ options_for_step[step][:#{name}]
275
+ end
276
+ EOS
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,51 @@
1
+ module DataMiner
2
+ class AttributeCollection
3
+ attr_accessor :klass, :attributes
4
+
5
+ def initialize(klass)
6
+ @klass = klass
7
+ @attributes = {}
8
+ end
9
+
10
+ def key!(step, attr_name, attr_options = {})
11
+ find_or_initialize(attr_name).key_for!(step, attr_options)
12
+ end
13
+
14
+ def affect!(step, attr_name, attr_options = {})
15
+ find_or_initialize(attr_name).affected_by!(step, attr_options)
16
+ end
17
+
18
+ def affect_all_content_columns!(step, options = {})
19
+ except = Array.wrap(options[:except]).map(&:to_sym)
20
+ step.klass.content_columns.map(&:name).reject { |content_column| except.include?(content_column.to_sym) }.each do |content_column|
21
+ find_or_initialize(content_column).affected_by!(step)
22
+ end
23
+ end
24
+
25
+ def all_affected_by(step)
26
+ attributes.values.select { |attr| attr.affected_by?(step) }
27
+ end
28
+
29
+ def all_keys_for(step)
30
+ attributes.values.select { |attr| attr.key_for?(step) }
31
+ end
32
+
33
+ def all_for(step)
34
+ (all_affected_by(step) + all_keys_for(step)).uniq
35
+ end
36
+
37
+ def has_keys_for?(step)
38
+ attributes.values.any? { |attr| attr.key_for?(step) }
39
+ end
40
+
41
+ def has_conditional_writes_for?(step)
42
+ all_affected_by(step).any? { |attr| !attr.wants_overwriting?(step) }
43
+ end
44
+
45
+ private
46
+
47
+ def find_or_initialize(attr_name)
48
+ self.attributes[attr_name] ||= Attribute.new(klass, attr_name)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,77 @@
1
+ module DataMiner
2
+ class Configuration
3
+ attr_accessor :steps, :klass, :counter, :attributes, :awaiting
4
+
5
+ def initialize(klass)
6
+ @steps = []
7
+ @klass = klass
8
+ @counter = 0
9
+ @attributes = AttributeCollection.new(klass)
10
+ end
11
+
12
+ %w(import associate derive await).each do |method|
13
+ eval <<-EOS
14
+ def #{method}(*args, &block)
15
+ self.counter += 1
16
+ if block_given? # FORM C
17
+ step_options = args[0] || {}
18
+ set_awaiting!(step_options)
19
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
+ elsif args[0].is_a?(Hash) # FORM A
21
+ step_options = args[0]
22
+ set_awaiting!(step_options)
23
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
+ else # FORM B
25
+ attr_name = args[0]
26
+ attr_options = args[1] || {}
27
+ step_options = {}
28
+ set_awaiting!(step_options)
29
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
+ attr.affect attr_name, attr_options
31
+ end
32
+ end
33
+ end
34
+ EOS
35
+ end
36
+
37
+ def set_awaiting!(step_options)
38
+ step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
39
+ end
40
+
41
+ def awaiting!(step)
42
+ self.awaiting = step
43
+ end
44
+
45
+ def stop_awaiting!
46
+ self.awaiting = nil
47
+ end
48
+
49
+ # Mine data for this class.
50
+ def mine(options = {})
51
+ steps.each { |step| step.perform options }
52
+ end
53
+
54
+ cattr_accessor :classes
55
+ self.classes = []
56
+ class << self
57
+ # Mine data. Defaults to all classes touched by DataMiner.
58
+ #
59
+ # Options
60
+ # * <tt>:class_names</tt>: provide an array class names to mine
61
+ def mine(options = {})
62
+ classes.each do |klass|
63
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
64
+ klass.data_mine.mine options
65
+ end
66
+ end
67
+ end
68
+
69
+ # Queue up all the ActiveRecord classes that DataMiner should touch.
70
+ #
71
+ # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
72
+ def enqueue(&block)
73
+ yield self.classes
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,36 @@
1
+ module DataMiner
2
+ class Dictionary
3
+ attr_accessor :key_name, :value_name, :sprintf, :table
4
+
5
+ def initialize(options = {})
6
+ @key_name = options[:key]
7
+ @value_name = options[:returns]
8
+ @sprintf = options[:sprintf] || '%s'
9
+ @table = RemoteTable.new(:url => options[:url])
10
+ end
11
+
12
+ def lookup(key)
13
+ find(self.key_name, key, self.value_name, :sprintf => self.sprintf)
14
+ end
15
+
16
+ def find(key_name, key, value_name, options = {})
17
+ if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
+ match[value_name].to_s.split(/\s*;\s/)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def normalize_for_comparison(string, options = {})
25
+ if options[:sprintf]
26
+ if /\%[0-9\.]*f/.match(options[:sprintf])
27
+ string = string.to_f
28
+ elsif /\%[0-9\.]*d/.match(options[:sprintf])
29
+ string = string.to_i
30
+ end
31
+ string = sprintf % string
32
+ end
33
+ string.to_s.strip
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,60 @@
1
+ module DataMiner
2
+ class Step
3
+ attr_accessor :configuration, :number, :options
4
+ delegate :klass, :to => :configuration
5
+ delegate :attributes, :to => :configuration
6
+
7
+ def initialize(configuration, number, options = {}, &block)
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ attributes.affect_all_content_columns!(self, :except => options[:except]) if options[:affect_all] == :content_columns
13
+ affected_attributes.each { |attr| attr.options_for_step[self][:callback] = options[:callback] } if options[:callback]
14
+ all_attributes.each { |attr| attr.options_for_step[self][:name_in_source] = attr.name_in_source(self).upcase } if options[:headers] == :upcase # TODO remove
15
+ end
16
+
17
+ def variant
18
+ self.class.name.demodulize.underscore.to_sym
19
+ end
20
+
21
+ def awaiting?
22
+ !options[:awaiting].nil?
23
+ end
24
+
25
+ def inspect
26
+ "Step(#{klass} #{variant.to_s.camelcase} #{number})"
27
+ end
28
+
29
+ def signature
30
+ "#{klass} step #{number}: #{variant}"
31
+ end
32
+
33
+ def perform(options = {})
34
+ return if awaiting? and !options[:force]
35
+ affected_attributes.each { |attr| attr.perform self }
36
+ $stderr.puts "performed #{signature}"
37
+ end
38
+
39
+ def affected_attributes
40
+ @affected_attributes ||= attributes.all_affected_by self
41
+ end
42
+
43
+ def key_attributes
44
+ @key_attributes ||= attributes.all_keys_for self
45
+ end
46
+
47
+ def all_attributes
48
+ @all_attributes ||= attributes.all_for self
49
+ end
50
+
51
+ def key(attr_name, attr_options = {})
52
+ attributes.key! self, attr_name, attr_options
53
+ end
54
+
55
+ def affect(attr_name, attr_options = {})
56
+ attributes.affect! self, attr_name, attr_options
57
+ end
58
+ alias_method :store, :affect
59
+ end
60
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Associate < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,35 @@
1
+ module DataMiner
2
+ class Step
3
+ class Await < Step
4
+ attr_accessor :other_class
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ # doesn't call super
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ @other_class = options.delete :other_class
12
+ configuration.awaiting! self
13
+ yield configuration # pull in steps
14
+ configuration.stop_awaiting!
15
+ end
16
+
17
+ def perform(*args)
18
+ other_class.data_mine.steps << Step::Callback.new(other_class.data_mine, self)
19
+ $stderr.puts "added #{signature} to callbacks after #{other_class}"
20
+ end
21
+
22
+ def callback
23
+ $stderr.puts "starting to perform deferred steps in #{signature}..."
24
+ all_awaiting.each { |step| step.perform :force => true }
25
+ $stderr.puts "...done"
26
+ end
27
+
28
+ private
29
+
30
+ def all_awaiting
31
+ configuration.steps.select { |step| step.options and step.options[:awaiting] == self }
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module DataMiner
2
+ class Step
3
+ class Callback < Step
4
+ attr_accessor :foreign_step
5
+
6
+ def initialize(configuration, foreign_step)
7
+ @configuration = configuration
8
+ @foreign_step = foreign_step
9
+ @number = "(last)"
10
+ end
11
+
12
+ def perform(*args)
13
+ foreign_step.callback
14
+ $stderr.puts "performed #{signature}"
15
+ end
16
+
17
+ def signature
18
+ "#{super} (on behalf of #{foreign_step.signature})"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Derive < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Step
3
+ class Import < Step
4
+ attr_accessor :table, :errata
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ super
8
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
9
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
10
+ end
11
+
12
+ def signature
13
+ "#{super} #{options[:url]}"
14
+ end
15
+
16
+ def perform(*args)
17
+ ActiveRecord::Base.connection.execute("TRUNCATE #{klass.quoted_table_name}") if wants_truncate?
18
+ table.each_row do |row|
19
+ if errata
20
+ next if errata.rejects?(row)
21
+ errata.correct!(row)
22
+ end
23
+ if uses_existing_data?
24
+ key_values = key_attributes.map { |key_attr| [ key_attr.value_from_row(self, row) ] }
25
+ record_set = WilliamJamesCartesianProduct.cart_prod(*key_values).map do |combination|
26
+ next if combination.include?(nil) and !wants_nil_keys?
27
+ klass.send(dynamic_finder_name, *combination)
28
+ end.flatten
29
+ else
30
+ record_set = klass.new
31
+ end
32
+ Array.wrap(record_set).each do |record|
33
+ affected_attributes.each { |attr| attr.set_record_from_row(self, record, row) }
34
+ record.save
35
+ end
36
+ end
37
+ $stderr.puts "performed #{signature}"
38
+ end
39
+
40
+ def wants_truncate?
41
+ options[:truncate] == true or (!(options[:truncate] == false) and !uses_existing_data?)
42
+ end
43
+
44
+ def wants_nil_keys?
45
+ options[:allow_nil_keys] == true
46
+ end
47
+
48
+ def uses_existing_data?
49
+ @uses_existing_data ||= attributes.has_keys_for?(self) or attributes.has_conditional_writes_for?(self)
50
+ end
51
+
52
+ def dynamic_finder_name
53
+ "find_or_initialize_by_#{key_attributes.map(&:name).join('_and_')}".to_sym
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,11 @@
1
+ # http://www.ruby-forum.com/topic/95519#200484
2
+
3
+ module WilliamJamesCartesianProduct
4
+ def self.cart_prod( *args )
5
+ args.inject([[]]){|old,lst|
6
+ new = []
7
+ lst.each{|e| new += old.map{|c| c.dup << e }}
8
+ new
9
+ }
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'test_helper'
2
+
3
+ ActiveRecord::Schema.define(:version => 20090819143429) do
4
+ create_table "airports", :force => true do |t|
5
+ t.string "iata_code"
6
+ t.string "name"
7
+ t.string "city"
8
+ t.integer "country_id"
9
+ t.float "latitude"
10
+ t.float "longitude"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ end
14
+ create_table "countries", :force => true do |t|
15
+ t.string "iso_3166"
16
+ t.string "name"
17
+ t.datetime "created_at"
18
+ t.datetime "updated_at"
19
+ end
20
+ end
21
+
22
+ class Country < ActiveRecord::Base
23
+ mine_data do |step|
24
+ # import country names and country codes
25
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
+ attr.key :iso_3166, :name_in_source => 'country code'
27
+ attr.store :iso_3166, :name_in_source => 'country code'
28
+ attr.store :name, :name_in_source => 'country'
29
+ end
30
+ end
31
+ end
32
+
33
+ class Airport < ActiveRecord::Base
34
+ belongs_to :country
35
+ mine_data do |step|
36
+ # import airport iata_code, name, etc.
37
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
+ attr.key :iata_code, :field_number => 3
39
+ attr.store :name, :field_number => 0
40
+ attr.store :city, :field_number => 1
41
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
+ attr.store :iata_code, :field_number => 3
43
+ attr.store :latitude, :field_number => 5
44
+ attr.store :longitude, :field_number => 6
45
+ end
46
+ end
47
+ end
48
+
49
+ DataMiner.enqueue do |queue|
50
+ queue << Country
51
+ queue << Airport
52
+ end
53
+
54
+ class DataMinerTest < Test::Unit::TestCase
55
+ def teardown
56
+ Airport.delete_all
57
+ Country.delete_all
58
+ end
59
+
60
+ should "mine a single class" do
61
+ Country.data_mine.mine
62
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
+ assert_equal 0, Airport.count
64
+ end
65
+
66
+ should "mine a single class using the API" do
67
+ DataMiner.mine :class_names => ['Country']
68
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
+ assert_equal 0, Airport.count
70
+ end
71
+
72
+ # should "mine all classes" do
73
+ # DataMiner.mine
74
+ # uy = Country.find_by_iso_3166('UY')
75
+ # assert_equal 'Uruguay', uy.name
76
+ # assert_equal uy, Airport.find_by_iata_code('MVD').country
77
+ # end
78
+ end
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'sqlite3'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'data_miner'
9
+
10
+ ActiveRecord::Base.establish_connection(
11
+ 'adapter' => 'sqlite3',
12
+ 'database' => 'test/test.sqlite3'
13
+ )
14
+
15
+ class Test::Unit::TestCase
16
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seamusabshere-data_miner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Seamus Abshere
8
+ - Andy Rossmeissl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-08-19 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activerecord
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: activesupport
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ version:
36
+ - !ruby/object:Gem::Dependency
37
+ name: seamusabshere-remote_table
38
+ type: :runtime
39
+ version_requirement:
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: seamusabshere-errata
48
+ type: :runtime
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ description: Mine remote data into your ActiveRecord models.
57
+ email: seamus@abshere.net
58
+ executables: []
59
+
60
+ extensions: []
61
+
62
+ extra_rdoc_files:
63
+ - LICENSE
64
+ - README.rdoc
65
+ files:
66
+ - .document
67
+ - .gitignore
68
+ - LICENSE
69
+ - README.rdoc
70
+ - Rakefile
71
+ - VERSION
72
+ - data_miner.gemspec
73
+ - lib/data_miner.rb
74
+ - lib/data_miner/active_record_ext.rb
75
+ - lib/data_miner/attribute.rb
76
+ - lib/data_miner/attribute_collection.rb
77
+ - lib/data_miner/configuration.rb
78
+ - lib/data_miner/dictionary.rb
79
+ - lib/data_miner/step.rb
80
+ - lib/data_miner/step/associate.rb
81
+ - lib/data_miner/step/await.rb
82
+ - lib/data_miner/step/callback.rb
83
+ - lib/data_miner/step/derive.rb
84
+ - lib/data_miner/step/import.rb
85
+ - lib/data_miner/william_james_cartesian_product.rb
86
+ - test/data_miner_test.rb
87
+ - test/test_helper.rb
88
+ has_rdoc: false
89
+ homepage: http://github.com/seamusabshere/data_miner
90
+ licenses:
91
+ post_install_message:
92
+ rdoc_options:
93
+ - --charset=UTF-8
94
+ - --line-numbers
95
+ - --inline-source
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: "0"
109
+ version:
110
+ requirements: []
111
+
112
+ rubyforge_project:
113
+ rubygems_version: 1.3.5
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: Mine remote data into your ActiveRecord models.
117
+ test_files:
118
+ - test/data_miner_test.rb
119
+ - test/test_helper.rb