data_miner 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,96 @@
1
+ =data_miner
2
+
3
+ Mine remote data into your ActiveRecord models.
4
+
5
+ ==Quick start
6
+
7
+ Put this in <tt>config/environment.rb</tt>:
8
+
9
+ config.gem 'seamusabshere-data_miner', :lib => 'data_miner', :source => 'http://gems.github.com'
10
+
11
+ You need to define <tt>mine_data</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
12
+
13
+ class Country < ActiveRecord::Base
14
+ mine_data do |step|
15
+ # import country names and country codes
16
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
17
+ attr.key :iso_3166, :name_in_source => 'country code'
18
+ attr.store :iso_3166, :name_in_source => 'country code'
19
+ attr.store :name, :name_in_source => 'country'
20
+ end
21
+ end
22
+ end
23
+
24
+ ...and in <tt>app/models/airport.rb</tt>:
25
+
26
+ class Airport < ActiveRecord::Base
27
+ belongs_to :country
28
+
29
+ mine_data do |step|
30
+ # import airport iata_code, name, etc.
31
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
32
+ attr.key :iata_code, :field_number => 3
33
+ attr.store :name, :field_number => 0
34
+ attr.store :city, :field_number => 1
35
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
36
+ attr.store :iata_code, :field_number => 3
37
+ attr.store :latitude, :field_number => 5
38
+ attr.store :longitude, :field_number => 6
39
+ end
40
+ end
41
+ end
42
+
43
+ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now)
44
+
45
+ namespace :data_miner do
46
+ task :mine => :environment do
47
+ DataMiner.mine :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
48
+ end
49
+
50
+ task :map_to_attrs => :environment do
51
+ DataMiner.map_to_attrs ENV['METHOD'], :class_names => ENV['CLASSES'].to_s.split(/\s*,\s*/).flatten.compact
52
+ end
53
+ end
54
+
55
+ You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
56
+
57
+ DataMiner.enqueue do |queue|
58
+ queue << Country # class whose data should be mined 1st
59
+ queue << Airport # class whose data should be mined 2nd
60
+ # etc
61
+ end
62
+
63
+ Once you have (1) set up the order of data mining and (2) defined <tt>mine_data</tt> blocks in your classes, you can:
64
+
65
+ $ rake data_miner:mine
66
+
67
+ ==Complete example
68
+
69
+ ~ $ rails testapp
70
+ ~ $ cd testapp/
71
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
72
+ ~/testapp $ ./script/generate model Country iso_3166:string name:string
73
+ ~/testapp $ rake db:migrate
74
+ ~/testapp $ touch lib/tasks/data_miner_tasks.rb
75
+ [...edit per quick start...]
76
+ ~/testapp $ touch config/initializers/data_miner_config.rake
77
+ [...edit per quick start...]
78
+ ~/testapp $ rake data_miner:mine
79
+
80
+ Now you should have
81
+
82
+ ~/testapp $ ./script/console
83
+ Loading development environment (Rails 2.3.3)
84
+ >> Airport.first.iata_code
85
+ => "GKA"
86
+ >> Airport.first.country.name
87
+ => "Papua New Guinea"
88
+
89
+ ==Authors
90
+
91
+ * Seamus Abshere <seamus@abshere.net>
92
+ * Andy Rossmeissl <andy@rossmeissl.net>
93
+
94
+ ==Copyright
95
+
96
+ Copyright (c) 2009 Brighter Planet. See LICENSE for details.
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "data_miner"
8
+ gem.summary = %Q{Mine remote data into your ActiveRecord models.}
9
+ gem.description = %Q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/data_miner"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ %w{ activerecord activesupport andand remote_table seamusabshere-errata seamusabshere-conversions }.each { |name| gem.add_dependency name }
14
+ gem.require_path = "lib"
15
+ gem.files.include %w(lib/data_miner) unless gem.files.empty? # seems to fail once it's in the wild
16
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
17
+ # gem.rubyforge_project = "dataminer"
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+ Jeweler::GemcutterTasks.new
21
+ Jeweler::RubyforgeTasks.new do |rubyforge|
22
+ rubyforge.doc_task = "rdoc"
23
+ end
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/*_test.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+
49
+
50
+
51
+ task :default => :test
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ if File.exist?('VERSION')
56
+ version = File.read('VERSION')
57
+ else
58
+ version = ""
59
+ end
60
+
61
+ rdoc.rdoc_dir = 'rdoc'
62
+ rdoc.title = "data_miner #{version}"
63
+ rdoc.rdoc_files.include('README*')
64
+ rdoc.rdoc_files.include('lib/**/*.rb')
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,81 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{data_miner}
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
+ s.date = %q{2009-10-30}
13
+ s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
+ s.email = %q{seamus@abshere.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "data_miner.gemspec",
27
+ "lib/data_miner.rb",
28
+ "lib/data_miner/active_record_ext.rb",
29
+ "lib/data_miner/attribute.rb",
30
+ "lib/data_miner/attribute_collection.rb",
31
+ "lib/data_miner/configuration.rb",
32
+ "lib/data_miner/dictionary.rb",
33
+ "lib/data_miner/step.rb",
34
+ "lib/data_miner/step/associate.rb",
35
+ "lib/data_miner/step/await.rb",
36
+ "lib/data_miner/step/callback.rb",
37
+ "lib/data_miner/step/derive.rb",
38
+ "lib/data_miner/step/import.rb",
39
+ "lib/data_miner/william_james_cartesian_product.rb",
40
+ "test/data_miner_test.rb",
41
+ "test/test_helper.rb"
42
+ ]
43
+ s.homepage = %q{http://github.com/seamusabshere/data_miner}
44
+ s.rdoc_options = ["--charset=UTF-8", "--line-numbers", "--inline-source"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = %q{1.3.5}
47
+ s.summary = %q{Mine remote data into your ActiveRecord models.}
48
+ s.test_files = [
49
+ "test/data_miner_test.rb",
50
+ "test/test_helper.rb"
51
+ ]
52
+
53
+ if s.respond_to? :specification_version then
54
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<activerecord>, [">= 0"])
59
+ s.add_runtime_dependency(%q<activesupport>, [">= 0"])
60
+ s.add_runtime_dependency(%q<andand>, [">= 0"])
61
+ s.add_runtime_dependency(%q<remote_table>, [">= 0"])
62
+ s.add_runtime_dependency(%q<seamusabshere-errata>, [">= 0"])
63
+ s.add_runtime_dependency(%q<seamusabshere-conversions>, [">= 0"])
64
+ else
65
+ s.add_dependency(%q<activerecord>, [">= 0"])
66
+ s.add_dependency(%q<activesupport>, [">= 0"])
67
+ s.add_dependency(%q<andand>, [">= 0"])
68
+ s.add_dependency(%q<remote_table>, [">= 0"])
69
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
70
+ s.add_dependency(%q<seamusabshere-conversions>, [">= 0"])
71
+ end
72
+ else
73
+ s.add_dependency(%q<activerecord>, [">= 0"])
74
+ s.add_dependency(%q<activesupport>, [">= 0"])
75
+ s.add_dependency(%q<andand>, [">= 0"])
76
+ s.add_dependency(%q<remote_table>, [">= 0"])
77
+ s.add_dependency(%q<seamusabshere-errata>, [">= 0"])
78
+ s.add_dependency(%q<seamusabshere-conversions>, [">= 0"])
79
+ end
80
+ end
81
+
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'activesupport'
3
+ require 'activerecord'
4
+ require 'conversions'
5
+ require 'remote_table'
6
+ require 'errata'
7
+
8
+ require 'data_miner/active_record_ext'
9
+ require 'data_miner/attribute'
10
+ require 'data_miner/attribute_collection'
11
+ require 'data_miner/configuration'
12
+ require 'data_miner/dictionary'
13
+ require 'data_miner/step'
14
+ require 'data_miner/step/associate'
15
+ require 'data_miner/step/await'
16
+ require 'data_miner/step/callback'
17
+ require 'data_miner/step/derive'
18
+ require 'data_miner/step/import'
19
+ require 'data_miner/william_james_cartesian_product' # TODO: move to gem
20
+
21
+ module DataMiner
22
+ class << self
23
+ def mine(options = {})
24
+ DataMiner::Configuration.mine options
25
+ end
26
+
27
+ def map_to_attrs(method, options = {})
28
+ puts DataMiner::Configuration.map_to_attrs(method, options)
29
+ end
30
+
31
+ def enqueue(&block)
32
+ DataMiner::Configuration.enqueue &block
33
+ end
34
+
35
+ def classes
36
+ DataMiner::Configuration.classes
37
+ end
38
+ end
39
+ end
40
+
41
+ ActiveRecord::Base.class_eval do
42
+ include DataMiner::ActiveRecordExt
43
+ end
@@ -0,0 +1,25 @@
1
+ module DataMiner
2
+ module ActiveRecordExt
3
+ def self.included(klass)
4
+ klass.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def mine_data(options = {}, &block)
9
+ if defined?(NO_DATA_MINER) and NO_DATA_MINER == true
10
+ class_eval do
11
+ class << self
12
+ def data_mine
13
+ raise "NO_DATA_MINER is set to true, so data_mine is not available"
14
+ end
15
+ end
16
+ end
17
+ else
18
+ class_eval { cattr_accessor :data_mine }
19
+ self.data_mine = Configuration.new(self)
20
+ yield data_mine
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,299 @@
1
+ module DataMiner
2
+ class Attribute
3
+ attr_accessor :klass, :name, :options_for_step, :affected_by_steps, :key_for_steps
4
+
5
+ def initialize(klass, name)
6
+ @klass = klass
7
+ @name = name.to_sym
8
+ @options_for_step = {}
9
+ @affected_by_steps = []
10
+ @key_for_steps = []
11
+ end
12
+
13
+ # polling questions
14
+ def report_find_or_create(step)
15
+ "Creates parents: #{klass}##{name} is set with #{reflection_klass(step)}.find_or_create_by_#{foreign_key(step)}" if wants_create?(step)
16
+ end
17
+
18
+ def report_unnatural_order(step)
19
+ if (
20
+ (rk = klass.reflect_on_association(weighting_association(step)).andand.klass) or
21
+ (wants_inline_association? and rk = reflection_klass(step))
22
+ ) and
23
+ step.configuration.classes.index(rk) > step.configuration.classes.index(klass) and
24
+ step.options[:awaiting].andand.klass != klass
25
+ "Unnatural order: #{klass} comes before #{rk}"
26
+ end
27
+ end
28
+
29
+ def inspect
30
+ "Attribute(#{klass}.#{name})"
31
+ end
32
+
33
+ def affected_by!(step, options = {})
34
+ self.options_for_step[step] = options
35
+ self.affected_by_steps << step
36
+ end
37
+
38
+ def affected_by?(step)
39
+ affected_by_steps.include?(step)
40
+ end
41
+
42
+ def key_for!(step, options = {})
43
+ self.options_for_step[step] = options
44
+ self.key_for_steps << step
45
+ end
46
+
47
+ def key_for?(step)
48
+ key_for_steps.include?(step)
49
+ end
50
+
51
+ def value_in_dictionary(step, key)
52
+ return *dictionary(step).lookup(key) # strip the array wrapper if there's only one element
53
+ end
54
+
55
+ def value_in_source(step, row)
56
+ if wants_static?(step)
57
+ value = static(step)
58
+ elsif field_number(step)
59
+ if field_number(step).is_a?(Range)
60
+ value = field_number(step).map { |n| row[n] }.join(delimiter(step))
61
+ else
62
+ value = row[field_number(step)]
63
+ end
64
+ else
65
+ value = row[name_in_source(step)]
66
+ end
67
+ return nil if value.nil?
68
+ return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
69
+ value = value.to_s
70
+ value = value[keep(step)] if wants_keep?(step)
71
+ value = do_split(step, value) if wants_split?(step)
72
+ # taken from old errata... maybe we want to do this here
73
+ value.gsub!(/[ ]+/, ' ')
74
+ # text.gsub!('- ', '-')
75
+ value.gsub!(/([^\\])~/, '\1 ')
76
+ value.strip!
77
+ value.upcase! if wants_upcase?(step)
78
+ value = do_convert(step, row, value) if wants_conversion?(step)
79
+ value = do_sprintf(step, value) if wants_sprintf?(step)
80
+ value
81
+ end
82
+
83
+ def value_from_row(step, row)
84
+ value = value_in_source(step, row)
85
+ return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
86
+ value = value_in_dictionary(step, value) if wants_dictionary?(step)
87
+ value = value_as_association(step, value) if wants_inline_association?
88
+ value
89
+ end
90
+
91
+ def value_as_association(step, value)
92
+ @_value_as_association ||= {}
93
+ @_value_as_association[step] ||= {}
94
+ if !@_value_as_association[step].has_key?(value)
95
+ dynamic_matcher = wants_create?(step) ? "find_or_create_by_#{foreign_key(step)}" : "find_by_#{foreign_key(step)}"
96
+ @_value_as_association[step][value] = reflection_klass(step).send(dynamic_matcher, value)
97
+ end
98
+ @_value_as_association[step][value]
99
+ end
100
+
101
+ # this will overwrite nils, even if wants_overwriting?(step) is false
102
+ def set_record_from_row(step, record, row)
103
+ return if !wants_overwriting?(step) and !record.send(name).nil?
104
+ value = value_from_row(step, row)
105
+ record.send "#{name}=", value
106
+ $stderr.puts("ActiveRecord didn't like trying to set #{klass}.#{name} = #{value}") if !value.nil? and record.send(name).nil?
107
+ end
108
+
109
+ def perform(step)
110
+ case step.variant
111
+ when :associate
112
+ perform_association(step)
113
+ when :derive
114
+ if wants_update_all?(step)
115
+ perform_update_all(step)
116
+ elsif wants_weighted_average?(step)
117
+ perform_weighted_average(step)
118
+ else
119
+ perform_callback(step)
120
+ end
121
+ when :import
122
+ raise "This shouldn't be called, the import step is special"
123
+ end
124
+ end
125
+
126
+ def perform_association(step)
127
+ raise "dictionary and prefix don't mix" if wants_dictionary?(step) and wants_prefix?(step)
128
+ klass.update_all("#{reflection.primary_key_name} = NULL") if wants_nullification?(step)
129
+ if wants_create?(step)
130
+ klass.find_in_batches do |batch|
131
+ batch.each do |record|
132
+ if wants_prefix?(step)
133
+ sql = "SELECT reflection_table.id FROM #{reflection_klass(step).quoted_table_name} AS reflection_table INNER JOIN #{klass.quoted_table_name} AS klass_table ON LEFT(klass_table.#{key(step)}, LENGTH(reflection_table.#{foreign_key(step)})) = reflection_table.#{foreign_key(step)} WHERE klass_table.id = #{record.id} ORDER BY LENGTH(reflection_table.#{foreign_key(step)}) DESC"
134
+ associated_id = ActiveRecord::Base.connection.select_value(sql)
135
+ next if associated_id.blank?
136
+ record.send("#{reflection.primary_key_name}=", associated_id)
137
+ else
138
+ dynamic_finder_value = record.send(key(step))
139
+ dynamic_finder_value = value_in_dictionary(step, dynamic_finder_value) if wants_dictionary?(step)
140
+ next if dynamic_finder_value.blank?
141
+ associated = reflection_klass(step).send("find_or_create_by_#{foreign_key(step)}", dynamic_finder_value) # TODO cache results
142
+ record.send("#{name}=", associated)
143
+ end
144
+ record.save
145
+ end
146
+ end
147
+ else
148
+ reflection_klass(step).find_in_batches do |batch|
149
+ batch.each do |reflection_record|
150
+ klass.update_all ["#{reflection.primary_key_name} = ?", reflection_record.id], ["#{key(step)} = ?", reflection_record.send(foreign_key(step))]
151
+ end
152
+ end
153
+ end
154
+ end
155
+
156
+ def perform_update_all(step)
157
+ klass.update_all("#{name} = #{set(step)}", conditions(step))
158
+ end
159
+
160
+ def perform_weighted_average(step)
161
+ # handle weighting by scopes instead of associations
162
+ if weighting_association(step) and !klass.reflect_on_association(weighting_association(step))
163
+ klass.find_in_batches do |batch|
164
+ batch.each do |record|
165
+ record.send "#{name}=", record.send(weighting_association(step)).weighted_average(name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step))
166
+ record.save
167
+ end
168
+ end
169
+ else # there's no weighting association OR there is one and it's a valid association
170
+ klass.update_all_weighted_averages name, :by => weighting_column(step), :disaggregator => weighting_disaggregator(step), :association => weighting_association(step)
171
+ end
172
+ end
173
+
174
+ def perform_callback(step)
175
+ case klass.method(callback(step)).arity
176
+ when 0:
177
+ klass.send(callback(step))
178
+ when 1:
179
+ klass.send(callback(step), name)
180
+ when 2:
181
+ klass.send(callback(step), name, options_for_step[step])
182
+ end
183
+ end
184
+
185
+ def unit_from_source(step, row)
186
+ row[unit_in_source(step)].to_s.strip.underscore.to_sym
187
+ end
188
+
189
+ def do_convert(step, row, value)
190
+ from_unit = from(step) || unit_from_source(step, row)
191
+ value.to_f.convert(from_unit, to(step))
192
+ end
193
+
194
+ def do_sprintf(step, value)
195
+ if /\%[0-9\.]*f/.match(sprintf(step))
196
+ value = value.to_f
197
+ elsif /\%[0-9\.]*d/.match(sprintf(step))
198
+ value = value.to_i
199
+ end
200
+ sprintf(step) % value
201
+ end
202
+
203
+ def do_split(step, value)
204
+ pattern = split_options(step)[:pattern] || /\s+/ # default is split on whitespace
205
+ keep = split_options(step)[:keep] || 0 # default is keep first element
206
+ value.to_s.split(pattern)[keep].to_s
207
+ end
208
+
209
+ def column_type
210
+ @column_type ||= klass.columns_hash[name.to_s].type
211
+ end
212
+
213
+ {
214
+ :static => 'options_for_step[step].has_key?(:static)',
215
+ :prefix => :prefix,
216
+ :create => :create,
217
+ :keep => :keep,
218
+ :upcase => :upcase,
219
+ :conversion => '!from(step).nil? or !unit_in_source(step).nil?',
220
+ :sprintf => :sprintf,
221
+ :dictionary => :dictionary_options,
222
+ :split => :split_options,
223
+ :update_all => :set,
224
+ :nullification => 'nullify(step) != false',
225
+ :overwriting => 'overwrite(step) != false',
226
+ :weighted_average => '!weighting_association(step).nil? or !weighting_column(step).nil?'
227
+ }.each do |name, condition|
228
+ condition = "!#{condition}(step).nil?" if condition.is_a?(Symbol)
229
+ eval <<-EOS
230
+ def wants_#{name}?(step)
231
+ #{condition}
232
+ end
233
+ EOS
234
+ end
235
+
236
+ {
237
+ :name_in_source => { :default => :name, :stringify => true },
238
+ :key => { :default => :name, :stringify => true },
239
+ :foreign_key => { :default => 'key(step)', :stringify => true },
240
+ :delimiter => { :default => '", "' }
241
+ }.each do |name, options|
242
+ eval <<-EOS
243
+ def #{name}(step)
244
+ (options_for_step[step][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
245
+ end
246
+ EOS
247
+ end
248
+
249
+ def reflection
250
+ if @_reflection.nil?
251
+ @_reflection = klass.reflect_on_association(name) || :missing
252
+ reflection
253
+ elsif @_reflection == :missing
254
+ nil
255
+ else
256
+ @_reflection
257
+ end
258
+ end
259
+
260
+ def reflection_klass(step)
261
+ return nil unless reflection
262
+ if reflection.options[:polymorphic]
263
+ polymorphic_type(step).andand.constantize
264
+ else
265
+ reflection.klass
266
+ end
267
+ end
268
+
269
+ def wants_inline_association?
270
+ reflection.present?
271
+ end
272
+
273
+ def callback(step)
274
+ (options_for_step[step][:callback] || "derive_#{name}").to_sym
275
+ end
276
+
277
+ def dictionary(step)
278
+ raise "shouldn't ask for this" unless wants_dictionary?(step) # don't try to initialize if there are no dictionary options
279
+ @dictionaries ||= {}
280
+ @dictionaries[step] ||= Dictionary.new(dictionary_options(step))
281
+ end
282
+
283
+ %w(dictionary split).each do |name|
284
+ eval <<-EOS
285
+ def #{name}_options(step)
286
+ options_for_step[step][:#{name}]
287
+ end
288
+ EOS
289
+ end
290
+
291
+ %w(from to set conditions weighting_association weighting_column weighting_disaggregator sprintf nullify overwrite upcase prefix unit_in_source field_number keep create static polymorphic_type).each do |name|
292
+ eval <<-EOS
293
+ def #{name}(step)
294
+ options_for_step[step][:#{name}]
295
+ end
296
+ EOS
297
+ end
298
+ end
299
+ end
@@ -0,0 +1,51 @@
1
+ module DataMiner
2
+ class AttributeCollection
3
+ attr_accessor :klass, :attributes
4
+
5
+ def initialize(klass)
6
+ @klass = klass
7
+ @attributes = {}
8
+ end
9
+
10
+ def key!(step, attr_name, attr_options = {})
11
+ find_or_initialize(attr_name).key_for!(step, attr_options)
12
+ end
13
+
14
+ def affect!(step, attr_name, attr_options = {})
15
+ find_or_initialize(attr_name).affected_by!(step, attr_options)
16
+ end
17
+
18
+ def affect_all_content_columns!(step, options = {})
19
+ except = Array.wrap(options[:except]).map(&:to_sym)
20
+ step.klass.content_columns.map(&:name).reject { |content_column| except.include?(content_column.to_sym) }.each do |content_column|
21
+ find_or_initialize(content_column).affected_by!(step)
22
+ end
23
+ end
24
+
25
+ def all_affected_by(step)
26
+ attributes.values.select { |attr| attr.affected_by?(step) }
27
+ end
28
+
29
+ def all_keys_for(step)
30
+ attributes.values.select { |attr| attr.key_for?(step) }
31
+ end
32
+
33
+ def all_for(step)
34
+ (all_affected_by(step) + all_keys_for(step)).uniq
35
+ end
36
+
37
+ def has_keys_for?(step)
38
+ attributes.values.any? { |attr| attr.key_for?(step) }
39
+ end
40
+
41
+ def has_conditional_writes_for?(step)
42
+ all_affected_by(step).any? { |attr| !attr.wants_overwriting?(step) }
43
+ end
44
+
45
+ private
46
+
47
+ def find_or_initialize(attr_name)
48
+ self.attributes[attr_name] ||= Attribute.new(klass, attr_name)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,94 @@
1
+ module DataMiner
2
+ class Configuration
3
+ attr_accessor :steps, :klass, :counter, :attributes, :awaiting
4
+
5
+ def initialize(klass)
6
+ @steps = []
7
+ @klass = klass
8
+ @counter = 0
9
+ @attributes = AttributeCollection.new(klass)
10
+ end
11
+
12
+ %w(import associate derive await).each do |method|
13
+ eval <<-EOS
14
+ def #{method}(*args, &block)
15
+ self.counter += 1
16
+ if block_given? # FORM C
17
+ step_options = args[0] || {}
18
+ set_awaiting!(step_options)
19
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options, &block)
20
+ elsif args[0].is_a?(Hash) # FORM A
21
+ step_options = args[0]
22
+ set_awaiting!(step_options)
23
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options)
24
+ else # FORM B
25
+ attr_name = args[0]
26
+ attr_options = args[1] || {}
27
+ step_options = {}
28
+ set_awaiting!(step_options)
29
+ self.steps << Step::#{method.camelcase}.new(self, counter, step_options) do |attr|
30
+ attr.affect attr_name, attr_options
31
+ end
32
+ end
33
+ end
34
+ EOS
35
+ end
36
+
37
+ def set_awaiting!(step_options)
38
+ step_options.merge!(:awaiting => awaiting) if !awaiting.nil?
39
+ end
40
+
41
+ def awaiting!(step)
42
+ self.awaiting = step
43
+ end
44
+
45
+ def stop_awaiting!
46
+ self.awaiting = nil
47
+ end
48
+
49
+ # Mine data for this class.
50
+ def mine(options = {})
51
+ steps.each { |step| step.perform options }
52
+ end
53
+
54
+ # Map <tt>method</tt> to attributes
55
+ def map_to_attrs(method)
56
+ steps.map { |step| step.map_to_attrs(method) }.compact
57
+ end
58
+
59
+ cattr_accessor :classes
60
+ self.classes = []
61
+ class << self
62
+ # Mine data. Defaults to all classes touched by DataMiner.
63
+ #
64
+ # Options
65
+ # * <tt>:class_names</tt>: provide an array class names to mine
66
+ def mine(options = {})
67
+ classes.each do |klass|
68
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
69
+ klass.data_mine.mine options
70
+ end
71
+ end
72
+ end
73
+
74
+ # Map a <tt>method</tt> to attrs. Defaults to all classes touched by DataMiner.
75
+ #
76
+ # Options
77
+ # * <tt>:class_names</tt>: provide an array class names to mine
78
+ def map_to_attrs(method, options = {})
79
+ classes.map do |klass|
80
+ if options[:class_names].blank? or options[:class_names].include?(klass.name)
81
+ klass.data_mine.map_to_attrs method
82
+ end
83
+ end.flatten.compact
84
+ end
85
+
86
+ # Queue up all the ActiveRecord classes that DataMiner should touch.
87
+ #
88
+ # Generally done in <tt>config/initializers/data_miner_config.rb</tt>.
89
+ def enqueue(&block)
90
+ yield self.classes
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,36 @@
1
+ module DataMiner
2
+ class Dictionary
3
+ attr_accessor :key_name, :value_name, :sprintf, :table
4
+
5
+ def initialize(options = {})
6
+ @key_name = options[:key]
7
+ @value_name = options[:returns]
8
+ @sprintf = options[:sprintf] || '%s'
9
+ @table = RemoteTable.new(:url => options[:url])
10
+ end
11
+
12
+ def lookup(key)
13
+ find(self.key_name, key, self.value_name, :sprintf => self.sprintf)
14
+ end
15
+
16
+ def find(key_name, key, value_name, options = {})
17
+ if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
+ match[value_name].to_s.split(/\s*;\s/)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def normalize_for_comparison(string, options = {})
25
+ if options[:sprintf]
26
+ if /\%[0-9\.]*f/.match(options[:sprintf])
27
+ string = string.to_f
28
+ elsif /\%[0-9\.]*d/.match(options[:sprintf])
29
+ string = string.to_i
30
+ end
31
+ string = sprintf % string
32
+ end
33
+ string.to_s.strip
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,64 @@
1
+ module DataMiner
2
+ class Step
3
+ attr_accessor :configuration, :number, :options
4
+ delegate :klass, :to => :configuration
5
+ delegate :attributes, :to => :configuration
6
+
7
+ def initialize(configuration, number, options = {}, &block)
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ yield self if block_given? # pull in attributes
12
+ attributes.affect_all_content_columns!(self, :except => options[:except]) if options[:affect_all] == :content_columns
13
+ affected_attributes.each { |attr| attr.options_for_step[self][:callback] = options[:callback] } if options[:callback]
14
+ all_attributes.each { |attr| attr.options_for_step[self][:name_in_source] = attr.name_in_source(self).upcase } if options[:headers] == :upcase # TODO remove
15
+ end
16
+
17
+ def variant
18
+ self.class.name.demodulize.underscore.to_sym
19
+ end
20
+
21
+ def awaiting?
22
+ !options[:awaiting].nil?
23
+ end
24
+
25
+ def inspect
26
+ "Step(#{klass} #{variant.to_s.camelcase} #{number})"
27
+ end
28
+
29
+ def signature
30
+ "#{klass} step #{number}: #{variant}"
31
+ end
32
+
33
+ def perform(options = {})
34
+ return if awaiting? and !options[:force]
35
+ affected_attributes.each { |attr| attr.perform self }
36
+ $stderr.puts "performed #{signature}"
37
+ end
38
+
39
+ def affected_attributes
40
+ @affected_attributes ||= attributes.all_affected_by self
41
+ end
42
+
43
+ def key_attributes
44
+ @key_attributes ||= attributes.all_keys_for self
45
+ end
46
+
47
+ def all_attributes
48
+ @all_attributes ||= attributes.all_for self
49
+ end
50
+
51
+ def key(attr_name, attr_options = {})
52
+ attributes.key! self, attr_name, attr_options
53
+ end
54
+
55
+ def affect(attr_name, attr_options = {})
56
+ attributes.affect! self, attr_name, attr_options
57
+ end
58
+ alias_method :store, :affect
59
+
60
+ def map_to_attrs(method)
61
+ affected_attributes.map { |attr| attr.send method, self }.compact
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Associate < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,35 @@
1
+ module DataMiner
2
+ class Step
3
+ class Await < Step
4
+ attr_accessor :other_class
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ # doesn't call super
8
+ @configuration = configuration
9
+ @number = number
10
+ @options = options
11
+ @other_class = options.delete :other_class
12
+ configuration.awaiting! self
13
+ yield configuration # pull in steps
14
+ configuration.stop_awaiting!
15
+ end
16
+
17
+ def perform(*args)
18
+ other_class.data_mine.steps << Step::Callback.new(other_class.data_mine, self)
19
+ $stderr.puts "added #{signature} to callbacks after #{other_class}"
20
+ end
21
+
22
+ def callback
23
+ $stderr.puts "starting to perform deferred steps in #{signature}..."
24
+ all_awaiting.each { |step| step.perform :force => true }
25
+ $stderr.puts "...done"
26
+ end
27
+
28
+ private
29
+
30
+ def all_awaiting
31
+ configuration.steps.select { |step| step.options and step.options[:awaiting] == self }
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module DataMiner
2
+ class Step
3
+ class Callback < Step
4
+ attr_accessor :foreign_step
5
+
6
+ def initialize(configuration, foreign_step)
7
+ @configuration = configuration
8
+ @foreign_step = foreign_step
9
+ @number = "(last)"
10
+ end
11
+
12
+ def perform(*args)
13
+ foreign_step.callback
14
+ $stderr.puts "performed #{signature}"
15
+ end
16
+
17
+ def signature
18
+ "#{super} (on behalf of #{foreign_step.signature})"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataMiner
2
+ class Step
3
+ class Derive < Step
4
+ def signature
5
+ "#{super} #{affected_attributes.first.name}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,57 @@
1
+ module DataMiner
2
+ class Step
3
+ class Import < Step
4
+ attr_accessor :table, :errata
5
+
6
+ def initialize(configuration, number, options = {}, &block)
7
+ super
8
+ @errata = Errata.new(:url => options[:errata], :klass => klass) if options[:errata]
9
+ @table = RemoteTable.new(options.slice(:url, :filename, :post_data, :format, :skip, :cut, :schema, :schema_name, :trap, :select, :reject, :sheet, :delimiter, :headers, :transform, :crop))
10
+ end
11
+
12
+ def signature
13
+ "#{super} #{options[:url]}"
14
+ end
15
+
16
+ def perform(*args)
17
+ ActiveRecord::Base.connection.execute("TRUNCATE #{klass.quoted_table_name}") if wants_truncate?
18
+ table.each_row do |row|
19
+ if errata
20
+ next if errata.rejects?(row)
21
+ errata.correct!(row)
22
+ end
23
+ if uses_existing_data?
24
+ key_values = key_attributes.map { |key_attr| [ key_attr.value_from_row(self, row) ] }
25
+ record_set = WilliamJamesCartesianProduct.cart_prod(*key_values).map do |combination|
26
+ next if combination.include?(nil) and !wants_nil_keys?
27
+ klass.send(dynamic_finder_name, *combination)
28
+ end.flatten
29
+ else
30
+ record_set = klass.new
31
+ end
32
+ Array.wrap(record_set).each do |record|
33
+ affected_attributes.each { |attr| attr.set_record_from_row(self, record, row) }
34
+ record.save
35
+ end
36
+ end
37
+ $stderr.puts "performed #{signature}"
38
+ end
39
+
40
+ def wants_truncate?
41
+ options[:truncate] == true or (!(options[:truncate] == false) and !uses_existing_data?)
42
+ end
43
+
44
+ def wants_nil_keys?
45
+ options[:allow_nil_keys] == true
46
+ end
47
+
48
+ def uses_existing_data?
49
+ @uses_existing_data ||= attributes.has_keys_for?(self) or attributes.has_conditional_writes_for?(self)
50
+ end
51
+
52
+ def dynamic_finder_name
53
+ "find_or_initialize_by_#{key_attributes.map(&:name).join('_and_')}".to_sym
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,11 @@
1
+ # http://www.ruby-forum.com/topic/95519#200484
2
+
3
+ module WilliamJamesCartesianProduct
4
+ def self.cart_prod( *args )
5
+ args.inject([[]]){|old,lst|
6
+ new = []
7
+ lst.each{|e| new += old.map{|c| c.dup << e }}
8
+ new
9
+ }
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'test_helper'
2
+
3
+ ActiveRecord::Schema.define(:version => 20090819143429) do
4
+ create_table "airports", :force => true do |t|
5
+ t.string "iata_code"
6
+ t.string "name"
7
+ t.string "city"
8
+ t.integer "country_id"
9
+ t.float "latitude"
10
+ t.float "longitude"
11
+ t.datetime "created_at"
12
+ t.datetime "updated_at"
13
+ end
14
+ create_table "countries", :force => true do |t|
15
+ t.string "iso_3166"
16
+ t.string "name"
17
+ t.datetime "created_at"
18
+ t.datetime "updated_at"
19
+ end
20
+ end
21
+
22
+ class Country < ActiveRecord::Base
23
+ mine_data do |step|
24
+ # import country names and country codes
25
+ step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
26
+ attr.key :iso_3166, :name_in_source => 'country code'
27
+ attr.store :iso_3166, :name_in_source => 'country code'
28
+ attr.store :name, :name_in_source => 'country'
29
+ end
30
+ end
31
+ end
32
+
33
+ class Airport < ActiveRecord::Base
34
+ belongs_to :country
35
+ mine_data do |step|
36
+ # import airport iata_code, name, etc.
37
+ step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
38
+ attr.key :iata_code, :field_number => 3
39
+ attr.store :name, :field_number => 0
40
+ attr.store :city, :field_number => 1
41
+ attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
42
+ attr.store :iata_code, :field_number => 3
43
+ attr.store :latitude, :field_number => 5
44
+ attr.store :longitude, :field_number => 6
45
+ end
46
+ end
47
+ end
48
+
49
+ DataMiner.enqueue do |queue|
50
+ queue << Country
51
+ queue << Airport
52
+ end
53
+
54
+ class DataMinerTest < Test::Unit::TestCase
55
+ def teardown
56
+ Airport.delete_all
57
+ Country.delete_all
58
+ end
59
+
60
+ should "mine a single class" do
61
+ Country.data_mine.mine
62
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
63
+ assert_equal 0, Airport.count
64
+ end
65
+
66
+ should "mine a single class using the API" do
67
+ DataMiner.mine :class_names => ['Country']
68
+ assert_equal 'Uruguay', Country.find_by_iso_3166('UY').name
69
+ assert_equal 0, Airport.count
70
+ end
71
+
72
+ should "mine all classes" do
73
+ DataMiner.mine
74
+ uy = Country.find_by_iso_3166('UY')
75
+ assert_equal 'Uruguay', uy.name
76
+ assert_equal uy, Airport.find_by_iata_code('MVD').country
77
+ end
78
+ end
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'sqlite3'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'data_miner'
9
+
10
+ ActiveRecord::Base.establish_connection(
11
+ 'adapter' => 'sqlite3',
12
+ 'database' => 'test/test.sqlite3'
13
+ )
14
+
15
+ class Test::Unit::TestCase
16
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_miner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
+ platform: ruby
6
+ authors:
7
+ - Seamus Abshere
8
+ - Andy Rossmeissl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-10-30 00:00:00 -04:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: activerecord
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: activesupport
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ version:
36
+ - !ruby/object:Gem::Dependency
37
+ name: andand
38
+ type: :runtime
39
+ version_requirement:
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ - !ruby/object:Gem::Dependency
47
+ name: remote_table
48
+ type: :runtime
49
+ version_requirement:
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ - !ruby/object:Gem::Dependency
57
+ name: seamusabshere-errata
58
+ type: :runtime
59
+ version_requirement:
60
+ version_requirements: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ - !ruby/object:Gem::Dependency
67
+ name: seamusabshere-conversions
68
+ type: :runtime
69
+ version_requirement:
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: "0"
75
+ version:
76
+ description: Mine remote data into your ActiveRecord models. You can also perform associations and convert units.
77
+ email: seamus@abshere.net
78
+ executables: []
79
+
80
+ extensions: []
81
+
82
+ extra_rdoc_files:
83
+ - LICENSE
84
+ - README.rdoc
85
+ files:
86
+ - .document
87
+ - .gitignore
88
+ - LICENSE
89
+ - README.rdoc
90
+ - Rakefile
91
+ - VERSION
92
+ - data_miner.gemspec
93
+ - lib/data_miner.rb
94
+ - lib/data_miner/active_record_ext.rb
95
+ - lib/data_miner/attribute.rb
96
+ - lib/data_miner/attribute_collection.rb
97
+ - lib/data_miner/configuration.rb
98
+ - lib/data_miner/dictionary.rb
99
+ - lib/data_miner/step.rb
100
+ - lib/data_miner/step/associate.rb
101
+ - lib/data_miner/step/await.rb
102
+ - lib/data_miner/step/callback.rb
103
+ - lib/data_miner/step/derive.rb
104
+ - lib/data_miner/step/import.rb
105
+ - lib/data_miner/william_james_cartesian_product.rb
106
+ - test/data_miner_test.rb
107
+ - test/test_helper.rb
108
+ has_rdoc: true
109
+ homepage: http://github.com/seamusabshere/data_miner
110
+ licenses: []
111
+
112
+ post_install_message:
113
+ rdoc_options:
114
+ - --charset=UTF-8
115
+ - --line-numbers
116
+ - --inline-source
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: "0"
124
+ version:
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: "0"
130
+ version:
131
+ requirements: []
132
+
133
+ rubyforge_project:
134
+ rubygems_version: 1.3.5
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: Mine remote data into your ActiveRecord models.
138
+ test_files:
139
+ - test/data_miner_test.rb
140
+ - test/test_helper.rb