data_miner 0.3.13 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -11,12 +11,19 @@ Put this in <tt>config/environment.rb</tt>:
11
11
  You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
12
12
 
13
13
  class Country < ActiveRecord::Base
14
- data_miner do |step|
15
- # import country names and country codes
16
- step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
17
- attr.key :iso_3166, :field_name => 'country code'
18
- attr.store :iso_3166, :field_name => 'country code'
19
- attr.store :name, :field_name => 'country'
14
+ set_primary_key :iso_3166
15
+
16
+ data_miner do
17
+ import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
18
+ key 'iso_3166'
19
+ store 'iso_3166', :field_number => 1
20
+ store 'name', :field_number => 0
21
+ end
22
+
23
+ import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
24
+ key 'iso_3166'
25
+ store 'iso_3166', :field_name => 'country code'
26
+ store 'name', :field_name => 'country'
20
27
  end
21
28
  end
22
29
  end
@@ -24,18 +31,17 @@ You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For e
24
31
  ...and in <tt>app/models/airport.rb</tt>:
25
32
 
26
33
  class Airport < ActiveRecord::Base
27
- belongs_to :country
34
+ set_primary_key :iata_code
28
35
 
29
- data_miner do |step|
30
- # import airport iata_code, name, etc.
31
- step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
32
- attr.key :iata_code, :field_number => 3
33
- attr.store :name, :field_number => 0
34
- attr.store :city, :field_number => 1
35
- attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
36
- attr.store :iata_code, :field_number => 3
37
- attr.store :latitude, :field_number => 5
38
- attr.store :longitude, :field_number => 6
36
+ data_miner do
37
+ import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
38
+ key 'iata_code'
39
+ store 'name', :field_number => 1
40
+ store 'city', :field_number => 2
41
+ store 'country_name', :field_number => 3
42
+ store 'iata_code', :field_number => 4
43
+ store 'latitude', :field_number => 6
44
+ store 'longitude', :field_number => 7
39
45
  end
40
46
  end
41
47
  end
@@ -48,30 +54,22 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
48
54
  end
49
55
  end
50
56
 
51
- You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
52
-
53
- DataMiner.enqueue do |queue|
54
- queue << Country # class whose data should be mined 1st
55
- queue << Airport # class whose data should be mined 2nd
56
- # etc
57
- end
58
-
59
57
  Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can:
60
58
 
61
- $ rake data_miner:run
59
+ $ rake data_miner:run RESOURCES=Airport,Country
62
60
 
63
61
  ==Complete example
64
62
 
65
63
  ~ $ rails testapp
66
64
  ~ $ cd testapp/
67
- ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
65
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float
66
+ [...edit migration to make iata_code the primary key...]
68
67
  ~/testapp $ ./script/generate model Country iso_3166:string name:string
68
+ [...edit migration to make iso_3166 the primary key...]
69
69
  ~/testapp $ rake db:migrate
70
70
  ~/testapp $ touch lib/tasks/data_miner_tasks.rb
71
71
  [...edit per quick start...]
72
- ~/testapp $ touch config/initializers/data_miner_config.rake
73
- [...edit per quick start...]
74
- ~/testapp $ rake data_miner:run
72
+ ~/testapp $ rake data_miner:run RESOURCES=Airport,Country
75
73
 
76
74
  Now you should have
77
75
 
@@ -79,7 +77,7 @@ Now you should have
79
77
  Loading development environment (Rails 2.3.3)
80
78
  >> Airport.first.iata_code
81
79
  => "GKA"
82
- >> Airport.first.country.name
80
+ >> Airport.first.country_name
83
81
  => "Papua New Guinea"
84
82
 
85
83
  ==Authors
@@ -89,4 +87,4 @@ Now you should have
89
87
 
90
88
  ==Copyright
91
89
 
92
- Copyright (c) 2009 Brighter Planet. See LICENSE for details.
90
+ Copyright (c) 2010 Brighter Planet. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.13
1
+ 0.4.0
data/data_miner.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{data_miner}
8
- s.version = "0.3.13"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-03-18}
12
+ s.date = %q{2010-03-19}
13
13
  s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
32
32
  "lib/data_miner/import.rb",
33
33
  "lib/data_miner/process.rb",
34
34
  "lib/data_miner/run.rb",
35
- "lib/data_miner/william_james_cartesian_product.rb",
36
35
  "test/data_miner_test.rb",
37
36
  "test/test_helper.rb"
38
37
  ]
data/lib/data_miner.rb CHANGED
@@ -14,9 +14,6 @@ require 'data_miner/import'
14
14
  require 'data_miner/process'
15
15
  require 'data_miner/run'
16
16
 
17
- # TODO: move to gem
18
- require 'data_miner/william_james_cartesian_product'
19
-
20
17
  module DataMiner
21
18
  class MissingHashColumn < RuntimeError; end
22
19
 
@@ -71,7 +68,6 @@ ActiveRecord::Base.class_eval do
71
68
  self.data_miner_config = DataMiner::Configuration.new self
72
69
 
73
70
  Blockenspiel.invoke block, data_miner_config
74
- data_miner_config.after_invoke
75
71
  end
76
72
  end
77
73
 
@@ -1,66 +1,68 @@
1
1
  module DataMiner
2
2
  class Attribute
3
- attr_accessor :resource, :name, :options_for_import
3
+ attr_accessor :runnable
4
+ attr_accessor :name
5
+ attr_accessor :options
4
6
 
5
- def initialize(resource, name)
6
- @resource = resource
7
+ delegate :resource, :to => :runnable
8
+
9
+ def initialize(runnable, name, options = {})
10
+ options.symbolize_keys!
11
+ @options = options
12
+
13
+ @runnable = runnable
7
14
  @name = name
8
- @options_for_import = {}
9
15
  end
10
16
 
11
17
  def inspect
12
18
  "Attribute(#{resource}##{name})"
13
19
  end
14
20
 
15
- def stored_by?(import)
16
- options_for_import.has_key?(import)
17
- end
18
-
19
- def value_in_dictionary(import, key)
20
- return *dictionary(import).lookup(key) # strip the array wrapper if there's only one element
21
+ def value_in_dictionary(str)
22
+ dictionary.lookup str
21
23
  end
22
24
 
23
- def value_in_source(import, row)
24
- if wants_static?(import)
25
- value = static(import)
26
- elsif field_number(import)
27
- if field_number(import).is_a?(Range)
28
- value = field_number(import).map { |n| row[n] }.join(delimiter(import))
25
+ def value_in_source(row)
26
+ if wants_static?
27
+ value = static
28
+ elsif field_number
29
+ if field_number.is_a?(Range)
30
+ value = field_number.map { |n| row[n] }.join(delimiter)
29
31
  else
30
- value = row[field_number(import)]
32
+ value = row[field_number]
31
33
  end
32
34
  else
33
- value = row[field_name(import)]
35
+ value = row[field_name]
34
36
  end
35
37
  return nil if value.nil?
36
38
  return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
37
39
  value = value.to_s
38
- value = value[chars(import)] if wants_chars?(import)
39
- value = do_split(import, value) if wants_split?(import)
40
+ value = value[chars] if wants_chars?
41
+ value = do_split(value) if wants_split?
40
42
  # taken from old errata... maybe we want to do this here
41
- value.gsub!(/[ ]+/, ' ')
43
+ value.gsub! /[ ]+/, ' '
42
44
  # text.gsub!('- ', '-')
43
- value.gsub!(/([^\\])~/, '\1 ')
45
+ value.gsub! /([^\\])~/, '\1 '
44
46
  value.strip!
45
- value.upcase! if wants_upcase?(import)
46
- value = do_convert(import, row, value) if wants_conversion?(import)
47
- value = do_sprintf(import, value) if wants_sprintf?(import)
47
+ value.upcase! if wants_upcase?
48
+ value = do_convert row, value if wants_conversion?
49
+ value = do_sprintf value if wants_sprintf?
48
50
  value
49
51
  end
50
52
 
51
- def value_from_row(import, row)
52
- value = value_in_source(import, row)
53
- return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
54
- value = value_in_dictionary(import, value) if wants_dictionary?(import)
53
+ def value_from_row(row)
54
+ value = value_in_source row
55
+ return value if value.is_a? ActiveRecord::Base # carry through trapdoor
56
+ value = value_in_dictionary value if wants_dictionary?
55
57
  value
56
58
  end
57
59
 
58
- # this will overwrite nils, even if wants_overwriting?(import) is false
60
+ # this will overwrite nils, even if wants_overwriting? is false
59
61
  # returns true if an attr was changed, otherwise false
60
- def set_record_from_row(import, record, row)
61
- return false if !wants_overwriting?(import) and !record.send(name).nil?
62
+ def set_record_from_row(record, row)
63
+ return false if !wants_overwriting? and !record.send(name).nil?
62
64
  what_it_was = record.send name
63
- what_it_should_be = value_from_row import, row
65
+ what_it_should_be = value_from_row row
64
66
  record.send "#{name}=", what_it_should_be
65
67
  what_it_is = record.send name
66
68
  if what_it_is.nil? and !what_it_should_be.nil?
@@ -73,26 +75,26 @@ module DataMiner
73
75
  end
74
76
  end
75
77
 
76
- def unit_from_source(import, row)
77
- row[units_field_name(import)].to_s.strip.underscore.to_sym
78
+ def unit_from_source(row)
79
+ row[units_field_name].to_s.strip.underscore.to_sym
78
80
  end
79
81
 
80
- def do_convert(import, row, value)
81
- value.to_f.convert((from_units(import) || unit_from_source(import, row)), to_units(import))
82
+ def do_convert(row, value)
83
+ value.to_f.convert((from_units || unit_from_source(row)), to_units)
82
84
  end
83
85
 
84
- def do_sprintf(import, value)
85
- if /\%[0-9\.]*f/.match(sprintf(import))
86
+ def do_sprintf(value)
87
+ if /\%[0-9\.]*f/.match sprintf
86
88
  value = value.to_f
87
- elsif /\%[0-9\.]*d/.match(sprintf(import))
89
+ elsif /\%[0-9\.]*d/.match sprintf
88
90
  value = value.to_i
89
91
  end
90
- sprintf(import) % value
92
+ sprintf % value
91
93
  end
92
94
 
93
- def do_split(import, value)
94
- pattern = split_options(import)[:pattern] || /\s+/ # default is split on whitespace
95
- keep = split_options(import)[:keep] || 0 # default is keep first element
95
+ def do_split(value)
96
+ pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
97
+ keep = split_options[:keep] || 0 # default is keep first element
96
98
  value.to_s.split(pattern)[keep].to_s
97
99
  end
98
100
 
@@ -100,127 +102,91 @@ module DataMiner
100
102
  resource.columns_hash[name.to_s].type
101
103
  end
102
104
 
103
- def dictionary(import)
104
- raise "shouldn't ask for this" unless wants_dictionary?(import) # don't try to initialize if there are no dictionary options
105
- @_dictionary ||= Dictionary.new dictionary_options(import)
106
- end
107
-
108
- # {
109
- # :static => 'options_for_import[import].has_key?(:static)',
110
- # :chars => :chars,
111
- # :upcase => :upcase,
112
- # :conversion => '!from_units(import).nil? or !units_field_name(import).nil?',
113
- # :sprintf => :sprintf,
114
- # :dictionary => :dictionary_options,
115
- # :split => :split_options,
116
- # :nullification => 'nullify(import) != false',
117
- # :overwriting => 'overwrite(import) != false',
118
- # }.each do |name, condition|
119
- # condition = "!#{condition}(import).nil?" if condition.is_a?(Symbol)
120
- # puts <<-EOS
121
- # def wants_#{name}?(import)
122
- # #{condition}
123
- # end
124
- # EOS
125
- # end
126
- def wants_split?(import)
127
- !split_options(import).nil?
105
+ # Our wants and needs :)
106
+ def wants_split?
107
+ split_options.present?
128
108
  end
129
- def wants_sprintf?(import)
130
- !sprintf(import).nil?
109
+ def wants_sprintf?
110
+ sprintf.present?
131
111
  end
132
- def wants_upcase?(import)
133
- !upcase(import).nil?
112
+ def wants_upcase?
113
+ upcase.present?
134
114
  end
135
- def wants_static?(import)
136
- options_for_import[import].has_key?(:static)
115
+ def wants_static?
116
+ options.has_key? :static
137
117
  end
138
- def wants_nullification?(import)
139
- nullify(import) != false
118
+ def wants_nullification?
119
+ nullify != false
140
120
  end
141
- def wants_chars?(import)
142
- !chars(import).nil?
121
+ def wants_chars?
122
+ chars.present?
143
123
  end
144
- def wants_overwriting?(import)
145
- overwrite(import) != false
124
+ def wants_overwriting?
125
+ overwrite != false
146
126
  end
147
- def wants_conversion?(import)
148
- !from_units(import).nil? or !units_field_name(import).nil?
127
+ def wants_conversion?
128
+ from_units.present? or units_field_name.present?
149
129
  end
150
- def wants_dictionary?(import)
151
- !dictionary_options(import).nil?
130
+ def wants_dictionary?
131
+ options[:dictionary].present?
152
132
  end
153
-
154
- # {
155
- # :field_name => { :default => :name, :stringify => true },
156
- # :delimiter => { :default => '", "' }
157
- # }.each do |name, options|
158
- # puts <<-EOS
159
- # def #{name}(import)
160
- # (options_for_import[import][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
161
- # end
162
- # EOS
163
- # end
164
- def field_name(import)
165
- (options_for_import[import][:field_name] || name).to_s
133
+
134
+ # Options that always have values
135
+ def field_name
136
+ (options[:field_name] || name).to_s
166
137
  end
167
- def delimiter(import)
168
- (options_for_import[import][:delimiter] || ", ")
138
+ def delimiter
139
+ (options[:delimiter] || ', ')
169
140
  end
170
141
 
171
- # %w(dictionary split).each do |name|
172
- # puts <<-EOS
173
- # def #{name}_options(import)
174
- # options_for_import[import][:#{name}]
175
- # end
176
- # EOS
177
- # end
178
- def dictionary_options(import)
179
- options_for_import[import][:dictionary]
180
- end
181
- def split_options(import)
182
- options_for_import[import][:split]
142
+ # Options that can't be referred to by their names
143
+ def split_options
144
+ options[:split]
183
145
  end
184
-
146
+
147
+ # Normal options
185
148
  # %w(from_units to_units conditions sprintf nullify overwrite upcase units_field_name field_number chars static).each do |name|
186
149
  # puts <<-EOS
187
- # def #{name}(import)
188
- # options_for_import[import][:#{name}]
150
+ # def #{name}
151
+ # options[:#{name}]
189
152
  # end
190
153
  # EOS
191
154
  # end
192
- def from_units(import)
193
- options_for_import[import][:from_units]
155
+ def from_units
156
+ options[:from_units]
157
+ end
158
+ def to_units
159
+ options[:to_units]
194
160
  end
195
- def to_units(import)
196
- options_for_import[import][:to_units]
161
+ def conditions
162
+ options[:conditions]
197
163
  end
198
- def conditions(import)
199
- options_for_import[import][:conditions]
164
+ def sprintf
165
+ options[:sprintf]
200
166
  end
201
- def sprintf(import)
202
- options_for_import[import][:sprintf]
167
+ def nullify
168
+ options[:nullify]
203
169
  end
204
- def nullify(import)
205
- options_for_import[import][:nullify]
170
+ def overwrite
171
+ options[:overwrite]
206
172
  end
207
- def overwrite(import)
208
- options_for_import[import][:overwrite]
173
+ def upcase
174
+ options[:upcase]
209
175
  end
210
- def upcase(import)
211
- options_for_import[import][:upcase]
176
+ def units_field_name
177
+ options[:units_field_name]
212
178
  end
213
- def units_field_name(import)
214
- options_for_import[import][:units_field_name]
179
+ def field_number
180
+ options[:field_number]
215
181
  end
216
- def field_number(import)
217
- options_for_import[import][:field_number]
182
+ def chars
183
+ options[:chars]
218
184
  end
219
- def chars(import)
220
- options_for_import[import][:chars]
185
+ def static
186
+ options[:static]
221
187
  end
222
- def static(import)
223
- options_for_import[import][:static]
188
+ def dictionary
189
+ @_dictionary ||= Dictionary.new options[:dictionary]
224
190
  end
225
191
  end
226
192
  end