data_miner 0.3.13 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -11,12 +11,19 @@ Put this in <tt>config/environment.rb</tt>:
11
11
  You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
12
12
 
13
13
  class Country < ActiveRecord::Base
14
- data_miner do |step|
15
- # import country names and country codes
16
- step.import :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do |attr|
17
- attr.key :iso_3166, :field_name => 'country code'
18
- attr.store :iso_3166, :field_name => 'country code'
19
- attr.store :name, :field_name => 'country'
14
+ set_primary_key :iso_3166
15
+
16
+ data_miner do
17
+ import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
18
+ key 'iso_3166'
19
+ store 'iso_3166', :field_number => 1
20
+ store 'name', :field_number => 0
21
+ end
22
+
23
+ import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
24
+ key 'iso_3166'
25
+ store 'iso_3166', :field_name => 'country code'
26
+ store 'name', :field_name => 'country'
20
27
  end
21
28
  end
22
29
  end
@@ -24,18 +31,17 @@ You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For e
24
31
  ...and in <tt>app/models/airport.rb</tt>:
25
32
 
26
33
  class Airport < ActiveRecord::Base
27
- belongs_to :country
34
+ set_primary_key :iata_code
28
35
 
29
- data_miner do |step|
30
- # import airport iata_code, name, etc.
31
- step.import(:url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false) do |attr|
32
- attr.key :iata_code, :field_number => 3
33
- attr.store :name, :field_number => 0
34
- attr.store :city, :field_number => 1
35
- attr.store :country, :field_number => 2, :foreign_key => :name # will use Country.find_by_name(X)
36
- attr.store :iata_code, :field_number => 3
37
- attr.store :latitude, :field_number => 5
38
- attr.store :longitude, :field_number => 6
36
+ data_miner do
37
+ import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
38
+ key 'iata_code'
39
+ store 'name', :field_number => 1
40
+ store 'city', :field_number => 2
41
+ store 'country_name', :field_number => 3
42
+ store 'iata_code', :field_number => 4
43
+ store 'latitude', :field_number => 6
44
+ store 'longitude', :field_number => 7
39
45
  end
40
46
  end
41
47
  end
@@ -48,30 +54,22 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
48
54
  end
49
55
  end
50
56
 
51
- You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
52
-
53
- DataMiner.enqueue do |queue|
54
- queue << Country # class whose data should be mined 1st
55
- queue << Airport # class whose data should be mined 2nd
56
- # etc
57
- end
58
-
59
57
  Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can:
60
58
 
61
- $ rake data_miner:run
59
+ $ rake data_miner:run RESOURCES=Airport,Country
62
60
 
63
61
  ==Complete example
64
62
 
65
63
  ~ $ rails testapp
66
64
  ~ $ cd testapp/
67
- ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_id:integer latitude:float longitude:float
65
+ ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float
66
+ [...edit migration to make iata_code the primary key...]
68
67
  ~/testapp $ ./script/generate model Country iso_3166:string name:string
68
+ [...edit migration to make iso_3166 the primary key...]
69
69
  ~/testapp $ rake db:migrate
70
70
  ~/testapp $ touch lib/tasks/data_miner_tasks.rb
71
71
  [...edit per quick start...]
72
- ~/testapp $ touch config/initializers/data_miner_config.rake
73
- [...edit per quick start...]
74
- ~/testapp $ rake data_miner:run
72
+ ~/testapp $ rake data_miner:run RESOURCES=Airport,Country
75
73
 
76
74
  Now you should have
77
75
 
@@ -79,7 +77,7 @@ Now you should have
79
77
  Loading development environment (Rails 2.3.3)
80
78
  >> Airport.first.iata_code
81
79
  => "GKA"
82
- >> Airport.first.country.name
80
+ >> Airport.first.country_name
83
81
  => "Papua New Guinea"
84
82
 
85
83
  ==Authors
@@ -89,4 +87,4 @@ Now you should have
89
87
 
90
88
  ==Copyright
91
89
 
92
- Copyright (c) 2009 Brighter Planet. See LICENSE for details.
90
+ Copyright (c) 2010 Brighter Planet. See LICENSE for details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.13
1
+ 0.4.0
data/data_miner.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{data_miner}
8
- s.version = "0.3.13"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-03-18}
12
+ s.date = %q{2010-03-19}
13
13
  s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
32
32
  "lib/data_miner/import.rb",
33
33
  "lib/data_miner/process.rb",
34
34
  "lib/data_miner/run.rb",
35
- "lib/data_miner/william_james_cartesian_product.rb",
36
35
  "test/data_miner_test.rb",
37
36
  "test/test_helper.rb"
38
37
  ]
data/lib/data_miner.rb CHANGED
@@ -14,9 +14,6 @@ require 'data_miner/import'
14
14
  require 'data_miner/process'
15
15
  require 'data_miner/run'
16
16
 
17
- # TODO: move to gem
18
- require 'data_miner/william_james_cartesian_product'
19
-
20
17
  module DataMiner
21
18
  class MissingHashColumn < RuntimeError; end
22
19
 
@@ -71,7 +68,6 @@ ActiveRecord::Base.class_eval do
71
68
  self.data_miner_config = DataMiner::Configuration.new self
72
69
 
73
70
  Blockenspiel.invoke block, data_miner_config
74
- data_miner_config.after_invoke
75
71
  end
76
72
  end
77
73
 
@@ -1,66 +1,68 @@
1
1
  module DataMiner
2
2
  class Attribute
3
- attr_accessor :resource, :name, :options_for_import
3
+ attr_accessor :runnable
4
+ attr_accessor :name
5
+ attr_accessor :options
4
6
 
5
- def initialize(resource, name)
6
- @resource = resource
7
+ delegate :resource, :to => :runnable
8
+
9
+ def initialize(runnable, name, options = {})
10
+ options.symbolize_keys!
11
+ @options = options
12
+
13
+ @runnable = runnable
7
14
  @name = name
8
- @options_for_import = {}
9
15
  end
10
16
 
11
17
  def inspect
12
18
  "Attribute(#{resource}##{name})"
13
19
  end
14
20
 
15
- def stored_by?(import)
16
- options_for_import.has_key?(import)
17
- end
18
-
19
- def value_in_dictionary(import, key)
20
- return *dictionary(import).lookup(key) # strip the array wrapper if there's only one element
21
+ def value_in_dictionary(str)
22
+ dictionary.lookup str
21
23
  end
22
24
 
23
- def value_in_source(import, row)
24
- if wants_static?(import)
25
- value = static(import)
26
- elsif field_number(import)
27
- if field_number(import).is_a?(Range)
28
- value = field_number(import).map { |n| row[n] }.join(delimiter(import))
25
+ def value_in_source(row)
26
+ if wants_static?
27
+ value = static
28
+ elsif field_number
29
+ if field_number.is_a?(Range)
30
+ value = field_number.map { |n| row[n] }.join(delimiter)
29
31
  else
30
- value = row[field_number(import)]
32
+ value = row[field_number]
31
33
  end
32
34
  else
33
- value = row[field_name(import)]
35
+ value = row[field_name]
34
36
  end
35
37
  return nil if value.nil?
36
38
  return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
37
39
  value = value.to_s
38
- value = value[chars(import)] if wants_chars?(import)
39
- value = do_split(import, value) if wants_split?(import)
40
+ value = value[chars] if wants_chars?
41
+ value = do_split(value) if wants_split?
40
42
  # taken from old errata... maybe we want to do this here
41
- value.gsub!(/[ ]+/, ' ')
43
+ value.gsub! /[ ]+/, ' '
42
44
  # text.gsub!('- ', '-')
43
- value.gsub!(/([^\\])~/, '\1 ')
45
+ value.gsub! /([^\\])~/, '\1 '
44
46
  value.strip!
45
- value.upcase! if wants_upcase?(import)
46
- value = do_convert(import, row, value) if wants_conversion?(import)
47
- value = do_sprintf(import, value) if wants_sprintf?(import)
47
+ value.upcase! if wants_upcase?
48
+ value = do_convert row, value if wants_conversion?
49
+ value = do_sprintf value if wants_sprintf?
48
50
  value
49
51
  end
50
52
 
51
- def value_from_row(import, row)
52
- value = value_in_source(import, row)
53
- return value if value.is_a?(ActiveRecord::Base) # carry through trapdoor
54
- value = value_in_dictionary(import, value) if wants_dictionary?(import)
53
+ def value_from_row(row)
54
+ value = value_in_source row
55
+ return value if value.is_a? ActiveRecord::Base # carry through trapdoor
56
+ value = value_in_dictionary value if wants_dictionary?
55
57
  value
56
58
  end
57
59
 
58
- # this will overwrite nils, even if wants_overwriting?(import) is false
60
+ # this will overwrite nils, even if wants_overwriting? is false
59
61
  # returns true if an attr was changed, otherwise false
60
- def set_record_from_row(import, record, row)
61
- return false if !wants_overwriting?(import) and !record.send(name).nil?
62
+ def set_record_from_row(record, row)
63
+ return false if !wants_overwriting? and !record.send(name).nil?
62
64
  what_it_was = record.send name
63
- what_it_should_be = value_from_row import, row
65
+ what_it_should_be = value_from_row row
64
66
  record.send "#{name}=", what_it_should_be
65
67
  what_it_is = record.send name
66
68
  if what_it_is.nil? and !what_it_should_be.nil?
@@ -73,26 +75,26 @@ module DataMiner
73
75
  end
74
76
  end
75
77
 
76
- def unit_from_source(import, row)
77
- row[units_field_name(import)].to_s.strip.underscore.to_sym
78
+ def unit_from_source(row)
79
+ row[units_field_name].to_s.strip.underscore.to_sym
78
80
  end
79
81
 
80
- def do_convert(import, row, value)
81
- value.to_f.convert((from_units(import) || unit_from_source(import, row)), to_units(import))
82
+ def do_convert(row, value)
83
+ value.to_f.convert((from_units || unit_from_source(row)), to_units)
82
84
  end
83
85
 
84
- def do_sprintf(import, value)
85
- if /\%[0-9\.]*f/.match(sprintf(import))
86
+ def do_sprintf(value)
87
+ if /\%[0-9\.]*f/.match sprintf
86
88
  value = value.to_f
87
- elsif /\%[0-9\.]*d/.match(sprintf(import))
89
+ elsif /\%[0-9\.]*d/.match sprintf
88
90
  value = value.to_i
89
91
  end
90
- sprintf(import) % value
92
+ sprintf % value
91
93
  end
92
94
 
93
- def do_split(import, value)
94
- pattern = split_options(import)[:pattern] || /\s+/ # default is split on whitespace
95
- keep = split_options(import)[:keep] || 0 # default is keep first element
95
+ def do_split(value)
96
+ pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
97
+ keep = split_options[:keep] || 0 # default is keep first element
96
98
  value.to_s.split(pattern)[keep].to_s
97
99
  end
98
100
 
@@ -100,127 +102,91 @@ module DataMiner
100
102
  resource.columns_hash[name.to_s].type
101
103
  end
102
104
 
103
- def dictionary(import)
104
- raise "shouldn't ask for this" unless wants_dictionary?(import) # don't try to initialize if there are no dictionary options
105
- @_dictionary ||= Dictionary.new dictionary_options(import)
106
- end
107
-
108
- # {
109
- # :static => 'options_for_import[import].has_key?(:static)',
110
- # :chars => :chars,
111
- # :upcase => :upcase,
112
- # :conversion => '!from_units(import).nil? or !units_field_name(import).nil?',
113
- # :sprintf => :sprintf,
114
- # :dictionary => :dictionary_options,
115
- # :split => :split_options,
116
- # :nullification => 'nullify(import) != false',
117
- # :overwriting => 'overwrite(import) != false',
118
- # }.each do |name, condition|
119
- # condition = "!#{condition}(import).nil?" if condition.is_a?(Symbol)
120
- # puts <<-EOS
121
- # def wants_#{name}?(import)
122
- # #{condition}
123
- # end
124
- # EOS
125
- # end
126
- def wants_split?(import)
127
- !split_options(import).nil?
105
+ # Our wants and needs :)
106
+ def wants_split?
107
+ split_options.present?
128
108
  end
129
- def wants_sprintf?(import)
130
- !sprintf(import).nil?
109
+ def wants_sprintf?
110
+ sprintf.present?
131
111
  end
132
- def wants_upcase?(import)
133
- !upcase(import).nil?
112
+ def wants_upcase?
113
+ upcase.present?
134
114
  end
135
- def wants_static?(import)
136
- options_for_import[import].has_key?(:static)
115
+ def wants_static?
116
+ options.has_key? :static
137
117
  end
138
- def wants_nullification?(import)
139
- nullify(import) != false
118
+ def wants_nullification?
119
+ nullify != false
140
120
  end
141
- def wants_chars?(import)
142
- !chars(import).nil?
121
+ def wants_chars?
122
+ chars.present?
143
123
  end
144
- def wants_overwriting?(import)
145
- overwrite(import) != false
124
+ def wants_overwriting?
125
+ overwrite != false
146
126
  end
147
- def wants_conversion?(import)
148
- !from_units(import).nil? or !units_field_name(import).nil?
127
+ def wants_conversion?
128
+ from_units.present? or units_field_name.present?
149
129
  end
150
- def wants_dictionary?(import)
151
- !dictionary_options(import).nil?
130
+ def wants_dictionary?
131
+ options[:dictionary].present?
152
132
  end
153
-
154
- # {
155
- # :field_name => { :default => :name, :stringify => true },
156
- # :delimiter => { :default => '", "' }
157
- # }.each do |name, options|
158
- # puts <<-EOS
159
- # def #{name}(import)
160
- # (options_for_import[import][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
161
- # end
162
- # EOS
163
- # end
164
- def field_name(import)
165
- (options_for_import[import][:field_name] || name).to_s
133
+
134
+ # Options that always have values
135
+ def field_name
136
+ (options[:field_name] || name).to_s
166
137
  end
167
- def delimiter(import)
168
- (options_for_import[import][:delimiter] || ", ")
138
+ def delimiter
139
+ (options[:delimiter] || ', ')
169
140
  end
170
141
 
171
- # %w(dictionary split).each do |name|
172
- # puts <<-EOS
173
- # def #{name}_options(import)
174
- # options_for_import[import][:#{name}]
175
- # end
176
- # EOS
177
- # end
178
- def dictionary_options(import)
179
- options_for_import[import][:dictionary]
180
- end
181
- def split_options(import)
182
- options_for_import[import][:split]
142
+ # Options that can't be referred to by their names
143
+ def split_options
144
+ options[:split]
183
145
  end
184
-
146
+
147
+ # Normal options
185
148
  # %w(from_units to_units conditions sprintf nullify overwrite upcase units_field_name field_number chars static).each do |name|
186
149
  # puts <<-EOS
187
- # def #{name}(import)
188
- # options_for_import[import][:#{name}]
150
+ # def #{name}
151
+ # options[:#{name}]
189
152
  # end
190
153
  # EOS
191
154
  # end
192
- def from_units(import)
193
- options_for_import[import][:from_units]
155
+ def from_units
156
+ options[:from_units]
157
+ end
158
+ def to_units
159
+ options[:to_units]
194
160
  end
195
- def to_units(import)
196
- options_for_import[import][:to_units]
161
+ def conditions
162
+ options[:conditions]
197
163
  end
198
- def conditions(import)
199
- options_for_import[import][:conditions]
164
+ def sprintf
165
+ options[:sprintf]
200
166
  end
201
- def sprintf(import)
202
- options_for_import[import][:sprintf]
167
+ def nullify
168
+ options[:nullify]
203
169
  end
204
- def nullify(import)
205
- options_for_import[import][:nullify]
170
+ def overwrite
171
+ options[:overwrite]
206
172
  end
207
- def overwrite(import)
208
- options_for_import[import][:overwrite]
173
+ def upcase
174
+ options[:upcase]
209
175
  end
210
- def upcase(import)
211
- options_for_import[import][:upcase]
176
+ def units_field_name
177
+ options[:units_field_name]
212
178
  end
213
- def units_field_name(import)
214
- options_for_import[import][:units_field_name]
179
+ def field_number
180
+ options[:field_number]
215
181
  end
216
- def field_number(import)
217
- options_for_import[import][:field_number]
182
+ def chars
183
+ options[:chars]
218
184
  end
219
- def chars(import)
220
- options_for_import[import][:chars]
185
+ def static
186
+ options[:static]
221
187
  end
222
- def static(import)
223
- options_for_import[import][:static]
188
+ def dictionary
189
+ @_dictionary ||= Dictionary.new options[:dictionary]
224
190
  end
225
191
  end
226
192
  end