data_miner 0.3.13 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +30 -32
- data/VERSION +1 -1
- data/data_miner.gemspec +2 -3
- data/lib/data_miner.rb +0 -4
- data/lib/data_miner/attribute.rb +103 -137
- data/lib/data_miner/configuration.rb +7 -9
- data/lib/data_miner/dictionary.rb +1 -1
- data/lib/data_miner/import.rb +24 -29
- data/test/data_miner_test.rb +341 -505
- data/test/test_helper.rb +0 -99
- metadata +2 -3
- data/lib/data_miner/william_james_cartesian_product.rb +0 -11
data/README.rdoc
CHANGED
@@ -11,12 +11,19 @@ Put this in <tt>config/environment.rb</tt>:
|
|
11
11
|
You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
|
12
12
|
|
13
13
|
class Country < ActiveRecord::Base
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
set_primary_key :iso_3166
|
15
|
+
|
16
|
+
data_miner do
|
17
|
+
import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
|
18
|
+
key 'iso_3166'
|
19
|
+
store 'iso_3166', :field_number => 1
|
20
|
+
store 'name', :field_number => 0
|
21
|
+
end
|
22
|
+
|
23
|
+
import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
24
|
+
key 'iso_3166'
|
25
|
+
store 'iso_3166', :field_name => 'country code'
|
26
|
+
store 'name', :field_name => 'country'
|
20
27
|
end
|
21
28
|
end
|
22
29
|
end
|
@@ -24,18 +31,17 @@ You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For e
|
|
24
31
|
...and in <tt>app/models/airport.rb</tt>:
|
25
32
|
|
26
33
|
class Airport < ActiveRecord::Base
|
27
|
-
|
34
|
+
set_primary_key :iata_code
|
28
35
|
|
29
|
-
data_miner do
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
attr.store :longitude, :field_number => 6
|
36
|
+
data_miner do
|
37
|
+
import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
|
38
|
+
key 'iata_code'
|
39
|
+
store 'name', :field_number => 1
|
40
|
+
store 'city', :field_number => 2
|
41
|
+
store 'country_name', :field_number => 3
|
42
|
+
store 'iata_code', :field_number => 4
|
43
|
+
store 'latitude', :field_number => 6
|
44
|
+
store 'longitude', :field_number => 7
|
39
45
|
end
|
40
46
|
end
|
41
47
|
end
|
@@ -48,30 +54,22 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
|
51
|
-
You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
|
52
|
-
|
53
|
-
DataMiner.enqueue do |queue|
|
54
|
-
queue << Country # class whose data should be mined 1st
|
55
|
-
queue << Airport # class whose data should be mined 2nd
|
56
|
-
# etc
|
57
|
-
end
|
58
|
-
|
59
57
|
Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can:
|
60
58
|
|
61
|
-
$ rake data_miner:run
|
59
|
+
$ rake data_miner:run RESOURCES=Airport,Country
|
62
60
|
|
63
61
|
==Complete example
|
64
62
|
|
65
63
|
~ $ rails testapp
|
66
64
|
~ $ cd testapp/
|
67
|
-
~/testapp $ ./script/generate model Airport iata_code:string name:string city:string
|
65
|
+
~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float
|
66
|
+
[...edit migration to make iata_code the primary key...]
|
68
67
|
~/testapp $ ./script/generate model Country iso_3166:string name:string
|
68
|
+
[...edit migration to make iso_3166 the primary key...]
|
69
69
|
~/testapp $ rake db:migrate
|
70
70
|
~/testapp $ touch lib/tasks/data_miner_tasks.rb
|
71
71
|
[...edit per quick start...]
|
72
|
-
~/testapp $
|
73
|
-
[...edit per quick start...]
|
74
|
-
~/testapp $ rake data_miner:run
|
72
|
+
~/testapp $ rake data_miner:run RESOURCES=Airport,Country
|
75
73
|
|
76
74
|
Now you should have
|
77
75
|
|
@@ -79,7 +77,7 @@ Now you should have
|
|
79
77
|
Loading development environment (Rails 2.3.3)
|
80
78
|
>> Airport.first.iata_code
|
81
79
|
=> "GKA"
|
82
|
-
>> Airport.first.
|
80
|
+
>> Airport.first.country_name
|
83
81
|
=> "Papua New Guinea"
|
84
82
|
|
85
83
|
==Authors
|
@@ -89,4 +87,4 @@ Now you should have
|
|
89
87
|
|
90
88
|
==Copyright
|
91
89
|
|
92
|
-
Copyright (c)
|
90
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/data_miner.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{data_miner}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
12
|
-
s.date = %q{2010-03-
|
12
|
+
s.date = %q{2010-03-19}
|
13
13
|
s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
|
14
14
|
s.email = %q{seamus@abshere.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/data_miner/import.rb",
|
33
33
|
"lib/data_miner/process.rb",
|
34
34
|
"lib/data_miner/run.rb",
|
35
|
-
"lib/data_miner/william_james_cartesian_product.rb",
|
36
35
|
"test/data_miner_test.rb",
|
37
36
|
"test/test_helper.rb"
|
38
37
|
]
|
data/lib/data_miner.rb
CHANGED
@@ -14,9 +14,6 @@ require 'data_miner/import'
|
|
14
14
|
require 'data_miner/process'
|
15
15
|
require 'data_miner/run'
|
16
16
|
|
17
|
-
# TODO: move to gem
|
18
|
-
require 'data_miner/william_james_cartesian_product'
|
19
|
-
|
20
17
|
module DataMiner
|
21
18
|
class MissingHashColumn < RuntimeError; end
|
22
19
|
|
@@ -71,7 +68,6 @@ ActiveRecord::Base.class_eval do
|
|
71
68
|
self.data_miner_config = DataMiner::Configuration.new self
|
72
69
|
|
73
70
|
Blockenspiel.invoke block, data_miner_config
|
74
|
-
data_miner_config.after_invoke
|
75
71
|
end
|
76
72
|
end
|
77
73
|
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,66 +1,68 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Attribute
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :runnable
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :options
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
+
delegate :resource, :to => :runnable
|
8
|
+
|
9
|
+
def initialize(runnable, name, options = {})
|
10
|
+
options.symbolize_keys!
|
11
|
+
@options = options
|
12
|
+
|
13
|
+
@runnable = runnable
|
7
14
|
@name = name
|
8
|
-
@options_for_import = {}
|
9
15
|
end
|
10
16
|
|
11
17
|
def inspect
|
12
18
|
"Attribute(#{resource}##{name})"
|
13
19
|
end
|
14
20
|
|
15
|
-
def
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def value_in_dictionary(import, key)
|
20
|
-
return *dictionary(import).lookup(key) # strip the array wrapper if there's only one element
|
21
|
+
def value_in_dictionary(str)
|
22
|
+
dictionary.lookup str
|
21
23
|
end
|
22
24
|
|
23
|
-
def value_in_source(
|
24
|
-
if wants_static?
|
25
|
-
value = static
|
26
|
-
elsif field_number
|
27
|
-
if field_number
|
28
|
-
value = field_number
|
25
|
+
def value_in_source(row)
|
26
|
+
if wants_static?
|
27
|
+
value = static
|
28
|
+
elsif field_number
|
29
|
+
if field_number.is_a?(Range)
|
30
|
+
value = field_number.map { |n| row[n] }.join(delimiter)
|
29
31
|
else
|
30
|
-
value = row[field_number
|
32
|
+
value = row[field_number]
|
31
33
|
end
|
32
34
|
else
|
33
|
-
value = row[field_name
|
35
|
+
value = row[field_name]
|
34
36
|
end
|
35
37
|
return nil if value.nil?
|
36
38
|
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
37
39
|
value = value.to_s
|
38
|
-
value = value[chars
|
39
|
-
value = do_split(
|
40
|
+
value = value[chars] if wants_chars?
|
41
|
+
value = do_split(value) if wants_split?
|
40
42
|
# taken from old errata... maybe we want to do this here
|
41
|
-
value.gsub!
|
43
|
+
value.gsub! /[ ]+/, ' '
|
42
44
|
# text.gsub!('- ', '-')
|
43
|
-
value.gsub!
|
45
|
+
value.gsub! /([^\\])~/, '\1 '
|
44
46
|
value.strip!
|
45
|
-
value.upcase! if wants_upcase?
|
46
|
-
value = do_convert
|
47
|
-
value = do_sprintf
|
47
|
+
value.upcase! if wants_upcase?
|
48
|
+
value = do_convert row, value if wants_conversion?
|
49
|
+
value = do_sprintf value if wants_sprintf?
|
48
50
|
value
|
49
51
|
end
|
50
52
|
|
51
|
-
def value_from_row(
|
52
|
-
value = value_in_source
|
53
|
-
return value if value.is_a?
|
54
|
-
value = value_in_dictionary
|
53
|
+
def value_from_row(row)
|
54
|
+
value = value_in_source row
|
55
|
+
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
56
|
+
value = value_in_dictionary value if wants_dictionary?
|
55
57
|
value
|
56
58
|
end
|
57
59
|
|
58
|
-
# this will overwrite nils, even if wants_overwriting?
|
60
|
+
# this will overwrite nils, even if wants_overwriting? is false
|
59
61
|
# returns true if an attr was changed, otherwise false
|
60
|
-
def set_record_from_row(
|
61
|
-
return false if !wants_overwriting?
|
62
|
+
def set_record_from_row(record, row)
|
63
|
+
return false if !wants_overwriting? and !record.send(name).nil?
|
62
64
|
what_it_was = record.send name
|
63
|
-
what_it_should_be = value_from_row
|
65
|
+
what_it_should_be = value_from_row row
|
64
66
|
record.send "#{name}=", what_it_should_be
|
65
67
|
what_it_is = record.send name
|
66
68
|
if what_it_is.nil? and !what_it_should_be.nil?
|
@@ -73,26 +75,26 @@ module DataMiner
|
|
73
75
|
end
|
74
76
|
end
|
75
77
|
|
76
|
-
def unit_from_source(
|
77
|
-
row[units_field_name
|
78
|
+
def unit_from_source(row)
|
79
|
+
row[units_field_name].to_s.strip.underscore.to_sym
|
78
80
|
end
|
79
81
|
|
80
|
-
def do_convert(
|
81
|
-
value.to_f.convert((from_units
|
82
|
+
def do_convert(row, value)
|
83
|
+
value.to_f.convert((from_units || unit_from_source(row)), to_units)
|
82
84
|
end
|
83
85
|
|
84
|
-
def do_sprintf(
|
85
|
-
if /\%[0-9\.]*f/.match
|
86
|
+
def do_sprintf(value)
|
87
|
+
if /\%[0-9\.]*f/.match sprintf
|
86
88
|
value = value.to_f
|
87
|
-
elsif /\%[0-9\.]*d/.match
|
89
|
+
elsif /\%[0-9\.]*d/.match sprintf
|
88
90
|
value = value.to_i
|
89
91
|
end
|
90
|
-
sprintf
|
92
|
+
sprintf % value
|
91
93
|
end
|
92
94
|
|
93
|
-
def do_split(
|
94
|
-
pattern = split_options
|
95
|
-
keep = split_options
|
95
|
+
def do_split(value)
|
96
|
+
pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
|
97
|
+
keep = split_options[:keep] || 0 # default is keep first element
|
96
98
|
value.to_s.split(pattern)[keep].to_s
|
97
99
|
end
|
98
100
|
|
@@ -100,127 +102,91 @@ module DataMiner
|
|
100
102
|
resource.columns_hash[name.to_s].type
|
101
103
|
end
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
# {
|
109
|
-
# :static => 'options_for_import[import].has_key?(:static)',
|
110
|
-
# :chars => :chars,
|
111
|
-
# :upcase => :upcase,
|
112
|
-
# :conversion => '!from_units(import).nil? or !units_field_name(import).nil?',
|
113
|
-
# :sprintf => :sprintf,
|
114
|
-
# :dictionary => :dictionary_options,
|
115
|
-
# :split => :split_options,
|
116
|
-
# :nullification => 'nullify(import) != false',
|
117
|
-
# :overwriting => 'overwrite(import) != false',
|
118
|
-
# }.each do |name, condition|
|
119
|
-
# condition = "!#{condition}(import).nil?" if condition.is_a?(Symbol)
|
120
|
-
# puts <<-EOS
|
121
|
-
# def wants_#{name}?(import)
|
122
|
-
# #{condition}
|
123
|
-
# end
|
124
|
-
# EOS
|
125
|
-
# end
|
126
|
-
def wants_split?(import)
|
127
|
-
!split_options(import).nil?
|
105
|
+
# Our wants and needs :)
|
106
|
+
def wants_split?
|
107
|
+
split_options.present?
|
128
108
|
end
|
129
|
-
def wants_sprintf?
|
130
|
-
|
109
|
+
def wants_sprintf?
|
110
|
+
sprintf.present?
|
131
111
|
end
|
132
|
-
def wants_upcase?
|
133
|
-
|
112
|
+
def wants_upcase?
|
113
|
+
upcase.present?
|
134
114
|
end
|
135
|
-
def wants_static?
|
136
|
-
|
115
|
+
def wants_static?
|
116
|
+
options.has_key? :static
|
137
117
|
end
|
138
|
-
def wants_nullification?
|
139
|
-
nullify
|
118
|
+
def wants_nullification?
|
119
|
+
nullify != false
|
140
120
|
end
|
141
|
-
def wants_chars?
|
142
|
-
|
121
|
+
def wants_chars?
|
122
|
+
chars.present?
|
143
123
|
end
|
144
|
-
def wants_overwriting?
|
145
|
-
overwrite
|
124
|
+
def wants_overwriting?
|
125
|
+
overwrite != false
|
146
126
|
end
|
147
|
-
def wants_conversion?
|
148
|
-
|
127
|
+
def wants_conversion?
|
128
|
+
from_units.present? or units_field_name.present?
|
149
129
|
end
|
150
|
-
def wants_dictionary?
|
151
|
-
|
130
|
+
def wants_dictionary?
|
131
|
+
options[:dictionary].present?
|
152
132
|
end
|
153
|
-
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
# }.each do |name, options|
|
158
|
-
# puts <<-EOS
|
159
|
-
# def #{name}(import)
|
160
|
-
# (options_for_import[import][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
|
161
|
-
# end
|
162
|
-
# EOS
|
163
|
-
# end
|
164
|
-
def field_name(import)
|
165
|
-
(options_for_import[import][:field_name] || name).to_s
|
133
|
+
|
134
|
+
# Options that always have values
|
135
|
+
def field_name
|
136
|
+
(options[:field_name] || name).to_s
|
166
137
|
end
|
167
|
-
def delimiter
|
168
|
-
(
|
138
|
+
def delimiter
|
139
|
+
(options[:delimiter] || ', ')
|
169
140
|
end
|
170
141
|
|
171
|
-
#
|
172
|
-
|
173
|
-
|
174
|
-
# options_for_import[import][:#{name}]
|
175
|
-
# end
|
176
|
-
# EOS
|
177
|
-
# end
|
178
|
-
def dictionary_options(import)
|
179
|
-
options_for_import[import][:dictionary]
|
180
|
-
end
|
181
|
-
def split_options(import)
|
182
|
-
options_for_import[import][:split]
|
142
|
+
# Options that can't be referred to by their names
|
143
|
+
def split_options
|
144
|
+
options[:split]
|
183
145
|
end
|
184
|
-
|
146
|
+
|
147
|
+
# Normal options
|
185
148
|
# %w(from_units to_units conditions sprintf nullify overwrite upcase units_field_name field_number chars static).each do |name|
|
186
149
|
# puts <<-EOS
|
187
|
-
# def #{name}
|
188
|
-
#
|
150
|
+
# def #{name}
|
151
|
+
# options[:#{name}]
|
189
152
|
# end
|
190
153
|
# EOS
|
191
154
|
# end
|
192
|
-
def from_units
|
193
|
-
|
155
|
+
def from_units
|
156
|
+
options[:from_units]
|
157
|
+
end
|
158
|
+
def to_units
|
159
|
+
options[:to_units]
|
194
160
|
end
|
195
|
-
def
|
196
|
-
|
161
|
+
def conditions
|
162
|
+
options[:conditions]
|
197
163
|
end
|
198
|
-
def
|
199
|
-
|
164
|
+
def sprintf
|
165
|
+
options[:sprintf]
|
200
166
|
end
|
201
|
-
def
|
202
|
-
|
167
|
+
def nullify
|
168
|
+
options[:nullify]
|
203
169
|
end
|
204
|
-
def
|
205
|
-
|
170
|
+
def overwrite
|
171
|
+
options[:overwrite]
|
206
172
|
end
|
207
|
-
def
|
208
|
-
|
173
|
+
def upcase
|
174
|
+
options[:upcase]
|
209
175
|
end
|
210
|
-
def
|
211
|
-
|
176
|
+
def units_field_name
|
177
|
+
options[:units_field_name]
|
212
178
|
end
|
213
|
-
def
|
214
|
-
|
179
|
+
def field_number
|
180
|
+
options[:field_number]
|
215
181
|
end
|
216
|
-
def
|
217
|
-
|
182
|
+
def chars
|
183
|
+
options[:chars]
|
218
184
|
end
|
219
|
-
def
|
220
|
-
|
185
|
+
def static
|
186
|
+
options[:static]
|
221
187
|
end
|
222
|
-
def
|
223
|
-
|
188
|
+
def dictionary
|
189
|
+
@_dictionary ||= Dictionary.new options[:dictionary]
|
224
190
|
end
|
225
191
|
end
|
226
192
|
end
|