data_miner 0.3.13 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +30 -32
- data/VERSION +1 -1
- data/data_miner.gemspec +2 -3
- data/lib/data_miner.rb +0 -4
- data/lib/data_miner/attribute.rb +103 -137
- data/lib/data_miner/configuration.rb +7 -9
- data/lib/data_miner/dictionary.rb +1 -1
- data/lib/data_miner/import.rb +24 -29
- data/test/data_miner_test.rb +341 -505
- data/test/test_helper.rb +0 -99
- metadata +2 -3
- data/lib/data_miner/william_james_cartesian_product.rb +0 -11
data/README.rdoc
CHANGED
@@ -11,12 +11,19 @@ Put this in <tt>config/environment.rb</tt>:
|
|
11
11
|
You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>:
|
12
12
|
|
13
13
|
class Country < ActiveRecord::Base
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
set_primary_key :iso_3166
|
15
|
+
|
16
|
+
data_miner do
|
17
|
+
import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do
|
18
|
+
key 'iso_3166'
|
19
|
+
store 'iso_3166', :field_number => 1
|
20
|
+
store 'name', :field_number => 0
|
21
|
+
end
|
22
|
+
|
23
|
+
import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do
|
24
|
+
key 'iso_3166'
|
25
|
+
store 'iso_3166', :field_name => 'country code'
|
26
|
+
store 'name', :field_name => 'country'
|
20
27
|
end
|
21
28
|
end
|
22
29
|
end
|
@@ -24,18 +31,17 @@ You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For e
|
|
24
31
|
...and in <tt>app/models/airport.rb</tt>:
|
25
32
|
|
26
33
|
class Airport < ActiveRecord::Base
|
27
|
-
|
34
|
+
set_primary_key :iata_code
|
28
35
|
|
29
|
-
data_miner do
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
attr.store :longitude, :field_number => 6
|
36
|
+
data_miner do
|
37
|
+
import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do
|
38
|
+
key 'iata_code'
|
39
|
+
store 'name', :field_number => 1
|
40
|
+
store 'city', :field_number => 2
|
41
|
+
store 'country_name', :field_number => 3
|
42
|
+
store 'iata_code', :field_number => 4
|
43
|
+
store 'latitude', :field_number => 6
|
44
|
+
store 'longitude', :field_number => 7
|
39
45
|
end
|
40
46
|
end
|
41
47
|
end
|
@@ -48,30 +54,22 @@ Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't kno
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
|
51
|
-
You need to specify what order to mine data. For example, in <tt>config/initializers/data_miner_config.rb</tt>:
|
52
|
-
|
53
|
-
DataMiner.enqueue do |queue|
|
54
|
-
queue << Country # class whose data should be mined 1st
|
55
|
-
queue << Airport # class whose data should be mined 2nd
|
56
|
-
# etc
|
57
|
-
end
|
58
|
-
|
59
57
|
Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can:
|
60
58
|
|
61
|
-
$ rake data_miner:run
|
59
|
+
$ rake data_miner:run RESOURCES=Airport,Country
|
62
60
|
|
63
61
|
==Complete example
|
64
62
|
|
65
63
|
~ $ rails testapp
|
66
64
|
~ $ cd testapp/
|
67
|
-
~/testapp $ ./script/generate model Airport iata_code:string name:string city:string
|
65
|
+
~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float
|
66
|
+
[...edit migration to make iata_code the primary key...]
|
68
67
|
~/testapp $ ./script/generate model Country iso_3166:string name:string
|
68
|
+
[...edit migration to make iso_3166 the primary key...]
|
69
69
|
~/testapp $ rake db:migrate
|
70
70
|
~/testapp $ touch lib/tasks/data_miner_tasks.rb
|
71
71
|
[...edit per quick start...]
|
72
|
-
~/testapp $
|
73
|
-
[...edit per quick start...]
|
74
|
-
~/testapp $ rake data_miner:run
|
72
|
+
~/testapp $ rake data_miner:run RESOURCES=Airport,Country
|
75
73
|
|
76
74
|
Now you should have
|
77
75
|
|
@@ -79,7 +77,7 @@ Now you should have
|
|
79
77
|
Loading development environment (Rails 2.3.3)
|
80
78
|
>> Airport.first.iata_code
|
81
79
|
=> "GKA"
|
82
|
-
>> Airport.first.
|
80
|
+
>> Airport.first.country_name
|
83
81
|
=> "Papua New Guinea"
|
84
82
|
|
85
83
|
==Authors
|
@@ -89,4 +87,4 @@ Now you should have
|
|
89
87
|
|
90
88
|
==Copyright
|
91
89
|
|
92
|
-
Copyright (c)
|
90
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/data_miner.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{data_miner}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
12
|
-
s.date = %q{2010-03-
|
12
|
+
s.date = %q{2010-03-19}
|
13
13
|
s.description = %q{Mine remote data into your ActiveRecord models. You can also perform associations and convert units.}
|
14
14
|
s.email = %q{seamus@abshere.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,7 +32,6 @@ Gem::Specification.new do |s|
|
|
32
32
|
"lib/data_miner/import.rb",
|
33
33
|
"lib/data_miner/process.rb",
|
34
34
|
"lib/data_miner/run.rb",
|
35
|
-
"lib/data_miner/william_james_cartesian_product.rb",
|
36
35
|
"test/data_miner_test.rb",
|
37
36
|
"test/test_helper.rb"
|
38
37
|
]
|
data/lib/data_miner.rb
CHANGED
@@ -14,9 +14,6 @@ require 'data_miner/import'
|
|
14
14
|
require 'data_miner/process'
|
15
15
|
require 'data_miner/run'
|
16
16
|
|
17
|
-
# TODO: move to gem
|
18
|
-
require 'data_miner/william_james_cartesian_product'
|
19
|
-
|
20
17
|
module DataMiner
|
21
18
|
class MissingHashColumn < RuntimeError; end
|
22
19
|
|
@@ -71,7 +68,6 @@ ActiveRecord::Base.class_eval do
|
|
71
68
|
self.data_miner_config = DataMiner::Configuration.new self
|
72
69
|
|
73
70
|
Blockenspiel.invoke block, data_miner_config
|
74
|
-
data_miner_config.after_invoke
|
75
71
|
end
|
76
72
|
end
|
77
73
|
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -1,66 +1,68 @@
|
|
1
1
|
module DataMiner
|
2
2
|
class Attribute
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :runnable
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :options
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
+
delegate :resource, :to => :runnable
|
8
|
+
|
9
|
+
def initialize(runnable, name, options = {})
|
10
|
+
options.symbolize_keys!
|
11
|
+
@options = options
|
12
|
+
|
13
|
+
@runnable = runnable
|
7
14
|
@name = name
|
8
|
-
@options_for_import = {}
|
9
15
|
end
|
10
16
|
|
11
17
|
def inspect
|
12
18
|
"Attribute(#{resource}##{name})"
|
13
19
|
end
|
14
20
|
|
15
|
-
def
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def value_in_dictionary(import, key)
|
20
|
-
return *dictionary(import).lookup(key) # strip the array wrapper if there's only one element
|
21
|
+
def value_in_dictionary(str)
|
22
|
+
dictionary.lookup str
|
21
23
|
end
|
22
24
|
|
23
|
-
def value_in_source(
|
24
|
-
if wants_static?
|
25
|
-
value = static
|
26
|
-
elsif field_number
|
27
|
-
if field_number
|
28
|
-
value = field_number
|
25
|
+
def value_in_source(row)
|
26
|
+
if wants_static?
|
27
|
+
value = static
|
28
|
+
elsif field_number
|
29
|
+
if field_number.is_a?(Range)
|
30
|
+
value = field_number.map { |n| row[n] }.join(delimiter)
|
29
31
|
else
|
30
|
-
value = row[field_number
|
32
|
+
value = row[field_number]
|
31
33
|
end
|
32
34
|
else
|
33
|
-
value = row[field_name
|
35
|
+
value = row[field_name]
|
34
36
|
end
|
35
37
|
return nil if value.nil?
|
36
38
|
return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
|
37
39
|
value = value.to_s
|
38
|
-
value = value[chars
|
39
|
-
value = do_split(
|
40
|
+
value = value[chars] if wants_chars?
|
41
|
+
value = do_split(value) if wants_split?
|
40
42
|
# taken from old errata... maybe we want to do this here
|
41
|
-
value.gsub!
|
43
|
+
value.gsub! /[ ]+/, ' '
|
42
44
|
# text.gsub!('- ', '-')
|
43
|
-
value.gsub!
|
45
|
+
value.gsub! /([^\\])~/, '\1 '
|
44
46
|
value.strip!
|
45
|
-
value.upcase! if wants_upcase?
|
46
|
-
value = do_convert
|
47
|
-
value = do_sprintf
|
47
|
+
value.upcase! if wants_upcase?
|
48
|
+
value = do_convert row, value if wants_conversion?
|
49
|
+
value = do_sprintf value if wants_sprintf?
|
48
50
|
value
|
49
51
|
end
|
50
52
|
|
51
|
-
def value_from_row(
|
52
|
-
value = value_in_source
|
53
|
-
return value if value.is_a?
|
54
|
-
value = value_in_dictionary
|
53
|
+
def value_from_row(row)
|
54
|
+
value = value_in_source row
|
55
|
+
return value if value.is_a? ActiveRecord::Base # carry through trapdoor
|
56
|
+
value = value_in_dictionary value if wants_dictionary?
|
55
57
|
value
|
56
58
|
end
|
57
59
|
|
58
|
-
# this will overwrite nils, even if wants_overwriting?
|
60
|
+
# this will overwrite nils, even if wants_overwriting? is false
|
59
61
|
# returns true if an attr was changed, otherwise false
|
60
|
-
def set_record_from_row(
|
61
|
-
return false if !wants_overwriting?
|
62
|
+
def set_record_from_row(record, row)
|
63
|
+
return false if !wants_overwriting? and !record.send(name).nil?
|
62
64
|
what_it_was = record.send name
|
63
|
-
what_it_should_be = value_from_row
|
65
|
+
what_it_should_be = value_from_row row
|
64
66
|
record.send "#{name}=", what_it_should_be
|
65
67
|
what_it_is = record.send name
|
66
68
|
if what_it_is.nil? and !what_it_should_be.nil?
|
@@ -73,26 +75,26 @@ module DataMiner
|
|
73
75
|
end
|
74
76
|
end
|
75
77
|
|
76
|
-
def unit_from_source(
|
77
|
-
row[units_field_name
|
78
|
+
def unit_from_source(row)
|
79
|
+
row[units_field_name].to_s.strip.underscore.to_sym
|
78
80
|
end
|
79
81
|
|
80
|
-
def do_convert(
|
81
|
-
value.to_f.convert((from_units
|
82
|
+
def do_convert(row, value)
|
83
|
+
value.to_f.convert((from_units || unit_from_source(row)), to_units)
|
82
84
|
end
|
83
85
|
|
84
|
-
def do_sprintf(
|
85
|
-
if /\%[0-9\.]*f/.match
|
86
|
+
def do_sprintf(value)
|
87
|
+
if /\%[0-9\.]*f/.match sprintf
|
86
88
|
value = value.to_f
|
87
|
-
elsif /\%[0-9\.]*d/.match
|
89
|
+
elsif /\%[0-9\.]*d/.match sprintf
|
88
90
|
value = value.to_i
|
89
91
|
end
|
90
|
-
sprintf
|
92
|
+
sprintf % value
|
91
93
|
end
|
92
94
|
|
93
|
-
def do_split(
|
94
|
-
pattern = split_options
|
95
|
-
keep = split_options
|
95
|
+
def do_split(value)
|
96
|
+
pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
|
97
|
+
keep = split_options[:keep] || 0 # default is keep first element
|
96
98
|
value.to_s.split(pattern)[keep].to_s
|
97
99
|
end
|
98
100
|
|
@@ -100,127 +102,91 @@ module DataMiner
|
|
100
102
|
resource.columns_hash[name.to_s].type
|
101
103
|
end
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
107
|
-
|
108
|
-
# {
|
109
|
-
# :static => 'options_for_import[import].has_key?(:static)',
|
110
|
-
# :chars => :chars,
|
111
|
-
# :upcase => :upcase,
|
112
|
-
# :conversion => '!from_units(import).nil? or !units_field_name(import).nil?',
|
113
|
-
# :sprintf => :sprintf,
|
114
|
-
# :dictionary => :dictionary_options,
|
115
|
-
# :split => :split_options,
|
116
|
-
# :nullification => 'nullify(import) != false',
|
117
|
-
# :overwriting => 'overwrite(import) != false',
|
118
|
-
# }.each do |name, condition|
|
119
|
-
# condition = "!#{condition}(import).nil?" if condition.is_a?(Symbol)
|
120
|
-
# puts <<-EOS
|
121
|
-
# def wants_#{name}?(import)
|
122
|
-
# #{condition}
|
123
|
-
# end
|
124
|
-
# EOS
|
125
|
-
# end
|
126
|
-
def wants_split?(import)
|
127
|
-
!split_options(import).nil?
|
105
|
+
# Our wants and needs :)
|
106
|
+
def wants_split?
|
107
|
+
split_options.present?
|
128
108
|
end
|
129
|
-
def wants_sprintf?
|
130
|
-
|
109
|
+
def wants_sprintf?
|
110
|
+
sprintf.present?
|
131
111
|
end
|
132
|
-
def wants_upcase?
|
133
|
-
|
112
|
+
def wants_upcase?
|
113
|
+
upcase.present?
|
134
114
|
end
|
135
|
-
def wants_static?
|
136
|
-
|
115
|
+
def wants_static?
|
116
|
+
options.has_key? :static
|
137
117
|
end
|
138
|
-
def wants_nullification?
|
139
|
-
nullify
|
118
|
+
def wants_nullification?
|
119
|
+
nullify != false
|
140
120
|
end
|
141
|
-
def wants_chars?
|
142
|
-
|
121
|
+
def wants_chars?
|
122
|
+
chars.present?
|
143
123
|
end
|
144
|
-
def wants_overwriting?
|
145
|
-
overwrite
|
124
|
+
def wants_overwriting?
|
125
|
+
overwrite != false
|
146
126
|
end
|
147
|
-
def wants_conversion?
|
148
|
-
|
127
|
+
def wants_conversion?
|
128
|
+
from_units.present? or units_field_name.present?
|
149
129
|
end
|
150
|
-
def wants_dictionary?
|
151
|
-
|
130
|
+
def wants_dictionary?
|
131
|
+
options[:dictionary].present?
|
152
132
|
end
|
153
|
-
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
# }.each do |name, options|
|
158
|
-
# puts <<-EOS
|
159
|
-
# def #{name}(import)
|
160
|
-
# (options_for_import[import][:#{name}] || #{options[:default]})#{'.to_s' if options[:stringify]}
|
161
|
-
# end
|
162
|
-
# EOS
|
163
|
-
# end
|
164
|
-
def field_name(import)
|
165
|
-
(options_for_import[import][:field_name] || name).to_s
|
133
|
+
|
134
|
+
# Options that always have values
|
135
|
+
def field_name
|
136
|
+
(options[:field_name] || name).to_s
|
166
137
|
end
|
167
|
-
def delimiter
|
168
|
-
(
|
138
|
+
def delimiter
|
139
|
+
(options[:delimiter] || ', ')
|
169
140
|
end
|
170
141
|
|
171
|
-
#
|
172
|
-
|
173
|
-
|
174
|
-
# options_for_import[import][:#{name}]
|
175
|
-
# end
|
176
|
-
# EOS
|
177
|
-
# end
|
178
|
-
def dictionary_options(import)
|
179
|
-
options_for_import[import][:dictionary]
|
180
|
-
end
|
181
|
-
def split_options(import)
|
182
|
-
options_for_import[import][:split]
|
142
|
+
# Options that can't be referred to by their names
|
143
|
+
def split_options
|
144
|
+
options[:split]
|
183
145
|
end
|
184
|
-
|
146
|
+
|
147
|
+
# Normal options
|
185
148
|
# %w(from_units to_units conditions sprintf nullify overwrite upcase units_field_name field_number chars static).each do |name|
|
186
149
|
# puts <<-EOS
|
187
|
-
# def #{name}
|
188
|
-
#
|
150
|
+
# def #{name}
|
151
|
+
# options[:#{name}]
|
189
152
|
# end
|
190
153
|
# EOS
|
191
154
|
# end
|
192
|
-
def from_units
|
193
|
-
|
155
|
+
def from_units
|
156
|
+
options[:from_units]
|
157
|
+
end
|
158
|
+
def to_units
|
159
|
+
options[:to_units]
|
194
160
|
end
|
195
|
-
def
|
196
|
-
|
161
|
+
def conditions
|
162
|
+
options[:conditions]
|
197
163
|
end
|
198
|
-
def
|
199
|
-
|
164
|
+
def sprintf
|
165
|
+
options[:sprintf]
|
200
166
|
end
|
201
|
-
def
|
202
|
-
|
167
|
+
def nullify
|
168
|
+
options[:nullify]
|
203
169
|
end
|
204
|
-
def
|
205
|
-
|
170
|
+
def overwrite
|
171
|
+
options[:overwrite]
|
206
172
|
end
|
207
|
-
def
|
208
|
-
|
173
|
+
def upcase
|
174
|
+
options[:upcase]
|
209
175
|
end
|
210
|
-
def
|
211
|
-
|
176
|
+
def units_field_name
|
177
|
+
options[:units_field_name]
|
212
178
|
end
|
213
|
-
def
|
214
|
-
|
179
|
+
def field_number
|
180
|
+
options[:field_number]
|
215
181
|
end
|
216
|
-
def
|
217
|
-
|
182
|
+
def chars
|
183
|
+
options[:chars]
|
218
184
|
end
|
219
|
-
def
|
220
|
-
|
185
|
+
def static
|
186
|
+
options[:static]
|
221
187
|
end
|
222
|
-
def
|
223
|
-
|
188
|
+
def dictionary
|
189
|
+
@_dictionary ||= Dictionary.new options[:dictionary]
|
224
190
|
end
|
225
191
|
end
|
226
192
|
end
|