data_miner 2.3.4 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/README.markdown +13 -0
- data/data_miner.gemspec +3 -0
- data/lib/data_miner/active_record_class_methods.rb +2 -1
- data/lib/data_miner/attribute.rb +8 -0
- data/lib/data_miner/run.rb +5 -3
- data/lib/data_miner/script.rb +39 -32
- data/lib/data_miner/step/import.rb +16 -12
- data/lib/data_miner/step/process.rb +1 -4
- data/lib/data_miner/step/sql.rb +117 -0
- data/lib/data_miner/step.rb +3 -0
- data/lib/data_miner/version.rb +1 -1
- data/lib/data_miner.rb +3 -2
- data/test/data_miner/step/test_sql.rb +38 -0
- data/test/helper.rb +2 -0
- data/test/support/pet3.rb +9 -0
- data/test/test_data_miner.rb +8 -0
- metadata +55 -5
- data/lib/data_miner/step/tap.rb +0 -167
- data/test/test_earth_tap.rb +0 -26
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
2.4.0 / 2012-07-26
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Entirely remove taps support - it doesn't preserve foreign key constraints and adds a lot of complexity
|
6
|
+
|
7
|
+
* Enhancements
|
8
|
+
|
9
|
+
* New "sql" step - executes SQL provided as a string OR a local/remote SQL file provided as a URL
|
10
|
+
* Die with a useful error message if a column specified in an import step doesn't exist - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/17
|
11
|
+
* Allow setting :validate => true on import steps - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/18
|
12
|
+
|
1
13
|
2.3.4 / 2012-07-06
|
2
14
|
|
3
15
|
* Bug fixes
|
data/README.markdown
CHANGED
@@ -30,8 +30,17 @@ You define <code>data_miner</code> blocks in your ActiveRecord models. For examp
|
|
30
30
|
|
31
31
|
class Country < ActiveRecord::Base
|
32
32
|
self.primary_key = 'iso_3166_code'
|
33
|
+
|
34
|
+
# the "col" class method is provided by a different library - active_record_inline_schema
|
35
|
+
col :iso_3166_code # alpha-2 2-letter like GB
|
36
|
+
col :iso_3166_numeric_code, :type => :integer # numeric like 826; aka UN M49 code
|
37
|
+
col :iso_3166_alpha_3_code # 3-letter like GBR
|
38
|
+
col :name
|
33
39
|
|
34
40
|
data_miner do
|
41
|
+
# auto_upgrade! is provided by active_record_inline_schema
|
42
|
+
process :auto_upgrade!
|
43
|
+
|
35
44
|
import("OpenGeoCode.org's Country Codes to Country Names list",
|
36
45
|
:url => 'http://opengeocode.org/download/countrynames.txt',
|
37
46
|
:format => :delimited,
|
@@ -107,6 +116,10 @@ And many more - look for the `data_miner.rb` file that corresponds to each model
|
|
107
116
|
* Derek Kastner <dkastner@gmail.com>
|
108
117
|
* Ian Hough <ijhough@gmail.com>
|
109
118
|
|
119
|
+
## Wishlist
|
120
|
+
|
121
|
+
* Make the tests real unit tests
|
122
|
+
|
110
123
|
## Copyright
|
111
124
|
|
112
125
|
Copyright (c) 2012 Brighter Planet. See LICENSE for details.
|
data/data_miner.gemspec
CHANGED
@@ -24,6 +24,8 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_runtime_dependency 'errata', '>=1.0.1'
|
25
25
|
s.add_runtime_dependency 'remote_table', '>=2.0.2'
|
26
26
|
s.add_runtime_dependency 'upsert', '>=0.3.1'
|
27
|
+
s.add_runtime_dependency 'posix-spawn'
|
28
|
+
s.add_runtime_dependency 'unix_utils'
|
27
29
|
|
28
30
|
s.add_development_dependency 'dkastner-alchemist'
|
29
31
|
s.add_development_dependency 'conversions'
|
@@ -34,6 +36,7 @@ Gem::Specification.new do |s|
|
|
34
36
|
s.add_development_dependency 'minitest-reporters'
|
35
37
|
s.add_development_dependency 'rake'
|
36
38
|
s.add_development_dependency 'yard'
|
39
|
+
s.add_development_dependency 'rdiscount'
|
37
40
|
if RUBY_PLATFORM == 'java'
|
38
41
|
s.add_development_dependency 'jruby-openssl'
|
39
42
|
s.add_development_dependency 'activerecord-jdbcsqlite3-adapter'
|
@@ -63,7 +63,7 @@ class DataMiner
|
|
63
63
|
#
|
64
64
|
# @see DataMiner::Script#import Creating an import step by calling DataMiner::Script#import from inside a data miner script
|
65
65
|
# @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
|
66
|
-
# @see DataMiner::Script#
|
66
|
+
# @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
|
67
67
|
#
|
68
68
|
# @example Creating steps
|
69
69
|
# class MyModel < ActiveRecord::Base
|
@@ -71,6 +71,7 @@ class DataMiner
|
|
71
71
|
# process [...]
|
72
72
|
# import [...]
|
73
73
|
# import [...yes, it's ok to have more than one import step...]
|
74
|
+
# sql [...]
|
74
75
|
# process [...]
|
75
76
|
# [...etc...]
|
76
77
|
# end
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -239,6 +239,9 @@ class DataMiner
|
|
239
239
|
|
240
240
|
# @private
|
241
241
|
def read(row)
|
242
|
+
unless column_exists?
|
243
|
+
raise RuntimeError, "[data_miner] Table #{model.table_name} does not have column #{name.inspect}"
|
244
|
+
end
|
242
245
|
if matcher and matcher_output = matcher.match(row)
|
243
246
|
return matcher_output
|
244
247
|
end
|
@@ -342,6 +345,11 @@ class DataMiner
|
|
342
345
|
step.model
|
343
346
|
end
|
344
347
|
|
348
|
+
def column_exists?
|
349
|
+
return @column_exists_boolean if defined?(@column_exists_boolean)
|
350
|
+
@column_exists_boolean = model.column_names.include? name.to_s
|
351
|
+
end
|
352
|
+
|
345
353
|
def text_column?
|
346
354
|
return @text_column_boolean if defined?(@text_column_boolean)
|
347
355
|
@text_column_boolean = model.columns_hash[name.to_s].text?
|
data/lib/data_miner/run.rb
CHANGED
@@ -100,9 +100,11 @@ class DataMiner
|
|
100
100
|
fail!
|
101
101
|
raise $!
|
102
102
|
ensure
|
103
|
-
|
104
|
-
|
105
|
-
|
103
|
+
if model.table_exists?
|
104
|
+
self.row_count_after = model.count
|
105
|
+
if DataMiner.per_column_statistics?
|
106
|
+
ColumnStatistic.take self
|
107
|
+
end
|
106
108
|
end
|
107
109
|
self.stopped_at = ::Time.now.utc
|
108
110
|
save!
|
data/lib/data_miner/script.rb
CHANGED
@@ -92,47 +92,32 @@ class DataMiner
|
|
92
92
|
append(:process, method_id_or_description, &blk)
|
93
93
|
end
|
94
94
|
|
95
|
-
#
|
96
|
-
#
|
97
|
-
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
98
|
-
# @see DataMiner::Step::Tap The actual Tap class.
|
99
|
-
#
|
100
|
-
# @param [String] description A description of the taps source.
|
101
|
-
# @param [String] source The taps URL, including username, password, domain, and port.
|
102
|
-
# @param [optional, Hash] options
|
103
|
-
# @option options [String] :source_table_name (model.table_name) The source table name, if different.
|
95
|
+
# Import rows into your model.
|
104
96
|
#
|
105
|
-
#
|
106
|
-
#
|
97
|
+
# As long as...
|
98
|
+
# 1. you +key+ on the primary key, or
|
99
|
+
# 2. the table has an auto-increment primary key, or
|
100
|
+
# 3. you DON'T enable +:validate+
|
101
|
+
# ... then things will be sped up using the {https://github.com/seamusabshere/upsert upsert library} in streaming mode.
|
107
102
|
#
|
108
|
-
#
|
109
|
-
# data_miner do
|
110
|
-
# [...]
|
111
|
-
# tap "Brighter Planet's reference data", "http://carbon:neutral@data.brighterplanet.com:5000"
|
112
|
-
# [...]
|
113
|
-
# end
|
114
|
-
#
|
115
|
-
# @return [nil]
|
116
|
-
def tap(description, source, options = {})
|
117
|
-
append :tap, description, source, options
|
118
|
-
end
|
119
|
-
|
120
|
-
# Import rows into your model.
|
103
|
+
# Otherwise, native +ActiveRecord+ constuctors and validations will be used.
|
121
104
|
#
|
122
105
|
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
123
106
|
# @see DataMiner::Step::Import The actual Import class.
|
124
107
|
#
|
125
108
|
# @param [String] description A description of the data source.
|
126
|
-
# @param [Hash]
|
127
|
-
# @option
|
128
|
-
# @option
|
129
|
-
# @option
|
109
|
+
# @param [Hash] settings Settings, including URL of the data source, that are used to download/parse (using RemoteTable) and (sometimes) correct (using Errata) the data.
|
110
|
+
# @option settings [String] :url The URL of the data source. Passed directly to +RemoteTable.new+.
|
111
|
+
# @option settings [Hash] :errata The +:responder+ and +:url+ settings that will be passed to +Errata.new+.
|
112
|
+
# @option settings [TrueClass,FalseClass] :validate Whether to always run +ActiveRecord+ validations.
|
113
|
+
# @option settings [*] anything Any other setting will be passed to +RemoteTable.new+.
|
130
114
|
#
|
131
115
|
# @yield [] A block defining how to +key+ the import (to make it idempotent) and which columns to +store+.
|
132
116
|
#
|
133
|
-
# @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +
|
117
|
+
# @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +settings+.
|
134
118
|
# @note There are hundreds of +import+ examples in https://github.com/brighterplanet/earth. The {file:README.markdown README} points to a few (at the bottom.)
|
135
119
|
# @note We often use string primary keys to make idempotency easier. https://github.com/seamusabshere/active_record_inline_schema supports defining these inline.
|
120
|
+
# @note Enabling +:validate+ may slow down importing large files because it precludes bulk loading using https://github.com/seamusabshere/upsert.
|
136
121
|
#
|
137
122
|
# @example From the README
|
138
123
|
# data_miner do
|
@@ -152,8 +137,26 @@ class DataMiner
|
|
152
137
|
# end
|
153
138
|
#
|
154
139
|
# @return [nil]
|
155
|
-
def import(description,
|
156
|
-
append(:import, description,
|
140
|
+
def import(description, settings, &blk)
|
141
|
+
append(:import, description, settings, &blk)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Execute SQL, provided either as a string or a URL.
|
145
|
+
#
|
146
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
147
|
+
# @see DataMiner::Step::Sql The actual Sql class.
|
148
|
+
#
|
149
|
+
# @note +url_or_statement+ is auto-detected by looking for +%r{^[^\s]*/[^\*]}+ (non-spaces followed by a slash followed by non-asterisk). Therefore if you're passing a local file path and want it to be treated like a URL, make it absolute.
|
150
|
+
#
|
151
|
+
# @param [String] description What this step does.
|
152
|
+
# @param [String] url_or_statement SQL statement as a String or location of the SQL file as a URL.
|
153
|
+
#
|
154
|
+
# @example Rapidly get a list of countries from Brighter Planet's Reference Data web service
|
155
|
+
# data_miner do
|
156
|
+
# sql "Brighter Planet's countries", 'http://data.brighterplanet.com/countries.sql'
|
157
|
+
# end
|
158
|
+
def sql(description, url_or_statement)
|
159
|
+
append(:sql, description, url_or_statement)
|
157
160
|
end
|
158
161
|
|
159
162
|
# Prepend a step to a script unless it's already there. Mostly for internal use.
|
@@ -237,7 +240,11 @@ class DataMiner
|
|
237
240
|
args = ["#{klass.name.demodulize} step with no description"]
|
238
241
|
end
|
239
242
|
initializer = [self] + args + [options]
|
240
|
-
|
243
|
+
if block_given?
|
244
|
+
klass.new(*initializer, &blk)
|
245
|
+
else
|
246
|
+
klass.new(*initializer)
|
247
|
+
end
|
241
248
|
end
|
242
249
|
end
|
243
250
|
end
|
@@ -16,31 +16,29 @@ class DataMiner
|
|
16
16
|
# @return [Array<DataMiner::Attribute>]
|
17
17
|
attr_reader :attributes
|
18
18
|
|
19
|
-
# @private
|
20
|
-
attr_reader :script
|
21
|
-
|
22
19
|
# Description of what this step does.
|
23
20
|
# @return [String]
|
24
21
|
attr_reader :description
|
25
22
|
|
26
23
|
# @private
|
27
|
-
def initialize(script, description,
|
28
|
-
|
29
|
-
if
|
24
|
+
def initialize(script, description, settings, &blk)
|
25
|
+
settings = settings.symbolize_keys
|
26
|
+
if settings.has_key?(:table)
|
30
27
|
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
|
31
28
|
end
|
32
|
-
if (errata_settings =
|
29
|
+
if (errata_settings = settings[:errata]) and not errata_settings.is_a?(::Hash)
|
33
30
|
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
|
34
31
|
end
|
35
32
|
@script = script
|
36
33
|
@attributes = ::ActiveSupport::OrderedHash.new
|
34
|
+
@validate_query = !!settings[:validate]
|
37
35
|
@description = description
|
38
|
-
if
|
39
|
-
errata_settings =
|
36
|
+
if settings.has_key? :errata
|
37
|
+
errata_settings = settings[:errata].symbolize_keys
|
40
38
|
errata_settings[:responder] ||= model
|
41
|
-
|
39
|
+
settings[:errata] = errata_settings
|
42
40
|
end
|
43
|
-
@table_settings =
|
41
|
+
@table_settings = settings.dup
|
44
42
|
@table_settings[:streaming] = true
|
45
43
|
@table_mutex = ::Mutex.new
|
46
44
|
instance_eval(&blk)
|
@@ -85,7 +83,7 @@ class DataMiner
|
|
85
83
|
|
86
84
|
# @private
|
87
85
|
def start
|
88
|
-
if storing_primary_key? or table_has_autoincrementing_primary_key?
|
86
|
+
if not validate? and (storing_primary_key? or table_has_autoincrementing_primary_key?)
|
89
87
|
c = ActiveRecord::Base.connection_pool.checkout
|
90
88
|
Upsert.stream(c, model.table_name) do |upsert|
|
91
89
|
table.each do |row|
|
@@ -109,6 +107,12 @@ class DataMiner
|
|
109
107
|
nil
|
110
108
|
end
|
111
109
|
|
110
|
+
# @private
|
111
|
+
# Whether to run ActiveRecord validations. Slows things down because Upsert isn't used.
|
112
|
+
def validate?
|
113
|
+
@validate_query == true
|
114
|
+
end
|
115
|
+
|
112
116
|
private
|
113
117
|
|
114
118
|
def table_has_autoincrementing_primary_key?
|
@@ -7,9 +7,6 @@ class DataMiner
|
|
7
7
|
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
8
8
|
# @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
|
9
9
|
class Process < Step
|
10
|
-
# @private
|
11
|
-
attr_reader :script
|
12
|
-
|
13
10
|
# The method to be called on the model class.
|
14
11
|
# @return [Symbol]
|
15
12
|
attr_reader :method_id
|
@@ -25,7 +22,7 @@ class DataMiner
|
|
25
22
|
alias :block_description :description
|
26
23
|
|
27
24
|
# @private
|
28
|
-
def initialize(script, method_id_or_description, ignored_options =
|
25
|
+
def initialize(script, method_id_or_description, ignored_options = nil, &blk)
|
29
26
|
@script = script
|
30
27
|
if block_given?
|
31
28
|
@description = method_id_or_description
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'tmpdir'
|
3
|
+
require 'posix/spawn'
|
4
|
+
require 'unix_utils'
|
5
|
+
|
6
|
+
class DataMiner
|
7
|
+
class Step
|
8
|
+
# A step that executes a SQL, either from a string or as retrieved from a URL.
|
9
|
+
#
|
10
|
+
# Create these by calling +sql+ inside a +data_miner+ block.
|
11
|
+
#
|
12
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
13
|
+
# @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
|
14
|
+
class Sql < Step
|
15
|
+
URL_DETECTOR = %r{^[^\s]*/[^\*]}
|
16
|
+
|
17
|
+
# Description of what this step does.
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :description
|
20
|
+
|
21
|
+
# Location of the SQL file.
|
22
|
+
# @return [String]
|
23
|
+
attr_reader :url
|
24
|
+
|
25
|
+
# String containing the SQL.
|
26
|
+
# @return [String]
|
27
|
+
attr_reader :statement
|
28
|
+
|
29
|
+
# @private
|
30
|
+
def initialize(script, description, url_or_statement, ignored_options = nil)
|
31
|
+
@script = script
|
32
|
+
@description = description
|
33
|
+
if url_or_statement =~ URL_DETECTOR
|
34
|
+
@url = url_or_statement
|
35
|
+
else
|
36
|
+
@statement = url_or_statement
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @private
|
41
|
+
def start
|
42
|
+
if statement
|
43
|
+
c = ActiveRecord::Base.connection_pool.checkout
|
44
|
+
c.execute statement
|
45
|
+
ActiveRecord::Base.connection_pool.checkin c
|
46
|
+
else
|
47
|
+
tmp_path = UnixUtils.curl url
|
48
|
+
send config[:adapter], tmp_path
|
49
|
+
File.unlink tmp_path
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def config
|
56
|
+
@config ||= if ActiveRecord::Base.respond_to?(:connection_config)
|
57
|
+
ActiveRecord::Base.connection_config
|
58
|
+
else
|
59
|
+
ActiveRecord::Base.connection_pool.spec.config
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def mysql(path)
|
64
|
+
connect = if config[:socket]
|
65
|
+
[ '--socket', config[:socket] ]
|
66
|
+
else
|
67
|
+
[ '--host', config.fetch(:host, '127.0.0.1'), '--port', config.fetch(:port, 3306).to_s ]
|
68
|
+
end
|
69
|
+
|
70
|
+
argv = [
|
71
|
+
'mysql',
|
72
|
+
'--compress',
|
73
|
+
'--user', config[:username],
|
74
|
+
"-p#{config[:password]}",
|
75
|
+
connect,
|
76
|
+
'--default-character-set', 'utf8',
|
77
|
+
config[:database]
|
78
|
+
].flatten
|
79
|
+
|
80
|
+
File.open(path) do |f|
|
81
|
+
pid = POSIX::Spawn.spawn(*(argv+[{:in => f}]))
|
82
|
+
::Process.waitpid pid
|
83
|
+
end
|
84
|
+
unless $?.success?
|
85
|
+
raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect}"
|
86
|
+
end
|
87
|
+
nil
|
88
|
+
end
|
89
|
+
|
90
|
+
alias :mysql2 :mysql
|
91
|
+
|
92
|
+
def postgresql(path)
|
93
|
+
connect = []
|
94
|
+
connect << ['--username', config[:username]] if config[:username]
|
95
|
+
connect << ['--password', config[:password]] if config[:password]
|
96
|
+
connect << ['--host', config[:host]] if config[:host]
|
97
|
+
connect << ['--port', config[:port]] if config[:port]
|
98
|
+
|
99
|
+
argv = [
|
100
|
+
'psql',
|
101
|
+
connect,
|
102
|
+
'--quiet',
|
103
|
+
'--dbname', config[:database],
|
104
|
+
'--file', path
|
105
|
+
].flatten
|
106
|
+
|
107
|
+
child = POSIX::Spawn::Child.new(*argv)
|
108
|
+
$stderr.puts child.out
|
109
|
+
$stderr.puts child.err
|
110
|
+
unless child.success?
|
111
|
+
raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect} (#{child.err.inspect})"
|
112
|
+
end
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/lib/data_miner/step.rb
CHANGED
data/lib/data_miner/version.rb
CHANGED
data/lib/data_miner.rb
CHANGED
@@ -20,8 +20,8 @@ require 'data_miner/script'
|
|
20
20
|
require 'data_miner/dictionary'
|
21
21
|
require 'data_miner/step'
|
22
22
|
require 'data_miner/step/import'
|
23
|
-
require 'data_miner/step/tap'
|
24
23
|
require 'data_miner/step/process'
|
24
|
+
require 'data_miner/step/sql'
|
25
25
|
require 'data_miner/run'
|
26
26
|
require 'data_miner/unit_converter'
|
27
27
|
|
@@ -44,7 +44,7 @@ class DataMiner
|
|
44
44
|
|
45
45
|
# @private
|
46
46
|
def compress_whitespace(str)
|
47
|
-
str.gsub(INNER_SPACE,
|
47
|
+
str.gsub(INNER_SPACE, ONE_SPACE).strip
|
48
48
|
end
|
49
49
|
|
50
50
|
# Set the unit converter.
|
@@ -66,6 +66,7 @@ class DataMiner
|
|
66
66
|
end
|
67
67
|
|
68
68
|
INNER_SPACE = /[ ]+/
|
69
|
+
ONE_SPACE = ' '
|
69
70
|
|
70
71
|
include ::Singleton
|
71
72
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'helper'
|
3
|
+
init_database
|
4
|
+
|
5
|
+
class BreedBlue < ActiveRecord::Base
|
6
|
+
self.table_name = 'breeds'
|
7
|
+
self.primary_key = 'name'
|
8
|
+
data_miner do
|
9
|
+
sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class BreedRed < ActiveRecord::Base
|
14
|
+
self.table_name = 'breeds'
|
15
|
+
self.primary_key = 'name'
|
16
|
+
data_miner do
|
17
|
+
sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
|
18
|
+
sql "Mess up weights", %{UPDATE breeds SET weight = 999}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe DataMiner::Step::Sql do
|
23
|
+
before do
|
24
|
+
BreedBlue.delete_all rescue nil
|
25
|
+
end
|
26
|
+
it "can be provided as a URL" do
|
27
|
+
BreedBlue.run_data_miner!
|
28
|
+
BreedBlue.where(:name => 'Affenpinscher').count.must_equal 1
|
29
|
+
BreedBlue.where(:name => 'Württemberger').count.must_equal 1
|
30
|
+
BreedBlue.find('Afghan Hound').weight.must_be_close_to 24.9476
|
31
|
+
end
|
32
|
+
it "can be provided as a string" do
|
33
|
+
BreedRed.run_data_miner!
|
34
|
+
BreedRed.where(:name => 'Affenpinscher').count.must_equal 1
|
35
|
+
BreedRed.where(:name => 'Württemberger').count.must_equal 1
|
36
|
+
BreedRed.find('Afghan Hound').weight.must_be_close_to 999
|
37
|
+
end
|
38
|
+
end
|
data/test/helper.rb
CHANGED
@@ -59,8 +59,10 @@ def init_models
|
|
59
59
|
require 'support/breed'
|
60
60
|
require 'support/pet'
|
61
61
|
require 'support/pet2'
|
62
|
+
require 'support/pet3'
|
62
63
|
Pet.auto_upgrade!
|
63
64
|
Pet2.auto_upgrade!
|
65
|
+
Pet3.auto_upgrade!
|
64
66
|
|
65
67
|
ActiveRecord::Base.descendants.each do |model|
|
66
68
|
model.attr_accessible nil
|
data/test/test_data_miner.rb
CHANGED
@@ -7,6 +7,8 @@ describe DataMiner do
|
|
7
7
|
describe "when used to import example data about pets" do
|
8
8
|
before do
|
9
9
|
Pet.delete_all
|
10
|
+
Pet2.delete_all
|
11
|
+
Pet3.delete_all
|
10
12
|
DataMiner::Run.delete_all
|
11
13
|
DataMiner::Run::ColumnStatistic.delete_all
|
12
14
|
end
|
@@ -112,5 +114,11 @@ describe DataMiner do
|
|
112
114
|
Pet2.run_data_miner!
|
113
115
|
Pet2.find('Jerry').breed_id.must_equal 'Beagle-Basset'
|
114
116
|
end
|
117
|
+
it "dies if a column specified in an import step doesn't exist" do
|
118
|
+
lambda do
|
119
|
+
Pet3.run_data_miner!
|
120
|
+
end.must_raise RuntimeError, /exist/i
|
121
|
+
end
|
122
|
+
|
115
123
|
end
|
116
124
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-07-
|
14
|
+
date: 2012-07-26 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: aasm
|
@@ -125,6 +125,38 @@ dependencies:
|
|
125
125
|
- - ! '>='
|
126
126
|
- !ruby/object:Gem::Version
|
127
127
|
version: 0.3.1
|
128
|
+
- !ruby/object:Gem::Dependency
|
129
|
+
name: posix-spawn
|
130
|
+
requirement: !ruby/object:Gem::Requirement
|
131
|
+
none: false
|
132
|
+
requirements:
|
133
|
+
- - ! '>='
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
type: :runtime
|
137
|
+
prerelease: false
|
138
|
+
version_requirements: !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ! '>='
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
- !ruby/object:Gem::Dependency
|
145
|
+
name: unix_utils
|
146
|
+
requirement: !ruby/object:Gem::Requirement
|
147
|
+
none: false
|
148
|
+
requirements:
|
149
|
+
- - ! '>='
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
type: :runtime
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
none: false
|
156
|
+
requirements:
|
157
|
+
- - ! '>='
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
128
160
|
- !ruby/object:Gem::Dependency
|
129
161
|
name: dkastner-alchemist
|
130
162
|
requirement: !ruby/object:Gem::Requirement
|
@@ -269,6 +301,22 @@ dependencies:
|
|
269
301
|
- - ! '>='
|
270
302
|
- !ruby/object:Gem::Version
|
271
303
|
version: '0'
|
304
|
+
- !ruby/object:Gem::Dependency
|
305
|
+
name: rdiscount
|
306
|
+
requirement: !ruby/object:Gem::Requirement
|
307
|
+
none: false
|
308
|
+
requirements:
|
309
|
+
- - ! '>='
|
310
|
+
- !ruby/object:Gem::Version
|
311
|
+
version: '0'
|
312
|
+
type: :development
|
313
|
+
prerelease: false
|
314
|
+
version_requirements: !ruby/object:Gem::Requirement
|
315
|
+
none: false
|
316
|
+
requirements:
|
317
|
+
- - ! '>='
|
318
|
+
- !ruby/object:Gem::Version
|
319
|
+
version: '0'
|
272
320
|
- !ruby/object:Gem::Dependency
|
273
321
|
name: sqlite3
|
274
322
|
requirement: !ruby/object:Gem::Requirement
|
@@ -344,12 +392,13 @@ files:
|
|
344
392
|
- lib/data_miner/step.rb
|
345
393
|
- lib/data_miner/step/import.rb
|
346
394
|
- lib/data_miner/step/process.rb
|
347
|
-
- lib/data_miner/step/
|
395
|
+
- lib/data_miner/step/sql.rb
|
348
396
|
- lib/data_miner/unit_converter.rb
|
349
397
|
- lib/data_miner/unit_converter/alchemist.rb
|
350
398
|
- lib/data_miner/unit_converter/conversions.rb
|
351
399
|
- lib/data_miner/version.rb
|
352
400
|
- test/data_miner/step/test_import.rb
|
401
|
+
- test/data_miner/step/test_sql.rb
|
353
402
|
- test/data_miner/test_attribute.rb
|
354
403
|
- test/data_miner/unit_converter/test_alchemist.rb
|
355
404
|
- test/data_miner/unit_converter/test_conversions.rb
|
@@ -362,6 +411,7 @@ files:
|
|
362
411
|
- test/support/data_miner_without_unit_converter.rb
|
363
412
|
- test/support/pet.rb
|
364
413
|
- test/support/pet2.rb
|
414
|
+
- test/support/pet3.rb
|
365
415
|
- test/support/pet_color_dictionary.en.csv
|
366
416
|
- test/support/pet_color_dictionary.es.csv
|
367
417
|
- test/support/pets.csv
|
@@ -369,7 +419,6 @@ files:
|
|
369
419
|
- test/test_data_miner.rb
|
370
420
|
- test/test_data_miner_run_column_statistic.rb
|
371
421
|
- test/test_earth_import.rb
|
372
|
-
- test/test_earth_tap.rb
|
373
422
|
- test/test_safety.rb
|
374
423
|
- test/test_unit_conversion.rb
|
375
424
|
homepage: https://github.com/seamusabshere/data_miner
|
@@ -399,6 +448,7 @@ summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and imp
|
|
399
448
|
XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
|
400
449
|
test_files:
|
401
450
|
- test/data_miner/step/test_import.rb
|
451
|
+
- test/data_miner/step/test_sql.rb
|
402
452
|
- test/data_miner/test_attribute.rb
|
403
453
|
- test/data_miner/unit_converter/test_alchemist.rb
|
404
454
|
- test/data_miner/unit_converter/test_conversions.rb
|
@@ -411,6 +461,7 @@ test_files:
|
|
411
461
|
- test/support/data_miner_without_unit_converter.rb
|
412
462
|
- test/support/pet.rb
|
413
463
|
- test/support/pet2.rb
|
464
|
+
- test/support/pet3.rb
|
414
465
|
- test/support/pet_color_dictionary.en.csv
|
415
466
|
- test/support/pet_color_dictionary.es.csv
|
416
467
|
- test/support/pets.csv
|
@@ -418,7 +469,6 @@ test_files:
|
|
418
469
|
- test/test_data_miner.rb
|
419
470
|
- test/test_data_miner_run_column_statistic.rb
|
420
471
|
- test/test_earth_import.rb
|
421
|
-
- test/test_earth_tap.rb
|
422
472
|
- test/test_safety.rb
|
423
473
|
- test/test_unit_conversion.rb
|
424
474
|
has_rdoc:
|
data/lib/data_miner/step/tap.rb
DELETED
@@ -1,167 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
|
3
|
-
class DataMiner
|
4
|
-
class Step
|
5
|
-
# A step that uses https://github.com/ricardochimal/taps to import table structure and data.
|
6
|
-
#
|
7
|
-
# Create these by calling +tap+ inside a +data_miner+ block.
|
8
|
-
#
|
9
|
-
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
10
|
-
# @see DataMiner::Script#tap Creating a tap step by calling DataMiner::Script#tap from inside a data miner script
|
11
|
-
class Tap < Step
|
12
|
-
DEFAULT_PORTS = {
|
13
|
-
:mysql => 3306,
|
14
|
-
:mysql2 => 3306,
|
15
|
-
:postgres => 5432
|
16
|
-
}
|
17
|
-
|
18
|
-
DEFAULT_USERNAMES = {
|
19
|
-
:mysql => 'root',
|
20
|
-
:mysql2 => 'root',
|
21
|
-
:postgres => ''
|
22
|
-
}
|
23
|
-
|
24
|
-
DEFAULT_PASSWORDS = {}
|
25
|
-
DEFAULT_PASSWORDS.default = ''
|
26
|
-
|
27
|
-
DEFAULT_HOSTS = {}
|
28
|
-
DEFAULT_HOSTS.default = '127.0.0.1'
|
29
|
-
|
30
|
-
# @private
|
31
|
-
attr_reader :script
|
32
|
-
|
33
|
-
# A description of the tapped data source.
|
34
|
-
# @return [String]
|
35
|
-
attr_reader :description
|
36
|
-
|
37
|
-
# The URL of the tapped data source, including username, password, domain, and port number.
|
38
|
-
# @return [String]
|
39
|
-
attr_reader :source
|
40
|
-
|
41
|
-
# Connection options that will be passed to the +taps pull command+. Defaults to the ActiveRecord connection config, if available.
|
42
|
-
# @return [Hash]
|
43
|
-
attr_reader :database_options
|
44
|
-
|
45
|
-
# Source table name. Defaults to the table name of the model.
|
46
|
-
# @return [String]
|
47
|
-
attr_reader :source_table_name
|
48
|
-
|
49
|
-
# @private
|
50
|
-
def initialize(script, description, source, options = {})
|
51
|
-
options = options.symbolize_keys
|
52
|
-
@script = script
|
53
|
-
@description = description
|
54
|
-
@source = source
|
55
|
-
@source_table_name = options.delete(:source_table_name) || model.table_name
|
56
|
-
@database_options = options.reverse_merge script.model.connection.instance_variable_get(:@config).symbolize_keys
|
57
|
-
end
|
58
|
-
|
59
|
-
# @private
|
60
|
-
def start
|
61
|
-
[ source_table_name, model.table_name ].each do |possible_obstacle|
|
62
|
-
if connection.table_exists? possible_obstacle
|
63
|
-
connection.drop_table possible_obstacle
|
64
|
-
end
|
65
|
-
end
|
66
|
-
taps_pull
|
67
|
-
if needs_table_rename?
|
68
|
-
connection.rename_table source_table_name, model.table_name
|
69
|
-
end
|
70
|
-
nil
|
71
|
-
end
|
72
|
-
|
73
|
-
# @return [String] The name of the current database.
|
74
|
-
def database
|
75
|
-
unless database = database_options[:database]
|
76
|
-
raise ::ArgumentError, %{[data_miner] Can't infer database name from options or ActiveRecord config.}
|
77
|
-
end
|
78
|
-
database
|
79
|
-
end
|
80
|
-
|
81
|
-
# @return [String] The database username.
|
82
|
-
def username
|
83
|
-
database_options[:username] || DEFAULT_USERNAMES[adapter.to_sym]
|
84
|
-
end
|
85
|
-
|
86
|
-
# @return [String] The database password.
|
87
|
-
def password
|
88
|
-
database_options[:password] || DEFAULT_PASSWORDS[adapter.to_sym]
|
89
|
-
end
|
90
|
-
|
91
|
-
# @return [String] The database port number.
|
92
|
-
def port
|
93
|
-
database_options[:port] || DEFAULT_PORTS[adapter.to_sym]
|
94
|
-
end
|
95
|
-
|
96
|
-
# @return [String] The database hostname.
|
97
|
-
def host
|
98
|
-
database_options[:host] || DEFAULT_HOSTS[adapter.to_sym]
|
99
|
-
end
|
100
|
-
|
101
|
-
private
|
102
|
-
|
103
|
-
def connection
|
104
|
-
model.connection
|
105
|
-
end
|
106
|
-
|
107
|
-
def needs_table_rename?
|
108
|
-
source_table_name != model.table_name
|
109
|
-
end
|
110
|
-
|
111
|
-
def adapter
|
112
|
-
case connection.adapter_name
|
113
|
-
when /mysql2/i
|
114
|
-
'mysql2'
|
115
|
-
when /mysql/i
|
116
|
-
'mysql'
|
117
|
-
when /postgres/i
|
118
|
-
'postgres'
|
119
|
-
when /sqlite/i
|
120
|
-
'sqlite'
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
# "user:pass"
|
125
|
-
# "user"
|
126
|
-
# nil
|
127
|
-
def userinfo
|
128
|
-
if username.present?
|
129
|
-
[username, password].select(&:present?).join(':')
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def db_url
|
134
|
-
case adapter
|
135
|
-
when 'sqlite'
|
136
|
-
"sqlite://#{database}"
|
137
|
-
else
|
138
|
-
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
143
|
-
#
|
144
|
-
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
145
|
-
def taps_pull
|
146
|
-
args = [
|
147
|
-
'taps',
|
148
|
-
'pull',
|
149
|
-
db_url,
|
150
|
-
source,
|
151
|
-
'--indexes-first',
|
152
|
-
'--tables',
|
153
|
-
source_table_name
|
154
|
-
]
|
155
|
-
|
156
|
-
# https://github.com/carlhuda/bundler/issues/1579
|
157
|
-
if defined?(::Bundler)
|
158
|
-
::Bundler.with_clean_env do
|
159
|
-
::Kernel.system args.join(' ')
|
160
|
-
end
|
161
|
-
else
|
162
|
-
::Kernel.system args.join(' ')
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
data/test/test_earth_tap.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
require 'earth'
|
5
|
-
|
6
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
7
|
-
Earth.init :locality, :pet, :load_data_miner => false, :apply_schemas => true
|
8
|
-
|
9
|
-
DataMiner.run %w{Country Breed}
|
10
|
-
|
11
|
-
describe DataMiner do
|
12
|
-
describe "being used by the Earth library's tap steps" do
|
13
|
-
describe "for pets" do
|
14
|
-
it "can pull breed and species" do
|
15
|
-
Breed.find('Golden Retriever').species.must_equal Species.find('dog')
|
16
|
-
end
|
17
|
-
end
|
18
|
-
describe "for localities" do
|
19
|
-
it "can handle non-latin characters" do
|
20
|
-
Country.find('DE').name.must_equal 'Germany'
|
21
|
-
Country.find('AX').name.must_equal 'Åland Islands'
|
22
|
-
Country.find('CI').name.must_equal "Côte d'Ivoire"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|