data_miner 2.3.4 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/README.markdown +13 -0
- data/data_miner.gemspec +3 -0
- data/lib/data_miner/active_record_class_methods.rb +2 -1
- data/lib/data_miner/attribute.rb +8 -0
- data/lib/data_miner/run.rb +5 -3
- data/lib/data_miner/script.rb +39 -32
- data/lib/data_miner/step/import.rb +16 -12
- data/lib/data_miner/step/process.rb +1 -4
- data/lib/data_miner/step/sql.rb +117 -0
- data/lib/data_miner/step.rb +3 -0
- data/lib/data_miner/version.rb +1 -1
- data/lib/data_miner.rb +3 -2
- data/test/data_miner/step/test_sql.rb +38 -0
- data/test/helper.rb +2 -0
- data/test/support/pet3.rb +9 -0
- data/test/test_data_miner.rb +8 -0
- metadata +55 -5
- data/lib/data_miner/step/tap.rb +0 -167
- data/test/test_earth_tap.rb +0 -26
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
2.4.0 / 2012-07-26
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Entirely remove taps support - it doesn't preserve foreign key constraints and adds a lot of complexity
|
6
|
+
|
7
|
+
* Enhancements
|
8
|
+
|
9
|
+
* New "sql" step - executes SQL provided as a string OR a local/remote SQL file provided as a URL
|
10
|
+
* Die with a useful error message if a column specified in an import step doesn't exist - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/17
|
11
|
+
* Allow setting :validate => true on import steps - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/18
|
12
|
+
|
1
13
|
2.3.4 / 2012-07-06
|
2
14
|
|
3
15
|
* Bug fixes
|
data/README.markdown
CHANGED
@@ -30,8 +30,17 @@ You define <code>data_miner</code> blocks in your ActiveRecord models. For examp
|
|
30
30
|
|
31
31
|
class Country < ActiveRecord::Base
|
32
32
|
self.primary_key = 'iso_3166_code'
|
33
|
+
|
34
|
+
# the "col" class method is provided by a different library - active_record_inline_schema
|
35
|
+
col :iso_3166_code # alpha-2 2-letter like GB
|
36
|
+
col :iso_3166_numeric_code, :type => :integer # numeric like 826; aka UN M49 code
|
37
|
+
col :iso_3166_alpha_3_code # 3-letter like GBR
|
38
|
+
col :name
|
33
39
|
|
34
40
|
data_miner do
|
41
|
+
# auto_upgrade! is provided by active_record_inline_schema
|
42
|
+
process :auto_upgrade!
|
43
|
+
|
35
44
|
import("OpenGeoCode.org's Country Codes to Country Names list",
|
36
45
|
:url => 'http://opengeocode.org/download/countrynames.txt',
|
37
46
|
:format => :delimited,
|
@@ -107,6 +116,10 @@ And many more - look for the `data_miner.rb` file that corresponds to each model
|
|
107
116
|
* Derek Kastner <dkastner@gmail.com>
|
108
117
|
* Ian Hough <ijhough@gmail.com>
|
109
118
|
|
119
|
+
## Wishlist
|
120
|
+
|
121
|
+
* Make the tests real unit tests
|
122
|
+
|
110
123
|
## Copyright
|
111
124
|
|
112
125
|
Copyright (c) 2012 Brighter Planet. See LICENSE for details.
|
data/data_miner.gemspec
CHANGED
@@ -24,6 +24,8 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_runtime_dependency 'errata', '>=1.0.1'
|
25
25
|
s.add_runtime_dependency 'remote_table', '>=2.0.2'
|
26
26
|
s.add_runtime_dependency 'upsert', '>=0.3.1'
|
27
|
+
s.add_runtime_dependency 'posix-spawn'
|
28
|
+
s.add_runtime_dependency 'unix_utils'
|
27
29
|
|
28
30
|
s.add_development_dependency 'dkastner-alchemist'
|
29
31
|
s.add_development_dependency 'conversions'
|
@@ -34,6 +36,7 @@ Gem::Specification.new do |s|
|
|
34
36
|
s.add_development_dependency 'minitest-reporters'
|
35
37
|
s.add_development_dependency 'rake'
|
36
38
|
s.add_development_dependency 'yard'
|
39
|
+
s.add_development_dependency 'rdiscount'
|
37
40
|
if RUBY_PLATFORM == 'java'
|
38
41
|
s.add_development_dependency 'jruby-openssl'
|
39
42
|
s.add_development_dependency 'activerecord-jdbcsqlite3-adapter'
|
@@ -63,7 +63,7 @@ class DataMiner
|
|
63
63
|
#
|
64
64
|
# @see DataMiner::Script#import Creating an import step by calling DataMiner::Script#import from inside a data miner script
|
65
65
|
# @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
|
66
|
-
# @see DataMiner::Script#
|
66
|
+
# @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
|
67
67
|
#
|
68
68
|
# @example Creating steps
|
69
69
|
# class MyModel < ActiveRecord::Base
|
@@ -71,6 +71,7 @@ class DataMiner
|
|
71
71
|
# process [...]
|
72
72
|
# import [...]
|
73
73
|
# import [...yes, it's ok to have more than one import step...]
|
74
|
+
# sql [...]
|
74
75
|
# process [...]
|
75
76
|
# [...etc...]
|
76
77
|
# end
|
data/lib/data_miner/attribute.rb
CHANGED
@@ -239,6 +239,9 @@ class DataMiner
|
|
239
239
|
|
240
240
|
# @private
|
241
241
|
def read(row)
|
242
|
+
unless column_exists?
|
243
|
+
raise RuntimeError, "[data_miner] Table #{model.table_name} does not have column #{name.inspect}"
|
244
|
+
end
|
242
245
|
if matcher and matcher_output = matcher.match(row)
|
243
246
|
return matcher_output
|
244
247
|
end
|
@@ -342,6 +345,11 @@ class DataMiner
|
|
342
345
|
step.model
|
343
346
|
end
|
344
347
|
|
348
|
+
def column_exists?
|
349
|
+
return @column_exists_boolean if defined?(@column_exists_boolean)
|
350
|
+
@column_exists_boolean = model.column_names.include? name.to_s
|
351
|
+
end
|
352
|
+
|
345
353
|
def text_column?
|
346
354
|
return @text_column_boolean if defined?(@text_column_boolean)
|
347
355
|
@text_column_boolean = model.columns_hash[name.to_s].text?
|
data/lib/data_miner/run.rb
CHANGED
@@ -100,9 +100,11 @@ class DataMiner
|
|
100
100
|
fail!
|
101
101
|
raise $!
|
102
102
|
ensure
|
103
|
-
|
104
|
-
|
105
|
-
|
103
|
+
if model.table_exists?
|
104
|
+
self.row_count_after = model.count
|
105
|
+
if DataMiner.per_column_statistics?
|
106
|
+
ColumnStatistic.take self
|
107
|
+
end
|
106
108
|
end
|
107
109
|
self.stopped_at = ::Time.now.utc
|
108
110
|
save!
|
data/lib/data_miner/script.rb
CHANGED
@@ -92,47 +92,32 @@ class DataMiner
|
|
92
92
|
append(:process, method_id_or_description, &blk)
|
93
93
|
end
|
94
94
|
|
95
|
-
#
|
96
|
-
#
|
97
|
-
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
98
|
-
# @see DataMiner::Step::Tap The actual Tap class.
|
99
|
-
#
|
100
|
-
# @param [String] description A description of the taps source.
|
101
|
-
# @param [String] source The taps URL, including username, password, domain, and port.
|
102
|
-
# @param [optional, Hash] options
|
103
|
-
# @option options [String] :source_table_name (model.table_name) The source table name, if different.
|
95
|
+
# Import rows into your model.
|
104
96
|
#
|
105
|
-
#
|
106
|
-
#
|
97
|
+
# As long as...
|
98
|
+
# 1. you +key+ on the primary key, or
|
99
|
+
# 2. the table has an auto-increment primary key, or
|
100
|
+
# 3. you DON'T enable +:validate+
|
101
|
+
# ... then things will be sped up using the {https://github.com/seamusabshere/upsert upsert library} in streaming mode.
|
107
102
|
#
|
108
|
-
#
|
109
|
-
# data_miner do
|
110
|
-
# [...]
|
111
|
-
# tap "Brighter Planet's reference data", "http://carbon:neutral@data.brighterplanet.com:5000"
|
112
|
-
# [...]
|
113
|
-
# end
|
114
|
-
#
|
115
|
-
# @return [nil]
|
116
|
-
def tap(description, source, options = {})
|
117
|
-
append :tap, description, source, options
|
118
|
-
end
|
119
|
-
|
120
|
-
# Import rows into your model.
|
103
|
+
# Otherwise, native +ActiveRecord+ constuctors and validations will be used.
|
121
104
|
#
|
122
105
|
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
123
106
|
# @see DataMiner::Step::Import The actual Import class.
|
124
107
|
#
|
125
108
|
# @param [String] description A description of the data source.
|
126
|
-
# @param [Hash]
|
127
|
-
# @option
|
128
|
-
# @option
|
129
|
-
# @option
|
109
|
+
# @param [Hash] settings Settings, including URL of the data source, that are used to download/parse (using RemoteTable) and (sometimes) correct (using Errata) the data.
|
110
|
+
# @option settings [String] :url The URL of the data source. Passed directly to +RemoteTable.new+.
|
111
|
+
# @option settings [Hash] :errata The +:responder+ and +:url+ settings that will be passed to +Errata.new+.
|
112
|
+
# @option settings [TrueClass,FalseClass] :validate Whether to always run +ActiveRecord+ validations.
|
113
|
+
# @option settings [*] anything Any other setting will be passed to +RemoteTable.new+.
|
130
114
|
#
|
131
115
|
# @yield [] A block defining how to +key+ the import (to make it idempotent) and which columns to +store+.
|
132
116
|
#
|
133
|
-
# @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +
|
117
|
+
# @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +settings+.
|
134
118
|
# @note There are hundreds of +import+ examples in https://github.com/brighterplanet/earth. The {file:README.markdown README} points to a few (at the bottom.)
|
135
119
|
# @note We often use string primary keys to make idempotency easier. https://github.com/seamusabshere/active_record_inline_schema supports defining these inline.
|
120
|
+
# @note Enabling +:validate+ may slow down importing large files because it precludes bulk loading using https://github.com/seamusabshere/upsert.
|
136
121
|
#
|
137
122
|
# @example From the README
|
138
123
|
# data_miner do
|
@@ -152,8 +137,26 @@ class DataMiner
|
|
152
137
|
# end
|
153
138
|
#
|
154
139
|
# @return [nil]
|
155
|
-
def import(description,
|
156
|
-
append(:import, description,
|
140
|
+
def import(description, settings, &blk)
|
141
|
+
append(:import, description, settings, &blk)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Execute SQL, provided either as a string or a URL.
|
145
|
+
#
|
146
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
147
|
+
# @see DataMiner::Step::Sql The actual Sql class.
|
148
|
+
#
|
149
|
+
# @note +url_or_statement+ is auto-detected by looking for +%r{^[^\s]*/[^\*]}+ (non-spaces followed by a slash followed by non-asterisk). Therefore if you're passing a local file path and want it to be treated like a URL, make it absolute.
|
150
|
+
#
|
151
|
+
# @param [String] description What this step does.
|
152
|
+
# @param [String] url_or_statement SQL statement as a String or location of the SQL file as a URL.
|
153
|
+
#
|
154
|
+
# @example Rapidly get a list of countries from Brighter Planet's Reference Data web service
|
155
|
+
# data_miner do
|
156
|
+
# sql "Brighter Planet's countries", 'http://data.brighterplanet.com/countries.sql'
|
157
|
+
# end
|
158
|
+
def sql(description, url_or_statement)
|
159
|
+
append(:sql, description, url_or_statement)
|
157
160
|
end
|
158
161
|
|
159
162
|
# Prepend a step to a script unless it's already there. Mostly for internal use.
|
@@ -237,7 +240,11 @@ class DataMiner
|
|
237
240
|
args = ["#{klass.name.demodulize} step with no description"]
|
238
241
|
end
|
239
242
|
initializer = [self] + args + [options]
|
240
|
-
|
243
|
+
if block_given?
|
244
|
+
klass.new(*initializer, &blk)
|
245
|
+
else
|
246
|
+
klass.new(*initializer)
|
247
|
+
end
|
241
248
|
end
|
242
249
|
end
|
243
250
|
end
|
@@ -16,31 +16,29 @@ class DataMiner
|
|
16
16
|
# @return [Array<DataMiner::Attribute>]
|
17
17
|
attr_reader :attributes
|
18
18
|
|
19
|
-
# @private
|
20
|
-
attr_reader :script
|
21
|
-
|
22
19
|
# Description of what this step does.
|
23
20
|
# @return [String]
|
24
21
|
attr_reader :description
|
25
22
|
|
26
23
|
# @private
|
27
|
-
def initialize(script, description,
|
28
|
-
|
29
|
-
if
|
24
|
+
def initialize(script, description, settings, &blk)
|
25
|
+
settings = settings.symbolize_keys
|
26
|
+
if settings.has_key?(:table)
|
30
27
|
raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
|
31
28
|
end
|
32
|
-
if (errata_settings =
|
29
|
+
if (errata_settings = settings[:errata]) and not errata_settings.is_a?(::Hash)
|
33
30
|
raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
|
34
31
|
end
|
35
32
|
@script = script
|
36
33
|
@attributes = ::ActiveSupport::OrderedHash.new
|
34
|
+
@validate_query = !!settings[:validate]
|
37
35
|
@description = description
|
38
|
-
if
|
39
|
-
errata_settings =
|
36
|
+
if settings.has_key? :errata
|
37
|
+
errata_settings = settings[:errata].symbolize_keys
|
40
38
|
errata_settings[:responder] ||= model
|
41
|
-
|
39
|
+
settings[:errata] = errata_settings
|
42
40
|
end
|
43
|
-
@table_settings =
|
41
|
+
@table_settings = settings.dup
|
44
42
|
@table_settings[:streaming] = true
|
45
43
|
@table_mutex = ::Mutex.new
|
46
44
|
instance_eval(&blk)
|
@@ -85,7 +83,7 @@ class DataMiner
|
|
85
83
|
|
86
84
|
# @private
|
87
85
|
def start
|
88
|
-
if storing_primary_key? or table_has_autoincrementing_primary_key?
|
86
|
+
if not validate? and (storing_primary_key? or table_has_autoincrementing_primary_key?)
|
89
87
|
c = ActiveRecord::Base.connection_pool.checkout
|
90
88
|
Upsert.stream(c, model.table_name) do |upsert|
|
91
89
|
table.each do |row|
|
@@ -109,6 +107,12 @@ class DataMiner
|
|
109
107
|
nil
|
110
108
|
end
|
111
109
|
|
110
|
+
# @private
|
111
|
+
# Whether to run ActiveRecord validations. Slows things down because Upsert isn't used.
|
112
|
+
def validate?
|
113
|
+
@validate_query == true
|
114
|
+
end
|
115
|
+
|
112
116
|
private
|
113
117
|
|
114
118
|
def table_has_autoincrementing_primary_key?
|
@@ -7,9 +7,6 @@ class DataMiner
|
|
7
7
|
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
8
8
|
# @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
|
9
9
|
class Process < Step
|
10
|
-
# @private
|
11
|
-
attr_reader :script
|
12
|
-
|
13
10
|
# The method to be called on the model class.
|
14
11
|
# @return [Symbol]
|
15
12
|
attr_reader :method_id
|
@@ -25,7 +22,7 @@ class DataMiner
|
|
25
22
|
alias :block_description :description
|
26
23
|
|
27
24
|
# @private
|
28
|
-
def initialize(script, method_id_or_description, ignored_options =
|
25
|
+
def initialize(script, method_id_or_description, ignored_options = nil, &blk)
|
29
26
|
@script = script
|
30
27
|
if block_given?
|
31
28
|
@description = method_id_or_description
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'tmpdir'
|
3
|
+
require 'posix/spawn'
|
4
|
+
require 'unix_utils'
|
5
|
+
|
6
|
+
class DataMiner
|
7
|
+
class Step
|
8
|
+
# A step that executes a SQL, either from a string or as retrieved from a URL.
|
9
|
+
#
|
10
|
+
# Create these by calling +sql+ inside a +data_miner+ block.
|
11
|
+
#
|
12
|
+
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
13
|
+
# @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
|
14
|
+
class Sql < Step
|
15
|
+
URL_DETECTOR = %r{^[^\s]*/[^\*]}
|
16
|
+
|
17
|
+
# Description of what this step does.
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :description
|
20
|
+
|
21
|
+
# Location of the SQL file.
|
22
|
+
# @return [String]
|
23
|
+
attr_reader :url
|
24
|
+
|
25
|
+
# String containing the SQL.
|
26
|
+
# @return [String]
|
27
|
+
attr_reader :statement
|
28
|
+
|
29
|
+
# @private
|
30
|
+
def initialize(script, description, url_or_statement, ignored_options = nil)
|
31
|
+
@script = script
|
32
|
+
@description = description
|
33
|
+
if url_or_statement =~ URL_DETECTOR
|
34
|
+
@url = url_or_statement
|
35
|
+
else
|
36
|
+
@statement = url_or_statement
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @private
|
41
|
+
def start
|
42
|
+
if statement
|
43
|
+
c = ActiveRecord::Base.connection_pool.checkout
|
44
|
+
c.execute statement
|
45
|
+
ActiveRecord::Base.connection_pool.checkin c
|
46
|
+
else
|
47
|
+
tmp_path = UnixUtils.curl url
|
48
|
+
send config[:adapter], tmp_path
|
49
|
+
File.unlink tmp_path
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def config
|
56
|
+
@config ||= if ActiveRecord::Base.respond_to?(:connection_config)
|
57
|
+
ActiveRecord::Base.connection_config
|
58
|
+
else
|
59
|
+
ActiveRecord::Base.connection_pool.spec.config
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def mysql(path)
|
64
|
+
connect = if config[:socket]
|
65
|
+
[ '--socket', config[:socket] ]
|
66
|
+
else
|
67
|
+
[ '--host', config.fetch(:host, '127.0.0.1'), '--port', config.fetch(:port, 3306).to_s ]
|
68
|
+
end
|
69
|
+
|
70
|
+
argv = [
|
71
|
+
'mysql',
|
72
|
+
'--compress',
|
73
|
+
'--user', config[:username],
|
74
|
+
"-p#{config[:password]}",
|
75
|
+
connect,
|
76
|
+
'--default-character-set', 'utf8',
|
77
|
+
config[:database]
|
78
|
+
].flatten
|
79
|
+
|
80
|
+
File.open(path) do |f|
|
81
|
+
pid = POSIX::Spawn.spawn(*(argv+[{:in => f}]))
|
82
|
+
::Process.waitpid pid
|
83
|
+
end
|
84
|
+
unless $?.success?
|
85
|
+
raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect}"
|
86
|
+
end
|
87
|
+
nil
|
88
|
+
end
|
89
|
+
|
90
|
+
alias :mysql2 :mysql
|
91
|
+
|
92
|
+
def postgresql(path)
|
93
|
+
connect = []
|
94
|
+
connect << ['--username', config[:username]] if config[:username]
|
95
|
+
connect << ['--password', config[:password]] if config[:password]
|
96
|
+
connect << ['--host', config[:host]] if config[:host]
|
97
|
+
connect << ['--port', config[:port]] if config[:port]
|
98
|
+
|
99
|
+
argv = [
|
100
|
+
'psql',
|
101
|
+
connect,
|
102
|
+
'--quiet',
|
103
|
+
'--dbname', config[:database],
|
104
|
+
'--file', path
|
105
|
+
].flatten
|
106
|
+
|
107
|
+
child = POSIX::Spawn::Child.new(*argv)
|
108
|
+
$stderr.puts child.out
|
109
|
+
$stderr.puts child.err
|
110
|
+
unless child.success?
|
111
|
+
raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect} (#{child.err.inspect})"
|
112
|
+
end
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/lib/data_miner/step.rb
CHANGED
data/lib/data_miner/version.rb
CHANGED
data/lib/data_miner.rb
CHANGED
@@ -20,8 +20,8 @@ require 'data_miner/script'
|
|
20
20
|
require 'data_miner/dictionary'
|
21
21
|
require 'data_miner/step'
|
22
22
|
require 'data_miner/step/import'
|
23
|
-
require 'data_miner/step/tap'
|
24
23
|
require 'data_miner/step/process'
|
24
|
+
require 'data_miner/step/sql'
|
25
25
|
require 'data_miner/run'
|
26
26
|
require 'data_miner/unit_converter'
|
27
27
|
|
@@ -44,7 +44,7 @@ class DataMiner
|
|
44
44
|
|
45
45
|
# @private
|
46
46
|
def compress_whitespace(str)
|
47
|
-
str.gsub(INNER_SPACE,
|
47
|
+
str.gsub(INNER_SPACE, ONE_SPACE).strip
|
48
48
|
end
|
49
49
|
|
50
50
|
# Set the unit converter.
|
@@ -66,6 +66,7 @@ class DataMiner
|
|
66
66
|
end
|
67
67
|
|
68
68
|
INNER_SPACE = /[ ]+/
|
69
|
+
ONE_SPACE = ' '
|
69
70
|
|
70
71
|
include ::Singleton
|
71
72
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'helper'
|
3
|
+
init_database
|
4
|
+
|
5
|
+
class BreedBlue < ActiveRecord::Base
|
6
|
+
self.table_name = 'breeds'
|
7
|
+
self.primary_key = 'name'
|
8
|
+
data_miner do
|
9
|
+
sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class BreedRed < ActiveRecord::Base
|
14
|
+
self.table_name = 'breeds'
|
15
|
+
self.primary_key = 'name'
|
16
|
+
data_miner do
|
17
|
+
sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
|
18
|
+
sql "Mess up weights", %{UPDATE breeds SET weight = 999}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe DataMiner::Step::Sql do
|
23
|
+
before do
|
24
|
+
BreedBlue.delete_all rescue nil
|
25
|
+
end
|
26
|
+
it "can be provided as a URL" do
|
27
|
+
BreedBlue.run_data_miner!
|
28
|
+
BreedBlue.where(:name => 'Affenpinscher').count.must_equal 1
|
29
|
+
BreedBlue.where(:name => 'Württemberger').count.must_equal 1
|
30
|
+
BreedBlue.find('Afghan Hound').weight.must_be_close_to 24.9476
|
31
|
+
end
|
32
|
+
it "can be provided as a string" do
|
33
|
+
BreedRed.run_data_miner!
|
34
|
+
BreedRed.where(:name => 'Affenpinscher').count.must_equal 1
|
35
|
+
BreedRed.where(:name => 'Württemberger').count.must_equal 1
|
36
|
+
BreedRed.find('Afghan Hound').weight.must_be_close_to 999
|
37
|
+
end
|
38
|
+
end
|
data/test/helper.rb
CHANGED
@@ -59,8 +59,10 @@ def init_models
|
|
59
59
|
require 'support/breed'
|
60
60
|
require 'support/pet'
|
61
61
|
require 'support/pet2'
|
62
|
+
require 'support/pet3'
|
62
63
|
Pet.auto_upgrade!
|
63
64
|
Pet2.auto_upgrade!
|
65
|
+
Pet3.auto_upgrade!
|
64
66
|
|
65
67
|
ActiveRecord::Base.descendants.each do |model|
|
66
68
|
model.attr_accessible nil
|
data/test/test_data_miner.rb
CHANGED
@@ -7,6 +7,8 @@ describe DataMiner do
|
|
7
7
|
describe "when used to import example data about pets" do
|
8
8
|
before do
|
9
9
|
Pet.delete_all
|
10
|
+
Pet2.delete_all
|
11
|
+
Pet3.delete_all
|
10
12
|
DataMiner::Run.delete_all
|
11
13
|
DataMiner::Run::ColumnStatistic.delete_all
|
12
14
|
end
|
@@ -112,5 +114,11 @@ describe DataMiner do
|
|
112
114
|
Pet2.run_data_miner!
|
113
115
|
Pet2.find('Jerry').breed_id.must_equal 'Beagle-Basset'
|
114
116
|
end
|
117
|
+
it "dies if a column specified in an import step doesn't exist" do
|
118
|
+
lambda do
|
119
|
+
Pet3.run_data_miner!
|
120
|
+
end.must_raise RuntimeError, /exist/i
|
121
|
+
end
|
122
|
+
|
115
123
|
end
|
116
124
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_miner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-07-
|
14
|
+
date: 2012-07-26 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: aasm
|
@@ -125,6 +125,38 @@ dependencies:
|
|
125
125
|
- - ! '>='
|
126
126
|
- !ruby/object:Gem::Version
|
127
127
|
version: 0.3.1
|
128
|
+
- !ruby/object:Gem::Dependency
|
129
|
+
name: posix-spawn
|
130
|
+
requirement: !ruby/object:Gem::Requirement
|
131
|
+
none: false
|
132
|
+
requirements:
|
133
|
+
- - ! '>='
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
type: :runtime
|
137
|
+
prerelease: false
|
138
|
+
version_requirements: !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ! '>='
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
- !ruby/object:Gem::Dependency
|
145
|
+
name: unix_utils
|
146
|
+
requirement: !ruby/object:Gem::Requirement
|
147
|
+
none: false
|
148
|
+
requirements:
|
149
|
+
- - ! '>='
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
type: :runtime
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
none: false
|
156
|
+
requirements:
|
157
|
+
- - ! '>='
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
128
160
|
- !ruby/object:Gem::Dependency
|
129
161
|
name: dkastner-alchemist
|
130
162
|
requirement: !ruby/object:Gem::Requirement
|
@@ -269,6 +301,22 @@ dependencies:
|
|
269
301
|
- - ! '>='
|
270
302
|
- !ruby/object:Gem::Version
|
271
303
|
version: '0'
|
304
|
+
- !ruby/object:Gem::Dependency
|
305
|
+
name: rdiscount
|
306
|
+
requirement: !ruby/object:Gem::Requirement
|
307
|
+
none: false
|
308
|
+
requirements:
|
309
|
+
- - ! '>='
|
310
|
+
- !ruby/object:Gem::Version
|
311
|
+
version: '0'
|
312
|
+
type: :development
|
313
|
+
prerelease: false
|
314
|
+
version_requirements: !ruby/object:Gem::Requirement
|
315
|
+
none: false
|
316
|
+
requirements:
|
317
|
+
- - ! '>='
|
318
|
+
- !ruby/object:Gem::Version
|
319
|
+
version: '0'
|
272
320
|
- !ruby/object:Gem::Dependency
|
273
321
|
name: sqlite3
|
274
322
|
requirement: !ruby/object:Gem::Requirement
|
@@ -344,12 +392,13 @@ files:
|
|
344
392
|
- lib/data_miner/step.rb
|
345
393
|
- lib/data_miner/step/import.rb
|
346
394
|
- lib/data_miner/step/process.rb
|
347
|
-
- lib/data_miner/step/
|
395
|
+
- lib/data_miner/step/sql.rb
|
348
396
|
- lib/data_miner/unit_converter.rb
|
349
397
|
- lib/data_miner/unit_converter/alchemist.rb
|
350
398
|
- lib/data_miner/unit_converter/conversions.rb
|
351
399
|
- lib/data_miner/version.rb
|
352
400
|
- test/data_miner/step/test_import.rb
|
401
|
+
- test/data_miner/step/test_sql.rb
|
353
402
|
- test/data_miner/test_attribute.rb
|
354
403
|
- test/data_miner/unit_converter/test_alchemist.rb
|
355
404
|
- test/data_miner/unit_converter/test_conversions.rb
|
@@ -362,6 +411,7 @@ files:
|
|
362
411
|
- test/support/data_miner_without_unit_converter.rb
|
363
412
|
- test/support/pet.rb
|
364
413
|
- test/support/pet2.rb
|
414
|
+
- test/support/pet3.rb
|
365
415
|
- test/support/pet_color_dictionary.en.csv
|
366
416
|
- test/support/pet_color_dictionary.es.csv
|
367
417
|
- test/support/pets.csv
|
@@ -369,7 +419,6 @@ files:
|
|
369
419
|
- test/test_data_miner.rb
|
370
420
|
- test/test_data_miner_run_column_statistic.rb
|
371
421
|
- test/test_earth_import.rb
|
372
|
-
- test/test_earth_tap.rb
|
373
422
|
- test/test_safety.rb
|
374
423
|
- test/test_unit_conversion.rb
|
375
424
|
homepage: https://github.com/seamusabshere/data_miner
|
@@ -399,6 +448,7 @@ summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and imp
|
|
399
448
|
XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
|
400
449
|
test_files:
|
401
450
|
- test/data_miner/step/test_import.rb
|
451
|
+
- test/data_miner/step/test_sql.rb
|
402
452
|
- test/data_miner/test_attribute.rb
|
403
453
|
- test/data_miner/unit_converter/test_alchemist.rb
|
404
454
|
- test/data_miner/unit_converter/test_conversions.rb
|
@@ -411,6 +461,7 @@ test_files:
|
|
411
461
|
- test/support/data_miner_without_unit_converter.rb
|
412
462
|
- test/support/pet.rb
|
413
463
|
- test/support/pet2.rb
|
464
|
+
- test/support/pet3.rb
|
414
465
|
- test/support/pet_color_dictionary.en.csv
|
415
466
|
- test/support/pet_color_dictionary.es.csv
|
416
467
|
- test/support/pets.csv
|
@@ -418,7 +469,6 @@ test_files:
|
|
418
469
|
- test/test_data_miner.rb
|
419
470
|
- test/test_data_miner_run_column_statistic.rb
|
420
471
|
- test/test_earth_import.rb
|
421
|
-
- test/test_earth_tap.rb
|
422
472
|
- test/test_safety.rb
|
423
473
|
- test/test_unit_conversion.rb
|
424
474
|
has_rdoc:
|
data/lib/data_miner/step/tap.rb
DELETED
@@ -1,167 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
|
3
|
-
class DataMiner
|
4
|
-
class Step
|
5
|
-
# A step that uses https://github.com/ricardochimal/taps to import table structure and data.
|
6
|
-
#
|
7
|
-
# Create these by calling +tap+ inside a +data_miner+ block.
|
8
|
-
#
|
9
|
-
# @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
|
10
|
-
# @see DataMiner::Script#tap Creating a tap step by calling DataMiner::Script#tap from inside a data miner script
|
11
|
-
class Tap < Step
|
12
|
-
DEFAULT_PORTS = {
|
13
|
-
:mysql => 3306,
|
14
|
-
:mysql2 => 3306,
|
15
|
-
:postgres => 5432
|
16
|
-
}
|
17
|
-
|
18
|
-
DEFAULT_USERNAMES = {
|
19
|
-
:mysql => 'root',
|
20
|
-
:mysql2 => 'root',
|
21
|
-
:postgres => ''
|
22
|
-
}
|
23
|
-
|
24
|
-
DEFAULT_PASSWORDS = {}
|
25
|
-
DEFAULT_PASSWORDS.default = ''
|
26
|
-
|
27
|
-
DEFAULT_HOSTS = {}
|
28
|
-
DEFAULT_HOSTS.default = '127.0.0.1'
|
29
|
-
|
30
|
-
# @private
|
31
|
-
attr_reader :script
|
32
|
-
|
33
|
-
# A description of the tapped data source.
|
34
|
-
# @return [String]
|
35
|
-
attr_reader :description
|
36
|
-
|
37
|
-
# The URL of the tapped data source, including username, password, domain, and port number.
|
38
|
-
# @return [String]
|
39
|
-
attr_reader :source
|
40
|
-
|
41
|
-
# Connection options that will be passed to the +taps pull command+. Defaults to the ActiveRecord connection config, if available.
|
42
|
-
# @return [Hash]
|
43
|
-
attr_reader :database_options
|
44
|
-
|
45
|
-
# Source table name. Defaults to the table name of the model.
|
46
|
-
# @return [String]
|
47
|
-
attr_reader :source_table_name
|
48
|
-
|
49
|
-
# @private
|
50
|
-
def initialize(script, description, source, options = {})
|
51
|
-
options = options.symbolize_keys
|
52
|
-
@script = script
|
53
|
-
@description = description
|
54
|
-
@source = source
|
55
|
-
@source_table_name = options.delete(:source_table_name) || model.table_name
|
56
|
-
@database_options = options.reverse_merge script.model.connection.instance_variable_get(:@config).symbolize_keys
|
57
|
-
end
|
58
|
-
|
59
|
-
# @private
|
60
|
-
def start
|
61
|
-
[ source_table_name, model.table_name ].each do |possible_obstacle|
|
62
|
-
if connection.table_exists? possible_obstacle
|
63
|
-
connection.drop_table possible_obstacle
|
64
|
-
end
|
65
|
-
end
|
66
|
-
taps_pull
|
67
|
-
if needs_table_rename?
|
68
|
-
connection.rename_table source_table_name, model.table_name
|
69
|
-
end
|
70
|
-
nil
|
71
|
-
end
|
72
|
-
|
73
|
-
# @return [String] The name of the current database.
|
74
|
-
def database
|
75
|
-
unless database = database_options[:database]
|
76
|
-
raise ::ArgumentError, %{[data_miner] Can't infer database name from options or ActiveRecord config.}
|
77
|
-
end
|
78
|
-
database
|
79
|
-
end
|
80
|
-
|
81
|
-
# @return [String] The database username.
|
82
|
-
def username
|
83
|
-
database_options[:username] || DEFAULT_USERNAMES[adapter.to_sym]
|
84
|
-
end
|
85
|
-
|
86
|
-
# @return [String] The database password.
|
87
|
-
def password
|
88
|
-
database_options[:password] || DEFAULT_PASSWORDS[adapter.to_sym]
|
89
|
-
end
|
90
|
-
|
91
|
-
# @return [String] The database port number.
|
92
|
-
def port
|
93
|
-
database_options[:port] || DEFAULT_PORTS[adapter.to_sym]
|
94
|
-
end
|
95
|
-
|
96
|
-
# @return [String] The database hostname.
|
97
|
-
def host
|
98
|
-
database_options[:host] || DEFAULT_HOSTS[adapter.to_sym]
|
99
|
-
end
|
100
|
-
|
101
|
-
private
|
102
|
-
|
103
|
-
def connection
|
104
|
-
model.connection
|
105
|
-
end
|
106
|
-
|
107
|
-
def needs_table_rename?
|
108
|
-
source_table_name != model.table_name
|
109
|
-
end
|
110
|
-
|
111
|
-
def adapter
|
112
|
-
case connection.adapter_name
|
113
|
-
when /mysql2/i
|
114
|
-
'mysql2'
|
115
|
-
when /mysql/i
|
116
|
-
'mysql'
|
117
|
-
when /postgres/i
|
118
|
-
'postgres'
|
119
|
-
when /sqlite/i
|
120
|
-
'sqlite'
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
# "user:pass"
|
125
|
-
# "user"
|
126
|
-
# nil
|
127
|
-
def userinfo
|
128
|
-
if username.present?
|
129
|
-
[username, password].select(&:present?).join(':')
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def db_url
|
134
|
-
case adapter
|
135
|
-
when 'sqlite'
|
136
|
-
"sqlite://#{database}"
|
137
|
-
else
|
138
|
-
::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
|
143
|
-
#
|
144
|
-
# This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
|
145
|
-
def taps_pull
|
146
|
-
args = [
|
147
|
-
'taps',
|
148
|
-
'pull',
|
149
|
-
db_url,
|
150
|
-
source,
|
151
|
-
'--indexes-first',
|
152
|
-
'--tables',
|
153
|
-
source_table_name
|
154
|
-
]
|
155
|
-
|
156
|
-
# https://github.com/carlhuda/bundler/issues/1579
|
157
|
-
if defined?(::Bundler)
|
158
|
-
::Bundler.with_clean_env do
|
159
|
-
::Kernel.system args.join(' ')
|
160
|
-
end
|
161
|
-
else
|
162
|
-
::Kernel.system args.join(' ')
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
data/test/test_earth_tap.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
require 'helper'
|
3
|
-
init_database
|
4
|
-
require 'earth'
|
5
|
-
|
6
|
-
# use earth, which has a plethora of real-world data_miner blocks
|
7
|
-
Earth.init :locality, :pet, :load_data_miner => false, :apply_schemas => true
|
8
|
-
|
9
|
-
DataMiner.run %w{Country Breed}
|
10
|
-
|
11
|
-
describe DataMiner do
|
12
|
-
describe "being used by the Earth library's tap steps" do
|
13
|
-
describe "for pets" do
|
14
|
-
it "can pull breed and species" do
|
15
|
-
Breed.find('Golden Retriever').species.must_equal Species.find('dog')
|
16
|
-
end
|
17
|
-
end
|
18
|
-
describe "for localities" do
|
19
|
-
it "can handle non-latin characters" do
|
20
|
-
Country.find('DE').name.must_equal 'Germany'
|
21
|
-
Country.find('AX').name.must_equal 'Åland Islands'
|
22
|
-
Country.find('CI').name.must_equal "Côte d'Ivoire"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|