data_miner 2.3.4 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 2.4.0 / 2012-07-26
2
+
3
+ * Breaking changes
4
+
5
+ * Entirely remove taps support - it doesn't preserve foreign key constraints and adds a lot of complexity
6
+
7
+ * Enhancements
8
+
9
+ * New "sql" step - executes SQL provided as a string OR a local/remote SQL file provided as a URL
10
+ * Die with a useful error message if a column specified in an import step doesn't exist - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/17
11
+ * Allow setting :validate => true on import steps - thanks @chrisle https://github.com/seamusabshere/data_miner/issues/18
12
+
1
13
  2.3.4 / 2012-07-06
2
14
 
3
15
  * Bug fixes
data/README.markdown CHANGED
@@ -30,8 +30,17 @@ You define <code>data_miner</code> blocks in your ActiveRecord models. For examp
30
30
 
31
31
  class Country < ActiveRecord::Base
32
32
  self.primary_key = 'iso_3166_code'
33
+
34
+ # the "col" class method is provided by a different library - active_record_inline_schema
35
+ col :iso_3166_code # alpha-2 2-letter like GB
36
+ col :iso_3166_numeric_code, :type => :integer # numeric like 826; aka UN M49 code
37
+ col :iso_3166_alpha_3_code # 3-letter like GBR
38
+ col :name
33
39
 
34
40
  data_miner do
41
+ # auto_upgrade! is provided by active_record_inline_schema
42
+ process :auto_upgrade!
43
+
35
44
  import("OpenGeoCode.org's Country Codes to Country Names list",
36
45
  :url => 'http://opengeocode.org/download/countrynames.txt',
37
46
  :format => :delimited,
@@ -107,6 +116,10 @@ And many more - look for the `data_miner.rb` file that corresponds to each model
107
116
  * Derek Kastner <dkastner@gmail.com>
108
117
  * Ian Hough <ijhough@gmail.com>
109
118
 
119
+ ## Wishlist
120
+
121
+ * Make the tests real unit tests
122
+
110
123
  ## Copyright
111
124
 
112
125
  Copyright (c) 2012 Brighter Planet. See LICENSE for details.
data/data_miner.gemspec CHANGED
@@ -24,6 +24,8 @@ Gem::Specification.new do |s|
24
24
  s.add_runtime_dependency 'errata', '>=1.0.1'
25
25
  s.add_runtime_dependency 'remote_table', '>=2.0.2'
26
26
  s.add_runtime_dependency 'upsert', '>=0.3.1'
27
+ s.add_runtime_dependency 'posix-spawn'
28
+ s.add_runtime_dependency 'unix_utils'
27
29
 
28
30
  s.add_development_dependency 'dkastner-alchemist'
29
31
  s.add_development_dependency 'conversions'
@@ -34,6 +36,7 @@ Gem::Specification.new do |s|
34
36
  s.add_development_dependency 'minitest-reporters'
35
37
  s.add_development_dependency 'rake'
36
38
  s.add_development_dependency 'yard'
39
+ s.add_development_dependency 'rdiscount'
37
40
  if RUBY_PLATFORM == 'java'
38
41
  s.add_development_dependency 'jruby-openssl'
39
42
  s.add_development_dependency 'activerecord-jdbcsqlite3-adapter'
@@ -63,7 +63,7 @@ class DataMiner
63
63
  #
64
64
  # @see DataMiner::Script#import Creating an import step by calling DataMiner::Script#import from inside a data miner script
65
65
  # @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
66
- # @see DataMiner::Script#tap Creating a tap step by calling DataMiner::Script#tap from inside a data miner script
66
+ # @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
67
67
  #
68
68
  # @example Creating steps
69
69
  # class MyModel < ActiveRecord::Base
@@ -71,6 +71,7 @@ class DataMiner
71
71
  # process [...]
72
72
  # import [...]
73
73
  # import [...yes, it's ok to have more than one import step...]
74
+ # sql [...]
74
75
  # process [...]
75
76
  # [...etc...]
76
77
  # end
@@ -239,6 +239,9 @@ class DataMiner
239
239
 
240
240
  # @private
241
241
  def read(row)
242
+ unless column_exists?
243
+ raise RuntimeError, "[data_miner] Table #{model.table_name} does not have column #{name.inspect}"
244
+ end
242
245
  if matcher and matcher_output = matcher.match(row)
243
246
  return matcher_output
244
247
  end
@@ -342,6 +345,11 @@ class DataMiner
342
345
  step.model
343
346
  end
344
347
 
348
+ def column_exists?
349
+ return @column_exists_boolean if defined?(@column_exists_boolean)
350
+ @column_exists_boolean = model.column_names.include? name.to_s
351
+ end
352
+
345
353
  def text_column?
346
354
  return @text_column_boolean if defined?(@text_column_boolean)
347
355
  @text_column_boolean = model.columns_hash[name.to_s].text?
@@ -100,9 +100,11 @@ class DataMiner
100
100
  fail!
101
101
  raise $!
102
102
  ensure
103
- self.row_count_after = model.count
104
- if DataMiner.per_column_statistics?
105
- ColumnStatistic.take self
103
+ if model.table_exists?
104
+ self.row_count_after = model.count
105
+ if DataMiner.per_column_statistics?
106
+ ColumnStatistic.take self
107
+ end
106
108
  end
107
109
  self.stopped_at = ::Time.now.utc
108
110
  save!
@@ -92,47 +92,32 @@ class DataMiner
92
92
  append(:process, method_id_or_description, &blk)
93
93
  end
94
94
 
95
- # Use https://github.com/ricardochimal/taps to pull table structure and data.
96
- #
97
- # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
98
- # @see DataMiner::Step::Tap The actual Tap class.
99
- #
100
- # @param [String] description A description of the taps source.
101
- # @param [String] source The taps URL, including username, password, domain, and port.
102
- # @param [optional, Hash] options
103
- # @option options [String] :source_table_name (model.table_name) The source table name, if different.
95
+ # Import rows into your model.
104
96
  #
105
- # @note The source table name will default to the model's table name. If it's different, use the +:source_table_name+ option.
106
- # @note +taps+ needs to be installed on your system and in your PATH, but it doesn't have to be in your Gemfile. Sometimes having it in your Gemfile will cause Heroku deploys (etc.) to fail because it requires +sqlite3+.
97
+ # As long as...
98
+ # 1. you +key+ on the primary key, or
99
+ # 2. the table has an auto-increment primary key, or
100
+ # 3. you DON'T enable +:validate+
101
+ # ... then things will be sped up using the {https://github.com/seamusabshere/upsert upsert library} in streaming mode.
107
102
  #
108
- # @example Tapping Brighter Planet's reference data web service
109
- # data_miner do
110
- # [...]
111
- # tap "Brighter Planet's reference data", "http://carbon:neutral@data.brighterplanet.com:5000"
112
- # [...]
113
- # end
114
- #
115
- # @return [nil]
116
- def tap(description, source, options = {})
117
- append :tap, description, source, options
118
- end
119
-
120
- # Import rows into your model.
103
+ # Otherwise, native +ActiveRecord+ constuctors and validations will be used.
121
104
  #
122
105
  # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
123
106
  # @see DataMiner::Step::Import The actual Import class.
124
107
  #
125
108
  # @param [String] description A description of the data source.
126
- # @param [Hash] table_and_errata_settings Settings, including URL of the data source, that are used to download/parse (using RemoteTable) and (sometimes) correct (using Errata) the data.
127
- # @option table_and_errata_settings [String] :url The URL of the data source. Passed directly to +RemoteTable.new+.
128
- # @option table_and_errata_settings [Hash] :errata The +:responder+ and +:url+ settings that will be passed to +Errata.new+.
129
- # @option table_and_errata_settings [*] anything Any other setting will be passed to +RemoteTable.new+.
109
+ # @param [Hash] settings Settings, including URL of the data source, that are used to download/parse (using RemoteTable) and (sometimes) correct (using Errata) the data.
110
+ # @option settings [String] :url The URL of the data source. Passed directly to +RemoteTable.new+.
111
+ # @option settings [Hash] :errata The +:responder+ and +:url+ settings that will be passed to +Errata.new+.
112
+ # @option settings [TrueClass,FalseClass] :validate Whether to always run +ActiveRecord+ validations.
113
+ # @option settings [*] anything Any other setting will be passed to +RemoteTable.new+.
130
114
  #
131
115
  # @yield [] A block defining how to +key+ the import (to make it idempotent) and which columns to +store+.
132
116
  #
133
- # @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +table_and_errata_settings+.
117
+ # @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +settings+.
134
118
  # @note There are hundreds of +import+ examples in https://github.com/brighterplanet/earth. The {file:README.markdown README} points to a few (at the bottom.)
135
119
  # @note We often use string primary keys to make idempotency easier. https://github.com/seamusabshere/active_record_inline_schema supports defining these inline.
120
+ # @note Enabling +:validate+ may slow down importing large files because it precludes bulk loading using https://github.com/seamusabshere/upsert.
136
121
  #
137
122
  # @example From the README
138
123
  # data_miner do
@@ -152,8 +137,26 @@ class DataMiner
152
137
  # end
153
138
  #
154
139
  # @return [nil]
155
- def import(description, table_and_errata_settings, &blk)
156
- append(:import, description, table_and_errata_settings, &blk)
140
+ def import(description, settings, &blk)
141
+ append(:import, description, settings, &blk)
142
+ end
143
+
144
+ # Execute SQL, provided either as a string or a URL.
145
+ #
146
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
147
+ # @see DataMiner::Step::Sql The actual Sql class.
148
+ #
149
+ # @note +url_or_statement+ is auto-detected by looking for +%r{^[^\s]*/[^\*]}+ (non-spaces followed by a slash followed by non-asterisk). Therefore if you're passing a local file path and want it to be treated like a URL, make it absolute.
150
+ #
151
+ # @param [String] description What this step does.
152
+ # @param [String] url_or_statement SQL statement as a String or location of the SQL file as a URL.
153
+ #
154
+ # @example Rapidly get a list of countries from Brighter Planet's Reference Data web service
155
+ # data_miner do
156
+ # sql "Brighter Planet's countries", 'http://data.brighterplanet.com/countries.sql'
157
+ # end
158
+ def sql(description, url_or_statement)
159
+ append(:sql, description, url_or_statement)
157
160
  end
158
161
 
159
162
  # Prepend a step to a script unless it's already there. Mostly for internal use.
@@ -237,7 +240,11 @@ class DataMiner
237
240
  args = ["#{klass.name.demodulize} step with no description"]
238
241
  end
239
242
  initializer = [self] + args + [options]
240
- klass.new(*initializer, &blk)
243
+ if block_given?
244
+ klass.new(*initializer, &blk)
245
+ else
246
+ klass.new(*initializer)
247
+ end
241
248
  end
242
249
  end
243
250
  end
@@ -16,31 +16,29 @@ class DataMiner
16
16
  # @return [Array<DataMiner::Attribute>]
17
17
  attr_reader :attributes
18
18
 
19
- # @private
20
- attr_reader :script
21
-
22
19
  # Description of what this step does.
23
20
  # @return [String]
24
21
  attr_reader :description
25
22
 
26
23
  # @private
27
- def initialize(script, description, table_and_errata_settings, &blk)
28
- table_and_errata_settings = table_and_errata_settings.symbolize_keys
29
- if table_and_errata_settings.has_key?(:table)
24
+ def initialize(script, description, settings, &blk)
25
+ settings = settings.symbolize_keys
26
+ if settings.has_key?(:table)
30
27
  raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
31
28
  end
32
- if (errata_settings = table_and_errata_settings[:errata]) and not errata_settings.is_a?(::Hash)
29
+ if (errata_settings = settings[:errata]) and not errata_settings.is_a?(::Hash)
33
30
  raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
34
31
  end
35
32
  @script = script
36
33
  @attributes = ::ActiveSupport::OrderedHash.new
34
+ @validate_query = !!settings[:validate]
37
35
  @description = description
38
- if table_and_errata_settings.has_key? :errata
39
- errata_settings = table_and_errata_settings[:errata].symbolize_keys
36
+ if settings.has_key? :errata
37
+ errata_settings = settings[:errata].symbolize_keys
40
38
  errata_settings[:responder] ||= model
41
- table_and_errata_settings[:errata] = errata_settings
39
+ settings[:errata] = errata_settings
42
40
  end
43
- @table_settings = table_and_errata_settings.dup
41
+ @table_settings = settings.dup
44
42
  @table_settings[:streaming] = true
45
43
  @table_mutex = ::Mutex.new
46
44
  instance_eval(&blk)
@@ -85,7 +83,7 @@ class DataMiner
85
83
 
86
84
  # @private
87
85
  def start
88
- if storing_primary_key? or table_has_autoincrementing_primary_key?
86
+ if not validate? and (storing_primary_key? or table_has_autoincrementing_primary_key?)
89
87
  c = ActiveRecord::Base.connection_pool.checkout
90
88
  Upsert.stream(c, model.table_name) do |upsert|
91
89
  table.each do |row|
@@ -109,6 +107,12 @@ class DataMiner
109
107
  nil
110
108
  end
111
109
 
110
+ # @private
111
+ # Whether to run ActiveRecord validations. Slows things down because Upsert isn't used.
112
+ def validate?
113
+ @validate_query == true
114
+ end
115
+
112
116
  private
113
117
 
114
118
  def table_has_autoincrementing_primary_key?
@@ -7,9 +7,6 @@ class DataMiner
7
7
  # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
8
8
  # @see DataMiner::Script#process Creating a process step by calling DataMiner::Script#process from inside a data miner script
9
9
  class Process < Step
10
- # @private
11
- attr_reader :script
12
-
13
10
  # The method to be called on the model class.
14
11
  # @return [Symbol]
15
12
  attr_reader :method_id
@@ -25,7 +22,7 @@ class DataMiner
25
22
  alias :block_description :description
26
23
 
27
24
  # @private
28
- def initialize(script, method_id_or_description, ignored_options = {}, &blk)
25
+ def initialize(script, method_id_or_description, ignored_options = nil, &blk)
29
26
  @script = script
30
27
  if block_given?
31
28
  @description = method_id_or_description
@@ -0,0 +1,117 @@
1
+ require 'csv'
2
+ require 'tmpdir'
3
+ require 'posix/spawn'
4
+ require 'unix_utils'
5
+
6
+ class DataMiner
7
+ class Step
8
+ # A step that executes a SQL, either from a string or as retrieved from a URL.
9
+ #
10
+ # Create these by calling +sql+ inside a +data_miner+ block.
11
+ #
12
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
13
+ # @see DataMiner::Script#sql Creating a sql step by calling DataMiner::Script#sql from inside a data miner script
14
+ class Sql < Step
15
+ URL_DETECTOR = %r{^[^\s]*/[^\*]}
16
+
17
+ # Description of what this step does.
18
+ # @return [String]
19
+ attr_reader :description
20
+
21
+ # Location of the SQL file.
22
+ # @return [String]
23
+ attr_reader :url
24
+
25
+ # String containing the SQL.
26
+ # @return [String]
27
+ attr_reader :statement
28
+
29
+ # @private
30
+ def initialize(script, description, url_or_statement, ignored_options = nil)
31
+ @script = script
32
+ @description = description
33
+ if url_or_statement =~ URL_DETECTOR
34
+ @url = url_or_statement
35
+ else
36
+ @statement = url_or_statement
37
+ end
38
+ end
39
+
40
+ # @private
41
+ def start
42
+ if statement
43
+ c = ActiveRecord::Base.connection_pool.checkout
44
+ c.execute statement
45
+ ActiveRecord::Base.connection_pool.checkin c
46
+ else
47
+ tmp_path = UnixUtils.curl url
48
+ send config[:adapter], tmp_path
49
+ File.unlink tmp_path
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def config
56
+ @config ||= if ActiveRecord::Base.respond_to?(:connection_config)
57
+ ActiveRecord::Base.connection_config
58
+ else
59
+ ActiveRecord::Base.connection_pool.spec.config
60
+ end
61
+ end
62
+
63
+ def mysql(path)
64
+ connect = if config[:socket]
65
+ [ '--socket', config[:socket] ]
66
+ else
67
+ [ '--host', config.fetch(:host, '127.0.0.1'), '--port', config.fetch(:port, 3306).to_s ]
68
+ end
69
+
70
+ argv = [
71
+ 'mysql',
72
+ '--compress',
73
+ '--user', config[:username],
74
+ "-p#{config[:password]}",
75
+ connect,
76
+ '--default-character-set', 'utf8',
77
+ config[:database]
78
+ ].flatten
79
+
80
+ File.open(path) do |f|
81
+ pid = POSIX::Spawn.spawn(*(argv+[{:in => f}]))
82
+ ::Process.waitpid pid
83
+ end
84
+ unless $?.success?
85
+ raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect}"
86
+ end
87
+ nil
88
+ end
89
+
90
+ alias :mysql2 :mysql
91
+
92
+ def postgresql(path)
93
+ connect = []
94
+ connect << ['--username', config[:username]] if config[:username]
95
+ connect << ['--password', config[:password]] if config[:password]
96
+ connect << ['--host', config[:host]] if config[:host]
97
+ connect << ['--port', config[:port]] if config[:port]
98
+
99
+ argv = [
100
+ 'psql',
101
+ connect,
102
+ '--quiet',
103
+ '--dbname', config[:database],
104
+ '--file', path
105
+ ].flatten
106
+
107
+ child = POSIX::Spawn::Child.new(*argv)
108
+ $stderr.puts child.out
109
+ $stderr.puts child.err
110
+ unless child.success?
111
+ raise RuntimeError, "[data_miner] Failed: #{argv.join(' ').inspect} (#{child.err.inspect})"
112
+ end
113
+ nil
114
+ end
115
+ end
116
+ end
117
+ end
@@ -1,5 +1,8 @@
1
1
  class DataMiner
2
2
  class Step
3
+ # @private
4
+ attr_reader :script
5
+
3
6
  # @private
4
7
  def ==(other)
5
8
  other.class == self.class and other.description == description
@@ -1,3 +1,3 @@
1
1
  class DataMiner
2
- VERSION = '2.3.4'
2
+ VERSION = '2.4.0'
3
3
  end
data/lib/data_miner.rb CHANGED
@@ -20,8 +20,8 @@ require 'data_miner/script'
20
20
  require 'data_miner/dictionary'
21
21
  require 'data_miner/step'
22
22
  require 'data_miner/step/import'
23
- require 'data_miner/step/tap'
24
23
  require 'data_miner/step/process'
24
+ require 'data_miner/step/sql'
25
25
  require 'data_miner/run'
26
26
  require 'data_miner/unit_converter'
27
27
 
@@ -44,7 +44,7 @@ class DataMiner
44
44
 
45
45
  # @private
46
46
  def compress_whitespace(str)
47
- str.gsub(INNER_SPACE, ' ').strip
47
+ str.gsub(INNER_SPACE, ONE_SPACE).strip
48
48
  end
49
49
 
50
50
  # Set the unit converter.
@@ -66,6 +66,7 @@ class DataMiner
66
66
  end
67
67
 
68
68
  INNER_SPACE = /[ ]+/
69
+ ONE_SPACE = ' '
69
70
 
70
71
  include ::Singleton
71
72
 
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'helper'
3
+ init_database
4
+
5
+ class BreedBlue < ActiveRecord::Base
6
+ self.table_name = 'breeds'
7
+ self.primary_key = 'name'
8
+ data_miner do
9
+ sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
10
+ end
11
+ end
12
+
13
+ class BreedRed < ActiveRecord::Base
14
+ self.table_name = 'breeds'
15
+ self.primary_key = 'name'
16
+ data_miner do
17
+ sql "Brighter Planet's list of breeds (as a URL)", 'http://data.brighterplanet.com/breeds.sql'
18
+ sql "Mess up weights", %{UPDATE breeds SET weight = 999}
19
+ end
20
+ end
21
+
22
+ describe DataMiner::Step::Sql do
23
+ before do
24
+ BreedBlue.delete_all rescue nil
25
+ end
26
+ it "can be provided as a URL" do
27
+ BreedBlue.run_data_miner!
28
+ BreedBlue.where(:name => 'Affenpinscher').count.must_equal 1
29
+ BreedBlue.where(:name => 'Württemberger').count.must_equal 1
30
+ BreedBlue.find('Afghan Hound').weight.must_be_close_to 24.9476
31
+ end
32
+ it "can be provided as a string" do
33
+ BreedRed.run_data_miner!
34
+ BreedRed.where(:name => 'Affenpinscher').count.must_equal 1
35
+ BreedRed.where(:name => 'Württemberger').count.must_equal 1
36
+ BreedRed.find('Afghan Hound').weight.must_be_close_to 999
37
+ end
38
+ end
data/test/helper.rb CHANGED
@@ -59,8 +59,10 @@ def init_models
59
59
  require 'support/breed'
60
60
  require 'support/pet'
61
61
  require 'support/pet2'
62
+ require 'support/pet3'
62
63
  Pet.auto_upgrade!
63
64
  Pet2.auto_upgrade!
65
+ Pet3.auto_upgrade!
64
66
 
65
67
  ActiveRecord::Base.descendants.each do |model|
66
68
  model.attr_accessible nil
@@ -0,0 +1,9 @@
1
+ class Pet3 < ActiveRecord::Base
2
+ col :a
3
+ data_miner do
4
+ process :auto_upgrade!
5
+ import("A list of pets", :url => "file://#{PETS}") do
6
+ key :b, :field_name => 'name'
7
+ end
8
+ end
9
+ end
@@ -7,6 +7,8 @@ describe DataMiner do
7
7
  describe "when used to import example data about pets" do
8
8
  before do
9
9
  Pet.delete_all
10
+ Pet2.delete_all
11
+ Pet3.delete_all
10
12
  DataMiner::Run.delete_all
11
13
  DataMiner::Run::ColumnStatistic.delete_all
12
14
  end
@@ -112,5 +114,11 @@ describe DataMiner do
112
114
  Pet2.run_data_miner!
113
115
  Pet2.find('Jerry').breed_id.must_equal 'Beagle-Basset'
114
116
  end
117
+ it "dies if a column specified in an import step doesn't exist" do
118
+ lambda do
119
+ Pet3.run_data_miner!
120
+ end.must_raise RuntimeError, /exist/i
121
+ end
122
+
115
123
  end
116
124
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.4
4
+ version: 2.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-07-07 00:00:00.000000000 Z
14
+ date: 2012-07-26 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: aasm
@@ -125,6 +125,38 @@ dependencies:
125
125
  - - ! '>='
126
126
  - !ruby/object:Gem::Version
127
127
  version: 0.3.1
128
+ - !ruby/object:Gem::Dependency
129
+ name: posix-spawn
130
+ requirement: !ruby/object:Gem::Requirement
131
+ none: false
132
+ requirements:
133
+ - - ! '>='
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ type: :runtime
137
+ prerelease: false
138
+ version_requirements: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ - !ruby/object:Gem::Dependency
145
+ name: unix_utils
146
+ requirement: !ruby/object:Gem::Requirement
147
+ none: false
148
+ requirements:
149
+ - - ! '>='
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ type: :runtime
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ none: false
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
128
160
  - !ruby/object:Gem::Dependency
129
161
  name: dkastner-alchemist
130
162
  requirement: !ruby/object:Gem::Requirement
@@ -269,6 +301,22 @@ dependencies:
269
301
  - - ! '>='
270
302
  - !ruby/object:Gem::Version
271
303
  version: '0'
304
+ - !ruby/object:Gem::Dependency
305
+ name: rdiscount
306
+ requirement: !ruby/object:Gem::Requirement
307
+ none: false
308
+ requirements:
309
+ - - ! '>='
310
+ - !ruby/object:Gem::Version
311
+ version: '0'
312
+ type: :development
313
+ prerelease: false
314
+ version_requirements: !ruby/object:Gem::Requirement
315
+ none: false
316
+ requirements:
317
+ - - ! '>='
318
+ - !ruby/object:Gem::Version
319
+ version: '0'
272
320
  - !ruby/object:Gem::Dependency
273
321
  name: sqlite3
274
322
  requirement: !ruby/object:Gem::Requirement
@@ -344,12 +392,13 @@ files:
344
392
  - lib/data_miner/step.rb
345
393
  - lib/data_miner/step/import.rb
346
394
  - lib/data_miner/step/process.rb
347
- - lib/data_miner/step/tap.rb
395
+ - lib/data_miner/step/sql.rb
348
396
  - lib/data_miner/unit_converter.rb
349
397
  - lib/data_miner/unit_converter/alchemist.rb
350
398
  - lib/data_miner/unit_converter/conversions.rb
351
399
  - lib/data_miner/version.rb
352
400
  - test/data_miner/step/test_import.rb
401
+ - test/data_miner/step/test_sql.rb
353
402
  - test/data_miner/test_attribute.rb
354
403
  - test/data_miner/unit_converter/test_alchemist.rb
355
404
  - test/data_miner/unit_converter/test_conversions.rb
@@ -362,6 +411,7 @@ files:
362
411
  - test/support/data_miner_without_unit_converter.rb
363
412
  - test/support/pet.rb
364
413
  - test/support/pet2.rb
414
+ - test/support/pet3.rb
365
415
  - test/support/pet_color_dictionary.en.csv
366
416
  - test/support/pet_color_dictionary.es.csv
367
417
  - test/support/pets.csv
@@ -369,7 +419,6 @@ files:
369
419
  - test/test_data_miner.rb
370
420
  - test/test_data_miner_run_column_statistic.rb
371
421
  - test/test_earth_import.rb
372
- - test/test_earth_tap.rb
373
422
  - test/test_safety.rb
374
423
  - test/test_unit_conversion.rb
375
424
  homepage: https://github.com/seamusabshere/data_miner
@@ -399,6 +448,7 @@ summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and imp
399
448
  XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
400
449
  test_files:
401
450
  - test/data_miner/step/test_import.rb
451
+ - test/data_miner/step/test_sql.rb
402
452
  - test/data_miner/test_attribute.rb
403
453
  - test/data_miner/unit_converter/test_alchemist.rb
404
454
  - test/data_miner/unit_converter/test_conversions.rb
@@ -411,6 +461,7 @@ test_files:
411
461
  - test/support/data_miner_without_unit_converter.rb
412
462
  - test/support/pet.rb
413
463
  - test/support/pet2.rb
464
+ - test/support/pet3.rb
414
465
  - test/support/pet_color_dictionary.en.csv
415
466
  - test/support/pet_color_dictionary.es.csv
416
467
  - test/support/pets.csv
@@ -418,7 +469,6 @@ test_files:
418
469
  - test/test_data_miner.rb
419
470
  - test/test_data_miner_run_column_statistic.rb
420
471
  - test/test_earth_import.rb
421
- - test/test_earth_tap.rb
422
472
  - test/test_safety.rb
423
473
  - test/test_unit_conversion.rb
424
474
  has_rdoc:
@@ -1,167 +0,0 @@
1
- require 'uri'
2
-
3
- class DataMiner
4
- class Step
5
- # A step that uses https://github.com/ricardochimal/taps to import table structure and data.
6
- #
7
- # Create these by calling +tap+ inside a +data_miner+ block.
8
- #
9
- # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
10
- # @see DataMiner::Script#tap Creating a tap step by calling DataMiner::Script#tap from inside a data miner script
11
- class Tap < Step
12
- DEFAULT_PORTS = {
13
- :mysql => 3306,
14
- :mysql2 => 3306,
15
- :postgres => 5432
16
- }
17
-
18
- DEFAULT_USERNAMES = {
19
- :mysql => 'root',
20
- :mysql2 => 'root',
21
- :postgres => ''
22
- }
23
-
24
- DEFAULT_PASSWORDS = {}
25
- DEFAULT_PASSWORDS.default = ''
26
-
27
- DEFAULT_HOSTS = {}
28
- DEFAULT_HOSTS.default = '127.0.0.1'
29
-
30
- # @private
31
- attr_reader :script
32
-
33
- # A description of the tapped data source.
34
- # @return [String]
35
- attr_reader :description
36
-
37
- # The URL of the tapped data source, including username, password, domain, and port number.
38
- # @return [String]
39
- attr_reader :source
40
-
41
- # Connection options that will be passed to the +taps pull command+. Defaults to the ActiveRecord connection config, if available.
42
- # @return [Hash]
43
- attr_reader :database_options
44
-
45
- # Source table name. Defaults to the table name of the model.
46
- # @return [String]
47
- attr_reader :source_table_name
48
-
49
- # @private
50
- def initialize(script, description, source, options = {})
51
- options = options.symbolize_keys
52
- @script = script
53
- @description = description
54
- @source = source
55
- @source_table_name = options.delete(:source_table_name) || model.table_name
56
- @database_options = options.reverse_merge script.model.connection.instance_variable_get(:@config).symbolize_keys
57
- end
58
-
59
- # @private
60
- def start
61
- [ source_table_name, model.table_name ].each do |possible_obstacle|
62
- if connection.table_exists? possible_obstacle
63
- connection.drop_table possible_obstacle
64
- end
65
- end
66
- taps_pull
67
- if needs_table_rename?
68
- connection.rename_table source_table_name, model.table_name
69
- end
70
- nil
71
- end
72
-
73
- # @return [String] The name of the current database.
74
- def database
75
- unless database = database_options[:database]
76
- raise ::ArgumentError, %{[data_miner] Can't infer database name from options or ActiveRecord config.}
77
- end
78
- database
79
- end
80
-
81
- # @return [String] The database username.
82
- def username
83
- database_options[:username] || DEFAULT_USERNAMES[adapter.to_sym]
84
- end
85
-
86
- # @return [String] The database password.
87
- def password
88
- database_options[:password] || DEFAULT_PASSWORDS[adapter.to_sym]
89
- end
90
-
91
- # @return [String] The database port number.
92
- def port
93
- database_options[:port] || DEFAULT_PORTS[adapter.to_sym]
94
- end
95
-
96
- # @return [String] The database hostname.
97
- def host
98
- database_options[:host] || DEFAULT_HOSTS[adapter.to_sym]
99
- end
100
-
101
- private
102
-
103
- def connection
104
- model.connection
105
- end
106
-
107
- def needs_table_rename?
108
- source_table_name != model.table_name
109
- end
110
-
111
- def adapter
112
- case connection.adapter_name
113
- when /mysql2/i
114
- 'mysql2'
115
- when /mysql/i
116
- 'mysql'
117
- when /postgres/i
118
- 'postgres'
119
- when /sqlite/i
120
- 'sqlite'
121
- end
122
- end
123
-
124
- # "user:pass"
125
- # "user"
126
- # nil
127
- def userinfo
128
- if username.present?
129
- [username, password].select(&:present?).join(':')
130
- end
131
- end
132
-
133
- def db_url
134
- case adapter
135
- when 'sqlite'
136
- "sqlite://#{database}"
137
- else
138
- ::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
139
- end
140
- end
141
-
142
- # Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
143
- #
144
- # This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
145
- def taps_pull
146
- args = [
147
- 'taps',
148
- 'pull',
149
- db_url,
150
- source,
151
- '--indexes-first',
152
- '--tables',
153
- source_table_name
154
- ]
155
-
156
- # https://github.com/carlhuda/bundler/issues/1579
157
- if defined?(::Bundler)
158
- ::Bundler.with_clean_env do
159
- ::Kernel.system args.join(' ')
160
- end
161
- else
162
- ::Kernel.system args.join(' ')
163
- end
164
- end
165
- end
166
- end
167
- end
@@ -1,26 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
- require 'helper'
3
- init_database
4
- require 'earth'
5
-
6
- # use earth, which has a plethora of real-world data_miner blocks
7
- Earth.init :locality, :pet, :load_data_miner => false, :apply_schemas => true
8
-
9
- DataMiner.run %w{Country Breed}
10
-
11
- describe DataMiner do
12
- describe "being used by the Earth library's tap steps" do
13
- describe "for pets" do
14
- it "can pull breed and species" do
15
- Breed.find('Golden Retriever').species.must_equal Species.find('dog')
16
- end
17
- end
18
- describe "for localities" do
19
- it "can handle non-latin characters" do
20
- Country.find('DE').name.must_equal 'Germany'
21
- Country.find('AX').name.must_equal 'Åland Islands'
22
- Country.find('CI').name.must_equal "Côte d'Ivoire"
23
- end
24
- end
25
- end
26
- end