data_miner 0.5.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
7
+ data_miner.log
8
+ Gemfile.lock
data/CHANGELOG ADDED
@@ -0,0 +1,7 @@
1
+ 0.2.6
2
+ * Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
3
+ 0.3.0
4
+ * Removed association code... now data_miner focuses on just importing.
5
+ * New, simpler DSL
6
+ * Upgrade to remote_table 0.2.1 for row_hashes and better blank row handling
7
+ * Remove all association-related code
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_miner.gemspec
4
+ gemspec :path => '.'
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2009 Brighter Planet
1
+ Copyright (c) 2011 Brighter Planet
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rake'
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |test|
7
+ test.libs << 'lib' << 'test'
8
+ test.pattern = 'test/**/test_*.rb'
9
+ test.verbose = true
10
+ end
11
+
12
+ begin
13
+ require 'rake/rdoctask'
14
+ Rake::RDocTask.new do |rdoc|
15
+ rdoc.rdoc_dir = 'rdoc'
16
+ rdoc.title = 'data_miner'
17
+ rdoc.options << '--line-numbers' << '--inline-source'
18
+ rdoc.rdoc_files.include('README*')
19
+ rdoc.rdoc_files.include('lib/**/*.rb')
20
+ end
21
+ rescue LoadError
22
+ puts "Rdoc is not available"
23
+ end
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_miner/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_miner"
7
+ s.version = DataMiner::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl", "Derek Kastner"]
10
+ s.email = ["seamus@abshere.net"]
11
+ s.homepage = "https://github.com/seamusabshere/data_miner"
12
+ s.summary = %{Mine remote data into your ActiveRecord models.}
13
+ s.description = %q{Mine remote data into your ActiveRecord models. You can also convert units.}
14
+
15
+ s.rubyforge_project = "data_miner"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency 'remote_table', '>=1.0.2'
23
+ s.add_dependency 'escape', '>=0.0.4'
24
+ s.add_dependency 'activerecord', '>=2.3.4'
25
+ s.add_dependency 'activesupport', '>=2.3.4'
26
+ s.add_dependency 'conversions', '>=1.4.4'
27
+ s.add_dependency 'blockenspiel', '>=0.3.2'
28
+ s.add_dependency 'taps', '>=0.3.11'
29
+ s.add_development_dependency 'loose_tight_dictionary', ">=0.0.5"
30
+ s.add_development_dependency 'errata', '>=1.0.1'
31
+ s.add_development_dependency 'test-unit'
32
+ s.add_development_dependency 'shoulda'
33
+ s.add_development_dependency 'mysql'
34
+ s.add_development_dependency 'ruby-debug'
35
+ end
data/lib/data_miner.rb CHANGED
@@ -6,82 +6,37 @@ require 'active_support/version'
6
6
  active_support/core_ext/string/multibyte
7
7
  }.each do |active_support_3_requirement|
8
8
  require active_support_3_requirement
9
- end if ActiveSupport::VERSION::MAJOR == 3
9
+ end if ::ActiveSupport::VERSION::MAJOR == 3
10
10
 
11
- require 'active_record'
12
- require 'blockenspiel'
13
- require 'conversions'
14
- require 'errata'
15
- require 'remote_table'
16
- require 'escape'
17
- require 'andand'
18
- require 'log4r'
19
- require 'fileutils'
20
- require 'tmpdir'
21
- require 'zlib'
22
-
23
- require 'data_miner/attribute'
24
- require 'data_miner/base'
25
- require 'data_miner/dictionary'
26
- require 'data_miner/import'
27
- require 'data_miner/tap'
28
- require 'data_miner/process'
29
- require 'data_miner/run'
30
- require 'data_miner/schema'
31
- require 'data_miner/verify'
11
+ require 'singleton'
32
12
 
33
- module DataMiner
13
+ class DataMiner
14
+ include ::Singleton
15
+
34
16
  class MissingHashColumn < StandardError; end
35
17
  class Finish < StandardError; end
36
18
  class Skip < StandardError; end
19
+ class VerificationFailed < StandardError; end
37
20
 
38
- mattr_accessor :logger
39
-
40
- def self.start_logging
41
- return if logger
42
-
43
- if defined? Rails
44
- self.logger = Rails.logger
45
- else
46
- class_eval { include Log4r }
47
- info_outputter = FileOutputter.new 'f1', :filename => 'data_miner.log'
48
- error_outputter = Outputter.stderr
49
- info_outputter.only_at DEBUG, INFO
50
- error_outputter.only_at WARN, ERROR, FATAL
51
-
52
- self.logger = Logger.new 'data_miner'
53
- logger.add info_outputter, error_outputter
54
- ActiveRecord::Base.logger = logger
55
- end
56
- end
57
-
58
- def self.log_or_raise(message)
59
- message = "[data_miner gem] #{message}"
60
- if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
61
- logger.error message
62
- else
63
- raise message
64
- end
65
- end
66
-
67
- def self.log_info(message)
68
- logger.info "[data_miner gem] #{message}"
69
- end
70
-
71
- def self.log_debug(message)
72
- logger.debug "[data_miner gem] #{message}"
73
- end
74
-
75
- def self.run(options = {})
76
- DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
77
- DataMiner::Base.call_stack.clear
78
- end
21
+ autoload :ActiveRecordExtensions, 'data_miner/active_record_extensions'
22
+ autoload :Attribute, 'data_miner/attribute'
23
+ autoload :Config, 'data_miner/config'
24
+ autoload :Dictionary, 'data_miner/dictionary'
25
+ autoload :Import, 'data_miner/import'
26
+ autoload :Tap, 'data_miner/tap'
27
+ autoload :Process, 'data_miner/process'
28
+ autoload :Run, 'data_miner/run'
29
+ autoload :Schema, 'data_miner/schema'
30
+ autoload :Verify, 'data_miner/verify'
79
31
 
80
- def self.resource_names
81
- DataMiner::Base.resource_names
32
+ class << self
33
+ delegate :logger, :to => :instance
34
+ delegate :logger=, :to => :instance
35
+ delegate :run, :to => :instance
36
+ delegate :resource_names, :to => :instance
82
37
  end
83
38
 
84
- # TODO this should probably live somewhere else
39
+ # TODO this should probably live somewhere else
85
40
  def self.backtick_with_reporting(cmd)
86
41
  cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
87
42
  output = `#{cmd}`
@@ -97,41 +52,45 @@ Output:
97
52
  }
98
53
  end
99
54
  end
55
+
56
+ attr_accessor :logger
100
57
 
101
- end
58
+ def resource_names
59
+ @resource_names ||= []
60
+ end
102
61
 
103
- ActiveRecord::Base.class_eval do
104
- def self.x_data_miner(&block)
105
- DataMiner.start_logging
106
-
107
- DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
62
+ def call_stack
63
+ @call_stack ||= []
108
64
  end
109
65
 
110
- def self.data_miner(&block)
111
- DataMiner.start_logging
112
-
113
- DataMiner.log_debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
114
-
115
- DataMiner.resource_names.push self.name unless DataMiner.resource_names.include? self.name
116
-
117
- # this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
118
- class_eval do
119
- cattr_accessor :data_miner_base
120
- def self.data_miner_runs
121
- DataMiner::Run.scoped :conditions => { :resource_name => name }
122
- end
123
- def self.run_data_miner!(options = {})
124
- data_miner_base.run options
125
- end
126
- def self.execute_schema
127
- schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
128
- schema.run(nil) if schema
66
+ def start_logging
67
+ if logger.nil?
68
+ if defined? ::Rails
69
+ @logger = ::Rails.logger
70
+ else
71
+ @logger = ::Logger.new $stdout
129
72
  end
130
73
  end
131
- self.data_miner_base = DataMiner::Base.new self
132
-
133
- Blockenspiel.invoke block, data_miner_base
74
+ ::ActiveRecord::Base.logger = logger
75
+ end
134
76
 
135
- data_miner_base.after_invoke
77
+ # Mine data. Defaults to all resource_names touched by DataMiner.
78
+ #
79
+ # Options
80
+ # * <tt>:resource_names</tt>: array of resource (class) names to mine
81
+ def run(options = {})
82
+ options = options.dup
83
+ options.stringify_keys!
84
+ options['preserve_call_stack_between_runs'] = true
85
+ resource_names.each do |resource_name|
86
+ if options['resource_names'].blank? or options['resource_names'].include?(resource_name)
87
+ resource_name.constantize.data_miner_config.run options
88
+ end
89
+ end
90
+ call_stack.clear
91
+ # RemoteTable.cleanup
136
92
  end
137
93
  end
94
+
95
+ require 'active_record'
96
+ ::ActiveRecord::Base.extend ::DataMiner::ActiveRecordExtensions
@@ -0,0 +1,38 @@
1
+ require 'active_record'
2
+ require 'blockenspiel'
3
+
4
+ class DataMiner
5
+ module ActiveRecordExtensions
6
+ def data_miner(&blk)
7
+ ::DataMiner.instance.start_logging
8
+
9
+ ::DataMiner.logger.debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
10
+
11
+ ::DataMiner.instance.resource_names.push self.name unless ::DataMiner.instance.resource_names.include? self.name
12
+
13
+ # this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
14
+ class_eval do
15
+ cattr_accessor :data_miner_config
16
+ def self.data_miner_runs
17
+ ::DataMiner::Run.scoped :conditions => { :resource_name => name }
18
+ end
19
+ def self.run_data_miner!(options = {})
20
+ data_miner_config.run options
21
+ end
22
+ def self.execute_schema
23
+ if schema = data_miner_config.steps.detect { |s| s.instance_of?(::DataMiner::Schema) }
24
+ schema.run
25
+ end
26
+ end
27
+ end
28
+
29
+ self.data_miner_config = ::DataMiner::Config.new self
30
+
31
+ ::Blockenspiel.invoke blk, data_miner_config
32
+
33
+ data_miner_config.after_invoke
34
+ end
35
+ end
36
+ end
37
+
38
+
@@ -1,45 +1,49 @@
1
- module DataMiner
1
+ require 'conversions'
2
+
3
+ class DataMiner
2
4
  class Attribute
3
- attr_accessor :step
4
- attr_accessor :name
5
- attr_accessor :options
5
+ attr_reader :step
6
+ attr_reader :name
7
+ attr_reader :options
6
8
 
7
- delegate :resource, :to => :step
9
+ def resource
10
+ step.resource
11
+ end
8
12
 
9
- VALID_OPTIONS = [
10
- :from_units,
11
- :to_units,
12
- :static,
13
- :dictionary,
14
- :matcher,
15
- :field_name,
16
- :delimiter,
17
- :split,
18
- :units,
19
- :sprintf,
20
- :nullify,
21
- :overwrite,
22
- :upcase,
23
- :units_field_name,
24
- :units_field_number,
25
- :field_number,
26
- :chars,
27
- :synthesize
28
- ]
13
+ VALID_OPTIONS = %w{
14
+ from_units
15
+ to_units
16
+ static
17
+ dictionary
18
+ matcher
19
+ field_name
20
+ delimiter
21
+ split
22
+ units
23
+ sprintf
24
+ nullify
25
+ overwrite
26
+ upcase
27
+ units_field_name
28
+ units_field_number
29
+ field_number
30
+ chars
31
+ synthesize
32
+ }
29
33
 
30
34
  def initialize(step, name, options = {})
31
- options.symbolize_keys!
35
+ @options = options.dup
36
+ @options.stringify_keys!
32
37
 
33
38
  @step = step
34
39
  @name = name
35
40
 
36
- invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
37
- DataMiner.log_or_raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
38
- @options = options
41
+ invalid_option_keys = @options.keys.select { |k| not VALID_OPTIONS.include? k }
42
+ raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
39
43
  end
40
44
 
41
45
  def inspect
42
- "Attribute(#{resource}##{name})"
46
+ %{#<DataMiner::Attribute(#{resource}##{name})>}
43
47
  end
44
48
 
45
49
  def value_in_dictionary(str)
@@ -50,7 +54,7 @@ module DataMiner
50
54
  if wants_static?
51
55
  value = static
52
56
  elsif field_number
53
- if field_number.is_a?(Range)
57
+ if field_number.is_a?(::Range)
54
58
  value = field_number.map { |n| row[n] }.join(delimiter)
55
59
  else
56
60
  value = row[field_number]
@@ -59,7 +63,7 @@ module DataMiner
59
63
  value = row[field_name]
60
64
  end
61
65
  return nil if value.nil?
62
- return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
66
+ return value if value.is_a?(::ActiveRecord::Base) # escape valve for parsers that look up associations directly
63
67
  value = value.to_s
64
68
  value = value[chars] if wants_chars?
65
69
  value = do_split(value) if wants_split?
@@ -81,7 +85,7 @@ module DataMiner
81
85
  def value_from_row(row)
82
86
  return match_row row if wants_matcher?
83
87
  value = value_in_source row
84
- return value if value.is_a? ActiveRecord::Base # carry through trapdoor
88
+ return value if value.is_a? ::ActiveRecord::Base # carry through trapdoor
85
89
  value = value_in_dictionary value if wants_dictionary?
86
90
  value = synthesize.call(row) if wants_synthesize?
87
91
  value = nil if value.blank? and wants_nullification?
@@ -100,7 +104,7 @@ module DataMiner
100
104
 
101
105
  what_it_is = record.send name
102
106
  if what_it_is.nil? and !what_it_should_be.nil?
103
- DataMiner.log_debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
107
+ ::DataMiner.logger.debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
104
108
  nil
105
109
  elsif what_it_is == what_it_was
106
110
  false
@@ -114,7 +118,7 @@ module DataMiner
114
118
  end
115
119
 
116
120
  def do_convert(row, value)
117
- DataMiner.log_or_raise "If you use :from_units, you need to set :to_units (#{inspect})" unless wants_units?
121
+ raise "If you use 'from_units', you need to set 'to_units' (#{inspect})" unless wants_units?
118
122
  value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
119
123
  end
120
124
 
@@ -128,8 +132,8 @@ module DataMiner
128
132
  end
129
133
 
130
134
  def do_split(value)
131
- pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
132
- keep = split_options[:keep] || 0 # default is keep first element
135
+ pattern = split_options['pattern'] || /\s+/ # default is split on whitespace
136
+ keep = split_options['keep'] || 0 # default is keep first element
133
137
  value.to_s.split(pattern)[keep].to_s
134
138
  end
135
139
 
@@ -148,7 +152,7 @@ module DataMiner
148
152
  upcase.present?
149
153
  end
150
154
  def wants_static?
151
- options.has_key? :static
155
+ options.has_key? 'static'
152
156
  end
153
157
  def wants_nullification?
154
158
  nullify == true
@@ -157,7 +161,7 @@ module DataMiner
157
161
  chars.present?
158
162
  end
159
163
  def wants_synthesize?
160
- synthesize.is_a?(Proc)
164
+ synthesize.is_a?(::Proc)
161
165
  end
162
166
  def wants_overwriting?
163
167
  overwrite != false
@@ -169,66 +173,67 @@ module DataMiner
169
173
  to_units.present? or units_field_name.present? or units_field_number.present?
170
174
  end
171
175
  def wants_dictionary?
172
- options[:dictionary].present?
176
+ options['dictionary'].present?
173
177
  end
174
178
  def wants_matcher?
175
- options[:matcher].present?
179
+ options['matcher'].present?
176
180
  end
177
181
 
178
182
  # Options that always have values
179
183
  def field_name
180
- (options[:field_name] || name).to_s
184
+ (options['field_name'] || name).to_s
181
185
  end
182
186
  def delimiter
183
- (options[:delimiter] || ', ')
187
+ (options['delimiter'] || ', ')
184
188
  end
185
189
 
186
190
  # Options that can't be referred to by their names
187
191
  def split_options
188
- options[:split]
192
+ options['split']
189
193
  end
190
194
 
191
195
  def from_units
192
- options[:from_units]
196
+ options['from_units']
193
197
  end
194
198
  def to_units
195
- options[:to_units] || options[:units]
199
+ options['to_units'] || options['units']
196
200
  end
197
201
  def sprintf
198
- options[:sprintf]
202
+ options['sprintf']
199
203
  end
200
204
  def nullify
201
- options[:nullify]
205
+ options['nullify']
202
206
  end
203
207
  def overwrite
204
- options[:overwrite]
208
+ options['overwrite']
205
209
  end
206
210
  def upcase
207
- options[:upcase]
211
+ options['upcase']
208
212
  end
209
213
  def units_field_name
210
- options[:units_field_name]
214
+ options['units_field_name']
211
215
  end
212
216
  def units_field_number
213
- options[:units_field_number]
217
+ options['units_field_number']
214
218
  end
215
219
  def field_number
216
- options[:field_number]
220
+ options['field_number']
217
221
  end
218
222
  def chars
219
- options[:chars]
223
+ options['chars']
220
224
  end
221
225
  def synthesize
222
- options[:synthesize]
226
+ options['synthesize']
223
227
  end
224
228
  def static
225
- options[:static]
229
+ options['static']
226
230
  end
231
+ # must be cleared before every run! (because it relies on remote data)
227
232
  def dictionary
228
- @_dictionary ||= (options[:dictionary].is_a?(Dictionary) ? options[:dictionary] : Dictionary.new(options[:dictionary]))
233
+ @dictionary ||= (options['dictionary'].is_a?(Dictionary) ? options['dictionary'] : Dictionary.new(options['dictionary']))
229
234
  end
230
235
  def matcher
231
- @_matcher ||= (options[:matcher].is_a?(String) ? options[:matcher].constantize.new : options[:matcher])
236
+ @matcher ||= (options['matcher'].is_a?(::String) ? options['matcher'].constantize.new : options['matcher'])
232
237
  end
233
238
  end
234
239
  end