data_miner 0.5.7 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ test/test.sqlite3
7
+ data_miner.log
8
+ Gemfile.lock
data/CHANGELOG ADDED
@@ -0,0 +1,7 @@
1
+ 0.2.6
2
+ * Upgrade to remote_table 0.1.6 to handle UTF-8 CSVs and long urls.
3
+ 0.3.0
4
+ * Removed association code... now data_miner focuses on just importing.
5
+ * New, simpler DSL
6
+ * Upgrade to remote_table 0.2.1 for row_hashes and better blank row handling
7
+ * Remove all association-related code
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_miner.gemspec
4
+ gemspec :path => '.'
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2009 Brighter Planet
1
+ Copyright (c) 2011 Brighter Planet
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rake'
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |test|
7
+ test.libs << 'lib' << 'test'
8
+ test.pattern = 'test/**/test_*.rb'
9
+ test.verbose = true
10
+ end
11
+
12
+ begin
13
+ require 'rake/rdoctask'
14
+ Rake::RDocTask.new do |rdoc|
15
+ rdoc.rdoc_dir = 'rdoc'
16
+ rdoc.title = 'data_miner'
17
+ rdoc.options << '--line-numbers' << '--inline-source'
18
+ rdoc.rdoc_files.include('README*')
19
+ rdoc.rdoc_files.include('lib/**/*.rb')
20
+ end
21
+ rescue LoadError
22
+ puts "Rdoc is not available"
23
+ end
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_miner/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_miner"
7
+ s.version = DataMiner::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Seamus Abshere", "Andy Rossmeissl", "Derek Kastner"]
10
+ s.email = ["seamus@abshere.net"]
11
+ s.homepage = "https://github.com/seamusabshere/data_miner"
12
+ s.summary = %{Mine remote data into your ActiveRecord models.}
13
+ s.description = %q{Mine remote data into your ActiveRecord models. You can also convert units.}
14
+
15
+ s.rubyforge_project = "data_miner"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency 'remote_table', '>=1.0.2'
23
+ s.add_dependency 'escape', '>=0.0.4'
24
+ s.add_dependency 'activerecord', '>=2.3.4'
25
+ s.add_dependency 'activesupport', '>=2.3.4'
26
+ s.add_dependency 'conversions', '>=1.4.4'
27
+ s.add_dependency 'blockenspiel', '>=0.3.2'
28
+ s.add_dependency 'taps', '>=0.3.11'
29
+ s.add_development_dependency 'loose_tight_dictionary', ">=0.0.5"
30
+ s.add_development_dependency 'errata', '>=1.0.1'
31
+ s.add_development_dependency 'test-unit'
32
+ s.add_development_dependency 'shoulda'
33
+ s.add_development_dependency 'mysql'
34
+ s.add_development_dependency 'ruby-debug'
35
+ end
data/lib/data_miner.rb CHANGED
@@ -6,82 +6,37 @@ require 'active_support/version'
6
6
  active_support/core_ext/string/multibyte
7
7
  }.each do |active_support_3_requirement|
8
8
  require active_support_3_requirement
9
- end if ActiveSupport::VERSION::MAJOR == 3
9
+ end if ::ActiveSupport::VERSION::MAJOR == 3
10
10
 
11
- require 'active_record'
12
- require 'blockenspiel'
13
- require 'conversions'
14
- require 'errata'
15
- require 'remote_table'
16
- require 'escape'
17
- require 'andand'
18
- require 'log4r'
19
- require 'fileutils'
20
- require 'tmpdir'
21
- require 'zlib'
22
-
23
- require 'data_miner/attribute'
24
- require 'data_miner/base'
25
- require 'data_miner/dictionary'
26
- require 'data_miner/import'
27
- require 'data_miner/tap'
28
- require 'data_miner/process'
29
- require 'data_miner/run'
30
- require 'data_miner/schema'
31
- require 'data_miner/verify'
11
+ require 'singleton'
32
12
 
33
- module DataMiner
13
+ class DataMiner
14
+ include ::Singleton
15
+
34
16
  class MissingHashColumn < StandardError; end
35
17
  class Finish < StandardError; end
36
18
  class Skip < StandardError; end
19
+ class VerificationFailed < StandardError; end
37
20
 
38
- mattr_accessor :logger
39
-
40
- def self.start_logging
41
- return if logger
42
-
43
- if defined? Rails
44
- self.logger = Rails.logger
45
- else
46
- class_eval { include Log4r }
47
- info_outputter = FileOutputter.new 'f1', :filename => 'data_miner.log'
48
- error_outputter = Outputter.stderr
49
- info_outputter.only_at DEBUG, INFO
50
- error_outputter.only_at WARN, ERROR, FATAL
51
-
52
- self.logger = Logger.new 'data_miner'
53
- logger.add info_outputter, error_outputter
54
- ActiveRecord::Base.logger = logger
55
- end
56
- end
57
-
58
- def self.log_or_raise(message)
59
- message = "[data_miner gem] #{message}"
60
- if ENV['RAILS_ENV'] == 'production' or ENV['DONT_RAISE'] == 'true'
61
- logger.error message
62
- else
63
- raise message
64
- end
65
- end
66
-
67
- def self.log_info(message)
68
- logger.info "[data_miner gem] #{message}"
69
- end
70
-
71
- def self.log_debug(message)
72
- logger.debug "[data_miner gem] #{message}"
73
- end
74
-
75
- def self.run(options = {})
76
- DataMiner::Base.run options.merge(:preserve_call_stack_between_runs => true)
77
- DataMiner::Base.call_stack.clear
78
- end
21
+ autoload :ActiveRecordExtensions, 'data_miner/active_record_extensions'
22
+ autoload :Attribute, 'data_miner/attribute'
23
+ autoload :Config, 'data_miner/config'
24
+ autoload :Dictionary, 'data_miner/dictionary'
25
+ autoload :Import, 'data_miner/import'
26
+ autoload :Tap, 'data_miner/tap'
27
+ autoload :Process, 'data_miner/process'
28
+ autoload :Run, 'data_miner/run'
29
+ autoload :Schema, 'data_miner/schema'
30
+ autoload :Verify, 'data_miner/verify'
79
31
 
80
- def self.resource_names
81
- DataMiner::Base.resource_names
32
+ class << self
33
+ delegate :logger, :to => :instance
34
+ delegate :logger=, :to => :instance
35
+ delegate :run, :to => :instance
36
+ delegate :resource_names, :to => :instance
82
37
  end
83
38
 
84
- # TODO this should probably live somewhere else
39
+ # TODO this should probably live somewhere else
85
40
  def self.backtick_with_reporting(cmd)
86
41
  cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
87
42
  output = `#{cmd}`
@@ -97,41 +52,45 @@ Output:
97
52
  }
98
53
  end
99
54
  end
55
+
56
+ attr_accessor :logger
100
57
 
101
- end
58
+ def resource_names
59
+ @resource_names ||= []
60
+ end
102
61
 
103
- ActiveRecord::Base.class_eval do
104
- def self.x_data_miner(&block)
105
- DataMiner.start_logging
106
-
107
- DataMiner.log_debug "Skipping data_miner block in #{self.name} because called as x_data_miner"
62
+ def call_stack
63
+ @call_stack ||= []
108
64
  end
109
65
 
110
- def self.data_miner(&block)
111
- DataMiner.start_logging
112
-
113
- DataMiner.log_debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
114
-
115
- DataMiner.resource_names.push self.name unless DataMiner.resource_names.include? self.name
116
-
117
- # this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
118
- class_eval do
119
- cattr_accessor :data_miner_base
120
- def self.data_miner_runs
121
- DataMiner::Run.scoped :conditions => { :resource_name => name }
122
- end
123
- def self.run_data_miner!(options = {})
124
- data_miner_base.run options
125
- end
126
- def self.execute_schema
127
- schema = data_miner_base.steps.find { |s| s.instance_of?(DataMiner::Schema) }
128
- schema.run(nil) if schema
66
+ def start_logging
67
+ if logger.nil?
68
+ if defined? ::Rails
69
+ @logger = ::Rails.logger
70
+ else
71
+ @logger = ::Logger.new $stdout
129
72
  end
130
73
  end
131
- self.data_miner_base = DataMiner::Base.new self
132
-
133
- Blockenspiel.invoke block, data_miner_base
74
+ ::ActiveRecord::Base.logger = logger
75
+ end
134
76
 
135
- data_miner_base.after_invoke
77
+ # Mine data. Defaults to all resource_names touched by DataMiner.
78
+ #
79
+ # Options
80
+ # * <tt>:resource_names</tt>: array of resource (class) names to mine
81
+ def run(options = {})
82
+ options = options.dup
83
+ options.stringify_keys!
84
+ options['preserve_call_stack_between_runs'] = true
85
+ resource_names.each do |resource_name|
86
+ if options['resource_names'].blank? or options['resource_names'].include?(resource_name)
87
+ resource_name.constantize.data_miner_config.run options
88
+ end
89
+ end
90
+ call_stack.clear
91
+ # RemoteTable.cleanup
136
92
  end
137
93
  end
94
+
95
+ require 'active_record'
96
+ ::ActiveRecord::Base.extend ::DataMiner::ActiveRecordExtensions
@@ -0,0 +1,38 @@
1
+ require 'active_record'
2
+ require 'blockenspiel'
3
+
4
+ class DataMiner
5
+ module ActiveRecordExtensions
6
+ def data_miner(&blk)
7
+ ::DataMiner.instance.start_logging
8
+
9
+ ::DataMiner.logger.debug "Database table `#{table_name}` doesn't exist. It might be created in the data_miner block, but if it's not, DataMiner probably won't work properly until you run a migration or otherwise fix the schema." unless table_exists?
10
+
11
+ ::DataMiner.instance.resource_names.push self.name unless ::DataMiner.instance.resource_names.include? self.name
12
+
13
+ # this is class_eval'ed here so that each ActiveRecord descendant has its own copy, or none at all
14
+ class_eval do
15
+ cattr_accessor :data_miner_config
16
+ def self.data_miner_runs
17
+ ::DataMiner::Run.scoped :conditions => { :resource_name => name }
18
+ end
19
+ def self.run_data_miner!(options = {})
20
+ data_miner_config.run options
21
+ end
22
+ def self.execute_schema
23
+ if schema = data_miner_config.steps.detect { |s| s.instance_of?(::DataMiner::Schema) }
24
+ schema.run
25
+ end
26
+ end
27
+ end
28
+
29
+ self.data_miner_config = ::DataMiner::Config.new self
30
+
31
+ ::Blockenspiel.invoke blk, data_miner_config
32
+
33
+ data_miner_config.after_invoke
34
+ end
35
+ end
36
+ end
37
+
38
+
@@ -1,45 +1,49 @@
1
- module DataMiner
1
+ require 'conversions'
2
+
3
+ class DataMiner
2
4
  class Attribute
3
- attr_accessor :step
4
- attr_accessor :name
5
- attr_accessor :options
5
+ attr_reader :step
6
+ attr_reader :name
7
+ attr_reader :options
6
8
 
7
- delegate :resource, :to => :step
9
+ def resource
10
+ step.resource
11
+ end
8
12
 
9
- VALID_OPTIONS = [
10
- :from_units,
11
- :to_units,
12
- :static,
13
- :dictionary,
14
- :matcher,
15
- :field_name,
16
- :delimiter,
17
- :split,
18
- :units,
19
- :sprintf,
20
- :nullify,
21
- :overwrite,
22
- :upcase,
23
- :units_field_name,
24
- :units_field_number,
25
- :field_number,
26
- :chars,
27
- :synthesize
28
- ]
13
+ VALID_OPTIONS = %w{
14
+ from_units
15
+ to_units
16
+ static
17
+ dictionary
18
+ matcher
19
+ field_name
20
+ delimiter
21
+ split
22
+ units
23
+ sprintf
24
+ nullify
25
+ overwrite
26
+ upcase
27
+ units_field_name
28
+ units_field_number
29
+ field_number
30
+ chars
31
+ synthesize
32
+ }
29
33
 
30
34
  def initialize(step, name, options = {})
31
- options.symbolize_keys!
35
+ @options = options.dup
36
+ @options.stringify_keys!
32
37
 
33
38
  @step = step
34
39
  @name = name
35
40
 
36
- invalid_option_keys = options.keys.select { |k| not VALID_OPTIONS.include? k }
37
- DataMiner.log_or_raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
38
- @options = options
41
+ invalid_option_keys = @options.keys.select { |k| not VALID_OPTIONS.include? k }
42
+ raise "Invalid options: #{invalid_option_keys.map(&:inspect).to_sentence} (#{inspect})" if invalid_option_keys.any?
39
43
  end
40
44
 
41
45
  def inspect
42
- "Attribute(#{resource}##{name})"
46
+ %{#<DataMiner::Attribute(#{resource}##{name})>}
43
47
  end
44
48
 
45
49
  def value_in_dictionary(str)
@@ -50,7 +54,7 @@ module DataMiner
50
54
  if wants_static?
51
55
  value = static
52
56
  elsif field_number
53
- if field_number.is_a?(Range)
57
+ if field_number.is_a?(::Range)
54
58
  value = field_number.map { |n| row[n] }.join(delimiter)
55
59
  else
56
60
  value = row[field_number]
@@ -59,7 +63,7 @@ module DataMiner
59
63
  value = row[field_name]
60
64
  end
61
65
  return nil if value.nil?
62
- return value if value.is_a?(ActiveRecord::Base) # escape valve for parsers that look up associations directly
66
+ return value if value.is_a?(::ActiveRecord::Base) # escape valve for parsers that look up associations directly
63
67
  value = value.to_s
64
68
  value = value[chars] if wants_chars?
65
69
  value = do_split(value) if wants_split?
@@ -81,7 +85,7 @@ module DataMiner
81
85
  def value_from_row(row)
82
86
  return match_row row if wants_matcher?
83
87
  value = value_in_source row
84
- return value if value.is_a? ActiveRecord::Base # carry through trapdoor
88
+ return value if value.is_a? ::ActiveRecord::Base # carry through trapdoor
85
89
  value = value_in_dictionary value if wants_dictionary?
86
90
  value = synthesize.call(row) if wants_synthesize?
87
91
  value = nil if value.blank? and wants_nullification?
@@ -100,7 +104,7 @@ module DataMiner
100
104
 
101
105
  what_it_is = record.send name
102
106
  if what_it_is.nil? and !what_it_should_be.nil?
103
- DataMiner.log_debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
107
+ ::DataMiner.logger.debug "ActiveRecord didn't like trying to set #{resource}.#{name} = #{what_it_should_be} (it came out as nil)"
104
108
  nil
105
109
  elsif what_it_is == what_it_was
106
110
  false
@@ -114,7 +118,7 @@ module DataMiner
114
118
  end
115
119
 
116
120
  def do_convert(row, value)
117
- DataMiner.log_or_raise "If you use :from_units, you need to set :to_units (#{inspect})" unless wants_units?
121
+ raise "If you use 'from_units', you need to set 'to_units' (#{inspect})" unless wants_units?
118
122
  value.to_f.convert((from_units || unit_from_source(row)), (to_units || unit_from_source(row)))
119
123
  end
120
124
 
@@ -128,8 +132,8 @@ module DataMiner
128
132
  end
129
133
 
130
134
  def do_split(value)
131
- pattern = split_options[:pattern] || /\s+/ # default is split on whitespace
132
- keep = split_options[:keep] || 0 # default is keep first element
135
+ pattern = split_options['pattern'] || /\s+/ # default is split on whitespace
136
+ keep = split_options['keep'] || 0 # default is keep first element
133
137
  value.to_s.split(pattern)[keep].to_s
134
138
  end
135
139
 
@@ -148,7 +152,7 @@ module DataMiner
148
152
  upcase.present?
149
153
  end
150
154
  def wants_static?
151
- options.has_key? :static
155
+ options.has_key? 'static'
152
156
  end
153
157
  def wants_nullification?
154
158
  nullify == true
@@ -157,7 +161,7 @@ module DataMiner
157
161
  chars.present?
158
162
  end
159
163
  def wants_synthesize?
160
- synthesize.is_a?(Proc)
164
+ synthesize.is_a?(::Proc)
161
165
  end
162
166
  def wants_overwriting?
163
167
  overwrite != false
@@ -169,66 +173,67 @@ module DataMiner
169
173
  to_units.present? or units_field_name.present? or units_field_number.present?
170
174
  end
171
175
  def wants_dictionary?
172
- options[:dictionary].present?
176
+ options['dictionary'].present?
173
177
  end
174
178
  def wants_matcher?
175
- options[:matcher].present?
179
+ options['matcher'].present?
176
180
  end
177
181
 
178
182
  # Options that always have values
179
183
  def field_name
180
- (options[:field_name] || name).to_s
184
+ (options['field_name'] || name).to_s
181
185
  end
182
186
  def delimiter
183
- (options[:delimiter] || ', ')
187
+ (options['delimiter'] || ', ')
184
188
  end
185
189
 
186
190
  # Options that can't be referred to by their names
187
191
  def split_options
188
- options[:split]
192
+ options['split']
189
193
  end
190
194
 
191
195
  def from_units
192
- options[:from_units]
196
+ options['from_units']
193
197
  end
194
198
  def to_units
195
- options[:to_units] || options[:units]
199
+ options['to_units'] || options['units']
196
200
  end
197
201
  def sprintf
198
- options[:sprintf]
202
+ options['sprintf']
199
203
  end
200
204
  def nullify
201
- options[:nullify]
205
+ options['nullify']
202
206
  end
203
207
  def overwrite
204
- options[:overwrite]
208
+ options['overwrite']
205
209
  end
206
210
  def upcase
207
- options[:upcase]
211
+ options['upcase']
208
212
  end
209
213
  def units_field_name
210
- options[:units_field_name]
214
+ options['units_field_name']
211
215
  end
212
216
  def units_field_number
213
- options[:units_field_number]
217
+ options['units_field_number']
214
218
  end
215
219
  def field_number
216
- options[:field_number]
220
+ options['field_number']
217
221
  end
218
222
  def chars
219
- options[:chars]
223
+ options['chars']
220
224
  end
221
225
  def synthesize
222
- options[:synthesize]
226
+ options['synthesize']
223
227
  end
224
228
  def static
225
- options[:static]
229
+ options['static']
226
230
  end
231
+ # must be cleared before every run! (because it relies on remote data)
227
232
  def dictionary
228
- @_dictionary ||= (options[:dictionary].is_a?(Dictionary) ? options[:dictionary] : Dictionary.new(options[:dictionary]))
233
+ @dictionary ||= (options['dictionary'].is_a?(Dictionary) ? options['dictionary'] : Dictionary.new(options['dictionary']))
229
234
  end
230
235
  def matcher
231
- @_matcher ||= (options[:matcher].is_a?(String) ? options[:matcher].constantize.new : options[:matcher])
236
+ @matcher ||= (options['matcher'].is_a?(::String) ? options['matcher'].constantize.new : options['matcher'])
232
237
  end
233
238
  end
234
239
  end