data_miner 1.3.8 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/CHANGELOG +42 -0
  2. data/Gemfile +19 -3
  3. data/README.rdoc +3 -3
  4. data/Rakefile +13 -15
  5. data/data_miner.gemspec +4 -15
  6. data/lib/data_miner.rb +69 -70
  7. data/lib/data_miner/active_record_extensions.rb +17 -22
  8. data/lib/data_miner/attribute.rb +176 -179
  9. data/lib/data_miner/dictionary.rb +38 -31
  10. data/lib/data_miner/run.rb +49 -18
  11. data/lib/data_miner/script.rb +116 -0
  12. data/lib/data_miner/step.rb +5 -0
  13. data/lib/data_miner/step/import.rb +74 -0
  14. data/lib/data_miner/step/process.rb +34 -0
  15. data/lib/data_miner/step/tap.rb +134 -0
  16. data/lib/data_miner/version.rb +1 -1
  17. data/test/helper.rb +26 -24
  18. data/test/support/breeds.xls +0 -0
  19. data/test/support/pet_color_dictionary.en.csv +5 -0
  20. data/test/support/pet_color_dictionary.es.csv +5 -0
  21. data/test/support/pets.csv +5 -0
  22. data/test/support/pets_funny.csv +4 -0
  23. data/test/test_data_miner.rb +103 -0
  24. data/test/test_earth_import.rb +25 -0
  25. data/test/test_earth_tap.rb +25 -0
  26. data/test/test_safety.rb +43 -0
  27. metadata +72 -78
  28. data/.document +0 -5
  29. data/lib/data_miner/config.rb +0 -124
  30. data/lib/data_miner/import.rb +0 -93
  31. data/lib/data_miner/process.rb +0 -38
  32. data/lib/data_miner/tap.rb +0 -143
  33. data/test/support/aircraft.rb +0 -102
  34. data/test/support/airport.rb +0 -16
  35. data/test/support/automobile_fuel_type.rb +0 -40
  36. data/test/support/automobile_variant.rb +0 -362
  37. data/test/support/country.rb +0 -15
  38. data/test/support/test_database.rb +0 -311
  39. data/test/test_data_miner_attribute.rb +0 -111
  40. data/test/test_data_miner_process.rb +0 -18
  41. data/test/test_old_syntax.rb +0 -825
  42. data/test/test_tap.rb +0 -21
@@ -1,55 +1,62 @@
1
1
  require 'remote_table'
2
+
2
3
  class DataMiner
3
4
  class Dictionary
4
- attr_reader :options
5
+ DEFAULT_CASE_SENSITIVE = true
6
+
7
+ attr_reader :key_name
8
+ attr_reader :value_name
9
+ attr_reader :sprintf
10
+ attr_reader :url
11
+ attr_reader :case_sensitive
12
+
5
13
  def initialize(options = {})
6
- @options = options.dup
7
- @options.stringify_keys!
8
- end
9
-
10
- def key_name
11
- options['input']
12
- end
13
-
14
- def value_name
15
- options['output']
16
- end
17
-
18
- def sprintf
19
- options['sprintf'] || '%s'
14
+ options = options.symbolize_keys
15
+ @url = options[:url]
16
+ @key_name = options[:input]
17
+ @value_name = options[:output]
18
+ @sprintf = options[:sprintf]
19
+ @case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
20
+ @table_mutex = ::Mutex.new
20
21
  end
21
22
 
22
23
  def table
23
- @table ||= ::RemoteTable.new(options['url']).to_a # convert to Array immediately
24
+ @table || @table_mutex.synchronize do
25
+ @table ||= ::RemoteTable.new(url).to_a # make sure it's fully cached
26
+ end
24
27
  end
25
28
 
26
- def free
27
- @table.free if @table.is_a?(::RemoteTable)
29
+ def refresh
28
30
  @table = nil
29
31
  end
30
-
32
+
31
33
  def lookup(key)
32
- find key_name, key, value_name, 'sprintf' => sprintf
34
+ find key_name, key, value_name, {:sprintf => sprintf, :case_sensitive => case_sensitive}
33
35
  end
34
36
 
35
37
  def find(key_name, key, value_name, options = {})
36
- if match = table.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
37
- match[value_name].to_s
38
+ normalized_key = normalize_for_comparison(key, options)
39
+ if match = table.detect { |row| normalized_key == normalize_for_comparison(row[key_name.to_s], options) }
40
+ match[value_name.to_s].to_s
38
41
  end
39
42
  end
40
-
43
+
41
44
  private
42
45
 
43
- def normalize_for_comparison(string, options = {})
44
- if options['sprintf']
45
- if /\%[0-9\.]*f/.match options['sprintf']
46
- string = string.to_f
47
- elsif /\%[0-9\.]*d/.match options['sprintf']
48
- string = string.to_i
46
+ def normalize_for_comparison(str, options = {})
47
+ if sprintf
48
+ if sprintf.end_with?('f')
49
+ str = str.to_f
50
+ elsif sprintf.end_with?('d')
51
+ str = str.to_i
49
52
  end
50
- string = sprintf % string
53
+ str = sprintf % str
54
+ end
55
+ str = DataMiner.compress_whitespace str
56
+ unless options[:case_sensitive]
57
+ str = DataMiner.downcase str
51
58
  end
52
- string.to_s.strip
59
+ str
53
60
  end
54
61
  end
55
62
  end
@@ -1,26 +1,57 @@
1
+ require 'aasm'
2
+ require 'active_record_inline_schema'
3
+
1
4
  class DataMiner
2
5
  class Run < ::ActiveRecord::Base
3
- set_table_name 'data_miner_runs'
4
-
5
- def resource
6
- resource_name.constantize
6
+ class Skip < ::Exception
7
7
  end
8
-
9
- class << self
10
- def create_tables
11
- return if table_exists?
12
- connection.create_table 'data_miner_runs', :force => true do |t|
13
- t.string 'resource_name'
14
- t.boolean 'killed'
15
- t.boolean 'skipped'
16
- t.boolean 'finished'
17
- t.datetime 'started_at'
18
- t.datetime 'terminated_at'
19
- t.datetime 'created_at'
20
- t.datetime 'updated_at'
8
+
9
+ INITIAL_STATE = :limbo
10
+
11
+ self.table_name = 'data_miner_runs'
12
+
13
+ col :model_name
14
+ col :aasm_state
15
+ col :created_at, :type => :datetime
16
+ col :stopped_at, :type => :datetime
17
+ col :updated_at, :type => :datetime
18
+ col :error, :type => :text
19
+
20
+ include ::AASM
21
+ aasm_initial_state INITIAL_STATE
22
+ aasm_state :limbo
23
+ aasm_state :skipped
24
+ aasm_state :succeeded
25
+ aasm_state :failed
26
+ aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
27
+ aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
28
+ aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
29
+
30
+ validates_presence_of :model_name
31
+
32
+ def perform
33
+ save!
34
+ begin
35
+ catch :data_miner_succeed do
36
+ yield
21
37
  end
22
- reset_column_information
38
+ succeed!
39
+ rescue Skip
40
+ skip!
41
+ rescue
42
+ self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
43
+ fail!
44
+ raise $!
45
+ ensure
46
+ self.stopped_at = ::Time.now
47
+ save!
48
+ DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
23
49
  end
24
50
  end
51
+ lock_method :perform
52
+
53
+ def as_lock
54
+ [Run.connection.current_database, model_name]
55
+ end
25
56
  end
26
57
  end
@@ -0,0 +1,116 @@
1
+ class DataMiner
2
+ class Script
3
+ class << self
4
+ # @private
5
+ # activerecord-3.2.3/lib/active_record/scoping.rb
6
+ def uniq
7
+ previous_uniq = current_uniq
8
+ Script.current_uniq = true
9
+ begin
10
+ yield
11
+ ensure
12
+ Script.current_uniq = previous_uniq
13
+ end
14
+ end
15
+
16
+ def current_stack
17
+ ::Thread.current[STACK_THREAD_VAR] ||= []
18
+ end
19
+
20
+ def current_stack=(stack)
21
+ ::Thread.current[STACK_THREAD_VAR] = stack
22
+ end
23
+
24
+ def current_uniq
25
+ ::Thread.current[UNIQ_THREAD_VAR]
26
+ end
27
+
28
+ def current_uniq=(uniq)
29
+ ::Thread.current[UNIQ_THREAD_VAR] = uniq
30
+ end
31
+ end
32
+
33
+ UNIQ_THREAD_VAR = 'DataMiner::Script.current_uniq'
34
+ STACK_THREAD_VAR = 'DataMiner::Script.current_stack'
35
+
36
+ attr_reader :model
37
+ attr_reader :steps
38
+
39
+ def initialize(model)
40
+ @model = model
41
+ @steps = []
42
+ end
43
+
44
+ def append_block(blk)
45
+ instance_eval(&blk)
46
+ end
47
+
48
+ def process(method_id_or_description, &blk)
49
+ append(:process, method_id_or_description, &blk)
50
+ end
51
+
52
+ def tap(description, source, options = {})
53
+ append :tap, description, source, options
54
+ end
55
+
56
+ def import(description = nil, options = {}, &blk)
57
+ append(:import, description, options, &blk)
58
+ end
59
+
60
+ def prepend_once(*args, &blk)
61
+ step = make(*args, &blk)
62
+ unless steps.include? step
63
+ steps.unshift step
64
+ end
65
+ end
66
+
67
+ def prepend(*args, &blk)
68
+ steps.unshift make(*args, &blk)
69
+ end
70
+
71
+ def append_once(*args, &blk)
72
+ step = make(*args, &blk)
73
+ unless steps.include? step
74
+ steps << step
75
+ end
76
+ end
77
+
78
+ def append(*args, &blk)
79
+ steps << make(*args, &blk)
80
+ end
81
+
82
+ def perform
83
+ model_name = model.name
84
+ # $stderr.write "0 - #{model_name}\n"
85
+ # $stderr.write "A - current_uniq - #{Script.current_uniq ? 'true' : 'false'}\n"
86
+ # $stderr.write "B - #{Script.current_stack.join(',')}\n"
87
+ if Script.current_uniq and Script.current_stack.include?(model_name)
88
+ # we've already done this in the current stack, so skip it
89
+ return
90
+ end
91
+ if not Script.current_uniq
92
+ # since we're not trying to uniq, ignore the current contents of the stack
93
+ Script.current_stack.clear
94
+ end
95
+ Script.current_stack << model_name
96
+ Run.new(:model_name => model_name).perform do
97
+ steps.each do |step|
98
+ step.perform
99
+ model.reset_column_information
100
+ end
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def make(*args, &blk)
107
+ klass = Step.const_get(args.shift.to_s.camelcase)
108
+ options = args.extract_options!
109
+ if args.empty?
110
+ args = ["#{klass.name.demodulize} step with no description"]
111
+ end
112
+ initializer = [self] + args + [options]
113
+ klass.new(*initializer, &blk)
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,5 @@
1
+ class DataMiner::Step
2
+ def ==(other)
3
+ other.class == self.class and other.description == description
4
+ end
5
+ end
@@ -0,0 +1,74 @@
1
+ require 'errata'
2
+ require 'remote_table'
3
+
4
+ class DataMiner::Step::Import
5
+ attr_reader :attributes
6
+ attr_reader :script
7
+ attr_reader :description
8
+ attr_reader :attributes
9
+
10
+ def initialize(script, description, options = {}, &blk)
11
+ options = options.symbolize_keys
12
+ if options.has_key?(:table)
13
+ raise ::ArgumentError, %{[data_miner] :table is no longer an allowed option.}
14
+ end
15
+ if (errata_options = options[:errata]) and not errata_options.is_a?(::Hash)
16
+ raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization options to Errata}
17
+ end
18
+ @script = script
19
+ @mutex = ::Mutex.new
20
+ @attributes = ::ActiveSupport::OrderedHash.new
21
+ @description = description
22
+ if options.has_key? :errata
23
+ errata_options = options[:errata].symbolize_keys
24
+ errata_options[:responder] ||= model
25
+ options[:errata] = errata_options
26
+ end
27
+ @table_options = options.dup
28
+ @table_options[:streaming] = true
29
+ instance_eval(&blk)
30
+ end
31
+
32
+ def model
33
+ script.model
34
+ end
35
+
36
+ def store(attr_name, attr_options = {})
37
+ attr_name = attr_name.to_sym
38
+ if attributes.has_key? attr_name
39
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
40
+ end
41
+ attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
42
+ end
43
+
44
+ def key(attr_name, attr_options = {})
45
+ attr_name = attr_name.to_sym
46
+ if attributes.has_key? attr_name
47
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
48
+ end
49
+ @key = attr_name
50
+ store attr_name, attr_options
51
+ end
52
+
53
+ def table
54
+ @table || @mutex.synchronize do
55
+ @table ||= ::RemoteTable.new(@table_options)
56
+ end
57
+ end
58
+
59
+ def refresh
60
+ @table = nil
61
+ attributes.each { |_, attr| attr.refresh }
62
+ nil
63
+ end
64
+
65
+ def perform
66
+ table.each do |row|
67
+ record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
68
+ attributes.each { |_, attr| attr.set_from_row record, row }
69
+ record.save!
70
+ end
71
+ refresh
72
+ nil
73
+ end
74
+ end
@@ -0,0 +1,34 @@
1
+ class DataMiner::Step::Process
2
+ attr_reader :script
3
+ attr_reader :method_id
4
+ attr_reader :description
5
+ attr_reader :blk
6
+
7
+ alias :block_description :description
8
+
9
+ def initialize(script, method_id_or_description, ignored_options = {}, &blk)
10
+ @script = script
11
+ if block_given?
12
+ @description = method_id_or_description
13
+ @blk = blk
14
+ else
15
+ @description = method_id_or_description
16
+ @method_id = method_id_or_description
17
+ end
18
+ end
19
+
20
+ def model
21
+ script.model
22
+ end
23
+
24
+ def perform
25
+ DataMiner::Script.uniq do
26
+ if blk
27
+ model.instance_eval(&blk)
28
+ else
29
+ model.send method_id
30
+ end
31
+ end
32
+ nil
33
+ end
34
+ end
@@ -0,0 +1,134 @@
1
+ require 'uri'
2
+ # Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
3
+ #
4
+ # This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
5
+ class DataMiner::Step::Tap
6
+ DEFAULT_PORTS = {
7
+ :mysql => 3306,
8
+ :mysql2 => 3306,
9
+ :postgres => 5432
10
+ }
11
+
12
+ DEFAULT_USERNAMES = {
13
+ :mysql => 'root',
14
+ :mysql2 => 'root',
15
+ :postgres => ''
16
+ }
17
+
18
+ DEFAULT_PASSWORDS = {}
19
+ DEFAULT_PASSWORDS.default = ''
20
+
21
+ DEFAULT_HOSTS = {}
22
+ DEFAULT_HOSTS.default = '127.0.0.1'
23
+
24
+ attr_reader :script
25
+ attr_reader :description
26
+ attr_reader :source
27
+ attr_reader :database_options
28
+ attr_reader :source_table_name
29
+
30
+ def initialize(script, description, source, options = {})
31
+ options = options.symbolize_keys
32
+ @script = script
33
+ @description = description
34
+ @source = source
35
+ @database_options = options.except(:source_table_name).reverse_merge(active_record_config)
36
+ @source_table_name = options.fetch :source_table_name, model.table_name
37
+ end
38
+
39
+ def model
40
+ script.model
41
+ end
42
+
43
+ def perform
44
+ [ source_table_name, model.table_name ].each do |possible_obstacle|
45
+ if connection.table_exists? possible_obstacle
46
+ connection.drop_table possible_obstacle
47
+ end
48
+ end
49
+ taps_pull
50
+ if needs_table_rename?
51
+ connection.rename_table source_table_name, model.table_name
52
+ end
53
+ nil
54
+ end
55
+
56
+ # sabshere 1/25/11 what if there were multiple connections
57
+ # blockenspiel doesn't like to delegate this to #model
58
+ def connection
59
+ ::ActiveRecord::Base.connection
60
+ end
61
+
62
+ def needs_table_rename?
63
+ source_table_name != model.table_name
64
+ end
65
+
66
+ def adapter
67
+ case connection.adapter_name
68
+ when /mysql2/i
69
+ 'mysql2'
70
+ when /mysql/i
71
+ 'mysql'
72
+ when /postgres/i
73
+ 'postgres'
74
+ when /sqlite/i
75
+ 'sqlite'
76
+ end
77
+ end
78
+
79
+ # never optional
80
+ def database
81
+ database_options[:database]
82
+ end
83
+
84
+ %w{ username password port host }.each do |x|
85
+ module_eval %{
86
+ def #{x}
87
+ database_options[:#{x}] || DEFAULT_#{x.upcase}S[adapter.to_sym]
88
+ end
89
+ }
90
+ end
91
+
92
+ # "user:pass"
93
+ # "user"
94
+ # nil
95
+ def userinfo
96
+ if username.present?
97
+ [username, password].select(&:present?).join(':')
98
+ end
99
+ end
100
+
101
+ def db_url
102
+ case adapter
103
+ when 'sqlite'
104
+ "sqlite://#{database}"
105
+ else
106
+ ::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
107
+ end
108
+ end
109
+
110
+ def active_record_config
111
+ connection.instance_variable_get(:@config).symbolize_keys
112
+ end
113
+
114
+ def taps_pull
115
+ args = [
116
+ 'taps',
117
+ 'pull',
118
+ db_url,
119
+ source,
120
+ '--indexes-first',
121
+ '--tables',
122
+ source_table_name
123
+ ]
124
+
125
+ # https://github.com/carlhuda/bundler/issues/1579
126
+ if defined?(::Bundler)
127
+ ::Bundler.with_clean_env do
128
+ ::Kernel.system args.join(' ')
129
+ end
130
+ else
131
+ ::Kernel.system args.join(' ')
132
+ end
133
+ end
134
+ end