data_miner 1.3.8 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/CHANGELOG +42 -0
  2. data/Gemfile +19 -3
  3. data/README.rdoc +3 -3
  4. data/Rakefile +13 -15
  5. data/data_miner.gemspec +4 -15
  6. data/lib/data_miner.rb +69 -70
  7. data/lib/data_miner/active_record_extensions.rb +17 -22
  8. data/lib/data_miner/attribute.rb +176 -179
  9. data/lib/data_miner/dictionary.rb +38 -31
  10. data/lib/data_miner/run.rb +49 -18
  11. data/lib/data_miner/script.rb +116 -0
  12. data/lib/data_miner/step.rb +5 -0
  13. data/lib/data_miner/step/import.rb +74 -0
  14. data/lib/data_miner/step/process.rb +34 -0
  15. data/lib/data_miner/step/tap.rb +134 -0
  16. data/lib/data_miner/version.rb +1 -1
  17. data/test/helper.rb +26 -24
  18. data/test/support/breeds.xls +0 -0
  19. data/test/support/pet_color_dictionary.en.csv +5 -0
  20. data/test/support/pet_color_dictionary.es.csv +5 -0
  21. data/test/support/pets.csv +5 -0
  22. data/test/support/pets_funny.csv +4 -0
  23. data/test/test_data_miner.rb +103 -0
  24. data/test/test_earth_import.rb +25 -0
  25. data/test/test_earth_tap.rb +25 -0
  26. data/test/test_safety.rb +43 -0
  27. metadata +72 -78
  28. data/.document +0 -5
  29. data/lib/data_miner/config.rb +0 -124
  30. data/lib/data_miner/import.rb +0 -93
  31. data/lib/data_miner/process.rb +0 -38
  32. data/lib/data_miner/tap.rb +0 -143
  33. data/test/support/aircraft.rb +0 -102
  34. data/test/support/airport.rb +0 -16
  35. data/test/support/automobile_fuel_type.rb +0 -40
  36. data/test/support/automobile_variant.rb +0 -362
  37. data/test/support/country.rb +0 -15
  38. data/test/support/test_database.rb +0 -311
  39. data/test/test_data_miner_attribute.rb +0 -111
  40. data/test/test_data_miner_process.rb +0 -18
  41. data/test/test_old_syntax.rb +0 -825
  42. data/test/test_tap.rb +0 -21
@@ -1,55 +1,62 @@
1
1
  require 'remote_table'
2
+
2
3
  class DataMiner
3
4
  class Dictionary
4
- attr_reader :options
5
+ DEFAULT_CASE_SENSITIVE = true
6
+
7
+ attr_reader :key_name
8
+ attr_reader :value_name
9
+ attr_reader :sprintf
10
+ attr_reader :url
11
+ attr_reader :case_sensitive
12
+
5
13
  def initialize(options = {})
6
- @options = options.dup
7
- @options.stringify_keys!
8
- end
9
-
10
- def key_name
11
- options['input']
12
- end
13
-
14
- def value_name
15
- options['output']
16
- end
17
-
18
- def sprintf
19
- options['sprintf'] || '%s'
14
+ options = options.symbolize_keys
15
+ @url = options[:url]
16
+ @key_name = options[:input]
17
+ @value_name = options[:output]
18
+ @sprintf = options[:sprintf]
19
+ @case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
20
+ @table_mutex = ::Mutex.new
20
21
  end
21
22
 
22
23
  def table
23
- @table ||= ::RemoteTable.new(options['url']).to_a # convert to Array immediately
24
+ @table || @table_mutex.synchronize do
25
+ @table ||= ::RemoteTable.new(url).to_a # make sure it's fully cached
26
+ end
24
27
  end
25
28
 
26
- def free
27
- @table.free if @table.is_a?(::RemoteTable)
29
+ def refresh
28
30
  @table = nil
29
31
  end
30
-
32
+
31
33
  def lookup(key)
32
- find key_name, key, value_name, 'sprintf' => sprintf
34
+ find key_name, key, value_name, {:sprintf => sprintf, :case_sensitive => case_sensitive}
33
35
  end
34
36
 
35
37
  def find(key_name, key, value_name, options = {})
36
- if match = table.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
37
- match[value_name].to_s
38
+ normalized_key = normalize_for_comparison(key, options)
39
+ if match = table.detect { |row| normalized_key == normalize_for_comparison(row[key_name.to_s], options) }
40
+ match[value_name.to_s].to_s
38
41
  end
39
42
  end
40
-
43
+
41
44
  private
42
45
 
43
- def normalize_for_comparison(string, options = {})
44
- if options['sprintf']
45
- if /\%[0-9\.]*f/.match options['sprintf']
46
- string = string.to_f
47
- elsif /\%[0-9\.]*d/.match options['sprintf']
48
- string = string.to_i
46
+ def normalize_for_comparison(str, options = {})
47
+ if sprintf
48
+ if sprintf.end_with?('f')
49
+ str = str.to_f
50
+ elsif sprintf.end_with?('d')
51
+ str = str.to_i
49
52
  end
50
- string = sprintf % string
53
+ str = sprintf % str
54
+ end
55
+ str = DataMiner.compress_whitespace str
56
+ unless options[:case_sensitive]
57
+ str = DataMiner.downcase str
51
58
  end
52
- string.to_s.strip
59
+ str
53
60
  end
54
61
  end
55
62
  end
@@ -1,26 +1,57 @@
1
+ require 'aasm'
2
+ require 'active_record_inline_schema'
3
+
1
4
  class DataMiner
2
5
  class Run < ::ActiveRecord::Base
3
- set_table_name 'data_miner_runs'
4
-
5
- def resource
6
- resource_name.constantize
6
+ class Skip < ::Exception
7
7
  end
8
-
9
- class << self
10
- def create_tables
11
- return if table_exists?
12
- connection.create_table 'data_miner_runs', :force => true do |t|
13
- t.string 'resource_name'
14
- t.boolean 'killed'
15
- t.boolean 'skipped'
16
- t.boolean 'finished'
17
- t.datetime 'started_at'
18
- t.datetime 'terminated_at'
19
- t.datetime 'created_at'
20
- t.datetime 'updated_at'
8
+
9
+ INITIAL_STATE = :limbo
10
+
11
+ self.table_name = 'data_miner_runs'
12
+
13
+ col :model_name
14
+ col :aasm_state
15
+ col :created_at, :type => :datetime
16
+ col :stopped_at, :type => :datetime
17
+ col :updated_at, :type => :datetime
18
+ col :error, :type => :text
19
+
20
+ include ::AASM
21
+ aasm_initial_state INITIAL_STATE
22
+ aasm_state :limbo
23
+ aasm_state :skipped
24
+ aasm_state :succeeded
25
+ aasm_state :failed
26
+ aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
27
+ aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
28
+ aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
29
+
30
+ validates_presence_of :model_name
31
+
32
+ def perform
33
+ save!
34
+ begin
35
+ catch :data_miner_succeed do
36
+ yield
21
37
  end
22
- reset_column_information
38
+ succeed!
39
+ rescue Skip
40
+ skip!
41
+ rescue
42
+ self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
43
+ fail!
44
+ raise $!
45
+ ensure
46
+ self.stopped_at = ::Time.now
47
+ save!
48
+ DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
23
49
  end
24
50
  end
51
+ lock_method :perform
52
+
53
+ def as_lock
54
+ [Run.connection.current_database, model_name]
55
+ end
25
56
  end
26
57
  end
@@ -0,0 +1,116 @@
1
+ class DataMiner
2
+ class Script
3
+ class << self
4
+ # @private
5
+ # activerecord-3.2.3/lib/active_record/scoping.rb
6
+ def uniq
7
+ previous_uniq = current_uniq
8
+ Script.current_uniq = true
9
+ begin
10
+ yield
11
+ ensure
12
+ Script.current_uniq = previous_uniq
13
+ end
14
+ end
15
+
16
+ def current_stack
17
+ ::Thread.current[STACK_THREAD_VAR] ||= []
18
+ end
19
+
20
+ def current_stack=(stack)
21
+ ::Thread.current[STACK_THREAD_VAR] = stack
22
+ end
23
+
24
+ def current_uniq
25
+ ::Thread.current[UNIQ_THREAD_VAR]
26
+ end
27
+
28
+ def current_uniq=(uniq)
29
+ ::Thread.current[UNIQ_THREAD_VAR] = uniq
30
+ end
31
+ end
32
+
33
+ UNIQ_THREAD_VAR = 'DataMiner::Script.current_uniq'
34
+ STACK_THREAD_VAR = 'DataMiner::Script.current_stack'
35
+
36
+ attr_reader :model
37
+ attr_reader :steps
38
+
39
+ def initialize(model)
40
+ @model = model
41
+ @steps = []
42
+ end
43
+
44
+ def append_block(blk)
45
+ instance_eval(&blk)
46
+ end
47
+
48
+ def process(method_id_or_description, &blk)
49
+ append(:process, method_id_or_description, &blk)
50
+ end
51
+
52
+ def tap(description, source, options = {})
53
+ append :tap, description, source, options
54
+ end
55
+
56
+ def import(description = nil, options = {}, &blk)
57
+ append(:import, description, options, &blk)
58
+ end
59
+
60
+ def prepend_once(*args, &blk)
61
+ step = make(*args, &blk)
62
+ unless steps.include? step
63
+ steps.unshift step
64
+ end
65
+ end
66
+
67
+ def prepend(*args, &blk)
68
+ steps.unshift make(*args, &blk)
69
+ end
70
+
71
+ def append_once(*args, &blk)
72
+ step = make(*args, &blk)
73
+ unless steps.include? step
74
+ steps << step
75
+ end
76
+ end
77
+
78
+ def append(*args, &blk)
79
+ steps << make(*args, &blk)
80
+ end
81
+
82
+ def perform
83
+ model_name = model.name
84
+ # $stderr.write "0 - #{model_name}\n"
85
+ # $stderr.write "A - current_uniq - #{Script.current_uniq ? 'true' : 'false'}\n"
86
+ # $stderr.write "B - #{Script.current_stack.join(',')}\n"
87
+ if Script.current_uniq and Script.current_stack.include?(model_name)
88
+ # we've already done this in the current stack, so skip it
89
+ return
90
+ end
91
+ if not Script.current_uniq
92
+ # since we're not trying to uniq, ignore the current contents of the stack
93
+ Script.current_stack.clear
94
+ end
95
+ Script.current_stack << model_name
96
+ Run.new(:model_name => model_name).perform do
97
+ steps.each do |step|
98
+ step.perform
99
+ model.reset_column_information
100
+ end
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def make(*args, &blk)
107
+ klass = Step.const_get(args.shift.to_s.camelcase)
108
+ options = args.extract_options!
109
+ if args.empty?
110
+ args = ["#{klass.name.demodulize} step with no description"]
111
+ end
112
+ initializer = [self] + args + [options]
113
+ klass.new(*initializer, &blk)
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,5 @@
1
+ class DataMiner::Step
2
+ def ==(other)
3
+ other.class == self.class and other.description == description
4
+ end
5
+ end
@@ -0,0 +1,74 @@
1
+ require 'errata'
2
+ require 'remote_table'
3
+
4
+ class DataMiner::Step::Import
5
+ attr_reader :attributes
6
+ attr_reader :script
7
+ attr_reader :description
8
+ attr_reader :attributes
9
+
10
+ def initialize(script, description, options = {}, &blk)
11
+ options = options.symbolize_keys
12
+ if options.has_key?(:table)
13
+ raise ::ArgumentError, %{[data_miner] :table is no longer an allowed option.}
14
+ end
15
+ if (errata_options = options[:errata]) and not errata_options.is_a?(::Hash)
16
+ raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization options to Errata}
17
+ end
18
+ @script = script
19
+ @mutex = ::Mutex.new
20
+ @attributes = ::ActiveSupport::OrderedHash.new
21
+ @description = description
22
+ if options.has_key? :errata
23
+ errata_options = options[:errata].symbolize_keys
24
+ errata_options[:responder] ||= model
25
+ options[:errata] = errata_options
26
+ end
27
+ @table_options = options.dup
28
+ @table_options[:streaming] = true
29
+ instance_eval(&blk)
30
+ end
31
+
32
+ def model
33
+ script.model
34
+ end
35
+
36
+ def store(attr_name, attr_options = {})
37
+ attr_name = attr_name.to_sym
38
+ if attributes.has_key? attr_name
39
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
40
+ end
41
+ attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
42
+ end
43
+
44
+ def key(attr_name, attr_options = {})
45
+ attr_name = attr_name.to_sym
46
+ if attributes.has_key? attr_name
47
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
48
+ end
49
+ @key = attr_name
50
+ store attr_name, attr_options
51
+ end
52
+
53
+ def table
54
+ @table || @mutex.synchronize do
55
+ @table ||= ::RemoteTable.new(@table_options)
56
+ end
57
+ end
58
+
59
+ def refresh
60
+ @table = nil
61
+ attributes.each { |_, attr| attr.refresh }
62
+ nil
63
+ end
64
+
65
+ def perform
66
+ table.each do |row|
67
+ record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
68
+ attributes.each { |_, attr| attr.set_from_row record, row }
69
+ record.save!
70
+ end
71
+ refresh
72
+ nil
73
+ end
74
+ end
@@ -0,0 +1,34 @@
1
+ class DataMiner::Step::Process
2
+ attr_reader :script
3
+ attr_reader :method_id
4
+ attr_reader :description
5
+ attr_reader :blk
6
+
7
+ alias :block_description :description
8
+
9
+ def initialize(script, method_id_or_description, ignored_options = {}, &blk)
10
+ @script = script
11
+ if block_given?
12
+ @description = method_id_or_description
13
+ @blk = blk
14
+ else
15
+ @description = method_id_or_description
16
+ @method_id = method_id_or_description
17
+ end
18
+ end
19
+
20
+ def model
21
+ script.model
22
+ end
23
+
24
+ def perform
25
+ DataMiner::Script.uniq do
26
+ if blk
27
+ model.instance_eval(&blk)
28
+ else
29
+ model.send method_id
30
+ end
31
+ end
32
+ nil
33
+ end
34
+ end
@@ -0,0 +1,134 @@
1
+ require 'uri'
2
+ # Note that you probably shouldn't put taps into your Gemfile, because it depends on sequel and other gems that may not compile on Heroku (etc.)
3
+ #
4
+ # This class automatically detects if you have Bundler installed, and if so, executes the `taps` binary with a "clean" environment (i.e. one that will not pay attention to the fact that taps is not in your Gemfile)
5
+ class DataMiner::Step::Tap
6
+ DEFAULT_PORTS = {
7
+ :mysql => 3306,
8
+ :mysql2 => 3306,
9
+ :postgres => 5432
10
+ }
11
+
12
+ DEFAULT_USERNAMES = {
13
+ :mysql => 'root',
14
+ :mysql2 => 'root',
15
+ :postgres => ''
16
+ }
17
+
18
+ DEFAULT_PASSWORDS = {}
19
+ DEFAULT_PASSWORDS.default = ''
20
+
21
+ DEFAULT_HOSTS = {}
22
+ DEFAULT_HOSTS.default = '127.0.0.1'
23
+
24
+ attr_reader :script
25
+ attr_reader :description
26
+ attr_reader :source
27
+ attr_reader :database_options
28
+ attr_reader :source_table_name
29
+
30
+ def initialize(script, description, source, options = {})
31
+ options = options.symbolize_keys
32
+ @script = script
33
+ @description = description
34
+ @source = source
35
+ @database_options = options.except(:source_table_name).reverse_merge(active_record_config)
36
+ @source_table_name = options.fetch :source_table_name, model.table_name
37
+ end
38
+
39
+ def model
40
+ script.model
41
+ end
42
+
43
+ def perform
44
+ [ source_table_name, model.table_name ].each do |possible_obstacle|
45
+ if connection.table_exists? possible_obstacle
46
+ connection.drop_table possible_obstacle
47
+ end
48
+ end
49
+ taps_pull
50
+ if needs_table_rename?
51
+ connection.rename_table source_table_name, model.table_name
52
+ end
53
+ nil
54
+ end
55
+
56
+ # sabshere 1/25/11 what if there were multiple connections
57
+ # blockenspiel doesn't like to delegate this to #model
58
+ def connection
59
+ ::ActiveRecord::Base.connection
60
+ end
61
+
62
+ def needs_table_rename?
63
+ source_table_name != model.table_name
64
+ end
65
+
66
+ def adapter
67
+ case connection.adapter_name
68
+ when /mysql2/i
69
+ 'mysql2'
70
+ when /mysql/i
71
+ 'mysql'
72
+ when /postgres/i
73
+ 'postgres'
74
+ when /sqlite/i
75
+ 'sqlite'
76
+ end
77
+ end
78
+
79
+ # never optional
80
+ def database
81
+ database_options[:database]
82
+ end
83
+
84
+ %w{ username password port host }.each do |x|
85
+ module_eval %{
86
+ def #{x}
87
+ database_options[:#{x}] || DEFAULT_#{x.upcase}S[adapter.to_sym]
88
+ end
89
+ }
90
+ end
91
+
92
+ # "user:pass"
93
+ # "user"
94
+ # nil
95
+ def userinfo
96
+ if username.present?
97
+ [username, password].select(&:present?).join(':')
98
+ end
99
+ end
100
+
101
+ def db_url
102
+ case adapter
103
+ when 'sqlite'
104
+ "sqlite://#{database}"
105
+ else
106
+ ::URI::Generic.new(adapter, userinfo, host, port, nil, "/#{database}", nil, nil, nil).to_s
107
+ end
108
+ end
109
+
110
+ def active_record_config
111
+ connection.instance_variable_get(:@config).symbolize_keys
112
+ end
113
+
114
+ def taps_pull
115
+ args = [
116
+ 'taps',
117
+ 'pull',
118
+ db_url,
119
+ source,
120
+ '--indexes-first',
121
+ '--tables',
122
+ source_table_name
123
+ ]
124
+
125
+ # https://github.com/carlhuda/bundler/issues/1579
126
+ if defined?(::Bundler)
127
+ ::Bundler.with_clean_env do
128
+ ::Kernel.system args.join(' ')
129
+ end
130
+ else
131
+ ::Kernel.system args.join(' ')
132
+ end
133
+ end
134
+ end