data_miner 0.5.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ require 'blockenspiel'
2
+
3
+ class DataMiner
4
+ class Config
5
+ include ::Blockenspiel::DSL
6
+
7
+ attr_reader :resource
8
+
9
+ def initialize(resource)
10
+ @resource = resource
11
+ end
12
+
13
+ def steps
14
+ @steps ||= []
15
+ end
16
+
17
+ # def attributes
18
+ # @attributes ||= {}
19
+ # end
20
+
21
+ def schema(create_table_options = {}, &blk)
22
+ step = Schema.new self, create_table_options
23
+ ::Blockenspiel.invoke blk, step
24
+ steps.push step
25
+ end
26
+
27
+ def process(method_id_or_block_description, &blk)
28
+ step = Process.new self, method_id_or_block_description, &blk
29
+ steps.push step
30
+ end
31
+
32
+ def tap(description, source, options = {})
33
+ step = Tap.new self, description, source, options
34
+ steps.push step
35
+ end
36
+
37
+ def import(*args, &blk)
38
+ if args.length == 1
39
+ description = '(no description)'
40
+ else
41
+ description = args[0]
42
+ end
43
+ options = args.last
44
+
45
+ step = Import.new self, description, options
46
+ ::Blockenspiel.invoke blk, step
47
+ steps.push step
48
+ end
49
+
50
+ def verify(description = '(no description)', &blk)
51
+ step = Verify.new self, description, &blk
52
+ steps.push step
53
+ end
54
+
55
+ # Mine data for this class.
56
+ def run(options = {})
57
+ options = options.dup
58
+ options.stringify_keys!
59
+
60
+ return if ::DataMiner.instance.call_stack.include? resource.name
61
+ ::DataMiner.instance.call_stack.push resource.name
62
+
63
+ finished = false
64
+ skipped = false
65
+ if Run.table_exists?
66
+ run = Run.create! :started_at => ::Time.now, :resource_name => resource.name, :killed => true
67
+ else
68
+ run = nil
69
+ ::DataMiner.logger.info "Not logging individual runs. Please run DataMiner::Run.create_tables if you want to enable this."
70
+ end
71
+ resource.delete_all if options['from_scratch']
72
+ begin
73
+ steps.each do |step|
74
+ step.run# run
75
+ resource.reset_column_information
76
+ end
77
+ finished = true
78
+ rescue Finish
79
+ finished = true
80
+ rescue Skip
81
+ skipped = true
82
+ ensure
83
+ if Run.table_exists?
84
+ run.update_attributes! :terminated_at => ::Time.now, :finished => finished, :skipped => skipped, :killed => false
85
+ end
86
+ if ::DataMiner.instance.call_stack.first == resource.name and !options['preserve_call_stack_between_runs']
87
+ ::DataMiner.instance.call_stack.clear
88
+ end
89
+ end
90
+ nil
91
+ end
92
+
93
+ def import_steps
94
+ steps.select { |step| step.is_a? Import }
95
+ end
96
+
97
+ def before_invoke
98
+
99
+ end
100
+
101
+ def after_invoke
102
+ return unless resource.table_exists?
103
+ make_sure_unit_definitions_make_sense
104
+ suggest_missing_column_migrations
105
+ end
106
+
107
+ COMPLETE_UNIT_DEFINITIONS = [
108
+ %w{units},
109
+ %w{from_units to_units},
110
+ %w{units_field_name},
111
+ %w{units_field_name to_units},
112
+ %w{units_field_number},
113
+ %w{units_field_number to_units}
114
+ ]
115
+
116
+ def make_sure_unit_definitions_make_sense
117
+ import_steps.each do |step|
118
+ step.attributes.each do |_, attribute|
119
+ if attribute.options.any? { |k, _| k.to_s =~ /unit/ } and COMPLETE_UNIT_DEFINITIONS.none? { |complete_definition| complete_definition.all? { |required_option| attribute.options[required_option].present? } }
120
+ raise %{
121
+
122
+ ================================
123
+
124
+ You don't have a valid unit definition for #{resource.name}##{attribute.name}.
125
+
126
+ You supplied #{attribute.options.keys.select { |k, _| k.to_s =~ /unit/ }.inspect }.
127
+
128
+ You need to supply one of #{COMPLETE_UNIT_DEFINITIONS.map(&:inspect).to_sentence}".
129
+
130
+ ================================
131
+ }
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ def suggest_missing_column_migrations
138
+ missing_columns = []
139
+
140
+ import_steps.each do |step|
141
+ step.attributes.each do |_, attribute|
142
+ raise "You can't have an attribute column that ends in _units (reserved): #{resource.table_name}.#{attribute.name}" if attribute.name.end_with? '_units'
143
+ unless resource.column_names.include? attribute.name
144
+ missing_columns << attribute.name
145
+ end
146
+ if attribute.wants_units? and !resource.column_names.include?(units_column = "#{attribute.name}_units")
147
+ missing_columns << units_column
148
+ end
149
+ end
150
+ end
151
+ missing_columns.uniq!
152
+ if missing_columns.any?
153
+ ::DataMiner.logger.debug %{
154
+
155
+ ================================
156
+
157
+ On #{resource}, it looks like you're missing some columns...
158
+
159
+ Please run this...
160
+
161
+ ./script/generate migration AddMissingColumnsTo#{resource.name}
162
+
163
+ and **replace** the resulting file with this:
164
+
165
+ class AddMissingColumnsTo#{resource.name} < ActiveRecord::Migration
166
+ def self.up
167
+ #{missing_columns.map { |column_name| " add_column :#{resource.table_name}, :#{column_name}, :#{column_name.end_with?('_units') ? 'string' : 'FIXME_WHAT_COLUMN_TYPE_AM_I' }" }.join("\n") }
168
+ end
169
+
170
+ def self.down
171
+ #{missing_columns.map { |column_name| " remove_column :#{resource.table_name}, :#{column_name}" }.join("\n") }
172
+ end
173
+ end
174
+
175
+ On the other hand, if you're working directly with create_table, this might be helpful:
176
+
177
+ #{missing_columns.map { |column_name| "t.#{column_name.end_with?('_units') ? 'string' : 'FIXME_WHAT_COLUMN_TYPE_AM_I' } '#{column_name}'" }.join("\n") }
178
+
179
+ ================================
180
+ }
181
+ end
182
+ end
183
+ end
184
+ end
@@ -1,20 +1,33 @@
1
- module DataMiner
1
+ class DataMiner
2
2
  class Dictionary
3
- attr_accessor :key_name, :value_name, :sprintf, :table
4
-
3
+ attr_reader :options
5
4
  def initialize(options = {})
6
- @key_name = options[:input]
7
- @value_name = options[:output]
8
- @sprintf = options[:sprintf] || '%s'
9
- @table = RemoteTable.new(:url => options[:url])
5
+ @options = options.dup
6
+ @options.stringify_keys!
7
+ end
8
+
9
+ def key_name
10
+ options['input']
11
+ end
12
+
13
+ def value_name
14
+ options['output']
15
+ end
16
+
17
+ def sprintf
18
+ options['sprintf'] || '%s'
19
+ end
20
+
21
+ def table
22
+ @table ||= ::RemoteTable.new options['url']
10
23
  end
11
24
 
12
25
  def lookup(key)
13
- find(self.key_name, key, self.value_name, :sprintf => self.sprintf)
26
+ find key_name, key, value_name, 'sprintf' => sprintf
14
27
  end
15
28
 
16
29
  def find(key_name, key, value_name, options = {})
17
- if match = table.rows.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
30
+ if match = table.detect { |row| normalize_for_comparison(key, options) == normalize_for_comparison(row[key_name], options) }
18
31
  match[value_name].to_s
19
32
  end
20
33
  end
@@ -22,10 +35,10 @@ module DataMiner
22
35
  private
23
36
 
24
37
  def normalize_for_comparison(string, options = {})
25
- if options[:sprintf]
26
- if /\%[0-9\.]*f/.match(options[:sprintf])
38
+ if options['sprintf']
39
+ if /\%[0-9\.]*f/.match options['sprintf']
27
40
  string = string.to_f
28
- elsif /\%[0-9\.]*d/.match(options[:sprintf])
41
+ elsif /\%[0-9\.]*d/.match options['sprintf']
29
42
  string = string.to_i
30
43
  end
31
44
  string = sprintf % string
@@ -1,79 +1,88 @@
1
- module DataMiner
1
+ require 'blockenspiel'
2
+ class DataMiner
2
3
  class Import
3
- include Blockenspiel::DSL
4
+ include ::Blockenspiel::DSL
4
5
 
5
6
  attr_reader :attributes
6
- attr_accessor :base
7
- attr_accessor :position_in_run
8
- attr_accessor :table_options
9
- attr_accessor :description
10
- delegate :resource, :to => :base
7
+ attr_reader :config
8
+ attr_reader :options
9
+ attr_reader :description
11
10
 
12
- def initialize(base, position_in_run, description, table_options = {})
13
- @table_options = table_options
14
- @table_options.symbolize_keys!
15
-
16
- @attributes = ActiveSupport::OrderedHash.new
17
- @base = base
18
- @position_in_run = position_in_run
11
+ def initialize(config, description, options = {})
12
+ @config = config
19
13
  @description = description
20
-
21
- if @table_options[:errata].is_a?(String)
22
- @table_options[:errata] = Errata.new :url => @table_options[:errata], :responder => resource
14
+ @options = options.dup
15
+ @options.stringify_keys!
16
+ # legacy
17
+ if @options.has_key? 'table'
18
+ ::DataMiner.logger.info "Warning: 'table' is no longer an allowed option, taking the url from it and ignoring the rest"
19
+ table_instance = @options.delete 'table'
20
+ @options['url'] = table_instance.url
23
21
  end
24
-
25
- if @table_options[:table] and @table_options[:url].present?
26
- DataMiner.log_or_raise "You should specify :table or :url, but not both"
22
+ # legacy
23
+ if @options.has_key?('errata') and not @options['errata'].is_a?(::Hash)
24
+ ::DataMiner.logger.info "Warning: 'errata' must be a hash of Errata options. taking the URL from the Errata instance you provided and ignoring everything else"
25
+ errata_instance = @options.delete 'errata'
26
+ @options['errata'] = { 'url' => errata_instance.options['url'] }
27
27
  end
28
28
  end
29
-
30
- def table
31
- @table ||= (table_options[:table] || RemoteTable.new(table_options))
29
+
30
+ def attributes
31
+ @attributes ||= ::ActiveSupport::OrderedHash.new
32
32
  end
33
33
 
34
- def clear_table
35
- @table = nil
34
+ def resource
35
+ config.resource
36
36
  end
37
37
 
38
38
  def inspect
39
- "Import(#{resource}) position #{position_in_run} (#{description})"
40
- end
41
-
42
- def stores?(attr_name)
43
- attributes.has_key? attr_name
39
+ %{#<DataMiner::Import(#{resource}) (#{description})>}
44
40
  end
45
41
 
46
42
  def store(attr_name, attr_options = {})
47
- DataMiner.log_or_raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
43
+ raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
48
44
  attributes[attr_name] = Attribute.new self, attr_name, attr_options
49
45
  end
50
46
 
51
47
  def key(attr_name, attr_options = {})
52
- DataMiner.log_or_raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
53
- @key = attr_name
48
+ raise "You should only call store or key once for #{resource.name}##{attr_name}" if attributes.has_key? attr_name
49
+ @_key = attr_name
54
50
  store attr_name, attr_options
55
51
  end
56
52
 
57
- def run(run)
58
- primary_key = resource.primary_key
59
- test_counter = 0
53
+ def primary_key
54
+ resource.primary_key
55
+ end
56
+
57
+ def table
58
+ return @table if @table.is_a? ::RemoteTable
59
+ # don't mess with the originals
60
+ options = @options.dup
61
+ if options['errata']
62
+ errata_options = options['errata'].dup
63
+ errata_options.stringify_keys!
64
+ errata_options['responder'] ||= resource
65
+ options['errata'] = errata_options
66
+ end
67
+ @table = ::RemoteTable.new options
68
+ end
60
69
 
61
- table.each_row do |row|
62
- if ENV['DUMP'] == 'true'
63
- raise "[data_miner gem] Stopping after 5 rows because TEST=true" if test_counter > 5
64
- test_counter += 1
65
- DataMiner.log_info %{Row #{test_counter}
66
- IN: #{row.inspect}
67
- OUT: #{attributes.inject(Hash.new) { |memo, v| attr_name, attr = v; memo[attr_name] = attr.value_from_row(row); memo }.inspect}
68
- }
69
- end
70
-
71
- record = resource.send "find_or_initialize_by_#{@key}", attributes[@key].value_from_row(row)
70
+ def expire_remote_data
71
+ @table = nil
72
+ attributes.each { |_, attr| attr.instance_variable_set :@dictionary, nil }
73
+ end
74
+
75
+ def run
76
+ expire_remote_data
77
+ table.each do |row|
78
+ record = resource.send "find_or_initialize_by_#{@_key}", attributes[@_key].value_from_row(row)
72
79
  attributes.each { |_, attr| attr.set_record_from_row record, row }
73
- record.save! if record.send(primary_key).present?
80
+ if record.send(primary_key).present?
81
+ record.save!
82
+ else
83
+ ::DataMiner.logger.debug "Skipping #{row} because there's no primary key"
84
+ end
74
85
  end
75
- DataMiner.log_info "performed #{inspect}"
76
- clear_table
77
86
  nil
78
87
  end
79
88
  end
@@ -1,37 +1,42 @@
1
- module DataMiner
1
+ class DataMiner
2
2
  class Process
3
- attr_accessor :base, :position_in_run
4
- attr_accessor :method_name
5
- attr_accessor :block_description, :block
6
- delegate :resource, :to => :base
3
+ attr_reader :config
4
+ attr_reader :method_id
5
+ attr_reader :block_description
6
+ attr_reader :blk
7
7
 
8
- def initialize(base, position_in_run, method_name_or_block_description, &block)
9
- @base = base
10
- @position_in_run = position_in_run
8
+ def initialize(config, method_id_or_block_description, &blk)
9
+ @config = config
11
10
  if block_given?
12
- @block_description = method_name_or_block_description
13
- @block = block
11
+ @block_description = method_id_or_block_description
12
+ @blk = blk
14
13
  else
15
- @method_name = method_name_or_block_description
14
+ @method_id = method_id_or_block_description
16
15
  end
17
16
  end
18
17
 
18
+ def resource
19
+ config.resource
20
+ end
21
+
19
22
  def inspect
20
- str = "Process(#{resource}) position #{position_in_run}"
23
+ str = %{<#Process(#{resource})}
21
24
  if block
22
- str << " ran block (#{block_description})"
25
+ str << %{ called block "#{block_description}"}
23
26
  else
24
- str << " called :#{method_name}"
27
+ str << %{ called method :#{method_id}}
25
28
  end
29
+ str << ">"
30
+ str
26
31
  end
27
32
 
28
- def run(run)
29
- if block
30
- block.call
33
+ def run
34
+ if blk
35
+ blk.call
31
36
  else
32
- resource.send method_name
37
+ resource.send method_id
33
38
  end
34
- DataMiner.log_info "ran #{inspect}"
39
+ nil
35
40
  end
36
41
  end
37
42
  end
@@ -1,5 +1,5 @@
1
- module DataMiner
2
- class Run < ActiveRecord::Base
1
+ class DataMiner
2
+ class Run < ::ActiveRecord::Base
3
3
  set_table_name 'data_miner_runs'
4
4
 
5
5
  def resource
@@ -8,7 +8,7 @@ module DataMiner
8
8
 
9
9
  class << self
10
10
  def create_tables
11
- return if table_exists? and column_names.include?('skipped') # force a drop
11
+ return if table_exists?
12
12
  connection.create_table 'data_miner_runs', :force => true do |t|
13
13
  t.string 'resource_name'
14
14
  t.boolean 'killed'