data_miner 2.0.1 → 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,49 +1,71 @@
1
1
  require 'remote_table'
2
2
 
3
3
  class DataMiner
4
+ # An easy way to translate data before importing it using an intermediate table.
4
5
  class Dictionary
5
6
  DEFAULT_CASE_SENSITIVE = true
6
7
 
8
+ # What field in the dictionary holds the lookup key.
9
+ #
10
+ # In other words, the column we scan down to find an entry.
11
+ #
12
+ # @return [String]
7
13
  attr_reader :key_name
14
+
15
+ # What field in the dictionary holds the final value.
16
+ #
17
+ # @return [String]
8
18
  attr_reader :value_name
19
+
20
+ # A +sprintf+-style format to be applied.
21
+ # @return [String]
9
22
  attr_reader :sprintf
23
+
24
+ # The URL of the dictionary. It must be a CSV.
25
+ # @return [String]
10
26
  attr_reader :url
27
+
28
+ # Whether to be case-sensitive with lookups. Defaults to false.
29
+ # @return [TrueClass, FalseClass]
11
30
  attr_reader :case_sensitive
12
31
 
32
+ # @private
13
33
  def initialize(options = {})
14
34
  options = options.symbolize_keys
15
35
  @url = options[:url]
16
- @key_name = options[:input]
17
- @value_name = options[:output]
36
+ @key_name = options[:input].to_s
37
+ @value_name = options[:output].to_s
18
38
  @sprintf = options[:sprintf]
19
39
  @case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
20
40
  @table_mutex = ::Mutex.new
21
41
  end
22
42
 
43
+ # Look up a translation for a value.
44
+ #
45
+ # @return [nil, String]
46
+ def lookup(value)
47
+ normalized_value = normalize_for_comparison value
48
+ if match = table.detect { |entry| entry[key_name] == normalized_value }
49
+ match[value_name].to_s
50
+ end
51
+ end
52
+
53
+ private
54
+
23
55
  def table
24
56
  @table || @table_mutex.synchronize do
25
- @table ||= ::RemoteTable.new(url).to_a # make sure it's fully cached
57
+ @table ||= ::RemoteTable.new(url).map do |entry|
58
+ entry[key_name] = normalize_for_comparison entry[key_name]
59
+ entry
60
+ end
26
61
  end
27
62
  end
28
63
 
29
64
  def refresh
30
65
  @table = nil
31
66
  end
32
-
33
- def lookup(key)
34
- find key_name, key, value_name, {:sprintf => sprintf, :case_sensitive => case_sensitive}
35
- end
36
-
37
- def find(key_name, key, value_name, options = {})
38
- normalized_key = normalize_for_comparison(key, options)
39
- if match = table.detect { |row| normalized_key == normalize_for_comparison(row[key_name.to_s], options) }
40
- match[value_name.to_s].to_s
41
- end
42
- end
43
-
44
- private
45
67
 
46
- def normalize_for_comparison(str, options = {})
68
+ def normalize_for_comparison(str)
47
69
  if sprintf
48
70
  if sprintf.end_with?('f')
49
71
  str = str.to_f
@@ -53,7 +75,7 @@ class DataMiner
53
75
  str = sprintf % str
54
76
  end
55
77
  str = DataMiner.compress_whitespace str
56
- unless options[:case_sensitive]
78
+ unless case_sensitive
57
79
  str = DataMiner.downcase str
58
80
  end
59
81
  str
@@ -2,7 +2,39 @@ require 'aasm'
2
2
  require 'active_record_inline_schema'
3
3
 
4
4
  class DataMiner
5
+ # A record of what happened when you ran a data miner script.
6
+ #
7
+ # To create the table, use +DataMiner::Run.auto_upgrade!+, possibly in +db/seeds.rb+ or a database migration.
5
8
  class Run < ::ActiveRecord::Base
9
+ class << self
10
+ # If a previous run died, you may find yourself getting +LockMethod::Locked+ exceptions.
11
+ #
12
+ # @param [String] model_names What locks to clear.
13
+ #
14
+ # @return [nil]
15
+ def clear_locks(model_names = DataMiner.model_names)
16
+ model_names.each do |model_name|
17
+ dummy = new
18
+ dummy.model_name = model_name
19
+ dummy.lock_method_clear :perform
20
+ end
21
+ nil
22
+ end
23
+ end
24
+ # Raise this exception to skip the current run without causing it to fail.
25
+ #
26
+ # @example Avoid running certain data miner scripts too often (because they take too long).
27
+ # class FlightSegment < ActiveRecord::Base
28
+ # data_miner do
29
+ # [...]
30
+ # process "don't run this more than once an hour" do
31
+ # if (last_ran_at = data_miner_runs.first(:order => 'created_at DESC').try(:created_at)) and (Time.now.utc - last_ran_at) < 3600
32
+ # raise DataMiner::Run::Skip
33
+ # end
34
+ # end
35
+ # [...]
36
+ # end
37
+ # end
6
38
  class Skip < ::Exception
7
39
  end
8
40
 
@@ -29,6 +61,7 @@ class DataMiner
29
61
 
30
62
  validates_presence_of :model_name
31
63
 
64
+ # @private
32
65
  def perform
33
66
  save!
34
67
  begin
@@ -47,9 +80,11 @@ class DataMiner
47
80
  save!
48
81
  DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
49
82
  end
83
+ self
50
84
  end
51
85
  lock_method :perform
52
86
 
87
+ # @private
53
88
  def as_lock
54
89
  [Run.connection.current_database, model_name]
55
90
  end
@@ -1,4 +1,5 @@
1
1
  class DataMiner
2
+ # The container that holds each step in the script.
2
3
  class Script
3
4
  class << self
4
5
  # @private
@@ -13,18 +14,22 @@ class DataMiner
13
14
  end
14
15
  end
15
16
 
17
+ # @private
16
18
  def current_stack
17
19
  ::Thread.current[STACK_THREAD_VAR] ||= []
18
20
  end
19
21
 
22
+ # @private
20
23
  def current_stack=(stack)
21
24
  ::Thread.current[STACK_THREAD_VAR] = stack
22
25
  end
23
26
 
27
+ # @private
24
28
  def current_uniq
25
29
  ::Thread.current[UNIQ_THREAD_VAR]
26
30
  end
27
31
 
32
+ # @private
28
33
  def current_uniq=(uniq)
29
34
  ::Thread.current[UNIQ_THREAD_VAR] = uniq
30
35
  end
@@ -33,52 +38,167 @@ class DataMiner
33
38
  UNIQ_THREAD_VAR = 'DataMiner::Script.current_uniq'
34
39
  STACK_THREAD_VAR = 'DataMiner::Script.current_stack'
35
40
 
41
+ # @private
36
42
  attr_reader :model
43
+
44
+ # The steps in the script.
45
+ # @return [Array<DataMiner::Step>]
37
46
  attr_reader :steps
38
47
 
48
+ # @private
39
49
  def initialize(model)
40
50
  @model = model
41
51
  @steps = []
42
52
  end
43
53
 
54
+ # @private
44
55
  def append_block(blk)
45
56
  instance_eval(&blk)
46
57
  end
47
58
 
59
+ # Identify a single method or a define block of arbitrary code to be executed.
60
+ #
61
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
62
+ # @see DataMiner::Step::Process The actual Process class.
63
+ #
64
+ # @overload process(method_id)
65
+ # Run a class method on the model.
66
+ # @param [Symbol] method_id The class method to be run on the model.
67
+ #
68
+ # @overload process(description, &blk)
69
+ # Run a block of code.
70
+ # @param [String] description A description of what the block does.
71
+ # @yield [] The block to be evaluated in the context of the model (it's instance_eval'ed on the model class)
72
+ #
73
+ # @example Single class method
74
+ # data_miner do
75
+ # [...]
76
+ # process :update_averages!
77
+ # [...]
78
+ # end
79
+ #
80
+ # @example Arbitrary code
81
+ # data_miner do
82
+ # [...]
83
+ # process "do some arbitrary stuff" do
84
+ # [...]
85
+ # end
86
+ # [...]
87
+ # end
88
+ #
89
+ # @return [nil]
48
90
  def process(method_id_or_description, &blk)
49
91
  append(:process, method_id_or_description, &blk)
50
92
  end
51
93
 
94
+ # Use https://github.com/ricardochimal/taps to pull table structure and data.
95
+ #
96
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
97
+ # @see DataMiner::Step::Tap The actual Tap class.
98
+ #
99
+ # @param [String] description A description of the taps source.
100
+ # @param [String] source The taps URL, including username, password, domain, and port.
101
+ # @param [optional, Hash] options
102
+ # @option options [String] :source_table_name (model.table_name) The source table name, if different.
103
+ #
104
+ # @note The source table name will default to the model's table name. If it's different, use the +:source_table_name+ option.
105
+ # @note +taps+ needs to be installed on your system and in your PATH, but it doesn't have to be in your Gemfile. Sometimes having it in your Gemfile will cause Heroku deploys (etc.) to fail because it requires +sqlite3+.
106
+ #
107
+ # @example Tapping Brighter Planet's reference data web service
108
+ # data_miner do
109
+ # [...]
110
+ # tap "Brighter Planet's reference data", "http://carbon:neutral@data.brighterplanet.com:5000"
111
+ # [...]
112
+ # end
113
+ #
114
+ # @return [nil]
52
115
  def tap(description, source, options = {})
53
116
  append :tap, description, source, options
54
117
  end
55
118
 
56
- def import(description = nil, options = {}, &blk)
57
- append(:import, description, options, &blk)
119
+ # Import rows into your model.
120
+ #
121
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
122
+ # @see DataMiner::Step::Import The actual Import class.
123
+ #
124
+ # @param [String] description A description of the data source.
125
+ # @param [Hash] table_and_errata_settings Settings, including URL of the data source, that are used to download/parse (using RemoteTable) and (sometimes) correct (using Errata) the data.
126
+ # @option table_and_errata_settings [String] :url The URL of the data source. Passed directly to +RemoteTable.new+.
127
+ # @option table_and_errata_settings [Hash] :errata The +:responder+ and +:url+ settings that will be passed to +Errata.new+.
128
+ # @option table_and_errata_settings [*] anything Any other setting will be passed to +RemoteTable.new+.
129
+ #
130
+ # @yield [] A block defining how to +key+ the import (to make it idempotent) and which columns to +store+.
131
+ #
132
+ # @note Be sure to check out https://github.com/seamusabshere/remote_table and https://github.com/seamusabshere/errata for available +table_and_errata_settings+.
133
+ # @note There are hundreds of +import+ examples in https://github.com/brighterplanet/earth
134
+ # @note We often use string primary keys to make idempotency easier. https://github.com/seamusabshere/active_record_inline_schema supports defining these inline.
135
+ #
136
+ # @example From the README
137
+ # data_miner do
138
+ # [...]
139
+ # import("OpenGeoCode.org's Country Codes to Country Names list",
140
+ # :url => 'http://opengeocode.org/download/countrynames.txt',
141
+ # :format => :delimited,
142
+ # :delimiter => '; ',
143
+ # :headers => false,
144
+ # :skip => 22) do
145
+ # key :iso_3166_code, :field_number => 0
146
+ # store :iso_3166_alpha_3_code, :field_number => 1
147
+ # store :iso_3166_numeric_code, :field_number => 2
148
+ # store :name, :field_number => 5
149
+ # end
150
+ # [...]
151
+ # end
152
+ #
153
+ # @return [nil]
154
+ def import(description, table_and_errata_settings, &blk)
155
+ append(:import, description, table_and_errata_settings, &blk)
58
156
  end
59
157
 
158
+ # Prepend a step to a script unless it's already there. Mostly for internal use.
159
+ #
160
+ # @return [nil]
60
161
  def prepend_once(*args, &blk)
61
162
  step = make(*args, &blk)
62
163
  unless steps.include? step
63
164
  steps.unshift step
64
165
  end
166
+ nil
65
167
  end
66
168
 
169
+ # Prepend a step to a script. Mostly for internal use.
170
+ #
171
+ # @return [nil]
67
172
  def prepend(*args, &blk)
68
173
  steps.unshift make(*args, &blk)
174
+ nil
69
175
  end
70
176
 
177
+ # Append a step to a script unless it's already there. Mostly for internal use.
178
+ #
179
+ # @return [nil]
71
180
  def append_once(*args, &blk)
72
181
  step = make(*args, &blk)
73
182
  unless steps.include? step
74
183
  steps << step
75
184
  end
185
+ nil
76
186
  end
77
187
 
188
+ # Append a step to a script. Mostly for internal use.
189
+ #
190
+ # @return [nil]
78
191
  def append(*args, &blk)
79
192
  steps << make(*args, &blk)
193
+ nil
80
194
  end
81
195
 
196
+ # Run the script for this model. Mostly for internal use.
197
+ #
198
+ # @note Normally you should use +Country.run_data_miner!+
199
+ # @note A primitive "call stack" is kept that will prevent infinite loops. So, if Country's data miner script calls Province's AND vice-versa, each one will only be run once.
200
+ #
201
+ # @return [DataMiner::Run]
82
202
  def perform
83
203
  model_name = model.name
84
204
  # $stderr.write "0 - #{model_name}\n"
@@ -103,6 +223,7 @@ class DataMiner
103
223
 
104
224
  private
105
225
 
226
+ # return [DataMiner::Step]
106
227
  def make(*args, &blk)
107
228
  klass = Step.const_get(args.shift.to_s.camelcase)
108
229
  options = args.extract_options!
@@ -1,5 +1,13 @@
1
- class DataMiner::Step
2
- def ==(other)
3
- other.class == self.class and other.description == description
1
+ class DataMiner
2
+ class Step
3
+ # @private
4
+ def ==(other)
5
+ other.class == self.class and other.description == description
6
+ end
7
+
8
+ # @private
9
+ def model
10
+ script.model
11
+ end
4
12
  end
5
13
  end
@@ -1,74 +1,110 @@
1
1
  require 'errata'
2
2
  require 'remote_table'
3
3
 
4
- class DataMiner::Step::Import
5
- attr_reader :attributes
6
- attr_reader :script
7
- attr_reader :description
8
- attr_reader :attributes
9
-
10
- def initialize(script, description, options = {}, &blk)
11
- options = options.symbolize_keys
12
- if options.has_key?(:table)
13
- raise ::ArgumentError, %{[data_miner] :table is no longer an allowed option.}
14
- end
15
- if (errata_options = options[:errata]) and not errata_options.is_a?(::Hash)
16
- raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization options to Errata}
17
- end
18
- @script = script
19
- @mutex = ::Mutex.new
20
- @attributes = ::ActiveSupport::OrderedHash.new
21
- @description = description
22
- if options.has_key? :errata
23
- errata_options = options[:errata].symbolize_keys
24
- errata_options[:responder] ||= model
25
- options[:errata] = errata_options
26
- end
27
- @table_options = options.dup
28
- @table_options[:streaming] = true
29
- instance_eval(&blk)
30
- end
4
+ class DataMiner
5
+ class Step
6
+ # A step that imports data from a remote source.
7
+ #
8
+ # Create these by calling +import+ inside a +data_miner+ block.
9
+ #
10
+ # @see DataMiner::ActiveRecordClassMethods#data_miner Overview of how to define data miner scripts inside of ActiveRecord models.
11
+ # @see DataMiner::Script#import
12
+ class Import < Step
13
+ # The mappings of local columns to remote data source fields.
14
+ # @return [Array<DataMiner::Attribute>]
15
+ attr_reader :attributes
31
16
 
32
- def model
33
- script.model
34
- end
17
+ # @private
18
+ attr_reader :script
35
19
 
36
- def store(attr_name, attr_options = {})
37
- attr_name = attr_name.to_sym
38
- if attributes.has_key? attr_name
39
- raise "You should only call store or key once for #{model.name}##{attr_name}"
40
- end
41
- attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
42
- end
43
-
44
- def key(attr_name, attr_options = {})
45
- attr_name = attr_name.to_sym
46
- if attributes.has_key? attr_name
47
- raise "You should only call store or key once for #{model.name}##{attr_name}"
48
- end
49
- @key = attr_name
50
- store attr_name, attr_options
51
- end
20
+ # Description of what this step does.
21
+ # @return [String]
22
+ attr_reader :description
23
+
24
+ # @private
25
+ def initialize(script, description, table_and_errata_settings, &blk)
26
+ table_and_errata_settings = table_and_errata_settings.symbolize_keys
27
+ if table_and_errata_settings.has_key?(:table)
28
+ raise ::ArgumentError, %{[data_miner] :table is no longer an allowed setting.}
29
+ end
30
+ if (errata_settings = table_and_errata_settings[:errata]) and not errata_settings.is_a?(::Hash)
31
+ raise ::ArgumentError, %{[data_miner] :errata must be a hash of initialization settings to Errata}
32
+ end
33
+ @script = script
34
+ @attributes = ::ActiveSupport::OrderedHash.new
35
+ @description = description
36
+ if table_and_errata_settings.has_key? :errata
37
+ errata_settings = table_and_errata_settings[:errata].symbolize_keys
38
+ errata_settings[:responder] ||= model
39
+ table_and_errata_settings[:errata] = errata_settings
40
+ end
41
+ @table_settings = table_and_errata_settings.dup
42
+ @table_settings[:streaming] = true
43
+ @table_mutex = ::Mutex.new
44
+ instance_eval(&blk)
45
+ end
52
46
 
53
- def table
54
- @table || @mutex.synchronize do
55
- @table ||= ::RemoteTable.new(@table_options)
56
- end
57
- end
47
+ # Store data into a model column.
48
+ #
49
+ # @see DataMiner::Attribute The actual Attribute class.
50
+ #
51
+ # @param [Symbol] attr_name The name of the local model column.
52
+ # @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
53
+ # @option attr_options [*] anything Any option for +DataMiner::Attribute+.
54
+ #
55
+ # @return [nil]
56
+ def store(attr_name, attr_options = {})
57
+ attr_name = attr_name.to_sym
58
+ if attributes.has_key? attr_name
59
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
60
+ end
61
+ attributes[attr_name] = DataMiner::Attribute.new self, attr_name, attr_options
62
+ end
58
63
 
59
- def refresh
60
- @table = nil
61
- attributes.each { |_, attr| attr.refresh }
62
- nil
63
- end
64
-
65
- def perform
66
- table.each do |row|
67
- record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
68
- attributes.each { |_, attr| attr.set_from_row record, row }
69
- record.save!
64
+ # Store data into a model column AND use it as the key.
65
+ #
66
+ # @see DataMiner::Attribute The actual Attribute class.
67
+ #
68
+ # Enables idempotency. In other words, you can run the data miner script multiple times, get updated data, and not get duplicate rows.
69
+ #
70
+ # @param [Symbol] attr_name The name of the local model column.
71
+ # @param [optional, Hash] attr_options Options that will be passed to +DataMiner::Attribute.new+
72
+ # @option attr_options [*] anything Any option for +DataMiner::Attribute+.
73
+ #
74
+ # @return [nil]
75
+ def key(attr_name, attr_options = {})
76
+ attr_name = attr_name.to_sym
77
+ if attributes.has_key? attr_name
78
+ raise "You should only call store or key once for #{model.name}##{attr_name}"
79
+ end
80
+ @key = attr_name
81
+ store attr_name, attr_options
82
+ end
83
+
84
+ # @private
85
+ def perform
86
+ table.each do |row|
87
+ record = model.send "find_or_initialize_by_#{@key}", attributes[@key].read(row)
88
+ attributes.each { |_, attr| attr.set_from_row record, row }
89
+ record.save!
90
+ end
91
+ refresh
92
+ nil
93
+ end
94
+
95
+ private
96
+
97
+ def table
98
+ @table || @table_mutex.synchronize do
99
+ @table ||= ::RemoteTable.new(@table_settings)
100
+ end
101
+ end
102
+
103
+ def refresh
104
+ @table = nil
105
+ attributes.each { |_, attr| attr.refresh }
106
+ nil
107
+ end
70
108
  end
71
- refresh
72
- nil
73
109
  end
74
110
  end