data_miner 2.5.2 → 3.0.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,6 @@ describe DataMiner do
9
9
  Pet.delete_all
10
10
  Pet2.delete_all
11
11
  Pet3.delete_all
12
- DataMiner::Run.delete_all
13
- DataMiner::Run::ColumnStatistic.delete_all
14
12
  end
15
13
  it "it does not depend on mass-assignment" do
16
14
  lambda do
@@ -37,18 +35,6 @@ describe DataMiner do
37
35
  Pet.run_data_miner!
38
36
  Pet.find('Jerry').color_id.must_equal 'brown/black'
39
37
  end
40
- it "refreshes the dictionary for every run" do
41
- Pet.run_data_miner!
42
- Pet.find('Jerry').color_id.must_equal 'brown/black'
43
- begin
44
- FileUtils.mv COLOR_DICTIONARY_ENGLISH, "#{COLOR_DICTIONARY_ENGLISH}.bak"
45
- FileUtils.cp COLOR_DICTIONARY_SPANISH, COLOR_DICTIONARY_ENGLISH # oops! somebody swapped in a spanish dictionary
46
- Pet.run_data_miner!
47
- Pet.find('Jerry').color_id.must_equal 'morron/negro'
48
- ensure
49
- FileUtils.mv "#{COLOR_DICTIONARY_ENGLISH}.bak", COLOR_DICTIONARY_ENGLISH
50
- end
51
- end
52
38
  it "refreshes the data source for every run" do
53
39
  Pet.run_data_miner!
54
40
  Pet.find('Jerry').breed_id.must_equal 'Beagle'
@@ -76,44 +62,24 @@ describe DataMiner do
76
62
  Pet.find('Amigo').height.must_equal 300.5
77
63
  Pet.find('Johnny').height.must_equal 4000.0
78
64
  end
79
- it "performs unit conversions" do
80
- Pet.run_data_miner!
81
- Pet.find('Pierre').weight.must_be_close_to 1.9958 # 4.4 pounds in kilograms
82
- end
83
- it "doesn't convert nil to 0 when converting units" do
65
+ it "uses blocks to synthesize values" do
84
66
  Pet.run_data_miner!
85
- Pet.find('Nemo').age.must_be_nil
67
+ Pet.find('Jerry').emphatic_command_phrase.must_equal 'che!!!!!'
86
68
  end
87
- it "sets units" do
69
+ it "runs the result of synthesize through the standard cleaners" do
88
70
  Pet.run_data_miner!
89
- Pet.find('Pierre').age_units.must_equal 'years'
90
- Pet.find('Pierre').weight_units.must_equal 'kilograms'
91
- Pet.find('Pierre').height_units.must_equal 'millimetres'
71
+ Pet.find('Johnny').emphatic_command_phrase.must_equal 'oh ok !!!!!'
92
72
  end
93
73
  it "always nullifies numeric columns when blank/nil is the input" do
94
74
  Pet.run_data_miner!
95
75
  Pet.find('Amigo').weight.must_be_nil
96
76
  end
97
- it "doesn't nullify string columns by default" do
98
- Pet.run_data_miner!
99
- Pet.find('Amigo').command_phrase.must_equal ''
100
- Pet.find('Johnny').command_phrase.must_equal ''
101
- end
102
- it "nullifies string columns on demand" do
77
+ it "does nullify blank string columns by default" do
103
78
  Pet.run_data_miner!
79
+ Pet.find('Amigo').command_phrase.must_be_nil
104
80
  Pet.find('Jerry').favorite_food.must_equal 'cheese'
105
81
  Pet.find('Johnny').favorite_food.must_be_nil
106
82
  end
107
- it "doesn't set units if the input was blank/null" do
108
- Pet.run_data_miner!
109
- Pet.find('Amigo').weight.must_be_nil
110
- Pet.find('Amigo').weight_units.must_be_nil
111
- end
112
- it "keeps a row count before and after" do
113
- Pet.run_data_miner!
114
- Pet.data_miner_runs.first.row_count_before.must_equal 0
115
- Pet.data_miner_runs.first.row_count_after.must_equal 5
116
- end
117
83
  it "can import based on keys other than the primary key" do
118
84
  Pet2.run_data_miner!
119
85
  Pet2.find('Jerry').breed_id.must_equal 'Beagle-Basset'
metadata CHANGED
@@ -1,8 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.2
5
- prerelease:
4
+ version: 3.0.0.alpha
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Seamus Abshere
@@ -13,40 +13,8 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2013-07-05 00:00:00.000000000 Z
16
+ date: 2013-07-25 00:00:00.000000000 Z
17
17
  dependencies:
18
- - !ruby/object:Gem::Dependency
19
- name: aasm
20
- requirement: !ruby/object:Gem::Requirement
21
- none: false
22
- requirements:
23
- - - ! '>='
24
- - !ruby/object:Gem::Version
25
- version: '0'
26
- type: :runtime
27
- prerelease: false
28
- version_requirements: !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
31
- - - ! '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- - !ruby/object:Gem::Dependency
35
- name: active_record_inline_schema
36
- requirement: !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ! '>='
40
- - !ruby/object:Gem::Version
41
- version: 0.6.1
42
- type: :runtime
43
- prerelease: false
44
- version_requirements: !ruby/object:Gem::Requirement
45
- none: false
46
- requirements:
47
- - - ! '>='
48
- - !ruby/object:Gem::Version
49
- version: 0.6.1
50
18
  - !ruby/object:Gem::Dependency
51
19
  name: activerecord
52
20
  requirement: !ruby/object:Gem::Requirement
@@ -176,23 +144,7 @@ dependencies:
176
144
  - !ruby/object:Gem::Version
177
145
  version: 1.10.3
178
146
  - !ruby/object:Gem::Dependency
179
- name: dkastner-alchemist
180
- requirement: !ruby/object:Gem::Requirement
181
- none: false
182
- requirements:
183
- - - ! '>='
184
- - !ruby/object:Gem::Version
185
- version: '0'
186
- type: :development
187
- prerelease: false
188
- version_requirements: !ruby/object:Gem::Requirement
189
- none: false
190
- requirements:
191
- - - ! '>='
192
- - !ruby/object:Gem::Version
193
- version: '0'
194
- - !ruby/object:Gem::Dependency
195
- name: conversions
147
+ name: pry
196
148
  requirement: !ruby/object:Gem::Requirement
197
149
  none: false
198
150
  requirements:
@@ -208,7 +160,7 @@ dependencies:
208
160
  - !ruby/object:Gem::Version
209
161
  version: '0'
210
162
  - !ruby/object:Gem::Dependency
211
- name: earth
163
+ name: active_record_inline_schema
212
164
  requirement: !ruby/object:Gem::Requirement
213
165
  none: false
214
166
  requirements:
@@ -239,22 +191,6 @@ dependencies:
239
191
  - - ! '>='
240
192
  - !ruby/object:Gem::Version
241
193
  version: '0'
242
- - !ruby/object:Gem::Dependency
243
- name: lock_method
244
- requirement: !ruby/object:Gem::Requirement
245
- none: false
246
- requirements:
247
- - - ! '>='
248
- - !ruby/object:Gem::Version
249
- version: '0'
250
- type: :development
251
- prerelease: false
252
- version_requirements: !ruby/object:Gem::Requirement
253
- none: false
254
- requirements:
255
- - - ! '>='
256
- - !ruby/object:Gem::Version
257
- version: '0'
258
194
  - !ruby/object:Gem::Dependency
259
195
  name: minitest
260
196
  requirement: !ruby/object:Gem::Requirement
@@ -384,8 +320,8 @@ dependencies:
384
320
  - !ruby/object:Gem::Version
385
321
  version: '0'
386
322
  description: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
387
- XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models. You can also convert
388
- units.
323
+ XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models. Uses Upsert internally
324
+ for speed.
389
325
  email:
390
326
  - seamus@abshere.net
391
327
  - rossmeissl@gmail.com
@@ -407,30 +343,19 @@ files:
407
343
  - lib/data_miner.rb
408
344
  - lib/data_miner/active_record_class_methods.rb
409
345
  - lib/data_miner/attribute.rb
410
- - lib/data_miner/dictionary.rb
411
- - lib/data_miner/run.rb
412
- - lib/data_miner/run/column_statistic.rb
413
346
  - lib/data_miner/script.rb
414
347
  - lib/data_miner/step.rb
415
348
  - lib/data_miner/step/import.rb
416
349
  - lib/data_miner/step/process.rb
417
350
  - lib/data_miner/step/sql.rb
418
- - lib/data_miner/unit_converter.rb
419
- - lib/data_miner/unit_converter/alchemist.rb
420
- - lib/data_miner/unit_converter/conversions.rb
421
351
  - lib/data_miner/version.rb
422
- - test/data_miner/step/test_import.rb
423
352
  - test/data_miner/step/test_sql.rb
424
353
  - test/data_miner/test_attribute.rb
425
- - test/data_miner/unit_converter/test_alchemist.rb
426
- - test/data_miner/unit_converter/test_conversions.rb
427
354
  - test/helper.rb
428
355
  - test/support/breed.rb
429
356
  - test/support/breed_by_license_number.csv
430
357
  - test/support/breeds.xls
431
358
  - test/support/data_miner_with_alchemist.rb
432
- - test/support/data_miner_with_conversions.rb
433
- - test/support/data_miner_without_unit_converter.rb
434
359
  - test/support/pet.rb
435
360
  - test/support/pet2.rb
436
361
  - test/support/pet3.rb
@@ -439,10 +364,6 @@ files:
439
364
  - test/support/pets.csv
440
365
  - test/support/pets_funny.csv
441
366
  - test/test_data_miner.rb
442
- - test/test_data_miner_run_column_statistic.rb
443
- - test/test_earth_import.rb
444
- - test/test_safety.rb
445
- - test/test_unit_conversion.rb
446
367
  homepage: https://github.com/seamusabshere/data_miner
447
368
  licenses: []
448
369
  post_install_message:
@@ -458,9 +379,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
458
379
  required_rubygems_version: !ruby/object:Gem::Requirement
459
380
  none: false
460
381
  requirements:
461
- - - ! '>='
382
+ - - ! '>'
462
383
  - !ruby/object:Gem::Version
463
- version: '0'
384
+ version: 1.3.1
464
385
  requirements: []
465
386
  rubyforge_project: data_miner
466
387
  rubygems_version: 1.8.25
@@ -469,18 +390,13 @@ specification_version: 3
469
390
  summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
470
391
  XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
471
392
  test_files:
472
- - test/data_miner/step/test_import.rb
473
393
  - test/data_miner/step/test_sql.rb
474
394
  - test/data_miner/test_attribute.rb
475
- - test/data_miner/unit_converter/test_alchemist.rb
476
- - test/data_miner/unit_converter/test_conversions.rb
477
395
  - test/helper.rb
478
396
  - test/support/breed.rb
479
397
  - test/support/breed_by_license_number.csv
480
398
  - test/support/breeds.xls
481
399
  - test/support/data_miner_with_alchemist.rb
482
- - test/support/data_miner_with_conversions.rb
483
- - test/support/data_miner_without_unit_converter.rb
484
400
  - test/support/pet.rb
485
401
  - test/support/pet2.rb
486
402
  - test/support/pet3.rb
@@ -489,8 +405,4 @@ test_files:
489
405
  - test/support/pets.csv
490
406
  - test/support/pets_funny.csv
491
407
  - test/test_data_miner.rb
492
- - test/test_data_miner_run_column_statistic.rb
493
- - test/test_earth_import.rb
494
- - test/test_safety.rb
495
- - test/test_unit_conversion.rb
496
408
  has_rdoc:
@@ -1,84 +0,0 @@
1
- require 'remote_table'
2
-
3
- class DataMiner
4
- # An easy way to translate data before importing it using an intermediate table.
5
- class Dictionary
6
- DEFAULT_CASE_SENSITIVE = true
7
-
8
- # What field in the dictionary holds the lookup key.
9
- #
10
- # In other words, the column we scan down to find an entry.
11
- #
12
- # @return [String]
13
- attr_reader :key_name
14
-
15
- # What field in the dictionary holds the final value.
16
- #
17
- # @return [String]
18
- attr_reader :value_name
19
-
20
- # A +sprintf+-style format to be applied.
21
- # @return [String]
22
- attr_reader :sprintf
23
-
24
- # The URL of the dictionary. It must be a CSV.
25
- # @return [String]
26
- attr_reader :url
27
-
28
- # Whether to be case-sensitive with lookups. Defaults to false.
29
- # @return [TrueClass, FalseClass]
30
- attr_reader :case_sensitive
31
-
32
- # @private
33
- def initialize(options = {})
34
- options = options.symbolize_keys
35
- @url = options[:url]
36
- @key_name = options[:input].to_s
37
- @value_name = options[:output].to_s
38
- @sprintf = options[:sprintf]
39
- @case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
40
- @table_mutex = ::Mutex.new
41
- end
42
-
43
- # Look up a translation for a value.
44
- #
45
- # @return [nil, String]
46
- def lookup(value)
47
- normalized_value = normalize_for_comparison value
48
- if match = table.detect { |entry| entry[key_name] == normalized_value }
49
- match[value_name].to_s
50
- end
51
- end
52
-
53
- private
54
-
55
- def table
56
- @table || @table_mutex.synchronize do
57
- @table ||= ::RemoteTable.new(url).map do |entry|
58
- entry[key_name] = normalize_for_comparison entry[key_name]
59
- entry
60
- end
61
- end
62
- end
63
-
64
- def refresh
65
- @table = nil
66
- end
67
-
68
- def normalize_for_comparison(str)
69
- if sprintf
70
- if sprintf.end_with?('f')
71
- str = str.to_f
72
- elsif sprintf.end_with?('d')
73
- str = str.to_i
74
- end
75
- str = sprintf % str
76
- end
77
- str = DataMiner.compress_whitespace str
78
- unless case_sensitive
79
- str = DataMiner.downcase str
80
- end
81
- str
82
- end
83
- end
84
- end
@@ -1,144 +0,0 @@
1
- require 'aasm'
2
- require 'active_record_inline_schema'
3
-
4
- require 'data_miner/run/column_statistic'
5
-
6
- class DataMiner
7
- # A record of what happened when you ran a data miner script.
8
- #
9
- # To create the table, use +DataMiner::Run.auto_upgrade!+, possibly in +db/seeds.rb+ or a database migration.
10
- class Run < ::ActiveRecord::Base
11
- class << self
12
- # If a previous run died and you have manually enabled locking, you may find yourself getting +LockMethod::Locked+ exceptions.
13
- #
14
- # @note Starting in 2.1.0, runs are no longer locked by default. This method remains in case you want to re-apply locking.
15
- #
16
- # @param [String] model_names What locks to clear.
17
- #
18
- # @return [nil]
19
- #
20
- # @example Re-enable locking (since it was turned off by default in 2.1.0)
21
- # require 'data_miner'
22
- # require 'lock_method'
23
- # DataMiner::Run.lock_method :start
24
- def clear_locks(model_names = DataMiner.model_names)
25
- return unless defined?(::LockMethod)
26
- model_names.each do |model_name|
27
- dummy = new
28
- dummy.model_name = model_name
29
- dummy.lock_method_clear :start
30
- end
31
- nil
32
- end
33
- end
34
-
35
- # Raise this exception to skip the current run without causing it to fail.
36
- #
37
- # @example Avoid running certain data miner scripts too often (because they take too long).
38
- # class FlightSegment < ActiveRecord::Base
39
- # data_miner do
40
- # [...]
41
- # process "don't run this more than once an hour" do
42
- # if (last_ran_at = data_miner_runs.first(:order => 'created_at DESC').try(:created_at)) and (Time.now.utc - last_ran_at) < 3600
43
- # raise DataMiner::Run::Skip
44
- # end
45
- # end
46
- # [...]
47
- # end
48
- # end
49
- class Skip < ::Exception
50
- end
51
-
52
- INITIAL_STATE = :limbo
53
-
54
- self.table_name = 'data_miner_runs'
55
-
56
- col :model_name
57
- col :aasm_state
58
- col :created_at, :type => :datetime
59
- col :stopped_at, :type => :datetime
60
- col :updated_at, :type => :datetime
61
- col :error, :type => :text
62
- col :row_count_before, :type => :integer
63
- col :row_count_after, :type => :integer
64
- add_index :model_name
65
- add_index :aasm_state
66
-
67
- validates_presence_of :model_name
68
-
69
- has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic', :order => 'id ASC'
70
-
71
- include ::AASM
72
- aasm_initial_state INITIAL_STATE
73
- aasm_state :limbo
74
- aasm_state :skipped
75
- aasm_state :succeeded
76
- aasm_state :failed
77
- aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
78
- aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
79
- aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
80
-
81
- # @private
82
- def start
83
- model = model_name.constantize
84
- if model.table_exists?
85
- self.row_count_before = model.count
86
- end
87
- save!
88
- if DataMiner.per_column_statistics?
89
- ColumnStatistic.take self
90
- end
91
- begin
92
- catch :data_miner_succeed do
93
- yield
94
- end
95
- succeed!
96
- rescue Skip
97
- skip!
98
- rescue
99
- self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
100
- fail!
101
- raise $!
102
- ensure
103
- if model.connection.respond_to?(:schema_cache)
104
- model.connection.schema_cache.clear!
105
- end
106
- model.reset_column_information
107
- if model.table_exists?
108
- self.row_count_after = model.count
109
- if DataMiner.per_column_statistics?
110
- ColumnStatistic.take self
111
- end
112
- end
113
- self.stopped_at = ::Time.now.utc
114
- save!
115
- DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
116
- end
117
- self
118
- end
119
-
120
- # Get the column statistics for a particular column before this run started.
121
- #
122
- # @param [String] column_name The column you want to know about.
123
- #
124
- # @return [ColumnStatistic]
125
- def initial_column_statistics(column_name)
126
- column_statistics.where(:column_name => column_name.to_s).first
127
- end
128
-
129
- # Get the column statistics for a particular column after this run finished.
130
- #
131
- # @param [String] column_name The column you want to know about.
132
- #
133
- # @return [ColumnStatistic]
134
- def final_column_statistics(column_name)
135
- column_statistics.where(:column_name => column_name.to_s).last
136
- end
137
-
138
- # @private
139
- def as_lock
140
- database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
141
- [database_name, model_name]
142
- end
143
- end
144
- end