data_miner 2.5.2 → 3.0.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,8 +9,6 @@ describe DataMiner do
9
9
  Pet.delete_all
10
10
  Pet2.delete_all
11
11
  Pet3.delete_all
12
- DataMiner::Run.delete_all
13
- DataMiner::Run::ColumnStatistic.delete_all
14
12
  end
15
13
  it "it does not depend on mass-assignment" do
16
14
  lambda do
@@ -37,18 +35,6 @@ describe DataMiner do
37
35
  Pet.run_data_miner!
38
36
  Pet.find('Jerry').color_id.must_equal 'brown/black'
39
37
  end
40
- it "refreshes the dictionary for every run" do
41
- Pet.run_data_miner!
42
- Pet.find('Jerry').color_id.must_equal 'brown/black'
43
- begin
44
- FileUtils.mv COLOR_DICTIONARY_ENGLISH, "#{COLOR_DICTIONARY_ENGLISH}.bak"
45
- FileUtils.cp COLOR_DICTIONARY_SPANISH, COLOR_DICTIONARY_ENGLISH # oops! somebody swapped in a spanish dictionary
46
- Pet.run_data_miner!
47
- Pet.find('Jerry').color_id.must_equal 'morron/negro'
48
- ensure
49
- FileUtils.mv "#{COLOR_DICTIONARY_ENGLISH}.bak", COLOR_DICTIONARY_ENGLISH
50
- end
51
- end
52
38
  it "refreshes the data source for every run" do
53
39
  Pet.run_data_miner!
54
40
  Pet.find('Jerry').breed_id.must_equal 'Beagle'
@@ -76,44 +62,24 @@ describe DataMiner do
76
62
  Pet.find('Amigo').height.must_equal 300.5
77
63
  Pet.find('Johnny').height.must_equal 4000.0
78
64
  end
79
- it "performs unit conversions" do
80
- Pet.run_data_miner!
81
- Pet.find('Pierre').weight.must_be_close_to 1.9958 # 4.4 pounds in kilograms
82
- end
83
- it "doesn't convert nil to 0 when converting units" do
65
+ it "uses blocks to synthesize values" do
84
66
  Pet.run_data_miner!
85
- Pet.find('Nemo').age.must_be_nil
67
+ Pet.find('Jerry').emphatic_command_phrase.must_equal 'che!!!!!'
86
68
  end
87
- it "sets units" do
69
+ it "runs the result of synthesize through the standard cleaners" do
88
70
  Pet.run_data_miner!
89
- Pet.find('Pierre').age_units.must_equal 'years'
90
- Pet.find('Pierre').weight_units.must_equal 'kilograms'
91
- Pet.find('Pierre').height_units.must_equal 'millimetres'
71
+ Pet.find('Johnny').emphatic_command_phrase.must_equal 'oh ok !!!!!'
92
72
  end
93
73
  it "always nullifies numeric columns when blank/nil is the input" do
94
74
  Pet.run_data_miner!
95
75
  Pet.find('Amigo').weight.must_be_nil
96
76
  end
97
- it "doesn't nullify string columns by default" do
98
- Pet.run_data_miner!
99
- Pet.find('Amigo').command_phrase.must_equal ''
100
- Pet.find('Johnny').command_phrase.must_equal ''
101
- end
102
- it "nullifies string columns on demand" do
77
+ it "does nullify blank string columns by default" do
103
78
  Pet.run_data_miner!
79
+ Pet.find('Amigo').command_phrase.must_be_nil
104
80
  Pet.find('Jerry').favorite_food.must_equal 'cheese'
105
81
  Pet.find('Johnny').favorite_food.must_be_nil
106
82
  end
107
- it "doesn't set units if the input was blank/null" do
108
- Pet.run_data_miner!
109
- Pet.find('Amigo').weight.must_be_nil
110
- Pet.find('Amigo').weight_units.must_be_nil
111
- end
112
- it "keeps a row count before and after" do
113
- Pet.run_data_miner!
114
- Pet.data_miner_runs.first.row_count_before.must_equal 0
115
- Pet.data_miner_runs.first.row_count_after.must_equal 5
116
- end
117
83
  it "can import based on keys other than the primary key" do
118
84
  Pet2.run_data_miner!
119
85
  Pet2.find('Jerry').breed_id.must_equal 'Beagle-Basset'
metadata CHANGED
@@ -1,8 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_miner
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.2
5
- prerelease:
4
+ version: 3.0.0.alpha
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Seamus Abshere
@@ -13,40 +13,8 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2013-07-05 00:00:00.000000000 Z
16
+ date: 2013-07-25 00:00:00.000000000 Z
17
17
  dependencies:
18
- - !ruby/object:Gem::Dependency
19
- name: aasm
20
- requirement: !ruby/object:Gem::Requirement
21
- none: false
22
- requirements:
23
- - - ! '>='
24
- - !ruby/object:Gem::Version
25
- version: '0'
26
- type: :runtime
27
- prerelease: false
28
- version_requirements: !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
31
- - - ! '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- - !ruby/object:Gem::Dependency
35
- name: active_record_inline_schema
36
- requirement: !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ! '>='
40
- - !ruby/object:Gem::Version
41
- version: 0.6.1
42
- type: :runtime
43
- prerelease: false
44
- version_requirements: !ruby/object:Gem::Requirement
45
- none: false
46
- requirements:
47
- - - ! '>='
48
- - !ruby/object:Gem::Version
49
- version: 0.6.1
50
18
  - !ruby/object:Gem::Dependency
51
19
  name: activerecord
52
20
  requirement: !ruby/object:Gem::Requirement
@@ -176,23 +144,7 @@ dependencies:
176
144
  - !ruby/object:Gem::Version
177
145
  version: 1.10.3
178
146
  - !ruby/object:Gem::Dependency
179
- name: dkastner-alchemist
180
- requirement: !ruby/object:Gem::Requirement
181
- none: false
182
- requirements:
183
- - - ! '>='
184
- - !ruby/object:Gem::Version
185
- version: '0'
186
- type: :development
187
- prerelease: false
188
- version_requirements: !ruby/object:Gem::Requirement
189
- none: false
190
- requirements:
191
- - - ! '>='
192
- - !ruby/object:Gem::Version
193
- version: '0'
194
- - !ruby/object:Gem::Dependency
195
- name: conversions
147
+ name: pry
196
148
  requirement: !ruby/object:Gem::Requirement
197
149
  none: false
198
150
  requirements:
@@ -208,7 +160,7 @@ dependencies:
208
160
  - !ruby/object:Gem::Version
209
161
  version: '0'
210
162
  - !ruby/object:Gem::Dependency
211
- name: earth
163
+ name: active_record_inline_schema
212
164
  requirement: !ruby/object:Gem::Requirement
213
165
  none: false
214
166
  requirements:
@@ -239,22 +191,6 @@ dependencies:
239
191
  - - ! '>='
240
192
  - !ruby/object:Gem::Version
241
193
  version: '0'
242
- - !ruby/object:Gem::Dependency
243
- name: lock_method
244
- requirement: !ruby/object:Gem::Requirement
245
- none: false
246
- requirements:
247
- - - ! '>='
248
- - !ruby/object:Gem::Version
249
- version: '0'
250
- type: :development
251
- prerelease: false
252
- version_requirements: !ruby/object:Gem::Requirement
253
- none: false
254
- requirements:
255
- - - ! '>='
256
- - !ruby/object:Gem::Version
257
- version: '0'
258
194
  - !ruby/object:Gem::Dependency
259
195
  name: minitest
260
196
  requirement: !ruby/object:Gem::Requirement
@@ -384,8 +320,8 @@ dependencies:
384
320
  - !ruby/object:Gem::Version
385
321
  version: '0'
386
322
  description: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
387
- XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models. You can also convert
388
- units.
323
+ XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models. Uses Upsert internally
324
+ for speed.
389
325
  email:
390
326
  - seamus@abshere.net
391
327
  - rossmeissl@gmail.com
@@ -407,30 +343,19 @@ files:
407
343
  - lib/data_miner.rb
408
344
  - lib/data_miner/active_record_class_methods.rb
409
345
  - lib/data_miner/attribute.rb
410
- - lib/data_miner/dictionary.rb
411
- - lib/data_miner/run.rb
412
- - lib/data_miner/run/column_statistic.rb
413
346
  - lib/data_miner/script.rb
414
347
  - lib/data_miner/step.rb
415
348
  - lib/data_miner/step/import.rb
416
349
  - lib/data_miner/step/process.rb
417
350
  - lib/data_miner/step/sql.rb
418
- - lib/data_miner/unit_converter.rb
419
- - lib/data_miner/unit_converter/alchemist.rb
420
- - lib/data_miner/unit_converter/conversions.rb
421
351
  - lib/data_miner/version.rb
422
- - test/data_miner/step/test_import.rb
423
352
  - test/data_miner/step/test_sql.rb
424
353
  - test/data_miner/test_attribute.rb
425
- - test/data_miner/unit_converter/test_alchemist.rb
426
- - test/data_miner/unit_converter/test_conversions.rb
427
354
  - test/helper.rb
428
355
  - test/support/breed.rb
429
356
  - test/support/breed_by_license_number.csv
430
357
  - test/support/breeds.xls
431
358
  - test/support/data_miner_with_alchemist.rb
432
- - test/support/data_miner_with_conversions.rb
433
- - test/support/data_miner_without_unit_converter.rb
434
359
  - test/support/pet.rb
435
360
  - test/support/pet2.rb
436
361
  - test/support/pet3.rb
@@ -439,10 +364,6 @@ files:
439
364
  - test/support/pets.csv
440
365
  - test/support/pets_funny.csv
441
366
  - test/test_data_miner.rb
442
- - test/test_data_miner_run_column_statistic.rb
443
- - test/test_earth_import.rb
444
- - test/test_safety.rb
445
- - test/test_unit_conversion.rb
446
367
  homepage: https://github.com/seamusabshere/data_miner
447
368
  licenses: []
448
369
  post_install_message:
@@ -458,9 +379,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
458
379
  required_rubygems_version: !ruby/object:Gem::Requirement
459
380
  none: false
460
381
  requirements:
461
- - - ! '>='
382
+ - - ! '>'
462
383
  - !ruby/object:Gem::Version
463
- version: '0'
384
+ version: 1.3.1
464
385
  requirements: []
465
386
  rubyforge_project: data_miner
466
387
  rubygems_version: 1.8.25
@@ -469,18 +390,13 @@ specification_version: 3
469
390
  summary: Download, pull out of a ZIP/TAR/GZ/BZ2 archive, parse, correct, and import
470
391
  XLS, ODS, XML, CSV, HTML, etc. into your ActiveRecord models.
471
392
  test_files:
472
- - test/data_miner/step/test_import.rb
473
393
  - test/data_miner/step/test_sql.rb
474
394
  - test/data_miner/test_attribute.rb
475
- - test/data_miner/unit_converter/test_alchemist.rb
476
- - test/data_miner/unit_converter/test_conversions.rb
477
395
  - test/helper.rb
478
396
  - test/support/breed.rb
479
397
  - test/support/breed_by_license_number.csv
480
398
  - test/support/breeds.xls
481
399
  - test/support/data_miner_with_alchemist.rb
482
- - test/support/data_miner_with_conversions.rb
483
- - test/support/data_miner_without_unit_converter.rb
484
400
  - test/support/pet.rb
485
401
  - test/support/pet2.rb
486
402
  - test/support/pet3.rb
@@ -489,8 +405,4 @@ test_files:
489
405
  - test/support/pets.csv
490
406
  - test/support/pets_funny.csv
491
407
  - test/test_data_miner.rb
492
- - test/test_data_miner_run_column_statistic.rb
493
- - test/test_earth_import.rb
494
- - test/test_safety.rb
495
- - test/test_unit_conversion.rb
496
408
  has_rdoc:
@@ -1,84 +0,0 @@
1
- require 'remote_table'
2
-
3
- class DataMiner
4
- # An easy way to translate data before importing it using an intermediate table.
5
- class Dictionary
6
- DEFAULT_CASE_SENSITIVE = true
7
-
8
- # What field in the dictionary holds the lookup key.
9
- #
10
- # In other words, the column we scan down to find an entry.
11
- #
12
- # @return [String]
13
- attr_reader :key_name
14
-
15
- # What field in the dictionary holds the final value.
16
- #
17
- # @return [String]
18
- attr_reader :value_name
19
-
20
- # A +sprintf+-style format to be applied.
21
- # @return [String]
22
- attr_reader :sprintf
23
-
24
- # The URL of the dictionary. It must be a CSV.
25
- # @return [String]
26
- attr_reader :url
27
-
28
- # Whether to be case-sensitive with lookups. Defaults to false.
29
- # @return [TrueClass, FalseClass]
30
- attr_reader :case_sensitive
31
-
32
- # @private
33
- def initialize(options = {})
34
- options = options.symbolize_keys
35
- @url = options[:url]
36
- @key_name = options[:input].to_s
37
- @value_name = options[:output].to_s
38
- @sprintf = options[:sprintf]
39
- @case_sensitive = options.fetch :case_sensitive, DEFAULT_CASE_SENSITIVE
40
- @table_mutex = ::Mutex.new
41
- end
42
-
43
- # Look up a translation for a value.
44
- #
45
- # @return [nil, String]
46
- def lookup(value)
47
- normalized_value = normalize_for_comparison value
48
- if match = table.detect { |entry| entry[key_name] == normalized_value }
49
- match[value_name].to_s
50
- end
51
- end
52
-
53
- private
54
-
55
- def table
56
- @table || @table_mutex.synchronize do
57
- @table ||= ::RemoteTable.new(url).map do |entry|
58
- entry[key_name] = normalize_for_comparison entry[key_name]
59
- entry
60
- end
61
- end
62
- end
63
-
64
- def refresh
65
- @table = nil
66
- end
67
-
68
- def normalize_for_comparison(str)
69
- if sprintf
70
- if sprintf.end_with?('f')
71
- str = str.to_f
72
- elsif sprintf.end_with?('d')
73
- str = str.to_i
74
- end
75
- str = sprintf % str
76
- end
77
- str = DataMiner.compress_whitespace str
78
- unless case_sensitive
79
- str = DataMiner.downcase str
80
- end
81
- str
82
- end
83
- end
84
- end
@@ -1,144 +0,0 @@
1
- require 'aasm'
2
- require 'active_record_inline_schema'
3
-
4
- require 'data_miner/run/column_statistic'
5
-
6
- class DataMiner
7
- # A record of what happened when you ran a data miner script.
8
- #
9
- # To create the table, use +DataMiner::Run.auto_upgrade!+, possibly in +db/seeds.rb+ or a database migration.
10
- class Run < ::ActiveRecord::Base
11
- class << self
12
- # If a previous run died and you have manually enabled locking, you may find yourself getting +LockMethod::Locked+ exceptions.
13
- #
14
- # @note Starting in 2.1.0, runs are no longer locked by default. This method remains in case you want to re-apply locking.
15
- #
16
- # @param [String] model_names What locks to clear.
17
- #
18
- # @return [nil]
19
- #
20
- # @example Re-enable locking (since it was turned off by default in 2.1.0)
21
- # require 'data_miner'
22
- # require 'lock_method'
23
- # DataMiner::Run.lock_method :start
24
- def clear_locks(model_names = DataMiner.model_names)
25
- return unless defined?(::LockMethod)
26
- model_names.each do |model_name|
27
- dummy = new
28
- dummy.model_name = model_name
29
- dummy.lock_method_clear :start
30
- end
31
- nil
32
- end
33
- end
34
-
35
- # Raise this exception to skip the current run without causing it to fail.
36
- #
37
- # @example Avoid running certain data miner scripts too often (because they take too long).
38
- # class FlightSegment < ActiveRecord::Base
39
- # data_miner do
40
- # [...]
41
- # process "don't run this more than once an hour" do
42
- # if (last_ran_at = data_miner_runs.first(:order => 'created_at DESC').try(:created_at)) and (Time.now.utc - last_ran_at) < 3600
43
- # raise DataMiner::Run::Skip
44
- # end
45
- # end
46
- # [...]
47
- # end
48
- # end
49
- class Skip < ::Exception
50
- end
51
-
52
- INITIAL_STATE = :limbo
53
-
54
- self.table_name = 'data_miner_runs'
55
-
56
- col :model_name
57
- col :aasm_state
58
- col :created_at, :type => :datetime
59
- col :stopped_at, :type => :datetime
60
- col :updated_at, :type => :datetime
61
- col :error, :type => :text
62
- col :row_count_before, :type => :integer
63
- col :row_count_after, :type => :integer
64
- add_index :model_name
65
- add_index :aasm_state
66
-
67
- validates_presence_of :model_name
68
-
69
- has_many :column_statistics, :class_name => 'DataMiner::Run::ColumnStatistic', :order => 'id ASC'
70
-
71
- include ::AASM
72
- aasm_initial_state INITIAL_STATE
73
- aasm_state :limbo
74
- aasm_state :skipped
75
- aasm_state :succeeded
76
- aasm_state :failed
77
- aasm_event(:succeed) { transitions :from => :limbo, :to => :succeeded }
78
- aasm_event(:skip) { transitions :from => :limbo, :to => :skipped }
79
- aasm_event(:fail) { transitions :from => :limbo, :to => :failed }
80
-
81
- # @private
82
- def start
83
- model = model_name.constantize
84
- if model.table_exists?
85
- self.row_count_before = model.count
86
- end
87
- save!
88
- if DataMiner.per_column_statistics?
89
- ColumnStatistic.take self
90
- end
91
- begin
92
- catch :data_miner_succeed do
93
- yield
94
- end
95
- succeed!
96
- rescue Skip
97
- skip!
98
- rescue
99
- self.error = "#{$!.message}\n#{$!.backtrace.join("\n")}"
100
- fail!
101
- raise $!
102
- ensure
103
- if model.connection.respond_to?(:schema_cache)
104
- model.connection.schema_cache.clear!
105
- end
106
- model.reset_column_information
107
- if model.table_exists?
108
- self.row_count_after = model.count
109
- if DataMiner.per_column_statistics?
110
- ColumnStatistic.take self
111
- end
112
- end
113
- self.stopped_at = ::Time.now.utc
114
- save!
115
- DataMiner.logger.info %{[data_miner] #{model_name} #{aasm_current_state.to_s.upcase} (#{(stopped_at-created_at).round(2)}s)}
116
- end
117
- self
118
- end
119
-
120
- # Get the column statistics for a particular column before this run started.
121
- #
122
- # @param [String] column_name The column you want to know about.
123
- #
124
- # @return [ColumnStatistic]
125
- def initial_column_statistics(column_name)
126
- column_statistics.where(:column_name => column_name.to_s).first
127
- end
128
-
129
- # Get the column statistics for a particular column after this run finished.
130
- #
131
- # @param [String] column_name The column you want to know about.
132
- #
133
- # @return [ColumnStatistic]
134
- def final_column_statistics(column_name)
135
- column_statistics.where(:column_name => column_name.to_s).last
136
- end
137
-
138
- # @private
139
- def as_lock
140
- database_name = Run.connection.instance_variable_get(:@config).try(:[], :database)
141
- [database_name, model_name]
142
- end
143
- end
144
- end