csv_migration 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/csv_migration.rb +473 -0
  3. metadata +58 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c5f74597cba0b6c9d98490bcd353432e9586e67a6ff1a01d6a7d51e821c20ddb
4
+ data.tar.gz: c1b2f539cb7b58f8ccd98ef187212f9c21eb15e24da129b4407dbfb69aacdbd1
5
+ SHA512:
6
+ metadata.gz: 81fcc7ae7e50b3f1dfcc4f3f12f399720a1d7e8f87f17fe3b16f04217e4ee78c3abb792dbdcc41e143fb35e7c33ab1b8ae1c0d505de8c503ac293024b018e813
7
+ data.tar.gz: cbc31f85f5ecbfc7d5dd063b035601fd00b12330dfd1e285a38de51a19c544b40bed433849eb035e25275f09d60733bc50d691bcfdf9e6061567271c69ba524f
@@ -0,0 +1,473 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Description: Parse and test data from a csv file.
4
+ class CsvMigration
5
+ # @param [String] file_name with extension (my_file.csv)
6
+ # @param [String] delimiter for parsing, by default = ';'
7
+ def initialize(file_name:, delimiter: ';')
8
+ # File name for parsing in csv format
9
+ @file_name_csv = file_name
10
+ @delimiter = delimiter
11
+
12
+ @file_name = @file_name_csv.split('.csv').first
13
+
14
+ # File for export correct data from base file
15
+ @correct_file_data_csv = File.expand_path("v_parser_correct_#{@file_name}.csv")
16
+ @errors_log = File.expand_path("v_parser_errors_#{@file_name}.log")
17
+ @duplicates_log = File.expand_path("v_parser_duplicates_#{@file_name}.log")
18
+ @not_saved_file_data_errors = File.expand_path("v_parser_not_saved_#{@file_name}.log")
19
+
20
+ # Parsing file
21
+ @file_for_parsing = File.expand_path(@file_name_csv)
22
+
23
+ # Remove old files
24
+ remove_old_files
25
+
26
+ # Count rows in the file without header
27
+ @count_file_lines = `wc -l #{@file_for_parsing}`.split[0].to_i - 1
28
+
29
+ @line_num = 0
30
+ @counter_good_records = 0
31
+ @counter_duplicates = 0
32
+
33
+ # Raw data from a file without header
34
+ @file_raw_data = []
35
+
36
+ # Data after parsing
37
+ @parsed_data = []
38
+
39
+ # Header fields from csv file
40
+ @parsing_file_header = []
41
+
42
+ # Error statuses
43
+ @errors = {}
44
+
45
+ # Errors data
46
+ @errors_data = {}
47
+
48
+ # Duplicates records
49
+ @duplicates = {}
50
+
51
+ # Errors creating records from the file
52
+ @not_saved_records = []
53
+
54
+ # Relation of header name from the file with a specific field name of a table
55
+ #
56
+ # Key: column name in the csv file
57
+ # Value:
58
+ # field: a field name of a table in a DB (symbol)
59
+ # require: a field should be not empty (true/false)
60
+ # replace: need to use @replace_dict ( @replace_dict = { 'what need replace' => 'replace to this' } ) (true/false)
61
+ # prefix: need to add value as a prefix from a field header name (header name from CSV file) (string)
62
+ # validate: callback method which necessary call for validating a specific format (symbol)
63
+ # is_empty: array with fields where need to search data if a value is empty (field name from CSV file header) (array of strings)
64
+ # default: a value which need set by default in any case (any type)
65
+ # callback: callback method which necessary call for creating a specific format (symbol)
66
+ @ref_csv_head_from_file = {}
67
+
68
+ # Dictionary with fields names from the @ref_csv_head_from_file where need to search duplicates
69
+ @duplicates_dict = %i[]
70
+
71
+ # Dictionary for replace a key word to a value word: 'hallo' => 'Hello'
72
+ @replace_dict = {}
73
+ end
74
+
75
+ # Start parsing
76
+ def call
77
+ puts "Start parse file #{@file_for_parsing}"
78
+
79
+ # Read line from csv file
80
+ File.foreach(@file_for_parsing) do |line|
81
+ data = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp.split(@delimiter).map(&:strip)
82
+
83
+ if @line_num.zero?
84
+ @parsing_file_header = data.map(&:downcase)
85
+ @line_num += 1
86
+ next
87
+ end
88
+
89
+ @file_raw_data << line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp
90
+
91
+ check = check_require_fields(data)
92
+
93
+ unless check[:status]
94
+ @line_num += 1
95
+ puts "Incorrect data! Required field: #{check[:error]} is empty!"
96
+ next
97
+ end
98
+
99
+ records = find_data_from_csv(data, @ref_csv_head_from_file)
100
+
101
+ @parsed_data << { id: @line_num - 1 }.merge(records)
102
+
103
+ puts "Left parse #{@count_file_lines - @line_num} lines"
104
+ @line_num += 1
105
+ @counter_good_records += 1
106
+ end
107
+
108
+ duplicates_id_list = check_duplicates
109
+ remove_duplicates(duplicates_id_list) if duplicates_id_list.any?
110
+
111
+ save_errors
112
+
113
+ create_file_without_errors
114
+
115
+ double_duplicates = @counter_good_records + @errors.values.sum + @counter_duplicates - @line_num - 1
116
+
117
+ puts
118
+ puts "Testing data was finished. All records in the file (without header): #{@line_num - 1}"
119
+ puts "Good records: #{@counter_good_records}"
120
+ puts "Bad records: #{@errors.values.sum}"
121
+ puts "Duplicate records: #{@counter_duplicates}"
122
+ puts "Duplicates more than one field: #{double_duplicates}" if double_duplicates.positive?
123
+ puts "Success parsed records: #{@parsed_data.size}"
124
+
125
+ error_actions if !@errors.values.sum.zero? || !@counter_duplicates.zero?
126
+
127
+ create_data_to_db
128
+
129
+ save_record_errors_to_file if @not_saved_records.any?
130
+ end
131
+
132
+ # Remove old files
133
+ def remove_old_files
134
+ File.delete(@errors_log) if File.exist?(@errors_log)
135
+ File.delete(@duplicates_log) if File.exist?(@duplicates_log)
136
+ File.delete(@correct_file_data_csv) if File.exist?(@correct_file_data_csv)
137
+ File.delete(@not_saved_file_data_errors) if File.exist?(@not_saved_file_data_errors)
138
+ end
139
+
140
+ private
141
+
142
+ # Checking variable on is nil or on is empty
143
+ #
144
+ # @param [Any] var variable for check
145
+ def blank?(var)
146
+ var.nil? || var.empty?
147
+ end
148
+
149
+ # Checking variable on if exist
150
+ #
151
+ # @param [Any] var variable for check
152
+ def present?(var)
153
+ !blank?(var)
154
+ end
155
+
156
+ # Question action before saving data if exist errors
157
+ def error_actions
158
+ print 'This file has errors. Do you want to save data without errors Y/n: '
159
+ respond = STDIN.gets.chomp
160
+
161
+ error_actions unless respond.casecmp('y').zero? || respond.casecmp('n').zero? || blank?(respond)
162
+ exit if respond.casecmp('n').zero?
163
+ end
164
+
165
+ # Callback for lowercase data
166
+ #
167
+ # @param [String] value Data from the CSV file after all manipulation (replace, prefix, etc)
168
+ # @param [String] header_name Header name a the CSV file
169
+ # @param [String] prev_value Data from the CSV file before all manipulation (replace, prefix, etc)
170
+ # @param [Hash] field_data Settings for a specific field from @ref_csv_head_from_file
171
+ def email_lowercase(value:, header_name:, prev_value:, field_data:)
172
+ value.downcase
173
+ end
174
+
175
+ # Callback for validating specific format data
176
+ #
177
+ # @param [String] value data for validate
178
+ def email_validate(value)
179
+ check = value.match?(/\A[\w+\-.]+@[a-z\d\-.]+\.[a-z]+\z/i)
180
+
181
+ puts "Email Error #{value}" unless check
182
+
183
+ check
184
+ end
185
+
186
+ # Check fields which should be present
187
+ #
188
+ # @param [Array] data list of values
189
+ def check_require_fields(data)
190
+ @ref_csv_head_from_file.each do |key, value|
191
+ return { error: value[:field], status: false } unless check_field(data, key.downcase, value)
192
+ end
193
+
194
+ { status: true }
195
+ end
196
+
197
+ # Checking specific field which should be present
198
+ #
199
+ # @param [Array] data list of values from a file
200
+ # @param [String] key header field name in a file
201
+ # @param [Object] value hash data from dict
202
+ def check_field(data, key, value)
203
+ if @parsing_file_header.find_index(key).nil?
204
+ puts "Please, correct settings in the @ref_csv_head_from_file hash. Key #{key} doesn't found in the header of #{@file_name_csv} file!"
205
+ exit
206
+ end
207
+
208
+ if value[:require] && blank?(data[@parsing_file_header.find_index(key)])
209
+ check = check_is_empty_field(key, data, value)
210
+
211
+ return true if check && validate_field(data_value: check, value: value)
212
+
213
+ @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
214
+ @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
215
+ @errors_data[value[:field]] << [data.join(';')]
216
+
217
+ return false
218
+ end
219
+
220
+ unless validate_field(data_value: data[@parsing_file_header.find_index(key)], value: value)
221
+ @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
222
+ @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
223
+ @errors_data[value[:field]] << [data.join(';')]
224
+
225
+ return false
226
+ end
227
+
228
+ true
229
+ end
230
+
231
+ # Validate field if exist validation callback
232
+ #
233
+ # @param [String] data_value value from file
234
+ # @param [Hash] value hash data from dict
235
+ def validate_field(data_value:, value:)
236
+
237
+ return true unless present?(value[:validate])
238
+
239
+ return method(value[:validate].to_sym).call(data_value) if respond_to?(value[:validate], true)
240
+
241
+ true
242
+ end
243
+
244
+ # Check all fields on is empty
245
+ #
246
+ # @param [String] key searched key
247
+ # @param [Array] data list with data from file
248
+ # @param [Hash] field Hash data from dict
249
+ def check_is_empty_field(key, data, field)
250
+ return false unless present?(field[:is_empty])
251
+
252
+ find_value_in_other_fields(key, data, field)
253
+ end
254
+
255
+ # Find value in other fields which was set for search
256
+ #
257
+ # @param [String] key searched key
258
+ # @param [Array] data list with data from file
259
+ # @param [Hash] field Hash data from dict
260
+ def find_value_in_other_fields(key, data, field)
261
+ return data[@parsing_file_header.find_index(key)] unless blank?(data[@parsing_file_header.find_index(key)])
262
+
263
+ return false unless field[:is_empty]
264
+
265
+ field[:is_empty].each do |value|
266
+ return data[@parsing_file_header.find_index(value.downcase)] unless blank?(data[@parsing_file_header.find_index(value.downcase)])
267
+ end
268
+
269
+ false
270
+ end
271
+
272
+ # Find data from a CSV file
273
+ #
274
+ # @param [Array] data from file (read one line)
275
+ # @param [Hash] object_dict hash dict for creating data in specific format
276
+ def find_data_from_csv(data, object_dict)
277
+ new_data = {}
278
+ object_dict.each do |key, value|
279
+ field_name = value[:field]
280
+
281
+ prev_field_data = present?(new_data[field_name]) ? new_data[field_name] : nil
282
+
283
+ new_data[field_name] = find_value_in_other_fields(key.downcase, data, value) if value[:require]
284
+ new_data[field_name] = data[@parsing_file_header.find_index(key.downcase)]&.strip unless value[:require]
285
+
286
+ if value[:prefix]
287
+ prefix_value = data[@parsing_file_header.find_index(value[:prefix])]&.strip
288
+ new_data[field_name] = prefix_value + ' ' + new_data[field_name] unless blank?(prefix_value)
289
+ end
290
+
291
+ new_data[field_name] = value[:default] if value[:default]
292
+ new_data[field_name] = value[:set_is_empty] if blank?(new_data[field_name]) && value.key?(:set_is_empty)
293
+ new_data[field_name] = replace_by_dict(new_data[field_name]) if value[:replace]
294
+
295
+ if value[:callback] && respond_to?(value[:callback], true)
296
+ new_data[field_name] = method(value[:callback].to_sym)
297
+ .call(value: new_data[field_name], header_name: key.downcase, prev_value: prev_field_data, field_data: value)
298
+ end
299
+ end
300
+
301
+ new_data
302
+ end
303
+
304
+ # Replace text by dict @replace_dict
305
+ def replace_by_dict(string)
306
+ @replace_dict.each do |key, value|
307
+ next if blank?(string)
308
+ return value if key.casecmp(string).zero?
309
+ end
310
+
311
+ string
312
+ end
313
+
314
+ # Search all duplicate records and saving it to a log file
315
+ def check_duplicates
316
+ return [] if @parsed_data.size.zero? || @duplicates_dict.size.zero?
317
+
318
+ id_list = []
319
+
320
+ puts 'Start search duplicates...'
321
+
322
+ @parsed_data.each do |row|
323
+ id = row[:id]
324
+ line = row.clone
325
+
326
+ @duplicates_dict.each do |duplicate|
327
+ next unless present?(line[duplicate])
328
+
329
+ unless @duplicates.key?(line[duplicate])
330
+ @duplicates = @duplicates.deep_merge(line[duplicate] => { id: [], field: duplicate, value: line[duplicate], data: [] })
331
+ end
332
+
333
+ @duplicates[line[duplicate]][:id] << id
334
+ @duplicates[line[duplicate]][:data] << @file_raw_data[id]
335
+
336
+ puts "Check line ##{id}"
337
+ end
338
+ end
339
+
340
+ @duplicates = @duplicates.select { |_k, v| v[:data].size > 1 && (v[:value] != 'NULL' || blank?(v[:value])) }
341
+
342
+ if @duplicates.any?
343
+ file_duplicate = File.open(@duplicates_log, 'w')
344
+ file_duplicate.puts @parsing_file_header.join(';')
345
+
346
+ @duplicates.each do |_key, value|
347
+ @counter_duplicates += value[:data].size
348
+
349
+ file_duplicate.puts
350
+ file_duplicate.puts "Duplicate field: #{value[:field]}, value: #{value[:value]}"
351
+ file_duplicate.puts
352
+ value[:data].each do |record|
353
+ file_duplicate.puts record
354
+ end
355
+
356
+ id_list << value[:id]
357
+ end
358
+
359
+ file_duplicate.close
360
+ end
361
+
362
+ id_list.flatten.uniq
363
+ end
364
+
365
+ # Remove duplicate records from parsed data
366
+ #
367
+ # @param [Array] id_list list duplicates
368
+ def remove_duplicates(id_list)
369
+ @counter_good_records -= id_list.size
370
+
371
+ @parsed_data = @parsed_data.reject { |value| id_list.include?(value[:id]) }
372
+ end
373
+
374
+ # Save errors data to a log file
375
+ def save_errors
376
+ errors = lambda do |errors_data|
377
+ file_error = File.open(@errors_log, 'w')
378
+ file_error.puts @parsing_file_header.join(';') unless errors_data.size.zero?
379
+
380
+ errors_data.each do |key, value|
381
+ file_error.puts
382
+ file_error.puts ' ' * 10 + "#{key.capitalize}:"
383
+ file_error.puts
384
+ value.each do |data|
385
+ file_error.puts data
386
+ end
387
+ end
388
+
389
+ file_error.close
390
+ end
391
+
392
+ errors.call(@errors_data) if @errors_data.any?
393
+
394
+ puts
395
+ puts "Errors: #{@errors}" if @errors.any?
396
+ puts
397
+ end
398
+
399
+ # Create new csv export file without errors and duplicates
400
+ def create_file_without_errors
401
+ file_export = File.open(@correct_file_data_csv, 'w')
402
+ file_export.puts @parsing_file_header.join(';')
403
+
404
+ @parsed_data.each do |value|
405
+ file_export.puts @file_raw_data[value[:id]]
406
+ end
407
+
408
+ file_export.close
409
+ end
410
+
411
+ # Add found error to errors data
412
+ #
413
+ # @param [Hash] record parsed data
414
+ # @param [String] error_text message for error
415
+ def save_error(record, error_text)
416
+ @not_saved_records << {
417
+ raw: @file_raw_data[record[:id]],
418
+ data: record,
419
+ error: error_text
420
+ }
421
+
422
+ puts error_text
423
+ end
424
+
425
+ # Create new data in the DB
426
+ # This method get @parsed_data and call in loop create_data_to_db method
427
+ def create_data_to_db
428
+ @parsed_data.each do |record|
429
+ add_record_to_db(record)
430
+ end
431
+
432
+ show_finished_test
433
+ end
434
+
435
+ # Show text in the console after migration
436
+ def show_finished_test
437
+ puts
438
+ puts 'Migration was finished.'
439
+ puts "Total records for insert: #{@parsed_data.size}"
440
+ puts "Saved records: #{@parsed_data.size - @not_saved_records.size}"
441
+ puts "Not saved records: #{@not_saved_records.size}. See log with errors: #{@not_saved_file_data_errors}"
442
+ end
443
+
444
+ # Create new record in the DB
445
+ #
446
+ # @param [Hash] _record in specific hash format
447
+ def add_record_to_db(_record)
448
+ raise 'You should make realization callback method add_record_to_db(record)'
449
+
450
+ # Search data of model if it necessary
451
+ # user = User.find_by(email: record[:email].downcase)
452
+ #
453
+ # if user.nil?
454
+ # save_error(record, "User doesn't found in the DB by email")
455
+ # next
456
+ # end
457
+ end
458
+
459
+ # Save all records with errors to a file
460
+ def save_record_errors_to_file
461
+ errors = lambda do |errors_data|
462
+ file_error = File.open(@not_saved_file_data_errors, 'w')
463
+
464
+ errors_data.each do |value|
465
+ file_error.puts value
466
+ end
467
+
468
+ file_error.close
469
+ end
470
+
471
+ errors.call(@not_saved_records) if @not_saved_records.any?
472
+ end
473
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv_migration
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Danilevsky Kirill (Syndicode.com)
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-11-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: minitest
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 5.13.0
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 5.13.0
27
+ description: You can make parsing CSV file, generate from it hash data and then save
28
+ to DB
29
+ email: k.danilevsky@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/csv_migration.rb
35
+ homepage: https://github.com/kirill-dan/csv_migration
36
+ licenses:
37
+ - MIT
38
+ metadata: {}
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 2.5.0
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubygems_version: 3.0.3
55
+ signing_key:
56
+ specification_version: 4
57
+ summary: Migration system from a csv file
58
+ test_files: []