csv_migration 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/csv_migration.rb +473 -0
  3. metadata +58 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c5f74597cba0b6c9d98490bcd353432e9586e67a6ff1a01d6a7d51e821c20ddb
4
+ data.tar.gz: c1b2f539cb7b58f8ccd98ef187212f9c21eb15e24da129b4407dbfb69aacdbd1
5
+ SHA512:
6
+ metadata.gz: 81fcc7ae7e50b3f1dfcc4f3f12f399720a1d7e8f87f17fe3b16f04217e4ee78c3abb792dbdcc41e143fb35e7c33ab1b8ae1c0d505de8c503ac293024b018e813
7
+ data.tar.gz: cbc31f85f5ecbfc7d5dd063b035601fd00b12330dfd1e285a38de51a19c544b40bed433849eb035e25275f09d60733bc50d691bcfdf9e6061567271c69ba524f
@@ -0,0 +1,473 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Description: Parse and test data from a csv file.
4
+ class CsvMigration
5
+ # @param [String] file_name with extension (my_file.csv)
6
+ # @param [String] delimiter for parsing, by default = ';'
7
+ def initialize(file_name:, delimiter: ';')
8
+ # File name for parsing in csv format
9
+ @file_name_csv = file_name
10
+ @delimiter = delimiter
11
+
12
+ @file_name = @file_name_csv.split('.csv').first
13
+
14
+ # File for export correct data from base file
15
+ @correct_file_data_csv = File.expand_path("v_parser_correct_#{@file_name}.csv")
16
+ @errors_log = File.expand_path("v_parser_errors_#{@file_name}.log")
17
+ @duplicates_log = File.expand_path("v_parser_duplicates_#{@file_name}.log")
18
+ @not_saved_file_data_errors = File.expand_path("v_parser_not_saved_#{@file_name}.log")
19
+
20
+ # Parsing file
21
+ @file_for_parsing = File.expand_path(@file_name_csv)
22
+
23
+ # Remove old files
24
+ remove_old_files
25
+
26
+ # Count rows in the file without header
27
+ @count_file_lines = `wc -l #{@file_for_parsing}`.split[0].to_i - 1
28
+
29
+ @line_num = 0
30
+ @counter_good_records = 0
31
+ @counter_duplicates = 0
32
+
33
+ # Raw data from a file without header
34
+ @file_raw_data = []
35
+
36
+ # Data after parsing
37
+ @parsed_data = []
38
+
39
+ # Header fields from csv file
40
+ @parsing_file_header = []
41
+
42
+ # Error statuses
43
+ @errors = {}
44
+
45
+ # Errors data
46
+ @errors_data = {}
47
+
48
+ # Duplicates records
49
+ @duplicates = {}
50
+
51
+ # Errors creating records from the file
52
+ @not_saved_records = []
53
+
54
+ # Relation of header name from the file with a specific field name of a table
55
+ #
56
+ # Key: column name in the csv file
57
+ # Value:
58
+ # field: a field name of a table in a DB (symbol)
59
+ # require: a field should be not empty (true/false)
60
+ # replace: need to use @replace_dict ( @replace_dict = { 'what need replace' => 'replace to this' } ) (true/false)
61
+ # prefix: need to add value as a prefix from a field header name (header name from CSV file) (string)
62
+ # validate: callback method which necessary call for validating a specific format (symbol)
63
+ # is_empty: array with fields where need to search data if a value is empty (field name from CSV file header) (array of strings)
64
+ # default: a value which need set by default in any case (any type)
65
+ # callback: callback method which necessary call for creating a specific format (symbol)
66
+ @ref_csv_head_from_file = {}
67
+
68
+ # Dictionary with fields names from the @ref_csv_head_from_file where need to search duplicates
69
+ @duplicates_dict = %i[]
70
+
71
+ # Dictionary for replace a key word to a value word: 'hallo' => 'Hello'
72
+ @replace_dict = {}
73
+ end
74
+
75
+ # Start parsing
76
+ def call
77
+ puts "Start parse file #{@file_for_parsing}"
78
+
79
+ # Read line from csv file
80
+ File.foreach(@file_for_parsing) do |line|
81
+ data = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp.split(@delimiter).map(&:strip)
82
+
83
+ if @line_num.zero?
84
+ @parsing_file_header = data.map(&:downcase)
85
+ @line_num += 1
86
+ next
87
+ end
88
+
89
+ @file_raw_data << line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp
90
+
91
+ check = check_require_fields(data)
92
+
93
+ unless check[:status]
94
+ @line_num += 1
95
+ puts "Incorrect data! Required field: #{check[:error]} is empty!"
96
+ next
97
+ end
98
+
99
+ records = find_data_from_csv(data, @ref_csv_head_from_file)
100
+
101
+ @parsed_data << { id: @line_num - 1 }.merge(records)
102
+
103
+ puts "Left parse #{@count_file_lines - @line_num} lines"
104
+ @line_num += 1
105
+ @counter_good_records += 1
106
+ end
107
+
108
+ duplicates_id_list = check_duplicates
109
+ remove_duplicates(duplicates_id_list) if duplicates_id_list.any?
110
+
111
+ save_errors
112
+
113
+ create_file_without_errors
114
+
115
+ double_duplicates = @counter_good_records + @errors.values.sum + @counter_duplicates - @line_num - 1
116
+
117
+ puts
118
+ puts "Testing data was finished. All records in the file (without header): #{@line_num - 1}"
119
+ puts "Good records: #{@counter_good_records}"
120
+ puts "Bad records: #{@errors.values.sum}"
121
+ puts "Duplicate records: #{@counter_duplicates}"
122
+ puts "Duplicates more than one field: #{double_duplicates}" if double_duplicates.positive?
123
+ puts "Success parsed records: #{@parsed_data.size}"
124
+
125
+ error_actions if !@errors.values.sum.zero? || !@counter_duplicates.zero?
126
+
127
+ create_data_to_db
128
+
129
+ save_record_errors_to_file if @not_saved_records.any?
130
+ end
131
+
132
+ # Remove old files
133
+ def remove_old_files
134
+ File.delete(@errors_log) if File.exist?(@errors_log)
135
+ File.delete(@duplicates_log) if File.exist?(@duplicates_log)
136
+ File.delete(@correct_file_data_csv) if File.exist?(@correct_file_data_csv)
137
+ File.delete(@not_saved_file_data_errors) if File.exist?(@not_saved_file_data_errors)
138
+ end
139
+
140
+ private
141
+
142
+ # Checking variable on is nil or on is empty
143
+ #
144
+ # @param [Any] var variable for check
145
+ def blank?(var)
146
+ var.nil? || var.empty?
147
+ end
148
+
149
+ # Checking variable on if exist
150
+ #
151
+ # @param [Any] var variable for check
152
+ def present?(var)
153
+ !blank?(var)
154
+ end
155
+
156
+ # Question action before saving data if exist errors
157
+ def error_actions
158
+ print 'This file has errors. Do you want to save data without errors Y/n: '
159
+ respond = STDIN.gets.chomp
160
+
161
+ error_actions unless respond.casecmp('y').zero? || respond.casecmp('n').zero? || blank?(respond)
162
+ exit if respond.casecmp('n').zero?
163
+ end
164
+
165
+ # Callback for lowercase data
166
+ #
167
+ # @param [String] value Data from the CSV file after all manipulation (replace, prefix, etc)
168
+ # @param [String] header_name Header name a the CSV file
169
+ # @param [String] prev_value Data from the CSV file before all manipulation (replace, prefix, etc)
170
+ # @param [Hash] field_data Settings for a specific field from @ref_csv_head_from_file
171
+ def email_lowercase(value:, header_name:, prev_value:, field_data:)
172
+ value.downcase
173
+ end
174
+
175
+ # Callback for validating specific format data
176
+ #
177
+ # @param [String] value data for validate
178
+ def email_validate(value)
179
+ check = value.match?(/\A[\w+\-.]+@[a-z\d\-.]+\.[a-z]+\z/i)
180
+
181
+ puts "Email Error #{value}" unless check
182
+
183
+ check
184
+ end
185
+
186
+ # Check fields which should be present
187
+ #
188
+ # @param [Array] data list of values
189
+ def check_require_fields(data)
190
+ @ref_csv_head_from_file.each do |key, value|
191
+ return { error: value[:field], status: false } unless check_field(data, key.downcase, value)
192
+ end
193
+
194
+ { status: true }
195
+ end
196
+
197
+ # Checking specific field which should be present
198
+ #
199
+ # @param [Array] data list of values from a file
200
+ # @param [String] key header field name in a file
201
+ # @param [Object] value hash data from dict
202
+ def check_field(data, key, value)
203
+ if @parsing_file_header.find_index(key).nil?
204
+ puts "Please, correct settings in the @ref_csv_head_from_file hash. Key #{key} doesn't found in the header of #{@file_name_csv} file!"
205
+ exit
206
+ end
207
+
208
+ if value[:require] && blank?(data[@parsing_file_header.find_index(key)])
209
+ check = check_is_empty_field(key, data, value)
210
+
211
+ return true if check && validate_field(data_value: check, value: value)
212
+
213
+ @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
214
+ @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
215
+ @errors_data[value[:field]] << [data.join(';')]
216
+
217
+ return false
218
+ end
219
+
220
+ unless validate_field(data_value: data[@parsing_file_header.find_index(key)], value: value)
221
+ @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
222
+ @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
223
+ @errors_data[value[:field]] << [data.join(';')]
224
+
225
+ return false
226
+ end
227
+
228
+ true
229
+ end
230
+
231
+ # Validate field if exist validation callback
232
+ #
233
+ # @param [String] data_value value from file
234
+ # @param [Hash] value hash data from dict
235
+ def validate_field(data_value:, value:)
236
+
237
+ return true unless present?(value[:validate])
238
+
239
+ return method(value[:validate].to_sym).call(data_value) if respond_to?(value[:validate], true)
240
+
241
+ true
242
+ end
243
+
244
+ # Check all fields on is empty
245
+ #
246
+ # @param [String] key searched key
247
+ # @param [Array] data list with data from file
248
+ # @param [Hash] field Hash data from dict
249
+ def check_is_empty_field(key, data, field)
250
+ return false unless present?(field[:is_empty])
251
+
252
+ find_value_in_other_fields(key, data, field)
253
+ end
254
+
255
+ # Find value in other fields which was set for search
256
+ #
257
+ # @param [String] key searched key
258
+ # @param [Array] data list with data from file
259
+ # @param [Hash] field Hash data from dict
260
+ def find_value_in_other_fields(key, data, field)
261
+ return data[@parsing_file_header.find_index(key)] unless blank?(data[@parsing_file_header.find_index(key)])
262
+
263
+ return false unless field[:is_empty]
264
+
265
+ field[:is_empty].each do |value|
266
+ return data[@parsing_file_header.find_index(value.downcase)] unless blank?(data[@parsing_file_header.find_index(value.downcase)])
267
+ end
268
+
269
+ false
270
+ end
271
+
272
+ # Find data from a CSV file
273
+ #
274
+ # @param [Array] data from file (read one line)
275
+ # @param [Hash] object_dict hash dict for creating data in specific format
276
+ def find_data_from_csv(data, object_dict)
277
+ new_data = {}
278
+ object_dict.each do |key, value|
279
+ field_name = value[:field]
280
+
281
+ prev_field_data = present?(new_data[field_name]) ? new_data[field_name] : nil
282
+
283
+ new_data[field_name] = find_value_in_other_fields(key.downcase, data, value) if value[:require]
284
+ new_data[field_name] = data[@parsing_file_header.find_index(key.downcase)]&.strip unless value[:require]
285
+
286
+ if value[:prefix]
287
+ prefix_value = data[@parsing_file_header.find_index(value[:prefix])]&.strip
288
+ new_data[field_name] = prefix_value + ' ' + new_data[field_name] unless blank?(prefix_value)
289
+ end
290
+
291
+ new_data[field_name] = value[:default] if value[:default]
292
+ new_data[field_name] = value[:set_is_empty] if blank?(new_data[field_name]) && value.key?(:set_is_empty)
293
+ new_data[field_name] = replace_by_dict(new_data[field_name]) if value[:replace]
294
+
295
+ if value[:callback] && respond_to?(value[:callback], true)
296
+ new_data[field_name] = method(value[:callback].to_sym)
297
+ .call(value: new_data[field_name], header_name: key.downcase, prev_value: prev_field_data, field_data: value)
298
+ end
299
+ end
300
+
301
+ new_data
302
+ end
303
+
304
+ # Replace text by dict @replace_dict
305
+ def replace_by_dict(string)
306
+ @replace_dict.each do |key, value|
307
+ next if blank?(string)
308
+ return value if key.casecmp(string).zero?
309
+ end
310
+
311
+ string
312
+ end
313
+
314
+ # Search all duplicate records and saving it to a log file
315
+ def check_duplicates
316
+ return [] if @parsed_data.size.zero? || @duplicates_dict.size.zero?
317
+
318
+ id_list = []
319
+
320
+ puts 'Start search duplicates...'
321
+
322
+ @parsed_data.each do |row|
323
+ id = row[:id]
324
+ line = row.clone
325
+
326
+ @duplicates_dict.each do |duplicate|
327
+ next unless present?(line[duplicate])
328
+
329
+ unless @duplicates.key?(line[duplicate])
330
+ @duplicates = @duplicates.deep_merge(line[duplicate] => { id: [], field: duplicate, value: line[duplicate], data: [] })
331
+ end
332
+
333
+ @duplicates[line[duplicate]][:id] << id
334
+ @duplicates[line[duplicate]][:data] << @file_raw_data[id]
335
+
336
+ puts "Check line ##{id}"
337
+ end
338
+ end
339
+
340
+ @duplicates = @duplicates.select { |_k, v| v[:data].size > 1 && (v[:value] != 'NULL' || blank?(v[:value])) }
341
+
342
+ if @duplicates.any?
343
+ file_duplicate = File.open(@duplicates_log, 'w')
344
+ file_duplicate.puts @parsing_file_header.join(';')
345
+
346
+ @duplicates.each do |_key, value|
347
+ @counter_duplicates += value[:data].size
348
+
349
+ file_duplicate.puts
350
+ file_duplicate.puts "Duplicate field: #{value[:field]}, value: #{value[:value]}"
351
+ file_duplicate.puts
352
+ value[:data].each do |record|
353
+ file_duplicate.puts record
354
+ end
355
+
356
+ id_list << value[:id]
357
+ end
358
+
359
+ file_duplicate.close
360
+ end
361
+
362
+ id_list.flatten.uniq
363
+ end
364
+
365
+ # Remove duplicate records from parsed data
366
+ #
367
+ # @param [Array] id_list list duplicates
368
+ def remove_duplicates(id_list)
369
+ @counter_good_records -= id_list.size
370
+
371
+ @parsed_data = @parsed_data.reject { |value| id_list.include?(value[:id]) }
372
+ end
373
+
374
+ # Save errors data to a log file
375
+ def save_errors
376
+ errors = lambda do |errors_data|
377
+ file_error = File.open(@errors_log, 'w')
378
+ file_error.puts @parsing_file_header.join(';') unless errors_data.size.zero?
379
+
380
+ errors_data.each do |key, value|
381
+ file_error.puts
382
+ file_error.puts ' ' * 10 + "#{key.capitalize}:"
383
+ file_error.puts
384
+ value.each do |data|
385
+ file_error.puts data
386
+ end
387
+ end
388
+
389
+ file_error.close
390
+ end
391
+
392
+ errors.call(@errors_data) if @errors_data.any?
393
+
394
+ puts
395
+ puts "Errors: #{@errors}" if @errors.any?
396
+ puts
397
+ end
398
+
399
+ # Create new csv export file without errors and duplicates
400
+ def create_file_without_errors
401
+ file_export = File.open(@correct_file_data_csv, 'w')
402
+ file_export.puts @parsing_file_header.join(';')
403
+
404
+ @parsed_data.each do |value|
405
+ file_export.puts @file_raw_data[value[:id]]
406
+ end
407
+
408
+ file_export.close
409
+ end
410
+
411
+ # Add found error to errors data
412
+ #
413
+ # @param [Hash] record parsed data
414
+ # @param [String] error_text message for error
415
+ def save_error(record, error_text)
416
+ @not_saved_records << {
417
+ raw: @file_raw_data[record[:id]],
418
+ data: record,
419
+ error: error_text
420
+ }
421
+
422
+ puts error_text
423
+ end
424
+
425
+ # Create new data in the DB
426
+ # This method get @parsed_data and call in loop create_data_to_db method
427
+ def create_data_to_db
428
+ @parsed_data.each do |record|
429
+ add_record_to_db(record)
430
+ end
431
+
432
+ show_finished_test
433
+ end
434
+
435
+ # Show text in the console after migration
436
+ def show_finished_test
437
+ puts
438
+ puts 'Migration was finished.'
439
+ puts "Total records for insert: #{@parsed_data.size}"
440
+ puts "Saved records: #{@parsed_data.size - @not_saved_records.size}"
441
+ puts "Not saved records: #{@not_saved_records.size}. See log with errors: #{@not_saved_file_data_errors}"
442
+ end
443
+
444
+ # Create new record in the DB
445
+ #
446
+ # @param [Hash] _record in specific hash format
447
+ def add_record_to_db(_record)
448
+ raise 'You should make realization callback method add_record_to_db(record)'
449
+
450
+ # Search data of model if it necessary
451
+ # user = User.find_by(email: record[:email].downcase)
452
+ #
453
+ # if user.nil?
454
+ # save_error(record, "User doesn't found in the DB by email")
455
+ # next
456
+ # end
457
+ end
458
+
459
+ # Save all records with errors to a file
460
+ def save_record_errors_to_file
461
+ errors = lambda do |errors_data|
462
+ file_error = File.open(@not_saved_file_data_errors, 'w')
463
+
464
+ errors_data.each do |value|
465
+ file_error.puts value
466
+ end
467
+
468
+ file_error.close
469
+ end
470
+
471
+ errors.call(@not_saved_records) if @not_saved_records.any?
472
+ end
473
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv_migration
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Danilevsky Kirill (Syndicode.com)
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-11-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: minitest
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 5.13.0
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 5.13.0
27
+ description: You can make parsing CSV file, generate from it hash data and then save
28
+ to DB
29
+ email: k.danilevsky@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/csv_migration.rb
35
+ homepage: https://github.com/kirill-dan/csv_migration
36
+ licenses:
37
+ - MIT
38
+ metadata: {}
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 2.5.0
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubygems_version: 3.0.3
55
+ signing_key:
56
+ specification_version: 4
57
+ summary: Migration system from a csv file
58
+ test_files: []