csvops 0.2.0.alpha → 0.4.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +56 -108
- data/docs/architecture.md +266 -0
- data/docs/release-v0.3.0-alpha.md +74 -0
- data/docs/release-v0.4.0-alpha.md +87 -0
- data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
- data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_randomization.rb +105 -0
- data/lib/csvtool/cli.rb +9 -1
- data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_options.rb +17 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_session.rb +25 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +24 -0
- data/lib/csvtool/domain/row_session/row_source.rb +3 -0
- data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
- data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
- data/lib/csvtool/infrastructure/csv/row_randomizer.rb +83 -0
- data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
- data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
- data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +22 -0
- data/lib/csvtool/interface/cli/prompts/seed_prompt.rb +29 -0
- data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
- data/test/csvtool/application/use_cases/run_row_randomization_test.rb +124 -0
- data/test/csvtool/cli_test.rb +231 -12
- data/test/csvtool/cli_unit_test.rb +27 -2
- data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
- data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
- data/test/csvtool/domain/row_randomization_session/randomization_options_test.rb +20 -0
- data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +26 -0
- data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +42 -0
- data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
- data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
- data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
- data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
- data/test/csvtool/infrastructure/csv/row_randomizer_test.rb +37 -0
- data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
- data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +78 -10
- data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +14 -0
- data/test/csvtool/interface/cli/prompts/seed_prompt_test.rb +39 -0
- data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
- data/test/fixtures/dedupe_reference.csv +3 -0
- data/test/fixtures/dedupe_reference.tsv +3 -0
- data/test/fixtures/dedupe_reference_all.csv +5 -0
- data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
- data/test/fixtures/dedupe_reference_none.csv +2 -0
- data/test/fixtures/dedupe_reference_normalization.csv +3 -0
- data/test/fixtures/dedupe_source.csv +6 -0
- data/test/fixtures/dedupe_source.tsv +6 -0
- data/test/fixtures/dedupe_source_no_headers.csv +5 -0
- data/test/fixtures/dedupe_source_normalization.csv +4 -0
- data/test/fixtures/sample_people_no_headers.csv +3 -0
- metadata +50 -6
- data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
- data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
- data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
5
|
+
require "csvtool/infrastructure/csv/cross_csv_deduper"
|
|
6
|
+
require "csvtool/infrastructure/csv/selector_validator"
|
|
7
|
+
|
|
8
|
+
module Csvtool
|
|
9
|
+
module Application
|
|
10
|
+
module UseCases
|
|
11
|
+
class RunCrossCsvDedupe
|
|
12
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
13
|
+
def ok?
|
|
14
|
+
ok
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(
|
|
19
|
+
header_reader: Infrastructure::CSV::HeaderReader.new,
|
|
20
|
+
deduper: Infrastructure::CSV::CrossCsvDeduper.new,
|
|
21
|
+
selector_validator: Infrastructure::CSV::SelectorValidator.new(header_reader: header_reader)
|
|
22
|
+
)
|
|
23
|
+
@header_reader = header_reader
|
|
24
|
+
@deduper = deduper
|
|
25
|
+
@selector_validator = selector_validator
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call(session:, on_header: nil, on_row: nil)
|
|
29
|
+
current_read_path = session.source.path
|
|
30
|
+
return failure(:column_not_found) unless @selector_validator.valid?(profile: session.source, selector: session.key_mapping.source_selector)
|
|
31
|
+
|
|
32
|
+
current_read_path = session.reference.path
|
|
33
|
+
return failure(:column_not_found) unless @selector_validator.valid?(profile: session.reference, selector: session.key_mapping.reference_selector)
|
|
34
|
+
|
|
35
|
+
source_headers = session.source.headers_present? ? @header_reader.call(file_path: session.source.path, col_sep: session.source.separator) : nil
|
|
36
|
+
current_read_path = session.source.path
|
|
37
|
+
|
|
38
|
+
if session.output_destination.file?
|
|
39
|
+
write_file(session: session, source_headers: source_headers)
|
|
40
|
+
else
|
|
41
|
+
on_header.call(source_headers) if on_header && source_headers
|
|
42
|
+
stats = @deduper.each_retained(**dedupe_options(session)) do |fields|
|
|
43
|
+
on_row.call(fields) if on_row
|
|
44
|
+
end
|
|
45
|
+
success(stats: stats)
|
|
46
|
+
end
|
|
47
|
+
rescue CSV::MalformedCSVError
|
|
48
|
+
failure(:could_not_parse_csv)
|
|
49
|
+
rescue Errno::EACCES
|
|
50
|
+
failure(:cannot_read_file, path: current_read_path || session.source.path)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def write_file(session:, source_headers:)
|
|
56
|
+
stats = nil
|
|
57
|
+
::CSV.open(
|
|
58
|
+
session.output_destination.path,
|
|
59
|
+
"w",
|
|
60
|
+
write_headers: !source_headers.nil?,
|
|
61
|
+
headers: source_headers,
|
|
62
|
+
col_sep: session.source.separator
|
|
63
|
+
) do |csv|
|
|
64
|
+
stats = @deduper.each_retained(**dedupe_options(session)) { |fields| csv << fields }
|
|
65
|
+
end
|
|
66
|
+
success(stats: stats, output_path: session.output_destination.path)
|
|
67
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
68
|
+
failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def dedupe_options(session)
|
|
72
|
+
{
|
|
73
|
+
source_path: session.source.path,
|
|
74
|
+
reference_path: session.reference.path,
|
|
75
|
+
source_selector: session.key_mapping.source_selector,
|
|
76
|
+
reference_selector: session.key_mapping.reference_selector,
|
|
77
|
+
source_col_sep: session.source.separator,
|
|
78
|
+
reference_col_sep: session.reference.separator,
|
|
79
|
+
match_options: session.match_options
|
|
80
|
+
}
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def success(data)
|
|
84
|
+
Result.new(ok: true, error: nil, data: data)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def failure(code, data = {})
|
|
88
|
+
Result.new(ok: false, error: code, data: data)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
@@ -19,8 +19,8 @@ require "csvtool/domain/column_session/column_selection"
|
|
|
19
19
|
require "csvtool/domain/column_session/extraction_options"
|
|
20
20
|
require "csvtool/domain/column_session/extraction_value"
|
|
21
21
|
require "csvtool/domain/column_session/preview"
|
|
22
|
-
require "csvtool/domain/column_session/output_destination"
|
|
23
22
|
require "csvtool/domain/column_session/column_session"
|
|
23
|
+
require "csvtool/domain/shared/output_destination"
|
|
24
24
|
|
|
25
25
|
module Csvtool
|
|
26
26
|
module Application
|
|
@@ -79,9 +79,9 @@ module Csvtool
|
|
|
79
79
|
return if output_destination.nil?
|
|
80
80
|
domain_destination =
|
|
81
81
|
if output_destination[:mode] == :file
|
|
82
|
-
Domain::
|
|
82
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
83
83
|
else
|
|
84
|
-
Domain::
|
|
84
|
+
Domain::Shared::OutputDestination.console
|
|
85
85
|
end
|
|
86
86
|
session = session.with_output_destination(domain_destination)
|
|
87
87
|
|
|
@@ -11,8 +11,8 @@ require "csvtool/infrastructure/output/csv_row_console_writer"
|
|
|
11
11
|
require "csvtool/infrastructure/output/csv_row_file_writer"
|
|
12
12
|
require "csvtool/domain/row_session/row_range"
|
|
13
13
|
require "csvtool/domain/row_session/row_source"
|
|
14
|
-
require "csvtool/domain/row_session/row_output_destination"
|
|
15
14
|
require "csvtool/domain/row_session/row_session"
|
|
15
|
+
require "csvtool/domain/shared/output_destination"
|
|
16
16
|
|
|
17
17
|
module Csvtool
|
|
18
18
|
module Application
|
|
@@ -56,9 +56,9 @@ module Csvtool
|
|
|
56
56
|
return if output_destination.nil?
|
|
57
57
|
destination =
|
|
58
58
|
if output_destination[:mode] == :file
|
|
59
|
-
Domain::
|
|
59
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
60
60
|
else
|
|
61
|
-
Domain::
|
|
61
|
+
Domain::Shared::OutputDestination.console
|
|
62
62
|
end
|
|
63
63
|
session = session.with_output_destination(destination)
|
|
64
64
|
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/interface/cli/errors/presenter"
|
|
5
|
+
require "csvtool/interface/cli/prompts/file_path_prompt"
|
|
6
|
+
require "csvtool/interface/cli/prompts/separator_prompt"
|
|
7
|
+
require "csvtool/interface/cli/prompts/headers_present_prompt"
|
|
8
|
+
require "csvtool/interface/cli/prompts/seed_prompt"
|
|
9
|
+
require "csvtool/interface/cli/prompts/output_destination_prompt"
|
|
10
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
11
|
+
require "csvtool/infrastructure/csv/row_randomizer"
|
|
12
|
+
require "csvtool/domain/row_randomization_session/randomization_source"
|
|
13
|
+
require "csvtool/domain/row_randomization_session/randomization_options"
|
|
14
|
+
require "csvtool/domain/row_randomization_session/randomization_session"
|
|
15
|
+
require "csvtool/domain/shared/output_destination"
|
|
16
|
+
|
|
17
|
+
module Csvtool
|
|
18
|
+
module Application
|
|
19
|
+
module UseCases
|
|
20
|
+
class RunRowRandomization
|
|
21
|
+
def initialize(stdin:, stdout:)
|
|
22
|
+
@stdin = stdin
|
|
23
|
+
@stdout = stdout
|
|
24
|
+
@errors = Interface::CLI::Errors::Presenter.new(stdout: stdout)
|
|
25
|
+
@header_reader = Infrastructure::CSV::HeaderReader.new
|
|
26
|
+
@row_randomizer = Infrastructure::CSV::RowRandomizer.new
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def call
|
|
30
|
+
file_path = Interface::CLI::Prompts::FilePathPrompt.new(stdin: @stdin, stdout: @stdout).call
|
|
31
|
+
return @errors.file_not_found(file_path) unless File.file?(file_path)
|
|
32
|
+
|
|
33
|
+
col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
34
|
+
return if col_sep.nil?
|
|
35
|
+
|
|
36
|
+
headers_present = Interface::CLI::Prompts::HeadersPresentPrompt.new(stdin: @stdin, stdout: @stdout).call
|
|
37
|
+
source = Domain::RowRandomizationSession::RandomizationSource.new(
|
|
38
|
+
path: file_path,
|
|
39
|
+
separator: col_sep,
|
|
40
|
+
headers_present: headers_present
|
|
41
|
+
)
|
|
42
|
+
headers = source.headers_present? ? @header_reader.call(file_path: source.path, col_sep: source.separator) : nil
|
|
43
|
+
return @errors.no_headers if source.headers_present? && headers.empty?
|
|
44
|
+
|
|
45
|
+
seed = Interface::CLI::Prompts::SeedPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
46
|
+
return if seed == Interface::CLI::Prompts::SeedPrompt::INVALID
|
|
47
|
+
options = Domain::RowRandomizationSession::RandomizationOptions.new(seed: seed)
|
|
48
|
+
session = Domain::RowRandomizationSession::RandomizationSession.start(source: source, options: options)
|
|
49
|
+
|
|
50
|
+
output_destination = Interface::CLI::Prompts::OutputDestinationPrompt.new(
|
|
51
|
+
stdin: @stdin,
|
|
52
|
+
stdout: @stdout,
|
|
53
|
+
errors: @errors
|
|
54
|
+
).call
|
|
55
|
+
return if output_destination.nil?
|
|
56
|
+
destination =
|
|
57
|
+
if output_destination[:mode] == :file
|
|
58
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
59
|
+
else
|
|
60
|
+
Domain::Shared::OutputDestination.console
|
|
61
|
+
end
|
|
62
|
+
session = session.with_output_destination(destination)
|
|
63
|
+
|
|
64
|
+
randomized_rows = @row_randomizer.each(
|
|
65
|
+
file_path: session.source.path,
|
|
66
|
+
col_sep: session.source.separator,
|
|
67
|
+
headers: session.source.headers_present?,
|
|
68
|
+
seed: session.options.seed
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if session.output_destination.file?
|
|
72
|
+
write_output_file(session.output_destination.path, headers, randomized_rows, col_sep: session.source.separator)
|
|
73
|
+
else
|
|
74
|
+
print_to_console(headers, randomized_rows, col_sep: session.source.separator)
|
|
75
|
+
end
|
|
76
|
+
rescue CSV::MalformedCSVError
|
|
77
|
+
@errors.could_not_parse_csv
|
|
78
|
+
rescue ArgumentError => e
|
|
79
|
+
return @errors.empty_output_path if e.message == "file output path cannot be empty"
|
|
80
|
+
|
|
81
|
+
raise e
|
|
82
|
+
rescue Errno::EACCES
|
|
83
|
+
@errors.cannot_read_file(file_path)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def print_to_console(headers, rows, col_sep:)
|
|
89
|
+
@stdout.puts
|
|
90
|
+
@stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp if headers
|
|
91
|
+
rows.each { |fields| @stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp }
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def write_output_file(path, headers, rows, col_sep:)
|
|
95
|
+
::CSV.open(path, "w", write_headers: !headers.nil?, headers: headers, col_sep: col_sep) do |csv|
|
|
96
|
+
rows.each { |fields| csv << fields }
|
|
97
|
+
end
|
|
98
|
+
@stdout.puts "Wrote output to #{path}"
|
|
99
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
100
|
+
@errors.cannot_write_output_file(path, e.class)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -4,6 +4,8 @@ require "csv"
|
|
|
4
4
|
require "csvtool/interface/cli/menu_loop"
|
|
5
5
|
require "csvtool/application/use_cases/run_extraction"
|
|
6
6
|
require "csvtool/application/use_cases/run_row_extraction"
|
|
7
|
+
require "csvtool/application/use_cases/run_row_randomization"
|
|
8
|
+
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
7
9
|
require "csvtool/interface/cli/errors/presenter"
|
|
8
10
|
require "csvtool/infrastructure/csv/header_reader"
|
|
9
11
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -14,6 +16,8 @@ module Csvtool
|
|
|
14
16
|
MENU_OPTIONS = [
|
|
15
17
|
"Extract column",
|
|
16
18
|
"Extract rows (range)",
|
|
19
|
+
"Randomize rows",
|
|
20
|
+
"Dedupe using another CSV",
|
|
17
21
|
"Exit"
|
|
18
22
|
].freeze
|
|
19
23
|
|
|
@@ -45,12 +49,16 @@ module Csvtool
|
|
|
45
49
|
def run_menu_loop
|
|
46
50
|
extract_column_action = -> { Application::UseCases::RunExtraction.new(stdin: @stdin, stdout: @stdout).call }
|
|
47
51
|
extract_rows_action = -> { Application::UseCases::RunRowExtraction.new(stdin: @stdin, stdout: @stdout).call }
|
|
52
|
+
randomize_rows_action = -> { Application::UseCases::RunRowRandomization.new(stdin: @stdin, stdout: @stdout).call }
|
|
53
|
+
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
48
54
|
Interface::CLI::MenuLoop.new(
|
|
49
55
|
stdin: @stdin,
|
|
50
56
|
stdout: @stdout,
|
|
51
57
|
menu_options: MENU_OPTIONS,
|
|
52
58
|
extract_column_action: extract_column_action,
|
|
53
|
-
extract_rows_action: extract_rows_action
|
|
59
|
+
extract_rows_action: extract_rows_action,
|
|
60
|
+
randomize_rows_action: randomize_rows_action,
|
|
61
|
+
dedupe_action: dedupe_action
|
|
54
62
|
).run
|
|
55
63
|
end
|
|
56
64
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class ColumnSelector
|
|
7
|
+
attr_reader :value
|
|
8
|
+
|
|
9
|
+
def self.from_input(headers_present:, input:)
|
|
10
|
+
if headers_present
|
|
11
|
+
raise ArgumentError, "column name cannot be empty" if input.to_s.empty?
|
|
12
|
+
|
|
13
|
+
new(value: input.to_s, headers_present: true)
|
|
14
|
+
else
|
|
15
|
+
raise ArgumentError, "column index must be a positive integer" unless /\A[1-9]\d*\z/.match?(input.to_s)
|
|
16
|
+
|
|
17
|
+
new(value: input.to_i, headers_present: false)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(value:, headers_present:)
|
|
22
|
+
@value = value
|
|
23
|
+
@headers_present = !!headers_present
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def headers_present?
|
|
27
|
+
@headers_present
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def index?
|
|
31
|
+
!@headers_present
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def extract_from(row)
|
|
35
|
+
if headers_present?
|
|
36
|
+
row[@value].to_s
|
|
37
|
+
else
|
|
38
|
+
row[@value - 1].to_s
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
4
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
require "csvtool/domain/shared/output_destination"
|
|
7
|
+
|
|
8
|
+
module Csvtool
|
|
9
|
+
module Domain
|
|
10
|
+
module CrossCsvDedupeSession
|
|
11
|
+
class CrossCsvDedupeSession
|
|
12
|
+
attr_reader :source, :reference, :key_mapping, :match_options, :output_destination
|
|
13
|
+
|
|
14
|
+
def self.start(source:, reference:, key_mapping:, match_options:)
|
|
15
|
+
new(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(source:, reference:, key_mapping:, match_options:, output_destination: nil)
|
|
19
|
+
raise ArgumentError, "source must be CsvProfile" unless source.is_a?(CsvProfile)
|
|
20
|
+
raise ArgumentError, "reference must be CsvProfile" unless reference.is_a?(CsvProfile)
|
|
21
|
+
raise ArgumentError, "key_mapping must be KeyMapping" unless key_mapping.is_a?(KeyMapping)
|
|
22
|
+
raise ArgumentError, "match_options must be MatchOptions" unless match_options.is_a?(MatchOptions)
|
|
23
|
+
unless output_destination.nil? || output_destination.is_a?(Domain::Shared::OutputDestination)
|
|
24
|
+
raise ArgumentError, "output_destination must be OutputDestination or nil"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@source = source
|
|
28
|
+
@reference = reference
|
|
29
|
+
@key_mapping = key_mapping
|
|
30
|
+
@match_options = match_options
|
|
31
|
+
@output_destination = output_destination
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def with_output_destination(destination)
|
|
35
|
+
self.class.new(
|
|
36
|
+
source: @source,
|
|
37
|
+
reference: @reference,
|
|
38
|
+
key_mapping: @key_mapping,
|
|
39
|
+
match_options: @match_options,
|
|
40
|
+
output_destination: destination
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class CsvProfile
|
|
7
|
+
attr_reader :path, :separator
|
|
8
|
+
|
|
9
|
+
def initialize(path:, separator:, headers_present:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
13
|
+
@path = path
|
|
14
|
+
@separator = separator
|
|
15
|
+
@headers_present = !!headers_present
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def headers_present?
|
|
19
|
+
@headers_present
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Domain
|
|
7
|
+
module CrossCsvDedupeSession
|
|
8
|
+
class KeyMapping
|
|
9
|
+
attr_reader :source_selector, :reference_selector
|
|
10
|
+
|
|
11
|
+
def initialize(source_selector:, reference_selector:)
|
|
12
|
+
unless source_selector.is_a?(ColumnSelector) && reference_selector.is_a?(ColumnSelector)
|
|
13
|
+
raise ArgumentError, "selectors must be ColumnSelector"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
@source_selector = source_selector
|
|
17
|
+
@reference_selector = reference_selector
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class MatchOptions
|
|
7
|
+
attr_reader :trim_whitespace, :case_insensitive
|
|
8
|
+
|
|
9
|
+
def initialize(trim_whitespace:, case_insensitive:)
|
|
10
|
+
@trim_whitespace = !!trim_whitespace
|
|
11
|
+
@case_insensitive = !!case_insensitive
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def trim_whitespace?
|
|
15
|
+
@trim_whitespace
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def case_insensitive?
|
|
19
|
+
@case_insensitive
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def normalize(value)
|
|
23
|
+
normalized = trim_whitespace? ? value.to_s.strip : value.to_s
|
|
24
|
+
case_insensitive? ? normalized.downcase : normalized
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module RowRandomizationSession
|
|
6
|
+
class RandomizationOptions
|
|
7
|
+
attr_reader :seed
|
|
8
|
+
|
|
9
|
+
def initialize(seed:)
|
|
10
|
+
raise ArgumentError, "seed must be an integer or nil" unless seed.nil? || seed.is_a?(Integer)
|
|
11
|
+
|
|
12
|
+
@seed = seed
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module RowRandomizationSession
|
|
6
|
+
class RandomizationSession
|
|
7
|
+
attr_reader :source, :options, :output_destination
|
|
8
|
+
|
|
9
|
+
def self.start(source:, options:)
|
|
10
|
+
new(source: source, options: options)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(source:, options:, output_destination: nil)
|
|
14
|
+
@source = source
|
|
15
|
+
@options = options
|
|
16
|
+
@output_destination = output_destination
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def with_output_destination(destination)
|
|
20
|
+
self.class.new(source: @source, options: @options, output_destination: destination)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module RowRandomizationSession
|
|
6
|
+
class RandomizationSource
|
|
7
|
+
attr_reader :path, :separator
|
|
8
|
+
|
|
9
|
+
def initialize(path:, separator:, headers_present:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
13
|
+
@path = path
|
|
14
|
+
@separator = separator
|
|
15
|
+
@headers_present = headers_present
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def headers_present?
|
|
19
|
+
@headers_present
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -7,6 +7,9 @@ module Csvtool
|
|
|
7
7
|
attr_reader :path, :separator
|
|
8
8
|
|
|
9
9
|
def initialize(path:, separator:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
10
13
|
@path = path
|
|
11
14
|
@separator = separator
|
|
12
15
|
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "set"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Infrastructure
|
|
9
|
+
module CSV
|
|
10
|
+
class CrossCsvDeduper
|
|
11
|
+
def call(
|
|
12
|
+
source_path:,
|
|
13
|
+
reference_path:,
|
|
14
|
+
source_selector:,
|
|
15
|
+
reference_selector:,
|
|
16
|
+
source_col_sep: ",",
|
|
17
|
+
reference_col_sep: ",",
|
|
18
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
19
|
+
)
|
|
20
|
+
kept_rows = []
|
|
21
|
+
stats = each_retained(
|
|
22
|
+
source_path: source_path,
|
|
23
|
+
reference_path: reference_path,
|
|
24
|
+
source_selector: source_selector,
|
|
25
|
+
reference_selector: reference_selector,
|
|
26
|
+
source_col_sep: source_col_sep,
|
|
27
|
+
reference_col_sep: reference_col_sep,
|
|
28
|
+
match_options: match_options
|
|
29
|
+
) do |fields|
|
|
30
|
+
kept_rows << fields
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
stats.merge(kept_rows: kept_rows)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def each_retained(
|
|
37
|
+
source_path:,
|
|
38
|
+
reference_path:,
|
|
39
|
+
source_selector:,
|
|
40
|
+
reference_selector:,
|
|
41
|
+
source_col_sep: ",",
|
|
42
|
+
reference_col_sep: ",",
|
|
43
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
44
|
+
)
|
|
45
|
+
source_has_headers = source_selector.headers_present?
|
|
46
|
+
reference_has_headers = reference_selector.headers_present?
|
|
47
|
+
reference_keys = Set.new
|
|
48
|
+
::CSV.foreach(reference_path, headers: reference_has_headers, col_sep: reference_col_sep) do |row|
|
|
49
|
+
reference_keys << extract_key(row, selector: reference_selector, match_options: match_options)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
source_header_row = nil
|
|
53
|
+
source_rows = 0
|
|
54
|
+
removed_rows = 0
|
|
55
|
+
kept_rows_count = 0
|
|
56
|
+
|
|
57
|
+
::CSV.foreach(source_path, headers: source_has_headers, col_sep: source_col_sep) do |row|
|
|
58
|
+
source_header_row ||= row.headers if source_has_headers
|
|
59
|
+
source_rows += 1
|
|
60
|
+
key = extract_key(row, selector: source_selector, match_options: match_options)
|
|
61
|
+
if reference_keys.include?(key)
|
|
62
|
+
removed_rows += 1
|
|
63
|
+
else
|
|
64
|
+
kept_rows_count += 1
|
|
65
|
+
yield(source_has_headers ? row.fields : row) if block_given?
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
headers: source_has_headers ? (source_header_row || []) : nil,
|
|
71
|
+
source_rows: source_rows,
|
|
72
|
+
removed_rows: removed_rows,
|
|
73
|
+
kept_rows_count: kept_rows_count
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def extract_key(row, selector:, match_options:)
|
|
80
|
+
match_options.normalize(selector.extract_from(row))
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|