csvops 0.3.0.alpha → 0.5.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -149
- data/docs/architecture.md +396 -0
- data/docs/release-v0.4.0-alpha.md +87 -0
- data/docs/release-v0.5.0-alpha.md +89 -0
- data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +96 -0
- data/lib/csvtool/application/use_cases/run_extraction.rb +63 -88
- data/lib/csvtool/application/use_cases/run_row_extraction.rb +45 -73
- data/lib/csvtool/application/use_cases/run_row_randomization.rb +56 -73
- data/lib/csvtool/cli.rb +11 -7
- data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +1 -0
- data/lib/csvtool/domain/row_session/row_source.rb +3 -0
- data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
- data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
- data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
- data/lib/csvtool/infrastructure/output/csv_cross_csv_dedupe_file_writer.rb +23 -0
- data/lib/csvtool/infrastructure/output/csv_file_writer.rb +1 -7
- data/lib/csvtool/infrastructure/output/csv_randomized_row_file_writer.rb +23 -0
- data/lib/csvtool/infrastructure/output/csv_row_file_writer.rb +2 -9
- data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
- data/lib/csvtool/interface/cli/prompts/dedupe_key_selector_prompt.rb +30 -0
- data/lib/csvtool/interface/cli/prompts/file_path_prompt.rb +4 -2
- data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +4 -2
- data/lib/csvtool/interface/cli/prompts/separator_prompt.rb +4 -2
- data/lib/csvtool/interface/cli/prompts/yes_no_prompt.rb +26 -0
- data/lib/csvtool/interface/cli/workflows/builders/column_session_builder.rb +32 -0
- data/lib/csvtool/interface/cli/workflows/builders/cross_csv_dedupe_session_builder.rb +35 -0
- data/lib/csvtool/interface/cli/workflows/builders/row_extraction_session_builder.rb +22 -0
- data/lib/csvtool/interface/cli/workflows/builders/row_randomization_session_builder.rb +28 -0
- data/lib/csvtool/interface/cli/workflows/presenters/column_extraction_presenter.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/presenters/cross_csv_dedupe_presenter.rb +39 -0
- data/lib/csvtool/interface/cli/workflows/presenters/row_extraction_presenter.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/presenters/row_randomization_presenter.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +86 -0
- data/lib/csvtool/interface/cli/workflows/run_extraction_workflow.rb +88 -0
- data/lib/csvtool/interface/cli/workflows/run_row_extraction_workflow.rb +86 -0
- data/lib/csvtool/interface/cli/workflows/run_row_randomization_workflow.rb +80 -0
- data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_options_step.rb +55 -0
- data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_profiles_step.rb +52 -0
- data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/execute_step.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/steps/extraction/build_preview_step.rb +40 -0
- data/lib/csvtool/interface/cli/workflows/steps/extraction/collect_destination_step.rb +28 -0
- data/lib/csvtool/interface/cli/workflows/steps/extraction/collect_inputs_step.rb +47 -0
- data/lib/csvtool/interface/cli/workflows/steps/extraction/execute_step.rb +32 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_destination_step.rb +33 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_range_step.rb +35 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_source_step.rb +32 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_extraction/execute_step.rb +43 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_extraction/read_headers_step.rb +29 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_randomization/collect_destination_step.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_randomization/collect_inputs_step.rb +49 -0
- data/lib/csvtool/interface/cli/workflows/steps/row_randomization/execute_step.rb +37 -0
- data/lib/csvtool/interface/cli/workflows/steps/workflow_step_pipeline.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/support/output_destination_mapper.rb +23 -0
- data/lib/csvtool/interface/cli/workflows/support/result_error_handler.rb +22 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/io_boundary_test.rb +26 -0
- data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +141 -0
- data/test/csvtool/application/use_cases/run_extraction_test.rb +72 -16
- data/test/csvtool/application/use_cases/run_row_extraction_test.rb +82 -102
- data/test/csvtool/application/use_cases/run_row_randomization_test.rb +96 -86
- data/test/csvtool/cli_test.rb +130 -16
- data/test/csvtool/cli_unit_test.rb +16 -3
- data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
- data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
- data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +2 -2
- data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +15 -1
- data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
- data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
- data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
- data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
- data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
- data/test/csvtool/infrastructure/output/csv_cross_csv_dedupe_file_writer_test.rb +32 -0
- data/test/csvtool/infrastructure/output/csv_file_writer_test.rb +0 -4
- data/test/csvtool/infrastructure/output/csv_randomized_row_file_writer_test.rb +32 -0
- data/test/csvtool/infrastructure/output/csv_row_file_writer_test.rb +1 -4
- data/test/csvtool/interface/cli/menu_loop_test.rb +50 -13
- data/test/csvtool/interface/cli/prompts/dedupe_key_selector_prompt_test.rb +30 -0
- data/test/csvtool/interface/cli/prompts/file_path_prompt_test.rb +9 -0
- data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +10 -0
- data/test/csvtool/interface/cli/prompts/separator_prompt_test.rb +10 -0
- data/test/csvtool/interface/cli/prompts/yes_no_prompt_test.rb +22 -0
- data/test/csvtool/interface/cli/workflows/builders/column_session_builder_test.rb +17 -0
- data/test/csvtool/interface/cli/workflows/builders/cross_csv_dedupe_session_builder_test.rb +36 -0
- data/test/csvtool/interface/cli/workflows/builders/row_extraction_session_builder_test.rb +21 -0
- data/test/csvtool/interface/cli/workflows/builders/row_randomization_session_builder_test.rb +26 -0
- data/test/csvtool/interface/cli/workflows/presenters/column_extraction_presenter_test.rb +24 -0
- data/test/csvtool/interface/cli/workflows/presenters/cross_csv_dedupe_presenter_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/presenters/row_extraction_presenter_test.rb +33 -0
- data/test/csvtool/interface/cli/workflows/presenters/row_randomization_presenter_test.rb +33 -0
- data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
- data/test/csvtool/interface/cli/workflows/run_extraction_workflow_test.rb +56 -0
- data/test/csvtool/interface/cli/workflows/run_row_extraction_workflow_test.rb +83 -0
- data/test/csvtool/interface/cli/workflows/run_row_randomization_workflow_test.rb +69 -0
- data/test/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_options_step_test.rb +41 -0
- data/test/csvtool/interface/cli/workflows/steps/extraction/collect_inputs_step_test.rb +66 -0
- data/test/csvtool/interface/cli/workflows/steps/row_extraction/collect_source_step_test.rb +39 -0
- data/test/csvtool/interface/cli/workflows/steps/row_extraction/execute_step_test.rb +91 -0
- data/test/csvtool/interface/cli/workflows/steps/row_extraction/read_headers_step_test.rb +57 -0
- data/test/csvtool/interface/cli/workflows/steps/row_randomization/collect_inputs_step_test.rb +37 -0
- data/test/csvtool/interface/cli/workflows/steps/workflow_step_pipeline_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/support/output_destination_mapper_test.rb +23 -0
- data/test/csvtool/interface/cli/workflows/support/result_error_handler_test.rb +34 -0
- data/test/fixtures/dedupe_reference.csv +3 -0
- data/test/fixtures/dedupe_reference.tsv +3 -0
- data/test/fixtures/dedupe_reference_all.csv +5 -0
- data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
- data/test/fixtures/dedupe_reference_none.csv +2 -0
- data/test/fixtures/dedupe_reference_normalization.csv +3 -0
- data/test/fixtures/dedupe_source.csv +6 -0
- data/test/fixtures/dedupe_source.tsv +6 -0
- data/test/fixtures/dedupe_source_no_headers.csv +5 -0
- data/test/fixtures/dedupe_source_normalization.csv +4 -0
- metadata +93 -8
- data/lib/csvtool/domain/row_randomization_session/randomization_output_destination.rb +0 -31
- data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
- data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
- data/test/csvtool/domain/row_randomization_session/randomization_output_destination_test.rb +0 -21
- data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
|
@@ -1,103 +1,86 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "csv"
|
|
4
|
-
require "csvtool/interface/cli/errors/presenter"
|
|
5
|
-
require "csvtool/interface/cli/prompts/file_path_prompt"
|
|
6
|
-
require "csvtool/interface/cli/prompts/separator_prompt"
|
|
7
|
-
require "csvtool/interface/cli/prompts/headers_present_prompt"
|
|
8
|
-
require "csvtool/interface/cli/prompts/seed_prompt"
|
|
9
|
-
require "csvtool/interface/cli/prompts/output_destination_prompt"
|
|
10
4
|
require "csvtool/infrastructure/csv/header_reader"
|
|
11
5
|
require "csvtool/infrastructure/csv/row_randomizer"
|
|
12
|
-
require "csvtool/
|
|
13
|
-
require "csvtool/domain/row_randomization_session/randomization_options"
|
|
14
|
-
require "csvtool/domain/row_randomization_session/randomization_output_destination"
|
|
15
|
-
require "csvtool/domain/row_randomization_session/randomization_session"
|
|
6
|
+
require "csvtool/infrastructure/output/csv_randomized_row_file_writer"
|
|
16
7
|
|
|
17
8
|
module Csvtool
|
|
18
9
|
module Application
|
|
19
10
|
module UseCases
|
|
20
11
|
class RunRowRandomization
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@header_reader = Infrastructure::CSV::HeaderReader.new
|
|
26
|
-
@row_randomizer = Infrastructure::CSV::RowRandomizer.new
|
|
12
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
13
|
+
def ok?
|
|
14
|
+
ok
|
|
15
|
+
end
|
|
27
16
|
end
|
|
28
17
|
|
|
29
|
-
def
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
path: file_path,
|
|
39
|
-
separator: col_sep,
|
|
40
|
-
headers_present: headers_present
|
|
18
|
+
def initialize(
|
|
19
|
+
header_reader: Infrastructure::CSV::HeaderReader.new,
|
|
20
|
+
row_randomizer: Infrastructure::CSV::RowRandomizer.new,
|
|
21
|
+
csv_randomized_row_file_writer: nil
|
|
22
|
+
)
|
|
23
|
+
@header_reader = header_reader
|
|
24
|
+
@row_randomizer = row_randomizer
|
|
25
|
+
@csv_randomized_row_file_writer = csv_randomized_row_file_writer || Infrastructure::Output::CsvRandomizedRowFileWriter.new(
|
|
26
|
+
row_randomizer: @row_randomizer
|
|
41
27
|
)
|
|
42
|
-
|
|
43
|
-
return @errors.no_headers if source.headers_present? && headers.empty?
|
|
28
|
+
end
|
|
44
29
|
|
|
45
|
-
|
|
46
|
-
return
|
|
47
|
-
options = Domain::RowRandomizationSession::RandomizationOptions.new(seed: seed)
|
|
48
|
-
session = Domain::RowRandomizationSession::RandomizationSession.start(source: source, options: options)
|
|
30
|
+
def read_headers(file_path:, col_sep:, headers_present:)
|
|
31
|
+
return failure(:file_not_found, path: file_path) unless File.file?(file_path)
|
|
49
32
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return if output_destination.nil?
|
|
56
|
-
destination =
|
|
57
|
-
if output_destination[:mode] == :file
|
|
58
|
-
Domain::RowRandomizationSession::RandomizationOutputDestination.file(path: output_destination[:path])
|
|
59
|
-
else
|
|
60
|
-
Domain::RowRandomizationSession::RandomizationOutputDestination.console
|
|
61
|
-
end
|
|
62
|
-
session = session.with_output_destination(destination)
|
|
33
|
+
headers = nil
|
|
34
|
+
if headers_present
|
|
35
|
+
headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
|
|
36
|
+
return failure(:no_headers) if headers.empty?
|
|
37
|
+
end
|
|
63
38
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
39
|
+
success(headers: headers)
|
|
40
|
+
rescue CSV::MalformedCSVError
|
|
41
|
+
failure(:could_not_parse_csv)
|
|
42
|
+
rescue Errno::EACCES
|
|
43
|
+
failure(:cannot_read_file, path: file_path)
|
|
44
|
+
end
|
|
70
45
|
|
|
46
|
+
def randomize(session:, headers:, on_row: nil)
|
|
71
47
|
if session.output_destination.file?
|
|
72
|
-
|
|
48
|
+
@csv_randomized_row_file_writer.call(
|
|
49
|
+
path: session.output_destination.path,
|
|
50
|
+
headers: headers,
|
|
51
|
+
file_path: session.source.path,
|
|
52
|
+
col_sep: session.source.separator,
|
|
53
|
+
headers_present: session.source.headers_present?,
|
|
54
|
+
seed: session.options.seed
|
|
55
|
+
)
|
|
56
|
+
success(output_path: session.output_destination.path)
|
|
73
57
|
else
|
|
74
|
-
|
|
58
|
+
@row_randomizer.each(
|
|
59
|
+
file_path: session.source.path,
|
|
60
|
+
col_sep: session.source.separator,
|
|
61
|
+
headers: session.source.headers_present?,
|
|
62
|
+
seed: session.options.seed
|
|
63
|
+
) { |fields| on_row.call(fields) if on_row }
|
|
64
|
+
success({})
|
|
75
65
|
end
|
|
76
66
|
rescue CSV::MalformedCSVError
|
|
77
|
-
|
|
78
|
-
rescue
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
67
|
+
failure(:could_not_parse_csv)
|
|
68
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
69
|
+
if session.output_destination.file?
|
|
70
|
+
failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
|
|
71
|
+
else
|
|
72
|
+
failure(:cannot_read_file, path: session.source.path)
|
|
73
|
+
end
|
|
84
74
|
end
|
|
85
75
|
|
|
86
76
|
private
|
|
87
77
|
|
|
88
|
-
def
|
|
89
|
-
|
|
90
|
-
@stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp if headers
|
|
91
|
-
rows.each { |fields| @stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp }
|
|
78
|
+
def success(data)
|
|
79
|
+
Result.new(ok: true, error: nil, data: data)
|
|
92
80
|
end
|
|
93
81
|
|
|
94
|
-
def
|
|
95
|
-
|
|
96
|
-
rows.each { |fields| csv << fields }
|
|
97
|
-
end
|
|
98
|
-
@stdout.puts "Wrote output to #{path}"
|
|
99
|
-
rescue Errno::EACCES, Errno::ENOENT => e
|
|
100
|
-
@errors.cannot_write_output_file(path, e.class)
|
|
82
|
+
def failure(code, data = {})
|
|
83
|
+
Result.new(ok: false, error: code, data: data)
|
|
101
84
|
end
|
|
102
85
|
end
|
|
103
86
|
end
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require "csv"
|
|
4
4
|
require "csvtool/interface/cli/menu_loop"
|
|
5
|
-
require "csvtool/
|
|
6
|
-
require "csvtool/
|
|
7
|
-
require "csvtool/
|
|
5
|
+
require "csvtool/interface/cli/workflows/run_extraction_workflow"
|
|
6
|
+
require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
|
|
7
|
+
require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
|
|
8
|
+
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
8
9
|
require "csvtool/interface/cli/errors/presenter"
|
|
9
10
|
require "csvtool/infrastructure/csv/header_reader"
|
|
10
11
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -16,6 +17,7 @@ module Csvtool
|
|
|
16
17
|
"Extract column",
|
|
17
18
|
"Extract rows (range)",
|
|
18
19
|
"Randomize rows",
|
|
20
|
+
"Dedupe using another CSV",
|
|
19
21
|
"Exit"
|
|
20
22
|
].freeze
|
|
21
23
|
|
|
@@ -45,16 +47,18 @@ module Csvtool
|
|
|
45
47
|
private
|
|
46
48
|
|
|
47
49
|
def run_menu_loop
|
|
48
|
-
extract_column_action = -> {
|
|
49
|
-
extract_rows_action = -> {
|
|
50
|
-
randomize_rows_action = -> {
|
|
50
|
+
extract_column_action = -> { Interface::CLI::Workflows::RunExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
51
|
+
extract_rows_action = -> { Interface::CLI::Workflows::RunRowExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
52
|
+
randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
53
|
+
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
51
54
|
Interface::CLI::MenuLoop.new(
|
|
52
55
|
stdin: @stdin,
|
|
53
56
|
stdout: @stdout,
|
|
54
57
|
menu_options: MENU_OPTIONS,
|
|
55
58
|
extract_column_action: extract_column_action,
|
|
56
59
|
extract_rows_action: extract_rows_action,
|
|
57
|
-
randomize_rows_action: randomize_rows_action
|
|
60
|
+
randomize_rows_action: randomize_rows_action,
|
|
61
|
+
dedupe_action: dedupe_action
|
|
58
62
|
).run
|
|
59
63
|
end
|
|
60
64
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class ColumnSelector
|
|
7
|
+
attr_reader :value
|
|
8
|
+
|
|
9
|
+
def self.from_input(headers_present:, input:)
|
|
10
|
+
if headers_present
|
|
11
|
+
raise ArgumentError, "column name cannot be empty" if input.to_s.empty?
|
|
12
|
+
|
|
13
|
+
new(value: input.to_s, headers_present: true)
|
|
14
|
+
else
|
|
15
|
+
raise ArgumentError, "column index must be a positive integer" unless /\A[1-9]\d*\z/.match?(input.to_s)
|
|
16
|
+
|
|
17
|
+
new(value: input.to_i, headers_present: false)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(value:, headers_present:)
|
|
22
|
+
@value = value
|
|
23
|
+
@headers_present = !!headers_present
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def headers_present?
|
|
27
|
+
@headers_present
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def index?
|
|
31
|
+
!@headers_present
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def extract_from(row)
|
|
35
|
+
if headers_present?
|
|
36
|
+
row[@value].to_s
|
|
37
|
+
else
|
|
38
|
+
row[@value - 1].to_s
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
4
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
require "csvtool/domain/shared/output_destination"
|
|
7
|
+
|
|
8
|
+
module Csvtool
|
|
9
|
+
module Domain
|
|
10
|
+
module CrossCsvDedupeSession
|
|
11
|
+
class CrossCsvDedupeSession
|
|
12
|
+
attr_reader :source, :reference, :key_mapping, :match_options, :output_destination
|
|
13
|
+
|
|
14
|
+
def self.start(source:, reference:, key_mapping:, match_options:)
|
|
15
|
+
new(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(source:, reference:, key_mapping:, match_options:, output_destination: nil)
|
|
19
|
+
raise ArgumentError, "source must be CsvProfile" unless source.is_a?(CsvProfile)
|
|
20
|
+
raise ArgumentError, "reference must be CsvProfile" unless reference.is_a?(CsvProfile)
|
|
21
|
+
raise ArgumentError, "key_mapping must be KeyMapping" unless key_mapping.is_a?(KeyMapping)
|
|
22
|
+
raise ArgumentError, "match_options must be MatchOptions" unless match_options.is_a?(MatchOptions)
|
|
23
|
+
unless output_destination.nil? || output_destination.is_a?(Domain::Shared::OutputDestination)
|
|
24
|
+
raise ArgumentError, "output_destination must be OutputDestination or nil"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@source = source
|
|
28
|
+
@reference = reference
|
|
29
|
+
@key_mapping = key_mapping
|
|
30
|
+
@match_options = match_options
|
|
31
|
+
@output_destination = output_destination
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def with_output_destination(destination)
|
|
35
|
+
self.class.new(
|
|
36
|
+
source: @source,
|
|
37
|
+
reference: @reference,
|
|
38
|
+
key_mapping: @key_mapping,
|
|
39
|
+
match_options: @match_options,
|
|
40
|
+
output_destination: destination
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class CsvProfile
|
|
7
|
+
attr_reader :path, :separator
|
|
8
|
+
|
|
9
|
+
def initialize(path:, separator:, headers_present:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
13
|
+
@path = path
|
|
14
|
+
@separator = separator
|
|
15
|
+
@headers_present = !!headers_present
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def headers_present?
|
|
19
|
+
@headers_present
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Domain
|
|
7
|
+
module CrossCsvDedupeSession
|
|
8
|
+
class KeyMapping
|
|
9
|
+
attr_reader :source_selector, :reference_selector
|
|
10
|
+
|
|
11
|
+
def initialize(source_selector:, reference_selector:)
|
|
12
|
+
unless source_selector.is_a?(ColumnSelector) && reference_selector.is_a?(ColumnSelector)
|
|
13
|
+
raise ArgumentError, "selectors must be ColumnSelector"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
@source_selector = source_selector
|
|
17
|
+
@reference_selector = reference_selector
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class MatchOptions
|
|
7
|
+
attr_reader :trim_whitespace, :case_insensitive
|
|
8
|
+
|
|
9
|
+
def initialize(trim_whitespace:, case_insensitive:)
|
|
10
|
+
@trim_whitespace = !!trim_whitespace
|
|
11
|
+
@case_insensitive = !!case_insensitive
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def trim_whitespace?
|
|
15
|
+
@trim_whitespace
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def case_insensitive?
|
|
19
|
+
@case_insensitive
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def normalize(value)
|
|
23
|
+
normalized = trim_whitespace? ? value.to_s.strip : value.to_s
|
|
24
|
+
case_insensitive? ? normalized.downcase : normalized
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -7,6 +7,9 @@ module Csvtool
|
|
|
7
7
|
attr_reader :path, :separator
|
|
8
8
|
|
|
9
9
|
def initialize(path:, separator:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
10
13
|
@path = path
|
|
11
14
|
@separator = separator
|
|
12
15
|
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "set"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Infrastructure
|
|
9
|
+
module CSV
|
|
10
|
+
class CrossCsvDeduper
|
|
11
|
+
def call(
|
|
12
|
+
source_path:,
|
|
13
|
+
reference_path:,
|
|
14
|
+
source_selector:,
|
|
15
|
+
reference_selector:,
|
|
16
|
+
source_col_sep: ",",
|
|
17
|
+
reference_col_sep: ",",
|
|
18
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
19
|
+
)
|
|
20
|
+
kept_rows = []
|
|
21
|
+
stats = each_retained(
|
|
22
|
+
source_path: source_path,
|
|
23
|
+
reference_path: reference_path,
|
|
24
|
+
source_selector: source_selector,
|
|
25
|
+
reference_selector: reference_selector,
|
|
26
|
+
source_col_sep: source_col_sep,
|
|
27
|
+
reference_col_sep: reference_col_sep,
|
|
28
|
+
match_options: match_options
|
|
29
|
+
) do |fields|
|
|
30
|
+
kept_rows << fields
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
stats.merge(kept_rows: kept_rows)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def each_retained(
|
|
37
|
+
source_path:,
|
|
38
|
+
reference_path:,
|
|
39
|
+
source_selector:,
|
|
40
|
+
reference_selector:,
|
|
41
|
+
source_col_sep: ",",
|
|
42
|
+
reference_col_sep: ",",
|
|
43
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
44
|
+
)
|
|
45
|
+
source_has_headers = source_selector.headers_present?
|
|
46
|
+
reference_has_headers = reference_selector.headers_present?
|
|
47
|
+
reference_keys = Set.new
|
|
48
|
+
::CSV.foreach(reference_path, headers: reference_has_headers, col_sep: reference_col_sep) do |row|
|
|
49
|
+
reference_keys << extract_key(row, selector: reference_selector, match_options: match_options)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
source_header_row = nil
|
|
53
|
+
source_rows = 0
|
|
54
|
+
removed_rows = 0
|
|
55
|
+
kept_rows_count = 0
|
|
56
|
+
|
|
57
|
+
::CSV.foreach(source_path, headers: source_has_headers, col_sep: source_col_sep) do |row|
|
|
58
|
+
source_header_row ||= row.headers if source_has_headers
|
|
59
|
+
source_rows += 1
|
|
60
|
+
key = extract_key(row, selector: source_selector, match_options: match_options)
|
|
61
|
+
if reference_keys.include?(key)
|
|
62
|
+
removed_rows += 1
|
|
63
|
+
else
|
|
64
|
+
kept_rows_count += 1
|
|
65
|
+
yield(source_has_headers ? row.fields : row) if block_given?
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
headers: source_has_headers ? (source_header_row || []) : nil,
|
|
71
|
+
source_rows: source_rows,
|
|
72
|
+
removed_rows: removed_rows,
|
|
73
|
+
kept_rows_count: kept_rows_count
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def extract_key(row, selector:, match_options:)
|
|
80
|
+
match_options.normalize(selector.extract_from(row))
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
5
|
+
|
|
6
|
+
module Csvtool
|
|
7
|
+
module Infrastructure
|
|
8
|
+
module CSV
|
|
9
|
+
class SelectorValidator
|
|
10
|
+
def initialize(header_reader: HeaderReader.new)
|
|
11
|
+
@header_reader = header_reader
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def valid?(profile:, selector:)
|
|
15
|
+
if selector.headers_present?
|
|
16
|
+
headers = @header_reader.call(file_path: profile.path, col_sep: profile.separator)
|
|
17
|
+
return false if headers.empty?
|
|
18
|
+
|
|
19
|
+
headers.include?(selector.value)
|
|
20
|
+
else
|
|
21
|
+
first_row = ::CSV.open(profile.path, "r", headers: false, col_sep: profile.separator, &:first)
|
|
22
|
+
return false if first_row.nil?
|
|
23
|
+
|
|
24
|
+
selector.value <= first_row.length
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Infrastructure
|
|
7
|
+
module Output
|
|
8
|
+
class CsvCrossCsvDedupeFileWriter
|
|
9
|
+
def initialize(deduper:)
|
|
10
|
+
@deduper = deduper
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(path:, headers:, col_sep:, dedupe_options:)
|
|
14
|
+
stats = nil
|
|
15
|
+
::CSV.open(path, "w", write_headers: !headers.nil?, headers: headers, col_sep: col_sep) do |csv|
|
|
16
|
+
stats = @deduper.each_retained(**dedupe_options) { |fields| csv << fields }
|
|
17
|
+
end
|
|
18
|
+
stats
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -6,9 +6,7 @@ module Csvtool
|
|
|
6
6
|
module Infrastructure
|
|
7
7
|
module Output
|
|
8
8
|
class CsvFileWriter
|
|
9
|
-
def initialize(
|
|
10
|
-
@stdout = stdout
|
|
11
|
-
@errors = errors
|
|
9
|
+
def initialize(value_streamer:)
|
|
12
10
|
@value_streamer = value_streamer
|
|
13
11
|
end
|
|
14
12
|
|
|
@@ -19,10 +17,6 @@ module Csvtool
|
|
|
19
17
|
csv << [value]
|
|
20
18
|
end
|
|
21
19
|
end
|
|
22
|
-
|
|
23
|
-
@stdout.puts "Wrote output to #{output_path}"
|
|
24
|
-
rescue Errno::EACCES, Errno::ENOENT => e
|
|
25
|
-
@errors.cannot_write_output_file(output_path, e.class)
|
|
26
20
|
end
|
|
27
21
|
end
|
|
28
22
|
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Infrastructure
|
|
7
|
+
module Output
|
|
8
|
+
class CsvRandomizedRowFileWriter
|
|
9
|
+
def initialize(row_randomizer:)
|
|
10
|
+
@row_randomizer = row_randomizer
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(path:, headers:, file_path:, col_sep:, headers_present:, seed:)
|
|
14
|
+
::CSV.open(path, "w", write_headers: !headers.nil?, headers: headers, col_sep: col_sep) do |csv|
|
|
15
|
+
@row_randomizer.each(file_path: file_path, col_sep: col_sep, headers: headers_present, seed: seed) do |fields|
|
|
16
|
+
csv << fields
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -6,9 +6,7 @@ module Csvtool
|
|
|
6
6
|
module Infrastructure
|
|
7
7
|
module Output
|
|
8
8
|
class CsvRowFileWriter
|
|
9
|
-
def initialize(
|
|
10
|
-
@stdout = stdout
|
|
11
|
-
@errors = errors
|
|
9
|
+
def initialize(row_streamer:)
|
|
12
10
|
@row_streamer = row_streamer
|
|
13
11
|
end
|
|
14
12
|
|
|
@@ -30,12 +28,7 @@ module Csvtool
|
|
|
30
28
|
csv << fields
|
|
31
29
|
end
|
|
32
30
|
|
|
33
|
-
|
|
34
|
-
@stdout.puts "Wrote output to #{output_path}" if wrote_rows
|
|
35
|
-
stats
|
|
36
|
-
rescue Errno::EACCES, Errno::ENOENT => e
|
|
37
|
-
@errors.cannot_write_output_file(output_path, e.class)
|
|
38
|
-
nil
|
|
31
|
+
stats.merge(wrote_rows: wrote_rows)
|
|
39
32
|
ensure
|
|
40
33
|
csv&.close unless csv&.closed?
|
|
41
34
|
end
|
|
@@ -4,13 +4,14 @@ module Csvtool
|
|
|
4
4
|
module Interface
|
|
5
5
|
module CLI
|
|
6
6
|
class MenuLoop
|
|
7
|
-
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:)
|
|
7
|
+
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
|
|
8
8
|
@stdin = stdin
|
|
9
9
|
@stdout = stdout
|
|
10
10
|
@menu_options = menu_options
|
|
11
11
|
@extract_column_action = extract_column_action
|
|
12
12
|
@extract_rows_action = extract_rows_action
|
|
13
13
|
@randomize_rows_action = randomize_rows_action
|
|
14
|
+
@dedupe_action = dedupe_action
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
def run
|
|
@@ -28,9 +29,11 @@ module Csvtool
|
|
|
28
29
|
when "3"
|
|
29
30
|
@randomize_rows_action.call
|
|
30
31
|
when "4"
|
|
32
|
+
@dedupe_action.call
|
|
33
|
+
when "5"
|
|
31
34
|
return 0
|
|
32
35
|
else
|
|
33
|
-
@stdout.puts "Please choose 1, 2, 3, or
|
|
36
|
+
@stdout.puts "Please choose 1, 2, 3, 4, or 5."
|
|
34
37
|
end
|
|
35
38
|
end
|
|
36
39
|
end
|