csvops 0.3.0.alpha → 0.4.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +56 -142
- data/docs/architecture.md +266 -0
- data/docs/release-v0.4.0-alpha.md +87 -0
- data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
- data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_randomization.rb +3 -3
- data/lib/csvtool/cli.rb +5 -1
- data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +1 -0
- data/lib/csvtool/domain/row_session/row_source.rb +3 -0
- data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
- data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
- data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
- data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
- data/test/csvtool/cli_test.rb +130 -16
- data/test/csvtool/cli_unit_test.rb +16 -3
- data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
- data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
- data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +2 -2
- data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +15 -1
- data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
- data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
- data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
- data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
- data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +50 -13
- data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
- data/test/fixtures/dedupe_reference.csv +3 -0
- data/test/fixtures/dedupe_reference.tsv +3 -0
- data/test/fixtures/dedupe_reference_all.csv +5 -0
- data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
- data/test/fixtures/dedupe_reference_none.csv +2 -0
- data/test/fixtures/dedupe_reference_normalization.csv +3 -0
- data/test/fixtures/dedupe_source.csv +6 -0
- data/test/fixtures/dedupe_source.tsv +6 -0
- data/test/fixtures/dedupe_source_no_headers.csv +5 -0
- data/test/fixtures/dedupe_source_normalization.csv +4 -0
- metadata +34 -8
- data/lib/csvtool/domain/row_randomization_session/randomization_output_destination.rb +0 -31
- data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
- data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
- data/test/csvtool/domain/row_randomization_session/randomization_output_destination_test.rb +0 -21
- data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
|
@@ -19,8 +19,8 @@ require "csvtool/domain/column_session/column_selection"
|
|
|
19
19
|
require "csvtool/domain/column_session/extraction_options"
|
|
20
20
|
require "csvtool/domain/column_session/extraction_value"
|
|
21
21
|
require "csvtool/domain/column_session/preview"
|
|
22
|
-
require "csvtool/domain/column_session/output_destination"
|
|
23
22
|
require "csvtool/domain/column_session/column_session"
|
|
23
|
+
require "csvtool/domain/shared/output_destination"
|
|
24
24
|
|
|
25
25
|
module Csvtool
|
|
26
26
|
module Application
|
|
@@ -79,9 +79,9 @@ module Csvtool
|
|
|
79
79
|
return if output_destination.nil?
|
|
80
80
|
domain_destination =
|
|
81
81
|
if output_destination[:mode] == :file
|
|
82
|
-
Domain::
|
|
82
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
83
83
|
else
|
|
84
|
-
Domain::
|
|
84
|
+
Domain::Shared::OutputDestination.console
|
|
85
85
|
end
|
|
86
86
|
session = session.with_output_destination(domain_destination)
|
|
87
87
|
|
|
@@ -11,8 +11,8 @@ require "csvtool/infrastructure/output/csv_row_console_writer"
|
|
|
11
11
|
require "csvtool/infrastructure/output/csv_row_file_writer"
|
|
12
12
|
require "csvtool/domain/row_session/row_range"
|
|
13
13
|
require "csvtool/domain/row_session/row_source"
|
|
14
|
-
require "csvtool/domain/row_session/row_output_destination"
|
|
15
14
|
require "csvtool/domain/row_session/row_session"
|
|
15
|
+
require "csvtool/domain/shared/output_destination"
|
|
16
16
|
|
|
17
17
|
module Csvtool
|
|
18
18
|
module Application
|
|
@@ -56,9 +56,9 @@ module Csvtool
|
|
|
56
56
|
return if output_destination.nil?
|
|
57
57
|
destination =
|
|
58
58
|
if output_destination[:mode] == :file
|
|
59
|
-
Domain::
|
|
59
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
60
60
|
else
|
|
61
|
-
Domain::
|
|
61
|
+
Domain::Shared::OutputDestination.console
|
|
62
62
|
end
|
|
63
63
|
session = session.with_output_destination(destination)
|
|
64
64
|
|
|
@@ -11,8 +11,8 @@ require "csvtool/infrastructure/csv/header_reader"
|
|
|
11
11
|
require "csvtool/infrastructure/csv/row_randomizer"
|
|
12
12
|
require "csvtool/domain/row_randomization_session/randomization_source"
|
|
13
13
|
require "csvtool/domain/row_randomization_session/randomization_options"
|
|
14
|
-
require "csvtool/domain/row_randomization_session/randomization_output_destination"
|
|
15
14
|
require "csvtool/domain/row_randomization_session/randomization_session"
|
|
15
|
+
require "csvtool/domain/shared/output_destination"
|
|
16
16
|
|
|
17
17
|
module Csvtool
|
|
18
18
|
module Application
|
|
@@ -55,9 +55,9 @@ module Csvtool
|
|
|
55
55
|
return if output_destination.nil?
|
|
56
56
|
destination =
|
|
57
57
|
if output_destination[:mode] == :file
|
|
58
|
-
Domain::
|
|
58
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
59
59
|
else
|
|
60
|
-
Domain::
|
|
60
|
+
Domain::Shared::OutputDestination.console
|
|
61
61
|
end
|
|
62
62
|
session = session.with_output_destination(destination)
|
|
63
63
|
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -5,6 +5,7 @@ require "csvtool/interface/cli/menu_loop"
|
|
|
5
5
|
require "csvtool/application/use_cases/run_extraction"
|
|
6
6
|
require "csvtool/application/use_cases/run_row_extraction"
|
|
7
7
|
require "csvtool/application/use_cases/run_row_randomization"
|
|
8
|
+
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
8
9
|
require "csvtool/interface/cli/errors/presenter"
|
|
9
10
|
require "csvtool/infrastructure/csv/header_reader"
|
|
10
11
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -16,6 +17,7 @@ module Csvtool
|
|
|
16
17
|
"Extract column",
|
|
17
18
|
"Extract rows (range)",
|
|
18
19
|
"Randomize rows",
|
|
20
|
+
"Dedupe using another CSV",
|
|
19
21
|
"Exit"
|
|
20
22
|
].freeze
|
|
21
23
|
|
|
@@ -48,13 +50,15 @@ module Csvtool
|
|
|
48
50
|
extract_column_action = -> { Application::UseCases::RunExtraction.new(stdin: @stdin, stdout: @stdout).call }
|
|
49
51
|
extract_rows_action = -> { Application::UseCases::RunRowExtraction.new(stdin: @stdin, stdout: @stdout).call }
|
|
50
52
|
randomize_rows_action = -> { Application::UseCases::RunRowRandomization.new(stdin: @stdin, stdout: @stdout).call }
|
|
53
|
+
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
51
54
|
Interface::CLI::MenuLoop.new(
|
|
52
55
|
stdin: @stdin,
|
|
53
56
|
stdout: @stdout,
|
|
54
57
|
menu_options: MENU_OPTIONS,
|
|
55
58
|
extract_column_action: extract_column_action,
|
|
56
59
|
extract_rows_action: extract_rows_action,
|
|
57
|
-
randomize_rows_action: randomize_rows_action
|
|
60
|
+
randomize_rows_action: randomize_rows_action,
|
|
61
|
+
dedupe_action: dedupe_action
|
|
58
62
|
).run
|
|
59
63
|
end
|
|
60
64
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class ColumnSelector
|
|
7
|
+
attr_reader :value
|
|
8
|
+
|
|
9
|
+
def self.from_input(headers_present:, input:)
|
|
10
|
+
if headers_present
|
|
11
|
+
raise ArgumentError, "column name cannot be empty" if input.to_s.empty?
|
|
12
|
+
|
|
13
|
+
new(value: input.to_s, headers_present: true)
|
|
14
|
+
else
|
|
15
|
+
raise ArgumentError, "column index must be a positive integer" unless /\A[1-9]\d*\z/.match?(input.to_s)
|
|
16
|
+
|
|
17
|
+
new(value: input.to_i, headers_present: false)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(value:, headers_present:)
|
|
22
|
+
@value = value
|
|
23
|
+
@headers_present = !!headers_present
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def headers_present?
|
|
27
|
+
@headers_present
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def index?
|
|
31
|
+
!@headers_present
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def extract_from(row)
|
|
35
|
+
if headers_present?
|
|
36
|
+
row[@value].to_s
|
|
37
|
+
else
|
|
38
|
+
row[@value - 1].to_s
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
4
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
require "csvtool/domain/shared/output_destination"
|
|
7
|
+
|
|
8
|
+
module Csvtool
|
|
9
|
+
module Domain
|
|
10
|
+
module CrossCsvDedupeSession
|
|
11
|
+
class CrossCsvDedupeSession
|
|
12
|
+
attr_reader :source, :reference, :key_mapping, :match_options, :output_destination
|
|
13
|
+
|
|
14
|
+
def self.start(source:, reference:, key_mapping:, match_options:)
|
|
15
|
+
new(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(source:, reference:, key_mapping:, match_options:, output_destination: nil)
|
|
19
|
+
raise ArgumentError, "source must be CsvProfile" unless source.is_a?(CsvProfile)
|
|
20
|
+
raise ArgumentError, "reference must be CsvProfile" unless reference.is_a?(CsvProfile)
|
|
21
|
+
raise ArgumentError, "key_mapping must be KeyMapping" unless key_mapping.is_a?(KeyMapping)
|
|
22
|
+
raise ArgumentError, "match_options must be MatchOptions" unless match_options.is_a?(MatchOptions)
|
|
23
|
+
unless output_destination.nil? || output_destination.is_a?(Domain::Shared::OutputDestination)
|
|
24
|
+
raise ArgumentError, "output_destination must be OutputDestination or nil"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@source = source
|
|
28
|
+
@reference = reference
|
|
29
|
+
@key_mapping = key_mapping
|
|
30
|
+
@match_options = match_options
|
|
31
|
+
@output_destination = output_destination
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def with_output_destination(destination)
|
|
35
|
+
self.class.new(
|
|
36
|
+
source: @source,
|
|
37
|
+
reference: @reference,
|
|
38
|
+
key_mapping: @key_mapping,
|
|
39
|
+
match_options: @match_options,
|
|
40
|
+
output_destination: destination
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class CsvProfile
|
|
7
|
+
attr_reader :path, :separator
|
|
8
|
+
|
|
9
|
+
def initialize(path:, separator:, headers_present:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
13
|
+
@path = path
|
|
14
|
+
@separator = separator
|
|
15
|
+
@headers_present = !!headers_present
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def headers_present?
|
|
19
|
+
@headers_present
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Domain
|
|
7
|
+
module CrossCsvDedupeSession
|
|
8
|
+
class KeyMapping
|
|
9
|
+
attr_reader :source_selector, :reference_selector
|
|
10
|
+
|
|
11
|
+
def initialize(source_selector:, reference_selector:)
|
|
12
|
+
unless source_selector.is_a?(ColumnSelector) && reference_selector.is_a?(ColumnSelector)
|
|
13
|
+
raise ArgumentError, "selectors must be ColumnSelector"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
@source_selector = source_selector
|
|
17
|
+
@reference_selector = reference_selector
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CrossCsvDedupeSession
|
|
6
|
+
class MatchOptions
|
|
7
|
+
attr_reader :trim_whitespace, :case_insensitive
|
|
8
|
+
|
|
9
|
+
def initialize(trim_whitespace:, case_insensitive:)
|
|
10
|
+
@trim_whitespace = !!trim_whitespace
|
|
11
|
+
@case_insensitive = !!case_insensitive
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def trim_whitespace?
|
|
15
|
+
@trim_whitespace
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def case_insensitive?
|
|
19
|
+
@case_insensitive
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def normalize(value)
|
|
23
|
+
normalized = trim_whitespace? ? value.to_s.strip : value.to_s
|
|
24
|
+
case_insensitive? ? normalized.downcase : normalized
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -7,6 +7,9 @@ module Csvtool
|
|
|
7
7
|
attr_reader :path, :separator
|
|
8
8
|
|
|
9
9
|
def initialize(path:, separator:)
|
|
10
|
+
raise ArgumentError, "path cannot be empty" if path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
12
|
+
|
|
10
13
|
@path = path
|
|
11
14
|
@separator = separator
|
|
12
15
|
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "set"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Infrastructure
|
|
9
|
+
module CSV
|
|
10
|
+
class CrossCsvDeduper
|
|
11
|
+
def call(
|
|
12
|
+
source_path:,
|
|
13
|
+
reference_path:,
|
|
14
|
+
source_selector:,
|
|
15
|
+
reference_selector:,
|
|
16
|
+
source_col_sep: ",",
|
|
17
|
+
reference_col_sep: ",",
|
|
18
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
19
|
+
)
|
|
20
|
+
kept_rows = []
|
|
21
|
+
stats = each_retained(
|
|
22
|
+
source_path: source_path,
|
|
23
|
+
reference_path: reference_path,
|
|
24
|
+
source_selector: source_selector,
|
|
25
|
+
reference_selector: reference_selector,
|
|
26
|
+
source_col_sep: source_col_sep,
|
|
27
|
+
reference_col_sep: reference_col_sep,
|
|
28
|
+
match_options: match_options
|
|
29
|
+
) do |fields|
|
|
30
|
+
kept_rows << fields
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
stats.merge(kept_rows: kept_rows)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def each_retained(
|
|
37
|
+
source_path:,
|
|
38
|
+
reference_path:,
|
|
39
|
+
source_selector:,
|
|
40
|
+
reference_selector:,
|
|
41
|
+
source_col_sep: ",",
|
|
42
|
+
reference_col_sep: ",",
|
|
43
|
+
match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
|
|
44
|
+
)
|
|
45
|
+
source_has_headers = source_selector.headers_present?
|
|
46
|
+
reference_has_headers = reference_selector.headers_present?
|
|
47
|
+
reference_keys = Set.new
|
|
48
|
+
::CSV.foreach(reference_path, headers: reference_has_headers, col_sep: reference_col_sep) do |row|
|
|
49
|
+
reference_keys << extract_key(row, selector: reference_selector, match_options: match_options)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
source_header_row = nil
|
|
53
|
+
source_rows = 0
|
|
54
|
+
removed_rows = 0
|
|
55
|
+
kept_rows_count = 0
|
|
56
|
+
|
|
57
|
+
::CSV.foreach(source_path, headers: source_has_headers, col_sep: source_col_sep) do |row|
|
|
58
|
+
source_header_row ||= row.headers if source_has_headers
|
|
59
|
+
source_rows += 1
|
|
60
|
+
key = extract_key(row, selector: source_selector, match_options: match_options)
|
|
61
|
+
if reference_keys.include?(key)
|
|
62
|
+
removed_rows += 1
|
|
63
|
+
else
|
|
64
|
+
kept_rows_count += 1
|
|
65
|
+
yield(source_has_headers ? row.fields : row) if block_given?
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{
|
|
70
|
+
headers: source_has_headers ? (source_header_row || []) : nil,
|
|
71
|
+
source_rows: source_rows,
|
|
72
|
+
removed_rows: removed_rows,
|
|
73
|
+
kept_rows_count: kept_rows_count
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def extract_key(row, selector:, match_options:)
|
|
80
|
+
match_options.normalize(selector.extract_from(row))
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
5
|
+
|
|
6
|
+
module Csvtool
|
|
7
|
+
module Infrastructure
|
|
8
|
+
module CSV
|
|
9
|
+
class SelectorValidator
|
|
10
|
+
def initialize(header_reader: HeaderReader.new)
|
|
11
|
+
@header_reader = header_reader
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def valid?(profile:, selector:)
|
|
15
|
+
if selector.headers_present?
|
|
16
|
+
headers = @header_reader.call(file_path: profile.path, col_sep: profile.separator)
|
|
17
|
+
return false if headers.empty?
|
|
18
|
+
|
|
19
|
+
headers.include?(selector.value)
|
|
20
|
+
else
|
|
21
|
+
first_row = ::CSV.open(profile.path, "r", headers: false, col_sep: profile.separator, &:first)
|
|
22
|
+
return false if first_row.nil?
|
|
23
|
+
|
|
24
|
+
selector.value <= first_row.length
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -4,13 +4,14 @@ module Csvtool
|
|
|
4
4
|
module Interface
|
|
5
5
|
module CLI
|
|
6
6
|
class MenuLoop
|
|
7
|
-
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:)
|
|
7
|
+
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
|
|
8
8
|
@stdin = stdin
|
|
9
9
|
@stdout = stdout
|
|
10
10
|
@menu_options = menu_options
|
|
11
11
|
@extract_column_action = extract_column_action
|
|
12
12
|
@extract_rows_action = extract_rows_action
|
|
13
13
|
@randomize_rows_action = randomize_rows_action
|
|
14
|
+
@dedupe_action = dedupe_action
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
def run
|
|
@@ -28,9 +29,11 @@ module Csvtool
|
|
|
28
29
|
when "3"
|
|
29
30
|
@randomize_rows_action.call
|
|
30
31
|
when "4"
|
|
32
|
+
@dedupe_action.call
|
|
33
|
+
when "5"
|
|
31
34
|
return 0
|
|
32
35
|
else
|
|
33
|
-
@stdout.puts "Please choose 1, 2, 3, or
|
|
36
|
+
@stdout.puts "Please choose 1, 2, 3, 4, or 5."
|
|
34
37
|
end
|
|
35
38
|
end
|
|
36
39
|
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/application/use_cases/run_cross_csv_dedupe"
|
|
5
|
+
require "csvtool/interface/cli/errors/presenter"
|
|
6
|
+
require "csvtool/interface/cli/prompts/file_path_prompt"
|
|
7
|
+
require "csvtool/interface/cli/prompts/separator_prompt"
|
|
8
|
+
require "csvtool/interface/cli/prompts/output_destination_prompt"
|
|
9
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
10
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
11
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
12
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
13
|
+
require "csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session"
|
|
14
|
+
require "csvtool/domain/shared/output_destination"
|
|
15
|
+
|
|
16
|
+
module Csvtool
|
|
17
|
+
module Interface
|
|
18
|
+
module CLI
|
|
19
|
+
module Workflows
|
|
20
|
+
class RunCrossCsvDedupeWorkflow
|
|
21
|
+
def initialize(stdin:, stdout:, use_case: Application::UseCases::RunCrossCsvDedupe.new)
|
|
22
|
+
@stdin = stdin
|
|
23
|
+
@stdout = stdout
|
|
24
|
+
@use_case = use_case
|
|
25
|
+
@errors = Interface::CLI::Errors::Presenter.new(stdout: stdout)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call
|
|
29
|
+
source_path = Interface::CLI::Prompts::FilePathPrompt.new(stdin: @stdin, stdout: @stdout).call
|
|
30
|
+
return @errors.file_not_found(source_path) unless File.file?(source_path)
|
|
31
|
+
|
|
32
|
+
@stdout.puts "Source CSV separator:"
|
|
33
|
+
source_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
34
|
+
return if source_col_sep.nil?
|
|
35
|
+
@stdout.print "Source headers present? [Y/n]: "
|
|
36
|
+
source_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
|
|
37
|
+
source = Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
38
|
+
path: source_path,
|
|
39
|
+
separator: source_col_sep,
|
|
40
|
+
headers_present: source_headers_present
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@stdout.print "Reference CSV file path: "
|
|
44
|
+
reference_path = @stdin.gets&.strip.to_s
|
|
45
|
+
return @errors.file_not_found(reference_path) unless File.file?(reference_path)
|
|
46
|
+
|
|
47
|
+
@stdout.puts "Reference CSV separator:"
|
|
48
|
+
reference_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
49
|
+
return if reference_col_sep.nil?
|
|
50
|
+
@stdout.print "Reference headers present? [Y/n]: "
|
|
51
|
+
reference_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
|
|
52
|
+
reference = Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
53
|
+
path: reference_path,
|
|
54
|
+
separator: reference_col_sep,
|
|
55
|
+
headers_present: reference_headers_present
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
source_selector = prompt_selector("Source", source.headers_present?)
|
|
59
|
+
return @errors.column_not_found if source_selector.nil?
|
|
60
|
+
reference_selector = prompt_selector("Reference", reference.headers_present?)
|
|
61
|
+
return @errors.column_not_found if reference_selector.nil?
|
|
62
|
+
|
|
63
|
+
@stdout.print "Trim whitespace before matching? [Y/n]: "
|
|
64
|
+
trim_whitespace = read_yes_no(default: true)
|
|
65
|
+
@stdout.print "Case-insensitive matching? [y/N]: "
|
|
66
|
+
case_insensitive = read_yes_no(default: false)
|
|
67
|
+
|
|
68
|
+
key_mapping = Domain::CrossCsvDedupeSession::KeyMapping.new(
|
|
69
|
+
source_selector: source_selector,
|
|
70
|
+
reference_selector: reference_selector
|
|
71
|
+
)
|
|
72
|
+
match_options = Domain::CrossCsvDedupeSession::MatchOptions.new(
|
|
73
|
+
trim_whitespace: trim_whitespace,
|
|
74
|
+
case_insensitive: case_insensitive
|
|
75
|
+
)
|
|
76
|
+
session = Domain::CrossCsvDedupeSession::CrossCsvDedupeSession.start(
|
|
77
|
+
source: source,
|
|
78
|
+
reference: reference,
|
|
79
|
+
key_mapping: key_mapping,
|
|
80
|
+
match_options: match_options
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
output_destination = Interface::CLI::Prompts::OutputDestinationPrompt.new(
|
|
84
|
+
stdin: @stdin,
|
|
85
|
+
stdout: @stdout,
|
|
86
|
+
errors: @errors
|
|
87
|
+
).call
|
|
88
|
+
return if output_destination.nil?
|
|
89
|
+
session = session.with_output_destination(
|
|
90
|
+
if output_destination[:mode] == :file
|
|
91
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
92
|
+
else
|
|
93
|
+
Domain::Shared::OutputDestination.console
|
|
94
|
+
end
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
result = @use_case.call(
|
|
98
|
+
session: session,
|
|
99
|
+
on_header: ->(headers) { print_header(headers, col_sep: session.source.separator) },
|
|
100
|
+
on_row: ->(fields) { print_row(fields, col_sep: session.source.separator) }
|
|
101
|
+
)
|
|
102
|
+
return handle_error(result) unless result.ok?
|
|
103
|
+
|
|
104
|
+
@stdout.puts "Wrote output to #{result.data[:output_path]}" if session.output_destination.file?
|
|
105
|
+
stats = result.data[:stats]
|
|
106
|
+
@stdout.puts "Summary: source_rows=#{stats[:source_rows]} removed_rows=#{stats[:removed_rows]} kept_rows=#{stats[:kept_rows_count]}"
|
|
107
|
+
@stdout.puts "No rows removed; no matching keys found." if stats[:removed_rows].zero?
|
|
108
|
+
@stdout.puts "All source rows were removed by dedupe." if stats[:source_rows].positive? && stats[:kept_rows_count].zero?
|
|
109
|
+
rescue ArgumentError => e
|
|
110
|
+
return @errors.empty_output_path if e.message == "file output path cannot be empty"
|
|
111
|
+
|
|
112
|
+
raise e
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def prompt_selector(label, headers_present)
|
|
118
|
+
if headers_present
|
|
119
|
+
@stdout.print "#{label} key column name: "
|
|
120
|
+
else
|
|
121
|
+
@stdout.print "#{label} key column index (1-based): "
|
|
122
|
+
end
|
|
123
|
+
input = @stdin.gets&.strip.to_s
|
|
124
|
+
Domain::CrossCsvDedupeSession::ColumnSelector.from_input(headers_present: headers_present, input: input)
|
|
125
|
+
rescue ArgumentError
|
|
126
|
+
nil
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def print_header(headers, col_sep:)
|
|
130
|
+
@stdout.puts
|
|
131
|
+
@stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def print_row(fields, col_sep:)
|
|
135
|
+
@stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def handle_error(result)
|
|
139
|
+
case result.error
|
|
140
|
+
when :column_not_found
|
|
141
|
+
@errors.column_not_found
|
|
142
|
+
when :could_not_parse_csv
|
|
143
|
+
@errors.could_not_parse_csv
|
|
144
|
+
when :cannot_read_file
|
|
145
|
+
@errors.cannot_read_file(result.data[:path])
|
|
146
|
+
when :cannot_write_output_file
|
|
147
|
+
@errors.cannot_write_output_file(result.data[:path], result.data[:error_class])
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def read_yes_no(default:)
|
|
152
|
+
answer = @stdin.gets&.strip.to_s.downcase
|
|
153
|
+
return default if answer.empty?
|
|
154
|
+
return true if %w[y yes].include?(answer)
|
|
155
|
+
return false if %w[n no].include?(answer)
|
|
156
|
+
|
|
157
|
+
default
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
data/lib/csvtool/version.rb
CHANGED