csvops 0.2.0.alpha → 0.4.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +56 -108
- data/docs/architecture.md +266 -0
- data/docs/release-v0.3.0-alpha.md +74 -0
- data/docs/release-v0.4.0-alpha.md +87 -0
- data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
- data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_randomization.rb +105 -0
- data/lib/csvtool/cli.rb +9 -1
- data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_options.rb +17 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_session.rb +25 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +24 -0
- data/lib/csvtool/domain/row_session/row_source.rb +3 -0
- data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
- data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
- data/lib/csvtool/infrastructure/csv/row_randomizer.rb +83 -0
- data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
- data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
- data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +22 -0
- data/lib/csvtool/interface/cli/prompts/seed_prompt.rb +29 -0
- data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
- data/test/csvtool/application/use_cases/run_row_randomization_test.rb +124 -0
- data/test/csvtool/cli_test.rb +231 -12
- data/test/csvtool/cli_unit_test.rb +27 -2
- data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
- data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
- data/test/csvtool/domain/row_randomization_session/randomization_options_test.rb +20 -0
- data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +26 -0
- data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +42 -0
- data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
- data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
- data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
- data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
- data/test/csvtool/infrastructure/csv/row_randomizer_test.rb +37 -0
- data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
- data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +78 -10
- data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +14 -0
- data/test/csvtool/interface/cli/prompts/seed_prompt_test.rb +39 -0
- data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
- data/test/fixtures/dedupe_reference.csv +3 -0
- data/test/fixtures/dedupe_reference.tsv +3 -0
- data/test/fixtures/dedupe_reference_all.csv +5 -0
- data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
- data/test/fixtures/dedupe_reference_none.csv +2 -0
- data/test/fixtures/dedupe_reference_normalization.csv +3 -0
- data/test/fixtures/dedupe_source.csv +6 -0
- data/test/fixtures/dedupe_source.tsv +6 -0
- data/test/fixtures/dedupe_source_no_headers.csv +5 -0
- data/test/fixtures/dedupe_source_normalization.csv +4 -0
- data/test/fixtures/sample_people_no_headers.csv +3 -0
- metadata +50 -6
- data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
- data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
- data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
|
|
6
|
+
module Csvtool
|
|
7
|
+
module Infrastructure
|
|
8
|
+
module CSV
|
|
9
|
+
class RowRandomizer
|
|
10
|
+
DEFAULT_CHUNK_SIZE = 10_000
|
|
11
|
+
|
|
12
|
+
def call(file_path:, col_sep:, headers:, seed: nil)
|
|
13
|
+
each(file_path: file_path, col_sep: col_sep, headers: headers, seed: seed).to_a
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def each(file_path:, col_sep:, headers:, seed: nil, chunk_size: DEFAULT_CHUNK_SIZE)
|
|
17
|
+
chunk_paths = []
|
|
18
|
+
return enum_for(:each, file_path: file_path, col_sep: col_sep, headers: headers, seed: seed, chunk_size: chunk_size) unless block_given?
|
|
19
|
+
|
|
20
|
+
rng = seed.nil? ? Random.new : Random.new(seed)
|
|
21
|
+
sequence = 0
|
|
22
|
+
chunk_entries = []
|
|
23
|
+
|
|
24
|
+
::CSV.foreach(file_path, headers: headers, col_sep: col_sep) do |row|
|
|
25
|
+
fields = headers ? row.fields : row
|
|
26
|
+
chunk_entries << [rng.rand, sequence, fields]
|
|
27
|
+
sequence += 1
|
|
28
|
+
flush_chunk(chunk_entries, chunk_paths) if chunk_entries.length >= chunk_size
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
flush_chunk(chunk_entries, chunk_paths) unless chunk_entries.empty?
|
|
32
|
+
merge_chunks(chunk_paths) { |fields| yield fields }
|
|
33
|
+
ensure
|
|
34
|
+
cleanup_chunks(chunk_paths)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def flush_chunk(entries, chunk_paths)
|
|
40
|
+
entries.sort_by! { |rand_key, seq, _fields| [rand_key, seq] }
|
|
41
|
+
file = Tempfile.new("csvtool-row-randomizer-chunk")
|
|
42
|
+
file.binmode
|
|
43
|
+
entries.each { |entry| Marshal.dump(entry, file) }
|
|
44
|
+
file.close
|
|
45
|
+
chunk_paths << file.path
|
|
46
|
+
entries.clear
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def merge_chunks(chunk_paths)
|
|
50
|
+
readers = chunk_paths.map { |path| File.open(path, "rb") }
|
|
51
|
+
heads = readers.map { |reader| next_entry(reader) }
|
|
52
|
+
|
|
53
|
+
loop do
|
|
54
|
+
indexed = heads.each_with_index.select { |entry, _i| !entry.nil? }
|
|
55
|
+
break if indexed.empty?
|
|
56
|
+
|
|
57
|
+
min_entry, min_index = indexed.min_by { |entry, _i| [entry[0], entry[1]] }
|
|
58
|
+
yield min_entry[2]
|
|
59
|
+
heads[min_index] = next_entry(readers[min_index])
|
|
60
|
+
end
|
|
61
|
+
ensure
|
|
62
|
+
readers&.each(&:close)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def next_entry(reader)
|
|
66
|
+
Marshal.load(reader)
|
|
67
|
+
rescue EOFError
|
|
68
|
+
nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def cleanup_chunks(chunk_paths)
|
|
72
|
+
return if chunk_paths.nil?
|
|
73
|
+
|
|
74
|
+
chunk_paths.each do |path|
|
|
75
|
+
File.delete(path) if File.exist?(path)
|
|
76
|
+
rescue Errno::EACCES, Errno::ENOENT
|
|
77
|
+
nil
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
5
|
+
|
|
6
|
+
module Csvtool
|
|
7
|
+
module Infrastructure
|
|
8
|
+
module CSV
|
|
9
|
+
class SelectorValidator
|
|
10
|
+
def initialize(header_reader: HeaderReader.new)
|
|
11
|
+
@header_reader = header_reader
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def valid?(profile:, selector:)
|
|
15
|
+
if selector.headers_present?
|
|
16
|
+
headers = @header_reader.call(file_path: profile.path, col_sep: profile.separator)
|
|
17
|
+
return false if headers.empty?
|
|
18
|
+
|
|
19
|
+
headers.include?(selector.value)
|
|
20
|
+
else
|
|
21
|
+
first_row = ::CSV.open(profile.path, "r", headers: false, col_sep: profile.separator, &:first)
|
|
22
|
+
return false if first_row.nil?
|
|
23
|
+
|
|
24
|
+
selector.value <= first_row.length
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -4,12 +4,14 @@ module Csvtool
|
|
|
4
4
|
module Interface
|
|
5
5
|
module CLI
|
|
6
6
|
class MenuLoop
|
|
7
|
-
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:)
|
|
7
|
+
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
|
|
8
8
|
@stdin = stdin
|
|
9
9
|
@stdout = stdout
|
|
10
10
|
@menu_options = menu_options
|
|
11
11
|
@extract_column_action = extract_column_action
|
|
12
12
|
@extract_rows_action = extract_rows_action
|
|
13
|
+
@randomize_rows_action = randomize_rows_action
|
|
14
|
+
@dedupe_action = dedupe_action
|
|
13
15
|
end
|
|
14
16
|
|
|
15
17
|
def run
|
|
@@ -25,9 +27,13 @@ module Csvtool
|
|
|
25
27
|
when "2"
|
|
26
28
|
@extract_rows_action.call
|
|
27
29
|
when "3"
|
|
30
|
+
@randomize_rows_action.call
|
|
31
|
+
when "4"
|
|
32
|
+
@dedupe_action.call
|
|
33
|
+
when "5"
|
|
28
34
|
return 0
|
|
29
35
|
else
|
|
30
|
-
@stdout.puts "Please choose 1, 2, or
|
|
36
|
+
@stdout.puts "Please choose 1, 2, 3, 4, or 5."
|
|
31
37
|
end
|
|
32
38
|
end
|
|
33
39
|
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Interface
|
|
5
|
+
module CLI
|
|
6
|
+
module Prompts
|
|
7
|
+
class HeadersPresentPrompt
|
|
8
|
+
def initialize(stdin:, stdout:)
|
|
9
|
+
@stdin = stdin
|
|
10
|
+
@stdout = stdout
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call
|
|
14
|
+
@stdout.print "Headers present? [Y/n]: "
|
|
15
|
+
answer = @stdin.gets&.strip.to_s.downcase
|
|
16
|
+
!%w[n no].include?(answer)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Interface
|
|
5
|
+
module CLI
|
|
6
|
+
module Prompts
|
|
7
|
+
class SeedPrompt
|
|
8
|
+
INVALID = :invalid
|
|
9
|
+
|
|
10
|
+
def initialize(stdin:, stdout:, errors:)
|
|
11
|
+
@stdin = stdin
|
|
12
|
+
@stdout = stdout
|
|
13
|
+
@errors = errors
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def call
|
|
17
|
+
@stdout.print "Random seed (optional integer): "
|
|
18
|
+
raw = @stdin.gets&.strip.to_s
|
|
19
|
+
return nil if raw.empty?
|
|
20
|
+
return raw.to_i if /\A-?\d+\z/.match?(raw)
|
|
21
|
+
|
|
22
|
+
@errors.invalid_seed
|
|
23
|
+
INVALID
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/application/use_cases/run_cross_csv_dedupe"
|
|
5
|
+
require "csvtool/interface/cli/errors/presenter"
|
|
6
|
+
require "csvtool/interface/cli/prompts/file_path_prompt"
|
|
7
|
+
require "csvtool/interface/cli/prompts/separator_prompt"
|
|
8
|
+
require "csvtool/interface/cli/prompts/output_destination_prompt"
|
|
9
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
10
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
11
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
12
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
13
|
+
require "csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session"
|
|
14
|
+
require "csvtool/domain/shared/output_destination"
|
|
15
|
+
|
|
16
|
+
module Csvtool
|
|
17
|
+
module Interface
|
|
18
|
+
module CLI
|
|
19
|
+
module Workflows
|
|
20
|
+
class RunCrossCsvDedupeWorkflow
|
|
21
|
+
def initialize(stdin:, stdout:, use_case: Application::UseCases::RunCrossCsvDedupe.new)
|
|
22
|
+
@stdin = stdin
|
|
23
|
+
@stdout = stdout
|
|
24
|
+
@use_case = use_case
|
|
25
|
+
@errors = Interface::CLI::Errors::Presenter.new(stdout: stdout)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call
|
|
29
|
+
source_path = Interface::CLI::Prompts::FilePathPrompt.new(stdin: @stdin, stdout: @stdout).call
|
|
30
|
+
return @errors.file_not_found(source_path) unless File.file?(source_path)
|
|
31
|
+
|
|
32
|
+
@stdout.puts "Source CSV separator:"
|
|
33
|
+
source_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
34
|
+
return if source_col_sep.nil?
|
|
35
|
+
@stdout.print "Source headers present? [Y/n]: "
|
|
36
|
+
source_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
|
|
37
|
+
source = Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
38
|
+
path: source_path,
|
|
39
|
+
separator: source_col_sep,
|
|
40
|
+
headers_present: source_headers_present
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@stdout.print "Reference CSV file path: "
|
|
44
|
+
reference_path = @stdin.gets&.strip.to_s
|
|
45
|
+
return @errors.file_not_found(reference_path) unless File.file?(reference_path)
|
|
46
|
+
|
|
47
|
+
@stdout.puts "Reference CSV separator:"
|
|
48
|
+
reference_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
|
|
49
|
+
return if reference_col_sep.nil?
|
|
50
|
+
@stdout.print "Reference headers present? [Y/n]: "
|
|
51
|
+
reference_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
|
|
52
|
+
reference = Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
53
|
+
path: reference_path,
|
|
54
|
+
separator: reference_col_sep,
|
|
55
|
+
headers_present: reference_headers_present
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
source_selector = prompt_selector("Source", source.headers_present?)
|
|
59
|
+
return @errors.column_not_found if source_selector.nil?
|
|
60
|
+
reference_selector = prompt_selector("Reference", reference.headers_present?)
|
|
61
|
+
return @errors.column_not_found if reference_selector.nil?
|
|
62
|
+
|
|
63
|
+
@stdout.print "Trim whitespace before matching? [Y/n]: "
|
|
64
|
+
trim_whitespace = read_yes_no(default: true)
|
|
65
|
+
@stdout.print "Case-insensitive matching? [y/N]: "
|
|
66
|
+
case_insensitive = read_yes_no(default: false)
|
|
67
|
+
|
|
68
|
+
key_mapping = Domain::CrossCsvDedupeSession::KeyMapping.new(
|
|
69
|
+
source_selector: source_selector,
|
|
70
|
+
reference_selector: reference_selector
|
|
71
|
+
)
|
|
72
|
+
match_options = Domain::CrossCsvDedupeSession::MatchOptions.new(
|
|
73
|
+
trim_whitespace: trim_whitespace,
|
|
74
|
+
case_insensitive: case_insensitive
|
|
75
|
+
)
|
|
76
|
+
session = Domain::CrossCsvDedupeSession::CrossCsvDedupeSession.start(
|
|
77
|
+
source: source,
|
|
78
|
+
reference: reference,
|
|
79
|
+
key_mapping: key_mapping,
|
|
80
|
+
match_options: match_options
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
output_destination = Interface::CLI::Prompts::OutputDestinationPrompt.new(
|
|
84
|
+
stdin: @stdin,
|
|
85
|
+
stdout: @stdout,
|
|
86
|
+
errors: @errors
|
|
87
|
+
).call
|
|
88
|
+
return if output_destination.nil?
|
|
89
|
+
session = session.with_output_destination(
|
|
90
|
+
if output_destination[:mode] == :file
|
|
91
|
+
Domain::Shared::OutputDestination.file(path: output_destination[:path])
|
|
92
|
+
else
|
|
93
|
+
Domain::Shared::OutputDestination.console
|
|
94
|
+
end
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
result = @use_case.call(
|
|
98
|
+
session: session,
|
|
99
|
+
on_header: ->(headers) { print_header(headers, col_sep: session.source.separator) },
|
|
100
|
+
on_row: ->(fields) { print_row(fields, col_sep: session.source.separator) }
|
|
101
|
+
)
|
|
102
|
+
return handle_error(result) unless result.ok?
|
|
103
|
+
|
|
104
|
+
@stdout.puts "Wrote output to #{result.data[:output_path]}" if session.output_destination.file?
|
|
105
|
+
stats = result.data[:stats]
|
|
106
|
+
@stdout.puts "Summary: source_rows=#{stats[:source_rows]} removed_rows=#{stats[:removed_rows]} kept_rows=#{stats[:kept_rows_count]}"
|
|
107
|
+
@stdout.puts "No rows removed; no matching keys found." if stats[:removed_rows].zero?
|
|
108
|
+
@stdout.puts "All source rows were removed by dedupe." if stats[:source_rows].positive? && stats[:kept_rows_count].zero?
|
|
109
|
+
rescue ArgumentError => e
|
|
110
|
+
return @errors.empty_output_path if e.message == "file output path cannot be empty"
|
|
111
|
+
|
|
112
|
+
raise e
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def prompt_selector(label, headers_present)
|
|
118
|
+
if headers_present
|
|
119
|
+
@stdout.print "#{label} key column name: "
|
|
120
|
+
else
|
|
121
|
+
@stdout.print "#{label} key column index (1-based): "
|
|
122
|
+
end
|
|
123
|
+
input = @stdin.gets&.strip.to_s
|
|
124
|
+
Domain::CrossCsvDedupeSession::ColumnSelector.from_input(headers_present: headers_present, input: input)
|
|
125
|
+
rescue ArgumentError
|
|
126
|
+
nil
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def print_header(headers, col_sep:)
|
|
130
|
+
@stdout.puts
|
|
131
|
+
@stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def print_row(fields, col_sep:)
|
|
135
|
+
@stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def handle_error(result)
|
|
139
|
+
case result.error
|
|
140
|
+
when :column_not_found
|
|
141
|
+
@errors.column_not_found
|
|
142
|
+
when :could_not_parse_csv
|
|
143
|
+
@errors.could_not_parse_csv
|
|
144
|
+
when :cannot_read_file
|
|
145
|
+
@errors.cannot_read_file(result.data[:path])
|
|
146
|
+
when :cannot_write_output_file
|
|
147
|
+
@errors.cannot_write_output_file(result.data[:path], result.data[:error_class])
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def read_yes_no(default:)
|
|
152
|
+
answer = @stdin.gets&.strip.to_s.downcase
|
|
153
|
+
return default if answer.empty?
|
|
154
|
+
return true if %w[y yes].include?(answer)
|
|
155
|
+
return false if %w[n no].include?(answer)
|
|
156
|
+
|
|
157
|
+
default
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
data/lib/csvtool/version.rb
CHANGED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../../test_helper"
|
|
4
|
+
require "csvtool/application/use_cases/run_cross_csv_dedupe"
|
|
5
|
+
require "csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session"
|
|
6
|
+
require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
|
|
7
|
+
require "csvtool/domain/cross_csv_dedupe_session/column_selector"
|
|
8
|
+
require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
|
|
9
|
+
require "csvtool/domain/cross_csv_dedupe_session/match_options"
|
|
10
|
+
require "csvtool/domain/shared/output_destination"
|
|
11
|
+
require "tmpdir"
|
|
12
|
+
|
|
13
|
+
class RunCrossCsvDedupeTest < Minitest::Test
|
|
14
|
+
def fixture_path(name)
|
|
15
|
+
File.expand_path("../../../fixtures/#{name}", __dir__)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_streams_retained_rows_to_callbacks
|
|
19
|
+
use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
|
|
20
|
+
headers = nil
|
|
21
|
+
rows = []
|
|
22
|
+
|
|
23
|
+
result = use_case.call(
|
|
24
|
+
session: build_session(
|
|
25
|
+
source_path: fixture_path("dedupe_source.csv"),
|
|
26
|
+
reference_path: fixture_path("dedupe_reference.csv"),
|
|
27
|
+
source_selector_input: "customer_id",
|
|
28
|
+
reference_selector_input: "external_id",
|
|
29
|
+
output_destination: Csvtool::Domain::Shared::OutputDestination.console
|
|
30
|
+
),
|
|
31
|
+
on_header: ->(value) { headers = value },
|
|
32
|
+
on_row: ->(fields) { rows << fields }
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
assert_equal true, result.ok?
|
|
36
|
+
assert_equal ["customer_id", "name"], headers
|
|
37
|
+
assert_equal [%w[1 Alice], %w[3 Cara]], rows
|
|
38
|
+
assert_equal 5, result.data[:stats][:source_rows]
|
|
39
|
+
assert_equal 3, result.data[:stats][:removed_rows]
|
|
40
|
+
assert_equal 2, result.data[:stats][:kept_rows_count]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def test_writes_to_file_output_destination
|
|
44
|
+
use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
|
|
45
|
+
|
|
46
|
+
Dir.mktmpdir do |dir|
|
|
47
|
+
output_path = File.join(dir, "deduped.csv")
|
|
48
|
+
result = use_case.call(
|
|
49
|
+
session: build_session(
|
|
50
|
+
source_path: fixture_path("dedupe_source.csv"),
|
|
51
|
+
reference_path: fixture_path("dedupe_reference.csv"),
|
|
52
|
+
source_selector_input: "customer_id",
|
|
53
|
+
reference_selector_input: "external_id",
|
|
54
|
+
output_destination: Csvtool::Domain::Shared::OutputDestination.file(path: output_path)
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert_equal true, result.ok?
|
|
59
|
+
assert_equal output_path, result.data[:output_path]
|
|
60
|
+
assert_equal "customer_id,name\n1,Alice\n3,Cara\n", File.read(output_path)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def test_returns_column_not_found_when_selector_invalid
|
|
65
|
+
use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
|
|
66
|
+
|
|
67
|
+
result = use_case.call(
|
|
68
|
+
session: build_session(
|
|
69
|
+
source_path: fixture_path("dedupe_source.csv"),
|
|
70
|
+
reference_path: fixture_path("dedupe_reference.csv"),
|
|
71
|
+
source_selector_input: "missing",
|
|
72
|
+
reference_selector_input: "external_id",
|
|
73
|
+
output_destination: Csvtool::Domain::Shared::OutputDestination.console
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
assert_equal false, result.ok?
|
|
78
|
+
assert_equal :column_not_found, result.error
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
def build_session(source_path:, reference_path:, source_selector_input:, reference_selector_input:, output_destination:)
|
|
84
|
+
source = Csvtool::Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
85
|
+
path: source_path,
|
|
86
|
+
separator: ",",
|
|
87
|
+
headers_present: true
|
|
88
|
+
)
|
|
89
|
+
reference = Csvtool::Domain::CrossCsvDedupeSession::CsvProfile.new(
|
|
90
|
+
path: reference_path,
|
|
91
|
+
separator: ",",
|
|
92
|
+
headers_present: true
|
|
93
|
+
)
|
|
94
|
+
key_mapping = Csvtool::Domain::CrossCsvDedupeSession::KeyMapping.new(
|
|
95
|
+
source_selector: Csvtool::Domain::CrossCsvDedupeSession::ColumnSelector.from_input(
|
|
96
|
+
headers_present: true,
|
|
97
|
+
input: source_selector_input
|
|
98
|
+
),
|
|
99
|
+
reference_selector: Csvtool::Domain::CrossCsvDedupeSession::ColumnSelector.from_input(
|
|
100
|
+
headers_present: true,
|
|
101
|
+
input: reference_selector_input
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
match_options = Csvtool::Domain::CrossCsvDedupeSession::MatchOptions.new(
|
|
105
|
+
trim_whitespace: true,
|
|
106
|
+
case_insensitive: false
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
Csvtool::Domain::CrossCsvDedupeSession::CrossCsvDedupeSession
|
|
110
|
+
.start(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
|
|
111
|
+
.with_output_destination(output_destination)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../../test_helper"
|
|
4
|
+
require "csvtool/application/use_cases/run_row_randomization"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
|
|
7
|
+
class RunRowRandomizationTest < Minitest::Test
|
|
8
|
+
def test_prints_header_then_all_randomized_rows
|
|
9
|
+
fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
|
|
10
|
+
output = StringIO.new
|
|
11
|
+
input = StringIO.new("#{fixture}\n\n\n\n\n")
|
|
12
|
+
|
|
13
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
14
|
+
|
|
15
|
+
assert_includes output.string, "CSV file path:"
|
|
16
|
+
header_index = output.string.index("name,city")
|
|
17
|
+
assert header_index
|
|
18
|
+
%w[Alice,London Bob,Paris Cara,Berlin].each do |row|
|
|
19
|
+
row_index = output.string.index(row)
|
|
20
|
+
assert row_index
|
|
21
|
+
assert_operator header_index, :<, row_index
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def test_missing_file_shows_friendly_error
|
|
26
|
+
output = StringIO.new
|
|
27
|
+
input = StringIO.new("/tmp/does-not-exist.csv\n")
|
|
28
|
+
|
|
29
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
30
|
+
|
|
31
|
+
assert_includes output.string, "File not found: /tmp/does-not-exist.csv"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def test_can_write_randomized_rows_to_file
|
|
35
|
+
fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
|
|
36
|
+
output = StringIO.new
|
|
37
|
+
|
|
38
|
+
Dir.mktmpdir do |dir|
|
|
39
|
+
output_path = File.join(dir, "randomized.csv")
|
|
40
|
+
input = StringIO.new("#{fixture}\n\n\n\n2\n#{output_path}\n")
|
|
41
|
+
|
|
42
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
43
|
+
|
|
44
|
+
written = File.read(output_path).lines.map(&:strip)
|
|
45
|
+
assert_equal "name,city", written.first
|
|
46
|
+
assert_equal ["Alice,London", "Bob,Paris", "Cara,Berlin"].sort, written[1..].sort
|
|
47
|
+
assert_includes output.string, "Wrote output to #{output_path}"
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def test_supports_tsv_separator
|
|
52
|
+
fixture = File.expand_path("../../../fixtures/sample_people.tsv", __dir__)
|
|
53
|
+
output = StringIO.new
|
|
54
|
+
input = StringIO.new("#{fixture}\n2\n\n\n\n")
|
|
55
|
+
|
|
56
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
57
|
+
|
|
58
|
+
assert_includes output.string, "name\tcity"
|
|
59
|
+
assert_includes output.string, "Alice\tLondon"
|
|
60
|
+
assert_includes output.string, "Bob\tParis"
|
|
61
|
+
assert_includes output.string, "Cara\tBerlin"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def test_supports_custom_separator
|
|
65
|
+
fixture = File.expand_path("../../../fixtures/sample_people_colon.txt", __dir__)
|
|
66
|
+
output = StringIO.new
|
|
67
|
+
input = StringIO.new("#{fixture}\n5\n:\n\n\n\n")
|
|
68
|
+
|
|
69
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
70
|
+
|
|
71
|
+
assert_includes output.string, "name:city"
|
|
72
|
+
assert_includes output.string, "Alice:London"
|
|
73
|
+
assert_includes output.string, "Bob:Paris"
|
|
74
|
+
assert_includes output.string, "Cara:Berlin"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def test_headerless_mode_randomizes_all_rows
|
|
78
|
+
fixture = File.expand_path("../../../fixtures/sample_people_no_headers.csv", __dir__)
|
|
79
|
+
output = StringIO.new
|
|
80
|
+
input = StringIO.new("#{fixture}\n\nn\n\n\n")
|
|
81
|
+
|
|
82
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
83
|
+
|
|
84
|
+
refute_includes output.string, "name,city"
|
|
85
|
+
assert_includes output.string, "Alice,London"
|
|
86
|
+
assert_includes output.string, "Bob,Paris"
|
|
87
|
+
assert_includes output.string, "Cara,Berlin"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def test_same_seed_produces_same_output_order
|
|
91
|
+
fixture = File.expand_path("../../../fixtures/sample_people_many.csv", __dir__)
|
|
92
|
+
input_data = "#{fixture}\n\n\n123\n\n"
|
|
93
|
+
|
|
94
|
+
out1 = StringIO.new
|
|
95
|
+
out2 = StringIO.new
|
|
96
|
+
|
|
97
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: StringIO.new(input_data), stdout: out1).call
|
|
98
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: StringIO.new(input_data), stdout: out2).call
|
|
99
|
+
|
|
100
|
+
rows1 = out1.string.lines.map(&:strip).select { |line| line.include?(",") && !line.start_with?("name,city") }
|
|
101
|
+
rows2 = out2.string.lines.map(&:strip).select { |line| line.include?(",") && !line.start_with?("name,city") }
|
|
102
|
+
assert_equal rows1, rows2
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def test_invalid_seed_shows_friendly_error
|
|
106
|
+
fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
|
|
107
|
+
output = StringIO.new
|
|
108
|
+
input = StringIO.new("#{fixture}\n\n\nabc\n")
|
|
109
|
+
|
|
110
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
111
|
+
|
|
112
|
+
assert_includes output.string, "Seed must be an integer."
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def test_malformed_csv_shows_friendly_error
|
|
116
|
+
fixture = File.expand_path("../../../fixtures/sample_people_bad_tail.csv", __dir__)
|
|
117
|
+
output = StringIO.new
|
|
118
|
+
input = StringIO.new("#{fixture}\n\n\n\n\n")
|
|
119
|
+
|
|
120
|
+
Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
|
|
121
|
+
|
|
122
|
+
assert_includes output.string, "Could not parse CSV file."
|
|
123
|
+
end
|
|
124
|
+
end
|