csvops 0.2.0.alpha → 0.4.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +56 -108
  3. data/docs/architecture.md +266 -0
  4. data/docs/release-v0.3.0-alpha.md +74 -0
  5. data/docs/release-v0.4.0-alpha.md +87 -0
  6. data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
  7. data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
  8. data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
  9. data/lib/csvtool/application/use_cases/run_row_randomization.rb +105 -0
  10. data/lib/csvtool/cli.rb +9 -1
  11. data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
  12. data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
  13. data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
  14. data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
  15. data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
  16. data/lib/csvtool/domain/row_randomization_session/randomization_options.rb +17 -0
  17. data/lib/csvtool/domain/row_randomization_session/randomization_session.rb +25 -0
  18. data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +24 -0
  19. data/lib/csvtool/domain/row_session/row_source.rb +3 -0
  20. data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
  21. data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
  22. data/lib/csvtool/infrastructure/csv/row_randomizer.rb +83 -0
  23. data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
  24. data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
  25. data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
  26. data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +22 -0
  27. data/lib/csvtool/interface/cli/prompts/seed_prompt.rb +29 -0
  28. data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
  29. data/lib/csvtool/version.rb +1 -1
  30. data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
  31. data/test/csvtool/application/use_cases/run_row_randomization_test.rb +124 -0
  32. data/test/csvtool/cli_test.rb +231 -12
  33. data/test/csvtool/cli_unit_test.rb +27 -2
  34. data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
  35. data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
  36. data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
  37. data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
  38. data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
  39. data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
  40. data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
  41. data/test/csvtool/domain/row_randomization_session/randomization_options_test.rb +20 -0
  42. data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +26 -0
  43. data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +42 -0
  44. data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
  45. data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
  46. data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
  47. data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
  48. data/test/csvtool/infrastructure/csv/row_randomizer_test.rb +37 -0
  49. data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
  50. data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
  51. data/test/csvtool/interface/cli/menu_loop_test.rb +78 -10
  52. data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +14 -0
  53. data/test/csvtool/interface/cli/prompts/seed_prompt_test.rb +39 -0
  54. data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
  55. data/test/fixtures/dedupe_reference.csv +3 -0
  56. data/test/fixtures/dedupe_reference.tsv +3 -0
  57. data/test/fixtures/dedupe_reference_all.csv +5 -0
  58. data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
  59. data/test/fixtures/dedupe_reference_none.csv +2 -0
  60. data/test/fixtures/dedupe_reference_normalization.csv +3 -0
  61. data/test/fixtures/dedupe_source.csv +6 -0
  62. data/test/fixtures/dedupe_source.tsv +6 -0
  63. data/test/fixtures/dedupe_source_no_headers.csv +5 -0
  64. data/test/fixtures/dedupe_source_normalization.csv +4 -0
  65. data/test/fixtures/sample_people_no_headers.csv +3 -0
  66. metadata +50 -6
  67. data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
  68. data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
  69. data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "tempfile"
5
+
6
+ module Csvtool
7
+ module Infrastructure
8
+ module CSV
9
+ class RowRandomizer
10
+ DEFAULT_CHUNK_SIZE = 10_000
11
+
12
+ def call(file_path:, col_sep:, headers:, seed: nil)
13
+ each(file_path: file_path, col_sep: col_sep, headers: headers, seed: seed).to_a
14
+ end
15
+
16
+ def each(file_path:, col_sep:, headers:, seed: nil, chunk_size: DEFAULT_CHUNK_SIZE)
17
+ chunk_paths = []
18
+ return enum_for(:each, file_path: file_path, col_sep: col_sep, headers: headers, seed: seed, chunk_size: chunk_size) unless block_given?
19
+
20
+ rng = seed.nil? ? Random.new : Random.new(seed)
21
+ sequence = 0
22
+ chunk_entries = []
23
+
24
+ ::CSV.foreach(file_path, headers: headers, col_sep: col_sep) do |row|
25
+ fields = headers ? row.fields : row
26
+ chunk_entries << [rng.rand, sequence, fields]
27
+ sequence += 1
28
+ flush_chunk(chunk_entries, chunk_paths) if chunk_entries.length >= chunk_size
29
+ end
30
+
31
+ flush_chunk(chunk_entries, chunk_paths) unless chunk_entries.empty?
32
+ merge_chunks(chunk_paths) { |fields| yield fields }
33
+ ensure
34
+ cleanup_chunks(chunk_paths)
35
+ end
36
+
37
+ private
38
+
39
+ def flush_chunk(entries, chunk_paths)
40
+ entries.sort_by! { |rand_key, seq, _fields| [rand_key, seq] }
41
+ file = Tempfile.new("csvtool-row-randomizer-chunk")
42
+ file.binmode
43
+ entries.each { |entry| Marshal.dump(entry, file) }
44
+ file.close
45
+ chunk_paths << file.path
46
+ entries.clear
47
+ end
48
+
49
+ def merge_chunks(chunk_paths)
50
+ readers = chunk_paths.map { |path| File.open(path, "rb") }
51
+ heads = readers.map { |reader| next_entry(reader) }
52
+
53
+ loop do
54
+ indexed = heads.each_with_index.select { |entry, _i| !entry.nil? }
55
+ break if indexed.empty?
56
+
57
+ min_entry, min_index = indexed.min_by { |entry, _i| [entry[0], entry[1]] }
58
+ yield min_entry[2]
59
+ heads[min_index] = next_entry(readers[min_index])
60
+ end
61
+ ensure
62
+ readers&.each(&:close)
63
+ end
64
+
65
+ def next_entry(reader)
66
+ Marshal.load(reader)
67
+ rescue EOFError
68
+ nil
69
+ end
70
+
71
+ def cleanup_chunks(chunk_paths)
72
+ return if chunk_paths.nil?
73
+
74
+ chunk_paths.each do |path|
75
+ File.delete(path) if File.exist?(path)
76
+ rescue Errno::EACCES, Errno::ENOENT
77
+ nil
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/header_reader"
5
+
6
+ module Csvtool
7
+ module Infrastructure
8
+ module CSV
9
+ class SelectorValidator
10
+ def initialize(header_reader: HeaderReader.new)
11
+ @header_reader = header_reader
12
+ end
13
+
14
+ def valid?(profile:, selector:)
15
+ if selector.headers_present?
16
+ headers = @header_reader.call(file_path: profile.path, col_sep: profile.separator)
17
+ return false if headers.empty?
18
+
19
+ headers.include?(selector.value)
20
+ else
21
+ first_row = ::CSV.open(profile.path, "r", headers: false, col_sep: profile.separator, &:first)
22
+ return false if first_row.nil?
23
+
24
+ selector.value <= first_row.length
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -49,6 +49,10 @@ module Csvtool
49
49
  @stdout.puts "Invalid separator choice."
50
50
  end
51
51
 
52
+ def invalid_seed
53
+ @stdout.puts "Seed must be an integer."
54
+ end
55
+
52
56
  def canceled
53
57
  @stdout.puts "Canceled."
54
58
  end
@@ -4,12 +4,14 @@ module Csvtool
4
4
  module Interface
5
5
  module CLI
6
6
  class MenuLoop
7
- def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:)
7
+ def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
8
8
  @stdin = stdin
9
9
  @stdout = stdout
10
10
  @menu_options = menu_options
11
11
  @extract_column_action = extract_column_action
12
12
  @extract_rows_action = extract_rows_action
13
+ @randomize_rows_action = randomize_rows_action
14
+ @dedupe_action = dedupe_action
13
15
  end
14
16
 
15
17
  def run
@@ -25,9 +27,13 @@ module Csvtool
25
27
  when "2"
26
28
  @extract_rows_action.call
27
29
  when "3"
30
+ @randomize_rows_action.call
31
+ when "4"
32
+ @dedupe_action.call
33
+ when "5"
28
34
  return 0
29
35
  else
30
- @stdout.puts "Please choose 1, 2, or 3."
36
+ @stdout.puts "Please choose 1, 2, 3, 4, or 5."
31
37
  end
32
38
  end
33
39
  end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Interface
5
+ module CLI
6
+ module Prompts
7
+ class HeadersPresentPrompt
8
+ def initialize(stdin:, stdout:)
9
+ @stdin = stdin
10
+ @stdout = stdout
11
+ end
12
+
13
+ def call
14
+ @stdout.print "Headers present? [Y/n]: "
15
+ answer = @stdin.gets&.strip.to_s.downcase
16
+ !%w[n no].include?(answer)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Interface
5
+ module CLI
6
+ module Prompts
7
+ class SeedPrompt
8
+ INVALID = :invalid
9
+
10
+ def initialize(stdin:, stdout:, errors:)
11
+ @stdin = stdin
12
+ @stdout = stdout
13
+ @errors = errors
14
+ end
15
+
16
+ def call
17
+ @stdout.print "Random seed (optional integer): "
18
+ raw = @stdin.gets&.strip.to_s
19
+ return nil if raw.empty?
20
+ return raw.to_i if /\A-?\d+\z/.match?(raw)
21
+
22
+ @errors.invalid_seed
23
+ INVALID
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/application/use_cases/run_cross_csv_dedupe"
5
+ require "csvtool/interface/cli/errors/presenter"
6
+ require "csvtool/interface/cli/prompts/file_path_prompt"
7
+ require "csvtool/interface/cli/prompts/separator_prompt"
8
+ require "csvtool/interface/cli/prompts/output_destination_prompt"
9
+ require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
10
+ require "csvtool/domain/cross_csv_dedupe_session/column_selector"
11
+ require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
12
+ require "csvtool/domain/cross_csv_dedupe_session/match_options"
13
+ require "csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session"
14
+ require "csvtool/domain/shared/output_destination"
15
+
16
+ module Csvtool
17
+ module Interface
18
+ module CLI
19
+ module Workflows
20
+ class RunCrossCsvDedupeWorkflow
21
+ def initialize(stdin:, stdout:, use_case: Application::UseCases::RunCrossCsvDedupe.new)
22
+ @stdin = stdin
23
+ @stdout = stdout
24
+ @use_case = use_case
25
+ @errors = Interface::CLI::Errors::Presenter.new(stdout: stdout)
26
+ end
27
+
28
+ def call
29
+ source_path = Interface::CLI::Prompts::FilePathPrompt.new(stdin: @stdin, stdout: @stdout).call
30
+ return @errors.file_not_found(source_path) unless File.file?(source_path)
31
+
32
+ @stdout.puts "Source CSV separator:"
33
+ source_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
34
+ return if source_col_sep.nil?
35
+ @stdout.print "Source headers present? [Y/n]: "
36
+ source_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
37
+ source = Domain::CrossCsvDedupeSession::CsvProfile.new(
38
+ path: source_path,
39
+ separator: source_col_sep,
40
+ headers_present: source_headers_present
41
+ )
42
+
43
+ @stdout.print "Reference CSV file path: "
44
+ reference_path = @stdin.gets&.strip.to_s
45
+ return @errors.file_not_found(reference_path) unless File.file?(reference_path)
46
+
47
+ @stdout.puts "Reference CSV separator:"
48
+ reference_col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
49
+ return if reference_col_sep.nil?
50
+ @stdout.print "Reference headers present? [Y/n]: "
51
+ reference_headers_present = !%w[n no].include?(@stdin.gets&.strip.to_s.downcase)
52
+ reference = Domain::CrossCsvDedupeSession::CsvProfile.new(
53
+ path: reference_path,
54
+ separator: reference_col_sep,
55
+ headers_present: reference_headers_present
56
+ )
57
+
58
+ source_selector = prompt_selector("Source", source.headers_present?)
59
+ return @errors.column_not_found if source_selector.nil?
60
+ reference_selector = prompt_selector("Reference", reference.headers_present?)
61
+ return @errors.column_not_found if reference_selector.nil?
62
+
63
+ @stdout.print "Trim whitespace before matching? [Y/n]: "
64
+ trim_whitespace = read_yes_no(default: true)
65
+ @stdout.print "Case-insensitive matching? [y/N]: "
66
+ case_insensitive = read_yes_no(default: false)
67
+
68
+ key_mapping = Domain::CrossCsvDedupeSession::KeyMapping.new(
69
+ source_selector: source_selector,
70
+ reference_selector: reference_selector
71
+ )
72
+ match_options = Domain::CrossCsvDedupeSession::MatchOptions.new(
73
+ trim_whitespace: trim_whitespace,
74
+ case_insensitive: case_insensitive
75
+ )
76
+ session = Domain::CrossCsvDedupeSession::CrossCsvDedupeSession.start(
77
+ source: source,
78
+ reference: reference,
79
+ key_mapping: key_mapping,
80
+ match_options: match_options
81
+ )
82
+
83
+ output_destination = Interface::CLI::Prompts::OutputDestinationPrompt.new(
84
+ stdin: @stdin,
85
+ stdout: @stdout,
86
+ errors: @errors
87
+ ).call
88
+ return if output_destination.nil?
89
+ session = session.with_output_destination(
90
+ if output_destination[:mode] == :file
91
+ Domain::Shared::OutputDestination.file(path: output_destination[:path])
92
+ else
93
+ Domain::Shared::OutputDestination.console
94
+ end
95
+ )
96
+
97
+ result = @use_case.call(
98
+ session: session,
99
+ on_header: ->(headers) { print_header(headers, col_sep: session.source.separator) },
100
+ on_row: ->(fields) { print_row(fields, col_sep: session.source.separator) }
101
+ )
102
+ return handle_error(result) unless result.ok?
103
+
104
+ @stdout.puts "Wrote output to #{result.data[:output_path]}" if session.output_destination.file?
105
+ stats = result.data[:stats]
106
+ @stdout.puts "Summary: source_rows=#{stats[:source_rows]} removed_rows=#{stats[:removed_rows]} kept_rows=#{stats[:kept_rows_count]}"
107
+ @stdout.puts "No rows removed; no matching keys found." if stats[:removed_rows].zero?
108
+ @stdout.puts "All source rows were removed by dedupe." if stats[:source_rows].positive? && stats[:kept_rows_count].zero?
109
+ rescue ArgumentError => e
110
+ return @errors.empty_output_path if e.message == "file output path cannot be empty"
111
+
112
+ raise e
113
+ end
114
+
115
+ private
116
+
117
+ def prompt_selector(label, headers_present)
118
+ if headers_present
119
+ @stdout.print "#{label} key column name: "
120
+ else
121
+ @stdout.print "#{label} key column index (1-based): "
122
+ end
123
+ input = @stdin.gets&.strip.to_s
124
+ Domain::CrossCsvDedupeSession::ColumnSelector.from_input(headers_present: headers_present, input: input)
125
+ rescue ArgumentError
126
+ nil
127
+ end
128
+
129
+ def print_header(headers, col_sep:)
130
+ @stdout.puts
131
+ @stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp
132
+ end
133
+
134
+ def print_row(fields, col_sep:)
135
+ @stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp
136
+ end
137
+
138
+ def handle_error(result)
139
+ case result.error
140
+ when :column_not_found
141
+ @errors.column_not_found
142
+ when :could_not_parse_csv
143
+ @errors.could_not_parse_csv
144
+ when :cannot_read_file
145
+ @errors.cannot_read_file(result.data[:path])
146
+ when :cannot_write_output_file
147
+ @errors.cannot_write_output_file(result.data[:path], result.data[:error_class])
148
+ end
149
+ end
150
+
151
+ def read_yes_no(default:)
152
+ answer = @stdin.gets&.strip.to_s.downcase
153
+ return default if answer.empty?
154
+ return true if %w[y yes].include?(answer)
155
+ return false if %w[n no].include?(answer)
156
+
157
+ default
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Csvtool
4
- VERSION = "0.2.0.alpha"
4
+ VERSION = "0.4.0.alpha"
5
5
  end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../../../test_helper"
4
+ require "csvtool/application/use_cases/run_cross_csv_dedupe"
5
+ require "csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session"
6
+ require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
7
+ require "csvtool/domain/cross_csv_dedupe_session/column_selector"
8
+ require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
9
+ require "csvtool/domain/cross_csv_dedupe_session/match_options"
10
+ require "csvtool/domain/shared/output_destination"
11
+ require "tmpdir"
12
+
13
+ class RunCrossCsvDedupeTest < Minitest::Test
14
+ def fixture_path(name)
15
+ File.expand_path("../../../fixtures/#{name}", __dir__)
16
+ end
17
+
18
+ def test_streams_retained_rows_to_callbacks
19
+ use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
20
+ headers = nil
21
+ rows = []
22
+
23
+ result = use_case.call(
24
+ session: build_session(
25
+ source_path: fixture_path("dedupe_source.csv"),
26
+ reference_path: fixture_path("dedupe_reference.csv"),
27
+ source_selector_input: "customer_id",
28
+ reference_selector_input: "external_id",
29
+ output_destination: Csvtool::Domain::Shared::OutputDestination.console
30
+ ),
31
+ on_header: ->(value) { headers = value },
32
+ on_row: ->(fields) { rows << fields }
33
+ )
34
+
35
+ assert_equal true, result.ok?
36
+ assert_equal ["customer_id", "name"], headers
37
+ assert_equal [%w[1 Alice], %w[3 Cara]], rows
38
+ assert_equal 5, result.data[:stats][:source_rows]
39
+ assert_equal 3, result.data[:stats][:removed_rows]
40
+ assert_equal 2, result.data[:stats][:kept_rows_count]
41
+ end
42
+
43
+ def test_writes_to_file_output_destination
44
+ use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
45
+
46
+ Dir.mktmpdir do |dir|
47
+ output_path = File.join(dir, "deduped.csv")
48
+ result = use_case.call(
49
+ session: build_session(
50
+ source_path: fixture_path("dedupe_source.csv"),
51
+ reference_path: fixture_path("dedupe_reference.csv"),
52
+ source_selector_input: "customer_id",
53
+ reference_selector_input: "external_id",
54
+ output_destination: Csvtool::Domain::Shared::OutputDestination.file(path: output_path)
55
+ )
56
+ )
57
+
58
+ assert_equal true, result.ok?
59
+ assert_equal output_path, result.data[:output_path]
60
+ assert_equal "customer_id,name\n1,Alice\n3,Cara\n", File.read(output_path)
61
+ end
62
+ end
63
+
64
+ def test_returns_column_not_found_when_selector_invalid
65
+ use_case = Csvtool::Application::UseCases::RunCrossCsvDedupe.new
66
+
67
+ result = use_case.call(
68
+ session: build_session(
69
+ source_path: fixture_path("dedupe_source.csv"),
70
+ reference_path: fixture_path("dedupe_reference.csv"),
71
+ source_selector_input: "missing",
72
+ reference_selector_input: "external_id",
73
+ output_destination: Csvtool::Domain::Shared::OutputDestination.console
74
+ )
75
+ )
76
+
77
+ assert_equal false, result.ok?
78
+ assert_equal :column_not_found, result.error
79
+ end
80
+
81
+ private
82
+
83
+ def build_session(source_path:, reference_path:, source_selector_input:, reference_selector_input:, output_destination:)
84
+ source = Csvtool::Domain::CrossCsvDedupeSession::CsvProfile.new(
85
+ path: source_path,
86
+ separator: ",",
87
+ headers_present: true
88
+ )
89
+ reference = Csvtool::Domain::CrossCsvDedupeSession::CsvProfile.new(
90
+ path: reference_path,
91
+ separator: ",",
92
+ headers_present: true
93
+ )
94
+ key_mapping = Csvtool::Domain::CrossCsvDedupeSession::KeyMapping.new(
95
+ source_selector: Csvtool::Domain::CrossCsvDedupeSession::ColumnSelector.from_input(
96
+ headers_present: true,
97
+ input: source_selector_input
98
+ ),
99
+ reference_selector: Csvtool::Domain::CrossCsvDedupeSession::ColumnSelector.from_input(
100
+ headers_present: true,
101
+ input: reference_selector_input
102
+ )
103
+ )
104
+ match_options = Csvtool::Domain::CrossCsvDedupeSession::MatchOptions.new(
105
+ trim_whitespace: true,
106
+ case_insensitive: false
107
+ )
108
+
109
+ Csvtool::Domain::CrossCsvDedupeSession::CrossCsvDedupeSession
110
+ .start(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
111
+ .with_output_destination(output_destination)
112
+ end
113
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../../../test_helper"
4
+ require "csvtool/application/use_cases/run_row_randomization"
5
+ require "tmpdir"
6
+
7
+ class RunRowRandomizationTest < Minitest::Test
8
+ def test_prints_header_then_all_randomized_rows
9
+ fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
10
+ output = StringIO.new
11
+ input = StringIO.new("#{fixture}\n\n\n\n\n")
12
+
13
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
14
+
15
+ assert_includes output.string, "CSV file path:"
16
+ header_index = output.string.index("name,city")
17
+ assert header_index
18
+ %w[Alice,London Bob,Paris Cara,Berlin].each do |row|
19
+ row_index = output.string.index(row)
20
+ assert row_index
21
+ assert_operator header_index, :<, row_index
22
+ end
23
+ end
24
+
25
+ def test_missing_file_shows_friendly_error
26
+ output = StringIO.new
27
+ input = StringIO.new("/tmp/does-not-exist.csv\n")
28
+
29
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
30
+
31
+ assert_includes output.string, "File not found: /tmp/does-not-exist.csv"
32
+ end
33
+
34
+ def test_can_write_randomized_rows_to_file
35
+ fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
36
+ output = StringIO.new
37
+
38
+ Dir.mktmpdir do |dir|
39
+ output_path = File.join(dir, "randomized.csv")
40
+ input = StringIO.new("#{fixture}\n\n\n\n2\n#{output_path}\n")
41
+
42
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
43
+
44
+ written = File.read(output_path).lines.map(&:strip)
45
+ assert_equal "name,city", written.first
46
+ assert_equal ["Alice,London", "Bob,Paris", "Cara,Berlin"].sort, written[1..].sort
47
+ assert_includes output.string, "Wrote output to #{output_path}"
48
+ end
49
+ end
50
+
51
+ def test_supports_tsv_separator
52
+ fixture = File.expand_path("../../../fixtures/sample_people.tsv", __dir__)
53
+ output = StringIO.new
54
+ input = StringIO.new("#{fixture}\n2\n\n\n\n")
55
+
56
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
57
+
58
+ assert_includes output.string, "name\tcity"
59
+ assert_includes output.string, "Alice\tLondon"
60
+ assert_includes output.string, "Bob\tParis"
61
+ assert_includes output.string, "Cara\tBerlin"
62
+ end
63
+
64
+ def test_supports_custom_separator
65
+ fixture = File.expand_path("../../../fixtures/sample_people_colon.txt", __dir__)
66
+ output = StringIO.new
67
+ input = StringIO.new("#{fixture}\n5\n:\n\n\n\n")
68
+
69
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
70
+
71
+ assert_includes output.string, "name:city"
72
+ assert_includes output.string, "Alice:London"
73
+ assert_includes output.string, "Bob:Paris"
74
+ assert_includes output.string, "Cara:Berlin"
75
+ end
76
+
77
+ def test_headerless_mode_randomizes_all_rows
78
+ fixture = File.expand_path("../../../fixtures/sample_people_no_headers.csv", __dir__)
79
+ output = StringIO.new
80
+ input = StringIO.new("#{fixture}\n\nn\n\n\n")
81
+
82
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
83
+
84
+ refute_includes output.string, "name,city"
85
+ assert_includes output.string, "Alice,London"
86
+ assert_includes output.string, "Bob,Paris"
87
+ assert_includes output.string, "Cara,Berlin"
88
+ end
89
+
90
+ def test_same_seed_produces_same_output_order
91
+ fixture = File.expand_path("../../../fixtures/sample_people_many.csv", __dir__)
92
+ input_data = "#{fixture}\n\n\n123\n\n"
93
+
94
+ out1 = StringIO.new
95
+ out2 = StringIO.new
96
+
97
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: StringIO.new(input_data), stdout: out1).call
98
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: StringIO.new(input_data), stdout: out2).call
99
+
100
+ rows1 = out1.string.lines.map(&:strip).select { |line| line.include?(",") && !line.start_with?("name,city") }
101
+ rows2 = out2.string.lines.map(&:strip).select { |line| line.include?(",") && !line.start_with?("name,city") }
102
+ assert_equal rows1, rows2
103
+ end
104
+
105
+ def test_invalid_seed_shows_friendly_error
106
+ fixture = File.expand_path("../../../fixtures/sample_people.csv", __dir__)
107
+ output = StringIO.new
108
+ input = StringIO.new("#{fixture}\n\n\nabc\n")
109
+
110
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
111
+
112
+ assert_includes output.string, "Seed must be an integer."
113
+ end
114
+
115
+ def test_malformed_csv_shows_friendly_error
116
+ fixture = File.expand_path("../../../fixtures/sample_people_bad_tail.csv", __dir__)
117
+ output = StringIO.new
118
+ input = StringIO.new("#{fixture}\n\n\n\n\n")
119
+
120
+ Csvtool::Application::UseCases::RunRowRandomization.new(stdin: input, stdout: output).call
121
+
122
+ assert_includes output.string, "Could not parse CSV file."
123
+ end
124
+ end