csvops 0.2.0.alpha → 0.4.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +56 -108
  3. data/docs/architecture.md +266 -0
  4. data/docs/release-v0.3.0-alpha.md +74 -0
  5. data/docs/release-v0.4.0-alpha.md +87 -0
  6. data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
  7. data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
  8. data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
  9. data/lib/csvtool/application/use_cases/run_row_randomization.rb +105 -0
  10. data/lib/csvtool/cli.rb +9 -1
  11. data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
  12. data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
  13. data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
  14. data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
  15. data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
  16. data/lib/csvtool/domain/row_randomization_session/randomization_options.rb +17 -0
  17. data/lib/csvtool/domain/row_randomization_session/randomization_session.rb +25 -0
  18. data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +24 -0
  19. data/lib/csvtool/domain/row_session/row_source.rb +3 -0
  20. data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
  21. data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
  22. data/lib/csvtool/infrastructure/csv/row_randomizer.rb +83 -0
  23. data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
  24. data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
  25. data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
  26. data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +22 -0
  27. data/lib/csvtool/interface/cli/prompts/seed_prompt.rb +29 -0
  28. data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
  29. data/lib/csvtool/version.rb +1 -1
  30. data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
  31. data/test/csvtool/application/use_cases/run_row_randomization_test.rb +124 -0
  32. data/test/csvtool/cli_test.rb +231 -12
  33. data/test/csvtool/cli_unit_test.rb +27 -2
  34. data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
  35. data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
  36. data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
  37. data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
  38. data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
  39. data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
  40. data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
  41. data/test/csvtool/domain/row_randomization_session/randomization_options_test.rb +20 -0
  42. data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +26 -0
  43. data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +42 -0
  44. data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
  45. data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
  46. data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
  47. data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
  48. data/test/csvtool/infrastructure/csv/row_randomizer_test.rb +37 -0
  49. data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
  50. data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
  51. data/test/csvtool/interface/cli/menu_loop_test.rb +78 -10
  52. data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +14 -0
  53. data/test/csvtool/interface/cli/prompts/seed_prompt_test.rb +39 -0
  54. data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
  55. data/test/fixtures/dedupe_reference.csv +3 -0
  56. data/test/fixtures/dedupe_reference.tsv +3 -0
  57. data/test/fixtures/dedupe_reference_all.csv +5 -0
  58. data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
  59. data/test/fixtures/dedupe_reference_none.csv +2 -0
  60. data/test/fixtures/dedupe_reference_normalization.csv +3 -0
  61. data/test/fixtures/dedupe_source.csv +6 -0
  62. data/test/fixtures/dedupe_source.tsv +6 -0
  63. data/test/fixtures/dedupe_source_no_headers.csv +5 -0
  64. data/test/fixtures/dedupe_source_normalization.csv +4 -0
  65. data/test/fixtures/sample_people_no_headers.csv +3 -0
  66. metadata +50 -6
  67. data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
  68. data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
  69. data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/header_reader"
5
+ require "csvtool/infrastructure/csv/cross_csv_deduper"
6
+ require "csvtool/infrastructure/csv/selector_validator"
7
+
8
+ module Csvtool
9
+ module Application
10
+ module UseCases
11
+ class RunCrossCsvDedupe
12
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
13
+ def ok?
14
+ ok
15
+ end
16
+ end
17
+
18
+ def initialize(
19
+ header_reader: Infrastructure::CSV::HeaderReader.new,
20
+ deduper: Infrastructure::CSV::CrossCsvDeduper.new,
21
+ selector_validator: Infrastructure::CSV::SelectorValidator.new(header_reader: header_reader)
22
+ )
23
+ @header_reader = header_reader
24
+ @deduper = deduper
25
+ @selector_validator = selector_validator
26
+ end
27
+
28
+ def call(session:, on_header: nil, on_row: nil)
29
+ current_read_path = session.source.path
30
+ return failure(:column_not_found) unless @selector_validator.valid?(profile: session.source, selector: session.key_mapping.source_selector)
31
+
32
+ current_read_path = session.reference.path
33
+ return failure(:column_not_found) unless @selector_validator.valid?(profile: session.reference, selector: session.key_mapping.reference_selector)
34
+
35
+ source_headers = session.source.headers_present? ? @header_reader.call(file_path: session.source.path, col_sep: session.source.separator) : nil
36
+ current_read_path = session.source.path
37
+
38
+ if session.output_destination.file?
39
+ write_file(session: session, source_headers: source_headers)
40
+ else
41
+ on_header.call(source_headers) if on_header && source_headers
42
+ stats = @deduper.each_retained(**dedupe_options(session)) do |fields|
43
+ on_row.call(fields) if on_row
44
+ end
45
+ success(stats: stats)
46
+ end
47
+ rescue CSV::MalformedCSVError
48
+ failure(:could_not_parse_csv)
49
+ rescue Errno::EACCES
50
+ failure(:cannot_read_file, path: current_read_path || session.source.path)
51
+ end
52
+
53
+ private
54
+
55
+ def write_file(session:, source_headers:)
56
+ stats = nil
57
+ ::CSV.open(
58
+ session.output_destination.path,
59
+ "w",
60
+ write_headers: !source_headers.nil?,
61
+ headers: source_headers,
62
+ col_sep: session.source.separator
63
+ ) do |csv|
64
+ stats = @deduper.each_retained(**dedupe_options(session)) { |fields| csv << fields }
65
+ end
66
+ success(stats: stats, output_path: session.output_destination.path)
67
+ rescue Errno::EACCES, Errno::ENOENT => e
68
+ failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
69
+ end
70
+
71
+ def dedupe_options(session)
72
+ {
73
+ source_path: session.source.path,
74
+ reference_path: session.reference.path,
75
+ source_selector: session.key_mapping.source_selector,
76
+ reference_selector: session.key_mapping.reference_selector,
77
+ source_col_sep: session.source.separator,
78
+ reference_col_sep: session.reference.separator,
79
+ match_options: session.match_options
80
+ }
81
+ end
82
+
83
+ def success(data)
84
+ Result.new(ok: true, error: nil, data: data)
85
+ end
86
+
87
+ def failure(code, data = {})
88
+ Result.new(ok: false, error: code, data: data)
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -19,8 +19,8 @@ require "csvtool/domain/column_session/column_selection"
19
19
  require "csvtool/domain/column_session/extraction_options"
20
20
  require "csvtool/domain/column_session/extraction_value"
21
21
  require "csvtool/domain/column_session/preview"
22
- require "csvtool/domain/column_session/output_destination"
23
22
  require "csvtool/domain/column_session/column_session"
23
+ require "csvtool/domain/shared/output_destination"
24
24
 
25
25
  module Csvtool
26
26
  module Application
@@ -79,9 +79,9 @@ module Csvtool
79
79
  return if output_destination.nil?
80
80
  domain_destination =
81
81
  if output_destination[:mode] == :file
82
- Domain::ColumnSession::OutputDestination.file(path: output_destination[:path])
82
+ Domain::Shared::OutputDestination.file(path: output_destination[:path])
83
83
  else
84
- Domain::ColumnSession::OutputDestination.console
84
+ Domain::Shared::OutputDestination.console
85
85
  end
86
86
  session = session.with_output_destination(domain_destination)
87
87
 
@@ -11,8 +11,8 @@ require "csvtool/infrastructure/output/csv_row_console_writer"
11
11
  require "csvtool/infrastructure/output/csv_row_file_writer"
12
12
  require "csvtool/domain/row_session/row_range"
13
13
  require "csvtool/domain/row_session/row_source"
14
- require "csvtool/domain/row_session/row_output_destination"
15
14
  require "csvtool/domain/row_session/row_session"
15
+ require "csvtool/domain/shared/output_destination"
16
16
 
17
17
  module Csvtool
18
18
  module Application
@@ -56,9 +56,9 @@ module Csvtool
56
56
  return if output_destination.nil?
57
57
  destination =
58
58
  if output_destination[:mode] == :file
59
- Domain::RowSession::RowOutputDestination.file(path: output_destination[:path])
59
+ Domain::Shared::OutputDestination.file(path: output_destination[:path])
60
60
  else
61
- Domain::RowSession::RowOutputDestination.console
61
+ Domain::Shared::OutputDestination.console
62
62
  end
63
63
  session = session.with_output_destination(destination)
64
64
 
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/interface/cli/errors/presenter"
5
+ require "csvtool/interface/cli/prompts/file_path_prompt"
6
+ require "csvtool/interface/cli/prompts/separator_prompt"
7
+ require "csvtool/interface/cli/prompts/headers_present_prompt"
8
+ require "csvtool/interface/cli/prompts/seed_prompt"
9
+ require "csvtool/interface/cli/prompts/output_destination_prompt"
10
+ require "csvtool/infrastructure/csv/header_reader"
11
+ require "csvtool/infrastructure/csv/row_randomizer"
12
+ require "csvtool/domain/row_randomization_session/randomization_source"
13
+ require "csvtool/domain/row_randomization_session/randomization_options"
14
+ require "csvtool/domain/row_randomization_session/randomization_session"
15
+ require "csvtool/domain/shared/output_destination"
16
+
17
+ module Csvtool
18
+ module Application
19
+ module UseCases
20
+ class RunRowRandomization
21
+ def initialize(stdin:, stdout:)
22
+ @stdin = stdin
23
+ @stdout = stdout
24
+ @errors = Interface::CLI::Errors::Presenter.new(stdout: stdout)
25
+ @header_reader = Infrastructure::CSV::HeaderReader.new
26
+ @row_randomizer = Infrastructure::CSV::RowRandomizer.new
27
+ end
28
+
29
+ def call
30
+ file_path = Interface::CLI::Prompts::FilePathPrompt.new(stdin: @stdin, stdout: @stdout).call
31
+ return @errors.file_not_found(file_path) unless File.file?(file_path)
32
+
33
+ col_sep = Interface::CLI::Prompts::SeparatorPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
34
+ return if col_sep.nil?
35
+
36
+ headers_present = Interface::CLI::Prompts::HeadersPresentPrompt.new(stdin: @stdin, stdout: @stdout).call
37
+ source = Domain::RowRandomizationSession::RandomizationSource.new(
38
+ path: file_path,
39
+ separator: col_sep,
40
+ headers_present: headers_present
41
+ )
42
+ headers = source.headers_present? ? @header_reader.call(file_path: source.path, col_sep: source.separator) : nil
43
+ return @errors.no_headers if source.headers_present? && headers.empty?
44
+
45
+ seed = Interface::CLI::Prompts::SeedPrompt.new(stdin: @stdin, stdout: @stdout, errors: @errors).call
46
+ return if seed == Interface::CLI::Prompts::SeedPrompt::INVALID
47
+ options = Domain::RowRandomizationSession::RandomizationOptions.new(seed: seed)
48
+ session = Domain::RowRandomizationSession::RandomizationSession.start(source: source, options: options)
49
+
50
+ output_destination = Interface::CLI::Prompts::OutputDestinationPrompt.new(
51
+ stdin: @stdin,
52
+ stdout: @stdout,
53
+ errors: @errors
54
+ ).call
55
+ return if output_destination.nil?
56
+ destination =
57
+ if output_destination[:mode] == :file
58
+ Domain::Shared::OutputDestination.file(path: output_destination[:path])
59
+ else
60
+ Domain::Shared::OutputDestination.console
61
+ end
62
+ session = session.with_output_destination(destination)
63
+
64
+ randomized_rows = @row_randomizer.each(
65
+ file_path: session.source.path,
66
+ col_sep: session.source.separator,
67
+ headers: session.source.headers_present?,
68
+ seed: session.options.seed
69
+ )
70
+
71
+ if session.output_destination.file?
72
+ write_output_file(session.output_destination.path, headers, randomized_rows, col_sep: session.source.separator)
73
+ else
74
+ print_to_console(headers, randomized_rows, col_sep: session.source.separator)
75
+ end
76
+ rescue CSV::MalformedCSVError
77
+ @errors.could_not_parse_csv
78
+ rescue ArgumentError => e
79
+ return @errors.empty_output_path if e.message == "file output path cannot be empty"
80
+
81
+ raise e
82
+ rescue Errno::EACCES
83
+ @errors.cannot_read_file(file_path)
84
+ end
85
+
86
+ private
87
+
88
+ def print_to_console(headers, rows, col_sep:)
89
+ @stdout.puts
90
+ @stdout.puts ::CSV.generate_line(headers, row_sep: "", col_sep: col_sep).chomp if headers
91
+ rows.each { |fields| @stdout.puts ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp }
92
+ end
93
+
94
+ def write_output_file(path, headers, rows, col_sep:)
95
+ ::CSV.open(path, "w", write_headers: !headers.nil?, headers: headers, col_sep: col_sep) do |csv|
96
+ rows.each { |fields| csv << fields }
97
+ end
98
+ @stdout.puts "Wrote output to #{path}"
99
+ rescue Errno::EACCES, Errno::ENOENT => e
100
+ @errors.cannot_write_output_file(path, e.class)
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
data/lib/csvtool/cli.rb CHANGED
@@ -4,6 +4,8 @@ require "csv"
4
4
  require "csvtool/interface/cli/menu_loop"
5
5
  require "csvtool/application/use_cases/run_extraction"
6
6
  require "csvtool/application/use_cases/run_row_extraction"
7
+ require "csvtool/application/use_cases/run_row_randomization"
8
+ require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
7
9
  require "csvtool/interface/cli/errors/presenter"
8
10
  require "csvtool/infrastructure/csv/header_reader"
9
11
  require "csvtool/infrastructure/csv/value_streamer"
@@ -14,6 +16,8 @@ module Csvtool
14
16
  MENU_OPTIONS = [
15
17
  "Extract column",
16
18
  "Extract rows (range)",
19
+ "Randomize rows",
20
+ "Dedupe using another CSV",
17
21
  "Exit"
18
22
  ].freeze
19
23
 
@@ -45,12 +49,16 @@ module Csvtool
45
49
  def run_menu_loop
46
50
  extract_column_action = -> { Application::UseCases::RunExtraction.new(stdin: @stdin, stdout: @stdout).call }
47
51
  extract_rows_action = -> { Application::UseCases::RunRowExtraction.new(stdin: @stdin, stdout: @stdout).call }
52
+ randomize_rows_action = -> { Application::UseCases::RunRowRandomization.new(stdin: @stdin, stdout: @stdout).call }
53
+ dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
48
54
  Interface::CLI::MenuLoop.new(
49
55
  stdin: @stdin,
50
56
  stdout: @stdout,
51
57
  menu_options: MENU_OPTIONS,
52
58
  extract_column_action: extract_column_action,
53
- extract_rows_action: extract_rows_action
59
+ extract_rows_action: extract_rows_action,
60
+ randomize_rows_action: randomize_rows_action,
61
+ dedupe_action: dedupe_action
54
62
  ).run
55
63
  end
56
64
 
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CrossCsvDedupeSession
6
+ class ColumnSelector
7
+ attr_reader :value
8
+
9
+ def self.from_input(headers_present:, input:)
10
+ if headers_present
11
+ raise ArgumentError, "column name cannot be empty" if input.to_s.empty?
12
+
13
+ new(value: input.to_s, headers_present: true)
14
+ else
15
+ raise ArgumentError, "column index must be a positive integer" unless /\A[1-9]\d*\z/.match?(input.to_s)
16
+
17
+ new(value: input.to_i, headers_present: false)
18
+ end
19
+ end
20
+
21
+ def initialize(value:, headers_present:)
22
+ @value = value
23
+ @headers_present = !!headers_present
24
+ end
25
+
26
+ def headers_present?
27
+ @headers_present
28
+ end
29
+
30
+ def index?
31
+ !@headers_present
32
+ end
33
+
34
+ def extract_from(row)
35
+ if headers_present?
36
+ row[@value].to_s
37
+ else
38
+ row[@value - 1].to_s
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csvtool/domain/cross_csv_dedupe_session/csv_profile"
4
+ require "csvtool/domain/cross_csv_dedupe_session/key_mapping"
5
+ require "csvtool/domain/cross_csv_dedupe_session/match_options"
6
+ require "csvtool/domain/shared/output_destination"
7
+
8
+ module Csvtool
9
+ module Domain
10
+ module CrossCsvDedupeSession
11
+ class CrossCsvDedupeSession
12
+ attr_reader :source, :reference, :key_mapping, :match_options, :output_destination
13
+
14
+ def self.start(source:, reference:, key_mapping:, match_options:)
15
+ new(source: source, reference: reference, key_mapping: key_mapping, match_options: match_options)
16
+ end
17
+
18
+ def initialize(source:, reference:, key_mapping:, match_options:, output_destination: nil)
19
+ raise ArgumentError, "source must be CsvProfile" unless source.is_a?(CsvProfile)
20
+ raise ArgumentError, "reference must be CsvProfile" unless reference.is_a?(CsvProfile)
21
+ raise ArgumentError, "key_mapping must be KeyMapping" unless key_mapping.is_a?(KeyMapping)
22
+ raise ArgumentError, "match_options must be MatchOptions" unless match_options.is_a?(MatchOptions)
23
+ unless output_destination.nil? || output_destination.is_a?(Domain::Shared::OutputDestination)
24
+ raise ArgumentError, "output_destination must be OutputDestination or nil"
25
+ end
26
+
27
+ @source = source
28
+ @reference = reference
29
+ @key_mapping = key_mapping
30
+ @match_options = match_options
31
+ @output_destination = output_destination
32
+ end
33
+
34
+ def with_output_destination(destination)
35
+ self.class.new(
36
+ source: @source,
37
+ reference: @reference,
38
+ key_mapping: @key_mapping,
39
+ match_options: @match_options,
40
+ output_destination: destination
41
+ )
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CrossCsvDedupeSession
6
+ class CsvProfile
7
+ attr_reader :path, :separator
8
+
9
+ def initialize(path:, separator:, headers_present:)
10
+ raise ArgumentError, "path cannot be empty" if path.to_s.empty?
11
+ raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
12
+
13
+ @path = path
14
+ @separator = separator
15
+ @headers_present = !!headers_present
16
+ end
17
+
18
+ def headers_present?
19
+ @headers_present
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csvtool/domain/cross_csv_dedupe_session/column_selector"
4
+
5
+ module Csvtool
6
+ module Domain
7
+ module CrossCsvDedupeSession
8
+ class KeyMapping
9
+ attr_reader :source_selector, :reference_selector
10
+
11
+ def initialize(source_selector:, reference_selector:)
12
+ unless source_selector.is_a?(ColumnSelector) && reference_selector.is_a?(ColumnSelector)
13
+ raise ArgumentError, "selectors must be ColumnSelector"
14
+ end
15
+
16
+ @source_selector = source_selector
17
+ @reference_selector = reference_selector
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CrossCsvDedupeSession
6
+ class MatchOptions
7
+ attr_reader :trim_whitespace, :case_insensitive
8
+
9
+ def initialize(trim_whitespace:, case_insensitive:)
10
+ @trim_whitespace = !!trim_whitespace
11
+ @case_insensitive = !!case_insensitive
12
+ end
13
+
14
+ def trim_whitespace?
15
+ @trim_whitespace
16
+ end
17
+
18
+ def case_insensitive?
19
+ @case_insensitive
20
+ end
21
+
22
+ def normalize(value)
23
+ normalized = trim_whitespace? ? value.to_s.strip : value.to_s
24
+ case_insensitive? ? normalized.downcase : normalized
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module RowRandomizationSession
6
+ class RandomizationOptions
7
+ attr_reader :seed
8
+
9
+ def initialize(seed:)
10
+ raise ArgumentError, "seed must be an integer or nil" unless seed.nil? || seed.is_a?(Integer)
11
+
12
+ @seed = seed
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module RowRandomizationSession
6
+ class RandomizationSession
7
+ attr_reader :source, :options, :output_destination
8
+
9
+ def self.start(source:, options:)
10
+ new(source: source, options: options)
11
+ end
12
+
13
+ def initialize(source:, options:, output_destination: nil)
14
+ @source = source
15
+ @options = options
16
+ @output_destination = output_destination
17
+ end
18
+
19
+ def with_output_destination(destination)
20
+ self.class.new(source: @source, options: @options, output_destination: destination)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module RowRandomizationSession
6
+ class RandomizationSource
7
+ attr_reader :path, :separator
8
+
9
+ def initialize(path:, separator:, headers_present:)
10
+ raise ArgumentError, "path cannot be empty" if path.to_s.empty?
11
+ raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
12
+
13
+ @path = path
14
+ @separator = separator
15
+ @headers_present = headers_present
16
+ end
17
+
18
+ def headers_present?
19
+ @headers_present
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -7,6 +7,9 @@ module Csvtool
7
7
  attr_reader :path, :separator
8
8
 
9
9
  def initialize(path:, separator:)
10
+ raise ArgumentError, "path cannot be empty" if path.to_s.empty?
11
+ raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
12
+
10
13
  @path = path
11
14
  @separator = separator
12
15
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Csvtool
4
4
  module Domain
5
- module ColumnSession
5
+ module Shared
6
6
  class OutputDestination
7
7
  attr_reader :mode, :path
8
8
 
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "set"
5
+ require "csvtool/domain/cross_csv_dedupe_session/match_options"
6
+
7
+ module Csvtool
8
+ module Infrastructure
9
+ module CSV
10
+ class CrossCsvDeduper
11
+ def call(
12
+ source_path:,
13
+ reference_path:,
14
+ source_selector:,
15
+ reference_selector:,
16
+ source_col_sep: ",",
17
+ reference_col_sep: ",",
18
+ match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
19
+ )
20
+ kept_rows = []
21
+ stats = each_retained(
22
+ source_path: source_path,
23
+ reference_path: reference_path,
24
+ source_selector: source_selector,
25
+ reference_selector: reference_selector,
26
+ source_col_sep: source_col_sep,
27
+ reference_col_sep: reference_col_sep,
28
+ match_options: match_options
29
+ ) do |fields|
30
+ kept_rows << fields
31
+ end
32
+
33
+ stats.merge(kept_rows: kept_rows)
34
+ end
35
+
36
+ def each_retained(
37
+ source_path:,
38
+ reference_path:,
39
+ source_selector:,
40
+ reference_selector:,
41
+ source_col_sep: ",",
42
+ reference_col_sep: ",",
43
+ match_options: Domain::CrossCsvDedupeSession::MatchOptions.new(trim_whitespace: true, case_insensitive: false)
44
+ )
45
+ source_has_headers = source_selector.headers_present?
46
+ reference_has_headers = reference_selector.headers_present?
47
+ reference_keys = Set.new
48
+ ::CSV.foreach(reference_path, headers: reference_has_headers, col_sep: reference_col_sep) do |row|
49
+ reference_keys << extract_key(row, selector: reference_selector, match_options: match_options)
50
+ end
51
+
52
+ source_header_row = nil
53
+ source_rows = 0
54
+ removed_rows = 0
55
+ kept_rows_count = 0
56
+
57
+ ::CSV.foreach(source_path, headers: source_has_headers, col_sep: source_col_sep) do |row|
58
+ source_header_row ||= row.headers if source_has_headers
59
+ source_rows += 1
60
+ key = extract_key(row, selector: source_selector, match_options: match_options)
61
+ if reference_keys.include?(key)
62
+ removed_rows += 1
63
+ else
64
+ kept_rows_count += 1
65
+ yield(source_has_headers ? row.fields : row) if block_given?
66
+ end
67
+ end
68
+
69
+ {
70
+ headers: source_has_headers ? (source_header_row || []) : nil,
71
+ source_rows: source_rows,
72
+ removed_rows: removed_rows,
73
+ kept_rows_count: kept_rows_count
74
+ }
75
+ end
76
+
77
+ private
78
+
79
+ def extract_key(row, selector:, match_options:)
80
+ match_options.normalize(selector.extract_from(row))
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end