csvops 0.5.0.alpha → 0.6.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +45 -3
  3. data/docs/architecture.md +61 -4
  4. data/docs/release-v0.6.0-alpha.md +84 -0
  5. data/lib/csvtool/application/use_cases/run_csv_parity.rb +70 -0
  6. data/lib/csvtool/cli.rb +5 -1
  7. data/lib/csvtool/domain/csv_parity_session/parity_options.rb +22 -0
  8. data/lib/csvtool/domain/csv_parity_session/parity_session.rb +20 -0
  9. data/lib/csvtool/domain/csv_parity_session/source_pair.rb +19 -0
  10. data/lib/csvtool/infrastructure/csv/csv_parity_comparator.rb +71 -0
  11. data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
  12. data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
  13. data/lib/csvtool/interface/cli/workflows/builders/csv_parity_session_builder.rb +33 -0
  14. data/lib/csvtool/interface/cli/workflows/presenters/csv_parity_presenter.rb +38 -0
  15. data/lib/csvtool/interface/cli/workflows/run_csv_parity_workflow.rb +66 -0
  16. data/lib/csvtool/interface/cli/workflows/steps/parity/build_session_step.rb +25 -0
  17. data/lib/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step.rb +32 -0
  18. data/lib/csvtool/interface/cli/workflows/steps/parity/execute_step.rb +26 -0
  19. data/lib/csvtool/version.rb +1 -1
  20. data/test/csvtool/application/use_cases/run_csv_parity_test.rb +160 -0
  21. data/test/csvtool/cli_test.rb +175 -21
  22. data/test/csvtool/cli_unit_test.rb +4 -4
  23. data/test/csvtool/domain/csv_parity_session/parity_options_test.rb +17 -0
  24. data/test/csvtool/domain/csv_parity_session/parity_session_test.rb +18 -0
  25. data/test/csvtool/domain/csv_parity_session/source_pair_test.rb +11 -0
  26. data/test/csvtool/infrastructure/csv/csv_parity_comparator_test.rb +78 -0
  27. data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
  28. data/test/csvtool/interface/cli/menu_loop_test.rb +59 -16
  29. data/test/csvtool/interface/cli/workflows/builders/csv_parity_session_builder_test.rb +20 -0
  30. data/test/csvtool/interface/cli/workflows/presenters/csv_parity_presenter_test.rb +43 -0
  31. data/test/csvtool/interface/cli/workflows/run_csv_parity_workflow_test.rb +94 -0
  32. data/test/csvtool/interface/cli/workflows/steps/parity/build_session_step_test.rb +41 -0
  33. data/test/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step_test.rb +30 -0
  34. data/test/csvtool/interface/cli/workflows/steps/parity/execute_step_test.rb +40 -0
  35. data/test/fixtures/parity_duplicates_left.csv +4 -0
  36. data/test/fixtures/parity_duplicates_right.csv +3 -0
  37. data/test/fixtures/parity_people_header_mismatch.csv +4 -0
  38. data/test/fixtures/parity_people_many_reordered.csv +13 -0
  39. data/test/fixtures/parity_people_mismatch.csv +4 -0
  40. data/test/fixtures/parity_people_reordered.csv +4 -0
  41. data/test/fixtures/parity_people_reordered.tsv +4 -0
  42. metadata +31 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b96fb7e03fa0629d3412a97d3abff5414492ac46ad08ede2c872e2176fcbfc62
4
- data.tar.gz: 856b7735a472b5810d5f19dff6371a565a7fcc538ce5b6eba52260fff0028760
3
+ metadata.gz: f7db22cb84c1d08c58b473368f9ad37575a217d6293539309277ed2b032a2852
4
+ data.tar.gz: 124bebc822fefa5d1f71286701959876260c82164067c36ff94b712a0b4cc1b3
5
5
  SHA512:
6
- metadata.gz: 5f643d331c6b54cb5feb0fe5db4ff7f8f7bc5c28461f74e3bfca5cf93d25703b84f497e72377302874b2b6302ef0fb542995c72d2d21798e3a998f6d5b294704
7
- data.tar.gz: 0e254fa75780ce0605054c24b28301d8786535a0f2bbff7adfb45a75f09e60e5315e950648208fa5772d08cdd6abce95ea382838f568947af05ceaa77ba1888f
6
+ metadata.gz: a8b8dbcfb66073f46f0ecc625267081fbe730e69ef9295f5d2303af6b831a9d71ef564f78f5b44212eb33c4ad7a5fdb78b54fa98e21dd58669e9494a5d3325fb
7
+ data.tar.gz: 05cbcaa2ca3116ad463413e53600d32a53df0941ceb8873ed22c2ef2d4cfe1afc8f90e44c7ff4400212ebbd5083a2ceb6a983281291436e9478d5087cc98b9ad
data/README.md CHANGED
@@ -37,11 +37,12 @@ CSV Tool Menu
37
37
  2. Extract rows (range)
38
38
  3. Randomize rows
39
39
  4. Dedupe using another CSV
40
- 5. Exit
40
+ 5. Validate parity
41
+ 6. Exit
41
42
  >
42
43
  ```
43
44
 
44
- Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, or `4` for cross-CSV dedupe.
45
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, or `5` for parity validation.
45
46
 
46
47
  ### 3. Follow prompts
47
48
 
@@ -59,6 +60,7 @@ Prompt flow by action:
59
60
  - `Extract rows (range)`: file path, separator, start row, end row, output destination.
60
61
  - `Randomize rows`: file path, separator, headers present, optional seed, output destination.
61
62
  - `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
63
+ - `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
62
64
 
63
65
  ### 4. Example interaction (console output)
64
66
 
@@ -129,7 +131,8 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
129
131
  2. Extract rows (range)
130
132
  3. Randomize rows
131
133
  4. Dedupe using another CSV
132
- 5. Exit
134
+ 5. Validate parity
135
+ 6. Exit
133
136
  +> 4
134
137
  CSV file path: /tmp/source.csv
135
138
  Source CSV separator:
@@ -166,6 +169,45 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
166
169
  -Summary: source_rows=5 removed_rows=3 kept_rows=2
167
170
  ```
168
171
 
172
+ ### 8. Parity interaction example
173
+
174
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
175
+
176
+ ```diff
177
+ CSV Tool Menu
178
+ 1. Extract column
179
+ 2. Extract rows (range)
180
+ 3. Randomize rows
181
+ 4. Dedupe using another CSV
182
+ 5. Validate parity
183
+ 6. Exit
184
+ +> 5
185
+ Left CSV file path: /tmp/left.csv
186
+ Right CSV file path: /tmp/right.csv
187
+ Choose separator:
188
+ 1. comma (,)
189
+ 2. tab (\t)
190
+ 3. semicolon (;)
191
+ 4. pipe (|)
192
+ 5. custom
193
+ +Separator choice [1]: 1
194
+ Headers present? [Y/n]:
195
+ -MISMATCH
196
+ -Summary: left_rows=10 right_rows=10 left_only=2 right_only=2
197
+ -Left-only examples:
198
+ - 4,Dina (count +1)
199
+ -Right-only examples:
200
+ - 4,Dina-Updated (count +1)
201
+ ```
202
+
203
+ ### 9. Parity large-file behavior
204
+
205
+ - Parity uses a streaming count-delta strategy:
206
+ - Stream left rows and increment row-key counts.
207
+ - Stream right rows and decrement row-key counts.
208
+ - Exact duplicate semantics are preserved by count deltas per normalized row value.
209
+ - Memory scales with the number of distinct row keys in the parity map, not the total input row count.
210
+
169
211
  ## Testing
170
212
 
171
213
  Run tests:
data/docs/architecture.md CHANGED
@@ -3,14 +3,14 @@
3
3
  The codebase follows a DDD-lite layered structure:
4
4
 
5
5
  - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
7
- - `infrastructure/`: CSV reading/streaming and output adapters (console/file), plus cross-CSV dedupe adapter.
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`).
7
+ - `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
8
8
  - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
9
  - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
10
 
11
11
  ## Workflow boundary (standardized)
12
12
 
13
- For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`), the boundary is:
13
+ For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`), the boundary is:
14
14
 
15
15
  - `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
16
16
  - `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
@@ -32,6 +32,7 @@ Current usage:
32
32
  - `RunRowExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::RowExtraction::*`.
33
33
  - `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
34
34
  - `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
35
+ - `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
35
36
 
36
37
  ## Adding New Concepts
37
38
 
@@ -107,7 +108,7 @@ For a new function type, prefer one of these patterns:
107
108
 
108
109
  ## Domain model
109
110
 
110
- Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, and `Cross-CSV Dedupe`.
111
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, and `CSV Parity`.
111
112
 
112
113
  ### Cross-CSV Dedupe (Large-file behavior)
113
114
 
@@ -366,6 +367,60 @@ classDiagram
366
367
  RunCrossCsvDedupe --> CsvCrossCsvDedupeFileWriter
367
368
  ```
368
369
 
370
+ ### CSV Parity
371
+
372
+ Core DDD structure:
373
+
374
+ - Aggregate root: `ParitySession`
375
+ - Captures one parity check request.
376
+ - Holds left/right source paths and parity options.
377
+ - Entities:
378
+ - `SourcePair` (left and right file paths)
379
+ - Value objects:
380
+ - `ParityOptions` (separator + header mode)
381
+ - Application service:
382
+ - `Application::UseCases::RunCsvParity` orchestrates parity validation and returns request/result style payloads.
383
+ - Infrastructure adapters:
384
+ - `Infrastructure::CSV::HeaderReader`
385
+ - `Infrastructure::CSV::CsvParityComparator` (streaming count-delta strategy with duplicate-aware semantics)
386
+ - Interface adapters:
387
+ - `Interface::CLI::MenuLoop`
388
+ - `Interface::CLI::Workflows::RunCsvParityWorkflow`
389
+ - `Interface::CLI::Workflows::Builders::CsvParitySessionBuilder`
390
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
391
+ - `Interface::CLI::Workflows::Steps::Parity::*`
392
+ - `Interface::CLI::Workflows::Presenters::CsvParityPresenter`
393
+ - `Interface::CLI::Workflows::Support::ResultErrorHandler`
394
+ - `Interface::CLI::Prompts::*`
395
+ - `Interface::CLI::Errors::Presenter`
396
+
397
+ ```mermaid
398
+ classDiagram
399
+ direction LR
400
+ class MenuLoop
401
+ class RunCsvParityWorkflow
402
+ class Prompts
403
+ class Errors
404
+ class RunCsvParity
405
+ class ParitySession
406
+ class SourcePair
407
+ class ParityOptions
408
+ class HeaderReader
409
+ class CsvParityComparator
410
+ class CsvParityPresenter
411
+
412
+ MenuLoop --> RunCsvParityWorkflow : invokes
413
+ RunCsvParityWorkflow --> Prompts : uses
414
+ RunCsvParityWorkflow --> Errors : reports failures
415
+ RunCsvParityWorkflow --> CsvParityPresenter : renders
416
+ RunCsvParityWorkflow --> RunCsvParity : calls
417
+ RunCsvParity --> ParitySession : orchestrates
418
+ ParitySession o-- SourcePair
419
+ ParitySession o-- ParityOptions
420
+ RunCsvParity --> HeaderReader
421
+ RunCsvParity --> CsvParityComparator
422
+ ```
423
+
369
424
  ## Project layout
370
425
 
371
426
  ```text
@@ -375,11 +430,13 @@ lib/csvtool/domain/column_session/*
375
430
  lib/csvtool/domain/row_session/*
376
431
  lib/csvtool/domain/row_randomization_session/*
377
432
  lib/csvtool/domain/cross_csv_dedupe_session/*
433
+ lib/csvtool/domain/csv_parity_session/*
378
434
  lib/csvtool/domain/shared/output_destination.rb
379
435
  lib/csvtool/application/use_cases/run_extraction.rb
380
436
  lib/csvtool/application/use_cases/run_row_extraction.rb
381
437
  lib/csvtool/application/use_cases/run_row_randomization.rb
382
438
  lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
439
+ lib/csvtool/application/use_cases/run_csv_parity.rb
383
440
  lib/csvtool/infrastructure/csv/*
384
441
  lib/csvtool/infrastructure/output/*
385
442
  lib/csvtool/interface/cli/menu_loop.rb
@@ -0,0 +1,84 @@
1
+ # Release Checklist: v0.6.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV parity workflow
35
+
36
+ Use menu option `5` (`Validate parity`) and verify:
37
+ - matching files with reordered rows return parity success
38
+ - mismatch files return friendly mismatch summary with sample deltas
39
+ - separator and header-mode selections are respected
40
+
41
+ ### Existing workflows regression pass
42
+
43
+ Run quick checks for menu options `1-4` and confirm:
44
+ - column extraction still works
45
+ - row-range extraction still works
46
+ - row randomization still works
47
+ - cross-CSV dedupe still works
48
+
49
+ ## 6. Build and validate gem package
50
+
51
+ ```bash
52
+ gem build csvops.gemspec
53
+ gem install ./csvops-0.6.0.alpha.gem
54
+ csvtool menu
55
+ ```
56
+
57
+ ## 7. Commit release prep
58
+
59
+ ```bash
60
+ git add -A
61
+ git commit -m "chore(release): prepare v0.6.0-alpha"
62
+ ```
63
+
64
+ ## 8. Tag release
65
+
66
+ ```bash
67
+ git tag -a v0.6.0-alpha -m "v0.6.0-alpha"
68
+ git push origin main --tags
69
+ ```
70
+
71
+ ## 9. Publish gem
72
+
73
+ ```bash
74
+ gem push csvops-0.6.0.alpha.gem
75
+ ```
76
+
77
+ ## 10. Create GitHub release
78
+
79
+ Create release `v0.6.0-alpha` with:
80
+ - Dedicated CSV parity validation workflow
81
+ - Header/separator parity options
82
+ - Friendly parity mismatch reporting
83
+ - Streaming delta-count parity comparator
84
+ - Parity architecture convergence (session model, workflow steps, presenter, docs)
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/csv_parity_comparator"
5
+ require "csvtool/infrastructure/csv/header_reader"
6
+
7
+ module Csvtool
8
+ module Application
9
+ module UseCases
10
+ class RunCsvParity
11
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
12
+ def ok?
13
+ ok
14
+ end
15
+ end
16
+
17
+ def initialize(
18
+ comparator: Infrastructure::CSV::CsvParityComparator.new,
19
+ header_reader: Infrastructure::CSV::HeaderReader.new
20
+ )
21
+ @comparator = comparator
22
+ @header_reader = header_reader
23
+ end
24
+
25
+ def call(session:)
26
+ left_path = session.source_pair.left_path
27
+ right_path = session.source_pair.right_path
28
+ col_sep = session.options.separator
29
+ headers_present = session.options.headers_present?
30
+
31
+ return failure(:file_not_found, path: left_path) unless File.file?(left_path)
32
+ return failure(:file_not_found, path: right_path) unless File.file?(right_path)
33
+
34
+ if headers_present
35
+ left_headers = @header_reader.call(file_path: left_path, col_sep: col_sep)
36
+ return failure(:no_headers, path: left_path) if left_headers.empty?
37
+
38
+ right_headers = @header_reader.call(file_path: right_path, col_sep: col_sep)
39
+ return failure(:no_headers, path: right_path) if right_headers.empty?
40
+
41
+ return failure(:header_mismatch, left_headers: left_headers, right_headers: right_headers) unless left_headers == right_headers
42
+ end
43
+
44
+ stats = @comparator.call(
45
+ left_path: left_path,
46
+ right_path: right_path,
47
+ col_sep: col_sep,
48
+ headers_present: headers_present
49
+ )
50
+
51
+ success(stats)
52
+ rescue CSV::MalformedCSVError
53
+ failure(:could_not_parse_csv)
54
+ rescue Errno::EACCES => e
55
+ failure(:cannot_read_file, path: e.respond_to?(:path) ? e.path : left_path)
56
+ end
57
+
58
+ private
59
+
60
+ def success(data)
61
+ Result.new(ok: true, error: nil, data: data)
62
+ end
63
+
64
+ def failure(code, data = {})
65
+ Result.new(ok: false, error: code, data: data)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
data/lib/csvtool/cli.rb CHANGED
@@ -6,6 +6,7 @@ require "csvtool/interface/cli/workflows/run_extraction_workflow"
6
6
  require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
7
7
  require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
8
8
  require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
9
+ require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
9
10
  require "csvtool/interface/cli/errors/presenter"
10
11
  require "csvtool/infrastructure/csv/header_reader"
11
12
  require "csvtool/infrastructure/csv/value_streamer"
@@ -18,6 +19,7 @@ module Csvtool
18
19
  "Extract rows (range)",
19
20
  "Randomize rows",
20
21
  "Dedupe using another CSV",
22
+ "Validate parity",
21
23
  "Exit"
22
24
  ].freeze
23
25
 
@@ -51,6 +53,7 @@ module Csvtool
51
53
  extract_rows_action = -> { Interface::CLI::Workflows::RunRowExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
52
54
  randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
53
55
  dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
56
+ parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
54
57
  Interface::CLI::MenuLoop.new(
55
58
  stdin: @stdin,
56
59
  stdout: @stdout,
@@ -58,7 +61,8 @@ module Csvtool
58
61
  extract_column_action: extract_column_action,
59
62
  extract_rows_action: extract_rows_action,
60
63
  randomize_rows_action: randomize_rows_action,
61
- dedupe_action: dedupe_action
64
+ dedupe_action: dedupe_action,
65
+ parity_action: parity_action
62
66
  ).run
63
67
  end
64
68
 
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvParitySession
6
+ class ParityOptions
7
+ attr_reader :separator
8
+
9
+ def initialize(separator:, headers_present:)
10
+ raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
11
+
12
+ @separator = separator
13
+ @headers_present = headers_present
14
+ end
15
+
16
+ def headers_present?
17
+ @headers_present
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvParitySession
6
+ class ParitySession
7
+ attr_reader :source_pair, :options
8
+
9
+ def self.start(source_pair:, options:)
10
+ new(source_pair: source_pair, options: options)
11
+ end
12
+
13
+ def initialize(source_pair:, options:)
14
+ @source_pair = source_pair
15
+ @options = options
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvParitySession
6
+ class SourcePair
7
+ attr_reader :left_path, :right_path
8
+
9
+ def initialize(left_path:, right_path:)
10
+ raise ArgumentError, "left_path cannot be empty" if left_path.to_s.empty?
11
+ raise ArgumentError, "right_path cannot be empty" if right_path.to_s.empty?
12
+
13
+ @left_path = left_path
14
+ @right_path = right_path
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module Csvtool
6
+ module Infrastructure
7
+ module CSV
8
+ class CsvParityComparator
9
+ def call(left_path:, right_path:, col_sep:, headers_present:, sample_limit: 5)
10
+ deltas = Hash.new(0)
11
+ left_rows = stream_rows(path: left_path, col_sep: col_sep, headers_present: headers_present) do |key|
12
+ deltas[key] += 1
13
+ end
14
+ right_rows = stream_rows(path: right_path, col_sep: col_sep, headers_present: headers_present) do |key|
15
+ deltas[key] -= 1
16
+ end
17
+
18
+ left_only_count, right_only_count, left_only_examples, right_only_examples =
19
+ mismatch_totals_and_samples(deltas: deltas, sample_limit: sample_limit)
20
+
21
+ {
22
+ match: left_only_count.zero? && right_only_count.zero?,
23
+ left_rows: left_rows,
24
+ right_rows: right_rows,
25
+ left_only_count: left_only_count,
26
+ right_only_count: right_only_count,
27
+ left_only_examples: left_only_examples,
28
+ right_only_examples: right_only_examples
29
+ }
30
+ end
31
+
32
+ private
33
+
34
+ def stream_rows(path:, col_sep:, headers_present:)
35
+ rows = 0
36
+
37
+ ::CSV.foreach(path, headers: headers_present, col_sep: col_sep) do |row|
38
+ fields = headers_present ? row.fields : row
39
+ yield serialize(fields: fields, col_sep: col_sep)
40
+ rows += 1
41
+ end
42
+
43
+ rows
44
+ end
45
+
46
+ def mismatch_totals_and_samples(deltas:, sample_limit:)
47
+ left_only_count = 0
48
+ right_only_count = 0
49
+ left_only_examples = []
50
+ right_only_examples = []
51
+
52
+ deltas.each do |key, delta|
53
+ if delta.positive?
54
+ left_only_count += delta
55
+ left_only_examples << { row: key, count_delta: delta } if left_only_examples.length < sample_limit
56
+ elsif delta.negative?
57
+ right_only_count += -delta
58
+ right_only_examples << { row: key, count_delta: -delta } if right_only_examples.length < sample_limit
59
+ end
60
+ end
61
+
62
+ [left_only_count, right_only_count, left_only_examples, right_only_examples]
63
+ end
64
+
65
+ def serialize(fields:, col_sep:)
66
+ ::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -72,6 +72,10 @@ module Csvtool
72
72
  def row_range_out_of_bounds(total_rows)
73
73
  @stdout.puts "Row range is out of bounds. File has #{total_rows} data rows."
74
74
  end
75
+
76
+ def header_mismatch
77
+ @stdout.puts "CSV headers do not match."
78
+ end
75
79
  end
76
80
  end
77
81
  end
@@ -4,7 +4,7 @@ module Csvtool
4
4
  module Interface
5
5
  module CLI
6
6
  class MenuLoop
7
- def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
7
+ def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:)
8
8
  @stdin = stdin
9
9
  @stdout = stdout
10
10
  @menu_options = menu_options
@@ -12,6 +12,7 @@ module Csvtool
12
12
  @extract_rows_action = extract_rows_action
13
13
  @randomize_rows_action = randomize_rows_action
14
14
  @dedupe_action = dedupe_action
15
+ @parity_action = parity_action
15
16
  end
16
17
 
17
18
  def run
@@ -31,9 +32,11 @@ module Csvtool
31
32
  when "4"
32
33
  @dedupe_action.call
33
34
  when "5"
35
+ @parity_action.call
36
+ when "6"
34
37
  return 0
35
38
  else
36
- @stdout.puts "Please choose 1, 2, 3, 4, or 5."
39
+ @stdout.puts "Please choose 1, 2, 3, 4, 5, or 6."
37
40
  end
38
41
  end
39
42
  end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csvtool/domain/csv_parity_session/source_pair"
4
+ require "csvtool/domain/csv_parity_session/parity_options"
5
+ require "csvtool/domain/csv_parity_session/parity_session"
6
+
7
+ module Csvtool
8
+ module Interface
9
+ module CLI
10
+ module Workflows
11
+ module Builders
12
+ class CsvParitySessionBuilder
13
+ def call(left_path:, right_path:, col_sep:, headers_present:)
14
+ source_pair = Domain::CsvParitySession::SourcePair.new(
15
+ left_path: left_path,
16
+ right_path: right_path
17
+ )
18
+ options = Domain::CsvParitySession::ParityOptions.new(
19
+ separator: col_sep,
20
+ headers_present: headers_present
21
+ )
22
+
23
+ Domain::CsvParitySession::ParitySession.start(
24
+ source_pair: source_pair,
25
+ options: options
26
+ )
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Interface
5
+ module CLI
6
+ module Workflows
7
+ module Presenters
8
+ class CsvParityPresenter
9
+ def initialize(stdout:)
10
+ @stdout = stdout
11
+ end
12
+
13
+ def print_summary(data)
14
+ @stdout.puts(data[:match] ? "MATCH" : "MISMATCH")
15
+ @stdout.puts "Summary: left_rows=#{data[:left_rows]} right_rows=#{data[:right_rows]} " \
16
+ "left_only=#{data[:left_only_count]} right_only=#{data[:right_only_count]}"
17
+ return if data[:match]
18
+
19
+ print_examples("Left-only examples", data[:left_only_examples])
20
+ print_examples("Right-only examples", data[:right_only_examples])
21
+ end
22
+
23
+ private
24
+
25
+ def print_examples(label, examples)
26
+ return if examples.nil? || examples.empty?
27
+
28
+ @stdout.puts "#{label}:"
29
+ examples.each do |example|
30
+ @stdout.puts " #{example[:row]} (count +#{example[:count_delta]})"
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end