csvops 0.5.0.alpha → 0.7.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +88 -7
  3. data/docs/architecture.md +119 -5
  4. data/docs/release-v0.6.0-alpha.md +84 -0
  5. data/docs/release-v0.7.0-alpha.md +87 -0
  6. data/lib/csvtool/application/use_cases/run_csv_parity.rb +70 -0
  7. data/lib/csvtool/application/use_cases/run_csv_split.rb +97 -0
  8. data/lib/csvtool/cli.rb +9 -1
  9. data/lib/csvtool/domain/csv_parity_session/parity_options.rb +22 -0
  10. data/lib/csvtool/domain/csv_parity_session/parity_session.rb +20 -0
  11. data/lib/csvtool/domain/csv_parity_session/source_pair.rb +19 -0
  12. data/lib/csvtool/domain/csv_split_session/split_options.rb +27 -0
  13. data/lib/csvtool/domain/csv_split_session/split_session.rb +20 -0
  14. data/lib/csvtool/domain/csv_split_session/split_source.rb +17 -0
  15. data/lib/csvtool/infrastructure/csv/csv_parity_comparator.rb +71 -0
  16. data/lib/csvtool/infrastructure/csv/csv_splitter.rb +64 -0
  17. data/lib/csvtool/infrastructure/output/csv_split_manifest_writer.rb +20 -0
  18. data/lib/csvtool/interface/cli/errors/presenter.rb +12 -0
  19. data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
  20. data/lib/csvtool/interface/cli/prompts/chunk_size_prompt.rb +21 -0
  21. data/lib/csvtool/interface/cli/prompts/split_manifest_prompt.rb +30 -0
  22. data/lib/csvtool/interface/cli/prompts/split_output_prompt.rb +38 -0
  23. data/lib/csvtool/interface/cli/workflows/builders/csv_parity_session_builder.rb +33 -0
  24. data/lib/csvtool/interface/cli/workflows/builders/csv_split_session_builder.rb +44 -0
  25. data/lib/csvtool/interface/cli/workflows/presenters/csv_parity_presenter.rb +38 -0
  26. data/lib/csvtool/interface/cli/workflows/presenters/csv_split_presenter.rb +26 -0
  27. data/lib/csvtool/interface/cli/workflows/run_csv_parity_workflow.rb +66 -0
  28. data/lib/csvtool/interface/cli/workflows/run_csv_split_workflow.rb +89 -0
  29. data/lib/csvtool/interface/cli/workflows/steps/csv_split/build_session_step.rb +30 -0
  30. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step.rb +43 -0
  31. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step.rb +30 -0
  32. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step.rb +31 -0
  33. data/lib/csvtool/interface/cli/workflows/steps/csv_split/execute_step.rb +36 -0
  34. data/lib/csvtool/interface/cli/workflows/steps/parity/build_session_step.rb +25 -0
  35. data/lib/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step.rb +32 -0
  36. data/lib/csvtool/interface/cli/workflows/steps/parity/execute_step.rb +26 -0
  37. data/lib/csvtool/version.rb +1 -1
  38. data/test/csvtool/application/use_cases/run_csv_parity_test.rb +160 -0
  39. data/test/csvtool/application/use_cases/run_csv_split_test.rb +124 -0
  40. data/test/csvtool/cli_test.rb +222 -21
  41. data/test/csvtool/cli_unit_test.rb +4 -4
  42. data/test/csvtool/domain/csv_parity_session/parity_options_test.rb +17 -0
  43. data/test/csvtool/domain/csv_parity_session/parity_session_test.rb +18 -0
  44. data/test/csvtool/domain/csv_parity_session/source_pair_test.rb +11 -0
  45. data/test/csvtool/infrastructure/csv/csv_parity_comparator_test.rb +78 -0
  46. data/test/csvtool/infrastructure/csv/csv_splitter_test.rb +68 -0
  47. data/test/csvtool/infrastructure/output/csv_split_manifest_writer_test.rb +25 -0
  48. data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
  49. data/test/csvtool/interface/cli/menu_loop_test.rb +87 -93
  50. data/test/csvtool/interface/cli/prompts/chunk_size_prompt_test.rb +17 -0
  51. data/test/csvtool/interface/cli/prompts/split_manifest_prompt_test.rb +42 -0
  52. data/test/csvtool/interface/cli/prompts/split_output_prompt_test.rb +22 -0
  53. data/test/csvtool/interface/cli/workflows/builders/csv_parity_session_builder_test.rb +20 -0
  54. data/test/csvtool/interface/cli/workflows/builders/csv_split_session_builder_test.rb +30 -0
  55. data/test/csvtool/interface/cli/workflows/presenters/csv_parity_presenter_test.rb +43 -0
  56. data/test/csvtool/interface/cli/workflows/presenters/csv_split_presenter_test.rb +26 -0
  57. data/test/csvtool/interface/cli/workflows/run_csv_parity_workflow_test.rb +94 -0
  58. data/test/csvtool/interface/cli/workflows/run_csv_split_workflow_test.rb +200 -0
  59. data/test/csvtool/interface/cli/workflows/steps/csv_split/build_session_step_test.rb +40 -0
  60. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step_test.rb +64 -0
  61. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step_test.rb +30 -0
  62. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step_test.rb +32 -0
  63. data/test/csvtool/interface/cli/workflows/steps/csv_split/execute_step_test.rb +83 -0
  64. data/test/csvtool/interface/cli/workflows/steps/parity/build_session_step_test.rb +41 -0
  65. data/test/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step_test.rb +30 -0
  66. data/test/csvtool/interface/cli/workflows/steps/parity/execute_step_test.rb +40 -0
  67. data/test/fixtures/parity_duplicates_left.csv +4 -0
  68. data/test/fixtures/parity_duplicates_right.csv +3 -0
  69. data/test/fixtures/parity_people_header_mismatch.csv +4 -0
  70. data/test/fixtures/parity_people_many_reordered.csv +13 -0
  71. data/test/fixtures/parity_people_mismatch.csv +4 -0
  72. data/test/fixtures/parity_people_reordered.csv +4 -0
  73. data/test/fixtures/parity_people_reordered.tsv +4 -0
  74. data/test/fixtures/split_people_25.csv +26 -0
  75. metadata +64 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b96fb7e03fa0629d3412a97d3abff5414492ac46ad08ede2c872e2176fcbfc62
4
- data.tar.gz: 856b7735a472b5810d5f19dff6371a565a7fcc538ce5b6eba52260fff0028760
3
+ metadata.gz: 803fa825ef1f50edcd7c0bc032a86926d356cb3ba6d943c460d59759a953fdcd
4
+ data.tar.gz: 2ba2afc9951aa96e777cbf3ea81dc77a41c88d2546505c885302607432461633
5
5
  SHA512:
6
- metadata.gz: 5f643d331c6b54cb5feb0fe5db4ff7f8f7bc5c28461f74e3bfca5cf93d25703b84f497e72377302874b2b6302ef0fb542995c72d2d21798e3a998f6d5b294704
7
- data.tar.gz: 0e254fa75780ce0605054c24b28301d8786535a0f2bbff7adfb45a75f09e60e5315e950648208fa5772d08cdd6abce95ea382838f568947af05ceaa77ba1888f
6
+ metadata.gz: 4f82dd7e9d3ac5ff53f8aaf40a0e5500e9b074aa052a031f6de4f5a2cc1ab711a5c375d5c203bdfaae802d36a02ecf14c4f73231a9f14e31d2f042ffeecd9a08
7
+ data.tar.gz: f9428d2ef29d257c99b484c7277dcff566dd5cf09ec06b78b4514c410b7858ffd6854f8aafd39a727c9c3d1e44e6940bc15456f3b11fdcac4a5b879bee9cc826
data/README.md CHANGED
@@ -37,11 +37,13 @@ CSV Tool Menu
37
37
  2. Extract rows (range)
38
38
  3. Randomize rows
39
39
  4. Dedupe using another CSV
40
- 5. Exit
40
+ 5. Validate parity
41
+ 6. Split CSV into chunks
42
+ 7. Exit
41
43
  >
42
44
  ```
43
45
 
44
- Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, or `4` for cross-CSV dedupe.
46
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, or `6` for CSV splitting.
45
47
 
46
48
  ### 3. Follow prompts
47
49
 
@@ -59,6 +61,8 @@ Prompt flow by action:
59
61
  - `Extract rows (range)`: file path, separator, start row, end row, output destination.
60
62
  - `Randomize rows`: file path, separator, headers present, optional seed, output destination.
61
63
  - `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
64
+ - `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
65
+ - `Split CSV into chunks`: source file, separator, header mode, chunk size, output directory/prefix, overwrite policy, optional manifest.
62
66
 
63
67
  ### 4. Example interaction (console output)
64
68
 
@@ -127,9 +131,11 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
127
131
  CSV Tool Menu
128
132
  1. Extract column
129
133
  2. Extract rows (range)
130
- 3. Randomize rows
131
- 4. Dedupe using another CSV
132
- 5. Exit
134
+ 3. Randomize rows
135
+ 4. Dedupe using another CSV
136
+ 5. Validate parity
137
+ 6. Split CSV into chunks
138
+ 7. Exit
133
139
  +> 4
134
140
  CSV file path: /tmp/source.csv
135
141
  Source CSV separator:
@@ -166,6 +172,81 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
166
172
  -Summary: source_rows=5 removed_rows=3 kept_rows=2
167
173
  ```
168
174
 
175
+ ### 8. Parity interaction example
176
+
177
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
178
+
179
+ ```diff
180
+ CSV Tool Menu
181
+ 1. Extract column
182
+ 2. Extract rows (range)
183
+ 3. Randomize rows
184
+ 4. Dedupe using another CSV
185
+ 5. Validate parity
186
+ 6. Split CSV into chunks
187
+ 7. Exit
188
+ +> 5
189
+ Left CSV file path: /tmp/left.csv
190
+ Right CSV file path: /tmp/right.csv
191
+ Choose separator:
192
+ 1. comma (,)
193
+ 2. tab (\t)
194
+ 3. semicolon (;)
195
+ 4. pipe (|)
196
+ 5. custom
197
+ +Separator choice [1]: 1
198
+ Headers present? [Y/n]:
199
+ -MISMATCH
200
+ -Summary: left_rows=10 right_rows=10 left_only=2 right_only=2
201
+ -Left-only examples:
202
+ - 4,Dina (count +1)
203
+ -Right-only examples:
204
+ - 4,Dina-Updated (count +1)
205
+ ```
206
+
207
+ ### 9. Parity large-file behavior
208
+
209
+ - Parity uses a streaming count-delta strategy:
210
+ - Stream left rows and increment row-key counts.
211
+ - Stream right rows and decrement row-key counts.
212
+ - Exact duplicate semantics are preserved by count deltas per normalized row value.
213
+ - Memory scales with the number of distinct row keys in the parity map, not the total input row count.
214
+
215
+ ### 10. Split interaction example
216
+
217
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
218
+
219
+ ```diff
220
+ CSV Tool Menu
221
+ 1. Extract column
222
+ 2. Extract rows (range)
223
+ 3. Randomize rows
224
+ 4. Dedupe using another CSV
225
+ 5. Validate parity
226
+ 6. Split CSV into chunks
227
+ 7. Exit
228
+ +> 6
229
+ Source CSV file path: /tmp/people.csv
230
+ Choose separator:
231
+ 1. comma (,)
232
+ 2. tab (\t)
233
+ 3. semicolon (;)
234
+ 4. pipe (|)
235
+ 5. custom
236
+ +Separator choice [1]: 1
237
+ Headers present? [Y/n]:
238
+ +Rows per chunk: 1000
239
+ Output directory [/tmp]:
240
+ Output file prefix [people]:
241
+ Overwrite existing chunk files? [y/N]:
242
+ Write manifest file? [y/N]:
243
+ -Split complete.
244
+ -Chunk size: 1000
245
+ -Data rows: 25000
246
+ -Chunks written: 25
247
+ -/tmp/people_part_001.csv
248
+ ```
249
+
169
250
  ## Testing
170
251
 
171
252
  Run tests:
@@ -182,7 +263,7 @@ bundle exec rake test
182
263
 
183
264
  ## Alpha release
184
265
 
185
- Current prerelease version: `0.5.0.alpha`
266
+ Current prerelease version: `0.7.0.alpha`
186
267
 
187
268
  Install prerelease from RubyGems:
188
269
 
@@ -192,7 +273,7 @@ gem install csvops --pre
192
273
 
193
274
  Release runbook:
194
275
 
195
- - `docs/release-v0.5.0-alpha.md`
276
+ - `docs/release-v0.7.0-alpha.md`
196
277
 
197
278
 
198
279
  ## Architecture
data/docs/architecture.md CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  The codebase follows a DDD-lite layered structure:
4
4
 
5
- - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
7
- - `infrastructure/`: CSV reading/streaming and output adapters (console/file), plus cross-CSV dedupe adapter.
5
+ - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, and `CsvSplitSession` aggregates + supporting entities/value objects).
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`).
7
+ - `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
8
8
  - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
9
  - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
10
 
11
11
  ## Workflow boundary (standardized)
12
12
 
13
- For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`), the boundary is:
13
+ For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`), the boundary is:
14
14
 
15
15
  - `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
16
16
  - `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
@@ -32,6 +32,8 @@ Current usage:
32
32
  - `RunRowExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::RowExtraction::*`.
33
33
  - `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
34
34
  - `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
35
+ - `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
36
+ - `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
35
37
 
36
38
  ## Adding New Concepts
37
39
 
@@ -107,7 +109,7 @@ For a new function type, prefer one of these patterns:
107
109
 
108
110
  ## Domain model
109
111
 
110
- Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, and `Cross-CSV Dedupe`.
112
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, and `CSV Split`.
111
113
 
112
114
  ### Cross-CSV Dedupe (Large-file behavior)
113
115
 
@@ -366,6 +368,114 @@ classDiagram
366
368
  RunCrossCsvDedupe --> CsvCrossCsvDedupeFileWriter
367
369
  ```
368
370
 
371
+ ### CSV Parity
372
+
373
+ Core DDD structure:
374
+
375
+ - Aggregate root: `ParitySession`
376
+ - Captures one parity check request.
377
+ - Holds left/right source paths and parity options.
378
+ - Entities:
379
+ - `SourcePair` (left and right file paths)
380
+ - Value objects:
381
+ - `ParityOptions` (separator + header mode)
382
+ - Application service:
383
+ - `Application::UseCases::RunCsvParity` orchestrates parity validation and returns request/result style payloads.
384
+ - Infrastructure adapters:
385
+ - `Infrastructure::CSV::HeaderReader`
386
+ - `Infrastructure::CSV::CsvParityComparator` (streaming count-delta strategy with duplicate-aware semantics)
387
+ - Interface adapters:
388
+ - `Interface::CLI::MenuLoop`
389
+ - `Interface::CLI::Workflows::RunCsvParityWorkflow`
390
+ - `Interface::CLI::Workflows::Builders::CsvParitySessionBuilder`
391
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
392
+ - `Interface::CLI::Workflows::Steps::Parity::*`
393
+ - `Interface::CLI::Workflows::Presenters::CsvParityPresenter`
394
+ - `Interface::CLI::Workflows::Support::ResultErrorHandler`
395
+ - `Interface::CLI::Prompts::*`
396
+ - `Interface::CLI::Errors::Presenter`
397
+
398
+ ```mermaid
399
+ classDiagram
400
+ direction LR
401
+ class MenuLoop
402
+ class RunCsvParityWorkflow
403
+ class Prompts
404
+ class Errors
405
+ class RunCsvParity
406
+ class ParitySession
407
+ class SourcePair
408
+ class ParityOptions
409
+ class HeaderReader
410
+ class CsvParityComparator
411
+ class CsvParityPresenter
412
+
413
+ MenuLoop --> RunCsvParityWorkflow : invokes
414
+ RunCsvParityWorkflow --> Prompts : uses
415
+ RunCsvParityWorkflow --> Errors : reports failures
416
+ RunCsvParityWorkflow --> CsvParityPresenter : renders
417
+ RunCsvParityWorkflow --> RunCsvParity : calls
418
+ RunCsvParity --> ParitySession : orchestrates
419
+ ParitySession o-- SourcePair
420
+ ParitySession o-- ParityOptions
421
+ RunCsvParity --> HeaderReader
422
+ RunCsvParity --> CsvParityComparator
423
+ ```
424
+
425
+ ### CSV Split
426
+
427
+ Core DDD structure:
428
+
429
+ - Aggregate root: `SplitSession`
430
+ - Captures one CSV split request.
431
+ - Holds split source and split options.
432
+ - Entities:
433
+ - `SplitSource` (path + separator + header mode)
434
+ - Value objects:
435
+ - `SplitOptions` (chunk size, output directory, file prefix, overwrite policy, optional manifest configuration)
436
+ - Application service:
437
+ - `Application::UseCases::RunCsvSplit` orchestrates split execution and returns request/result style payloads.
438
+ - Infrastructure adapters:
439
+ - `Infrastructure::CSV::CsvSplitter` (streaming row-by-row chunk writer)
440
+ - `Infrastructure::Output::CsvSplitManifestWriter` (optional manifest output)
441
+ - Interface adapters:
442
+ - `Interface::CLI::MenuLoop`
443
+ - `Interface::CLI::Workflows::RunCsvSplitWorkflow`
444
+ - `Interface::CLI::Workflows::Builders::CsvSplitSessionBuilder`
445
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
446
+ - `Interface::CLI::Workflows::Steps::CsvSplit::*`
447
+ - `Interface::CLI::Workflows::Presenters::CsvSplitPresenter`
448
+ - `Interface::CLI::Workflows::Support::ResultErrorHandler`
449
+ - `Interface::CLI::Prompts::*`
450
+ - `Interface::CLI::Errors::Presenter`
451
+
452
+ ```mermaid
453
+ classDiagram
454
+ direction LR
455
+ class MenuLoop
456
+ class RunCsvSplitWorkflow
457
+ class Prompts
458
+ class Errors
459
+ class RunCsvSplit
460
+ class SplitSession
461
+ class SplitSource
462
+ class SplitOptions
463
+ class CsvSplitter
464
+ class CsvSplitManifestWriter
465
+ class CsvSplitPresenter
466
+
467
+ MenuLoop --> RunCsvSplitWorkflow : invokes
468
+ RunCsvSplitWorkflow --> Prompts : uses
469
+ RunCsvSplitWorkflow --> Errors : reports failures
470
+ RunCsvSplitWorkflow --> CsvSplitPresenter : renders
471
+ RunCsvSplitWorkflow --> RunCsvSplit : calls
472
+ RunCsvSplit --> SplitSession : orchestrates
473
+ SplitSession o-- SplitSource
474
+ SplitSession o-- SplitOptions
475
+ RunCsvSplit --> CsvSplitter
476
+ RunCsvSplit --> CsvSplitManifestWriter
477
+ ```
478
+
369
479
  ## Project layout
370
480
 
371
481
  ```text
@@ -375,11 +485,15 @@ lib/csvtool/domain/column_session/*
375
485
  lib/csvtool/domain/row_session/*
376
486
  lib/csvtool/domain/row_randomization_session/*
377
487
  lib/csvtool/domain/cross_csv_dedupe_session/*
488
+ lib/csvtool/domain/csv_parity_session/*
489
+ lib/csvtool/domain/csv_split_session/*
378
490
  lib/csvtool/domain/shared/output_destination.rb
379
491
  lib/csvtool/application/use_cases/run_extraction.rb
380
492
  lib/csvtool/application/use_cases/run_row_extraction.rb
381
493
  lib/csvtool/application/use_cases/run_row_randomization.rb
382
494
  lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
495
+ lib/csvtool/application/use_cases/run_csv_parity.rb
496
+ lib/csvtool/application/use_cases/run_csv_split.rb
383
497
  lib/csvtool/infrastructure/csv/*
384
498
  lib/csvtool/infrastructure/output/*
385
499
  lib/csvtool/interface/cli/menu_loop.rb
@@ -0,0 +1,84 @@
1
+ # Release Checklist: v0.6.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV parity workflow
35
+
36
+ Use menu option `5` (`Validate parity`) and verify:
37
+ - matching files with reordered rows return parity success
38
+ - mismatch files return friendly mismatch summary with sample deltas
39
+ - separator and header-mode selections are respected
40
+
41
+ ### Existing workflows regression pass
42
+
43
+ Run quick checks for menu options `1-4` and confirm:
44
+ - column extraction still works
45
+ - row-range extraction still works
46
+ - row randomization still works
47
+ - cross-CSV dedupe still works
48
+
49
+ ## 6. Build and validate gem package
50
+
51
+ ```bash
52
+ gem build csvops.gemspec
53
+ gem install ./csvops-0.6.0.alpha.gem
54
+ csvtool menu
55
+ ```
56
+
57
+ ## 7. Commit release prep
58
+
59
+ ```bash
60
+ git add -A
61
+ git commit -m "chore(release): prepare v0.6.0-alpha"
62
+ ```
63
+
64
+ ## 8. Tag release
65
+
66
+ ```bash
67
+ git tag -a v0.6.0-alpha -m "v0.6.0-alpha"
68
+ git push origin main --tags
69
+ ```
70
+
71
+ ## 9. Publish gem
72
+
73
+ ```bash
74
+ gem push csvops-0.6.0.alpha.gem
75
+ ```
76
+
77
+ ## 10. Create GitHub release
78
+
79
+ Create release `v0.6.0-alpha` with:
80
+ - Dedicated CSV parity validation workflow
81
+ - Header/separator parity options
82
+ - Friendly parity mismatch reporting
83
+ - Streaming delta-count parity comparator
84
+ - Parity architecture convergence (session model, workflow steps, presenter, docs)
@@ -0,0 +1,87 @@
1
+ # Release Checklist: v0.7.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV split workflow (new in this release)
35
+
36
+ Use menu option `6` (`Split CSV into chunks`) and verify:
37
+ - happy path split (`N=10`) writes expected chunk files and counts
38
+ - separator and header mode options work (CSV/TSV/headerless/custom)
39
+ - output directory + file prefix options produce expected paths
40
+ - overwrite protection blocks existing chunk paths unless allowed
41
+ - optional manifest output writes valid CSV metadata
42
+
43
+ ### Existing workflows regression pass
44
+
45
+ Use menu options `1-5` and verify:
46
+ - column extraction still works
47
+ - row-range extraction still works
48
+ - row randomization still works
49
+ - cross-CSV dedupe still works
50
+ - parity validation still works
51
+
52
+ ## 6. Build and validate gem package
53
+
54
+ ```bash
55
+ gem build csvops.gemspec
56
+ gem install ./csvops-0.7.0.alpha.gem
57
+ csvtool menu
58
+ ```
59
+
60
+ ## 7. Commit release prep
61
+
62
+ ```bash
63
+ git add -A
64
+ git commit -m "chore(release): prepare v0.7.0-alpha"
65
+ ```
66
+
67
+ ## 8. Tag release
68
+
69
+ ```bash
70
+ git tag -a v0.7.0-alpha -m "v0.7.0-alpha"
71
+ git push origin main --tags
72
+ ```
73
+
74
+ ## 9. Publish gem
75
+
76
+ ```bash
77
+ gem push csvops-0.7.0.alpha.gem
78
+ ```
79
+
80
+ ## 10. Create GitHub release
81
+
82
+ Create release `v0.7.0-alpha` with:
83
+ - New `Split CSV into chunks` workflow
84
+ - Split-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
85
+ - Output strategy improvements (directory/prefix/overwrite controls)
86
+ - Optional split manifest output
87
+ - Large-file streaming split coverage and docs updates
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/csv_parity_comparator"
5
+ require "csvtool/infrastructure/csv/header_reader"
6
+
7
+ module Csvtool
8
+ module Application
9
+ module UseCases
10
+ class RunCsvParity
11
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
12
+ def ok?
13
+ ok
14
+ end
15
+ end
16
+
17
+ def initialize(
18
+ comparator: Infrastructure::CSV::CsvParityComparator.new,
19
+ header_reader: Infrastructure::CSV::HeaderReader.new
20
+ )
21
+ @comparator = comparator
22
+ @header_reader = header_reader
23
+ end
24
+
25
+ def call(session:)
26
+ left_path = session.source_pair.left_path
27
+ right_path = session.source_pair.right_path
28
+ col_sep = session.options.separator
29
+ headers_present = session.options.headers_present?
30
+
31
+ return failure(:file_not_found, path: left_path) unless File.file?(left_path)
32
+ return failure(:file_not_found, path: right_path) unless File.file?(right_path)
33
+
34
+ if headers_present
35
+ left_headers = @header_reader.call(file_path: left_path, col_sep: col_sep)
36
+ return failure(:no_headers, path: left_path) if left_headers.empty?
37
+
38
+ right_headers = @header_reader.call(file_path: right_path, col_sep: col_sep)
39
+ return failure(:no_headers, path: right_path) if right_headers.empty?
40
+
41
+ return failure(:header_mismatch, left_headers: left_headers, right_headers: right_headers) unless left_headers == right_headers
42
+ end
43
+
44
+ stats = @comparator.call(
45
+ left_path: left_path,
46
+ right_path: right_path,
47
+ col_sep: col_sep,
48
+ headers_present: headers_present
49
+ )
50
+
51
+ success(stats)
52
+ rescue CSV::MalformedCSVError
53
+ failure(:could_not_parse_csv)
54
+ rescue Errno::EACCES => e
55
+ failure(:cannot_read_file, path: e.respond_to?(:path) ? e.path : left_path)
56
+ end
57
+
58
+ private
59
+
60
+ def success(data)
61
+ Result.new(ok: true, error: nil, data: data)
62
+ end
63
+
64
+ def failure(code, data = {})
65
+ Result.new(ok: false, error: code, data: data)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "fileutils"
5
+ require "csvtool/infrastructure/csv/header_reader"
6
+ require "csvtool/infrastructure/csv/csv_splitter"
7
+ require "csvtool/infrastructure/output/csv_split_manifest_writer"
8
+
9
+ module Csvtool
10
+ module Application
11
+ module UseCases
12
+ class RunCsvSplit
13
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
14
+ def ok?
15
+ ok
16
+ end
17
+ end
18
+
19
+ def initialize(
20
+ header_reader: Infrastructure::CSV::HeaderReader.new,
21
+ csv_splitter: Infrastructure::CSV::CsvSplitter.new,
22
+ csv_split_manifest_writer: Infrastructure::Output::CsvSplitManifestWriter.new
23
+ )
24
+ @header_reader = header_reader
25
+ @csv_splitter = csv_splitter
26
+ @csv_split_manifest_writer = csv_split_manifest_writer
27
+ end
28
+
29
+ def read_headers(file_path:, col_sep:, headers_present:)
30
+ return failure(:file_not_found, path: file_path) unless File.file?(file_path)
31
+ return success(headers: nil) unless headers_present
32
+
33
+ headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
34
+ return failure(:no_headers) if headers.empty?
35
+
36
+ success(headers: headers)
37
+ rescue CSV::MalformedCSVError
38
+ failure(:could_not_parse_csv)
39
+ rescue Errno::EACCES
40
+ failure(:cannot_read_file, path: file_path)
41
+ end
42
+
43
+ def call(session:)
44
+ source = session.source
45
+ output_directory = session.options.output_directory || File.dirname(source.path)
46
+ file_prefix = session.options.file_prefix || File.basename(source.path, ".*")
47
+ FileUtils.mkdir_p(output_directory)
48
+
49
+ stats = @csv_splitter.call(
50
+ file_path: source.path,
51
+ col_sep: source.separator,
52
+ headers_present: source.headers_present,
53
+ chunk_size: session.options.chunk_size,
54
+ output_directory: output_directory,
55
+ file_prefix: file_prefix,
56
+ overwrite_existing: session.options.overwrite_existing
57
+ )
58
+ manifest_path = maybe_write_manifest(
59
+ session: session,
60
+ output_directory: output_directory,
61
+ file_prefix: file_prefix,
62
+ stats: stats
63
+ )
64
+ success(stats.merge(output_directory: output_directory, file_prefix: file_prefix, manifest_path: manifest_path))
65
+ rescue Infrastructure::CSV::CsvSplitter::OutputFileExistsError => e
66
+ failure(:output_file_exists, path: e.path)
67
+ rescue CSV::MalformedCSVError
68
+ failure(:could_not_parse_csv)
69
+ rescue Errno::EACCES, Errno::ENOENT => e
70
+ failure(:cannot_write_output_file, path: output_directory, error_class: e.class)
71
+ end
72
+
73
+ private
74
+
75
+ def success(data)
76
+ Result.new(ok: true, error: nil, data: data)
77
+ end
78
+
79
+ def failure(code, data = {})
80
+ Result.new(ok: false, error: code, data: data)
81
+ end
82
+
83
+ def maybe_write_manifest(session:, output_directory:, file_prefix:, stats:)
84
+ return nil unless session.options.write_manifest
85
+
86
+ manifest_path = session.options.manifest_path || File.join(output_directory, "#{file_prefix}_manifest.csv")
87
+ @csv_split_manifest_writer.call(
88
+ path: manifest_path,
89
+ chunk_paths: stats[:chunk_paths],
90
+ chunk_row_counts: stats[:chunk_row_counts]
91
+ )
92
+ manifest_path
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
data/lib/csvtool/cli.rb CHANGED
@@ -6,6 +6,8 @@ require "csvtool/interface/cli/workflows/run_extraction_workflow"
6
6
  require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
7
7
  require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
8
8
  require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
9
+ require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
10
+ require "csvtool/interface/cli/workflows/run_csv_split_workflow"
9
11
  require "csvtool/interface/cli/errors/presenter"
10
12
  require "csvtool/infrastructure/csv/header_reader"
11
13
  require "csvtool/infrastructure/csv/value_streamer"
@@ -18,6 +20,8 @@ module Csvtool
18
20
  "Extract rows (range)",
19
21
  "Randomize rows",
20
22
  "Dedupe using another CSV",
23
+ "Validate parity",
24
+ "Split CSV into chunks",
21
25
  "Exit"
22
26
  ].freeze
23
27
 
@@ -51,6 +55,8 @@ module Csvtool
51
55
  extract_rows_action = -> { Interface::CLI::Workflows::RunRowExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
52
56
  randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
53
57
  dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
58
+ parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
59
+ split_action = -> { Interface::CLI::Workflows::RunCsvSplitWorkflow.new(stdin: @stdin, stdout: @stdout).call }
54
60
  Interface::CLI::MenuLoop.new(
55
61
  stdin: @stdin,
56
62
  stdout: @stdout,
@@ -58,7 +64,9 @@ module Csvtool
58
64
  extract_column_action: extract_column_action,
59
65
  extract_rows_action: extract_rows_action,
60
66
  randomize_rows_action: randomize_rows_action,
61
- dedupe_action: dedupe_action
67
+ dedupe_action: dedupe_action,
68
+ parity_action: parity_action,
69
+ split_action: split_action
62
70
  ).run
63
71
  end
64
72