csvops 0.6.0.alpha → 0.8.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +103 -24
  3. data/docs/architecture.md +121 -4
  4. data/docs/release-v0.7.0-alpha.md +87 -0
  5. data/docs/release-v0.8.0-alpha.md +88 -0
  6. data/lib/csvtool/application/use_cases/run_csv_split.rb +97 -0
  7. data/lib/csvtool/application/use_cases/run_csv_stats.rb +64 -0
  8. data/lib/csvtool/cli.rb +9 -1
  9. data/lib/csvtool/domain/csv_split_session/split_options.rb +27 -0
  10. data/lib/csvtool/domain/csv_split_session/split_session.rb +20 -0
  11. data/lib/csvtool/domain/csv_split_session/split_source.rb +17 -0
  12. data/lib/csvtool/domain/csv_stats_session/stats_options.rb +11 -0
  13. data/lib/csvtool/domain/csv_stats_session/stats_session.rb +25 -0
  14. data/lib/csvtool/domain/csv_stats_session/stats_source.rb +17 -0
  15. data/lib/csvtool/infrastructure/csv/csv_splitter.rb +64 -0
  16. data/lib/csvtool/infrastructure/csv/csv_stats_scanner.rb +67 -0
  17. data/lib/csvtool/infrastructure/output/csv_split_manifest_writer.rb +20 -0
  18. data/lib/csvtool/infrastructure/output/csv_stats_file_writer.rb +26 -0
  19. data/lib/csvtool/interface/cli/errors/presenter.rb +8 -0
  20. data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
  21. data/lib/csvtool/interface/cli/prompts/chunk_size_prompt.rb +21 -0
  22. data/lib/csvtool/interface/cli/prompts/split_manifest_prompt.rb +30 -0
  23. data/lib/csvtool/interface/cli/prompts/split_output_prompt.rb +38 -0
  24. data/lib/csvtool/interface/cli/workflows/builders/csv_split_session_builder.rb +44 -0
  25. data/lib/csvtool/interface/cli/workflows/builders/csv_stats_session_builder.rb +28 -0
  26. data/lib/csvtool/interface/cli/workflows/presenters/csv_split_presenter.rb +26 -0
  27. data/lib/csvtool/interface/cli/workflows/presenters/csv_stats_presenter.rb +34 -0
  28. data/lib/csvtool/interface/cli/workflows/run_csv_split_workflow.rb +89 -0
  29. data/lib/csvtool/interface/cli/workflows/run_csv_stats_workflow.rb +77 -0
  30. data/lib/csvtool/interface/cli/workflows/steps/csv_split/build_session_step.rb +30 -0
  31. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step.rb +43 -0
  32. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step.rb +30 -0
  33. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step.rb +31 -0
  34. data/lib/csvtool/interface/cli/workflows/steps/csv_split/execute_step.rb +36 -0
  35. data/lib/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step.rb +25 -0
  36. data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step.rb +27 -0
  37. data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step.rb +31 -0
  38. data/lib/csvtool/interface/cli/workflows/steps/csv_stats/execute_step.rb +27 -0
  39. data/lib/csvtool/version.rb +1 -1
  40. data/test/csvtool/application/use_cases/run_csv_split_test.rb +124 -0
  41. data/test/csvtool/application/use_cases/run_csv_stats_test.rb +165 -0
  42. data/test/csvtool/cli_test.rb +139 -29
  43. data/test/csvtool/infrastructure/csv/csv_splitter_test.rb +68 -0
  44. data/test/csvtool/infrastructure/csv/csv_stats_scanner_test.rb +68 -0
  45. data/test/csvtool/infrastructure/output/csv_split_manifest_writer_test.rb +25 -0
  46. data/test/csvtool/infrastructure/output/csv_stats_file_writer_test.rb +38 -0
  47. data/test/csvtool/interface/cli/menu_loop_test.rb +104 -130
  48. data/test/csvtool/interface/cli/prompts/chunk_size_prompt_test.rb +17 -0
  49. data/test/csvtool/interface/cli/prompts/split_manifest_prompt_test.rb +42 -0
  50. data/test/csvtool/interface/cli/prompts/split_output_prompt_test.rb +22 -0
  51. data/test/csvtool/interface/cli/workflows/builders/csv_split_session_builder_test.rb +30 -0
  52. data/test/csvtool/interface/cli/workflows/builders/csv_stats_session_builder_test.rb +19 -0
  53. data/test/csvtool/interface/cli/workflows/presenters/csv_split_presenter_test.rb +26 -0
  54. data/test/csvtool/interface/cli/workflows/presenters/csv_stats_presenter_test.rb +37 -0
  55. data/test/csvtool/interface/cli/workflows/run_csv_split_workflow_test.rb +200 -0
  56. data/test/csvtool/interface/cli/workflows/run_csv_stats_workflow_test.rb +146 -0
  57. data/test/csvtool/interface/cli/workflows/steps/csv_split/build_session_step_test.rb +40 -0
  58. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step_test.rb +64 -0
  59. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step_test.rb +30 -0
  60. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step_test.rb +32 -0
  61. data/test/csvtool/interface/cli/workflows/steps/csv_split/execute_step_test.rb +83 -0
  62. data/test/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step_test.rb +36 -0
  63. data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step_test.rb +49 -0
  64. data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step_test.rb +61 -0
  65. data/test/csvtool/interface/cli/workflows/steps/csv_stats/execute_step_test.rb +65 -0
  66. data/test/fixtures/split_people_25.csv +26 -0
  67. metadata +58 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f7db22cb84c1d08c58b473368f9ad37575a217d6293539309277ed2b032a2852
4
- data.tar.gz: 124bebc822fefa5d1f71286701959876260c82164067c36ff94b712a0b4cc1b3
3
+ metadata.gz: 7c09a1df68b5bbb8885b254bd7ea1260617495fc042cae43ef7251d9eb66836e
4
+ data.tar.gz: a5fbf5098df8007e844c83134a5474f92eafe4866b4d9910519d9a1517675af9
5
5
  SHA512:
6
- metadata.gz: a8b8dbcfb66073f46f0ecc625267081fbe730e69ef9295f5d2303af6b831a9d71ef564f78f5b44212eb33c4ad7a5fdb78b54fa98e21dd58669e9494a5d3325fb
7
- data.tar.gz: 05cbcaa2ca3116ad463413e53600d32a53df0941ceb8873ed22c2ef2d4cfe1afc8f90e44c7ff4400212ebbd5083a2ceb6a983281291436e9478d5087cc98b9ad
6
+ metadata.gz: 3695b9d49a638138d03d69122267a2889ecb6bd33605e9a256a20480ccab869c858f8f08996a213a885c2fb9a08e740712bb1dcc38fef6734d10129b20b3d611
7
+ data.tar.gz: df4cf19d31ac3c317ae69552cd3181c48984295268880810adf0d7b46bb606a2fa92646c35c4b1cd3343c04c5d08c734efb6b53b08f44fb3259b3dc3cbaf4509
data/README.md CHANGED
@@ -38,29 +38,26 @@ CSV Tool Menu
38
38
  3. Randomize rows
39
39
  4. Dedupe using another CSV
40
40
  5. Validate parity
41
- 6. Exit
41
+ 6. Split CSV into chunks
42
+ 7. CSV stats summary
43
+ 8. Exit
42
44
  >
43
45
  ```
44
46
 
45
- Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, or `5` for parity validation.
47
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, `6` for CSV splitting, or `7` for CSV stats.
46
48
 
47
49
  ### 3. Follow prompts
48
50
 
49
- Each menu action runs through a dedicated CLI workflow (`interface/cli/workflows/*`) that handles prompts/output and delegates execution to an interface-agnostic application use case.
51
+ Each action asks only for what it needs (file path, separator, and any action-specific options), then prints results to the console or writes to a file when selected.
50
52
 
51
- Workflow internals are split into small composable parts:
53
+ Typical prompt pattern:
52
54
 
53
- - `workflows/builders/*` for session construction
54
- - `workflows/support/*` for shared mapping/dispatch utilities
55
- - `workflows/presenters/*` for output formatting and summaries
55
+ - choose source file(s)
56
+ - choose separator/header options when relevant
57
+ - choose action-specific options
58
+ - choose output destination (console or file)
56
59
 
57
- Prompt flow by action:
58
-
59
- - `Extract column`: file path, separator, optional header filter + column select, skip blanks, preview/confirm, output destination.
60
- - `Extract rows (range)`: file path, separator, start row, end row, output destination.
61
- - `Randomize rows`: file path, separator, headers present, optional seed, output destination.
62
- - `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
63
- - `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
60
+ For architecture and internal design details, see [`docs/architecture.md`](docs/architecture.md).
64
61
 
65
62
  ### 4. Example interaction (console output)
66
63
 
@@ -129,10 +126,12 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
129
126
  CSV Tool Menu
130
127
  1. Extract column
131
128
  2. Extract rows (range)
132
- 3. Randomize rows
133
- 4. Dedupe using another CSV
134
- 5. Validate parity
135
- 6. Exit
129
+ 3. Randomize rows
130
+ 4. Dedupe using another CSV
131
+ 5. Validate parity
132
+ 6. Split CSV into chunks
133
+ 7. CSV stats summary
134
+ 8. Exit
136
135
  +> 4
137
136
  CSV file path: /tmp/source.csv
138
137
  Source CSV separator:
@@ -177,10 +176,12 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
177
176
  CSV Tool Menu
178
177
  1. Extract column
179
178
  2. Extract rows (range)
180
- 3. Randomize rows
181
- 4. Dedupe using another CSV
182
- 5. Validate parity
183
- 6. Exit
179
+ 3. Randomize rows
180
+ 4. Dedupe using another CSV
181
+ 5. Validate parity
182
+ 6. Split CSV into chunks
183
+ 7. CSV stats summary
184
+ 8. Exit
184
185
  +> 5
185
186
  Left CSV file path: /tmp/left.csv
186
187
  Right CSV file path: /tmp/right.csv
@@ -208,6 +209,84 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
208
209
  - Exact duplicate semantics are preserved by count deltas per normalized row value.
209
210
  - Memory scales with the number of distinct row keys in the parity map, not the total input row count.
210
211
 
212
+ ### 10. Split interaction example
213
+
214
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
215
+
216
+ ```diff
217
+ CSV Tool Menu
218
+ 1. Extract column
219
+ 2. Extract rows (range)
220
+ 3. Randomize rows
221
+ 4. Dedupe using another CSV
222
+ 5. Validate parity
223
+ 6. Split CSV into chunks
224
+ 7. CSV stats summary
225
+ 8. Exit
226
+ +> 6
227
+ Source CSV file path: /tmp/people.csv
228
+ Choose separator:
229
+ 1. comma (,)
230
+ 2. tab (\t)
231
+ 3. semicolon (;)
232
+ 4. pipe (|)
233
+ 5. custom
234
+ +Separator choice [1]: 1
235
+ Headers present? [Y/n]:
236
+ +Rows per chunk: 1000
237
+ Output directory [/tmp]:
238
+ Output file prefix [people]:
239
+ Overwrite existing chunk files? [y/N]:
240
+ Write manifest file? [y/N]:
241
+ -Split complete.
242
+ -Chunk size: 1000
243
+ -Data rows: 25000
244
+ -Chunks written: 25
245
+ -/tmp/people_part_001.csv
246
+ ```
247
+
248
+ ### 11. CSV stats interaction example
249
+
250
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
251
+
252
+ ```diff
253
+ CSV Tool Menu
254
+ 1. Extract column
255
+ 2. Extract rows (range)
256
+ 3. Randomize rows
257
+ 4. Dedupe using another CSV
258
+ 5. Validate parity
259
+ 6. Split CSV into chunks
260
+ 7. CSV stats summary
261
+ 8. Exit
262
+ +> 7
263
+ CSV file path: /tmp/people.csv
264
+ Choose separator:
265
+ 1. comma (,)
266
+ 2. tab (\t)
267
+ 3. semicolon (;)
268
+ 4. pipe (|)
269
+ 5. custom
270
+ +Separator choice [1]: 1
271
+ Headers present? [Y/n]:
272
+ Output destination:
273
+ 1. console
274
+ 2. file
275
+ +Output destination [1]: 1
276
+ -CSV Stats Summary
277
+ -Rows: 3
278
+ -Columns: 2
279
+ -Headers: name, city
280
+ -Column completeness:
281
+ - name: non_blank=3 blank=0
282
+ - city: non_blank=3 blank=0
283
+ ```
284
+
285
+ ### 12. CSV stats large-file behavior
286
+
287
+ - Stats scanning is streaming (`CSV.foreach`), processed in one pass.
288
+ - Memory grows with per-column aggregates (`column_stats`), not with total row count.
289
+
211
290
  ## Testing
212
291
 
213
292
  Run tests:
@@ -224,7 +303,7 @@ bundle exec rake test
224
303
 
225
304
  ## Alpha release
226
305
 
227
- Current prerelease version: `0.5.0.alpha`
306
+ Current prerelease version: `0.8.0.alpha`
228
307
 
229
308
  Install prerelease from RubyGems:
230
309
 
@@ -234,7 +313,7 @@ gem install csvops --pre
234
313
 
235
314
  Release runbook:
236
315
 
237
- - `docs/release-v0.5.0-alpha.md`
316
+ - `docs/release-v0.8.0-alpha.md`
238
317
 
239
318
 
240
319
  ## Architecture
data/docs/architecture.md CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  The codebase follows a DDD-lite layered structure:
4
4
 
5
- - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`).
5
+ - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, `ParitySession`, `SplitSession`, and `CsvStatsSession` aggregates + supporting entities/value objects).
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`, `RunCsvStats`).
7
7
  - `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
8
8
  - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
9
  - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
10
 
11
11
  ## Workflow boundary (standardized)
12
12
 
13
- For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`), the boundary is:
13
+ For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, `CSV Stats`), the boundary is:
14
14
 
15
15
  - `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
16
16
  - `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
@@ -33,6 +33,8 @@ Current usage:
33
33
  - `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
34
34
  - `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
35
35
  - `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
36
+ - `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
37
+ - `RunCsvStatsWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvStats::*`.
36
38
 
37
39
  ## Adding New Concepts
38
40
 
@@ -108,7 +110,7 @@ For a new function type, prefer one of these patterns:
108
110
 
109
111
  ## Domain model
110
112
 
111
- Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, and `CSV Parity`.
113
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, and `CSV Stats`.
112
114
 
113
115
  ### Cross-CSV Dedupe (Large-file behavior)
114
116
 
@@ -421,6 +423,117 @@ classDiagram
421
423
  RunCsvParity --> CsvParityComparator
422
424
  ```
423
425
 
426
+ ### CSV Split
427
+
428
+ Core DDD structure:
429
+
430
+ - Aggregate root: `SplitSession`
431
+ - Captures one CSV split request.
432
+ - Holds split source and split options.
433
+ - Entities:
434
+ - `SplitSource` (path + separator + header mode)
435
+ - Value objects:
436
+ - `SplitOptions` (chunk size, output directory, file prefix, overwrite policy, optional manifest configuration)
437
+ - Application service:
438
+ - `Application::UseCases::RunCsvSplit` orchestrates split execution and returns request/result style payloads.
439
+ - Infrastructure adapters:
440
+ - `Infrastructure::CSV::CsvSplitter` (streaming row-by-row chunk writer)
441
+ - `Infrastructure::Output::CsvSplitManifestWriter` (optional manifest output)
442
+ - Interface adapters:
443
+ - `Interface::CLI::MenuLoop`
444
+ - `Interface::CLI::Workflows::RunCsvSplitWorkflow`
445
+ - `Interface::CLI::Workflows::Builders::CsvSplitSessionBuilder`
446
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
447
+ - `Interface::CLI::Workflows::Steps::CsvSplit::*`
448
+ - `Interface::CLI::Workflows::Presenters::CsvSplitPresenter`
449
+ - `Interface::CLI::Workflows::Support::ResultErrorHandler`
450
+ - `Interface::CLI::Prompts::*`
451
+ - `Interface::CLI::Errors::Presenter`
452
+
453
+ ```mermaid
454
+ classDiagram
455
+ direction LR
456
+ class MenuLoop
457
+ class RunCsvSplitWorkflow
458
+ class Prompts
459
+ class Errors
460
+ class RunCsvSplit
461
+ class SplitSession
462
+ class SplitSource
463
+ class SplitOptions
464
+ class CsvSplitter
465
+ class CsvSplitManifestWriter
466
+ class CsvSplitPresenter
467
+
468
+ MenuLoop --> RunCsvSplitWorkflow : invokes
469
+ RunCsvSplitWorkflow --> Prompts : uses
470
+ RunCsvSplitWorkflow --> Errors : reports failures
471
+ RunCsvSplitWorkflow --> CsvSplitPresenter : renders
472
+ RunCsvSplitWorkflow --> RunCsvSplit : calls
473
+ RunCsvSplit --> SplitSession : orchestrates
474
+ SplitSession o-- SplitSource
475
+ SplitSession o-- SplitOptions
476
+ RunCsvSplit --> CsvSplitter
477
+ RunCsvSplit --> CsvSplitManifestWriter
478
+ ```
479
+
480
+ ### CSV Stats
481
+
482
+ Core DDD structure:
483
+
484
+ - Aggregate root: `StatsSession`
485
+ - Captures one stats summary request.
486
+ - Holds source profile and output destination.
487
+ - Entity:
488
+ - `StatsSource` (path + separator + header mode)
489
+ - Value objects:
490
+ - `StatsOptions` (currently lightweight; keeps option growth explicit)
491
+ - Shared `OutputDestination` (`console` or `file(path)`)
492
+ - Application service:
493
+ - `Application::UseCases::RunCsvStats` orchestrates stats scanning and output routing.
494
+ - Infrastructure adapters:
495
+ - `Infrastructure::CSV::CsvStatsScanner` (streaming one-pass row aggregation)
496
+ - `Infrastructure::Output::CsvStatsFileWriter` (metric/value artifact writer)
497
+ - Interface adapters:
498
+ - `Interface::CLI::MenuLoop`
499
+ - `Interface::CLI::Workflows::RunCsvStatsWorkflow`
500
+ - `Interface::CLI::Workflows::Builders::CsvStatsSessionBuilder`
501
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
502
+ - `Interface::CLI::Workflows::Steps::CsvStats::*`
503
+ - `Interface::CLI::Workflows::Presenters::CsvStatsPresenter`
504
+ - `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
505
+ - `Interface::CLI::Prompts::*`
506
+ - `Interface::CLI::Errors::Presenter`
507
+
508
+ ```mermaid
509
+ classDiagram
510
+ direction LR
511
+ class MenuLoop
512
+ class RunCsvStatsWorkflow
513
+ class Prompts
514
+ class Errors
515
+ class RunCsvStats
516
+ class StatsSession
517
+ class StatsSource
518
+ class StatsOptions
519
+ class OutputDestination
520
+ class CsvStatsScanner
521
+ class CsvStatsFileWriter
522
+ class CsvStatsPresenter
523
+
524
+ MenuLoop --> RunCsvStatsWorkflow : invokes
525
+ RunCsvStatsWorkflow --> Prompts : uses
526
+ RunCsvStatsWorkflow --> Errors : reports failures
527
+ RunCsvStatsWorkflow --> CsvStatsPresenter : renders
528
+ RunCsvStatsWorkflow --> RunCsvStats : calls
529
+ RunCsvStats --> StatsSession : orchestrates
530
+ StatsSession o-- StatsSource
531
+ StatsSession o-- StatsOptions
532
+ StatsSession o-- OutputDestination
533
+ RunCsvStats --> CsvStatsScanner
534
+ RunCsvStats --> CsvStatsFileWriter
535
+ ```
536
+
424
537
  ## Project layout
425
538
 
426
539
  ```text
@@ -431,12 +544,16 @@ lib/csvtool/domain/row_session/*
431
544
  lib/csvtool/domain/row_randomization_session/*
432
545
  lib/csvtool/domain/cross_csv_dedupe_session/*
433
546
  lib/csvtool/domain/csv_parity_session/*
547
+ lib/csvtool/domain/csv_split_session/*
548
+ lib/csvtool/domain/csv_stats_session/*
434
549
  lib/csvtool/domain/shared/output_destination.rb
435
550
  lib/csvtool/application/use_cases/run_extraction.rb
436
551
  lib/csvtool/application/use_cases/run_row_extraction.rb
437
552
  lib/csvtool/application/use_cases/run_row_randomization.rb
438
553
  lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
439
554
  lib/csvtool/application/use_cases/run_csv_parity.rb
555
+ lib/csvtool/application/use_cases/run_csv_split.rb
556
+ lib/csvtool/application/use_cases/run_csv_stats.rb
440
557
  lib/csvtool/infrastructure/csv/*
441
558
  lib/csvtool/infrastructure/output/*
442
559
  lib/csvtool/interface/cli/menu_loop.rb
@@ -0,0 +1,87 @@
1
+ # Release Checklist: v0.7.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV split workflow (new in this release)
35
+
36
+ Use menu option `6` (`Split CSV into chunks`) and verify:
37
+ - happy path split (`N=10`) writes expected chunk files and counts
38
+ - separator and header mode options work (CSV/TSV/headerless/custom)
39
+ - output directory + file prefix options produce expected paths
40
+ - overwrite protection blocks existing chunk paths unless allowed
41
+ - optional manifest output writes valid CSV metadata
42
+
43
+ ### Existing workflows regression pass
44
+
45
+ Use menu options `1-5` and verify:
46
+ - column extraction still works
47
+ - row-range extraction still works
48
+ - row randomization still works
49
+ - cross-CSV dedupe still works
50
+ - parity validation still works
51
+
52
+ ## 6. Build and validate gem package
53
+
54
+ ```bash
55
+ gem build csvops.gemspec
56
+ gem install ./csvops-0.7.0.alpha.gem
57
+ csvtool menu
58
+ ```
59
+
60
+ ## 7. Commit release prep
61
+
62
+ ```bash
63
+ git add -A
64
+ git commit -m "chore(release): prepare v0.7.0-alpha"
65
+ ```
66
+
67
+ ## 8. Tag release
68
+
69
+ ```bash
70
+ git tag -a v0.7.0-alpha -m "v0.7.0-alpha"
71
+ git push origin main --tags
72
+ ```
73
+
74
+ ## 9. Publish gem
75
+
76
+ ```bash
77
+ gem push csvops-0.7.0.alpha.gem
78
+ ```
79
+
80
+ ## 10. Create GitHub release
81
+
82
+ Create release `v0.7.0-alpha` with:
83
+ - New `Split CSV into chunks` workflow
84
+ - Split-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
85
+ - Output strategy improvements (directory/prefix/overwrite controls)
86
+ - Optional split manifest output
87
+ - Large-file streaming split coverage and docs updates
@@ -0,0 +1,88 @@
1
+ # Release Checklist: v0.8.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV stats workflow (new in this release)
35
+
36
+ Use menu option `7` (`CSV stats summary`) and verify:
37
+ - happy path summary prints rows/columns/headers
38
+ - separator and header mode options work (CSV/TSV/headerless/custom)
39
+ - column completeness output is correct for blanks
40
+ - output destination supports console and file
41
+ - invalid output path returns friendly error
42
+
43
+ ### Existing workflows regression pass
44
+
45
+ Use menu options `1-6` and verify:
46
+ - column extraction still works
47
+ - row-range extraction still works
48
+ - row randomization still works
49
+ - cross-CSV dedupe still works
50
+ - parity validation still works
51
+ - CSV split still works
52
+
53
+ ## 6. Build and validate gem package
54
+
55
+ ```bash
56
+ gem build csvops.gemspec
57
+ gem install ./csvops-0.8.0.alpha.gem
58
+ csvtool menu
59
+ ```
60
+
61
+ ## 7. Commit release prep
62
+
63
+ ```bash
64
+ git add -A
65
+ git commit -m "chore(release): prepare v0.8.0-alpha"
66
+ ```
67
+
68
+ ## 8. Tag release
69
+
70
+ ```bash
71
+ git tag -a v0.8.0-alpha -m "v0.8.0-alpha"
72
+ git push origin main --tags
73
+ ```
74
+
75
+ ## 9. Publish gem
76
+
77
+ ```bash
78
+ gem push csvops-0.8.0.alpha.gem
79
+ ```
80
+
81
+ ## 10. Create GitHub release
82
+
83
+ Create release `v0.8.0-alpha` with:
84
+ - New `CSV stats summary` workflow
85
+ - Stats-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
86
+ - Console/file output destination support for stats summary artifacts
87
+ - Streaming stats scanner coverage for large files
88
+ - Stats documentation updates in README + architecture guide
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "fileutils"
5
+ require "csvtool/infrastructure/csv/header_reader"
6
+ require "csvtool/infrastructure/csv/csv_splitter"
7
+ require "csvtool/infrastructure/output/csv_split_manifest_writer"
8
+
9
+ module Csvtool
10
+ module Application
11
+ module UseCases
12
+ class RunCsvSplit
13
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
14
+ def ok?
15
+ ok
16
+ end
17
+ end
18
+
19
+ def initialize(
20
+ header_reader: Infrastructure::CSV::HeaderReader.new,
21
+ csv_splitter: Infrastructure::CSV::CsvSplitter.new,
22
+ csv_split_manifest_writer: Infrastructure::Output::CsvSplitManifestWriter.new
23
+ )
24
+ @header_reader = header_reader
25
+ @csv_splitter = csv_splitter
26
+ @csv_split_manifest_writer = csv_split_manifest_writer
27
+ end
28
+
29
+ def read_headers(file_path:, col_sep:, headers_present:)
30
+ return failure(:file_not_found, path: file_path) unless File.file?(file_path)
31
+ return success(headers: nil) unless headers_present
32
+
33
+ headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
34
+ return failure(:no_headers) if headers.empty?
35
+
36
+ success(headers: headers)
37
+ rescue CSV::MalformedCSVError
38
+ failure(:could_not_parse_csv)
39
+ rescue Errno::EACCES
40
+ failure(:cannot_read_file, path: file_path)
41
+ end
42
+
43
+ def call(session:)
44
+ source = session.source
45
+ output_directory = session.options.output_directory || File.dirname(source.path)
46
+ file_prefix = session.options.file_prefix || File.basename(source.path, ".*")
47
+ FileUtils.mkdir_p(output_directory)
48
+
49
+ stats = @csv_splitter.call(
50
+ file_path: source.path,
51
+ col_sep: source.separator,
52
+ headers_present: source.headers_present,
53
+ chunk_size: session.options.chunk_size,
54
+ output_directory: output_directory,
55
+ file_prefix: file_prefix,
56
+ overwrite_existing: session.options.overwrite_existing
57
+ )
58
+ manifest_path = maybe_write_manifest(
59
+ session: session,
60
+ output_directory: output_directory,
61
+ file_prefix: file_prefix,
62
+ stats: stats
63
+ )
64
+ success(stats.merge(output_directory: output_directory, file_prefix: file_prefix, manifest_path: manifest_path))
65
+ rescue Infrastructure::CSV::CsvSplitter::OutputFileExistsError => e
66
+ failure(:output_file_exists, path: e.path)
67
+ rescue CSV::MalformedCSVError
68
+ failure(:could_not_parse_csv)
69
+ rescue Errno::EACCES, Errno::ENOENT => e
70
+ failure(:cannot_write_output_file, path: output_directory, error_class: e.class)
71
+ end
72
+
73
+ private
74
+
75
+ def success(data)
76
+ Result.new(ok: true, error: nil, data: data)
77
+ end
78
+
79
+ def failure(code, data = {})
80
+ Result.new(ok: false, error: code, data: data)
81
+ end
82
+
83
+ def maybe_write_manifest(session:, output_directory:, file_prefix:, stats:)
84
+ return nil unless session.options.write_manifest
85
+
86
+ manifest_path = session.options.manifest_path || File.join(output_directory, "#{file_prefix}_manifest.csv")
87
+ @csv_split_manifest_writer.call(
88
+ path: manifest_path,
89
+ chunk_paths: stats[:chunk_paths],
90
+ chunk_row_counts: stats[:chunk_row_counts]
91
+ )
92
+ manifest_path
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/csv_stats_scanner"
5
+ require "csvtool/infrastructure/output/csv_stats_file_writer"
6
+
7
+ module Csvtool
8
+ module Application
9
+ module UseCases
10
+ class RunCsvStats
11
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
12
+ def ok?
13
+ ok
14
+ end
15
+ end
16
+
17
+ def initialize(
18
+ scanner: Infrastructure::CSV::CsvStatsScanner.new,
19
+ csv_stats_file_writer: Infrastructure::Output::CsvStatsFileWriter.new
20
+ )
21
+ @scanner = scanner
22
+ @csv_stats_file_writer = csv_stats_file_writer
23
+ end
24
+
25
+ def call(session:)
26
+ path = session.source.path
27
+ return failure(:file_not_found, path: path) unless File.file?(path)
28
+
29
+ stats = @scanner.call(
30
+ file_path: path,
31
+ col_sep: session.source.separator,
32
+ headers_present: session.source.headers_present
33
+ )
34
+ if session.output_destination&.file?
35
+ @csv_stats_file_writer.call(path: session.output_destination.path, data: stats)
36
+ return success(stats.merge(output_path: session.output_destination.path))
37
+ end
38
+ success(stats)
39
+ rescue CSV::MalformedCSVError
40
+ failure(:could_not_parse_csv)
41
+ rescue Errno::EACCES => e
42
+ if session.output_destination&.file?
43
+ return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
44
+ end
45
+ failure(:cannot_read_file, path: path)
46
+ rescue Errno::ENOENT => e
47
+ return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class) if session.output_destination&.file?
48
+
49
+ failure(:cannot_read_file, path: path)
50
+ end
51
+
52
+ private
53
+
54
+ def success(data)
55
+ Result.new(ok: true, error: nil, data: data)
56
+ end
57
+
58
+ def failure(code, data = {})
59
+ Result.new(ok: false, error: code, data: data)
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end