csvops 0.6.0.alpha → 0.7.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +51 -12
  3. data/docs/architecture.md +61 -4
  4. data/docs/release-v0.7.0-alpha.md +87 -0
  5. data/lib/csvtool/application/use_cases/run_csv_split.rb +97 -0
  6. data/lib/csvtool/cli.rb +5 -1
  7. data/lib/csvtool/domain/csv_split_session/split_options.rb +27 -0
  8. data/lib/csvtool/domain/csv_split_session/split_session.rb +20 -0
  9. data/lib/csvtool/domain/csv_split_session/split_source.rb +17 -0
  10. data/lib/csvtool/infrastructure/csv/csv_splitter.rb +64 -0
  11. data/lib/csvtool/infrastructure/output/csv_split_manifest_writer.rb +20 -0
  12. data/lib/csvtool/interface/cli/errors/presenter.rb +8 -0
  13. data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
  14. data/lib/csvtool/interface/cli/prompts/chunk_size_prompt.rb +21 -0
  15. data/lib/csvtool/interface/cli/prompts/split_manifest_prompt.rb +30 -0
  16. data/lib/csvtool/interface/cli/prompts/split_output_prompt.rb +38 -0
  17. data/lib/csvtool/interface/cli/workflows/builders/csv_split_session_builder.rb +44 -0
  18. data/lib/csvtool/interface/cli/workflows/presenters/csv_split_presenter.rb +26 -0
  19. data/lib/csvtool/interface/cli/workflows/run_csv_split_workflow.rb +89 -0
  20. data/lib/csvtool/interface/cli/workflows/steps/csv_split/build_session_step.rb +30 -0
  21. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step.rb +43 -0
  22. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step.rb +30 -0
  23. data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step.rb +31 -0
  24. data/lib/csvtool/interface/cli/workflows/steps/csv_split/execute_step.rb +36 -0
  25. data/lib/csvtool/version.rb +1 -1
  26. data/test/csvtool/application/use_cases/run_csv_split_test.rb +124 -0
  27. data/test/csvtool/cli_test.rb +76 -29
  28. data/test/csvtool/infrastructure/csv/csv_splitter_test.rb +68 -0
  29. data/test/csvtool/infrastructure/output/csv_split_manifest_writer_test.rb +25 -0
  30. data/test/csvtool/interface/cli/menu_loop_test.rb +81 -130
  31. data/test/csvtool/interface/cli/prompts/chunk_size_prompt_test.rb +17 -0
  32. data/test/csvtool/interface/cli/prompts/split_manifest_prompt_test.rb +42 -0
  33. data/test/csvtool/interface/cli/prompts/split_output_prompt_test.rb +22 -0
  34. data/test/csvtool/interface/cli/workflows/builders/csv_split_session_builder_test.rb +30 -0
  35. data/test/csvtool/interface/cli/workflows/presenters/csv_split_presenter_test.rb +26 -0
  36. data/test/csvtool/interface/cli/workflows/run_csv_split_workflow_test.rb +200 -0
  37. data/test/csvtool/interface/cli/workflows/steps/csv_split/build_session_step_test.rb +40 -0
  38. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step_test.rb +64 -0
  39. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step_test.rb +30 -0
  40. data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step_test.rb +32 -0
  41. data/test/csvtool/interface/cli/workflows/steps/csv_split/execute_step_test.rb +83 -0
  42. data/test/fixtures/split_people_25.csv +26 -0
  43. metadata +34 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f7db22cb84c1d08c58b473368f9ad37575a217d6293539309277ed2b032a2852
4
- data.tar.gz: 124bebc822fefa5d1f71286701959876260c82164067c36ff94b712a0b4cc1b3
3
+ metadata.gz: 803fa825ef1f50edcd7c0bc032a86926d356cb3ba6d943c460d59759a953fdcd
4
+ data.tar.gz: 2ba2afc9951aa96e777cbf3ea81dc77a41c88d2546505c885302607432461633
5
5
  SHA512:
6
- metadata.gz: a8b8dbcfb66073f46f0ecc625267081fbe730e69ef9295f5d2303af6b831a9d71ef564f78f5b44212eb33c4ad7a5fdb78b54fa98e21dd58669e9494a5d3325fb
7
- data.tar.gz: 05cbcaa2ca3116ad463413e53600d32a53df0941ceb8873ed22c2ef2d4cfe1afc8f90e44c7ff4400212ebbd5083a2ceb6a983281291436e9478d5087cc98b9ad
6
+ metadata.gz: 4f82dd7e9d3ac5ff53f8aaf40a0e5500e9b074aa052a031f6de4f5a2cc1ab711a5c375d5c203bdfaae802d36a02ecf14c4f73231a9f14e31d2f042ffeecd9a08
7
+ data.tar.gz: f9428d2ef29d257c99b484c7277dcff566dd5cf09ec06b78b4514c410b7858ffd6854f8aafd39a727c9c3d1e44e6940bc15456f3b11fdcac4a5b879bee9cc826
data/README.md CHANGED
@@ -38,11 +38,12 @@ CSV Tool Menu
38
38
  3. Randomize rows
39
39
  4. Dedupe using another CSV
40
40
  5. Validate parity
41
- 6. Exit
41
+ 6. Split CSV into chunks
42
+ 7. Exit
42
43
  >
43
44
  ```
44
45
 
45
- Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, or `5` for parity validation.
46
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, or `6` for CSV splitting.
46
47
 
47
48
  ### 3. Follow prompts
48
49
 
@@ -61,6 +62,7 @@ Prompt flow by action:
61
62
  - `Randomize rows`: file path, separator, headers present, optional seed, output destination.
62
63
  - `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
63
64
  - `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
65
+ - `Split CSV into chunks`: source file, separator, header mode, chunk size, output directory/prefix, overwrite policy, optional manifest.
64
66
 
65
67
  ### 4. Example interaction (console output)
66
68
 
@@ -129,10 +131,11 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
129
131
  CSV Tool Menu
130
132
  1. Extract column
131
133
  2. Extract rows (range)
132
- 3. Randomize rows
133
- 4. Dedupe using another CSV
134
- 5. Validate parity
135
- 6. Exit
134
+ 3. Randomize rows
135
+ 4. Dedupe using another CSV
136
+ 5. Validate parity
137
+ 6. Split CSV into chunks
138
+ 7. Exit
136
139
  +> 4
137
140
  CSV file path: /tmp/source.csv
138
141
  Source CSV separator:
@@ -177,10 +180,11 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
177
180
  CSV Tool Menu
178
181
  1. Extract column
179
182
  2. Extract rows (range)
180
- 3. Randomize rows
181
- 4. Dedupe using another CSV
182
- 5. Validate parity
183
- 6. Exit
183
+ 3. Randomize rows
184
+ 4. Dedupe using another CSV
185
+ 5. Validate parity
186
+ 6. Split CSV into chunks
187
+ 7. Exit
184
188
  +> 5
185
189
  Left CSV file path: /tmp/left.csv
186
190
  Right CSV file path: /tmp/right.csv
@@ -208,6 +212,41 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
208
212
  - Exact duplicate semantics are preserved by count deltas per normalized row value.
209
213
  - Memory scales with the number of distinct row keys in the parity map, not the total input row count.
210
214
 
215
+ ### 10. Split interaction example
216
+
217
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
218
+
219
+ ```diff
220
+ CSV Tool Menu
221
+ 1. Extract column
222
+ 2. Extract rows (range)
223
+ 3. Randomize rows
224
+ 4. Dedupe using another CSV
225
+ 5. Validate parity
226
+ 6. Split CSV into chunks
227
+ 7. Exit
228
+ +> 6
229
+ Source CSV file path: /tmp/people.csv
230
+ Choose separator:
231
+ 1. comma (,)
232
+ 2. tab (\t)
233
+ 3. semicolon (;)
234
+ 4. pipe (|)
235
+ 5. custom
236
+ +Separator choice [1]: 1
237
+ Headers present? [Y/n]:
238
+ +Rows per chunk: 1000
239
+ Output directory [/tmp]:
240
+ Output file prefix [people]:
241
+ Overwrite existing chunk files? [y/N]:
242
+ Write manifest file? [y/N]:
243
+ -Split complete.
244
+ -Chunk size: 1000
245
+ -Data rows: 25000
246
+ -Chunks written: 25
247
+ -/tmp/people_part_001.csv
248
+ ```
249
+
211
250
  ## Testing
212
251
 
213
252
  Run tests:
@@ -224,7 +263,7 @@ bundle exec rake test
224
263
 
225
264
  ## Alpha release
226
265
 
227
- Current prerelease version: `0.5.0.alpha`
266
+ Current prerelease version: `0.7.0.alpha`
228
267
 
229
268
  Install prerelease from RubyGems:
230
269
 
@@ -234,7 +273,7 @@ gem install csvops --pre
234
273
 
235
274
  Release runbook:
236
275
 
237
- - `docs/release-v0.5.0-alpha.md`
276
+ - `docs/release-v0.7.0-alpha.md`
238
277
 
239
278
 
240
279
  ## Architecture
data/docs/architecture.md CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  The codebase follows a DDD-lite layered structure:
4
4
 
5
- - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`).
5
+ - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, and `CsvSplitSession` aggregates + supporting entities/value objects).
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`).
7
7
  - `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
8
8
  - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
9
  - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
10
 
11
11
  ## Workflow boundary (standardized)
12
12
 
13
- For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`), the boundary is:
13
+ For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`), the boundary is:
14
14
 
15
15
  - `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
16
16
  - `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
@@ -33,6 +33,7 @@ Current usage:
33
33
  - `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
34
34
  - `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
35
35
  - `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
36
+ - `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
36
37
 
37
38
  ## Adding New Concepts
38
39
 
@@ -108,7 +109,7 @@ For a new function type, prefer one of these patterns:
108
109
 
109
110
  ## Domain model
110
111
 
111
- Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, and `CSV Parity`.
112
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, and `CSV Split`.
112
113
 
113
114
  ### Cross-CSV Dedupe (Large-file behavior)
114
115
 
@@ -421,6 +422,60 @@ classDiagram
421
422
  RunCsvParity --> CsvParityComparator
422
423
  ```
423
424
 
425
+ ### CSV Split
426
+
427
+ Core DDD structure:
428
+
429
+ - Aggregate root: `SplitSession`
430
+ - Captures one CSV split request.
431
+ - Holds split source and split options.
432
+ - Entities:
433
+ - `SplitSource` (path + separator + header mode)
434
+ - Value objects:
435
+ - `SplitOptions` (chunk size, output directory, file prefix, overwrite policy, optional manifest configuration)
436
+ - Application service:
437
+ - `Application::UseCases::RunCsvSplit` orchestrates split execution and returns request/result style payloads.
438
+ - Infrastructure adapters:
439
+ - `Infrastructure::CSV::CsvSplitter` (streaming row-by-row chunk writer)
440
+ - `Infrastructure::Output::CsvSplitManifestWriter` (optional manifest output)
441
+ - Interface adapters:
442
+ - `Interface::CLI::MenuLoop`
443
+ - `Interface::CLI::Workflows::RunCsvSplitWorkflow`
444
+ - `Interface::CLI::Workflows::Builders::CsvSplitSessionBuilder`
445
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
446
+ - `Interface::CLI::Workflows::Steps::CsvSplit::*`
447
+ - `Interface::CLI::Workflows::Presenters::CsvSplitPresenter`
448
+ - `Interface::CLI::Workflows::Support::ResultErrorHandler`
449
+ - `Interface::CLI::Prompts::*`
450
+ - `Interface::CLI::Errors::Presenter`
451
+
452
+ ```mermaid
453
+ classDiagram
454
+ direction LR
455
+ class MenuLoop
456
+ class RunCsvSplitWorkflow
457
+ class Prompts
458
+ class Errors
459
+ class RunCsvSplit
460
+ class SplitSession
461
+ class SplitSource
462
+ class SplitOptions
463
+ class CsvSplitter
464
+ class CsvSplitManifestWriter
465
+ class CsvSplitPresenter
466
+
467
+ MenuLoop --> RunCsvSplitWorkflow : invokes
468
+ RunCsvSplitWorkflow --> Prompts : uses
469
+ RunCsvSplitWorkflow --> Errors : reports failures
470
+ RunCsvSplitWorkflow --> CsvSplitPresenter : renders
471
+ RunCsvSplitWorkflow --> RunCsvSplit : calls
472
+ RunCsvSplit --> SplitSession : orchestrates
473
+ SplitSession o-- SplitSource
474
+ SplitSession o-- SplitOptions
475
+ RunCsvSplit --> CsvSplitter
476
+ RunCsvSplit --> CsvSplitManifestWriter
477
+ ```
478
+
424
479
  ## Project layout
425
480
 
426
481
  ```text
@@ -431,12 +486,14 @@ lib/csvtool/domain/row_session/*
431
486
  lib/csvtool/domain/row_randomization_session/*
432
487
  lib/csvtool/domain/cross_csv_dedupe_session/*
433
488
  lib/csvtool/domain/csv_parity_session/*
489
+ lib/csvtool/domain/csv_split_session/*
434
490
  lib/csvtool/domain/shared/output_destination.rb
435
491
  lib/csvtool/application/use_cases/run_extraction.rb
436
492
  lib/csvtool/application/use_cases/run_row_extraction.rb
437
493
  lib/csvtool/application/use_cases/run_row_randomization.rb
438
494
  lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
439
495
  lib/csvtool/application/use_cases/run_csv_parity.rb
496
+ lib/csvtool/application/use_cases/run_csv_split.rb
440
497
  lib/csvtool/infrastructure/csv/*
441
498
  lib/csvtool/infrastructure/output/*
442
499
  lib/csvtool/interface/cli/menu_loop.rb
@@ -0,0 +1,87 @@
1
+ # Release Checklist: v0.7.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.x`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### CSV split workflow (new in this release)
35
+
36
+ Use menu option `6` (`Split CSV into chunks`) and verify:
37
+ - happy path split (`N=10`) writes expected chunk files and counts
38
+ - separator and header mode options work (CSV/TSV/headerless/custom)
39
+ - output directory + file prefix options produce expected paths
40
+ - overwrite protection blocks existing chunk paths unless allowed
41
+ - optional manifest output writes valid CSV metadata
42
+
43
+ ### Existing workflows regression pass
44
+
45
+ Use menu options `1-5` and verify:
46
+ - column extraction still works
47
+ - row-range extraction still works
48
+ - row randomization still works
49
+ - cross-CSV dedupe still works
50
+ - parity validation still works
51
+
52
+ ## 6. Build and validate gem package
53
+
54
+ ```bash
55
+ gem build csvops.gemspec
56
+ gem install ./csvops-0.7.0.alpha.gem
57
+ csvtool menu
58
+ ```
59
+
60
+ ## 7. Commit release prep
61
+
62
+ ```bash
63
+ git add -A
64
+ git commit -m "chore(release): prepare v0.7.0-alpha"
65
+ ```
66
+
67
+ ## 8. Tag release
68
+
69
+ ```bash
70
+ git tag -a v0.7.0-alpha -m "v0.7.0-alpha"
71
+ git push origin main --tags
72
+ ```
73
+
74
+ ## 9. Publish gem
75
+
76
+ ```bash
77
+ gem push csvops-0.7.0.alpha.gem
78
+ ```
79
+
80
+ ## 10. Create GitHub release
81
+
82
+ Create release `v0.7.0-alpha` with:
83
+ - New `Split CSV into chunks` workflow
84
+ - Split-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
85
+ - Output strategy improvements (directory/prefix/overwrite controls)
86
+ - Optional split manifest output
87
+ - Large-file streaming split coverage and docs updates
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "fileutils"
5
+ require "csvtool/infrastructure/csv/header_reader"
6
+ require "csvtool/infrastructure/csv/csv_splitter"
7
+ require "csvtool/infrastructure/output/csv_split_manifest_writer"
8
+
9
+ module Csvtool
10
+ module Application
11
+ module UseCases
12
+ class RunCsvSplit
13
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
14
+ def ok?
15
+ ok
16
+ end
17
+ end
18
+
19
+ def initialize(
20
+ header_reader: Infrastructure::CSV::HeaderReader.new,
21
+ csv_splitter: Infrastructure::CSV::CsvSplitter.new,
22
+ csv_split_manifest_writer: Infrastructure::Output::CsvSplitManifestWriter.new
23
+ )
24
+ @header_reader = header_reader
25
+ @csv_splitter = csv_splitter
26
+ @csv_split_manifest_writer = csv_split_manifest_writer
27
+ end
28
+
29
+ def read_headers(file_path:, col_sep:, headers_present:)
30
+ return failure(:file_not_found, path: file_path) unless File.file?(file_path)
31
+ return success(headers: nil) unless headers_present
32
+
33
+ headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
34
+ return failure(:no_headers) if headers.empty?
35
+
36
+ success(headers: headers)
37
+ rescue CSV::MalformedCSVError
38
+ failure(:could_not_parse_csv)
39
+ rescue Errno::EACCES
40
+ failure(:cannot_read_file, path: file_path)
41
+ end
42
+
43
+ def call(session:)
44
+ source = session.source
45
+ output_directory = session.options.output_directory || File.dirname(source.path)
46
+ file_prefix = session.options.file_prefix || File.basename(source.path, ".*")
47
+ FileUtils.mkdir_p(output_directory)
48
+
49
+ stats = @csv_splitter.call(
50
+ file_path: source.path,
51
+ col_sep: source.separator,
52
+ headers_present: source.headers_present,
53
+ chunk_size: session.options.chunk_size,
54
+ output_directory: output_directory,
55
+ file_prefix: file_prefix,
56
+ overwrite_existing: session.options.overwrite_existing
57
+ )
58
+ manifest_path = maybe_write_manifest(
59
+ session: session,
60
+ output_directory: output_directory,
61
+ file_prefix: file_prefix,
62
+ stats: stats
63
+ )
64
+ success(stats.merge(output_directory: output_directory, file_prefix: file_prefix, manifest_path: manifest_path))
65
+ rescue Infrastructure::CSV::CsvSplitter::OutputFileExistsError => e
66
+ failure(:output_file_exists, path: e.path)
67
+ rescue CSV::MalformedCSVError
68
+ failure(:could_not_parse_csv)
69
+ rescue Errno::EACCES, Errno::ENOENT => e
70
+ failure(:cannot_write_output_file, path: output_directory, error_class: e.class)
71
+ end
72
+
73
+ private
74
+
75
+ def success(data)
76
+ Result.new(ok: true, error: nil, data: data)
77
+ end
78
+
79
+ def failure(code, data = {})
80
+ Result.new(ok: false, error: code, data: data)
81
+ end
82
+
83
+ def maybe_write_manifest(session:, output_directory:, file_prefix:, stats:)
84
+ return nil unless session.options.write_manifest
85
+
86
+ manifest_path = session.options.manifest_path || File.join(output_directory, "#{file_prefix}_manifest.csv")
87
+ @csv_split_manifest_writer.call(
88
+ path: manifest_path,
89
+ chunk_paths: stats[:chunk_paths],
90
+ chunk_row_counts: stats[:chunk_row_counts]
91
+ )
92
+ manifest_path
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
data/lib/csvtool/cli.rb CHANGED
@@ -7,6 +7,7 @@ require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
7
7
  require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
8
8
  require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
9
9
  require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
10
+ require "csvtool/interface/cli/workflows/run_csv_split_workflow"
10
11
  require "csvtool/interface/cli/errors/presenter"
11
12
  require "csvtool/infrastructure/csv/header_reader"
12
13
  require "csvtool/infrastructure/csv/value_streamer"
@@ -20,6 +21,7 @@ module Csvtool
20
21
  "Randomize rows",
21
22
  "Dedupe using another CSV",
22
23
  "Validate parity",
24
+ "Split CSV into chunks",
23
25
  "Exit"
24
26
  ].freeze
25
27
 
@@ -54,6 +56,7 @@ module Csvtool
54
56
  randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
55
57
  dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
56
58
  parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
59
+ split_action = -> { Interface::CLI::Workflows::RunCsvSplitWorkflow.new(stdin: @stdin, stdout: @stdout).call }
57
60
  Interface::CLI::MenuLoop.new(
58
61
  stdin: @stdin,
59
62
  stdout: @stdout,
@@ -62,7 +65,8 @@ module Csvtool
62
65
  extract_rows_action: extract_rows_action,
63
66
  randomize_rows_action: randomize_rows_action,
64
67
  dedupe_action: dedupe_action,
65
- parity_action: parity_action
68
+ parity_action: parity_action,
69
+ split_action: split_action
66
70
  ).run
67
71
  end
68
72
 
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvSplitSession
6
+ class SplitOptions
7
+ attr_reader :chunk_size, :output_directory, :file_prefix, :overwrite_existing, :write_manifest, :manifest_path
8
+
9
+ def initialize(
10
+ chunk_size:,
11
+ output_directory: nil,
12
+ file_prefix: nil,
13
+ overwrite_existing: false,
14
+ write_manifest: false,
15
+ manifest_path: nil
16
+ )
17
+ @chunk_size = Integer(chunk_size)
18
+ @output_directory = output_directory
19
+ @file_prefix = file_prefix
20
+ @overwrite_existing = overwrite_existing
21
+ @write_manifest = write_manifest
22
+ @manifest_path = manifest_path
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvSplitSession
6
+ class SplitSession
7
+ attr_reader :source, :options
8
+
9
+ def self.start(source:, options:)
10
+ new(source: source, options: options)
11
+ end
12
+
13
+ def initialize(source:, options:)
14
+ @source = source
15
+ @options = options
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Domain
5
+ module CsvSplitSession
6
+ class SplitSource
7
+ attr_reader :path, :separator, :headers_present
8
+
9
+ def initialize(path:, separator:, headers_present:)
10
+ @path = path
11
+ @separator = separator
12
+ @headers_present = headers_present
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module Csvtool
6
+ module Infrastructure
7
+ module CSV
8
+ class CsvSplitter
9
+ class OutputFileExistsError < StandardError
10
+ attr_reader :path
11
+
12
+ def initialize(path)
13
+ super("output file exists: #{path}")
14
+ @path = path
15
+ end
16
+ end
17
+
18
+ def call(file_path:, col_sep:, headers_present:, chunk_size:, output_directory:, file_prefix:, overwrite_existing:)
19
+ ext = File.extname(file_path)
20
+ ext = ".csv" if ext.empty?
21
+ sequence = 0
22
+ data_rows = 0
23
+ chunk_paths = []
24
+ chunk_row_counts = []
25
+ rows_in_chunk = 0
26
+ current_csv = nil
27
+
28
+ write_mode_headers = nil
29
+ write_headers = headers_present
30
+
31
+ ::CSV.foreach(file_path, headers: headers_present, col_sep: col_sep) do |row|
32
+ if current_csv.nil? || rows_in_chunk >= chunk_size
33
+ current_csv&.close
34
+ sequence += 1
35
+ rows_in_chunk = 0
36
+ path = File.join(output_directory, format("%<prefix>s_part_%<num>03d%<ext>s", prefix: file_prefix, num: sequence, ext: ext))
37
+ raise OutputFileExistsError.new(path) if File.exist?(path) && !overwrite_existing
38
+
39
+ chunk_paths << path
40
+ chunk_row_counts << 0
41
+ write_mode_headers = headers_present ? row.headers : nil
42
+ current_csv = ::CSV.open(path, "w", write_headers: write_headers, headers: write_mode_headers, col_sep: col_sep)
43
+ end
44
+
45
+ fields = headers_present ? row.fields : row
46
+ current_csv << fields
47
+ rows_in_chunk += 1
48
+ chunk_row_counts[-1] += 1
49
+ data_rows += 1
50
+ end
51
+
52
+ {
53
+ chunk_paths: chunk_paths,
54
+ chunk_count: chunk_paths.length,
55
+ data_rows: data_rows,
56
+ chunk_row_counts: chunk_row_counts
57
+ }
58
+ ensure
59
+ current_csv&.close unless current_csv&.closed?
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module Csvtool
6
+ module Infrastructure
7
+ module Output
8
+ class CsvSplitManifestWriter
9
+ def call(path:, chunk_paths:, chunk_row_counts:)
10
+ ::CSV.open(path, "w") do |csv|
11
+ csv << %w[chunk_index chunk_path row_count]
12
+ chunk_paths.each_with_index do |chunk_path, index|
13
+ csv << [index + 1, chunk_path, chunk_row_counts[index]]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -33,6 +33,10 @@ module Csvtool
33
33
  @stdout.puts "Cannot write output file: #{path} (#{error_class})"
34
34
  end
35
35
 
36
+ def output_file_exists(path)
37
+ @stdout.puts "Output file already exists: #{path}"
38
+ end
39
+
36
40
  def empty_output_path
37
41
  @stdout.puts "Output file path cannot be empty."
38
42
  end
@@ -53,6 +57,10 @@ module Csvtool
53
57
  @stdout.puts "Seed must be an integer."
54
58
  end
55
59
 
60
+ def invalid_chunk_size
61
+ @stdout.puts "Chunk size must be a positive integer."
62
+ end
63
+
56
64
  def canceled
57
65
  @stdout.puts "Canceled."
58
66
  end
@@ -4,7 +4,7 @@ module Csvtool
4
4
  module Interface
5
5
  module CLI
6
6
  class MenuLoop
7
- def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:)
7
+ def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:, split_action:)
8
8
  @stdin = stdin
9
9
  @stdout = stdout
10
10
  @menu_options = menu_options
@@ -13,6 +13,7 @@ module Csvtool
13
13
  @randomize_rows_action = randomize_rows_action
14
14
  @dedupe_action = dedupe_action
15
15
  @parity_action = parity_action
16
+ @split_action = split_action
16
17
  end
17
18
 
18
19
  def run
@@ -34,9 +35,11 @@ module Csvtool
34
35
  when "5"
35
36
  @parity_action.call
36
37
  when "6"
38
+ @split_action.call
39
+ when "7"
37
40
  return 0
38
41
  else
39
- @stdout.puts "Please choose 1, 2, 3, 4, 5, or 6."
42
+ @stdout.puts "Please choose 1, 2, 3, 4, 5, 6, or 7."
40
43
  end
41
44
  end
42
45
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Csvtool
4
+ module Interface
5
+ module CLI
6
+ module Prompts
7
+ class ChunkSizePrompt
8
+ def initialize(stdin:, stdout:)
9
+ @stdin = stdin
10
+ @stdout = stdout
11
+ end
12
+
13
+ def call
14
+ @stdout.print "Rows per chunk: "
15
+ @stdin.gets&.strip.to_s
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end