csvops 0.5.0.alpha → 0.7.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +88 -7
- data/docs/architecture.md +119 -5
- data/docs/release-v0.6.0-alpha.md +84 -0
- data/docs/release-v0.7.0-alpha.md +87 -0
- data/lib/csvtool/application/use_cases/run_csv_parity.rb +70 -0
- data/lib/csvtool/application/use_cases/run_csv_split.rb +97 -0
- data/lib/csvtool/cli.rb +9 -1
- data/lib/csvtool/domain/csv_parity_session/parity_options.rb +22 -0
- data/lib/csvtool/domain/csv_parity_session/parity_session.rb +20 -0
- data/lib/csvtool/domain/csv_parity_session/source_pair.rb +19 -0
- data/lib/csvtool/domain/csv_split_session/split_options.rb +27 -0
- data/lib/csvtool/domain/csv_split_session/split_session.rb +20 -0
- data/lib/csvtool/domain/csv_split_session/split_source.rb +17 -0
- data/lib/csvtool/infrastructure/csv/csv_parity_comparator.rb +71 -0
- data/lib/csvtool/infrastructure/csv/csv_splitter.rb +64 -0
- data/lib/csvtool/infrastructure/output/csv_split_manifest_writer.rb +20 -0
- data/lib/csvtool/interface/cli/errors/presenter.rb +12 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
- data/lib/csvtool/interface/cli/prompts/chunk_size_prompt.rb +21 -0
- data/lib/csvtool/interface/cli/prompts/split_manifest_prompt.rb +30 -0
- data/lib/csvtool/interface/cli/prompts/split_output_prompt.rb +38 -0
- data/lib/csvtool/interface/cli/workflows/builders/csv_parity_session_builder.rb +33 -0
- data/lib/csvtool/interface/cli/workflows/builders/csv_split_session_builder.rb +44 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_parity_presenter.rb +38 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_split_presenter.rb +26 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_parity_workflow.rb +66 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_split_workflow.rb +89 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/build_session_step.rb +30 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step.rb +43 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step.rb +30 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step.rb +31 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/execute_step.rb +36 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/build_session_step.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step.rb +32 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/execute_step.rb +26 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_csv_parity_test.rb +160 -0
- data/test/csvtool/application/use_cases/run_csv_split_test.rb +124 -0
- data/test/csvtool/cli_test.rb +222 -21
- data/test/csvtool/cli_unit_test.rb +4 -4
- data/test/csvtool/domain/csv_parity_session/parity_options_test.rb +17 -0
- data/test/csvtool/domain/csv_parity_session/parity_session_test.rb +18 -0
- data/test/csvtool/domain/csv_parity_session/source_pair_test.rb +11 -0
- data/test/csvtool/infrastructure/csv/csv_parity_comparator_test.rb +78 -0
- data/test/csvtool/infrastructure/csv/csv_splitter_test.rb +68 -0
- data/test/csvtool/infrastructure/output/csv_split_manifest_writer_test.rb +25 -0
- data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +87 -93
- data/test/csvtool/interface/cli/prompts/chunk_size_prompt_test.rb +17 -0
- data/test/csvtool/interface/cli/prompts/split_manifest_prompt_test.rb +42 -0
- data/test/csvtool/interface/cli/prompts/split_output_prompt_test.rb +22 -0
- data/test/csvtool/interface/cli/workflows/builders/csv_parity_session_builder_test.rb +20 -0
- data/test/csvtool/interface/cli/workflows/builders/csv_split_session_builder_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_parity_presenter_test.rb +43 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_split_presenter_test.rb +26 -0
- data/test/csvtool/interface/cli/workflows/run_csv_parity_workflow_test.rb +94 -0
- data/test/csvtool/interface/cli/workflows/run_csv_split_workflow_test.rb +200 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/build_session_step_test.rb +40 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step_test.rb +64 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step_test.rb +32 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/execute_step_test.rb +83 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/build_session_step_test.rb +41 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/execute_step_test.rb +40 -0
- data/test/fixtures/parity_duplicates_left.csv +4 -0
- data/test/fixtures/parity_duplicates_right.csv +3 -0
- data/test/fixtures/parity_people_header_mismatch.csv +4 -0
- data/test/fixtures/parity_people_many_reordered.csv +13 -0
- data/test/fixtures/parity_people_mismatch.csv +4 -0
- data/test/fixtures/parity_people_reordered.csv +4 -0
- data/test/fixtures/parity_people_reordered.tsv +4 -0
- data/test/fixtures/split_people_25.csv +26 -0
- metadata +64 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 803fa825ef1f50edcd7c0bc032a86926d356cb3ba6d943c460d59759a953fdcd
|
|
4
|
+
data.tar.gz: 2ba2afc9951aa96e777cbf3ea81dc77a41c88d2546505c885302607432461633
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4f82dd7e9d3ac5ff53f8aaf40a0e5500e9b074aa052a031f6de4f5a2cc1ab711a5c375d5c203bdfaae802d36a02ecf14c4f73231a9f14e31d2f042ffeecd9a08
|
|
7
|
+
data.tar.gz: f9428d2ef29d257c99b484c7277dcff566dd5cf09ec06b78b4514c410b7858ffd6854f8aafd39a727c9c3d1e44e6940bc15456f3b11fdcac4a5b879bee9cc826
|
data/README.md
CHANGED
|
@@ -37,11 +37,13 @@ CSV Tool Menu
|
|
|
37
37
|
2. Extract rows (range)
|
|
38
38
|
3. Randomize rows
|
|
39
39
|
4. Dedupe using another CSV
|
|
40
|
-
5.
|
|
40
|
+
5. Validate parity
|
|
41
|
+
6. Split CSV into chunks
|
|
42
|
+
7. Exit
|
|
41
43
|
>
|
|
42
44
|
```
|
|
43
45
|
|
|
44
|
-
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization,
|
|
46
|
+
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, or `6` for CSV splitting.
|
|
45
47
|
|
|
46
48
|
### 3. Follow prompts
|
|
47
49
|
|
|
@@ -59,6 +61,8 @@ Prompt flow by action:
|
|
|
59
61
|
- `Extract rows (range)`: file path, separator, start row, end row, output destination.
|
|
60
62
|
- `Randomize rows`: file path, separator, headers present, optional seed, output destination.
|
|
61
63
|
- `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
|
|
64
|
+
- `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
|
|
65
|
+
- `Split CSV into chunks`: source file, separator, header mode, chunk size, output directory/prefix, overwrite policy, optional manifest.
|
|
62
66
|
|
|
63
67
|
### 4. Example interaction (console output)
|
|
64
68
|
|
|
@@ -127,9 +131,11 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
127
131
|
CSV Tool Menu
|
|
128
132
|
1. Extract column
|
|
129
133
|
2. Extract rows (range)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
134
|
+
3. Randomize rows
|
|
135
|
+
4. Dedupe using another CSV
|
|
136
|
+
5. Validate parity
|
|
137
|
+
6. Split CSV into chunks
|
|
138
|
+
7. Exit
|
|
133
139
|
+> 4
|
|
134
140
|
CSV file path: /tmp/source.csv
|
|
135
141
|
Source CSV separator:
|
|
@@ -166,6 +172,81 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
166
172
|
-Summary: source_rows=5 removed_rows=3 kept_rows=2
|
|
167
173
|
```
|
|
168
174
|
|
|
175
|
+
### 8. Parity interaction example
|
|
176
|
+
|
|
177
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
178
|
+
|
|
179
|
+
```diff
|
|
180
|
+
CSV Tool Menu
|
|
181
|
+
1. Extract column
|
|
182
|
+
2. Extract rows (range)
|
|
183
|
+
3. Randomize rows
|
|
184
|
+
4. Dedupe using another CSV
|
|
185
|
+
5. Validate parity
|
|
186
|
+
6. Split CSV into chunks
|
|
187
|
+
7. Exit
|
|
188
|
+
+> 5
|
|
189
|
+
Left CSV file path: /tmp/left.csv
|
|
190
|
+
Right CSV file path: /tmp/right.csv
|
|
191
|
+
Choose separator:
|
|
192
|
+
1. comma (,)
|
|
193
|
+
2. tab (\t)
|
|
194
|
+
3. semicolon (;)
|
|
195
|
+
4. pipe (|)
|
|
196
|
+
5. custom
|
|
197
|
+
+Separator choice [1]: 1
|
|
198
|
+
Headers present? [Y/n]:
|
|
199
|
+
-MISMATCH
|
|
200
|
+
-Summary: left_rows=10 right_rows=10 left_only=2 right_only=2
|
|
201
|
+
-Left-only examples:
|
|
202
|
+
- 4,Dina (count +1)
|
|
203
|
+
-Right-only examples:
|
|
204
|
+
- 4,Dina-Updated (count +1)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### 9. Parity large-file behavior
|
|
208
|
+
|
|
209
|
+
- Parity uses a streaming count-delta strategy:
|
|
210
|
+
- Stream left rows and increment row-key counts.
|
|
211
|
+
- Stream right rows and decrement row-key counts.
|
|
212
|
+
- Exact duplicate semantics are preserved by count deltas per normalized row value.
|
|
213
|
+
- Memory scales with the number of distinct row keys in the parity map, not the total input row count.
|
|
214
|
+
|
|
215
|
+
### 10. Split interaction example
|
|
216
|
+
|
|
217
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
218
|
+
|
|
219
|
+
```diff
|
|
220
|
+
CSV Tool Menu
|
|
221
|
+
1. Extract column
|
|
222
|
+
2. Extract rows (range)
|
|
223
|
+
3. Randomize rows
|
|
224
|
+
4. Dedupe using another CSV
|
|
225
|
+
5. Validate parity
|
|
226
|
+
6. Split CSV into chunks
|
|
227
|
+
7. Exit
|
|
228
|
+
+> 6
|
|
229
|
+
Source CSV file path: /tmp/people.csv
|
|
230
|
+
Choose separator:
|
|
231
|
+
1. comma (,)
|
|
232
|
+
2. tab (\t)
|
|
233
|
+
3. semicolon (;)
|
|
234
|
+
4. pipe (|)
|
|
235
|
+
5. custom
|
|
236
|
+
+Separator choice [1]: 1
|
|
237
|
+
Headers present? [Y/n]:
|
|
238
|
+
+Rows per chunk: 1000
|
|
239
|
+
Output directory [/tmp]:
|
|
240
|
+
Output file prefix [people]:
|
|
241
|
+
Overwrite existing chunk files? [y/N]:
|
|
242
|
+
Write manifest file? [y/N]:
|
|
243
|
+
-Split complete.
|
|
244
|
+
-Chunk size: 1000
|
|
245
|
+
-Data rows: 25000
|
|
246
|
+
-Chunks written: 25
|
|
247
|
+
-/tmp/people_part_001.csv
|
|
248
|
+
```
|
|
249
|
+
|
|
169
250
|
## Testing
|
|
170
251
|
|
|
171
252
|
Run tests:
|
|
@@ -182,7 +263,7 @@ bundle exec rake test
|
|
|
182
263
|
|
|
183
264
|
## Alpha release
|
|
184
265
|
|
|
185
|
-
Current prerelease version: `0.
|
|
266
|
+
Current prerelease version: `0.7.0.alpha`
|
|
186
267
|
|
|
187
268
|
Install prerelease from RubyGems:
|
|
188
269
|
|
|
@@ -192,7 +273,7 @@ gem install csvops --pre
|
|
|
192
273
|
|
|
193
274
|
Release runbook:
|
|
194
275
|
|
|
195
|
-
- `docs/release-v0.
|
|
276
|
+
- `docs/release-v0.7.0-alpha.md`
|
|
196
277
|
|
|
197
278
|
|
|
198
279
|
## Architecture
|
data/docs/architecture.md
CHANGED
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
The codebase follows a DDD-lite layered structure:
|
|
4
4
|
|
|
5
|
-
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `
|
|
6
|
-
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
|
|
7
|
-
- `infrastructure/`: CSV reading/streaming and output adapters (console/file)
|
|
5
|
+
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, and `CsvSplitSession` aggregates + supporting entities/value objects).
|
|
6
|
+
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`).
|
|
7
|
+
- `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
|
|
8
8
|
- `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
|
|
9
9
|
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
10
10
|
|
|
11
11
|
## Workflow boundary (standardized)
|
|
12
12
|
|
|
13
|
-
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`), the boundary is:
|
|
13
|
+
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`), the boundary is:
|
|
14
14
|
|
|
15
15
|
- `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
|
|
16
16
|
- `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
|
|
@@ -32,6 +32,8 @@ Current usage:
|
|
|
32
32
|
- `RunRowExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::RowExtraction::*`.
|
|
33
33
|
- `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
|
|
34
34
|
- `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
|
|
35
|
+
- `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
|
|
36
|
+
- `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
|
|
35
37
|
|
|
36
38
|
## Adding New Concepts
|
|
37
39
|
|
|
@@ -107,7 +109,7 @@ For a new function type, prefer one of these patterns:
|
|
|
107
109
|
|
|
108
110
|
## Domain model
|
|
109
111
|
|
|
110
|
-
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`,
|
|
112
|
+
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, and `CSV Split`.
|
|
111
113
|
|
|
112
114
|
### Cross-CSV Dedupe (Large-file behavior)
|
|
113
115
|
|
|
@@ -366,6 +368,114 @@ classDiagram
|
|
|
366
368
|
RunCrossCsvDedupe --> CsvCrossCsvDedupeFileWriter
|
|
367
369
|
```
|
|
368
370
|
|
|
371
|
+
### CSV Parity
|
|
372
|
+
|
|
373
|
+
Core DDD structure:
|
|
374
|
+
|
|
375
|
+
- Aggregate root: `ParitySession`
|
|
376
|
+
- Captures one parity check request.
|
|
377
|
+
- Holds left/right source paths and parity options.
|
|
378
|
+
- Entities:
|
|
379
|
+
- `SourcePair` (left and right file paths)
|
|
380
|
+
- Value objects:
|
|
381
|
+
- `ParityOptions` (separator + header mode)
|
|
382
|
+
- Application service:
|
|
383
|
+
- `Application::UseCases::RunCsvParity` orchestrates parity validation and returns request/result style payloads.
|
|
384
|
+
- Infrastructure adapters:
|
|
385
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
386
|
+
- `Infrastructure::CSV::CsvParityComparator` (streaming count-delta strategy with duplicate-aware semantics)
|
|
387
|
+
- Interface adapters:
|
|
388
|
+
- `Interface::CLI::MenuLoop`
|
|
389
|
+
- `Interface::CLI::Workflows::RunCsvParityWorkflow`
|
|
390
|
+
- `Interface::CLI::Workflows::Builders::CsvParitySessionBuilder`
|
|
391
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
392
|
+
- `Interface::CLI::Workflows::Steps::Parity::*`
|
|
393
|
+
- `Interface::CLI::Workflows::Presenters::CsvParityPresenter`
|
|
394
|
+
- `Interface::CLI::Workflows::Support::ResultErrorHandler`
|
|
395
|
+
- `Interface::CLI::Prompts::*`
|
|
396
|
+
- `Interface::CLI::Errors::Presenter`
|
|
397
|
+
|
|
398
|
+
```mermaid
|
|
399
|
+
classDiagram
|
|
400
|
+
direction LR
|
|
401
|
+
class MenuLoop
|
|
402
|
+
class RunCsvParityWorkflow
|
|
403
|
+
class Prompts
|
|
404
|
+
class Errors
|
|
405
|
+
class RunCsvParity
|
|
406
|
+
class ParitySession
|
|
407
|
+
class SourcePair
|
|
408
|
+
class ParityOptions
|
|
409
|
+
class HeaderReader
|
|
410
|
+
class CsvParityComparator
|
|
411
|
+
class CsvParityPresenter
|
|
412
|
+
|
|
413
|
+
MenuLoop --> RunCsvParityWorkflow : invokes
|
|
414
|
+
RunCsvParityWorkflow --> Prompts : uses
|
|
415
|
+
RunCsvParityWorkflow --> Errors : reports failures
|
|
416
|
+
RunCsvParityWorkflow --> CsvParityPresenter : renders
|
|
417
|
+
RunCsvParityWorkflow --> RunCsvParity : calls
|
|
418
|
+
RunCsvParity --> ParitySession : orchestrates
|
|
419
|
+
ParitySession o-- SourcePair
|
|
420
|
+
ParitySession o-- ParityOptions
|
|
421
|
+
RunCsvParity --> HeaderReader
|
|
422
|
+
RunCsvParity --> CsvParityComparator
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
### CSV Split
|
|
426
|
+
|
|
427
|
+
Core DDD structure:
|
|
428
|
+
|
|
429
|
+
- Aggregate root: `SplitSession`
|
|
430
|
+
- Captures one CSV split request.
|
|
431
|
+
- Holds split source and split options.
|
|
432
|
+
- Entities:
|
|
433
|
+
- `SplitSource` (path + separator + header mode)
|
|
434
|
+
- Value objects:
|
|
435
|
+
- `SplitOptions` (chunk size, output directory, file prefix, overwrite policy, optional manifest configuration)
|
|
436
|
+
- Application service:
|
|
437
|
+
- `Application::UseCases::RunCsvSplit` orchestrates split execution and returns request/result style payloads.
|
|
438
|
+
- Infrastructure adapters:
|
|
439
|
+
- `Infrastructure::CSV::CsvSplitter` (streaming row-by-row chunk writer)
|
|
440
|
+
- `Infrastructure::Output::CsvSplitManifestWriter` (optional manifest output)
|
|
441
|
+
- Interface adapters:
|
|
442
|
+
- `Interface::CLI::MenuLoop`
|
|
443
|
+
- `Interface::CLI::Workflows::RunCsvSplitWorkflow`
|
|
444
|
+
- `Interface::CLI::Workflows::Builders::CsvSplitSessionBuilder`
|
|
445
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
446
|
+
- `Interface::CLI::Workflows::Steps::CsvSplit::*`
|
|
447
|
+
- `Interface::CLI::Workflows::Presenters::CsvSplitPresenter`
|
|
448
|
+
- `Interface::CLI::Workflows::Support::ResultErrorHandler`
|
|
449
|
+
- `Interface::CLI::Prompts::*`
|
|
450
|
+
- `Interface::CLI::Errors::Presenter`
|
|
451
|
+
|
|
452
|
+
```mermaid
|
|
453
|
+
classDiagram
|
|
454
|
+
direction LR
|
|
455
|
+
class MenuLoop
|
|
456
|
+
class RunCsvSplitWorkflow
|
|
457
|
+
class Prompts
|
|
458
|
+
class Errors
|
|
459
|
+
class RunCsvSplit
|
|
460
|
+
class SplitSession
|
|
461
|
+
class SplitSource
|
|
462
|
+
class SplitOptions
|
|
463
|
+
class CsvSplitter
|
|
464
|
+
class CsvSplitManifestWriter
|
|
465
|
+
class CsvSplitPresenter
|
|
466
|
+
|
|
467
|
+
MenuLoop --> RunCsvSplitWorkflow : invokes
|
|
468
|
+
RunCsvSplitWorkflow --> Prompts : uses
|
|
469
|
+
RunCsvSplitWorkflow --> Errors : reports failures
|
|
470
|
+
RunCsvSplitWorkflow --> CsvSplitPresenter : renders
|
|
471
|
+
RunCsvSplitWorkflow --> RunCsvSplit : calls
|
|
472
|
+
RunCsvSplit --> SplitSession : orchestrates
|
|
473
|
+
SplitSession o-- SplitSource
|
|
474
|
+
SplitSession o-- SplitOptions
|
|
475
|
+
RunCsvSplit --> CsvSplitter
|
|
476
|
+
RunCsvSplit --> CsvSplitManifestWriter
|
|
477
|
+
```
|
|
478
|
+
|
|
369
479
|
## Project layout
|
|
370
480
|
|
|
371
481
|
```text
|
|
@@ -375,11 +485,15 @@ lib/csvtool/domain/column_session/*
|
|
|
375
485
|
lib/csvtool/domain/row_session/*
|
|
376
486
|
lib/csvtool/domain/row_randomization_session/*
|
|
377
487
|
lib/csvtool/domain/cross_csv_dedupe_session/*
|
|
488
|
+
lib/csvtool/domain/csv_parity_session/*
|
|
489
|
+
lib/csvtool/domain/csv_split_session/*
|
|
378
490
|
lib/csvtool/domain/shared/output_destination.rb
|
|
379
491
|
lib/csvtool/application/use_cases/run_extraction.rb
|
|
380
492
|
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
381
493
|
lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
382
494
|
lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
|
|
495
|
+
lib/csvtool/application/use_cases/run_csv_parity.rb
|
|
496
|
+
lib/csvtool/application/use_cases/run_csv_split.rb
|
|
383
497
|
lib/csvtool/infrastructure/csv/*
|
|
384
498
|
lib/csvtool/infrastructure/output/*
|
|
385
499
|
lib/csvtool/interface/cli/menu_loop.rb
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Release Checklist: v0.6.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV parity workflow
|
|
35
|
+
|
|
36
|
+
Use menu option `5` (`Validate parity`) and verify:
|
|
37
|
+
- matching files with reordered rows return parity success
|
|
38
|
+
- mismatch files return friendly mismatch summary with sample deltas
|
|
39
|
+
- separator and header-mode selections are respected
|
|
40
|
+
|
|
41
|
+
### Existing workflows regression pass
|
|
42
|
+
|
|
43
|
+
Run quick checks for menu options `1-4` and confirm:
|
|
44
|
+
- column extraction still works
|
|
45
|
+
- row-range extraction still works
|
|
46
|
+
- row randomization still works
|
|
47
|
+
- cross-CSV dedupe still works
|
|
48
|
+
|
|
49
|
+
## 6. Build and validate gem package
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
gem build csvops.gemspec
|
|
53
|
+
gem install ./csvops-0.6.0.alpha.gem
|
|
54
|
+
csvtool menu
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 7. Commit release prep
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git add -A
|
|
61
|
+
git commit -m "chore(release): prepare v0.6.0-alpha"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 8. Tag release
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
git tag -a v0.6.0-alpha -m "v0.6.0-alpha"
|
|
68
|
+
git push origin main --tags
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 9. Publish gem
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
gem push csvops-0.6.0.alpha.gem
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## 10. Create GitHub release
|
|
78
|
+
|
|
79
|
+
Create release `v0.6.0-alpha` with:
|
|
80
|
+
- Dedicated CSV parity validation workflow
|
|
81
|
+
- Header/separator parity options
|
|
82
|
+
- Friendly parity mismatch reporting
|
|
83
|
+
- Streaming delta-count parity comparator
|
|
84
|
+
- Parity architecture convergence (session model, workflow steps, presenter, docs)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Release Checklist: v0.7.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV split workflow (new in this release)
|
|
35
|
+
|
|
36
|
+
Use menu option `6` (`Split CSV into chunks`) and verify:
|
|
37
|
+
- happy path split (`N=10`) writes expected chunk files and counts
|
|
38
|
+
- separator and header mode options work (CSV/TSV/headerless/custom)
|
|
39
|
+
- output directory + file prefix options produce expected paths
|
|
40
|
+
- overwrite protection blocks existing chunk paths unless allowed
|
|
41
|
+
- optional manifest output writes valid CSV metadata
|
|
42
|
+
|
|
43
|
+
### Existing workflows regression pass
|
|
44
|
+
|
|
45
|
+
Use menu options `1-5` and verify:
|
|
46
|
+
- column extraction still works
|
|
47
|
+
- row-range extraction still works
|
|
48
|
+
- row randomization still works
|
|
49
|
+
- cross-CSV dedupe still works
|
|
50
|
+
- parity validation still works
|
|
51
|
+
|
|
52
|
+
## 6. Build and validate gem package
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
gem build csvops.gemspec
|
|
56
|
+
gem install ./csvops-0.7.0.alpha.gem
|
|
57
|
+
csvtool menu
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 7. Commit release prep
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git add -A
|
|
64
|
+
git commit -m "chore(release): prepare v0.7.0-alpha"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 8. Tag release
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git tag -a v0.7.0-alpha -m "v0.7.0-alpha"
|
|
71
|
+
git push origin main --tags
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 9. Publish gem
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
gem push csvops-0.7.0.alpha.gem
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## 10. Create GitHub release
|
|
81
|
+
|
|
82
|
+
Create release `v0.7.0-alpha` with:
|
|
83
|
+
- New `Split CSV into chunks` workflow
|
|
84
|
+
- Split-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
|
|
85
|
+
- Output strategy improvements (directory/prefix/overwrite controls)
|
|
86
|
+
- Optional split manifest output
|
|
87
|
+
- Large-file streaming split coverage and docs updates
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/csv_parity_comparator"
|
|
5
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Application
|
|
9
|
+
module UseCases
|
|
10
|
+
class RunCsvParity
|
|
11
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
12
|
+
def ok?
|
|
13
|
+
ok
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(
|
|
18
|
+
comparator: Infrastructure::CSV::CsvParityComparator.new,
|
|
19
|
+
header_reader: Infrastructure::CSV::HeaderReader.new
|
|
20
|
+
)
|
|
21
|
+
@comparator = comparator
|
|
22
|
+
@header_reader = header_reader
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call(session:)
|
|
26
|
+
left_path = session.source_pair.left_path
|
|
27
|
+
right_path = session.source_pair.right_path
|
|
28
|
+
col_sep = session.options.separator
|
|
29
|
+
headers_present = session.options.headers_present?
|
|
30
|
+
|
|
31
|
+
return failure(:file_not_found, path: left_path) unless File.file?(left_path)
|
|
32
|
+
return failure(:file_not_found, path: right_path) unless File.file?(right_path)
|
|
33
|
+
|
|
34
|
+
if headers_present
|
|
35
|
+
left_headers = @header_reader.call(file_path: left_path, col_sep: col_sep)
|
|
36
|
+
return failure(:no_headers, path: left_path) if left_headers.empty?
|
|
37
|
+
|
|
38
|
+
right_headers = @header_reader.call(file_path: right_path, col_sep: col_sep)
|
|
39
|
+
return failure(:no_headers, path: right_path) if right_headers.empty?
|
|
40
|
+
|
|
41
|
+
return failure(:header_mismatch, left_headers: left_headers, right_headers: right_headers) unless left_headers == right_headers
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
stats = @comparator.call(
|
|
45
|
+
left_path: left_path,
|
|
46
|
+
right_path: right_path,
|
|
47
|
+
col_sep: col_sep,
|
|
48
|
+
headers_present: headers_present
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
success(stats)
|
|
52
|
+
rescue CSV::MalformedCSVError
|
|
53
|
+
failure(:could_not_parse_csv)
|
|
54
|
+
rescue Errno::EACCES => e
|
|
55
|
+
failure(:cannot_read_file, path: e.respond_to?(:path) ? e.path : left_path)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def success(data)
|
|
61
|
+
Result.new(ok: true, error: nil, data: data)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def failure(code, data = {})
|
|
65
|
+
Result.new(ok: false, error: code, data: data)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
6
|
+
require "csvtool/infrastructure/csv/csv_splitter"
|
|
7
|
+
require "csvtool/infrastructure/output/csv_split_manifest_writer"
|
|
8
|
+
|
|
9
|
+
module Csvtool
|
|
10
|
+
module Application
|
|
11
|
+
module UseCases
|
|
12
|
+
class RunCsvSplit
|
|
13
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
14
|
+
def ok?
|
|
15
|
+
ok
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def initialize(
|
|
20
|
+
header_reader: Infrastructure::CSV::HeaderReader.new,
|
|
21
|
+
csv_splitter: Infrastructure::CSV::CsvSplitter.new,
|
|
22
|
+
csv_split_manifest_writer: Infrastructure::Output::CsvSplitManifestWriter.new
|
|
23
|
+
)
|
|
24
|
+
@header_reader = header_reader
|
|
25
|
+
@csv_splitter = csv_splitter
|
|
26
|
+
@csv_split_manifest_writer = csv_split_manifest_writer
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def read_headers(file_path:, col_sep:, headers_present:)
|
|
30
|
+
return failure(:file_not_found, path: file_path) unless File.file?(file_path)
|
|
31
|
+
return success(headers: nil) unless headers_present
|
|
32
|
+
|
|
33
|
+
headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
|
|
34
|
+
return failure(:no_headers) if headers.empty?
|
|
35
|
+
|
|
36
|
+
success(headers: headers)
|
|
37
|
+
rescue CSV::MalformedCSVError
|
|
38
|
+
failure(:could_not_parse_csv)
|
|
39
|
+
rescue Errno::EACCES
|
|
40
|
+
failure(:cannot_read_file, path: file_path)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def call(session:)
|
|
44
|
+
source = session.source
|
|
45
|
+
output_directory = session.options.output_directory || File.dirname(source.path)
|
|
46
|
+
file_prefix = session.options.file_prefix || File.basename(source.path, ".*")
|
|
47
|
+
FileUtils.mkdir_p(output_directory)
|
|
48
|
+
|
|
49
|
+
stats = @csv_splitter.call(
|
|
50
|
+
file_path: source.path,
|
|
51
|
+
col_sep: source.separator,
|
|
52
|
+
headers_present: source.headers_present,
|
|
53
|
+
chunk_size: session.options.chunk_size,
|
|
54
|
+
output_directory: output_directory,
|
|
55
|
+
file_prefix: file_prefix,
|
|
56
|
+
overwrite_existing: session.options.overwrite_existing
|
|
57
|
+
)
|
|
58
|
+
manifest_path = maybe_write_manifest(
|
|
59
|
+
session: session,
|
|
60
|
+
output_directory: output_directory,
|
|
61
|
+
file_prefix: file_prefix,
|
|
62
|
+
stats: stats
|
|
63
|
+
)
|
|
64
|
+
success(stats.merge(output_directory: output_directory, file_prefix: file_prefix, manifest_path: manifest_path))
|
|
65
|
+
rescue Infrastructure::CSV::CsvSplitter::OutputFileExistsError => e
|
|
66
|
+
failure(:output_file_exists, path: e.path)
|
|
67
|
+
rescue CSV::MalformedCSVError
|
|
68
|
+
failure(:could_not_parse_csv)
|
|
69
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
70
|
+
failure(:cannot_write_output_file, path: output_directory, error_class: e.class)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def success(data)
|
|
76
|
+
Result.new(ok: true, error: nil, data: data)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def failure(code, data = {})
|
|
80
|
+
Result.new(ok: false, error: code, data: data)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def maybe_write_manifest(session:, output_directory:, file_prefix:, stats:)
|
|
84
|
+
return nil unless session.options.write_manifest
|
|
85
|
+
|
|
86
|
+
manifest_path = session.options.manifest_path || File.join(output_directory, "#{file_prefix}_manifest.csv")
|
|
87
|
+
@csv_split_manifest_writer.call(
|
|
88
|
+
path: manifest_path,
|
|
89
|
+
chunk_paths: stats[:chunk_paths],
|
|
90
|
+
chunk_row_counts: stats[:chunk_row_counts]
|
|
91
|
+
)
|
|
92
|
+
manifest_path
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -6,6 +6,8 @@ require "csvtool/interface/cli/workflows/run_extraction_workflow"
|
|
|
6
6
|
require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
|
|
7
7
|
require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
|
|
8
8
|
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
9
|
+
require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
|
|
10
|
+
require "csvtool/interface/cli/workflows/run_csv_split_workflow"
|
|
9
11
|
require "csvtool/interface/cli/errors/presenter"
|
|
10
12
|
require "csvtool/infrastructure/csv/header_reader"
|
|
11
13
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -18,6 +20,8 @@ module Csvtool
|
|
|
18
20
|
"Extract rows (range)",
|
|
19
21
|
"Randomize rows",
|
|
20
22
|
"Dedupe using another CSV",
|
|
23
|
+
"Validate parity",
|
|
24
|
+
"Split CSV into chunks",
|
|
21
25
|
"Exit"
|
|
22
26
|
].freeze
|
|
23
27
|
|
|
@@ -51,6 +55,8 @@ module Csvtool
|
|
|
51
55
|
extract_rows_action = -> { Interface::CLI::Workflows::RunRowExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
52
56
|
randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
53
57
|
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
58
|
+
parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
59
|
+
split_action = -> { Interface::CLI::Workflows::RunCsvSplitWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
54
60
|
Interface::CLI::MenuLoop.new(
|
|
55
61
|
stdin: @stdin,
|
|
56
62
|
stdout: @stdout,
|
|
@@ -58,7 +64,9 @@ module Csvtool
|
|
|
58
64
|
extract_column_action: extract_column_action,
|
|
59
65
|
extract_rows_action: extract_rows_action,
|
|
60
66
|
randomize_rows_action: randomize_rows_action,
|
|
61
|
-
dedupe_action: dedupe_action
|
|
67
|
+
dedupe_action: dedupe_action,
|
|
68
|
+
parity_action: parity_action,
|
|
69
|
+
split_action: split_action
|
|
62
70
|
).run
|
|
63
71
|
end
|
|
64
72
|
|