csvops 0.6.0.alpha → 0.8.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +103 -24
- data/docs/architecture.md +121 -4
- data/docs/release-v0.7.0-alpha.md +87 -0
- data/docs/release-v0.8.0-alpha.md +88 -0
- data/lib/csvtool/application/use_cases/run_csv_split.rb +97 -0
- data/lib/csvtool/application/use_cases/run_csv_stats.rb +64 -0
- data/lib/csvtool/cli.rb +9 -1
- data/lib/csvtool/domain/csv_split_session/split_options.rb +27 -0
- data/lib/csvtool/domain/csv_split_session/split_session.rb +20 -0
- data/lib/csvtool/domain/csv_split_session/split_source.rb +17 -0
- data/lib/csvtool/domain/csv_stats_session/stats_options.rb +11 -0
- data/lib/csvtool/domain/csv_stats_session/stats_session.rb +25 -0
- data/lib/csvtool/domain/csv_stats_session/stats_source.rb +17 -0
- data/lib/csvtool/infrastructure/csv/csv_splitter.rb +64 -0
- data/lib/csvtool/infrastructure/csv/csv_stats_scanner.rb +67 -0
- data/lib/csvtool/infrastructure/output/csv_split_manifest_writer.rb +20 -0
- data/lib/csvtool/infrastructure/output/csv_stats_file_writer.rb +26 -0
- data/lib/csvtool/interface/cli/errors/presenter.rb +8 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +8 -2
- data/lib/csvtool/interface/cli/prompts/chunk_size_prompt.rb +21 -0
- data/lib/csvtool/interface/cli/prompts/split_manifest_prompt.rb +30 -0
- data/lib/csvtool/interface/cli/prompts/split_output_prompt.rb +38 -0
- data/lib/csvtool/interface/cli/workflows/builders/csv_split_session_builder.rb +44 -0
- data/lib/csvtool/interface/cli/workflows/builders/csv_stats_session_builder.rb +28 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_split_presenter.rb +26 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_stats_presenter.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_split_workflow.rb +89 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_stats_workflow.rb +77 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/build_session_step.rb +30 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step.rb +43 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step.rb +30 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step.rb +31 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_split/execute_step.rb +36 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step.rb +27 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step.rb +31 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/execute_step.rb +27 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_csv_split_test.rb +124 -0
- data/test/csvtool/application/use_cases/run_csv_stats_test.rb +165 -0
- data/test/csvtool/cli_test.rb +139 -29
- data/test/csvtool/infrastructure/csv/csv_splitter_test.rb +68 -0
- data/test/csvtool/infrastructure/csv/csv_stats_scanner_test.rb +68 -0
- data/test/csvtool/infrastructure/output/csv_split_manifest_writer_test.rb +25 -0
- data/test/csvtool/infrastructure/output/csv_stats_file_writer_test.rb +38 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +104 -130
- data/test/csvtool/interface/cli/prompts/chunk_size_prompt_test.rb +17 -0
- data/test/csvtool/interface/cli/prompts/split_manifest_prompt_test.rb +42 -0
- data/test/csvtool/interface/cli/prompts/split_output_prompt_test.rb +22 -0
- data/test/csvtool/interface/cli/workflows/builders/csv_split_session_builder_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/builders/csv_stats_session_builder_test.rb +19 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_split_presenter_test.rb +26 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_stats_presenter_test.rb +37 -0
- data/test/csvtool/interface/cli/workflows/run_csv_split_workflow_test.rb +200 -0
- data/test/csvtool/interface/cli/workflows/run_csv_stats_workflow_test.rb +146 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/build_session_step_test.rb +40 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_inputs_step_test.rb +64 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_manifest_step_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/collect_output_step_test.rb +32 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_split/execute_step_test.rb +83 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step_test.rb +36 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step_test.rb +49 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step_test.rb +61 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/execute_step_test.rb +65 -0
- data/test/fixtures/split_people_25.csv +26 -0
- metadata +58 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7c09a1df68b5bbb8885b254bd7ea1260617495fc042cae43ef7251d9eb66836e
|
|
4
|
+
data.tar.gz: a5fbf5098df8007e844c83134a5474f92eafe4866b4d9910519d9a1517675af9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3695b9d49a638138d03d69122267a2889ecb6bd33605e9a256a20480ccab869c858f8f08996a213a885c2fb9a08e740712bb1dcc38fef6734d10129b20b3d611
|
|
7
|
+
data.tar.gz: df4cf19d31ac3c317ae69552cd3181c48984295268880810adf0d7b46bb606a2fa92646c35c4b1cd3343c04c5d08c734efb6b53b08f44fb3259b3dc3cbaf4509
|
data/README.md
CHANGED
|
@@ -38,29 +38,26 @@ CSV Tool Menu
|
|
|
38
38
|
3. Randomize rows
|
|
39
39
|
4. Dedupe using another CSV
|
|
40
40
|
5. Validate parity
|
|
41
|
-
6.
|
|
41
|
+
6. Split CSV into chunks
|
|
42
|
+
7. CSV stats summary
|
|
43
|
+
8. Exit
|
|
42
44
|
>
|
|
43
45
|
```
|
|
44
46
|
|
|
45
|
-
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe,
|
|
47
|
+
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, `6` for CSV splitting, or `7` for CSV stats.
|
|
46
48
|
|
|
47
49
|
### 3. Follow prompts
|
|
48
50
|
|
|
49
|
-
Each
|
|
51
|
+
Each action asks only for what it needs (file path, separator, and any action-specific options), then prints results to the console or writes to a file when selected.
|
|
50
52
|
|
|
51
|
-
|
|
53
|
+
Typical prompt pattern:
|
|
52
54
|
|
|
53
|
-
-
|
|
54
|
-
-
|
|
55
|
-
-
|
|
55
|
+
- choose source file(s)
|
|
56
|
+
- choose separator/header options when relevant
|
|
57
|
+
- choose action-specific options
|
|
58
|
+
- choose output destination (console or file)
|
|
56
59
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
- `Extract column`: file path, separator, optional header filter + column select, skip blanks, preview/confirm, output destination.
|
|
60
|
-
- `Extract rows (range)`: file path, separator, start row, end row, output destination.
|
|
61
|
-
- `Randomize rows`: file path, separator, headers present, optional seed, output destination.
|
|
62
|
-
- `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
|
|
63
|
-
- `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
|
|
60
|
+
For architecture and internal design details, see [`docs/architecture.md`](docs/architecture.md).
|
|
64
61
|
|
|
65
62
|
### 4. Example interaction (console output)
|
|
66
63
|
|
|
@@ -129,10 +126,12 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
129
126
|
CSV Tool Menu
|
|
130
127
|
1. Extract column
|
|
131
128
|
2. Extract rows (range)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
129
|
+
3. Randomize rows
|
|
130
|
+
4. Dedupe using another CSV
|
|
131
|
+
5. Validate parity
|
|
132
|
+
6. Split CSV into chunks
|
|
133
|
+
7. CSV stats summary
|
|
134
|
+
8. Exit
|
|
136
135
|
+> 4
|
|
137
136
|
CSV file path: /tmp/source.csv
|
|
138
137
|
Source CSV separator:
|
|
@@ -177,10 +176,12 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
177
176
|
CSV Tool Menu
|
|
178
177
|
1. Extract column
|
|
179
178
|
2. Extract rows (range)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
6.
|
|
179
|
+
3. Randomize rows
|
|
180
|
+
4. Dedupe using another CSV
|
|
181
|
+
5. Validate parity
|
|
182
|
+
6. Split CSV into chunks
|
|
183
|
+
7. CSV stats summary
|
|
184
|
+
8. Exit
|
|
184
185
|
+> 5
|
|
185
186
|
Left CSV file path: /tmp/left.csv
|
|
186
187
|
Right CSV file path: /tmp/right.csv
|
|
@@ -208,6 +209,84 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
208
209
|
- Exact duplicate semantics are preserved by count deltas per normalized row value.
|
|
209
210
|
- Memory scales with the number of distinct row keys in the parity map, not the total input row count.
|
|
210
211
|
|
|
212
|
+
### 10. Split interaction example
|
|
213
|
+
|
|
214
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
215
|
+
|
|
216
|
+
```diff
|
|
217
|
+
CSV Tool Menu
|
|
218
|
+
1. Extract column
|
|
219
|
+
2. Extract rows (range)
|
|
220
|
+
3. Randomize rows
|
|
221
|
+
4. Dedupe using another CSV
|
|
222
|
+
5. Validate parity
|
|
223
|
+
6. Split CSV into chunks
|
|
224
|
+
7. CSV stats summary
|
|
225
|
+
8. Exit
|
|
226
|
+
+> 6
|
|
227
|
+
Source CSV file path: /tmp/people.csv
|
|
228
|
+
Choose separator:
|
|
229
|
+
1. comma (,)
|
|
230
|
+
2. tab (\t)
|
|
231
|
+
3. semicolon (;)
|
|
232
|
+
4. pipe (|)
|
|
233
|
+
5. custom
|
|
234
|
+
+Separator choice [1]: 1
|
|
235
|
+
Headers present? [Y/n]:
|
|
236
|
+
+Rows per chunk: 1000
|
|
237
|
+
Output directory [/tmp]:
|
|
238
|
+
Output file prefix [people]:
|
|
239
|
+
Overwrite existing chunk files? [y/N]:
|
|
240
|
+
Write manifest file? [y/N]:
|
|
241
|
+
-Split complete.
|
|
242
|
+
-Chunk size: 1000
|
|
243
|
+
-Data rows: 25000
|
|
244
|
+
-Chunks written: 25
|
|
245
|
+
-/tmp/people_part_001.csv
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### 11. CSV stats interaction example
|
|
249
|
+
|
|
250
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
251
|
+
|
|
252
|
+
```diff
|
|
253
|
+
CSV Tool Menu
|
|
254
|
+
1. Extract column
|
|
255
|
+
2. Extract rows (range)
|
|
256
|
+
3. Randomize rows
|
|
257
|
+
4. Dedupe using another CSV
|
|
258
|
+
5. Validate parity
|
|
259
|
+
6. Split CSV into chunks
|
|
260
|
+
7. CSV stats summary
|
|
261
|
+
8. Exit
|
|
262
|
+
+> 7
|
|
263
|
+
CSV file path: /tmp/people.csv
|
|
264
|
+
Choose separator:
|
|
265
|
+
1. comma (,)
|
|
266
|
+
2. tab (\t)
|
|
267
|
+
3. semicolon (;)
|
|
268
|
+
4. pipe (|)
|
|
269
|
+
5. custom
|
|
270
|
+
+Separator choice [1]: 1
|
|
271
|
+
Headers present? [Y/n]:
|
|
272
|
+
Output destination:
|
|
273
|
+
1. console
|
|
274
|
+
2. file
|
|
275
|
+
+Output destination [1]: 1
|
|
276
|
+
-CSV Stats Summary
|
|
277
|
+
-Rows: 3
|
|
278
|
+
-Columns: 2
|
|
279
|
+
-Headers: name, city
|
|
280
|
+
-Column completeness:
|
|
281
|
+
- name: non_blank=3 blank=0
|
|
282
|
+
- city: non_blank=3 blank=0
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### 12. CSV stats large-file behavior
|
|
286
|
+
|
|
287
|
+
- Stats scanning is streaming (`CSV.foreach`), processed in one pass.
|
|
288
|
+
- Memory grows with per-column aggregates (`column_stats`), not with total row count.
|
|
289
|
+
|
|
211
290
|
## Testing
|
|
212
291
|
|
|
213
292
|
Run tests:
|
|
@@ -224,7 +303,7 @@ bundle exec rake test
|
|
|
224
303
|
|
|
225
304
|
## Alpha release
|
|
226
305
|
|
|
227
|
-
Current prerelease version: `0.
|
|
306
|
+
Current prerelease version: `0.8.0.alpha`
|
|
228
307
|
|
|
229
308
|
Install prerelease from RubyGems:
|
|
230
309
|
|
|
@@ -234,7 +313,7 @@ gem install csvops --pre
|
|
|
234
313
|
|
|
235
314
|
Release runbook:
|
|
236
315
|
|
|
237
|
-
- `docs/release-v0.
|
|
316
|
+
- `docs/release-v0.8.0-alpha.md`
|
|
238
317
|
|
|
239
318
|
|
|
240
319
|
## Architecture
|
data/docs/architecture.md
CHANGED
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
The codebase follows a DDD-lite layered structure:
|
|
4
4
|
|
|
5
|
-
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `
|
|
6
|
-
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`).
|
|
5
|
+
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, `ParitySession`, `SplitSession`, and `CsvStatsSession` aggregates + supporting entities/value objects).
|
|
6
|
+
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`, `RunCsvStats`).
|
|
7
7
|
- `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
|
|
8
8
|
- `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
|
|
9
9
|
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
10
10
|
|
|
11
11
|
## Workflow boundary (standardized)
|
|
12
12
|
|
|
13
|
-
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`), the boundary is:
|
|
13
|
+
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, `CSV Stats`), the boundary is:
|
|
14
14
|
|
|
15
15
|
- `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
|
|
16
16
|
- `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
|
|
@@ -33,6 +33,8 @@ Current usage:
|
|
|
33
33
|
- `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
|
|
34
34
|
- `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
|
|
35
35
|
- `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
|
|
36
|
+
- `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
|
|
37
|
+
- `RunCsvStatsWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvStats::*`.
|
|
36
38
|
|
|
37
39
|
## Adding New Concepts
|
|
38
40
|
|
|
@@ -108,7 +110,7 @@ For a new function type, prefer one of these patterns:
|
|
|
108
110
|
|
|
109
111
|
## Domain model
|
|
110
112
|
|
|
111
|
-
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, and `CSV
|
|
113
|
+
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, and `CSV Stats`.
|
|
112
114
|
|
|
113
115
|
### Cross-CSV Dedupe (Large-file behavior)
|
|
114
116
|
|
|
@@ -421,6 +423,117 @@ classDiagram
|
|
|
421
423
|
RunCsvParity --> CsvParityComparator
|
|
422
424
|
```
|
|
423
425
|
|
|
426
|
+
### CSV Split
|
|
427
|
+
|
|
428
|
+
Core DDD structure:
|
|
429
|
+
|
|
430
|
+
- Aggregate root: `SplitSession`
|
|
431
|
+
- Captures one CSV split request.
|
|
432
|
+
- Holds split source and split options.
|
|
433
|
+
- Entities:
|
|
434
|
+
- `SplitSource` (path + separator + header mode)
|
|
435
|
+
- Value objects:
|
|
436
|
+
- `SplitOptions` (chunk size, output directory, file prefix, overwrite policy, optional manifest configuration)
|
|
437
|
+
- Application service:
|
|
438
|
+
- `Application::UseCases::RunCsvSplit` orchestrates split execution and returns request/result style payloads.
|
|
439
|
+
- Infrastructure adapters:
|
|
440
|
+
- `Infrastructure::CSV::CsvSplitter` (streaming row-by-row chunk writer)
|
|
441
|
+
- `Infrastructure::Output::CsvSplitManifestWriter` (optional manifest output)
|
|
442
|
+
- Interface adapters:
|
|
443
|
+
- `Interface::CLI::MenuLoop`
|
|
444
|
+
- `Interface::CLI::Workflows::RunCsvSplitWorkflow`
|
|
445
|
+
- `Interface::CLI::Workflows::Builders::CsvSplitSessionBuilder`
|
|
446
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
447
|
+
- `Interface::CLI::Workflows::Steps::CsvSplit::*`
|
|
448
|
+
- `Interface::CLI::Workflows::Presenters::CsvSplitPresenter`
|
|
449
|
+
- `Interface::CLI::Workflows::Support::ResultErrorHandler`
|
|
450
|
+
- `Interface::CLI::Prompts::*`
|
|
451
|
+
- `Interface::CLI::Errors::Presenter`
|
|
452
|
+
|
|
453
|
+
```mermaid
|
|
454
|
+
classDiagram
|
|
455
|
+
direction LR
|
|
456
|
+
class MenuLoop
|
|
457
|
+
class RunCsvSplitWorkflow
|
|
458
|
+
class Prompts
|
|
459
|
+
class Errors
|
|
460
|
+
class RunCsvSplit
|
|
461
|
+
class SplitSession
|
|
462
|
+
class SplitSource
|
|
463
|
+
class SplitOptions
|
|
464
|
+
class CsvSplitter
|
|
465
|
+
class CsvSplitManifestWriter
|
|
466
|
+
class CsvSplitPresenter
|
|
467
|
+
|
|
468
|
+
MenuLoop --> RunCsvSplitWorkflow : invokes
|
|
469
|
+
RunCsvSplitWorkflow --> Prompts : uses
|
|
470
|
+
RunCsvSplitWorkflow --> Errors : reports failures
|
|
471
|
+
RunCsvSplitWorkflow --> CsvSplitPresenter : renders
|
|
472
|
+
RunCsvSplitWorkflow --> RunCsvSplit : calls
|
|
473
|
+
RunCsvSplit --> SplitSession : orchestrates
|
|
474
|
+
SplitSession o-- SplitSource
|
|
475
|
+
SplitSession o-- SplitOptions
|
|
476
|
+
RunCsvSplit --> CsvSplitter
|
|
477
|
+
RunCsvSplit --> CsvSplitManifestWriter
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
### CSV Stats
|
|
481
|
+
|
|
482
|
+
Core DDD structure:
|
|
483
|
+
|
|
484
|
+
- Aggregate root: `StatsSession`
|
|
485
|
+
- Captures one stats summary request.
|
|
486
|
+
- Holds source profile and output destination.
|
|
487
|
+
- Entity:
|
|
488
|
+
- `StatsSource` (path + separator + header mode)
|
|
489
|
+
- Value objects:
|
|
490
|
+
- `StatsOptions` (currently lightweight; keeps option growth explicit)
|
|
491
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
492
|
+
- Application service:
|
|
493
|
+
- `Application::UseCases::RunCsvStats` orchestrates stats scanning and output routing.
|
|
494
|
+
- Infrastructure adapters:
|
|
495
|
+
- `Infrastructure::CSV::CsvStatsScanner` (streaming one-pass row aggregation)
|
|
496
|
+
- `Infrastructure::Output::CsvStatsFileWriter` (metric/value artifact writer)
|
|
497
|
+
- Interface adapters:
|
|
498
|
+
- `Interface::CLI::MenuLoop`
|
|
499
|
+
- `Interface::CLI::Workflows::RunCsvStatsWorkflow`
|
|
500
|
+
- `Interface::CLI::Workflows::Builders::CsvStatsSessionBuilder`
|
|
501
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
502
|
+
- `Interface::CLI::Workflows::Steps::CsvStats::*`
|
|
503
|
+
- `Interface::CLI::Workflows::Presenters::CsvStatsPresenter`
|
|
504
|
+
- `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
|
|
505
|
+
- `Interface::CLI::Prompts::*`
|
|
506
|
+
- `Interface::CLI::Errors::Presenter`
|
|
507
|
+
|
|
508
|
+
```mermaid
|
|
509
|
+
classDiagram
|
|
510
|
+
direction LR
|
|
511
|
+
class MenuLoop
|
|
512
|
+
class RunCsvStatsWorkflow
|
|
513
|
+
class Prompts
|
|
514
|
+
class Errors
|
|
515
|
+
class RunCsvStats
|
|
516
|
+
class StatsSession
|
|
517
|
+
class StatsSource
|
|
518
|
+
class StatsOptions
|
|
519
|
+
class OutputDestination
|
|
520
|
+
class CsvStatsScanner
|
|
521
|
+
class CsvStatsFileWriter
|
|
522
|
+
class CsvStatsPresenter
|
|
523
|
+
|
|
524
|
+
MenuLoop --> RunCsvStatsWorkflow : invokes
|
|
525
|
+
RunCsvStatsWorkflow --> Prompts : uses
|
|
526
|
+
RunCsvStatsWorkflow --> Errors : reports failures
|
|
527
|
+
RunCsvStatsWorkflow --> CsvStatsPresenter : renders
|
|
528
|
+
RunCsvStatsWorkflow --> RunCsvStats : calls
|
|
529
|
+
RunCsvStats --> StatsSession : orchestrates
|
|
530
|
+
StatsSession o-- StatsSource
|
|
531
|
+
StatsSession o-- StatsOptions
|
|
532
|
+
StatsSession o-- OutputDestination
|
|
533
|
+
RunCsvStats --> CsvStatsScanner
|
|
534
|
+
RunCsvStats --> CsvStatsFileWriter
|
|
535
|
+
```
|
|
536
|
+
|
|
424
537
|
## Project layout
|
|
425
538
|
|
|
426
539
|
```text
|
|
@@ -431,12 +544,16 @@ lib/csvtool/domain/row_session/*
|
|
|
431
544
|
lib/csvtool/domain/row_randomization_session/*
|
|
432
545
|
lib/csvtool/domain/cross_csv_dedupe_session/*
|
|
433
546
|
lib/csvtool/domain/csv_parity_session/*
|
|
547
|
+
lib/csvtool/domain/csv_split_session/*
|
|
548
|
+
lib/csvtool/domain/csv_stats_session/*
|
|
434
549
|
lib/csvtool/domain/shared/output_destination.rb
|
|
435
550
|
lib/csvtool/application/use_cases/run_extraction.rb
|
|
436
551
|
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
437
552
|
lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
438
553
|
lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
|
|
439
554
|
lib/csvtool/application/use_cases/run_csv_parity.rb
|
|
555
|
+
lib/csvtool/application/use_cases/run_csv_split.rb
|
|
556
|
+
lib/csvtool/application/use_cases/run_csv_stats.rb
|
|
440
557
|
lib/csvtool/infrastructure/csv/*
|
|
441
558
|
lib/csvtool/infrastructure/output/*
|
|
442
559
|
lib/csvtool/interface/cli/menu_loop.rb
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Release Checklist: v0.7.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV split workflow (new in this release)
|
|
35
|
+
|
|
36
|
+
Use menu option `6` (`Split CSV into chunks`) and verify:
|
|
37
|
+
- happy path split (`N=10`) writes expected chunk files and counts
|
|
38
|
+
- separator and header mode options work (CSV/TSV/headerless/custom)
|
|
39
|
+
- output directory + file prefix options produce expected paths
|
|
40
|
+
- overwrite protection blocks existing chunk paths unless allowed
|
|
41
|
+
- optional manifest output writes valid CSV metadata
|
|
42
|
+
|
|
43
|
+
### Existing workflows regression pass
|
|
44
|
+
|
|
45
|
+
Use menu options `1-5` and verify:
|
|
46
|
+
- column extraction still works
|
|
47
|
+
- row-range extraction still works
|
|
48
|
+
- row randomization still works
|
|
49
|
+
- cross-CSV dedupe still works
|
|
50
|
+
- parity validation still works
|
|
51
|
+
|
|
52
|
+
## 6. Build and validate gem package
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
gem build csvops.gemspec
|
|
56
|
+
gem install ./csvops-0.7.0.alpha.gem
|
|
57
|
+
csvtool menu
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 7. Commit release prep
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git add -A
|
|
64
|
+
git commit -m "chore(release): prepare v0.7.0-alpha"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 8. Tag release
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git tag -a v0.7.0-alpha -m "v0.7.0-alpha"
|
|
71
|
+
git push origin main --tags
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 9. Publish gem
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
gem push csvops-0.7.0.alpha.gem
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## 10. Create GitHub release
|
|
81
|
+
|
|
82
|
+
Create release `v0.7.0-alpha` with:
|
|
83
|
+
- New `Split CSV into chunks` workflow
|
|
84
|
+
- Split-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
|
|
85
|
+
- Output strategy improvements (directory/prefix/overwrite controls)
|
|
86
|
+
- Optional split manifest output
|
|
87
|
+
- Large-file streaming split coverage and docs updates
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Release Checklist: v0.8.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV stats workflow (new in this release)
|
|
35
|
+
|
|
36
|
+
Use menu option `7` (`CSV stats summary`) and verify:
|
|
37
|
+
- happy path summary prints rows/columns/headers
|
|
38
|
+
- separator and header mode options work (CSV/TSV/headerless/custom)
|
|
39
|
+
- column completeness output is correct for blanks
|
|
40
|
+
- output destination supports console and file
|
|
41
|
+
- invalid output path returns friendly error
|
|
42
|
+
|
|
43
|
+
### Existing workflows regression pass
|
|
44
|
+
|
|
45
|
+
Use menu options `1-6` and verify:
|
|
46
|
+
- column extraction still works
|
|
47
|
+
- row-range extraction still works
|
|
48
|
+
- row randomization still works
|
|
49
|
+
- cross-CSV dedupe still works
|
|
50
|
+
- parity validation still works
|
|
51
|
+
- CSV split still works
|
|
52
|
+
|
|
53
|
+
## 6. Build and validate gem package
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
gem build csvops.gemspec
|
|
57
|
+
gem install ./csvops-0.8.0.alpha.gem
|
|
58
|
+
csvtool menu
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## 7. Commit release prep
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git add -A
|
|
65
|
+
git commit -m "chore(release): prepare v0.8.0-alpha"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## 8. Tag release
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git tag -a v0.8.0-alpha -m "v0.8.0-alpha"
|
|
72
|
+
git push origin main --tags
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 9. Publish gem
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
gem push csvops-0.8.0.alpha.gem
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## 10. Create GitHub release
|
|
82
|
+
|
|
83
|
+
Create release `v0.8.0-alpha` with:
|
|
84
|
+
- New `CSV stats summary` workflow
|
|
85
|
+
- Stats-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
|
|
86
|
+
- Console/file output destination support for stats summary artifacts
|
|
87
|
+
- Streaming stats scanner coverage for large files
|
|
88
|
+
- Stats documentation updates in README + architecture guide
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
6
|
+
require "csvtool/infrastructure/csv/csv_splitter"
|
|
7
|
+
require "csvtool/infrastructure/output/csv_split_manifest_writer"
|
|
8
|
+
|
|
9
|
+
module Csvtool
|
|
10
|
+
module Application
|
|
11
|
+
module UseCases
|
|
12
|
+
class RunCsvSplit
|
|
13
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
14
|
+
def ok?
|
|
15
|
+
ok
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def initialize(
|
|
20
|
+
header_reader: Infrastructure::CSV::HeaderReader.new,
|
|
21
|
+
csv_splitter: Infrastructure::CSV::CsvSplitter.new,
|
|
22
|
+
csv_split_manifest_writer: Infrastructure::Output::CsvSplitManifestWriter.new
|
|
23
|
+
)
|
|
24
|
+
@header_reader = header_reader
|
|
25
|
+
@csv_splitter = csv_splitter
|
|
26
|
+
@csv_split_manifest_writer = csv_split_manifest_writer
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def read_headers(file_path:, col_sep:, headers_present:)
|
|
30
|
+
return failure(:file_not_found, path: file_path) unless File.file?(file_path)
|
|
31
|
+
return success(headers: nil) unless headers_present
|
|
32
|
+
|
|
33
|
+
headers = @header_reader.call(file_path: file_path, col_sep: col_sep)
|
|
34
|
+
return failure(:no_headers) if headers.empty?
|
|
35
|
+
|
|
36
|
+
success(headers: headers)
|
|
37
|
+
rescue CSV::MalformedCSVError
|
|
38
|
+
failure(:could_not_parse_csv)
|
|
39
|
+
rescue Errno::EACCES
|
|
40
|
+
failure(:cannot_read_file, path: file_path)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def call(session:)
|
|
44
|
+
source = session.source
|
|
45
|
+
output_directory = session.options.output_directory || File.dirname(source.path)
|
|
46
|
+
file_prefix = session.options.file_prefix || File.basename(source.path, ".*")
|
|
47
|
+
FileUtils.mkdir_p(output_directory)
|
|
48
|
+
|
|
49
|
+
stats = @csv_splitter.call(
|
|
50
|
+
file_path: source.path,
|
|
51
|
+
col_sep: source.separator,
|
|
52
|
+
headers_present: source.headers_present,
|
|
53
|
+
chunk_size: session.options.chunk_size,
|
|
54
|
+
output_directory: output_directory,
|
|
55
|
+
file_prefix: file_prefix,
|
|
56
|
+
overwrite_existing: session.options.overwrite_existing
|
|
57
|
+
)
|
|
58
|
+
manifest_path = maybe_write_manifest(
|
|
59
|
+
session: session,
|
|
60
|
+
output_directory: output_directory,
|
|
61
|
+
file_prefix: file_prefix,
|
|
62
|
+
stats: stats
|
|
63
|
+
)
|
|
64
|
+
success(stats.merge(output_directory: output_directory, file_prefix: file_prefix, manifest_path: manifest_path))
|
|
65
|
+
rescue Infrastructure::CSV::CsvSplitter::OutputFileExistsError => e
|
|
66
|
+
failure(:output_file_exists, path: e.path)
|
|
67
|
+
rescue CSV::MalformedCSVError
|
|
68
|
+
failure(:could_not_parse_csv)
|
|
69
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
70
|
+
failure(:cannot_write_output_file, path: output_directory, error_class: e.class)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def success(data)
|
|
76
|
+
Result.new(ok: true, error: nil, data: data)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def failure(code, data = {})
|
|
80
|
+
Result.new(ok: false, error: code, data: data)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def maybe_write_manifest(session:, output_directory:, file_prefix:, stats:)
|
|
84
|
+
return nil unless session.options.write_manifest
|
|
85
|
+
|
|
86
|
+
manifest_path = session.options.manifest_path || File.join(output_directory, "#{file_prefix}_manifest.csv")
|
|
87
|
+
@csv_split_manifest_writer.call(
|
|
88
|
+
path: manifest_path,
|
|
89
|
+
chunk_paths: stats[:chunk_paths],
|
|
90
|
+
chunk_row_counts: stats[:chunk_row_counts]
|
|
91
|
+
)
|
|
92
|
+
manifest_path
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/csv_stats_scanner"
|
|
5
|
+
require "csvtool/infrastructure/output/csv_stats_file_writer"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Application
|
|
9
|
+
module UseCases
|
|
10
|
+
class RunCsvStats
|
|
11
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
12
|
+
def ok?
|
|
13
|
+
ok
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(
|
|
18
|
+
scanner: Infrastructure::CSV::CsvStatsScanner.new,
|
|
19
|
+
csv_stats_file_writer: Infrastructure::Output::CsvStatsFileWriter.new
|
|
20
|
+
)
|
|
21
|
+
@scanner = scanner
|
|
22
|
+
@csv_stats_file_writer = csv_stats_file_writer
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call(session:)
|
|
26
|
+
path = session.source.path
|
|
27
|
+
return failure(:file_not_found, path: path) unless File.file?(path)
|
|
28
|
+
|
|
29
|
+
stats = @scanner.call(
|
|
30
|
+
file_path: path,
|
|
31
|
+
col_sep: session.source.separator,
|
|
32
|
+
headers_present: session.source.headers_present
|
|
33
|
+
)
|
|
34
|
+
if session.output_destination&.file?
|
|
35
|
+
@csv_stats_file_writer.call(path: session.output_destination.path, data: stats)
|
|
36
|
+
return success(stats.merge(output_path: session.output_destination.path))
|
|
37
|
+
end
|
|
38
|
+
success(stats)
|
|
39
|
+
rescue CSV::MalformedCSVError
|
|
40
|
+
failure(:could_not_parse_csv)
|
|
41
|
+
rescue Errno::EACCES => e
|
|
42
|
+
if session.output_destination&.file?
|
|
43
|
+
return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
|
|
44
|
+
end
|
|
45
|
+
failure(:cannot_read_file, path: path)
|
|
46
|
+
rescue Errno::ENOENT => e
|
|
47
|
+
return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class) if session.output_destination&.file?
|
|
48
|
+
|
|
49
|
+
failure(:cannot_read_file, path: path)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def success(data)
|
|
55
|
+
Result.new(ok: true, error: nil, data: data)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def failure(code, data = {})
|
|
59
|
+
Result.new(ok: false, error: code, data: data)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|