csvops 0.7.0.alpha → 0.8.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +61 -21
- data/docs/architecture.md +64 -4
- data/docs/release-v0.8.0-alpha.md +88 -0
- data/lib/csvtool/application/use_cases/run_csv_stats.rb +64 -0
- data/lib/csvtool/cli.rb +5 -1
- data/lib/csvtool/domain/csv_stats_session/stats_options.rb +11 -0
- data/lib/csvtool/domain/csv_stats_session/stats_session.rb +25 -0
- data/lib/csvtool/domain/csv_stats_session/stats_source.rb +17 -0
- data/lib/csvtool/infrastructure/csv/csv_stats_scanner.rb +67 -0
- data/lib/csvtool/infrastructure/output/csv_stats_file_writer.rb +26 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
- data/lib/csvtool/interface/cli/workflows/builders/csv_stats_session_builder.rb +28 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_stats_presenter.rb +34 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_stats_workflow.rb +77 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step.rb +27 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step.rb +31 -0
- data/lib/csvtool/interface/cli/workflows/steps/csv_stats/execute_step.rb +27 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_csv_stats_test.rb +165 -0
- data/test/csvtool/cli_test.rb +93 -30
- data/test/csvtool/infrastructure/csv/csv_stats_scanner_test.rb +68 -0
- data/test/csvtool/infrastructure/output/csv_stats_file_writer_test.rb +38 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +34 -11
- data/test/csvtool/interface/cli/workflows/builders/csv_stats_session_builder_test.rb +19 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_stats_presenter_test.rb +37 -0
- data/test/csvtool/interface/cli/workflows/run_csv_stats_workflow_test.rb +146 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/build_session_step_test.rb +36 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_destination_step_test.rb +49 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/collect_inputs_step_test.rb +61 -0
- data/test/csvtool/interface/cli/workflows/steps/csv_stats/execute_step_test.rb +65 -0
- metadata +25 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7c09a1df68b5bbb8885b254bd7ea1260617495fc042cae43ef7251d9eb66836e
|
|
4
|
+
data.tar.gz: a5fbf5098df8007e844c83134a5474f92eafe4866b4d9910519d9a1517675af9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3695b9d49a638138d03d69122267a2889ecb6bd33605e9a256a20480ccab869c858f8f08996a213a885c2fb9a08e740712bb1dcc38fef6734d10129b20b3d611
|
|
7
|
+
data.tar.gz: df4cf19d31ac3c317ae69552cd3181c48984295268880810adf0d7b46bb606a2fa92646c35c4b1cd3343c04c5d08c734efb6b53b08f44fb3259b3dc3cbaf4509
|
data/README.md
CHANGED
|
@@ -39,30 +39,25 @@ CSV Tool Menu
|
|
|
39
39
|
4. Dedupe using another CSV
|
|
40
40
|
5. Validate parity
|
|
41
41
|
6. Split CSV into chunks
|
|
42
|
-
7.
|
|
42
|
+
7. CSV stats summary
|
|
43
|
+
8. Exit
|
|
43
44
|
>
|
|
44
45
|
```
|
|
45
46
|
|
|
46
|
-
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, or `
|
|
47
|
+
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, `5` for parity validation, `6` for CSV splitting, or `7` for CSV stats.
|
|
47
48
|
|
|
48
49
|
### 3. Follow prompts
|
|
49
50
|
|
|
50
|
-
Each
|
|
51
|
+
Each action asks only for what it needs (file path, separator, and any action-specific options), then prints results to the console or writes to a file when selected.
|
|
51
52
|
|
|
52
|
-
|
|
53
|
+
Typical prompt pattern:
|
|
53
54
|
|
|
54
|
-
-
|
|
55
|
-
-
|
|
56
|
-
-
|
|
55
|
+
- choose source file(s)
|
|
56
|
+
- choose separator/header options when relevant
|
|
57
|
+
- choose action-specific options
|
|
58
|
+
- choose output destination (console or file)
|
|
57
59
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
- `Extract column`: file path, separator, optional header filter + column select, skip blanks, preview/confirm, output destination.
|
|
61
|
-
- `Extract rows (range)`: file path, separator, start row, end row, output destination.
|
|
62
|
-
- `Randomize rows`: file path, separator, headers present, optional seed, output destination.
|
|
63
|
-
- `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
|
|
64
|
-
- `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
|
|
65
|
-
- `Split CSV into chunks`: source file, separator, header mode, chunk size, output directory/prefix, overwrite policy, optional manifest.
|
|
60
|
+
For architecture and internal design details, see [`docs/architecture.md`](docs/architecture.md).
|
|
66
61
|
|
|
67
62
|
### 4. Example interaction (console output)
|
|
68
63
|
|
|
@@ -134,8 +129,9 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
134
129
|
3. Randomize rows
|
|
135
130
|
4. Dedupe using another CSV
|
|
136
131
|
5. Validate parity
|
|
137
|
-
|
|
138
|
-
7.
|
|
132
|
+
6. Split CSV into chunks
|
|
133
|
+
7. CSV stats summary
|
|
134
|
+
8. Exit
|
|
139
135
|
+> 4
|
|
140
136
|
CSV file path: /tmp/source.csv
|
|
141
137
|
Source CSV separator:
|
|
@@ -184,7 +180,8 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
184
180
|
4. Dedupe using another CSV
|
|
185
181
|
5. Validate parity
|
|
186
182
|
6. Split CSV into chunks
|
|
187
|
-
7.
|
|
183
|
+
7. CSV stats summary
|
|
184
|
+
8. Exit
|
|
188
185
|
+> 5
|
|
189
186
|
Left CSV file path: /tmp/left.csv
|
|
190
187
|
Right CSV file path: /tmp/right.csv
|
|
@@ -224,7 +221,8 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
224
221
|
4. Dedupe using another CSV
|
|
225
222
|
5. Validate parity
|
|
226
223
|
6. Split CSV into chunks
|
|
227
|
-
7.
|
|
224
|
+
7. CSV stats summary
|
|
225
|
+
8. Exit
|
|
228
226
|
+> 6
|
|
229
227
|
Source CSV file path: /tmp/people.csv
|
|
230
228
|
Choose separator:
|
|
@@ -247,6 +245,48 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
247
245
|
-/tmp/people_part_001.csv
|
|
248
246
|
```
|
|
249
247
|
|
|
248
|
+
### 11. CSV stats interaction example
|
|
249
|
+
|
|
250
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
251
|
+
|
|
252
|
+
```diff
|
|
253
|
+
CSV Tool Menu
|
|
254
|
+
1. Extract column
|
|
255
|
+
2. Extract rows (range)
|
|
256
|
+
3. Randomize rows
|
|
257
|
+
4. Dedupe using another CSV
|
|
258
|
+
5. Validate parity
|
|
259
|
+
6. Split CSV into chunks
|
|
260
|
+
7. CSV stats summary
|
|
261
|
+
8. Exit
|
|
262
|
+
+> 7
|
|
263
|
+
CSV file path: /tmp/people.csv
|
|
264
|
+
Choose separator:
|
|
265
|
+
1. comma (,)
|
|
266
|
+
2. tab (\t)
|
|
267
|
+
3. semicolon (;)
|
|
268
|
+
4. pipe (|)
|
|
269
|
+
5. custom
|
|
270
|
+
+Separator choice [1]: 1
|
|
271
|
+
Headers present? [Y/n]:
|
|
272
|
+
Output destination:
|
|
273
|
+
1. console
|
|
274
|
+
2. file
|
|
275
|
+
+Output destination [1]: 1
|
|
276
|
+
-CSV Stats Summary
|
|
277
|
+
-Rows: 3
|
|
278
|
+
-Columns: 2
|
|
279
|
+
-Headers: name, city
|
|
280
|
+
-Column completeness:
|
|
281
|
+
- name: non_blank=3 blank=0
|
|
282
|
+
- city: non_blank=3 blank=0
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### 12. CSV stats large-file behavior
|
|
286
|
+
|
|
287
|
+
- Stats scanning is streaming (`CSV.foreach`), processed in one pass.
|
|
288
|
+
- Memory grows with per-column aggregates (`column_stats`), not with total row count.
|
|
289
|
+
|
|
250
290
|
## Testing
|
|
251
291
|
|
|
252
292
|
Run tests:
|
|
@@ -263,7 +303,7 @@ bundle exec rake test
|
|
|
263
303
|
|
|
264
304
|
## Alpha release
|
|
265
305
|
|
|
266
|
-
Current prerelease version: `0.
|
|
306
|
+
Current prerelease version: `0.8.0.alpha`
|
|
267
307
|
|
|
268
308
|
Install prerelease from RubyGems:
|
|
269
309
|
|
|
@@ -273,7 +313,7 @@ gem install csvops --pre
|
|
|
273
313
|
|
|
274
314
|
Release runbook:
|
|
275
315
|
|
|
276
|
-
- `docs/release-v0.
|
|
316
|
+
- `docs/release-v0.8.0-alpha.md`
|
|
277
317
|
|
|
278
318
|
|
|
279
319
|
## Architecture
|
data/docs/architecture.md
CHANGED
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
The codebase follows a DDD-lite layered structure:
|
|
4
4
|
|
|
5
|
-
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, and `
|
|
6
|
-
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`).
|
|
5
|
+
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, `CrossCsvDedupeSession`, `ParitySession`, `SplitSession`, and `CsvStatsSession` aggregates + supporting entities/value objects).
|
|
6
|
+
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`, `RunCsvSplit`, `RunCsvStats`).
|
|
7
7
|
- `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
|
|
8
8
|
- `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
|
|
9
9
|
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
10
10
|
|
|
11
11
|
## Workflow boundary (standardized)
|
|
12
12
|
|
|
13
|
-
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`), the boundary is:
|
|
13
|
+
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, `CSV Stats`), the boundary is:
|
|
14
14
|
|
|
15
15
|
- `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
|
|
16
16
|
- `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
|
|
@@ -34,6 +34,7 @@ Current usage:
|
|
|
34
34
|
- `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
|
|
35
35
|
- `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
|
|
36
36
|
- `RunCsvSplitWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvSplit::*`.
|
|
37
|
+
- `RunCsvStatsWorkflow` uses `WorkflowStepPipeline` + `Steps::CsvStats::*`.
|
|
37
38
|
|
|
38
39
|
## Adding New Concepts
|
|
39
40
|
|
|
@@ -109,7 +110,7 @@ For a new function type, prefer one of these patterns:
|
|
|
109
110
|
|
|
110
111
|
## Domain model
|
|
111
112
|
|
|
112
|
-
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, and `CSV
|
|
113
|
+
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`, `CSV Split`, and `CSV Stats`.
|
|
113
114
|
|
|
114
115
|
### Cross-CSV Dedupe (Large-file behavior)
|
|
115
116
|
|
|
@@ -476,6 +477,63 @@ classDiagram
|
|
|
476
477
|
RunCsvSplit --> CsvSplitManifestWriter
|
|
477
478
|
```
|
|
478
479
|
|
|
480
|
+
### CSV Stats
|
|
481
|
+
|
|
482
|
+
Core DDD structure:
|
|
483
|
+
|
|
484
|
+
- Aggregate root: `StatsSession`
|
|
485
|
+
- Captures one stats summary request.
|
|
486
|
+
- Holds source profile and output destination.
|
|
487
|
+
- Entity:
|
|
488
|
+
- `StatsSource` (path + separator + header mode)
|
|
489
|
+
- Value objects:
|
|
490
|
+
- `StatsOptions` (currently lightweight; keeps option growth explicit)
|
|
491
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
492
|
+
- Application service:
|
|
493
|
+
- `Application::UseCases::RunCsvStats` orchestrates stats scanning and output routing.
|
|
494
|
+
- Infrastructure adapters:
|
|
495
|
+
- `Infrastructure::CSV::CsvStatsScanner` (streaming one-pass row aggregation)
|
|
496
|
+
- `Infrastructure::Output::CsvStatsFileWriter` (metric/value artifact writer)
|
|
497
|
+
- Interface adapters:
|
|
498
|
+
- `Interface::CLI::MenuLoop`
|
|
499
|
+
- `Interface::CLI::Workflows::RunCsvStatsWorkflow`
|
|
500
|
+
- `Interface::CLI::Workflows::Builders::CsvStatsSessionBuilder`
|
|
501
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
502
|
+
- `Interface::CLI::Workflows::Steps::CsvStats::*`
|
|
503
|
+
- `Interface::CLI::Workflows::Presenters::CsvStatsPresenter`
|
|
504
|
+
- `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
|
|
505
|
+
- `Interface::CLI::Prompts::*`
|
|
506
|
+
- `Interface::CLI::Errors::Presenter`
|
|
507
|
+
|
|
508
|
+
```mermaid
|
|
509
|
+
classDiagram
|
|
510
|
+
direction LR
|
|
511
|
+
class MenuLoop
|
|
512
|
+
class RunCsvStatsWorkflow
|
|
513
|
+
class Prompts
|
|
514
|
+
class Errors
|
|
515
|
+
class RunCsvStats
|
|
516
|
+
class StatsSession
|
|
517
|
+
class StatsSource
|
|
518
|
+
class StatsOptions
|
|
519
|
+
class OutputDestination
|
|
520
|
+
class CsvStatsScanner
|
|
521
|
+
class CsvStatsFileWriter
|
|
522
|
+
class CsvStatsPresenter
|
|
523
|
+
|
|
524
|
+
MenuLoop --> RunCsvStatsWorkflow : invokes
|
|
525
|
+
RunCsvStatsWorkflow --> Prompts : uses
|
|
526
|
+
RunCsvStatsWorkflow --> Errors : reports failures
|
|
527
|
+
RunCsvStatsWorkflow --> CsvStatsPresenter : renders
|
|
528
|
+
RunCsvStatsWorkflow --> RunCsvStats : calls
|
|
529
|
+
RunCsvStats --> StatsSession : orchestrates
|
|
530
|
+
StatsSession o-- StatsSource
|
|
531
|
+
StatsSession o-- StatsOptions
|
|
532
|
+
StatsSession o-- OutputDestination
|
|
533
|
+
RunCsvStats --> CsvStatsScanner
|
|
534
|
+
RunCsvStats --> CsvStatsFileWriter
|
|
535
|
+
```
|
|
536
|
+
|
|
479
537
|
## Project layout
|
|
480
538
|
|
|
481
539
|
```text
|
|
@@ -487,6 +545,7 @@ lib/csvtool/domain/row_randomization_session/*
|
|
|
487
545
|
lib/csvtool/domain/cross_csv_dedupe_session/*
|
|
488
546
|
lib/csvtool/domain/csv_parity_session/*
|
|
489
547
|
lib/csvtool/domain/csv_split_session/*
|
|
548
|
+
lib/csvtool/domain/csv_stats_session/*
|
|
490
549
|
lib/csvtool/domain/shared/output_destination.rb
|
|
491
550
|
lib/csvtool/application/use_cases/run_extraction.rb
|
|
492
551
|
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
@@ -494,6 +553,7 @@ lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
|
494
553
|
lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
|
|
495
554
|
lib/csvtool/application/use_cases/run_csv_parity.rb
|
|
496
555
|
lib/csvtool/application/use_cases/run_csv_split.rb
|
|
556
|
+
lib/csvtool/application/use_cases/run_csv_stats.rb
|
|
497
557
|
lib/csvtool/infrastructure/csv/*
|
|
498
558
|
lib/csvtool/infrastructure/output/*
|
|
499
559
|
lib/csvtool/interface/cli/menu_loop.rb
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Release Checklist: v0.8.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV stats workflow (new in this release)
|
|
35
|
+
|
|
36
|
+
Use menu option `7` (`CSV stats summary`) and verify:
|
|
37
|
+
- happy path summary prints rows/columns/headers
|
|
38
|
+
- separator and header mode options work (CSV/TSV/headerless/custom)
|
|
39
|
+
- column completeness output is correct for blanks
|
|
40
|
+
- output destination supports console and file
|
|
41
|
+
- invalid output path returns friendly error
|
|
42
|
+
|
|
43
|
+
### Existing workflows regression pass
|
|
44
|
+
|
|
45
|
+
Use menu options `1-6` and verify:
|
|
46
|
+
- column extraction still works
|
|
47
|
+
- row-range extraction still works
|
|
48
|
+
- row randomization still works
|
|
49
|
+
- cross-CSV dedupe still works
|
|
50
|
+
- parity validation still works
|
|
51
|
+
- CSV split still works
|
|
52
|
+
|
|
53
|
+
## 6. Build and validate gem package
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
gem build csvops.gemspec
|
|
57
|
+
gem install ./csvops-0.8.0.alpha.gem
|
|
58
|
+
csvtool menu
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## 7. Commit release prep
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git add -A
|
|
65
|
+
git commit -m "chore(release): prepare v0.8.0-alpha"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## 8. Tag release
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git tag -a v0.8.0-alpha -m "v0.8.0-alpha"
|
|
72
|
+
git push origin main --tags
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 9. Publish gem
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
gem push csvops-0.8.0.alpha.gem
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## 10. Create GitHub release
|
|
82
|
+
|
|
83
|
+
Create release `v0.8.0-alpha` with:
|
|
84
|
+
- New `CSV stats summary` workflow
|
|
85
|
+
- Stats-domain architecture (workflow steps, builder, presenter, use case, infrastructure adapters)
|
|
86
|
+
- Console/file output destination support for stats summary artifacts
|
|
87
|
+
- Streaming stats scanner coverage for large files
|
|
88
|
+
- Stats documentation updates in README + architecture guide
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/csv_stats_scanner"
|
|
5
|
+
require "csvtool/infrastructure/output/csv_stats_file_writer"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Application
|
|
9
|
+
module UseCases
|
|
10
|
+
class RunCsvStats
|
|
11
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
12
|
+
def ok?
|
|
13
|
+
ok
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(
|
|
18
|
+
scanner: Infrastructure::CSV::CsvStatsScanner.new,
|
|
19
|
+
csv_stats_file_writer: Infrastructure::Output::CsvStatsFileWriter.new
|
|
20
|
+
)
|
|
21
|
+
@scanner = scanner
|
|
22
|
+
@csv_stats_file_writer = csv_stats_file_writer
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call(session:)
|
|
26
|
+
path = session.source.path
|
|
27
|
+
return failure(:file_not_found, path: path) unless File.file?(path)
|
|
28
|
+
|
|
29
|
+
stats = @scanner.call(
|
|
30
|
+
file_path: path,
|
|
31
|
+
col_sep: session.source.separator,
|
|
32
|
+
headers_present: session.source.headers_present
|
|
33
|
+
)
|
|
34
|
+
if session.output_destination&.file?
|
|
35
|
+
@csv_stats_file_writer.call(path: session.output_destination.path, data: stats)
|
|
36
|
+
return success(stats.merge(output_path: session.output_destination.path))
|
|
37
|
+
end
|
|
38
|
+
success(stats)
|
|
39
|
+
rescue CSV::MalformedCSVError
|
|
40
|
+
failure(:could_not_parse_csv)
|
|
41
|
+
rescue Errno::EACCES => e
|
|
42
|
+
if session.output_destination&.file?
|
|
43
|
+
return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
|
|
44
|
+
end
|
|
45
|
+
failure(:cannot_read_file, path: path)
|
|
46
|
+
rescue Errno::ENOENT => e
|
|
47
|
+
return failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class) if session.output_destination&.file?
|
|
48
|
+
|
|
49
|
+
failure(:cannot_read_file, path: path)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def success(data)
|
|
55
|
+
Result.new(ok: true, error: nil, data: data)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def failure(code, data = {})
|
|
59
|
+
Result.new(ok: false, error: code, data: data)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -8,6 +8,7 @@ require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
|
|
|
8
8
|
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
9
9
|
require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
|
|
10
10
|
require "csvtool/interface/cli/workflows/run_csv_split_workflow"
|
|
11
|
+
require "csvtool/interface/cli/workflows/run_csv_stats_workflow"
|
|
11
12
|
require "csvtool/interface/cli/errors/presenter"
|
|
12
13
|
require "csvtool/infrastructure/csv/header_reader"
|
|
13
14
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -22,6 +23,7 @@ module Csvtool
|
|
|
22
23
|
"Dedupe using another CSV",
|
|
23
24
|
"Validate parity",
|
|
24
25
|
"Split CSV into chunks",
|
|
26
|
+
"CSV stats summary",
|
|
25
27
|
"Exit"
|
|
26
28
|
].freeze
|
|
27
29
|
|
|
@@ -57,6 +59,7 @@ module Csvtool
|
|
|
57
59
|
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
58
60
|
parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
59
61
|
split_action = -> { Interface::CLI::Workflows::RunCsvSplitWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
62
|
+
stats_action = -> { Interface::CLI::Workflows::RunCsvStatsWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
60
63
|
Interface::CLI::MenuLoop.new(
|
|
61
64
|
stdin: @stdin,
|
|
62
65
|
stdout: @stdout,
|
|
@@ -66,7 +69,8 @@ module Csvtool
|
|
|
66
69
|
randomize_rows_action: randomize_rows_action,
|
|
67
70
|
dedupe_action: dedupe_action,
|
|
68
71
|
parity_action: parity_action,
|
|
69
|
-
split_action: split_action
|
|
72
|
+
split_action: split_action,
|
|
73
|
+
stats_action: stats_action
|
|
70
74
|
).run
|
|
71
75
|
end
|
|
72
76
|
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CsvStatsSession
|
|
6
|
+
class StatsSession
|
|
7
|
+
attr_reader :source, :options, :output_destination
|
|
8
|
+
|
|
9
|
+
def self.start(source:, options:)
|
|
10
|
+
new(source: source, options: options)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(source:, options:, output_destination: nil)
|
|
14
|
+
@source = source
|
|
15
|
+
@options = options
|
|
16
|
+
@output_destination = output_destination
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def with_output_destination(output_destination)
|
|
20
|
+
self.class.new(source: source, options: options, output_destination: output_destination)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CsvStatsSession
|
|
6
|
+
class StatsSource
|
|
7
|
+
attr_reader :path, :separator, :headers_present
|
|
8
|
+
|
|
9
|
+
def initialize(path:, separator:, headers_present:)
|
|
10
|
+
@path = path
|
|
11
|
+
@separator = separator
|
|
12
|
+
@headers_present = headers_present
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Infrastructure
|
|
7
|
+
module CSV
|
|
8
|
+
class CsvStatsScanner
|
|
9
|
+
def initialize(csv: ::CSV)
|
|
10
|
+
@csv = csv
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(file_path:, col_sep:, headers_present:)
|
|
14
|
+
data_row_count = 0
|
|
15
|
+
headers = nil
|
|
16
|
+
column_count = 0
|
|
17
|
+
column_stats = []
|
|
18
|
+
|
|
19
|
+
# Streaming scan: memory grows with per-column metrics, not row count.
|
|
20
|
+
@csv.foreach(file_path, headers: headers_present, col_sep: col_sep) do |row|
|
|
21
|
+
if headers_present
|
|
22
|
+
headers ||= row.headers
|
|
23
|
+
column_count = headers.length
|
|
24
|
+
if column_stats.empty?
|
|
25
|
+
column_stats = headers.map { |name| { name: name, blank_count: 0, non_blank_count: 0 } }
|
|
26
|
+
end
|
|
27
|
+
fields = row.fields
|
|
28
|
+
fields.fill(nil, fields.length...column_count)
|
|
29
|
+
fields.each_with_index { |value, index| apply_value(column_stats[index], value) }
|
|
30
|
+
data_row_count += 1
|
|
31
|
+
else
|
|
32
|
+
fields = row.is_a?(::CSV::Row) ? row.fields : row
|
|
33
|
+
column_count = [column_count, fields.length].max
|
|
34
|
+
while column_stats.length < column_count
|
|
35
|
+
column_stats << {
|
|
36
|
+
name: "column_#{column_stats.length + 1}",
|
|
37
|
+
blank_count: 0,
|
|
38
|
+
non_blank_count: 0
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
fields.fill(nil, fields.length...column_count)
|
|
42
|
+
fields.each_with_index { |value, index| apply_value(column_stats[index], value) }
|
|
43
|
+
data_row_count += 1
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
{
|
|
48
|
+
row_count: data_row_count,
|
|
49
|
+
column_count: column_count,
|
|
50
|
+
headers: headers,
|
|
51
|
+
column_stats: column_stats
|
|
52
|
+
}
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def apply_value(stats, value)
|
|
58
|
+
if value.nil? || value.strip.empty?
|
|
59
|
+
stats[:blank_count] += 1
|
|
60
|
+
else
|
|
61
|
+
stats[:non_blank_count] += 1
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Infrastructure
|
|
7
|
+
module Output
|
|
8
|
+
class CsvStatsFileWriter
|
|
9
|
+
def call(path:, data:)
|
|
10
|
+
::CSV.open(path, "w") do |csv|
|
|
11
|
+
csv << %w[metric value]
|
|
12
|
+
csv << ["row_count", data[:row_count]]
|
|
13
|
+
csv << ["column_count", data[:column_count]]
|
|
14
|
+
unless data[:headers].nil? || data[:headers].empty?
|
|
15
|
+
csv << ["headers", data[:headers].join("|")]
|
|
16
|
+
end
|
|
17
|
+
data.fetch(:column_stats, []).each do |stats|
|
|
18
|
+
csv << ["column.#{stats[:name]}.non_blank", stats[:non_blank_count]]
|
|
19
|
+
csv << ["column.#{stats[:name]}.blank", stats[:blank_count]]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -4,7 +4,7 @@ module Csvtool
|
|
|
4
4
|
module Interface
|
|
5
5
|
module CLI
|
|
6
6
|
class MenuLoop
|
|
7
|
-
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:, split_action:)
|
|
7
|
+
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:, split_action:, stats_action:)
|
|
8
8
|
@stdin = stdin
|
|
9
9
|
@stdout = stdout
|
|
10
10
|
@menu_options = menu_options
|
|
@@ -14,6 +14,7 @@ module Csvtool
|
|
|
14
14
|
@dedupe_action = dedupe_action
|
|
15
15
|
@parity_action = parity_action
|
|
16
16
|
@split_action = split_action
|
|
17
|
+
@stats_action = stats_action
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
def run
|
|
@@ -37,9 +38,11 @@ module Csvtool
|
|
|
37
38
|
when "6"
|
|
38
39
|
@split_action.call
|
|
39
40
|
when "7"
|
|
41
|
+
@stats_action.call
|
|
42
|
+
when "8"
|
|
40
43
|
return 0
|
|
41
44
|
else
|
|
42
|
-
@stdout.puts "Please choose 1, 2, 3, 4, 5, 6, or
|
|
45
|
+
@stdout.puts "Please choose 1, 2, 3, 4, 5, 6, 7, or 8."
|
|
43
46
|
end
|
|
44
47
|
end
|
|
45
48
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/csv_stats_session/stats_source"
|
|
4
|
+
require "csvtool/domain/csv_stats_session/stats_options"
|
|
5
|
+
require "csvtool/domain/csv_stats_session/stats_session"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Interface
|
|
9
|
+
module CLI
|
|
10
|
+
module Workflows
|
|
11
|
+
module Builders
|
|
12
|
+
class CsvStatsSessionBuilder
|
|
13
|
+
def call(file_path:, col_sep:, headers_present:, destination:)
|
|
14
|
+
source = Domain::CsvStatsSession::StatsSource.new(
|
|
15
|
+
path: file_path,
|
|
16
|
+
separator: col_sep,
|
|
17
|
+
headers_present: headers_present
|
|
18
|
+
)
|
|
19
|
+
options = Domain::CsvStatsSession::StatsOptions.new
|
|
20
|
+
session = Domain::CsvStatsSession::StatsSession.start(source: source, options: options)
|
|
21
|
+
session.with_output_destination(destination)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|