csvops 0.5.0.alpha → 0.6.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +45 -3
- data/docs/architecture.md +61 -4
- data/docs/release-v0.6.0-alpha.md +84 -0
- data/lib/csvtool/application/use_cases/run_csv_parity.rb +70 -0
- data/lib/csvtool/cli.rb +5 -1
- data/lib/csvtool/domain/csv_parity_session/parity_options.rb +22 -0
- data/lib/csvtool/domain/csv_parity_session/parity_session.rb +20 -0
- data/lib/csvtool/domain/csv_parity_session/source_pair.rb +19 -0
- data/lib/csvtool/infrastructure/csv/csv_parity_comparator.rb +71 -0
- data/lib/csvtool/interface/cli/errors/presenter.rb +4 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
- data/lib/csvtool/interface/cli/workflows/builders/csv_parity_session_builder.rb +33 -0
- data/lib/csvtool/interface/cli/workflows/presenters/csv_parity_presenter.rb +38 -0
- data/lib/csvtool/interface/cli/workflows/run_csv_parity_workflow.rb +66 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/build_session_step.rb +25 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step.rb +32 -0
- data/lib/csvtool/interface/cli/workflows/steps/parity/execute_step.rb +26 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_csv_parity_test.rb +160 -0
- data/test/csvtool/cli_test.rb +175 -21
- data/test/csvtool/cli_unit_test.rb +4 -4
- data/test/csvtool/domain/csv_parity_session/parity_options_test.rb +17 -0
- data/test/csvtool/domain/csv_parity_session/parity_session_test.rb +18 -0
- data/test/csvtool/domain/csv_parity_session/source_pair_test.rb +11 -0
- data/test/csvtool/infrastructure/csv/csv_parity_comparator_test.rb +78 -0
- data/test/csvtool/interface/cli/errors/presenter_test.rb +2 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +59 -16
- data/test/csvtool/interface/cli/workflows/builders/csv_parity_session_builder_test.rb +20 -0
- data/test/csvtool/interface/cli/workflows/presenters/csv_parity_presenter_test.rb +43 -0
- data/test/csvtool/interface/cli/workflows/run_csv_parity_workflow_test.rb +94 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/build_session_step_test.rb +41 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/collect_inputs_step_test.rb +30 -0
- data/test/csvtool/interface/cli/workflows/steps/parity/execute_step_test.rb +40 -0
- data/test/fixtures/parity_duplicates_left.csv +4 -0
- data/test/fixtures/parity_duplicates_right.csv +3 -0
- data/test/fixtures/parity_people_header_mismatch.csv +4 -0
- data/test/fixtures/parity_people_many_reordered.csv +13 -0
- data/test/fixtures/parity_people_mismatch.csv +4 -0
- data/test/fixtures/parity_people_reordered.csv +4 -0
- data/test/fixtures/parity_people_reordered.tsv +4 -0
- metadata +31 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f7db22cb84c1d08c58b473368f9ad37575a217d6293539309277ed2b032a2852
|
|
4
|
+
data.tar.gz: 124bebc822fefa5d1f71286701959876260c82164067c36ff94b712a0b4cc1b3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a8b8dbcfb66073f46f0ecc625267081fbe730e69ef9295f5d2303af6b831a9d71ef564f78f5b44212eb33c4ad7a5fdb78b54fa98e21dd58669e9494a5d3325fb
|
|
7
|
+
data.tar.gz: 05cbcaa2ca3116ad463413e53600d32a53df0941ceb8873ed22c2ef2d4cfe1afc8f90e44c7ff4400212ebbd5083a2ceb6a983281291436e9478d5087cc98b9ad
|
data/README.md
CHANGED
|
@@ -37,11 +37,12 @@ CSV Tool Menu
|
|
|
37
37
|
2. Extract rows (range)
|
|
38
38
|
3. Randomize rows
|
|
39
39
|
4. Dedupe using another CSV
|
|
40
|
-
5.
|
|
40
|
+
5. Validate parity
|
|
41
|
+
6. Exit
|
|
41
42
|
>
|
|
42
43
|
```
|
|
43
44
|
|
|
44
|
-
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization,
|
|
45
|
+
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, `4` for cross-CSV dedupe, or `5` for parity validation.
|
|
45
46
|
|
|
46
47
|
### 3. Follow prompts
|
|
47
48
|
|
|
@@ -59,6 +60,7 @@ Prompt flow by action:
|
|
|
59
60
|
- `Extract rows (range)`: file path, separator, start row, end row, output destination.
|
|
60
61
|
- `Randomize rows`: file path, separator, headers present, optional seed, output destination.
|
|
61
62
|
- `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
|
|
63
|
+
- `Validate parity`: left/right files, separator, header mode, parity summary, mismatch samples.
|
|
62
64
|
|
|
63
65
|
### 4. Example interaction (console output)
|
|
64
66
|
|
|
@@ -129,7 +131,8 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
129
131
|
2. Extract rows (range)
|
|
130
132
|
3. Randomize rows
|
|
131
133
|
4. Dedupe using another CSV
|
|
132
|
-
5.
|
|
134
|
+
5. Validate parity
|
|
135
|
+
6. Exit
|
|
133
136
|
+> 4
|
|
134
137
|
CSV file path: /tmp/source.csv
|
|
135
138
|
Source CSV separator:
|
|
@@ -166,6 +169,45 @@ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
|
166
169
|
-Summary: source_rows=5 removed_rows=3 kept_rows=2
|
|
167
170
|
```
|
|
168
171
|
|
|
172
|
+
### 8. Parity interaction example
|
|
173
|
+
|
|
174
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
175
|
+
|
|
176
|
+
```diff
|
|
177
|
+
CSV Tool Menu
|
|
178
|
+
1. Extract column
|
|
179
|
+
2. Extract rows (range)
|
|
180
|
+
3. Randomize rows
|
|
181
|
+
4. Dedupe using another CSV
|
|
182
|
+
5. Validate parity
|
|
183
|
+
6. Exit
|
|
184
|
+
+> 5
|
|
185
|
+
Left CSV file path: /tmp/left.csv
|
|
186
|
+
Right CSV file path: /tmp/right.csv
|
|
187
|
+
Choose separator:
|
|
188
|
+
1. comma (,)
|
|
189
|
+
2. tab (\t)
|
|
190
|
+
3. semicolon (;)
|
|
191
|
+
4. pipe (|)
|
|
192
|
+
5. custom
|
|
193
|
+
+Separator choice [1]: 1
|
|
194
|
+
Headers present? [Y/n]:
|
|
195
|
+
-MISMATCH
|
|
196
|
+
-Summary: left_rows=10 right_rows=10 left_only=2 right_only=2
|
|
197
|
+
-Left-only examples:
|
|
198
|
+
- 4,Dina (count +1)
|
|
199
|
+
-Right-only examples:
|
|
200
|
+
- 4,Dina-Updated (count +1)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### 9. Parity large-file behavior
|
|
204
|
+
|
|
205
|
+
- Parity uses a streaming count-delta strategy:
|
|
206
|
+
- Stream left rows and increment row-key counts.
|
|
207
|
+
- Stream right rows and decrement row-key counts.
|
|
208
|
+
- Exact duplicate semantics are preserved by count deltas per normalized row value.
|
|
209
|
+
- Memory scales with the number of distinct row keys in the parity map, not the total input row count.
|
|
210
|
+
|
|
169
211
|
## Testing
|
|
170
212
|
|
|
171
213
|
Run tests:
|
data/docs/architecture.md
CHANGED
|
@@ -3,14 +3,14 @@
|
|
|
3
3
|
The codebase follows a DDD-lite layered structure:
|
|
4
4
|
|
|
5
5
|
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
|
|
6
|
-
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
|
|
7
|
-
- `infrastructure/`: CSV reading/streaming and output adapters (console/file)
|
|
6
|
+
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`, `RunCsvParity`).
|
|
7
|
+
- `infrastructure/`: CSV reading/streaming/comparison and output adapters (console/file).
|
|
8
8
|
- `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
|
|
9
9
|
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
10
10
|
|
|
11
11
|
## Workflow boundary (standardized)
|
|
12
12
|
|
|
13
|
-
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`), the boundary is:
|
|
13
|
+
For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, `CSV Parity`), the boundary is:
|
|
14
14
|
|
|
15
15
|
- `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
|
|
16
16
|
- `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
|
|
@@ -32,6 +32,7 @@ Current usage:
|
|
|
32
32
|
- `RunRowExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::RowExtraction::*`.
|
|
33
33
|
- `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
|
|
34
34
|
- `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
|
|
35
|
+
- `RunCsvParityWorkflow` uses `WorkflowStepPipeline` + `Steps::Parity::*`.
|
|
35
36
|
|
|
36
37
|
## Adding New Concepts
|
|
37
38
|
|
|
@@ -107,7 +108,7 @@ For a new function type, prefer one of these patterns:
|
|
|
107
108
|
|
|
108
109
|
## Domain model
|
|
109
110
|
|
|
110
|
-
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`,
|
|
111
|
+
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`, and `CSV Parity`.
|
|
111
112
|
|
|
112
113
|
### Cross-CSV Dedupe (Large-file behavior)
|
|
113
114
|
|
|
@@ -366,6 +367,60 @@ classDiagram
|
|
|
366
367
|
RunCrossCsvDedupe --> CsvCrossCsvDedupeFileWriter
|
|
367
368
|
```
|
|
368
369
|
|
|
370
|
+
### CSV Parity
|
|
371
|
+
|
|
372
|
+
Core DDD structure:
|
|
373
|
+
|
|
374
|
+
- Aggregate root: `ParitySession`
|
|
375
|
+
- Captures one parity check request.
|
|
376
|
+
- Holds left/right source paths and parity options.
|
|
377
|
+
- Entities:
|
|
378
|
+
- `SourcePair` (left and right file paths)
|
|
379
|
+
- Value objects:
|
|
380
|
+
- `ParityOptions` (separator + header mode)
|
|
381
|
+
- Application service:
|
|
382
|
+
- `Application::UseCases::RunCsvParity` orchestrates parity validation and returns request/result style payloads.
|
|
383
|
+
- Infrastructure adapters:
|
|
384
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
385
|
+
- `Infrastructure::CSV::CsvParityComparator` (streaming count-delta strategy with duplicate-aware semantics)
|
|
386
|
+
- Interface adapters:
|
|
387
|
+
- `Interface::CLI::MenuLoop`
|
|
388
|
+
- `Interface::CLI::Workflows::RunCsvParityWorkflow`
|
|
389
|
+
- `Interface::CLI::Workflows::Builders::CsvParitySessionBuilder`
|
|
390
|
+
- `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
|
|
391
|
+
- `Interface::CLI::Workflows::Steps::Parity::*`
|
|
392
|
+
- `Interface::CLI::Workflows::Presenters::CsvParityPresenter`
|
|
393
|
+
- `Interface::CLI::Workflows::Support::ResultErrorHandler`
|
|
394
|
+
- `Interface::CLI::Prompts::*`
|
|
395
|
+
- `Interface::CLI::Errors::Presenter`
|
|
396
|
+
|
|
397
|
+
```mermaid
|
|
398
|
+
classDiagram
|
|
399
|
+
direction LR
|
|
400
|
+
class MenuLoop
|
|
401
|
+
class RunCsvParityWorkflow
|
|
402
|
+
class Prompts
|
|
403
|
+
class Errors
|
|
404
|
+
class RunCsvParity
|
|
405
|
+
class ParitySession
|
|
406
|
+
class SourcePair
|
|
407
|
+
class ParityOptions
|
|
408
|
+
class HeaderReader
|
|
409
|
+
class CsvParityComparator
|
|
410
|
+
class CsvParityPresenter
|
|
411
|
+
|
|
412
|
+
MenuLoop --> RunCsvParityWorkflow : invokes
|
|
413
|
+
RunCsvParityWorkflow --> Prompts : uses
|
|
414
|
+
RunCsvParityWorkflow --> Errors : reports failures
|
|
415
|
+
RunCsvParityWorkflow --> CsvParityPresenter : renders
|
|
416
|
+
RunCsvParityWorkflow --> RunCsvParity : calls
|
|
417
|
+
RunCsvParity --> ParitySession : orchestrates
|
|
418
|
+
ParitySession o-- SourcePair
|
|
419
|
+
ParitySession o-- ParityOptions
|
|
420
|
+
RunCsvParity --> HeaderReader
|
|
421
|
+
RunCsvParity --> CsvParityComparator
|
|
422
|
+
```
|
|
423
|
+
|
|
369
424
|
## Project layout
|
|
370
425
|
|
|
371
426
|
```text
|
|
@@ -375,11 +430,13 @@ lib/csvtool/domain/column_session/*
|
|
|
375
430
|
lib/csvtool/domain/row_session/*
|
|
376
431
|
lib/csvtool/domain/row_randomization_session/*
|
|
377
432
|
lib/csvtool/domain/cross_csv_dedupe_session/*
|
|
433
|
+
lib/csvtool/domain/csv_parity_session/*
|
|
378
434
|
lib/csvtool/domain/shared/output_destination.rb
|
|
379
435
|
lib/csvtool/application/use_cases/run_extraction.rb
|
|
380
436
|
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
381
437
|
lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
382
438
|
lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
|
|
439
|
+
lib/csvtool/application/use_cases/run_csv_parity.rb
|
|
383
440
|
lib/csvtool/infrastructure/csv/*
|
|
384
441
|
lib/csvtool/infrastructure/output/*
|
|
385
442
|
lib/csvtool/interface/cli/menu_loop.rb
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Release Checklist: v0.6.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.x`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### CSV parity workflow
|
|
35
|
+
|
|
36
|
+
Use menu option `5` (`Validate parity`) and verify:
|
|
37
|
+
- matching files with reordered rows return parity success
|
|
38
|
+
- mismatch files return friendly mismatch summary with sample deltas
|
|
39
|
+
- separator and header-mode selections are respected
|
|
40
|
+
|
|
41
|
+
### Existing workflows regression pass
|
|
42
|
+
|
|
43
|
+
Run quick checks for menu options `1-4` and confirm:
|
|
44
|
+
- column extraction still works
|
|
45
|
+
- row-range extraction still works
|
|
46
|
+
- row randomization still works
|
|
47
|
+
- cross-CSV dedupe still works
|
|
48
|
+
|
|
49
|
+
## 6. Build and validate gem package
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
gem build csvops.gemspec
|
|
53
|
+
gem install ./csvops-0.6.0.alpha.gem
|
|
54
|
+
csvtool menu
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 7. Commit release prep
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git add -A
|
|
61
|
+
git commit -m "chore(release): prepare v0.6.0-alpha"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 8. Tag release
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
git tag -a v0.6.0-alpha -m "v0.6.0-alpha"
|
|
68
|
+
git push origin main --tags
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 9. Publish gem
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
gem push csvops-0.6.0.alpha.gem
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## 10. Create GitHub release
|
|
78
|
+
|
|
79
|
+
Create release `v0.6.0-alpha` with:
|
|
80
|
+
- Dedicated CSV parity validation workflow
|
|
81
|
+
- Header/separator parity options
|
|
82
|
+
- Friendly parity mismatch reporting
|
|
83
|
+
- Streaming delta-count parity comparator
|
|
84
|
+
- Parity architecture convergence (session model, workflow steps, presenter, docs)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/csv_parity_comparator"
|
|
5
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Application
|
|
9
|
+
module UseCases
|
|
10
|
+
class RunCsvParity
|
|
11
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
12
|
+
def ok?
|
|
13
|
+
ok
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(
|
|
18
|
+
comparator: Infrastructure::CSV::CsvParityComparator.new,
|
|
19
|
+
header_reader: Infrastructure::CSV::HeaderReader.new
|
|
20
|
+
)
|
|
21
|
+
@comparator = comparator
|
|
22
|
+
@header_reader = header_reader
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call(session:)
|
|
26
|
+
left_path = session.source_pair.left_path
|
|
27
|
+
right_path = session.source_pair.right_path
|
|
28
|
+
col_sep = session.options.separator
|
|
29
|
+
headers_present = session.options.headers_present?
|
|
30
|
+
|
|
31
|
+
return failure(:file_not_found, path: left_path) unless File.file?(left_path)
|
|
32
|
+
return failure(:file_not_found, path: right_path) unless File.file?(right_path)
|
|
33
|
+
|
|
34
|
+
if headers_present
|
|
35
|
+
left_headers = @header_reader.call(file_path: left_path, col_sep: col_sep)
|
|
36
|
+
return failure(:no_headers, path: left_path) if left_headers.empty?
|
|
37
|
+
|
|
38
|
+
right_headers = @header_reader.call(file_path: right_path, col_sep: col_sep)
|
|
39
|
+
return failure(:no_headers, path: right_path) if right_headers.empty?
|
|
40
|
+
|
|
41
|
+
return failure(:header_mismatch, left_headers: left_headers, right_headers: right_headers) unless left_headers == right_headers
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
stats = @comparator.call(
|
|
45
|
+
left_path: left_path,
|
|
46
|
+
right_path: right_path,
|
|
47
|
+
col_sep: col_sep,
|
|
48
|
+
headers_present: headers_present
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
success(stats)
|
|
52
|
+
rescue CSV::MalformedCSVError
|
|
53
|
+
failure(:could_not_parse_csv)
|
|
54
|
+
rescue Errno::EACCES => e
|
|
55
|
+
failure(:cannot_read_file, path: e.respond_to?(:path) ? e.path : left_path)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def success(data)
|
|
61
|
+
Result.new(ok: true, error: nil, data: data)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def failure(code, data = {})
|
|
65
|
+
Result.new(ok: false, error: code, data: data)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
data/lib/csvtool/cli.rb
CHANGED
|
@@ -6,6 +6,7 @@ require "csvtool/interface/cli/workflows/run_extraction_workflow"
|
|
|
6
6
|
require "csvtool/interface/cli/workflows/run_row_extraction_workflow"
|
|
7
7
|
require "csvtool/interface/cli/workflows/run_row_randomization_workflow"
|
|
8
8
|
require "csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow"
|
|
9
|
+
require "csvtool/interface/cli/workflows/run_csv_parity_workflow"
|
|
9
10
|
require "csvtool/interface/cli/errors/presenter"
|
|
10
11
|
require "csvtool/infrastructure/csv/header_reader"
|
|
11
12
|
require "csvtool/infrastructure/csv/value_streamer"
|
|
@@ -18,6 +19,7 @@ module Csvtool
|
|
|
18
19
|
"Extract rows (range)",
|
|
19
20
|
"Randomize rows",
|
|
20
21
|
"Dedupe using another CSV",
|
|
22
|
+
"Validate parity",
|
|
21
23
|
"Exit"
|
|
22
24
|
].freeze
|
|
23
25
|
|
|
@@ -51,6 +53,7 @@ module Csvtool
|
|
|
51
53
|
extract_rows_action = -> { Interface::CLI::Workflows::RunRowExtractionWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
52
54
|
randomize_rows_action = -> { Interface::CLI::Workflows::RunRowRandomizationWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
53
55
|
dedupe_action = -> { Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
56
|
+
parity_action = -> { Interface::CLI::Workflows::RunCsvParityWorkflow.new(stdin: @stdin, stdout: @stdout).call }
|
|
54
57
|
Interface::CLI::MenuLoop.new(
|
|
55
58
|
stdin: @stdin,
|
|
56
59
|
stdout: @stdout,
|
|
@@ -58,7 +61,8 @@ module Csvtool
|
|
|
58
61
|
extract_column_action: extract_column_action,
|
|
59
62
|
extract_rows_action: extract_rows_action,
|
|
60
63
|
randomize_rows_action: randomize_rows_action,
|
|
61
|
-
dedupe_action: dedupe_action
|
|
64
|
+
dedupe_action: dedupe_action,
|
|
65
|
+
parity_action: parity_action
|
|
62
66
|
).run
|
|
63
67
|
end
|
|
64
68
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CsvParitySession
|
|
6
|
+
class ParityOptions
|
|
7
|
+
attr_reader :separator
|
|
8
|
+
|
|
9
|
+
def initialize(separator:, headers_present:)
|
|
10
|
+
raise ArgumentError, "separator cannot be empty" if separator.to_s.empty?
|
|
11
|
+
|
|
12
|
+
@separator = separator
|
|
13
|
+
@headers_present = headers_present
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def headers_present?
|
|
17
|
+
@headers_present
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CsvParitySession
|
|
6
|
+
class ParitySession
|
|
7
|
+
attr_reader :source_pair, :options
|
|
8
|
+
|
|
9
|
+
def self.start(source_pair:, options:)
|
|
10
|
+
new(source_pair: source_pair, options: options)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(source_pair:, options:)
|
|
14
|
+
@source_pair = source_pair
|
|
15
|
+
@options = options
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Domain
|
|
5
|
+
module CsvParitySession
|
|
6
|
+
class SourcePair
|
|
7
|
+
attr_reader :left_path, :right_path
|
|
8
|
+
|
|
9
|
+
def initialize(left_path:, right_path:)
|
|
10
|
+
raise ArgumentError, "left_path cannot be empty" if left_path.to_s.empty?
|
|
11
|
+
raise ArgumentError, "right_path cannot be empty" if right_path.to_s.empty?
|
|
12
|
+
|
|
13
|
+
@left_path = left_path
|
|
14
|
+
@right_path = right_path
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Csvtool
|
|
6
|
+
module Infrastructure
|
|
7
|
+
module CSV
|
|
8
|
+
class CsvParityComparator
|
|
9
|
+
def call(left_path:, right_path:, col_sep:, headers_present:, sample_limit: 5)
|
|
10
|
+
deltas = Hash.new(0)
|
|
11
|
+
left_rows = stream_rows(path: left_path, col_sep: col_sep, headers_present: headers_present) do |key|
|
|
12
|
+
deltas[key] += 1
|
|
13
|
+
end
|
|
14
|
+
right_rows = stream_rows(path: right_path, col_sep: col_sep, headers_present: headers_present) do |key|
|
|
15
|
+
deltas[key] -= 1
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
left_only_count, right_only_count, left_only_examples, right_only_examples =
|
|
19
|
+
mismatch_totals_and_samples(deltas: deltas, sample_limit: sample_limit)
|
|
20
|
+
|
|
21
|
+
{
|
|
22
|
+
match: left_only_count.zero? && right_only_count.zero?,
|
|
23
|
+
left_rows: left_rows,
|
|
24
|
+
right_rows: right_rows,
|
|
25
|
+
left_only_count: left_only_count,
|
|
26
|
+
right_only_count: right_only_count,
|
|
27
|
+
left_only_examples: left_only_examples,
|
|
28
|
+
right_only_examples: right_only_examples
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def stream_rows(path:, col_sep:, headers_present:)
|
|
35
|
+
rows = 0
|
|
36
|
+
|
|
37
|
+
::CSV.foreach(path, headers: headers_present, col_sep: col_sep) do |row|
|
|
38
|
+
fields = headers_present ? row.fields : row
|
|
39
|
+
yield serialize(fields: fields, col_sep: col_sep)
|
|
40
|
+
rows += 1
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
rows
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def mismatch_totals_and_samples(deltas:, sample_limit:)
|
|
47
|
+
left_only_count = 0
|
|
48
|
+
right_only_count = 0
|
|
49
|
+
left_only_examples = []
|
|
50
|
+
right_only_examples = []
|
|
51
|
+
|
|
52
|
+
deltas.each do |key, delta|
|
|
53
|
+
if delta.positive?
|
|
54
|
+
left_only_count += delta
|
|
55
|
+
left_only_examples << { row: key, count_delta: delta } if left_only_examples.length < sample_limit
|
|
56
|
+
elsif delta.negative?
|
|
57
|
+
right_only_count += -delta
|
|
58
|
+
right_only_examples << { row: key, count_delta: -delta } if right_only_examples.length < sample_limit
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
[left_only_count, right_only_count, left_only_examples, right_only_examples]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def serialize(fields:, col_sep:)
|
|
66
|
+
::CSV.generate_line(fields, row_sep: "", col_sep: col_sep).chomp
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -4,7 +4,7 @@ module Csvtool
|
|
|
4
4
|
module Interface
|
|
5
5
|
module CLI
|
|
6
6
|
class MenuLoop
|
|
7
|
-
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:)
|
|
7
|
+
def initialize(stdin:, stdout:, menu_options:, extract_column_action:, extract_rows_action:, randomize_rows_action:, dedupe_action:, parity_action:)
|
|
8
8
|
@stdin = stdin
|
|
9
9
|
@stdout = stdout
|
|
10
10
|
@menu_options = menu_options
|
|
@@ -12,6 +12,7 @@ module Csvtool
|
|
|
12
12
|
@extract_rows_action = extract_rows_action
|
|
13
13
|
@randomize_rows_action = randomize_rows_action
|
|
14
14
|
@dedupe_action = dedupe_action
|
|
15
|
+
@parity_action = parity_action
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
def run
|
|
@@ -31,9 +32,11 @@ module Csvtool
|
|
|
31
32
|
when "4"
|
|
32
33
|
@dedupe_action.call
|
|
33
34
|
when "5"
|
|
35
|
+
@parity_action.call
|
|
36
|
+
when "6"
|
|
34
37
|
return 0
|
|
35
38
|
else
|
|
36
|
-
@stdout.puts "Please choose 1, 2, 3, 4, or
|
|
39
|
+
@stdout.puts "Please choose 1, 2, 3, 4, 5, or 6."
|
|
37
40
|
end
|
|
38
41
|
end
|
|
39
42
|
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csvtool/domain/csv_parity_session/source_pair"
|
|
4
|
+
require "csvtool/domain/csv_parity_session/parity_options"
|
|
5
|
+
require "csvtool/domain/csv_parity_session/parity_session"
|
|
6
|
+
|
|
7
|
+
module Csvtool
|
|
8
|
+
module Interface
|
|
9
|
+
module CLI
|
|
10
|
+
module Workflows
|
|
11
|
+
module Builders
|
|
12
|
+
class CsvParitySessionBuilder
|
|
13
|
+
def call(left_path:, right_path:, col_sep:, headers_present:)
|
|
14
|
+
source_pair = Domain::CsvParitySession::SourcePair.new(
|
|
15
|
+
left_path: left_path,
|
|
16
|
+
right_path: right_path
|
|
17
|
+
)
|
|
18
|
+
options = Domain::CsvParitySession::ParityOptions.new(
|
|
19
|
+
separator: col_sep,
|
|
20
|
+
headers_present: headers_present
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
Domain::CsvParitySession::ParitySession.start(
|
|
24
|
+
source_pair: source_pair,
|
|
25
|
+
options: options
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Csvtool
|
|
4
|
+
module Interface
|
|
5
|
+
module CLI
|
|
6
|
+
module Workflows
|
|
7
|
+
module Presenters
|
|
8
|
+
class CsvParityPresenter
|
|
9
|
+
def initialize(stdout:)
|
|
10
|
+
@stdout = stdout
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def print_summary(data)
|
|
14
|
+
@stdout.puts(data[:match] ? "MATCH" : "MISMATCH")
|
|
15
|
+
@stdout.puts "Summary: left_rows=#{data[:left_rows]} right_rows=#{data[:right_rows]} " \
|
|
16
|
+
"left_only=#{data[:left_only_count]} right_only=#{data[:right_only_count]}"
|
|
17
|
+
return if data[:match]
|
|
18
|
+
|
|
19
|
+
print_examples("Left-only examples", data[:left_only_examples])
|
|
20
|
+
print_examples("Right-only examples", data[:right_only_examples])
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def print_examples(label, examples)
|
|
26
|
+
return if examples.nil? || examples.empty?
|
|
27
|
+
|
|
28
|
+
@stdout.puts "#{label}:"
|
|
29
|
+
examples.each do |example|
|
|
30
|
+
@stdout.puts " #{example[:row]} (count +#{example[:count_delta]})"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|