csvops 0.3.0.alpha → 0.4.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +56 -142
- data/docs/architecture.md +266 -0
- data/docs/release-v0.4.0-alpha.md +87 -0
- data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
- data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
- data/lib/csvtool/application/use_cases/run_row_randomization.rb +3 -3
- data/lib/csvtool/cli.rb +5 -1
- data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
- data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
- data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +1 -0
- data/lib/csvtool/domain/row_session/row_source.rb +3 -0
- data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
- data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
- data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
- data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
- data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
- data/lib/csvtool/version.rb +1 -1
- data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
- data/test/csvtool/cli_test.rb +130 -16
- data/test/csvtool/cli_unit_test.rb +16 -3
- data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
- data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
- data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
- data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +2 -2
- data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +15 -1
- data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
- data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
- data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
- data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
- data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
- data/test/csvtool/interface/cli/menu_loop_test.rb +50 -13
- data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
- data/test/fixtures/dedupe_reference.csv +3 -0
- data/test/fixtures/dedupe_reference.tsv +3 -0
- data/test/fixtures/dedupe_reference_all.csv +5 -0
- data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
- data/test/fixtures/dedupe_reference_none.csv +2 -0
- data/test/fixtures/dedupe_reference_normalization.csv +3 -0
- data/test/fixtures/dedupe_source.csv +6 -0
- data/test/fixtures/dedupe_source.tsv +6 -0
- data/test/fixtures/dedupe_source_no_headers.csv +5 -0
- data/test/fixtures/dedupe_source_normalization.csv +4 -0
- metadata +34 -8
- data/lib/csvtool/domain/row_randomization_session/randomization_output_destination.rb +0 -31
- data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
- data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
- data/test/csvtool/domain/row_randomization_session/randomization_output_destination_test.rb +0 -21
- data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9c5a0c00272c2d10751b234384ac50ee8caa90681860906419ccdec7a6e3c110
|
|
4
|
+
data.tar.gz: 849d377bec9acd507c0fd37a75e823bb9458295e12a31a5000b9ba599084092d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ba96ce18b4e6d2fd8eb018f406c17e7b810010a788a6be1acb51a714b87dad614d822edb97f780e2c745e257bbc68c89266427876fcc4b3fee57fadb29232630
|
|
7
|
+
data.tar.gz: 378c4a47b96cf210b28f689d9ef0aa1056c95777d3128d4044d8462cf802eeb53ca148062f4244182105e4b760bbf7dcf48d19705f69f18dfb78e3fc2e935413
|
data/README.md
CHANGED
|
@@ -35,11 +35,13 @@ bundle exec csvtool menu
|
|
|
35
35
|
CSV Tool Menu
|
|
36
36
|
1. Extract column
|
|
37
37
|
2. Extract rows (range)
|
|
38
|
-
3.
|
|
38
|
+
3. Randomize rows
|
|
39
|
+
4. Dedupe using another CSV
|
|
40
|
+
5. Exit
|
|
39
41
|
>
|
|
40
42
|
```
|
|
41
43
|
|
|
42
|
-
Select `1`
|
|
44
|
+
Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, or `4` for cross-CSV dedupe.
|
|
43
45
|
|
|
44
46
|
### 3. Follow prompts
|
|
45
47
|
|
|
@@ -111,6 +113,53 @@ With Bundler:
|
|
|
111
113
|
bundle exec csvtool column /path/to/file.csv column_name
|
|
112
114
|
```
|
|
113
115
|
|
|
116
|
+
### 7. Dedupe interaction example
|
|
117
|
+
|
|
118
|
+
Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
|
|
119
|
+
|
|
120
|
+
```diff
|
|
121
|
+
CSV Tool Menu
|
|
122
|
+
1. Extract column
|
|
123
|
+
2. Extract rows (range)
|
|
124
|
+
3. Randomize rows
|
|
125
|
+
4. Dedupe using another CSV
|
|
126
|
+
5. Exit
|
|
127
|
+
+> 4
|
|
128
|
+
CSV file path: /tmp/source.csv
|
|
129
|
+
Source CSV separator:
|
|
130
|
+
Choose separator:
|
|
131
|
+
1. comma (,)
|
|
132
|
+
2. tab (\t)
|
|
133
|
+
3. semicolon (;)
|
|
134
|
+
4. pipe (|)
|
|
135
|
+
5. custom
|
|
136
|
+
+Separator choice [1]: 1
|
|
137
|
+
Source headers present? [Y/n]:
|
|
138
|
+
Reference CSV file path: /tmp/reference.csv
|
|
139
|
+
Reference CSV separator:
|
|
140
|
+
Choose separator:
|
|
141
|
+
1. comma (,)
|
|
142
|
+
2. tab (\t)
|
|
143
|
+
3. semicolon (;)
|
|
144
|
+
4. pipe (|)
|
|
145
|
+
5. custom
|
|
146
|
+
+Separator choice [1]: 1
|
|
147
|
+
Reference headers present? [Y/n]:
|
|
148
|
+
Source key column name: customer_id
|
|
149
|
+
Reference key column name: external_id
|
|
150
|
+
Trim whitespace before matching? [Y/n]:
|
|
151
|
+
Case-insensitive matching? [y/N]:
|
|
152
|
+
Output destination:
|
|
153
|
+
1. console
|
|
154
|
+
2. file
|
|
155
|
+
+Output destination [1]: 1
|
|
156
|
+
-
|
|
157
|
+
-customer_id,name
|
|
158
|
+
-1,Alice
|
|
159
|
+
-3,Cara
|
|
160
|
+
-Summary: source_rows=5 removed_rows=3 kept_rows=2
|
|
161
|
+
```
|
|
162
|
+
|
|
114
163
|
## Testing
|
|
115
164
|
|
|
116
165
|
Run tests:
|
|
@@ -127,7 +176,7 @@ bundle exec rake test
|
|
|
127
176
|
|
|
128
177
|
## Alpha release
|
|
129
178
|
|
|
130
|
-
Current prerelease version: `0.
|
|
179
|
+
Current prerelease version: `0.4.0.alpha`
|
|
131
180
|
|
|
132
181
|
Install prerelease from RubyGems:
|
|
133
182
|
|
|
@@ -137,146 +186,11 @@ gem install csvops --pre
|
|
|
137
186
|
|
|
138
187
|
Release runbook:
|
|
139
188
|
|
|
140
|
-
- `docs/release-v0.
|
|
141
|
-
|
|
142
|
-
## Architecture
|
|
143
|
-
|
|
144
|
-
The codebase follows a DDD-lite layered structure:
|
|
145
|
-
|
|
146
|
-
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, and `RandomizationSession` aggregates + supporting entities/value objects).
|
|
147
|
-
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`).
|
|
148
|
-
- `infrastructure/`: CSV reading/streaming and output adapters (console/file).
|
|
149
|
-
- `interface/cli/`: menu, prompts, and user-facing error presentation.
|
|
150
|
-
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
151
|
-
|
|
152
|
-
## Domain model
|
|
153
|
-
|
|
154
|
-
Bounded contexts: `Column Extraction`, `Row Extraction`, and `Row Randomization`.
|
|
155
|
-
|
|
156
|
-
### Column Extraction
|
|
157
|
-
|
|
158
|
-
- Aggregate root: `ColumnSession`
|
|
159
|
-
- Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
|
|
160
|
-
- Enforces session-level invariants.
|
|
161
|
-
- Entities:
|
|
162
|
-
- `CsvSource` (file path + `Separator`)
|
|
163
|
-
- `ColumnSelection` (chosen header)
|
|
164
|
-
- Value objects:
|
|
165
|
-
- `Separator`
|
|
166
|
-
- `ExtractionOptions` (`skip_blanks`, `preview_limit`)
|
|
167
|
-
- `Preview` (list of `ExtractionValue`)
|
|
168
|
-
- `ExtractionValue`
|
|
169
|
-
- `OutputDestination` (`console` or `file(path)`)
|
|
170
|
-
- Application service:
|
|
171
|
-
- `Application::UseCases::RunExtraction` orchestrates one extraction request.
|
|
172
|
-
- Infrastructure adapters:
|
|
173
|
-
- `Infrastructure::CSV::HeaderReader`
|
|
174
|
-
- `Infrastructure::CSV::ValueStreamer`
|
|
175
|
-
- `Infrastructure::Output::ConsoleWriter`
|
|
176
|
-
- `Infrastructure::Output::CsvFileWriter`
|
|
177
|
-
- Interface adapters:
|
|
178
|
-
- `Interface::CLI::MenuLoop`
|
|
179
|
-
- `Interface::CLI::Prompts::*`
|
|
180
|
-
- `Interface::CLI::Errors::Presenter`
|
|
181
|
-
|
|
182
|
-
```mermaid
|
|
183
|
-
flowchart LR
|
|
184
|
-
UI["Interface CLI\n(Menu + Prompts + Errors)"] --> APP["Application Use Case\nRunExtraction"]
|
|
185
|
-
APP --> AGG["Domain Aggregate\nColumnSession"]
|
|
186
|
-
|
|
187
|
-
AGG --> E1["Entity\nCsvSource"]
|
|
188
|
-
AGG --> E2["Entity\nColumnSelection"]
|
|
189
|
-
AGG --> V1["Value Objects\nSeparator / ExtractionOptions / Preview / OutputDestination / ExtractionValue"]
|
|
190
|
-
|
|
191
|
-
APP --> INFCSV["Infrastructure CSV\nHeaderReader + ValueStreamer"]
|
|
192
|
-
APP --> INFOUT["Infrastructure Output\nConsoleWriter + CsvFileWriter"]
|
|
193
|
-
```
|
|
189
|
+
- `docs/release-v0.4.0-alpha.md`
|
|
194
190
|
|
|
195
|
-
### Row Extraction
|
|
196
|
-
|
|
197
|
-
Core DDD structure:
|
|
198
|
-
|
|
199
|
-
- Aggregate root: `RowSession`
|
|
200
|
-
- Captures one row-range extraction request.
|
|
201
|
-
- Holds selected source, requested range, and output destination.
|
|
202
|
-
- Entity:
|
|
203
|
-
- `RowSource` (file path + separator)
|
|
204
|
-
- Value objects:
|
|
205
|
-
- `RowRange` (`start_row`, `end_row`) plus row-range validation errors
|
|
206
|
-
- `RowOutputDestination` (`console` or `file(path)`)
|
|
207
|
-
- Application service:
|
|
208
|
-
- `Application::UseCases::RunRowExtraction` orchestrates row-range extraction.
|
|
209
|
-
- Infrastructure adapters:
|
|
210
|
-
- `Infrastructure::CSV::HeaderReader`
|
|
211
|
-
- `Infrastructure::CSV::RowStreamer`
|
|
212
|
-
- `Infrastructure::Output::CsvRowConsoleWriter`
|
|
213
|
-
- `Infrastructure::Output::CsvRowFileWriter`
|
|
214
|
-
- Interface adapters:
|
|
215
|
-
- `Interface::CLI::MenuLoop`
|
|
216
|
-
- `Interface::CLI::Prompts::*`
|
|
217
|
-
- `Interface::CLI::Errors::Presenter`
|
|
218
|
-
|
|
219
|
-
```mermaid
|
|
220
|
-
flowchart LR
|
|
221
|
-
UI2["Interface CLI\n(Menu + Prompts + Errors)"] --> APP2["Application Use Case\nRunRowExtraction"]
|
|
222
|
-
APP2 --> AGG2["Domain Aggregate\nRowSession"]
|
|
223
|
-
|
|
224
|
-
AGG2 --> E3["Entity\nRowSource"]
|
|
225
|
-
AGG2 --> V2["Value Objects\nRowRange / RowOutputDestination"]
|
|
226
|
-
|
|
227
|
-
APP2 --> INFCSV2["Infrastructure CSV\nHeaderReader + RowStreamer"]
|
|
228
|
-
APP2 --> INFOUT2["Infrastructure Output\nCsvRowConsoleWriter + CsvRowFileWriter"]
|
|
229
|
-
```
|
|
230
191
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
Core DDD structure:
|
|
234
|
-
|
|
235
|
-
- Aggregate root: `RandomizationSession`
|
|
236
|
-
- Captures one randomization request from source + options + output destination.
|
|
237
|
-
- Entity:
|
|
238
|
-
- `RandomizationSource` (file path + separator + header mode)
|
|
239
|
-
- Value objects:
|
|
240
|
-
- `RandomizationOptions` (optional deterministic `seed`)
|
|
241
|
-
- `RandomizationOutputDestination` (`console` or `file(path)`)
|
|
242
|
-
- Application service:
|
|
243
|
-
- `Application::UseCases::RunRowRandomization` orchestrates row randomization.
|
|
244
|
-
- Infrastructure adapters:
|
|
245
|
-
- `Infrastructure::CSV::HeaderReader`
|
|
246
|
-
- `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
|
|
247
|
-
- Interface adapters:
|
|
248
|
-
- `Interface::CLI::MenuLoop`
|
|
249
|
-
- `Interface::CLI::Prompts::*`
|
|
250
|
-
- `Interface::CLI::Errors::Presenter`
|
|
251
|
-
|
|
252
|
-
```mermaid
|
|
253
|
-
flowchart LR
|
|
254
|
-
UI3["Interface CLI\n(Menu + Prompts + Errors)"] --> APP3["Application Use Case\nRunRowRandomization"]
|
|
255
|
-
APP3 --> AGG3["Domain Aggregate\nRandomizationSession"]
|
|
256
|
-
|
|
257
|
-
AGG3 --> E4["Entity\nRandomizationSource"]
|
|
258
|
-
AGG3 --> V3["Value Objects\nRandomizationOptions / RandomizationOutputDestination"]
|
|
259
|
-
|
|
260
|
-
APP3 --> INFCSV3["Infrastructure CSV\nHeaderReader + RowRandomizer"]
|
|
261
|
-
```
|
|
192
|
+
## Architecture
|
|
262
193
|
|
|
263
|
-
|
|
194
|
+
Full architecture and domain documentation lives in:
|
|
264
195
|
|
|
265
|
-
|
|
266
|
-
bin/tool # CLI entrypoint
|
|
267
|
-
lib/csvtool/cli.rb
|
|
268
|
-
lib/csvtool/domain/column_session/*
|
|
269
|
-
lib/csvtool/domain/row_session/*
|
|
270
|
-
lib/csvtool/domain/row_randomization_session/*
|
|
271
|
-
lib/csvtool/application/use_cases/run_extraction.rb
|
|
272
|
-
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
273
|
-
lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
274
|
-
lib/csvtool/infrastructure/csv/*
|
|
275
|
-
lib/csvtool/infrastructure/output/*
|
|
276
|
-
lib/csvtool/interface/cli/menu_loop.rb
|
|
277
|
-
lib/csvtool/interface/cli/prompts/*
|
|
278
|
-
lib/csvtool/interface/cli/errors/presenter.rb
|
|
279
|
-
test/csvtool/cli_test.rb # end-to-end workflow tests
|
|
280
|
-
test/csvtool/**/*_test.rb # focused unit tests by component folder
|
|
281
|
-
test/test_helper.rb
|
|
282
|
-
```
|
|
196
|
+
- [`docs/architecture.md`](docs/architecture.md)
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
The codebase follows a DDD-lite layered structure:
|
|
4
|
+
|
|
5
|
+
- `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
|
|
6
|
+
- `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
|
|
7
|
+
- `infrastructure/`: CSV reading/streaming and output adapters (console/file), plus cross-CSV dedupe adapter.
|
|
8
|
+
- `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
|
|
9
|
+
- `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
|
|
10
|
+
|
|
11
|
+
## Domain model
|
|
12
|
+
|
|
13
|
+
Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, and `Cross-CSV Dedupe`.
|
|
14
|
+
|
|
15
|
+
### Cross-CSV Dedupe (Large-file behavior)
|
|
16
|
+
|
|
17
|
+
- Workflow: remove rows from a source CSV when source key matches a key from a reference CSV.
|
|
18
|
+
- Scaling strategy:
|
|
19
|
+
- Reference CSV keys are loaded into a `Set` for fast membership checks.
|
|
20
|
+
- Source CSV rows are streamed directly to the selected output destination (console or file).
|
|
21
|
+
- Memory tradeoff:
|
|
22
|
+
- Memory is dominated by the number of unique keys in the reference CSV.
|
|
23
|
+
- Source-row memory stays bounded because retained rows are not accumulated in memory before writing.
|
|
24
|
+
|
|
25
|
+
### Column Extraction
|
|
26
|
+
|
|
27
|
+
- Aggregate root: `ColumnSession`
|
|
28
|
+
- Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
|
|
29
|
+
- Enforces session-level invariants.
|
|
30
|
+
- Entities:
|
|
31
|
+
- `CsvSource` (file path + `Separator`)
|
|
32
|
+
- `ColumnSelection` (chosen header)
|
|
33
|
+
- Value objects:
|
|
34
|
+
- `Separator`
|
|
35
|
+
- `ExtractionOptions` (`skip_blanks`, `preview_limit`)
|
|
36
|
+
- `Preview` (list of `ExtractionValue`)
|
|
37
|
+
- `ExtractionValue`
|
|
38
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
39
|
+
- Application service:
|
|
40
|
+
- `Application::UseCases::RunExtraction` orchestrates one extraction request.
|
|
41
|
+
- Infrastructure adapters:
|
|
42
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
43
|
+
- `Infrastructure::CSV::ValueStreamer`
|
|
44
|
+
- `Infrastructure::Output::ConsoleWriter`
|
|
45
|
+
- `Infrastructure::Output::CsvFileWriter`
|
|
46
|
+
- Interface adapters:
|
|
47
|
+
- `Interface::CLI::MenuLoop`
|
|
48
|
+
- `Interface::CLI::Prompts::*`
|
|
49
|
+
- `Interface::CLI::Errors::Presenter`
|
|
50
|
+
|
|
51
|
+
```mermaid
|
|
52
|
+
classDiagram
|
|
53
|
+
direction LR
|
|
54
|
+
class MenuLoop
|
|
55
|
+
class Prompts
|
|
56
|
+
class Errors
|
|
57
|
+
class RunExtraction
|
|
58
|
+
class ColumnSession
|
|
59
|
+
class CsvSource
|
|
60
|
+
class ColumnSelection
|
|
61
|
+
class ExtractionOptions
|
|
62
|
+
class Preview
|
|
63
|
+
class ExtractionValue
|
|
64
|
+
class OutputDestination
|
|
65
|
+
class HeaderReader
|
|
66
|
+
class ValueStreamer
|
|
67
|
+
class ConsoleWriter
|
|
68
|
+
class CsvFileWriter
|
|
69
|
+
|
|
70
|
+
MenuLoop --> RunExtraction : invokes
|
|
71
|
+
Prompts --> RunExtraction : provides input
|
|
72
|
+
RunExtraction --> Errors : reports failures
|
|
73
|
+
RunExtraction --> ColumnSession : orchestrates
|
|
74
|
+
ColumnSession o-- CsvSource
|
|
75
|
+
ColumnSession o-- ColumnSelection
|
|
76
|
+
ColumnSession o-- ExtractionOptions
|
|
77
|
+
ColumnSession o-- Preview
|
|
78
|
+
Preview o-- ExtractionValue
|
|
79
|
+
ColumnSession o-- OutputDestination
|
|
80
|
+
RunExtraction --> HeaderReader
|
|
81
|
+
RunExtraction --> ValueStreamer
|
|
82
|
+
RunExtraction --> ConsoleWriter
|
|
83
|
+
RunExtraction --> CsvFileWriter
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Row Extraction
|
|
87
|
+
|
|
88
|
+
Core DDD structure:
|
|
89
|
+
|
|
90
|
+
- Aggregate root: `RowSession`
|
|
91
|
+
- Captures one row-range extraction request.
|
|
92
|
+
- Holds selected source, requested range, and output destination.
|
|
93
|
+
- Entity:
|
|
94
|
+
- `RowSource` (file path + separator)
|
|
95
|
+
- Value objects:
|
|
96
|
+
- `RowRange` (`start_row`, `end_row`) plus row-range validation errors
|
|
97
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
98
|
+
- Application service:
|
|
99
|
+
- `Application::UseCases::RunRowExtraction` orchestrates row-range extraction.
|
|
100
|
+
- Infrastructure adapters:
|
|
101
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
102
|
+
- `Infrastructure::CSV::RowStreamer`
|
|
103
|
+
- `Infrastructure::Output::CsvRowConsoleWriter`
|
|
104
|
+
- `Infrastructure::Output::CsvRowFileWriter`
|
|
105
|
+
- Interface adapters:
|
|
106
|
+
- `Interface::CLI::MenuLoop`
|
|
107
|
+
- `Interface::CLI::Prompts::*`
|
|
108
|
+
- `Interface::CLI::Errors::Presenter`
|
|
109
|
+
|
|
110
|
+
```mermaid
|
|
111
|
+
classDiagram
|
|
112
|
+
direction LR
|
|
113
|
+
class MenuLoop
|
|
114
|
+
class Prompts
|
|
115
|
+
class Errors
|
|
116
|
+
class RunRowExtraction
|
|
117
|
+
class RowSession
|
|
118
|
+
class RowSource
|
|
119
|
+
class RowRange
|
|
120
|
+
class OutputDestination
|
|
121
|
+
class HeaderReader
|
|
122
|
+
class RowStreamer
|
|
123
|
+
class CsvRowConsoleWriter
|
|
124
|
+
class CsvRowFileWriter
|
|
125
|
+
|
|
126
|
+
MenuLoop --> RunRowExtraction : invokes
|
|
127
|
+
Prompts --> RunRowExtraction : provides input
|
|
128
|
+
RunRowExtraction --> Errors : reports failures
|
|
129
|
+
RunRowExtraction --> RowSession : orchestrates
|
|
130
|
+
RowSession o-- RowSource
|
|
131
|
+
RowSession o-- RowRange
|
|
132
|
+
RowSession o-- OutputDestination
|
|
133
|
+
RunRowExtraction --> HeaderReader
|
|
134
|
+
RunRowExtraction --> RowStreamer
|
|
135
|
+
RunRowExtraction --> CsvRowConsoleWriter
|
|
136
|
+
RunRowExtraction --> CsvRowFileWriter
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Row Randomization
|
|
140
|
+
|
|
141
|
+
Core DDD structure:
|
|
142
|
+
|
|
143
|
+
- Aggregate root: `RandomizationSession`
|
|
144
|
+
- Captures one randomization request from source + options + output destination.
|
|
145
|
+
- Entity:
|
|
146
|
+
- `RandomizationSource` (file path + separator + header mode)
|
|
147
|
+
- Value objects:
|
|
148
|
+
- `RandomizationOptions` (optional deterministic `seed`)
|
|
149
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
150
|
+
- Application service:
|
|
151
|
+
- `Application::UseCases::RunRowRandomization` orchestrates row randomization.
|
|
152
|
+
- Infrastructure adapters:
|
|
153
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
154
|
+
- `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
|
|
155
|
+
- Interface adapters:
|
|
156
|
+
- `Interface::CLI::MenuLoop`
|
|
157
|
+
- `Interface::CLI::Prompts::*`
|
|
158
|
+
- `Interface::CLI::Errors::Presenter`
|
|
159
|
+
|
|
160
|
+
```mermaid
|
|
161
|
+
classDiagram
|
|
162
|
+
direction LR
|
|
163
|
+
class MenuLoop
|
|
164
|
+
class Prompts
|
|
165
|
+
class Errors
|
|
166
|
+
class RunRowRandomization
|
|
167
|
+
class RandomizationSession
|
|
168
|
+
class RandomizationSource
|
|
169
|
+
class RandomizationOptions
|
|
170
|
+
class OutputDestination
|
|
171
|
+
class HeaderReader
|
|
172
|
+
class RowRandomizer
|
|
173
|
+
|
|
174
|
+
MenuLoop --> RunRowRandomization : invokes
|
|
175
|
+
Prompts --> RunRowRandomization : provides input
|
|
176
|
+
RunRowRandomization --> Errors : reports failures
|
|
177
|
+
RunRowRandomization --> RandomizationSession : orchestrates
|
|
178
|
+
RandomizationSession o-- RandomizationSource
|
|
179
|
+
RandomizationSession o-- RandomizationOptions
|
|
180
|
+
RandomizationSession o-- OutputDestination
|
|
181
|
+
RunRowRandomization --> HeaderReader
|
|
182
|
+
RunRowRandomization --> RowRandomizer
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Cross-CSV Dedupe
|
|
186
|
+
|
|
187
|
+
Core DDD structure:
|
|
188
|
+
|
|
189
|
+
- Aggregate root: `CrossCsvDedupeSession`
|
|
190
|
+
- Captures one dedupe request with source/reference profiles, key mapping, match options, and output destination.
|
|
191
|
+
- Entities:
|
|
192
|
+
- `CsvProfile` (path + separator + header mode) for source and reference CSVs.
|
|
193
|
+
- `KeyMapping` (source selector + reference selector).
|
|
194
|
+
- Value objects:
|
|
195
|
+
- `ColumnSelector` (header name or 1-based index mode)
|
|
196
|
+
- `MatchOptions` (`trim_whitespace`, `case_insensitive`, plus normalization behavior)
|
|
197
|
+
- Shared `OutputDestination` (`console` or `file(path)`)
|
|
198
|
+
- Application service:
|
|
199
|
+
- `Application::UseCases::RunCrossCsvDedupe` orchestrates dedupe workflow.
|
|
200
|
+
- Infrastructure adapters:
|
|
201
|
+
- `Infrastructure::CSV::HeaderReader`
|
|
202
|
+
- `Infrastructure::CSV::SelectorValidator`
|
|
203
|
+
- `Infrastructure::CSV::CrossCsvDeduper` (streams source rows while checking membership against reference key set)
|
|
204
|
+
- Interface adapters:
|
|
205
|
+
- `Interface::CLI::MenuLoop`
|
|
206
|
+
- `Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow`
|
|
207
|
+
- `Interface::CLI::Prompts::*`
|
|
208
|
+
- `Interface::CLI::Errors::Presenter`
|
|
209
|
+
|
|
210
|
+
```mermaid
|
|
211
|
+
classDiagram
|
|
212
|
+
direction LR
|
|
213
|
+
class MenuLoop
|
|
214
|
+
class RunCrossCsvDedupeWorkflow
|
|
215
|
+
class Prompts
|
|
216
|
+
class Errors
|
|
217
|
+
class RunCrossCsvDedupe
|
|
218
|
+
class CrossCsvDedupeSession
|
|
219
|
+
class CsvProfile
|
|
220
|
+
class KeyMapping
|
|
221
|
+
class ColumnSelector
|
|
222
|
+
class MatchOptions
|
|
223
|
+
class OutputDestination
|
|
224
|
+
class HeaderReader
|
|
225
|
+
class SelectorValidator
|
|
226
|
+
class CrossCsvDeduper
|
|
227
|
+
|
|
228
|
+
MenuLoop --> RunCrossCsvDedupeWorkflow : invokes
|
|
229
|
+
Prompts --> RunCrossCsvDedupeWorkflow : provides input
|
|
230
|
+
RunCrossCsvDedupeWorkflow --> Errors : reports failures
|
|
231
|
+
RunCrossCsvDedupeWorkflow --> RunCrossCsvDedupe : calls
|
|
232
|
+
RunCrossCsvDedupe --> CrossCsvDedupeSession : orchestrates
|
|
233
|
+
CrossCsvDedupeSession o-- CsvProfile
|
|
234
|
+
CrossCsvDedupeSession o-- KeyMapping
|
|
235
|
+
KeyMapping o-- ColumnSelector
|
|
236
|
+
CrossCsvDedupeSession o-- MatchOptions
|
|
237
|
+
CrossCsvDedupeSession o-- OutputDestination
|
|
238
|
+
RunCrossCsvDedupe --> HeaderReader
|
|
239
|
+
RunCrossCsvDedupe --> SelectorValidator
|
|
240
|
+
RunCrossCsvDedupe --> CrossCsvDeduper
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Project layout
|
|
244
|
+
|
|
245
|
+
```text
|
|
246
|
+
bin/tool # CLI entrypoint
|
|
247
|
+
lib/csvtool/cli.rb
|
|
248
|
+
lib/csvtool/domain/column_session/*
|
|
249
|
+
lib/csvtool/domain/row_session/*
|
|
250
|
+
lib/csvtool/domain/row_randomization_session/*
|
|
251
|
+
lib/csvtool/domain/cross_csv_dedupe_session/*
|
|
252
|
+
lib/csvtool/domain/shared/output_destination.rb
|
|
253
|
+
lib/csvtool/application/use_cases/run_extraction.rb
|
|
254
|
+
lib/csvtool/application/use_cases/run_row_extraction.rb
|
|
255
|
+
lib/csvtool/application/use_cases/run_row_randomization.rb
|
|
256
|
+
lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
|
|
257
|
+
lib/csvtool/infrastructure/csv/*
|
|
258
|
+
lib/csvtool/infrastructure/output/*
|
|
259
|
+
lib/csvtool/interface/cli/menu_loop.rb
|
|
260
|
+
lib/csvtool/interface/cli/workflows/*
|
|
261
|
+
lib/csvtool/interface/cli/prompts/*
|
|
262
|
+
lib/csvtool/interface/cli/errors/presenter.rb
|
|
263
|
+
test/csvtool/cli_test.rb # end-to-end workflow tests
|
|
264
|
+
test/csvtool/**/*_test.rb # focused unit tests by component folder
|
|
265
|
+
test/test_helper.rb
|
|
266
|
+
```
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Release Checklist: v0.4.0-alpha
|
|
2
|
+
|
|
3
|
+
## 1. Verify environment
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
ruby -v
|
|
7
|
+
bundle -v
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Expected:
|
|
11
|
+
- Ruby `3.3.0`
|
|
12
|
+
|
|
13
|
+
## 2. Install dependencies
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 3. Run quality checks
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
bundle exec rake test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 4. Smoke test CLI commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
bundle exec csvtool menu
|
|
29
|
+
bundle exec csvtool column test/fixtures/sample_people.csv name
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 5. Smoke test workflows
|
|
33
|
+
|
|
34
|
+
### Row randomization workflow
|
|
35
|
+
|
|
36
|
+
Use menu option `3` (`Randomize rows`) and verify:
|
|
37
|
+
- headered CSV output keeps header in first row
|
|
38
|
+
- seeded mode is reproducible
|
|
39
|
+
- file output path writes valid CSV
|
|
40
|
+
- headerless mode randomizes all rows
|
|
41
|
+
|
|
42
|
+
### Cross-CSV dedupe workflow
|
|
43
|
+
|
|
44
|
+
Use menu option `4` (`Dedupe using another CSV`) and verify:
|
|
45
|
+
- headered + comma happy path produces expected retained rows
|
|
46
|
+
- headerless + index selectors work
|
|
47
|
+
- TSV separators work
|
|
48
|
+
- normalization toggles (`trim`, `case-insensitive`) behave as expected
|
|
49
|
+
- diagnostics render for `no matches` and `all removed`
|
|
50
|
+
- file output mode writes expected CSV
|
|
51
|
+
|
|
52
|
+
## 6. Build and validate gem package
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
gem build csvops.gemspec
|
|
56
|
+
gem install ./csvops-0.4.0.alpha.gem
|
|
57
|
+
csvtool menu
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 7. Commit release prep
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git add -A
|
|
64
|
+
git commit -m "chore(release): prepare v0.4.0-alpha"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 8. Tag release
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git tag -a v0.4.0-alpha -m "v0.4.0-alpha"
|
|
71
|
+
git push origin main --tags
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 9. Publish gem (optional for alpha)
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
gem push csvops-0.4.0.alpha.gem
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## 10. Create GitHub release
|
|
81
|
+
|
|
82
|
+
Create release `v0.4.0-alpha` with:
|
|
83
|
+
- Cross-CSV dedupe workflow with normalization options and large-file streaming behavior
|
|
84
|
+
- Dedupe domain model (`CrossCsvDedupeSession`) with stronger invariants
|
|
85
|
+
- Shared-kernel `OutputDestination` value object across workflows
|
|
86
|
+
- Architecture/docs split (`README` + `docs/architecture.md`) with UML diagrams
|
|
87
|
+
- Dedupe boundary cleanup: CLI workflow (`RunCrossCsvDedupeWorkflow`) and application use-case separation
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "csvtool/infrastructure/csv/header_reader"
|
|
5
|
+
require "csvtool/infrastructure/csv/cross_csv_deduper"
|
|
6
|
+
require "csvtool/infrastructure/csv/selector_validator"
|
|
7
|
+
|
|
8
|
+
module Csvtool
|
|
9
|
+
module Application
|
|
10
|
+
module UseCases
|
|
11
|
+
class RunCrossCsvDedupe
|
|
12
|
+
Result = Struct.new(:ok, :error, :data, keyword_init: true) do
|
|
13
|
+
def ok?
|
|
14
|
+
ok
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(
|
|
19
|
+
header_reader: Infrastructure::CSV::HeaderReader.new,
|
|
20
|
+
deduper: Infrastructure::CSV::CrossCsvDeduper.new,
|
|
21
|
+
selector_validator: Infrastructure::CSV::SelectorValidator.new(header_reader: header_reader)
|
|
22
|
+
)
|
|
23
|
+
@header_reader = header_reader
|
|
24
|
+
@deduper = deduper
|
|
25
|
+
@selector_validator = selector_validator
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call(session:, on_header: nil, on_row: nil)
|
|
29
|
+
current_read_path = session.source.path
|
|
30
|
+
return failure(:column_not_found) unless @selector_validator.valid?(profile: session.source, selector: session.key_mapping.source_selector)
|
|
31
|
+
|
|
32
|
+
current_read_path = session.reference.path
|
|
33
|
+
return failure(:column_not_found) unless @selector_validator.valid?(profile: session.reference, selector: session.key_mapping.reference_selector)
|
|
34
|
+
|
|
35
|
+
source_headers = session.source.headers_present? ? @header_reader.call(file_path: session.source.path, col_sep: session.source.separator) : nil
|
|
36
|
+
current_read_path = session.source.path
|
|
37
|
+
|
|
38
|
+
if session.output_destination.file?
|
|
39
|
+
write_file(session: session, source_headers: source_headers)
|
|
40
|
+
else
|
|
41
|
+
on_header.call(source_headers) if on_header && source_headers
|
|
42
|
+
stats = @deduper.each_retained(**dedupe_options(session)) do |fields|
|
|
43
|
+
on_row.call(fields) if on_row
|
|
44
|
+
end
|
|
45
|
+
success(stats: stats)
|
|
46
|
+
end
|
|
47
|
+
rescue CSV::MalformedCSVError
|
|
48
|
+
failure(:could_not_parse_csv)
|
|
49
|
+
rescue Errno::EACCES
|
|
50
|
+
failure(:cannot_read_file, path: current_read_path || session.source.path)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def write_file(session:, source_headers:)
|
|
56
|
+
stats = nil
|
|
57
|
+
::CSV.open(
|
|
58
|
+
session.output_destination.path,
|
|
59
|
+
"w",
|
|
60
|
+
write_headers: !source_headers.nil?,
|
|
61
|
+
headers: source_headers,
|
|
62
|
+
col_sep: session.source.separator
|
|
63
|
+
) do |csv|
|
|
64
|
+
stats = @deduper.each_retained(**dedupe_options(session)) { |fields| csv << fields }
|
|
65
|
+
end
|
|
66
|
+
success(stats: stats, output_path: session.output_destination.path)
|
|
67
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
68
|
+
failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def dedupe_options(session)
|
|
72
|
+
{
|
|
73
|
+
source_path: session.source.path,
|
|
74
|
+
reference_path: session.reference.path,
|
|
75
|
+
source_selector: session.key_mapping.source_selector,
|
|
76
|
+
reference_selector: session.key_mapping.reference_selector,
|
|
77
|
+
source_col_sep: session.source.separator,
|
|
78
|
+
reference_col_sep: session.reference.separator,
|
|
79
|
+
match_options: session.match_options
|
|
80
|
+
}
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def success(data)
|
|
84
|
+
Result.new(ok: true, error: nil, data: data)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def failure(code, data = {})
|
|
88
|
+
Result.new(ok: false, error: code, data: data)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|