csvops 0.3.0.alpha → 0.4.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +56 -142
  3. data/docs/architecture.md +266 -0
  4. data/docs/release-v0.4.0-alpha.md +87 -0
  5. data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +93 -0
  6. data/lib/csvtool/application/use_cases/run_extraction.rb +3 -3
  7. data/lib/csvtool/application/use_cases/run_row_extraction.rb +3 -3
  8. data/lib/csvtool/application/use_cases/run_row_randomization.rb +3 -3
  9. data/lib/csvtool/cli.rb +5 -1
  10. data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
  11. data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
  12. data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
  13. data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
  14. data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
  15. data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +1 -0
  16. data/lib/csvtool/domain/row_session/row_source.rb +3 -0
  17. data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
  18. data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
  19. data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
  20. data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
  21. data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +163 -0
  22. data/lib/csvtool/version.rb +1 -1
  23. data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +113 -0
  24. data/test/csvtool/cli_test.rb +130 -16
  25. data/test/csvtool/cli_unit_test.rb +16 -3
  26. data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
  27. data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
  28. data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
  29. data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
  30. data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
  31. data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
  32. data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
  33. data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +2 -2
  34. data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +15 -1
  35. data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
  36. data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
  37. data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
  38. data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
  39. data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
  40. data/test/csvtool/interface/cli/menu_loop_test.rb +50 -13
  41. data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
  42. data/test/fixtures/dedupe_reference.csv +3 -0
  43. data/test/fixtures/dedupe_reference.tsv +3 -0
  44. data/test/fixtures/dedupe_reference_all.csv +5 -0
  45. data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
  46. data/test/fixtures/dedupe_reference_none.csv +2 -0
  47. data/test/fixtures/dedupe_reference_normalization.csv +3 -0
  48. data/test/fixtures/dedupe_source.csv +6 -0
  49. data/test/fixtures/dedupe_source.tsv +6 -0
  50. data/test/fixtures/dedupe_source_no_headers.csv +5 -0
  51. data/test/fixtures/dedupe_source_normalization.csv +4 -0
  52. metadata +34 -8
  53. data/lib/csvtool/domain/row_randomization_session/randomization_output_destination.rb +0 -31
  54. data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
  55. data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
  56. data/test/csvtool/domain/row_randomization_session/randomization_output_destination_test.rb +0 -21
  57. data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9663c50901b31a8073c4a5a0524e9e30c81c20bbe1b736af71649e60a7150a0e
4
- data.tar.gz: a622dad35eb52afeded279726d2575db0cd210c8ed4aa07650506bfd2e2b6de5
3
+ metadata.gz: 9c5a0c00272c2d10751b234384ac50ee8caa90681860906419ccdec7a6e3c110
4
+ data.tar.gz: 849d377bec9acd507c0fd37a75e823bb9458295e12a31a5000b9ba599084092d
5
5
  SHA512:
6
- metadata.gz: 28726bb66d05881caead074ce529d79db5424b85a7552f8b56cca44e891b3bb0c34cd850ff22351d6d93a5ef725e3891ccc8b7ac7e1e62d24d4e4c4d7d9b344d
7
- data.tar.gz: 0fb96011d8737fb757b30e6226b9086453aa3ef6e5def1e95b77bb8bdbd414e25b72adfe6a8e65ee0498bbb88f2a809777a0d2405057689d00b3858c73014b93
6
+ metadata.gz: ba96ce18b4e6d2fd8eb018f406c17e7b810010a788a6be1acb51a714b87dad614d822edb97f780e2c745e257bbc68c89266427876fcc4b3fee57fadb29232630
7
+ data.tar.gz: 378c4a47b96cf210b28f689d9ef0aa1056c95777d3128d4044d8462cf802eeb53ca148062f4244182105e4b760bbf7dcf48d19705f69f18dfb78e3fc2e935413
data/README.md CHANGED
@@ -35,11 +35,13 @@ bundle exec csvtool menu
35
35
  CSV Tool Menu
36
36
  1. Extract column
37
37
  2. Extract rows (range)
38
- 3. Exit
38
+ 3. Randomize rows
39
+ 4. Dedupe using another CSV
40
+ 5. Exit
39
41
  >
40
42
  ```
41
43
 
42
- Select `1` to run extraction.
44
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, or `4` for cross-CSV dedupe.
43
45
 
44
46
  ### 3. Follow prompts
45
47
 
@@ -111,6 +113,53 @@ With Bundler:
111
113
  bundle exec csvtool column /path/to/file.csv column_name
112
114
  ```
113
115
 
116
+ ### 7. Dedupe interaction example
117
+
118
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
119
+
120
+ ```diff
121
+ CSV Tool Menu
122
+ 1. Extract column
123
+ 2. Extract rows (range)
124
+ 3. Randomize rows
125
+ 4. Dedupe using another CSV
126
+ 5. Exit
127
+ +> 4
128
+ CSV file path: /tmp/source.csv
129
+ Source CSV separator:
130
+ Choose separator:
131
+ 1. comma (,)
132
+ 2. tab (\t)
133
+ 3. semicolon (;)
134
+ 4. pipe (|)
135
+ 5. custom
136
+ +Separator choice [1]: 1
137
+ Source headers present? [Y/n]:
138
+ Reference CSV file path: /tmp/reference.csv
139
+ Reference CSV separator:
140
+ Choose separator:
141
+ 1. comma (,)
142
+ 2. tab (\t)
143
+ 3. semicolon (;)
144
+ 4. pipe (|)
145
+ 5. custom
146
+ +Separator choice [1]: 1
147
+ Reference headers present? [Y/n]:
148
+ Source key column name: customer_id
149
+ Reference key column name: external_id
150
+ Trim whitespace before matching? [Y/n]:
151
+ Case-insensitive matching? [y/N]:
152
+ Output destination:
153
+ 1. console
154
+ 2. file
155
+ +Output destination [1]: 1
156
+ -
157
+ -customer_id,name
158
+ -1,Alice
159
+ -3,Cara
160
+ -Summary: source_rows=5 removed_rows=3 kept_rows=2
161
+ ```
162
+
114
163
  ## Testing
115
164
 
116
165
  Run tests:
@@ -127,7 +176,7 @@ bundle exec rake test
127
176
 
128
177
  ## Alpha release
129
178
 
130
- Current prerelease version: `0.3.0.alpha`
179
+ Current prerelease version: `0.4.0.alpha`
131
180
 
132
181
  Install prerelease from RubyGems:
133
182
 
@@ -137,146 +186,11 @@ gem install csvops --pre
137
186
 
138
187
  Release runbook:
139
188
 
140
- - `docs/release-v0.3.0-alpha.md`
141
-
142
- ## Architecture
143
-
144
- The codebase follows a DDD-lite layered structure:
145
-
146
- - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, and `RandomizationSession` aggregates + supporting entities/value objects).
147
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`).
148
- - `infrastructure/`: CSV reading/streaming and output adapters (console/file).
149
- - `interface/cli/`: menu, prompts, and user-facing error presentation.
150
- - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
151
-
152
- ## Domain model
153
-
154
- Bounded contexts: `Column Extraction`, `Row Extraction`, and `Row Randomization`.
155
-
156
- ### Column Extraction
157
-
158
- - Aggregate root: `ColumnSession`
159
- - Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
160
- - Enforces session-level invariants.
161
- - Entities:
162
- - `CsvSource` (file path + `Separator`)
163
- - `ColumnSelection` (chosen header)
164
- - Value objects:
165
- - `Separator`
166
- - `ExtractionOptions` (`skip_blanks`, `preview_limit`)
167
- - `Preview` (list of `ExtractionValue`)
168
- - `ExtractionValue`
169
- - `OutputDestination` (`console` or `file(path)`)
170
- - Application service:
171
- - `Application::UseCases::RunExtraction` orchestrates one extraction request.
172
- - Infrastructure adapters:
173
- - `Infrastructure::CSV::HeaderReader`
174
- - `Infrastructure::CSV::ValueStreamer`
175
- - `Infrastructure::Output::ConsoleWriter`
176
- - `Infrastructure::Output::CsvFileWriter`
177
- - Interface adapters:
178
- - `Interface::CLI::MenuLoop`
179
- - `Interface::CLI::Prompts::*`
180
- - `Interface::CLI::Errors::Presenter`
181
-
182
- ```mermaid
183
- flowchart LR
184
- UI["Interface CLI\n(Menu + Prompts + Errors)"] --> APP["Application Use Case\nRunExtraction"]
185
- APP --> AGG["Domain Aggregate\nColumnSession"]
186
-
187
- AGG --> E1["Entity\nCsvSource"]
188
- AGG --> E2["Entity\nColumnSelection"]
189
- AGG --> V1["Value Objects\nSeparator / ExtractionOptions / Preview / OutputDestination / ExtractionValue"]
190
-
191
- APP --> INFCSV["Infrastructure CSV\nHeaderReader + ValueStreamer"]
192
- APP --> INFOUT["Infrastructure Output\nConsoleWriter + CsvFileWriter"]
193
- ```
189
+ - `docs/release-v0.4.0-alpha.md`
194
190
 
195
- ### Row Extraction
196
-
197
- Core DDD structure:
198
-
199
- - Aggregate root: `RowSession`
200
- - Captures one row-range extraction request.
201
- - Holds selected source, requested range, and output destination.
202
- - Entity:
203
- - `RowSource` (file path + separator)
204
- - Value objects:
205
- - `RowRange` (`start_row`, `end_row`) plus row-range validation errors
206
- - `RowOutputDestination` (`console` or `file(path)`)
207
- - Application service:
208
- - `Application::UseCases::RunRowExtraction` orchestrates row-range extraction.
209
- - Infrastructure adapters:
210
- - `Infrastructure::CSV::HeaderReader`
211
- - `Infrastructure::CSV::RowStreamer`
212
- - `Infrastructure::Output::CsvRowConsoleWriter`
213
- - `Infrastructure::Output::CsvRowFileWriter`
214
- - Interface adapters:
215
- - `Interface::CLI::MenuLoop`
216
- - `Interface::CLI::Prompts::*`
217
- - `Interface::CLI::Errors::Presenter`
218
-
219
- ```mermaid
220
- flowchart LR
221
- UI2["Interface CLI\n(Menu + Prompts + Errors)"] --> APP2["Application Use Case\nRunRowExtraction"]
222
- APP2 --> AGG2["Domain Aggregate\nRowSession"]
223
-
224
- AGG2 --> E3["Entity\nRowSource"]
225
- AGG2 --> V2["Value Objects\nRowRange / RowOutputDestination"]
226
-
227
- APP2 --> INFCSV2["Infrastructure CSV\nHeaderReader + RowStreamer"]
228
- APP2 --> INFOUT2["Infrastructure Output\nCsvRowConsoleWriter + CsvRowFileWriter"]
229
- ```
230
191
 
231
- ### Row Randomization
232
-
233
- Core DDD structure:
234
-
235
- - Aggregate root: `RandomizationSession`
236
- - Captures one randomization request from source + options + output destination.
237
- - Entity:
238
- - `RandomizationSource` (file path + separator + header mode)
239
- - Value objects:
240
- - `RandomizationOptions` (optional deterministic `seed`)
241
- - `RandomizationOutputDestination` (`console` or `file(path)`)
242
- - Application service:
243
- - `Application::UseCases::RunRowRandomization` orchestrates row randomization.
244
- - Infrastructure adapters:
245
- - `Infrastructure::CSV::HeaderReader`
246
- - `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
247
- - Interface adapters:
248
- - `Interface::CLI::MenuLoop`
249
- - `Interface::CLI::Prompts::*`
250
- - `Interface::CLI::Errors::Presenter`
251
-
252
- ```mermaid
253
- flowchart LR
254
- UI3["Interface CLI\n(Menu + Prompts + Errors)"] --> APP3["Application Use Case\nRunRowRandomization"]
255
- APP3 --> AGG3["Domain Aggregate\nRandomizationSession"]
256
-
257
- AGG3 --> E4["Entity\nRandomizationSource"]
258
- AGG3 --> V3["Value Objects\nRandomizationOptions / RandomizationOutputDestination"]
259
-
260
- APP3 --> INFCSV3["Infrastructure CSV\nHeaderReader + RowRandomizer"]
261
- ```
192
+ ## Architecture
262
193
 
263
- ## Project layout
194
+ Full architecture and domain documentation lives in:
264
195
 
265
- ```text
266
- bin/tool # CLI entrypoint
267
- lib/csvtool/cli.rb
268
- lib/csvtool/domain/column_session/*
269
- lib/csvtool/domain/row_session/*
270
- lib/csvtool/domain/row_randomization_session/*
271
- lib/csvtool/application/use_cases/run_extraction.rb
272
- lib/csvtool/application/use_cases/run_row_extraction.rb
273
- lib/csvtool/application/use_cases/run_row_randomization.rb
274
- lib/csvtool/infrastructure/csv/*
275
- lib/csvtool/infrastructure/output/*
276
- lib/csvtool/interface/cli/menu_loop.rb
277
- lib/csvtool/interface/cli/prompts/*
278
- lib/csvtool/interface/cli/errors/presenter.rb
279
- test/csvtool/cli_test.rb # end-to-end workflow tests
280
- test/csvtool/**/*_test.rb # focused unit tests by component folder
281
- test/test_helper.rb
282
- ```
196
+ - [`docs/architecture.md`](docs/architecture.md)
@@ -0,0 +1,266 @@
1
+ # Architecture
2
+
3
+ The codebase follows a DDD-lite layered structure:
4
+
5
+ - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
7
+ - `infrastructure/`: CSV reading/streaming and output adapters (console/file), plus cross-CSV dedupe adapter.
8
+ - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
+ - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
+
11
+ ## Domain model
12
+
13
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, and `Cross-CSV Dedupe`.
14
+
15
+ ### Cross-CSV Dedupe (Large-file behavior)
16
+
17
+ - Workflow: remove rows from a source CSV when source key matches a key from a reference CSV.
18
+ - Scaling strategy:
19
+ - Reference CSV keys are loaded into a `Set` for fast membership checks.
20
+ - Source CSV rows are streamed directly to the selected output destination (console or file).
21
+ - Memory tradeoff:
22
+ - Memory is dominated by the number of unique keys in the reference CSV.
23
+ - Source-row memory stays bounded because retained rows are not accumulated in memory before writing.
24
+
25
+ ### Column Extraction
26
+
27
+ - Aggregate root: `ColumnSession`
28
+ - Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
29
+ - Enforces session-level invariants.
30
+ - Entities:
31
+ - `CsvSource` (file path + `Separator`)
32
+ - `ColumnSelection` (chosen header)
33
+ - Value objects:
34
+ - `Separator`
35
+ - `ExtractionOptions` (`skip_blanks`, `preview_limit`)
36
+ - `Preview` (list of `ExtractionValue`)
37
+ - `ExtractionValue`
38
+ - Shared `OutputDestination` (`console` or `file(path)`)
39
+ - Application service:
40
+ - `Application::UseCases::RunExtraction` orchestrates one extraction request.
41
+ - Infrastructure adapters:
42
+ - `Infrastructure::CSV::HeaderReader`
43
+ - `Infrastructure::CSV::ValueStreamer`
44
+ - `Infrastructure::Output::ConsoleWriter`
45
+ - `Infrastructure::Output::CsvFileWriter`
46
+ - Interface adapters:
47
+ - `Interface::CLI::MenuLoop`
48
+ - `Interface::CLI::Prompts::*`
49
+ - `Interface::CLI::Errors::Presenter`
50
+
51
+ ```mermaid
52
+ classDiagram
53
+ direction LR
54
+ class MenuLoop
55
+ class Prompts
56
+ class Errors
57
+ class RunExtraction
58
+ class ColumnSession
59
+ class CsvSource
60
+ class ColumnSelection
61
+ class ExtractionOptions
62
+ class Preview
63
+ class ExtractionValue
64
+ class OutputDestination
65
+ class HeaderReader
66
+ class ValueStreamer
67
+ class ConsoleWriter
68
+ class CsvFileWriter
69
+
70
+ MenuLoop --> RunExtraction : invokes
71
+ Prompts --> RunExtraction : provides input
72
+ RunExtraction --> Errors : reports failures
73
+ RunExtraction --> ColumnSession : orchestrates
74
+ ColumnSession o-- CsvSource
75
+ ColumnSession o-- ColumnSelection
76
+ ColumnSession o-- ExtractionOptions
77
+ ColumnSession o-- Preview
78
+ Preview o-- ExtractionValue
79
+ ColumnSession o-- OutputDestination
80
+ RunExtraction --> HeaderReader
81
+ RunExtraction --> ValueStreamer
82
+ RunExtraction --> ConsoleWriter
83
+ RunExtraction --> CsvFileWriter
84
+ ```
85
+
86
+ ### Row Extraction
87
+
88
+ Core DDD structure:
89
+
90
+ - Aggregate root: `RowSession`
91
+ - Captures one row-range extraction request.
92
+ - Holds selected source, requested range, and output destination.
93
+ - Entity:
94
+ - `RowSource` (file path + separator)
95
+ - Value objects:
96
+ - `RowRange` (`start_row`, `end_row`) plus row-range validation errors
97
+ - Shared `OutputDestination` (`console` or `file(path)`)
98
+ - Application service:
99
+ - `Application::UseCases::RunRowExtraction` orchestrates row-range extraction.
100
+ - Infrastructure adapters:
101
+ - `Infrastructure::CSV::HeaderReader`
102
+ - `Infrastructure::CSV::RowStreamer`
103
+ - `Infrastructure::Output::CsvRowConsoleWriter`
104
+ - `Infrastructure::Output::CsvRowFileWriter`
105
+ - Interface adapters:
106
+ - `Interface::CLI::MenuLoop`
107
+ - `Interface::CLI::Prompts::*`
108
+ - `Interface::CLI::Errors::Presenter`
109
+
110
+ ```mermaid
111
+ classDiagram
112
+ direction LR
113
+ class MenuLoop
114
+ class Prompts
115
+ class Errors
116
+ class RunRowExtraction
117
+ class RowSession
118
+ class RowSource
119
+ class RowRange
120
+ class OutputDestination
121
+ class HeaderReader
122
+ class RowStreamer
123
+ class CsvRowConsoleWriter
124
+ class CsvRowFileWriter
125
+
126
+ MenuLoop --> RunRowExtraction : invokes
127
+ Prompts --> RunRowExtraction : provides input
128
+ RunRowExtraction --> Errors : reports failures
129
+ RunRowExtraction --> RowSession : orchestrates
130
+ RowSession o-- RowSource
131
+ RowSession o-- RowRange
132
+ RowSession o-- OutputDestination
133
+ RunRowExtraction --> HeaderReader
134
+ RunRowExtraction --> RowStreamer
135
+ RunRowExtraction --> CsvRowConsoleWriter
136
+ RunRowExtraction --> CsvRowFileWriter
137
+ ```
138
+
139
+ ### Row Randomization
140
+
141
+ Core DDD structure:
142
+
143
+ - Aggregate root: `RandomizationSession`
144
+ - Captures one randomization request from source + options + output destination.
145
+ - Entity:
146
+ - `RandomizationSource` (file path + separator + header mode)
147
+ - Value objects:
148
+ - `RandomizationOptions` (optional deterministic `seed`)
149
+ - Shared `OutputDestination` (`console` or `file(path)`)
150
+ - Application service:
151
+ - `Application::UseCases::RunRowRandomization` orchestrates row randomization.
152
+ - Infrastructure adapters:
153
+ - `Infrastructure::CSV::HeaderReader`
154
+ - `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
155
+ - Interface adapters:
156
+ - `Interface::CLI::MenuLoop`
157
+ - `Interface::CLI::Prompts::*`
158
+ - `Interface::CLI::Errors::Presenter`
159
+
160
+ ```mermaid
161
+ classDiagram
162
+ direction LR
163
+ class MenuLoop
164
+ class Prompts
165
+ class Errors
166
+ class RunRowRandomization
167
+ class RandomizationSession
168
+ class RandomizationSource
169
+ class RandomizationOptions
170
+ class OutputDestination
171
+ class HeaderReader
172
+ class RowRandomizer
173
+
174
+ MenuLoop --> RunRowRandomization : invokes
175
+ Prompts --> RunRowRandomization : provides input
176
+ RunRowRandomization --> Errors : reports failures
177
+ RunRowRandomization --> RandomizationSession : orchestrates
178
+ RandomizationSession o-- RandomizationSource
179
+ RandomizationSession o-- RandomizationOptions
180
+ RandomizationSession o-- OutputDestination
181
+ RunRowRandomization --> HeaderReader
182
+ RunRowRandomization --> RowRandomizer
183
+ ```
184
+
185
+ ### Cross-CSV Dedupe
186
+
187
+ Core DDD structure:
188
+
189
+ - Aggregate root: `CrossCsvDedupeSession`
190
+ - Captures one dedupe request with source/reference profiles, key mapping, match options, and output destination.
191
+ - Entities:
192
+ - `CsvProfile` (path + separator + header mode) for source and reference CSVs.
193
+ - `KeyMapping` (source selector + reference selector).
194
+ - Value objects:
195
+ - `ColumnSelector` (header name or 1-based index mode)
196
+ - `MatchOptions` (`trim_whitespace`, `case_insensitive`, plus normalization behavior)
197
+ - Shared `OutputDestination` (`console` or `file(path)`)
198
+ - Application service:
199
+ - `Application::UseCases::RunCrossCsvDedupe` orchestrates dedupe workflow.
200
+ - Infrastructure adapters:
201
+ - `Infrastructure::CSV::HeaderReader`
202
+ - `Infrastructure::CSV::SelectorValidator`
203
+ - `Infrastructure::CSV::CrossCsvDeduper` (streams source rows while checking membership against reference key set)
204
+ - Interface adapters:
205
+ - `Interface::CLI::MenuLoop`
206
+ - `Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow`
207
+ - `Interface::CLI::Prompts::*`
208
+ - `Interface::CLI::Errors::Presenter`
209
+
210
+ ```mermaid
211
+ classDiagram
212
+ direction LR
213
+ class MenuLoop
214
+ class RunCrossCsvDedupeWorkflow
215
+ class Prompts
216
+ class Errors
217
+ class RunCrossCsvDedupe
218
+ class CrossCsvDedupeSession
219
+ class CsvProfile
220
+ class KeyMapping
221
+ class ColumnSelector
222
+ class MatchOptions
223
+ class OutputDestination
224
+ class HeaderReader
225
+ class SelectorValidator
226
+ class CrossCsvDeduper
227
+
228
+ MenuLoop --> RunCrossCsvDedupeWorkflow : invokes
229
+ Prompts --> RunCrossCsvDedupeWorkflow : provides input
230
+ RunCrossCsvDedupeWorkflow --> Errors : reports failures
231
+ RunCrossCsvDedupeWorkflow --> RunCrossCsvDedupe : calls
232
+ RunCrossCsvDedupe --> CrossCsvDedupeSession : orchestrates
233
+ CrossCsvDedupeSession o-- CsvProfile
234
+ CrossCsvDedupeSession o-- KeyMapping
235
+ KeyMapping o-- ColumnSelector
236
+ CrossCsvDedupeSession o-- MatchOptions
237
+ CrossCsvDedupeSession o-- OutputDestination
238
+ RunCrossCsvDedupe --> HeaderReader
239
+ RunCrossCsvDedupe --> SelectorValidator
240
+ RunCrossCsvDedupe --> CrossCsvDeduper
241
+ ```
242
+
243
+ ## Project layout
244
+
245
+ ```text
246
+ bin/tool # CLI entrypoint
247
+ lib/csvtool/cli.rb
248
+ lib/csvtool/domain/column_session/*
249
+ lib/csvtool/domain/row_session/*
250
+ lib/csvtool/domain/row_randomization_session/*
251
+ lib/csvtool/domain/cross_csv_dedupe_session/*
252
+ lib/csvtool/domain/shared/output_destination.rb
253
+ lib/csvtool/application/use_cases/run_extraction.rb
254
+ lib/csvtool/application/use_cases/run_row_extraction.rb
255
+ lib/csvtool/application/use_cases/run_row_randomization.rb
256
+ lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
257
+ lib/csvtool/infrastructure/csv/*
258
+ lib/csvtool/infrastructure/output/*
259
+ lib/csvtool/interface/cli/menu_loop.rb
260
+ lib/csvtool/interface/cli/workflows/*
261
+ lib/csvtool/interface/cli/prompts/*
262
+ lib/csvtool/interface/cli/errors/presenter.rb
263
+ test/csvtool/cli_test.rb # end-to-end workflow tests
264
+ test/csvtool/**/*_test.rb # focused unit tests by component folder
265
+ test/test_helper.rb
266
+ ```
@@ -0,0 +1,87 @@
1
+ # Release Checklist: v0.4.0-alpha
2
+
3
+ ## 1. Verify environment
4
+
5
+ ```bash
6
+ ruby -v
7
+ bundle -v
8
+ ```
9
+
10
+ Expected:
11
+ - Ruby `3.3.0`
12
+
13
+ ## 2. Install dependencies
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ ## 3. Run quality checks
20
+
21
+ ```bash
22
+ bundle exec rake test
23
+ ```
24
+
25
+ ## 4. Smoke test CLI commands
26
+
27
+ ```bash
28
+ bundle exec csvtool menu
29
+ bundle exec csvtool column test/fixtures/sample_people.csv name
30
+ ```
31
+
32
+ ## 5. Smoke test workflows
33
+
34
+ ### Row randomization workflow
35
+
36
+ Use menu option `3` (`Randomize rows`) and verify:
37
+ - headered CSV output keeps header in first row
38
+ - seeded mode is reproducible
39
+ - file output path writes valid CSV
40
+ - headerless mode randomizes all rows
41
+
42
+ ### Cross-CSV dedupe workflow
43
+
44
+ Use menu option `4` (`Dedupe using another CSV`) and verify:
45
+ - headered + comma happy path produces expected retained rows
46
+ - headerless + index selectors work
47
+ - TSV separators work
48
+ - normalization toggles (`trim`, `case-insensitive`) behave as expected
49
+ - diagnostics render for `no matches` and `all removed`
50
+ - file output mode writes expected CSV
51
+
52
+ ## 6. Build and validate gem package
53
+
54
+ ```bash
55
+ gem build csvops.gemspec
56
+ gem install ./csvops-0.4.0.alpha.gem
57
+ csvtool menu
58
+ ```
59
+
60
+ ## 7. Commit release prep
61
+
62
+ ```bash
63
+ git add -A
64
+ git commit -m "chore(release): prepare v0.4.0-alpha"
65
+ ```
66
+
67
+ ## 8. Tag release
68
+
69
+ ```bash
70
+ git tag -a v0.4.0-alpha -m "v0.4.0-alpha"
71
+ git push origin main --tags
72
+ ```
73
+
74
+ ## 9. Publish gem (optional for alpha)
75
+
76
+ ```bash
77
+ gem push csvops-0.4.0.alpha.gem
78
+ ```
79
+
80
+ ## 10. Create GitHub release
81
+
82
+ Create release `v0.4.0-alpha` with:
83
+ - Cross-CSV dedupe workflow with normalization options and large-file streaming behavior
84
+ - Dedupe domain model (`CrossCsvDedupeSession`) with stronger invariants
85
+ - Shared-kernel `OutputDestination` value object across workflows
86
+ - Architecture/docs split (`README` + `docs/architecture.md`) with UML diagrams
87
+ - Dedupe boundary cleanup: CLI workflow (`RunCrossCsvDedupeWorkflow`) and application use-case separation
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "csvtool/infrastructure/csv/header_reader"
5
+ require "csvtool/infrastructure/csv/cross_csv_deduper"
6
+ require "csvtool/infrastructure/csv/selector_validator"
7
+
8
+ module Csvtool
9
+ module Application
10
+ module UseCases
11
+ class RunCrossCsvDedupe
12
+ Result = Struct.new(:ok, :error, :data, keyword_init: true) do
13
+ def ok?
14
+ ok
15
+ end
16
+ end
17
+
18
+ def initialize(
19
+ header_reader: Infrastructure::CSV::HeaderReader.new,
20
+ deduper: Infrastructure::CSV::CrossCsvDeduper.new,
21
+ selector_validator: Infrastructure::CSV::SelectorValidator.new(header_reader: header_reader)
22
+ )
23
+ @header_reader = header_reader
24
+ @deduper = deduper
25
+ @selector_validator = selector_validator
26
+ end
27
+
28
+ def call(session:, on_header: nil, on_row: nil)
29
+ current_read_path = session.source.path
30
+ return failure(:column_not_found) unless @selector_validator.valid?(profile: session.source, selector: session.key_mapping.source_selector)
31
+
32
+ current_read_path = session.reference.path
33
+ return failure(:column_not_found) unless @selector_validator.valid?(profile: session.reference, selector: session.key_mapping.reference_selector)
34
+
35
+ source_headers = session.source.headers_present? ? @header_reader.call(file_path: session.source.path, col_sep: session.source.separator) : nil
36
+ current_read_path = session.source.path
37
+
38
+ if session.output_destination.file?
39
+ write_file(session: session, source_headers: source_headers)
40
+ else
41
+ on_header.call(source_headers) if on_header && source_headers
42
+ stats = @deduper.each_retained(**dedupe_options(session)) do |fields|
43
+ on_row.call(fields) if on_row
44
+ end
45
+ success(stats: stats)
46
+ end
47
+ rescue CSV::MalformedCSVError
48
+ failure(:could_not_parse_csv)
49
+ rescue Errno::EACCES
50
+ failure(:cannot_read_file, path: current_read_path || session.source.path)
51
+ end
52
+
53
+ private
54
+
55
+ def write_file(session:, source_headers:)
56
+ stats = nil
57
+ ::CSV.open(
58
+ session.output_destination.path,
59
+ "w",
60
+ write_headers: !source_headers.nil?,
61
+ headers: source_headers,
62
+ col_sep: session.source.separator
63
+ ) do |csv|
64
+ stats = @deduper.each_retained(**dedupe_options(session)) { |fields| csv << fields }
65
+ end
66
+ success(stats: stats, output_path: session.output_destination.path)
67
+ rescue Errno::EACCES, Errno::ENOENT => e
68
+ failure(:cannot_write_output_file, path: session.output_destination.path, error_class: e.class)
69
+ end
70
+
71
+ def dedupe_options(session)
72
+ {
73
+ source_path: session.source.path,
74
+ reference_path: session.reference.path,
75
+ source_selector: session.key_mapping.source_selector,
76
+ reference_selector: session.key_mapping.reference_selector,
77
+ source_col_sep: session.source.separator,
78
+ reference_col_sep: session.reference.separator,
79
+ match_options: session.match_options
80
+ }
81
+ end
82
+
83
+ def success(data)
84
+ Result.new(ok: true, error: nil, data: data)
85
+ end
86
+
87
+ def failure(code, data = {})
88
+ Result.new(ok: false, error: code, data: data)
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end