csvops 0.3.0.alpha → 0.5.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +69 -149
  3. data/docs/architecture.md +396 -0
  4. data/docs/release-v0.4.0-alpha.md +87 -0
  5. data/docs/release-v0.5.0-alpha.md +89 -0
  6. data/lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb +96 -0
  7. data/lib/csvtool/application/use_cases/run_extraction.rb +63 -88
  8. data/lib/csvtool/application/use_cases/run_row_extraction.rb +45 -73
  9. data/lib/csvtool/application/use_cases/run_row_randomization.rb +56 -73
  10. data/lib/csvtool/cli.rb +11 -7
  11. data/lib/csvtool/domain/cross_csv_dedupe_session/column_selector.rb +44 -0
  12. data/lib/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session.rb +46 -0
  13. data/lib/csvtool/domain/cross_csv_dedupe_session/csv_profile.rb +24 -0
  14. data/lib/csvtool/domain/cross_csv_dedupe_session/key_mapping.rb +22 -0
  15. data/lib/csvtool/domain/cross_csv_dedupe_session/match_options.rb +29 -0
  16. data/lib/csvtool/domain/row_randomization_session/randomization_source.rb +1 -0
  17. data/lib/csvtool/domain/row_session/row_source.rb +3 -0
  18. data/lib/csvtool/domain/{column_session → shared}/output_destination.rb +1 -1
  19. data/lib/csvtool/infrastructure/csv/cross_csv_deduper.rb +85 -0
  20. data/lib/csvtool/infrastructure/csv/selector_validator.rb +30 -0
  21. data/lib/csvtool/infrastructure/output/csv_cross_csv_dedupe_file_writer.rb +23 -0
  22. data/lib/csvtool/infrastructure/output/csv_file_writer.rb +1 -7
  23. data/lib/csvtool/infrastructure/output/csv_randomized_row_file_writer.rb +23 -0
  24. data/lib/csvtool/infrastructure/output/csv_row_file_writer.rb +2 -9
  25. data/lib/csvtool/interface/cli/menu_loop.rb +5 -2
  26. data/lib/csvtool/interface/cli/prompts/dedupe_key_selector_prompt.rb +30 -0
  27. data/lib/csvtool/interface/cli/prompts/file_path_prompt.rb +4 -2
  28. data/lib/csvtool/interface/cli/prompts/headers_present_prompt.rb +4 -2
  29. data/lib/csvtool/interface/cli/prompts/separator_prompt.rb +4 -2
  30. data/lib/csvtool/interface/cli/prompts/yes_no_prompt.rb +26 -0
  31. data/lib/csvtool/interface/cli/workflows/builders/column_session_builder.rb +32 -0
  32. data/lib/csvtool/interface/cli/workflows/builders/cross_csv_dedupe_session_builder.rb +35 -0
  33. data/lib/csvtool/interface/cli/workflows/builders/row_extraction_session_builder.rb +22 -0
  34. data/lib/csvtool/interface/cli/workflows/builders/row_randomization_session_builder.rb +28 -0
  35. data/lib/csvtool/interface/cli/workflows/presenters/column_extraction_presenter.rb +25 -0
  36. data/lib/csvtool/interface/cli/workflows/presenters/cross_csv_dedupe_presenter.rb +39 -0
  37. data/lib/csvtool/interface/cli/workflows/presenters/row_extraction_presenter.rb +34 -0
  38. data/lib/csvtool/interface/cli/workflows/presenters/row_randomization_presenter.rb +34 -0
  39. data/lib/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow.rb +86 -0
  40. data/lib/csvtool/interface/cli/workflows/run_extraction_workflow.rb +88 -0
  41. data/lib/csvtool/interface/cli/workflows/run_row_extraction_workflow.rb +86 -0
  42. data/lib/csvtool/interface/cli/workflows/run_row_randomization_workflow.rb +80 -0
  43. data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_options_step.rb +55 -0
  44. data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_profiles_step.rb +52 -0
  45. data/lib/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/execute_step.rb +34 -0
  46. data/lib/csvtool/interface/cli/workflows/steps/extraction/build_preview_step.rb +40 -0
  47. data/lib/csvtool/interface/cli/workflows/steps/extraction/collect_destination_step.rb +28 -0
  48. data/lib/csvtool/interface/cli/workflows/steps/extraction/collect_inputs_step.rb +47 -0
  49. data/lib/csvtool/interface/cli/workflows/steps/extraction/execute_step.rb +32 -0
  50. data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_destination_step.rb +33 -0
  51. data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_range_step.rb +35 -0
  52. data/lib/csvtool/interface/cli/workflows/steps/row_extraction/collect_source_step.rb +32 -0
  53. data/lib/csvtool/interface/cli/workflows/steps/row_extraction/execute_step.rb +43 -0
  54. data/lib/csvtool/interface/cli/workflows/steps/row_extraction/read_headers_step.rb +29 -0
  55. data/lib/csvtool/interface/cli/workflows/steps/row_randomization/collect_destination_step.rb +34 -0
  56. data/lib/csvtool/interface/cli/workflows/steps/row_randomization/collect_inputs_step.rb +49 -0
  57. data/lib/csvtool/interface/cli/workflows/steps/row_randomization/execute_step.rb +37 -0
  58. data/lib/csvtool/interface/cli/workflows/steps/workflow_step_pipeline.rb +25 -0
  59. data/lib/csvtool/interface/cli/workflows/support/output_destination_mapper.rb +23 -0
  60. data/lib/csvtool/interface/cli/workflows/support/result_error_handler.rb +22 -0
  61. data/lib/csvtool/version.rb +1 -1
  62. data/test/csvtool/application/use_cases/io_boundary_test.rb +26 -0
  63. data/test/csvtool/application/use_cases/run_cross_csv_dedupe_test.rb +141 -0
  64. data/test/csvtool/application/use_cases/run_extraction_test.rb +72 -16
  65. data/test/csvtool/application/use_cases/run_row_extraction_test.rb +82 -102
  66. data/test/csvtool/application/use_cases/run_row_randomization_test.rb +96 -86
  67. data/test/csvtool/cli_test.rb +130 -16
  68. data/test/csvtool/cli_unit_test.rb +16 -3
  69. data/test/csvtool/domain/column_session/column_session_test.rb +2 -2
  70. data/test/csvtool/domain/column_session/csv_source_test.rb +10 -0
  71. data/test/csvtool/domain/cross_csv_dedupe_session/column_selector_test.rb +42 -0
  72. data/test/csvtool/domain/cross_csv_dedupe_session/cross_csv_dedupe_session_test.rb +75 -0
  73. data/test/csvtool/domain/cross_csv_dedupe_session/csv_profile_test.rb +26 -0
  74. data/test/csvtool/domain/cross_csv_dedupe_session/key_mapping_test.rb +31 -0
  75. data/test/csvtool/domain/cross_csv_dedupe_session/match_options_test.rb +52 -0
  76. data/test/csvtool/domain/row_randomization_session/randomization_session_test.rb +2 -2
  77. data/test/csvtool/domain/row_randomization_session/randomization_source_test.rb +15 -1
  78. data/test/csvtool/domain/row_session/row_session_test.rb +2 -2
  79. data/test/csvtool/domain/row_session/row_source_test.rb +16 -0
  80. data/test/csvtool/domain/shared/output_destination_test.rb +24 -0
  81. data/test/csvtool/infrastructure/csv/cross_csv_deduper_test.rb +155 -0
  82. data/test/csvtool/infrastructure/csv/selector_validator_test.rb +72 -0
  83. data/test/csvtool/infrastructure/output/csv_cross_csv_dedupe_file_writer_test.rb +32 -0
  84. data/test/csvtool/infrastructure/output/csv_file_writer_test.rb +0 -4
  85. data/test/csvtool/infrastructure/output/csv_randomized_row_file_writer_test.rb +32 -0
  86. data/test/csvtool/infrastructure/output/csv_row_file_writer_test.rb +1 -4
  87. data/test/csvtool/interface/cli/menu_loop_test.rb +50 -13
  88. data/test/csvtool/interface/cli/prompts/dedupe_key_selector_prompt_test.rb +30 -0
  89. data/test/csvtool/interface/cli/prompts/file_path_prompt_test.rb +9 -0
  90. data/test/csvtool/interface/cli/prompts/headers_present_prompt_test.rb +10 -0
  91. data/test/csvtool/interface/cli/prompts/separator_prompt_test.rb +10 -0
  92. data/test/csvtool/interface/cli/prompts/yes_no_prompt_test.rb +22 -0
  93. data/test/csvtool/interface/cli/workflows/builders/column_session_builder_test.rb +17 -0
  94. data/test/csvtool/interface/cli/workflows/builders/cross_csv_dedupe_session_builder_test.rb +36 -0
  95. data/test/csvtool/interface/cli/workflows/builders/row_extraction_session_builder_test.rb +21 -0
  96. data/test/csvtool/interface/cli/workflows/builders/row_randomization_session_builder_test.rb +26 -0
  97. data/test/csvtool/interface/cli/workflows/presenters/column_extraction_presenter_test.rb +24 -0
  98. data/test/csvtool/interface/cli/workflows/presenters/cross_csv_dedupe_presenter_test.rb +30 -0
  99. data/test/csvtool/interface/cli/workflows/presenters/row_extraction_presenter_test.rb +33 -0
  100. data/test/csvtool/interface/cli/workflows/presenters/row_randomization_presenter_test.rb +33 -0
  101. data/test/csvtool/interface/cli/workflows/run_cross_csv_dedupe_workflow_test.rb +246 -0
  102. data/test/csvtool/interface/cli/workflows/run_extraction_workflow_test.rb +56 -0
  103. data/test/csvtool/interface/cli/workflows/run_row_extraction_workflow_test.rb +83 -0
  104. data/test/csvtool/interface/cli/workflows/run_row_randomization_workflow_test.rb +69 -0
  105. data/test/csvtool/interface/cli/workflows/steps/cross_csv_dedupe/collect_options_step_test.rb +41 -0
  106. data/test/csvtool/interface/cli/workflows/steps/extraction/collect_inputs_step_test.rb +66 -0
  107. data/test/csvtool/interface/cli/workflows/steps/row_extraction/collect_source_step_test.rb +39 -0
  108. data/test/csvtool/interface/cli/workflows/steps/row_extraction/execute_step_test.rb +91 -0
  109. data/test/csvtool/interface/cli/workflows/steps/row_extraction/read_headers_step_test.rb +57 -0
  110. data/test/csvtool/interface/cli/workflows/steps/row_randomization/collect_inputs_step_test.rb +37 -0
  111. data/test/csvtool/interface/cli/workflows/steps/workflow_step_pipeline_test.rb +30 -0
  112. data/test/csvtool/interface/cli/workflows/support/output_destination_mapper_test.rb +23 -0
  113. data/test/csvtool/interface/cli/workflows/support/result_error_handler_test.rb +34 -0
  114. data/test/fixtures/dedupe_reference.csv +3 -0
  115. data/test/fixtures/dedupe_reference.tsv +3 -0
  116. data/test/fixtures/dedupe_reference_all.csv +5 -0
  117. data/test/fixtures/dedupe_reference_no_headers.csv +2 -0
  118. data/test/fixtures/dedupe_reference_none.csv +2 -0
  119. data/test/fixtures/dedupe_reference_normalization.csv +3 -0
  120. data/test/fixtures/dedupe_source.csv +6 -0
  121. data/test/fixtures/dedupe_source.tsv +6 -0
  122. data/test/fixtures/dedupe_source_no_headers.csv +5 -0
  123. data/test/fixtures/dedupe_source_normalization.csv +4 -0
  124. metadata +93 -8
  125. data/lib/csvtool/domain/row_randomization_session/randomization_output_destination.rb +0 -31
  126. data/lib/csvtool/domain/row_session/row_output_destination.rb +0 -31
  127. data/test/csvtool/domain/column_session/output_destination_test.rb +0 -18
  128. data/test/csvtool/domain/row_randomization_session/randomization_output_destination_test.rb +0 -21
  129. data/test/csvtool/domain/row_session/row_output_destination_test.rb +0 -23
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9663c50901b31a8073c4a5a0524e9e30c81c20bbe1b736af71649e60a7150a0e
4
- data.tar.gz: a622dad35eb52afeded279726d2575db0cd210c8ed4aa07650506bfd2e2b6de5
3
+ metadata.gz: b96fb7e03fa0629d3412a97d3abff5414492ac46ad08ede2c872e2176fcbfc62
4
+ data.tar.gz: 856b7735a472b5810d5f19dff6371a565a7fcc538ce5b6eba52260fff0028760
5
5
  SHA512:
6
- metadata.gz: 28726bb66d05881caead074ce529d79db5424b85a7552f8b56cca44e891b3bb0c34cd850ff22351d6d93a5ef725e3891ccc8b7ac7e1e62d24d4e4c4d7d9b344d
7
- data.tar.gz: 0fb96011d8737fb757b30e6226b9086453aa3ef6e5def1e95b77bb8bdbd414e25b72adfe6a8e65ee0498bbb88f2a809777a0d2405057689d00b3858c73014b93
6
+ metadata.gz: 5f643d331c6b54cb5feb0fe5db4ff7f8f7bc5c28461f74e3bfca5cf93d25703b84f497e72377302874b2b6302ef0fb542995c72d2d21798e3a998f6d5b294704
7
+ data.tar.gz: 0e254fa75780ce0605054c24b28301d8786535a0f2bbff7adfb45a75f09e60e5315e950648208fa5772d08cdd6abce95ea382838f568947af05ceaa77ba1888f
data/README.md CHANGED
@@ -35,22 +35,30 @@ bundle exec csvtool menu
35
35
  CSV Tool Menu
36
36
  1. Extract column
37
37
  2. Extract rows (range)
38
- 3. Exit
38
+ 3. Randomize rows
39
+ 4. Dedupe using another CSV
40
+ 5. Exit
39
41
  >
40
42
  ```
41
43
 
42
- Select `1` to run extraction.
44
+ Select `1` for column extraction, `2` for row-range extraction, `3` for row randomization, or `4` for cross-CSV dedupe.
43
45
 
44
46
  ### 3. Follow prompts
45
47
 
46
- Prompt flow:
48
+ Each menu action runs through a dedicated CLI workflow (`interface/cli/workflows/*`) that handles prompts/output and delegates execution to an interface-agnostic application use case.
47
49
 
48
- - CSV file path
49
- - Separator (`comma`, `tab`, `semicolon`, `pipe`, or `custom`)
50
- - Optional header filter + column selection
51
- - Skip blanks (`Y/n`, default `Y`)
52
- - Preview + confirmation
53
- - Output destination (`console` or `file`)
50
+ Workflow internals are split into small composable parts:
51
+
52
+ - `workflows/builders/*` for session construction
53
+ - `workflows/support/*` for shared mapping/dispatch utilities
54
+ - `workflows/presenters/*` for output formatting and summaries
55
+
56
+ Prompt flow by action:
57
+
58
+ - `Extract column`: file path, separator, optional header filter + column select, skip blanks, preview/confirm, output destination.
59
+ - `Extract rows (range)`: file path, separator, start row, end row, output destination.
60
+ - `Randomize rows`: file path, separator, headers present, optional seed, output destination.
61
+ - `Dedupe using another CSV`: source/reference files, separators, header modes, key selectors, match options, output destination.
54
62
 
55
63
  ### 4. Example interaction (console output)
56
64
 
@@ -111,6 +119,53 @@ With Bundler:
111
119
  bundle exec csvtool column /path/to/file.csv column_name
112
120
  ```
113
121
 
122
+ ### 7. Dedupe interaction example
123
+
124
+ Legend: ` ` = prompt/menu, `+` = user input, `-` = tool output
125
+
126
+ ```diff
127
+ CSV Tool Menu
128
+ 1. Extract column
129
+ 2. Extract rows (range)
130
+ 3. Randomize rows
131
+ 4. Dedupe using another CSV
132
+ 5. Exit
133
+ +> 4
134
+ CSV file path: /tmp/source.csv
135
+ Source CSV separator:
136
+ Choose separator:
137
+ 1. comma (,)
138
+ 2. tab (\t)
139
+ 3. semicolon (;)
140
+ 4. pipe (|)
141
+ 5. custom
142
+ +Separator choice [1]: 1
143
+ Source headers present? [Y/n]:
144
+ Reference CSV file path: /tmp/reference.csv
145
+ Reference CSV separator:
146
+ Choose separator:
147
+ 1. comma (,)
148
+ 2. tab (\t)
149
+ 3. semicolon (;)
150
+ 4. pipe (|)
151
+ 5. custom
152
+ +Separator choice [1]: 1
153
+ Reference headers present? [Y/n]:
154
+ Source key column name: customer_id
155
+ Reference key column name: external_id
156
+ Trim whitespace before matching? [Y/n]:
157
+ Case-insensitive matching? [y/N]:
158
+ Output destination:
159
+ 1. console
160
+ 2. file
161
+ +Output destination [1]: 1
162
+ -
163
+ -customer_id,name
164
+ -1,Alice
165
+ -3,Cara
166
+ -Summary: source_rows=5 removed_rows=3 kept_rows=2
167
+ ```
168
+
114
169
  ## Testing
115
170
 
116
171
  Run tests:
@@ -127,7 +182,7 @@ bundle exec rake test
127
182
 
128
183
  ## Alpha release
129
184
 
130
- Current prerelease version: `0.3.0.alpha`
185
+ Current prerelease version: `0.5.0.alpha`
131
186
 
132
187
  Install prerelease from RubyGems:
133
188
 
@@ -137,146 +192,11 @@ gem install csvops --pre
137
192
 
138
193
  Release runbook:
139
194
 
140
- - `docs/release-v0.3.0-alpha.md`
195
+ - `docs/release-v0.5.0-alpha.md`
141
196
 
142
- ## Architecture
143
-
144
- The codebase follows a DDD-lite layered structure:
145
-
146
- - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, and `RandomizationSession` aggregates + supporting entities/value objects).
147
- - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`).
148
- - `infrastructure/`: CSV reading/streaming and output adapters (console/file).
149
- - `interface/cli/`: menu, prompts, and user-facing error presentation.
150
- - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
151
-
152
- ## Domain model
153
-
154
- Bounded contexts: `Column Extraction`, `Row Extraction`, and `Row Randomization`.
155
-
156
- ### Column Extraction
157
-
158
- - Aggregate root: `ColumnSession`
159
- - Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
160
- - Enforces session-level invariants.
161
- - Entities:
162
- - `CsvSource` (file path + `Separator`)
163
- - `ColumnSelection` (chosen header)
164
- - Value objects:
165
- - `Separator`
166
- - `ExtractionOptions` (`skip_blanks`, `preview_limit`)
167
- - `Preview` (list of `ExtractionValue`)
168
- - `ExtractionValue`
169
- - `OutputDestination` (`console` or `file(path)`)
170
- - Application service:
171
- - `Application::UseCases::RunExtraction` orchestrates one extraction request.
172
- - Infrastructure adapters:
173
- - `Infrastructure::CSV::HeaderReader`
174
- - `Infrastructure::CSV::ValueStreamer`
175
- - `Infrastructure::Output::ConsoleWriter`
176
- - `Infrastructure::Output::CsvFileWriter`
177
- - Interface adapters:
178
- - `Interface::CLI::MenuLoop`
179
- - `Interface::CLI::Prompts::*`
180
- - `Interface::CLI::Errors::Presenter`
181
-
182
- ```mermaid
183
- flowchart LR
184
- UI["Interface CLI\n(Menu + Prompts + Errors)"] --> APP["Application Use Case\nRunExtraction"]
185
- APP --> AGG["Domain Aggregate\nColumnSession"]
186
-
187
- AGG --> E1["Entity\nCsvSource"]
188
- AGG --> E2["Entity\nColumnSelection"]
189
- AGG --> V1["Value Objects\nSeparator / ExtractionOptions / Preview / OutputDestination / ExtractionValue"]
190
-
191
- APP --> INFCSV["Infrastructure CSV\nHeaderReader + ValueStreamer"]
192
- APP --> INFOUT["Infrastructure Output\nConsoleWriter + CsvFileWriter"]
193
- ```
194
-
195
- ### Row Extraction
196
-
197
- Core DDD structure:
198
-
199
- - Aggregate root: `RowSession`
200
- - Captures one row-range extraction request.
201
- - Holds selected source, requested range, and output destination.
202
- - Entity:
203
- - `RowSource` (file path + separator)
204
- - Value objects:
205
- - `RowRange` (`start_row`, `end_row`) plus row-range validation errors
206
- - `RowOutputDestination` (`console` or `file(path)`)
207
- - Application service:
208
- - `Application::UseCases::RunRowExtraction` orchestrates row-range extraction.
209
- - Infrastructure adapters:
210
- - `Infrastructure::CSV::HeaderReader`
211
- - `Infrastructure::CSV::RowStreamer`
212
- - `Infrastructure::Output::CsvRowConsoleWriter`
213
- - `Infrastructure::Output::CsvRowFileWriter`
214
- - Interface adapters:
215
- - `Interface::CLI::MenuLoop`
216
- - `Interface::CLI::Prompts::*`
217
- - `Interface::CLI::Errors::Presenter`
218
-
219
- ```mermaid
220
- flowchart LR
221
- UI2["Interface CLI\n(Menu + Prompts + Errors)"] --> APP2["Application Use Case\nRunRowExtraction"]
222
- APP2 --> AGG2["Domain Aggregate\nRowSession"]
223
-
224
- AGG2 --> E3["Entity\nRowSource"]
225
- AGG2 --> V2["Value Objects\nRowRange / RowOutputDestination"]
226
-
227
- APP2 --> INFCSV2["Infrastructure CSV\nHeaderReader + RowStreamer"]
228
- APP2 --> INFOUT2["Infrastructure Output\nCsvRowConsoleWriter + CsvRowFileWriter"]
229
- ```
230
197
 
231
- ### Row Randomization
232
-
233
- Core DDD structure:
234
-
235
- - Aggregate root: `RandomizationSession`
236
- - Captures one randomization request from source + options + output destination.
237
- - Entity:
238
- - `RandomizationSource` (file path + separator + header mode)
239
- - Value objects:
240
- - `RandomizationOptions` (optional deterministic `seed`)
241
- - `RandomizationOutputDestination` (`console` or `file(path)`)
242
- - Application service:
243
- - `Application::UseCases::RunRowRandomization` orchestrates row randomization.
244
- - Infrastructure adapters:
245
- - `Infrastructure::CSV::HeaderReader`
246
- - `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
247
- - Interface adapters:
248
- - `Interface::CLI::MenuLoop`
249
- - `Interface::CLI::Prompts::*`
250
- - `Interface::CLI::Errors::Presenter`
251
-
252
- ```mermaid
253
- flowchart LR
254
- UI3["Interface CLI\n(Menu + Prompts + Errors)"] --> APP3["Application Use Case\nRunRowRandomization"]
255
- APP3 --> AGG3["Domain Aggregate\nRandomizationSession"]
256
-
257
- AGG3 --> E4["Entity\nRandomizationSource"]
258
- AGG3 --> V3["Value Objects\nRandomizationOptions / RandomizationOutputDestination"]
259
-
260
- APP3 --> INFCSV3["Infrastructure CSV\nHeaderReader + RowRandomizer"]
261
- ```
198
+ ## Architecture
262
199
 
263
- ## Project layout
200
+ Full architecture and domain documentation lives in:
264
201
 
265
- ```text
266
- bin/tool # CLI entrypoint
267
- lib/csvtool/cli.rb
268
- lib/csvtool/domain/column_session/*
269
- lib/csvtool/domain/row_session/*
270
- lib/csvtool/domain/row_randomization_session/*
271
- lib/csvtool/application/use_cases/run_extraction.rb
272
- lib/csvtool/application/use_cases/run_row_extraction.rb
273
- lib/csvtool/application/use_cases/run_row_randomization.rb
274
- lib/csvtool/infrastructure/csv/*
275
- lib/csvtool/infrastructure/output/*
276
- lib/csvtool/interface/cli/menu_loop.rb
277
- lib/csvtool/interface/cli/prompts/*
278
- lib/csvtool/interface/cli/errors/presenter.rb
279
- test/csvtool/cli_test.rb # end-to-end workflow tests
280
- test/csvtool/**/*_test.rb # focused unit tests by component folder
281
- test/test_helper.rb
282
- ```
202
+ - [`docs/architecture.md`](docs/architecture.md)
@@ -0,0 +1,396 @@
1
+ # Architecture
2
+
3
+ The codebase follows a DDD-lite layered structure:
4
+
5
+ - `domain/`: core domain models and invariants (`ColumnSession`, `RowSession`, `RandomizationSession`, and `CrossCsvDedupeSession` aggregates + supporting entities/value objects).
6
+ - `application/`: use-case orchestration (`RunExtraction`, `RunRowExtraction`, `RunRowRandomization`, `RunCrossCsvDedupe`).
7
+ - `infrastructure/`: CSV reading/streaming and output adapters (console/file), plus cross-CSV dedupe adapter.
8
+ - `interface/cli/`: menu, prompts, workflows, and user-facing error presentation.
9
+ - `Csvtool::CLI`: entrypoint wiring from command args to interface/application flow.
10
+
11
+ ## Workflow boundary (standardized)
12
+
13
+ For all interactive domains (`Column Extraction`, `Row Extraction`, `Row Randomization`, `Cross-CSV Dedupe`), the boundary is:
14
+
15
+ - `interface/cli/workflows/*`: owns prompts, stdout rendering, and user-facing error presentation.
16
+ - `interface/cli/workflows/builders/*`: builds domain sessions/aggregates from prompt results.
17
+ - `interface/cli/workflows/support/*`: shared workflow utilities (error routing, output destination mapping).
18
+ - `interface/cli/workflows/presenters/*`: workflow-level output/summary rendering.
19
+ - `interface/cli/workflows/steps/*`: optional step-pipeline units for complex workflow orchestration.
20
+ - `application/use_cases/*`: interface-agnostic orchestration with request/result style contracts.
21
+ - `domain/*`: invariants and domain policies.
22
+ - `infrastructure/*`: CSV mechanics and output adapters.
23
+
24
+ Write-boundary rule:
25
+ - Use cases coordinate write paths but do not perform direct file writes.
26
+ - Direct write APIs (`CSV.open`, writable `File.open`, `File.write`, `IO.write`) are infrastructure-only.
27
+ - File output behavior is implemented in `infrastructure/output/*` writer adapters.
28
+
29
+ Current usage:
30
+
31
+ - `RunExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::Extraction::*`.
32
+ - `RunRowExtractionWorkflow` uses `WorkflowStepPipeline` + `Steps::RowExtraction::*`.
33
+ - `RunRowRandomizationWorkflow` uses `WorkflowStepPipeline` + `Steps::RowRandomization::*`.
34
+ - `RunCrossCsvDedupeWorkflow` uses `WorkflowStepPipeline` + `Steps::CrossCsvDedupe::*`.
35
+
36
+ ## Adding New Concepts
37
+
38
+ Use this checklist when introducing a new capability (for example: a new transformation function, validator, comparer, or exporter).
39
+
40
+ ### 1) Classify the concept first
41
+
42
+ - `Workflow concept`: interactive flow and prompt sequence.
43
+ - `Domain concept`: business rule/invariant and core vocabulary.
44
+ - `Application concept`: use-case orchestration and request/result contract.
45
+ - `Infrastructure concept`: file/CSV mechanics, streaming, persistence, or external IO.
46
+
47
+ If it does not clearly fit one layer, split it until each part has one responsibility.
48
+
49
+ ### 2) Add the feature vertically (thin slice)
50
+
51
+ Implement in this order:
52
+
53
+ 1. `interface/cli/workflows/*`: new workflow entry or new branch in an existing workflow.
54
+ 2. `interface/cli/prompts/*`: prompts for user inputs.
55
+ 3. `interface/cli/workflows/builders/*`: build domain session/request objects.
56
+ 4. `application/use_cases/*`: interface-agnostic use case with `Result` success/failure.
57
+ 5. `domain/*`: new entities/value objects/aggregate changes for invariants.
58
+ 6. `infrastructure/*`: adapters needed by the use case.
59
+ 7. `interface/cli/workflows/presenters/*`: output and summaries.
60
+
61
+ Keep each step testable on its own before moving to the next.
62
+
63
+ ### 3) Function type patterns
64
+
65
+ For a new function type, prefer one of these patterns:
66
+
67
+ - `Transform` (changes output rows/values):
68
+ - Domain: transformation options/value objects.
69
+ - Application: orchestrate transform over streamed rows.
70
+ - Infrastructure: stream reader/writer implementation.
71
+ - `Validate` (checks and reports findings):
72
+ - Domain: validation policy and finding model.
73
+ - Application: run checks and return findings in result data.
74
+ - Presenter: format findings and summary.
75
+ - `Compare` (source vs reference logic):
76
+ - Domain: mapping/selectors/match options.
77
+ - Application: compare strategy and stats.
78
+ - Infrastructure: dual-source readers and selector helpers.
79
+ - `Export` (destination-focused):
80
+ - Domain: output destination value object.
81
+ - Application: orchestrate write path only.
82
+ - Infrastructure: writer adapter.
83
+
84
+ ### 4) Required boundaries and rules
85
+
86
+ - Workflows do not contain business rules.
87
+ - Use cases do not prompt or print.
88
+ - Domain does not depend on interface or infrastructure.
89
+ - Infrastructure does not own workflow decisions.
90
+ - Shared workflow helpers belong under `workflows/support/*`.
91
+ - Reusable construction logic belongs under `workflows/builders/*`.
92
+ - Rendering/summary formatting belongs under `workflows/presenters/*`.
93
+
94
+ ### 5) Minimum tests for each new concept
95
+
96
+ - Prompt tests for each new prompt class.
97
+ - Builder tests for each new builder class.
98
+ - Use-case tests for request/result behavior.
99
+ - Workflow behavior tests for prompt + output integration.
100
+ - One end-to-end CLI test for the happy path.
101
+
102
+ ### 6) Naming and structure guidance
103
+
104
+ - Prefer domain-first names (`RowRange`, `ColumnSelection`, `MatchOptions`) over technical names.
105
+ - Use `Run<Concept>` for use cases and `Run<Concept>Workflow` for workflows.
106
+ - Keep one file per class and mirror structure under `test/csvtool/...`.
107
+
108
+ ## Domain model
109
+
110
+ Bounded contexts: `Column Extraction`, `Row Extraction`, `Row Randomization`, and `Cross-CSV Dedupe`.
111
+
112
+ ### Cross-CSV Dedupe (Large-file behavior)
113
+
114
+ - Workflow: remove rows from a source CSV when source key matches a key from a reference CSV.
115
+ - Scaling strategy:
116
+ - Reference CSV keys are loaded into a `Set` for fast membership checks.
117
+ - Source CSV rows are streamed directly to the selected output destination (console or file).
118
+ - Memory tradeoff:
119
+ - Memory is dominated by the number of unique keys in the reference CSV.
120
+ - Source-row memory stays bounded because retained rows are not accumulated in memory before writing.
121
+
122
+ ### Column Extraction
123
+
124
+ - Aggregate root: `ColumnSession`
125
+ - Controls extraction state transitions (`start`, `with_preview`, `confirm!`, `with_output_destination`).
126
+ - Enforces session-level invariants.
127
+ - Entities:
128
+ - `CsvSource` (file path + `Separator`)
129
+ - `ColumnSelection` (chosen header)
130
+ - Value objects:
131
+ - `Separator`
132
+ - `ExtractionOptions` (`skip_blanks`, `preview_limit`)
133
+ - `Preview` (list of `ExtractionValue`)
134
+ - `ExtractionValue`
135
+ - Shared `OutputDestination` (`console` or `file(path)`)
136
+ - Application service:
137
+ - `Application::UseCases::RunExtraction` is interface-agnostic and exposes request/result operations.
138
+ - Infrastructure adapters:
139
+ - `Infrastructure::CSV::HeaderReader`
140
+ - `Infrastructure::CSV::ValueStreamer`
141
+ - `Infrastructure::Output::ConsoleWriter`
142
+ - `Infrastructure::Output::CsvFileWriter`
143
+ - Interface adapters:
144
+ - `Interface::CLI::MenuLoop`
145
+ - `Interface::CLI::Workflows::RunExtractionWorkflow`
146
+ - `Interface::CLI::Workflows::Builders::ColumnSessionBuilder`
147
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
148
+ - `Interface::CLI::Workflows::Steps::Extraction::*`
149
+ - `Interface::CLI::Workflows::Presenters::ColumnExtractionPresenter`
150
+ - `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
151
+ - `Interface::CLI::Prompts::*`
152
+ - `Interface::CLI::Errors::Presenter`
153
+
154
+ ```mermaid
155
+ classDiagram
156
+ direction LR
157
+ class MenuLoop
158
+ class RunExtractionWorkflow
159
+ class Prompts
160
+ class Errors
161
+ class RunExtraction
162
+ class ColumnSession
163
+ class CsvSource
164
+ class ColumnSelection
165
+ class ExtractionOptions
166
+ class Preview
167
+ class ExtractionValue
168
+ class OutputDestination
169
+ class HeaderReader
170
+ class ValueStreamer
171
+ class CsvFileWriter
172
+
173
+ MenuLoop --> RunExtractionWorkflow : invokes
174
+ RunExtractionWorkflow --> Prompts : uses
175
+ RunExtractionWorkflow --> Errors : reports failures
176
+ RunExtractionWorkflow --> RunExtraction : calls
177
+ RunExtraction --> ColumnSession : orchestrates
178
+ ColumnSession o-- CsvSource
179
+ ColumnSession o-- ColumnSelection
180
+ ColumnSession o-- ExtractionOptions
181
+ ColumnSession o-- Preview
182
+ Preview o-- ExtractionValue
183
+ ColumnSession o-- OutputDestination
184
+ RunExtraction --> HeaderReader
185
+ RunExtraction --> ValueStreamer
186
+ RunExtraction --> CsvFileWriter
187
+ ```
188
+
189
+ ### Row Extraction
190
+
191
+ Core DDD structure:
192
+
193
+ - Aggregate root: `RowSession`
194
+ - Captures one row-range extraction request.
195
+ - Holds selected source, requested range, and output destination.
196
+ - Entity:
197
+ - `RowSource` (file path + separator)
198
+ - Value objects:
199
+ - `RowRange` (`start_row`, `end_row`) plus row-range validation errors
200
+ - Shared `OutputDestination` (`console` or `file(path)`)
201
+ - Application service:
202
+ - `Application::UseCases::RunRowExtraction` is interface-agnostic and exposes request/result operations.
203
+ - Infrastructure adapters:
204
+ - `Infrastructure::CSV::HeaderReader`
205
+ - `Infrastructure::CSV::RowStreamer`
206
+ - `Infrastructure::Output::CsvRowFileWriter`
207
+ - Interface adapters:
208
+ - `Interface::CLI::MenuLoop`
209
+ - `Interface::CLI::Workflows::RunRowExtractionWorkflow`
210
+ - `Interface::CLI::Workflows::Builders::RowExtractionSessionBuilder`
211
+ - `Interface::CLI::Workflows::Presenters::RowExtractionPresenter`
212
+ - `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
213
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
214
+ - `Interface::CLI::Workflows::Steps::RowExtraction::*`
215
+ - `Interface::CLI::Prompts::*`
216
+ - `Interface::CLI::Errors::Presenter`
217
+
218
+ ```mermaid
219
+ classDiagram
220
+ direction LR
221
+ class MenuLoop
222
+ class RunRowExtractionWorkflow
223
+ class Prompts
224
+ class Errors
225
+ class RunRowExtraction
226
+ class RowSession
227
+ class RowSource
228
+ class RowRange
229
+ class OutputDestination
230
+ class HeaderReader
231
+ class RowStreamer
232
+ class CsvRowFileWriter
233
+ MenuLoop --> RunRowExtractionWorkflow : invokes
234
+ RunRowExtractionWorkflow --> Prompts : uses
235
+ RunRowExtractionWorkflow --> Errors : reports failures
236
+ RunRowExtractionWorkflow --> RunRowExtraction : calls
237
+ RunRowExtraction --> RowSession : orchestrates
238
+ RowSession o-- RowSource
239
+ RowSession o-- RowRange
240
+ RowSession o-- OutputDestination
241
+ RunRowExtraction --> HeaderReader
242
+ RunRowExtraction --> RowStreamer
243
+ RunRowExtraction --> CsvRowFileWriter
244
+ ```
245
+
246
+ ### Row Randomization
247
+
248
+ Core DDD structure:
249
+
250
+ - Aggregate root: `RandomizationSession`
251
+ - Captures one randomization request from source + options + output destination.
252
+ - Entity:
253
+ - `RandomizationSource` (file path + separator + header mode)
254
+ - Value objects:
255
+ - `RandomizationOptions` (optional deterministic `seed`)
256
+ - Shared `OutputDestination` (`console` or `file(path)`)
257
+ - Application service:
258
+ - `Application::UseCases::RunRowRandomization` is interface-agnostic and exposes request/result operations.
259
+ - Infrastructure adapters:
260
+ - `Infrastructure::CSV::HeaderReader`
261
+ - `Infrastructure::CSV::RowRandomizer` (external chunked `RAND + sort` + merge)
262
+ - `Infrastructure::Output::CsvRandomizedRowFileWriter`
263
+ - Interface adapters:
264
+ - `Interface::CLI::MenuLoop`
265
+ - `Interface::CLI::Workflows::RunRowRandomizationWorkflow`
266
+ - `Interface::CLI::Workflows::Builders::RowRandomizationSessionBuilder`
267
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
268
+ - `Interface::CLI::Workflows::Steps::RowRandomization::*`
269
+ - `Interface::CLI::Workflows::Presenters::RowRandomizationPresenter`
270
+ - `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
271
+ - `Interface::CLI::Prompts::*`
272
+ - `Interface::CLI::Errors::Presenter`
273
+
274
+ ```mermaid
275
+ classDiagram
276
+ direction LR
277
+ class MenuLoop
278
+ class RunRowRandomizationWorkflow
279
+ class Prompts
280
+ class Errors
281
+ class RunRowRandomization
282
+ class RandomizationSession
283
+ class RandomizationSource
284
+ class RandomizationOptions
285
+ class OutputDestination
286
+ class HeaderReader
287
+ class RowRandomizer
288
+ class CsvRandomizedRowFileWriter
289
+
290
+ MenuLoop --> RunRowRandomizationWorkflow : invokes
291
+ RunRowRandomizationWorkflow --> Prompts : uses
292
+ RunRowRandomizationWorkflow --> Errors : reports failures
293
+ RunRowRandomizationWorkflow --> RunRowRandomization : calls
294
+ RunRowRandomization --> RandomizationSession : orchestrates
295
+ RandomizationSession o-- RandomizationSource
296
+ RandomizationSession o-- RandomizationOptions
297
+ RandomizationSession o-- OutputDestination
298
+ RunRowRandomization --> HeaderReader
299
+ RunRowRandomization --> RowRandomizer
300
+ RunRowRandomization --> CsvRandomizedRowFileWriter
301
+ ```
302
+
303
+ ### Cross-CSV Dedupe
304
+
305
+ Core DDD structure:
306
+
307
+ - Aggregate root: `CrossCsvDedupeSession`
308
+ - Captures one dedupe request with source/reference profiles, key mapping, match options, and output destination.
309
+ - Entities:
310
+ - `CsvProfile` (path + separator + header mode) for source and reference CSVs.
311
+ - `KeyMapping` (source selector + reference selector).
312
+ - Value objects:
313
+ - `ColumnSelector` (header name or 1-based index mode)
314
+ - `MatchOptions` (`trim_whitespace`, `case_insensitive`, plus normalization behavior)
315
+ - Shared `OutputDestination` (`console` or `file(path)`)
316
+ - Application service:
317
+ - `Application::UseCases::RunCrossCsvDedupe` orchestrates dedupe workflow.
318
+ - Infrastructure adapters:
319
+ - `Infrastructure::CSV::HeaderReader`
320
+ - `Infrastructure::CSV::SelectorValidator`
321
+ - `Infrastructure::CSV::CrossCsvDeduper` (streams source rows while checking membership against reference key set)
322
+ - `Infrastructure::Output::CsvCrossCsvDedupeFileWriter`
323
+ - Interface adapters:
324
+ - `Interface::CLI::MenuLoop`
325
+ - `Interface::CLI::Workflows::RunCrossCsvDedupeWorkflow`
326
+ - `Interface::CLI::Workflows::Builders::CrossCsvDedupeSessionBuilder`
327
+ - `Interface::CLI::Workflows::Steps::WorkflowStepPipeline`
328
+ - `Interface::CLI::Workflows::Steps::CrossCsvDedupe::*`
329
+ - `Interface::CLI::Workflows::Presenters::CrossCsvDedupePresenter`
330
+ - `Interface::CLI::Workflows::Support::{OutputDestinationMapper,ResultErrorHandler}`
331
+ - `Interface::CLI::Prompts::*`
332
+ - `Interface::CLI::Errors::Presenter`
333
+
334
+ ```mermaid
335
+ classDiagram
336
+ direction LR
337
+ class MenuLoop
338
+ class RunCrossCsvDedupeWorkflow
339
+ class Prompts
340
+ class Errors
341
+ class RunCrossCsvDedupe
342
+ class CrossCsvDedupeSession
343
+ class CsvProfile
344
+ class KeyMapping
345
+ class ColumnSelector
346
+ class MatchOptions
347
+ class OutputDestination
348
+ class HeaderReader
349
+ class SelectorValidator
350
+ class CrossCsvDeduper
351
+ class CsvCrossCsvDedupeFileWriter
352
+
353
+ MenuLoop --> RunCrossCsvDedupeWorkflow : invokes
354
+ Prompts --> RunCrossCsvDedupeWorkflow : provides input
355
+ RunCrossCsvDedupeWorkflow --> Errors : reports failures
356
+ RunCrossCsvDedupeWorkflow --> RunCrossCsvDedupe : calls
357
+ RunCrossCsvDedupe --> CrossCsvDedupeSession : orchestrates
358
+ CrossCsvDedupeSession o-- CsvProfile
359
+ CrossCsvDedupeSession o-- KeyMapping
360
+ KeyMapping o-- ColumnSelector
361
+ CrossCsvDedupeSession o-- MatchOptions
362
+ CrossCsvDedupeSession o-- OutputDestination
363
+ RunCrossCsvDedupe --> HeaderReader
364
+ RunCrossCsvDedupe --> SelectorValidator
365
+ RunCrossCsvDedupe --> CrossCsvDeduper
366
+ RunCrossCsvDedupe --> CsvCrossCsvDedupeFileWriter
367
+ ```
368
+
369
+ ## Project layout
370
+
371
+ ```text
372
+ bin/tool # CLI entrypoint
373
+ lib/csvtool/cli.rb
374
+ lib/csvtool/domain/column_session/*
375
+ lib/csvtool/domain/row_session/*
376
+ lib/csvtool/domain/row_randomization_session/*
377
+ lib/csvtool/domain/cross_csv_dedupe_session/*
378
+ lib/csvtool/domain/shared/output_destination.rb
379
+ lib/csvtool/application/use_cases/run_extraction.rb
380
+ lib/csvtool/application/use_cases/run_row_extraction.rb
381
+ lib/csvtool/application/use_cases/run_row_randomization.rb
382
+ lib/csvtool/application/use_cases/run_cross_csv_dedupe.rb
383
+ lib/csvtool/infrastructure/csv/*
384
+ lib/csvtool/infrastructure/output/*
385
+ lib/csvtool/interface/cli/menu_loop.rb
386
+ lib/csvtool/interface/cli/workflows/*
387
+ lib/csvtool/interface/cli/workflows/builders/*
388
+ lib/csvtool/interface/cli/workflows/support/*
389
+ lib/csvtool/interface/cli/workflows/presenters/*
390
+ lib/csvtool/interface/cli/workflows/steps/*
391
+ lib/csvtool/interface/cli/prompts/*
392
+ lib/csvtool/interface/cli/errors/presenter.rb
393
+ test/csvtool/cli_test.rb # end-to-end workflow tests
394
+ test/csvtool/**/*_test.rb # focused unit tests by component folder
395
+ test/test_helper.rb
396
+ ```