@caracal-lynx/sluice 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CLAUDE.md +1822 -0
  2. package/LICENCE-FAQ.md +74 -0
  3. package/LICENSE +92 -0
  4. package/README.md +582 -0
  5. package/dist/adapters/source/csv.d.ts +10 -0
  6. package/dist/adapters/source/csv.d.ts.map +1 -0
  7. package/dist/adapters/source/csv.js +110 -0
  8. package/dist/adapters/source/csv.js.map +1 -0
  9. package/dist/adapters/source/index.d.ts +9 -0
  10. package/dist/adapters/source/index.d.ts.map +1 -0
  11. package/dist/adapters/source/index.js +26 -0
  12. package/dist/adapters/source/index.js.map +1 -0
  13. package/dist/adapters/source/mssql.d.ts +11 -0
  14. package/dist/adapters/source/mssql.d.ts.map +1 -0
  15. package/dist/adapters/source/mssql.js +230 -0
  16. package/dist/adapters/source/mssql.js.map +1 -0
  17. package/dist/adapters/source/pg.d.ts +11 -0
  18. package/dist/adapters/source/pg.d.ts.map +1 -0
  19. package/dist/adapters/source/pg.js +88 -0
  20. package/dist/adapters/source/pg.js.map +1 -0
  21. package/dist/adapters/source/registry.d.ts +10 -0
  22. package/dist/adapters/source/registry.d.ts.map +1 -0
  23. package/dist/adapters/source/registry.js +36 -0
  24. package/dist/adapters/source/registry.js.map +1 -0
  25. package/dist/adapters/source/rest.d.ts +16 -0
  26. package/dist/adapters/source/rest.d.ts.map +1 -0
  27. package/dist/adapters/source/rest.js +182 -0
  28. package/dist/adapters/source/rest.js.map +1 -0
  29. package/dist/adapters/source/rest.types.d.ts +15 -0
  30. package/dist/adapters/source/rest.types.d.ts.map +1 -0
  31. package/dist/adapters/source/rest.types.js +6 -0
  32. package/dist/adapters/source/rest.types.js.map +1 -0
  33. package/dist/adapters/source/types.d.ts +23 -0
  34. package/dist/adapters/source/types.d.ts.map +1 -0
  35. package/dist/adapters/source/types.js +4 -0
  36. package/dist/adapters/source/types.js.map +1 -0
  37. package/dist/adapters/source/xlsx.d.ts +10 -0
  38. package/dist/adapters/source/xlsx.d.ts.map +1 -0
  39. package/dist/adapters/source/xlsx.js +71 -0
  40. package/dist/adapters/source/xlsx.js.map +1 -0
  41. package/dist/adapters/target/bc.d.ts +21 -0
  42. package/dist/adapters/target/bc.d.ts.map +1 -0
  43. package/dist/adapters/target/bc.js +188 -0
  44. package/dist/adapters/target/bc.js.map +1 -0
  45. package/dist/adapters/target/bluecherry.d.ts +10 -0
  46. package/dist/adapters/target/bluecherry.d.ts.map +1 -0
  47. package/dist/adapters/target/bluecherry.js +127 -0
  48. package/dist/adapters/target/bluecherry.js.map +1 -0
  49. package/dist/adapters/target/csv.d.ts +10 -0
  50. package/dist/adapters/target/csv.d.ts.map +1 -0
  51. package/dist/adapters/target/csv.js +40 -0
  52. package/dist/adapters/target/csv.js.map +1 -0
  53. package/dist/adapters/target/ifs.d.ts +10 -0
  54. package/dist/adapters/target/ifs.d.ts.map +1 -0
  55. package/dist/adapters/target/ifs.js +55 -0
  56. package/dist/adapters/target/ifs.js.map +1 -0
  57. package/dist/adapters/target/index.d.ts +8 -0
  58. package/dist/adapters/target/index.d.ts.map +1 -0
  59. package/dist/adapters/target/index.js +22 -0
  60. package/dist/adapters/target/index.js.map +1 -0
  61. package/dist/adapters/target/pg.d.ts +11 -0
  62. package/dist/adapters/target/pg.d.ts.map +1 -0
  63. package/dist/adapters/target/pg.js +103 -0
  64. package/dist/adapters/target/pg.js.map +1 -0
  65. package/dist/adapters/target/registry.d.ts +9 -0
  66. package/dist/adapters/target/registry.d.ts.map +1 -0
  67. package/dist/adapters/target/registry.js +29 -0
  68. package/dist/adapters/target/registry.js.map +1 -0
  69. package/dist/adapters/target/types.d.ts +15 -0
  70. package/dist/adapters/target/types.d.ts.map +1 -0
  71. package/dist/adapters/target/types.js +4 -0
  72. package/dist/adapters/target/types.js.map +1 -0
  73. package/dist/cli.d.ts +25 -0
  74. package/dist/cli.d.ts.map +1 -0
  75. package/dist/cli.js +354 -0
  76. package/dist/cli.js.map +1 -0
  77. package/dist/config/index.d.ts +4 -0
  78. package/dist/config/index.d.ts.map +1 -0
  79. package/dist/config/index.js +6 -0
  80. package/dist/config/index.js.map +1 -0
  81. package/dist/config/loader.d.ts +5 -0
  82. package/dist/config/loader.d.ts.map +1 -0
  83. package/dist/config/loader.js +135 -0
  84. package/dist/config/loader.js.map +1 -0
  85. package/dist/config/schema.d.ts +4162 -0
  86. package/dist/config/schema.d.ts.map +1 -0
  87. package/dist/config/schema.js +263 -0
  88. package/dist/config/schema.js.map +1 -0
  89. package/dist/config/types.d.ts +3 -0
  90. package/dist/config/types.d.ts.map +1 -0
  91. package/dist/config/types.js +4 -0
  92. package/dist/config/types.js.map +1 -0
  93. package/dist/dq/engine.d.ts +10 -0
  94. package/dist/dq/engine.d.ts.map +1 -0
  95. package/dist/dq/engine.js +114 -0
  96. package/dist/dq/engine.js.map +1 -0
  97. package/dist/dq/index.d.ts +6 -0
  98. package/dist/dq/index.d.ts.map +1 -0
  99. package/dist/dq/index.js +6 -0
  100. package/dist/dq/index.js.map +1 -0
  101. package/dist/dq/reporter.d.ts +5 -0
  102. package/dist/dq/reporter.d.ts.map +1 -0
  103. package/dist/dq/reporter.js +41 -0
  104. package/dist/dq/reporter.js.map +1 -0
  105. package/dist/dq/rules/allowedValues.d.ts +7 -0
  106. package/dist/dq/rules/allowedValues.d.ts.map +1 -0
  107. package/dist/dq/rules/allowedValues.js +26 -0
  108. package/dist/dq/rules/allowedValues.js.map +1 -0
  109. package/dist/dq/rules/email.d.ts +7 -0
  110. package/dist/dq/rules/email.d.ts.map +1 -0
  111. package/dist/dq/rules/email.js +24 -0
  112. package/dist/dq/rules/email.js.map +1 -0
  113. package/dist/dq/rules/index.d.ts +15 -0
  114. package/dist/dq/rules/index.d.ts.map +1 -0
  115. package/dist/dq/rules/index.js +30 -0
  116. package/dist/dq/rules/index.js.map +1 -0
  117. package/dist/dq/rules/maxLength.d.ts +7 -0
  118. package/dist/dq/rules/maxLength.d.ts.map +1 -0
  119. package/dist/dq/rules/maxLength.js +25 -0
  120. package/dist/dq/rules/maxLength.js.map +1 -0
  121. package/dist/dq/rules/minMax.d.ts +11 -0
  122. package/dist/dq/rules/minMax.d.ts.map +1 -0
  123. package/dist/dq/rules/minMax.js +52 -0
  124. package/dist/dq/rules/minMax.js.map +1 -0
  125. package/dist/dq/rules/notNull.d.ts +7 -0
  126. package/dist/dq/rules/notNull.d.ts.map +1 -0
  127. package/dist/dq/rules/notNull.js +21 -0
  128. package/dist/dq/rules/notNull.js.map +1 -0
  129. package/dist/dq/rules/pattern.d.ts +7 -0
  130. package/dist/dq/rules/pattern.d.ts.map +1 -0
  131. package/dist/dq/rules/pattern.js +31 -0
  132. package/dist/dq/rules/pattern.js.map +1 -0
  133. package/dist/dq/rules/types.d.ts +6 -0
  134. package/dist/dq/rules/types.d.ts.map +1 -0
  135. package/dist/dq/rules/types.js +4 -0
  136. package/dist/dq/rules/types.js.map +1 -0
  137. package/dist/dq/rules/ukPostcode.d.ts +7 -0
  138. package/dist/dq/rules/ukPostcode.d.ts.map +1 -0
  139. package/dist/dq/rules/ukPostcode.js +24 -0
  140. package/dist/dq/rules/ukPostcode.js.map +1 -0
  141. package/dist/dq/rules/unique.d.ts +14 -0
  142. package/dist/dq/rules/unique.d.ts.map +1 -0
  143. package/dist/dq/rules/unique.js +9 -0
  144. package/dist/dq/rules/unique.js.map +1 -0
  145. package/dist/dq/types.d.ts +29 -0
  146. package/dist/dq/types.d.ts.map +1 -0
  147. package/dist/dq/types.js +4 -0
  148. package/dist/dq/types.js.map +1 -0
  149. package/dist/enrich/types.d.ts +87 -0
  150. package/dist/enrich/types.d.ts.map +1 -0
  151. package/dist/enrich/types.js +4 -0
  152. package/dist/enrich/types.js.map +1 -0
  153. package/dist/index.d.ts +17 -0
  154. package/dist/index.d.ts.map +1 -0
  155. package/dist/index.js +17 -0
  156. package/dist/index.js.map +1 -0
  157. package/dist/merge/conflict-log.d.ts +9 -0
  158. package/dist/merge/conflict-log.d.ts.map +1 -0
  159. package/dist/merge/conflict-log.js +28 -0
  160. package/dist/merge/conflict-log.js.map +1 -0
  161. package/dist/merge/engine.d.ts +7 -0
  162. package/dist/merge/engine.d.ts.map +1 -0
  163. package/dist/merge/engine.js +19 -0
  164. package/dist/merge/engine.js.map +1 -0
  165. package/dist/merge/index.d.ts +11 -0
  166. package/dist/merge/index.d.ts.map +1 -0
  167. package/dist/merge/index.js +34 -0
  168. package/dist/merge/index.js.map +1 -0
  169. package/dist/merge/sql-builder.d.ts +19 -0
  170. package/dist/merge/sql-builder.d.ts.map +1 -0
  171. package/dist/merge/sql-builder.js +148 -0
  172. package/dist/merge/sql-builder.js.map +1 -0
  173. package/dist/merge/strategies/coalesce.d.ts +17 -0
  174. package/dist/merge/strategies/coalesce.d.ts.map +1 -0
  175. package/dist/merge/strategies/coalesce.js +77 -0
  176. package/dist/merge/strategies/coalesce.js.map +1 -0
  177. package/dist/merge/strategies/index.d.ts +5 -0
  178. package/dist/merge/strategies/index.d.ts.map +1 -0
  179. package/dist/merge/strategies/index.js +7 -0
  180. package/dist/merge/strategies/index.js.map +1 -0
  181. package/dist/merge/strategies/intersect.d.ts +17 -0
  182. package/dist/merge/strategies/intersect.d.ts.map +1 -0
  183. package/dist/merge/strategies/intersect.js +75 -0
  184. package/dist/merge/strategies/intersect.js.map +1 -0
  185. package/dist/merge/strategies/priority-override.d.ts +16 -0
  186. package/dist/merge/strategies/priority-override.d.ts.map +1 -0
  187. package/dist/merge/strategies/priority-override.js +78 -0
  188. package/dist/merge/strategies/priority-override.js.map +1 -0
  189. package/dist/merge/strategies/registry.d.ts +8 -0
  190. package/dist/merge/strategies/registry.d.ts.map +1 -0
  191. package/dist/merge/strategies/registry.js +19 -0
  192. package/dist/merge/strategies/registry.js.map +1 -0
  193. package/dist/merge/strategies/union.d.ts +15 -0
  194. package/dist/merge/strategies/union.d.ts.map +1 -0
  195. package/dist/merge/strategies/union.js +75 -0
  196. package/dist/merge/strategies/union.js.map +1 -0
  197. package/dist/merge/types.d.ts +24 -0
  198. package/dist/merge/types.d.ts.map +1 -0
  199. package/dist/merge/types.js +4 -0
  200. package/dist/merge/types.js.map +1 -0
  201. package/dist/multi-source-runner.d.ts +22 -0
  202. package/dist/multi-source-runner.d.ts.map +1 -0
  203. package/dist/multi-source-runner.js +398 -0
  204. package/dist/multi-source-runner.js.map +1 -0
  205. package/dist/plugins/index.d.ts +4 -0
  206. package/dist/plugins/index.d.ts.map +1 -0
  207. package/dist/plugins/index.js +5 -0
  208. package/dist/plugins/index.js.map +1 -0
  209. package/dist/plugins/loader.d.ts +22 -0
  210. package/dist/plugins/loader.d.ts.map +1 -0
  211. package/dist/plugins/loader.js +151 -0
  212. package/dist/plugins/loader.js.map +1 -0
  213. package/dist/plugins/registry.d.ts +25 -0
  214. package/dist/plugins/registry.d.ts.map +1 -0
  215. package/dist/plugins/registry.js +42 -0
  216. package/dist/plugins/registry.js.map +1 -0
  217. package/dist/plugins/types.d.ts +61 -0
  218. package/dist/plugins/types.d.ts.map +1 -0
  219. package/dist/plugins/types.js +4 -0
  220. package/dist/plugins/types.js.map +1 -0
  221. package/dist/runner.d.ts +97 -0
  222. package/dist/runner.d.ts.map +1 -0
  223. package/dist/runner.js +520 -0
  224. package/dist/runner.js.map +1 -0
  225. package/dist/staging/index.d.ts +3 -0
  226. package/dist/staging/index.d.ts.map +1 -0
  227. package/dist/staging/index.js +5 -0
  228. package/dist/staging/index.js.map +1 -0
  229. package/dist/staging/schema.d.ts +19 -0
  230. package/dist/staging/schema.d.ts.map +1 -0
  231. package/dist/staging/schema.js +15 -0
  232. package/dist/staging/schema.js.map +1 -0
  233. package/dist/staging/store.d.ts +71 -0
  234. package/dist/staging/store.d.ts.map +1 -0
  235. package/dist/staging/store.js +270 -0
  236. package/dist/staging/store.js.map +1 -0
  237. package/dist/transform/cleanse.d.ts +2 -0
  238. package/dist/transform/cleanse.d.ts.map +1 -0
  239. package/dist/transform/cleanse.js +59 -0
  240. package/dist/transform/cleanse.js.map +1 -0
  241. package/dist/transform/engine.d.ts +10 -0
  242. package/dist/transform/engine.d.ts.map +1 -0
  243. package/dist/transform/engine.js +225 -0
  244. package/dist/transform/engine.js.map +1 -0
  245. package/dist/transform/expression.d.ts +5 -0
  246. package/dist/transform/expression.d.ts.map +1 -0
  247. package/dist/transform/expression.js +52 -0
  248. package/dist/transform/expression.js.map +1 -0
  249. package/dist/transform/index.d.ts +6 -0
  250. package/dist/transform/index.d.ts.map +1 -0
  251. package/dist/transform/index.js +7 -0
  252. package/dist/transform/index.js.map +1 -0
  253. package/dist/transform/lookup.d.ts +10 -0
  254. package/dist/transform/lookup.d.ts.map +1 -0
  255. package/dist/transform/lookup.js +66 -0
  256. package/dist/transform/lookup.js.map +1 -0
  257. package/dist/transform/types.d.ts +10 -0
  258. package/dist/transform/types.d.ts.map +1 -0
  259. package/dist/transform/types.js +4 -0
  260. package/dist/transform/types.js.map +1 -0
  261. package/dist/utils/env.d.ts +3 -0
  262. package/dist/utils/env.d.ts.map +1 -0
  263. package/dist/utils/env.js +26 -0
  264. package/dist/utils/env.js.map +1 -0
  265. package/dist/utils/errors.d.ts +26 -0
  266. package/dist/utils/errors.d.ts.map +1 -0
  267. package/dist/utils/errors.js +39 -0
  268. package/dist/utils/errors.js.map +1 -0
  269. package/dist/utils/index.d.ts +5 -0
  270. package/dist/utils/index.d.ts.map +1 -0
  271. package/dist/utils/index.js +7 -0
  272. package/dist/utils/index.js.map +1 -0
  273. package/dist/utils/logger.d.ts +14 -0
  274. package/dist/utils/logger.d.ts.map +1 -0
  275. package/dist/utils/logger.js +16 -0
  276. package/dist/utils/logger.js.map +1 -0
  277. package/dist/utils/progress.d.ts +66 -0
  278. package/dist/utils/progress.d.ts.map +1 -0
  279. package/dist/utils/progress.js +283 -0
  280. package/dist/utils/progress.js.map +1 -0
  281. package/package.json +92 -0
package/README.md ADDED
@@ -0,0 +1,582 @@
1
+ ![Sluice](./images/sluice_banner.png)
2
+
3
+ > *"A sluice is a channel that controls the flow of water. Sluice is a toolkit that controls the flow of data. Except data doesn't flood your basement. Usually."*
4
+
5
+ **`@caracal-lynx/sluice`** β€” a config-driven ETL toolkit for ERP data migrations, built by [Caracal Lynx Ltd.](https://caracallynx.com).
6
+
7
+ [![Node 24](https://img.shields.io/badge/Node-24_LTS-green)](https://nodejs.org)
8
+ [![TypeScript](https://img.shields.io/badge/TypeScript-6.x-blue)](https://www.typescriptlang.org)
9
+ [![License](https://img.shields.io/badge/license-Elastic_2.0-blue)](LICENSE)
10
+
11
+ ---
12
+
13
+ ## πŸ€” What is this thing?
14
+
15
+ ![Gold Sluice](./images/sluice-for-gold.jpg)
16
+
17
+ Sluice takes the pain out of ERP data migrations. You know the drill β€” a client has 20 years of customer records in a legacy SQL database, and they need them in a shiny new ERP system by Monday. The data is a mess, the field names are cryptic, and someone has helpfully stored postcodes in a column called `ADDR5`.
18
+
19
+ Sluice lets you describe the entire migration as a **YAML pipeline config** β€” where to get the data, what quality rules to enforce, how to transform the fields, and where to load the result. The engine is written once; every client engagement is just a folder of YAML files.
20
+
21
+ **No UI. No server. No cloud dependency.** Just the `sluice` CLI, TypeScript modules, and a strong cup of tea. β˜•
22
+
23
+ ---
24
+
25
+ ## ✨ What it does
26
+
27
+ The data flows through four stages β€” like water through a sluice gate:
28
+
29
+ ```
30
+ πŸ’Ύ Source(s) πŸ” Data Quality ✨ Transform 🎯 Target
31
+ ───────────────── β†’ ───────────────── β†’ ───────────────── β†’ ─────────────────
32
+ MSSQL / CSV / Validate rules Map fields Business Central
33
+ XLSX / REST / Reject bad rows Apply lookups IFS ERP
34
+ PostgreSQL Write DQ report Cleanse values BlueCherry ERP
35
+ Evaluate expressions CSV / PostgreSQL
36
+ (1..N sources)
37
+ ↓
38
+ πŸ”€ Optional Merge
39
+ coalesce, union,
40
+ intersect, priority
41
+ ```
42
+
43
+ Under the bonnet, all extracted data passes through a **local DuckDB staging store** before being transformed and loaded. Think of it as a staging area where data sits while it gets its act together before being presented to the target ERP. πŸ¦†
44
+
45
+ Pipelines can be **single-source** (one YAML per entity, one `source:` block) or **multi-source** β€” 2+ sources merged on a key column using one of four built-in strategies before DQ and transform run. See [Multi-Source Merge](#-multi-source-merge) below.
46
+
47
+ ---
48
+
49
+ ## πŸ—οΈ Architecture
50
+
51
+ ### Single-source pipeline
52
+
53
+ ```mermaid
54
+ flowchart LR
55
+ A[πŸ“„ Pipeline YAML] --> B[βš™οΈ Config Loader<br/>Zod validation<br/>ENV var resolution<br/>Composite rule expansion]
56
+ B --> C[πŸ”Œ Source Adapter<br/>mssql / pg / csv<br/>xlsx / rest]
57
+ C --> D[(πŸ¦† DuckDB<br/>stg_raw)]
58
+ D --> E[πŸ” DQ Engine<br/>Rules validation<br/>Rejection report]
59
+ E --> F[✨ Transform Engine<br/>Field mapping<br/>Lookup resolution<br/>Cleanse ops<br/>Custom plugins]
60
+ F --> G[(πŸ¦† DuckDB<br/>stg_transformed)]
61
+ G --> H[🎯 Target Adapter<br/>bc / ifs / bluecherry<br/>csv / pg]
62
+ H --> I[πŸ“¦ Output<br/>CSV / REST / DB]
63
+ E -->|❌ critical failures| J[πŸ›‘ Pipeline halted<br/>dq-summary.json<br/>rejected.csv]
64
+ ```
65
+
66
+ ### Multi-source pipeline
67
+
68
+ ```mermaid
69
+ flowchart LR
70
+ A[πŸ“„ Pipeline YAML<br/>sources + merge] --> B[βš™οΈ Config Loader]
71
+ B --> C1[πŸ”Œ Source 1]
72
+ B --> C2[πŸ”Œ Source 2]
73
+ B --> C3[πŸ”Œ Source N]
74
+ C1 --> D1[(πŸ¦† stg_raw_src1<br/>+ rename + per-source DQ)]
75
+ C2 --> D2[(πŸ¦† stg_raw_src2<br/>+ rename + per-source DQ)]
76
+ C3 --> D3[(πŸ¦† stg_raw_srcN<br/>+ rename + per-source DQ)]
77
+ D1 --> M[πŸ”€ MergeEngine<br/>coalesce / union<br/>intersect / priority-override]
78
+ D2 --> M
79
+ D3 --> M
80
+ M --> G[(πŸ¦† stg_merged<br/>+ stg_merge_conflicts.csv)]
81
+ G --> E[πŸ” Post-merge DQ]
82
+ E --> F[✨ Transform β†’ stg_transformed]
83
+ F --> H[🎯 Target Adapter]
84
+ H --> I[πŸ“¦ Output]
85
+ ```
86
+
87
+ ---
88
+
89
+ ## 🧰 Tech Stack
90
+
91
+ | What | Package | Why |
92
+ |------|---------|-----|
93
+ | πŸ”€ Language | TypeScript 5.x `strict` | Because `any` is a cry for help |
94
+ | 🟒 Runtime | Node.js 24 LTS | Active LTS until April 2028; OpenSSL 3.5; ESM-stable |
95
+ | πŸ“‹ Config | `js-yaml` + `zod` | YAML in, typed objects out |
96
+ | πŸ—„οΈ SQL Server | `mssql` | Because the legacy DB is always SQL Server |
97
+ | πŸ“Š Staging | `@duckdb/node-api` (embedded) | Promise-native, ABI-stable β€” no server, no `npm rebuild` after Node version bumps |
98
+ | πŸ“ CSV | `csv-parse` + `csv-stringify` | Streaming, handles BOM, the works |
99
+ | πŸ“ˆ Excel | `xlsx` (SheetJS) | Read-only β€” we're migrating away from it, after all |
100
+ | 🌐 HTTP | `axios` + `axios-retry` | 3 retries, exponential backoff, rate limit respect |
101
+ | πŸ“… Dates | `dayjs` | Because time zones are already somebody else's problem |
102
+ | πŸ–₯️ CLI | `commander` v12 | Clean commands, sane flags |
103
+ | πŸ“ Logging | `pino` | Structured JSON logs β€” pretty in dev, parseable in CI |
104
+ | πŸ§ͺ Testing | `vitest` | Not Jest. Never Jest. |
105
+ | πŸ”’ Expressions | `expr-eval` | Safe expression parsing β€” no `eval()` here, thank you very much |
106
+
107
+ ---
108
+
109
+ ## πŸš€ Quick Start
110
+
111
+ ```bash
112
+ # Install
113
+ npm install @caracal-lynx/sluice
114
+
115
+ # Check a pipeline config is valid (no data touched)
116
+ sluice check customers.pipeline.yaml
117
+
118
+ # Run DQ and transform but don't write output
119
+ sluice validate customers.pipeline.yaml
120
+
121
+ # Go for it πŸš€
122
+ sluice run customers.pipeline.yaml
123
+
124
+ # Profile source data (column stats, no DQ)
125
+ sluice profile customers.pipeline.yaml
126
+
127
+ # Inspect loaded plugins and merge strategies
128
+ sluice plugins
129
+ sluice merge list-strategies
130
+ sluice merge info coalesce
131
+ ```
132
+
133
+ ### CLI flags
134
+
135
+ | Flag | What it does |
136
+ |------|-------------|
137
+ | `--log-level debug\|info\|warn\|error` | How chatty do you want the logs? |
138
+ | `--env <file>` | Path to your `.env` file (default: `./.env`) |
139
+ | `--output <dir>` | Override the output directory |
140
+ | `--plugins <dir...>` | Load additional plugin directories (alongside the pipeline `plugins/` folder) |
141
+ | `--dry-run` | Extract + DQ + transform, but don't write a single byte to the target |
142
+
143
+ When multiple plugin directories resolve to the same absolute path (for example,
144
+ `--plugins ./plugins`), Sluice de-duplicates them before loading.
145
+
146
+ ### Exit codes
147
+
148
+ | Code | Meaning |
149
+ |------|---------|
150
+ | `0` | βœ… All good |
151
+ | `1` | ❌ Pipeline error |
152
+ | `2` | πŸ›‘ Critical DQ violations halted the pipeline |
153
+ | `3` | πŸ“‹ Config validation failed |
154
+
155
+ ---
156
+
157
+ ## πŸ“„ Pipeline Config Format
158
+
159
+ Each migration entity gets its own YAML file. One entity, one file. Nice and tidy.
160
+
161
+ ```
162
+ πŸ’‘ One YAML file = one migrated entity
163
+ (customers, items, vendors, styles, purchase orders, etc.)
164
+ ```
165
+
166
+ A single-source pipeline has five sections:
167
+
168
+ ```yaml
169
+ pipeline: { name, client, version, entity, description }
170
+ source: { adapter, connection/file/endpoint, ... }
171
+ dq: { rules, stopOnCritical, rejectionFile }
172
+ transform: { lookups, fields }
173
+ target: { adapter, output/baseUrl, ... }
174
+ run: { mode, batchSize, logLevel, dryRun, ... } # all optional
175
+ ```
176
+
177
+ A multi-source pipeline swaps `source:` for `sources:` + `merge:`:
178
+
179
+ ```yaml
180
+ pipeline: { ... }
181
+ sources: [ { id, priority, adapter, ..., rename? }, ... ] # 2+ entries
182
+ merge: { key, strategy, onUnmatched, fieldStrategies, conflictLog, incrementalSource? }
183
+ dq: { ... } # rules can be scoped via sourceId
184
+ transform: { ... }
185
+ target: { ... }
186
+ run: { ... }
187
+ ```
188
+
189
+ `PipelineSchema` requires *either* `source:` (single) *or* both `sources:` + `merge:` (multi) β€” never both. The CLI auto-routes based on which shape the YAML has, so there's no flag to remember.
190
+
191
+ ### πŸ“₯ Source Adapters
192
+
193
+ | Adapter | Use when... |
194
+ |---------|-------------|
195
+ | `mssql` | The legacy system is SQL Server (it's always SQL Server) |
196
+ | `pg` | The legacy system is PostgreSQL (you lucky thing) |
197
+ | `csv` | Someone emailed you a CSV export at 11pm the night before go-live |
198
+ | `xlsx` | Same as above but Excel, complete with merged cells and mystery formatting |
199
+ | `rest` | The source system has an API! Progress! |
200
+
201
+ ### 🎯 Target Adapters
202
+
203
+ | Adapter | Loads to... |
204
+ |---------|-------------|
205
+ | `bc` | Microsoft Dynamics 365 Business Central (via OData REST + OAuth2) |
206
+ | `ifs` | IFS ERP (via fixed-format CSV import β€” no header, specific column order) |
207
+ | `bluecherry` | BlueCherry ERP / CGS (CSV import, US-format dates, headers required) |
208
+ | `csv` | Generic CSV β€” for anything else or for manual inspection |
209
+ | `pg` | PostgreSQL β€” useful for intermediate staging or custom targets |
210
+
211
+ ### πŸ” Data Quality Rules
212
+
213
+ Nine built-in rule types, configurable per field:
214
+
215
+ ```yaml
216
+ dq:
217
+ stopOnCritical: true
218
+ rules:
219
+ - field: CUST_CODE
220
+ checks:
221
+ - { type: notNull, severity: critical } # πŸ’₯ stops the pipeline
222
+ - { type: unique, severity: critical }
223
+ - { type: pattern, value: "^[A-Z0-9]{3,10}$", severity: warning }
224
+
225
+ - field: EMAIL
226
+ checks:
227
+ - { type: email, severity: warning } # ⚠️ flagged but not rejected
228
+
229
+ - field: POST_CODE
230
+ checks:
231
+ - { type: ukPostcode, severity: warning } # πŸ‡¬πŸ‡§ all UK formats
232
+ ```
233
+
234
+ | Rule | What it checks |
235
+ |------|---------------|
236
+ | `notNull` | Not null, not empty, not just whitespace |
237
+ | `unique` | No duplicates across the whole dataset |
238
+ | `pattern` | ECMAScript regex |
239
+ | `email` | RFC 5322-ish email validation |
240
+ | `ukPostcode` | All current UK postcode formats |
241
+ | `maxLength` | String length cap |
242
+ | `min` / `max` | Numeric range |
243
+ | `allowedValues` | Enum-style allowed value list |
244
+
245
+ Severity levels: `critical` (row rejected, pipeline can halt) Β· `warning` (flagged in report, row kept) Β· `info` (summary only)
246
+
247
+ ### ✨ Transform: Field Mapping Types
248
+
249
+ | Type | What it does |
250
+ |------|-------------|
251
+ | `string` | Cast + optional cleanse ops + optional truncation |
252
+ | `number` | Integer coercion (NaN = error) |
253
+ | `decimal` | Fixed-precision decimal stored as string |
254
+ | `boolean` | `'1','true','yes','y','t'` β†’ true. Everything else β†’ false |
255
+ | `date` | Parse source date, output in target format |
256
+ | `lookup` | Resolve via a CSV or SQL lookup table |
257
+ | `concat` | Join multiple source fields with a separator |
258
+ | `constant` | Emit a fixed value (e.g. `CustomerGroup: DOMESTIC`) |
259
+ | `expression` | Evaluate an expression against the source row |
260
+ | `custom` | Delegate to a `TransformPlugin` via `customOp` (Phase 2) |
261
+
262
+ ### 🧹 Cleanse Operations
263
+
264
+ Pipe-chain them: `cleanse: trim|titleCase|normaliseUnicode`
265
+
266
+ | Op | Before | After |
267
+ |----|--------|-------|
268
+ | `trim` | `" hello "` | `"hello"` |
269
+ | `uppercase` | `"hello"` | `"HELLO"` |
270
+ | `lowercase` | `"HELLO"` | `"hello"` |
271
+ | `titleCase` | `"john smith"` | `"John Smith"` |
272
+ | `stripNonAlpha` | `"AB-12!"` | `"AB"` |
273
+ | `stripNonNumeric` | `"AB-12!"` | `"12"` |
274
+ | `padStart:6:0` | `"42"` | `"000042"` |
275
+ | `nullIfEmpty` | `""` | `null` |
276
+ | `normaliseUnicode` | `"cafΓ©"` | `"cafe"` |
277
+ | `normaliseQuotes` | `"it's"` | `"it's"` |
278
+
279
+ ---
280
+
281
+ ## πŸ“ Repository Structure
282
+
283
+ ```
284
+ sluice/
285
+ β”œβ”€β”€ src/
286
+ β”‚ β”œβ”€β”€ cli.ts ← CLI entry point (commander)
287
+ β”‚ β”œβ”€β”€ runner.ts ← PipelineRunner β€” single-source orchestration
288
+ β”‚ β”œβ”€β”€ multi-source-runner.ts ← MultiSourcePipelineRunner (Phase 3)
289
+ β”‚ β”œβ”€β”€ config/ ← Zod schema, YAML loader, ENV var + composite expansion
290
+ β”‚ β”œβ”€β”€ adapters/
291
+ β”‚ β”‚ β”œβ”€β”€ source/ ← mssql, pg, csv, xlsx, rest
292
+ β”‚ β”‚ └── target/ ← bc, ifs, bluecherry, csv, pg
293
+ β”‚ β”œβ”€β”€ staging/ ← DuckDB wrapper (stg_raw β†’ stg_merged β†’ stg_transformed)
294
+ β”‚ β”œβ”€β”€ dq/ ← DQ engine, rules, rejection reporter
295
+ β”‚ β”œβ”€β”€ transform/ ← Transform engine, lookup resolver, cleanse ops
296
+ β”‚ β”œβ”€β”€ merge/ ← MergeEngine, SQL builder, 4 built-in strategies
297
+ β”‚ β”œβ”€β”€ plugins/ ← Rule/Transform/Merge registries + file & npm loaders
298
+ β”‚ └── utils/ ← logger (pino), errors, env helpers
299
+ β”œβ”€β”€ tests/
300
+ β”‚ β”œβ”€β”€ fixtures/ ← sample pipeline YAMLs, CSV/rules data, plugin files
301
+ β”‚ β”œβ”€β”€ unit/ ← unit tests (all I/O mocked)
302
+ β”‚ └── integration/ ← real DuckDB :memory: + CSV fixtures
303
+ └── clients/ ← πŸ™ˆ gitignored β€” each client has their own repo
304
+ β”œβ”€β”€ cochran/ ← Cochran Group (Annan) pipelines
305
+ └── eribe/ ← EribΓ© Knitwear pipelines
306
+ ```
307
+
308
+ ---
309
+
310
+ ## βš™οΈ Environment Variables
311
+
312
+ Connection strings and credentials live in `.env` (never in YAML files, never in Git).
313
+
314
+ ```bash
315
+ # .env
316
+ COCHRAN_MSSQL=mssql://user:password@server.cochran.local/LegacyDB
317
+ BC_BASE_URL=https://api.businesscentral.dynamics.com/v2.0
318
+ BC_TENANT_ID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
319
+ BC_CLIENT_ID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
320
+ BC_CLIENT_SECRET=your-secret-here
321
+ BC_COMPANY=Example Company Ltd
322
+ ```
323
+
324
+ Reference them in YAML with `${ENV_VAR}` β€” resolved at runtime, never stored in config:
325
+
326
+ ```yaml
327
+ source:
328
+ adapter: mssql
329
+ connection: ${COCHRAN_MSSQL}
330
+ ```
331
+
332
+ ---
333
+
334
+ ## 🧩 Phase 2: Extension System
335
+
336
+ Phase 2 adds a three-tier plugin system so you can extend Sluice without touching the core engine.
337
+
338
+ ### Tier 1 β€” Composite Rules (YAML) πŸ“‹
339
+
340
+ Name a bundle of checks in a shared rules file and reference them like built-ins:
341
+
342
+ ```yaml
343
+ # shared/rules.yaml
344
+ rules:
345
+ - id: eribeStyleNo
346
+ checks:
347
+ - { type: notNull, severity: critical }
348
+ - { type: pattern, value: "^[A-Z]{2}[0-9]{4}$", severity: critical }
349
+ - { type: maxLength, value: 6, severity: critical }
350
+ ```
351
+
352
+ ```yaml
353
+ # In your pipeline:
354
+ dq:
355
+ rulesFile: ../../shared/rules.yaml
356
+ rules:
357
+ - field: STYLE_NO
358
+ checks:
359
+ - { type: eribeStyleNo } # expands to the three checks above ✨
360
+ ```
361
+
362
+ ### Tier 2 β€” Plugin Files (TypeScript) πŸ”Œ
363
+
364
+ Drop a `*.rule.ts`, `*.transform.ts`, or `*.merge.ts` file into a `plugins/` folder next to your pipeline YAMLs. Auto-discovered at startup:
365
+
366
+ ```typescript
367
+ // plugins/ukVatNumber.rule.ts
368
+ export const rule: RulePlugin = {
369
+ id: 'ukVatNumber',
370
+ validate(value, config, rowIndex, field) {
371
+ const valid = /^GB([0-9]{9}|[0-9]{12}|(GD|HA)[0-9]{3})$/.test(String(value));
372
+ return valid ? null : { field, rowIndex, value, rule: 'ukVatNumber',
373
+ severity: config.severity, message: 'Invalid UK VAT number' };
374
+ }
375
+ };
376
+ ```
377
+
378
+ ### Tier 3 β€” npm Packages πŸ“¦
379
+
380
+ When plugins are useful across multiple clients, promote them to scoped npm packages and declare them in `sluice.config.yaml`:
381
+
382
+ ```yaml
383
+ # sluice.config.yaml
384
+ plugins:
385
+ - package: "@caracal-lynx/etl-rules-uk"
386
+ - package: "@caracal-lynx/etl-rules-fashion"
387
+ - package: "@caracal-lynx/etl-transform-ifs"
388
+ ```
389
+
390
+ All three tiers use the same registry interfaces and are invoked identically by the engines. The engine doesn't know or care which tier a rule came from. 🀷
391
+
392
+ ### List Loaded Plugins
393
+
394
+ ```bash
395
+ sluice plugins
396
+
397
+ # Include extra plugin directories outside the pipeline folder
398
+ sluice plugins --plugins ./shared/plugins ./team/plugins
399
+ ```
400
+
401
+ Output:
402
+ ```
403
+ πŸ“‹ Data Quality Rules:
404
+ β€’ ukVatNumber
405
+ β€’ bcAccountCode
406
+ β€’ iso8601Date
407
+
408
+ πŸ”„ Transform Operations:
409
+ β€’ slugGenerator
410
+ β€’ normalizeCompanyName
411
+ β€’ fixedDecimal
412
+
413
+ πŸ”€ Merge Strategies:
414
+ β€’ coalesce
415
+ β€’ priority-override
416
+ β€’ union
417
+ β€’ intersect
418
+ ```
419
+
420
+ ### Getting Started with Plugins
421
+
422
+ Detailed guide: **[PLUGINS.md](./PLUGINS.md)**
423
+
424
+ - Create a custom DQ rule
425
+ - Create a custom transform operation
426
+ - Create a custom merge strategy
427
+ - Package plugins as npm packages
428
+ - Test and debug plugins
429
+ - Real-world examples
430
+
431
+ ---
432
+
433
+ ## πŸ”€ Multi-Source Merge
434
+
435
+ Phase 3 lets a single pipeline extract from **2+ sources** and merge them on a key column before DQ and transform. Useful when the master record for an entity is scattered across systems β€” master data in SQL Server, pricing enrichment in an Excel sheet, product descriptions in Odoo, and so on.
436
+
437
+ ### Built-in merge strategies
438
+
439
+ | Strategy | Behaviour | When to use |
440
+ |---|---|---|
441
+ | `coalesce` | First non-null value wins (priority-ordered; whitespace treated as blank) | Enriching a primary source with fallback data from lower-priority sources |
442
+ | `priority-override` | Highest-priority source wins, even if null or blank | Strict priority β€” the trusted source is the trusted source, full stop |
443
+ | `union` | All rows from all sources, deduplicated by key | Combining independent datasets (e.g. multi-warehouse inventory) |
444
+ | `intersect` | Only rows present in **all** sources | Reconciliation / "find the records that agree" |
445
+
446
+ Custom strategies can be dropped in as `*.merge.ts` plugins or shipped as npm packages β€” same three-tier model as DQ rules and transforms.
447
+
448
+ ### A minimal multi-source pipeline
449
+
450
+ ```yaml
451
+ pipeline:
452
+ name: eribe-products-merged
453
+ client: eribe-knitwear
454
+ version: "1.0"
455
+ entity: Style
456
+
457
+ sources:
458
+ - id: sql-server # staging table: stg_raw_sql-server
459
+ priority: 1 # lower = higher precedence
460
+ adapter: mssql
461
+ connection: ${ERIBE_MSSQL}
462
+ query: "SELECT STYLE_NO, STYLE_DESC, COST_PRICE FROM dbo.Styles WHERE Active = 1"
463
+
464
+ - id: excel
465
+ priority: 2
466
+ adapter: xlsx
467
+ file: ./data/product-data.xlsx
468
+ sheet: "Products"
469
+ rename: # applied in-place after extract, before DQ
470
+ Style Number: STYLE_NO
471
+ Description: STYLE_DESC
472
+ Fibre: FIBRE_CONTENT
473
+
474
+ merge:
475
+ key: STYLE_NO # single column or array for composite keys
476
+ strategy: coalesce
477
+ onUnmatched: include # include | exclude | warn | error
478
+ fieldStrategies: # per-field overrides
479
+ - { field: FIBRE_CONTENT, source: excel } # pin to one source
480
+ - { field: COST_PRICE, strategy: priority-override }
481
+ conflictLog: ./output/eribe-products-conflicts.csv # optional CSV of field disagreements
482
+
483
+ dq:
484
+ stopOnCritical: true
485
+ rules:
486
+ - field: STYLE_NO # 🎯 pre-merge: scoped to one source
487
+ sourceId: sql-server
488
+ checks: [ { type: notNull, severity: critical }, { type: unique, severity: critical } ]
489
+ - field: STYLE_DESC # 🎯 post-merge: runs against stg_merged
490
+ checks: [ { type: notNull, severity: critical } ]
491
+
492
+ transform: { ... }
493
+ target: { ... }
494
+ ```
495
+
496
+ Pre-merge rules (`sourceId: …`) run against each source's staging table before merging and generate per-source rejection CSVs (suffixed `-{sourceId}`). Post-merge rules (no `sourceId`) run once against `stg_merged`.
497
+
498
+ ### Incremental multi-source
499
+
500
+ ```yaml
501
+ merge:
502
+ incrementalSource: sql-server # must match a source id; required in incremental mode
503
+ run:
504
+ mode: incremental
505
+ incrementalField: UPDATED_AT
506
+ ```
507
+
508
+ Only the named source is filtered by timestamp; other sources run full each time. The state file gains a per-source `sources` block tracking each source's last run time.
509
+
510
+ ### Inspect merge strategies
511
+
512
+ ```bash
513
+ sluice merge list-strategies # ids + descriptions for all registered strategies
514
+ sluice merge info coalesce # details for one strategy
515
+ ```
516
+
517
+ A full working example lives at [tests/fixtures/eribe-products-merged.pipeline.yaml](tests/fixtures/eribe-products-merged.pipeline.yaml).
518
+
519
+ ---
520
+
521
+ ## 🀝 Known Clients
522
+
523
+ | Client | Source | Target | Adapter |
524
+ |--------|--------|--------|---------|
525
+ | Cochran Group (Annan) | MSSQL legacy DB | IFS ERP | `ifs` |
526
+ | EribΓ© Knitwear | MSSQL / CSV exports | BlueCherry ERP | `bluecherry` |
527
+
528
+ ---
529
+
530
+ ## πŸ§ͺ Testing
531
+
532
+ ```bash
533
+ npm test # run tests once
534
+ npm run test:watch # watch mode (great for TDD)
535
+ npm run test:cov # with coverage report
536
+ ```
537
+
538
+ - **Unit tests** mock all I/O with `vi.mock` β€” no live databases required
539
+ - **Integration tests** use real DuckDB (`:memory:`) with CSV fixtures
540
+ - Target: 80% line coverage across `src/dq/` and `src/transform/`
541
+ - CI runs on `ubuntu-latest` via GitHub Actions
542
+
543
+ ---
544
+
545
+ ## πŸ—οΈ Development
546
+
547
+ ```bash
548
+ npm run build # tsc compile
549
+ npm run dev # tsx watch src/cli.ts (live reload)
550
+ npm run lint # eslint
551
+ npm run format # prettier
552
+
553
+ # Pretty logs in dev:
554
+ npm run dev -- run customers.pipeline.yaml | npx pino-pretty
555
+ ```
556
+
557
+ > **Note:** Uses `tsx`, not `ts-node`. Path aliases work correctly on Windows without extra configuration. πŸͺŸ
558
+
559
+ ---
560
+
561
+ ## 🚫 Things Sluice Is Not
562
+
563
+ - ❌ A web application or dashboard (there's no UI β€” this is a good thing)
564
+ - ❌ A streaming / real-time ingestion platform
565
+ - ❌ A data warehouse
566
+ - ❌ A multi-tenant SaaS product
567
+ - ❌ An excuse to use `eval()` anywhere
568
+
569
+ ---
570
+
571
+ ## πŸ“¦ Package Info
572
+
573
+ ```
574
+ npm package: @caracal-lynx/sluice
575
+ owner: Caracal Lynx Ltd. (SC826823)
576
+ author: Michael Scott
577
+ maintainers: Michael Scott, Carolyn Scott, Andrew Scott, Duncan Scott
578
+ ```
579
+
580
+ ---
581
+
582
+ *Clean data flows through.* πŸ’§
@@ -0,0 +1,10 @@
1
+ import type { RunConfig, SourceConfig } from '../../config/types.js';
2
+ import type { StagingStore } from '../../staging/index.js';
3
+ import type { ExtractResult, SourceAdapter } from './types.js';
4
+ export declare class CsvSourceAdapter implements SourceAdapter {
5
+ readonly id = "csv";
6
+ connect(_config: SourceConfig): Promise<void>;
7
+ disconnect(): Promise<void>;
8
+ extract(config: SourceConfig, store: StagingStore, runConfig: RunConfig, onProgress: (rows: number) => void, targetTable?: string): Promise<ExtractResult>;
9
+ }
10
+ //# sourceMappingURL=csv.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv.d.ts","sourceRoot":"","sources":["../../../src/adapters/source/csv.ts"],"names":[],"mappings":"AAwBA,OAAO,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACrE,OAAO,KAAK,EAAc,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGvE,OAAO,KAAK,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AA+B/D,qBAAa,gBAAiB,YAAW,aAAa;IACpD,QAAQ,CAAC,EAAE,SAAS;IAEd,OAAO,CAAC,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAI7C,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAI3B,OAAO,CACX,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,YAAY,EACnB,SAAS,EAAE,SAAS,EACpB,UAAU,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,EAClC,WAAW,SAAY,GACtB,OAAO,CAAC,aAAa,CAAC;CA6D1B"}