@murumets-ee/imports 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +94 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +1 -0
- package/dist/plugin.d.mts +38 -0
- package/dist/plugin.d.mts.map +1 -0
- package/dist/plugin.mjs +2 -0
- package/dist/plugin.mjs.map +1 -0
- package/dist/runner-DdhiNybk.mjs +2 -0
- package/dist/runner-DdhiNybk.mjs.map +1 -0
- package/dist/storage-resolver.d.mts +17 -0
- package/dist/storage-resolver.d.mts.map +1 -0
- package/dist/storage-resolver.mjs +2 -0
- package/dist/storage-resolver.mjs.map +1 -0
- package/dist/transform-BUGBTotp.mjs +2 -0
- package/dist/transform-BUGBTotp.mjs.map +1 -0
- package/dist/transform-D_uhdLeo.d.mts +119 -0
- package/dist/transform-D_uhdLeo.d.mts.map +1 -0
- package/dist/transforms.d.mts +57 -0
- package/dist/transforms.d.mts.map +1 -0
- package/dist/transforms.mjs +2 -0
- package/dist/transforms.mjs.map +1 -0
- package/dist/worker-DerGVTSI.d.mts +467 -0
- package/dist/worker-DerGVTSI.d.mts.map +1 -0
- package/dist/worker.d.mts +2 -0
- package/dist/worker.mjs +2 -0
- package/dist/worker.mjs.map +1 -0
- package/package.json +58 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Elastic License 2.0 (ELv2)
|
|
2
|
+
|
|
3
|
+
URL: https://www.elastic.co/licensing/elastic-license
|
|
4
|
+
|
|
5
|
+
## Acceptance
|
|
6
|
+
|
|
7
|
+
By using the software, you agree to all of the terms and conditions below.
|
|
8
|
+
|
|
9
|
+
## Copyright License
|
|
10
|
+
|
|
11
|
+
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
|
12
|
+
non-sublicensable, non-transferable license to use, copy, distribute, make
|
|
13
|
+
available, and prepare derivative works of the software, in each case subject
|
|
14
|
+
to the limitations and conditions below.
|
|
15
|
+
|
|
16
|
+
## Limitations
|
|
17
|
+
|
|
18
|
+
You may not provide the software to third parties as a hosted or managed
|
|
19
|
+
service, where the service provides users with access to any substantial set
|
|
20
|
+
of the features or functionality of the software.
|
|
21
|
+
|
|
22
|
+
You may not move, change, disable, or circumvent the license key functionality
|
|
23
|
+
in the software, and you may not remove or obscure any functionality in the
|
|
24
|
+
software that is protected by the license key.
|
|
25
|
+
|
|
26
|
+
You may not alter, remove, or obscure any licensing, copyright, or other
|
|
27
|
+
notices of the licensor in the software. Any use of the licensor's trademarks
|
|
28
|
+
is subject to applicable law.
|
|
29
|
+
|
|
30
|
+
## Patents
|
|
31
|
+
|
|
32
|
+
The licensor grants you a license, under any patent claims the licensor can
|
|
33
|
+
license, or becomes able to license, to make, have made, use, sell, offer for
|
|
34
|
+
sale, import and have imported the software, in each case subject to the
|
|
35
|
+
limitations and conditions in this license. This license does not cover any
|
|
36
|
+
patent claims that you cause to be infringed by modifications or additions to
|
|
37
|
+
the software. If you or your company make any written claim that the software
|
|
38
|
+
infringes or contributes to infringement of any patent, your patent license
|
|
39
|
+
for the software granted under these terms ends immediately. If your company
|
|
40
|
+
makes such a claim, your patent license ends immediately for work on behalf
|
|
41
|
+
of your company.
|
|
42
|
+
|
|
43
|
+
## Notices
|
|
44
|
+
|
|
45
|
+
You must ensure that anyone who gets a copy of any part of the software from
|
|
46
|
+
you also gets a copy of these terms.
|
|
47
|
+
|
|
48
|
+
If you modify the software, you must include in any modified copies of the
|
|
49
|
+
software prominent notices stating that you have modified the software.
|
|
50
|
+
|
|
51
|
+
## No Other Rights
|
|
52
|
+
|
|
53
|
+
These terms do not imply any licenses other than those expressly granted in
|
|
54
|
+
these terms.
|
|
55
|
+
|
|
56
|
+
## Termination
|
|
57
|
+
|
|
58
|
+
If you use the software in violation of these terms, such use is not licensed,
|
|
59
|
+
and your licenses will automatically terminate. If the licensor provides you
|
|
60
|
+
with a notice of your violation, and you cease all violation of this license
|
|
61
|
+
no later than 30 days after you receive that notice, your licenses will be
|
|
62
|
+
reinstated retroactively. However, if you violate these terms after such
|
|
63
|
+
reinstatement, any additional violation of these terms will cause your
|
|
64
|
+
licenses to terminate automatically and permanently.
|
|
65
|
+
|
|
66
|
+
## No Liability
|
|
67
|
+
|
|
68
|
+
As far as the law allows, the software comes as is, without any warranty or
|
|
69
|
+
condition, and the licensor will not be liable to you for any damages arising
|
|
70
|
+
out of these terms or the use or nature of the software, under any kind of
|
|
71
|
+
legal claim.
|
|
72
|
+
|
|
73
|
+
## Definitions
|
|
74
|
+
|
|
75
|
+
The **licensor** is the entity offering these terms, and the **software** is
|
|
76
|
+
the software the licensor makes available under these terms, including any
|
|
77
|
+
portion of it.
|
|
78
|
+
|
|
79
|
+
**you** refers to the individual or entity agreeing to these terms.
|
|
80
|
+
|
|
81
|
+
**your company** is any legal entity, sole proprietorship, or other kind of
|
|
82
|
+
organization that you work for, plus all organizations that have control over,
|
|
83
|
+
are under the control of, or are under common control with that organization.
|
|
84
|
+
**control** means ownership of substantially all the assets of an entity, or
|
|
85
|
+
the power to direct the management and policies of an entity (for example, by
|
|
86
|
+
voting right, contract, or otherwise). Control can be direct or indirect.
|
|
87
|
+
|
|
88
|
+
**your licenses** are all the licenses granted to you for the software under
|
|
89
|
+
these terms.
|
|
90
|
+
|
|
91
|
+
**use** means anything you do with the software requiring one of your
|
|
92
|
+
licenses.
|
|
93
|
+
|
|
94
|
+
**trademark** means trademarks, service marks, and similar rights.
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { C as IMPORT_RUN_STATUSES, E as ImportRunStatus, S as ErrorTrackerConfig, T as ImportRunClient, _ as DEFAULT_MAX_PATTERNS, b as ErrorSample, d as RunImportOptions, f as RunImportResult, g as streamFeed, h as StreamFeedRow, l as DEFAULT_BATCH_SIZE, m as StreamFeedOptions, n as FilePathResolver, p as runImport, t as EsClientResolver, u as ImportRunProgress, v as DEFAULT_MAX_SAMPLES_PER_PATTERN, w as ImportRun, x as ErrorTracker, y as ErrorPattern } from "./worker-DerGVTSI.mjs";
|
|
2
|
+
import { a as TransformName, c as registerImportTransform, i as TransformContext, n as RowResult, o as TransformRegistry, r as RowTransform, s as getTransformRegistry, t as RowError } from "./transform-D_uhdLeo.mjs";
|
|
3
|
+
export { DEFAULT_BATCH_SIZE, DEFAULT_MAX_PATTERNS, DEFAULT_MAX_SAMPLES_PER_PATTERN, type ErrorPattern, type ErrorSample, ErrorTracker, type ErrorTrackerConfig, type EsClientResolver, type FilePathResolver, IMPORT_RUN_STATUSES, ImportRun, type ImportRunClient, type ImportRunProgress, type ImportRunStatus, type RowError, type RowResult, type RowTransform, type RunImportOptions, type RunImportResult, type StreamFeedOptions, type StreamFeedRow, type TransformContext, type TransformName, TransformRegistry, getTransformRegistry, registerImportTransform, runImport, streamFeed };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{a as e,i as t,n,r,t as i}from"./transform-BUGBTotp.mjs";import{a,i as o,n as s,o as c,r as l,t as u}from"./runner-DdhiNybk.mjs";export{u as DEFAULT_BATCH_SIZE,o as DEFAULT_MAX_PATTERNS,a as DEFAULT_MAX_SAMPLES_PER_PATTERN,c as ErrorTracker,t as IMPORT_RUN_STATUSES,e as ImportRun,i as TransformRegistry,n as getTransformRegistry,r as registerImportTransform,s as runImport,l as streamFeed};
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { n as FilePathResolver, t as EsClientResolver } from "./worker-DerGVTSI.mjs";
|
|
2
|
+
import { Plugin } from "@murumets-ee/core";
|
|
3
|
+
|
|
4
|
+
//#region src/plugin.d.ts
|
|
5
|
+
interface ImportsPluginOptions {
|
|
6
|
+
/**
|
|
7
|
+
* Elasticsearch client resolver. Lazy so consumers can construct
|
|
8
|
+
* the client inside a route initialiser instead of at plugin-init
|
|
9
|
+
* time. Required — without it, the worker can't bulk-write.
|
|
10
|
+
*/
|
|
11
|
+
esClient: EsClientResolver;
|
|
12
|
+
/**
|
|
13
|
+
* ES index alias to write to. Defaults to `'parts'` (the
|
|
14
|
+
* `PARTS_INDEX_ALIAS` re-exported from `@murumets-ee/search-elasticsearch`,
|
|
15
|
+
* inlined here to avoid a runtime import for a one-line constant).
|
|
16
|
+
* Per D6 always pass an alias, never a physical index.
|
|
17
|
+
*/
|
|
18
|
+
esIndex?: string;
|
|
19
|
+
/**
|
|
20
|
+
* Resolve `import_run.filePath` to a local FS path before streaming.
|
|
21
|
+
* **Required when uploads land in remote storage** (R2/S3/etc.) —
|
|
22
|
+
* without it, the worker hands the storage key to `createReadStream`
|
|
23
|
+
* and crashes with `ENOENT`.
|
|
24
|
+
*
|
|
25
|
+
* Typical wiring downloads the storage object to a tmpfile and
|
|
26
|
+
* returns its path; the optional `cleanup` callback runs after the
|
|
27
|
+
* run finishes (success or failure).
|
|
28
|
+
*
|
|
29
|
+
* Omit this option for the original on-disk PoC setup where the
|
|
30
|
+
* upload route writes directly to a local directory and persists
|
|
31
|
+
* the absolute path on `import_run.filePath`.
|
|
32
|
+
*/
|
|
33
|
+
resolveFilePath?: FilePathResolver;
|
|
34
|
+
}
|
|
35
|
+
declare function imports(options: ImportsPluginOptions): Plugin;
|
|
36
|
+
//#endregion
|
|
37
|
+
export { ImportsPluginOptions, imports };
|
|
38
|
+
//# sourceMappingURL=plugin.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"plugin.d.mts","names":[],"sources":["../src/plugin.ts"],"mappings":";;;;UAuCiB,oBAAA;;;;;;EAMf,QAAA,EAAU,gBAAA;;;;;;;EAOV,OAAA;;;;;;;;;;;;;;;EAeA,eAAA,GAAkB,gBAAA;AAAA;AAAA,iBAGJ,OAAA,CAAQ,OAAA,EAAS,oBAAA,GAAuB,MAAA"}
|
package/dist/plugin.mjs
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{a as e,n as t}from"./transform-BUGBTotp.mjs";import{createRunImportHandler as n,importsRunJob as r}from"./worker.mjs";import{createAdminClient as i}from"@murumets-ee/core/clients";function a(a){let o=a.esIndex??`parts`;return{name:`@murumets-ee/imports`,server:{entities:[e],init:async s=>{if(!s.plugins.all().some(e=>e.name===`@murumets-ee/queue`)){s.logger.warn(`imports: queue() plugin not in plugins array — imports:run jobs will not be processed`);return}let c=await import(`@murumets-ee/queue/client`),l=i(e,s),u=t();c.registerJob(r,n({importRuns:l,transforms:u,esClient:a.esClient,esIndex:o,...a.resolveFilePath!==void 0&&{resolveFilePath:a.resolveFilePath},logger:s.logger.child({pkg:`imports`})})),s.logger.info({esIndex:o},`Imports plugin initialized`)}}}}export{a as imports};
|
|
2
|
+
//# sourceMappingURL=plugin.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"plugin.mjs","names":[],"sources":["../src/plugin.ts"],"sourcesContent":["/**\n * Imports plugin — registers the `import_run` entity and (when ES + queue\n * are wired) registers the `imports:run` job handler.\n *\n * Per PLAN-ECOMMERCE.md PR 7 (PoC scope): plumbing only. The bespoke\n * admin pages (upload form, parts search) live in PR 8a. Per-feed\n * transforms (carmaker, …) live in PR 8 and self-register against the\n * process-global transform registry — this plugin's init does NOT\n * import any transform.\n *\n * @example\n * ```ts\n * import { defineLumiConfig } from '@murumets-ee/core'\n * import { imports } from '@murumets-ee/imports/plugin'\n * import { Client } from '@elastic/elasticsearch'\n *\n * const es = new Client({ node: process.env.ES_URL })\n *\n * export default defineLumiConfig({\n * plugins: [\n * // queue() and the carmaker-transform plugin must be present too.\n * queue(),\n * imports({ esClient: () => es, esIndex: 'parts' }),\n * ],\n * })\n * ```\n */\n\nimport type { Plugin } from '@murumets-ee/core'\nimport { createAdminClient } from '@murumets-ee/core/clients'\nimport { ImportRun } from './entities/import-run.js'\nimport {\n type EsClientResolver,\n type FilePathResolver,\n createRunImportHandler,\n importsRunJob,\n} from './worker.js'\nimport { getTransformRegistry } from './transform.js'\n\nexport interface ImportsPluginOptions {\n /**\n * Elasticsearch client resolver. Lazy so consumers can construct\n * the client inside a route initialiser instead of at plugin-init\n * time. Required — without it, the worker can't bulk-write.\n */\n esClient: EsClientResolver\n /**\n * ES index alias to write to. Defaults to `'parts'` (the\n * `PARTS_INDEX_ALIAS` re-exported from `@murumets-ee/search-elasticsearch`,\n * inlined here to avoid a runtime import for a one-line constant).\n * Per D6 always pass an alias, never a physical index.\n */\n esIndex?: string\n /**\n * Resolve `import_run.filePath` to a local FS path before streaming.\n * **Required when uploads land in remote storage** (R2/S3/etc.) —\n * without it, the worker hands the storage key to `createReadStream`\n * and crashes with `ENOENT`.\n *\n * Typical wiring downloads the storage object to a tmpfile and\n * returns its path; the optional `cleanup` callback runs after the\n * run finishes (success or failure).\n *\n * Omit this option for the original on-disk PoC setup where the\n * upload route writes directly to a local directory and persists\n * the absolute path on `import_run.filePath`.\n */\n resolveFilePath?: FilePathResolver\n}\n\nexport function imports(options: ImportsPluginOptions): Plugin {\n const esIndex = options.esIndex ?? 'parts'\n\n return {\n name: '@murumets-ee/imports',\n server: {\n entities: [ImportRun],\n init: async (app) => {\n // Queue PACKAGE being importable doesn't mean the queue() PLUGIN\n // is in the consumer's plugins array. Without it, registerJob()\n // succeeds but no worker ever picks up the job. Probe and warn.\n const queuePluginPresent = app.plugins\n .all()\n .some((p) => p.name === '@murumets-ee/queue')\n if (!queuePluginPresent) {\n app.logger.warn(\n 'imports: queue() plugin not in plugins array — imports:run jobs will not be processed',\n )\n return\n }\n\n // Queue client stays dynamic-imported: we already early-returned\n // above when the queue() plugin isn't in the consumer's plugins\n // array, so static-importing here would force-load the queue\n // client module in deployments that don't enable the worker.\n const queueClientModule = await import('@murumets-ee/queue/client')\n\n const importRuns = createAdminClient(ImportRun, app)\n const transforms = getTransformRegistry()\n\n queueClientModule.registerJob(\n importsRunJob,\n createRunImportHandler({\n importRuns,\n transforms,\n esClient: options.esClient,\n esIndex,\n ...(options.resolveFilePath !== undefined && {\n resolveFilePath: options.resolveFilePath,\n }),\n logger: app.logger.child({ pkg: 'imports' }),\n }),\n )\n\n app.logger.info({ esIndex }, 'Imports plugin initialized')\n },\n },\n }\n}\n"],"mappings":"2LAsEA,SAAgB,EAAQ,EAAuC,CAC7D,IAAM,EAAU,EAAQ,SAAW,QAEnC,MAAO,CACL,KAAM,uBACN,OAAQ,CACN,SAAU,CAAC,EAAU,CACrB,KAAM,KAAO,IAAQ,CAOnB,GAAI,CAHuB,EAAI,QAC5B,KAAK,CACL,KAAM,GAAM,EAAE,OAAS,qBACH,CAAE,CACvB,EAAI,OAAO,KACT,wFACD,CACD,OAOF,IAAM,EAAoB,MAAM,OAAO,6BAEjC,EAAa,EAAkB,EAAW,EAAI,CAC9C,EAAa,GAAsB,CAEzC,EAAkB,YAChB,EACA,EAAuB,CACrB,aACA,aACA,SAAU,EAAQ,SAClB,UACA,GAAI,EAAQ,kBAAoB,IAAA,IAAa,CAC3C,gBAAiB,EAAQ,gBAC1B,CACD,OAAQ,EAAI,OAAO,MAAM,CAAE,IAAK,UAAW,CAAC,CAC7C,CAAC,CACH,CAED,EAAI,OAAO,KAAK,CAAE,UAAS,CAAE,6BAA6B,EAE7D,CACF"}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{bulkUpsert as e}from"@murumets-ee/search-elasticsearch";import{createReadStream as t}from"node:fs";import{parse as n}from"csv-parse";const r=50,i=5;var a=class{patterns=new Map;maxPatterns;maxSamplesPerPattern;droppedSignatures=0;constructor(e={}){this.maxPatterns=e.maxPatterns??50,this.maxSamplesPerPattern=e.maxSamplesPerPattern??5}addError(e,t,n,r,i){let a=`${t}:${r??`GENERAL`}:${n}`,o=this.patterns.get(a);if(!o){if(this.patterns.size>=this.maxPatterns){this.droppedSignatures+=1;return}o={errorType:t,field:r??null,message:n,count:0,firstOccurrence:e,lastOccurrence:e,samples:[]},this.patterns.set(a,o)}o.count+=1,o.lastOccurrence=e,o.samples.length<this.maxSamplesPerPattern&&o.samples.push({rowNumber:e,rowData:i})}getTotalErrorCount(){let e=0;for(let t of this.patterns.values())e+=t.count;return e}getDistinctPatternCount(){return this.patterns.size}getDroppedSignatureCount(){return this.droppedSignatures}getTopPatterns(){let e=Array.from(this.patterns.values()).sort((e,t)=>t.count-e.count),t=e.reduce((e,t)=>e+t.count,0);return e.map(e=>({errorType:e.errorType,field:e.field,message:e.message,count:e.count,firstOccurrence:e.firstOccurrence,lastOccurrence:e.lastOccurrence,samples:e.samples.slice(),percentage:t>0?e.count/t*100:0}))}snapshot(){return{totalErrors:this.getTotalErrorCount(),distinctPatterns:this.getDistinctPatternCount(),droppedSignatures:this.droppedSignatures,patterns:this.getTopPatterns()}}};async function*o(e){let{filePath:r,delimiter:i=` `,hasHeader:a=!0,columns:o,relaxColumnCount:s=!1}=e,c=o?Array.from(o):a,l=t(r),u=l.pipe(n({delimiter:i,columns:c,bom:!0,skip_empty_lines:!0,relax_column_count:s})),d=0;try{for await(let e of u){d+=1;let t={};if(Array.isArray(e)){let n=e;for(let e=0;e<n.length;e+=1)t[String(e)]=n[e]??``}else for(let[n,r]of Object.entries(e))t[n]=r??``;yield{rowNumber:d,row:t}}}finally{l.destroy()}}const s=1e3;async function c(t){let{importRunId:n,runLabel:r,params:i,transform:c,feed:l,esClient:u,esIndex:d,batchSize:f=s,onProgress:p,rowLimit:m,signal:h,errorTracker:g=new a}=t;if(f<1)throw Error(`batchSize must be >= 1 (got ${f})`);let _=Date.now(),v=0,y=0,b=0,x=0,S=0,C=[],w=async()=>{if(C.length===0)return;let t=C;C=[];let n;try{n=await e(u,{index:d,docs:t.map(({id:e,doc:t})=>({id:e,doc:t})),...h!==void 0&&{signal:h}})}catch(e){let n=h?.aborted??(e instanceof Error&&(e.name===`AbortError`||/abort/i.test(e.message)))?`aborted`:`bulk_request_failed`,r=e instanceof Error?e.message:String(e);for(let{rowNumber:e}of t)g.addError(e,n,r,void 0,null);throw b+=t.length,e}if(y+=n.succeeded,b+=n.failures.length,S+=1,n.failures.length>0){let e=new Map(t.map(e=>[e.id,e.rowNumber]));for(let t of n.failures){let n=e.get(t.id)??-1;g.addError(n,t.type,t.reason,void 0,{id:t.id})}}if(p){let e=(Date.now()-_)/1e3;p({rowsRead:v,rowsSucceeded:y,rowsFailed:b,rowsSkipped:x,batchesCompleted:S,elapsedSeconds:e,rowsPerSecond:e>0?v/e:0,distinctErrorPatterns:g.getDistinctPatternCount()})}};for await(let{rowNumber:e,row:t}of o(l)){if(h?.aborted||m!==void 0&&v>=m)break;v+=1;let a={importRunId:n,params:i,runLabel:r,rowNumber:e},o;try{o=await c(t,a)}catch(n){let r=n instanceof Error?n.message:String(n);g.addError(e,`transform_threw`,r,void 0,t),b+=1;continue}if(o.kind===`skip`){x+=1;continue}if(o.kind===`error`){g.addError(e,o.error.errorType,o.error.message,o.error.field,t),b+=1;continue}C.push({id:o.id,doc:o.doc,rowNumber:e}),C.length>=f&&await w()}return await w(),{rowsRead:v,rowsSucceeded:y,rowsFailed:b,rowsSkipped:x,batchesCompleted:S,errors:g.snapshot()}}export{i as a,r as i,c as n,a as o,o as r,s as t};
|
|
2
|
+
//# sourceMappingURL=runner-DdhiNybk.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner-DdhiNybk.mjs","names":[],"sources":["../src/error-tracker.ts","../src/streaming.ts","../src/runner.ts"],"sourcesContent":["/**\n * Aggregates per-row errors into top-N pattern buckets so a feed of 1M\n * malformed rows surfaces as a handful of actionable signatures rather\n * than a million identical strings.\n *\n * Lifted from giga-test (`backend/src/workers/csv-importer.ts`) and\n * generalized:\n * - Configurable caps so the importer can tune memory bounds per run.\n * - Pure data — no logging side effects, no I/O.\n * - JSON-serialisable output via {@link ErrorTracker.snapshot} for the\n * `import_run.errorSummary` column.\n *\n * Pattern signature shape: `${errorType}:${field || 'GENERAL'}:${message}`.\n * Same `errorType + field + message` collapses to one bucket; differing\n * messages stay separate. This is intentional: a parser error on column\n * `NetPrice/Discount` (\"invalid number 'NA,5'\") and the same on column\n * `GrossPrice` are operationally distinct even if the parser is the same.\n *\n * Memory bounds: the patterns map is capped at `maxPatterns`. Once full,\n * additional NEW signatures are dropped — known patterns keep accumulating\n * counts. This is the \"top-N most common\" model: rare-but-novel errors\n * past the cap are invisible, but the cap protects against a runaway\n * adversarial feed exploding the map. Sample arrays are independently\n * capped at `maxSamplesPerPattern`.\n */\n\n/**\n * Recursively-defined JSON-serialisable value. Mirrors the shape of\n * `JsonValue` in `@murumets-ee/entity` without taking a dependency on\n * that package — error-tracker is otherwise standalone, and the\n * `ImportRun.errorSummary` JSONB column accepts anything in this shape.\n */\nexport type ImportJsonValue =\n | string\n | number\n | boolean\n | null\n | ImportJsonValue[]\n | { [key: string]: ImportJsonValue }\n\n/** One sample row attached to a pattern. `rowData` is the originally-parsed row. */\nexport interface ErrorSample {\n rowNumber: number\n rowData: ImportJsonValue\n}\n\n/** Public shape of an aggregated pattern as returned by {@link ErrorTracker.getTopPatterns}. */\nexport interface ErrorPattern {\n errorType: string\n field: string | null\n message: string\n count: number\n firstOccurrence: number\n lastOccurrence: number\n samples: ReadonlyArray<ErrorSample>\n /** `count / totalErrors`, scaled 0..100. `0` when there are zero errors total. */\n percentage: number\n}\n\nexport interface ErrorTrackerConfig {\n /** Hard cap on distinct signatures. Defaults to 50. New signatures past the cap are dropped. */\n maxPatterns?: number\n /** Hard cap on samples retained per pattern. Defaults to 5. Excess samples are dropped. */\n maxSamplesPerPattern?: number\n}\n\n/** Default caps — match giga-test for compatibility. */\nexport const DEFAULT_MAX_PATTERNS = 50\nexport const DEFAULT_MAX_SAMPLES_PER_PATTERN = 5\n\ninterface InternalPattern {\n errorType: string\n field: string | null\n message: string\n count: number\n firstOccurrence: number\n lastOccurrence: number\n samples: ErrorSample[]\n}\n\nexport class ErrorTracker {\n private readonly patterns = new Map<string, InternalPattern>()\n private readonly maxPatterns: number\n private readonly maxSamplesPerPattern: number\n private droppedSignatures = 0\n\n constructor(config: ErrorTrackerConfig = {}) {\n this.maxPatterns = config.maxPatterns ?? DEFAULT_MAX_PATTERNS\n this.maxSamplesPerPattern = config.maxSamplesPerPattern ?? DEFAULT_MAX_SAMPLES_PER_PATTERN\n }\n\n /**\n * Record one error. Same `(errorType, field, message)` triple bumps the\n * existing bucket; a new triple opens a new one (subject to {@link maxPatterns}).\n *\n * `field` is optional — pass `undefined` for errors not tied to a single\n * column (e.g. parse errors at row level). Internally normalised to the\n * literal string `'GENERAL'` so it shares a bucket with other genericised\n * errors of the same type+message.\n */\n addError(\n rowNumber: number,\n errorType: string,\n message: string,\n field: string | undefined,\n rowData: ImportJsonValue,\n ): void {\n const fieldKey = field ?? 'GENERAL'\n const signature = `${errorType}:${fieldKey}:${message}`\n\n let pattern = this.patterns.get(signature)\n if (!pattern) {\n if (this.patterns.size >= this.maxPatterns) {\n this.droppedSignatures += 1\n return\n }\n pattern = {\n errorType,\n field: field ?? null,\n message,\n count: 0,\n firstOccurrence: rowNumber,\n lastOccurrence: rowNumber,\n samples: [],\n }\n this.patterns.set(signature, pattern)\n }\n\n pattern.count += 1\n pattern.lastOccurrence = rowNumber\n if (pattern.samples.length < this.maxSamplesPerPattern) {\n pattern.samples.push({ rowNumber, rowData })\n }\n }\n\n /** Total count across every pattern. Counts errors, not patterns. */\n getTotalErrorCount(): number {\n let sum = 0\n for (const p of this.patterns.values()) sum += p.count\n return sum\n }\n\n /** Number of distinct signatures retained in the map (≤ `maxPatterns`). */\n getDistinctPatternCount(): number {\n return this.patterns.size\n }\n\n /**\n * Number of NEW signatures dropped because the map was already at\n * capacity. Surfacing this in the import_run summary tells the operator\n * \"the top-N was saturated — there's a long tail you're not seeing\".\n */\n getDroppedSignatureCount(): number {\n return this.droppedSignatures\n }\n\n /**\n * Top patterns sorted by descending count, capped at `maxPatterns`.\n * Stable secondary order is insertion order (Map iteration order is\n * insertion order; Array.sort is stable in V8).\n */\n getTopPatterns(): ErrorPattern[] {\n const all = Array.from(this.patterns.values()).sort((a, b) => b.count - a.count)\n const totalErrors = all.reduce((sum, p) => sum + p.count, 0)\n return all.map((p) => ({\n errorType: p.errorType,\n field: p.field,\n message: p.message,\n count: p.count,\n firstOccurrence: p.firstOccurrence,\n lastOccurrence: p.lastOccurrence,\n samples: p.samples.slice(),\n percentage: totalErrors > 0 ? (p.count / totalErrors) * 100 : 0,\n }))\n }\n\n /**\n * Compact JSON-serialisable snapshot for `import_run.errorSummary`.\n * Aside from the patterns array, includes the totals so a reader of\n * just this column doesn't have to re-derive them.\n */\n snapshot(): ErrorTrackerSnapshot {\n return {\n totalErrors: this.getTotalErrorCount(),\n distinctPatterns: this.getDistinctPatternCount(),\n droppedSignatures: this.droppedSignatures,\n patterns: this.getTopPatterns(),\n }\n }\n}\n\nexport interface ErrorTrackerSnapshot {\n totalErrors: number\n distinctPatterns: number\n droppedSignatures: number\n patterns: ErrorPattern[]\n}\n","/**\n * Tab-delimited / CSV streaming reader. Generic over delimiter so the\n * same path handles `.txt` (tab), `.csv` (comma), and the rare `;`\n * European export dialect.\n *\n * Built on `csv-parse` per giga-test precedent — node-stream-based, low\n * memory, handles UTF-8 BOM (the carmaker feeds are Windows-exported\n * and ship with a BOM that breaks naive split-on-tab parsers).\n *\n * The reader yields `{ rowNumber, row }` pairs where:\n * - `rowNumber` is 1-based and counts the header as row 0.\n * - `row` is `Record<string, string>` keyed by header name. Empty cells\n * are the empty string, NOT `undefined` — feed transforms test with\n * `value === ''` consistently.\n *\n * Why this lives in a streaming reader and not inside the transform\n * itself: the transform sees one already-parsed row at a time, never\n * the file. That keeps transform implementations free of I/O concerns\n * and makes them trivially unit-testable with a fixture row map.\n */\n\nimport { createReadStream } from 'node:fs'\nimport { parse } from 'csv-parse'\n\nexport interface StreamFeedOptions {\n /** Path to the file on disk. The PoC uploads land on local disk; S3-keyed reads come later. */\n filePath: string\n /**\n * Single-character field delimiter. Default `\\t` (the carmaker feed\n * format). Pass `,` for CSV, `;` for some European dialects.\n */\n delimiter?: string\n /**\n * `true` (default): the first row is the header and column names come\n * from it. `false`: rows are emitted as positional `{ \"0\": ..., \"1\": ... }`\n * and the transform reads by index — useful for headerless feeds that\n * commit to a documented column order.\n */\n hasHeader?: boolean\n /**\n * Optional explicit column-name list. When provided, takes precedence\n * over `hasHeader` (header row, if present, is skipped but its values\n * are ignored). Useful when the upstream header is unstable but the\n * positional shape isn't.\n */\n columns?: ReadonlyArray<string>\n /**\n * Forward to `csv-parse` `relax_column_count`. Default `false` —\n * a row whose column count doesn't match the header surfaces as a\n * parser error so the transform isn't silently fed truncated data.\n */\n relaxColumnCount?: boolean\n}\n\nexport interface StreamFeedRow {\n /** 1-based row number. Header (when present) is row 0; first data row is row 1. */\n rowNumber: number\n /**\n * Cell values keyed by column name (or string-position when\n * `hasHeader: false` AND no `columns`).\n *\n * **Cell-value invariants:**\n * - Empty cells (`A\\t\\tC`) → `''` (empty string).\n * - Missing TRAILING cells in `relaxColumnCount: true` mode → the\n * key is **absent** from the object, not present-with-`''`. csv-parse\n * does not emit keys for short rows. Transforms reading those\n * columns get `undefined` from `row['col']` and must handle it\n * (`row['col'] ?? ''` is the canonical idiom).\n * - With the default `relaxColumnCount: false`, short rows reject at\n * the parser, so this case never reaches the transform.\n */\n row: Record<string, string>\n}\n\n/**\n * Async-iterable over the parsed rows of a delimited file. Use with\n * `for await (const { rowNumber, row } of streamFeed({ filePath, ... }))`.\n *\n * The iterator owns its file descriptor — the `for await` loop closes\n * the underlying stream when it returns or breaks. Aborting mid-stream\n * (`break`, `throw`, signal) is safe; csv-parse propagates the close.\n */\nexport async function* streamFeed(options: StreamFeedOptions): AsyncIterable<StreamFeedRow> {\n const { filePath, delimiter = '\\t', hasHeader = true, columns, relaxColumnCount = false } = options\n\n // When the caller provides explicit `columns`, prefer them. When the\n // file has a header but no explicit `columns`, csv-parse takes the\n // first row as the column source. When neither is true, rows are\n // emitted with string-position keys.\n //\n // Typed against csv-parse's actual `columns?: ColumnOption[] | boolean`\n // signature — `string[]` satisfies `ColumnOption[]` since\n // `ColumnOption = string | undefined | null | false | { name: string }`.\n const columnConfig: string[] | boolean = columns ? Array.from(columns) : hasHeader\n\n const stream = createReadStream(filePath)\n const parser = stream.pipe(\n parse({\n delimiter,\n columns: columnConfig,\n bom: true,\n skip_empty_lines: true,\n // csv-parse defaults to strict column count; opt-in relaxation only.\n relax_column_count: relaxColumnCount,\n }),\n )\n\n let rowNumber = 0\n try {\n for await (const rawRow of parser as AsyncIterable<\n ReadonlyArray<string> | Record<string, string | undefined>\n >) {\n rowNumber += 1\n // csv-parse emits records with string keys when `columns` is\n // truthy, otherwise an Array. Normalise both shapes to\n // `Record<string, string>` so transforms can rely on `value === ''`\n // for missing cells (csv-parse leaves trailing missing cells as\n // `undefined` when `relax_column_count: true`; this collapses\n // them to `''` to keep the contract uniform).\n const row: Record<string, string> = {}\n if (Array.isArray(rawRow)) {\n const arr = rawRow as ReadonlyArray<string | undefined>\n for (let i = 0; i < arr.length; i += 1) {\n row[String(i)] = arr[i] ?? ''\n }\n } else {\n for (const [k, v] of Object.entries(rawRow)) {\n row[k] = v ?? ''\n }\n }\n yield { rowNumber, row }\n }\n } finally {\n // Safety: ensure the underlying file descriptor closes even if the\n // consumer breaks mid-iteration. Node closes streams on\n // garbage-collection but this makes it deterministic under tests.\n stream.destroy()\n }\n}\n","/**\n * One-shot importer: stream rows → transform → batched bulk-write to ES,\n * accumulating per-row errors into `ErrorTracker` and reporting progress\n * to the queue every batch.\n *\n * Per PLAN-ECOMMERCE.md PR 7 (PoC scope):\n * - **Batch size 1000.** Matches giga-test precedent. Configurable for\n * integration tests that don't want a 1k row floor.\n * - **No resumability and no automatic retries.** The `imports:run` queue\n * job is registered with `defaultRetries: 0` (see `worker.ts`) so a\n * failed handler does NOT re-enqueue itself — re-running a multi-batch\n * import against the same `import_run.id` while the previous attempt\n * may still be writing is a footgun (duplicate batches, double-counted\n * progress). Operator retries by creating a NEW `import_run` row.\n * - **No per-supplier transform plugin.** The runner takes a single\n * `RowTransform<TDoc>` from the registry and applies it to every row;\n * PR 8 may diverge but only by registering a different transform name.\n * - **Direct `bulkUpsert` into the live aliased index** (D6 alias is\n * set up by PR 4's `ensureAliasedIndex`; the importer doesn't reindex).\n *\n * Per D21 (sanctioned bulk path): this runner intentionally bypasses\n * AdminClient and entity hooks. Per-batch audit / observability lives\n * on the surrounding `import_run` row + queue progress, NOT per-row.\n */\n\nimport { bulkUpsert, type BulkIndexResult, type EsClientLike } from '@murumets-ee/search-elasticsearch'\nimport { ErrorTracker } from './error-tracker.js'\nimport { streamFeed, type StreamFeedOptions } from './streaming.js'\nimport type { RowTransform, TransformContext } from './transform.js'\n\n/** Soft default; chosen to match giga-test. ES bulk requests over ~5MB get split server-side anyway. */\nexport const DEFAULT_BATCH_SIZE = 1000\n\nexport interface RunImportOptions<TDoc> {\n /** UUID of the `import_run` row driving this run. Forwarded to every transform invocation. */\n importRunId: string\n /** Operator-supplied label for the run. Forwarded to the transform context. */\n runLabel: string\n /** Opaque per-run params copied from `import_run.params`. */\n params: Record<string, unknown>\n /** Transform applied to every parsed row. */\n transform: RowTransform<TDoc>\n /** Streaming reader options — file path, delimiter, header config. */\n feed: StreamFeedOptions\n /** ES client (low-level shape from `@murumets-ee/search-elasticsearch`). */\n esClient: EsClientLike\n /** Index alias to write to. Per D6, callers always pass an alias, never a physical index. */\n esIndex: string\n /** Rows per `bulkUpsert` call. Default {@link DEFAULT_BATCH_SIZE}. */\n batchSize?: number\n /**\n * Callback invoked after every batch. The handler in `worker.ts`\n * forwards this to `ctx.updateProgress` for the queue UI; tests\n * inspect it directly. Synchronous + cheap so a slow callback can't\n * back-pressure the importer.\n */\n onProgress?: (progress: ImportRunProgress) => void\n /** Optional: stop processing after this many rows. Tests use it; production passes `undefined`. */\n rowLimit?: number\n /** Abort signal threaded into the underlying ES client request — cooperative cancel. */\n signal?: AbortSignal\n /** Optional ErrorTracker config (caps). Default: top-50 patterns × 5 samples. */\n errorTracker?: ErrorTracker\n}\n\n/**\n * Progress payload written to `toolkit_jobs.progress` after every batch.\n * Caps + flush rules live on the queue's `updateProgress` debounce —\n * callers don't need to throttle.\n */\nexport interface ImportRunProgress {\n rowsRead: number\n rowsSucceeded: number\n rowsFailed: number\n rowsSkipped: number\n batchesCompleted: number\n /** Wall-clock seconds since the runner started. */\n elapsedSeconds: number\n /** Rows / second, computed at every batch. */\n rowsPerSecond: number\n /** Distinct error patterns currently held by the tracker. Saturates at the cap. */\n distinctErrorPatterns: number\n}\n\n/**\n * Final result returned by {@link runImport}. The handler writes these\n * onto the `import_run` row alongside the ErrorTracker snapshot.\n */\nexport interface RunImportResult {\n /** Total rows read from the file (excludes skipped empty lines). */\n rowsRead: number\n /** Rows the transform turned into a successful doc AND the ES cluster acknowledged. */\n rowsSucceeded: number\n /**\n * Rows that the transform rejected (`{ kind: 'error' }`) OR that ES\n * rejected on bulk-write (per-doc failure). Both are aggregated by\n * `errorTracker` for the import_run summary.\n */\n rowsFailed: number\n /** Rows that the transform skipped (`{ kind: 'skip' }`) — header noise, blank lines, intentional drop. */\n rowsSkipped: number\n /** Number of `bulkUpsert` calls made. */\n batchesCompleted: number\n /** Final value of {@link ErrorTracker.snapshot}. */\n errors: ReturnType<ErrorTracker['snapshot']>\n}\n\n/**\n * Apply the runner against a feed file. Stops on rowLimit OR end-of-file\n * OR if `signal` aborts. Throws if the streaming reader / ES client\n * throws — caller (the queue handler) catches that and writes\n * `import_run.status = 'failed'` with the error message in\n * `errorSummary.fatal`.\n */\nexport async function runImport<TDoc>(options: RunImportOptions<TDoc>): Promise<RunImportResult> {\n const {\n importRunId,\n runLabel,\n params,\n transform,\n feed,\n esClient,\n esIndex,\n batchSize = DEFAULT_BATCH_SIZE,\n onProgress,\n rowLimit,\n signal,\n errorTracker = new ErrorTracker(),\n } = options\n\n if (batchSize < 1) {\n throw new Error(`batchSize must be >= 1 (got ${batchSize})`)\n }\n\n const startedAt = Date.now()\n let rowsRead = 0\n let rowsSucceeded = 0\n let rowsFailed = 0\n let rowsSkipped = 0\n let batchesCompleted = 0\n\n let pending: Array<{ id: string; doc: TDoc; rowNumber: number }> = []\n\n const flush = async (): Promise<void> => {\n if (pending.length === 0) return\n const batch = pending\n pending = []\n let result: BulkIndexResult\n try {\n result = await bulkUpsert<TDoc>(esClient, {\n index: esIndex,\n docs: batch.map(({ id, doc }) => ({ id, doc })),\n ...(signal !== undefined && { signal }),\n })\n } catch (err) {\n // Cluster- or transport-level failure — the whole batch is\n // unaccounted for. Distinguish abort (operator-driven cancel) from\n // a real cluster failure so the errorSummary doesn't mislabel a\n // cancelled run as broken cluster connectivity.\n const isAbort =\n signal?.aborted ??\n (err instanceof Error && (err.name === 'AbortError' || /abort/i.test(err.message)))\n const errorType = isAbort ? 'aborted' : 'bulk_request_failed'\n const reason = err instanceof Error ? err.message : String(err)\n for (const { rowNumber } of batch) {\n errorTracker.addError(rowNumber, errorType, reason, undefined, null)\n }\n rowsFailed += batch.length\n throw err\n }\n\n rowsSucceeded += result.succeeded\n rowsFailed += result.failures.length\n batchesCompleted += 1\n\n if (result.failures.length > 0) {\n // Map each ES failure back to its source row via `id`. The bulk\n // response order matches the request order, but ES doesn't promise\n // that; matching by `id` is the safe path. PoC volume is small\n // enough that the O(failures × batch) cost is irrelevant.\n const byId = new Map(batch.map((b) => [b.id, b.rowNumber]))\n for (const fail of result.failures) {\n const rowNumber = byId.get(fail.id) ?? -1\n errorTracker.addError(rowNumber, fail.type, fail.reason, undefined, { id: fail.id })\n }\n }\n\n if (onProgress) {\n const elapsedSeconds = (Date.now() - startedAt) / 1000\n onProgress({\n rowsRead,\n rowsSucceeded,\n rowsFailed,\n rowsSkipped,\n batchesCompleted,\n elapsedSeconds,\n rowsPerSecond: elapsedSeconds > 0 ? rowsRead / elapsedSeconds : 0,\n distinctErrorPatterns: errorTracker.getDistinctPatternCount(),\n })\n }\n }\n\n for await (const { rowNumber, row } of streamFeed(feed)) {\n if (signal?.aborted) break\n if (rowLimit !== undefined && rowsRead >= rowLimit) break\n rowsRead += 1\n\n const ctx: TransformContext = { importRunId, params, runLabel, rowNumber }\n let result: Awaited<ReturnType<typeof transform>>\n try {\n result = await transform(row, ctx)\n } catch (err) {\n // A throw from the transform is a programmer error — surface it as\n // a row-level error so the run can continue. (If the bug is\n // catastrophic, the operator sees the same message repeated and\n // can stop the run.)\n const reason = err instanceof Error ? err.message : String(err)\n errorTracker.addError(rowNumber, 'transform_threw', reason, undefined, row)\n rowsFailed += 1\n continue\n }\n\n if (result.kind === 'skip') {\n rowsSkipped += 1\n continue\n }\n if (result.kind === 'error') {\n errorTracker.addError(\n rowNumber,\n result.error.errorType,\n result.error.message,\n result.error.field,\n row,\n )\n rowsFailed += 1\n continue\n }\n\n pending.push({ id: result.id, doc: result.doc, rowNumber })\n if (pending.length >= batchSize) {\n await flush()\n }\n }\n\n await flush()\n\n return {\n rowsRead,\n rowsSucceeded,\n rowsFailed,\n rowsSkipped,\n batchesCompleted,\n errors: errorTracker.snapshot(),\n }\n}\n"],"mappings":"4IAmEA,MAAa,EAAuB,GACvB,EAAkC,EAY/C,IAAa,EAAb,KAA0B,CACxB,SAA4B,IAAI,IAChC,YACA,qBACA,kBAA4B,EAE5B,YAAY,EAA6B,EAAE,CAAE,CAC3C,KAAK,YAAc,EAAO,aAAA,GAC1B,KAAK,qBAAuB,EAAO,sBAAA,EAYrC,SACE,EACA,EACA,EACA,EACA,EACM,CAEN,IAAM,EAAY,GAAG,EAAU,GADd,GAAS,UACiB,GAAG,IAE1C,EAAU,KAAK,SAAS,IAAI,EAAU,CAC1C,GAAI,CAAC,EAAS,CACZ,GAAI,KAAK,SAAS,MAAQ,KAAK,YAAa,CAC1C,KAAK,mBAAqB,EAC1B,OAEF,EAAU,CACR,YACA,MAAO,GAAS,KAChB,UACA,MAAO,EACP,gBAAiB,EACjB,eAAgB,EAChB,QAAS,EAAE,CACZ,CACD,KAAK,SAAS,IAAI,EAAW,EAAQ,CAGvC,EAAQ,OAAS,EACjB,EAAQ,eAAiB,EACrB,EAAQ,QAAQ,OAAS,KAAK,sBAChC,EAAQ,QAAQ,KAAK,CAAE,YAAW,UAAS,CAAC,CAKhD,oBAA6B,CAC3B,IAAI,EAAM,EACV,IAAK,IAAM,KAAK,KAAK,SAAS,QAAQ,CAAE,GAAO,EAAE,MACjD,OAAO,EAIT,yBAAkC,CAChC,OAAO,KAAK,SAAS,KAQvB,0BAAmC,CACjC,OAAO,KAAK,kBAQd,gBAAiC,CAC/B,IAAM,EAAM,MAAM,KAAK,KAAK,SAAS,QAAQ,CAAC,CAAC,MAAM,EAAG,IAAM,EAAE,MAAQ,EAAE,MAAM,CAC1E,EAAc,EAAI,QAAQ,EAAK,IAAM,EAAM,EAAE,MAAO,EAAE,CAC5D,OAAO,EAAI,IAAK,IAAO,CACrB,UAAW,EAAE,UACb,MAAO,EAAE,MACT,QAAS,EAAE,QACX,MAAO,EAAE,MACT,gBAAiB,EAAE,gBACnB,eAAgB,EAAE,eAClB,QAAS,EAAE,QAAQ,OAAO,CAC1B,WAAY,EAAc,EAAK,EAAE,MAAQ,EAAe,IAAM,EAC/D,EAAE,CAQL,UAAiC,CAC/B,MAAO,CACL,YAAa,KAAK,oBAAoB,CACtC,iBAAkB,KAAK,yBAAyB,CAChD,kBAAmB,KAAK,kBACxB,SAAU,KAAK,gBAAgB,CAChC,GCzGL,eAAuB,EAAW,EAA0D,CAC1F,GAAM,CAAE,WAAU,YAAY,IAAM,YAAY,GAAM,UAAS,mBAAmB,IAAU,EAUtF,EAAmC,EAAU,MAAM,KAAK,EAAQ,CAAG,EAEnE,EAAS,EAAiB,EAAS,CACnC,EAAS,EAAO,KACpB,EAAM,CACJ,YACA,QAAS,EACT,IAAK,GACL,iBAAkB,GAElB,mBAAoB,EACrB,CAAC,CACH,CAEG,EAAY,EAChB,GAAI,CACF,UAAW,IAAM,KAAU,EAExB,CACD,GAAa,EAOb,IAAM,EAA8B,EAAE,CACtC,GAAI,MAAM,QAAQ,EAAO,CAAE,CACzB,IAAM,EAAM,EACZ,IAAK,IAAI,EAAI,EAAG,EAAI,EAAI,OAAQ,GAAK,EACnC,EAAI,OAAO,EAAE,EAAI,EAAI,IAAM,QAG7B,IAAK,GAAM,CAAC,EAAG,KAAM,OAAO,QAAQ,EAAO,CACzC,EAAI,GAAK,GAAK,GAGlB,KAAM,CAAE,YAAW,MAAK,SAElB,CAIR,EAAO,SAAS,ECzGpB,MAAa,EAAqB,IAmFlC,eAAsB,EAAgB,EAA2D,CAC/F,GAAM,CACJ,cACA,WACA,SACA,YACA,OACA,WACA,UACA,YAAY,EACZ,aACA,WACA,SACA,eAAe,IAAI,GACjB,EAEJ,GAAI,EAAY,EACd,MAAU,MAAM,+BAA+B,EAAU,GAAG,CAG9D,IAAM,EAAY,KAAK,KAAK,CACxB,EAAW,EACX,EAAgB,EAChB,EAAa,EACb,EAAc,EACd,EAAmB,EAEnB,EAA+D,EAAE,CAE/D,EAAQ,SAA2B,CACvC,GAAI,EAAQ,SAAW,EAAG,OAC1B,IAAM,EAAQ,EACd,EAAU,EAAE,CACZ,IAAI,EACJ,GAAI,CACF,EAAS,MAAM,EAAiB,EAAU,CACxC,MAAO,EACP,KAAM,EAAM,KAAK,CAAE,KAAI,UAAW,CAAE,KAAI,MAAK,EAAE,CAC/C,GAAI,IAAW,IAAA,IAAa,CAAE,SAAQ,CACvC,CAAC,OACK,EAAK,CAQZ,IAAM,EAFJ,GAAQ,UACP,aAAe,QAAU,EAAI,OAAS,cAAgB,SAAS,KAAK,EAAI,QAAQ,GACvD,UAAY,sBAClC,EAAS,aAAe,MAAQ,EAAI,QAAU,OAAO,EAAI,CAC/D,IAAK,GAAM,CAAE,eAAe,EAC1B,EAAa,SAAS,EAAW,EAAW,EAAQ,IAAA,GAAW,KAAK,CAGtE,KADA,IAAc,EAAM,OACd,EAOR,GAJA,GAAiB,EAAO,UACxB,GAAc,EAAO,SAAS,OAC9B,GAAoB,EAEhB,EAAO,SAAS,OAAS,EAAG,CAK9B,IAAM,EAAO,IAAI,IAAI,EAAM,IAAK,GAAM,CAAC,EAAE,GAAI,EAAE,UAAU,CAAC,CAAC,CAC3D,IAAK,IAAM,KAAQ,EAAO,SAAU,CAClC,IAAM,EAAY,EAAK,IAAI,EAAK,GAAG,EAAI,GACvC,EAAa,SAAS,EAAW,EAAK,KAAM,EAAK,OAAQ,IAAA,GAAW,CAAE,GAAI,EAAK,GAAI,CAAC,EAIxF,GAAI,EAAY,CACd,IAAM,GAAkB,KAAK,KAAK,CAAG,GAAa,IAClD,EAAW,CACT,WACA,gBACA,aACA,cACA,mBACA,iBACA,cAAe,EAAiB,EAAI,EAAW,EAAiB,EAChE,sBAAuB,EAAa,yBAAyB,CAC9D,CAAC,GAIN,UAAW,GAAM,CAAE,YAAW,SAAS,EAAW,EAAK,CAAE,CAEvD,GADI,GAAQ,SACR,IAAa,IAAA,IAAa,GAAY,EAAU,MACpD,GAAY,EAEZ,IAAM,EAAwB,CAAE,cAAa,SAAQ,WAAU,YAAW,CACtE,EACJ,GAAI,CACF,EAAS,MAAM,EAAU,EAAK,EAAI,OAC3B,EAAK,CAKZ,IAAM,EAAS,aAAe,MAAQ,EAAI,QAAU,OAAO,EAAI,CAC/D,EAAa,SAAS,EAAW,kBAAmB,EAAQ,IAAA,GAAW,EAAI,CAC3E,GAAc,EACd,SAGF,GAAI,EAAO,OAAS,OAAQ,CAC1B,GAAe,EACf,SAEF,GAAI,EAAO,OAAS,QAAS,CAC3B,EAAa,SACX,EACA,EAAO,MAAM,UACb,EAAO,MAAM,QACb,EAAO,MAAM,MACb,EACD,CACD,GAAc,EACd,SAGF,EAAQ,KAAK,CAAE,GAAI,EAAO,GAAI,IAAK,EAAO,IAAK,YAAW,CAAC,CACvD,EAAQ,QAAU,GACpB,MAAM,GAAO,CAMjB,OAFA,MAAM,GAAO,CAEN,CACL,WACA,gBACA,aACA,cACA,mBACA,OAAQ,EAAa,UAAU,CAChC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { n as FilePathResolver } from "./worker-DerGVTSI.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/storage-resolver.d.ts
|
|
4
|
+
/**
|
|
5
|
+
* Read the storage object at `key` into a tmpfile and return the
|
|
6
|
+
* path. Cleans up via `fs.unlink` after the run finishes — best-
|
|
7
|
+
* effort, swallows `ENOENT` (the file may already be gone if the
|
|
8
|
+
* worker crashed and the OS cleaned `/tmp`).
|
|
9
|
+
*
|
|
10
|
+
* Lazy-imports `@murumets-ee/storage` + `@murumets-ee/core` so
|
|
11
|
+
* deployments that don't ingest from remote storage never load the
|
|
12
|
+
* R2 client.
|
|
13
|
+
*/
|
|
14
|
+
declare const storageResolveFilePath: FilePathResolver;
|
|
15
|
+
//#endregion
|
|
16
|
+
export { storageResolveFilePath };
|
|
17
|
+
//# sourceMappingURL=storage-resolver.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"storage-resolver.d.mts","names":[],"sources":["../src/storage-resolver.ts"],"mappings":";;;;;;;;;;;;;cA6Ca,sBAAA,EAAwB,gBAAA"}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{promises as e}from"node:fs";import{tmpdir as t}from"node:os";import{join as n}from"node:path";const r=async r=>{let{createStorageClient:i}=await import(`@murumets-ee/storage`),{getStorageConfig:a}=await import(`@murumets-ee/storage/plugin`),{getApp:o}=await import(`@murumets-ee/core`),{body:s}=await i(a(),{app:o()}).download(r),c=Buffer.isBuffer(s)?s:Buffer.from(await new Response(s).arrayBuffer()),l=r.split(`/`).pop()??`feed`,u=n(t(),`imports-${crypto.randomUUID()}-${l}`);return await e.writeFile(u,c),{localPath:u,cleanup:async()=>{try{await e.unlink(u)}catch(e){if(e.code!==`ENOENT`)throw e}}}};export{r as storageResolveFilePath};
|
|
2
|
+
//# sourceMappingURL=storage-resolver.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"storage-resolver.mjs","names":["fs"],"sources":["../src/storage-resolver.ts"],"sourcesContent":["/**\n * Storage-backed `resolveFilePath` for the imports plugin. Downloads\n * an object out of `@murumets-ee/storage` to a tmpfile so the queue\n * worker can stream it via `node:fs.createReadStream`.\n *\n * Wire into the imports plugin like this:\n *\n * ```ts\n * import { imports } from '@murumets-ee/imports/plugin'\n * import { storageResolveFilePath } from '@murumets-ee/imports/storage-resolver'\n *\n * imports({\n * esClient: () => esClient,\n * resolveFilePath: storageResolveFilePath,\n * })\n * ```\n *\n * Why this lives here rather than in the route or in the storage\n * package:\n * - The route shouldn't know about the consumer's runner; it just\n * persists a key on `import_run.filePath`.\n * - The storage package is generic (no opinion on imports).\n * - The imports worker is the natural download point: it owns the\n * run's lifecycle and can guarantee tmpfile cleanup.\n *\n * The helper is in a separate subpath so consumers that use a\n * different upload sink (local disk, S3 with a custom adapter, etc.)\n * don't pull in `@murumets-ee/storage` transitively.\n */\n\nimport { promises as fs } from 'node:fs'\nimport { tmpdir } from 'node:os'\nimport { join } from 'node:path'\nimport type { FilePathResolver } from './worker.js'\n\n/**\n * Read the storage object at `key` into a tmpfile and return the\n * path. Cleans up via `fs.unlink` after the run finishes — best-\n * effort, swallows `ENOENT` (the file may already be gone if the\n * worker crashed and the OS cleaned `/tmp`).\n *\n * Lazy-imports `@murumets-ee/storage` + `@murumets-ee/core` so\n * deployments that don't ingest from remote storage never load the\n * R2 client.\n */\nexport const storageResolveFilePath: FilePathResolver = async (storageKey) => {\n const { createStorageClient } = await import('@murumets-ee/storage')\n const { getStorageConfig } = await import('@murumets-ee/storage/plugin')\n const { getApp } = await import('@murumets-ee/core')\n\n const storage = createStorageClient(getStorageConfig(), { app: getApp() })\n const { body } = await storage.download(storageKey)\n\n // `DownloadResult.body` is `Buffer | ReadableStream<Uint8Array>` per\n // storage's adapter contract. Normalise to Buffer for the simple\n // tmpfile-write path; the carmaker feeds top out at ~tens of MB,\n // well within memory.\n const buffer = Buffer.isBuffer(body)\n ? body\n : Buffer.from(await new Response(body).arrayBuffer())\n\n // Suffix carries the basename hint (last storage-key segment) so\n // operator-readable temp paths help debugging without leaking the\n // full original filename through the FS.\n const suffix = storageKey.split('/').pop() ?? 'feed'\n const localPath = join(tmpdir(), `imports-${crypto.randomUUID()}-${suffix}`)\n await fs.writeFile(localPath, buffer)\n\n return {\n localPath,\n cleanup: async () => {\n try {\n await fs.unlink(localPath)\n } catch (err) {\n // ENOENT is fine — file already gone (OS cleanup, manual\n // intervention, etc.). Anything else, rethrow so the worker's\n // best-effort cleanup logs a warning.\n if ((err as NodeJS.ErrnoException).code !== 'ENOENT') throw err\n }\n },\n }\n}\n"],"mappings":"qGA6CA,MAAa,EAA2C,KAAO,IAAe,CAC5E,GAAM,CAAE,uBAAwB,MAAM,OAAO,wBACvC,CAAE,oBAAqB,MAAM,OAAO,+BACpC,CAAE,UAAW,MAAM,OAAO,qBAG1B,CAAE,QAAS,MADD,EAAoB,GAAkB,CAAE,CAAE,IAAK,GAAQ,CAAE,CAC3C,CAAC,SAAS,EAAW,CAM7C,EAAS,OAAO,SAAS,EAAK,CAChC,EACA,OAAO,KAAK,MAAM,IAAI,SAAS,EAAK,CAAC,aAAa,CAAC,CAKjD,EAAS,EAAW,MAAM,IAAI,CAAC,KAAK,EAAI,OACxC,EAAY,EAAK,GAAQ,CAAE,WAAW,OAAO,YAAY,CAAC,GAAG,IAAS,CAG5E,OAFA,MAAMA,EAAG,UAAU,EAAW,EAAO,CAE9B,CACL,YACA,QAAS,SAAY,CACnB,GAAI,CACF,MAAMA,EAAG,OAAO,EAAU,OACnB,EAAK,CAIZ,GAAK,EAA8B,OAAS,SAAU,MAAM,IAGjE"}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{behavior as e,defineEntity as t,field as n}from"@murumets-ee/entity";const r=[`pending`,`running`,`succeeded`,`failed`,`cancelled`],i=t({name:`import_run`,fields:{label:n.text({required:!0,maxLength:255}),status:n.select({options:r,default:`pending`,indexed:!0}),filePath:n.text({required:!0,maxLength:2048}),transformName:n.text({required:!0,maxLength:128,indexed:!0}),params:n.json(),totals:n.json(),errorSummary:n.json(),startedAt:n.date(),finishedAt:n.date(),queueJobId:n.text({maxLength:64,indexed:!0})},behaviors:[e.auditable()],scope:`global`,admin:{group:`imports`,label:`Import runs`,labelSingular:`Import run`,icon:`upload`,hideFromMenu:!0}});var a=class{entries=new Map;register(e,t){if(this.entries.has(e))throw Error(`Transform "${e}" is already registered — two plugins cannot register the same transform name.`);this.entries.set(e,t)}get(e){return this.entries.get(e)}has(e){return this.entries.has(e)}list(){return Array.from(this.entries.keys()).sort()}clear(){this.entries.clear()}};const o=Symbol.for(`@murumets-ee/imports:transforms`);function s(){let e=globalThis,t=e[o];return t||(t=new a,e[o]=t),t}function c(e,t){s().register(e,t)}export{i as a,r as i,s as n,c as r,a as t};
|
|
2
|
+
//# sourceMappingURL=transform-BUGBTotp.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transform-BUGBTotp.mjs","names":[],"sources":["../src/entities/import-run.ts","../src/transform.ts"],"sourcesContent":["/**\n * ImportRun — one feed-import attempt. Tracks file, transform, status,\n * row totals, and the top error patterns surfaced by `ErrorTracker`.\n *\n * Per PLAN-ECOMMERCE.md PR 7 (PoC scope): generic plumbing entity. The\n * commerce-specific transform parameters (brandId, supplierId, supplier\n * code prefix) live in the opaque `params` JSONB so the imports package\n * stays free of commerce-domain dependencies. PR 8 wires a concrete\n * carmaker-feed transform that reads those params; PR 8a renders a\n * bespoke `/admin/commerce/imports` page over this entity's history.\n *\n * `hideFromMenu: true` because PR 7 ships only the entity + plumbing —\n * the operator-facing surface is PR 8a. The auto-EntityListPage at\n * `/admin/import_run` still resolves for direct lookups, but no sidebar\n * entry points to it.\n */\n\nimport { behavior, defineEntity, field } from '@murumets-ee/entity'\nimport type { AdminClient } from '@murumets-ee/entity/admin'\n\n/**\n * Lifecycle stages.\n *\n * - `pending` — row exists, queue job has been enqueued, worker hasn't picked it up yet.\n * - `running` — worker is streaming + bulk-writing.\n * - `succeeded` — worker finished cleanly (note: per-row failures still possible — see `totals.failed`).\n * - `failed` — worker threw and the queue marked the job dead. `errorSummary.fatal` carries the cause.\n * - `cancelled` — manual operator action via PR 8a (out of scope for PR 7).\n */\nexport const IMPORT_RUN_STATUSES = ['pending', 'running', 'succeeded', 'failed', 'cancelled'] as const\nexport type ImportRunStatus = (typeof IMPORT_RUN_STATUSES)[number]\n\nexport const ImportRun = defineEntity({\n name: 'import_run',\n fields: {\n /** Operator-visible label, e.g. `\"MERCEDES — ME_20251027075918.txt\"`. Free-form. */\n label: field.text({ required: true, maxLength: 255 }),\n /** See {@link IMPORT_RUN_STATUSES}. */\n status: field.select({\n options: IMPORT_RUN_STATUSES,\n default: 'pending',\n indexed: true,\n }),\n /**\n * Path or storage key of the uploaded feed file. Generic string —\n * could be `/var/lumi/uploads/<id>.txt` for the local-disk PoC or an\n * S3 object key once storage adapter integration lands. The worker\n * reads this with the configured `readFeed` resolver.\n */\n filePath: field.text({ required: true, maxLength: 2048 }),\n /**\n * Name of the registered transform applied to each row. Resolved at\n * worker-dispatch time against the transform registry contributed by\n * the consumer. PoC: `'commerce:carmaker-feed'`.\n */\n transformName: field.text({ required: true, maxLength: 128, indexed: true }),\n /**\n * Opaque per-transform parameters. The carmaker transform expects\n * `{ brandId, supplierId, codePrefix? }`. Validation happens inside\n * the transform — the imports package never inspects this shape so\n * a new transform with different params doesn't require a schema\n * migration.\n */\n params: field.json(),\n /**\n * Row counters: `{ submitted, succeeded, failed, skipped, batches }`.\n * Updated by the worker as each batch completes. Final values are\n * what the operator reads; intermediate progress comes from\n * `toolkit_jobs.progress` via the queue UI.\n */\n totals: field.json(),\n /**\n * Output of `ErrorTracker.getTopPatterns(totalRows)` — the top-50\n * error signatures with up to 5 sample rows each. Empty `[]` until\n * the worker writes it on completion.\n */\n errorSummary: field.json(),\n /** Set when the worker picks up the job. */\n startedAt: field.date(),\n /** Set when the worker finishes (success OR fatal failure). */\n finishedAt: field.date(),\n /**\n * `toolkit_jobs.id` of the queue job processing this run — link\n * back so PR 8a can show live progress without a second lookup.\n */\n queueJobId: field.text({ maxLength: 64, indexed: true }),\n },\n behaviors: [behavior.auditable()],\n scope: 'global',\n admin: {\n group: 'imports',\n label: 'Import runs',\n labelSingular: 'Import run',\n icon: 'upload',\n hideFromMenu: true,\n },\n})\n\nexport type ImportRunClient = AdminClient<typeof ImportRun.allFields>\n","/**\n * Per-feed transform plugin interface (PLAN-ECOMMERCE.md PR 7 / D14).\n *\n * The streaming reader reads raw rows out of the file (column key →\n * string). The transform turns one row into a typed `OutputDoc` ready\n * for the bulk-write surface (PoC: `PartsDocument` for the parts ES\n * index). The transform is what makes a generic feed-importer\n * commerce-aware — without it, the importer has no opinion on what a\n * row \"means\".\n *\n * Why an interface, not a function:\n * - The transform may need to reject a row (`RowSkip`) without that\n * counting as an error in `ErrorTracker` — e.g. blank lines, header\n * rows mistakenly retained, or \"this is a replacement-code marker\n * row, handled out-of-band\" decisions.\n * - The transform may need to fail a row (`RowError`) with a typed\n * error class so `ErrorTracker` collapses them by `errorType`.\n * - The transform may need access to the run-level params (brand /\n * supplier IDs, code prefix, batch ID) without those being\n * re-derived per row.\n *\n * PR 7 ships only the interface + a registry. PR 8 ships the first\n * concrete carmaker-feed transform.\n */\n\n/** Identifies which transform a registered handler implements. Stable across deploys. */\nexport type TransformName = string\n\n/** Per-run context provided to every row call. Kept narrow on purpose. */\nexport interface TransformContext {\n /** UUID of the `import_run` row. Forward to the output doc as `import_batch_id` for source attribution (D20). */\n importRunId: string\n /** Opaque per-run params copied from `import_run.params`. The transform validates the shape it expects. */\n params: Record<string, unknown>\n /** Operator-supplied label, useful for logging / debugging. */\n runLabel: string\n /** 1-based row number across the whole feed (header counts as row 0). Forward to `RowError.rowNumber`. */\n rowNumber: number\n}\n\n/**\n * One of: a successful `OutputDoc` keyed by a stable `id`, an\n * intentional `skip` (counted in `totals.skipped`), or a row-level\n * error (counted in `totals.failed` AND aggregated by `ErrorTracker`).\n *\n * The `id` field on `success` is used as the bulk-upsert primary key\n * (`_id` in ES). Per D4 the parts index uses\n * `<code_normalized>__<supplier_id>` so a re-run of the same feed\n * idempotently overwrites instead of duplicating.\n */\nexport type RowResult<TDoc> =\n | { kind: 'success'; id: string; doc: TDoc }\n | { kind: 'skip'; reason: string }\n | { kind: 'error'; error: RowError }\n\n/** Row-level error. The aggregator uses `(errorType, field, message)` as the dedup signature. */\nexport interface RowError {\n errorType: string\n message: string\n field?: string | undefined\n}\n\n/**\n * The contract every per-feed transform implements. Pure function of\n * `(row, ctx) → RowResult` — no I/O. Async only because the future\n * carmaker variant might consult an in-memory taxonomy lookup.\n */\nexport interface RowTransform<TDoc> {\n (rawRow: Record<string, string>, ctx: TransformContext): Promise<RowResult<TDoc>>\n}\n\n/**\n * Holds the registered transforms for the current process. Each plugin\n * that ships a transform calls {@link TransformRegistry.register} from\n * its `init` hook.\n *\n * Per CLAUDE.md \"Whitelist, don't blacklist\": the worker dispatches\n * by `import_run.transformName` against this registry. An unregistered\n * name fails the run rather than running with a default transform.\n *\n * **Production callers MUST go through {@link getTransformRegistry}**, not\n * `new TransformRegistry()`. The constructor is exposed only so tests\n * (and rare custom-worker embeds) can build an isolated instance and\n * inject it into {@link createRunImportHandler} directly. Two `new`\n * instances do NOT share state — registering a transform on one will\n * NOT make it visible to a worker resolving against the other. The\n * package's queue handler (registered by `imports()` plugin) always\n * resolves against the singleton.\n */\nexport class TransformRegistry {\n private readonly entries = new Map<TransformName, RowTransform<unknown>>()\n\n /**\n * Register a transform under a stable name. Throws on duplicate\n * names so two plugins can't silently overwrite each other (matches\n * the `SearchRegistry` pattern in `@murumets-ee/search`).\n */\n register<TDoc>(name: TransformName, transform: RowTransform<TDoc>): void {\n if (this.entries.has(name)) {\n throw new Error(\n `Transform \"${name}\" is already registered — two plugins cannot register the same transform name.`,\n )\n }\n this.entries.set(name, transform as RowTransform<unknown>)\n }\n\n /** Look up a transform by name. Returns `undefined` if no plugin has registered one. */\n get(name: TransformName): RowTransform<unknown> | undefined {\n return this.entries.get(name)\n }\n\n /** True iff a transform is registered for the given name. */\n has(name: TransformName): boolean {\n return this.entries.has(name)\n }\n\n /** All registered names. Useful for the admin Catalog tab and for tests. */\n list(): TransformName[] {\n return Array.from(this.entries.keys()).sort()\n }\n\n /** Drop all registrations. Tests only. */\n clear(): void {\n this.entries.clear()\n }\n}\n\n// ---------------------------------------------------------------------------\n// Process-global singleton (mirrors `@murumets-ee/queue`'s handler registry).\n// ---------------------------------------------------------------------------\n//\n// Why a singleton: PR 8's carmaker-feed transform lives in a different\n// plugin from `@murumets-ee/imports`. Under Next.js HMR, ONE of those\n// plugins might re-evaluate while the other does not. Holding the\n// registry on `globalThis` via `Symbol.for` means the same `Map`\n// instance is shared across module evaluations — the carmaker plugin's\n// `register` and the imports worker's `get` see each other regardless\n// of evaluation order. Same fix shape as the queue's handler-registry\n// HMR bug (#186).\n\nconst REGISTRY_KEY = Symbol.for('@murumets-ee/imports:transforms')\n\ninterface GlobalThisWithTransforms {\n [REGISTRY_KEY]?: TransformRegistry\n}\n\n/** Returns the singleton registry, creating it on first access. */\nexport function getTransformRegistry(): TransformRegistry {\n const g = globalThis as GlobalThisWithTransforms\n let reg = g[REGISTRY_KEY]\n if (!reg) {\n reg = new TransformRegistry()\n g[REGISTRY_KEY] = reg\n }\n return reg\n}\n\n/**\n * Convenience wrapper around `getTransformRegistry().register(name, fn)`.\n * Plugins that ship a transform call this from their `init` hook — same\n * shape as `@murumets-ee/queue/client`'s `registerJob`.\n */\nexport function registerImportTransform<TDoc>(\n name: TransformName,\n transform: RowTransform<TDoc>,\n): void {\n getTransformRegistry().register(name, transform)\n}\n"],"mappings":"4EA6BA,MAAa,EAAsB,CAAC,UAAW,UAAW,YAAa,SAAU,YAAY,CAGhF,EAAY,EAAa,CACpC,KAAM,aACN,OAAQ,CAEN,MAAO,EAAM,KAAK,CAAE,SAAU,GAAM,UAAW,IAAK,CAAC,CAErD,OAAQ,EAAM,OAAO,CACnB,QAAS,EACT,QAAS,UACT,QAAS,GACV,CAAC,CAOF,SAAU,EAAM,KAAK,CAAE,SAAU,GAAM,UAAW,KAAM,CAAC,CAMzD,cAAe,EAAM,KAAK,CAAE,SAAU,GAAM,UAAW,IAAK,QAAS,GAAM,CAAC,CAQ5E,OAAQ,EAAM,MAAM,CAOpB,OAAQ,EAAM,MAAM,CAMpB,aAAc,EAAM,MAAM,CAE1B,UAAW,EAAM,MAAM,CAEvB,WAAY,EAAM,MAAM,CAKxB,WAAY,EAAM,KAAK,CAAE,UAAW,GAAI,QAAS,GAAM,CAAC,CACzD,CACD,UAAW,CAAC,EAAS,WAAW,CAAC,CACjC,MAAO,SACP,MAAO,CACL,MAAO,UACP,MAAO,cACP,cAAe,aACf,KAAM,SACN,aAAc,GACf,CACF,CAAC,CCPF,IAAa,EAAb,KAA+B,CAC7B,QAA2B,IAAI,IAO/B,SAAe,EAAqB,EAAqC,CACvE,GAAI,KAAK,QAAQ,IAAI,EAAK,CACxB,MAAU,MACR,cAAc,EAAK,gFACpB,CAEH,KAAK,QAAQ,IAAI,EAAM,EAAmC,CAI5D,IAAI,EAAwD,CAC1D,OAAO,KAAK,QAAQ,IAAI,EAAK,CAI/B,IAAI,EAA8B,CAChC,OAAO,KAAK,QAAQ,IAAI,EAAK,CAI/B,MAAwB,CACtB,OAAO,MAAM,KAAK,KAAK,QAAQ,MAAM,CAAC,CAAC,MAAM,CAI/C,OAAc,CACZ,KAAK,QAAQ,OAAO,GAiBxB,MAAM,EAAe,OAAO,IAAI,kCAAkC,CAOlE,SAAgB,GAA0C,CACxD,IAAM,EAAI,WACN,EAAM,EAAE,GAKZ,OAJK,IACH,EAAM,IAAI,EACV,EAAE,GAAgB,GAEb,EAQT,SAAgB,EACd,EACA,EACM,CACN,GAAsB,CAAC,SAAS,EAAM,EAAU"}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
//#region src/transform.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* Per-feed transform plugin interface (PLAN-ECOMMERCE.md PR 7 / D14).
|
|
4
|
+
*
|
|
5
|
+
* The streaming reader reads raw rows out of the file (column key →
|
|
6
|
+
* string). The transform turns one row into a typed `OutputDoc` ready
|
|
7
|
+
* for the bulk-write surface (PoC: `PartsDocument` for the parts ES
|
|
8
|
+
* index). The transform is what makes a generic feed-importer
|
|
9
|
+
* commerce-aware — without it, the importer has no opinion on what a
|
|
10
|
+
* row "means".
|
|
11
|
+
*
|
|
12
|
+
* Why an interface, not a function:
|
|
13
|
+
* - The transform may need to reject a row (`RowSkip`) without that
|
|
14
|
+
* counting as an error in `ErrorTracker` — e.g. blank lines, header
|
|
15
|
+
* rows mistakenly retained, or "this is a replacement-code marker
|
|
16
|
+
* row, handled out-of-band" decisions.
|
|
17
|
+
* - The transform may need to fail a row (`RowError`) with a typed
|
|
18
|
+
* error class so `ErrorTracker` collapses them by `errorType`.
|
|
19
|
+
* - The transform may need access to the run-level params (brand /
|
|
20
|
+
* supplier IDs, code prefix, batch ID) without those being
|
|
21
|
+
* re-derived per row.
|
|
22
|
+
*
|
|
23
|
+
* PR 7 ships only the interface + a registry. PR 8 ships the first
|
|
24
|
+
* concrete carmaker-feed transform.
|
|
25
|
+
*/
|
|
26
|
+
/** Identifies which transform a registered handler implements. Stable across deploys. */
|
|
27
|
+
type TransformName = string;
|
|
28
|
+
/** Per-run context provided to every row call. Kept narrow on purpose. */
|
|
29
|
+
interface TransformContext {
|
|
30
|
+
/** UUID of the `import_run` row. Forward to the output doc as `import_batch_id` for source attribution (D20). */
|
|
31
|
+
importRunId: string;
|
|
32
|
+
/** Opaque per-run params copied from `import_run.params`. The transform validates the shape it expects. */
|
|
33
|
+
params: Record<string, unknown>;
|
|
34
|
+
/** Operator-supplied label, useful for logging / debugging. */
|
|
35
|
+
runLabel: string;
|
|
36
|
+
/** 1-based row number across the whole feed (header counts as row 0). Forward to `RowError.rowNumber`. */
|
|
37
|
+
rowNumber: number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* One of: a successful `OutputDoc` keyed by a stable `id`, an
|
|
41
|
+
* intentional `skip` (counted in `totals.skipped`), or a row-level
|
|
42
|
+
* error (counted in `totals.failed` AND aggregated by `ErrorTracker`).
|
|
43
|
+
*
|
|
44
|
+
* The `id` field on `success` is used as the bulk-upsert primary key
|
|
45
|
+
* (`_id` in ES). Per D4 the parts index uses
|
|
46
|
+
* `<code_normalized>__<supplier_id>` so a re-run of the same feed
|
|
47
|
+
* idempotently overwrites instead of duplicating.
|
|
48
|
+
*/
|
|
49
|
+
type RowResult<TDoc> = {
|
|
50
|
+
kind: 'success';
|
|
51
|
+
id: string;
|
|
52
|
+
doc: TDoc;
|
|
53
|
+
} | {
|
|
54
|
+
kind: 'skip';
|
|
55
|
+
reason: string;
|
|
56
|
+
} | {
|
|
57
|
+
kind: 'error';
|
|
58
|
+
error: RowError;
|
|
59
|
+
};
|
|
60
|
+
/** Row-level error. The aggregator uses `(errorType, field, message)` as the dedup signature. */
|
|
61
|
+
interface RowError {
|
|
62
|
+
errorType: string;
|
|
63
|
+
message: string;
|
|
64
|
+
field?: string | undefined;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* The contract every per-feed transform implements. Pure function of
|
|
68
|
+
* `(row, ctx) → RowResult` — no I/O. Async only because the future
|
|
69
|
+
* carmaker variant might consult an in-memory taxonomy lookup.
|
|
70
|
+
*/
|
|
71
|
+
interface RowTransform<TDoc> {
|
|
72
|
+
(rawRow: Record<string, string>, ctx: TransformContext): Promise<RowResult<TDoc>>;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Holds the registered transforms for the current process. Each plugin
|
|
76
|
+
* that ships a transform calls {@link TransformRegistry.register} from
|
|
77
|
+
* its `init` hook.
|
|
78
|
+
*
|
|
79
|
+
* Per CLAUDE.md "Whitelist, don't blacklist": the worker dispatches
|
|
80
|
+
* by `import_run.transformName` against this registry. An unregistered
|
|
81
|
+
* name fails the run rather than running with a default transform.
|
|
82
|
+
*
|
|
83
|
+
* **Production callers MUST go through {@link getTransformRegistry}**, not
|
|
84
|
+
* `new TransformRegistry()`. The constructor is exposed only so tests
|
|
85
|
+
* (and rare custom-worker embeds) can build an isolated instance and
|
|
86
|
+
* inject it into {@link createRunImportHandler} directly. Two `new`
|
|
87
|
+
* instances do NOT share state — registering a transform on one will
|
|
88
|
+
* NOT make it visible to a worker resolving against the other. The
|
|
89
|
+
* package's queue handler (registered by `imports()` plugin) always
|
|
90
|
+
* resolves against the singleton.
|
|
91
|
+
*/
|
|
92
|
+
declare class TransformRegistry {
|
|
93
|
+
private readonly entries;
|
|
94
|
+
/**
|
|
95
|
+
* Register a transform under a stable name. Throws on duplicate
|
|
96
|
+
* names so two plugins can't silently overwrite each other (matches
|
|
97
|
+
* the `SearchRegistry` pattern in `@murumets-ee/search`).
|
|
98
|
+
*/
|
|
99
|
+
register<TDoc>(name: TransformName, transform: RowTransform<TDoc>): void;
|
|
100
|
+
/** Look up a transform by name. Returns `undefined` if no plugin has registered one. */
|
|
101
|
+
get(name: TransformName): RowTransform<unknown> | undefined;
|
|
102
|
+
/** True iff a transform is registered for the given name. */
|
|
103
|
+
has(name: TransformName): boolean;
|
|
104
|
+
/** All registered names. Useful for the admin Catalog tab and for tests. */
|
|
105
|
+
list(): TransformName[];
|
|
106
|
+
/** Drop all registrations. Tests only. */
|
|
107
|
+
clear(): void;
|
|
108
|
+
}
|
|
109
|
+
/** Returns the singleton registry, creating it on first access. */
|
|
110
|
+
declare function getTransformRegistry(): TransformRegistry;
|
|
111
|
+
/**
|
|
112
|
+
* Convenience wrapper around `getTransformRegistry().register(name, fn)`.
|
|
113
|
+
* Plugins that ship a transform call this from their `init` hook — same
|
|
114
|
+
* shape as `@murumets-ee/queue/client`'s `registerJob`.
|
|
115
|
+
*/
|
|
116
|
+
declare function registerImportTransform<TDoc>(name: TransformName, transform: RowTransform<TDoc>): void;
|
|
117
|
+
//#endregion
|
|
118
|
+
export { TransformName as a, registerImportTransform as c, TransformContext as i, RowResult as n, TransformRegistry as o, RowTransform as r, getTransformRegistry as s, RowError as t };
|
|
119
|
+
//# sourceMappingURL=transform-D_uhdLeo.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transform-D_uhdLeo.d.mts","names":[],"sources":["../src/transform.ts"],"mappings":";;AA0BA;;;;;AAGA;;;;;;;;;;;AAqBA;;;;;;;;KAxBY,aAAA;;UAGK,gBAAA;EAwBX;EAtBJ,WAAA;EAsB0B;EApB1B,MAAA,EAAQ,MAAA;EAoB0B;EAlBlC,QAAA;EAqBuB;EAnBvB,SAAA;AAAA;;;;;;AA8BF;;;;;KAjBY,SAAA;EACN,IAAA;EAAiB,EAAA;EAAY,GAAA,EAAK,IAAA;AAAA;EAClC,IAAA;EAAc,MAAA;AAAA;EACd,IAAA;EAAe,KAAA,EAAO,QAAA;AAAA;;UAGX,QAAA;EACf,SAAA;EACA,OAAA;EACA,KAAA;AAAA;;;;;;UAQe,YAAA;EAAA,CACd,MAAA,EAAQ,MAAA,kBAAwB,GAAA,EAAK,gBAAA,GAAmB,OAAA,CAAQ,SAAA,CAAU,IAAA;AAAA;;;;;;;;;;;;;;;;;;;cAqBhE,iBAAA;EAAA,iBACM,OAAA;EAgCZ;;AAyBP;;;EAlDE,QAAA,MAAA,CAAe,IAAA,EAAM,aAAA,EAAe,SAAA,EAAW,YAAA,CAAa,IAAA;EAkDL;EAxCvD,GAAA,CAAI,IAAA,EAAM,aAAA,GAAgB,YAAA;EAuDW;EAlDrC,GAAA,CAAI,IAAA,EAAM,aAAA;EAmDJ;EA9CN,IAAA,CAAA,GAAQ,aAAA;EA+CG;EA1CX,KAAA,CAAA;AAAA;;iBAyBc,oBAAA,CAAA,GAAwB,iBAAA;;;;;;iBAexB,uBAAA,MAAA,CACd,IAAA,EAAM,aAAA,EACN,SAAA,EAAW,YAAA,CAAa,IAAA"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { r as RowTransform } from "./transform-D_uhdLeo.mjs";
|
|
2
|
+
import { PartsDocument } from "@murumets-ee/search-elasticsearch";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
|
|
5
|
+
//#region src/transforms/carmaker-feed.d.ts
|
|
6
|
+
/** Stable name under which this transform is registered. */
|
|
7
|
+
declare const CARMAKER_FEED_TRANSFORM_NAME = "commerce:carmaker-feed";
|
|
8
|
+
/**
|
|
9
|
+
* Per-job configuration. PR 8a's upload route validates the
|
|
10
|
+
* brandId/supplierId UUIDs, looks up the brand slug + supplier
|
|
11
|
+
* display_name from the commerce entities, and writes the resolved
|
|
12
|
+
* shape into `import_run.params` before enqueuing — that keeps
|
|
13
|
+
* this transform free of DB I/O.
|
|
14
|
+
*/
|
|
15
|
+
declare const carmakerFeedParamsSchema: z.ZodObject<{
|
|
16
|
+
/** UUID of the brand. Forwarded to ES doc per D24 (typed FK). */brandId: z.ZodString; /** Brand slug for facet labels — pre-resolved by the route. */
|
|
17
|
+
brandSlug: z.ZodString; /** UUID of the supplier. */
|
|
18
|
+
supplierId: z.ZodString;
|
|
19
|
+
/**
|
|
20
|
+
* Customer-facing supplier alias (D24 anti-disintermediation) —
|
|
21
|
+
* pre-resolved. NEVER `supplier.name` (the legal name).
|
|
22
|
+
*/
|
|
23
|
+
supplierDisplayName: z.ZodString;
|
|
24
|
+
/**
|
|
25
|
+
* Optional supplier-specific code prefix to strip from
|
|
26
|
+
* `ArticleName` before normalization. e.g. `'ME-'` for Mercedes,
|
|
27
|
+
* `'TO-'` for Toyota. When absent, the full ArticleName is
|
|
28
|
+
* normalized as-is.
|
|
29
|
+
*/
|
|
30
|
+
supplierCodePrefix: z.ZodOptional<z.ZodString>;
|
|
31
|
+
/**
|
|
32
|
+
* Stable identifier for this run's batch — written verbatim to
|
|
33
|
+
* `PartsDocument.import_batch_id` so D20 source-attribution holds
|
|
34
|
+
* for the rare cases an operator needs to roll back a single feed
|
|
35
|
+
* import in ES.
|
|
36
|
+
*/
|
|
37
|
+
importBatchId: z.ZodString;
|
|
38
|
+
}, "strip", z.ZodTypeAny, {
|
|
39
|
+
brandId: string;
|
|
40
|
+
brandSlug: string;
|
|
41
|
+
supplierId: string;
|
|
42
|
+
supplierDisplayName: string;
|
|
43
|
+
importBatchId: string;
|
|
44
|
+
supplierCodePrefix?: string | undefined;
|
|
45
|
+
}, {
|
|
46
|
+
brandId: string;
|
|
47
|
+
brandSlug: string;
|
|
48
|
+
supplierId: string;
|
|
49
|
+
supplierDisplayName: string;
|
|
50
|
+
importBatchId: string;
|
|
51
|
+
supplierCodePrefix?: string | undefined;
|
|
52
|
+
}>;
|
|
53
|
+
type CarmakerFeedParams = z.infer<typeof carmakerFeedParamsSchema>;
|
|
54
|
+
declare const carmakerFeedTransform: RowTransform<PartsDocument>;
|
|
55
|
+
//#endregion
|
|
56
|
+
export { CARMAKER_FEED_TRANSFORM_NAME, type CarmakerFeedParams, carmakerFeedParamsSchema, carmakerFeedTransform };
|
|
57
|
+
//# sourceMappingURL=transforms.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transforms.d.mts","names":[],"sources":["../src/transforms/carmaker-feed.ts"],"mappings":";;;;;;cAiCa,4BAAA;;;;;;;;cASA,wBAAA,EAAwB,CAAA,CAAA,SAAA;;;;;;;;;;;;;;;;;;;;AA+BrC;;;;;;;;;;;;;;;;;;KAAY,kBAAA,GAAqB,CAAA,CAAE,KAAA,QAAa,wBAAA;AAAA,cAoFnC,qBAAA,EAAuB,YAAA,CAAa,aAAA"}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{z as e}from"zod";const t=`commerce:carmaker-feed`,n=e.object({brandId:e.string().uuid(),brandSlug:e.string().min(1).max(64),supplierId:e.string().uuid(),supplierDisplayName:e.string().min(1).max(255),supplierCodePrefix:e.string().max(8).regex(/^[A-Z0-9-]+$/,`must be uppercase alphanumeric or "-"`).optional(),importBatchId:e.string().uuid()}),r=new WeakMap;function i(e){let t=r.get(e);if(t)return t;let i=n.parse(e);return r.set(e,i),i}const a=/[\s-]/g;function o(e){return e.toUpperCase().replace(a,``)}const s=/^-?\d+(\.\d+)?$/;function c(e){let t=e.includes(`,`),n=e.includes(`.`),r;if(t&&n)r=e.replace(/\./g,``).replace(`,`,`.`);else if(t)r=e.replace(`,`,`.`);else if(n)return NaN;else r=e;return s.test(r)?Number.parseFloat(r):NaN}function l(e){if(e===void 0)return null;let t=e.trim();return t===``?null:t}const u=async(e,t)=>{let n=i(t.params),r=e.ArticleName?.trim();if(!r)return{kind:`error`,error:{errorType:`missing_article_name`,message:`ArticleName is empty`,field:`ArticleName`}};let a=(e.CurrencyCd??``).trim();if(a!==`EUR`)return{kind:`error`,error:{errorType:`unsupported_currency`,message:`Currency "${a}" not supported (PoC is EUR-only)`,field:`CurrencyCd`}};let s=o(n.supplierCodePrefix&&r.startsWith(n.supplierCodePrefix)?r.slice(n.supplierCodePrefix.length):r);if(!s)return{kind:`error`,error:{errorType:`empty_normalized_code`,message:`Code "${r}" is empty after normalization`,field:`ArticleName`}};let u=e[`NetPrice/Discount`]??``,d=c(u);if(Number.isNaN(d))return{kind:`error`,error:{errorType:`invalid_net_price`,message:`Could not parse "${u}" as a number`,field:`NetPrice/Discount`}};let f=(e.GrossPrice??``).trim(),p=null;if(f!==``&&f!==`NA`){let e=c(f);if(Number.isNaN(e))return{kind:`error`,error:{errorType:`invalid_gross_price`,message:`Could not parse "${f}" as a number`,field:`GrossPrice`}};p=e}let m=`${s}__${n.supplierId}`;return{kind:`success`,id:m,doc:{doc_id:m,code:r,code_normalized:s,brand_id:n.brandId,brand_slug:n.brandSlug,supplier_id:n.supplierId,supplier_display_name:n.supplierDisplayName,net_price_eur:d,gross_price_eur:p,currency:`EUR`,barcode:l(e.Barcode),name_de:l(e.Description_DE),name_en:l(e.Description_EN),name_es:l(e.Description_ES),name_fr:l(e.Description_FR),name_it:l(e.Description_IT),name_nl:l(e.Description_NL),name_pt:l(e.Description_PT),description1:l(e.Description1),description2:l(e.Description2),import_batch_id:n.importBatchId,imported_at:new Date().toISOString()}}};export{t as CARMAKER_FEED_TRANSFORM_NAME,n as carmakerFeedParamsSchema,u as carmakerFeedTransform};
|
|
2
|
+
//# sourceMappingURL=transforms.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transforms.mjs","names":[],"sources":["../src/transforms/carmaker-feed.ts"],"sourcesContent":["/**\n * Carmaker-feed transform — converts the giga-test 14-column tab-\n * delimited row into a `PartsDocument` for the parts ES alias.\n *\n * Per PLAN-ECOMMERCE.md PR 8 (PoC scope): ONE generic transform\n * covering every supplier whose feed matches this header. Per-job\n * `(brandId, brandSlug, supplierId, supplierDisplayName, supplierCodePrefix?)`\n * config is fully resolved at enqueue time (PR 8a's route looks up\n * the slug + display_name from the UUIDs before enqueuing) so the\n * transform is a pure function with no DB lookups per row.\n *\n * Decisions enforced:\n * - **D2** Code = bare manufacturer code, normalized. Strip the\n * supplier prefix (e.g. `ME-` for Mercedes, `TO-` for Toyota) if\n * present, then uppercase + dash/space-stripped.\n * - **D4** ES `_id` = `<code_normalized>__<supplierId>` so re-running\n * the same feed idempotently overwrites instead of duplicating.\n * - **D9** Single currency (EUR). Non-EUR rows are rejected as\n * row-level errors — at the PoC stage, only EUR data is loaded.\n * - **D21** Bulk-write path bypasses entity hooks. Per-batch audit\n * lives on the surrounding `import_run`, NOT per-row.\n *\n * **`*S` rows pass through unchanged.** The replacement-code side\n * effect (R1 / D11) is deferred to PR 5e — the row goes into ES\n * with its `*S` suffix preserved, no out-of-band write to\n * `replacement_group` / `replacement_member`.\n */\n\nimport type { PartsDocument } from '@murumets-ee/search-elasticsearch'\nimport { z } from 'zod'\nimport type { RowTransform } from '../transform.js'\n\n/** Stable name under which this transform is registered. */\nexport const CARMAKER_FEED_TRANSFORM_NAME = 'commerce:carmaker-feed'\n\n/**\n * Per-job configuration. PR 8a's upload route validates the\n * brandId/supplierId UUIDs, looks up the brand slug + supplier\n * display_name from the commerce entities, and writes the resolved\n * shape into `import_run.params` before enqueuing — that keeps\n * this transform free of DB I/O.\n */\nexport const carmakerFeedParamsSchema = z.object({\n /** UUID of the brand. Forwarded to ES doc per D24 (typed FK). */\n brandId: z.string().uuid(),\n /** Brand slug for facet labels — pre-resolved by the route. */\n brandSlug: z.string().min(1).max(64),\n /** UUID of the supplier. */\n supplierId: z.string().uuid(),\n /**\n * Customer-facing supplier alias (D24 anti-disintermediation) —\n * pre-resolved. NEVER `supplier.name` (the legal name).\n */\n supplierDisplayName: z.string().min(1).max(255),\n /**\n * Optional supplier-specific code prefix to strip from\n * `ArticleName` before normalization. e.g. `'ME-'` for Mercedes,\n * `'TO-'` for Toyota. When absent, the full ArticleName is\n * normalized as-is.\n */\n supplierCodePrefix: z\n .string()\n .max(8)\n .regex(/^[A-Z0-9-]+$/, 'must be uppercase alphanumeric or \"-\"')\n .optional(),\n /**\n * Stable identifier for this run's batch — written verbatim to\n * `PartsDocument.import_batch_id` so D20 source-attribution holds\n * for the rare cases an operator needs to roll back a single feed\n * import in ES.\n */\n importBatchId: z.string().uuid(),\n})\nexport type CarmakerFeedParams = z.infer<typeof carmakerFeedParamsSchema>\n\n/**\n * Run-level memoization of the parsed params. The runner passes the\n * same `ctx.params` reference for every row of a run, so this caches\n * the Zod parse result by reference identity. Garbage collection\n * clears the entry naturally once the run finishes and the params\n * object goes out of scope.\n *\n * Failed parses are NOT cached — a malformed-params run throws on\n * every row, but the ErrorTracker collapses identical Zod errors\n * into one bucket so the noise is bounded.\n */\nconst paramsCache = new WeakMap<object, CarmakerFeedParams>()\n\nfunction resolveParams(raw: Record<string, unknown>): CarmakerFeedParams {\n const cached = paramsCache.get(raw)\n if (cached) return cached\n const parsed = carmakerFeedParamsSchema.parse(raw)\n paramsCache.set(raw, parsed)\n return parsed\n}\n\nconst NORMALIZE_CODE_RE = /[\\s-]/g\n\n/** Uppercase + dash/whitespace-stripped (per D2). */\nfunction normalizeCode(s: string): string {\n return s.toUpperCase().replace(NORMALIZE_CODE_RE, '')\n}\n\n/** Validates the post-cleanup numeric form before accepting it. */\nconst STRICT_NUMBER_RE = /^-?\\d+(\\.\\d+)?$/\n\n/**\n * Parse a European-formatted decimal: `,` is the decimal separator, `.`\n * is the (optional) thousands separator. Returns `NaN` on unparseable\n * input — the caller surfaces that as a row-level error rather than\n * silently coercing to `0` or to a wrong value.\n *\n * Cases handled:\n * - `'1,98'` → `1.98` (decimal only)\n * - `'1.234,56'` → `1234.56` (thousands + decimal)\n * - `'5'` → `5` (integer)\n * - `'-5,00'` → `-5` (signed)\n *\n * Cases REJECTED (return `NaN`):\n * - `'1.98'` — bare-dot decimal is ambiguous in this format. The\n * carmaker feeds are documented as comma-decimal; a `.` without a\n * `,` could mean US-format decimal (`1.98`) OR a thousands marker\n * without decimal part. Rather than guess, reject. If a future\n * supplier sends US-format prices, that's a per-supplier transform\n * decision, not a quiet promotion of every dot.\n * - `'1,98EUR'`, `'1,98 '`, `'1,98abc'` — trailing junk. `parseFloat`\n * would silently accept the leading numeric prefix; the strict regex\n * post-check ensures the entire string is a clean number.\n * - `'1,2,3'` — multiple commas.\n * - `''`, `'NA'`, anything else non-numeric.\n */\nfunction parseEuropeanNumber(s: string): number {\n const hasComma = s.includes(',')\n const hasDot = s.includes('.')\n let cleaned: string\n if (hasComma && hasDot) {\n // European thousands+decimal: strip every `.`, then swap the (sole)\n // `,` for `.`. The strict regex below catches multi-comma input.\n cleaned = s.replace(/\\./g, '').replace(',', '.')\n } else if (hasComma) {\n cleaned = s.replace(',', '.')\n } else if (hasDot) {\n return Number.NaN\n } else {\n cleaned = s\n }\n if (!STRICT_NUMBER_RE.test(cleaned)) return Number.NaN\n return Number.parseFloat(cleaned)\n}\n\n/** Trim + collapse `''` and `undefined` to `null` (matches the `PartsDocument.* | null` shape). */\nfunction emptyToNull(s: string | undefined): string | null {\n if (s === undefined) return null\n const trimmed = s.trim()\n return trimmed === '' ? null : trimmed\n}\n\nexport const carmakerFeedTransform: RowTransform<PartsDocument> = async (row, ctx) => {\n const params = resolveParams(ctx.params)\n\n // 1. ArticleName is the only required cell that drives both `code`\n // and the ES `_id`. An empty value cannot be normalized; reject\n // rather than emit a doc with `_id = '__<supplierId>'`.\n const articleName = row['ArticleName']?.trim()\n if (!articleName) {\n return {\n kind: 'error',\n error: {\n errorType: 'missing_article_name',\n message: 'ArticleName is empty',\n field: 'ArticleName',\n },\n }\n }\n\n // 2. Currency gate (D9). The PoC is EUR-only — non-EUR data is not\n // converted, it's rejected. When PR 5c lands the price-application\n // layer with FxRate, this gate widens (or moves to the indexer).\n const currency = (row['CurrencyCd'] ?? '').trim()\n if (currency !== 'EUR') {\n return {\n kind: 'error',\n error: {\n errorType: 'unsupported_currency',\n message: `Currency \"${currency}\" not supported (PoC is EUR-only)`,\n field: 'CurrencyCd',\n },\n }\n }\n\n // 3. Strip supplier prefix if configured + present, then normalize.\n // A row whose ArticleName doesn't start with the configured prefix\n // is normalized as-is — this matches feeds that mix prefixed and\n // bare codes (rare but observed).\n const stripped =\n params.supplierCodePrefix && articleName.startsWith(params.supplierCodePrefix)\n ? articleName.slice(params.supplierCodePrefix.length)\n : articleName\n const codeNormalized = normalizeCode(stripped)\n if (!codeNormalized) {\n return {\n kind: 'error',\n error: {\n errorType: 'empty_normalized_code',\n message: `Code \"${articleName}\" is empty after normalization`,\n field: 'ArticleName',\n },\n }\n }\n\n // 4. Parse prices. Asymmetric: NetPrice is required (empty / unparseable\n // fails as `invalid_net_price`); GrossPrice is optional (`'NA'` or\n // empty → null, only a non-empty unparseable value fails). Required\n // because every parts row carries a wholesale price; gross is a\n // derived retail figure the supplier doesn't always populate.\n const netPriceRaw = row['NetPrice/Discount'] ?? ''\n const netPrice = parseEuropeanNumber(netPriceRaw)\n if (Number.isNaN(netPrice)) {\n return {\n kind: 'error',\n error: {\n errorType: 'invalid_net_price',\n message: `Could not parse \"${netPriceRaw}\" as a number`,\n field: 'NetPrice/Discount',\n },\n }\n }\n\n const grossRaw = (row['GrossPrice'] ?? '').trim()\n let grossPrice: number | null = null\n if (grossRaw !== '' && grossRaw !== 'NA') {\n const g = parseEuropeanNumber(grossRaw)\n if (Number.isNaN(g)) {\n return {\n kind: 'error',\n error: {\n errorType: 'invalid_gross_price',\n message: `Could not parse \"${grossRaw}\" as a number`,\n field: 'GrossPrice',\n },\n }\n }\n grossPrice = g\n }\n\n // 5. Build the ES doc. `imported_at` is generated per-row rather\n // than per-batch so the document carries a usable timestamp even\n // when the batch boundary is invisible to the reader.\n const docId = `${codeNormalized}__${params.supplierId}`\n const doc: PartsDocument = {\n doc_id: docId,\n code: articleName,\n code_normalized: codeNormalized,\n brand_id: params.brandId,\n brand_slug: params.brandSlug,\n supplier_id: params.supplierId,\n supplier_display_name: params.supplierDisplayName,\n net_price_eur: netPrice,\n gross_price_eur: grossPrice,\n currency: 'EUR',\n barcode: emptyToNull(row['Barcode']),\n name_de: emptyToNull(row['Description_DE']),\n name_en: emptyToNull(row['Description_EN']),\n name_es: emptyToNull(row['Description_ES']),\n name_fr: emptyToNull(row['Description_FR']),\n name_it: emptyToNull(row['Description_IT']),\n name_nl: emptyToNull(row['Description_NL']),\n name_pt: emptyToNull(row['Description_PT']),\n description1: emptyToNull(row['Description1']),\n description2: emptyToNull(row['Description2']),\n import_batch_id: params.importBatchId,\n imported_at: new Date().toISOString(),\n }\n\n return { kind: 'success', id: docId, doc }\n}\n"],"mappings":"wBAiCA,MAAa,EAA+B,yBAS/B,EAA2B,EAAE,OAAO,CAE/C,QAAS,EAAE,QAAQ,CAAC,MAAM,CAE1B,UAAW,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,GAAG,CAEpC,WAAY,EAAE,QAAQ,CAAC,MAAM,CAK7B,oBAAqB,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,IAAI,CAO/C,mBAAoB,EACjB,QAAQ,CACR,IAAI,EAAE,CACN,MAAM,eAAgB,wCAAwC,CAC9D,UAAU,CAOb,cAAe,EAAE,QAAQ,CAAC,MAAM,CACjC,CAAC,CAcI,EAAc,IAAI,QAExB,SAAS,EAAc,EAAkD,CACvE,IAAM,EAAS,EAAY,IAAI,EAAI,CACnC,GAAI,EAAQ,OAAO,EACnB,IAAM,EAAS,EAAyB,MAAM,EAAI,CAElD,OADA,EAAY,IAAI,EAAK,EAAO,CACrB,EAGT,MAAM,EAAoB,SAG1B,SAAS,EAAc,EAAmB,CACxC,OAAO,EAAE,aAAa,CAAC,QAAQ,EAAmB,GAAG,CAIvD,MAAM,EAAmB,kBA2BzB,SAAS,EAAoB,EAAmB,CAC9C,IAAM,EAAW,EAAE,SAAS,IAAI,CAC1B,EAAS,EAAE,SAAS,IAAI,CAC1B,EACJ,GAAI,GAAY,EAGd,EAAU,EAAE,QAAQ,MAAO,GAAG,CAAC,QAAQ,IAAK,IAAI,SACvC,EACT,EAAU,EAAE,QAAQ,IAAK,IAAI,SACpB,EACT,MAAO,UAEP,EAAU,EAGZ,OADK,EAAiB,KAAK,EAAQ,CAC5B,OAAO,WAAW,EAAQ,CADW,IAK9C,SAAS,EAAY,EAAsC,CACzD,GAAI,IAAM,IAAA,GAAW,OAAO,KAC5B,IAAM,EAAU,EAAE,MAAM,CACxB,OAAO,IAAY,GAAK,KAAO,EAGjC,MAAa,EAAqD,MAAO,EAAK,IAAQ,CACpF,IAAM,EAAS,EAAc,EAAI,OAAO,CAKlC,EAAc,EAAI,aAAgB,MAAM,CAC9C,GAAI,CAAC,EACH,MAAO,CACL,KAAM,QACN,MAAO,CACL,UAAW,uBACX,QAAS,uBACT,MAAO,cACR,CACF,CAMH,IAAM,GAAY,EAAI,YAAiB,IAAI,MAAM,CACjD,GAAI,IAAa,MACf,MAAO,CACL,KAAM,QACN,MAAO,CACL,UAAW,uBACX,QAAS,aAAa,EAAS,mCAC/B,MAAO,aACR,CACF,CAWH,IAAM,EAAiB,EAHrB,EAAO,oBAAsB,EAAY,WAAW,EAAO,mBAAmB,CAC1E,EAAY,MAAM,EAAO,mBAAmB,OAAO,CACnD,EACwC,CAC9C,GAAI,CAAC,EACH,MAAO,CACL,KAAM,QACN,MAAO,CACL,UAAW,wBACX,QAAS,SAAS,EAAY,gCAC9B,MAAO,cACR,CACF,CAQH,IAAM,EAAc,EAAI,sBAAwB,GAC1C,EAAW,EAAoB,EAAY,CACjD,GAAI,OAAO,MAAM,EAAS,CACxB,MAAO,CACL,KAAM,QACN,MAAO,CACL,UAAW,oBACX,QAAS,oBAAoB,EAAY,eACzC,MAAO,oBACR,CACF,CAGH,IAAM,GAAY,EAAI,YAAiB,IAAI,MAAM,CAC7C,EAA4B,KAChC,GAAI,IAAa,IAAM,IAAa,KAAM,CACxC,IAAM,EAAI,EAAoB,EAAS,CACvC,GAAI,OAAO,MAAM,EAAE,CACjB,MAAO,CACL,KAAM,QACN,MAAO,CACL,UAAW,sBACX,QAAS,oBAAoB,EAAS,eACtC,MAAO,aACR,CACF,CAEH,EAAa,EAMf,IAAM,EAAQ,GAAG,EAAe,IAAI,EAAO,aA0B3C,MAAO,CAAE,KAAM,UAAW,GAAI,EAAO,IAAA,CAxBnC,OAAQ,EACR,KAAM,EACN,gBAAiB,EACjB,SAAU,EAAO,QACjB,WAAY,EAAO,UACnB,YAAa,EAAO,WACpB,sBAAuB,EAAO,oBAC9B,cAAe,EACf,gBAAiB,EACjB,SAAU,MACV,QAAS,EAAY,EAAI,QAAW,CACpC,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,QAAS,EAAY,EAAI,eAAkB,CAC3C,aAAc,EAAY,EAAI,aAAgB,CAC9C,aAAc,EAAY,EAAI,aAAgB,CAC9C,gBAAiB,EAAO,cACxB,YAAa,IAAI,MAAM,CAAC,aAAa,CAGC,CAAE"}
|
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
import { o as TransformRegistry, r as RowTransform } from "./transform-D_uhdLeo.mjs";
|
|
2
|
+
import * as _$_murumets_ee_entity0 from "@murumets-ee/entity";
|
|
3
|
+
import { EsClientLike } from "@murumets-ee/search-elasticsearch";
|
|
4
|
+
import { Logger } from "@murumets-ee/core";
|
|
5
|
+
import { JobDefinition } from "@murumets-ee/queue/client";
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
import { AdminClient } from "@murumets-ee/entity/admin";
|
|
8
|
+
|
|
9
|
+
//#region src/entities/import-run.d.ts
|
|
10
|
+
/**
|
|
11
|
+
* Lifecycle stages.
|
|
12
|
+
*
|
|
13
|
+
* - `pending` — row exists, queue job has been enqueued, worker hasn't picked it up yet.
|
|
14
|
+
* - `running` — worker is streaming + bulk-writing.
|
|
15
|
+
* - `succeeded` — worker finished cleanly (note: per-row failures still possible — see `totals.failed`).
|
|
16
|
+
* - `failed` — worker threw and the queue marked the job dead. `errorSummary.fatal` carries the cause.
|
|
17
|
+
* - `cancelled` — manual operator action via PR 8a (out of scope for PR 7).
|
|
18
|
+
*/
|
|
19
|
+
declare const IMPORT_RUN_STATUSES: readonly ["pending", "running", "succeeded", "failed", "cancelled"];
|
|
20
|
+
type ImportRunStatus = (typeof IMPORT_RUN_STATUSES)[number];
|
|
21
|
+
declare const ImportRun: _$_murumets_ee_entity0.Entity<{
|
|
22
|
+
id: _$_murumets_ee_entity0.IdField;
|
|
23
|
+
} & _$_murumets_ee_entity0.AuditableFields & {
|
|
24
|
+
/** Operator-visible label, e.g. `"MERCEDES — ME_20251027075918.txt"`. Free-form. */label: _$_murumets_ee_entity0.TextField & {
|
|
25
|
+
readonly required: true;
|
|
26
|
+
readonly maxLength: 255;
|
|
27
|
+
}; /** See {@link IMPORT_RUN_STATUSES}. */
|
|
28
|
+
status: _$_murumets_ee_entity0.SelectField & {
|
|
29
|
+
options: readonly ["pending", "running", "succeeded", "failed", "cancelled"];
|
|
30
|
+
} & {
|
|
31
|
+
readonly options: readonly ["pending", "running", "succeeded", "failed", "cancelled"];
|
|
32
|
+
readonly default: "pending";
|
|
33
|
+
readonly indexed: true;
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* Path or storage key of the uploaded feed file. Generic string —
|
|
37
|
+
* could be `/var/lumi/uploads/<id>.txt` for the local-disk PoC or an
|
|
38
|
+
* S3 object key once storage adapter integration lands. The worker
|
|
39
|
+
* reads this with the configured `readFeed` resolver.
|
|
40
|
+
*/
|
|
41
|
+
filePath: _$_murumets_ee_entity0.TextField & {
|
|
42
|
+
readonly required: true;
|
|
43
|
+
readonly maxLength: 2048;
|
|
44
|
+
};
|
|
45
|
+
/**
|
|
46
|
+
* Name of the registered transform applied to each row. Resolved at
|
|
47
|
+
* worker-dispatch time against the transform registry contributed by
|
|
48
|
+
* the consumer. PoC: `'commerce:carmaker-feed'`.
|
|
49
|
+
*/
|
|
50
|
+
transformName: _$_murumets_ee_entity0.TextField & {
|
|
51
|
+
readonly required: true;
|
|
52
|
+
readonly maxLength: 128;
|
|
53
|
+
readonly indexed: true;
|
|
54
|
+
};
|
|
55
|
+
/**
|
|
56
|
+
* Opaque per-transform parameters. The carmaker transform expects
|
|
57
|
+
* `{ brandId, supplierId, codePrefix? }`. Validation happens inside
|
|
58
|
+
* the transform — the imports package never inspects this shape so
|
|
59
|
+
* a new transform with different params doesn't require a schema
|
|
60
|
+
* migration.
|
|
61
|
+
*/
|
|
62
|
+
params: _$_murumets_ee_entity0.JsonField & Partial<_$_murumets_ee_entity0.JsonField>;
|
|
63
|
+
/**
|
|
64
|
+
* Row counters: `{ submitted, succeeded, failed, skipped, batches }`.
|
|
65
|
+
* Updated by the worker as each batch completes. Final values are
|
|
66
|
+
* what the operator reads; intermediate progress comes from
|
|
67
|
+
* `toolkit_jobs.progress` via the queue UI.
|
|
68
|
+
*/
|
|
69
|
+
totals: _$_murumets_ee_entity0.JsonField & Partial<_$_murumets_ee_entity0.JsonField>;
|
|
70
|
+
/**
|
|
71
|
+
* Output of `ErrorTracker.getTopPatterns(totalRows)` — the top-50
|
|
72
|
+
* error signatures with up to 5 sample rows each. Empty `[]` until
|
|
73
|
+
* the worker writes it on completion.
|
|
74
|
+
*/
|
|
75
|
+
errorSummary: _$_murumets_ee_entity0.JsonField & Partial<_$_murumets_ee_entity0.JsonField>; /** Set when the worker picks up the job. */
|
|
76
|
+
startedAt: _$_murumets_ee_entity0.DateField & Partial<_$_murumets_ee_entity0.DateField>; /** Set when the worker finishes (success OR fatal failure). */
|
|
77
|
+
finishedAt: _$_murumets_ee_entity0.DateField & Partial<_$_murumets_ee_entity0.DateField>;
|
|
78
|
+
/**
|
|
79
|
+
* `toolkit_jobs.id` of the queue job processing this run — link
|
|
80
|
+
* back so PR 8a can show live progress without a second lookup.
|
|
81
|
+
*/
|
|
82
|
+
queueJobId: _$_murumets_ee_entity0.TextField & {
|
|
83
|
+
readonly maxLength: 64;
|
|
84
|
+
readonly indexed: true;
|
|
85
|
+
};
|
|
86
|
+
}>;
|
|
87
|
+
type ImportRunClient = AdminClient<typeof ImportRun.allFields>;
|
|
88
|
+
//#endregion
|
|
89
|
+
//#region src/error-tracker.d.ts
|
|
90
|
+
/**
|
|
91
|
+
* Aggregates per-row errors into top-N pattern buckets so a feed of 1M
|
|
92
|
+
* malformed rows surfaces as a handful of actionable signatures rather
|
|
93
|
+
* than a million identical strings.
|
|
94
|
+
*
|
|
95
|
+
* Lifted from giga-test (`backend/src/workers/csv-importer.ts`) and
|
|
96
|
+
* generalized:
|
|
97
|
+
* - Configurable caps so the importer can tune memory bounds per run.
|
|
98
|
+
* - Pure data — no logging side effects, no I/O.
|
|
99
|
+
* - JSON-serialisable output via {@link ErrorTracker.snapshot} for the
|
|
100
|
+
* `import_run.errorSummary` column.
|
|
101
|
+
*
|
|
102
|
+
* Pattern signature shape: `${errorType}:${field || 'GENERAL'}:${message}`.
|
|
103
|
+
* Same `errorType + field + message` collapses to one bucket; differing
|
|
104
|
+
* messages stay separate. This is intentional: a parser error on column
|
|
105
|
+
* `NetPrice/Discount` ("invalid number 'NA,5'") and the same on column
|
|
106
|
+
* `GrossPrice` are operationally distinct even if the parser is the same.
|
|
107
|
+
*
|
|
108
|
+
* Memory bounds: the patterns map is capped at `maxPatterns`. Once full,
|
|
109
|
+
* additional NEW signatures are dropped — known patterns keep accumulating
|
|
110
|
+
* counts. This is the "top-N most common" model: rare-but-novel errors
|
|
111
|
+
* past the cap are invisible, but the cap protects against a runaway
|
|
112
|
+
* adversarial feed exploding the map. Sample arrays are independently
|
|
113
|
+
* capped at `maxSamplesPerPattern`.
|
|
114
|
+
*/
|
|
115
|
+
/**
|
|
116
|
+
* Recursively-defined JSON-serialisable value. Mirrors the shape of
|
|
117
|
+
* `JsonValue` in `@murumets-ee/entity` without taking a dependency on
|
|
118
|
+
* that package — error-tracker is otherwise standalone, and the
|
|
119
|
+
* `ImportRun.errorSummary` JSONB column accepts anything in this shape.
|
|
120
|
+
*/
|
|
121
|
+
type ImportJsonValue = string | number | boolean | null | ImportJsonValue[] | {
|
|
122
|
+
[key: string]: ImportJsonValue;
|
|
123
|
+
};
|
|
124
|
+
/** One sample row attached to a pattern. `rowData` is the originally-parsed row. */
|
|
125
|
+
interface ErrorSample {
|
|
126
|
+
rowNumber: number;
|
|
127
|
+
rowData: ImportJsonValue;
|
|
128
|
+
}
|
|
129
|
+
/** Public shape of an aggregated pattern as returned by {@link ErrorTracker.getTopPatterns}. */
|
|
130
|
+
interface ErrorPattern {
|
|
131
|
+
errorType: string;
|
|
132
|
+
field: string | null;
|
|
133
|
+
message: string;
|
|
134
|
+
count: number;
|
|
135
|
+
firstOccurrence: number;
|
|
136
|
+
lastOccurrence: number;
|
|
137
|
+
samples: ReadonlyArray<ErrorSample>;
|
|
138
|
+
/** `count / totalErrors`, scaled 0..100. `0` when there are zero errors total. */
|
|
139
|
+
percentage: number;
|
|
140
|
+
}
|
|
141
|
+
interface ErrorTrackerConfig {
|
|
142
|
+
/** Hard cap on distinct signatures. Defaults to 50. New signatures past the cap are dropped. */
|
|
143
|
+
maxPatterns?: number;
|
|
144
|
+
/** Hard cap on samples retained per pattern. Defaults to 5. Excess samples are dropped. */
|
|
145
|
+
maxSamplesPerPattern?: number;
|
|
146
|
+
}
|
|
147
|
+
/** Default caps — match giga-test for compatibility. */
|
|
148
|
+
declare const DEFAULT_MAX_PATTERNS = 50;
|
|
149
|
+
declare const DEFAULT_MAX_SAMPLES_PER_PATTERN = 5;
|
|
150
|
+
declare class ErrorTracker {
|
|
151
|
+
private readonly patterns;
|
|
152
|
+
private readonly maxPatterns;
|
|
153
|
+
private readonly maxSamplesPerPattern;
|
|
154
|
+
private droppedSignatures;
|
|
155
|
+
constructor(config?: ErrorTrackerConfig);
|
|
156
|
+
/**
|
|
157
|
+
* Record one error. Same `(errorType, field, message)` triple bumps the
|
|
158
|
+
* existing bucket; a new triple opens a new one (subject to {@link maxPatterns}).
|
|
159
|
+
*
|
|
160
|
+
* `field` is optional — pass `undefined` for errors not tied to a single
|
|
161
|
+
* column (e.g. parse errors at row level). Internally normalised to the
|
|
162
|
+
* literal string `'GENERAL'` so it shares a bucket with other genericised
|
|
163
|
+
* errors of the same type+message.
|
|
164
|
+
*/
|
|
165
|
+
addError(rowNumber: number, errorType: string, message: string, field: string | undefined, rowData: ImportJsonValue): void;
|
|
166
|
+
/** Total count across every pattern. Counts errors, not patterns. */
|
|
167
|
+
getTotalErrorCount(): number;
|
|
168
|
+
/** Number of distinct signatures retained in the map (≤ `maxPatterns`). */
|
|
169
|
+
getDistinctPatternCount(): number;
|
|
170
|
+
/**
|
|
171
|
+
* Number of NEW signatures dropped because the map was already at
|
|
172
|
+
* capacity. Surfacing this in the import_run summary tells the operator
|
|
173
|
+
* "the top-N was saturated — there's a long tail you're not seeing".
|
|
174
|
+
*/
|
|
175
|
+
getDroppedSignatureCount(): number;
|
|
176
|
+
/**
|
|
177
|
+
* Top patterns sorted by descending count, capped at `maxPatterns`.
|
|
178
|
+
* Stable secondary order is insertion order (Map iteration order is
|
|
179
|
+
* insertion order; Array.sort is stable in V8).
|
|
180
|
+
*/
|
|
181
|
+
getTopPatterns(): ErrorPattern[];
|
|
182
|
+
/**
|
|
183
|
+
* Compact JSON-serialisable snapshot for `import_run.errorSummary`.
|
|
184
|
+
* Aside from the patterns array, includes the totals so a reader of
|
|
185
|
+
* just this column doesn't have to re-derive them.
|
|
186
|
+
*/
|
|
187
|
+
snapshot(): ErrorTrackerSnapshot;
|
|
188
|
+
}
|
|
189
|
+
interface ErrorTrackerSnapshot {
|
|
190
|
+
totalErrors: number;
|
|
191
|
+
distinctPatterns: number;
|
|
192
|
+
droppedSignatures: number;
|
|
193
|
+
patterns: ErrorPattern[];
|
|
194
|
+
}
|
|
195
|
+
//#endregion
|
|
196
|
+
//#region src/streaming.d.ts
|
|
197
|
+
/**
|
|
198
|
+
* Tab-delimited / CSV streaming reader. Generic over delimiter so the
|
|
199
|
+
* same path handles `.txt` (tab), `.csv` (comma), and the rare `;`
|
|
200
|
+
* European export dialect.
|
|
201
|
+
*
|
|
202
|
+
* Built on `csv-parse` per giga-test precedent — node-stream-based, low
|
|
203
|
+
* memory, handles UTF-8 BOM (the carmaker feeds are Windows-exported
|
|
204
|
+
* and ship with a BOM that breaks naive split-on-tab parsers).
|
|
205
|
+
*
|
|
206
|
+
* The reader yields `{ rowNumber, row }` pairs where:
|
|
207
|
+
* - `rowNumber` is 1-based and counts the header as row 0.
|
|
208
|
+
* - `row` is `Record<string, string>` keyed by header name. Empty cells
|
|
209
|
+
* are the empty string, NOT `undefined` — feed transforms test with
|
|
210
|
+
* `value === ''` consistently.
|
|
211
|
+
*
|
|
212
|
+
* Why this lives in a streaming reader and not inside the transform
|
|
213
|
+
* itself: the transform sees one already-parsed row at a time, never
|
|
214
|
+
* the file. That keeps transform implementations free of I/O concerns
|
|
215
|
+
* and makes them trivially unit-testable with a fixture row map.
|
|
216
|
+
*/
|
|
217
|
+
interface StreamFeedOptions {
|
|
218
|
+
/** Path to the file on disk. The PoC uploads land on local disk; S3-keyed reads come later. */
|
|
219
|
+
filePath: string;
|
|
220
|
+
/**
|
|
221
|
+
* Single-character field delimiter. Default `\t` (the carmaker feed
|
|
222
|
+
* format). Pass `,` for CSV, `;` for some European dialects.
|
|
223
|
+
*/
|
|
224
|
+
delimiter?: string;
|
|
225
|
+
/**
|
|
226
|
+
* `true` (default): the first row is the header and column names come
|
|
227
|
+
* from it. `false`: rows are emitted as positional `{ "0": ..., "1": ... }`
|
|
228
|
+
* and the transform reads by index — useful for headerless feeds that
|
|
229
|
+
* commit to a documented column order.
|
|
230
|
+
*/
|
|
231
|
+
hasHeader?: boolean;
|
|
232
|
+
/**
|
|
233
|
+
* Optional explicit column-name list. When provided, takes precedence
|
|
234
|
+
* over `hasHeader` (header row, if present, is skipped but its values
|
|
235
|
+
* are ignored). Useful when the upstream header is unstable but the
|
|
236
|
+
* positional shape isn't.
|
|
237
|
+
*/
|
|
238
|
+
columns?: ReadonlyArray<string>;
|
|
239
|
+
/**
|
|
240
|
+
* Forward to `csv-parse` `relax_column_count`. Default `false` —
|
|
241
|
+
* a row whose column count doesn't match the header surfaces as a
|
|
242
|
+
* parser error so the transform isn't silently fed truncated data.
|
|
243
|
+
*/
|
|
244
|
+
relaxColumnCount?: boolean;
|
|
245
|
+
}
|
|
246
|
+
interface StreamFeedRow {
|
|
247
|
+
/** 1-based row number. Header (when present) is row 0; first data row is row 1. */
|
|
248
|
+
rowNumber: number;
|
|
249
|
+
/**
|
|
250
|
+
* Cell values keyed by column name (or string-position when
|
|
251
|
+
* `hasHeader: false` AND no `columns`).
|
|
252
|
+
*
|
|
253
|
+
* **Cell-value invariants:**
|
|
254
|
+
* - Empty cells (`A\t\tC`) → `''` (empty string).
|
|
255
|
+
* - Missing TRAILING cells in `relaxColumnCount: true` mode → the
|
|
256
|
+
* key is **absent** from the object, not present-with-`''`. csv-parse
|
|
257
|
+
* does not emit keys for short rows. Transforms reading those
|
|
258
|
+
* columns get `undefined` from `row['col']` and must handle it
|
|
259
|
+
* (`row['col'] ?? ''` is the canonical idiom).
|
|
260
|
+
* - With the default `relaxColumnCount: false`, short rows reject at
|
|
261
|
+
* the parser, so this case never reaches the transform.
|
|
262
|
+
*/
|
|
263
|
+
row: Record<string, string>;
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Async-iterable over the parsed rows of a delimited file. Use with
|
|
267
|
+
* `for await (const { rowNumber, row } of streamFeed({ filePath, ... }))`.
|
|
268
|
+
*
|
|
269
|
+
* The iterator owns its file descriptor — the `for await` loop closes
|
|
270
|
+
* the underlying stream when it returns or breaks. Aborting mid-stream
|
|
271
|
+
* (`break`, `throw`, signal) is safe; csv-parse propagates the close.
|
|
272
|
+
*/
|
|
273
|
+
declare function streamFeed(options: StreamFeedOptions): AsyncIterable<StreamFeedRow>;
|
|
274
|
+
//#endregion
|
|
275
|
+
//#region src/runner.d.ts
|
|
276
|
+
/** Soft default; chosen to match giga-test. ES bulk requests over ~5MB get split server-side anyway. */
|
|
277
|
+
declare const DEFAULT_BATCH_SIZE = 1000;
|
|
278
|
+
interface RunImportOptions<TDoc> {
|
|
279
|
+
/** UUID of the `import_run` row driving this run. Forwarded to every transform invocation. */
|
|
280
|
+
importRunId: string;
|
|
281
|
+
/** Operator-supplied label for the run. Forwarded to the transform context. */
|
|
282
|
+
runLabel: string;
|
|
283
|
+
/** Opaque per-run params copied from `import_run.params`. */
|
|
284
|
+
params: Record<string, unknown>;
|
|
285
|
+
/** Transform applied to every parsed row. */
|
|
286
|
+
transform: RowTransform<TDoc>;
|
|
287
|
+
/** Streaming reader options — file path, delimiter, header config. */
|
|
288
|
+
feed: StreamFeedOptions;
|
|
289
|
+
/** ES client (low-level shape from `@murumets-ee/search-elasticsearch`). */
|
|
290
|
+
esClient: EsClientLike;
|
|
291
|
+
/** Index alias to write to. Per D6, callers always pass an alias, never a physical index. */
|
|
292
|
+
esIndex: string;
|
|
293
|
+
/** Rows per `bulkUpsert` call. Default {@link DEFAULT_BATCH_SIZE}. */
|
|
294
|
+
batchSize?: number;
|
|
295
|
+
/**
|
|
296
|
+
* Callback invoked after every batch. The handler in `worker.ts`
|
|
297
|
+
* forwards this to `ctx.updateProgress` for the queue UI; tests
|
|
298
|
+
* inspect it directly. Synchronous + cheap so a slow callback can't
|
|
299
|
+
* back-pressure the importer.
|
|
300
|
+
*/
|
|
301
|
+
onProgress?: (progress: ImportRunProgress) => void;
|
|
302
|
+
/** Optional: stop processing after this many rows. Tests use it; production passes `undefined`. */
|
|
303
|
+
rowLimit?: number;
|
|
304
|
+
/** Abort signal threaded into the underlying ES client request — cooperative cancel. */
|
|
305
|
+
signal?: AbortSignal;
|
|
306
|
+
/** Optional ErrorTracker config (caps). Default: top-50 patterns × 5 samples. */
|
|
307
|
+
errorTracker?: ErrorTracker;
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Progress payload written to `toolkit_jobs.progress` after every batch.
|
|
311
|
+
* Caps + flush rules live on the queue's `updateProgress` debounce —
|
|
312
|
+
* callers don't need to throttle.
|
|
313
|
+
*/
|
|
314
|
+
interface ImportRunProgress {
|
|
315
|
+
rowsRead: number;
|
|
316
|
+
rowsSucceeded: number;
|
|
317
|
+
rowsFailed: number;
|
|
318
|
+
rowsSkipped: number;
|
|
319
|
+
batchesCompleted: number;
|
|
320
|
+
/** Wall-clock seconds since the runner started. */
|
|
321
|
+
elapsedSeconds: number;
|
|
322
|
+
/** Rows / second, computed at every batch. */
|
|
323
|
+
rowsPerSecond: number;
|
|
324
|
+
/** Distinct error patterns currently held by the tracker. Saturates at the cap. */
|
|
325
|
+
distinctErrorPatterns: number;
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Final result returned by {@link runImport}. The handler writes these
|
|
329
|
+
* onto the `import_run` row alongside the ErrorTracker snapshot.
|
|
330
|
+
*/
|
|
331
|
+
interface RunImportResult {
|
|
332
|
+
/** Total rows read from the file (excludes skipped empty lines). */
|
|
333
|
+
rowsRead: number;
|
|
334
|
+
/** Rows the transform turned into a successful doc AND the ES cluster acknowledged. */
|
|
335
|
+
rowsSucceeded: number;
|
|
336
|
+
/**
|
|
337
|
+
* Rows that the transform rejected (`{ kind: 'error' }`) OR that ES
|
|
338
|
+
* rejected on bulk-write (per-doc failure). Both are aggregated by
|
|
339
|
+
* `errorTracker` for the import_run summary.
|
|
340
|
+
*/
|
|
341
|
+
rowsFailed: number;
|
|
342
|
+
/** Rows that the transform skipped (`{ kind: 'skip' }`) — header noise, blank lines, intentional drop. */
|
|
343
|
+
rowsSkipped: number;
|
|
344
|
+
/** Number of `bulkUpsert` calls made. */
|
|
345
|
+
batchesCompleted: number;
|
|
346
|
+
/** Final value of {@link ErrorTracker.snapshot}. */
|
|
347
|
+
errors: ReturnType<ErrorTracker['snapshot']>;
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Apply the runner against a feed file. Stops on rowLimit OR end-of-file
|
|
351
|
+
* OR if `signal` aborts. Throws if the streaming reader / ES client
|
|
352
|
+
* throws — caller (the queue handler) catches that and writes
|
|
353
|
+
* `import_run.status = 'failed'` with the error message in
|
|
354
|
+
* `errorSummary.fatal`.
|
|
355
|
+
*/
|
|
356
|
+
declare function runImport<TDoc>(options: RunImportOptions<TDoc>): Promise<RunImportResult>;
|
|
357
|
+
//#endregion
|
|
358
|
+
//#region src/worker.d.ts
|
|
359
|
+
/**
|
|
360
|
+
* Structural subset of `JobContext` the handler reads. Declared
|
|
361
|
+
* locally so we don't depend on a `JobHandler` import — the queue
|
|
362
|
+
* package doesn't re-export it from any subpath today (matches the
|
|
363
|
+
* `SendEmailJobContext` pattern in `@murumets-ee/notifications`).
|
|
364
|
+
*
|
|
365
|
+
* Function-param contravariance lets a handler returned with this
|
|
366
|
+
* narrower shape be assigned to the wider `JobHandler<TPayload>` that
|
|
367
|
+
* `registerJob` expects.
|
|
368
|
+
*/
|
|
369
|
+
interface ImportsRunJobContext {
|
|
370
|
+
id: string;
|
|
371
|
+
payload: ImportsRunJobPayload;
|
|
372
|
+
updateProgress(data: ImportRunProgress): void;
|
|
373
|
+
}
|
|
374
|
+
/**
|
|
375
|
+
* Payload schema. `importRunId` is the `import_run.id` UUID — the worker
|
|
376
|
+
* looks up everything else (filePath, transformName, params, …) from
|
|
377
|
+
* that row. Keeping the payload tiny means the queue's progress JSON
|
|
378
|
+
* column never bloats with feed metadata duplicated across `toolkit_jobs`.
|
|
379
|
+
*/
|
|
380
|
+
declare const importsRunJobPayloadSchema: z.ZodObject<{
|
|
381
|
+
importRunId: z.ZodString;
|
|
382
|
+
}, "strip", z.ZodTypeAny, {
|
|
383
|
+
importRunId: string;
|
|
384
|
+
}, {
|
|
385
|
+
importRunId: string;
|
|
386
|
+
}>;
|
|
387
|
+
type ImportsRunJobPayload = z.infer<typeof importsRunJobPayloadSchema>;
|
|
388
|
+
/**
|
|
389
|
+
* Job definition. Consumers register their handler against this with
|
|
390
|
+
* `registerJob(importsRunJob, createRunImportHandler({...}))`.
|
|
391
|
+
*
|
|
392
|
+
* `defaultRetries: 0` — re-running a multi-batch import against the
|
|
393
|
+
* same `import_run.id` while the previous handler may still be writing
|
|
394
|
+
* is a footgun (duplicate batches, double-counted progress). The
|
|
395
|
+
* operator handles retries explicitly via PR 8a's "retry" button by
|
|
396
|
+
* creating a new `import_run` row. When the resumable design lands
|
|
397
|
+
* (post-PoC), retries become safe to enable.
|
|
398
|
+
*/
|
|
399
|
+
declare const importsRunJob: JobDefinition<ImportsRunJobPayload>;
|
|
400
|
+
/**
|
|
401
|
+
* Resolves the ES client at handler-invocation time. A function (rather
|
|
402
|
+
* than the bare `EsClientLike`) so the consumer can lazy-construct the
|
|
403
|
+
* client — typical Next.js setups create the ES connection in a
|
|
404
|
+
* route-handler initialiser, not at plugin-init time.
|
|
405
|
+
*/
|
|
406
|
+
type EsClientResolver = () => EsClientLike | Promise<EsClientLike>;
|
|
407
|
+
/**
|
|
408
|
+
* Resolves the value of `import_run.filePath` (whatever the upload
|
|
409
|
+
* route persisted there — typically a storage adapter key) to a
|
|
410
|
+
* readable LOCAL filesystem path that {@link runImport} can hand to
|
|
411
|
+
* `node:fs.createReadStream`.
|
|
412
|
+
*
|
|
413
|
+
* **Why this exists:** the streaming reader (`streamFeed`) reads from
|
|
414
|
+
* local disk via `createReadStream`. The upload route may persist a
|
|
415
|
+
* remote-storage object key (R2, S3, …) on `import_run.filePath`
|
|
416
|
+
* because that's what `@murumets-ee/storage` returns. Without a
|
|
417
|
+
* resolver, `createReadStream('uploads/2026/05/<uuid>/feed.txt')`
|
|
418
|
+
* crashes with `ENOENT`. The resolver is the documented integration
|
|
419
|
+
* point — typical wiring downloads the storage object to a tmpfile
|
|
420
|
+
* and returns its path.
|
|
421
|
+
*
|
|
422
|
+
* **`cleanup`** runs after the run finishes (success OR failure). The
|
|
423
|
+
* worker awaits it best-effort — a failed cleanup logs but does not
|
|
424
|
+
* crash the run.
|
|
425
|
+
*
|
|
426
|
+
* **No-resolver fallback:** when `resolveFilePath` is unset, the
|
|
427
|
+
* worker treats `import_run.filePath` as already a local FS path
|
|
428
|
+
* (back-compat with the original PoC design where uploads landed on
|
|
429
|
+
* local disk). This stays valid for fixture-driven tests + on-disk
|
|
430
|
+
* deployments.
|
|
431
|
+
*/
|
|
432
|
+
type FilePathResolver = (storageKey: string) => Promise<{
|
|
433
|
+
localPath: string;
|
|
434
|
+
cleanup?: () => Promise<void>;
|
|
435
|
+
}>;
|
|
436
|
+
interface RunImportHandlerConfig {
|
|
437
|
+
/** AdminClient over the `import_run` entity. */
|
|
438
|
+
importRuns: ImportRunClient;
|
|
439
|
+
/** Transform registry to dispatch against. Defaults to the process-global singleton. */
|
|
440
|
+
transforms: TransformRegistry;
|
|
441
|
+
/** Resolver for the ES client. */
|
|
442
|
+
esClient: EsClientResolver;
|
|
443
|
+
/** ES alias / index to bulk-write into. Per D6 callers always pass an alias. */
|
|
444
|
+
esIndex: string;
|
|
445
|
+
/**
|
|
446
|
+
* Optional resolver for `import_run.filePath`. See {@link FilePathResolver}.
|
|
447
|
+
* Required when uploads land in remote storage (R2/S3); optional for
|
|
448
|
+
* on-disk PoC setups.
|
|
449
|
+
*/
|
|
450
|
+
resolveFilePath?: FilePathResolver;
|
|
451
|
+
/** Optional structured logger. Defaults to silent. */
|
|
452
|
+
logger?: Logger;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Build the `JobHandler` for {@link importsRunJob}. The returned
|
|
456
|
+
* handler is what gets passed to `registerJob`.
|
|
457
|
+
*
|
|
458
|
+
* The body is wrapped in `runAsCli` so AdminClient calls inside have a
|
|
459
|
+
* synthetic `cli` admin context — `auditable()` records `updatedBy:
|
|
460
|
+
* 'cli'` rather than NULL, and the firewall checker passes. This is
|
|
461
|
+
* the documented worker entry-point pattern (see `runAsCli` JSDoc in
|
|
462
|
+
* `@murumets-ee/core`).
|
|
463
|
+
*/
|
|
464
|
+
declare function createRunImportHandler(config: RunImportHandlerConfig): (job: ImportsRunJobContext) => Promise<void>;
|
|
465
|
+
//#endregion
|
|
466
|
+
export { IMPORT_RUN_STATUSES as C, ImportRunStatus as E, ErrorTrackerConfig as S, ImportRunClient as T, DEFAULT_MAX_PATTERNS as _, RunImportHandlerConfig as a, ErrorSample as b, importsRunJobPayloadSchema as c, RunImportOptions as d, RunImportResult as f, streamFeed as g, StreamFeedRow as h, ImportsRunJobPayload as i, DEFAULT_BATCH_SIZE as l, StreamFeedOptions as m, FilePathResolver as n, createRunImportHandler as o, runImport as p, ImportsRunJobContext as r, importsRunJob as s, EsClientResolver as t, ImportRunProgress as u, DEFAULT_MAX_SAMPLES_PER_PATTERN as v, ImportRun as w, ErrorTracker as x, ErrorPattern as y };
|
|
467
|
+
//# sourceMappingURL=worker-DerGVTSI.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker-DerGVTSI.d.mts","names":[],"sources":["../src/entities/import-run.ts","../src/error-tracker.ts","../src/streaming.ts","../src/runner.ts","../src/worker.ts"],"mappings":";;;;;;;;;;;AAgCA;;;;;;;cAHa,mBAAA;AAAA,KACD,eAAA,WAA0B,mBAAA;AAAA,cAEzB,SAAA,yBAAS,MAAA;MAgEpB,sBAAA,CAAA,OAAA;AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAEU,eAAA,GAAkB,WAAA,QAAmB,SAAA,CAAU,SAAA;;;;;;;;;;;;AArE3D;;;;;AACA;;;;;AAEA;;;;;;;;;;;;KCAY,eAAA,sCAKR,eAAA;EAAA,CACG,GAAA,WAAc,eAAA;AAAA;;UAGJ,WAAA;EACf,SAAA;EACA,OAAA,EAAS,eAAA;AAAA;;UAIM,YAAA;EACf,SAAA;EACA,KAAA;EACA,OAAA;EACA,KAAA;EACA,eAAA;EACA,cAAA;EACA,OAAA,EAAS,aAAA,CAAc,WAAA;;EAEvB,UAAA;AAAA;AAAA,UAGe,kBAAA;;EAEf,WAAA;;EAEA,oBAAA;AAAA;;cAIW,oBAAA;AAAA,cACA,+BAAA;AAAA,cAYA,YAAA;EAAA,iBACM,QAAA;EAAA,iBACA,WAAA;EAAA,iBACA,oBAAA;EAAA,QACT,iBAAA;cAEI,MAAA,GAAQ,kBAAA;;;;;;;;;;EAcpB,QAAA,CACE,SAAA,UACA,SAAA,UACA,OAAA,UACA,KAAA,sBACA,OAAA,EAAS,eAAA;;EA+BX,kBAAA,CAAA;;EAOA,uBAAA,CAAA;;;;;;EASA,wBAAA,CAAA;;;;;;EASA,cAAA,CAAA,GAAkB,YAAA;;;;;AD/DpB;ECmFE,QAAA,CAAA,GAAY,oBAAA;AAAA;AAAA,UAUG,oBAAA;EACf,WAAA;EACA,gBAAA;EACA,iBAAA;EACA,QAAA,EAAU,YAAA;AAAA;;;;;;;;;;;;ADtKZ;;;;;AACA;;;;;AAEA;UERiB,iBAAA;;EAEf,QAAA;;;;;EAKA,SAAA;;;;;;;EAOA,SAAA;;;;;;;EAOA,OAAA,GAAU,aAAA;;;;;;EAMV,gBAAA;AAAA;AAAA,UAGe,aAAA;;EAEf,SAAA;;;;;;;;;;;;;;;EAeA,GAAA,EAAK,MAAA;AAAA;;;;;;;;;iBAWgB,UAAA,CAAW,OAAA,EAAS,iBAAA,GAAoB,aAAA,CAAc,aAAA;;;;cCnDhE,kBAAA;AAAA,UAEI,gBAAA;;EAEf,WAAA;;EAEA,QAAA;;EAEA,MAAA,EAAQ,MAAA;;EAER,SAAA,EAAW,YAAA,CAAa,IAAA;;EAExB,IAAA,EAAM,iBAAA;;EAEN,QAAA,EAAU,YAAA;;EAEV,OAAA;EHfoB;EGiBpB,SAAA;EHjBoB;;;;;;EGwBpB,UAAA,IAAc,QAAA,EAAU,iBAAA;;EAExB,QAAA;;EAEA,MAAA,GAAS,WAAA;;EAET,YAAA,GAAe,YAAA;AAAA;;;;;;UAQA,iBAAA;EACf,QAAA;EACA,aAAA;EACA,UAAA;EACA,WAAA;EACA,gBAAA;;EAEA,cAAA;;EAEA,aAAA;;EAEA,qBAAA;AAAA;;;;;UAOe,eAAA;;EAEf,QAAA;;EAEA,aAAA;;;;;;EAMA,UAAA;;EAEA,WAAA;EHFU;EGIV,gBAAA;;EAEA,MAAA,EAAQ,UAAA,CAAW,YAAA;AAAA;;;;;;;;iBAUC,SAAA,MAAA,CAAgB,OAAA,EAAS,gBAAA,CAAiB,IAAA,IAAQ,OAAA,CAAQ,eAAA;;;;;;;;;;;;;UCjF/D,oBAAA;EACf,EAAA;EACA,OAAA,EAAS,oBAAA;EACT,cAAA,CAAe,IAAA,EAAM,iBAAA;AAAA;;;;;;;cASV,0BAAA,EAA0B,CAAA,CAAA,SAAA;;;;;;;KAG3B,oBAAA,GAAuB,CAAA,CAAE,KAAA,QAAa,0BAAA;;;;;;;;;;;;cAarC,aAAA,EAAe,aAAA,CAAc,oBAAA;;;;;;;KAa9B,gBAAA,SAAyB,YAAA,GAAe,OAAA,CAAQ,YAAA;;;;;;;;;;;;;;;;;;;;;;;;;;KA2BhD,gBAAA,IACV,UAAA,aACG,OAAA;EAAU,SAAA;EAAmB,OAAA,SAAgB,OAAA;AAAA;AAAA,UAEjC,sBAAA;EJPwB;EISvC,UAAA,EAAY,eAAA;EJTmC;EIW/C,UAAA,EAAY,iBAAA;EJXsD;EIalE,QAAA,EAAU,gBAAA;;EAEV,OAAA;;AHjFF;;;;EGuFE,eAAA,GAAkB,gBAAA;EHjFb;EGmFL,MAAA,GAAS,MAAA;AAAA;;AHhFX;;;;;;;;;iBG6FgB,sBAAA,CACd,MAAA,EAAQ,sBAAA,IACN,GAAA,EAAK,oBAAA,KAAyB,OAAA"}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import { a as RunImportHandlerConfig, c as importsRunJobPayloadSchema, i as ImportsRunJobPayload, n as FilePathResolver, o as createRunImportHandler, r as ImportsRunJobContext, s as importsRunJob, t as EsClientResolver } from "./worker-DerGVTSI.mjs";
|
|
2
|
+
export { EsClientResolver, FilePathResolver, ImportsRunJobContext, ImportsRunJobPayload, RunImportHandlerConfig, createRunImportHandler, importsRunJob, importsRunJobPayloadSchema };
|
package/dist/worker.mjs
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import{n as e}from"./runner-DdhiNybk.mjs";import{runAsCli as t}from"@murumets-ee/core";import{defineJob as n}from"@murumets-ee/queue/client";import{z as r}from"zod";const i=r.object({importRunId:r.string().uuid()}),a=n({name:`imports:run`,description:`Stream a feed file, transform rows, bulk-write to Elasticsearch.`,schema:i,defaultRetries:0});function o(n){let{importRuns:r,transforms:i,esClient:a,esIndex:o,resolveFilePath:c,logger:l}=n;return async n=>t(async()=>{let{importRunId:t}=n.payload,u=l?.child({jobId:n.id,importRunId:t,type:`imports:run`}),d=await r.findById(t);if(!d){u?.warn(`import_run row not found — skipping`);return}let f=i.get(d.transformName);if(!f)throw await s(r,t,`Unknown transform "${d.transformName}"`),Error(`No transform registered for name "${d.transformName}"`);let p=d.params&&typeof d.params==`object`&&!Array.isArray(d.params)?d.params:{};await r.update(t,{status:`running`,queueJobId:n.id,startedAt:new Date});let m;try{m=await a()}catch(e){throw await s(r,t,`Could not resolve ES client: ${e instanceof Error?e.message:String(e)}`),e}let h,g;if(c)try{let e=await c(d.filePath);h=e.localPath,g=e.cleanup}catch(e){throw await s(r,t,`Could not resolve filePath: ${e instanceof Error?e.message:String(e)}`),e}else h=d.filePath;let _;try{_=await e({importRunId:t,runLabel:d.label,params:p,transform:f,feed:{filePath:h},esClient:m,esIndex:o,onProgress:e=>n.updateProgress(e)})}catch(e){throw await s(r,t,e instanceof Error?e.message:String(e)),g&&await g().catch(e=>{u?.warn({err:e},`filePath cleanup after failed run threw — ignoring`)}),e}g&&await g().catch(e=>{u?.warn({err:e},`filePath cleanup after successful run threw — ignoring`)}),await r.update(t,{status:`succeeded`,finishedAt:new Date,totals:{rowsRead:_.rowsRead,rowsSucceeded:_.rowsSucceeded,rowsFailed:_.rowsFailed,rowsSkipped:_.rowsSkipped,batchesCompleted:_.batchesCompleted},errorSummary:JSON.parse(JSON.stringify(_.errors))}),u?.info({rowsRead:_.rowsRead,rowsSucceeded:_.rowsSucceeded,rowsFailed:_.rowsFailed,rowsSkipped:_.rowsSkipped,batches:_.batchesCompleted},`import_run completed`)})}async function s(e,t,n){try{await e.update(t,{status:`failed`,finishedAt:new Date,errorSummary:{fatal:n}})}catch{}}export{o as createRunImportHandler,a as importsRunJob,i as importsRunJobPayloadSchema};
|
|
2
|
+
//# sourceMappingURL=worker.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker.mjs","names":[],"sources":["../src/worker.ts"],"sourcesContent":["/**\n * Queue handler for `imports:run`. The route handler that creates the\n * `import_run` row enqueues this job carrying `{ importRunId }`; the\n * worker module here resolves the run, the transform, and the ES\n * client, then runs `runImport` and writes results back.\n *\n * Per PLAN-ECOMMERCE.md PR 7 (PoC scope): one job type, no resumability.\n * A failed handler attempt re-runs from row 1 on retry — fine for the\n * tens-of-MB feed sizes the PoC exercises.\n *\n * Per CLAUDE.md \"leaf packages\" boundary rule: this handler reads the\n * import_run row via an injected `AdminClient`, NOT a direct `db.select`.\n * Construction happens at plugin init in `plugin.ts`.\n */\n\nimport { type Logger, runAsCli } from '@murumets-ee/core'\nimport { defineJob, type JobDefinition } from '@murumets-ee/queue/client'\nimport type { EsClientLike } from '@murumets-ee/search-elasticsearch'\nimport { z } from 'zod'\nimport { type ImportRunClient, type ImportRunStatus } from './entities/import-run.js'\nimport { type ImportRunProgress, runImport } from './runner.js'\nimport { TransformRegistry } from './transform.js'\n\n/**\n * Structural subset of `JobContext` the handler reads. Declared\n * locally so we don't depend on a `JobHandler` import — the queue\n * package doesn't re-export it from any subpath today (matches the\n * `SendEmailJobContext` pattern in `@murumets-ee/notifications`).\n *\n * Function-param contravariance lets a handler returned with this\n * narrower shape be assigned to the wider `JobHandler<TPayload>` that\n * `registerJob` expects.\n */\nexport interface ImportsRunJobContext {\n id: string\n payload: ImportsRunJobPayload\n updateProgress(data: ImportRunProgress): void\n}\n\n/**\n * Payload schema. `importRunId` is the `import_run.id` UUID — the worker\n * looks up everything else (filePath, transformName, params, …) from\n * that row. Keeping the payload tiny means the queue's progress JSON\n * column never bloats with feed metadata duplicated across `toolkit_jobs`.\n */\nexport const importsRunJobPayloadSchema = z.object({\n importRunId: z.string().uuid(),\n})\nexport type ImportsRunJobPayload = z.infer<typeof importsRunJobPayloadSchema>\n\n/**\n * Job definition. Consumers register their handler against this with\n * `registerJob(importsRunJob, createRunImportHandler({...}))`.\n *\n * `defaultRetries: 0` — re-running a multi-batch import against the\n * same `import_run.id` while the previous handler may still be writing\n * is a footgun (duplicate batches, double-counted progress). The\n * operator handles retries explicitly via PR 8a's \"retry\" button by\n * creating a new `import_run` row. When the resumable design lands\n * (post-PoC), retries become safe to enable.\n */\nexport const importsRunJob: JobDefinition<ImportsRunJobPayload> = defineJob({\n name: 'imports:run',\n description: 'Stream a feed file, transform rows, bulk-write to Elasticsearch.',\n schema: importsRunJobPayloadSchema,\n defaultRetries: 0,\n})\n\n/**\n * Resolves the ES client at handler-invocation time. A function (rather\n * than the bare `EsClientLike`) so the consumer can lazy-construct the\n * client — typical Next.js setups create the ES connection in a\n * route-handler initialiser, not at plugin-init time.\n */\nexport type EsClientResolver = () => EsClientLike | Promise<EsClientLike>\n\n/**\n * Resolves the value of `import_run.filePath` (whatever the upload\n * route persisted there — typically a storage adapter key) to a\n * readable LOCAL filesystem path that {@link runImport} can hand to\n * `node:fs.createReadStream`.\n *\n * **Why this exists:** the streaming reader (`streamFeed`) reads from\n * local disk via `createReadStream`. The upload route may persist a\n * remote-storage object key (R2, S3, …) on `import_run.filePath`\n * because that's what `@murumets-ee/storage` returns. Without a\n * resolver, `createReadStream('uploads/2026/05/<uuid>/feed.txt')`\n * crashes with `ENOENT`. The resolver is the documented integration\n * point — typical wiring downloads the storage object to a tmpfile\n * and returns its path.\n *\n * **`cleanup`** runs after the run finishes (success OR failure). The\n * worker awaits it best-effort — a failed cleanup logs but does not\n * crash the run.\n *\n * **No-resolver fallback:** when `resolveFilePath` is unset, the\n * worker treats `import_run.filePath` as already a local FS path\n * (back-compat with the original PoC design where uploads landed on\n * local disk). This stays valid for fixture-driven tests + on-disk\n * deployments.\n */\nexport type FilePathResolver = (\n storageKey: string,\n) => Promise<{ localPath: string; cleanup?: () => Promise<void> }>\n\nexport interface RunImportHandlerConfig {\n /** AdminClient over the `import_run` entity. */\n importRuns: ImportRunClient\n /** Transform registry to dispatch against. Defaults to the process-global singleton. */\n transforms: TransformRegistry\n /** Resolver for the ES client. */\n esClient: EsClientResolver\n /** ES alias / index to bulk-write into. Per D6 callers always pass an alias. */\n esIndex: string\n /**\n * Optional resolver for `import_run.filePath`. See {@link FilePathResolver}.\n * Required when uploads land in remote storage (R2/S3); optional for\n * on-disk PoC setups.\n */\n resolveFilePath?: FilePathResolver\n /** Optional structured logger. Defaults to silent. */\n logger?: Logger\n}\n\n/**\n * Build the `JobHandler` for {@link importsRunJob}. The returned\n * handler is what gets passed to `registerJob`.\n *\n * The body is wrapped in `runAsCli` so AdminClient calls inside have a\n * synthetic `cli` admin context — `auditable()` records `updatedBy:\n * 'cli'` rather than NULL, and the firewall checker passes. This is\n * the documented worker entry-point pattern (see `runAsCli` JSDoc in\n * `@murumets-ee/core`).\n */\nexport function createRunImportHandler(\n config: RunImportHandlerConfig,\n): (job: ImportsRunJobContext) => Promise<void> {\n const { importRuns, transforms, esClient, esIndex, resolveFilePath, logger } = config\n\n return async (job: ImportsRunJobContext): Promise<void> =>\n runAsCli(async () => {\n const { importRunId } = job.payload\n const log = logger?.child({ jobId: job.id, importRunId, type: 'imports:run' })\n\n const run = await importRuns.findById(importRunId)\n if (!run) {\n // Row was deleted between enqueue and dispatch — nothing to do.\n // Don't throw: a thrown handler retries with no payoff because\n // the row stays gone. Log + return so the queue marks the job\n // completed.\n log?.warn('import_run row not found — skipping')\n return\n }\n\n // `transformName`, `filePath`, `label` are declared `field.text({\n // required: true })` so the inferred DTO already types them as\n // `string` — no cast needed. `params` is `field.json()` which\n // infers as `JsonValue | undefined`; narrow to a record so the\n // transform receives an object.\n const transform = transforms.get(run.transformName)\n if (!transform) {\n await markFailed(importRuns, importRunId, `Unknown transform \"${run.transformName}\"`)\n throw new Error(`No transform registered for name \"${run.transformName}\"`)\n }\n const params: Record<string, unknown> =\n run.params && typeof run.params === 'object' && !Array.isArray(run.params)\n ? (run.params as Record<string, unknown>)\n : {}\n\n await importRuns.update(importRunId, {\n status: 'running' satisfies ImportRunStatus,\n queueJobId: job.id,\n startedAt: new Date(),\n })\n\n let resolvedClient: EsClientLike\n try {\n resolvedClient = await esClient()\n } catch (err) {\n const reason = err instanceof Error ? err.message : String(err)\n await markFailed(importRuns, importRunId, `Could not resolve ES client: ${reason}`)\n throw err\n }\n\n // Materialise the file locally if the consumer wired a resolver.\n // The `cleanup` callback (when present) runs in the `finally`\n // below regardless of run outcome so a tmpfile created here\n // doesn't outlive the handler.\n let localPath: string\n let fileCleanup: (() => Promise<void>) | undefined\n if (resolveFilePath) {\n try {\n const resolved = await resolveFilePath(run.filePath)\n localPath = resolved.localPath\n fileCleanup = resolved.cleanup\n } catch (err) {\n const reason = err instanceof Error ? err.message : String(err)\n await markFailed(importRuns, importRunId, `Could not resolve filePath: ${reason}`)\n throw err\n }\n } else {\n // Back-compat: no resolver wired → treat `filePath` as already\n // a local FS path. Fine for fixture-driven tests + the original\n // on-disk PoC design.\n localPath = run.filePath\n }\n\n let result: Awaited<ReturnType<typeof runImport>>\n try {\n result = await runImport({\n importRunId,\n runLabel: run.label,\n params,\n transform,\n feed: { filePath: localPath },\n esClient: resolvedClient,\n esIndex,\n onProgress: (progress) => job.updateProgress(progress),\n })\n } catch (err) {\n const reason = err instanceof Error ? err.message : String(err)\n await markFailed(importRuns, importRunId, reason)\n if (fileCleanup) {\n await fileCleanup().catch((cleanupErr: unknown) => {\n log?.warn({ err: cleanupErr }, 'filePath cleanup after failed run threw — ignoring')\n })\n }\n throw err\n }\n\n // Cleanup runs before the success-update so a cleanup throw\n // surfaces as a logged warning rather than a crashed handler at\n // the worst possible moment (just after the row was about to be\n // marked succeeded).\n if (fileCleanup) {\n await fileCleanup().catch((cleanupErr: unknown) => {\n log?.warn({ err: cleanupErr }, 'filePath cleanup after successful run threw — ignoring')\n })\n }\n\n await importRuns.update(importRunId, {\n status: 'succeeded' satisfies ImportRunStatus,\n finishedAt: new Date(),\n totals: {\n rowsRead: result.rowsRead,\n rowsSucceeded: result.rowsSucceeded,\n rowsFailed: result.rowsFailed,\n rowsSkipped: result.rowsSkipped,\n batchesCompleted: result.batchesCompleted,\n },\n // The typed `ErrorTrackerSnapshot` is fully JSON-serialisable but\n // doesn't structurally satisfy entity's recursive `JsonValue`\n // (objects without an index signature aren't `Record<string,\n // JsonValue>`). Round-trip through `JSON.parse(JSON.stringify(...))`\n // to land as the recursive JSON shape — the runtime cost is one\n // serialise per import_run completion, the type-cast cost is zero.\n errorSummary: JSON.parse(JSON.stringify(result.errors)),\n })\n\n log?.info(\n {\n rowsRead: result.rowsRead,\n rowsSucceeded: result.rowsSucceeded,\n rowsFailed: result.rowsFailed,\n rowsSkipped: result.rowsSkipped,\n batches: result.batchesCompleted,\n },\n 'import_run completed',\n )\n })\n}\n\n/** Write a `failed` row with a fatal error reason. Best-effort — logs but does not throw on DB failure. */\nasync function markFailed(\n importRuns: ImportRunClient,\n id: string,\n reason: string,\n): Promise<void> {\n try {\n await importRuns.update(id, {\n status: 'failed' satisfies ImportRunStatus,\n finishedAt: new Date(),\n errorSummary: { fatal: reason },\n })\n } catch {\n // The handler already threw / will throw — the queue will surface\n // the original error. A second failure here just logs noise.\n }\n}\n"],"mappings":"qKA6CA,MAAa,EAA6B,EAAE,OAAO,CACjD,YAAa,EAAE,QAAQ,CAAC,MAAM,CAC/B,CAAC,CAcW,EAAqD,EAAU,CAC1E,KAAM,cACN,YAAa,mEACb,OAAQ,EACR,eAAgB,EACjB,CAAC,CAoEF,SAAgB,EACd,EAC8C,CAC9C,GAAM,CAAE,aAAY,aAAY,WAAU,UAAS,kBAAiB,UAAW,EAE/E,OAAO,KAAO,IACZ,EAAS,SAAY,CACnB,GAAM,CAAE,eAAgB,EAAI,QACtB,EAAM,GAAQ,MAAM,CAAE,MAAO,EAAI,GAAI,cAAa,KAAM,cAAe,CAAC,CAExE,EAAM,MAAM,EAAW,SAAS,EAAY,CAClD,GAAI,CAAC,EAAK,CAKR,GAAK,KAAK,sCAAsC,CAChD,OAQF,IAAM,EAAY,EAAW,IAAI,EAAI,cAAc,CACnD,GAAI,CAAC,EAEH,MADA,MAAM,EAAW,EAAY,EAAa,sBAAsB,EAAI,cAAc,GAAG,CAC3E,MAAM,qCAAqC,EAAI,cAAc,GAAG,CAE5E,IAAM,EACJ,EAAI,QAAU,OAAO,EAAI,QAAW,UAAY,CAAC,MAAM,QAAQ,EAAI,OAAO,CACrE,EAAI,OACL,EAAE,CAER,MAAM,EAAW,OAAO,EAAa,CACnC,OAAQ,UACR,WAAY,EAAI,GAChB,UAAW,IAAI,KAChB,CAAC,CAEF,IAAI,EACJ,GAAI,CACF,EAAiB,MAAM,GAAU,OAC1B,EAAK,CAGZ,MADA,MAAM,EAAW,EAAY,EAAa,gCAD3B,aAAe,MAAQ,EAAI,QAAU,OAAO,EAAI,GACoB,CAC7E,EAOR,IAAI,EACA,EACJ,GAAI,EACF,GAAI,CACF,IAAM,EAAW,MAAM,EAAgB,EAAI,SAAS,CACpD,EAAY,EAAS,UACrB,EAAc,EAAS,cAChB,EAAK,CAGZ,MADA,MAAM,EAAW,EAAY,EAAa,+BAD3B,aAAe,MAAQ,EAAI,QAAU,OAAO,EAAI,GACmB,CAC5E,OAMR,EAAY,EAAI,SAGlB,IAAI,EACJ,GAAI,CACF,EAAS,MAAM,EAAU,CACvB,cACA,SAAU,EAAI,MACd,SACA,YACA,KAAM,CAAE,SAAU,EAAW,CAC7B,SAAU,EACV,UACA,WAAa,GAAa,EAAI,eAAe,EAAS,CACvD,CAAC,OACK,EAAK,CAQZ,MANA,MAAM,EAAW,EAAY,EADd,aAAe,MAAQ,EAAI,QAAU,OAAO,EAAI,CACd,CAC7C,GACF,MAAM,GAAa,CAAC,MAAO,GAAwB,CACjD,GAAK,KAAK,CAAE,IAAK,EAAY,CAAE,qDAAqD,EACpF,CAEE,EAOJ,GACF,MAAM,GAAa,CAAC,MAAO,GAAwB,CACjD,GAAK,KAAK,CAAE,IAAK,EAAY,CAAE,yDAAyD,EACxF,CAGJ,MAAM,EAAW,OAAO,EAAa,CACnC,OAAQ,YACR,WAAY,IAAI,KAChB,OAAQ,CACN,SAAU,EAAO,SACjB,cAAe,EAAO,cACtB,WAAY,EAAO,WACnB,YAAa,EAAO,YACpB,iBAAkB,EAAO,iBAC1B,CAOD,aAAc,KAAK,MAAM,KAAK,UAAU,EAAO,OAAO,CAAC,CACxD,CAAC,CAEF,GAAK,KACH,CACE,SAAU,EAAO,SACjB,cAAe,EAAO,cACtB,WAAY,EAAO,WACnB,YAAa,EAAO,YACpB,QAAS,EAAO,iBACjB,CACD,uBACD,EACD,CAIN,eAAe,EACb,EACA,EACA,EACe,CACf,GAAI,CACF,MAAM,EAAW,OAAO,EAAI,CAC1B,OAAQ,SACR,WAAY,IAAI,KAChB,aAAc,CAAE,MAAO,EAAQ,CAChC,CAAC,MACI"}
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@murumets-ee/imports",
|
|
3
|
+
"version": "0.12.0",
|
|
4
|
+
"license": "Elastic-2.0",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"exports": {
|
|
7
|
+
".": {
|
|
8
|
+
"types": "./dist/index.d.mts",
|
|
9
|
+
"import": "./dist/index.mjs"
|
|
10
|
+
},
|
|
11
|
+
"./plugin": {
|
|
12
|
+
"types": "./dist/plugin.d.mts",
|
|
13
|
+
"import": "./dist/plugin.mjs"
|
|
14
|
+
},
|
|
15
|
+
"./worker": {
|
|
16
|
+
"types": "./dist/worker.d.mts",
|
|
17
|
+
"import": "./dist/worker.mjs"
|
|
18
|
+
},
|
|
19
|
+
"./transforms": {
|
|
20
|
+
"types": "./dist/transforms.d.mts",
|
|
21
|
+
"import": "./dist/transforms.mjs"
|
|
22
|
+
},
|
|
23
|
+
"./storage-resolver": {
|
|
24
|
+
"types": "./dist/storage-resolver.d.mts",
|
|
25
|
+
"import": "./dist/storage-resolver.mjs"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"files": [
|
|
29
|
+
"dist"
|
|
30
|
+
],
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"csv-parse": "^5.5.3",
|
|
33
|
+
"drizzle-orm": "^0.45.2",
|
|
34
|
+
"zod": "^3.24.1",
|
|
35
|
+
"@murumets-ee/core": "0.12.0",
|
|
36
|
+
"@murumets-ee/db": "0.12.0",
|
|
37
|
+
"@murumets-ee/entity": "0.12.0",
|
|
38
|
+
"@murumets-ee/logging": "0.12.0",
|
|
39
|
+
"@murumets-ee/queue": "0.12.0",
|
|
40
|
+
"@murumets-ee/search-elasticsearch": "0.12.0",
|
|
41
|
+
"@murumets-ee/storage": "0.12.0"
|
|
42
|
+
},
|
|
43
|
+
"devDependencies": {
|
|
44
|
+
"@types/node": "^20.19.39",
|
|
45
|
+
"tsdown": "^0.21.10",
|
|
46
|
+
"typescript": "^5.7.3",
|
|
47
|
+
"vitest": "^2.1.8"
|
|
48
|
+
},
|
|
49
|
+
"typeCoverage": {
|
|
50
|
+
"atLeast": 100
|
|
51
|
+
},
|
|
52
|
+
"scripts": {
|
|
53
|
+
"build": "tsdown",
|
|
54
|
+
"dev": "tsdown --watch",
|
|
55
|
+
"test": "vitest",
|
|
56
|
+
"test:integration": "vitest run --config vitest.integration.config.ts"
|
|
57
|
+
}
|
|
58
|
+
}
|