@ordis-dev/ordis 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/index.d.ts +3 -1
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +1 -0
- package/dist/core/index.js.map +1 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +14 -1
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/preprocessor.d.ts +35 -0
- package/dist/core/preprocessor.d.ts.map +1 -0
- package/dist/core/preprocessor.js +297 -0
- package/dist/core/preprocessor.js.map +1 -0
- package/dist/core/types.d.ts +22 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
package/dist/core/index.d.ts
CHANGED
|
@@ -4,5 +4,7 @@
|
|
|
4
4
|
export { ExtractionPipeline, extract } from './pipeline.js';
|
|
5
5
|
export { validateExtractedData } from './validator.js';
|
|
6
6
|
export { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
-
export
|
|
7
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
|
|
8
|
+
export type { PreprocessResult } from './preprocessor.js';
|
|
9
|
+
export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './types.js';
|
|
8
10
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/core/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC;AAC3B,YAAY,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAC1D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,YAAY,CAAC"}
|
package/dist/core/index.js
CHANGED
|
@@ -4,4 +4,5 @@
|
|
|
4
4
|
export { ExtractionPipeline, extract } from './pipeline.js';
|
|
5
5
|
export { validateExtractedData } from './validator.js';
|
|
6
6
|
export { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
|
|
7
8
|
//# sourceMappingURL=index.js.map
|
package/dist/core/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAc,MAAM,YAAY,CAAC;AAEhF;;GAEG;AACH,qBAAa,kBAAkB;IAC3B,OAAO,CAAC,KAAK,CAAU;gBAEX,KAAK,GAAE,OAAe;IAIlC;;OAEG;IACG,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC;IAoLlE;;OAEG;IACH,OAAO,CAAC,UAAU;IAsBlB;;OAEG;YACW,eAAe;CAqBhC;AAED;;GAEG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC,CAGjF"}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import { LLMClient } from '../llm/client.js';
|
|
5
5
|
import { validateExtractedData } from './validator.js';
|
|
6
6
|
import { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
+
import { preprocessWithDetails } from './preprocessor.js';
|
|
7
8
|
/**
|
|
8
9
|
* Main extraction pipeline
|
|
9
10
|
*/
|
|
@@ -19,6 +20,18 @@ export class ExtractionPipeline {
|
|
|
19
20
|
const startTime = Date.now();
|
|
20
21
|
const steps = [];
|
|
21
22
|
try {
|
|
23
|
+
// Step 0: Preprocess input (if configured)
|
|
24
|
+
let processedInput = request.input;
|
|
25
|
+
if (request.preprocessing) {
|
|
26
|
+
const preprocessStep = this.recordStep('preprocess', () => {
|
|
27
|
+
return preprocessWithDetails(request.input, request.preprocessing);
|
|
28
|
+
});
|
|
29
|
+
steps.push(preprocessStep);
|
|
30
|
+
if (preprocessStep.success && preprocessStep.data) {
|
|
31
|
+
const result = preprocessStep.data;
|
|
32
|
+
processedInput = result.text;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
22
35
|
// Step 1: Create LLM client
|
|
23
36
|
const clientStep = this.recordStep('create_client', () => {
|
|
24
37
|
return new LLMClient(request.llmConfig);
|
|
@@ -32,7 +45,7 @@ export class ExtractionPipeline {
|
|
|
32
45
|
const extractStep = await this.recordStepAsync('llm_extract', async () => {
|
|
33
46
|
return await client.extract({
|
|
34
47
|
schema: request.schema,
|
|
35
|
-
input:
|
|
48
|
+
input: processedInput,
|
|
36
49
|
});
|
|
37
50
|
});
|
|
38
51
|
steps.push(extractStep);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAG1D;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACnB,KAAK,CAAU;IAEvB,YAAY,QAAiB,KAAK;QAC9B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,OAA0B;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAiB,EAAE,CAAC;QAE/B,IAAI,CAAC;YACD,2CAA2C;YAC3C,IAAI,cAAc,GAAG,OAAO,CAAC,KAAK,CAAC;YACnC,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBACxB,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,YAAY,EAAE,GAAG,EAAE;oBACtD,OAAO,qBAAqB,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;gBACvE,CAAC,CAAC,CAAC;gBACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAE3B,IAAI,cAAc,CAAC,OAAO,IAAI,cAAc,CAAC,IAAI,EAAE,CAAC;oBAChD,MAAM,MAAM,GAAG,cAAc,CAAC,IAA+C,CAAC;oBAC9E,cAAc,GAAG,MAAM,CAAC,IAAI,CAAC;gBACjC,CAAC;YACL,CAAC;YAED,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACrD,OAAO,IAAI,SAAS,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAEvB,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;gBACtB,MAAM,IAAI,aAAa,CACnB,6BAA6B,EAC7B,kBAAkB,CAAC,SAAS,EAC5B,eAAe,CAClB,CAAC;YACN,CAAC;YAED,MAAM,MAAM,GAAG,UAAU,CAAC,IAAiB,CAAC;YAE5C,kCAAkC;YAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,KAAK,IAAI,EAAE;gBACrE,OAAO,MAAM,MAAM,CAAC,OAAO,CAAC;oBACxB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,cAAc;iBACxB,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAExB,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;gBAC5C,MAAM,IAAI,aAAa,CACnB,uBAAuB,EACvB,kBAAkB,CAAC,SAAS,EAC5B,aAAa,EACb,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAC/B,CAAC;YACN,CAAC;YAED,MAAM,UAAU,GAAG,WAAW,CAAC,IAI9B,CAAC;YAEF,kCAAkC;YAClC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACvD,OAAO,qBAAqB,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClE,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAEzB,MAAM,UAAU,GAAG,YAAY,CAAC,IAA2F,CAAC;YAE5H,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,qCAAqC;YACrC,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,kBAAkB,EAAE,GAAG,EAAE;gBAC5D,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;oBAC7B,OAAO,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC;gBACpC,CAAC;gBAED,MAAM,EAAE,SAAS,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;gBACrE,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,IAAI,SAAS,CAAC;gBAE1D,OAAO;oBACH,cAAc;oBACd,UAAU,EAAE,CAAC,cAAc,IAAI,mBAAmB;iBACrD,CAAC;YACN,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAE3B,MAAM,eAAe,GAAG,cAAc,CAAC,IAAyD,CAAC;YAEjG,IAAI,eAAe,CAAC,UAAU,EAAE,CAAC;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,IAAI,EAAE,UAAU,CAAC,IAAI;oBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;oBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;oBAC/C,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,cAAc,UAAU,CAAC,UAAU,qBAAqB,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,GAAG;4BACxG,IAAI,EAAE,kBAAkB,CAAC,gBAAgB;yBAC5C;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,WAAW;YACX,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;gBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;gBAC/C,cAAc,EAAE,eAAe,CAAC,cAAc;gBAC9C,MAAM,EAAE,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAExC,IAAI,KAAK,YAAY,aAAa,EAAE,CAAC;gBACjC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,KAAK,CAAC,OAAO;4BACtB,IAAI,EAAE,KAAK,CAAC,IAAI;yBACnB;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,OAAO;gBACH,OAAO,EAAE,KAAK;gBACd,cAAc,EAAE,KAAK;gBACrB,MAAM,EAAE;oBACJ;wBACI,OAAO,EAAG,KAAe,CAAC,OAAO;wBACjC,IAAI,EAAE,eAAe;qBACxB;iBACJ;gBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,UAAU,CAAI,IAAY,EAAE,EAAW;QAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAI,IAAY,EAAE,EAAoB;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,EAAE,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAA0B;IACpD,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACvD,OAAO,MAAM,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML preprocessing module
|
|
3
|
+
* Strips HTML tags and noise from input text before extraction
|
|
4
|
+
*/
|
|
5
|
+
import type { HtmlStripOptions, PreprocessingConfig } from './types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Resolves preprocessing options to concrete HtmlStripOptions
|
|
8
|
+
*/
|
|
9
|
+
export declare function resolveHtmlStripOptions(config: boolean | HtmlStripOptions | undefined): HtmlStripOptions | null;
|
|
10
|
+
/**
|
|
11
|
+
* Strips HTML from input text according to options
|
|
12
|
+
*/
|
|
13
|
+
export declare function stripHtml(input: string, options: HtmlStripOptions): string;
|
|
14
|
+
/**
|
|
15
|
+
* Preprocesses input text according to configuration
|
|
16
|
+
*/
|
|
17
|
+
export declare function preprocess(input: string, config: PreprocessingConfig): string;
|
|
18
|
+
/**
|
|
19
|
+
* Result of preprocessing
|
|
20
|
+
*/
|
|
21
|
+
export interface PreprocessResult {
|
|
22
|
+
/** The preprocessed text */
|
|
23
|
+
text: string;
|
|
24
|
+
/** Whether preprocessing was applied */
|
|
25
|
+
wasProcessed: boolean;
|
|
26
|
+
/** Original input length */
|
|
27
|
+
originalLength: number;
|
|
28
|
+
/** Processed text length */
|
|
29
|
+
processedLength: number;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Preprocesses input with detailed result information
|
|
33
|
+
*/
|
|
34
|
+
export declare function preprocessWithDetails(input: string, config: PreprocessingConfig | undefined): PreprocessResult;
|
|
35
|
+
//# sourceMappingURL=preprocessor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"preprocessor.d.ts","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AA4CxE;;GAEG;AACH,wBAAgB,uBAAuB,CACnC,MAAM,EAAE,OAAO,GAAG,gBAAgB,GAAG,SAAS,GAC/C,gBAAgB,GAAG,IAAI,CAqBzB;AAuKD;;GAEG;AACH,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,MAAM,CA2C1E;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,mBAAmB,GAAG,MAAM,CAY7E;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,4BAA4B;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,wCAAwC;IACxC,YAAY,EAAE,OAAO,CAAC;IACtB,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,4BAA4B;IAC5B,eAAe,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACjC,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,mBAAmB,GAAG,SAAS,GACxC,gBAAgB,CAkBlB"}
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML preprocessing module
|
|
3
|
+
* Strips HTML tags and noise from input text before extraction
|
|
4
|
+
*/
|
|
5
|
+
import { parse, HTMLElement } from 'node-html-parser';
|
|
6
|
+
/**
|
|
7
|
+
* Default selectors to remove from HTML
|
|
8
|
+
* These typically contain non-content elements
|
|
9
|
+
*/
|
|
10
|
+
const DEFAULT_REMOVE_SELECTORS = [
|
|
11
|
+
'script',
|
|
12
|
+
'style',
|
|
13
|
+
'nav',
|
|
14
|
+
'footer',
|
|
15
|
+
'header',
|
|
16
|
+
'aside',
|
|
17
|
+
'noscript',
|
|
18
|
+
'iframe',
|
|
19
|
+
'svg',
|
|
20
|
+
'canvas',
|
|
21
|
+
'form',
|
|
22
|
+
// Common ad and tracking selectors
|
|
23
|
+
'[class*="ad-"]',
|
|
24
|
+
'[class*="advertisement"]',
|
|
25
|
+
'[class*="cookie"]',
|
|
26
|
+
'[class*="subscribe"]',
|
|
27
|
+
'[class*="newsletter"]',
|
|
28
|
+
'[class*="popup"]',
|
|
29
|
+
'[class*="modal"]',
|
|
30
|
+
'[class*="banner"]',
|
|
31
|
+
'[id*="ad-"]',
|
|
32
|
+
'[id*="advertisement"]',
|
|
33
|
+
'[id*="cookie"]',
|
|
34
|
+
];
|
|
35
|
+
/**
|
|
36
|
+
* Elements that should preserve their semantic meaning
|
|
37
|
+
* when preserveStructure is enabled
|
|
38
|
+
*/
|
|
39
|
+
const SEMANTIC_ELEMENTS = {
|
|
40
|
+
headings: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
|
|
41
|
+
lists: ['ul', 'ol', 'li'],
|
|
42
|
+
containers: ['article', 'main', 'section', 'div', 'body', 'html'],
|
|
43
|
+
blocks: ['p', 'blockquote'],
|
|
44
|
+
inline: ['strong', 'b', 'em', 'i', 'a', 'code'],
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Resolves preprocessing options to concrete HtmlStripOptions
|
|
48
|
+
*/
|
|
49
|
+
export function resolveHtmlStripOptions(config) {
|
|
50
|
+
if (!config) {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
if (config === true) {
|
|
54
|
+
// Default options when stripHtml: true
|
|
55
|
+
return {
|
|
56
|
+
extractText: true,
|
|
57
|
+
preserveStructure: false,
|
|
58
|
+
removeSelectors: [],
|
|
59
|
+
maxLength: undefined,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
extractText: config.extractText ?? true,
|
|
64
|
+
preserveStructure: config.preserveStructure ?? false,
|
|
65
|
+
removeSelectors: config.removeSelectors ?? [],
|
|
66
|
+
maxLength: config.maxLength,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Removes elements matching the specified selectors
|
|
71
|
+
*/
|
|
72
|
+
function removeElements(root, selectors) {
|
|
73
|
+
const allSelectors = [...DEFAULT_REMOVE_SELECTORS, ...selectors];
|
|
74
|
+
for (const selector of allSelectors) {
|
|
75
|
+
try {
|
|
76
|
+
const elements = root.querySelectorAll(selector);
|
|
77
|
+
for (const el of elements) {
|
|
78
|
+
el.remove();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
// Invalid selector, skip silently
|
|
83
|
+
// This can happen with complex CSS selectors not supported by node-html-parser
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Converts semantic HTML elements to markdown-like text
|
|
89
|
+
*/
|
|
90
|
+
function convertToStructuredText(root) {
|
|
91
|
+
const lines = [];
|
|
92
|
+
function processNode(node, depth = 0) {
|
|
93
|
+
if (!node)
|
|
94
|
+
return;
|
|
95
|
+
const tagName = node.tagName?.toLowerCase() || '';
|
|
96
|
+
// Handle headings
|
|
97
|
+
if (SEMANTIC_ELEMENTS.headings.includes(tagName)) {
|
|
98
|
+
const level = parseInt(tagName[1], 10);
|
|
99
|
+
const prefix = '#'.repeat(level) + ' ';
|
|
100
|
+
const text = node.text.trim();
|
|
101
|
+
if (text) {
|
|
102
|
+
lines.push('');
|
|
103
|
+
lines.push(prefix + text);
|
|
104
|
+
lines.push('');
|
|
105
|
+
}
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
// Handle list items
|
|
109
|
+
if (tagName === 'li') {
|
|
110
|
+
const parent = node.parentNode;
|
|
111
|
+
const parentTag = parent?.tagName?.toLowerCase();
|
|
112
|
+
const prefix = parentTag === 'ol' ? '1. ' : '- ';
|
|
113
|
+
const text = node.text.trim();
|
|
114
|
+
if (text) {
|
|
115
|
+
lines.push(prefix + text);
|
|
116
|
+
}
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
// Handle lists container
|
|
120
|
+
if (tagName === 'ul' || tagName === 'ol') {
|
|
121
|
+
lines.push('');
|
|
122
|
+
for (const child of node.childNodes) {
|
|
123
|
+
if (child instanceof HTMLElement) {
|
|
124
|
+
processNode(child, depth + 1);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
lines.push('');
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
// Handle blockquotes
|
|
131
|
+
if (tagName === 'blockquote') {
|
|
132
|
+
const text = node.text.trim();
|
|
133
|
+
if (text) {
|
|
134
|
+
lines.push('');
|
|
135
|
+
lines.push('> ' + text.replace(/\n/g, '\n> '));
|
|
136
|
+
lines.push('');
|
|
137
|
+
}
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
// Handle code blocks
|
|
141
|
+
if (tagName === 'pre' || tagName === 'code') {
|
|
142
|
+
const text = node.text.trim();
|
|
143
|
+
if (text) {
|
|
144
|
+
lines.push('');
|
|
145
|
+
lines.push('```');
|
|
146
|
+
lines.push(text);
|
|
147
|
+
lines.push('```');
|
|
148
|
+
lines.push('');
|
|
149
|
+
}
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
// Handle paragraphs and other block elements
|
|
153
|
+
if (SEMANTIC_ELEMENTS.blocks.includes(tagName)) {
|
|
154
|
+
const text = node.text.trim();
|
|
155
|
+
if (text) {
|
|
156
|
+
lines.push('');
|
|
157
|
+
lines.push(text);
|
|
158
|
+
lines.push('');
|
|
159
|
+
}
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
// Handle container elements - recurse into children
|
|
163
|
+
if (SEMANTIC_ELEMENTS.containers.includes(tagName) || !tagName) {
|
|
164
|
+
for (const child of node.childNodes) {
|
|
165
|
+
if (child instanceof HTMLElement) {
|
|
166
|
+
processNode(child, depth);
|
|
167
|
+
}
|
|
168
|
+
else if (child.nodeType === 3) {
|
|
169
|
+
// Text node
|
|
170
|
+
const text = child.text.trim();
|
|
171
|
+
if (text) {
|
|
172
|
+
lines.push(text);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
// Recursively process children for any other elements
|
|
179
|
+
for (const child of node.childNodes) {
|
|
180
|
+
if (child instanceof HTMLElement) {
|
|
181
|
+
processNode(child, depth);
|
|
182
|
+
}
|
|
183
|
+
else if (child.nodeType === 3) {
|
|
184
|
+
// Text node
|
|
185
|
+
const text = child.text.trim();
|
|
186
|
+
if (text) {
|
|
187
|
+
lines.push(text);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
processNode(root);
|
|
193
|
+
// Clean up multiple blank lines
|
|
194
|
+
return lines
|
|
195
|
+
.join('\n')
|
|
196
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
197
|
+
.trim();
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Extracts plain text from HTML, preserving meaningful whitespace
|
|
201
|
+
*/
|
|
202
|
+
function extractPlainText(root) {
|
|
203
|
+
// Get raw text
|
|
204
|
+
let text = root.text;
|
|
205
|
+
// Clean up whitespace while preserving paragraph breaks
|
|
206
|
+
text = text
|
|
207
|
+
// Replace multiple spaces with single space
|
|
208
|
+
.replace(/[ \t]+/g, ' ')
|
|
209
|
+
// Replace multiple newlines with double newline (paragraph break)
|
|
210
|
+
.replace(/\n\s*\n/g, '\n\n')
|
|
211
|
+
// Remove leading/trailing whitespace from each line
|
|
212
|
+
.split('\n')
|
|
213
|
+
.map(line => line.trim())
|
|
214
|
+
.join('\n')
|
|
215
|
+
// Remove more than two consecutive newlines
|
|
216
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
217
|
+
.trim();
|
|
218
|
+
return text;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Strips HTML from input text according to options
|
|
222
|
+
*/
|
|
223
|
+
export function stripHtml(input, options) {
|
|
224
|
+
// Quick check: if no HTML-like content, return as-is
|
|
225
|
+
if (!input.includes('<') || !input.includes('>')) {
|
|
226
|
+
return options.maxLength ? input.slice(0, options.maxLength) : input;
|
|
227
|
+
}
|
|
228
|
+
// Parse HTML
|
|
229
|
+
const root = parse(input, {
|
|
230
|
+
lowerCaseTagName: true,
|
|
231
|
+
comment: false, // Remove comments
|
|
232
|
+
blockTextElements: {
|
|
233
|
+
script: true,
|
|
234
|
+
noscript: true,
|
|
235
|
+
style: true,
|
|
236
|
+
pre: true,
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
// Remove unwanted elements
|
|
240
|
+
removeElements(root, options.removeSelectors || []);
|
|
241
|
+
// Extract text based on options
|
|
242
|
+
let result;
|
|
243
|
+
if (options.preserveStructure) {
|
|
244
|
+
result = convertToStructuredText(root);
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
result = extractPlainText(root);
|
|
248
|
+
}
|
|
249
|
+
// Apply max length if specified
|
|
250
|
+
if (options.maxLength && result.length > options.maxLength) {
|
|
251
|
+
result = result.slice(0, options.maxLength);
|
|
252
|
+
// Try to break at a word boundary
|
|
253
|
+
const lastSpace = result.lastIndexOf(' ');
|
|
254
|
+
if (lastSpace > options.maxLength * 0.8) {
|
|
255
|
+
result = result.slice(0, lastSpace) + '...';
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
result += '...';
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
return result;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Preprocesses input text according to configuration
|
|
265
|
+
*/
|
|
266
|
+
export function preprocess(input, config) {
|
|
267
|
+
let result = input;
|
|
268
|
+
// Handle HTML stripping
|
|
269
|
+
if (config.stripHtml) {
|
|
270
|
+
const options = resolveHtmlStripOptions(config.stripHtml);
|
|
271
|
+
if (options) {
|
|
272
|
+
result = stripHtml(result, options);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
return result;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Preprocesses input with detailed result information
|
|
279
|
+
*/
|
|
280
|
+
export function preprocessWithDetails(input, config) {
|
|
281
|
+
if (!config || (!config.stripHtml)) {
|
|
282
|
+
return {
|
|
283
|
+
text: input,
|
|
284
|
+
wasProcessed: false,
|
|
285
|
+
originalLength: input.length,
|
|
286
|
+
processedLength: input.length,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
const processed = preprocess(input, config);
|
|
290
|
+
return {
|
|
291
|
+
text: processed,
|
|
292
|
+
wasProcessed: processed !== input,
|
|
293
|
+
originalLength: input.length,
|
|
294
|
+
processedLength: processed.length,
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
//# sourceMappingURL=preprocessor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"preprocessor.js","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,MAAM,wBAAwB,GAAG;IAC7B,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,QAAQ;IACR,MAAM;IACN,mCAAmC;IACnC,gBAAgB;IAChB,0BAA0B;IAC1B,mBAAmB;IACnB,sBAAsB;IACtB,uBAAuB;IACvB,kBAAkB;IAClB,kBAAkB;IAClB,mBAAmB;IACnB,aAAa;IACb,uBAAuB;IACvB,gBAAgB;CACnB,CAAC;AAEF;;;GAGG;AACH,MAAM,iBAAiB,GAAG;IACtB,QAAQ,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IAC9C,KAAK,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IACzB,UAAU,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC;IACjE,MAAM,EAAE,CAAC,GAAG,EAAE,YAAY,CAAC;IAC3B,MAAM,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC;CAClD,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACnC,MAA8C;IAE9C,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,OAAO,IAAI,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;QAClB,uCAAuC;QACvC,OAAO;YACH,WAAW,EAAE,IAAI;YACjB,iBAAiB,EAAE,KAAK;YACxB,eAAe,EAAE,EAAE;YACnB,SAAS,EAAE,SAAS;SACvB,CAAC;IACN,CAAC;IAED,OAAO;QACH,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,IAAI;QACvC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB,IAAI,KAAK;QACpD,eAAe,EAAE,MAAM,CAAC,eAAe,IAAI,EAAE;QAC7C,SAAS,EAAE,MAAM,CAAC,SAAS;KAC9B,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAiB,EAAE,SAAmB;IAC1D,MAAM,YAAY,GAAG,CAAC,GAAG,wBAAwB,EAAE,GAAG,SAAS,CAAC,CAAC;IAEjE,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QAClC,IAAI,CAAC;YACD,MAAM,QAAQ,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACjD,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBACxB,EAAE,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACL,kCAAkC;YAClC,+EAA+E;QACnF,CAAC;IACL,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,uBAAuB,CAAC,IAAiB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,SAAS,WAAW,CAAC,IAAwB,EAAE,QAAgB,CAAC;QAC5D,IAAI,CAAC,IAAI;YAAE,OAAO;QAElB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAElD,kBAAkB;QAClB,IAAI,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACvC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;YACvC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oBAAoB;QACpB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACnB,MAAM,MAAM,GAAG,IAAI,CAAC,UAAgC,CAAC;YACrD,MAAM,SAAS,GAAG,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC;YACjD,MAAM,MAAM,GAAG,SAAS,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;YACjD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;YAC9B,CAAC;YACD,OAAO;QACX,CAAC;QAED,yBAAyB;QACzB,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;gBAClC,CAAC;YACL,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,YAAY,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;gBAC/C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,KAAK,IAAI,OAAO,KAAK,MAAM,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,6CAA6C;QAC7C,IAAI,iBAAiB,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oDAAoD;QACpD,IAAI,iBAAiB,CAAC,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAC7D,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;gBAC9B,CAAC;qBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC9B,YAAY;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;oBAC/B,IAAI,IAAI,EAAE,CAAC;wBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;gBACL,CAAC;YACL,CAAC;YACD,OAAO;QACX,CAAC;QAED,sDAAsD;QACtD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;gBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAC9B,CAAC;iBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;gBAC9B,YAAY;gBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC/B,IAAI,IAAI,EAAE,CAAC;oBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACrB,CAAC;YACL,CAAC;QACL,CAAC;IACL,CAAC;IAED,WAAW,CAAC,IAAI,CAAC,CAAC;IAElB,gCAAgC;IAChC,OAAO,KAAK;SACP,IAAI,CAAC,IAAI,CAAC;SACV,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAiB;IACvC,eAAe;IACf,IAAI,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;IAErB,wDAAwD;IACxD,IAAI,GAAG,IAAI;QACP,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;QACxB,kEAAkE;SACjE,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC;QAC5B,oDAAoD;SACnD,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,IAAI,CAAC,IAAI,CAAC;QACX,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;IAEZ,OAAO,IAAI,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,KAAa,EAAE,OAAyB;IAC9D,qDAAqD;IACrD,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IACzE,CAAC;IAED,aAAa;IACb,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAE;QACtB,gBAAgB,EAAE,IAAI;QACtB,OAAO,EAAE,KAAK,EAAE,kBAAkB;QAClC,iBAAiB,EAAE;YACf,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,IAAI;YACd,KAAK,EAAE,IAAI;YACX,GAAG,EAAE,IAAI;SACZ;KACJ,CAAC,CAAC;IAEH,2BAA2B;IAC3B,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC;IAEpD,gCAAgC;IAChC,IAAI,MAAc,CAAC;IAEnB,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;QAC5B,MAAM,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC;SAAM,CAAC;QACJ,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAED,gCAAgC;IAChC,IAAI,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;QACzD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;QAC5C,kCAAkC;QAClC,MAAM,SAAS,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,GAAG,GAAG,EAAE,CAAC;YACtC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,KAAK,CAAC;QAChD,CAAC;aAAM,CAAC;YACJ,MAAM,IAAI,KAAK,CAAC;QACpB,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,KAAa,EAAE,MAA2B;IACjE,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,wBAAwB;IACxB,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,OAAO,GAAG,uBAAuB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC1D,IAAI,OAAO,EAAE,CAAC;YACV,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACxC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAgBD;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACjC,KAAa,EACb,MAAuC;IAEvC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;QACjC,OAAO;YACH,IAAI,EAAE,KAAK;YACX,YAAY,EAAE,KAAK;YACnB,cAAc,EAAE,KAAK,CAAC,MAAM;YAC5B,eAAe,EAAE,KAAK,CAAC,MAAM;SAChC,CAAC;IACN,CAAC;IAED,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAE5C,OAAO;QACH,IAAI,EAAE,SAAS;QACf,YAAY,EAAE,SAAS,KAAK,KAAK;QACjC,cAAc,EAAE,KAAK,CAAC,MAAM;QAC5B,eAAe,EAAE,SAAS,CAAC,MAAM;KACpC,CAAC;AACN,CAAC"}
|
package/dist/core/types.d.ts
CHANGED
|
@@ -3,6 +3,26 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import type { Schema } from '../schemas/types.js';
|
|
5
5
|
import type { LLMConfig } from '../llm/types.js';
|
|
6
|
+
/**
|
|
7
|
+
* HTML stripping options for preprocessing
|
|
8
|
+
*/
|
|
9
|
+
export interface HtmlStripOptions {
|
|
10
|
+
/** Keep text content only (default: true) */
|
|
11
|
+
extractText?: boolean;
|
|
12
|
+
/** Preserve semantic structure like headings, lists (converts to markdown-like format) */
|
|
13
|
+
preserveStructure?: boolean;
|
|
14
|
+
/** Remove specific CSS selectors (e.g., 'nav', 'footer', '.ad', '#sidebar') */
|
|
15
|
+
removeSelectors?: string[];
|
|
16
|
+
/** Max content length after stripping (truncates if exceeded) */
|
|
17
|
+
maxLength?: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Preprocessing configuration for input text
|
|
21
|
+
*/
|
|
22
|
+
export interface PreprocessingConfig {
|
|
23
|
+
/** Strip HTML tags from input. When true, uses default options. */
|
|
24
|
+
stripHtml?: boolean | HtmlStripOptions;
|
|
25
|
+
}
|
|
6
26
|
/**
|
|
7
27
|
* Pipeline configuration
|
|
8
28
|
*/
|
|
@@ -19,6 +39,8 @@ export interface ExtractionRequest {
|
|
|
19
39
|
input: string;
|
|
20
40
|
schema: Schema;
|
|
21
41
|
llmConfig: LLMConfig;
|
|
42
|
+
/** Optional preprocessing configuration */
|
|
43
|
+
preprocessing?: PreprocessingConfig;
|
|
22
44
|
debug?: boolean;
|
|
23
45
|
}
|
|
24
46
|
/**
|
package/dist/core/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,6CAA6C;IAC7C,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,0FAA0F;IAC1F,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAC5B,+EAA+E;IAC/E,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,iEAAiE;IACjE,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAChC,mEAAmE;IACnE,SAAS,CAAC,EAAE,OAAO,GAAG,gBAAgB,CAAC;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,2CAA2C;IAC3C,aAAa,CAAC,EAAE,mBAAmB,CAAC;IACpC,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
|
package/dist/index.d.ts
CHANGED
|
@@ -5,7 +5,9 @@
|
|
|
5
5
|
export { ExtractionPipeline, extract } from './core/pipeline.js';
|
|
6
6
|
export { validateExtractedData } from './core/validator.js';
|
|
7
7
|
export { PipelineError, PipelineErrorCodes } from './core/errors.js';
|
|
8
|
-
export
|
|
8
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
|
|
9
|
+
export type { PreprocessResult } from './core/preprocessor.js';
|
|
10
|
+
export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './core/types.js';
|
|
9
11
|
export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
|
|
10
12
|
export { validateSchema } from './schemas/validator.js';
|
|
11
13
|
export { SchemaValidationError, ErrorCodes as SchemaErrorCodes } from './schemas/errors.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAChC,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5F,YAAY,EACR,MAAM,EACN,eAAe,EACf,SAAS,EACT,eAAe,EACf,gBAAgB,EACnB,MAAM,oBAAoB,CAAC;AAG5B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACtE,YAAY,EACR,SAAS,EACT,WAAW,EACX,WAAW,EACX,UAAU,EACV,WAAW,EACX,iBAAiB,EACjB,kBAAkB,EAClB,WAAW,GACd,MAAM,gBAAgB,CAAC;AACxB,YAAY,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
export { ExtractionPipeline, extract } from './core/pipeline.js';
|
|
7
7
|
export { validateExtractedData } from './core/validator.js';
|
|
8
8
|
export { PipelineError, PipelineErrorCodes } from './core/errors.js';
|
|
9
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
|
|
9
10
|
// Schema exports
|
|
10
11
|
export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
|
|
11
12
|
export { validateSchema } from './schemas/validator.js';
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAWhC,iBAAiB;AACjB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAS5F,cAAc;AACd,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ordis-dev/ordis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Schema-first LLM extraction tool that turns unstructured text into validated structured data",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -64,5 +64,8 @@
|
|
|
64
64
|
"tsx": "^4.21.0",
|
|
65
65
|
"typescript": "^5.9.3",
|
|
66
66
|
"vitest": "^4.0.15"
|
|
67
|
+
},
|
|
68
|
+
"dependencies": {
|
|
69
|
+
"node-html-parser": "^7.0.2"
|
|
67
70
|
}
|
|
68
71
|
}
|