@ordis-dev/ordis 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,5 +4,7 @@
4
4
  export { ExtractionPipeline, extract } from './pipeline.js';
5
5
  export { validateExtractedData } from './validator.js';
6
6
  export { PipelineError, PipelineErrorCodes } from './errors.js';
7
- export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, } from './types.js';
7
+ export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
8
+ export type { PreprocessResult } from './preprocessor.js';
9
+ export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './types.js';
8
10
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,GACb,MAAM,YAAY,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC;AAC3B,YAAY,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAC1D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,YAAY,CAAC"}
@@ -4,4 +4,5 @@
4
4
  export { ExtractionPipeline, extract } from './pipeline.js';
5
5
  export { validateExtractedData } from './validator.js';
6
6
  export { PipelineError, PipelineErrorCodes } from './errors.js';
7
+ export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
7
8
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAc,MAAM,YAAY,CAAC;AAEhF;;GAEG;AACH,qBAAa,kBAAkB;IAC3B,OAAO,CAAC,KAAK,CAAU;gBAEX,KAAK,GAAE,OAAe;IAIlC;;OAEG;IACG,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC;IAsKlE;;OAEG;IACH,OAAO,CAAC,UAAU;IAsBlB;;OAEG;YACW,eAAe;CAqBhC;AAED;;GAEG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC,CAGjF"}
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAc,MAAM,YAAY,CAAC;AAEhF;;GAEG;AACH,qBAAa,kBAAkB;IAC3B,OAAO,CAAC,KAAK,CAAU;gBAEX,KAAK,GAAE,OAAe;IAIlC;;OAEG;IACG,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC;IAoLlE;;OAEG;IACH,OAAO,CAAC,UAAU;IAsBlB;;OAEG;YACW,eAAe;CAqBhC;AAED;;GAEG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC,CAGjF"}
@@ -4,6 +4,7 @@
4
4
  import { LLMClient } from '../llm/client.js';
5
5
  import { validateExtractedData } from './validator.js';
6
6
  import { PipelineError, PipelineErrorCodes } from './errors.js';
7
+ import { preprocessWithDetails } from './preprocessor.js';
7
8
  /**
8
9
  * Main extraction pipeline
9
10
  */
@@ -19,6 +20,18 @@ export class ExtractionPipeline {
19
20
  const startTime = Date.now();
20
21
  const steps = [];
21
22
  try {
23
+ // Step 0: Preprocess input (if configured)
24
+ let processedInput = request.input;
25
+ if (request.preprocessing) {
26
+ const preprocessStep = this.recordStep('preprocess', () => {
27
+ return preprocessWithDetails(request.input, request.preprocessing);
28
+ });
29
+ steps.push(preprocessStep);
30
+ if (preprocessStep.success && preprocessStep.data) {
31
+ const result = preprocessStep.data;
32
+ processedInput = result.text;
33
+ }
34
+ }
22
35
  // Step 1: Create LLM client
23
36
  const clientStep = this.recordStep('create_client', () => {
24
37
  return new LLMClient(request.llmConfig);
@@ -32,7 +45,7 @@ export class ExtractionPipeline {
32
45
  const extractStep = await this.recordStepAsync('llm_extract', async () => {
33
46
  return await client.extract({
34
47
  schema: request.schema,
35
- input: request.input,
48
+ input: processedInput,
36
49
  });
37
50
  });
38
51
  steps.push(extractStep);
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAGhE;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACnB,KAAK,CAAU;IAEvB,YAAY,QAAiB,KAAK;QAC9B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,OAA0B;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAiB,EAAE,CAAC;QAE/B,IAAI,CAAC;YACD,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACrD,OAAO,IAAI,SAAS,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAEvB,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;gBACtB,MAAM,IAAI,aAAa,CACnB,6BAA6B,EAC7B,kBAAkB,CAAC,SAAS,EAC5B,eAAe,CAClB,CAAC;YACN,CAAC;YAED,MAAM,MAAM,GAAG,UAAU,CAAC,IAAiB,CAAC;YAE5C,kCAAkC;YAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,KAAK,IAAI,EAAE;gBACrE,OAAO,MAAM,MAAM,CAAC,OAAO,CAAC;oBACxB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;iBACvB,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAExB,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;gBAC5C,MAAM,IAAI,aAAa,CACnB,uBAAuB,EACvB,kBAAkB,CAAC,SAAS,EAC5B,aAAa,EACb,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAC/B,CAAC;YACN,CAAC;YAED,MAAM,UAAU,GAAG,WAAW,CAAC,IAI9B,CAAC;YAEF,kCAAkC;YAClC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACvD,OAAO,qBAAqB,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClE,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAEzB,MAAM,UAAU,GAAG,YAAY,CAAC,IAA2F,CAAC;YAE5H,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,qCAAqC;YACrC,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,kBAAkB,EAAE,GAAG,EAAE;gBAC5D,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;oBAC7B,OAAO,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC;gBACpC,CAAC;gBAED,MAAM,EAAE,SAAS,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;gBACrE,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,IAAI,SAAS,CAAC;gBAE1D,OAAO;oBACH,cAAc;oBACd,UAAU,EAAE,CAAC,cAAc,IAAI,mBAAmB;iBACrD,CAAC;YACN,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAE3B,MAAM,eAAe,GAAG,cAAc,CAAC,IAAyD,CAAC;YAEjG,IAAI,eAAe,CAAC,UAAU,EAAE,CAAC;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,IAAI,EAAE,UAAU,CAAC,IAAI;oBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;oBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;oBAC/C,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,cAAc,UAAU,CAAC,UAAU,qBAAqB,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,GAAG;4BACxG,IAAI,EAAE,kBAAkB,CAAC,gBAAgB;yBAC5C;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,WAAW;YACX,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;gBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;gBAC/C,cAAc,EAAE,eAAe,CAAC,cAAc;gBAC9C,MAAM,EAAE,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAExC,IAAI,KAAK,YAAY,aAAa,EAAE,CAAC;gBACjC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,KAAK,CAAC,OAAO;4BACtB,IAAI,EAAE,KAAK,CAAC,IAAI;yBACnB;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,OAAO;gBACH,OAAO,EAAE,KAAK;gBACd,cAAc,EAAE,KAAK;gBACrB,MAAM,EAAE;oBACJ;wBACI,OAAO,EAAG,KAAe,CAAC,OAAO;wBACjC,IAAI,EAAE,eAAe;qBACxB;iBACJ;gBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,UAAU,CAAI,IAAY,EAAE,EAAW;QAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAI,IAAY,EAAE,EAAoB;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,EAAE,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAA0B;IACpD,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACvD,OAAO,MAAM,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC"}
1
+ {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAG1D;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACnB,KAAK,CAAU;IAEvB,YAAY,QAAiB,KAAK;QAC9B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,OAA0B;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAiB,EAAE,CAAC;QAE/B,IAAI,CAAC;YACD,2CAA2C;YAC3C,IAAI,cAAc,GAAG,OAAO,CAAC,KAAK,CAAC;YACnC,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBACxB,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,YAAY,EAAE,GAAG,EAAE;oBACtD,OAAO,qBAAqB,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;gBACvE,CAAC,CAAC,CAAC;gBACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAE3B,IAAI,cAAc,CAAC,OAAO,IAAI,cAAc,CAAC,IAAI,EAAE,CAAC;oBAChD,MAAM,MAAM,GAAG,cAAc,CAAC,IAA+C,CAAC;oBAC9E,cAAc,GAAG,MAAM,CAAC,IAAI,CAAC;gBACjC,CAAC;YACL,CAAC;YAED,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACrD,OAAO,IAAI,SAAS,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAEvB,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;gBACtB,MAAM,IAAI,aAAa,CACnB,6BAA6B,EAC7B,kBAAkB,CAAC,SAAS,EAC5B,eAAe,CAClB,CAAC;YACN,CAAC;YAED,MAAM,MAAM,GAAG,UAAU,CAAC,IAAiB,CAAC;YAE5C,kCAAkC;YAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,KAAK,IAAI,EAAE;gBACrE,OAAO,MAAM,MAAM,CAAC,OAAO,CAAC;oBACxB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,cAAc;iBACxB,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAExB,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;gBAC5C,MAAM,IAAI,aAAa,CACnB,uBAAuB,EACvB,kBAAkB,CAAC,SAAS,EAC5B,aAAa,EACb,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAC/B,CAAC;YACN,CAAC;YAED,MAAM,UAAU,GAAG,WAAW,CAAC,IAI9B,CAAC;YAEF,kCAAkC;YAClC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACvD,OAAO,qBAAqB,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClE,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAEzB,MAAM,UAAU,GAAG,YAAY,CAAC,IAA2F,CAAC;YAE5H,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,qCAAqC;YACrC,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,kBAAkB,EAAE,GAAG,EAAE;gBAC5D,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;oBAC7B,OAAO,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC;gBACpC,CAAC;gBAED,MAAM,EAAE,SAAS,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;gBACrE,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,IAAI,SAAS,CAAC;gBAE1D,OAAO;oBACH,cAAc;oBACd,UAAU,EAAE,CAAC,cAAc,IAAI,mBAAmB;iBACrD,CAAC;YACN,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAE3B,MAAM,eAAe,GAAG,cAAc,CAAC,IAAyD,CAAC;YAEjG,IAAI,eAAe,CAAC,UAAU,EAAE,CAAC;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,IAAI,EAAE,UAAU,CAAC,IAAI;oBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;oBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;oBAC/C,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,cAAc,UAAU,CAAC,UAAU,qBAAqB,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,GAAG;4BACxG,IAAI,EAAE,kBAAkB,CAAC,gBAAgB;yBAC5C;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,WAAW;YACX,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;gBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;gBAC/C,cAAc,EAAE,eAAe,CAAC,cAAc;gBAC9C,MAAM,EAAE,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAExC,IAAI,KAAK,YAAY,aAAa,EAAE,CAAC;gBACjC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,KAAK,CAAC,OAAO;4BACtB,IAAI,EAAE,KAAK,CAAC,IAAI;yBACnB;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,OAAO;gBACH,OAAO,EAAE,KAAK;gBACd,cAAc,EAAE,KAAK;gBACrB,MAAM,EAAE;oBACJ;wBACI,OAAO,EAAG,KAAe,CAAC,OAAO;wBACjC,IAAI,EAAE,eAAe;qBACxB;iBACJ;gBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,UAAU,CAAI,IAAY,EAAE,EAAW;QAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAI,IAAY,EAAE,EAAoB;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,EAAE,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAA0B;IACpD,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACvD,OAAO,MAAM,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC"}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * HTML preprocessing module
3
+ * Strips HTML tags and noise from input text before extraction
4
+ */
5
+ import type { HtmlStripOptions, PreprocessingConfig } from './types.js';
6
+ /**
7
+ * Resolves preprocessing options to concrete HtmlStripOptions
8
+ */
9
+ export declare function resolveHtmlStripOptions(config: boolean | HtmlStripOptions | undefined): HtmlStripOptions | null;
10
+ /**
11
+ * Strips HTML from input text according to options
12
+ */
13
+ export declare function stripHtml(input: string, options: HtmlStripOptions): string;
14
+ /**
15
+ * Preprocesses input text according to configuration
16
+ */
17
+ export declare function preprocess(input: string, config: PreprocessingConfig): string;
18
+ /**
19
+ * Result of preprocessing
20
+ */
21
+ export interface PreprocessResult {
22
+ /** The preprocessed text */
23
+ text: string;
24
+ /** Whether preprocessing was applied */
25
+ wasProcessed: boolean;
26
+ /** Original input length */
27
+ originalLength: number;
28
+ /** Processed text length */
29
+ processedLength: number;
30
+ }
31
+ /**
32
+ * Preprocesses input with detailed result information
33
+ */
34
+ export declare function preprocessWithDetails(input: string, config: PreprocessingConfig | undefined): PreprocessResult;
35
+ //# sourceMappingURL=preprocessor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"preprocessor.d.ts","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AA4CxE;;GAEG;AACH,wBAAgB,uBAAuB,CACnC,MAAM,EAAE,OAAO,GAAG,gBAAgB,GAAG,SAAS,GAC/C,gBAAgB,GAAG,IAAI,CAqBzB;AAuKD;;GAEG;AACH,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,MAAM,CA2C1E;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,mBAAmB,GAAG,MAAM,CAY7E;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,4BAA4B;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,wCAAwC;IACxC,YAAY,EAAE,OAAO,CAAC;IACtB,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,4BAA4B;IAC5B,eAAe,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACjC,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,mBAAmB,GAAG,SAAS,GACxC,gBAAgB,CAkBlB"}
@@ -0,0 +1,297 @@
1
+ /**
2
+ * HTML preprocessing module
3
+ * Strips HTML tags and noise from input text before extraction
4
+ */
5
+ import { parse, HTMLElement } from 'node-html-parser';
6
+ /**
7
+ * Default selectors to remove from HTML
8
+ * These typically contain non-content elements
9
+ */
10
+ const DEFAULT_REMOVE_SELECTORS = [
11
+ 'script',
12
+ 'style',
13
+ 'nav',
14
+ 'footer',
15
+ 'header',
16
+ 'aside',
17
+ 'noscript',
18
+ 'iframe',
19
+ 'svg',
20
+ 'canvas',
21
+ 'form',
22
+ // Common ad and tracking selectors
23
+ '[class*="ad-"]',
24
+ '[class*="advertisement"]',
25
+ '[class*="cookie"]',
26
+ '[class*="subscribe"]',
27
+ '[class*="newsletter"]',
28
+ '[class*="popup"]',
29
+ '[class*="modal"]',
30
+ '[class*="banner"]',
31
+ '[id*="ad-"]',
32
+ '[id*="advertisement"]',
33
+ '[id*="cookie"]',
34
+ ];
35
+ /**
36
+ * Elements that should preserve their semantic meaning
37
+ * when preserveStructure is enabled
38
+ */
39
+ const SEMANTIC_ELEMENTS = {
40
+ headings: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
41
+ lists: ['ul', 'ol', 'li'],
42
+ containers: ['article', 'main', 'section', 'div', 'body', 'html'],
43
+ blocks: ['p', 'blockquote'],
44
+ inline: ['strong', 'b', 'em', 'i', 'a', 'code'],
45
+ };
46
+ /**
47
+ * Resolves preprocessing options to concrete HtmlStripOptions
48
+ */
49
+ export function resolveHtmlStripOptions(config) {
50
+ if (!config) {
51
+ return null;
52
+ }
53
+ if (config === true) {
54
+ // Default options when stripHtml: true
55
+ return {
56
+ extractText: true,
57
+ preserveStructure: false,
58
+ removeSelectors: [],
59
+ maxLength: undefined,
60
+ };
61
+ }
62
+ return {
63
+ extractText: config.extractText ?? true,
64
+ preserveStructure: config.preserveStructure ?? false,
65
+ removeSelectors: config.removeSelectors ?? [],
66
+ maxLength: config.maxLength,
67
+ };
68
+ }
69
+ /**
70
+ * Removes elements matching the specified selectors
71
+ */
72
+ function removeElements(root, selectors) {
73
+ const allSelectors = [...DEFAULT_REMOVE_SELECTORS, ...selectors];
74
+ for (const selector of allSelectors) {
75
+ try {
76
+ const elements = root.querySelectorAll(selector);
77
+ for (const el of elements) {
78
+ el.remove();
79
+ }
80
+ }
81
+ catch {
82
+ // Invalid selector, skip silently
83
+ // This can happen with complex CSS selectors not supported by node-html-parser
84
+ }
85
+ }
86
+ }
87
+ /**
88
+ * Converts semantic HTML elements to markdown-like text
89
+ */
90
+ function convertToStructuredText(root) {
91
+ const lines = [];
92
+ function processNode(node, depth = 0) {
93
+ if (!node)
94
+ return;
95
+ const tagName = node.tagName?.toLowerCase() || '';
96
+ // Handle headings
97
+ if (SEMANTIC_ELEMENTS.headings.includes(tagName)) {
98
+ const level = parseInt(tagName[1], 10);
99
+ const prefix = '#'.repeat(level) + ' ';
100
+ const text = node.text.trim();
101
+ if (text) {
102
+ lines.push('');
103
+ lines.push(prefix + text);
104
+ lines.push('');
105
+ }
106
+ return;
107
+ }
108
+ // Handle list items
109
+ if (tagName === 'li') {
110
+ const parent = node.parentNode;
111
+ const parentTag = parent?.tagName?.toLowerCase();
112
+ const prefix = parentTag === 'ol' ? '1. ' : '- ';
113
+ const text = node.text.trim();
114
+ if (text) {
115
+ lines.push(prefix + text);
116
+ }
117
+ return;
118
+ }
119
+ // Handle lists container
120
+ if (tagName === 'ul' || tagName === 'ol') {
121
+ lines.push('');
122
+ for (const child of node.childNodes) {
123
+ if (child instanceof HTMLElement) {
124
+ processNode(child, depth + 1);
125
+ }
126
+ }
127
+ lines.push('');
128
+ return;
129
+ }
130
+ // Handle blockquotes
131
+ if (tagName === 'blockquote') {
132
+ const text = node.text.trim();
133
+ if (text) {
134
+ lines.push('');
135
+ lines.push('> ' + text.replace(/\n/g, '\n> '));
136
+ lines.push('');
137
+ }
138
+ return;
139
+ }
140
+ // Handle code blocks
141
+ if (tagName === 'pre' || tagName === 'code') {
142
+ const text = node.text.trim();
143
+ if (text) {
144
+ lines.push('');
145
+ lines.push('```');
146
+ lines.push(text);
147
+ lines.push('```');
148
+ lines.push('');
149
+ }
150
+ return;
151
+ }
152
+ // Handle paragraphs and other block elements
153
+ if (SEMANTIC_ELEMENTS.blocks.includes(tagName)) {
154
+ const text = node.text.trim();
155
+ if (text) {
156
+ lines.push('');
157
+ lines.push(text);
158
+ lines.push('');
159
+ }
160
+ return;
161
+ }
162
+ // Handle container elements - recurse into children
163
+ if (SEMANTIC_ELEMENTS.containers.includes(tagName) || !tagName) {
164
+ for (const child of node.childNodes) {
165
+ if (child instanceof HTMLElement) {
166
+ processNode(child, depth);
167
+ }
168
+ else if (child.nodeType === 3) {
169
+ // Text node
170
+ const text = child.text.trim();
171
+ if (text) {
172
+ lines.push(text);
173
+ }
174
+ }
175
+ }
176
+ return;
177
+ }
178
+ // Recursively process children for any other elements
179
+ for (const child of node.childNodes) {
180
+ if (child instanceof HTMLElement) {
181
+ processNode(child, depth);
182
+ }
183
+ else if (child.nodeType === 3) {
184
+ // Text node
185
+ const text = child.text.trim();
186
+ if (text) {
187
+ lines.push(text);
188
+ }
189
+ }
190
+ }
191
+ }
192
+ processNode(root);
193
+ // Clean up multiple blank lines
194
+ return lines
195
+ .join('\n')
196
+ .replace(/\n{3,}/g, '\n\n')
197
+ .trim();
198
+ }
199
+ /**
200
+ * Extracts plain text from HTML, preserving meaningful whitespace
201
+ */
202
+ function extractPlainText(root) {
203
+ // Get raw text
204
+ let text = root.text;
205
+ // Clean up whitespace while preserving paragraph breaks
206
+ text = text
207
+ // Replace multiple spaces with single space
208
+ .replace(/[ \t]+/g, ' ')
209
+ // Replace multiple newlines with double newline (paragraph break)
210
+ .replace(/\n\s*\n/g, '\n\n')
211
+ // Remove leading/trailing whitespace from each line
212
+ .split('\n')
213
+ .map(line => line.trim())
214
+ .join('\n')
215
+ // Remove more than two consecutive newlines
216
+ .replace(/\n{3,}/g, '\n\n')
217
+ .trim();
218
+ return text;
219
+ }
220
+ /**
221
+ * Strips HTML from input text according to options
222
+ */
223
+ export function stripHtml(input, options) {
224
+ // Quick check: if no HTML-like content, return as-is
225
+ if (!input.includes('<') || !input.includes('>')) {
226
+ return options.maxLength ? input.slice(0, options.maxLength) : input;
227
+ }
228
+ // Parse HTML
229
+ const root = parse(input, {
230
+ lowerCaseTagName: true,
231
+ comment: false, // Remove comments
232
+ blockTextElements: {
233
+ script: true,
234
+ noscript: true,
235
+ style: true,
236
+ pre: true,
237
+ },
238
+ });
239
+ // Remove unwanted elements
240
+ removeElements(root, options.removeSelectors || []);
241
+ // Extract text based on options
242
+ let result;
243
+ if (options.preserveStructure) {
244
+ result = convertToStructuredText(root);
245
+ }
246
+ else {
247
+ result = extractPlainText(root);
248
+ }
249
+ // Apply max length if specified
250
+ if (options.maxLength && result.length > options.maxLength) {
251
+ result = result.slice(0, options.maxLength);
252
+ // Try to break at a word boundary
253
+ const lastSpace = result.lastIndexOf(' ');
254
+ if (lastSpace > options.maxLength * 0.8) {
255
+ result = result.slice(0, lastSpace) + '...';
256
+ }
257
+ else {
258
+ result += '...';
259
+ }
260
+ }
261
+ return result;
262
+ }
263
+ /**
264
+ * Preprocesses input text according to configuration
265
+ */
266
+ export function preprocess(input, config) {
267
+ let result = input;
268
+ // Handle HTML stripping
269
+ if (config.stripHtml) {
270
+ const options = resolveHtmlStripOptions(config.stripHtml);
271
+ if (options) {
272
+ result = stripHtml(result, options);
273
+ }
274
+ }
275
+ return result;
276
+ }
277
+ /**
278
+ * Preprocesses input with detailed result information
279
+ */
280
+ export function preprocessWithDetails(input, config) {
281
+ if (!config || (!config.stripHtml)) {
282
+ return {
283
+ text: input,
284
+ wasProcessed: false,
285
+ originalLength: input.length,
286
+ processedLength: input.length,
287
+ };
288
+ }
289
+ const processed = preprocess(input, config);
290
+ return {
291
+ text: processed,
292
+ wasProcessed: processed !== input,
293
+ originalLength: input.length,
294
+ processedLength: processed.length,
295
+ };
296
+ }
297
+ //# sourceMappingURL=preprocessor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"preprocessor.js","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,MAAM,wBAAwB,GAAG;IAC7B,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,QAAQ;IACR,MAAM;IACN,mCAAmC;IACnC,gBAAgB;IAChB,0BAA0B;IAC1B,mBAAmB;IACnB,sBAAsB;IACtB,uBAAuB;IACvB,kBAAkB;IAClB,kBAAkB;IAClB,mBAAmB;IACnB,aAAa;IACb,uBAAuB;IACvB,gBAAgB;CACnB,CAAC;AAEF;;;GAGG;AACH,MAAM,iBAAiB,GAAG;IACtB,QAAQ,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IAC9C,KAAK,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IACzB,UAAU,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC;IACjE,MAAM,EAAE,CAAC,GAAG,EAAE,YAAY,CAAC;IAC3B,MAAM,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC;CAClD,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACnC,MAA8C;IAE9C,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,OAAO,IAAI,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;QAClB,uCAAuC;QACvC,OAAO;YACH,WAAW,EAAE,IAAI;YACjB,iBAAiB,EAAE,KAAK;YACxB,eAAe,EAAE,EAAE;YACnB,SAAS,EAAE,SAAS;SACvB,CAAC;IACN,CAAC;IAED,OAAO;QACH,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,IAAI;QACvC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB,IAAI,KAAK;QACpD,eAAe,EAAE,MAAM,CAAC,eAAe,IAAI,EAAE;QAC7C,SAAS,EAAE,MAAM,CAAC,SAAS;KAC9B,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAiB,EAAE,SAAmB;IAC1D,MAAM,YAAY,GAAG,CAAC,GAAG,wBAAwB,EAAE,GAAG,SAAS,CAAC,CAAC;IAEjE,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QAClC,IAAI,CAAC;YACD,MAAM,QAAQ,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACjD,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBACxB,EAAE,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACL,kCAAkC;YAClC,+EAA+E;QACnF,CAAC;IACL,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,uBAAuB,CAAC,IAAiB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,SAAS,WAAW,CAAC,IAAwB,EAAE,QAAgB,CAAC;QAC5D,IAAI,CAAC,IAAI;YAAE,OAAO;QAElB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAElD,kBAAkB;QAClB,IAAI,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACvC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;YACvC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oBAAoB;QACpB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACnB,MAAM,MAAM,GAAG,IAAI,CAAC,UAAgC,CAAC;YACrD,MAAM,SAAS,GAAG,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC;YACjD,MAAM,MAAM,GAAG,SAAS,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;YACjD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;YAC9B,CAAC;YACD,OAAO;QACX,CAAC;QAED,yBAAyB;QACzB,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;gBAClC,CAAC;YACL,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,YAAY,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;gBAC/C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,KAAK,IAAI,OAAO,KAAK,MAAM,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,6CAA6C;QAC7C,IAAI,iBAAiB,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oDAAoD;QACpD,IAAI,iBAAiB,CAAC,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAC7D,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;gBAC9B,CAAC;qBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC9B,YAAY;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;oBAC/B,IAAI,IAAI,EAAE,CAAC;wBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;gBACL,CAAC;YACL,CAAC;YACD,OAAO;QACX,CAAC;QAED,sDAAsD;QACtD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;gBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAC9B,CAAC;iBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;gBAC9B,YAAY;gBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC/B,IAAI,IAAI,EAAE,CAAC;oBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACrB,CAAC;YACL,CAAC;QACL,CAAC;IACL,CAAC;IAED,WAAW,CAAC,IAAI,CAAC,CAAC;IAElB,gCAAgC;IAChC,OAAO,KAAK;SACP,IAAI,CAAC,IAAI,CAAC;SACV,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAiB;IACvC,eAAe;IACf,IAAI,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;IAErB,wDAAwD;IACxD,IAAI,GAAG,IAAI;QACP,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;QACxB,kEAAkE;SACjE,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC;QAC5B,oDAAoD;SACnD,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,IAAI,CAAC,IAAI,CAAC;QACX,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;IAEZ,OAAO,IAAI,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,KAAa,EAAE,OAAyB;IAC9D,qDAAqD;IACrD,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IACzE,CAAC;IAED,aAAa;IACb,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAE;QACtB,gBAAgB,EAAE,IAAI;QACtB,OAAO,EAAE,KAAK,EAAE,kBAAkB;QAClC,iBAAiB,EAAE;YACf,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,IAAI;YACd,KAAK,EAAE,IAAI;YACX,GAAG,EAAE,IAAI;SACZ;KACJ,CAAC,CAAC;IAEH,2BAA2B;IAC3B,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC;IAEpD,gCAAgC;IAChC,IAAI,MAAc,CAAC;IAEnB,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;QAC5B,MAAM,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC;SAAM,CAAC;QACJ,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAED,gCAAgC;IAChC,IAAI,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;QACzD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;QAC5C,kCAAkC;QAClC,MAAM,SAAS,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,GAAG,GAAG,EAAE,CAAC;YACtC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,KAAK,CAAC;QAChD,CAAC;aAAM,CAAC;YACJ,MAAM,IAAI,KAAK,CAAC;QACpB,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,KAAa,EAAE,MAA2B;IACjE,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,wBAAwB;IACxB,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,OAAO,GAAG,uBAAuB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC1D,IAAI,OAAO,EAAE,CAAC;YACV,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACxC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAgBD;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACjC,KAAa,EACb,MAAuC;IAEvC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;QACjC,OAAO;YACH,IAAI,EAAE,KAAK;YACX,YAAY,EAAE,KAAK;YACnB,cAAc,EAAE,KAAK,CAAC,MAAM;YAC5B,eAAe,EAAE,KAAK,CAAC,MAAM;SAChC,CAAC;IACN,CAAC;IAED,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAE5C,OAAO;QACH,IAAI,EAAE,SAAS;QACf,YAAY,EAAE,SAAS,KAAK,KAAK;QACjC,cAAc,EAAE,KAAK,CAAC,MAAM;QAC5B,eAAe,EAAE,SAAS,CAAC,MAAM;KACpC,CAAC;AACN,CAAC"}
@@ -3,6 +3,26 @@
3
3
  */
4
4
  import type { Schema } from '../schemas/types.js';
5
5
  import type { LLMConfig } from '../llm/types.js';
6
+ /**
7
+ * HTML stripping options for preprocessing
8
+ */
9
+ export interface HtmlStripOptions {
10
+ /** Keep text content only (default: true) */
11
+ extractText?: boolean;
12
+ /** Preserve semantic structure like headings, lists (converts to markdown-like format) */
13
+ preserveStructure?: boolean;
14
+ /** Remove specific CSS selectors (e.g., 'nav', 'footer', '.ad', '#sidebar') */
15
+ removeSelectors?: string[];
16
+ /** Max content length after stripping (truncates if exceeded) */
17
+ maxLength?: number;
18
+ }
19
+ /**
20
+ * Preprocessing configuration for input text
21
+ */
22
+ export interface PreprocessingConfig {
23
+ /** Strip HTML tags from input. When true, uses default options. */
24
+ stripHtml?: boolean | HtmlStripOptions;
25
+ }
6
26
  /**
7
27
  * Pipeline configuration
8
28
  */
@@ -19,6 +39,8 @@ export interface ExtractionRequest {
19
39
  input: string;
20
40
  schema: Schema;
21
41
  llmConfig: LLMConfig;
42
+ /** Optional preprocessing configuration */
43
+ preprocessing?: PreprocessingConfig;
22
44
  debug?: boolean;
23
45
  }
24
46
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,6CAA6C;IAC7C,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,0FAA0F;IAC1F,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAC5B,+EAA+E;IAC/E,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,iEAAiE;IACjE,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAChC,mEAAmE;IACnE,SAAS,CAAC,EAAE,OAAO,GAAG,gBAAgB,CAAC;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,2CAA2C;IAC3C,aAAa,CAAC,EAAE,mBAAmB,CAAC;IACpC,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
package/dist/index.d.ts CHANGED
@@ -5,7 +5,9 @@
5
5
  export { ExtractionPipeline, extract } from './core/pipeline.js';
6
6
  export { validateExtractedData } from './core/validator.js';
7
7
  export { PipelineError, PipelineErrorCodes } from './core/errors.js';
8
- export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, } from './core/types.js';
8
+ export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
9
+ export type { PreprocessResult } from './core/preprocessor.js';
10
+ export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './core/types.js';
9
11
  export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
10
12
  export { validateSchema } from './schemas/validator.js';
11
13
  export { SchemaValidationError, ErrorCodes as SchemaErrorCodes } from './schemas/errors.js';
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,GACb,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5F,YAAY,EACR,MAAM,EACN,eAAe,EACf,SAAS,EACT,eAAe,EACf,gBAAgB,EACnB,MAAM,oBAAoB,CAAC;AAG5B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACtE,YAAY,EACR,SAAS,EACT,WAAW,EACX,WAAW,EACX,UAAU,EACV,WAAW,EACX,iBAAiB,EACjB,kBAAkB,EAClB,WAAW,GACd,MAAM,gBAAgB,CAAC;AACxB,YAAY,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAChC,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5F,YAAY,EACR,MAAM,EACN,eAAe,EACf,SAAS,EACT,eAAe,EACf,gBAAgB,EACnB,MAAM,oBAAoB,CAAC;AAG5B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACtE,YAAY,EACR,SAAS,EACT,WAAW,EACX,WAAW,EACX,UAAU,EACV,WAAW,EACX,iBAAiB,EACjB,kBAAkB,EAClB,WAAW,GACd,MAAM,gBAAgB,CAAC;AACxB,YAAY,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC"}
package/dist/index.js CHANGED
@@ -6,6 +6,7 @@
6
6
  export { ExtractionPipeline, extract } from './core/pipeline.js';
7
7
  export { validateExtractedData } from './core/validator.js';
8
8
  export { PipelineError, PipelineErrorCodes } from './core/errors.js';
9
+ export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
9
10
  // Schema exports
10
11
  export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
11
12
  export { validateSchema } from './schemas/validator.js';
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAQrE,iBAAiB;AACjB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAS5F,cAAc;AACd,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAWhC,iBAAiB;AACjB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAS5F,cAAc;AACd,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ordis-dev/ordis",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "type": "module",
5
5
  "description": "Schema-first LLM extraction tool that turns unstructured text into validated structured data",
6
6
  "main": "dist/index.js",
@@ -64,5 +64,8 @@
64
64
  "tsx": "^4.21.0",
65
65
  "typescript": "^5.9.3",
66
66
  "vitest": "^4.0.15"
67
+ },
68
+ "dependencies": {
69
+ "node-html-parser": "^7.0.2"
67
70
  }
68
71
  }