@ordis-dev/ordis 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -3
- package/dist/cli.js +8 -2
- package/dist/cli.js.map +1 -1
- package/dist/core/index.d.ts +3 -1
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +1 -0
- package/dist/core/index.js.map +1 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +14 -1
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/preprocessor.d.ts +35 -0
- package/dist/core/preprocessor.d.ts.map +1 -0
- package/dist/core/preprocessor.js +297 -0
- package/dist/core/preprocessor.js.map +1 -0
- package/dist/core/types.d.ts +22 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/core/validator.js +39 -39
- package/dist/core/validator.js.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/schemas/loader.js +1 -1
- package/dist/schemas/loader.js.map +1 -1
- package/dist/schemas/types.d.ts +3 -1
- package/dist/schemas/types.d.ts.map +1 -1
- package/dist/schemas/validator.js +8 -10
- package/dist/schemas/validator.js.map +1 -1
- package/package.json +14 -3
package/README.md
CHANGED
|
@@ -38,7 +38,7 @@ ordis extract \
|
|
|
38
38
|
"invoice_id": { "type": "string" },
|
|
39
39
|
"amount": { "type": "number" },
|
|
40
40
|
"currency": { "type": "string", "enum": ["USD", "SGD", "EUR"] },
|
|
41
|
-
"date": { "type": "date", "optional": true }
|
|
41
|
+
"date": { "type": "string", "format": "date-time", "optional": true }
|
|
42
42
|
}
|
|
43
43
|
}
|
|
44
44
|
```
|
|
@@ -97,6 +97,17 @@ ordis extract \
|
|
|
97
97
|
--debug
|
|
98
98
|
```
|
|
99
99
|
|
|
100
|
+
**With API key** (for providers like OpenAI, Deepseek, etc.):
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
ordis extract \
|
|
104
|
+
--schema examples/invoice.schema.json \
|
|
105
|
+
--input examples/invoice.txt \
|
|
106
|
+
--base https://api.deepseek.com/v1 \
|
|
107
|
+
--model deepseek-chat \
|
|
108
|
+
--api-key your-api-key-here
|
|
109
|
+
```
|
|
110
|
+
|
|
100
111
|
### Programmatic Usage
|
|
101
112
|
|
|
102
113
|
Use ordis as a library in your Node.js application:
|
|
@@ -179,9 +190,19 @@ npm run benchmark
|
|
|
179
190
|
```
|
|
180
191
|
|
|
181
192
|
## Roadmap
|
|
182
|
-
|
|
193
|
+
|
|
194
|
+
**Completed in v0.1.0:**
|
|
195
|
+
- ✅ Core extraction pipeline with schema validation
|
|
196
|
+
- ✅ Token budget awareness and management
|
|
197
|
+
- ✅ Confidence scoring for extracted data
|
|
198
|
+
- ✅ Programmatic API for library usage
|
|
199
|
+
- ✅ CLI tool with debug mode
|
|
200
|
+
- ✅ Comprehensive test suite and benchmarks
|
|
201
|
+
- ✅ Support for any OpenAI-compatible API
|
|
202
|
+
|
|
203
|
+
**Upcoming:**
|
|
204
|
+
- [ ] Smart input truncation ([#40](https://github.com/ordis-dev/ordis/issues/40))
|
|
183
205
|
- [ ] Multi-pass extraction for large inputs ([#41](https://github.com/ordis-dev/ordis/issues/41))
|
|
184
|
-
- [ ]
|
|
185
206
|
- [ ] Config file support ([#16](https://github.com/ordis-dev/ordis/issues/16))
|
|
186
207
|
- [ ] Output formatting options ([#14](https://github.com/ordis-dev/ordis/issues/14))
|
|
187
208
|
- [ ] Batch extraction ([#19](https://github.com/ordis-dev/ordis/issues/19))
|
package/dist/cli.js
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
* Ordis - Schema-first extraction tool
|
|
4
4
|
* CLI entrypoint
|
|
5
5
|
*/
|
|
6
|
-
import
|
|
7
|
-
import * as
|
|
6
|
+
import process from 'node:process';
|
|
7
|
+
import * as fs from 'node:fs/promises';
|
|
8
|
+
import * as path from 'node:path';
|
|
8
9
|
import { loadSchema } from './schemas/loader.js';
|
|
9
10
|
import { extract } from './core/pipeline.js';
|
|
10
11
|
import packageJson from '../package.json' with { type: 'json' };
|
|
@@ -36,6 +37,9 @@ function parseArgs(args) {
|
|
|
36
37
|
else if (arg === '--model' && args[i + 1]) {
|
|
37
38
|
parsed.model = args[++i];
|
|
38
39
|
}
|
|
40
|
+
else if (arg === '--api-key' && args[i + 1]) {
|
|
41
|
+
parsed.apiKey = args[++i];
|
|
42
|
+
}
|
|
39
43
|
else if (!arg.startsWith('--')) {
|
|
40
44
|
parsed.command = arg;
|
|
41
45
|
}
|
|
@@ -54,6 +58,7 @@ OPTIONS:
|
|
|
54
58
|
--input <path> Path to input text file
|
|
55
59
|
--base <url> Base URL for OpenAI-compatible API
|
|
56
60
|
--model <name> Model name to use for extraction
|
|
61
|
+
--api-key <key> API key for the LLM provider (optional)
|
|
57
62
|
--debug Enable verbose debug output
|
|
58
63
|
--version, -v Show version number
|
|
59
64
|
--help, -h Show this help message
|
|
@@ -124,6 +129,7 @@ async function runExtraction(args) {
|
|
|
124
129
|
const llmConfig = {
|
|
125
130
|
baseURL: args.base,
|
|
126
131
|
model: args.model,
|
|
132
|
+
...(args.apiKey && { apiKey: args.apiKey }),
|
|
127
133
|
};
|
|
128
134
|
if (args.debug) {
|
|
129
135
|
console.log('[DEBUG] LLM config:', {
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;GAGG;AAEH,OAAO,KAAK,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;GAGG;AAEH,OAAO,OAAO,MAAM,cAAc,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AACjD,OAAO,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AAE7C,OAAO,WAAW,MAAM,iBAAiB,CAAC,OAAO,IAAI,EAAE,MAAM,EAAE,CAAC;AAYhE,SAAS,SAAS,CAAC,IAAc;IAC7B,MAAM,MAAM,GAAY,EAAE,CAAC;IAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QAEpB,IAAI,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACnC,QAAQ,EAAE,CAAC;YACX,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;QAED,IAAI,GAAG,KAAK,WAAW,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACtC,WAAW,EAAE,CAAC;YACd,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;QAED,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACpB,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC;YACpB,SAAS;QACb,CAAC;QAED,IAAI,GAAG,KAAK,UAAU,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YACpC,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9B,CAAC;aAAM,IAAI,GAAG,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC1C,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC7B,CAAC;aAAM,IAAI,GAAG,KAAK,QAAQ,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YACzC,MAAM,CAAC,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,CAAC;aAAM,IAAI,GAAG,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC1C,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC7B,CAAC;aAAM,IAAI,GAAG,KAAK,WAAW,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC5C,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9B,CAAC;aAAM,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YAC/B,MAAM,CAAC,OAAO,GAAG,GAAG,CAAC;QACzB,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED,SAAS,QAAQ;IACb,OAAO,CAAC,GAAG,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA4Bf,CAAC,CAAC;AACH,CAAC;AAED,SAAS,WAAW;IAChB,OAAO,CAAC,GAAG,CAAC,UAAU,WAAW,CAAC,OAAO,EAAE,CAAC,CAAC;AACjD,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,IAAa;IACtC,8BAA8B;IAC9B,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;QAC7C,OAAO,CAAC,KAAK,CAAC,iFAAiF,CAAC,CAAC;QACjG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;QACd,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAC5C,OAAO,CAAC,KAAK,CAAC,iFAAiF,CAAC,CAAC;QACjG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;QAC3C,OAAO,CAAC,KAAK,CAAC,iFAAiF,CAAC,CAAC;QACjG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;QACd,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAC5C,OAAO,CAAC,KAAK,CAAC,iFAAiF,CAAC,CAAC;QACjG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,IAAI,CAAC;QACD,sBAAsB;QACtB,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QAC/D,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,UAAU,CAAC,CAAC;QAE5C,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,qCAAqC,EAAE;gBAC/C,IAAI,EAAE,MAAM,CAAC,QAAQ,EAAE,IAAI;gBAC3B,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC;gBAClC,mBAAmB,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS;aACpD,CAAC,CAAC;QACP,CAAC;QAED,0BAA0B;QAC1B,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;QAC7D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3C,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QAExD,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,yBAAyB,SAAS,CAAC,MAAM,aAAa,CAAC,CAAC;QACxE,CAAC;QAED,4BAA4B;QAC5B,MAAM,SAAS,GAAc;YACzB,OAAO,EAAE,IAAI,CAAC,IAAI;YAClB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC;SAC9C,CAAC;QAEF,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,qBAAqB,EAAE;gBAC/B,OAAO,EAAE,SAAS,CAAC,OAAO;gBAC1B,KAAK,EAAE,SAAS,CAAC,KAAK;aACzB,CAAC,CAAC;QACP,CAAC;QAED,yBAAyB;QACzB,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC;YACzB,KAAK,EAAE,SAAS;YAChB,MAAM;YACN,SAAS;YACT,KAAK,EAAE,IAAI,CAAC,KAAK;SACpB,CAAC,CAAC;QAEH,yBAAyB;QACzB,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;YAC3C,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE;gBAC3B,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,UAAU,EAAE,MAAM,CAAC,UAAU;gBAC7B,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM;aACnC,CAAC,CAAC;QACP,CAAC;QAED,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACjB,uCAAuC;YACvC,MAAM,MAAM,GAAG;gBACX,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,UAAU,EAAE,MAAM,CAAC,UAAU;gBAC7B,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;gBAC3C,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC5B,CAAC;YAEF,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC7C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;aAAM,CAAC;YACJ,6BAA6B;YAC7B,MAAM,MAAM,GAAG;gBACX,OAAO,EAAE,KAAK;gBACd,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,iBAAiB;gBACpC,UAAU,EAAE,MAAM,CAAC,UAAU;gBAC7B,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC5B,CAAC;YAEF,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC/C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,2BAA2B;QAC3B,IAAI,IAAI,CAAC,KAAK,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YACvC,OAAO,CAAC,KAAK,CAAC,sBAAsB,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC;QACvD,CAAC;QAED,MAAM,WAAW,GAAG;YAChB,OAAO,EAAE,KAAK;YACd,MAAM,EAAE;gBACJ;oBACI,OAAO,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;oBAC/D,IAAI,EAAG,KAAa,CAAC,IAAI,IAAI,eAAe;iBAC/C;aACJ;SACJ,CAAC;QAEF,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;AACL,CAAC;AAED,KAAK,UAAU,IAAI;IACf,MAAM,IAAI,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAErC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,oEAAoE,CAAC,CAAC;QACpF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,IAAI,IAAI,CAAC,OAAO,KAAK,SAAS,EAAE,CAAC;QAC7B,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,wCAAwC,EAAE;gBAClD,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,KAAK,EAAE,IAAI,CAAC,KAAK;aACpB,CAAC,CAAC;QACP,CAAC;QAED,MAAM,aAAa,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;SAAM,CAAC;QACJ,OAAO,CAAC,KAAK,CAAC,2BAA2B,IAAI,CAAC,OAAO,kCAAkC,CAAC,CAAC;QACzF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;AACL,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACnB,OAAO,CAAC,KAAK,CAAC,cAAc,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAC7C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC"}
|
package/dist/core/index.d.ts
CHANGED
|
@@ -4,5 +4,7 @@
|
|
|
4
4
|
export { ExtractionPipeline, extract } from './pipeline.js';
|
|
5
5
|
export { validateExtractedData } from './validator.js';
|
|
6
6
|
export { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
-
export
|
|
7
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
|
|
8
|
+
export type { PreprocessResult } from './preprocessor.js';
|
|
9
|
+
export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './types.js';
|
|
8
10
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/core/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC;AAC3B,YAAY,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAC1D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,YAAY,CAAC"}
|
package/dist/core/index.js
CHANGED
|
@@ -4,4 +4,5 @@
|
|
|
4
4
|
export { ExtractionPipeline, extract } from './pipeline.js';
|
|
5
5
|
export { validateExtractedData } from './validator.js';
|
|
6
6
|
export { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './preprocessor.js';
|
|
7
8
|
//# sourceMappingURL=index.js.map
|
package/dist/core/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,mBAAmB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAc,MAAM,YAAY,CAAC;AAEhF;;GAEG;AACH,qBAAa,kBAAkB;IAC3B,OAAO,CAAC,KAAK,CAAU;gBAEX,KAAK,GAAE,OAAe;IAIlC;;OAEG;IACG,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC;IAoLlE;;OAEG;IACH,OAAO,CAAC,UAAU;IAsBlB;;OAEG;YACW,eAAe;CAqBhC;AAED;;GAEG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,cAAc,CAAC,CAGjF"}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import { LLMClient } from '../llm/client.js';
|
|
5
5
|
import { validateExtractedData } from './validator.js';
|
|
6
6
|
import { PipelineError, PipelineErrorCodes } from './errors.js';
|
|
7
|
+
import { preprocessWithDetails } from './preprocessor.js';
|
|
7
8
|
/**
|
|
8
9
|
* Main extraction pipeline
|
|
9
10
|
*/
|
|
@@ -19,6 +20,18 @@ export class ExtractionPipeline {
|
|
|
19
20
|
const startTime = Date.now();
|
|
20
21
|
const steps = [];
|
|
21
22
|
try {
|
|
23
|
+
// Step 0: Preprocess input (if configured)
|
|
24
|
+
let processedInput = request.input;
|
|
25
|
+
if (request.preprocessing) {
|
|
26
|
+
const preprocessStep = this.recordStep('preprocess', () => {
|
|
27
|
+
return preprocessWithDetails(request.input, request.preprocessing);
|
|
28
|
+
});
|
|
29
|
+
steps.push(preprocessStep);
|
|
30
|
+
if (preprocessStep.success && preprocessStep.data) {
|
|
31
|
+
const result = preprocessStep.data;
|
|
32
|
+
processedInput = result.text;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
22
35
|
// Step 1: Create LLM client
|
|
23
36
|
const clientStep = this.recordStep('create_client', () => {
|
|
24
37
|
return new LLMClient(request.llmConfig);
|
|
@@ -32,7 +45,7 @@ export class ExtractionPipeline {
|
|
|
32
45
|
const extractStep = await this.recordStepAsync('llm_extract', async () => {
|
|
33
46
|
return await client.extract({
|
|
34
47
|
schema: request.schema,
|
|
35
|
-
input:
|
|
48
|
+
input: processedInput,
|
|
36
49
|
});
|
|
37
50
|
});
|
|
38
51
|
steps.push(extractStep);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/core/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAChE,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAG1D;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACnB,KAAK,CAAU;IAEvB,YAAY,QAAiB,KAAK;QAC9B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,OAA0B;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAiB,EAAE,CAAC;QAE/B,IAAI,CAAC;YACD,2CAA2C;YAC3C,IAAI,cAAc,GAAG,OAAO,CAAC,KAAK,CAAC;YACnC,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;gBACxB,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,YAAY,EAAE,GAAG,EAAE;oBACtD,OAAO,qBAAqB,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;gBACvE,CAAC,CAAC,CAAC;gBACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAE3B,IAAI,cAAc,CAAC,OAAO,IAAI,cAAc,CAAC,IAAI,EAAE,CAAC;oBAChD,MAAM,MAAM,GAAG,cAAc,CAAC,IAA+C,CAAC;oBAC9E,cAAc,GAAG,MAAM,CAAC,IAAI,CAAC;gBACjC,CAAC;YACL,CAAC;YAED,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACrD,OAAO,IAAI,SAAS,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAEvB,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;gBACtB,MAAM,IAAI,aAAa,CACnB,6BAA6B,EAC7B,kBAAkB,CAAC,SAAS,EAC5B,eAAe,CAClB,CAAC;YACN,CAAC;YAED,MAAM,MAAM,GAAG,UAAU,CAAC,IAAiB,CAAC;YAE5C,kCAAkC;YAClC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,KAAK,IAAI,EAAE;gBACrE,OAAO,MAAM,MAAM,CAAC,OAAO,CAAC;oBACxB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,cAAc;iBACxB,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAExB,IAAI,CAAC,WAAW,CAAC,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;gBAC5C,MAAM,IAAI,aAAa,CACnB,uBAAuB,EACvB,kBAAkB,CAAC,SAAS,EAC5B,aAAa,EACb,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAC/B,CAAC;YACN,CAAC;YAED,MAAM,UAAU,GAAG,WAAW,CAAC,IAI9B,CAAC;YAEF,kCAAkC;YAClC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,GAAG,EAAE;gBACvD,OAAO,qBAAqB,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YAClE,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAEzB,MAAM,UAAU,GAAG,YAAY,CAAC,IAA2F,CAAC;YAE5H,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,qCAAqC;YACrC,MAAM,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,kBAAkB,EAAE,GAAG,EAAE;gBAC5D,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;oBAC7B,OAAO,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC;gBACpC,CAAC;gBAED,MAAM,EAAE,SAAS,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;gBACrE,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,IAAI,SAAS,CAAC;gBAE1D,OAAO;oBACH,cAAc;oBACd,UAAU,EAAE,CAAC,cAAc,IAAI,mBAAmB;iBACrD,CAAC;YACN,CAAC,CAAC,CAAC;YACH,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAE3B,MAAM,eAAe,GAAG,cAAc,CAAC,IAAyD,CAAC;YAEjG,IAAI,eAAe,CAAC,UAAU,EAAE,CAAC;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,IAAI,EAAE,UAAU,CAAC,IAAI;oBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;oBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;oBAC/C,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,cAAc,UAAU,CAAC,UAAU,qBAAqB,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,GAAG;4BACxG,IAAI,EAAE,kBAAkB,CAAC,gBAAgB;yBAC5C;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,WAAW;YACX,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,UAAU,CAAC,IAAI;gBACrB,UAAU,EAAE,UAAU,CAAC,UAAU;gBACjC,iBAAiB,EAAE,UAAU,CAAC,iBAAiB;gBAC/C,cAAc,EAAE,eAAe,CAAC,cAAc;gBAC9C,MAAM,EAAE,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAExC,IAAI,KAAK,YAAY,aAAa,EAAE,CAAC;gBACjC,OAAO;oBACH,OAAO,EAAE,KAAK;oBACd,cAAc,EAAE,KAAK;oBACrB,MAAM,EAAE;wBACJ;4BACI,OAAO,EAAE,KAAK,CAAC,OAAO;4BACtB,IAAI,EAAE,KAAK,CAAC,IAAI;yBACnB;qBACJ;oBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;oBACrC,QAAQ,EAAE;wBACN,QAAQ;wBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;wBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;qBAC5C;iBACJ,CAAC;YACN,CAAC;YAED,OAAO;gBACH,OAAO,EAAE,KAAK;gBACd,cAAc,EAAE,KAAK;gBACrB,MAAM,EAAE;oBACJ;wBACI,OAAO,EAAG,KAAe,CAAC,OAAO;wBACjC,IAAI,EAAE,eAAe;qBACxB;iBACJ;gBACD,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;gBACrC,QAAQ,EAAE;oBACN,QAAQ;oBACR,KAAK,EAAE,OAAO,CAAC,SAAS,CAAC,KAAK;oBAC9B,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI;iBAC5C;aACJ,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,UAAU,CAAI,IAAY,EAAE,EAAW;QAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAI,IAAY,EAAE,EAAoB;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,EAAE,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,IAAI;gBACb,IAAI,EAAE,MAAM;gBACZ,QAAQ;aACX,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACxC,OAAO;gBACH,IAAI,EAAE,IAAI;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAG,KAAe,CAAC,OAAO;gBAC/B,QAAQ;aACX,CAAC;QACN,CAAC;IACL,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAA0B;IACpD,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACvD,OAAO,MAAM,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML preprocessing module
|
|
3
|
+
* Strips HTML tags and noise from input text before extraction
|
|
4
|
+
*/
|
|
5
|
+
import type { HtmlStripOptions, PreprocessingConfig } from './types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Resolves preprocessing options to concrete HtmlStripOptions
|
|
8
|
+
*/
|
|
9
|
+
export declare function resolveHtmlStripOptions(config: boolean | HtmlStripOptions | undefined): HtmlStripOptions | null;
|
|
10
|
+
/**
|
|
11
|
+
* Strips HTML from input text according to options
|
|
12
|
+
*/
|
|
13
|
+
export declare function stripHtml(input: string, options: HtmlStripOptions): string;
|
|
14
|
+
/**
|
|
15
|
+
* Preprocesses input text according to configuration
|
|
16
|
+
*/
|
|
17
|
+
export declare function preprocess(input: string, config: PreprocessingConfig): string;
|
|
18
|
+
/**
|
|
19
|
+
* Result of preprocessing
|
|
20
|
+
*/
|
|
21
|
+
export interface PreprocessResult {
|
|
22
|
+
/** The preprocessed text */
|
|
23
|
+
text: string;
|
|
24
|
+
/** Whether preprocessing was applied */
|
|
25
|
+
wasProcessed: boolean;
|
|
26
|
+
/** Original input length */
|
|
27
|
+
originalLength: number;
|
|
28
|
+
/** Processed text length */
|
|
29
|
+
processedLength: number;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Preprocesses input with detailed result information
|
|
33
|
+
*/
|
|
34
|
+
export declare function preprocessWithDetails(input: string, config: PreprocessingConfig | undefined): PreprocessResult;
|
|
35
|
+
//# sourceMappingURL=preprocessor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"preprocessor.d.ts","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AA4CxE;;GAEG;AACH,wBAAgB,uBAAuB,CACnC,MAAM,EAAE,OAAO,GAAG,gBAAgB,GAAG,SAAS,GAC/C,gBAAgB,GAAG,IAAI,CAqBzB;AAuKD;;GAEG;AACH,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,MAAM,CA2C1E;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,mBAAmB,GAAG,MAAM,CAY7E;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,4BAA4B;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,wCAAwC;IACxC,YAAY,EAAE,OAAO,CAAC;IACtB,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,4BAA4B;IAC5B,eAAe,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACjC,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,mBAAmB,GAAG,SAAS,GACxC,gBAAgB,CAkBlB"}
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML preprocessing module
|
|
3
|
+
* Strips HTML tags and noise from input text before extraction
|
|
4
|
+
*/
|
|
5
|
+
import { parse, HTMLElement } from 'node-html-parser';
|
|
6
|
+
/**
|
|
7
|
+
* Default selectors to remove from HTML
|
|
8
|
+
* These typically contain non-content elements
|
|
9
|
+
*/
|
|
10
|
+
const DEFAULT_REMOVE_SELECTORS = [
|
|
11
|
+
'script',
|
|
12
|
+
'style',
|
|
13
|
+
'nav',
|
|
14
|
+
'footer',
|
|
15
|
+
'header',
|
|
16
|
+
'aside',
|
|
17
|
+
'noscript',
|
|
18
|
+
'iframe',
|
|
19
|
+
'svg',
|
|
20
|
+
'canvas',
|
|
21
|
+
'form',
|
|
22
|
+
// Common ad and tracking selectors
|
|
23
|
+
'[class*="ad-"]',
|
|
24
|
+
'[class*="advertisement"]',
|
|
25
|
+
'[class*="cookie"]',
|
|
26
|
+
'[class*="subscribe"]',
|
|
27
|
+
'[class*="newsletter"]',
|
|
28
|
+
'[class*="popup"]',
|
|
29
|
+
'[class*="modal"]',
|
|
30
|
+
'[class*="banner"]',
|
|
31
|
+
'[id*="ad-"]',
|
|
32
|
+
'[id*="advertisement"]',
|
|
33
|
+
'[id*="cookie"]',
|
|
34
|
+
];
|
|
35
|
+
/**
|
|
36
|
+
* Elements that should preserve their semantic meaning
|
|
37
|
+
* when preserveStructure is enabled
|
|
38
|
+
*/
|
|
39
|
+
const SEMANTIC_ELEMENTS = {
|
|
40
|
+
headings: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
|
|
41
|
+
lists: ['ul', 'ol', 'li'],
|
|
42
|
+
containers: ['article', 'main', 'section', 'div', 'body', 'html'],
|
|
43
|
+
blocks: ['p', 'blockquote'],
|
|
44
|
+
inline: ['strong', 'b', 'em', 'i', 'a', 'code'],
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Resolves preprocessing options to concrete HtmlStripOptions
|
|
48
|
+
*/
|
|
49
|
+
export function resolveHtmlStripOptions(config) {
|
|
50
|
+
if (!config) {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
if (config === true) {
|
|
54
|
+
// Default options when stripHtml: true
|
|
55
|
+
return {
|
|
56
|
+
extractText: true,
|
|
57
|
+
preserveStructure: false,
|
|
58
|
+
removeSelectors: [],
|
|
59
|
+
maxLength: undefined,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
extractText: config.extractText ?? true,
|
|
64
|
+
preserveStructure: config.preserveStructure ?? false,
|
|
65
|
+
removeSelectors: config.removeSelectors ?? [],
|
|
66
|
+
maxLength: config.maxLength,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Removes elements matching the specified selectors
|
|
71
|
+
*/
|
|
72
|
+
function removeElements(root, selectors) {
|
|
73
|
+
const allSelectors = [...DEFAULT_REMOVE_SELECTORS, ...selectors];
|
|
74
|
+
for (const selector of allSelectors) {
|
|
75
|
+
try {
|
|
76
|
+
const elements = root.querySelectorAll(selector);
|
|
77
|
+
for (const el of elements) {
|
|
78
|
+
el.remove();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
// Invalid selector, skip silently
|
|
83
|
+
// This can happen with complex CSS selectors not supported by node-html-parser
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Converts semantic HTML elements to markdown-like text
|
|
89
|
+
*/
|
|
90
|
+
function convertToStructuredText(root) {
|
|
91
|
+
const lines = [];
|
|
92
|
+
function processNode(node, depth = 0) {
|
|
93
|
+
if (!node)
|
|
94
|
+
return;
|
|
95
|
+
const tagName = node.tagName?.toLowerCase() || '';
|
|
96
|
+
// Handle headings
|
|
97
|
+
if (SEMANTIC_ELEMENTS.headings.includes(tagName)) {
|
|
98
|
+
const level = parseInt(tagName[1], 10);
|
|
99
|
+
const prefix = '#'.repeat(level) + ' ';
|
|
100
|
+
const text = node.text.trim();
|
|
101
|
+
if (text) {
|
|
102
|
+
lines.push('');
|
|
103
|
+
lines.push(prefix + text);
|
|
104
|
+
lines.push('');
|
|
105
|
+
}
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
// Handle list items
|
|
109
|
+
if (tagName === 'li') {
|
|
110
|
+
const parent = node.parentNode;
|
|
111
|
+
const parentTag = parent?.tagName?.toLowerCase();
|
|
112
|
+
const prefix = parentTag === 'ol' ? '1. ' : '- ';
|
|
113
|
+
const text = node.text.trim();
|
|
114
|
+
if (text) {
|
|
115
|
+
lines.push(prefix + text);
|
|
116
|
+
}
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
// Handle lists container
|
|
120
|
+
if (tagName === 'ul' || tagName === 'ol') {
|
|
121
|
+
lines.push('');
|
|
122
|
+
for (const child of node.childNodes) {
|
|
123
|
+
if (child instanceof HTMLElement) {
|
|
124
|
+
processNode(child, depth + 1);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
lines.push('');
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
// Handle blockquotes
|
|
131
|
+
if (tagName === 'blockquote') {
|
|
132
|
+
const text = node.text.trim();
|
|
133
|
+
if (text) {
|
|
134
|
+
lines.push('');
|
|
135
|
+
lines.push('> ' + text.replace(/\n/g, '\n> '));
|
|
136
|
+
lines.push('');
|
|
137
|
+
}
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
// Handle code blocks
|
|
141
|
+
if (tagName === 'pre' || tagName === 'code') {
|
|
142
|
+
const text = node.text.trim();
|
|
143
|
+
if (text) {
|
|
144
|
+
lines.push('');
|
|
145
|
+
lines.push('```');
|
|
146
|
+
lines.push(text);
|
|
147
|
+
lines.push('```');
|
|
148
|
+
lines.push('');
|
|
149
|
+
}
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
// Handle paragraphs and other block elements
|
|
153
|
+
if (SEMANTIC_ELEMENTS.blocks.includes(tagName)) {
|
|
154
|
+
const text = node.text.trim();
|
|
155
|
+
if (text) {
|
|
156
|
+
lines.push('');
|
|
157
|
+
lines.push(text);
|
|
158
|
+
lines.push('');
|
|
159
|
+
}
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
// Handle container elements - recurse into children
|
|
163
|
+
if (SEMANTIC_ELEMENTS.containers.includes(tagName) || !tagName) {
|
|
164
|
+
for (const child of node.childNodes) {
|
|
165
|
+
if (child instanceof HTMLElement) {
|
|
166
|
+
processNode(child, depth);
|
|
167
|
+
}
|
|
168
|
+
else if (child.nodeType === 3) {
|
|
169
|
+
// Text node
|
|
170
|
+
const text = child.text.trim();
|
|
171
|
+
if (text) {
|
|
172
|
+
lines.push(text);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
// Recursively process children for any other elements
|
|
179
|
+
for (const child of node.childNodes) {
|
|
180
|
+
if (child instanceof HTMLElement) {
|
|
181
|
+
processNode(child, depth);
|
|
182
|
+
}
|
|
183
|
+
else if (child.nodeType === 3) {
|
|
184
|
+
// Text node
|
|
185
|
+
const text = child.text.trim();
|
|
186
|
+
if (text) {
|
|
187
|
+
lines.push(text);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
processNode(root);
|
|
193
|
+
// Clean up multiple blank lines
|
|
194
|
+
return lines
|
|
195
|
+
.join('\n')
|
|
196
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
197
|
+
.trim();
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Extracts plain text from HTML, preserving meaningful whitespace
|
|
201
|
+
*/
|
|
202
|
+
function extractPlainText(root) {
|
|
203
|
+
// Get raw text
|
|
204
|
+
let text = root.text;
|
|
205
|
+
// Clean up whitespace while preserving paragraph breaks
|
|
206
|
+
text = text
|
|
207
|
+
// Replace multiple spaces with single space
|
|
208
|
+
.replace(/[ \t]+/g, ' ')
|
|
209
|
+
// Replace multiple newlines with double newline (paragraph break)
|
|
210
|
+
.replace(/\n\s*\n/g, '\n\n')
|
|
211
|
+
// Remove leading/trailing whitespace from each line
|
|
212
|
+
.split('\n')
|
|
213
|
+
.map(line => line.trim())
|
|
214
|
+
.join('\n')
|
|
215
|
+
// Remove more than two consecutive newlines
|
|
216
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
217
|
+
.trim();
|
|
218
|
+
return text;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Strips HTML from input text according to options
|
|
222
|
+
*/
|
|
223
|
+
export function stripHtml(input, options) {
|
|
224
|
+
// Quick check: if no HTML-like content, return as-is
|
|
225
|
+
if (!input.includes('<') || !input.includes('>')) {
|
|
226
|
+
return options.maxLength ? input.slice(0, options.maxLength) : input;
|
|
227
|
+
}
|
|
228
|
+
// Parse HTML
|
|
229
|
+
const root = parse(input, {
|
|
230
|
+
lowerCaseTagName: true,
|
|
231
|
+
comment: false, // Remove comments
|
|
232
|
+
blockTextElements: {
|
|
233
|
+
script: true,
|
|
234
|
+
noscript: true,
|
|
235
|
+
style: true,
|
|
236
|
+
pre: true,
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
// Remove unwanted elements
|
|
240
|
+
removeElements(root, options.removeSelectors || []);
|
|
241
|
+
// Extract text based on options
|
|
242
|
+
let result;
|
|
243
|
+
if (options.preserveStructure) {
|
|
244
|
+
result = convertToStructuredText(root);
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
result = extractPlainText(root);
|
|
248
|
+
}
|
|
249
|
+
// Apply max length if specified
|
|
250
|
+
if (options.maxLength && result.length > options.maxLength) {
|
|
251
|
+
result = result.slice(0, options.maxLength);
|
|
252
|
+
// Try to break at a word boundary
|
|
253
|
+
const lastSpace = result.lastIndexOf(' ');
|
|
254
|
+
if (lastSpace > options.maxLength * 0.8) {
|
|
255
|
+
result = result.slice(0, lastSpace) + '...';
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
result += '...';
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
return result;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Preprocesses input text according to configuration
|
|
265
|
+
*/
|
|
266
|
+
export function preprocess(input, config) {
|
|
267
|
+
let result = input;
|
|
268
|
+
// Handle HTML stripping
|
|
269
|
+
if (config.stripHtml) {
|
|
270
|
+
const options = resolveHtmlStripOptions(config.stripHtml);
|
|
271
|
+
if (options) {
|
|
272
|
+
result = stripHtml(result, options);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
return result;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Preprocesses input with detailed result information
|
|
279
|
+
*/
|
|
280
|
+
export function preprocessWithDetails(input, config) {
|
|
281
|
+
if (!config || (!config.stripHtml)) {
|
|
282
|
+
return {
|
|
283
|
+
text: input,
|
|
284
|
+
wasProcessed: false,
|
|
285
|
+
originalLength: input.length,
|
|
286
|
+
processedLength: input.length,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
const processed = preprocess(input, config);
|
|
290
|
+
return {
|
|
291
|
+
text: processed,
|
|
292
|
+
wasProcessed: processed !== input,
|
|
293
|
+
originalLength: input.length,
|
|
294
|
+
processedLength: processed.length,
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
//# sourceMappingURL=preprocessor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"preprocessor.js","sourceRoot":"","sources":["../../src/core/preprocessor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,MAAM,wBAAwB,GAAG;IAC7B,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,QAAQ;IACR,MAAM;IACN,mCAAmC;IACnC,gBAAgB;IAChB,0BAA0B;IAC1B,mBAAmB;IACnB,sBAAsB;IACtB,uBAAuB;IACvB,kBAAkB;IAClB,kBAAkB;IAClB,mBAAmB;IACnB,aAAa;IACb,uBAAuB;IACvB,gBAAgB;CACnB,CAAC;AAEF;;;GAGG;AACH,MAAM,iBAAiB,GAAG;IACtB,QAAQ,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IAC9C,KAAK,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;IACzB,UAAU,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC;IACjE,MAAM,EAAE,CAAC,GAAG,EAAE,YAAY,CAAC;IAC3B,MAAM,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC;CAClD,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACnC,MAA8C;IAE9C,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,OAAO,IAAI,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;QAClB,uCAAuC;QACvC,OAAO;YACH,WAAW,EAAE,IAAI;YACjB,iBAAiB,EAAE,KAAK;YACxB,eAAe,EAAE,EAAE;YACnB,SAAS,EAAE,SAAS;SACvB,CAAC;IACN,CAAC;IAED,OAAO;QACH,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,IAAI;QACvC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB,IAAI,KAAK;QACpD,eAAe,EAAE,MAAM,CAAC,eAAe,IAAI,EAAE;QAC7C,SAAS,EAAE,MAAM,CAAC,SAAS;KAC9B,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAiB,EAAE,SAAmB;IAC1D,MAAM,YAAY,GAAG,CAAC,GAAG,wBAAwB,EAAE,GAAG,SAAS,CAAC,CAAC;IAEjE,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QAClC,IAAI,CAAC;YACD,MAAM,QAAQ,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACjD,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBACxB,EAAE,CAAC,MAAM,EAAE,CAAC;YAChB,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACL,kCAAkC;YAClC,+EAA+E;QACnF,CAAC;IACL,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,uBAAuB,CAAC,IAAiB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,SAAS,WAAW,CAAC,IAAwB,EAAE,QAAgB,CAAC;QAC5D,IAAI,CAAC,IAAI;YAAE,OAAO;QAElB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAElD,kBAAkB;QAClB,IAAI,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACvC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;YACvC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oBAAoB;QACpB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACnB,MAAM,MAAM,GAAG,IAAI,CAAC,UAAgC,CAAC;YACrD,MAAM,SAAS,GAAG,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC;YACjD,MAAM,MAAM,GAAG,SAAS,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;YACjD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;YAC9B,CAAC;YACD,OAAO;QACX,CAAC;QAED,yBAAyB;QACzB,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;gBAClC,CAAC;YACL,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,YAAY,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;gBAC/C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,KAAK,KAAK,IAAI,OAAO,KAAK,MAAM,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,6CAA6C;QAC7C,IAAI,iBAAiB,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnB,CAAC;YACD,OAAO;QACX,CAAC;QAED,oDAAoD;QACpD,IAAI,iBAAiB,CAAC,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAC7D,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;oBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;gBAC9B,CAAC;qBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC9B,YAAY;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;oBAC/B,IAAI,IAAI,EAAE,CAAC;wBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;gBACL,CAAC;YACL,CAAC;YACD,OAAO;QACX,CAAC;QAED,sDAAsD;QACtD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YAClC,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;gBAC/B,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAC9B,CAAC;iBAAM,IAAI,KAAK,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;gBAC9B,YAAY;gBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC/B,IAAI,IAAI,EAAE,CAAC;oBACP,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACrB,CAAC;YACL,CAAC;QACL,CAAC;IACL,CAAC;IAED,WAAW,CAAC,IAAI,CAAC,CAAC;IAElB,gCAAgC;IAChC,OAAO,KAAK;SACP,IAAI,CAAC,IAAI,CAAC;SACV,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAiB;IACvC,eAAe;IACf,IAAI,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;IAErB,wDAAwD;IACxD,IAAI,GAAG,IAAI;QACP,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;QACxB,kEAAkE;SACjE,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC;QAC5B,oDAAoD;SACnD,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,IAAI,CAAC,IAAI,CAAC;QACX,4CAA4C;SAC3C,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;IAEZ,OAAO,IAAI,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,KAAa,EAAE,OAAyB;IAC9D,qDAAqD;IACrD,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IACzE,CAAC;IAED,aAAa;IACb,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAE;QACtB,gBAAgB,EAAE,IAAI;QACtB,OAAO,EAAE,KAAK,EAAE,kBAAkB;QAClC,iBAAiB,EAAE;YACf,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,IAAI;YACd,KAAK,EAAE,IAAI;YACX,GAAG,EAAE,IAAI;SACZ;KACJ,CAAC,CAAC;IAEH,2BAA2B;IAC3B,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC;IAEpD,gCAAgC;IAChC,IAAI,MAAc,CAAC;IAEnB,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;QAC5B,MAAM,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC;SAAM,CAAC;QACJ,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAED,gCAAgC;IAChC,IAAI,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;QACzD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;QAC5C,kCAAkC;QAClC,MAAM,SAAS,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,GAAG,GAAG,EAAE,CAAC;YACtC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,KAAK,CAAC;QAChD,CAAC;aAAM,CAAC;YACJ,MAAM,IAAI,KAAK,CAAC;QACpB,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,KAAa,EAAE,MAA2B;IACjE,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,wBAAwB;IACxB,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,OAAO,GAAG,uBAAuB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC1D,IAAI,OAAO,EAAE,CAAC;YACV,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACxC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAgBD;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACjC,KAAa,EACb,MAAuC;IAEvC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;QACjC,OAAO;YACH,IAAI,EAAE,KAAK;YACX,YAAY,EAAE,KAAK;YACnB,cAAc,EAAE,KAAK,CAAC,MAAM;YAC5B,eAAe,EAAE,KAAK,CAAC,MAAM;SAChC,CAAC;IACN,CAAC;IAED,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAE5C,OAAO;QACH,IAAI,EAAE,SAAS;QACf,YAAY,EAAE,SAAS,KAAK,KAAK;QACjC,cAAc,EAAE,KAAK,CAAC,MAAM;QAC5B,eAAe,EAAE,SAAS,CAAC,MAAM;KACpC,CAAC;AACN,CAAC"}
|
package/dist/core/types.d.ts
CHANGED
|
@@ -3,6 +3,26 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import type { Schema } from '../schemas/types.js';
|
|
5
5
|
import type { LLMConfig } from '../llm/types.js';
|
|
6
|
+
/**
|
|
7
|
+
* HTML stripping options for preprocessing
|
|
8
|
+
*/
|
|
9
|
+
export interface HtmlStripOptions {
|
|
10
|
+
/** Keep text content only (default: true) */
|
|
11
|
+
extractText?: boolean;
|
|
12
|
+
/** Preserve semantic structure like headings, lists (converts to markdown-like format) */
|
|
13
|
+
preserveStructure?: boolean;
|
|
14
|
+
/** Remove specific CSS selectors (e.g., 'nav', 'footer', '.ad', '#sidebar') */
|
|
15
|
+
removeSelectors?: string[];
|
|
16
|
+
/** Max content length after stripping (truncates if exceeded) */
|
|
17
|
+
maxLength?: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Preprocessing configuration for input text
|
|
21
|
+
*/
|
|
22
|
+
export interface PreprocessingConfig {
|
|
23
|
+
/** Strip HTML tags from input. When true, uses default options. */
|
|
24
|
+
stripHtml?: boolean | HtmlStripOptions;
|
|
25
|
+
}
|
|
6
26
|
/**
|
|
7
27
|
* Pipeline configuration
|
|
8
28
|
*/
|
|
@@ -19,6 +39,8 @@ export interface ExtractionRequest {
|
|
|
19
39
|
input: string;
|
|
20
40
|
schema: Schema;
|
|
21
41
|
llmConfig: LLMConfig;
|
|
42
|
+
/** Optional preprocessing configuration */
|
|
43
|
+
preprocessing?: PreprocessingConfig;
|
|
22
44
|
debug?: boolean;
|
|
23
45
|
}
|
|
24
46
|
/**
|
package/dist/core/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/core/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,6CAA6C;IAC7C,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,0FAA0F;IAC1F,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAC5B,+EAA+E;IAC/E,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,iEAAiE;IACjE,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAChC,mEAAmE;IACnE,SAAS,CAAC,EAAE,OAAO,GAAG,gBAAgB,CAAC;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,SAAS,CAAC;IACrB,2CAA2C;IAC3C,aAAa,CAAC,EAAE,mBAAmB,CAAC;IACpC,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACV,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,QAAQ,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,UAAU,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACL"}
|
package/dist/core/validator.js
CHANGED
|
@@ -45,20 +45,32 @@ function validateField(fieldName, value, fieldDef) {
|
|
|
45
45
|
value,
|
|
46
46
|
});
|
|
47
47
|
}
|
|
48
|
-
else
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
});
|
|
58
|
-
}
|
|
48
|
+
else {
|
|
49
|
+
// Check enum constraint
|
|
50
|
+
if (fieldDef.enum && !fieldDef.enum.includes(value)) {
|
|
51
|
+
errors.push({
|
|
52
|
+
field: fieldName,
|
|
53
|
+
message: `Field '${fieldName}' must be one of: ${fieldDef.enum.join(', ')}. Got: ${value}`,
|
|
54
|
+
code: PipelineErrorCodes.FIELD_INVALID,
|
|
55
|
+
value,
|
|
56
|
+
});
|
|
59
57
|
}
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
// Check pattern constraint
|
|
59
|
+
if (fieldDef.pattern) {
|
|
60
|
+
try {
|
|
61
|
+
const regex = new RegExp(fieldDef.pattern);
|
|
62
|
+
if (!regex.test(value)) {
|
|
63
|
+
errors.push({
|
|
64
|
+
field: fieldName,
|
|
65
|
+
message: `Field '${fieldName}' does not match pattern: ${fieldDef.pattern}`,
|
|
66
|
+
code: PipelineErrorCodes.FIELD_INVALID,
|
|
67
|
+
value,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
catch (e) {
|
|
72
|
+
// Invalid regex in schema - should be caught by schema validation
|
|
73
|
+
}
|
|
62
74
|
}
|
|
63
75
|
}
|
|
64
76
|
break;
|
|
@@ -90,45 +102,33 @@ function validateField(fieldName, value, fieldDef) {
|
|
|
90
102
|
}
|
|
91
103
|
}
|
|
92
104
|
break;
|
|
93
|
-
case '
|
|
94
|
-
|
|
95
|
-
if (typeof value !== 'string' && !(value instanceof Date)) {
|
|
105
|
+
case 'integer':
|
|
106
|
+
if (typeof value !== 'number' || !Number.isInteger(value)) {
|
|
96
107
|
errors.push({
|
|
97
108
|
field: fieldName,
|
|
98
|
-
message: `Field '${fieldName}' must be
|
|
109
|
+
message: `Field '${fieldName}' must be an integer`,
|
|
99
110
|
code: PipelineErrorCodes.TYPE_MISMATCH,
|
|
100
111
|
value,
|
|
101
112
|
});
|
|
102
113
|
}
|
|
103
114
|
else {
|
|
104
|
-
//
|
|
105
|
-
|
|
106
|
-
if (isNaN(dateValue.getTime())) {
|
|
115
|
+
// Check min/max constraints
|
|
116
|
+
if (fieldDef.min !== undefined && value < fieldDef.min) {
|
|
107
117
|
errors.push({
|
|
108
118
|
field: fieldName,
|
|
109
|
-
message: `Field '${fieldName}'
|
|
119
|
+
message: `Field '${fieldName}' must be at least ${fieldDef.min}`,
|
|
120
|
+
code: PipelineErrorCodes.FIELD_INVALID,
|
|
121
|
+
value,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
if (fieldDef.max !== undefined && value > fieldDef.max) {
|
|
125
|
+
errors.push({
|
|
126
|
+
field: fieldName,
|
|
127
|
+
message: `Field '${fieldName}' must be at most ${fieldDef.max}`,
|
|
110
128
|
code: PipelineErrorCodes.FIELD_INVALID,
|
|
111
129
|
value,
|
|
112
130
|
});
|
|
113
131
|
}
|
|
114
|
-
}
|
|
115
|
-
break;
|
|
116
|
-
case 'enum':
|
|
117
|
-
if (typeof value !== 'string') {
|
|
118
|
-
errors.push({
|
|
119
|
-
field: fieldName,
|
|
120
|
-
message: `Field '${fieldName}' must be a string for enum type, got ${typeof value}`,
|
|
121
|
-
code: PipelineErrorCodes.TYPE_MISMATCH,
|
|
122
|
-
value,
|
|
123
|
-
});
|
|
124
|
-
}
|
|
125
|
-
else if (fieldDef.enum && !fieldDef.enum.includes(value)) {
|
|
126
|
-
errors.push({
|
|
127
|
-
field: fieldName,
|
|
128
|
-
message: `Field '${fieldName}' must be one of: ${fieldDef.enum.join(', ')}. Got: ${value}`,
|
|
129
|
-
code: PipelineErrorCodes.FIELD_INVALID,
|
|
130
|
-
value,
|
|
131
|
-
});
|
|
132
132
|
}
|
|
133
133
|
break;
|
|
134
134
|
case 'boolean':
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"validator.js","sourceRoot":"","sources":["../../src/core/validator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAiB,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAchE;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACjC,IAA6B,EAC7B,MAAc;IAEd,MAAM,MAAM,GAAsB,EAAE,CAAC;IAErC,wBAAwB;IACxB,KAAK,MAAM,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QAChE,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC;QAE9B,4BAA4B;QAC5B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,mBAAmB,SAAS,cAAc;oBACnD,IAAI,EAAE,kBAAkB,CAAC,aAAa;iBACzC,CAAC,CAAC;YACP,CAAC;YACD,SAAS;QACb,CAAC;QAED,sCAAsC;QACtC,MAAM,WAAW,GAAG,aAAa,CAAC,SAAS,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC9D,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,CAAC;IAChC,CAAC;IAED,OAAO;QACH,KAAK,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC;QAC1B,MAAM;KACT,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAClB,SAAiB,EACjB,KAAc,EACd,QAAyB;IAEzB,MAAM,MAAM,GAAsB,EAAE,CAAC;IAErC,QAAQ,QAAQ,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,QAAQ;YACT,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC5B,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,UAAU,SAAS,2BAA2B,OAAO,KAAK,EAAE;oBACrE,IAAI,EAAE,kBAAkB,CAAC,aAAa;oBACtC,KAAK;iBACR,CAAC,CAAC;YACP,CAAC;iBAAM,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"validator.js","sourceRoot":"","sources":["../../src/core/validator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAiB,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAchE;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACjC,IAA6B,EAC7B,MAAc;IAEd,MAAM,MAAM,GAAsB,EAAE,CAAC;IAErC,wBAAwB;IACxB,KAAK,MAAM,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QAChE,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC;QAE9B,4BAA4B;QAC5B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,mBAAmB,SAAS,cAAc;oBACnD,IAAI,EAAE,kBAAkB,CAAC,aAAa;iBACzC,CAAC,CAAC;YACP,CAAC;YACD,SAAS;QACb,CAAC;QAED,sCAAsC;QACtC,MAAM,WAAW,GAAG,aAAa,CAAC,SAAS,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC9D,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,CAAC;IAChC,CAAC;IAED,OAAO;QACH,KAAK,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC;QAC1B,MAAM;KACT,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAClB,SAAiB,EACjB,KAAc,EACd,QAAyB;IAEzB,MAAM,MAAM,GAAsB,EAAE,CAAC;IAErC,QAAQ,QAAQ,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,QAAQ;YACT,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC5B,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,UAAU,SAAS,2BAA2B,OAAO,KAAK,EAAE;oBACrE,IAAI,EAAE,kBAAkB,CAAC,aAAa;oBACtC,KAAK;iBACR,CAAC,CAAC;YACP,CAAC;iBAAM,CAAC;gBACJ,wBAAwB;gBACxB,IAAI,QAAQ,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;oBAClD,MAAM,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,SAAS;wBAChB,OAAO,EAAE,UAAU,SAAS,qBAAqB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,KAAK,EAAE;wBAC1F,IAAI,EAAE,kBAAkB,CAAC,aAAa;wBACtC,KAAK;qBACR,CAAC,CAAC;gBACP,CAAC;gBACD,2BAA2B;gBAC3B,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;oBACnB,IAAI,CAAC;wBACD,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;wBAC3C,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;4BACrB,MAAM,CAAC,IAAI,CAAC;gCACR,KAAK,EAAE,SAAS;gCAChB,OAAO,EAAE,UAAU,SAAS,6BAA6B,QAAQ,CAAC,OAAO,EAAE;gCAC3E,IAAI,EAAE,kBAAkB,CAAC,aAAa;gCACtC,KAAK;6BACR,CAAC,CAAC;wBACP,CAAC;oBACL,CAAC;oBAAC,OAAO,CAAC,EAAE,CAAC;wBACT,kEAAkE;oBACtE,CAAC;gBACL,CAAC;YACL,CAAC;YACD,MAAM;QAEV,KAAK,QAAQ;YACT,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC5C,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,UAAU,SAAS,2BAA2B,OAAO,KAAK,EAAE;oBACrE,IAAI,EAAE,kBAAkB,CAAC,aAAa;oBACtC,KAAK;iBACR,CAAC,CAAC;YACP,CAAC;iBAAM,CAAC;gBACJ,IAAI,QAAQ,CAAC,GAAG,KAAK,SAAS,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;oBACrD,MAAM,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,SAAS;wBAChB,OAAO,EAAE,UAAU,SAAS,gBAAgB,QAAQ,CAAC,GAAG,SAAS,KAAK,EAAE;wBACxE,IAAI,EAAE,kBAAkB,CAAC,aAAa;wBACtC,KAAK;qBACR,CAAC,CAAC;gBACP,CAAC;gBACD,IAAI,QAAQ,CAAC,GAAG,KAAK,SAAS,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;oBACrD,MAAM,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,SAAS;wBAChB,OAAO,EAAE,UAAU,SAAS,gBAAgB,QAAQ,CAAC,GAAG,SAAS,KAAK,EAAE;wBACxE,IAAI,EAAE,kBAAkB,CAAC,aAAa;wBACtC,KAAK;qBACR,CAAC,CAAC;gBACP,CAAC;YACL,CAAC;YACD,MAAM;QAEV,KAAK,SAAS;YACV,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;gBACxD,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,UAAU,SAAS,sBAAsB;oBAClD,IAAI,EAAE,kBAAkB,CAAC,aAAa;oBACtC,KAAK;iBACR,CAAC,CAAC;YACP,CAAC;iBAAM,CAAC;gBACJ,4BAA4B;gBAC5B,IAAI,QAAQ,CAAC,GAAG,KAAK,SAAS,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;oBACrD,MAAM,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,SAAS;wBAChB,OAAO,EAAE,UAAU,SAAS,sBAAsB,QAAQ,CAAC,GAAG,EAAE;wBAChE,IAAI,EAAE,kBAAkB,CAAC,aAAa;wBACtC,KAAK;qBACR,CAAC,CAAC;gBACP,CAAC;gBACD,IAAI,QAAQ,CAAC,GAAG,KAAK,SAAS,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;oBACrD,MAAM,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,SAAS;wBAChB,OAAO,EAAE,UAAU,SAAS,qBAAqB,QAAQ,CAAC,GAAG,EAAE;wBAC/D,IAAI,EAAE,kBAAkB,CAAC,aAAa;wBACtC,KAAK;qBACR,CAAC,CAAC;gBACP,CAAC;YACL,CAAC;YACD,MAAM;QAEV,KAAK,SAAS;YACV,IAAI,OAAO,KAAK,KAAK,SAAS,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,UAAU,SAAS,4BAA4B,OAAO,KAAK,EAAE;oBACtE,IAAI,EAAE,kBAAkB,CAAC,aAAa;oBACtC,KAAK;iBACR,CAAC,CAAC;YACP,CAAC;YACD,MAAM;IACd,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -5,7 +5,9 @@
|
|
|
5
5
|
export { ExtractionPipeline, extract } from './core/pipeline.js';
|
|
6
6
|
export { validateExtractedData } from './core/validator.js';
|
|
7
7
|
export { PipelineError, PipelineErrorCodes } from './core/errors.js';
|
|
8
|
-
export
|
|
8
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
|
|
9
|
+
export type { PreprocessResult } from './core/preprocessor.js';
|
|
10
|
+
export type { PipelineConfig, ExtractionRequest, PipelineResult, StepResult, HtmlStripOptions, PreprocessingConfig, } from './core/types.js';
|
|
9
11
|
export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
|
|
10
12
|
export { validateSchema } from './schemas/validator.js';
|
|
11
13
|
export { SchemaValidationError, ErrorCodes as SchemaErrorCodes } from './schemas/errors.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAChC,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,YAAY,EACR,cAAc,EACd,iBAAiB,EACjB,cAAc,EACd,UAAU,EACV,gBAAgB,EAChB,mBAAmB,GACtB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5F,YAAY,EACR,MAAM,EACN,eAAe,EACf,SAAS,EACT,eAAe,EACf,gBAAgB,EACnB,MAAM,oBAAoB,CAAC;AAG5B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACtE,YAAY,EACR,SAAS,EACT,WAAW,EACX,WAAW,EACX,UAAU,EACV,WAAW,EACX,iBAAiB,EACjB,kBAAkB,EAClB,WAAW,GACd,MAAM,gBAAgB,CAAC;AACxB,YAAY,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
export { ExtractionPipeline, extract } from './core/pipeline.js';
|
|
7
7
|
export { validateExtractedData } from './core/validator.js';
|
|
8
8
|
export { PipelineError, PipelineErrorCodes } from './core/errors.js';
|
|
9
|
+
export { stripHtml, preprocess, preprocessWithDetails, resolveHtmlStripOptions, } from './core/preprocessor.js';
|
|
9
10
|
// Schema exports
|
|
10
11
|
export { loadSchema, parseSchema, loadSchemaFromObject } from './schemas/loader.js';
|
|
11
12
|
export { validateSchema } from './schemas/validator.js';
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,wBAAwB;AACxB,OAAO,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EACH,SAAS,EACT,UAAU,EACV,qBAAqB,EACrB,uBAAuB,GAC1B,MAAM,wBAAwB,CAAC;AAWhC,iBAAiB;AACjB,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AACpF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,qBAAqB,EAAE,UAAU,IAAI,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAS5F,cAAc;AACd,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC"}
|
package/dist/schemas/loader.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/schemas/loader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/schemas/loader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAGvC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,qBAAqB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEhE;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC7C,IAAI,CAAC;QACD,uCAAuC;QACvC,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IACjD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,MAAM,IAAI,qBAAqB,CAC3B,4BAA4B,QAAQ,EAAE,EACtC,UAAU,CAAC,YAAY,EACvB,SAAS,EACT,EAAE,QAAQ,EAAE,KAAK,EAAG,KAAe,CAAC,OAAO,EAAE,CAChD,CAAC;IACN,CAAC;IAED,oBAAoB;IACpB,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACD,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,MAAM,IAAI,qBAAqB,CAC3B,+BAAgC,KAAe,CAAC,OAAO,EAAE,EACzD,UAAU,CAAC,YAAY,EACvB,SAAS,EACT,EAAE,QAAQ,EAAE,CACf,CAAC;IACN,CAAC;IAED,aAAa;IACb,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,MAAM,IAAI,qBAAqB,CAC3B,gCAAiC,KAAe,CAAC,OAAO,EAAE,EAC1D,UAAU,CAAC,YAAY,EACvB,SAAS,EACT,EAAE,QAAQ,EAAE,CACf,CAAC;IACN,CAAC;IAED,4BAA4B;IAC5B,cAAc,CAAC,MAAM,CAAC,CAAC;IAEvB,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CAAC,UAAkB;IAC1C,IAAI,MAAe,CAAC;IAEpB,IAAI,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,MAAM,IAAI,qBAAqB,CAC3B,iBAAkB,KAAe,CAAC,OAAO,EAAE,EAC3C,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;IAED,cAAc,CAAC,MAAM,CAAC,CAAC;IACvB,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,oBAAoB,CAAC,GAAY;IAC7C,cAAc,CAAC,GAAG,CAAC,CAAC;IACpB,OAAO,GAAG,CAAC;AACf,CAAC"}
|
package/dist/schemas/types.d.ts
CHANGED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
*/
|
|
6
6
|
/**
|
|
7
7
|
* Supported field types in schema definitions
|
|
8
|
+
* Note: For dates, use type='string' with format='date-time'
|
|
8
9
|
*/
|
|
9
|
-
export type FieldType = 'string' | 'number' | '
|
|
10
|
+
export type FieldType = 'string' | 'number' | 'integer' | 'boolean';
|
|
10
11
|
/**
|
|
11
12
|
* Field definition within a schema
|
|
12
13
|
*/
|
|
@@ -14,6 +15,7 @@ export interface FieldDefinition {
|
|
|
14
15
|
type: FieldType;
|
|
15
16
|
description?: string;
|
|
16
17
|
optional?: boolean;
|
|
18
|
+
format?: string;
|
|
17
19
|
enum?: string[];
|
|
18
20
|
min?: number;
|
|
19
21
|
max?: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/schemas/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/schemas/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,QAAQ,GAAG,QAAQ,GAAG,SAAS,GAAG,SAAS,CAAC;AAEpE;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B,IAAI,EAAE,SAAS,CAAC;IAChB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,MAAM;IACnB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC;IACxC,QAAQ,CAAC,EAAE;QACP,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,WAAW,CAAC,EAAE,MAAM,CAAC;KACxB,CAAC;IACF,UAAU,CAAC,EAAE;QACT,SAAS,EAAE,MAAM,CAAC;QAClB,mBAAmB,EAAE,OAAO,CAAC;KAChC,CAAC;IACF,MAAM,CAAC,EAAE;QACL,sBAAsB,CAAC,EAAE,OAAO,CAAC;KACpC,CAAC;CACL;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9C;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC7B,OAAO,EAAE,OAAO,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,cAAc,EAAE,OAAO,CAAC;IACxB,QAAQ,CAAC,EAAE;QACP,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;CACL"}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Schema validator - validates schema definitions
|
|
3
3
|
*/
|
|
4
4
|
import { SchemaValidationError, ErrorCodes } from './errors.js';
|
|
5
|
-
const VALID_FIELD_TYPES = ['string', 'number', '
|
|
5
|
+
const VALID_FIELD_TYPES = ['string', 'number', 'integer', 'boolean'];
|
|
6
6
|
/**
|
|
7
7
|
* Validates a schema definition
|
|
8
8
|
*
|
|
@@ -82,23 +82,21 @@ function validateFieldDefinition(fieldName, fieldDef) {
|
|
|
82
82
|
}
|
|
83
83
|
// Type-specific validations
|
|
84
84
|
const fieldType = def.type;
|
|
85
|
-
if (fieldType === '
|
|
86
|
-
validateEnumField(fieldName, def);
|
|
87
|
-
}
|
|
88
|
-
if (fieldType === 'number') {
|
|
85
|
+
if (fieldType === 'number' || fieldType === 'integer') {
|
|
89
86
|
validateNumberField(fieldName, def);
|
|
90
87
|
}
|
|
91
88
|
if (fieldType === 'string') {
|
|
92
89
|
validateStringField(fieldName, def);
|
|
90
|
+
// Validate enum constraint if present
|
|
91
|
+
if (def.enum) {
|
|
92
|
+
validateEnumConstraint(fieldName, def);
|
|
93
|
+
}
|
|
93
94
|
}
|
|
94
95
|
}
|
|
95
96
|
/**
|
|
96
|
-
* Validates enum
|
|
97
|
+
* Validates enum constraint on string fields
|
|
97
98
|
*/
|
|
98
|
-
function
|
|
99
|
-
if (!def.enum) {
|
|
100
|
-
throw new SchemaValidationError(`Field '${fieldName}' with type 'enum' must have an 'enum' property`, ErrorCodes.MISSING_ENUM_VALUES, fieldName);
|
|
101
|
-
}
|
|
99
|
+
function validateEnumConstraint(fieldName, def) {
|
|
102
100
|
if (!Array.isArray(def.enum)) {
|
|
103
101
|
throw new SchemaValidationError(`Field '${fieldName}' enum property must be an array`, ErrorCodes.INVALID_ENUM_VALUE, fieldName);
|
|
104
102
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"validator.js","sourceRoot":"","sources":["../../src/schemas/validator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAE,qBAAqB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEhE,MAAM,iBAAiB,GAAgB,CAAC,QAAQ,EAAE,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"validator.js","sourceRoot":"","sources":["../../src/schemas/validator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAE,qBAAqB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEhE,MAAM,iBAAiB,GAAgB,CAAC,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;AAElF;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAAC,MAAe;IAC1C,+BAA+B;IAC/B,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QACxC,MAAM,IAAI,qBAAqB,CAC3B,0BAA0B,EAC1B,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;IAED,MAAM,SAAS,GAAG,MAAiC,CAAC;IAEpD,uCAAuC;IACvC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,IAAI,qBAAqB,CAC3B,yCAAyC,EACzC,UAAU,CAAC,cAAc,CAC5B,CAAC;IACN,CAAC;IAED,IAAI,OAAO,SAAS,CAAC,MAAM,KAAK,QAAQ,IAAI,SAAS,CAAC,MAAM,KAAK,IAAI,EAAE,CAAC;QACpE,MAAM,IAAI,qBAAqB,CAC3B,4BAA4B,EAC5B,UAAU,CAAC,cAAc,CAC5B,CAAC;IACN,CAAC;IAED,MAAM,MAAM,GAAG,SAAS,CAAC,MAAiC,CAAC;IAE3D,2BAA2B;IAC3B,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACnC,MAAM,IAAI,qBAAqB,CAC3B,wCAAwC,EACxC,UAAU,CAAC,cAAc,CAC5B,CAAC;IACN,CAAC;IAED,iCAAiC;IACjC,KAAK,MAAM,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QACzD,iBAAiB,CAAC,SAAS,CAAC,CAAC;QAC7B,uBAAuB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IACjD,CAAC;IAED,+BAA+B;IAC/B,IAAI,SAAS,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;QACnC,gBAAgB,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IACzC,CAAC;IAED,+CAA+C;IAC/C,IAAI,SAAS,CAAC,UAAU,KAAK,SAAS,EAAE,CAAC;QACrC,wBAAwB,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IACnD,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAiB;IACxC,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACxC,MAAM,IAAI,qBAAqB,CAC3B,4BAA4B,EAC5B,UAAU,CAAC,kBAAkB,CAChC,CAAC;IACN,CAAC;IAED,wEAAwE;IACxE,IAAI,CAAC,0BAA0B,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9C,MAAM,IAAI,qBAAqB,CAC3B,uBAAuB,SAAS,gHAAgH,EAChJ,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,uBAAuB,CAAC,SAAiB,EAAE,QAAiB;IACjE,IAAI,CAAC,QAAQ,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;QAC5C,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,gCAAgC,EACnD,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,MAAM,GAAG,GAAG,QAAmC,CAAC;IAEhD,qCAAqC;IACrC,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACZ,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,uCAAuC,EAC1D,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC/B,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,yBAAyB,EAC5C,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAiB,CAAC,EAAE,CAAC;QACrD,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,uBAAuB,GAAG,CAAC,IAAI,uBAAuB,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EACvG,UAAU,CAAC,kBAAkB,EAC7B,SAAS,EACT,EAAE,UAAU,EAAE,iBAAiB,EAAE,YAAY,EAAE,GAAG,CAAC,IAAI,EAAE,CAC5D,CAAC;IACN,CAAC;IAED,wCAAwC;IACxC,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,IAAI,OAAO,GAAG,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;QAClE,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,uCAAuC,EAC1D,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,kCAAkC;IAClC,IAAI,GAAG,CAAC,WAAW,KAAK,SAAS,IAAI,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,EAAE,CAAC;QACvE,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,gCAAgC,EACnD,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,4BAA4B;IAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,IAAiB,CAAC;IAExC,IAAI,SAAS,KAAK,QAAQ,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;QACpD,mBAAmB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;IACxC,CAAC;IAED,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;QACzB,mBAAmB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QACpC,sCAAsC;QACtC,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;YACX,sBAAsB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAC3C,CAAC;IACL,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,sBAAsB,CAAC,SAAiB,EAAE,GAA4B;IAC3E,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,kCAAkC,EACrD,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,8BAA8B,EACjD,UAAU,CAAC,iBAAiB,EAC5B,SAAS,CACZ,CAAC;IACN,CAAC;IAED,oCAAoC;IACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC5B,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,yBAAyB,CAAC,0BAA0B,OAAO,KAAK,EAAE,EACrF,UAAU,CAAC,kBAAkB,EAC7B,SAAS,EACT,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CACtB,CAAC;QACN,CAAC;IACL,CAAC;IAED,6BAA6B;IAC7B,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,YAAY,CAAC,IAAI,KAAK,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACxC,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,kCAAkC,EACrD,UAAU,CAAC,oBAAoB,EAC/B,SAAS,CACZ,CAAC;IACN,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,SAAiB,EAAE,GAA4B;IACxE,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,EAAE,CAAC;QACxB,IAAI,OAAO,GAAG,CAAC,GAAG,KAAK,QAAQ,EAAE,CAAC;YAC9B,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,mCAAmC,EACtD,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;QACN,CAAC;IACL,CAAC;IAED,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,EAAE,CAAC;QACxB,IAAI,OAAO,GAAG,CAAC,GAAG,KAAK,QAAQ,EAAE,CAAC;YAC9B,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,mCAAmC,EACtD,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;QACN,CAAC;IACL,CAAC;IAED,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,EAAE,CAAC;QACjD,IAAK,GAAG,CAAC,GAAc,GAAI,GAAG,CAAC,GAAc,EAAE,CAAC;YAC5C,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,gBAAgB,GAAG,CAAC,GAAG,uCAAuC,GAAG,CAAC,GAAG,GAAG,EAC3F,UAAU,CAAC,mBAAmB,EAC9B,SAAS,EACT,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,CACjC,CAAC;QACN,CAAC;IACL,CAAC;IAED,oCAAoC;IACpC,IAAI,GAAG,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;QACzB,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,kDAAkD,EACrE,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,SAAiB,EAAE,GAA4B;IACxE,IAAI,GAAG,CAAC,OAAO,KAAK,SAAS,EAAE,CAAC;QAC5B,IAAI,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAClC,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,uCAAuC,EAC1D,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;QACN,CAAC;QAED,yBAAyB;QACzB,IAAI,CAAC;YACD,IAAI,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACb,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,gCAAiC,KAAe,CAAC,OAAO,EAAE,EAC7E,UAAU,CAAC,eAAe,EAC1B,SAAS,EACT,EAAE,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,CAC3B,CAAC;QACN,CAAC;IACL,CAAC;IAED,sEAAsE;IACtE,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,IAAI,GAAG,CAAC,GAAG,KAAK,SAAS,EAAE,CAAC;QACjD,MAAM,IAAI,qBAAqB,CAC3B,UAAU,SAAS,4DAA4D,EAC/E,UAAU,CAAC,kBAAkB,EAC7B,SAAS,CACZ,CAAC;IACN,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,QAAiB;IACvC,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;QACpD,MAAM,IAAI,qBAAqB,CAC3B,mCAAmC,EACnC,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;IAED,MAAM,IAAI,GAAG,QAAmC,CAAC;IAEjD,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,OAAO,IAAI,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC3D,MAAM,IAAI,qBAAqB,CAC3B,uCAAuC,EACvC,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;IAED,IAAI,IAAI,CAAC,OAAO,KAAK,SAAS,IAAI,OAAO,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;QACjE,MAAM,IAAI,qBAAqB,CAC3B,0CAA0C,EAC1C,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;IAED,IAAI,IAAI,CAAC,WAAW,KAAK,SAAS,IAAI,OAAO,IAAI,CAAC,WAAW,KAAK,QAAQ,EAAE,CAAC;QACzE,MAAM,IAAI,qBAAqB,CAC3B,8CAA8C,EAC9C,UAAU,CAAC,YAAY,CAC1B,CAAC;IACN,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,wBAAwB,CAAC,UAAmB;IACjD,IAAI,OAAO,UAAU,KAAK,QAAQ,IAAI,UAAU,KAAK,IAAI,EAAE,CAAC;QACxD,MAAM,IAAI,qBAAqB,CAC3B,4CAA4C,EAC5C,UAAU,CAAC,yBAAyB,CACvC,CAAC;IACN,CAAC;IAED,MAAM,MAAM,GAAG,UAAqC,CAAC;IAErD,0CAA0C;IAC1C,IAAI,MAAM,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACjC,MAAM,IAAI,qBAAqB,CAC3B,yDAAyD,EACzD,UAAU,CAAC,yBAAyB,CACvC,CAAC;IACN,CAAC;IAED,IAAI,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,EAAE,CAAC;QACvC,MAAM,IAAI,qBAAqB,CAC3B,uCAAuC,EACvC,UAAU,CAAC,yBAAyB,EACpC,SAAS,EACT,EAAE,QAAQ,EAAE,OAAO,MAAM,CAAC,SAAS,EAAE,CACxC,CAAC;IACN,CAAC;IAED,mCAAmC;IACnC,IAAI,MAAM,CAAC,SAAS,GAAG,CAAC,IAAI,MAAM,CAAC,SAAS,GAAG,GAAG,EAAE,CAAC;QACjD,MAAM,IAAI,qBAAqB,CAC3B,uDAAuD,MAAM,CAAC,SAAS,EAAE,EACzE,UAAU,CAAC,yBAAyB,EACpC,SAAS,EACT,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAClC,CAAC;IACN,CAAC;IAED,oDAAoD;IACpD,IAAI,MAAM,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;QAC3C,MAAM,IAAI,qBAAqB,CAC3B,mEAAmE,EACnE,UAAU,CAAC,yBAAyB,CACvC,CAAC;IACN,CAAC;IAED,IAAI,OAAO,MAAM,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;QAClD,MAAM,IAAI,qBAAqB,CAC3B,uCAAuC,EACvC,UAAU,CAAC,yBAAyB,EACpC,SAAS,EACT,EAAE,QAAQ,EAAE,OAAO,MAAM,CAAC,mBAAmB,EAAE,CAClD,CAAC;IACN,CAAC;AACL,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ordis-dev/ordis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Schema-first LLM extraction tool that turns unstructured text into validated structured data",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -34,7 +34,15 @@
|
|
|
34
34
|
"validation",
|
|
35
35
|
"cli",
|
|
36
36
|
"openai",
|
|
37
|
-
"ollama"
|
|
37
|
+
"ollama",
|
|
38
|
+
"cross-platform",
|
|
39
|
+
"deno",
|
|
40
|
+
"bun",
|
|
41
|
+
"nodejs",
|
|
42
|
+
"webstandards",
|
|
43
|
+
"typescript",
|
|
44
|
+
"javascript",
|
|
45
|
+
"jsonschema"
|
|
38
46
|
],
|
|
39
47
|
"author": "Ordis",
|
|
40
48
|
"license": "MIT",
|
|
@@ -56,5 +64,8 @@
|
|
|
56
64
|
"tsx": "^4.21.0",
|
|
57
65
|
"typescript": "^5.9.3",
|
|
58
66
|
"vitest": "^4.0.15"
|
|
67
|
+
},
|
|
68
|
+
"dependencies": {
|
|
69
|
+
"node-html-parser": "^7.0.2"
|
|
59
70
|
}
|
|
60
|
-
}
|
|
71
|
+
}
|