@intuned/browser-dev 0.1.15-dev.1 → 0.1.16-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +3 -0
- package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
- package/dist/ai/tests/testMatching.spec.js +38 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
- package/dist/common/matching/matching.js +3 -3
- package/dist/common/xpathMapping.js +23 -10
- package/dist/helpers/downloadFile.js +3 -0
- package/dist/helpers/saveFileToS3.js +3 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
- package/dist/optimized-extractors/common/aiModelsValidations.js +2 -21
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +2 -3
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +1 -4
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +1 -2
- package/dist/optimized-extractors/common/findTableHeaders.js +2 -2
- package/dist/optimized-extractors/common/index.js +4 -4
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +2 -2
- package/dist/optimized-extractors/common/matching/utils.js +4 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
- package/dist/optimized-extractors/export.d.ts +2 -50
- package/dist/optimized-extractors/extractArray.js +3 -1
- package/dist/optimized-extractors/index.d.ts +2 -50
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +32 -23
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +2 -2
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
- package/dist/optimized-extractors/validators.js +5 -6
- package/package.json +1 -1
- package/dist/optimized-extractors/types/aiModelsValidation.js +0 -45
package/.prettierrc
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _extendedTest = require("../../common/extendedTest");
|
|
4
|
+
var _ = require("..");
|
|
5
|
+
var _neverthrow = require("neverthrow");
|
|
6
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
7
|
+
const DATA_SCHEMA = {
|
|
8
|
+
type: "object",
|
|
9
|
+
required: ["phone"],
|
|
10
|
+
properties: {
|
|
11
|
+
name: {
|
|
12
|
+
type: "string",
|
|
13
|
+
description: "the contact name"
|
|
14
|
+
},
|
|
15
|
+
phone: {
|
|
16
|
+
type: "string",
|
|
17
|
+
description: "the contact phone"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
const CONTACT_HTML = `<div class="contact"><p>Andreas Jansson</p><p>PH: (727) 471-4768</p></div>`;
|
|
22
|
+
function makeOptions(page) {
|
|
23
|
+
return {
|
|
24
|
+
source: page,
|
|
25
|
+
dataSchema: DATA_SCHEMA,
|
|
26
|
+
strategy: "HTML",
|
|
27
|
+
enableDomMatching: true,
|
|
28
|
+
enableCache: true,
|
|
29
|
+
model: "claude-sonnet-4-20250514"
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async function mockExtractStructuredDataUsingAi(result) {
|
|
33
|
+
const aiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../extractStructuredDataUsingAi")));
|
|
34
|
+
return _extendedTest.vi.spyOn(aiModule, "extractStructuredDataUsingAi").mockResolvedValue((0, _neverthrow.ok)({
|
|
35
|
+
result,
|
|
36
|
+
xpathMapping: {}
|
|
37
|
+
}));
|
|
38
|
+
}
|
|
39
|
+
async function useInMemoryCache() {
|
|
40
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../intunedServices/cache/cache")));
|
|
41
|
+
const store = new Map();
|
|
42
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
43
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
44
|
+
store.set(key, value);
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
(0, _extendedTest.describe)("extractStructuredData - DOM matching cache (enableDomMatching: true)", () => {
|
|
48
|
+
(0, _extendedTest.afterEach)(() => {
|
|
49
|
+
_extendedTest.vi.restoreAllMocks();
|
|
50
|
+
});
|
|
51
|
+
(0, _extendedTest.test)("reuses the cache on a second call, even when a value is a partial match", async ({
|
|
52
|
+
page
|
|
53
|
+
}) => {
|
|
54
|
+
const aiResult = {
|
|
55
|
+
name: "Andreas Jansson",
|
|
56
|
+
phone: "(727) 471-4768"
|
|
57
|
+
};
|
|
58
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
59
|
+
await useInMemoryCache();
|
|
60
|
+
await page.setContent(CONTACT_HTML);
|
|
61
|
+
const first = await (0, _.extractStructuredData)(makeOptions(page));
|
|
62
|
+
(0, _extendedTest.expect)(first).toEqual(aiResult);
|
|
63
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
64
|
+
const second = await (0, _.extractStructuredData)(makeOptions(page));
|
|
65
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
66
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
67
|
+
});
|
|
68
|
+
(0, _extendedTest.test)("re-extracts when an extracted value's DOM text changes", async ({
|
|
69
|
+
page
|
|
70
|
+
}) => {
|
|
71
|
+
const aiResult = {
|
|
72
|
+
name: "Andreas Jansson",
|
|
73
|
+
phone: "(727) 471-4768"
|
|
74
|
+
};
|
|
75
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
76
|
+
await useInMemoryCache();
|
|
77
|
+
await page.setContent(CONTACT_HTML);
|
|
78
|
+
await (0, _.extractStructuredData)(makeOptions(page));
|
|
79
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
80
|
+
await page.evaluate(() => {
|
|
81
|
+
const ps = document.querySelectorAll("div.contact p");
|
|
82
|
+
if (ps[1]) ps[1].textContent = "PH: (999) 999-9999";
|
|
83
|
+
});
|
|
84
|
+
await (0, _.extractStructuredData)(makeOptions(page));
|
|
85
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
@@ -260,6 +260,44 @@ var _matching = require("../../common/matching/matching");
|
|
|
260
260
|
});
|
|
261
261
|
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello");
|
|
262
262
|
});
|
|
263
|
+
(0, _extendedTest.test)("should prefer a full match over a partial match", async () => {
|
|
264
|
+
const matches = [{
|
|
265
|
+
matched_value: "Hello",
|
|
266
|
+
match_mode: "partial",
|
|
267
|
+
fuzzy_distance: 0,
|
|
268
|
+
match_source: "text_content",
|
|
269
|
+
xpath: "/html/body/div[1]"
|
|
270
|
+
}, {
|
|
271
|
+
matched_value: "Hello World",
|
|
272
|
+
match_mode: "full",
|
|
273
|
+
fuzzy_distance: 0,
|
|
274
|
+
match_source: "text_content",
|
|
275
|
+
xpath: "/html/body/div[2]"
|
|
276
|
+
}];
|
|
277
|
+
const result = (0, _matching.selectBestMatch)({
|
|
278
|
+
original: "Hello World",
|
|
279
|
+
matches
|
|
280
|
+
});
|
|
281
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello World");
|
|
282
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchXpath).toBe("/html/body/div[2]");
|
|
283
|
+
});
|
|
284
|
+
(0, _extendedTest.test)("should preserve the attribute name for attribute matches", async () => {
|
|
285
|
+
const matches = [{
|
|
286
|
+
matched_value: "https://example.com/x.pdf",
|
|
287
|
+
match_mode: "full",
|
|
288
|
+
fuzzy_distance: 0,
|
|
289
|
+
match_source: "attribute",
|
|
290
|
+
attribute: "href",
|
|
291
|
+
xpath: "/html/body/a[1]"
|
|
292
|
+
}];
|
|
293
|
+
const result = (0, _matching.selectBestMatch)({
|
|
294
|
+
original: "https://example.com/x.pdf",
|
|
295
|
+
matches
|
|
296
|
+
});
|
|
297
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchType).toEqual({
|
|
298
|
+
attribute: "href"
|
|
299
|
+
});
|
|
300
|
+
});
|
|
263
301
|
});
|
|
264
302
|
(0, _extendedTest.describe)("matchStringsWithDomContent", () => {
|
|
265
303
|
let browser;
|
|
@@ -262,4 +262,62 @@ const IFRAME_WITH_EXTRA_CONTENT = `
|
|
|
262
262
|
(0, _extendedTest.expect)(isValid2).toBe(true);
|
|
263
263
|
(0, _extendedTest.expect)(isValid3).toBe(true);
|
|
264
264
|
});
|
|
265
|
+
(0, _extendedTest.test)("PARTIAL match (value is a substring of the element text, PH: prefix) stays valid", async () => {
|
|
266
|
+
await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
|
|
267
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
268
|
+
phone: "(727) 471-4768"
|
|
269
|
+
});
|
|
270
|
+
(0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
|
|
271
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
272
|
+
});
|
|
273
|
+
(0, _extendedTest.test)("value sitting among sibling direct text nodes (br-separated block) stays valid", async () => {
|
|
274
|
+
await page.setContent(`
|
|
275
|
+
<table><tbody><tr><td class="vendor">
|
|
276
|
+
Acme Corp<br/>
|
|
277
|
+
Email: sales@acme.com<br/>
|
|
278
|
+
FAX: (630) 904-4118
|
|
279
|
+
</td></tr></tbody></table>
|
|
280
|
+
`);
|
|
281
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
282
|
+
phone: "(630) 904-4118",
|
|
283
|
+
email: "sales@acme.com"
|
|
284
|
+
});
|
|
285
|
+
(0, _extendedTest.expect)(cachedMapping["(630) 904-4118"].length).toBeGreaterThan(0);
|
|
286
|
+
(0, _extendedTest.expect)(cachedMapping["sales@acme.com"].length).toBeGreaterThan(0);
|
|
287
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
288
|
+
});
|
|
289
|
+
(0, _extendedTest.test)("href / attribute match (relative href resolved to absolute) stays valid", async () => {
|
|
290
|
+
await page.setContent(`<!DOCTYPE html><html><head><base href="https://example.com/files/"></head><body>` + `<div class="doc"><a href="annual-report.pdf">Annual Report</a></div></body></html>`);
|
|
291
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
292
|
+
href: "annual-report.pdf",
|
|
293
|
+
name: "Annual Report"
|
|
294
|
+
});
|
|
295
|
+
(0, _extendedTest.expect)(cachedMapping["annual-report.pdf"].length).toBeGreaterThan(0);
|
|
296
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
297
|
+
});
|
|
298
|
+
(0, _extendedTest.test)("FUZZY match (extracted value has a small typo) stays valid", async () => {
|
|
299
|
+
await page.setContent(`<div class="vendor"><span class="name">International Cleaning Services</span></div>`);
|
|
300
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
301
|
+
name: "Internationl Cleaning Servces"
|
|
302
|
+
});
|
|
303
|
+
(0, _extendedTest.expect)(cachedMapping["Internationl Cleaning Servces"].length).toBeGreaterThan(0);
|
|
304
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
305
|
+
});
|
|
306
|
+
(0, _extendedTest.test)("irregular internal whitespace stays valid", async () => {
|
|
307
|
+
await page.setContent(`<div class="vendor"><span>Audioconferencing systems</span></div>`);
|
|
308
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
309
|
+
category: "Audioconferencing systems"
|
|
310
|
+
});
|
|
311
|
+
(0, _extendedTest.expect)(cachedMapping["Audioconferencing systems"].length).toBeGreaterThan(0);
|
|
312
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
313
|
+
});
|
|
314
|
+
(0, _extendedTest.test)("invalidates when a partial-matched value's text changes", async () => {
|
|
315
|
+
await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
|
|
316
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
317
|
+
phone: "(727) 471-4768"
|
|
318
|
+
});
|
|
319
|
+
(0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
|
|
320
|
+
await page.setContent(`<div class="contact"><p>PH: (999) 999-9999</p></div>`);
|
|
321
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(false);
|
|
322
|
+
});
|
|
265
323
|
});
|
|
@@ -122,12 +122,12 @@ function selectBestMatch({
|
|
|
122
122
|
}) {
|
|
123
123
|
const exactMatches = matches.filter(match => match.match_mode !== "fuzzy");
|
|
124
124
|
if (exactMatches.length > 0) {
|
|
125
|
-
const bestMatch = exactMatches[0];
|
|
125
|
+
const bestMatch = exactMatches.find(match => match.match_mode === "full") ?? exactMatches[0];
|
|
126
126
|
return {
|
|
127
127
|
matchText: bestMatch.matched_value,
|
|
128
128
|
matchXpath: bestMatch.xpath,
|
|
129
129
|
matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
|
|
130
|
-
attribute: "unknown"
|
|
130
|
+
attribute: bestMatch.attribute ?? "unknown"
|
|
131
131
|
} : "all-text"
|
|
132
132
|
};
|
|
133
133
|
}
|
|
@@ -148,7 +148,7 @@ function selectBestMatch({
|
|
|
148
148
|
matchText: bestMatch.matched_value,
|
|
149
149
|
matchXpath: bestMatch.xpath,
|
|
150
150
|
matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
|
|
151
|
-
attribute: "unknown"
|
|
151
|
+
attribute: bestMatch.attribute ?? "unknown"
|
|
152
152
|
} : "all-text"
|
|
153
153
|
};
|
|
154
154
|
}
|
|
@@ -6,13 +6,15 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
6
6
|
exports.createXPathMapping = createXPathMapping;
|
|
7
7
|
exports.validateXPathMapping = validateXPathMapping;
|
|
8
8
|
var _ensureBrowserScripts = require("./ensureBrowserScripts");
|
|
9
|
+
var _utils = require("../optimized-extractors/common/matching/utils");
|
|
9
10
|
async function validateXPathMapping(page, cachedMapping, prefix) {
|
|
10
11
|
try {
|
|
11
12
|
for (const [expectedText, entries] of Object.entries(cachedMapping)) {
|
|
12
13
|
for (const entry of entries) {
|
|
13
14
|
const {
|
|
14
15
|
xpath,
|
|
15
|
-
matchType
|
|
16
|
+
matchType,
|
|
17
|
+
sourceText
|
|
16
18
|
} = entry;
|
|
17
19
|
const elementExists = await page.evaluate(({
|
|
18
20
|
xpath,
|
|
@@ -27,34 +29,45 @@ async function validateXPathMapping(page, cachedMapping, prefix) {
|
|
|
27
29
|
if (!elementExists) {
|
|
28
30
|
return false;
|
|
29
31
|
}
|
|
30
|
-
const
|
|
32
|
+
const actualTexts = await page.evaluate(({
|
|
31
33
|
xpath,
|
|
32
34
|
prefix,
|
|
33
35
|
matchType
|
|
34
36
|
}) => {
|
|
35
37
|
const element = document.evaluate(prefix ? `${prefix}/${xpath}` : xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
36
|
-
if (!element) return
|
|
38
|
+
if (!element) return [];
|
|
37
39
|
if (matchType === "direct-text") {
|
|
38
|
-
|
|
40
|
+
const nodes = [];
|
|
39
41
|
for (const child of element.childNodes) {
|
|
40
42
|
if (child.nodeType === Node.TEXT_NODE) {
|
|
41
|
-
|
|
43
|
+
const t = (child.textContent || "").trim();
|
|
44
|
+
if (t) nodes.push(t);
|
|
42
45
|
}
|
|
43
46
|
}
|
|
44
|
-
return
|
|
47
|
+
return nodes;
|
|
45
48
|
} else if (matchType === "all-text") {
|
|
46
49
|
var _element$textContent;
|
|
47
|
-
return ((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || "";
|
|
50
|
+
return [((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || ""];
|
|
48
51
|
} else if (typeof matchType === "object" && matchType.attribute) {
|
|
49
|
-
|
|
52
|
+
const attr = matchType.attribute;
|
|
53
|
+
const raw = element.getAttribute(attr) || "";
|
|
54
|
+
const candidates = [raw];
|
|
55
|
+
const reflected = element[attr];
|
|
56
|
+
if (typeof reflected === "string" && reflected) {
|
|
57
|
+
candidates.push(reflected);
|
|
58
|
+
}
|
|
59
|
+
return candidates;
|
|
50
60
|
}
|
|
51
|
-
return
|
|
61
|
+
return [];
|
|
52
62
|
}, {
|
|
53
63
|
xpath,
|
|
54
64
|
prefix,
|
|
55
65
|
matchType
|
|
56
66
|
});
|
|
57
|
-
|
|
67
|
+
const expectedFull = sourceText ?? expectedText;
|
|
68
|
+
const normExpected = (0, _utils.normalizeSpacing)(expectedFull);
|
|
69
|
+
const matched = actualTexts.some(t => (0, _utils.normalizeSpacing)(t) === normExpected);
|
|
70
|
+
if (!matched) {
|
|
58
71
|
return false;
|
|
59
72
|
}
|
|
60
73
|
}
|
|
@@ -30,6 +30,9 @@ const saveFileToS3 = async input => {
|
|
|
30
30
|
if (!page || typeof page.goto !== "function") {
|
|
31
31
|
throw new Error("page must be a playwright Page object");
|
|
32
32
|
}
|
|
33
|
+
if (!trigger) {
|
|
34
|
+
throw new Error("trigger is required");
|
|
35
|
+
}
|
|
33
36
|
const download = await (0, _downloadFile.downloadFile)({
|
|
34
37
|
page,
|
|
35
38
|
trigger,
|
|
@@ -279,6 +279,22 @@ _extendedTest.describe.skip("TestNotInGeneration", () => {
|
|
|
279
279
|
(0, _extendedTest.expect)(cancelled).toContain("canceled");
|
|
280
280
|
}
|
|
281
281
|
});
|
|
282
|
+
(0, _extendedTest.test)("should throw when trigger is missing (wrong param names)", async () => {
|
|
283
|
+
await page.setContent(content);
|
|
284
|
+
await (0, _extendedTest.expect)((0, _.downloadFile)({
|
|
285
|
+
page,
|
|
286
|
+
url: "https://example.com/file.pdf",
|
|
287
|
+
fileName: "test.pdf"
|
|
288
|
+
})).rejects.toThrow("trigger is required");
|
|
289
|
+
});
|
|
290
|
+
(0, _extendedTest.test)("saveFileToS3 should throw when trigger is missing (wrong param names)", async () => {
|
|
291
|
+
await page.setContent(content);
|
|
292
|
+
await (0, _extendedTest.expect)((0, _.saveFileToS3)({
|
|
293
|
+
page,
|
|
294
|
+
url: "https://example.com/file.pdf",
|
|
295
|
+
fileName: "test.pdf"
|
|
296
|
+
})).rejects.toThrow("trigger is required");
|
|
297
|
+
});
|
|
282
298
|
});
|
|
283
299
|
const SIGNED_URL = "https://bucket.r2.cloudflarestorage.com/img.jpg?X-Amz-Signature=abc";
|
|
284
300
|
(0, _extendedTest.describe)("SignedUrlAttachmentInTypedArray", () => {
|
|
@@ -3,30 +3,11 @@
|
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
|
-
exports.
|
|
7
|
-
const CLAUDE_MODELS = exports.CLAUDE_MODELS = ["claude-opus-4-20250514", "claude-sonnet-4-20250514", "claude-3-7-sonnet-20250219", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-5-haiku-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"];
|
|
6
|
+
exports.MAX_TOKENS_OVERRIDES = void 0;
|
|
8
7
|
const MAX_TOKENS_OVERRIDES = exports.MAX_TOKENS_OVERRIDES = {
|
|
9
8
|
"claude-3-5-sonnet-20240620": 8192,
|
|
10
9
|
"gemini-1.5-pro-002": 8192,
|
|
11
10
|
"gemini-1.5-flash-8b-002": 8192,
|
|
12
11
|
"gemini-1.5-flash-002": 8192,
|
|
13
12
|
"gemini-2.0-flash-exp": 8192
|
|
14
|
-
};
|
|
15
|
-
const CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = [...CLAUDE_MODELS];
|
|
16
|
-
const CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS_MAPPINGS = {
|
|
17
|
-
"claude-3-haiku": "claude-3-haiku-20240307",
|
|
18
|
-
"claude-3-5-haiku": "claude-3-5-haiku-20241022",
|
|
19
|
-
"claude-3-opus": "claude-3-opus-20240229",
|
|
20
|
-
"claude-3-sonnet": "claude-3-sonnet-20240229",
|
|
21
|
-
"claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
|
|
22
|
-
"claude-4-sonnet": "claude-sonnet-4-20250514",
|
|
23
|
-
"claude-4-opus": "claude-opus-4-20250514"
|
|
24
|
-
};
|
|
25
|
-
const GPT_MODELS = exports.GPT_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-4o", "gpt-4o-mini", "gpt-4o-audio-preview", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo", "o1", "o1-mini", "o1-preview", "o3-mini", "o3", "o4-mini", "chatgpt-4o-latest", "gpt4-turbo"];
|
|
26
|
-
const GOOGLE_MODELS = exports.GOOGLE_MODELS = ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", "gemini-2.5-flash-lite-preview-06-17", "gemini-2.0-flash", "gemini-1.5-pro", "gemini-1.5-pro-latest", "gemini-1.5-flash", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b", "gemini-1.5-flash-8b-latest"];
|
|
27
|
-
const MODELS_MAPPINGS = exports.MODELS_MAPPINGS = {
|
|
28
|
-
...CLAUDE_MODELS_MAPPINGS
|
|
29
|
-
};
|
|
30
|
-
const SUPPPORTED_CLAUDE_MODELS = exports.SUPPPORTED_CLAUDE_MODELS = ["claude-3-5-haiku-20241022", "claude-3-5-haiku-latest", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-latest", "claude-3-7-sonnet-20250219", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307", "claude-4-opus-20250514", "claude-4-sonnet-20250514", "claude-opus-4-1", "claude-opus-4-1-20250805", "claude-opus-4-20250514", "claude-sonnet-4-20250514"];
|
|
31
|
-
const SUPPPORTED_GPT_MODELS = exports.SUPPPORTED_GPT_MODELS = ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-instruct", "gpt-3.5-turbo-instruct-0914", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4.1", "gpt-4.1-2025-04-14", "gpt-4.1-mini", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano", "gpt-4.1-nano-2025-04-14", "gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "gpt-4o-2024-11-20", "gpt-4o-mini", "gpt-4o-mini-2024-07-18", "gpt-5", "gpt-5-2025-08-07", "gpt-5-chat", "gpt-5-chat-latest", "gpt-5-mini", "gpt-5-mini-2025-08-07", "gpt-5-nano", "gpt-5-nano-2025-08-07", "o1", "o1-2024-12-17", "o1-mini", "o1-mini-2024-09-12", "o1-pro", "o1-pro-2025-03-19", "o3", "o3-2025-04-16", "o3-deep-research", "o3-deep-research-2025-06-26", "o3-mini", "o3-mini-2025-01-31", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-2025-04-16", "o4-mini-deep-research", "o4-mini-deep-research-2025-06-26"];
|
|
32
|
-
const SUPPORTED_MODELS = exports.SUPPORTED_MODELS = [...SUPPPORTED_CLAUDE_MODELS, ...SUPPPORTED_GPT_MODELS];
|
|
13
|
+
};
|
|
@@ -56,8 +56,7 @@ async function extractStructuredDataUsingClaude(input) {
|
|
|
56
56
|
const anthropic = (0, _anthropicModel.createAnthropicInstance)({
|
|
57
57
|
apiKey
|
|
58
58
|
});
|
|
59
|
-
const
|
|
60
|
-
const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[modelName] ?? 4096;
|
|
59
|
+
const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[model] ?? 4096;
|
|
61
60
|
const response = await (0, _neverthrow.fromPromise)(anthropic.messages.create({
|
|
62
61
|
max_tokens: maxTokens,
|
|
63
62
|
temperature: 0,
|
|
@@ -66,7 +65,7 @@ async function extractStructuredDataUsingClaude(input) {
|
|
|
66
65
|
role: "user",
|
|
67
66
|
content
|
|
68
67
|
}],
|
|
69
|
-
model
|
|
68
|
+
model,
|
|
70
69
|
tools: [{
|
|
71
70
|
input_schema: processedJsonSchema,
|
|
72
71
|
name: toolName,
|
|
@@ -15,10 +15,7 @@ async function extractStructuredDataUsingGoogle(input) {
|
|
|
15
15
|
if (!input.apiKey) {
|
|
16
16
|
return (0, _neverthrow.err)(Errors.invalidInput("Google AI is only supported with a custom API key. Please provide it or use a different AI provider."));
|
|
17
17
|
}
|
|
18
|
-
|
|
19
|
-
if (input.model in _aiModelsValidations.MODELS_MAPPINGS) {
|
|
20
|
-
model = _aiModelsValidations.MODELS_MAPPINGS[input.model];
|
|
21
|
-
}
|
|
18
|
+
const model = input.model;
|
|
22
19
|
const googleGenAi = (0, _google.createGoogleGenerativeAI)({
|
|
23
20
|
apiKey: input.apiKey
|
|
24
21
|
});
|
|
@@ -8,7 +8,6 @@ var _neverthrow = require("neverthrow");
|
|
|
8
8
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
9
9
|
var _utils = require("./utils");
|
|
10
10
|
var _Logger = require("../../common/Logger");
|
|
11
|
-
var _aiModelsValidations = require("../common/aiModelsValidations");
|
|
12
11
|
var _openaiModel = require("../models/openaiModel");
|
|
13
12
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
14
13
|
async function extractStructuredDataUsingOpenAi(input) {
|
|
@@ -50,7 +49,7 @@ async function extractStructuredDataUsingOpenAi(input) {
|
|
|
50
49
|
}));
|
|
51
50
|
content.push(...imageContent);
|
|
52
51
|
}
|
|
53
|
-
const modelName =
|
|
52
|
+
const modelName = input.model;
|
|
54
53
|
const toolName = `extract_${entityName}`;
|
|
55
54
|
const openAiInstance = (0, _openaiModel.createOpenAIInstance)({
|
|
56
55
|
apiKey
|
|
@@ -10,7 +10,7 @@ var _imageSize = require("image-size");
|
|
|
10
10
|
var _neverthrow = require("neverthrow");
|
|
11
11
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
12
12
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
13
|
-
async function getTableHeadersUsingAi(handle,
|
|
13
|
+
async function getTableHeadersUsingAi(handle, model) {
|
|
14
14
|
var _response$error;
|
|
15
15
|
let image;
|
|
16
16
|
try {
|
|
@@ -57,7 +57,7 @@ async function getTableHeadersUsingAi(handle, identifier) {
|
|
|
57
57
|
}
|
|
58
58
|
}]
|
|
59
59
|
}],
|
|
60
|
-
model
|
|
60
|
+
model,
|
|
61
61
|
tools: [{
|
|
62
62
|
input_schema: {
|
|
63
63
|
type: "object",
|
|
@@ -12,15 +12,15 @@ var _extractStructuredDataUsingClaude = require("./extractStructuredDataUsingCla
|
|
|
12
12
|
var _extractStructuredDataUsingOpenAi = require("./extractStructuredDataUsingOpenAi");
|
|
13
13
|
var _utils = require("./utils");
|
|
14
14
|
var _extractStructuredDataUsingGoogle = require("./extractStructuredDataUsingGoogle");
|
|
15
|
-
var
|
|
15
|
+
var _getModelProvider = require("../../common/getModelProvider");
|
|
16
16
|
function isClaudeModel(model) {
|
|
17
|
-
return
|
|
17
|
+
return (0, _getModelProvider.getModelProvider)(model) === "anthropic";
|
|
18
18
|
}
|
|
19
19
|
function isGoogleModel(model) {
|
|
20
|
-
return
|
|
20
|
+
return (0, _getModelProvider.getModelProvider)(model) === "google_vertexai";
|
|
21
21
|
}
|
|
22
22
|
function isOpenAiModel(model) {
|
|
23
|
-
return
|
|
23
|
+
return (0, _getModelProvider.getModelProvider)(model) === "openai";
|
|
24
24
|
}
|
|
25
25
|
async function extractStructuredDataUsingAi(input) {
|
|
26
26
|
let extractionResult;
|
|
@@ -9,7 +9,7 @@ var _zod = require("zod");
|
|
|
9
9
|
var _neverthrow = require("neverthrow");
|
|
10
10
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
11
11
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
12
|
-
async function isTableHeaderOrFooter(content) {
|
|
12
|
+
async function isTableHeaderOrFooter(content, model) {
|
|
13
13
|
var _response$error;
|
|
14
14
|
if (!content) {
|
|
15
15
|
return (0, _neverthrow.ok)({
|
|
@@ -39,7 +39,7 @@ async function isTableHeaderOrFooter(content) {
|
|
|
39
39
|
a table header is a row that contains labels for table columns, and footer usually has pagination information or summary of the table`
|
|
40
40
|
}, itemContent]
|
|
41
41
|
}],
|
|
42
|
-
model
|
|
42
|
+
model,
|
|
43
43
|
tools: [{
|
|
44
44
|
input_schema: {
|
|
45
45
|
type: "object",
|
|
@@ -102,7 +102,8 @@ function selectBestMatch(original, matches) {
|
|
|
102
102
|
return {
|
|
103
103
|
matchText: bestMatch.matchText,
|
|
104
104
|
matchXpath: bestMatch.matchXpath,
|
|
105
|
-
matchType: bestMatch.matchType
|
|
105
|
+
matchType: bestMatch.matchType,
|
|
106
|
+
sourceText: bestMatch.sourceText
|
|
106
107
|
};
|
|
107
108
|
}
|
|
108
109
|
const fuzzyMatches = matches.filter(match => match.isFuzzy).map(match => [match, rankMatch(original, match)]).filter(([_, rank]) => rank === "HIGH");
|
|
@@ -112,7 +113,8 @@ function selectBestMatch(original, matches) {
|
|
|
112
113
|
return {
|
|
113
114
|
matchText: bestMatch.matchText,
|
|
114
115
|
matchXpath: bestMatch.matchXpath,
|
|
115
|
-
matchType: bestMatch.matchType
|
|
116
|
+
matchType: bestMatch.matchType,
|
|
117
|
+
sourceText: bestMatch.sourceText
|
|
116
118
|
};
|
|
117
119
|
}
|
|
118
120
|
return null;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _vitest = require("vitest");
|
|
4
|
+
var _validators = require("../validators");
|
|
5
|
+
var _index = require("./index");
|
|
6
|
+
(0, _vitest.describe)("strategy model accepts any string", () => {
|
|
7
|
+
(0, _vitest.it)("accepts a non-enum text model for the HTML strategy", () => {
|
|
8
|
+
const result = _validators.strategySchema.safeParse({
|
|
9
|
+
model: "gpt-4.1",
|
|
10
|
+
type: "HTML"
|
|
11
|
+
});
|
|
12
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
13
|
+
if (result.success) {
|
|
14
|
+
(0, _vitest.expect)(result.data.model).toBe("gpt-4.1");
|
|
15
|
+
}
|
|
16
|
+
});
|
|
17
|
+
(0, _vitest.it)("accepts a non-enum vision model for the IMAGE strategy", () => {
|
|
18
|
+
const result = _validators.strategySchema.safeParse({
|
|
19
|
+
model: "gemini-2.5-pro",
|
|
20
|
+
type: "IMAGE"
|
|
21
|
+
});
|
|
22
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
23
|
+
});
|
|
24
|
+
(0, _vitest.it)("accepts a brand new claude model name", () => {
|
|
25
|
+
const result = _validators.strategySchema.safeParse({
|
|
26
|
+
model: "claude-sonnet-4-5",
|
|
27
|
+
type: "HTML"
|
|
28
|
+
});
|
|
29
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
30
|
+
});
|
|
31
|
+
(0, _vitest.it)("rejects an empty model", () => {
|
|
32
|
+
const result = _validators.strategySchema.safeParse({
|
|
33
|
+
model: "",
|
|
34
|
+
type: "HTML"
|
|
35
|
+
});
|
|
36
|
+
(0, _vitest.expect)(result.success).toBe(false);
|
|
37
|
+
});
|
|
38
|
+
(0, _vitest.it)("rejects an unknown strategy type", () => {
|
|
39
|
+
const result = _validators.strategySchema.safeParse({
|
|
40
|
+
model: "gpt-4o",
|
|
41
|
+
type: "FOO"
|
|
42
|
+
});
|
|
43
|
+
(0, _vitest.expect)(result.success).toBe(false);
|
|
44
|
+
});
|
|
45
|
+
(0, _vitest.it)("falls back to the default strategy when omitted", () => {
|
|
46
|
+
const result = _validators.strategySchema.safeParse(undefined);
|
|
47
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
48
|
+
if (result.success) {
|
|
49
|
+
(0, _vitest.expect)(result.data).toEqual({
|
|
50
|
+
model: "claude-haiku-4-5-20251001",
|
|
51
|
+
type: "HTML"
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
(0, _vitest.describe)("provider routing handles arbitrary model strings", () => {
|
|
57
|
+
(0, _vitest.it)("routes OpenAI models by prefix", () => {
|
|
58
|
+
for (const model of ["gpt-4.1", "gpt-5", "gpt-4o", "o3", "o4-mini"]) {
|
|
59
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(true);
|
|
60
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
|
|
61
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
(0, _vitest.it)("routes Google models by prefix", () => {
|
|
65
|
+
for (const model of ["gemini-2.5-pro", "gemini-2.0-flash"]) {
|
|
66
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(true);
|
|
67
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
|
|
68
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
(0, _vitest.it)("routes Claude models by prefix", () => {
|
|
72
|
+
for (const model of ["claude-sonnet-4-5", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307"]) {
|
|
73
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(true);
|
|
74
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
|
|
75
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
(0, _vitest.it)("no longer silently misroutes new OpenAI/Google models to Claude", () => {
|
|
79
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)("gpt-4.1")).toBe(true);
|
|
80
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)("gemini-2.5-pro")).toBe(true);
|
|
81
|
+
});
|
|
82
|
+
});
|