@intuned/browser-dev 0.1.15-dev.0 → 0.1.16-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +3 -0
- package/dist/ai/isPageLoaded.js +6 -6
- package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
- package/dist/ai/tests/testMatching.spec.js +38 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
- package/dist/common/matching/matching.js +3 -3
- package/dist/common/xpathMapping.js +23 -10
- package/dist/helpers/downloadFile.js +3 -0
- package/dist/helpers/saveFileToS3.js +3 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
- package/dist/optimized-extractors/common/index.js +4 -4
- package/dist/optimized-extractors/common/matching/utils.js +4 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
- package/dist/optimized-extractors/export.d.ts +2 -50
- package/dist/optimized-extractors/extractArray.js +3 -1
- package/dist/optimized-extractors/index.d.ts +2 -50
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +31 -22
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
- package/dist/optimized-extractors/validators.js +4 -5
- package/package.json +1 -1
package/.prettierrc
ADDED
package/dist/ai/isPageLoaded.js
CHANGED
|
@@ -58,9 +58,9 @@ const isPageLoaded = async options => {
|
|
|
58
58
|
}]
|
|
59
59
|
});
|
|
60
60
|
if (response !== null && response !== void 0 && (_response$response = response.response) !== null && _response$response !== void 0 && (_response$response = _response$response.headers) !== null && _response$response !== void 0 && _response$response["x-ai-cost-in-cents"]) {
|
|
61
|
-
_Logger.logger.info(`Total LLM Cost In Cents: ${response.response.headers["x-ai-cost-in-cents"]}`);
|
|
61
|
+
_Logger.logger.info(`[isPageLoaded] Total LLM Cost In Cents: ${response.response.headers["x-ai-cost-in-cents"]}`);
|
|
62
62
|
} else {
|
|
63
|
-
_Logger.logger.info(`Total LLM Tokens: ${response.usage.totalTokens}`);
|
|
63
|
+
_Logger.logger.info(`[isPageLoaded] Total LLM Tokens: ${response.usage.totalTokens}`);
|
|
64
64
|
}
|
|
65
65
|
let llmResult = response.text.trim();
|
|
66
66
|
if (!llmResult) {
|
|
@@ -74,18 +74,18 @@ const isPageLoaded = async options => {
|
|
|
74
74
|
const reason = lines.length > 1 ? lines[1] : null;
|
|
75
75
|
let result;
|
|
76
76
|
if (isTrue) {
|
|
77
|
-
_Logger.logger.info(`Page is loaded.`);
|
|
77
|
+
_Logger.logger.info(`[isPageLoaded] Page is loaded.`);
|
|
78
78
|
result = true;
|
|
79
79
|
} else if (isFalse) {
|
|
80
|
-
_Logger.logger.info(`Page is not loaded.`);
|
|
80
|
+
_Logger.logger.info(`[isPageLoaded] Page is not loaded.`);
|
|
81
81
|
result = false;
|
|
82
82
|
} else if (isDontKnow) {
|
|
83
|
-
_Logger.logger.info(`Page loading status is unknown.`);
|
|
83
|
+
_Logger.logger.info(`[isPageLoaded] Page loading status is unknown.`);
|
|
84
84
|
result = false;
|
|
85
85
|
} else {
|
|
86
86
|
throw new Error("LLM result is not valid");
|
|
87
87
|
}
|
|
88
|
-
_Logger.logger.info(`LLM Reason: ${reason}`);
|
|
88
|
+
_Logger.logger.info(`[isPageLoaded] LLM Reason: ${reason}`);
|
|
89
89
|
return result;
|
|
90
90
|
};
|
|
91
91
|
exports.isPageLoaded = isPageLoaded;
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _extendedTest = require("../../common/extendedTest");
|
|
4
|
+
var _ = require("..");
|
|
5
|
+
var _neverthrow = require("neverthrow");
|
|
6
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
7
|
+
const DATA_SCHEMA = {
|
|
8
|
+
type: "object",
|
|
9
|
+
required: ["phone"],
|
|
10
|
+
properties: {
|
|
11
|
+
name: {
|
|
12
|
+
type: "string",
|
|
13
|
+
description: "the contact name"
|
|
14
|
+
},
|
|
15
|
+
phone: {
|
|
16
|
+
type: "string",
|
|
17
|
+
description: "the contact phone"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
const CONTACT_HTML = `<div class="contact"><p>Andreas Jansson</p><p>PH: (727) 471-4768</p></div>`;
|
|
22
|
+
function makeOptions(page) {
|
|
23
|
+
return {
|
|
24
|
+
source: page,
|
|
25
|
+
dataSchema: DATA_SCHEMA,
|
|
26
|
+
strategy: "HTML",
|
|
27
|
+
enableDomMatching: true,
|
|
28
|
+
enableCache: true,
|
|
29
|
+
model: "claude-sonnet-4-20250514"
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async function mockExtractStructuredDataUsingAi(result) {
|
|
33
|
+
const aiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../extractStructuredDataUsingAi")));
|
|
34
|
+
return _extendedTest.vi.spyOn(aiModule, "extractStructuredDataUsingAi").mockResolvedValue((0, _neverthrow.ok)({
|
|
35
|
+
result,
|
|
36
|
+
xpathMapping: {}
|
|
37
|
+
}));
|
|
38
|
+
}
|
|
39
|
+
async function useInMemoryCache() {
|
|
40
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../intunedServices/cache/cache")));
|
|
41
|
+
const store = new Map();
|
|
42
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
43
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
44
|
+
store.set(key, value);
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
(0, _extendedTest.describe)("extractStructuredData - DOM matching cache (enableDomMatching: true)", () => {
|
|
48
|
+
(0, _extendedTest.afterEach)(() => {
|
|
49
|
+
_extendedTest.vi.restoreAllMocks();
|
|
50
|
+
});
|
|
51
|
+
(0, _extendedTest.test)("reuses the cache on a second call, even when a value is a partial match", async ({
|
|
52
|
+
page
|
|
53
|
+
}) => {
|
|
54
|
+
const aiResult = {
|
|
55
|
+
name: "Andreas Jansson",
|
|
56
|
+
phone: "(727) 471-4768"
|
|
57
|
+
};
|
|
58
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
59
|
+
await useInMemoryCache();
|
|
60
|
+
await page.setContent(CONTACT_HTML);
|
|
61
|
+
const first = await (0, _.extractStructuredData)(makeOptions(page));
|
|
62
|
+
(0, _extendedTest.expect)(first).toEqual(aiResult);
|
|
63
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
64
|
+
const second = await (0, _.extractStructuredData)(makeOptions(page));
|
|
65
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
66
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
67
|
+
});
|
|
68
|
+
(0, _extendedTest.test)("re-extracts when an extracted value's DOM text changes", async ({
|
|
69
|
+
page
|
|
70
|
+
}) => {
|
|
71
|
+
const aiResult = {
|
|
72
|
+
name: "Andreas Jansson",
|
|
73
|
+
phone: "(727) 471-4768"
|
|
74
|
+
};
|
|
75
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
76
|
+
await useInMemoryCache();
|
|
77
|
+
await page.setContent(CONTACT_HTML);
|
|
78
|
+
await (0, _.extractStructuredData)(makeOptions(page));
|
|
79
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
80
|
+
await page.evaluate(() => {
|
|
81
|
+
const ps = document.querySelectorAll("div.contact p");
|
|
82
|
+
if (ps[1]) ps[1].textContent = "PH: (999) 999-9999";
|
|
83
|
+
});
|
|
84
|
+
await (0, _.extractStructuredData)(makeOptions(page));
|
|
85
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
@@ -260,6 +260,44 @@ var _matching = require("../../common/matching/matching");
|
|
|
260
260
|
});
|
|
261
261
|
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello");
|
|
262
262
|
});
|
|
263
|
+
(0, _extendedTest.test)("should prefer a full match over a partial match", async () => {
|
|
264
|
+
const matches = [{
|
|
265
|
+
matched_value: "Hello",
|
|
266
|
+
match_mode: "partial",
|
|
267
|
+
fuzzy_distance: 0,
|
|
268
|
+
match_source: "text_content",
|
|
269
|
+
xpath: "/html/body/div[1]"
|
|
270
|
+
}, {
|
|
271
|
+
matched_value: "Hello World",
|
|
272
|
+
match_mode: "full",
|
|
273
|
+
fuzzy_distance: 0,
|
|
274
|
+
match_source: "text_content",
|
|
275
|
+
xpath: "/html/body/div[2]"
|
|
276
|
+
}];
|
|
277
|
+
const result = (0, _matching.selectBestMatch)({
|
|
278
|
+
original: "Hello World",
|
|
279
|
+
matches
|
|
280
|
+
});
|
|
281
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello World");
|
|
282
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchXpath).toBe("/html/body/div[2]");
|
|
283
|
+
});
|
|
284
|
+
(0, _extendedTest.test)("should preserve the attribute name for attribute matches", async () => {
|
|
285
|
+
const matches = [{
|
|
286
|
+
matched_value: "https://example.com/x.pdf",
|
|
287
|
+
match_mode: "full",
|
|
288
|
+
fuzzy_distance: 0,
|
|
289
|
+
match_source: "attribute",
|
|
290
|
+
attribute: "href",
|
|
291
|
+
xpath: "/html/body/a[1]"
|
|
292
|
+
}];
|
|
293
|
+
const result = (0, _matching.selectBestMatch)({
|
|
294
|
+
original: "https://example.com/x.pdf",
|
|
295
|
+
matches
|
|
296
|
+
});
|
|
297
|
+
(0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchType).toEqual({
|
|
298
|
+
attribute: "href"
|
|
299
|
+
});
|
|
300
|
+
});
|
|
263
301
|
});
|
|
264
302
|
(0, _extendedTest.describe)("matchStringsWithDomContent", () => {
|
|
265
303
|
let browser;
|
|
@@ -262,4 +262,62 @@ const IFRAME_WITH_EXTRA_CONTENT = `
|
|
|
262
262
|
(0, _extendedTest.expect)(isValid2).toBe(true);
|
|
263
263
|
(0, _extendedTest.expect)(isValid3).toBe(true);
|
|
264
264
|
});
|
|
265
|
+
(0, _extendedTest.test)("PARTIAL match (value is a substring of the element text, PH: prefix) stays valid", async () => {
|
|
266
|
+
await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
|
|
267
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
268
|
+
phone: "(727) 471-4768"
|
|
269
|
+
});
|
|
270
|
+
(0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
|
|
271
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
272
|
+
});
|
|
273
|
+
(0, _extendedTest.test)("value sitting among sibling direct text nodes (br-separated block) stays valid", async () => {
|
|
274
|
+
await page.setContent(`
|
|
275
|
+
<table><tbody><tr><td class="vendor">
|
|
276
|
+
Acme Corp<br/>
|
|
277
|
+
Email: sales@acme.com<br/>
|
|
278
|
+
FAX: (630) 904-4118
|
|
279
|
+
</td></tr></tbody></table>
|
|
280
|
+
`);
|
|
281
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
282
|
+
phone: "(630) 904-4118",
|
|
283
|
+
email: "sales@acme.com"
|
|
284
|
+
});
|
|
285
|
+
(0, _extendedTest.expect)(cachedMapping["(630) 904-4118"].length).toBeGreaterThan(0);
|
|
286
|
+
(0, _extendedTest.expect)(cachedMapping["sales@acme.com"].length).toBeGreaterThan(0);
|
|
287
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
288
|
+
});
|
|
289
|
+
(0, _extendedTest.test)("href / attribute match (relative href resolved to absolute) stays valid", async () => {
|
|
290
|
+
await page.setContent(`<!DOCTYPE html><html><head><base href="https://example.com/files/"></head><body>` + `<div class="doc"><a href="annual-report.pdf">Annual Report</a></div></body></html>`);
|
|
291
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
292
|
+
href: "annual-report.pdf",
|
|
293
|
+
name: "Annual Report"
|
|
294
|
+
});
|
|
295
|
+
(0, _extendedTest.expect)(cachedMapping["annual-report.pdf"].length).toBeGreaterThan(0);
|
|
296
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
297
|
+
});
|
|
298
|
+
(0, _extendedTest.test)("FUZZY match (extracted value has a small typo) stays valid", async () => {
|
|
299
|
+
await page.setContent(`<div class="vendor"><span class="name">International Cleaning Services</span></div>`);
|
|
300
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
301
|
+
name: "Internationl Cleaning Servces"
|
|
302
|
+
});
|
|
303
|
+
(0, _extendedTest.expect)(cachedMapping["Internationl Cleaning Servces"].length).toBeGreaterThan(0);
|
|
304
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
305
|
+
});
|
|
306
|
+
(0, _extendedTest.test)("irregular internal whitespace stays valid", async () => {
|
|
307
|
+
await page.setContent(`<div class="vendor"><span>Audioconferencing systems</span></div>`);
|
|
308
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
309
|
+
category: "Audioconferencing systems"
|
|
310
|
+
});
|
|
311
|
+
(0, _extendedTest.expect)(cachedMapping["Audioconferencing systems"].length).toBeGreaterThan(0);
|
|
312
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
|
|
313
|
+
});
|
|
314
|
+
(0, _extendedTest.test)("invalidates when a partial-matched value's text changes", async () => {
|
|
315
|
+
await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
|
|
316
|
+
const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
|
|
317
|
+
phone: "(727) 471-4768"
|
|
318
|
+
});
|
|
319
|
+
(0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
|
|
320
|
+
await page.setContent(`<div class="contact"><p>PH: (999) 999-9999</p></div>`);
|
|
321
|
+
(0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(false);
|
|
322
|
+
});
|
|
265
323
|
});
|
|
@@ -122,12 +122,12 @@ function selectBestMatch({
|
|
|
122
122
|
}) {
|
|
123
123
|
const exactMatches = matches.filter(match => match.match_mode !== "fuzzy");
|
|
124
124
|
if (exactMatches.length > 0) {
|
|
125
|
-
const bestMatch = exactMatches[0];
|
|
125
|
+
const bestMatch = exactMatches.find(match => match.match_mode === "full") ?? exactMatches[0];
|
|
126
126
|
return {
|
|
127
127
|
matchText: bestMatch.matched_value,
|
|
128
128
|
matchXpath: bestMatch.xpath,
|
|
129
129
|
matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
|
|
130
|
-
attribute: "unknown"
|
|
130
|
+
attribute: bestMatch.attribute ?? "unknown"
|
|
131
131
|
} : "all-text"
|
|
132
132
|
};
|
|
133
133
|
}
|
|
@@ -148,7 +148,7 @@ function selectBestMatch({
|
|
|
148
148
|
matchText: bestMatch.matched_value,
|
|
149
149
|
matchXpath: bestMatch.xpath,
|
|
150
150
|
matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
|
|
151
|
-
attribute: "unknown"
|
|
151
|
+
attribute: bestMatch.attribute ?? "unknown"
|
|
152
152
|
} : "all-text"
|
|
153
153
|
};
|
|
154
154
|
}
|
|
@@ -6,13 +6,15 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
6
6
|
exports.createXPathMapping = createXPathMapping;
|
|
7
7
|
exports.validateXPathMapping = validateXPathMapping;
|
|
8
8
|
var _ensureBrowserScripts = require("./ensureBrowserScripts");
|
|
9
|
+
var _utils = require("../optimized-extractors/common/matching/utils");
|
|
9
10
|
async function validateXPathMapping(page, cachedMapping, prefix) {
|
|
10
11
|
try {
|
|
11
12
|
for (const [expectedText, entries] of Object.entries(cachedMapping)) {
|
|
12
13
|
for (const entry of entries) {
|
|
13
14
|
const {
|
|
14
15
|
xpath,
|
|
15
|
-
matchType
|
|
16
|
+
matchType,
|
|
17
|
+
sourceText
|
|
16
18
|
} = entry;
|
|
17
19
|
const elementExists = await page.evaluate(({
|
|
18
20
|
xpath,
|
|
@@ -27,34 +29,45 @@ async function validateXPathMapping(page, cachedMapping, prefix) {
|
|
|
27
29
|
if (!elementExists) {
|
|
28
30
|
return false;
|
|
29
31
|
}
|
|
30
|
-
const
|
|
32
|
+
const actualTexts = await page.evaluate(({
|
|
31
33
|
xpath,
|
|
32
34
|
prefix,
|
|
33
35
|
matchType
|
|
34
36
|
}) => {
|
|
35
37
|
const element = document.evaluate(prefix ? `${prefix}/${xpath}` : xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
36
|
-
if (!element) return
|
|
38
|
+
if (!element) return [];
|
|
37
39
|
if (matchType === "direct-text") {
|
|
38
|
-
|
|
40
|
+
const nodes = [];
|
|
39
41
|
for (const child of element.childNodes) {
|
|
40
42
|
if (child.nodeType === Node.TEXT_NODE) {
|
|
41
|
-
|
|
43
|
+
const t = (child.textContent || "").trim();
|
|
44
|
+
if (t) nodes.push(t);
|
|
42
45
|
}
|
|
43
46
|
}
|
|
44
|
-
return
|
|
47
|
+
return nodes;
|
|
45
48
|
} else if (matchType === "all-text") {
|
|
46
49
|
var _element$textContent;
|
|
47
|
-
return ((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || "";
|
|
50
|
+
return [((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || ""];
|
|
48
51
|
} else if (typeof matchType === "object" && matchType.attribute) {
|
|
49
|
-
|
|
52
|
+
const attr = matchType.attribute;
|
|
53
|
+
const raw = element.getAttribute(attr) || "";
|
|
54
|
+
const candidates = [raw];
|
|
55
|
+
const reflected = element[attr];
|
|
56
|
+
if (typeof reflected === "string" && reflected) {
|
|
57
|
+
candidates.push(reflected);
|
|
58
|
+
}
|
|
59
|
+
return candidates;
|
|
50
60
|
}
|
|
51
|
-
return
|
|
61
|
+
return [];
|
|
52
62
|
}, {
|
|
53
63
|
xpath,
|
|
54
64
|
prefix,
|
|
55
65
|
matchType
|
|
56
66
|
});
|
|
57
|
-
|
|
67
|
+
const expectedFull = sourceText ?? expectedText;
|
|
68
|
+
const normExpected = (0, _utils.normalizeSpacing)(expectedFull);
|
|
69
|
+
const matched = actualTexts.some(t => (0, _utils.normalizeSpacing)(t) === normExpected);
|
|
70
|
+
if (!matched) {
|
|
58
71
|
return false;
|
|
59
72
|
}
|
|
60
73
|
}
|
|
@@ -30,6 +30,9 @@ const saveFileToS3 = async input => {
|
|
|
30
30
|
if (!page || typeof page.goto !== "function") {
|
|
31
31
|
throw new Error("page must be a playwright Page object");
|
|
32
32
|
}
|
|
33
|
+
if (!trigger) {
|
|
34
|
+
throw new Error("trigger is required");
|
|
35
|
+
}
|
|
33
36
|
const download = await (0, _downloadFile.downloadFile)({
|
|
34
37
|
page,
|
|
35
38
|
trigger,
|
|
@@ -279,6 +279,22 @@ _extendedTest.describe.skip("TestNotInGeneration", () => {
|
|
|
279
279
|
(0, _extendedTest.expect)(cancelled).toContain("canceled");
|
|
280
280
|
}
|
|
281
281
|
});
|
|
282
|
+
(0, _extendedTest.test)("should throw when trigger is missing (wrong param names)", async () => {
|
|
283
|
+
await page.setContent(content);
|
|
284
|
+
await (0, _extendedTest.expect)((0, _.downloadFile)({
|
|
285
|
+
page,
|
|
286
|
+
url: "https://example.com/file.pdf",
|
|
287
|
+
fileName: "test.pdf"
|
|
288
|
+
})).rejects.toThrow("trigger is required");
|
|
289
|
+
});
|
|
290
|
+
(0, _extendedTest.test)("saveFileToS3 should throw when trigger is missing (wrong param names)", async () => {
|
|
291
|
+
await page.setContent(content);
|
|
292
|
+
await (0, _extendedTest.expect)((0, _.saveFileToS3)({
|
|
293
|
+
page,
|
|
294
|
+
url: "https://example.com/file.pdf",
|
|
295
|
+
fileName: "test.pdf"
|
|
296
|
+
})).rejects.toThrow("trigger is required");
|
|
297
|
+
});
|
|
282
298
|
});
|
|
283
299
|
const SIGNED_URL = "https://bucket.r2.cloudflarestorage.com/img.jpg?X-Amz-Signature=abc";
|
|
284
300
|
(0, _extendedTest.describe)("SignedUrlAttachmentInTypedArray", () => {
|
|
@@ -12,15 +12,15 @@ var _extractStructuredDataUsingClaude = require("./extractStructuredDataUsingCla
|
|
|
12
12
|
var _extractStructuredDataUsingOpenAi = require("./extractStructuredDataUsingOpenAi");
|
|
13
13
|
var _utils = require("./utils");
|
|
14
14
|
var _extractStructuredDataUsingGoogle = require("./extractStructuredDataUsingGoogle");
|
|
15
|
-
var
|
|
15
|
+
var _getModelProvider = require("../../common/getModelProvider");
|
|
16
16
|
function isClaudeModel(model) {
|
|
17
|
-
return
|
|
17
|
+
return (0, _getModelProvider.getModelProvider)(model) === "anthropic";
|
|
18
18
|
}
|
|
19
19
|
function isGoogleModel(model) {
|
|
20
|
-
return
|
|
20
|
+
return (0, _getModelProvider.getModelProvider)(model) === "google_vertexai";
|
|
21
21
|
}
|
|
22
22
|
function isOpenAiModel(model) {
|
|
23
|
-
return
|
|
23
|
+
return (0, _getModelProvider.getModelProvider)(model) === "openai";
|
|
24
24
|
}
|
|
25
25
|
async function extractStructuredDataUsingAi(input) {
|
|
26
26
|
let extractionResult;
|
|
@@ -102,7 +102,8 @@ function selectBestMatch(original, matches) {
|
|
|
102
102
|
return {
|
|
103
103
|
matchText: bestMatch.matchText,
|
|
104
104
|
matchXpath: bestMatch.matchXpath,
|
|
105
|
-
matchType: bestMatch.matchType
|
|
105
|
+
matchType: bestMatch.matchType,
|
|
106
|
+
sourceText: bestMatch.sourceText
|
|
106
107
|
};
|
|
107
108
|
}
|
|
108
109
|
const fuzzyMatches = matches.filter(match => match.isFuzzy).map(match => [match, rankMatch(original, match)]).filter(([_, rank]) => rank === "HIGH");
|
|
@@ -112,7 +113,8 @@ function selectBestMatch(original, matches) {
|
|
|
112
113
|
return {
|
|
113
114
|
matchText: bestMatch.matchText,
|
|
114
115
|
matchXpath: bestMatch.matchXpath,
|
|
115
|
-
matchType: bestMatch.matchType
|
|
116
|
+
matchType: bestMatch.matchType,
|
|
117
|
+
sourceText: bestMatch.sourceText
|
|
116
118
|
};
|
|
117
119
|
}
|
|
118
120
|
return null;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _vitest = require("vitest");
|
|
4
|
+
var _validators = require("../validators");
|
|
5
|
+
var _index = require("./index");
|
|
6
|
+
(0, _vitest.describe)("strategy model accepts any string", () => {
|
|
7
|
+
(0, _vitest.it)("accepts a non-enum text model for the HTML strategy", () => {
|
|
8
|
+
const result = _validators.strategySchema.safeParse({
|
|
9
|
+
model: "gpt-4.1",
|
|
10
|
+
type: "HTML"
|
|
11
|
+
});
|
|
12
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
13
|
+
if (result.success) {
|
|
14
|
+
(0, _vitest.expect)(result.data.model).toBe("gpt-4.1");
|
|
15
|
+
}
|
|
16
|
+
});
|
|
17
|
+
(0, _vitest.it)("accepts a non-enum vision model for the IMAGE strategy", () => {
|
|
18
|
+
const result = _validators.strategySchema.safeParse({
|
|
19
|
+
model: "gemini-2.5-pro",
|
|
20
|
+
type: "IMAGE"
|
|
21
|
+
});
|
|
22
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
23
|
+
});
|
|
24
|
+
(0, _vitest.it)("accepts a brand new claude model name", () => {
|
|
25
|
+
const result = _validators.strategySchema.safeParse({
|
|
26
|
+
model: "claude-sonnet-4-5",
|
|
27
|
+
type: "HTML"
|
|
28
|
+
});
|
|
29
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
30
|
+
});
|
|
31
|
+
(0, _vitest.it)("rejects an empty model", () => {
|
|
32
|
+
const result = _validators.strategySchema.safeParse({
|
|
33
|
+
model: "",
|
|
34
|
+
type: "HTML"
|
|
35
|
+
});
|
|
36
|
+
(0, _vitest.expect)(result.success).toBe(false);
|
|
37
|
+
});
|
|
38
|
+
(0, _vitest.it)("rejects an unknown strategy type", () => {
|
|
39
|
+
const result = _validators.strategySchema.safeParse({
|
|
40
|
+
model: "gpt-4o",
|
|
41
|
+
type: "FOO"
|
|
42
|
+
});
|
|
43
|
+
(0, _vitest.expect)(result.success).toBe(false);
|
|
44
|
+
});
|
|
45
|
+
(0, _vitest.it)("falls back to the default strategy when omitted", () => {
|
|
46
|
+
const result = _validators.strategySchema.safeParse(undefined);
|
|
47
|
+
(0, _vitest.expect)(result.success).toBe(true);
|
|
48
|
+
if (result.success) {
|
|
49
|
+
(0, _vitest.expect)(result.data).toEqual({
|
|
50
|
+
model: "claude-3-haiku",
|
|
51
|
+
type: "HTML"
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
(0, _vitest.describe)("provider routing handles arbitrary model strings", () => {
|
|
57
|
+
(0, _vitest.it)("routes OpenAI models by prefix", () => {
|
|
58
|
+
for (const model of ["gpt-4.1", "gpt-5", "gpt-4o", "o3", "o4-mini"]) {
|
|
59
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(true);
|
|
60
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
|
|
61
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
(0, _vitest.it)("routes Google models by prefix", () => {
|
|
65
|
+
for (const model of ["gemini-2.5-pro", "gemini-2.0-flash"]) {
|
|
66
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(true);
|
|
67
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
|
|
68
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
(0, _vitest.it)("routes Claude models by prefix", () => {
|
|
72
|
+
for (const model of ["claude-sonnet-4-5", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307"]) {
|
|
73
|
+
(0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(true);
|
|
74
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
|
|
75
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
(0, _vitest.it)("no longer silently misroutes new OpenAI/Google models to Claude", () => {
|
|
79
|
+
(0, _vitest.expect)((0, _index.isOpenAiModel)("gpt-4.1")).toBe(true);
|
|
80
|
+
(0, _vitest.expect)((0, _index.isGoogleModel)("gemini-2.5-pro")).toBe(true);
|
|
81
|
+
});
|
|
82
|
+
});
|
|
@@ -9,29 +9,7 @@ import { BasicSchema } from "./types/jsonSchema";
|
|
|
9
9
|
* @property type - the type of the strategy
|
|
10
10
|
*/
|
|
11
11
|
export interface ImageStrategy {
|
|
12
|
-
model:
|
|
13
|
-
| "claude-3-haiku"
|
|
14
|
-
| "claude-3-haiku-20240307"
|
|
15
|
-
| "claude-3.5-sonnet"
|
|
16
|
-
| "claude-3-5-sonnet-20240620"
|
|
17
|
-
| "claude-3-5-sonnet-20241022"
|
|
18
|
-
| "claude-opus-4"
|
|
19
|
-
| "claude-opus-4-20250514"
|
|
20
|
-
| "claude-sonnet-4"
|
|
21
|
-
| "claude-sonnet-4-20250514"
|
|
22
|
-
| "gpt4-turbo"
|
|
23
|
-
| "gpt-4-turbo-2024-04-09"
|
|
24
|
-
| "gpt-4o"
|
|
25
|
-
| "gpt-4o-2024-05-13"
|
|
26
|
-
| "gpt-4o-mini"
|
|
27
|
-
| "gpt-4o-mini-2024-07-18"
|
|
28
|
-
| "gemini-1.5-pro"
|
|
29
|
-
| "gemini-1.5-pro-002"
|
|
30
|
-
| "gemini-1.5-flash-8b"
|
|
31
|
-
| "gemini-1.5-flash-8b-002"
|
|
32
|
-
| "gemini-1.5-flash"
|
|
33
|
-
| "gemini-1.5-flash-002"
|
|
34
|
-
| "gemini-2.0-flash-exp";
|
|
12
|
+
model: string;
|
|
35
13
|
type: "IMAGE";
|
|
36
14
|
}
|
|
37
15
|
/**
|
|
@@ -43,33 +21,7 @@ export interface ImageStrategy {
|
|
|
43
21
|
* @property type - the type of the strategy
|
|
44
22
|
*/
|
|
45
23
|
export interface HtmlStrategy {
|
|
46
|
-
model:
|
|
47
|
-
| "claude-3-haiku"
|
|
48
|
-
| "claude-3-haiku-20240307"
|
|
49
|
-
| "claude-3-5-haiku"
|
|
50
|
-
| "claude-3-5-haiku-20241022"
|
|
51
|
-
| "claude-3.5-sonnet"
|
|
52
|
-
| "claude-3-5-sonnet-20240620"
|
|
53
|
-
| "claude-3-5-sonnet-20241022"
|
|
54
|
-
| "claude-opus-4"
|
|
55
|
-
| "claude-opus-4-20250514"
|
|
56
|
-
| "claude-sonnet-4"
|
|
57
|
-
| "claude-sonnet-4-20250514"
|
|
58
|
-
| "gpt4-turbo"
|
|
59
|
-
| "gpt-4-turbo-2024-04-09"
|
|
60
|
-
| "gpt3.5-turbo"
|
|
61
|
-
| "gpt-3.5-turbo-0125"
|
|
62
|
-
| "gpt-4o"
|
|
63
|
-
| "gpt-4o-2024-05-13"
|
|
64
|
-
| "gpt-4o-mini"
|
|
65
|
-
| "gpt-4o-mini-2024-07-18"
|
|
66
|
-
| "gemini-1.5-pro"
|
|
67
|
-
| "gemini-1.5-pro-002"
|
|
68
|
-
| "gemini-1.5-flash-8b"
|
|
69
|
-
| "gemini-1.5-flash-8b-002"
|
|
70
|
-
| "gemini-1.5-flash"
|
|
71
|
-
| "gemini-1.5-flash-002"
|
|
72
|
-
| "gemini-2.0-flash-exp";
|
|
24
|
+
model: string;
|
|
73
25
|
type: "HTML";
|
|
74
26
|
}
|
|
75
27
|
/**
|
|
@@ -81,7 +81,9 @@ const extractArrayFromLocator = async (locator, options) => {
|
|
|
81
81
|
strategy: validOptions.strategy,
|
|
82
82
|
optionalPropertiesInvalidator: validOptions.optionalPropertiesInvalidator,
|
|
83
83
|
variantKey: validOptions.variantKey,
|
|
84
|
-
searchRegion: locator
|
|
84
|
+
searchRegion: locator,
|
|
85
|
+
prompt: validOptions.prompt,
|
|
86
|
+
apiKey: validOptions.apiKey
|
|
85
87
|
});
|
|
86
88
|
if (result.isErr()) {
|
|
87
89
|
switch (result.error.type) {
|
|
@@ -9,29 +9,7 @@ import { BasicSchema } from "./types/jsonSchema";
|
|
|
9
9
|
* @property type - the type of the strategy
|
|
10
10
|
*/
|
|
11
11
|
export interface ImageStrategy {
|
|
12
|
-
model:
|
|
13
|
-
| "claude-3-haiku"
|
|
14
|
-
| "claude-3-haiku-20240307"
|
|
15
|
-
| "claude-3.5-sonnet"
|
|
16
|
-
| "claude-3-5-sonnet-20240620"
|
|
17
|
-
| "claude-3-5-sonnet-20241022"
|
|
18
|
-
| "claude-opus-4"
|
|
19
|
-
| "claude-opus-4-20250514"
|
|
20
|
-
| "claude-sonnet-4"
|
|
21
|
-
| "claude-sonnet-4-20250514"
|
|
22
|
-
| "gpt4-turbo"
|
|
23
|
-
| "gpt-4-turbo-2024-04-09"
|
|
24
|
-
| "gpt-4o"
|
|
25
|
-
| "gpt-4o-2024-05-13"
|
|
26
|
-
| "gpt-4o-mini"
|
|
27
|
-
| "gpt-4o-mini-2024-07-18"
|
|
28
|
-
| "gemini-1.5-pro"
|
|
29
|
-
| "gemini-1.5-pro-002"
|
|
30
|
-
| "gemini-1.5-flash-8b"
|
|
31
|
-
| "gemini-1.5-flash-8b-002"
|
|
32
|
-
| "gemini-1.5-flash"
|
|
33
|
-
| "gemini-1.5-flash-002"
|
|
34
|
-
| "gemini-2.0-flash-exp";
|
|
12
|
+
model: string;
|
|
35
13
|
type: "IMAGE";
|
|
36
14
|
}
|
|
37
15
|
/**
|
|
@@ -43,33 +21,7 @@ export interface ImageStrategy {
|
|
|
43
21
|
* @property type - the type of the strategy
|
|
44
22
|
*/
|
|
45
23
|
export interface HtmlStrategy {
|
|
46
|
-
model:
|
|
47
|
-
| "claude-3-haiku"
|
|
48
|
-
| "claude-3-haiku-20240307"
|
|
49
|
-
| "claude-3-5-haiku"
|
|
50
|
-
| "claude-3-5-haiku-20241022"
|
|
51
|
-
| "claude-3.5-sonnet"
|
|
52
|
-
| "claude-3-5-sonnet-20240620"
|
|
53
|
-
| "claude-3-5-sonnet-20241022"
|
|
54
|
-
| "claude-opus-4"
|
|
55
|
-
| "claude-opus-4-20250514"
|
|
56
|
-
| "claude-sonnet-4"
|
|
57
|
-
| "claude-sonnet-4-20250514"
|
|
58
|
-
| "gpt4-turbo"
|
|
59
|
-
| "gpt-4-turbo-2024-04-09"
|
|
60
|
-
| "gpt3.5-turbo"
|
|
61
|
-
| "gpt-3.5-turbo-0125"
|
|
62
|
-
| "gpt-4o"
|
|
63
|
-
| "gpt-4o-2024-05-13"
|
|
64
|
-
| "gpt-4o-mini"
|
|
65
|
-
| "gpt-4o-mini-2024-07-18"
|
|
66
|
-
| "gemini-1.5-pro"
|
|
67
|
-
| "gemini-1.5-pro-002"
|
|
68
|
-
| "gemini-1.5-flash-8b"
|
|
69
|
-
| "gemini-1.5-flash-8b-002"
|
|
70
|
-
| "gemini-1.5-flash"
|
|
71
|
-
| "gemini-1.5-flash-002"
|
|
72
|
-
| "gemini-2.0-flash-exp";
|
|
24
|
+
model: string;
|
|
73
25
|
type: "HTML";
|
|
74
26
|
}
|
|
75
27
|
/**
|