@intuned/browser-dev 0.1.15-dev.1 → 0.1.16-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +3 -0
- package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
- package/dist/ai/tests/testMatching.spec.js +38 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
- package/dist/common/matching/matching.js +3 -3
- package/dist/common/xpathMapping.js +23 -10
- package/dist/helpers/downloadFile.js +3 -0
- package/dist/helpers/saveFileToS3.js +3 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
- package/dist/optimized-extractors/common/aiModelsValidations.js +2 -21
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +2 -3
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +1 -4
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +1 -2
- package/dist/optimized-extractors/common/findTableHeaders.js +2 -2
- package/dist/optimized-extractors/common/index.js +4 -4
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +2 -2
- package/dist/optimized-extractors/common/matching/utils.js +4 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
- package/dist/optimized-extractors/export.d.ts +2 -50
- package/dist/optimized-extractors/extractArray.js +3 -1
- package/dist/optimized-extractors/index.d.ts +2 -50
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +32 -23
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +2 -2
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
- package/dist/optimized-extractors/validators.js +5 -6
- package/package.json +1 -1
- package/dist/optimized-extractors/types/aiModelsValidation.js +0 -45
|
@@ -21,6 +21,24 @@ var _findTableHeaders = require("../common/findTableHeaders");
|
|
|
21
21
|
var _Logger = require("../../common/Logger");
|
|
22
22
|
var _utils = require("../common/matching/utils");
|
|
23
23
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
24
|
+
function getCommonAncestorXpath(xpaths) {
|
|
25
|
+
if (xpaths.length === 0) return null;
|
|
26
|
+
const segmented = xpaths.map(xpath => xpath.split("/"));
|
|
27
|
+
const minLen = Math.min(...segmented.map(segments => segments.length));
|
|
28
|
+
const common = [];
|
|
29
|
+
for (let i = 0; i < minLen; i++) {
|
|
30
|
+
const segment = segmented[0][i];
|
|
31
|
+
if (segmented.every(segments => segments[i] === segment)) {
|
|
32
|
+
common.push(segment);
|
|
33
|
+
} else {
|
|
34
|
+
break;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
if (common.length > 1 && xpaths.some(xpath => xpath === common.join("/"))) {
|
|
38
|
+
common.pop();
|
|
39
|
+
}
|
|
40
|
+
return common.length > 0 ? common.join("/") : null;
|
|
41
|
+
}
|
|
24
42
|
async function runAiExtraction(params) {
|
|
25
43
|
return handleNewAiExtraction(params);
|
|
26
44
|
}
|
|
@@ -82,7 +100,8 @@ async function handleNewAiExtraction(params) {
|
|
|
82
100
|
acc[key] = {
|
|
83
101
|
matchText: bestMatch.matchText,
|
|
84
102
|
matchXpath: bestMatch.matchXpath,
|
|
85
|
-
matchType: bestMatch.matchType
|
|
103
|
+
matchType: bestMatch.matchType,
|
|
104
|
+
sourceText: bestMatch.sourceText
|
|
86
105
|
};
|
|
87
106
|
} else {
|
|
88
107
|
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the page html, dropped for hallucination protection`);
|
|
@@ -100,26 +119,15 @@ async function handleNewAiExtraction(params) {
|
|
|
100
119
|
})));
|
|
101
120
|
let containerPath = null;
|
|
102
121
|
let fullContainerXpath = null;
|
|
103
|
-
if (resultValues.length > 0
|
|
104
|
-
const
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
return window.__INTUNED__.getElementXPath(element.parentElement);
|
|
113
|
-
}
|
|
114
|
-
return null;
|
|
115
|
-
}, primaryXpath);
|
|
116
|
-
if (parentXpath) {
|
|
117
|
-
fullContainerXpath = parentXpath;
|
|
118
|
-
if (hasSearchRegionContainer) {
|
|
119
|
-
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, parentXpath);
|
|
120
|
-
} else {
|
|
121
|
-
containerPath = parentXpath;
|
|
122
|
-
}
|
|
122
|
+
if (resultValues.length > 0) {
|
|
123
|
+
const propertyXpaths = Object.values(resultValues[0].result).map(value => value.matchXpath).filter(xpath => !!xpath);
|
|
124
|
+
const itemContainerXpath = getCommonAncestorXpath(propertyXpaths);
|
|
125
|
+
if (itemContainerXpath) {
|
|
126
|
+
fullContainerXpath = itemContainerXpath;
|
|
127
|
+
if (hasSearchRegionContainer) {
|
|
128
|
+
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, itemContainerXpath);
|
|
129
|
+
} else {
|
|
130
|
+
containerPath = itemContainerXpath;
|
|
123
131
|
}
|
|
124
132
|
}
|
|
125
133
|
}
|
|
@@ -201,7 +209,7 @@ async function splitDomAndExtractData({
|
|
|
201
209
|
tableLocater
|
|
202
210
|
} = await (0, _tablesUtils.isListTable)(listItemsContainerLocator, itemsSimplifiedHtml);
|
|
203
211
|
const tableAsJsonArray = isTable ? await (0, _tablesUtils.createJsonFromTable)(pageAndSearchRegion.page) : [];
|
|
204
|
-
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater,
|
|
212
|
+
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater, strategy.model) : undefined;
|
|
205
213
|
if (tableHeaders && tableHeaders.isErr()) {
|
|
206
214
|
return (0, _neverthrow.err)(tableHeaders.error);
|
|
207
215
|
}
|
|
@@ -240,7 +248,8 @@ async function splitDomAndExtractData({
|
|
|
240
248
|
acc[key] = {
|
|
241
249
|
matchText: bestMatch.matchText,
|
|
242
250
|
matchXpath: bestMatch.matchXpath,
|
|
243
|
-
matchType: bestMatch.matchType
|
|
251
|
+
matchType: bestMatch.matchType,
|
|
252
|
+
sourceText: bestMatch.sourceText
|
|
244
253
|
};
|
|
245
254
|
} else {
|
|
246
255
|
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the item's html, dropped for hallucination protection`);
|
package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js
CHANGED
|
@@ -44,7 +44,7 @@ async function extractPropertiesUsingGPT({
|
|
|
44
44
|
}
|
|
45
45
|
if (possibleTableHeaderOrFooter) {
|
|
46
46
|
const content = text ?? image;
|
|
47
|
-
const isHeader = await (0, _isTableHeaderOrFooter.isTableHeaderOrFooter)(content);
|
|
47
|
+
const isHeader = await (0, _isTableHeaderOrFooter.isTableHeaderOrFooter)(content, strategy.model);
|
|
48
48
|
if (isHeader.isErr()) {
|
|
49
49
|
return (0, _neverthrow.err)(isHeader.error);
|
|
50
50
|
}
|
|
@@ -146,7 +146,7 @@ async function extractPropertiesWithHTMLStrategy({
|
|
|
146
146
|
apiKey
|
|
147
147
|
}) {
|
|
148
148
|
const shouldUseTableData = !!tableAsJsonArray && tableAsJsonArray.length === items.length;
|
|
149
|
-
const isWeakModel = strategy.model
|
|
149
|
+
const isWeakModel = strategy.model.includes("haiku") || strategy.model.includes("turbo");
|
|
150
150
|
const averageItemLength = items.reduce((sum, item) => {
|
|
151
151
|
if (item.type !== "text") return sum;
|
|
152
152
|
return sum + (0, _extractionHelpers.compressStringSpaces)(item.text).length;
|
|
@@ -59,7 +59,8 @@ async function runAIExtraction(pageAndSearchRegion, extractionInfo, snapshot, st
|
|
|
59
59
|
xpathMapping[propertyName] = {
|
|
60
60
|
matchXpath: bestMatch.matchXpath,
|
|
61
61
|
matchText: bestMatch.matchText,
|
|
62
|
-
matchType: bestMatch.matchType
|
|
62
|
+
matchType: bestMatch.matchType,
|
|
63
|
+
sourceText: bestMatch.sourceText
|
|
63
64
|
};
|
|
64
65
|
} else {
|
|
65
66
|
_Logger.logger.debug(`Property ${propertyName} not found in the page HTML, dropped for hallucination protection`);
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
var _extendedTest = require("../../../common/extendedTest");
|
|
4
4
|
var _ = require("../..");
|
|
5
|
+
var _neverthrow = require("neverthrow");
|
|
5
6
|
var _uuid = require("uuid");
|
|
7
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
6
8
|
const productTemplate = `
|
|
7
9
|
<div class="product-page">
|
|
8
10
|
<div class="product-info">
|
|
@@ -104,4 +106,210 @@ _extendedTest.describe.skip("Object Extractor Caching Tests", () => {
|
|
|
104
106
|
console.log("All cache behavior tests completed successfully!");
|
|
105
107
|
});
|
|
106
108
|
});
|
|
109
|
+
});
|
|
110
|
+
async function mockExtractStructuredDataUsingAi(result) {
|
|
111
|
+
const commonModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../common")));
|
|
112
|
+
return _extendedTest.vi.spyOn(commonModule, "extractStructuredDataUsingAi").mockResolvedValue((0, _neverthrow.ok)({
|
|
113
|
+
result
|
|
114
|
+
}));
|
|
115
|
+
}
|
|
116
|
+
async function useInMemoryCache() {
|
|
117
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
118
|
+
const store = new Map();
|
|
119
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
120
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
121
|
+
store.set(key, value);
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
(0, _extendedTest.describe)("extractObjectFromLocator - cache round-trip (real matcher, mocked AI)", () => {
|
|
125
|
+
(0, _extendedTest.afterEach)(() => {
|
|
126
|
+
_extendedTest.vi.restoreAllMocks();
|
|
127
|
+
});
|
|
128
|
+
(0, _extendedTest.test)("reuses the cache on a second identical call when the match was partial (PH: prefix)", async ({
|
|
129
|
+
page
|
|
130
|
+
}) => {
|
|
131
|
+
const label = `contact-roundtrip-${(0, _uuid.v4)()}`;
|
|
132
|
+
const aiResult = {
|
|
133
|
+
contact_information_name: "Andreas Jansson",
|
|
134
|
+
contact_information_phone_number: "(727) 471-4768",
|
|
135
|
+
contact_information_email: "andreas.jansson@flowbird.group"
|
|
136
|
+
};
|
|
137
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
138
|
+
await useInMemoryCache();
|
|
139
|
+
await page.setContent(`
|
|
140
|
+
<div class="vendor">
|
|
141
|
+
<h2>Contact Information</h2>
|
|
142
|
+
<p class="c-name">Andreas Jansson</p>
|
|
143
|
+
<p class="c-phone">PH: (727) 471-4768</p>
|
|
144
|
+
<p class="c-email">andreas.jansson@flowbird.group</p>
|
|
145
|
+
</div>
|
|
146
|
+
`);
|
|
147
|
+
const options = {
|
|
148
|
+
entityName: "supplier_contacts",
|
|
149
|
+
label,
|
|
150
|
+
entitySchema: {
|
|
151
|
+
type: "object",
|
|
152
|
+
required: ["contact_information_email"],
|
|
153
|
+
properties: {
|
|
154
|
+
contact_information_name: {
|
|
155
|
+
type: "string",
|
|
156
|
+
description: "the contact person name"
|
|
157
|
+
},
|
|
158
|
+
contact_information_phone_number: {
|
|
159
|
+
type: "string",
|
|
160
|
+
description: "the contact phone number"
|
|
161
|
+
},
|
|
162
|
+
contact_information_email: {
|
|
163
|
+
type: "string",
|
|
164
|
+
description: "the contact email"
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
strategy: {
|
|
169
|
+
model: "claude-sonnet-4-20250514",
|
|
170
|
+
type: "HTML"
|
|
171
|
+
},
|
|
172
|
+
variantKey: label
|
|
173
|
+
};
|
|
174
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
175
|
+
(0, _extendedTest.expect)(first).toEqual(aiResult);
|
|
176
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
177
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
178
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
179
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
180
|
+
});
|
|
181
|
+
(0, _extendedTest.test)("reuses the cache when fields sit in separate direct text nodes of one element (br-separated block)", async ({
|
|
182
|
+
page
|
|
183
|
+
}) => {
|
|
184
|
+
const label = `vendor-block-${(0, _uuid.v4)()}`;
|
|
185
|
+
const vendorAiResult = {
|
|
186
|
+
primary_vendor_name: "V00000110 - International Cleaning Services, Inc.",
|
|
187
|
+
primary_vendor_contact_email: "monaco.intl@sbcglobal.net",
|
|
188
|
+
primary_vendor_contact_phone: "(630) 904-4118"
|
|
189
|
+
};
|
|
190
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(vendorAiResult);
|
|
191
|
+
await useInMemoryCache();
|
|
192
|
+
await page.setContent(`
|
|
193
|
+
<table><tbody><tr><td class="vendor">
|
|
194
|
+
<a href="#">V00000110 - International Cleaning Services, Inc.</a>
|
|
195
|
+
2415 Comstock Court<br/>
|
|
196
|
+
Naperville, IL 60564<br/>
|
|
197
|
+
US<br/>
|
|
198
|
+
Email: monaco.intl@sbcglobal.net<br/>
|
|
199
|
+
FAX: (630) 904-4118
|
|
200
|
+
</td></tr></tbody></table>
|
|
201
|
+
`);
|
|
202
|
+
const options = {
|
|
203
|
+
entityName: "supplier_contacts",
|
|
204
|
+
label,
|
|
205
|
+
entitySchema: {
|
|
206
|
+
type: "object",
|
|
207
|
+
required: ["primary_vendor_name"],
|
|
208
|
+
properties: {
|
|
209
|
+
primary_vendor_name: {
|
|
210
|
+
type: "string",
|
|
211
|
+
description: "vendor id - company name"
|
|
212
|
+
},
|
|
213
|
+
primary_vendor_contact_email: {
|
|
214
|
+
type: "string",
|
|
215
|
+
description: "vendor email"
|
|
216
|
+
},
|
|
217
|
+
primary_vendor_contact_phone: {
|
|
218
|
+
type: "string",
|
|
219
|
+
description: "vendor phone"
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
strategy: {
|
|
224
|
+
model: "claude-sonnet-4-20250514",
|
|
225
|
+
type: "HTML"
|
|
226
|
+
},
|
|
227
|
+
variantKey: label
|
|
228
|
+
};
|
|
229
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("td.vendor"), options);
|
|
230
|
+
(0, _extendedTest.expect)(first).toEqual(vendorAiResult);
|
|
231
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
232
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("td.vendor"), options);
|
|
233
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
234
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
235
|
+
});
|
|
236
|
+
(0, _extendedTest.test)("reuses the cache for a link whose href is stored resolved but read raw (relative href)", async ({
|
|
237
|
+
page
|
|
238
|
+
}) => {
|
|
239
|
+
const label = `href-${(0, _uuid.v4)()}`;
|
|
240
|
+
const hrefAiResult = {
|
|
241
|
+
attachment_href: "https://example.com/files/annual-report.pdf",
|
|
242
|
+
attachment_name: "Annual Report"
|
|
243
|
+
};
|
|
244
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(hrefAiResult);
|
|
245
|
+
await useInMemoryCache();
|
|
246
|
+
await page.setContent(`<!DOCTYPE html><html><head><base href="https://example.com/files/"></head><body>` + `<div class="doc"><a href="annual-report.pdf">Annual Report</a></div>` + `</body></html>`);
|
|
247
|
+
const options = {
|
|
248
|
+
entityName: "attachment",
|
|
249
|
+
label,
|
|
250
|
+
entitySchema: {
|
|
251
|
+
type: "object",
|
|
252
|
+
required: ["attachment_href"],
|
|
253
|
+
properties: {
|
|
254
|
+
attachment_href: {
|
|
255
|
+
type: "string",
|
|
256
|
+
description: "the file url (href)"
|
|
257
|
+
},
|
|
258
|
+
attachment_name: {
|
|
259
|
+
type: "string",
|
|
260
|
+
description: "the file name"
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
},
|
|
264
|
+
strategy: {
|
|
265
|
+
model: "claude-sonnet-4-20250514",
|
|
266
|
+
type: "HTML"
|
|
267
|
+
},
|
|
268
|
+
variantKey: label
|
|
269
|
+
};
|
|
270
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.doc"), options);
|
|
271
|
+
(0, _extendedTest.expect)(first).toEqual(hrefAiResult);
|
|
272
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
273
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.doc"), options);
|
|
274
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
275
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
276
|
+
});
|
|
277
|
+
(0, _extendedTest.test)("reuses the cache for a FUZZY match (AI value differs slightly from the DOM)", async ({
|
|
278
|
+
page
|
|
279
|
+
}) => {
|
|
280
|
+
const label = `fuzzy-${(0, _uuid.v4)()}`;
|
|
281
|
+
const fuzzyAiResult = {
|
|
282
|
+
company_name: "Internationl Cleaning Servces"
|
|
283
|
+
};
|
|
284
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(fuzzyAiResult);
|
|
285
|
+
await useInMemoryCache();
|
|
286
|
+
await page.setContent(`<div class="vendor"><span class="name">International Cleaning Services</span></div>`);
|
|
287
|
+
const options = {
|
|
288
|
+
entityName: "vendor",
|
|
289
|
+
label,
|
|
290
|
+
entitySchema: {
|
|
291
|
+
type: "object",
|
|
292
|
+
required: ["company_name"],
|
|
293
|
+
properties: {
|
|
294
|
+
company_name: {
|
|
295
|
+
type: "string",
|
|
296
|
+
description: "the company name"
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
},
|
|
300
|
+
strategy: {
|
|
301
|
+
model: "claude-sonnet-4-20250514",
|
|
302
|
+
type: "HTML"
|
|
303
|
+
},
|
|
304
|
+
variantKey: label
|
|
305
|
+
};
|
|
306
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
307
|
+
(0, _extendedTest.expect)(first).toEqual({
|
|
308
|
+
company_name: "International Cleaning Services"
|
|
309
|
+
});
|
|
310
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
311
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
312
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
313
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
314
|
+
});
|
|
107
315
|
});
|
|
@@ -88,11 +88,13 @@ async function dynamicObjectExtractor(page, identifier, options) {
|
|
|
88
88
|
Object.entries(xpathMappingFromAI).forEach(([_propertyName, {
|
|
89
89
|
matchXpath,
|
|
90
90
|
matchText,
|
|
91
|
-
matchType
|
|
91
|
+
matchType,
|
|
92
|
+
sourceText
|
|
92
93
|
}]) => {
|
|
93
94
|
xpathMapping[matchText] = [{
|
|
94
95
|
xpath: matchXpath,
|
|
95
|
-
matchType
|
|
96
|
+
matchType,
|
|
97
|
+
sourceText
|
|
96
98
|
}];
|
|
97
99
|
});
|
|
98
100
|
const resultsToCache = {
|
|
@@ -5,22 +5,21 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
5
5
|
});
|
|
6
6
|
exports.strategySchema = exports.simpleObjectJsonSchema = exports.simpleArrayItemJsonSchema = exports.extractObjectOptimizedInputSchema = exports.extractArrayOptimizedInputSchema = void 0;
|
|
7
7
|
var _zod = require("zod");
|
|
8
|
-
var _aiModelsValidation = require("./types/aiModelsValidation");
|
|
9
8
|
const htmlStrategySchema = _zod.z.object({
|
|
10
|
-
model: _zod.z.
|
|
9
|
+
model: _zod.z.string({
|
|
11
10
|
required_error: "strategy model is required",
|
|
12
11
|
invalid_type_error: "strategy model is invalid"
|
|
13
|
-
}),
|
|
12
|
+
}).min(1, "strategy model is required"),
|
|
14
13
|
type: _zod.z.literal("HTML", {
|
|
15
14
|
required_error: "strategy type is required",
|
|
16
15
|
invalid_type_error: "strategy type is invalid"
|
|
17
16
|
})
|
|
18
17
|
});
|
|
19
18
|
const imageStrategySchema = _zod.z.object({
|
|
20
|
-
model: _zod.z.
|
|
19
|
+
model: _zod.z.string({
|
|
21
20
|
required_error: "strategy model is required",
|
|
22
21
|
invalid_type_error: "strategy model is invalid"
|
|
23
|
-
}),
|
|
22
|
+
}).min(1, "strategy model is required"),
|
|
24
23
|
type: _zod.z.literal("IMAGE", {
|
|
25
24
|
required_error: "strategy type is required",
|
|
26
25
|
invalid_type_error: "strategy type is invalid"
|
|
@@ -114,7 +113,7 @@ const strategySchema = exports.strategySchema = _zod.z.union([htmlStrategySchema
|
|
|
114
113
|
};
|
|
115
114
|
}
|
|
116
115
|
}).optional().default({
|
|
117
|
-
model: "claude-
|
|
116
|
+
model: "claude-haiku-4-5-20251001",
|
|
118
117
|
type: "HTML"
|
|
119
118
|
});
|
|
120
119
|
const labelSchema = _zod.z.string({
|
package/package.json
CHANGED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
exports.SUPPORTED_VISION_MODELS = exports.SUPPORTED_TEXT_MODELS = exports.SUPPORTED_GPT_MODELS = exports.SUPPORTED_GOOGLE_MODELS = exports.SUPPORTED_CLAUDE_MODELS = exports.MODELS_MAPPINGS = exports.MAX_TOKENS_OVERRIDES = exports.GPT_MODELS_MAPPINGS = exports.GOOGLE_MODELS_MAPPINGS = exports.CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_ONLY_TEXT_MODELS = exports.CLAUDE_MODELS_MAPPINGS = void 0;
|
|
7
|
-
const CLAUDE_ONLY_TEXT_MODELS = exports.CLAUDE_ONLY_TEXT_MODELS = ["claude-3-5-haiku", "claude-3-5-haiku-20241022"];
|
|
8
|
-
const CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = ["claude-3-haiku", "claude-3-haiku-20240307", "claude-3.5-sonnet", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", "claude-opus-4", "claude-opus-4-20250514", "claude-sonnet-4", "claude-sonnet-4-20250514"];
|
|
9
|
-
const SUPPORTED_CLAUDE_MODELS = exports.SUPPORTED_CLAUDE_MODELS = [...CLAUDE_ONLY_TEXT_MODELS, ...CLAUDE_VISION_SUPPORTED_MODELS];
|
|
10
|
-
const CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS_MAPPINGS = {
|
|
11
|
-
"claude-3-haiku": "claude-3-haiku-20240307",
|
|
12
|
-
"claude-3-5-haiku": "claude-3-5-haiku-20241022",
|
|
13
|
-
"claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
|
|
14
|
-
"claude-opus-4": "claude-opus-4-20250514",
|
|
15
|
-
"claude-sonnet-4": "claude-sonnet-4-20250514"
|
|
16
|
-
};
|
|
17
|
-
const GPT_ONLY_TEXT_GPT_MODELS = ["gpt3.5-turbo", "gpt-3.5-turbo-0125"];
|
|
18
|
-
const GPT_VISION_SUPPORTED_MODELS = ["gpt4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"];
|
|
19
|
-
const SUPPORTED_GPT_MODELS = exports.SUPPORTED_GPT_MODELS = [...GPT_ONLY_TEXT_GPT_MODELS, ...GPT_VISION_SUPPORTED_MODELS];
|
|
20
|
-
const GPT_MODELS_MAPPINGS = exports.GPT_MODELS_MAPPINGS = {
|
|
21
|
-
"gpt4-turbo": "gpt-4-turbo-2024-04-09",
|
|
22
|
-
"gpt3.5-turbo": "gpt-3.5-turbo-0125",
|
|
23
|
-
"gpt-4o": "gpt-4o-2024-05-13",
|
|
24
|
-
"gpt-4o-mini": "gpt-4o-mini-2024-07-18"
|
|
25
|
-
};
|
|
26
|
-
const SUPPORTED_GOOGLE_MODELS = exports.SUPPORTED_GOOGLE_MODELS = ["gemini-1.5-pro", "gemini-1.5-pro-002", "gemini-1.5-flash-8b", "gemini-1.5-flash-8b-002", "gemini-1.5-flash", "gemini-1.5-flash-002", "gemini-2.0-flash-exp"];
|
|
27
|
-
const GOOGLE_MODELS_MAPPINGS = exports.GOOGLE_MODELS_MAPPINGS = {
|
|
28
|
-
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
|
29
|
-
"gemini-1.5-flash-8b": "gemini-1.5-flash-8b-002",
|
|
30
|
-
"gemini-1.5-flash": "gemini-1.5-flash-002"
|
|
31
|
-
};
|
|
32
|
-
const SUPPORTED_TEXT_MODELS = exports.SUPPORTED_TEXT_MODELS = [...SUPPORTED_CLAUDE_MODELS, ...SUPPORTED_GPT_MODELS, ...SUPPORTED_GOOGLE_MODELS];
|
|
33
|
-
const SUPPORTED_VISION_MODELS = exports.SUPPORTED_VISION_MODELS = [...CLAUDE_VISION_SUPPORTED_MODELS, ...GPT_VISION_SUPPORTED_MODELS, ...SUPPORTED_GOOGLE_MODELS];
|
|
34
|
-
const MODELS_MAPPINGS = exports.MODELS_MAPPINGS = {
|
|
35
|
-
...GPT_MODELS_MAPPINGS,
|
|
36
|
-
...CLAUDE_MODELS_MAPPINGS,
|
|
37
|
-
...GOOGLE_MODELS_MAPPINGS
|
|
38
|
-
};
|
|
39
|
-
const MAX_TOKENS_OVERRIDES = exports.MAX_TOKENS_OVERRIDES = {
|
|
40
|
-
"claude-3-5-sonnet-20240620": 8192,
|
|
41
|
-
"gemini-1.5-pro-002": 8192,
|
|
42
|
-
"gemini-1.5-flash-8b-002": 8192,
|
|
43
|
-
"gemini-1.5-flash-002": 8192,
|
|
44
|
-
"gemini-2.0-flash-exp": 8192
|
|
45
|
-
};
|