@intuned/browser-dev 0.1.15-dev.0 → 0.1.16-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +3 -0
- package/dist/ai/isPageLoaded.js +6 -6
- package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
- package/dist/ai/tests/testMatching.spec.js +38 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
- package/dist/common/matching/matching.js +3 -3
- package/dist/common/xpathMapping.js +23 -10
- package/dist/helpers/downloadFile.js +3 -0
- package/dist/helpers/saveFileToS3.js +3 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
- package/dist/optimized-extractors/common/index.js +4 -4
- package/dist/optimized-extractors/common/matching/utils.js +4 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
- package/dist/optimized-extractors/export.d.ts +2 -50
- package/dist/optimized-extractors/extractArray.js +3 -1
- package/dist/optimized-extractors/index.d.ts +2 -50
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +31 -22
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
- package/dist/optimized-extractors/validators.js +4 -5
- package/package.json +1 -1
package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
var _extendedTest = require("../../../common/extendedTest");
|
|
4
4
|
var _dynamicListExtractor = require("../dynamicListExtractor");
|
|
5
|
+
var _ = require("../../");
|
|
5
6
|
var _uuid = require("uuid");
|
|
6
7
|
var _dotenv = require("dotenv");
|
|
7
8
|
var _neverthrow = require("neverthrow");
|
|
@@ -249,7 +250,6 @@ _extendedTest.describe.skip("Dynamic List Extractor Caching Tests", () => {
|
|
|
249
250
|
fullContainerXpath: "/html/body/div[1]"
|
|
250
251
|
}));
|
|
251
252
|
const consoleSpy = _extendedTest.vi.spyOn(_Logger.logger, "debug");
|
|
252
|
-
const consoleInfoSpy = _extendedTest.vi.spyOn(_Logger.logger, "info");
|
|
253
253
|
const consoleWarnSpy = _extendedTest.vi.spyOn(_Logger.logger, "warn");
|
|
254
254
|
await page.setContent(simpleTemplate);
|
|
255
255
|
const firstResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
@@ -266,4 +266,369 @@ _extendedTest.describe.skip("Dynamic List Extractor Caching Tests", () => {
|
|
|
266
266
|
console.log("Cache size limit test completed successfully!");
|
|
267
267
|
});
|
|
268
268
|
});
|
|
269
|
+
});
|
|
270
|
+
const FULL_CONTAINER_XPATH = "html[1]/body[1]/div[1]/table[1]/tbody[1]";
|
|
271
|
+
const RELATIVE_CONTAINER_PATH = "div[1]/table[1]/tbody[1]";
|
|
272
|
+
const categoriesEntitySchema = {
|
|
273
|
+
type: "object",
|
|
274
|
+
required: ["unspsc"],
|
|
275
|
+
properties: {
|
|
276
|
+
unspsc: {
|
|
277
|
+
type: "string",
|
|
278
|
+
description: "extract UNSPSC",
|
|
279
|
+
primary: true
|
|
280
|
+
},
|
|
281
|
+
unspsc_description: {
|
|
282
|
+
type: "string",
|
|
283
|
+
description: "extract UNSPSC description"
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
};
|
|
287
|
+
function makeCategoriesOptions(label, searchRegion) {
|
|
288
|
+
return {
|
|
289
|
+
itemEntityName: "categories",
|
|
290
|
+
label,
|
|
291
|
+
itemEntitySchema: categoriesEntitySchema,
|
|
292
|
+
strategy: {
|
|
293
|
+
model: "claude-sonnet-4-20250514",
|
|
294
|
+
type: "HTML"
|
|
295
|
+
},
|
|
296
|
+
variantKey: label,
|
|
297
|
+
searchRegion: searchRegion
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
const categoriesTemplate = `
|
|
301
|
+
<div class="categories">
|
|
302
|
+
<table>
|
|
303
|
+
<tbody>
|
|
304
|
+
<tr><td>45111901</td><td>Audioconferencing systems</td></tr>
|
|
305
|
+
<tr><td>81111809</td><td>System installation services</td></tr>
|
|
306
|
+
</tbody>
|
|
307
|
+
</table>
|
|
308
|
+
</div>
|
|
309
|
+
`;
|
|
310
|
+
const mockResultValues = [{
|
|
311
|
+
rowIndex: 0,
|
|
312
|
+
result: {
|
|
313
|
+
unspsc: {
|
|
314
|
+
matchText: "45111901",
|
|
315
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[1]/td[1]`,
|
|
316
|
+
matchType: "direct-text",
|
|
317
|
+
sourceText: "45111901"
|
|
318
|
+
},
|
|
319
|
+
unspsc_description: {
|
|
320
|
+
matchText: "Audioconferencing systems",
|
|
321
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[1]/td[2]`,
|
|
322
|
+
matchType: "direct-text",
|
|
323
|
+
sourceText: "Audioconferencing systems"
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}, {
|
|
327
|
+
rowIndex: 1,
|
|
328
|
+
result: {
|
|
329
|
+
unspsc: {
|
|
330
|
+
matchText: "81111809",
|
|
331
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[2]/td[1]`,
|
|
332
|
+
matchType: "direct-text",
|
|
333
|
+
sourceText: "81111809"
|
|
334
|
+
},
|
|
335
|
+
unspsc_description: {
|
|
336
|
+
matchText: "System installation services",
|
|
337
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[2]/td[2]`,
|
|
338
|
+
matchType: "direct-text",
|
|
339
|
+
sourceText: "System installation services"
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}];
|
|
343
|
+
const expectedCategories = [{
|
|
344
|
+
unspsc: "45111901",
|
|
345
|
+
unspsc_description: "Audioconferencing systems"
|
|
346
|
+
}, {
|
|
347
|
+
unspsc: "81111809",
|
|
348
|
+
unspsc_description: "System installation services"
|
|
349
|
+
}];
|
|
350
|
+
(0, _extendedTest.describe)("dynamicListExtractor - search region cache reuse", () => {
|
|
351
|
+
(0, _extendedTest.test)("reuses the cache on a second identical call (no AI re-extraction)", async ({
|
|
352
|
+
page
|
|
353
|
+
}) => {
|
|
354
|
+
const label = `categories-search-region-${(0, _uuid.v4)()}`;
|
|
355
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
356
|
+
const store = new Map();
|
|
357
|
+
const cacheGetSpy = _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
358
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
359
|
+
store.set(key, value);
|
|
360
|
+
});
|
|
361
|
+
const runAiExtractionModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
|
|
362
|
+
const runAiExtractionSpy = _extendedTest.vi.spyOn(runAiExtractionModule, "runAiExtraction").mockResolvedValue((0, _neverthrow.ok)({
|
|
363
|
+
resultValues: mockResultValues,
|
|
364
|
+
containerPath: RELATIVE_CONTAINER_PATH,
|
|
365
|
+
fullContainerXpath: FULL_CONTAINER_XPATH,
|
|
366
|
+
matches: new Map()
|
|
367
|
+
}));
|
|
368
|
+
await page.setContent(categoriesTemplate);
|
|
369
|
+
const searchRegion = page.locator("div.categories");
|
|
370
|
+
const first = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
371
|
+
(0, _extendedTest.expect)(first.isOk()).toBe(true);
|
|
372
|
+
(0, _extendedTest.expect)(first._unsafeUnwrap()).toEqual(expectedCategories);
|
|
373
|
+
(0, _extendedTest.expect)(runAiExtractionSpy).toHaveBeenCalledTimes(1);
|
|
374
|
+
const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
375
|
+
(0, _extendedTest.expect)(second.isOk()).toBe(true);
|
|
376
|
+
(0, _extendedTest.expect)(second._unsafeUnwrap()).toEqual(expectedCategories);
|
|
377
|
+
(0, _extendedTest.expect)(runAiExtractionSpy).toHaveBeenCalledTimes(1);
|
|
378
|
+
(0, _extendedTest.expect)(cacheGetSpy).toHaveBeenCalledTimes(2);
|
|
379
|
+
runAiExtractionSpy.mockRestore();
|
|
380
|
+
});
|
|
381
|
+
(0, _extendedTest.test)("caches a relative xpath mapping, not an absolute html/body path", async ({
|
|
382
|
+
page
|
|
383
|
+
}) => {
|
|
384
|
+
const label = `categories-mapping-${(0, _uuid.v4)()}`;
|
|
385
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
386
|
+
const store = new Map();
|
|
387
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
388
|
+
const cacheSetSpy = _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
389
|
+
store.set(key, value);
|
|
390
|
+
});
|
|
391
|
+
const runAiExtractionModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
|
|
392
|
+
const runAiExtractionSpy = _extendedTest.vi.spyOn(runAiExtractionModule, "runAiExtraction").mockResolvedValue((0, _neverthrow.ok)({
|
|
393
|
+
resultValues: mockResultValues,
|
|
394
|
+
containerPath: RELATIVE_CONTAINER_PATH,
|
|
395
|
+
fullContainerXpath: FULL_CONTAINER_XPATH,
|
|
396
|
+
matches: new Map()
|
|
397
|
+
}));
|
|
398
|
+
await page.setContent(categoriesTemplate);
|
|
399
|
+
const searchRegion = page.locator("div.categories");
|
|
400
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
401
|
+
(0, _extendedTest.expect)(cacheSetSpy).toHaveBeenCalledTimes(1);
|
|
402
|
+
const cached = cacheSetSpy.mock.calls[0][1];
|
|
403
|
+
const allXpaths = Object.values(cached.matchesMapping).flat().map(entry => entry.xpath);
|
|
404
|
+
(0, _extendedTest.expect)(allXpaths.length).toBeGreaterThan(0);
|
|
405
|
+
for (const xpath of allXpaths) {
|
|
406
|
+
(0, _extendedTest.expect)(xpath).not.toContain("html[1]/body[1]");
|
|
407
|
+
const resolves = await page.evaluate(({
|
|
408
|
+
prefix,
|
|
409
|
+
rel
|
|
410
|
+
}) => document.evaluate(`${prefix}/${rel}`, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue !== null, {
|
|
411
|
+
prefix: FULL_CONTAINER_XPATH,
|
|
412
|
+
rel: xpath
|
|
413
|
+
});
|
|
414
|
+
(0, _extendedTest.expect)(resolves).toBe(true);
|
|
415
|
+
}
|
|
416
|
+
runAiExtractionSpy.mockRestore();
|
|
417
|
+
});
|
|
418
|
+
});
|
|
419
|
+
function row(rowIndexZeroBased, unspsc, description, sources) {
|
|
420
|
+
const tr = rowIndexZeroBased + 1;
|
|
421
|
+
return {
|
|
422
|
+
rowIndex: rowIndexZeroBased,
|
|
423
|
+
result: {
|
|
424
|
+
unspsc: {
|
|
425
|
+
matchText: unspsc,
|
|
426
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[${tr}]/td[1]`,
|
|
427
|
+
matchType: "direct-text",
|
|
428
|
+
sourceText: (sources === null || sources === void 0 ? void 0 : sources.unspsc) ?? unspsc
|
|
429
|
+
},
|
|
430
|
+
unspsc_description: {
|
|
431
|
+
matchText: description,
|
|
432
|
+
matchXpath: `${FULL_CONTAINER_XPATH}/tr[${tr}]/td[2]`,
|
|
433
|
+
matchType: "direct-text",
|
|
434
|
+
sourceText: (sources === null || sources === void 0 ? void 0 : sources.description) ?? description
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
function categoriesAiResult(rows) {
|
|
440
|
+
return (0, _neverthrow.ok)({
|
|
441
|
+
resultValues: rows,
|
|
442
|
+
containerPath: RELATIVE_CONTAINER_PATH,
|
|
443
|
+
fullContainerXpath: FULL_CONTAINER_XPATH,
|
|
444
|
+
matches: new Map()
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
function tableHtml(cells) {
|
|
448
|
+
const rows = cells.map(([a, b]) => `<tr><td>${a}</td><td>${b}</td></tr>`).join("\n ");
|
|
449
|
+
return `
|
|
450
|
+
<div class="categories">
|
|
451
|
+
<table>
|
|
452
|
+
<tbody>
|
|
453
|
+
${rows}
|
|
454
|
+
</tbody>
|
|
455
|
+
</table>
|
|
456
|
+
</div>
|
|
457
|
+
`;
|
|
458
|
+
}
|
|
459
|
+
async function setupCategoriesMocks() {
|
|
460
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
461
|
+
const store = new Map();
|
|
462
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
463
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
464
|
+
store.set(key, value);
|
|
465
|
+
});
|
|
466
|
+
const aiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
|
|
467
|
+
const aiSpy = _extendedTest.vi.spyOn(aiModule, "runAiExtraction");
|
|
468
|
+
return {
|
|
469
|
+
aiSpy,
|
|
470
|
+
store
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
(0, _extendedTest.describe)("dynamicListExtractor - cache validation edge cases", () => {
|
|
474
|
+
(0, _extendedTest.afterEach)(() => {
|
|
475
|
+
_extendedTest.vi.restoreAllMocks();
|
|
476
|
+
});
|
|
477
|
+
(0, _extendedTest.test)("invalidates cache when extracted text changes", async ({
|
|
478
|
+
page
|
|
479
|
+
}) => {
|
|
480
|
+
const label = `validation-text-change-${(0, _uuid.v4)()}`;
|
|
481
|
+
const {
|
|
482
|
+
aiSpy
|
|
483
|
+
} = await setupCategoriesMocks();
|
|
484
|
+
aiSpy.mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")])).mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems UPDATED"), row(1, "81111809", "System installation services")]));
|
|
485
|
+
await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
|
|
486
|
+
const searchRegion = page.locator("div.categories");
|
|
487
|
+
const first = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
488
|
+
(0, _extendedTest.expect)(first.isOk()).toBe(true);
|
|
489
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
490
|
+
await page.evaluate(() => {
|
|
491
|
+
const td = document.querySelector("div.categories tbody tr:nth-child(1) td:nth-child(2)");
|
|
492
|
+
if (td) td.textContent = "Audioconferencing systems UPDATED";
|
|
493
|
+
});
|
|
494
|
+
const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
495
|
+
(0, _extendedTest.expect)(second.isOk()).toBe(true);
|
|
496
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
497
|
+
(0, _extendedTest.expect)(second._unsafeUnwrap()[0]).toEqual({
|
|
498
|
+
unspsc: "45111901",
|
|
499
|
+
unspsc_description: "Audioconferencing systems UPDATED"
|
|
500
|
+
});
|
|
501
|
+
});
|
|
502
|
+
(0, _extendedTest.test)("invalidates cache when a row is removed", async ({
|
|
503
|
+
page
|
|
504
|
+
}) => {
|
|
505
|
+
const label = `validation-row-removed-${(0, _uuid.v4)()}`;
|
|
506
|
+
const {
|
|
507
|
+
aiSpy
|
|
508
|
+
} = await setupCategoriesMocks();
|
|
509
|
+
aiSpy.mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")])).mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems")]));
|
|
510
|
+
await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
|
|
511
|
+
const searchRegion = page.locator("div.categories");
|
|
512
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
513
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
514
|
+
await page.evaluate(() => {
|
|
515
|
+
const tr = document.querySelector("div.categories tbody tr:nth-child(2)");
|
|
516
|
+
tr === null || tr === void 0 || tr.remove();
|
|
517
|
+
});
|
|
518
|
+
const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
519
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
520
|
+
(0, _extendedTest.expect)(second._unsafeUnwrap()).toHaveLength(1);
|
|
521
|
+
});
|
|
522
|
+
(0, _extendedTest.test)("reuses cache for a clean exact-match list", async ({
|
|
523
|
+
page
|
|
524
|
+
}) => {
|
|
525
|
+
const label = `validation-exact-reuse-${(0, _uuid.v4)()}`;
|
|
526
|
+
const {
|
|
527
|
+
aiSpy
|
|
528
|
+
} = await setupCategoriesMocks();
|
|
529
|
+
aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")]));
|
|
530
|
+
await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
|
|
531
|
+
const searchRegion = page.locator("div.categories");
|
|
532
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
533
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
534
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
535
|
+
});
|
|
536
|
+
(0, _extendedTest.test)("reuses cache when DOM text has irregular internal whitespace", async ({
|
|
537
|
+
page
|
|
538
|
+
}) => {
|
|
539
|
+
const label = `validation-whitespace-${(0, _uuid.v4)()}`;
|
|
540
|
+
const {
|
|
541
|
+
aiSpy
|
|
542
|
+
} = await setupCategoriesMocks();
|
|
543
|
+
aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")]));
|
|
544
|
+
await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
|
|
545
|
+
const searchRegion = page.locator("div.categories");
|
|
546
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
547
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
548
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
549
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
550
|
+
});
|
|
551
|
+
(0, _extendedTest.test)("reuses cache when match was partial (full source text matches)", async ({
|
|
552
|
+
page
|
|
553
|
+
}) => {
|
|
554
|
+
const label = `validation-partial-${(0, _uuid.v4)()}`;
|
|
555
|
+
const {
|
|
556
|
+
aiSpy
|
|
557
|
+
} = await setupCategoriesMocks();
|
|
558
|
+
aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems", {
|
|
559
|
+
description: "Audioconferencing systems and hardware controllers"
|
|
560
|
+
}), row(1, "81111809", "System installation services", {
|
|
561
|
+
description: "System installation services - admin"
|
|
562
|
+
})]));
|
|
563
|
+
await page.setContent(tableHtml([["45111901", "Audioconferencing systems and hardware controllers"], ["81111809", "System installation services - admin"]]));
|
|
564
|
+
const searchRegion = page.locator("div.categories");
|
|
565
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
566
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
567
|
+
await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
|
|
568
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
569
|
+
});
|
|
570
|
+
});
|
|
571
|
+
(0, _extendedTest.describe)("dynamicListExtractor - single-item container round-trip (real matcher, mocked AI)", () => {
|
|
572
|
+
(0, _extendedTest.afterEach)(() => {
|
|
573
|
+
_extendedTest.vi.restoreAllMocks();
|
|
574
|
+
});
|
|
575
|
+
(0, _extendedTest.test)("reuses cache for a single-row list where id and name are in sibling cells", async ({
|
|
576
|
+
page
|
|
577
|
+
}) => {
|
|
578
|
+
const label = `single-row-${(0, _uuid.v4)()}`;
|
|
579
|
+
const listAiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../utils/extractStructuredListUsingAi")));
|
|
580
|
+
const aiSpy = _extendedTest.vi.spyOn(listAiModule, "extractStructuredListUsingAi").mockImplementation(async (_entityName, schema) => {
|
|
581
|
+
const props = Object.keys((schema === null || schema === void 0 ? void 0 : schema.properties) ?? {});
|
|
582
|
+
return (0, _neverthrow.ok)(props.length <= 1 ? [{
|
|
583
|
+
vendor: "V00000908"
|
|
584
|
+
}] : [{
|
|
585
|
+
vendor: "V00000908",
|
|
586
|
+
vendor_name: "FLAGPOLES INC."
|
|
587
|
+
}]);
|
|
588
|
+
});
|
|
589
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
590
|
+
const store = new Map();
|
|
591
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
592
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
593
|
+
store.set(key, value);
|
|
594
|
+
});
|
|
595
|
+
await page.setContent(`
|
|
596
|
+
<table><tbody>
|
|
597
|
+
<tr><td>ID: <a href="#">V00000908</a></td><td>FLAGPOLES INC.</td></tr>
|
|
598
|
+
</tbody></table>
|
|
599
|
+
`);
|
|
600
|
+
const options = {
|
|
601
|
+
label,
|
|
602
|
+
itemEntityName: "vendors",
|
|
603
|
+
itemEntitySchema: {
|
|
604
|
+
type: "object",
|
|
605
|
+
properties: {
|
|
606
|
+
vendor: {
|
|
607
|
+
type: "string",
|
|
608
|
+
primary: true,
|
|
609
|
+
description: "the vendor id"
|
|
610
|
+
},
|
|
611
|
+
vendor_name: {
|
|
612
|
+
type: "string",
|
|
613
|
+
description: "the vendor name"
|
|
614
|
+
}
|
|
615
|
+
},
|
|
616
|
+
required: ["vendor", "vendor_name"]
|
|
617
|
+
},
|
|
618
|
+
strategy: {
|
|
619
|
+
model: "claude-sonnet-4-20250514",
|
|
620
|
+
type: "HTML"
|
|
621
|
+
},
|
|
622
|
+
variantKey: label
|
|
623
|
+
};
|
|
624
|
+
const first = await (0, _.extractArrayFromPage)(page, options);
|
|
625
|
+
(0, _extendedTest.expect)(first).toEqual([{
|
|
626
|
+
vendor: "V00000908",
|
|
627
|
+
vendor_name: "FLAGPOLES INC."
|
|
628
|
+
}]);
|
|
629
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
630
|
+
const second = await (0, _.extractArrayFromPage)(page, options);
|
|
631
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
632
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
|
|
633
|
+
});
|
|
269
634
|
});
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
var _extendedTest = require("../../../common/extendedTest");
|
|
4
4
|
var _ = require("../..");
|
|
5
|
+
var _neverthrow = require("neverthrow");
|
|
5
6
|
var _uuid = require("uuid");
|
|
7
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
6
8
|
const productListTemplate = `
|
|
7
9
|
<div class="products-container">
|
|
8
10
|
<div class="product-item">
|
|
@@ -143,4 +145,45 @@ _extendedTest.describe.skip("Array Extractor Caching Tests", () => {
|
|
|
143
145
|
console.log("All cache behavior tests completed successfully!");
|
|
144
146
|
});
|
|
145
147
|
});
|
|
148
|
+
});
|
|
149
|
+
(0, _extendedTest.describe)("extractArrayFromLocator - option forwarding", () => {
|
|
150
|
+
(0, _extendedTest.afterEach)(() => {
|
|
151
|
+
_extendedTest.vi.restoreAllMocks();
|
|
152
|
+
});
|
|
153
|
+
(0, _extendedTest.test)("forwards prompt, apiKey and variantKey to dynamicListExtractor", async ({
|
|
154
|
+
page
|
|
155
|
+
}) => {
|
|
156
|
+
const dleModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../dynamicListExtractor")));
|
|
157
|
+
const spy = _extendedTest.vi.spyOn(dleModule, "dynamicListExtractor").mockResolvedValue((0, _neverthrow.ok)([]));
|
|
158
|
+
await page.setContent(`<ul id="list"><li>item one</li></ul>`);
|
|
159
|
+
await (0, _.extractArrayFromLocator)(page.locator("#list"), {
|
|
160
|
+
label: "items",
|
|
161
|
+
itemEntityName: "items",
|
|
162
|
+
itemEntitySchema: {
|
|
163
|
+
type: "object",
|
|
164
|
+
properties: {
|
|
165
|
+
name: {
|
|
166
|
+
type: "string",
|
|
167
|
+
primary: true,
|
|
168
|
+
description: "the item name"
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
required: ["name"]
|
|
172
|
+
},
|
|
173
|
+
strategy: {
|
|
174
|
+
model: "claude-sonnet-4-20250514",
|
|
175
|
+
type: "HTML"
|
|
176
|
+
},
|
|
177
|
+
prompt: "MY_UNIQUE_PROMPT",
|
|
178
|
+
apiKey: "MY_API_KEY",
|
|
179
|
+
variantKey: "MY_VARIANT"
|
|
180
|
+
});
|
|
181
|
+
(0, _extendedTest.expect)(spy).toHaveBeenCalledTimes(1);
|
|
182
|
+
const passedOptions = spy.mock.calls[0][2];
|
|
183
|
+
(0, _extendedTest.expect)(passedOptions).toMatchObject({
|
|
184
|
+
prompt: "MY_UNIQUE_PROMPT",
|
|
185
|
+
apiKey: "MY_API_KEY",
|
|
186
|
+
variantKey: "MY_VARIANT"
|
|
187
|
+
});
|
|
188
|
+
});
|
|
146
189
|
});
|
|
@@ -59,7 +59,7 @@ async function dynamicListExtractor(page, identifier, options) {
|
|
|
59
59
|
if (cachedResult.exceedsLimit) {
|
|
60
60
|
_Logger.logger.warn(`Cache key ${extractorInputHash} exceeds cache limit and is not cacheable`);
|
|
61
61
|
} else {
|
|
62
|
-
const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.
|
|
62
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.fullContainerXpath);
|
|
63
63
|
if (isValid) {
|
|
64
64
|
const nonRelatedChildrenCount = cachedResult.nonRelatedChildrenCount;
|
|
65
65
|
const currentChildrenCount = await page.evaluate(fullContainerXpath => {
|
|
@@ -127,7 +127,7 @@ async function dynamicListExtractor(page, identifier, options) {
|
|
|
127
127
|
return (0, _neverthrow.ok)(resultsToReturn);
|
|
128
128
|
}
|
|
129
129
|
function buildXpathsMapping(results) {
|
|
130
|
-
const containerXpath = results.
|
|
130
|
+
const containerXpath = results.fullContainerXpath;
|
|
131
131
|
const xpathsMapping = {};
|
|
132
132
|
for (const result of results.resultValues) {
|
|
133
133
|
for (const [_key, valueObj] of Object.entries(result.result)) {
|
|
@@ -137,7 +137,8 @@ function buildXpathsMapping(results) {
|
|
|
137
137
|
const relativePath = matchedXpath.replace(containerXpath + "/", "");
|
|
138
138
|
const xpathEntry = {
|
|
139
139
|
xpath: relativePath,
|
|
140
|
-
matchType: value.matchType
|
|
140
|
+
matchType: value.matchType,
|
|
141
|
+
sourceText: value.sourceText
|
|
141
142
|
};
|
|
142
143
|
if (!xpathsMapping[value.matchText]) {
|
|
143
144
|
xpathsMapping[value.matchText] = [];
|
|
@@ -21,6 +21,24 @@ var _findTableHeaders = require("../common/findTableHeaders");
|
|
|
21
21
|
var _Logger = require("../../common/Logger");
|
|
22
22
|
var _utils = require("../common/matching/utils");
|
|
23
23
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
24
|
+
function getCommonAncestorXpath(xpaths) {
|
|
25
|
+
if (xpaths.length === 0) return null;
|
|
26
|
+
const segmented = xpaths.map(xpath => xpath.split("/"));
|
|
27
|
+
const minLen = Math.min(...segmented.map(segments => segments.length));
|
|
28
|
+
const common = [];
|
|
29
|
+
for (let i = 0; i < minLen; i++) {
|
|
30
|
+
const segment = segmented[0][i];
|
|
31
|
+
if (segmented.every(segments => segments[i] === segment)) {
|
|
32
|
+
common.push(segment);
|
|
33
|
+
} else {
|
|
34
|
+
break;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
if (common.length > 1 && xpaths.some(xpath => xpath === common.join("/"))) {
|
|
38
|
+
common.pop();
|
|
39
|
+
}
|
|
40
|
+
return common.length > 0 ? common.join("/") : null;
|
|
41
|
+
}
|
|
24
42
|
async function runAiExtraction(params) {
|
|
25
43
|
return handleNewAiExtraction(params);
|
|
26
44
|
}
|
|
@@ -82,7 +100,8 @@ async function handleNewAiExtraction(params) {
|
|
|
82
100
|
acc[key] = {
|
|
83
101
|
matchText: bestMatch.matchText,
|
|
84
102
|
matchXpath: bestMatch.matchXpath,
|
|
85
|
-
matchType: bestMatch.matchType
|
|
103
|
+
matchType: bestMatch.matchType,
|
|
104
|
+
sourceText: bestMatch.sourceText
|
|
86
105
|
};
|
|
87
106
|
} else {
|
|
88
107
|
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the page html, dropped for hallucination protection`);
|
|
@@ -100,26 +119,15 @@ async function handleNewAiExtraction(params) {
|
|
|
100
119
|
})));
|
|
101
120
|
let containerPath = null;
|
|
102
121
|
let fullContainerXpath = null;
|
|
103
|
-
if (resultValues.length > 0
|
|
104
|
-
const
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
return window.__INTUNED__.getElementXPath(element.parentElement);
|
|
113
|
-
}
|
|
114
|
-
return null;
|
|
115
|
-
}, primaryXpath);
|
|
116
|
-
if (parentXpath) {
|
|
117
|
-
fullContainerXpath = parentXpath;
|
|
118
|
-
if (hasSearchRegionContainer) {
|
|
119
|
-
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, parentXpath);
|
|
120
|
-
} else {
|
|
121
|
-
containerPath = parentXpath;
|
|
122
|
-
}
|
|
122
|
+
if (resultValues.length > 0) {
|
|
123
|
+
const propertyXpaths = Object.values(resultValues[0].result).map(value => value.matchXpath).filter(xpath => !!xpath);
|
|
124
|
+
const itemContainerXpath = getCommonAncestorXpath(propertyXpaths);
|
|
125
|
+
if (itemContainerXpath) {
|
|
126
|
+
fullContainerXpath = itemContainerXpath;
|
|
127
|
+
if (hasSearchRegionContainer) {
|
|
128
|
+
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, itemContainerXpath);
|
|
129
|
+
} else {
|
|
130
|
+
containerPath = itemContainerXpath;
|
|
123
131
|
}
|
|
124
132
|
}
|
|
125
133
|
}
|
|
@@ -240,7 +248,8 @@ async function splitDomAndExtractData({
|
|
|
240
248
|
acc[key] = {
|
|
241
249
|
matchText: bestMatch.matchText,
|
|
242
250
|
matchXpath: bestMatch.matchXpath,
|
|
243
|
-
matchType: bestMatch.matchType
|
|
251
|
+
matchType: bestMatch.matchType,
|
|
252
|
+
sourceText: bestMatch.sourceText
|
|
244
253
|
};
|
|
245
254
|
} else {
|
|
246
255
|
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the item's html, dropped for hallucination protection`);
|
|
@@ -59,7 +59,8 @@ async function runAIExtraction(pageAndSearchRegion, extractionInfo, snapshot, st
|
|
|
59
59
|
xpathMapping[propertyName] = {
|
|
60
60
|
matchXpath: bestMatch.matchXpath,
|
|
61
61
|
matchText: bestMatch.matchText,
|
|
62
|
-
matchType: bestMatch.matchType
|
|
62
|
+
matchType: bestMatch.matchType,
|
|
63
|
+
sourceText: bestMatch.sourceText
|
|
63
64
|
};
|
|
64
65
|
} else {
|
|
65
66
|
_Logger.logger.debug(`Property ${propertyName} not found in the page HTML, dropped for hallucination protection`);
|