@intuned/browser-dev 0.1.15-dev.1 → 0.1.16-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/.prettierrc +3 -0
  2. package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
  3. package/dist/ai/tests/testMatching.spec.js +38 -0
  4. package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
  5. package/dist/common/matching/matching.js +3 -3
  6. package/dist/common/xpathMapping.js +23 -10
  7. package/dist/helpers/downloadFile.js +3 -0
  8. package/dist/helpers/saveFileToS3.js +3 -0
  9. package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
  10. package/dist/optimized-extractors/common/aiModelsValidations.js +2 -21
  11. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +2 -3
  12. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +1 -4
  13. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +1 -2
  14. package/dist/optimized-extractors/common/findTableHeaders.js +2 -2
  15. package/dist/optimized-extractors/common/index.js +4 -4
  16. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +2 -2
  17. package/dist/optimized-extractors/common/matching/utils.js +4 -2
  18. package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
  19. package/dist/optimized-extractors/export.d.ts +2 -50
  20. package/dist/optimized-extractors/extractArray.js +3 -1
  21. package/dist/optimized-extractors/index.d.ts +2 -50
  22. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
  23. package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
  24. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
  25. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +32 -23
  26. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +2 -2
  27. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
  28. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
  29. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
  30. package/dist/optimized-extractors/validators.js +5 -6
  31. package/package.json +1 -1
  32. package/dist/optimized-extractors/types/aiModelsValidation.js +0 -45
@@ -9,29 +9,7 @@ import { BasicSchema } from "./types/jsonSchema";
9
9
  * @property type - the type of the strategy
10
10
  */
11
11
  export interface ImageStrategy {
12
- model:
13
- | "claude-3-haiku"
14
- | "claude-3-haiku-20240307"
15
- | "claude-3.5-sonnet"
16
- | "claude-3-5-sonnet-20240620"
17
- | "claude-3-5-sonnet-20241022"
18
- | "claude-opus-4"
19
- | "claude-opus-4-20250514"
20
- | "claude-sonnet-4"
21
- | "claude-sonnet-4-20250514"
22
- | "gpt4-turbo"
23
- | "gpt-4-turbo-2024-04-09"
24
- | "gpt-4o"
25
- | "gpt-4o-2024-05-13"
26
- | "gpt-4o-mini"
27
- | "gpt-4o-mini-2024-07-18"
28
- | "gemini-1.5-pro"
29
- | "gemini-1.5-pro-002"
30
- | "gemini-1.5-flash-8b"
31
- | "gemini-1.5-flash-8b-002"
32
- | "gemini-1.5-flash"
33
- | "gemini-1.5-flash-002"
34
- | "gemini-2.0-flash-exp";
12
+ model: string;
35
13
  type: "IMAGE";
36
14
  }
37
15
  /**
@@ -43,33 +21,7 @@ export interface ImageStrategy {
43
21
  * @property type - the type of the strategy
44
22
  */
45
23
  export interface HtmlStrategy {
46
- model:
47
- | "claude-3-haiku"
48
- | "claude-3-haiku-20240307"
49
- | "claude-3-5-haiku"
50
- | "claude-3-5-haiku-20241022"
51
- | "claude-3.5-sonnet"
52
- | "claude-3-5-sonnet-20240620"
53
- | "claude-3-5-sonnet-20241022"
54
- | "claude-opus-4"
55
- | "claude-opus-4-20250514"
56
- | "claude-sonnet-4"
57
- | "claude-sonnet-4-20250514"
58
- | "gpt4-turbo"
59
- | "gpt-4-turbo-2024-04-09"
60
- | "gpt3.5-turbo"
61
- | "gpt-3.5-turbo-0125"
62
- | "gpt-4o"
63
- | "gpt-4o-2024-05-13"
64
- | "gpt-4o-mini"
65
- | "gpt-4o-mini-2024-07-18"
66
- | "gemini-1.5-pro"
67
- | "gemini-1.5-pro-002"
68
- | "gemini-1.5-flash-8b"
69
- | "gemini-1.5-flash-8b-002"
70
- | "gemini-1.5-flash"
71
- | "gemini-1.5-flash-002"
72
- | "gemini-2.0-flash-exp";
24
+ model: string;
73
25
  type: "HTML";
74
26
  }
75
27
  /**
@@ -81,7 +81,9 @@ const extractArrayFromLocator = async (locator, options) => {
81
81
  strategy: validOptions.strategy,
82
82
  optionalPropertiesInvalidator: validOptions.optionalPropertiesInvalidator,
83
83
  variantKey: validOptions.variantKey,
84
- searchRegion: locator
84
+ searchRegion: locator,
85
+ prompt: validOptions.prompt,
86
+ apiKey: validOptions.apiKey
85
87
  });
86
88
  if (result.isErr()) {
87
89
  switch (result.error.type) {
@@ -9,29 +9,7 @@ import { BasicSchema } from "./types/jsonSchema";
9
9
  * @property type - the type of the strategy
10
10
  */
11
11
  export interface ImageStrategy {
12
- model:
13
- | "claude-3-haiku"
14
- | "claude-3-haiku-20240307"
15
- | "claude-3.5-sonnet"
16
- | "claude-3-5-sonnet-20240620"
17
- | "claude-3-5-sonnet-20241022"
18
- | "claude-opus-4"
19
- | "claude-opus-4-20250514"
20
- | "claude-sonnet-4"
21
- | "claude-sonnet-4-20250514"
22
- | "gpt4-turbo"
23
- | "gpt-4-turbo-2024-04-09"
24
- | "gpt-4o"
25
- | "gpt-4o-2024-05-13"
26
- | "gpt-4o-mini"
27
- | "gpt-4o-mini-2024-07-18"
28
- | "gemini-1.5-pro"
29
- | "gemini-1.5-pro-002"
30
- | "gemini-1.5-flash-8b"
31
- | "gemini-1.5-flash-8b-002"
32
- | "gemini-1.5-flash"
33
- | "gemini-1.5-flash-002"
34
- | "gemini-2.0-flash-exp";
12
+ model: string;
35
13
  type: "IMAGE";
36
14
  }
37
15
  /**
@@ -43,33 +21,7 @@ export interface ImageStrategy {
43
21
  * @property type - the type of the strategy
44
22
  */
45
23
  export interface HtmlStrategy {
46
- model:
47
- | "claude-3-haiku"
48
- | "claude-3-haiku-20240307"
49
- | "claude-3-5-haiku"
50
- | "claude-3-5-haiku-20241022"
51
- | "claude-3.5-sonnet"
52
- | "claude-3-5-sonnet-20240620"
53
- | "claude-3-5-sonnet-20241022"
54
- | "claude-opus-4"
55
- | "claude-opus-4-20250514"
56
- | "claude-sonnet-4"
57
- | "claude-sonnet-4-20250514"
58
- | "gpt4-turbo"
59
- | "gpt-4-turbo-2024-04-09"
60
- | "gpt3.5-turbo"
61
- | "gpt-3.5-turbo-0125"
62
- | "gpt-4o"
63
- | "gpt-4o-2024-05-13"
64
- | "gpt-4o-mini"
65
- | "gpt-4o-mini-2024-07-18"
66
- | "gemini-1.5-pro"
67
- | "gemini-1.5-pro-002"
68
- | "gemini-1.5-flash-8b"
69
- | "gemini-1.5-flash-8b-002"
70
- | "gemini-1.5-flash"
71
- | "gemini-1.5-flash-002"
72
- | "gemini-2.0-flash-exp";
24
+ model: string;
73
25
  type: "HTML";
74
26
  }
75
27
  /**
@@ -2,6 +2,7 @@
2
2
 
3
3
  var _extendedTest = require("../../../common/extendedTest");
4
4
  var _dynamicListExtractor = require("../dynamicListExtractor");
5
+ var _ = require("../../");
5
6
  var _uuid = require("uuid");
6
7
  var _dotenv = require("dotenv");
7
8
  var _neverthrow = require("neverthrow");
@@ -249,7 +250,6 @@ _extendedTest.describe.skip("Dynamic List Extractor Caching Tests", () => {
249
250
  fullContainerXpath: "/html/body/div[1]"
250
251
  }));
251
252
  const consoleSpy = _extendedTest.vi.spyOn(_Logger.logger, "debug");
252
- const consoleInfoSpy = _extendedTest.vi.spyOn(_Logger.logger, "info");
253
253
  const consoleWarnSpy = _extendedTest.vi.spyOn(_Logger.logger, "warn");
254
254
  await page.setContent(simpleTemplate);
255
255
  const firstResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
@@ -266,4 +266,369 @@ _extendedTest.describe.skip("Dynamic List Extractor Caching Tests", () => {
266
266
  console.log("Cache size limit test completed successfully!");
267
267
  });
268
268
  });
269
+ });
270
+ const FULL_CONTAINER_XPATH = "html[1]/body[1]/div[1]/table[1]/tbody[1]";
271
+ const RELATIVE_CONTAINER_PATH = "div[1]/table[1]/tbody[1]";
272
+ const categoriesEntitySchema = {
273
+ type: "object",
274
+ required: ["unspsc"],
275
+ properties: {
276
+ unspsc: {
277
+ type: "string",
278
+ description: "extract UNSPSC",
279
+ primary: true
280
+ },
281
+ unspsc_description: {
282
+ type: "string",
283
+ description: "extract UNSPSC description"
284
+ }
285
+ }
286
+ };
287
+ function makeCategoriesOptions(label, searchRegion) {
288
+ return {
289
+ itemEntityName: "categories",
290
+ label,
291
+ itemEntitySchema: categoriesEntitySchema,
292
+ strategy: {
293
+ model: "claude-sonnet-4-20250514",
294
+ type: "HTML"
295
+ },
296
+ variantKey: label,
297
+ searchRegion: searchRegion
298
+ };
299
+ }
300
+ const categoriesTemplate = `
301
+ <div class="categories">
302
+ <table>
303
+ <tbody>
304
+ <tr><td>45111901</td><td>Audioconferencing systems</td></tr>
305
+ <tr><td>81111809</td><td>System installation services</td></tr>
306
+ </tbody>
307
+ </table>
308
+ </div>
309
+ `;
310
+ const mockResultValues = [{
311
+ rowIndex: 0,
312
+ result: {
313
+ unspsc: {
314
+ matchText: "45111901",
315
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[1]/td[1]`,
316
+ matchType: "direct-text",
317
+ sourceText: "45111901"
318
+ },
319
+ unspsc_description: {
320
+ matchText: "Audioconferencing systems",
321
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[1]/td[2]`,
322
+ matchType: "direct-text",
323
+ sourceText: "Audioconferencing systems"
324
+ }
325
+ }
326
+ }, {
327
+ rowIndex: 1,
328
+ result: {
329
+ unspsc: {
330
+ matchText: "81111809",
331
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[2]/td[1]`,
332
+ matchType: "direct-text",
333
+ sourceText: "81111809"
334
+ },
335
+ unspsc_description: {
336
+ matchText: "System installation services",
337
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[2]/td[2]`,
338
+ matchType: "direct-text",
339
+ sourceText: "System installation services"
340
+ }
341
+ }
342
+ }];
343
+ const expectedCategories = [{
344
+ unspsc: "45111901",
345
+ unspsc_description: "Audioconferencing systems"
346
+ }, {
347
+ unspsc: "81111809",
348
+ unspsc_description: "System installation services"
349
+ }];
350
+ (0, _extendedTest.describe)("dynamicListExtractor - search region cache reuse", () => {
351
+ (0, _extendedTest.test)("reuses the cache on a second identical call (no AI re-extraction)", async ({
352
+ page
353
+ }) => {
354
+ const label = `categories-search-region-${(0, _uuid.v4)()}`;
355
+ const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
356
+ const store = new Map();
357
+ const cacheGetSpy = _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
358
+ _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
359
+ store.set(key, value);
360
+ });
361
+ const runAiExtractionModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
362
+ const runAiExtractionSpy = _extendedTest.vi.spyOn(runAiExtractionModule, "runAiExtraction").mockResolvedValue((0, _neverthrow.ok)({
363
+ resultValues: mockResultValues,
364
+ containerPath: RELATIVE_CONTAINER_PATH,
365
+ fullContainerXpath: FULL_CONTAINER_XPATH,
366
+ matches: new Map()
367
+ }));
368
+ await page.setContent(categoriesTemplate);
369
+ const searchRegion = page.locator("div.categories");
370
+ const first = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
371
+ (0, _extendedTest.expect)(first.isOk()).toBe(true);
372
+ (0, _extendedTest.expect)(first._unsafeUnwrap()).toEqual(expectedCategories);
373
+ (0, _extendedTest.expect)(runAiExtractionSpy).toHaveBeenCalledTimes(1);
374
+ const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
375
+ (0, _extendedTest.expect)(second.isOk()).toBe(true);
376
+ (0, _extendedTest.expect)(second._unsafeUnwrap()).toEqual(expectedCategories);
377
+ (0, _extendedTest.expect)(runAiExtractionSpy).toHaveBeenCalledTimes(1);
378
+ (0, _extendedTest.expect)(cacheGetSpy).toHaveBeenCalledTimes(2);
379
+ runAiExtractionSpy.mockRestore();
380
+ });
381
+ (0, _extendedTest.test)("caches a relative xpath mapping, not an absolute html/body path", async ({
382
+ page
383
+ }) => {
384
+ const label = `categories-mapping-${(0, _uuid.v4)()}`;
385
+ const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
386
+ const store = new Map();
387
+ _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
388
+ const cacheSetSpy = _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
389
+ store.set(key, value);
390
+ });
391
+ const runAiExtractionModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
392
+ const runAiExtractionSpy = _extendedTest.vi.spyOn(runAiExtractionModule, "runAiExtraction").mockResolvedValue((0, _neverthrow.ok)({
393
+ resultValues: mockResultValues,
394
+ containerPath: RELATIVE_CONTAINER_PATH,
395
+ fullContainerXpath: FULL_CONTAINER_XPATH,
396
+ matches: new Map()
397
+ }));
398
+ await page.setContent(categoriesTemplate);
399
+ const searchRegion = page.locator("div.categories");
400
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
401
+ (0, _extendedTest.expect)(cacheSetSpy).toHaveBeenCalledTimes(1);
402
+ const cached = cacheSetSpy.mock.calls[0][1];
403
+ const allXpaths = Object.values(cached.matchesMapping).flat().map(entry => entry.xpath);
404
+ (0, _extendedTest.expect)(allXpaths.length).toBeGreaterThan(0);
405
+ for (const xpath of allXpaths) {
406
+ (0, _extendedTest.expect)(xpath).not.toContain("html[1]/body[1]");
407
+ const resolves = await page.evaluate(({
408
+ prefix,
409
+ rel
410
+ }) => document.evaluate(`${prefix}/${rel}`, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue !== null, {
411
+ prefix: FULL_CONTAINER_XPATH,
412
+ rel: xpath
413
+ });
414
+ (0, _extendedTest.expect)(resolves).toBe(true);
415
+ }
416
+ runAiExtractionSpy.mockRestore();
417
+ });
418
+ });
419
+ function row(rowIndexZeroBased, unspsc, description, sources) {
420
+ const tr = rowIndexZeroBased + 1;
421
+ return {
422
+ rowIndex: rowIndexZeroBased,
423
+ result: {
424
+ unspsc: {
425
+ matchText: unspsc,
426
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[${tr}]/td[1]`,
427
+ matchType: "direct-text",
428
+ sourceText: (sources === null || sources === void 0 ? void 0 : sources.unspsc) ?? unspsc
429
+ },
430
+ unspsc_description: {
431
+ matchText: description,
432
+ matchXpath: `${FULL_CONTAINER_XPATH}/tr[${tr}]/td[2]`,
433
+ matchType: "direct-text",
434
+ sourceText: (sources === null || sources === void 0 ? void 0 : sources.description) ?? description
435
+ }
436
+ }
437
+ };
438
+ }
439
+ function categoriesAiResult(rows) {
440
+ return (0, _neverthrow.ok)({
441
+ resultValues: rows,
442
+ containerPath: RELATIVE_CONTAINER_PATH,
443
+ fullContainerXpath: FULL_CONTAINER_XPATH,
444
+ matches: new Map()
445
+ });
446
+ }
447
+ function tableHtml(cells) {
448
+ const rows = cells.map(([a, b]) => `<tr><td>${a}</td><td>${b}</td></tr>`).join("\n ");
449
+ return `
450
+ <div class="categories">
451
+ <table>
452
+ <tbody>
453
+ ${rows}
454
+ </tbody>
455
+ </table>
456
+ </div>
457
+ `;
458
+ }
459
+ async function setupCategoriesMocks() {
460
+ const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
461
+ const store = new Map();
462
+ _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
463
+ _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
464
+ store.set(key, value);
465
+ });
466
+ const aiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
467
+ const aiSpy = _extendedTest.vi.spyOn(aiModule, "runAiExtraction");
468
+ return {
469
+ aiSpy,
470
+ store
471
+ };
472
+ }
473
+ (0, _extendedTest.describe)("dynamicListExtractor - cache validation edge cases", () => {
474
+ (0, _extendedTest.afterEach)(() => {
475
+ _extendedTest.vi.restoreAllMocks();
476
+ });
477
+ (0, _extendedTest.test)("invalidates cache when extracted text changes", async ({
478
+ page
479
+ }) => {
480
+ const label = `validation-text-change-${(0, _uuid.v4)()}`;
481
+ const {
482
+ aiSpy
483
+ } = await setupCategoriesMocks();
484
+ aiSpy.mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")])).mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems UPDATED"), row(1, "81111809", "System installation services")]));
485
+ await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
486
+ const searchRegion = page.locator("div.categories");
487
+ const first = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
488
+ (0, _extendedTest.expect)(first.isOk()).toBe(true);
489
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
490
+ await page.evaluate(() => {
491
+ const td = document.querySelector("div.categories tbody tr:nth-child(1) td:nth-child(2)");
492
+ if (td) td.textContent = "Audioconferencing systems UPDATED";
493
+ });
494
+ const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
495
+ (0, _extendedTest.expect)(second.isOk()).toBe(true);
496
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
497
+ (0, _extendedTest.expect)(second._unsafeUnwrap()[0]).toEqual({
498
+ unspsc: "45111901",
499
+ unspsc_description: "Audioconferencing systems UPDATED"
500
+ });
501
+ });
502
+ (0, _extendedTest.test)("invalidates cache when a row is removed", async ({
503
+ page
504
+ }) => {
505
+ const label = `validation-row-removed-${(0, _uuid.v4)()}`;
506
+ const {
507
+ aiSpy
508
+ } = await setupCategoriesMocks();
509
+ aiSpy.mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")])).mockResolvedValueOnce(categoriesAiResult([row(0, "45111901", "Audioconferencing systems")]));
510
+ await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
511
+ const searchRegion = page.locator("div.categories");
512
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
513
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
514
+ await page.evaluate(() => {
515
+ const tr = document.querySelector("div.categories tbody tr:nth-child(2)");
516
+ tr === null || tr === void 0 || tr.remove();
517
+ });
518
+ const second = await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
519
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
520
+ (0, _extendedTest.expect)(second._unsafeUnwrap()).toHaveLength(1);
521
+ });
522
+ (0, _extendedTest.test)("reuses cache for a clean exact-match list", async ({
523
+ page
524
+ }) => {
525
+ const label = `validation-exact-reuse-${(0, _uuid.v4)()}`;
526
+ const {
527
+ aiSpy
528
+ } = await setupCategoriesMocks();
529
+ aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")]));
530
+ await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
531
+ const searchRegion = page.locator("div.categories");
532
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
533
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
534
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
535
+ });
536
+ (0, _extendedTest.test)("reuses cache when DOM text has irregular internal whitespace", async ({
537
+ page
538
+ }) => {
539
+ const label = `validation-whitespace-${(0, _uuid.v4)()}`;
540
+ const {
541
+ aiSpy
542
+ } = await setupCategoriesMocks();
543
+ aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems"), row(1, "81111809", "System installation services")]));
544
+ await page.setContent(tableHtml([["45111901", "Audioconferencing systems"], ["81111809", "System installation services"]]));
545
+ const searchRegion = page.locator("div.categories");
546
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
547
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
548
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
549
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
550
+ });
551
+ (0, _extendedTest.test)("reuses cache when match was partial (full source text matches)", async ({
552
+ page
553
+ }) => {
554
+ const label = `validation-partial-${(0, _uuid.v4)()}`;
555
+ const {
556
+ aiSpy
557
+ } = await setupCategoriesMocks();
558
+ aiSpy.mockResolvedValue(categoriesAiResult([row(0, "45111901", "Audioconferencing systems", {
559
+ description: "Audioconferencing systems and hardware controllers"
560
+ }), row(1, "81111809", "System installation services", {
561
+ description: "System installation services - admin"
562
+ })]));
563
+ await page.setContent(tableHtml([["45111901", "Audioconferencing systems and hardware controllers"], ["81111809", "System installation services - admin"]]));
564
+ const searchRegion = page.locator("div.categories");
565
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
566
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
567
+ await (0, _dynamicListExtractor.dynamicListExtractor)(page, label, makeCategoriesOptions(label, searchRegion));
568
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
569
+ });
570
+ });
571
+ (0, _extendedTest.describe)("dynamicListExtractor - single-item container round-trip (real matcher, mocked AI)", () => {
572
+ (0, _extendedTest.afterEach)(() => {
573
+ _extendedTest.vi.restoreAllMocks();
574
+ });
575
+ (0, _extendedTest.test)("reuses cache for a single-row list where id and name are in sibling cells", async ({
576
+ page
577
+ }) => {
578
+ const label = `single-row-${(0, _uuid.v4)()}`;
579
+ const listAiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../utils/extractStructuredListUsingAi")));
580
+ const aiSpy = _extendedTest.vi.spyOn(listAiModule, "extractStructuredListUsingAi").mockImplementation(async (_entityName, schema) => {
581
+ const props = Object.keys((schema === null || schema === void 0 ? void 0 : schema.properties) ?? {});
582
+ return (0, _neverthrow.ok)(props.length <= 1 ? [{
583
+ vendor: "V00000908"
584
+ }] : [{
585
+ vendor: "V00000908",
586
+ vendor_name: "FLAGPOLES INC."
587
+ }]);
588
+ });
589
+ const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
590
+ const store = new Map();
591
+ _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
592
+ _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
593
+ store.set(key, value);
594
+ });
595
+ await page.setContent(`
596
+ <table><tbody>
597
+ <tr><td>ID: <a href="#">V00000908</a></td><td>FLAGPOLES INC.</td></tr>
598
+ </tbody></table>
599
+ `);
600
+ const options = {
601
+ label,
602
+ itemEntityName: "vendors",
603
+ itemEntitySchema: {
604
+ type: "object",
605
+ properties: {
606
+ vendor: {
607
+ type: "string",
608
+ primary: true,
609
+ description: "the vendor id"
610
+ },
611
+ vendor_name: {
612
+ type: "string",
613
+ description: "the vendor name"
614
+ }
615
+ },
616
+ required: ["vendor", "vendor_name"]
617
+ },
618
+ strategy: {
619
+ model: "claude-sonnet-4-20250514",
620
+ type: "HTML"
621
+ },
622
+ variantKey: label
623
+ };
624
+ const first = await (0, _.extractArrayFromPage)(page, options);
625
+ (0, _extendedTest.expect)(first).toEqual([{
626
+ vendor: "V00000908",
627
+ vendor_name: "FLAGPOLES INC."
628
+ }]);
629
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
630
+ const second = await (0, _.extractArrayFromPage)(page, options);
631
+ (0, _extendedTest.expect)(second).toEqual(first);
632
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
633
+ });
269
634
  });
@@ -2,7 +2,9 @@
2
2
 
3
3
  var _extendedTest = require("../../../common/extendedTest");
4
4
  var _ = require("../..");
5
+ var _neverthrow = require("neverthrow");
5
6
  var _uuid = require("uuid");
7
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
6
8
  const productListTemplate = `
7
9
  <div class="products-container">
8
10
  <div class="product-item">
@@ -143,4 +145,45 @@ _extendedTest.describe.skip("Array Extractor Caching Tests", () => {
143
145
  console.log("All cache behavior tests completed successfully!");
144
146
  });
145
147
  });
148
+ });
149
+ (0, _extendedTest.describe)("extractArrayFromLocator - option forwarding", () => {
150
+ (0, _extendedTest.afterEach)(() => {
151
+ _extendedTest.vi.restoreAllMocks();
152
+ });
153
+ (0, _extendedTest.test)("forwards prompt, apiKey and variantKey to dynamicListExtractor", async ({
154
+ page
155
+ }) => {
156
+ const dleModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../dynamicListExtractor")));
157
+ const spy = _extendedTest.vi.spyOn(dleModule, "dynamicListExtractor").mockResolvedValue((0, _neverthrow.ok)([]));
158
+ await page.setContent(`<ul id="list"><li>item one</li></ul>`);
159
+ await (0, _.extractArrayFromLocator)(page.locator("#list"), {
160
+ label: "items",
161
+ itemEntityName: "items",
162
+ itemEntitySchema: {
163
+ type: "object",
164
+ properties: {
165
+ name: {
166
+ type: "string",
167
+ primary: true,
168
+ description: "the item name"
169
+ }
170
+ },
171
+ required: ["name"]
172
+ },
173
+ strategy: {
174
+ model: "claude-sonnet-4-20250514",
175
+ type: "HTML"
176
+ },
177
+ prompt: "MY_UNIQUE_PROMPT",
178
+ apiKey: "MY_API_KEY",
179
+ variantKey: "MY_VARIANT"
180
+ });
181
+ (0, _extendedTest.expect)(spy).toHaveBeenCalledTimes(1);
182
+ const passedOptions = spy.mock.calls[0][2];
183
+ (0, _extendedTest.expect)(passedOptions).toMatchObject({
184
+ prompt: "MY_UNIQUE_PROMPT",
185
+ apiKey: "MY_API_KEY",
186
+ variantKey: "MY_VARIANT"
187
+ });
188
+ });
146
189
  });
@@ -59,7 +59,7 @@ async function dynamicListExtractor(page, identifier, options) {
59
59
  if (cachedResult.exceedsLimit) {
60
60
  _Logger.logger.warn(`Cache key ${extractorInputHash} exceeds cache limit and is not cacheable`);
61
61
  } else {
62
- const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.containerPath);
62
+ const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.fullContainerXpath);
63
63
  if (isValid) {
64
64
  const nonRelatedChildrenCount = cachedResult.nonRelatedChildrenCount;
65
65
  const currentChildrenCount = await page.evaluate(fullContainerXpath => {
@@ -127,7 +127,7 @@ async function dynamicListExtractor(page, identifier, options) {
127
127
  return (0, _neverthrow.ok)(resultsToReturn);
128
128
  }
129
129
  function buildXpathsMapping(results) {
130
- const containerXpath = results.containerPath;
130
+ const containerXpath = results.fullContainerXpath;
131
131
  const xpathsMapping = {};
132
132
  for (const result of results.resultValues) {
133
133
  for (const [_key, valueObj] of Object.entries(result.result)) {
@@ -137,7 +137,8 @@ function buildXpathsMapping(results) {
137
137
  const relativePath = matchedXpath.replace(containerXpath + "/", "");
138
138
  const xpathEntry = {
139
139
  xpath: relativePath,
140
- matchType: value.matchType
140
+ matchType: value.matchType,
141
+ sourceText: value.sourceText
141
142
  };
142
143
  if (!xpathsMapping[value.matchText]) {
143
144
  xpathsMapping[value.matchText] = [];