@intuned/browser-dev 0.1.15-dev.1 → 0.1.16-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/.prettierrc +3 -0
  2. package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
  3. package/dist/ai/tests/testMatching.spec.js +38 -0
  4. package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
  5. package/dist/common/matching/matching.js +3 -3
  6. package/dist/common/xpathMapping.js +23 -10
  7. package/dist/helpers/downloadFile.js +3 -0
  8. package/dist/helpers/saveFileToS3.js +3 -0
  9. package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
  10. package/dist/optimized-extractors/common/aiModelsValidations.js +2 -21
  11. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +2 -3
  12. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +1 -4
  13. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +1 -2
  14. package/dist/optimized-extractors/common/findTableHeaders.js +2 -2
  15. package/dist/optimized-extractors/common/index.js +4 -4
  16. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +2 -2
  17. package/dist/optimized-extractors/common/matching/utils.js +4 -2
  18. package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
  19. package/dist/optimized-extractors/export.d.ts +2 -50
  20. package/dist/optimized-extractors/extractArray.js +3 -1
  21. package/dist/optimized-extractors/index.d.ts +2 -50
  22. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
  23. package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
  24. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
  25. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +32 -23
  26. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +2 -2
  27. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
  28. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
  29. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
  30. package/dist/optimized-extractors/validators.js +5 -6
  31. package/package.json +1 -1
  32. package/dist/optimized-extractors/types/aiModelsValidation.js +0 -45
package/.prettierrc ADDED
@@ -0,0 +1,3 @@
1
+ {
2
+ "trailingComma": "es5"
3
+ }
@@ -0,0 +1,87 @@
1
+ "use strict";
2
+
3
+ var _extendedTest = require("../../common/extendedTest");
4
+ var _ = require("..");
5
+ var _neverthrow = require("neverthrow");
6
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
7
+ const DATA_SCHEMA = {
8
+ type: "object",
9
+ required: ["phone"],
10
+ properties: {
11
+ name: {
12
+ type: "string",
13
+ description: "the contact name"
14
+ },
15
+ phone: {
16
+ type: "string",
17
+ description: "the contact phone"
18
+ }
19
+ }
20
+ };
21
+ const CONTACT_HTML = `<div class="contact"><p>Andreas Jansson</p><p>PH: (727) 471-4768</p></div>`;
22
+ function makeOptions(page) {
23
+ return {
24
+ source: page,
25
+ dataSchema: DATA_SCHEMA,
26
+ strategy: "HTML",
27
+ enableDomMatching: true,
28
+ enableCache: true,
29
+ model: "claude-sonnet-4-20250514"
30
+ };
31
+ }
32
+ async function mockExtractStructuredDataUsingAi(result) {
33
+ const aiModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../extractStructuredDataUsingAi")));
34
+ return _extendedTest.vi.spyOn(aiModule, "extractStructuredDataUsingAi").mockResolvedValue((0, _neverthrow.ok)({
35
+ result,
36
+ xpathMapping: {}
37
+ }));
38
+ }
39
+ async function useInMemoryCache() {
40
+ const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../intunedServices/cache/cache")));
41
+ const store = new Map();
42
+ _extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
43
+ _extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
44
+ store.set(key, value);
45
+ });
46
+ }
47
+ (0, _extendedTest.describe)("extractStructuredData - DOM matching cache (enableDomMatching: true)", () => {
48
+ (0, _extendedTest.afterEach)(() => {
49
+ _extendedTest.vi.restoreAllMocks();
50
+ });
51
+ (0, _extendedTest.test)("reuses the cache on a second call, even when a value is a partial match", async ({
52
+ page
53
+ }) => {
54
+ const aiResult = {
55
+ name: "Andreas Jansson",
56
+ phone: "(727) 471-4768"
57
+ };
58
+ const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
59
+ await useInMemoryCache();
60
+ await page.setContent(CONTACT_HTML);
61
+ const first = await (0, _.extractStructuredData)(makeOptions(page));
62
+ (0, _extendedTest.expect)(first).toEqual(aiResult);
63
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
64
+ const second = await (0, _.extractStructuredData)(makeOptions(page));
65
+ (0, _extendedTest.expect)(second).toEqual(first);
66
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
67
+ });
68
+ (0, _extendedTest.test)("re-extracts when an extracted value's DOM text changes", async ({
69
+ page
70
+ }) => {
71
+ const aiResult = {
72
+ name: "Andreas Jansson",
73
+ phone: "(727) 471-4768"
74
+ };
75
+ const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
76
+ await useInMemoryCache();
77
+ await page.setContent(CONTACT_HTML);
78
+ await (0, _.extractStructuredData)(makeOptions(page));
79
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
80
+ await page.evaluate(() => {
81
+ const ps = document.querySelectorAll("div.contact p");
82
+ if (ps[1]) ps[1].textContent = "PH: (999) 999-9999";
83
+ });
84
+ await (0, _.extractStructuredData)(makeOptions(page));
85
+ (0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(2);
86
+ });
87
+ });
@@ -260,6 +260,44 @@ var _matching = require("../../common/matching/matching");
260
260
  });
261
261
  (0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello");
262
262
  });
263
+ (0, _extendedTest.test)("should prefer a full match over a partial match", async () => {
264
+ const matches = [{
265
+ matched_value: "Hello",
266
+ match_mode: "partial",
267
+ fuzzy_distance: 0,
268
+ match_source: "text_content",
269
+ xpath: "/html/body/div[1]"
270
+ }, {
271
+ matched_value: "Hello World",
272
+ match_mode: "full",
273
+ fuzzy_distance: 0,
274
+ match_source: "text_content",
275
+ xpath: "/html/body/div[2]"
276
+ }];
277
+ const result = (0, _matching.selectBestMatch)({
278
+ original: "Hello World",
279
+ matches
280
+ });
281
+ (0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchText).toBe("Hello World");
282
+ (0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchXpath).toBe("/html/body/div[2]");
283
+ });
284
+ (0, _extendedTest.test)("should preserve the attribute name for attribute matches", async () => {
285
+ const matches = [{
286
+ matched_value: "https://example.com/x.pdf",
287
+ match_mode: "full",
288
+ fuzzy_distance: 0,
289
+ match_source: "attribute",
290
+ attribute: "href",
291
+ xpath: "/html/body/a[1]"
292
+ }];
293
+ const result = (0, _matching.selectBestMatch)({
294
+ original: "https://example.com/x.pdf",
295
+ matches
296
+ });
297
+ (0, _extendedTest.expect)(result === null || result === void 0 ? void 0 : result.matchType).toEqual({
298
+ attribute: "href"
299
+ });
300
+ });
263
301
  });
264
302
  (0, _extendedTest.describe)("matchStringsWithDomContent", () => {
265
303
  let browser;
@@ -262,4 +262,62 @@ const IFRAME_WITH_EXTRA_CONTENT = `
262
262
  (0, _extendedTest.expect)(isValid2).toBe(true);
263
263
  (0, _extendedTest.expect)(isValid3).toBe(true);
264
264
  });
265
+ (0, _extendedTest.test)("PARTIAL match (value is a substring of the element text, PH: prefix) stays valid", async () => {
266
+ await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
267
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
268
+ phone: "(727) 471-4768"
269
+ });
270
+ (0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
271
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
272
+ });
273
+ (0, _extendedTest.test)("value sitting among sibling direct text nodes (br-separated block) stays valid", async () => {
274
+ await page.setContent(`
275
+ <table><tbody><tr><td class="vendor">
276
+ Acme Corp<br/>
277
+ Email: sales@acme.com<br/>
278
+ FAX: (630) 904-4118
279
+ </td></tr></tbody></table>
280
+ `);
281
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
282
+ phone: "(630) 904-4118",
283
+ email: "sales@acme.com"
284
+ });
285
+ (0, _extendedTest.expect)(cachedMapping["(630) 904-4118"].length).toBeGreaterThan(0);
286
+ (0, _extendedTest.expect)(cachedMapping["sales@acme.com"].length).toBeGreaterThan(0);
287
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
288
+ });
289
+ (0, _extendedTest.test)("href / attribute match (relative href resolved to absolute) stays valid", async () => {
290
+ await page.setContent(`<!DOCTYPE html><html><head><base href="https://example.com/files/"></head><body>` + `<div class="doc"><a href="annual-report.pdf">Annual Report</a></div></body></html>`);
291
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
292
+ href: "annual-report.pdf",
293
+ name: "Annual Report"
294
+ });
295
+ (0, _extendedTest.expect)(cachedMapping["annual-report.pdf"].length).toBeGreaterThan(0);
296
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
297
+ });
298
+ (0, _extendedTest.test)("FUZZY match (extracted value has a small typo) stays valid", async () => {
299
+ await page.setContent(`<div class="vendor"><span class="name">International Cleaning Services</span></div>`);
300
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
301
+ name: "Internationl Cleaning Servces"
302
+ });
303
+ (0, _extendedTest.expect)(cachedMapping["Internationl Cleaning Servces"].length).toBeGreaterThan(0);
304
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
305
+ });
306
+ (0, _extendedTest.test)("irregular internal whitespace stays valid", async () => {
307
+ await page.setContent(`<div class="vendor"><span>Audioconferencing systems</span></div>`);
308
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
309
+ category: "Audioconferencing systems"
310
+ });
311
+ (0, _extendedTest.expect)(cachedMapping["Audioconferencing systems"].length).toBeGreaterThan(0);
312
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(true);
313
+ });
314
+ (0, _extendedTest.test)("invalidates when a partial-matched value's text changes", async () => {
315
+ await page.setContent(`<div class="contact"><p>PH: (727) 471-4768</p></div>`);
316
+ const cachedMapping = await (0, _matching.createMatchesMapping)(page, {
317
+ phone: "(727) 471-4768"
318
+ });
319
+ (0, _extendedTest.expect)(cachedMapping["(727) 471-4768"].length).toBeGreaterThan(0);
320
+ await page.setContent(`<div class="contact"><p>PH: (999) 999-9999</p></div>`);
321
+ (0, _extendedTest.expect)(await (0, _matching.validateMatchesMapping)(page, cachedMapping)).toBe(false);
322
+ });
265
323
  });
@@ -122,12 +122,12 @@ function selectBestMatch({
122
122
  }) {
123
123
  const exactMatches = matches.filter(match => match.match_mode !== "fuzzy");
124
124
  if (exactMatches.length > 0) {
125
- const bestMatch = exactMatches[0];
125
+ const bestMatch = exactMatches.find(match => match.match_mode === "full") ?? exactMatches[0];
126
126
  return {
127
127
  matchText: bestMatch.matched_value,
128
128
  matchXpath: bestMatch.xpath,
129
129
  matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
130
- attribute: "unknown"
130
+ attribute: bestMatch.attribute ?? "unknown"
131
131
  } : "all-text"
132
132
  };
133
133
  }
@@ -148,7 +148,7 @@ function selectBestMatch({
148
148
  matchText: bestMatch.matched_value,
149
149
  matchXpath: bestMatch.xpath,
150
150
  matchType: bestMatch.match_source === "direct_text_node" ? "direct-text" : bestMatch.match_source === "attribute" ? {
151
- attribute: "unknown"
151
+ attribute: bestMatch.attribute ?? "unknown"
152
152
  } : "all-text"
153
153
  };
154
154
  }
@@ -6,13 +6,15 @@ Object.defineProperty(exports, "__esModule", {
6
6
  exports.createXPathMapping = createXPathMapping;
7
7
  exports.validateXPathMapping = validateXPathMapping;
8
8
  var _ensureBrowserScripts = require("./ensureBrowserScripts");
9
+ var _utils = require("../optimized-extractors/common/matching/utils");
9
10
  async function validateXPathMapping(page, cachedMapping, prefix) {
10
11
  try {
11
12
  for (const [expectedText, entries] of Object.entries(cachedMapping)) {
12
13
  for (const entry of entries) {
13
14
  const {
14
15
  xpath,
15
- matchType
16
+ matchType,
17
+ sourceText
16
18
  } = entry;
17
19
  const elementExists = await page.evaluate(({
18
20
  xpath,
@@ -27,34 +29,45 @@ async function validateXPathMapping(page, cachedMapping, prefix) {
27
29
  if (!elementExists) {
28
30
  return false;
29
31
  }
30
- const actualText = await page.evaluate(({
32
+ const actualTexts = await page.evaluate(({
31
33
  xpath,
32
34
  prefix,
33
35
  matchType
34
36
  }) => {
35
37
  const element = document.evaluate(prefix ? `${prefix}/${xpath}` : xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
36
- if (!element) return "";
38
+ if (!element) return [];
37
39
  if (matchType === "direct-text") {
38
- let directText = "";
40
+ const nodes = [];
39
41
  for (const child of element.childNodes) {
40
42
  if (child.nodeType === Node.TEXT_NODE) {
41
- directText += child.textContent || "";
43
+ const t = (child.textContent || "").trim();
44
+ if (t) nodes.push(t);
42
45
  }
43
46
  }
44
- return directText.trim();
47
+ return nodes;
45
48
  } else if (matchType === "all-text") {
46
49
  var _element$textContent;
47
- return ((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || "";
50
+ return [((_element$textContent = element.textContent) === null || _element$textContent === void 0 ? void 0 : _element$textContent.trim()) || ""];
48
51
  } else if (typeof matchType === "object" && matchType.attribute) {
49
- return element.getAttribute(matchType.attribute) || "";
52
+ const attr = matchType.attribute;
53
+ const raw = element.getAttribute(attr) || "";
54
+ const candidates = [raw];
55
+ const reflected = element[attr];
56
+ if (typeof reflected === "string" && reflected) {
57
+ candidates.push(reflected);
58
+ }
59
+ return candidates;
50
60
  }
51
- return "";
61
+ return [];
52
62
  }, {
53
63
  xpath,
54
64
  prefix,
55
65
  matchType
56
66
  });
57
- if (actualText !== expectedText) {
67
+ const expectedFull = sourceText ?? expectedText;
68
+ const normExpected = (0, _utils.normalizeSpacing)(expectedFull);
69
+ const matched = actualTexts.some(t => (0, _utils.normalizeSpacing)(t) === normExpected);
70
+ if (!matched) {
58
71
  return false;
59
72
  }
60
73
  }
@@ -38,6 +38,9 @@ const downloadFile = async input => {
38
38
  trigger,
39
39
  timeoutInMs
40
40
  } = input;
41
+ if (!trigger) {
42
+ throw new Error("trigger is required");
43
+ }
41
44
  let pageToDownloadFrom = page;
42
45
  let shouldClosePage = false;
43
46
  let downloadPromise;
@@ -30,6 +30,9 @@ const saveFileToS3 = async input => {
30
30
  if (!page || typeof page.goto !== "function") {
31
31
  throw new Error("page must be a playwright Page object");
32
32
  }
33
+ if (!trigger) {
34
+ throw new Error("trigger is required");
35
+ }
33
36
  const download = await (0, _downloadFile.downloadFile)({
34
37
  page,
35
38
  trigger,
@@ -279,6 +279,22 @@ _extendedTest.describe.skip("TestNotInGeneration", () => {
279
279
  (0, _extendedTest.expect)(cancelled).toContain("canceled");
280
280
  }
281
281
  });
282
+ (0, _extendedTest.test)("should throw when trigger is missing (wrong param names)", async () => {
283
+ await page.setContent(content);
284
+ await (0, _extendedTest.expect)((0, _.downloadFile)({
285
+ page,
286
+ url: "https://example.com/file.pdf",
287
+ fileName: "test.pdf"
288
+ })).rejects.toThrow("trigger is required");
289
+ });
290
+ (0, _extendedTest.test)("saveFileToS3 should throw when trigger is missing (wrong param names)", async () => {
291
+ await page.setContent(content);
292
+ await (0, _extendedTest.expect)((0, _.saveFileToS3)({
293
+ page,
294
+ url: "https://example.com/file.pdf",
295
+ fileName: "test.pdf"
296
+ })).rejects.toThrow("trigger is required");
297
+ });
282
298
  });
283
299
  const SIGNED_URL = "https://bucket.r2.cloudflarestorage.com/img.jpg?X-Amz-Signature=abc";
284
300
  (0, _extendedTest.describe)("SignedUrlAttachmentInTypedArray", () => {
@@ -3,30 +3,11 @@
3
3
  Object.defineProperty(exports, "__esModule", {
4
4
  value: true
5
5
  });
6
- exports.SUPPPORTED_GPT_MODELS = exports.SUPPPORTED_CLAUDE_MODELS = exports.SUPPORTED_MODELS = exports.MODELS_MAPPINGS = exports.MAX_TOKENS_OVERRIDES = exports.GPT_MODELS = exports.GOOGLE_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS = void 0;
7
- const CLAUDE_MODELS = exports.CLAUDE_MODELS = ["claude-opus-4-20250514", "claude-sonnet-4-20250514", "claude-3-7-sonnet-20250219", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-5-haiku-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"];
6
+ exports.MAX_TOKENS_OVERRIDES = void 0;
8
7
  const MAX_TOKENS_OVERRIDES = exports.MAX_TOKENS_OVERRIDES = {
9
8
  "claude-3-5-sonnet-20240620": 8192,
10
9
  "gemini-1.5-pro-002": 8192,
11
10
  "gemini-1.5-flash-8b-002": 8192,
12
11
  "gemini-1.5-flash-002": 8192,
13
12
  "gemini-2.0-flash-exp": 8192
14
- };
15
- const CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = [...CLAUDE_MODELS];
16
- const CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS_MAPPINGS = {
17
- "claude-3-haiku": "claude-3-haiku-20240307",
18
- "claude-3-5-haiku": "claude-3-5-haiku-20241022",
19
- "claude-3-opus": "claude-3-opus-20240229",
20
- "claude-3-sonnet": "claude-3-sonnet-20240229",
21
- "claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
22
- "claude-4-sonnet": "claude-sonnet-4-20250514",
23
- "claude-4-opus": "claude-opus-4-20250514"
24
- };
25
- const GPT_MODELS = exports.GPT_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-4o", "gpt-4o-mini", "gpt-4o-audio-preview", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo", "o1", "o1-mini", "o1-preview", "o3-mini", "o3", "o4-mini", "chatgpt-4o-latest", "gpt4-turbo"];
26
- const GOOGLE_MODELS = exports.GOOGLE_MODELS = ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", "gemini-2.5-flash-lite-preview-06-17", "gemini-2.0-flash", "gemini-1.5-pro", "gemini-1.5-pro-latest", "gemini-1.5-flash", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b", "gemini-1.5-flash-8b-latest"];
27
- const MODELS_MAPPINGS = exports.MODELS_MAPPINGS = {
28
- ...CLAUDE_MODELS_MAPPINGS
29
- };
30
- const SUPPPORTED_CLAUDE_MODELS = exports.SUPPPORTED_CLAUDE_MODELS = ["claude-3-5-haiku-20241022", "claude-3-5-haiku-latest", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-latest", "claude-3-7-sonnet-20250219", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307", "claude-4-opus-20250514", "claude-4-sonnet-20250514", "claude-opus-4-1", "claude-opus-4-1-20250805", "claude-opus-4-20250514", "claude-sonnet-4-20250514"];
31
- const SUPPPORTED_GPT_MODELS = exports.SUPPPORTED_GPT_MODELS = ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-instruct", "gpt-3.5-turbo-instruct-0914", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4.1", "gpt-4.1-2025-04-14", "gpt-4.1-mini", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano", "gpt-4.1-nano-2025-04-14", "gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "gpt-4o-2024-11-20", "gpt-4o-mini", "gpt-4o-mini-2024-07-18", "gpt-5", "gpt-5-2025-08-07", "gpt-5-chat", "gpt-5-chat-latest", "gpt-5-mini", "gpt-5-mini-2025-08-07", "gpt-5-nano", "gpt-5-nano-2025-08-07", "o1", "o1-2024-12-17", "o1-mini", "o1-mini-2024-09-12", "o1-pro", "o1-pro-2025-03-19", "o3", "o3-2025-04-16", "o3-deep-research", "o3-deep-research-2025-06-26", "o3-mini", "o3-mini-2025-01-31", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-2025-04-16", "o4-mini-deep-research", "o4-mini-deep-research-2025-06-26"];
32
- const SUPPORTED_MODELS = exports.SUPPORTED_MODELS = [...SUPPPORTED_CLAUDE_MODELS, ...SUPPPORTED_GPT_MODELS];
13
+ };
@@ -56,8 +56,7 @@ async function extractStructuredDataUsingClaude(input) {
56
56
  const anthropic = (0, _anthropicModel.createAnthropicInstance)({
57
57
  apiKey
58
58
  });
59
- const modelName = _aiModelsValidations.CLAUDE_MODELS_MAPPINGS[model] ?? model;
60
- const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[modelName] ?? 4096;
59
+ const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[model] ?? 4096;
61
60
  const response = await (0, _neverthrow.fromPromise)(anthropic.messages.create({
62
61
  max_tokens: maxTokens,
63
62
  temperature: 0,
@@ -66,7 +65,7 @@ async function extractStructuredDataUsingClaude(input) {
66
65
  role: "user",
67
66
  content
68
67
  }],
69
- model: modelName,
68
+ model,
70
69
  tools: [{
71
70
  input_schema: processedJsonSchema,
72
71
  name: toolName,
@@ -15,10 +15,7 @@ async function extractStructuredDataUsingGoogle(input) {
15
15
  if (!input.apiKey) {
16
16
  return (0, _neverthrow.err)(Errors.invalidInput("Google AI is only supported with a custom API key. Please provide it or use a different AI provider."));
17
17
  }
18
- let model = input.model;
19
- if (input.model in _aiModelsValidations.MODELS_MAPPINGS) {
20
- model = _aiModelsValidations.MODELS_MAPPINGS[input.model];
21
- }
18
+ const model = input.model;
22
19
  const googleGenAi = (0, _google.createGoogleGenerativeAI)({
23
20
  apiKey: input.apiKey
24
21
  });
@@ -8,7 +8,6 @@ var _neverthrow = require("neverthrow");
8
8
  var Errors = _interopRequireWildcard(require("../types/errors"));
9
9
  var _utils = require("./utils");
10
10
  var _Logger = require("../../common/Logger");
11
- var _aiModelsValidations = require("../common/aiModelsValidations");
12
11
  var _openaiModel = require("../models/openaiModel");
13
12
  function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
14
13
  async function extractStructuredDataUsingOpenAi(input) {
@@ -50,7 +49,7 @@ async function extractStructuredDataUsingOpenAi(input) {
50
49
  }));
51
50
  content.push(...imageContent);
52
51
  }
53
- const modelName = _aiModelsValidations.MODELS_MAPPINGS[input.model] ?? input.model;
52
+ const modelName = input.model;
54
53
  const toolName = `extract_${entityName}`;
55
54
  const openAiInstance = (0, _openaiModel.createOpenAIInstance)({
56
55
  apiKey
@@ -10,7 +10,7 @@ var _imageSize = require("image-size");
10
10
  var _neverthrow = require("neverthrow");
11
11
  var Errors = _interopRequireWildcard(require("../types/errors"));
12
12
  function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
13
- async function getTableHeadersUsingAi(handle, identifier) {
13
+ async function getTableHeadersUsingAi(handle, model) {
14
14
  var _response$error;
15
15
  let image;
16
16
  try {
@@ -57,7 +57,7 @@ async function getTableHeadersUsingAi(handle, identifier) {
57
57
  }
58
58
  }]
59
59
  }],
60
- model: "claude-3-haiku-20240307",
60
+ model,
61
61
  tools: [{
62
62
  input_schema: {
63
63
  type: "object",
@@ -12,15 +12,15 @@ var _extractStructuredDataUsingClaude = require("./extractStructuredDataUsingCla
12
12
  var _extractStructuredDataUsingOpenAi = require("./extractStructuredDataUsingOpenAi");
13
13
  var _utils = require("./utils");
14
14
  var _extractStructuredDataUsingGoogle = require("./extractStructuredDataUsingGoogle");
15
- var _aiModelsValidation = require("../types/aiModelsValidation");
15
+ var _getModelProvider = require("../../common/getModelProvider");
16
16
  function isClaudeModel(model) {
17
- return _aiModelsValidation.SUPPORTED_CLAUDE_MODELS.includes(model);
17
+ return (0, _getModelProvider.getModelProvider)(model) === "anthropic";
18
18
  }
19
19
  function isGoogleModel(model) {
20
- return _aiModelsValidation.SUPPORTED_GOOGLE_MODELS.includes(model);
20
+ return (0, _getModelProvider.getModelProvider)(model) === "google_vertexai";
21
21
  }
22
22
  function isOpenAiModel(model) {
23
- return _aiModelsValidation.SUPPORTED_GPT_MODELS.includes(model);
23
+ return (0, _getModelProvider.getModelProvider)(model) === "openai";
24
24
  }
25
25
  async function extractStructuredDataUsingAi(input) {
26
26
  let extractionResult;
@@ -9,7 +9,7 @@ var _zod = require("zod");
9
9
  var _neverthrow = require("neverthrow");
10
10
  var Errors = _interopRequireWildcard(require("../types/errors"));
11
11
  function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
12
- async function isTableHeaderOrFooter(content) {
12
+ async function isTableHeaderOrFooter(content, model) {
13
13
  var _response$error;
14
14
  if (!content) {
15
15
  return (0, _neverthrow.ok)({
@@ -39,7 +39,7 @@ async function isTableHeaderOrFooter(content) {
39
39
  a table header is a row that contains labels for table columns, and footer usually has pagination information or summary of the table`
40
40
  }, itemContent]
41
41
  }],
42
- model: "claude-3-haiku-20240307",
42
+ model,
43
43
  tools: [{
44
44
  input_schema: {
45
45
  type: "object",
@@ -102,7 +102,8 @@ function selectBestMatch(original, matches) {
102
102
  return {
103
103
  matchText: bestMatch.matchText,
104
104
  matchXpath: bestMatch.matchXpath,
105
- matchType: bestMatch.matchType
105
+ matchType: bestMatch.matchType,
106
+ sourceText: bestMatch.sourceText
106
107
  };
107
108
  }
108
109
  const fuzzyMatches = matches.filter(match => match.isFuzzy).map(match => [match, rankMatch(original, match)]).filter(([_, rank]) => rank === "HIGH");
@@ -112,7 +113,8 @@ function selectBestMatch(original, matches) {
112
113
  return {
113
114
  matchText: bestMatch.matchText,
114
115
  matchXpath: bestMatch.matchXpath,
115
- matchType: bestMatch.matchType
116
+ matchType: bestMatch.matchType,
117
+ sourceText: bestMatch.sourceText
116
118
  };
117
119
  }
118
120
  return null;
@@ -0,0 +1,82 @@
1
+ "use strict";
2
+
3
+ var _vitest = require("vitest");
4
+ var _validators = require("../validators");
5
+ var _index = require("./index");
6
+ (0, _vitest.describe)("strategy model accepts any string", () => {
7
+ (0, _vitest.it)("accepts a non-enum text model for the HTML strategy", () => {
8
+ const result = _validators.strategySchema.safeParse({
9
+ model: "gpt-4.1",
10
+ type: "HTML"
11
+ });
12
+ (0, _vitest.expect)(result.success).toBe(true);
13
+ if (result.success) {
14
+ (0, _vitest.expect)(result.data.model).toBe("gpt-4.1");
15
+ }
16
+ });
17
+ (0, _vitest.it)("accepts a non-enum vision model for the IMAGE strategy", () => {
18
+ const result = _validators.strategySchema.safeParse({
19
+ model: "gemini-2.5-pro",
20
+ type: "IMAGE"
21
+ });
22
+ (0, _vitest.expect)(result.success).toBe(true);
23
+ });
24
+ (0, _vitest.it)("accepts a brand new claude model name", () => {
25
+ const result = _validators.strategySchema.safeParse({
26
+ model: "claude-sonnet-4-5",
27
+ type: "HTML"
28
+ });
29
+ (0, _vitest.expect)(result.success).toBe(true);
30
+ });
31
+ (0, _vitest.it)("rejects an empty model", () => {
32
+ const result = _validators.strategySchema.safeParse({
33
+ model: "",
34
+ type: "HTML"
35
+ });
36
+ (0, _vitest.expect)(result.success).toBe(false);
37
+ });
38
+ (0, _vitest.it)("rejects an unknown strategy type", () => {
39
+ const result = _validators.strategySchema.safeParse({
40
+ model: "gpt-4o",
41
+ type: "FOO"
42
+ });
43
+ (0, _vitest.expect)(result.success).toBe(false);
44
+ });
45
+ (0, _vitest.it)("falls back to the default strategy when omitted", () => {
46
+ const result = _validators.strategySchema.safeParse(undefined);
47
+ (0, _vitest.expect)(result.success).toBe(true);
48
+ if (result.success) {
49
+ (0, _vitest.expect)(result.data).toEqual({
50
+ model: "claude-haiku-4-5-20251001",
51
+ type: "HTML"
52
+ });
53
+ }
54
+ });
55
+ });
56
+ (0, _vitest.describe)("provider routing handles arbitrary model strings", () => {
57
+ (0, _vitest.it)("routes OpenAI models by prefix", () => {
58
+ for (const model of ["gpt-4.1", "gpt-5", "gpt-4o", "o3", "o4-mini"]) {
59
+ (0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(true);
60
+ (0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
61
+ (0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
62
+ }
63
+ });
64
+ (0, _vitest.it)("routes Google models by prefix", () => {
65
+ for (const model of ["gemini-2.5-pro", "gemini-2.0-flash"]) {
66
+ (0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(true);
67
+ (0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
68
+ (0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(false);
69
+ }
70
+ });
71
+ (0, _vitest.it)("routes Claude models by prefix", () => {
72
+ for (const model of ["claude-sonnet-4-5", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307"]) {
73
+ (0, _vitest.expect)((0, _index.isClaudeModel)(model)).toBe(true);
74
+ (0, _vitest.expect)((0, _index.isOpenAiModel)(model)).toBe(false);
75
+ (0, _vitest.expect)((0, _index.isGoogleModel)(model)).toBe(false);
76
+ }
77
+ });
78
+ (0, _vitest.it)("no longer silently misroutes new OpenAI/Google models to Claude", () => {
79
+ (0, _vitest.expect)((0, _index.isOpenAiModel)("gpt-4.1")).toBe(true);
80
+ (0, _vitest.expect)((0, _index.isGoogleModel)("gemini-2.5-pro")).toBe(true);
81
+ });
82
+ });