@intuned/browser-dev 2.2.3-unify-sdks.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/.babelrc +21 -0
  2. package/.eslintignore +10 -0
  3. package/.eslintrc.js +39 -0
  4. package/LICENSE +43 -0
  5. package/dist/ai-extractors/AnthropicClient/index.js +23 -0
  6. package/dist/ai-extractors/export.d.js +5 -0
  7. package/dist/ai-extractors/export.d.ts +422 -0
  8. package/dist/ai-extractors/extractStructuredData.js +79 -0
  9. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
  10. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
  11. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
  12. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
  13. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
  14. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
  15. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
  16. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
  17. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
  18. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
  19. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
  20. package/dist/ai-extractors/extractionHelpers/types.js +5 -0
  21. package/dist/ai-extractors/fileExtractors.js +176 -0
  22. package/dist/ai-extractors/index.js +31 -0
  23. package/dist/ai-extractors/jsonSchema.d.js +5 -0
  24. package/dist/ai-extractors/jsonSchema.d.ts +49 -0
  25. package/dist/ai-extractors/openAiClients/index.js +23 -0
  26. package/dist/ai-extractors/validators.js +239 -0
  27. package/dist/browser/ai/export.d.js +3 -0
  28. package/dist/browser/ai/export.d.ts +587 -0
  29. package/dist/browser/ai/extractMarkdown.js +15 -0
  30. package/dist/browser/ai/extractStructuredData.js +231 -0
  31. package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
  32. package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
  33. package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
  34. package/dist/browser/ai/index.d.ts +587 -0
  35. package/dist/browser/ai/index.js +19 -0
  36. package/dist/browser/ai/isPageLoaded.js +67 -0
  37. package/dist/browser/ai/prompt.js +39 -0
  38. package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
  39. package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
  40. package/dist/browser/ai/tools/index.js +48 -0
  41. package/dist/browser/ai/types/errors.js +67 -0
  42. package/dist/browser/ai/types/models.js +45 -0
  43. package/dist/browser/ai/types/types.js +48 -0
  44. package/dist/browser/ai/validators.js +136 -0
  45. package/dist/common/Logger/index.js +60 -0
  46. package/dist/common/Logger/types.js +5 -0
  47. package/dist/common/SdkError.js +50 -0
  48. package/dist/common/aiModelsValidations.js +50 -0
  49. package/dist/common/browser_scripts.js +2596 -0
  50. package/dist/common/ensureBrowserScripts.js +17 -0
  51. package/dist/common/environmentVariables.js +16 -0
  52. package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
  53. package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
  54. package/dist/common/extendedTest.js +148 -0
  55. package/dist/common/extractionHelpers.js +19 -0
  56. package/dist/common/formatZodError.js +18 -0
  57. package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
  58. package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
  59. package/dist/common/fuzzySearch/utils.js +23 -0
  60. package/dist/common/getModelProvider.js +18 -0
  61. package/dist/common/getSimplifiedHtml.js +122 -0
  62. package/dist/common/hashObject.js +32 -0
  63. package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
  64. package/dist/common/html2markdown/index.js +19 -0
  65. package/dist/common/jwtTokenManager.js +18 -0
  66. package/dist/common/loadRuntime.js +16 -0
  67. package/dist/common/locatorHelpers.js +41 -0
  68. package/dist/common/matching/collectStrings.js +32 -0
  69. package/dist/common/matching/levenshtein.js +40 -0
  70. package/dist/common/matching/matching.js +317 -0
  71. package/dist/common/matching/types.js +1 -0
  72. package/dist/common/noEmpty.js +9 -0
  73. package/dist/common/saveSnapshotWithExamples.js +60 -0
  74. package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
  75. package/dist/common/xpathMapping.js +107 -0
  76. package/dist/helpers/downloadFile.js +125 -0
  77. package/dist/helpers/export.d.js +1 -0
  78. package/dist/helpers/export.d.ts +1294 -0
  79. package/dist/helpers/extractMarkdown.js +35 -0
  80. package/dist/helpers/filterEmptyValues.js +54 -0
  81. package/dist/helpers/gotoUrl.js +93 -0
  82. package/dist/helpers/index.d.ts +1294 -0
  83. package/dist/helpers/index.js +115 -0
  84. package/dist/helpers/processDate.js +25 -0
  85. package/dist/helpers/resolveUrl.js +63 -0
  86. package/dist/helpers/sanitizeHtml.js +73 -0
  87. package/dist/helpers/saveFileToS3.js +46 -0
  88. package/dist/helpers/scrollToLoadContent.js +50 -0
  89. package/dist/helpers/tests/extendedTest.js +130 -0
  90. package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
  91. package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
  92. package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
  93. package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
  94. package/dist/helpers/tests/testProcessDate.spec.js +13 -0
  95. package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
  96. package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
  97. package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
  98. package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
  99. package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
  100. package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
  101. package/dist/helpers/types/Attachment.js +81 -0
  102. package/dist/helpers/types/CustomTypeRegistry.js +48 -0
  103. package/dist/helpers/types/RunEnvironment.js +18 -0
  104. package/dist/helpers/types/ValidationError.js +17 -0
  105. package/dist/helpers/types/index.js +51 -0
  106. package/dist/helpers/uploadFileToS3.js +153 -0
  107. package/dist/helpers/utils/getS3Client.js +21 -0
  108. package/dist/helpers/utils/index.js +73 -0
  109. package/dist/helpers/utils/isDownload.js +10 -0
  110. package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
  111. package/dist/helpers/utils/isLocator.js +9 -0
  112. package/dist/helpers/utils/jwtTokenManager.js +18 -0
  113. package/dist/helpers/validateDataUsingSchema.js +119 -0
  114. package/dist/helpers/waitForDomSettled.js +182 -0
  115. package/dist/helpers/waitForNetworkIdle.js +191 -0
  116. package/dist/index.d.js +82 -0
  117. package/dist/index.d.ts +11 -0
  118. package/dist/index.js +84 -0
  119. package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
  120. package/dist/intunedServices/ApiGateway/factory.js +13 -0
  121. package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
  122. package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
  123. package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
  124. package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
  125. package/dist/intunedServices/ApiGateway/types.js +11 -0
  126. package/dist/intunedServices/cache/cache.js +61 -0
  127. package/dist/intunedServices/cache/index.js +12 -0
  128. package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
  129. package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
  130. package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
  131. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
  132. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
  133. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
  134. package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
  135. package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
  136. package/dist/optimized-extractors/common/index.js +55 -0
  137. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
  138. package/dist/optimized-extractors/common/matching/matching.js +212 -0
  139. package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
  140. package/dist/optimized-extractors/common/matching/types.js +18 -0
  141. package/dist/optimized-extractors/common/matching/utils.js +184 -0
  142. package/dist/optimized-extractors/common/utils.js +58 -0
  143. package/dist/optimized-extractors/export.d.js +5 -0
  144. package/dist/optimized-extractors/export.d.ts +397 -0
  145. package/dist/optimized-extractors/extractArray.js +120 -0
  146. package/dist/optimized-extractors/extractObject.js +104 -0
  147. package/dist/optimized-extractors/index.d.ts +397 -0
  148. package/dist/optimized-extractors/index.js +31 -0
  149. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
  150. package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
  151. package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
  152. package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
  153. package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
  154. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
  155. package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
  156. package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
  157. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
  158. package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
  159. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
  160. package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
  161. package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
  162. package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
  163. package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
  164. package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
  165. package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
  166. package/dist/optimized-extractors/models/anthropicModel.js +23 -0
  167. package/dist/optimized-extractors/models/openaiModel.js +23 -0
  168. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
  169. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
  170. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
  171. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
  172. package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
  173. package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
  174. package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
  175. package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
  176. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
  177. package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
  178. package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
  179. package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
  180. package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
  181. package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
  182. package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
  183. package/dist/optimized-extractors/types/errors.js +42 -0
  184. package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
  185. package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
  186. package/dist/optimized-extractors/types/types.js +5 -0
  187. package/dist/optimized-extractors/validators.js +152 -0
  188. package/dist/vite-env.d.js +1 -0
  189. package/dist/vite-env.d.ts +9 -0
  190. package/docs.md +14 -0
  191. package/how-to-run-tests.md +10 -0
  192. package/intuned-runtime-setup.md +13 -0
  193. package/package.json +124 -0
  194. package/tsconfig.eslint.json +5 -0
  195. package/tsconfig.json +26 -0
@@ -0,0 +1,231 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.extractStructuredData = void 0;
7
+ var _extractStructuredDataUsingAi = require("./extractStructuredDataUsingAi");
8
+ var _validators = require("./validators");
9
+ var _screenshotHelpers = require("./extractionHelpers/screenshotHelpers");
10
+ var _formatZodError = require("../../common/formatZodError");
11
+ var _cache = require("../../intunedServices/cache/cache");
12
+ var _locatorHelpers = require("../../common/locatorHelpers");
13
+ var _extractionHelpers = require("../../common/extractionHelpers");
14
+ var _getSimplifiedHtml = require("../../common/getSimplifiedHtml");
15
+ var _hashObject = require("../../common/hashObject");
16
+ var _Logger = require("../../common/Logger");
17
+ var _helpers = require("../../helpers");
18
+ var _xpathMapping = require("../../common/xpathMapping");
19
+ const extractStructuredData = async options => {
20
+ const pageOrLocator = options.source;
21
+ const isPageInput = (0, _locatorHelpers.isPage)(pageOrLocator);
22
+ const {
23
+ model,
24
+ strategy,
25
+ prompt,
26
+ apiKey,
27
+ enableDomMatching = false,
28
+ maxRetries = 3,
29
+ dataSchema,
30
+ enableCache = true
31
+ } = options;
32
+ const inputParsingResult = await _validators.extractDataInputJsonSchema.safeParseAsync({
33
+ source: pageOrLocator,
34
+ model,
35
+ strategy,
36
+ prompt,
37
+ apiKey,
38
+ enableDomMatching,
39
+ enableCache,
40
+ maxRetries,
41
+ dataSchema
42
+ });
43
+ if (!inputParsingResult.success) {
44
+ const errors = (0, _formatZodError.formatZodError)(inputParsingResult.error);
45
+ const message = `invalid input parameters for extractStructuredData: ${errors}`;
46
+ throw new Error(message);
47
+ }
48
+ const validatedData = inputParsingResult.data;
49
+ const pageObject = isPageInput ? pageOrLocator : pageOrLocator.page();
50
+ if (validatedData.enableDomMatching) {
51
+ if (!(0, _validators.checkAllTypesAreStrings)(validatedData.dataSchema)) {
52
+ throw new Error("For DOM matching, all types of the extraction fields must be STRINGS, to match with the DOM.");
53
+ }
54
+ }
55
+ let cacheKey = "";
56
+ if (validatedData.strategy === "HTML") {
57
+ const containerHandle = isPageInput ? await pageOrLocator.locator("html").elementHandle() : await pageOrLocator.elementHandle();
58
+ if (!containerHandle) {
59
+ throw new Error("No HTML content found in the specified region.");
60
+ }
61
+ const simplifiedHtml = await (0, _getSimplifiedHtml.getSimplifiedHtml)(containerHandle);
62
+ if (validatedData.enableCache) {
63
+ cacheKey = (0, _hashObject.hashObject)({
64
+ pageUrl: pageObject.url(),
65
+ dataSchema: validatedData.dataSchema,
66
+ strategy: validatedData.strategy,
67
+ model: validatedData.model,
68
+ prompt: validatedData.prompt,
69
+ searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
70
+ ...(validatedData.enableDomMatching ? {} : {
71
+ html: (0, _extractionHelpers.compressStringSpaces)(simplifiedHtml)
72
+ })
73
+ }, true);
74
+ const cachedResult = await _cache.cache.get(cacheKey);
75
+ if (validatedData.enableDomMatching && cachedResult && cachedResult.matchesMapping) {
76
+ const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedResult.matchesMapping);
77
+ if (isValid) {
78
+ _Logger.logger.info(`Cached results matched correctly with the current page, returning cached result`);
79
+ return cachedResult.result;
80
+ }
81
+ } else if (cachedResult && !validatedData.enableDomMatching) {
82
+ _Logger.logger.info(`Results for the extractor found in the cache, returning cached result`);
83
+ return cachedResult;
84
+ }
85
+ }
86
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
87
+ apiKey: validatedData.apiKey,
88
+ enableDomMatching: validatedData.enableDomMatching,
89
+ jsonSchema: validatedData.dataSchema,
90
+ model: validatedData.model,
91
+ content: simplifiedHtml,
92
+ prompt: validatedData.prompt,
93
+ images: [],
94
+ maxRetries: validatedData.maxRetries
95
+ });
96
+ if (result.isErr()) {
97
+ throw new Error(result.error.context);
98
+ }
99
+ if (validatedData.enableCache) {
100
+ if (!validatedData.enableDomMatching) {
101
+ await _cache.cache.set(cacheKey, result.value.result);
102
+ } else {
103
+ const resultsToCache = {
104
+ result: result.value.result,
105
+ matchesMapping: result.value.xpathMapping || {}
106
+ };
107
+ await _cache.cache.set(cacheKey, resultsToCache);
108
+ }
109
+ }
110
+ return result.value.result;
111
+ }
112
+ if (validatedData.strategy === "IMAGE") {
113
+ const containerHandle = isPageInput ? undefined : await pageOrLocator.elementHandle();
114
+ const images = await (0, _screenshotHelpers.buildImagesFromPageOrHandle)(pageObject, containerHandle);
115
+ if (validatedData.enableCache) {
116
+ cacheKey = (0, _hashObject.hashObject)({
117
+ pageUrl: pageObject.url(),
118
+ dataSchema: validatedData.dataSchema,
119
+ strategy: validatedData.strategy,
120
+ model: validatedData.model,
121
+ prompt: validatedData.prompt,
122
+ searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
123
+ ...(validatedData.enableDomMatching ? {} : {
124
+ html: await pageObject.evaluate(() => document.documentElement.outerHTML)
125
+ })
126
+ }, true);
127
+ const cachedResult = await _cache.cache.get(cacheKey);
128
+ if (validatedData.enableDomMatching && cachedResult && cachedResult.matchesMapping) {
129
+ const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedResult.matchesMapping);
130
+ if (isValid) {
131
+ _Logger.logger.info("Cached results matched correctly with the data in the current page.\nReturning cached result");
132
+ return cachedResult.result;
133
+ }
134
+ } else if (cachedResult && !validatedData.enableDomMatching) {
135
+ _Logger.logger.info("Results for the extractor found in the cache.\nReturning cached result");
136
+ return cachedResult;
137
+ }
138
+ }
139
+ if (images.isErr()) {
140
+ throw new Error(images.error.context);
141
+ }
142
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
143
+ apiKey: validatedData.apiKey,
144
+ enableDomMatching: validatedData.enableDomMatching,
145
+ jsonSchema: validatedData.dataSchema,
146
+ model: validatedData.model,
147
+ content: "Extract structured data from the following images.",
148
+ prompt: validatedData.prompt,
149
+ images: images.value,
150
+ maxRetries: validatedData.maxRetries
151
+ });
152
+ if (result.isErr()) {
153
+ throw new Error(result.error.context);
154
+ }
155
+ if (validatedData.enableCache) {
156
+ if (!validatedData.enableDomMatching) {
157
+ await _cache.cache.set(cacheKey, result.value.result);
158
+ } else {
159
+ const resultsToCache = {
160
+ result: result.value.result,
161
+ matchesMapping: result.value.xpathMapping || {}
162
+ };
163
+ await _cache.cache.set(cacheKey, resultsToCache);
164
+ }
165
+ }
166
+ return result.value.result;
167
+ }
168
+ if (validatedData.strategy === "MARKDOWN") {
169
+ const containerHandle = isPageInput ? await pageOrLocator.locator("html").elementHandle() : await pageOrLocator.elementHandle();
170
+ const html = await (containerHandle === null || containerHandle === void 0 ? void 0 : containerHandle.innerHTML());
171
+ if (!html) {
172
+ throw new Error("No HTML content found in the specified region.");
173
+ }
174
+ const markdown = await (0, _helpers.extractMarkdown)({
175
+ source: pageObject
176
+ });
177
+ if (validatedData.enableCache) {
178
+ cacheKey = (0, _hashObject.hashObject)({
179
+ pageUrl: pageObject.url(),
180
+ dataSchema: validatedData.dataSchema,
181
+ strategy: validatedData.strategy,
182
+ model: validatedData.model,
183
+ prompt: validatedData.prompt,
184
+ searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
185
+ ...(validatedData.enableDomMatching ? {} : {
186
+ html: await pageObject.evaluate(() => document.documentElement.outerHTML),
187
+ markdown
188
+ })
189
+ }, true);
190
+ const cachedResult = await _cache.cache.get(cacheKey);
191
+ if (enableDomMatching && cachedResult && cachedResult.matchesMapping) {
192
+ const cachedXpathMapping = cachedResult.matchesMapping;
193
+ const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedXpathMapping);
194
+ if (isValid) {
195
+ console.log(`Cached results matched correctly with the data in the current page.\nReturning cached result`);
196
+ return cachedResult.result;
197
+ }
198
+ } else if (cachedResult && !enableDomMatching) {
199
+ _Logger.logger.info(`Results for the extractor found in the cache.\nReturning cached result`);
200
+ return cachedResult;
201
+ }
202
+ }
203
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
204
+ apiKey: validatedData.apiKey,
205
+ enableDomMatching: validatedData.enableDomMatching,
206
+ jsonSchema: validatedData.dataSchema,
207
+ model: validatedData.model,
208
+ content: markdown,
209
+ prompt: validatedData.prompt,
210
+ images: [],
211
+ maxRetries: validatedData.maxRetries
212
+ });
213
+ if (result.isErr()) {
214
+ throw new Error(result.error.context);
215
+ }
216
+ if (validatedData.enableCache) {
217
+ if (!enableDomMatching) {
218
+ await _cache.cache.set(cacheKey, result.value.result);
219
+ return result.value.result;
220
+ }
221
+ const resultsToCache = {
222
+ result: result.value.result,
223
+ matchesMapping: result.value.xpathMapping || {}
224
+ };
225
+ await _cache.cache.set(cacheKey, resultsToCache);
226
+ }
227
+ return result.value.result;
228
+ }
229
+ throw new Error(`Unsupported strategy type: ${validatedData.strategy}. Supported types are: HTML, IMAGE, and MARKDOWN.`);
230
+ };
231
+ exports.extractStructuredData = extractStructuredData;
@@ -0,0 +1,140 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.extractStructuredDataUsingAi = extractStructuredDataUsingAi;
7
+ var _neverthrow = require("neverthrow");
8
+ var Errors = _interopRequireWildcard(require("./types/errors"));
9
+ var _getAiTrackingHeaders = require("../../common/eventTracking/getAiTrackingHeaders");
10
+ var _environmentVariables = require("../../common/environmentVariables");
11
+ var _Logger = require("../../common/Logger");
12
+ var _collectStrings = require("../../common/matching/collectStrings");
13
+ var _matching = require("../../common/matching/matching");
14
+ var _validateSchema = require("./extractionHelpers/validateSchema");
15
+ var _factory = require("../../intunedServices/ApiGateway/factory");
16
+ var _tools = require("./tools");
17
+ var _prompt = require("./prompt");
18
+ var _ai = require("ai");
19
+ var _loadRuntime = require("../../common/loadRuntime");
20
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
21
+ async function extractStructuredDataUsingAi(page, input) {
22
+ var _getExecutionContext, _getExecutionContext2, _getExecutionContext3;
23
+ const {
24
+ apiKey,
25
+ enableDomMatching,
26
+ jsonSchema,
27
+ model,
28
+ content,
29
+ prompt,
30
+ images,
31
+ maxRetries = 3
32
+ } = input;
33
+ let accumulatedCost = 0;
34
+ const getExecutionContext = await (0, _loadRuntime.loadRuntime)();
35
+ const toolName = `extract_data`;
36
+ const headers = (0, _getAiTrackingHeaders.getAiTrackingHeaders)({
37
+ environment: (0, _environmentVariables.getEnvironmentVariable)("RUN_ENVIRONMENT"),
38
+ type: "DYNAMIC_LIST",
39
+ runId: ((_getExecutionContext = getExecutionContext()) === null || _getExecutionContext === void 0 ? void 0 : _getExecutionContext.runId) || undefined,
40
+ jobId: ((_getExecutionContext2 = getExecutionContext()) === null || _getExecutionContext2 === void 0 ? void 0 : _getExecutionContext2.jobId) || undefined,
41
+ jobRunId: ((_getExecutionContext3 = getExecutionContext()) === null || _getExecutionContext3 === void 0 ? void 0 : _getExecutionContext3.jobRunId) || undefined
42
+ });
43
+ const gateway = _factory.GatewayFactory.createAIGateway();
44
+ const gatewayModel = gateway.getModel(model, apiKey);
45
+ const tools = (0, _tools.getTools)(toolName, jsonSchema);
46
+ const messages = (0, _prompt.getMessages)({
47
+ prompt,
48
+ content,
49
+ images,
50
+ enableDomMatching
51
+ });
52
+ const messagesHistory = messages;
53
+ let currentRetry = 0;
54
+ let result;
55
+ while (currentRetry < maxRetries) {
56
+ try {
57
+ var _result$usage;
58
+ result = await (0, _ai.generateText)({
59
+ model: gatewayModel,
60
+ messages: messagesHistory,
61
+ tools: tools.isOk() ? tools.value : {},
62
+ toolChoice: "required",
63
+ maxRetries,
64
+ headers
65
+ });
66
+ accumulatedCost += ((_result$usage = result.usage) === null || _result$usage === void 0 ? void 0 : _result$usage.totalTokens) ?? 0;
67
+ _Logger.logger.info(`AI extraction cost: ${accumulatedCost}`);
68
+ const toolCall = result.toolCalls[0] ?? null;
69
+ let extractedData = toolCall.input;
70
+ const isArray = jsonSchema.type === "array";
71
+ if (isArray && extractedData.extracted_data) {
72
+ extractedData = extractedData.extracted_data;
73
+ }
74
+ const errors = (0, _validateSchema.validateToolCallSchema)(extractedData, jsonSchema);
75
+ if (errors.length > 0) {
76
+ const reaskMessage = (0, _validateSchema.createReaskMessage)(errors);
77
+ const modelMessages = (0, _ai.convertToModelMessages)([{
78
+ role: "assistant",
79
+ parts: [{
80
+ type: "step-start"
81
+ }, {
82
+ type: "text",
83
+ text: result.text,
84
+ state: "done"
85
+ }, {
86
+ type: `tool-${toolName}`,
87
+ state: "output-error",
88
+ toolCallId: toolCall.toolCallId,
89
+ input: extractedData,
90
+ errorText: reaskMessage
91
+ }]
92
+ }]);
93
+ messagesHistory.push(...modelMessages);
94
+ currentRetry++;
95
+ continue;
96
+ }
97
+ if (!enableDomMatching) {
98
+ return (0, _neverthrow.ok)({
99
+ result: extractedData,
100
+ usage: accumulatedCost,
101
+ xpathMapping: {}
102
+ });
103
+ }
104
+ const stringsToMatch = (0, _collectStrings.collectStrings)({
105
+ dataStructure: extractedData
106
+ });
107
+ if (!stringsToMatch || stringsToMatch.length === 0) {
108
+ return (0, _neverthrow.ok)({
109
+ result: [],
110
+ usage: accumulatedCost,
111
+ xpathMapping: {}
112
+ });
113
+ }
114
+ const {
115
+ replacements,
116
+ xpathMapping
117
+ } = await (0, _matching.replaceWithBestMatches)({
118
+ stringsToMatch,
119
+ pageObject: page
120
+ });
121
+ const stringReplacements = {};
122
+ Object.entries(replacements).forEach(([key, value]) => {
123
+ stringReplacements[key] = (value === null || value === void 0 ? void 0 : value.matchText) || null;
124
+ });
125
+ const matchesData = await (0, _validateSchema.recursivelyReplaceStrings)(extractedData, stringReplacements);
126
+ return (0, _neverthrow.ok)({
127
+ result: matchesData,
128
+ usage: accumulatedCost,
129
+ xpathMapping
130
+ });
131
+ } catch (error) {
132
+ _Logger.logger.error("Error during AI extraction", {
133
+ error,
134
+ model
135
+ });
136
+ return (0, _neverthrow.err)(Errors.invalidExtractionResult(error instanceof Error ? error.message : "Unknown error during extraction"));
137
+ }
138
+ }
139
+ return (0, _neverthrow.err)(Errors.maxRetriesExceeded(`Max retries of ${maxRetries} exceeded for extraction`));
140
+ }
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.buildImagesFromPageOrHandle = buildImagesFromPageOrHandle;
7
+ exports.captureFullPageImagesWithOverlap = captureFullPageImagesWithOverlap;
8
+ var _neverthrow = require("neverthrow");
9
+ var errors = _interopRequireWildcard(require("../types/errors"));
10
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
11
+ async function captureFullPageImagesWithOverlap(page, options = {
12
+ overlap: 200,
13
+ sliceHeight: 1000
14
+ }) {
15
+ const totalHeight = await page.evaluate(() => document.body.scrollHeight);
16
+ let currentHeight = 0;
17
+ const buffers = [];
18
+ while (currentHeight < totalHeight) {
19
+ if (buffers.length > 10) {
20
+ console.info(`the page is too long, only first ${totalHeight} px of the page will be captured.`);
21
+ break;
22
+ }
23
+ await page.setViewportSize({
24
+ width: 1200,
25
+ height: options.sliceHeight
26
+ });
27
+ await page.evaluate(y => window.scrollTo(0, y), currentHeight - (currentHeight > 0 ? options.overlap : 0));
28
+ await page.waitForTimeout(500);
29
+ const buffer = await page.screenshot();
30
+ buffers.push(buffer);
31
+ currentHeight += options.sliceHeight - options.overlap;
32
+ }
33
+ return buffers;
34
+ }
35
+ async function buildImagesFromPageOrHandle(page, searchRegionHandler) {
36
+ const originalViewPortSize = page.viewportSize();
37
+ await page.setViewportSize({
38
+ width: 1200,
39
+ height: 800
40
+ });
41
+ if (searchRegionHandler) {
42
+ const size = await searchRegionHandler.boundingBox();
43
+ if (!size) {
44
+ return (0, _neverthrow.err)(errors.other("the provided search region is very large, image extraction support up to 5000px height."));
45
+ }
46
+ return (0, _neverthrow.ok)([await searchRegionHandler.screenshot({
47
+ type: "png"
48
+ })]);
49
+ }
50
+ const fullPageImages = await captureFullPageImagesWithOverlap(page);
51
+ if (originalViewPortSize) {
52
+ await page.setViewportSize(originalViewPortSize);
53
+ }
54
+ return (0, _neverthrow.ok)(fullPageImages);
55
+ }
@@ -0,0 +1,148 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.createReaskMessage = createReaskMessage;
7
+ exports.recursivelyReplaceStrings = recursivelyReplaceStrings;
8
+ exports.validateJSONSchema = validateJSONSchema;
9
+ exports.validateToolCallSchema = validateToolCallSchema;
10
+ var _neverthrow = require("neverthrow");
11
+ var errors = _interopRequireWildcard(require("../types/errors"));
12
+ var _ajv = _interopRequireDefault(require("ajv"));
13
+ var _ajvFormats = _interopRequireDefault(require("ajv-formats"));
14
+ function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
15
+ function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
16
+ function validateJSONSchema(schema) {
17
+ if (!schema || typeof schema !== "object") {
18
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Schema must be an object"));
19
+ }
20
+ if (!schema.type) {
21
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Schema must have a 'type' property"));
22
+ }
23
+ const validTypes = ["string", "number", "integer", "boolean", "array", "object"];
24
+ if (!validTypes.includes(schema.type)) {
25
+ return (0, _neverthrow.err)(errors.invalidJsonSchema(`Invalid schema type: ${schema.type}`));
26
+ }
27
+ if (schema.type === "array") {
28
+ if (!schema.items) {
29
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Array schema must have 'items' property"));
30
+ }
31
+ const itemsValidation = validateJSONSchema(schema.items);
32
+ if (itemsValidation.isErr()) {
33
+ return itemsValidation;
34
+ }
35
+ }
36
+ if (schema.type === "object") {
37
+ if (!schema.properties || typeof schema.properties !== "object") {
38
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Object schema must have 'properties' object"));
39
+ }
40
+ for (const [key, propSchema] of Object.entries(schema.properties)) {
41
+ const propValidation = validateJSONSchema(propSchema);
42
+ if (propValidation.isErr()) {
43
+ return (0, _neverthrow.err)(errors.invalidJsonSchema(`Invalid schema for property '${key}': ${propValidation.error.context}`));
44
+ }
45
+ }
46
+ if (schema.required && !Array.isArray(schema.required)) {
47
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("'required' must be an array of property names"));
48
+ }
49
+ }
50
+ if (schema.type === "number" || schema.type === "integer") {
51
+ if (schema.maximum !== undefined && schema.exclusiveMaximum !== undefined) {
52
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Cannot have both 'maximum' and 'exclusiveMaximum'"));
53
+ }
54
+ if (schema.minimum !== undefined && schema.exclusiveMinimum !== undefined) {
55
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("Cannot have both 'minimum' and 'exclusiveMinimum'"));
56
+ }
57
+ if (schema.minimum > schema.maximum) {
58
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("'minimum' cannot be greater than 'maximum'"));
59
+ }
60
+ }
61
+ if (schema.type === "string") {
62
+ if (schema.maxLength !== undefined && schema.minLength !== undefined) {
63
+ if (schema.minLength > schema.maxLength) {
64
+ return (0, _neverthrow.err)(errors.invalidJsonSchema("'minLength' cannot be greater than 'maxLength'"));
65
+ }
66
+ }
67
+ }
68
+ return (0, _neverthrow.ok)(schema);
69
+ }
70
+ function recursivelyReplaceStrings(dataStructure, replacements) {
71
+ if (typeof dataStructure === "string") {
72
+ return replacements[dataStructure] ?? dataStructure;
73
+ } else if (typeof dataStructure === "number") {
74
+ const replacement = replacements[dataStructure.toString()];
75
+ return replacement !== null && replacement !== undefined ? replacement : dataStructure;
76
+ } else if (Array.isArray(dataStructure)) {
77
+ return dataStructure.map(item => recursivelyReplaceStrings(item, replacements));
78
+ } else if (dataStructure !== null && typeof dataStructure === "object") {
79
+ const result = {};
80
+ for (const [key, value] of Object.entries(dataStructure)) {
81
+ result[key] = recursivelyReplaceStrings(value, replacements);
82
+ }
83
+ return result;
84
+ } else {
85
+ return dataStructure;
86
+ }
87
+ }
88
+ function validateToolCallSchema(instance, schema) {
89
+ const ajv = new _ajv.default({
90
+ allErrors: true,
91
+ verbose: true
92
+ });
93
+ (0, _ajvFormats.default)(ajv);
94
+ const validate = ajv.compile(schema);
95
+ const isValid = validate(instance);
96
+ if (isValid) {
97
+ return [];
98
+ }
99
+ const errors = [];
100
+ if (validate.errors) {
101
+ for (const error of validate.errors) {
102
+ let pathString = "root";
103
+ if (error.instancePath) {
104
+ const pathParts = error.instancePath.slice(1).split("/").map(part => {
105
+ if (/^\d+$/.test(part)) {
106
+ return `[${part}]`;
107
+ }
108
+ return part;
109
+ });
110
+ if (pathParts.length > 0) {
111
+ pathString = "root." + pathParts.join(".");
112
+ pathString = pathString.replace(/\.\[/g, "[");
113
+ }
114
+ }
115
+ let schemaPathString = "schema";
116
+ if (error.schemaPath) {
117
+ const schemaParts = error.schemaPath.slice(1).split("/");
118
+ if (schemaParts.length > 0) {
119
+ schemaPathString = "schema." + schemaParts.join(".");
120
+ }
121
+ }
122
+ let invalidValue = instance;
123
+ if (error.instancePath) {
124
+ const pathParts = error.instancePath.slice(1).split("/");
125
+ for (const part of pathParts) {
126
+ if (!invalidValue) {
127
+ invalidValue = invalidValue[part];
128
+ }
129
+ }
130
+ }
131
+ errors.push({
132
+ path: pathString,
133
+ message: error.message || "Validation error",
134
+ value: invalidValue,
135
+ schema_path: schemaPathString
136
+ });
137
+ }
138
+ }
139
+ return errors;
140
+ }
141
+ function createReaskMessage(validationErrors) {
142
+ const formattedErrors = validationErrors.map((error, index) => `${index + 1}. Path "${error.path}": ${error.message}`).join("\n");
143
+ return `The extracted data has the following validation errors that need to be fixed:
144
+
145
+ ${formattedErrors}
146
+
147
+ Please extract the data again, ensuring it follows the exact schema requirements.`;
148
+ }