@intuned/browser-dev 2.2.3-unify-sdks.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/.babelrc +21 -0
  2. package/.eslintignore +10 -0
  3. package/.eslintrc.js +39 -0
  4. package/LICENSE +43 -0
  5. package/dist/ai-extractors/AnthropicClient/index.js +23 -0
  6. package/dist/ai-extractors/export.d.js +5 -0
  7. package/dist/ai-extractors/export.d.ts +422 -0
  8. package/dist/ai-extractors/extractStructuredData.js +79 -0
  9. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
  10. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
  11. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
  12. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
  13. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
  14. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
  15. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
  16. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
  17. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
  18. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
  19. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
  20. package/dist/ai-extractors/extractionHelpers/types.js +5 -0
  21. package/dist/ai-extractors/fileExtractors.js +176 -0
  22. package/dist/ai-extractors/index.js +31 -0
  23. package/dist/ai-extractors/jsonSchema.d.js +5 -0
  24. package/dist/ai-extractors/jsonSchema.d.ts +49 -0
  25. package/dist/ai-extractors/openAiClients/index.js +23 -0
  26. package/dist/ai-extractors/validators.js +239 -0
  27. package/dist/browser/ai/export.d.js +3 -0
  28. package/dist/browser/ai/export.d.ts +587 -0
  29. package/dist/browser/ai/extractMarkdown.js +15 -0
  30. package/dist/browser/ai/extractStructuredData.js +231 -0
  31. package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
  32. package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
  33. package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
  34. package/dist/browser/ai/index.d.ts +587 -0
  35. package/dist/browser/ai/index.js +19 -0
  36. package/dist/browser/ai/isPageLoaded.js +67 -0
  37. package/dist/browser/ai/prompt.js +39 -0
  38. package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
  39. package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
  40. package/dist/browser/ai/tools/index.js +48 -0
  41. package/dist/browser/ai/types/errors.js +67 -0
  42. package/dist/browser/ai/types/models.js +45 -0
  43. package/dist/browser/ai/types/types.js +48 -0
  44. package/dist/browser/ai/validators.js +136 -0
  45. package/dist/common/Logger/index.js +60 -0
  46. package/dist/common/Logger/types.js +5 -0
  47. package/dist/common/SdkError.js +50 -0
  48. package/dist/common/aiModelsValidations.js +50 -0
  49. package/dist/common/browser_scripts.js +2596 -0
  50. package/dist/common/ensureBrowserScripts.js +17 -0
  51. package/dist/common/environmentVariables.js +16 -0
  52. package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
  53. package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
  54. package/dist/common/extendedTest.js +148 -0
  55. package/dist/common/extractionHelpers.js +19 -0
  56. package/dist/common/formatZodError.js +18 -0
  57. package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
  58. package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
  59. package/dist/common/fuzzySearch/utils.js +23 -0
  60. package/dist/common/getModelProvider.js +18 -0
  61. package/dist/common/getSimplifiedHtml.js +122 -0
  62. package/dist/common/hashObject.js +32 -0
  63. package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
  64. package/dist/common/html2markdown/index.js +19 -0
  65. package/dist/common/jwtTokenManager.js +18 -0
  66. package/dist/common/loadRuntime.js +16 -0
  67. package/dist/common/locatorHelpers.js +41 -0
  68. package/dist/common/matching/collectStrings.js +32 -0
  69. package/dist/common/matching/levenshtein.js +40 -0
  70. package/dist/common/matching/matching.js +317 -0
  71. package/dist/common/matching/types.js +1 -0
  72. package/dist/common/noEmpty.js +9 -0
  73. package/dist/common/saveSnapshotWithExamples.js +60 -0
  74. package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
  75. package/dist/common/xpathMapping.js +107 -0
  76. package/dist/helpers/downloadFile.js +125 -0
  77. package/dist/helpers/export.d.js +1 -0
  78. package/dist/helpers/export.d.ts +1294 -0
  79. package/dist/helpers/extractMarkdown.js +35 -0
  80. package/dist/helpers/filterEmptyValues.js +54 -0
  81. package/dist/helpers/gotoUrl.js +93 -0
  82. package/dist/helpers/index.d.ts +1294 -0
  83. package/dist/helpers/index.js +115 -0
  84. package/dist/helpers/processDate.js +25 -0
  85. package/dist/helpers/resolveUrl.js +63 -0
  86. package/dist/helpers/sanitizeHtml.js +73 -0
  87. package/dist/helpers/saveFileToS3.js +46 -0
  88. package/dist/helpers/scrollToLoadContent.js +50 -0
  89. package/dist/helpers/tests/extendedTest.js +130 -0
  90. package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
  91. package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
  92. package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
  93. package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
  94. package/dist/helpers/tests/testProcessDate.spec.js +13 -0
  95. package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
  96. package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
  97. package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
  98. package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
  99. package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
  100. package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
  101. package/dist/helpers/types/Attachment.js +81 -0
  102. package/dist/helpers/types/CustomTypeRegistry.js +48 -0
  103. package/dist/helpers/types/RunEnvironment.js +18 -0
  104. package/dist/helpers/types/ValidationError.js +17 -0
  105. package/dist/helpers/types/index.js +51 -0
  106. package/dist/helpers/uploadFileToS3.js +153 -0
  107. package/dist/helpers/utils/getS3Client.js +21 -0
  108. package/dist/helpers/utils/index.js +73 -0
  109. package/dist/helpers/utils/isDownload.js +10 -0
  110. package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
  111. package/dist/helpers/utils/isLocator.js +9 -0
  112. package/dist/helpers/utils/jwtTokenManager.js +18 -0
  113. package/dist/helpers/validateDataUsingSchema.js +119 -0
  114. package/dist/helpers/waitForDomSettled.js +182 -0
  115. package/dist/helpers/waitForNetworkIdle.js +191 -0
  116. package/dist/index.d.js +82 -0
  117. package/dist/index.d.ts +11 -0
  118. package/dist/index.js +84 -0
  119. package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
  120. package/dist/intunedServices/ApiGateway/factory.js +13 -0
  121. package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
  122. package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
  123. package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
  124. package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
  125. package/dist/intunedServices/ApiGateway/types.js +11 -0
  126. package/dist/intunedServices/cache/cache.js +61 -0
  127. package/dist/intunedServices/cache/index.js +12 -0
  128. package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
  129. package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
  130. package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
  131. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
  132. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
  133. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
  134. package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
  135. package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
  136. package/dist/optimized-extractors/common/index.js +55 -0
  137. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
  138. package/dist/optimized-extractors/common/matching/matching.js +212 -0
  139. package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
  140. package/dist/optimized-extractors/common/matching/types.js +18 -0
  141. package/dist/optimized-extractors/common/matching/utils.js +184 -0
  142. package/dist/optimized-extractors/common/utils.js +58 -0
  143. package/dist/optimized-extractors/export.d.js +5 -0
  144. package/dist/optimized-extractors/export.d.ts +397 -0
  145. package/dist/optimized-extractors/extractArray.js +120 -0
  146. package/dist/optimized-extractors/extractObject.js +104 -0
  147. package/dist/optimized-extractors/index.d.ts +397 -0
  148. package/dist/optimized-extractors/index.js +31 -0
  149. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
  150. package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
  151. package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
  152. package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
  153. package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
  154. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
  155. package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
  156. package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
  157. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
  158. package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
  159. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
  160. package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
  161. package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
  162. package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
  163. package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
  164. package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
  165. package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
  166. package/dist/optimized-extractors/models/anthropicModel.js +23 -0
  167. package/dist/optimized-extractors/models/openaiModel.js +23 -0
  168. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
  169. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
  170. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
  171. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
  172. package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
  173. package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
  174. package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
  175. package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
  176. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
  177. package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
  178. package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
  179. package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
  180. package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
  181. package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
  182. package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
  183. package/dist/optimized-extractors/types/errors.js +42 -0
  184. package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
  185. package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
  186. package/dist/optimized-extractors/types/types.js +5 -0
  187. package/dist/optimized-extractors/validators.js +152 -0
  188. package/dist/vite-env.d.js +1 -0
  189. package/dist/vite-env.d.ts +9 -0
  190. package/docs.md +14 -0
  191. package/how-to-run-tests.md +10 -0
  192. package/intuned-runtime-setup.md +13 -0
  193. package/package.json +124 -0
  194. package/tsconfig.eslint.json +5 -0
  195. package/tsconfig.json +26 -0
package/.babelrc ADDED
@@ -0,0 +1,21 @@
1
+ {
2
+ "presets": [
3
+ [
4
+ "@babel/preset-env",
5
+ {
6
+ "targets": {
7
+ "chrome": 115,
8
+ "node": "16"
9
+ },
10
+ "modules": "commonjs"
11
+ }
12
+ ],
13
+ "@babel/preset-typescript"
14
+ ],
15
+ "plugins": [
16
+ "babel-plugin-macros",
17
+ "@babel/plugin-transform-export-namespace-from"
18
+ ],
19
+ "sourceMaps": false,
20
+ "comments": false
21
+ }
package/.eslintignore ADDED
@@ -0,0 +1,10 @@
1
+ node_modules
2
+ dist
3
+ .eslintrc.js
4
+ *.test.ts
5
+ *.js
6
+ types-package
7
+ vite.config.ts
8
+ reports
9
+ intuned
10
+ **/*.d.ts
package/.eslintrc.js ADDED
@@ -0,0 +1,39 @@
1
+ module.exports = {
2
+ root: true,
3
+ parser: "@typescript-eslint/parser",
4
+ parserOptions: {
5
+ project: "./tsconfig.eslint.json",
6
+ tsconfigRootDir: __dirname,
7
+ },
8
+ plugins: ["@typescript-eslint", "deprecation", "prettier"],
9
+ ignorePatterns: ["src/common/browserScripts/rollup.config.mjs"],
10
+ extends: [
11
+ "eslint:recommended",
12
+ "plugin:@typescript-eslint/eslint-recommended",
13
+ "plugin:@typescript-eslint/recommended",
14
+ ],
15
+ rules: {
16
+ "@next/next/no-html-link-for-pages": 0,
17
+ "prettier/prettier": "error",
18
+
19
+ // recommended for safety
20
+ "@typescript-eslint/no-floating-promises": "error", // forgetting to await Activities and Workflow APIs is bad
21
+ "deprecation/deprecation": "warn",
22
+
23
+ // code style preference
24
+ "object-shorthand": ["warn", "always"],
25
+
26
+ // relaxed rules, for convenience
27
+ "@typescript-eslint/no-unused-vars": [
28
+ "warn",
29
+ {
30
+ argsIgnorePattern: "^_",
31
+ varsIgnorePattern: "^_",
32
+ },
33
+ ],
34
+ "@typescript-eslint/no-explicit-any": "off",
35
+ "@typescript-eslint/no-empty-interface": "off",
36
+ "@typescript-eslint/ban-ts-comment": "warn",
37
+ "no-empty": "warn",
38
+ },
39
+ };
package/LICENSE ADDED
@@ -0,0 +1,43 @@
1
+ Acceptance
2
+ By using the software, you agree to all of the terms and conditions below.
3
+
4
+ Copyright License
5
+ The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below.
6
+
7
+ Limitations
8
+ You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
9
+
10
+ You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
11
+
12
+ You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law.
13
+
14
+ Patents
15
+ The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
16
+
17
+ Notices
18
+ You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
19
+
20
+ If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
21
+
22
+ No Other Rights
23
+ These terms do not imply any licenses other than those expressly granted in these terms.
24
+
25
+ Termination
26
+ If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
27
+
28
+ No Liability
29
+ As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.
30
+
31
+ Definitions
32
+ The licensor is the entity offering these terms, and the software is the software the licensor makes available under these terms, including any portion of it.
33
+
34
+ you refers to the individual or entity agreeing to these terms.
35
+
36
+ your company is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. control means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
37
+
38
+ your licenses are all the licenses granted to you for the software under these terms.
39
+
40
+ use means anything you do with the software requiring one of your licenses.
41
+
42
+ trademark means trademarks, service marks, and similar rights.
43
+
@@ -0,0 +1,23 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.createAnthropicInstance = createAnthropicInstance;
7
+ var _dotenv = require("dotenv");
8
+ var _jwtTokenManager = require("../../common/jwtTokenManager");
9
+ var _sdk = _interopRequireDefault(require("@anthropic-ai/sdk"));
10
+ function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
11
+ (0, _dotenv.config)();
12
+ function createAnthropicInstance(options) {
13
+ if (options !== null && options !== void 0 && options.apiKey) {
14
+ return new _sdk.default({
15
+ apiKey: options.apiKey
16
+ });
17
+ }
18
+ return new _sdk.default({
19
+ apiKey: "--THI_VALUE_WILL_BE_REPLACED_BY_INTUNED_BE--",
20
+ baseURL: `${process.env.FUNCTIONS_DOMAIN}/api/${process.env.INTUNED_WORKSPACE_ID}/functions/${process.env.INTUNED_INTEGRATION_ID}/anthropic`,
21
+ fetch: _jwtTokenManager.backendFunctionsTokenManager.fetchWithToken.bind(_jwtTokenManager.backendFunctionsTokenManager)
22
+ });
23
+ }
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
@@ -0,0 +1,422 @@
1
+ import { Locator, Page } from "playwright-core";
2
+ import { JsonSchema, ObjectSchema } from "./jsonSchema";
3
+ /**
4
+ *
5
+ * @param file the file you want to extract the data from,
6
+ * @param options.label a label for this extraction process, used for billing and monitoring
7
+ * @param options.dataSchema the json schema of the data you're trying to extract.
8
+ * @param options.prompt optional, a prompt to guide the extraction process and provide more context.
9
+ * @param options.strategy optional, the strategy to use for extraction. use `IMAGE` if the info you're trying to extract is visual and cannot be converted to markdown. Defaults to `MARKDOWN` strategy with `gpt4-turbo` model.
10
+ * @example
11
+ * ```typescript extractStructuredDataFromFile
12
+ * import { extractStructuredDataFromFile } from "@intuned/sdk/ai-extractors";
13
+ *
14
+ * const movie = await extractStructuredDataFromFile({
15
+ * source: {
16
+ * type: "url",
17
+ * data: "<file url>"
18
+ * },
19
+ * type: "pdf",
20
+ * // pages array is optional, do not pass it if you want to include all pages in the process
21
+ * pages: [1, 2]
22
+ * }, {
23
+ * label: "extract_movie",
24
+ * dataSchema: {
25
+ * type: "object",
26
+ * properties: {
27
+ * "name": {
28
+ * type: "string",
29
+ * description: "movie name"
30
+ * },
31
+ * revenue: {
32
+ * type: "string",
33
+ * description: "movie revenue"
34
+ * }
35
+ * }
36
+ * }
37
+ * })
38
+ *
39
+ * ```
40
+ */
41
+ export declare function extractStructuredDataFromFile(
42
+ file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile,
43
+ options: {
44
+ label: string;
45
+ dataSchema: JsonSchema;
46
+ prompt?: string;
47
+ strategy?: MarkdownFileStrategy | ImageFileStrategy;
48
+ }
49
+ ): Promise<any>;
50
+
51
+ /**
52
+ * converts a file to markdown (ImageFile or PdfFile).
53
+ *
54
+ * @param file - The file you want to extract the markdown content from.
55
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
56
+ * @returns {Promise<string>} A promise that resolves to the extracted markdown content as a string.
57
+ *
58
+ * @example
59
+ * ```typescript extractMarkdownFromFile
60
+ * import { extractMarkdownFromFile } from "@intuned/sdk/ai-extractors";
61
+ *
62
+ * const markdown = await extractMarkdownFromFile({
63
+ * source: {
64
+ * type: "url",
65
+ * data: "<file url>"
66
+ * },
67
+ * type: "pdf",
68
+ * // pages array is optional, do not pass it if you want to include all pages in the process
69
+ * pages: [1, 2]
70
+ * }, {
71
+ * label: "extract_markdown"
72
+ * });
73
+ *
74
+ * console.log(markdown);
75
+ * ```
76
+ */
77
+ export declare function extractMarkdownFromFile(
78
+ file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile,
79
+ options: {
80
+ label: string;
81
+ }
82
+ ): Promise<string>;
83
+
84
+ /**
85
+ * Represents a file source from a buffer.
86
+ *
87
+ * @interface
88
+ * @property type - The type of the file source, which is always "buffer".
89
+ * @property data - The buffer data of the file.
90
+ */
91
+ interface FileBufferSource {
92
+ type: "buffer";
93
+ data: Buffer;
94
+ }
95
+
96
+ /**
97
+ * Represents a file source from a URL.
98
+ *
99
+ * @interface
100
+ * @property type - The type of the file source, which is always "url".
101
+ * @property data - The URL of the file.
102
+ */
103
+ interface FileUrlSource {
104
+ type: "url";
105
+ data: string;
106
+ }
107
+
108
+ /**
109
+ * Represents a file source from a base64 string.
110
+ *
111
+ * @interface
112
+ * @property type - The type of the file source, which is always "base64".
113
+ * @property data - The base64 string of the file data.
114
+ */
115
+ interface FileBase64Source {
116
+ type: "base64";
117
+ data: string;
118
+ }
119
+
120
+ /**
121
+ * Represents an image file source.
122
+ *
123
+ * @interface
124
+ * @property type - The type of the file, which is always "image".
125
+ * @property source - The source of the file data.
126
+ */
127
+ interface ImageFile {
128
+ type: "image";
129
+ source: FileBufferSource | FileUrlSource | FileBase64Source;
130
+ }
131
+
132
+ /**
133
+ * Represents a PDF file source.
134
+ *
135
+ * @interface
136
+ * @property type - The type of the file, which is always "pdf".
137
+ * @property [pages] - Optional. The specific pages of the PDF to extract data from, if not provided, all page will be included.
138
+ * @property source - The source of the file data.
139
+ */
140
+ export interface PdfFile {
141
+ type: "pdf";
142
+ pages?: number[];
143
+ source: FileBufferSource | FileUrlSource | FileBase64Source;
144
+ }
145
+
146
+ /**
147
+ * Represents a Spreadsheet file source. For now, only .xlsx Excel spreadsheets are supported.
148
+ *
149
+ * @interface
150
+ * @property type - The type of the file, which is always "spreadsheet".
151
+ * @property sheetName - The name of the sheet to extract data from.
152
+ * @property source - The source of the file data.
153
+ */
154
+ export interface SpreadsheetFile {
155
+ type: "spreadsheet";
156
+ sheetName: string;
157
+ source: FileBufferSource | FileUrlSource | FileBase64Source;
158
+ }
159
+
160
+ /**
161
+ * Represents a Document file source. For now, only .docx Word files are supported.
162
+ *
163
+ * @interface
164
+ * @property type - The type of the file, which is always "document".
165
+ * @property [pages] - Optional. The specific pages of the document to extract data from, if not provided, all page will be included.
166
+ * @property source - The source of the file data.
167
+ * @property config - Optional. Configurations on how the spreadsheet should be processed when it is converted to a document.
168
+ */
169
+ export interface DocumentFile {
170
+ type: "document";
171
+ pages?: number[];
172
+ source: FileBufferSource | FileUrlSource | FileBase64Source;
173
+ }
174
+
175
+ /**
176
+ * Extracts tables from a file (ImageFile or PdfFile).
177
+ *
178
+ * @param file - The file you want to extract the tables from.
179
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
180
+ * @returns {Promise<Array<ExtractedTable>>} A promise that resolves to an array of extracted tables.
181
+ *
182
+ * @example
183
+ * ```typescript extractTablesFromFile
184
+ * import { extractTablesFromFile } from "@intuned/sdk/ai-extractors";
185
+ *
186
+ * const tables = await extractTablesFromFile({
187
+ * source: {
188
+ * type: "url",
189
+ * data: "<file url>"
190
+ * },
191
+ * type: "pdf",
192
+ * // pages array is optional, do not pass it if you want to include all pages in the process
193
+ * pages: [1, 2]
194
+ * }, {
195
+ * label: "extract_tables"
196
+ * });
197
+ *
198
+ * console.log(tables);
199
+ * ```
200
+ */
201
+ export declare function extractTablesFromFile(
202
+ file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile,
203
+ options: {
204
+ label: string;
205
+ }
206
+ ): Promise<Array<ExtractedTable>>;
207
+
208
+ /**
209
+ * Represents a table extracted from a pdf file.
210
+ *
211
+ * @interface
212
+ * @property title - the title of the table if found
213
+ * @property content - a 2 dimensional array contains the table values.
214
+ */
215
+ interface ExtractedTable {
216
+ title: string | null;
217
+ content: (string | null)[][];
218
+ }
219
+
220
+ /**
221
+ * this strategy will use a screenshot of the page/locator with some processing to extract the needed data.
222
+ * should be used when the information you're trying to extract is not present in the dom as a text but can be identified visually.
223
+ * @interface
224
+ * @property model - the model to use in the extraction process.
225
+ * @property type - the type of the strategy
226
+ */
227
+
228
+ /**
229
+ * this strategy will use the html of the page/locator to extract the needed data. we filter out some of the attributes to reduce context.
230
+ * the attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`,
231
+ *
232
+ * @interface
233
+ * @property model - the model to use in the extraction process
234
+ * @property type - the type of the strategy
235
+ */
236
+
237
+ /**
238
+ * this strategy will extract markdown content from the file then run data extraction on it.
239
+ *
240
+ * @interface
241
+ * @property model - the model to use in the extraction process
242
+ * @property type - the type of the strategy
243
+ */
244
+ export interface MarkdownFileStrategy {
245
+ model:
246
+ | "claude-3-haiku"
247
+ | "claude-3-haiku-20240307"
248
+ | "claude-3-5-haiku"
249
+ | "claude-3-5-haiku-20241022"
250
+ | "claude-3.5-sonnet"
251
+ | "claude-3-5-sonnet-20240620"
252
+ | "claude-3-5-sonnet-20241022"
253
+ | "claude-opus-4"
254
+ | "claude-opus-4-20250514"
255
+ | "claude-sonnet-4"
256
+ | "claude-sonnet-4-20250514"
257
+ | "gpt4-turbo"
258
+ | "gpt-4-turbo-2024-04-09"
259
+ | "gpt3.5-turbo"
260
+ | "gpt-3.5-turbo-0125"
261
+ | "gpt-4o"
262
+ | "gpt-4o-2024-05-13"
263
+ | "gpt-4o-mini"
264
+ | "gpt-4o-mini-2024-07-18"
265
+ | "gemini-1.5-pro"
266
+ | "gemini-1.5-pro-002"
267
+ | "gemini-1.5-flash-8b"
268
+ | "gemini-1.5-flash-8b-002"
269
+ | "gemini-1.5-flash"
270
+ | "gemini-1.5-flash-002"
271
+ | "gemini-2.0-flash-exp";
272
+ type: "MARKDOWN";
273
+ }
274
+
275
+ /**
276
+ * this strategy will use the image content of the file to extract the needed data.
277
+ * should be used when the information you're trying to extract cannot be converted to markdown. For example, a checkbox in a pdf file.
278
+ * @interface
279
+ * @property model - the model to use in the extraction process.
280
+ * @property type - the type of the strategy
281
+ */
282
+ export interface ImageFileStrategy {
283
+ model:
284
+ | "claude-3-haiku"
285
+ | "claude-3-haiku-20240307"
286
+ | "claude-3.5-sonnet"
287
+ | "claude-3-5-sonnet-20240620"
288
+ | "claude-3-5-sonnet-20241022"
289
+ | "claude-opus-4"
290
+ | "claude-opus-4-20250514"
291
+ | "claude-sonnet-4"
292
+ | "claude-sonnet-4-20250514"
293
+ | "gpt4-turbo"
294
+ | "gpt-4-turbo-2024-04-09"
295
+ | "gpt-4o"
296
+ | "gpt-4o-2024-05-13"
297
+ | "gpt-4o-mini"
298
+ | "gpt-4o-mini-2024-07-18"
299
+ | "gemini-1.5-pro"
300
+ | "gemini-1.5-pro-002"
301
+ | "gemini-1.5-flash-8b"
302
+ | "gemini-1.5-flash-8b-002"
303
+ | "gemini-1.5-flash"
304
+ | "gemini-1.5-flash-002"
305
+ | "gemini-2.0-flash-exp";
306
+ type: "IMAGE";
307
+ }
308
+
309
+ /**
310
+ * Extracts structured data from content items (text or images).
311
+ *
312
+ * @param content - The content items from which to extract the structured data.
313
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
314
+ * @param options.dataSchema - The JSON schema of the data you're trying to extract.
315
+ * @param [options.prompt] - Optional. A prompt to guide the extraction process.
316
+ * @param options.model - The model to use for extraction.
317
+ * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
318
+ * @returns A promise that resolves to the extracted structured data.
319
+ *
320
+ * @example
321
+ * ```typescript extractStructuredDataFromContent
322
+ * import { extractStructuredDataFromContent } from "@intuned/sdk/ai-extractors";
323
+ *
324
+ * const content = [
325
+ * { type: "text", data: "Sample text data" },
326
+ * {
327
+ * type: "image-url",
328
+ * image_type: "jpeg",
329
+ * data: "https://example.com/image.jpg"
330
+ * }
331
+ * ];
332
+ *
333
+ * const options = {
334
+ * label: "extract_contact_info",
335
+ * dataSchema: {
336
+ * type: "object",
337
+ * properties: {
338
+ * name: { type: "string", description: "contact name" },
339
+ * phone: { type: "string", description: "contact info" }
340
+ * }
341
+ * },
342
+ * model: "gpt4-turbo"
343
+ * };
344
+ *
345
+ * const data = await extractStructuredDataFromContent(content, options);
346
+ * console.log(data);
347
+ * ```
348
+ */
349
+ export declare function extractStructuredDataFromContent(
350
+ content:
351
+ | (TextContentItem | ImageBufferContentItem | ImageUrlContentItem)[]
352
+ | TextContentItem
353
+ | ImageBufferContentItem
354
+ | ImageUrlContentItem,
355
+ options: {
356
+ label: string;
357
+ dataSchema: ObjectSchema;
358
+ prompt?: string;
359
+ model:
360
+ | "claude-3-haiku"
361
+ | "claude-3-haiku-20240307"
362
+ | "claude-3-5-haiku"
363
+ | "claude-3-5-haiku-20241022"
364
+ | "claude-3.5-sonnet"
365
+ | "claude-3-5-sonnet-20240620"
366
+ | "claude-3-5-sonnet-20241022"
367
+ | "claude-opus-4"
368
+ | "claude-opus-4-20250514"
369
+ | "claude-sonnet-4"
370
+ | "claude-sonnet-4-20250514"
371
+ | "gpt4-turbo"
372
+ | "gpt-4-turbo-2024-04-09"
373
+ | "gpt3.5-turbo"
374
+ | "gpt-3.5-turbo-0125"
375
+ | "gpt-4o"
376
+ | "gpt-4o-2024-05-13"
377
+ | "gpt-4o-mini"
378
+ | "gpt-4o-mini-2024-07-18"
379
+ | "gemini-1.5-pro"
380
+ | "gemini-1.5-pro-002"
381
+ | "gemini-1.5-flash-8b"
382
+ | "gemini-1.5-flash-8b-002"
383
+ | "gemini-1.5-flash"
384
+ | "gemini-1.5-flash-002"
385
+ | "gemini-2.0-flash-exp";
386
+ apiKey?: string;
387
+ }
388
+ ): Promise<any>;
389
+
390
+ /**
391
+ * @interface
392
+ * @property type - The type of the content item, which is always "text".
393
+ * @property data - The text data.
394
+ */
395
+ export interface TextContentItem {
396
+ type: "text";
397
+ data: string;
398
+ }
399
+
400
+ /**
401
+ * @interface
402
+ * @property type - The type of the content item, which is always "image-buffer".
403
+ * @property image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
404
+ * @property data - The buffer containing the image data.
405
+ */
406
+ export interface ImageBufferContentItem {
407
+ type: "image-buffer";
408
+ image_type: "png" | "jpeg" | "gif" | "webp";
409
+ data: Buffer;
410
+ }
411
+
412
+ /**
413
+ * @interface
414
+ * @property type - The type of the content item, which is always "image-url".
415
+ * @property image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
416
+ * @property data - The URL of the image.
417
+ */
418
+ export interface ImageUrlContentItem {
419
+ type: "image-url";
420
+ image_type: "png" | "jpeg" | "gif" | "webp";
421
+ data: string;
422
+ }
@@ -0,0 +1,79 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.extractStructuredDataFromContent = extractStructuredDataFromContent;
7
+ var _extractStructuredDataUsingAi = require("./extractionHelpers/extractStructuredDataUsingAi");
8
+ var _validators = require("./validators");
9
+ var _formatZodError = require("../common/formatZodError");
10
+ var _checksumUtils = require("../optimized-extractors/objectExtractionHelpers/checksumUtils");
11
+ var _cache = require("../intunedServices/cache");
12
+ var _Logger = require("../common/Logger");
13
+ var _aiModelsValidations = require("../common/aiModelsValidations");
14
+ async function extractStructuredDataFromContent(...[data, options]) {
15
+ const contentValidationResult = _validators.contentValidationSchema.safeParse(data);
16
+ if (!contentValidationResult.success) {
17
+ const error = contentValidationResult.error;
18
+ const messages = (0, _formatZodError.formatZodError)(error);
19
+ throw new Error("extractStructuredDataFromContent content is invalid: \n" + messages.join("\n"));
20
+ }
21
+ const parsingResult = _validators.genericExtractDataInputSchema.safeParse(options);
22
+ if (!parsingResult.success) {
23
+ const error = parsingResult.error;
24
+ const messages = (0, _formatZodError.formatZodError)(error);
25
+ throw new Error("extractStructuredDataFromContent input is invalid: \n" + messages.join("\n"));
26
+ }
27
+ const content = Array.isArray(data) ? data : [data];
28
+ const imagesFromBuffers = content.filter(c => c.type === "image-buffer").map(c => ({
29
+ image_type: c.image_type,
30
+ data: c.data
31
+ }));
32
+ const imagesFromUrls = content.filter(c => c.type === "image-url").map(c => ({
33
+ image_type: c.image_type,
34
+ data: c.data
35
+ })).map(async c => {
36
+ try {
37
+ const response = await fetch(c.data);
38
+ const buffer = Buffer.from(await response.arrayBuffer());
39
+ return {
40
+ image_type: c.image_type,
41
+ data: buffer
42
+ };
43
+ } catch (e) {
44
+ throw new Error(`fetching image:${c.data} from url Failed: ${e}`);
45
+ }
46
+ });
47
+ const images = [...(await Promise.all(imagesFromUrls)), ...imagesFromBuffers];
48
+ const texts = content.filter(c => c.type === "text").map(c => c.data);
49
+ const cacheKey = (0, _checksumUtils.hashObject)({
50
+ identifier: options.label,
51
+ entityName: "data",
52
+ systemMessage: options.prompt,
53
+ images,
54
+ jsonSchema: options.dataSchema,
55
+ model: _aiModelsValidations.MODELS_MAPPINGS[options.model] ?? options.model,
56
+ text: texts
57
+ }, false);
58
+ const cachedResult = await _cache.cache.get(cacheKey);
59
+ if (cachedResult) {
60
+ _Logger.logger.info(`extractor ${options.label}: used cached result`);
61
+ return cachedResult;
62
+ }
63
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
64
+ logAiCallCost: true,
65
+ identifier: options.label,
66
+ entityName: "data",
67
+ systemMessage: options.prompt,
68
+ images,
69
+ jsonSchema: options.dataSchema,
70
+ model: options.model,
71
+ text: texts,
72
+ apiKey: options.apiKey
73
+ });
74
+ if (result.isErr()) {
75
+ throw new Error(result.error.context);
76
+ }
77
+ await _cache.cache.set(cacheKey, result.value.result);
78
+ return result.value.result;
79
+ }
@@ -0,0 +1,7 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.DEFAULT_CLAUDE_MODEL_MAX_TOKEN = void 0;
7
+ const DEFAULT_CLAUDE_MODEL_MAX_TOKEN = exports.DEFAULT_CLAUDE_MODEL_MAX_TOKEN = 4096;