@intuned/browser-dev 2.2.3-test-build.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +21 -0
- package/.eslintignore +10 -0
- package/.eslintrc.js +39 -0
- package/LICENSE +43 -0
- package/dist/ai/export.d.js +5 -0
- package/dist/ai/export.d.ts +641 -0
- package/dist/ai/extractStructuredData.js +320 -0
- package/dist/ai/extractStructuredDataUsingAi.js +139 -0
- package/dist/ai/extractionHelpers/screenshotHelpers.js +56 -0
- package/dist/ai/extractionHelpers/validateSchema.js +148 -0
- package/dist/ai/index.d.ts +641 -0
- package/dist/ai/index.js +19 -0
- package/dist/ai/isPageLoaded.js +77 -0
- package/dist/ai/prompt.js +39 -0
- package/dist/ai/tests/testCheckAllTypesAreStrings.spec.js +137 -0
- package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +646 -0
- package/dist/ai/tests/testIsPageLoaded.spec.js +277 -0
- package/dist/ai/tools/index.js +48 -0
- package/dist/ai/types/errors.js +67 -0
- package/dist/ai/types/models.js +45 -0
- package/dist/ai/types/types.js +48 -0
- package/dist/ai/validators.js +167 -0
- package/dist/common/Logger/index.js +60 -0
- package/dist/common/Logger/types.js +5 -0
- package/dist/common/SdkError.js +50 -0
- package/dist/common/aiModelsValidations.js +32 -0
- package/dist/common/browser_scripts.js +2596 -0
- package/dist/common/ensureBrowserScripts.js +18 -0
- package/dist/common/extendedTest.js +148 -0
- package/dist/common/extractionHelpers.js +19 -0
- package/dist/common/formatZodError.js +18 -0
- package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
- package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
- package/dist/common/fuzzySearch/utils.js +23 -0
- package/dist/common/getModelProvider.js +18 -0
- package/dist/common/getSimplifiedHtml.js +122 -0
- package/dist/common/hashObject.js +32 -0
- package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
- package/dist/common/html2markdown/index.js +19 -0
- package/dist/common/jwtTokenManager.js +18 -0
- package/dist/common/loadRuntime.js +16 -0
- package/dist/common/locatorHelpers.js +41 -0
- package/dist/common/matching/collectStrings.js +32 -0
- package/dist/common/matching/levenshtein.js +40 -0
- package/dist/common/matching/matching.js +317 -0
- package/dist/common/matching/types.js +1 -0
- package/dist/common/noEmpty.js +9 -0
- package/dist/common/saveSnapshotWithExamples.js +60 -0
- package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
- package/dist/common/xpathMapping.js +107 -0
- package/dist/helpers/clickUntilExhausted.js +85 -0
- package/dist/helpers/downloadFile.js +125 -0
- package/dist/helpers/export.d.js +5 -0
- package/dist/helpers/export.d.ts +1220 -0
- package/dist/helpers/extractMarkdown.js +35 -0
- package/dist/helpers/filterEmptyValues.js +54 -0
- package/dist/helpers/gotoUrl.js +98 -0
- package/dist/helpers/index.d.ts +1220 -0
- package/dist/helpers/index.js +128 -0
- package/dist/helpers/processDate.js +25 -0
- package/dist/helpers/resolveUrl.js +64 -0
- package/dist/helpers/sanitizeHtml.js +74 -0
- package/dist/helpers/saveFileToS3.js +50 -0
- package/dist/helpers/scrollToLoadContent.js +57 -0
- package/dist/helpers/tests/extendedTest.js +130 -0
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +387 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +204 -0
- package/dist/helpers/tests/testExtractMarkdown.spec.js +290 -0
- package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
- package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
- package/dist/helpers/tests/testProcessDate.spec.js +13 -0
- package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
- package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
- package/dist/helpers/tests/testScrollToLoadContent.spec.js +163 -0
- package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +342 -0
- package/dist/helpers/tests/testWithDomSettledWait.spec.js +164 -0
- package/dist/helpers/tests/testWithNetworkIdleWait.spec.js +114 -0
- package/dist/helpers/types/Attachment.js +115 -0
- package/dist/helpers/types/CustomTypeRegistry.js +48 -0
- package/dist/helpers/types/RunEnvironment.js +18 -0
- package/dist/helpers/types/ValidationError.js +17 -0
- package/dist/helpers/types/index.js +51 -0
- package/dist/helpers/uploadFileToS3.js +154 -0
- package/dist/helpers/utils/getS3Client.js +22 -0
- package/dist/helpers/utils/index.js +73 -0
- package/dist/helpers/utils/isDownload.js +10 -0
- package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
- package/dist/helpers/utils/isLocator.js +9 -0
- package/dist/helpers/utils/jwtTokenManager.js +18 -0
- package/dist/helpers/validateDataUsingSchema.js +103 -0
- package/dist/helpers/waitForDomSettled.js +90 -0
- package/dist/helpers/withNetworkSettledWait.js +91 -0
- package/dist/index.d.js +16 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +16 -0
- package/dist/intunedServices/ApiGateway/aiApiGateway.js +99 -0
- package/dist/intunedServices/ApiGateway/factory.js +13 -0
- package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
- package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
- package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
- package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +224 -0
- package/dist/intunedServices/ApiGateway/types.js +11 -0
- package/dist/intunedServices/cache/cache.js +61 -0
- package/dist/intunedServices/cache/index.js +12 -0
- package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
- package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
- package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +135 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +132 -0
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
- package/dist/optimized-extractors/common/findTableHeaders.js +162 -0
- package/dist/optimized-extractors/common/index.js +55 -0
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +84 -0
- package/dist/optimized-extractors/common/matching/matching.js +212 -0
- package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
- package/dist/optimized-extractors/common/matching/types.js +18 -0
- package/dist/optimized-extractors/common/matching/utils.js +184 -0
- package/dist/optimized-extractors/common/utils.js +58 -0
- package/dist/optimized-extractors/export.d.js +5 -0
- package/dist/optimized-extractors/export.d.ts +397 -0
- package/dist/optimized-extractors/extractArray.js +120 -0
- package/dist/optimized-extractors/extractObject.js +104 -0
- package/dist/optimized-extractors/index.d.ts +397 -0
- package/dist/optimized-extractors/index.js +31 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
- package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
- package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
- package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
- package/dist/optimized-extractors/models/anthropicModel.js +23 -0
- package/dist/optimized-extractors/models/openaiModel.js +23 -0
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
- package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
- package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
- package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
- package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
- package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
- package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
- package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
- package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
- package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
- package/dist/optimized-extractors/types/errors.js +42 -0
- package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
- package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
- package/dist/optimized-extractors/types/types.js +5 -0
- package/dist/optimized-extractors/validators.js +152 -0
- package/dist/vite-env.d.js +1 -0
- package/dist/vite-env.d.ts +9 -0
- package/docs.md +14 -0
- package/generated-docs/ai/functions/extractStructuredData.mdx +255 -0
- package/generated-docs/ai/functions/isPageLoaded.mdx +88 -0
- package/generated-docs/ai/interfaces/ArraySchema.mdx +36 -0
- package/generated-docs/ai/interfaces/BasicSchema.mdx +14 -0
- package/generated-docs/ai/interfaces/BooleanSchema.mdx +28 -0
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/NumberSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/ObjectSchema.mdx +39 -0
- package/generated-docs/ai/interfaces/StringSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/TextContentItem.mdx +14 -0
- package/generated-docs/ai/type-aliases/ContentItem.mdx +12 -0
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +47 -0
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +85 -0
- package/generated-docs/helpers/functions/downloadFile.mdx +99 -0
- package/generated-docs/helpers/functions/extractMarkdown.mdx +56 -0
- package/generated-docs/helpers/functions/filterEmptyValues.mdx +51 -0
- package/generated-docs/helpers/functions/goToUrl.mdx +124 -0
- package/generated-docs/helpers/functions/processDate.mdx +55 -0
- package/generated-docs/helpers/functions/resolveUrl.mdx +165 -0
- package/generated-docs/helpers/functions/sanitizeHtml.mdx +113 -0
- package/generated-docs/helpers/functions/saveFileToS3.mdx +127 -0
- package/generated-docs/helpers/functions/scrollToLoadContent.mdx +89 -0
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +121 -0
- package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +90 -0
- package/generated-docs/helpers/functions/waitForDomSettled.mdx +91 -0
- package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +76 -0
- package/generated-docs/helpers/interfaces/Attachment.mdx +56 -0
- package/generated-docs/helpers/interfaces/S3Configs.mdx +52 -0
- package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +22 -0
- package/generated-docs/helpers/type-aliases/AttachmentType.mdx +10 -0
- package/generated-docs/helpers/type-aliases/FileType.mdx +61 -0
- package/generated-docs/helpers/type-aliases/Trigger.mdx +62 -0
- package/how-to-run-tests.md +10 -0
- package/intuned-runtime-setup.md +13 -0
- package/package.json +119 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.extractStructuredData = void 0;
|
|
7
|
+
var _extractStructuredDataUsingAi = require("./extractStructuredDataUsingAi");
|
|
8
|
+
var _validators = require("./validators");
|
|
9
|
+
var _screenshotHelpers = require("./extractionHelpers/screenshotHelpers");
|
|
10
|
+
var _formatZodError = require("../common/formatZodError");
|
|
11
|
+
var _cache = require("../intunedServices/cache/cache");
|
|
12
|
+
var _locatorHelpers = require("../common/locatorHelpers");
|
|
13
|
+
var _extractionHelpers = require("../common/extractionHelpers");
|
|
14
|
+
var _getSimplifiedHtml = require("../common/getSimplifiedHtml");
|
|
15
|
+
var _hashObject = require("../common/hashObject");
|
|
16
|
+
var _Logger = require("../common/Logger");
|
|
17
|
+
var _helpers = require("../helpers");
|
|
18
|
+
var _xpathMapping = require("../common/xpathMapping");
|
|
19
|
+
const extractStructuredData = async options => {
|
|
20
|
+
if ("content" in options && !("source" in options)) {
|
|
21
|
+
return await extractStructuredDataFromContent(options);
|
|
22
|
+
}
|
|
23
|
+
const pageOrLocator = options.source;
|
|
24
|
+
const isPageInput = (0, _locatorHelpers.isPage)(pageOrLocator);
|
|
25
|
+
const {
|
|
26
|
+
model,
|
|
27
|
+
strategy,
|
|
28
|
+
prompt,
|
|
29
|
+
apiKey,
|
|
30
|
+
enableDomMatching = false,
|
|
31
|
+
maxRetries = 3,
|
|
32
|
+
dataSchema,
|
|
33
|
+
enableCache = true
|
|
34
|
+
} = options;
|
|
35
|
+
const inputParsingResult = await _validators.extractDataInputJsonSchema.safeParseAsync({
|
|
36
|
+
source: pageOrLocator,
|
|
37
|
+
model,
|
|
38
|
+
strategy,
|
|
39
|
+
prompt,
|
|
40
|
+
apiKey,
|
|
41
|
+
enableDomMatching,
|
|
42
|
+
enableCache,
|
|
43
|
+
maxRetries,
|
|
44
|
+
dataSchema
|
|
45
|
+
});
|
|
46
|
+
if (!inputParsingResult.success) {
|
|
47
|
+
const errors = (0, _formatZodError.formatZodError)(inputParsingResult.error);
|
|
48
|
+
const message = `invalid input parameters for extractStructuredData: ${errors}`;
|
|
49
|
+
throw new Error(message);
|
|
50
|
+
}
|
|
51
|
+
const validatedData = inputParsingResult.data;
|
|
52
|
+
const pageObject = isPageInput ? pageOrLocator : pageOrLocator.page();
|
|
53
|
+
if (validatedData.enableDomMatching) {
|
|
54
|
+
if (!(0, _validators.checkAllTypesAreStrings)(validatedData.dataSchema)) {
|
|
55
|
+
throw new Error("For DOM matching, all types of the extraction fields must be STRINGS, to match with the DOM.");
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
let cacheKey = "";
|
|
59
|
+
if (validatedData.strategy === "HTML") {
|
|
60
|
+
const containerHandle = isPageInput ? await pageOrLocator.locator("html").elementHandle() : await pageOrLocator.elementHandle();
|
|
61
|
+
if (!containerHandle) {
|
|
62
|
+
throw new Error("No HTML content found in the specified region.");
|
|
63
|
+
}
|
|
64
|
+
const simplifiedHtml = await (0, _getSimplifiedHtml.getSimplifiedHtml)(containerHandle);
|
|
65
|
+
if (validatedData.enableCache) {
|
|
66
|
+
cacheKey = (0, _hashObject.hashObject)({
|
|
67
|
+
pageUrl: pageObject.url(),
|
|
68
|
+
dataSchema: validatedData.dataSchema,
|
|
69
|
+
strategy: validatedData.strategy,
|
|
70
|
+
model: validatedData.model,
|
|
71
|
+
prompt: validatedData.prompt,
|
|
72
|
+
searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
|
|
73
|
+
...(validatedData.enableDomMatching ? {} : {
|
|
74
|
+
html: (0, _extractionHelpers.compressStringSpaces)(simplifiedHtml)
|
|
75
|
+
})
|
|
76
|
+
}, true);
|
|
77
|
+
const cachedResult = await _cache.cache.get(cacheKey);
|
|
78
|
+
if (validatedData.enableDomMatching && cachedResult && cachedResult.matchesMapping) {
|
|
79
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedResult.matchesMapping);
|
|
80
|
+
if (isValid) {
|
|
81
|
+
_Logger.logger.info("Returning cached result with valid DOM matching");
|
|
82
|
+
return cachedResult.result;
|
|
83
|
+
}
|
|
84
|
+
} else if (cachedResult && !validatedData.enableDomMatching) {
|
|
85
|
+
_Logger.logger.info("Returning cached result");
|
|
86
|
+
return cachedResult;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
90
|
+
page: pageObject,
|
|
91
|
+
options: {
|
|
92
|
+
apiKey: validatedData.apiKey,
|
|
93
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
94
|
+
jsonSchema: validatedData.dataSchema,
|
|
95
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
96
|
+
content: simplifiedHtml,
|
|
97
|
+
prompt: validatedData.prompt,
|
|
98
|
+
images: [],
|
|
99
|
+
maxRetries: validatedData.maxRetries
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
if (result.isErr()) {
|
|
103
|
+
throw new Error(result.error.context);
|
|
104
|
+
}
|
|
105
|
+
if (validatedData.enableCache) {
|
|
106
|
+
if (!validatedData.enableDomMatching) {
|
|
107
|
+
await _cache.cache.set(cacheKey, result.value.result);
|
|
108
|
+
} else {
|
|
109
|
+
const resultsToCache = {
|
|
110
|
+
result: result.value.result,
|
|
111
|
+
matchesMapping: result.value.xpathMapping || {}
|
|
112
|
+
};
|
|
113
|
+
await _cache.cache.set(cacheKey, resultsToCache);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return result.value.result;
|
|
117
|
+
}
|
|
118
|
+
if (validatedData.strategy === "IMAGE") {
|
|
119
|
+
const containerHandle = isPageInput ? undefined : await pageOrLocator.elementHandle();
|
|
120
|
+
const images = await (0, _screenshotHelpers.buildImagesFromPageOrHandle)(pageObject, containerHandle);
|
|
121
|
+
if (validatedData.enableCache) {
|
|
122
|
+
cacheKey = (0, _hashObject.hashObject)({
|
|
123
|
+
pageUrl: pageObject.url(),
|
|
124
|
+
dataSchema: validatedData.dataSchema,
|
|
125
|
+
strategy: validatedData.strategy,
|
|
126
|
+
model: validatedData.model,
|
|
127
|
+
prompt: validatedData.prompt,
|
|
128
|
+
searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
|
|
129
|
+
...(validatedData.enableDomMatching ? {} : {
|
|
130
|
+
html: await pageObject.evaluate(() => document.documentElement.outerHTML)
|
|
131
|
+
})
|
|
132
|
+
}, true);
|
|
133
|
+
const cachedResult = await _cache.cache.get(cacheKey);
|
|
134
|
+
if (validatedData.enableDomMatching && cachedResult && cachedResult.matchesMapping) {
|
|
135
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedResult.matchesMapping);
|
|
136
|
+
if (isValid) {
|
|
137
|
+
_Logger.logger.info("Returning cached result with valid DOM matching");
|
|
138
|
+
return cachedResult.result;
|
|
139
|
+
}
|
|
140
|
+
} else if (cachedResult && !validatedData.enableDomMatching) {
|
|
141
|
+
_Logger.logger.info("Returning cached result");
|
|
142
|
+
return cachedResult;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
if (images.isErr()) {
|
|
146
|
+
throw new Error(images.error.context);
|
|
147
|
+
}
|
|
148
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
149
|
+
page: pageObject,
|
|
150
|
+
options: {
|
|
151
|
+
apiKey: validatedData.apiKey,
|
|
152
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
153
|
+
jsonSchema: validatedData.dataSchema,
|
|
154
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
155
|
+
content: "Extract structured data from the following images.",
|
|
156
|
+
prompt: validatedData.prompt,
|
|
157
|
+
images: images.value.map(i => ({
|
|
158
|
+
data: i,
|
|
159
|
+
image_type: "png"
|
|
160
|
+
})),
|
|
161
|
+
maxRetries: validatedData.maxRetries
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
if (result.isErr()) {
|
|
165
|
+
throw new Error(result.error.context);
|
|
166
|
+
}
|
|
167
|
+
if (validatedData.enableCache) {
|
|
168
|
+
if (!validatedData.enableDomMatching) {
|
|
169
|
+
await _cache.cache.set(cacheKey, result.value.result);
|
|
170
|
+
} else {
|
|
171
|
+
const resultsToCache = {
|
|
172
|
+
result: result.value.result,
|
|
173
|
+
matchesMapping: result.value.xpathMapping || {}
|
|
174
|
+
};
|
|
175
|
+
await _cache.cache.set(cacheKey, resultsToCache);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return result.value.result;
|
|
179
|
+
}
|
|
180
|
+
if (validatedData.strategy === "MARKDOWN") {
|
|
181
|
+
const containerHandle = isPageInput ? await pageOrLocator.locator("html").elementHandle() : await pageOrLocator.elementHandle();
|
|
182
|
+
const html = await (containerHandle === null || containerHandle === void 0 ? void 0 : containerHandle.innerHTML());
|
|
183
|
+
if (!html) {
|
|
184
|
+
throw new Error("No HTML content found in the specified region.");
|
|
185
|
+
}
|
|
186
|
+
const markdown = await (0, _helpers.extractMarkdown)({
|
|
187
|
+
source: pageObject
|
|
188
|
+
});
|
|
189
|
+
if (validatedData.enableCache) {
|
|
190
|
+
cacheKey = (0, _hashObject.hashObject)({
|
|
191
|
+
pageUrl: pageObject.url(),
|
|
192
|
+
dataSchema: validatedData.dataSchema,
|
|
193
|
+
strategy: validatedData.strategy,
|
|
194
|
+
model: validatedData.model,
|
|
195
|
+
prompt: validatedData.prompt,
|
|
196
|
+
searchRegion: !isPageInput ? pageOrLocator.toString() : undefined,
|
|
197
|
+
...(validatedData.enableDomMatching ? {} : {
|
|
198
|
+
html: await pageObject.evaluate(() => document.documentElement.outerHTML),
|
|
199
|
+
markdown
|
|
200
|
+
})
|
|
201
|
+
}, true);
|
|
202
|
+
const cachedResult = await _cache.cache.get(cacheKey);
|
|
203
|
+
if (enableDomMatching && cachedResult && cachedResult.matchesMapping) {
|
|
204
|
+
const cachedXpathMapping = cachedResult.matchesMapping;
|
|
205
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(pageObject, cachedXpathMapping);
|
|
206
|
+
if (isValid) {
|
|
207
|
+
_Logger.logger.info("Returning cached result with valid DOM matching");
|
|
208
|
+
return cachedResult.result;
|
|
209
|
+
}
|
|
210
|
+
} else if (cachedResult && !enableDomMatching) {
|
|
211
|
+
_Logger.logger.info("Returning cached result");
|
|
212
|
+
return cachedResult;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
216
|
+
page: pageObject,
|
|
217
|
+
options: {
|
|
218
|
+
apiKey: validatedData.apiKey,
|
|
219
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
220
|
+
jsonSchema: validatedData.dataSchema,
|
|
221
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
222
|
+
content: markdown,
|
|
223
|
+
prompt: validatedData.prompt,
|
|
224
|
+
images: [],
|
|
225
|
+
maxRetries: validatedData.maxRetries
|
|
226
|
+
}
|
|
227
|
+
});
|
|
228
|
+
if (result.isErr()) {
|
|
229
|
+
throw new Error(result.error.context);
|
|
230
|
+
}
|
|
231
|
+
if (validatedData.enableCache) {
|
|
232
|
+
if (!enableDomMatching) {
|
|
233
|
+
await _cache.cache.set(cacheKey, result.value.result);
|
|
234
|
+
return result.value.result;
|
|
235
|
+
}
|
|
236
|
+
const resultsToCache = {
|
|
237
|
+
result: result.value.result,
|
|
238
|
+
matchesMapping: result.value.xpathMapping || {}
|
|
239
|
+
};
|
|
240
|
+
await _cache.cache.set(cacheKey, resultsToCache);
|
|
241
|
+
}
|
|
242
|
+
return result.value.result;
|
|
243
|
+
}
|
|
244
|
+
throw new Error(`Unsupported strategy type: ${validatedData.strategy}. Supported types are: HTML, IMAGE, and MARKDOWN.`);
|
|
245
|
+
};
|
|
246
|
+
exports.extractStructuredData = extractStructuredData;
|
|
247
|
+
const extractStructuredDataFromContent = async options => {
|
|
248
|
+
const contentValidationResult = _validators.contentValidationSchema.safeParse(options.content);
|
|
249
|
+
if (!contentValidationResult.success) {
|
|
250
|
+
const error = contentValidationResult.error;
|
|
251
|
+
const messages = (0, _formatZodError.formatZodError)(error);
|
|
252
|
+
throw new Error("extractStructuredDataFromContent content is invalid: \n" + messages.join("\n"));
|
|
253
|
+
}
|
|
254
|
+
const {
|
|
255
|
+
content: _,
|
|
256
|
+
...rest
|
|
257
|
+
} = options;
|
|
258
|
+
const parsingResult = _validators.genericExtractDataInputSchema.safeParse(rest);
|
|
259
|
+
if (!parsingResult.success) {
|
|
260
|
+
const error = parsingResult.error;
|
|
261
|
+
const messages = (0, _formatZodError.formatZodError)(error);
|
|
262
|
+
throw new Error("extractStructuredDataFromContent input is invalid: \n" + messages.join("\n"));
|
|
263
|
+
}
|
|
264
|
+
const content = Array.isArray(options.content) ? options.content : [options.content];
|
|
265
|
+
const imagesFromBuffers = content.filter(c => c.type === "image-buffer").map(c => ({
|
|
266
|
+
image_type: c.image_type,
|
|
267
|
+
data: c.data
|
|
268
|
+
}));
|
|
269
|
+
const imagesFromUrls = content.filter(c => c.type === "image-url").map(c => ({
|
|
270
|
+
image_type: c.image_type,
|
|
271
|
+
data: c.data
|
|
272
|
+
})).map(async c => {
|
|
273
|
+
try {
|
|
274
|
+
const response = await fetch(c.data);
|
|
275
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
276
|
+
return {
|
|
277
|
+
image_type: c.image_type,
|
|
278
|
+
data: buffer
|
|
279
|
+
};
|
|
280
|
+
} catch (e) {
|
|
281
|
+
throw new Error(`fetching image:${c.data} from url Failed: ${e}`);
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
const images = [...(await Promise.all(imagesFromUrls)), ...imagesFromBuffers];
|
|
285
|
+
const texts = content.filter(c => c.type === "text").map(c => c.data);
|
|
286
|
+
let cacheKey = "";
|
|
287
|
+
if (options.enableCache != false) {
|
|
288
|
+
cacheKey = (0, _hashObject.hashObject)({
|
|
289
|
+
systemMessage: options.prompt,
|
|
290
|
+
images,
|
|
291
|
+
jsonSchema: options.dataSchema,
|
|
292
|
+
model: options.model,
|
|
293
|
+
text: texts
|
|
294
|
+
}, false);
|
|
295
|
+
const cachedResult = await _cache.cache.get(cacheKey);
|
|
296
|
+
if (cachedResult) {
|
|
297
|
+
_Logger.logger.info("Returning cached result");
|
|
298
|
+
return cachedResult;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
302
|
+
options: {
|
|
303
|
+
prompt: options.prompt,
|
|
304
|
+
images,
|
|
305
|
+
jsonSchema: options.dataSchema,
|
|
306
|
+
content: texts.join("\n"),
|
|
307
|
+
enableDomMatching: false,
|
|
308
|
+
apiKey: options.apiKey,
|
|
309
|
+
model: options.model || "claude-3-5-haiku-latest",
|
|
310
|
+
maxRetries: options.maxRetries
|
|
311
|
+
}
|
|
312
|
+
});
|
|
313
|
+
if (result.isErr()) {
|
|
314
|
+
throw new Error(result.error.context);
|
|
315
|
+
}
|
|
316
|
+
if (options.enableCache != false) {
|
|
317
|
+
await _cache.cache.set(cacheKey, result.value.result);
|
|
318
|
+
}
|
|
319
|
+
return result.value.result;
|
|
320
|
+
};
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.extractStructuredDataUsingAi = extractStructuredDataUsingAi;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var Errors = _interopRequireWildcard(require("./types/errors"));
|
|
9
|
+
var _Logger = require("../common/Logger");
|
|
10
|
+
var _collectStrings = require("../common/matching/collectStrings");
|
|
11
|
+
var _matching = require("../common/matching/matching");
|
|
12
|
+
var _validateSchema = require("./extractionHelpers/validateSchema");
|
|
13
|
+
var _factory = require("../intunedServices/ApiGateway/factory");
|
|
14
|
+
var _tools = require("./tools");
|
|
15
|
+
var _prompt = require("./prompt");
|
|
16
|
+
var _ai = require("ai");
|
|
17
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
18
|
+
async function extractStructuredDataUsingAi(input) {
|
|
19
|
+
const {
|
|
20
|
+
apiKey,
|
|
21
|
+
enableDomMatching,
|
|
22
|
+
jsonSchema,
|
|
23
|
+
model,
|
|
24
|
+
content,
|
|
25
|
+
prompt,
|
|
26
|
+
images,
|
|
27
|
+
maxRetries = 3
|
|
28
|
+
} = input.options;
|
|
29
|
+
let accumulatedTokens = 0;
|
|
30
|
+
const toolName = `extract_data`;
|
|
31
|
+
const gateway = _factory.GatewayFactory.createAIGateway();
|
|
32
|
+
const gatewayModel = gateway.getModel(model, apiKey);
|
|
33
|
+
const tools = (0, _tools.getTools)(toolName, jsonSchema);
|
|
34
|
+
const messages = (0, _prompt.getMessages)({
|
|
35
|
+
prompt,
|
|
36
|
+
content,
|
|
37
|
+
images,
|
|
38
|
+
enableDomMatching
|
|
39
|
+
});
|
|
40
|
+
const messagesHistory = messages;
|
|
41
|
+
let currentRetry = 0;
|
|
42
|
+
let result;
|
|
43
|
+
let isGateway = false;
|
|
44
|
+
while (currentRetry < maxRetries) {
|
|
45
|
+
try {
|
|
46
|
+
result = await (0, _ai.generateText)({
|
|
47
|
+
model: gatewayModel,
|
|
48
|
+
messages: messagesHistory,
|
|
49
|
+
tools: tools.isOk() ? tools.value : {},
|
|
50
|
+
toolChoice: "required",
|
|
51
|
+
maxRetries
|
|
52
|
+
});
|
|
53
|
+
if (result.response.headers["x-ai-cost-in-cents"]) {
|
|
54
|
+
isGateway = true;
|
|
55
|
+
accumulatedTokens += result.response.headers["x-ai-cost-in-cents"];
|
|
56
|
+
} else {
|
|
57
|
+
var _result$usage;
|
|
58
|
+
accumulatedTokens += ((_result$usage = result.usage) === null || _result$usage === void 0 ? void 0 : _result$usage.totalTokens) ?? 0;
|
|
59
|
+
}
|
|
60
|
+
const toolCall = result.toolCalls[0] ?? null;
|
|
61
|
+
let extractedData = toolCall.input;
|
|
62
|
+
const isArray = jsonSchema.type === "array";
|
|
63
|
+
if (isArray && extractedData.extracted_data) {
|
|
64
|
+
extractedData = extractedData.extracted_data;
|
|
65
|
+
}
|
|
66
|
+
const errors = (0, _validateSchema.validateToolCallSchema)(extractedData, jsonSchema);
|
|
67
|
+
if (errors.length > 0) {
|
|
68
|
+
const reaskMessage = (0, _validateSchema.createReaskMessage)(errors);
|
|
69
|
+
const modelMessages = (0, _ai.convertToModelMessages)([{
|
|
70
|
+
role: "assistant",
|
|
71
|
+
parts: [{
|
|
72
|
+
type: "step-start"
|
|
73
|
+
}, {
|
|
74
|
+
type: "text",
|
|
75
|
+
text: result.text,
|
|
76
|
+
state: "done"
|
|
77
|
+
}, {
|
|
78
|
+
type: `tool-${toolName}`,
|
|
79
|
+
state: "output-error",
|
|
80
|
+
toolCallId: toolCall.toolCallId,
|
|
81
|
+
input: extractedData,
|
|
82
|
+
errorText: reaskMessage
|
|
83
|
+
}]
|
|
84
|
+
}]);
|
|
85
|
+
messagesHistory.push(...modelMessages);
|
|
86
|
+
currentRetry++;
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
if (!enableDomMatching || !input.page) {
|
|
90
|
+
_Logger.logger.info(`Extraction completed, total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
91
|
+
return (0, _neverthrow.ok)({
|
|
92
|
+
result: extractedData,
|
|
93
|
+
xpathMapping: {}
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
const stringsToMatch = (0, _collectStrings.collectStrings)({
|
|
97
|
+
dataStructure: extractedData
|
|
98
|
+
});
|
|
99
|
+
if (!stringsToMatch || stringsToMatch.length === 0) {
|
|
100
|
+
_Logger.logger.info(`Extraction completed.
|
|
101
|
+
No matching data found.
|
|
102
|
+
Total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
103
|
+
return (0, _neverthrow.ok)({
|
|
104
|
+
result: [],
|
|
105
|
+
xpathMapping: {}
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
const {
|
|
109
|
+
replacements,
|
|
110
|
+
xpathMapping
|
|
111
|
+
} = await (0, _matching.replaceWithBestMatches)({
|
|
112
|
+
stringsToMatch,
|
|
113
|
+
pageObject: input.page
|
|
114
|
+
});
|
|
115
|
+
const stringReplacements = {};
|
|
116
|
+
Object.entries(replacements).forEach(([key, value]) => {
|
|
117
|
+
stringReplacements[key] = (value === null || value === void 0 ? void 0 : value.matchText) || null;
|
|
118
|
+
});
|
|
119
|
+
const matchesData = await (0, _validateSchema.recursivelyReplaceStrings)(extractedData, stringReplacements);
|
|
120
|
+
_Logger.logger.info(`Extraction completed,
|
|
121
|
+
Total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
122
|
+
return (0, _neverthrow.ok)({
|
|
123
|
+
result: matchesData,
|
|
124
|
+
xpathMapping
|
|
125
|
+
});
|
|
126
|
+
} catch (error) {
|
|
127
|
+
_Logger.logger.error("Error during AI extraction", {
|
|
128
|
+
error,
|
|
129
|
+
model
|
|
130
|
+
});
|
|
131
|
+
_Logger.logger.info(`Extraction failed,
|
|
132
|
+
Total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
133
|
+
return (0, _neverthrow.err)(Errors.invalidExtractionResult(error instanceof Error ? error.message : "Unknown error during extraction"));
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
_Logger.logger.info(`Extraction failed.
|
|
137
|
+
Total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
138
|
+
return (0, _neverthrow.err)(Errors.maxRetriesExceeded(`Max retries of ${maxRetries} exceeded for extraction`));
|
|
139
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.buildImagesFromPageOrHandle = buildImagesFromPageOrHandle;
|
|
7
|
+
exports.captureFullPageImagesWithOverlap = captureFullPageImagesWithOverlap;
|
|
8
|
+
var _neverthrow = require("neverthrow");
|
|
9
|
+
var errors = _interopRequireWildcard(require("../types/errors"));
|
|
10
|
+
var _Logger = require("../../common/Logger");
|
|
11
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
12
|
+
async function captureFullPageImagesWithOverlap(page, options = {
|
|
13
|
+
overlap: 200,
|
|
14
|
+
sliceHeight: 1000
|
|
15
|
+
}) {
|
|
16
|
+
const totalHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
17
|
+
let currentHeight = 0;
|
|
18
|
+
const buffers = [];
|
|
19
|
+
while (currentHeight < totalHeight) {
|
|
20
|
+
if (buffers.length > 10) {
|
|
21
|
+
_Logger.logger.info(`Page height exceeds maximum capture limit, only first ${totalHeight}px will be captured`);
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
await page.setViewportSize({
|
|
25
|
+
width: 1200,
|
|
26
|
+
height: options.sliceHeight
|
|
27
|
+
});
|
|
28
|
+
await page.evaluate(y => window.scrollTo(0, y), currentHeight - (currentHeight > 0 ? options.overlap : 0));
|
|
29
|
+
await page.waitForTimeout(500);
|
|
30
|
+
const buffer = await page.screenshot();
|
|
31
|
+
buffers.push(buffer);
|
|
32
|
+
currentHeight += options.sliceHeight - options.overlap;
|
|
33
|
+
}
|
|
34
|
+
return buffers;
|
|
35
|
+
}
|
|
36
|
+
async function buildImagesFromPageOrHandle(page, searchRegionHandler) {
|
|
37
|
+
const originalViewPortSize = page.viewportSize();
|
|
38
|
+
await page.setViewportSize({
|
|
39
|
+
width: 1200,
|
|
40
|
+
height: 800
|
|
41
|
+
});
|
|
42
|
+
if (searchRegionHandler) {
|
|
43
|
+
const size = await searchRegionHandler.boundingBox();
|
|
44
|
+
if (!size) {
|
|
45
|
+
return (0, _neverthrow.err)(errors.other("the provided search region is very large, image extraction support up to 5000px height."));
|
|
46
|
+
}
|
|
47
|
+
return (0, _neverthrow.ok)([await searchRegionHandler.screenshot({
|
|
48
|
+
type: "png"
|
|
49
|
+
})]);
|
|
50
|
+
}
|
|
51
|
+
const fullPageImages = await captureFullPageImagesWithOverlap(page);
|
|
52
|
+
if (originalViewPortSize) {
|
|
53
|
+
await page.setViewportSize(originalViewPortSize);
|
|
54
|
+
}
|
|
55
|
+
return (0, _neverthrow.ok)(fullPageImages);
|
|
56
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.createReaskMessage = createReaskMessage;
|
|
7
|
+
exports.recursivelyReplaceStrings = recursivelyReplaceStrings;
|
|
8
|
+
exports.validateJSONSchema = validateJSONSchema;
|
|
9
|
+
exports.validateToolCallSchema = validateToolCallSchema;
|
|
10
|
+
var _neverthrow = require("neverthrow");
|
|
11
|
+
var errors = _interopRequireWildcard(require("../types/errors"));
|
|
12
|
+
var _ajv = _interopRequireDefault(require("ajv"));
|
|
13
|
+
var _ajvFormats = _interopRequireDefault(require("ajv-formats"));
|
|
14
|
+
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
15
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
16
|
+
function validateJSONSchema(schema) {
|
|
17
|
+
if (!schema || typeof schema !== "object") {
|
|
18
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Schema must be an object"));
|
|
19
|
+
}
|
|
20
|
+
if (!schema.type) {
|
|
21
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Schema must have a 'type' property"));
|
|
22
|
+
}
|
|
23
|
+
const validTypes = ["string", "number", "integer", "boolean", "array", "object"];
|
|
24
|
+
if (!validTypes.includes(schema.type)) {
|
|
25
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema(`Invalid schema type: ${schema.type}`));
|
|
26
|
+
}
|
|
27
|
+
if (schema.type === "array") {
|
|
28
|
+
if (!schema.items) {
|
|
29
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Array schema must have 'items' property"));
|
|
30
|
+
}
|
|
31
|
+
const itemsValidation = validateJSONSchema(schema.items);
|
|
32
|
+
if (itemsValidation.isErr()) {
|
|
33
|
+
return itemsValidation;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (schema.type === "object") {
|
|
37
|
+
if (!schema.properties || typeof schema.properties !== "object") {
|
|
38
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Object schema must have 'properties' object"));
|
|
39
|
+
}
|
|
40
|
+
for (const [key, propSchema] of Object.entries(schema.properties)) {
|
|
41
|
+
const propValidation = validateJSONSchema(propSchema);
|
|
42
|
+
if (propValidation.isErr()) {
|
|
43
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema(`Invalid schema for property '${key}': ${propValidation.error.context}`));
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (schema.required && !Array.isArray(schema.required)) {
|
|
47
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("'required' must be an array of property names"));
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if (schema.type === "number" || schema.type === "integer") {
|
|
51
|
+
if (schema.maximum !== undefined && schema.exclusiveMaximum !== undefined) {
|
|
52
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Cannot have both 'maximum' and 'exclusiveMaximum'"));
|
|
53
|
+
}
|
|
54
|
+
if (schema.minimum !== undefined && schema.exclusiveMinimum !== undefined) {
|
|
55
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("Cannot have both 'minimum' and 'exclusiveMinimum'"));
|
|
56
|
+
}
|
|
57
|
+
if (schema.minimum > schema.maximum) {
|
|
58
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("'minimum' cannot be greater than 'maximum'"));
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (schema.type === "string") {
|
|
62
|
+
if (schema.maxLength !== undefined && schema.minLength !== undefined) {
|
|
63
|
+
if (schema.minLength > schema.maxLength) {
|
|
64
|
+
return (0, _neverthrow.err)(errors.invalidJsonSchema("'minLength' cannot be greater than 'maxLength'"));
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return (0, _neverthrow.ok)(schema);
|
|
69
|
+
}
|
|
70
|
+
function recursivelyReplaceStrings(dataStructure, replacements) {
|
|
71
|
+
if (typeof dataStructure === "string") {
|
|
72
|
+
return replacements[dataStructure] ?? dataStructure;
|
|
73
|
+
} else if (typeof dataStructure === "number") {
|
|
74
|
+
const replacement = replacements[dataStructure.toString()];
|
|
75
|
+
return replacement !== null && replacement !== undefined ? replacement : dataStructure;
|
|
76
|
+
} else if (Array.isArray(dataStructure)) {
|
|
77
|
+
return dataStructure.map(item => recursivelyReplaceStrings(item, replacements));
|
|
78
|
+
} else if (dataStructure !== null && typeof dataStructure === "object") {
|
|
79
|
+
const result = {};
|
|
80
|
+
for (const [key, value] of Object.entries(dataStructure)) {
|
|
81
|
+
result[key] = recursivelyReplaceStrings(value, replacements);
|
|
82
|
+
}
|
|
83
|
+
return result;
|
|
84
|
+
} else {
|
|
85
|
+
return dataStructure;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
function validateToolCallSchema(instance, schema) {
|
|
89
|
+
const ajv = new _ajv.default({
|
|
90
|
+
allErrors: true,
|
|
91
|
+
verbose: true
|
|
92
|
+
});
|
|
93
|
+
(0, _ajvFormats.default)(ajv);
|
|
94
|
+
const validate = ajv.compile(schema);
|
|
95
|
+
const isValid = validate(instance);
|
|
96
|
+
if (isValid) {
|
|
97
|
+
return [];
|
|
98
|
+
}
|
|
99
|
+
const errors = [];
|
|
100
|
+
if (validate.errors) {
|
|
101
|
+
for (const error of validate.errors) {
|
|
102
|
+
let pathString = "root";
|
|
103
|
+
if (error.instancePath) {
|
|
104
|
+
const pathParts = error.instancePath.slice(1).split("/").map(part => {
|
|
105
|
+
if (/^\d+$/.test(part)) {
|
|
106
|
+
return `[${part}]`;
|
|
107
|
+
}
|
|
108
|
+
return part;
|
|
109
|
+
});
|
|
110
|
+
if (pathParts.length > 0) {
|
|
111
|
+
pathString = "root." + pathParts.join(".");
|
|
112
|
+
pathString = pathString.replace(/\.\[/g, "[");
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
let schemaPathString = "schema";
|
|
116
|
+
if (error.schemaPath) {
|
|
117
|
+
const schemaParts = error.schemaPath.slice(1).split("/");
|
|
118
|
+
if (schemaParts.length > 0) {
|
|
119
|
+
schemaPathString = "schema." + schemaParts.join(".");
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
let invalidValue = instance;
|
|
123
|
+
if (error.instancePath) {
|
|
124
|
+
const pathParts = error.instancePath.slice(1).split("/");
|
|
125
|
+
for (const part of pathParts) {
|
|
126
|
+
if (!invalidValue) {
|
|
127
|
+
invalidValue = invalidValue[part];
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
errors.push({
|
|
132
|
+
path: pathString,
|
|
133
|
+
message: error.message || "Validation error",
|
|
134
|
+
value: invalidValue,
|
|
135
|
+
schema_path: schemaPathString
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return errors;
|
|
140
|
+
}
|
|
141
|
+
function createReaskMessage(validationErrors) {
|
|
142
|
+
const formattedErrors = validationErrors.map((error, index) => `${index + 1}. Path "${error.path}": ${error.message}`).join("\n");
|
|
143
|
+
return `The extracted data has the following validation errors that need to be fixed:
|
|
144
|
+
|
|
145
|
+
${formattedErrors}
|
|
146
|
+
|
|
147
|
+
Please extract the data again, ensuring it follows the exact schema requirements.`;
|
|
148
|
+
}
|