@intuned/browser-dev 2.2.3-test-build.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +21 -0
- package/.eslintignore +10 -0
- package/.eslintrc.js +39 -0
- package/LICENSE +43 -0
- package/dist/ai/export.d.js +5 -0
- package/dist/ai/export.d.ts +641 -0
- package/dist/ai/extractStructuredData.js +320 -0
- package/dist/ai/extractStructuredDataUsingAi.js +139 -0
- package/dist/ai/extractionHelpers/screenshotHelpers.js +56 -0
- package/dist/ai/extractionHelpers/validateSchema.js +148 -0
- package/dist/ai/index.d.ts +641 -0
- package/dist/ai/index.js +19 -0
- package/dist/ai/isPageLoaded.js +77 -0
- package/dist/ai/prompt.js +39 -0
- package/dist/ai/tests/testCheckAllTypesAreStrings.spec.js +137 -0
- package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +646 -0
- package/dist/ai/tests/testIsPageLoaded.spec.js +277 -0
- package/dist/ai/tools/index.js +48 -0
- package/dist/ai/types/errors.js +67 -0
- package/dist/ai/types/models.js +45 -0
- package/dist/ai/types/types.js +48 -0
- package/dist/ai/validators.js +167 -0
- package/dist/common/Logger/index.js +60 -0
- package/dist/common/Logger/types.js +5 -0
- package/dist/common/SdkError.js +50 -0
- package/dist/common/aiModelsValidations.js +32 -0
- package/dist/common/browser_scripts.js +2596 -0
- package/dist/common/ensureBrowserScripts.js +18 -0
- package/dist/common/extendedTest.js +148 -0
- package/dist/common/extractionHelpers.js +19 -0
- package/dist/common/formatZodError.js +18 -0
- package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
- package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
- package/dist/common/fuzzySearch/utils.js +23 -0
- package/dist/common/getModelProvider.js +18 -0
- package/dist/common/getSimplifiedHtml.js +122 -0
- package/dist/common/hashObject.js +32 -0
- package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
- package/dist/common/html2markdown/index.js +19 -0
- package/dist/common/jwtTokenManager.js +18 -0
- package/dist/common/loadRuntime.js +16 -0
- package/dist/common/locatorHelpers.js +41 -0
- package/dist/common/matching/collectStrings.js +32 -0
- package/dist/common/matching/levenshtein.js +40 -0
- package/dist/common/matching/matching.js +317 -0
- package/dist/common/matching/types.js +1 -0
- package/dist/common/noEmpty.js +9 -0
- package/dist/common/saveSnapshotWithExamples.js +60 -0
- package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
- package/dist/common/xpathMapping.js +107 -0
- package/dist/helpers/clickUntilExhausted.js +85 -0
- package/dist/helpers/downloadFile.js +125 -0
- package/dist/helpers/export.d.js +5 -0
- package/dist/helpers/export.d.ts +1220 -0
- package/dist/helpers/extractMarkdown.js +35 -0
- package/dist/helpers/filterEmptyValues.js +54 -0
- package/dist/helpers/gotoUrl.js +98 -0
- package/dist/helpers/index.d.ts +1220 -0
- package/dist/helpers/index.js +128 -0
- package/dist/helpers/processDate.js +25 -0
- package/dist/helpers/resolveUrl.js +64 -0
- package/dist/helpers/sanitizeHtml.js +74 -0
- package/dist/helpers/saveFileToS3.js +50 -0
- package/dist/helpers/scrollToLoadContent.js +57 -0
- package/dist/helpers/tests/extendedTest.js +130 -0
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +387 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +204 -0
- package/dist/helpers/tests/testExtractMarkdown.spec.js +290 -0
- package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
- package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
- package/dist/helpers/tests/testProcessDate.spec.js +13 -0
- package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
- package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
- package/dist/helpers/tests/testScrollToLoadContent.spec.js +163 -0
- package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +342 -0
- package/dist/helpers/tests/testWithDomSettledWait.spec.js +164 -0
- package/dist/helpers/tests/testWithNetworkIdleWait.spec.js +114 -0
- package/dist/helpers/types/Attachment.js +115 -0
- package/dist/helpers/types/CustomTypeRegistry.js +48 -0
- package/dist/helpers/types/RunEnvironment.js +18 -0
- package/dist/helpers/types/ValidationError.js +17 -0
- package/dist/helpers/types/index.js +51 -0
- package/dist/helpers/uploadFileToS3.js +154 -0
- package/dist/helpers/utils/getS3Client.js +22 -0
- package/dist/helpers/utils/index.js +73 -0
- package/dist/helpers/utils/isDownload.js +10 -0
- package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
- package/dist/helpers/utils/isLocator.js +9 -0
- package/dist/helpers/utils/jwtTokenManager.js +18 -0
- package/dist/helpers/validateDataUsingSchema.js +103 -0
- package/dist/helpers/waitForDomSettled.js +90 -0
- package/dist/helpers/withNetworkSettledWait.js +91 -0
- package/dist/index.d.js +16 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +16 -0
- package/dist/intunedServices/ApiGateway/aiApiGateway.js +99 -0
- package/dist/intunedServices/ApiGateway/factory.js +13 -0
- package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
- package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
- package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
- package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +224 -0
- package/dist/intunedServices/ApiGateway/types.js +11 -0
- package/dist/intunedServices/cache/cache.js +61 -0
- package/dist/intunedServices/cache/index.js +12 -0
- package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
- package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
- package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +135 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +132 -0
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
- package/dist/optimized-extractors/common/findTableHeaders.js +162 -0
- package/dist/optimized-extractors/common/index.js +55 -0
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +84 -0
- package/dist/optimized-extractors/common/matching/matching.js +212 -0
- package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
- package/dist/optimized-extractors/common/matching/types.js +18 -0
- package/dist/optimized-extractors/common/matching/utils.js +184 -0
- package/dist/optimized-extractors/common/utils.js +58 -0
- package/dist/optimized-extractors/export.d.js +5 -0
- package/dist/optimized-extractors/export.d.ts +397 -0
- package/dist/optimized-extractors/extractArray.js +120 -0
- package/dist/optimized-extractors/extractObject.js +104 -0
- package/dist/optimized-extractors/index.d.ts +397 -0
- package/dist/optimized-extractors/index.js +31 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
- package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
- package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
- package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
- package/dist/optimized-extractors/models/anthropicModel.js +23 -0
- package/dist/optimized-extractors/models/openaiModel.js +23 -0
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
- package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
- package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
- package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
- package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
- package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
- package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
- package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
- package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
- package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
- package/dist/optimized-extractors/types/errors.js +42 -0
- package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
- package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
- package/dist/optimized-extractors/types/types.js +5 -0
- package/dist/optimized-extractors/validators.js +152 -0
- package/dist/vite-env.d.js +1 -0
- package/dist/vite-env.d.ts +9 -0
- package/docs.md +14 -0
- package/generated-docs/ai/functions/extractStructuredData.mdx +255 -0
- package/generated-docs/ai/functions/isPageLoaded.mdx +88 -0
- package/generated-docs/ai/interfaces/ArraySchema.mdx +36 -0
- package/generated-docs/ai/interfaces/BasicSchema.mdx +14 -0
- package/generated-docs/ai/interfaces/BooleanSchema.mdx +28 -0
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/NumberSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/ObjectSchema.mdx +39 -0
- package/generated-docs/ai/interfaces/StringSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/TextContentItem.mdx +14 -0
- package/generated-docs/ai/type-aliases/ContentItem.mdx +12 -0
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +47 -0
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +85 -0
- package/generated-docs/helpers/functions/downloadFile.mdx +99 -0
- package/generated-docs/helpers/functions/extractMarkdown.mdx +56 -0
- package/generated-docs/helpers/functions/filterEmptyValues.mdx +51 -0
- package/generated-docs/helpers/functions/goToUrl.mdx +124 -0
- package/generated-docs/helpers/functions/processDate.mdx +55 -0
- package/generated-docs/helpers/functions/resolveUrl.mdx +165 -0
- package/generated-docs/helpers/functions/sanitizeHtml.mdx +113 -0
- package/generated-docs/helpers/functions/saveFileToS3.mdx +127 -0
- package/generated-docs/helpers/functions/scrollToLoadContent.mdx +89 -0
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +121 -0
- package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +90 -0
- package/generated-docs/helpers/functions/waitForDomSettled.mdx +91 -0
- package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +76 -0
- package/generated-docs/helpers/interfaces/Attachment.mdx +56 -0
- package/generated-docs/helpers/interfaces/S3Configs.mdx +52 -0
- package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +22 -0
- package/generated-docs/helpers/type-aliases/AttachmentType.mdx +10 -0
- package/generated-docs/helpers/type-aliases/FileType.mdx +61 -0
- package/generated-docs/helpers/type-aliases/Trigger.mdx +62 -0
- package/how-to-run-tests.md +10 -0
- package/intuned-runtime-setup.md +13 -0
- package/package.json +119 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.runAiExtraction = runAiExtraction;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _extractionHelpers = require("../../common/extractionHelpers");
|
|
9
|
+
var _findDomMatches = require("../objectExtractionHelpers/findDomMatches");
|
|
10
|
+
var _getSimplifiedHtml = require("../objectExtractionHelpers/getSimplifiedHtml");
|
|
11
|
+
var Errors = _interopRequireWildcard(require("./errors"));
|
|
12
|
+
var _getListMatches = require("./getListMatches");
|
|
13
|
+
var _extractPropertiesUsingGPTFromArray = require("./utils/extractPropertiesUsingGPTFromArray");
|
|
14
|
+
var _extractStructuredListUsingAi = require("./utils/extractStructuredListUsingAi");
|
|
15
|
+
var _getListContainerXpath = require("./utils/getListContainerXpath");
|
|
16
|
+
var _getRelativeContainerXpathSelector = require("./utils/getRelativeContainerXpathSelector");
|
|
17
|
+
var _getSimplifiedHtmlPerListItem = require("./utils/getSimplifiedHtmlPerListItem");
|
|
18
|
+
var _tablesUtils = require("./utils/tablesUtils");
|
|
19
|
+
var _buildImagesFromPage = require("../common/buildImagesFromPage");
|
|
20
|
+
var _findTableHeaders = require("../common/findTableHeaders");
|
|
21
|
+
var _Logger = require("../../common/Logger");
|
|
22
|
+
var _utils = require("../common/matching/utils");
|
|
23
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
24
|
+
async function runAiExtraction(params) {
|
|
25
|
+
return handleNewAiExtraction(params);
|
|
26
|
+
}
|
|
27
|
+
async function handleNewAiExtraction(params) {
|
|
28
|
+
var _primaryData$value;
|
|
29
|
+
const {
|
|
30
|
+
itemEntityName,
|
|
31
|
+
itemEntitySchema,
|
|
32
|
+
pageAndSearchRegion,
|
|
33
|
+
primaryProperty,
|
|
34
|
+
hasSearchRegionContainer,
|
|
35
|
+
strategy,
|
|
36
|
+
identifier,
|
|
37
|
+
prompt,
|
|
38
|
+
examples,
|
|
39
|
+
apiKey
|
|
40
|
+
} = params;
|
|
41
|
+
const [primaryPropertyName, primaryPropertyValue] = primaryProperty;
|
|
42
|
+
const aiExtractionData = await getAiExtractionData(strategy, pageAndSearchRegion, hasSearchRegionContainer);
|
|
43
|
+
if (aiExtractionData.isErr()) return (0, _neverthrow.err)(aiExtractionData.error);
|
|
44
|
+
const primaryData = await (0, _extractStructuredListUsingAi.extractStructuredListUsingAi)(itemEntityName, {
|
|
45
|
+
type: "object",
|
|
46
|
+
properties: {
|
|
47
|
+
[primaryPropertyName]: primaryPropertyValue
|
|
48
|
+
},
|
|
49
|
+
required: [primaryPropertyName]
|
|
50
|
+
}, aiExtractionData.value, identifier, strategy, prompt, apiKey);
|
|
51
|
+
if (primaryData.isErr()) {
|
|
52
|
+
return (0, _neverthrow.err)(primaryData.error);
|
|
53
|
+
}
|
|
54
|
+
if (primaryData.value.length === 0) {
|
|
55
|
+
_Logger.logger.debug(`the ai couldn't find any item with the ${primaryPropertyName} property`);
|
|
56
|
+
return (0, _neverthrow.ok)({
|
|
57
|
+
resultValues: [],
|
|
58
|
+
containerPath: null,
|
|
59
|
+
matches: new Map()
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
const primaryValues = (_primaryData$value = primaryData.value) === null || _primaryData$value === void 0 ? void 0 : _primaryData$value.map(i => i[primaryPropertyName]);
|
|
63
|
+
_Logger.logger.debug(`we were able to detect ${primaryValues.length} items with ${primaryPropertyName} property: ${JSON.stringify(primaryValues)}`);
|
|
64
|
+
if (primaryValues.length < 2) {
|
|
65
|
+
const allData = await (0, _extractStructuredListUsingAi.extractStructuredListUsingAi)(itemEntityName, itemEntitySchema, aiExtractionData.value, identifier, strategy, prompt, apiKey);
|
|
66
|
+
if (allData.isErr()) {
|
|
67
|
+
return (0, _neverthrow.err)(allData.error);
|
|
68
|
+
}
|
|
69
|
+
allData.value.forEach((v, i) => {
|
|
70
|
+
_Logger.logger.debug(`ai extraction result for row ${i}: ${JSON.stringify(v)}`);
|
|
71
|
+
});
|
|
72
|
+
const aiResults = allData.value.map((v, i) => ({
|
|
73
|
+
result: v,
|
|
74
|
+
rowIndex: i
|
|
75
|
+
}));
|
|
76
|
+
const resultValues = aiResults;
|
|
77
|
+
const matches = await (0, _getListMatches.getListMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, aiResults);
|
|
78
|
+
return (0, _neverthrow.ok)({
|
|
79
|
+
resultValues,
|
|
80
|
+
containerPath: null,
|
|
81
|
+
matches
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
const primaryValuesDomMatches = await (0, _findDomMatches.getDomMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, primaryValues);
|
|
85
|
+
const valuesDoesNotExistInDOM = Array.from(primaryValuesDomMatches.entries()).filter(([_, v]) => v.length === 0).map(i => i[0]);
|
|
86
|
+
if (valuesDoesNotExistInDOM.length > 0) {
|
|
87
|
+
_Logger.logger.debug(`the following values returned by AI does not exist in the page dom, [${valuesDoesNotExistInDOM}] , this will cause issues finding the list container`);
|
|
88
|
+
}
|
|
89
|
+
const fullContainerXpath = await (0, _getListContainerXpath.getListContainerXpath)(primaryValuesDomMatches);
|
|
90
|
+
let containerPath = fullContainerXpath;
|
|
91
|
+
if (hasSearchRegionContainer && containerPath) {
|
|
92
|
+
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, containerPath);
|
|
93
|
+
}
|
|
94
|
+
if (!containerPath) {
|
|
95
|
+
return (0, _neverthrow.err)(Errors.invalidList());
|
|
96
|
+
}
|
|
97
|
+
const listItemsContainerLocator = (await (0, _extractionHelpers.selectLocatorsUsingXpath)(pageAndSearchRegion.page, fullContainerXpath))[0];
|
|
98
|
+
const extractedData = await splitDomAndExtractData({
|
|
99
|
+
listItemsContainerLocator,
|
|
100
|
+
itemEntityName,
|
|
101
|
+
itemEntitySchema,
|
|
102
|
+
pageAndSearchRegion,
|
|
103
|
+
primaryPropertyName: primaryProperty[0],
|
|
104
|
+
strategy,
|
|
105
|
+
identifier,
|
|
106
|
+
examples,
|
|
107
|
+
apiKey
|
|
108
|
+
});
|
|
109
|
+
if (extractedData.isErr()) {
|
|
110
|
+
return (0, _neverthrow.err)(extractedData.error);
|
|
111
|
+
}
|
|
112
|
+
const {
|
|
113
|
+
matches,
|
|
114
|
+
resultValues
|
|
115
|
+
} = extractedData.value;
|
|
116
|
+
return (0, _neverthrow.ok)({
|
|
117
|
+
resultValues,
|
|
118
|
+
containerPath,
|
|
119
|
+
matches
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
async function buildImagesForItemsHandles(locators) {
|
|
123
|
+
const images = [];
|
|
124
|
+
for (const locator of locators) {
|
|
125
|
+
const elementHandle = await locator.elementHandle();
|
|
126
|
+
const screenshot = await elementHandle.screenshot({
|
|
127
|
+
type: "png"
|
|
128
|
+
});
|
|
129
|
+
images.push(screenshot);
|
|
130
|
+
}
|
|
131
|
+
return images.map(i => ({
|
|
132
|
+
type: "image",
|
|
133
|
+
buffer: i
|
|
134
|
+
}));
|
|
135
|
+
}
|
|
136
|
+
async function splitDomAndExtractData({
|
|
137
|
+
listItemsContainerLocator,
|
|
138
|
+
itemEntityName,
|
|
139
|
+
itemEntitySchema,
|
|
140
|
+
pageAndSearchRegion,
|
|
141
|
+
primaryPropertyName,
|
|
142
|
+
strategy,
|
|
143
|
+
identifier,
|
|
144
|
+
examples,
|
|
145
|
+
apiKey
|
|
146
|
+
}) {
|
|
147
|
+
const itemsLocators = await (0, _extractionHelpers.splitContainerIntoListLocators)(listItemsContainerLocator);
|
|
148
|
+
const itemsSimplifiedHtml = await (0, _getSimplifiedHtmlPerListItem.getSimplifiedHtmlPerListItem)(itemsLocators);
|
|
149
|
+
const {
|
|
150
|
+
isTable,
|
|
151
|
+
tableLocater
|
|
152
|
+
} = await (0, _tablesUtils.isListTable)(listItemsContainerLocator, itemsSimplifiedHtml);
|
|
153
|
+
const tableAsJsonArray = isTable ? await (0, _tablesUtils.createJsonFromTable)(pageAndSearchRegion.page) : [];
|
|
154
|
+
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater, identifier) : undefined;
|
|
155
|
+
if (tableHeaders && tableHeaders.isErr()) {
|
|
156
|
+
return (0, _neverthrow.err)(tableHeaders.error);
|
|
157
|
+
}
|
|
158
|
+
const extractedData = await (0, _extractPropertiesUsingGPTFromArray.extractPropertiesUsingGPTFromArray)({
|
|
159
|
+
itemEntityName,
|
|
160
|
+
itemEntitySchema,
|
|
161
|
+
itemsSimplifiedHtml,
|
|
162
|
+
tableAsJsonArray,
|
|
163
|
+
strategy,
|
|
164
|
+
tableHeaders: tableHeaders !== null && tableHeaders !== void 0 && tableHeaders.value.headers.length ? tableHeaders.value.headers : undefined,
|
|
165
|
+
items: strategy.type === "HTML" ? itemsSimplifiedHtml.map(v => ({
|
|
166
|
+
type: "text",
|
|
167
|
+
text: v
|
|
168
|
+
})) : await buildImagesForItemsHandles(itemsLocators),
|
|
169
|
+
identifier,
|
|
170
|
+
examples,
|
|
171
|
+
apiKey
|
|
172
|
+
});
|
|
173
|
+
if (extractedData.isErr()) {
|
|
174
|
+
return extractedData;
|
|
175
|
+
}
|
|
176
|
+
const resultValues = [];
|
|
177
|
+
for (let i = 0; i < extractedData.value.length; i++) {
|
|
178
|
+
const rowValues = extractedData.value[i] ?? {};
|
|
179
|
+
const rowLocator = itemsLocators[i];
|
|
180
|
+
const primaryValue = rowValues[primaryPropertyName];
|
|
181
|
+
if (primaryValue === null || primaryValue === undefined) {
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
const rowValuesMatches = await (0, _findDomMatches.getDomMatchesFromItemsHandles)(pageAndSearchRegion.page, await rowLocator.elementHandle(), Object.entries(rowValues).map(([_, value]) => value));
|
|
185
|
+
const matches = rowValuesMatches.get(primaryValue);
|
|
186
|
+
const rowValuesWithMatchesOnly = Object.entries(rowValues).reduce((acc, [key, value]) => {
|
|
187
|
+
const valueMatches = rowValuesMatches.get(value);
|
|
188
|
+
const bestMatch = (0, _utils.selectBestMatch)(value, valueMatches ?? []);
|
|
189
|
+
if (valueMatches && valueMatches.length > 0 && bestMatch) {
|
|
190
|
+
acc[key] = {
|
|
191
|
+
matchText: bestMatch.matchText,
|
|
192
|
+
matchXpath: bestMatch.matchXpath,
|
|
193
|
+
matchType: bestMatch.matchType
|
|
194
|
+
};
|
|
195
|
+
} else {
|
|
196
|
+
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the item's html, dropped for hallucination protection`);
|
|
197
|
+
}
|
|
198
|
+
return acc;
|
|
199
|
+
}, {});
|
|
200
|
+
if (matches && (matches === null || matches === void 0 ? void 0 : matches.length) > 0) {
|
|
201
|
+
resultValues.push({
|
|
202
|
+
rowIndex: i,
|
|
203
|
+
result: rowValuesWithMatchesOnly
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
const propertyKeys = Object.keys(itemEntitySchema.properties);
|
|
208
|
+
for (const propertyKey of propertyKeys) {
|
|
209
|
+
const isRequired = itemEntitySchema.required.includes(propertyKey);
|
|
210
|
+
const isPrimary = itemEntitySchema.properties[propertyKey].primary;
|
|
211
|
+
if (!isPrimary && isRequired && resultValues.some(i => {
|
|
212
|
+
const doenstExist = i.result[propertyKey] === null || i.result[propertyKey] === undefined;
|
|
213
|
+
return doenstExist;
|
|
214
|
+
})) {
|
|
215
|
+
return (0, _neverthrow.err)(Errors.invalidExtractionResult(`Required property ${propertyKey} not found in all rows`));
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
const matches = await (0, _getListMatches.getListMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, resultValues.map(v => {
|
|
219
|
+
return {
|
|
220
|
+
rowIndex: v.rowIndex,
|
|
221
|
+
result: Object.fromEntries(Object.entries(v.result).map(([key, value]) => [key, value.matchText]))
|
|
222
|
+
};
|
|
223
|
+
}));
|
|
224
|
+
return (0, _neverthrow.ok)({
|
|
225
|
+
resultValues,
|
|
226
|
+
matches
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
async function getAiExtractionData(strategy, pageAndSearchRegion, hasSearchRegionContainer) {
|
|
230
|
+
if (strategy.type === "HTML") {
|
|
231
|
+
return (0, _neverthrow.ok)({
|
|
232
|
+
text: await (0, _getSimplifiedHtml.getSimplifiedHtml)(pageAndSearchRegion.searchRegionHandler)
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
const images = await (0, _buildImagesFromPage.buildImagesFromPageOrHandle)(pageAndSearchRegion.page, hasSearchRegionContainer ? pageAndSearchRegion.searchRegionHandler : undefined);
|
|
236
|
+
if (images.isErr()) return (0, _neverthrow.err)(Errors.other(images.error.context));
|
|
237
|
+
return (0, _neverthrow.ok)({
|
|
238
|
+
images: images.value
|
|
239
|
+
});
|
|
240
|
+
}
|
package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.extractPropertiesUsingGPTFromArray = extractPropertiesUsingGPTFromArray;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _errors = require("../errors");
|
|
9
|
+
var _common = require("../../common/");
|
|
10
|
+
var _isTableHeaderOrFooter = require("../../common/isTableHeaderOrFooter");
|
|
11
|
+
var _Logger = require("../../../common/Logger");
|
|
12
|
+
var _extractionHelpers = require("../../../common/extractionHelpers");
|
|
13
|
+
var _buildExamplesPrompt = require("../../common/buildExamplesPrompt");
|
|
14
|
+
var _utils = require("../../common/utils");
|
|
15
|
+
async function extractPropertiesUsingGPTFromArray(input) {
|
|
16
|
+
if (input.strategy.type === "IMAGE") {
|
|
17
|
+
return extractPropertiesWithImageStrategy(input);
|
|
18
|
+
} else if (input.strategy.type === "HTML") {
|
|
19
|
+
return extractPropertiesWithHTMLStrategy(input);
|
|
20
|
+
}
|
|
21
|
+
throw new Error("Invalid strategy type");
|
|
22
|
+
}
|
|
23
|
+
async function extractPropertiesUsingGPT({
|
|
24
|
+
itemEntityName,
|
|
25
|
+
itemEntitySchema,
|
|
26
|
+
text,
|
|
27
|
+
image,
|
|
28
|
+
extraRowDataFromTableRow,
|
|
29
|
+
strategy,
|
|
30
|
+
tableHeaders,
|
|
31
|
+
identifier,
|
|
32
|
+
possibleTableHeaderOrFooter,
|
|
33
|
+
examples,
|
|
34
|
+
apiKey
|
|
35
|
+
}) {
|
|
36
|
+
const systemMessage = extraRowDataFromTableRow ? `You are a data extraction assistant, out of this text, you will be asked to extract some data. Be accurate, give complete results. If you cannot find the field data, use null. make sure to follow examples if provided. The data will be provided twice, once as a json that was extracted from an html table and once as the html for that row in the table.` : `You are a data extraction assistant, out of this text, you will be asked to extract some data. Be accurate, give complete results. If you cannot find the field data, use null. make sure to follow examples if provided.`;
|
|
37
|
+
const extraUserMessages = [];
|
|
38
|
+
if (examples.length > 0) {
|
|
39
|
+
const examplesMessage = (0, _buildExamplesPrompt.buildExamplesPrompt)({
|
|
40
|
+
entityName: itemEntityName,
|
|
41
|
+
examples
|
|
42
|
+
});
|
|
43
|
+
extraUserMessages.push(examplesMessage);
|
|
44
|
+
}
|
|
45
|
+
if (possibleTableHeaderOrFooter) {
|
|
46
|
+
const content = text ?? image;
|
|
47
|
+
const isHeader = await (0, _isTableHeaderOrFooter.isTableHeaderOrFooter)(content);
|
|
48
|
+
if (isHeader.isErr()) {
|
|
49
|
+
return (0, _neverthrow.err)(isHeader.error);
|
|
50
|
+
}
|
|
51
|
+
if (isHeader.value.isHeader) {
|
|
52
|
+
return (0, _neverthrow.ok)({});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
if (tableHeaders) {
|
|
56
|
+
extraUserMessages.push(`this data are part of a table, the table headers in order are: ${tableHeaders === null || tableHeaders === void 0 ? void 0 : tableHeaders.join(", ")}`);
|
|
57
|
+
}
|
|
58
|
+
if (extraRowDataFromTableRow) {
|
|
59
|
+
extraUserMessages.push(JSON.stringify(extraRowDataFromTableRow));
|
|
60
|
+
}
|
|
61
|
+
const extractionResult = await (0, _common.extractStructuredDataUsingAi)({
|
|
62
|
+
entityName: itemEntityName,
|
|
63
|
+
model: strategy.model,
|
|
64
|
+
text: text ? [text] : undefined,
|
|
65
|
+
jsonSchema: itemEntitySchema,
|
|
66
|
+
systemMessage,
|
|
67
|
+
extraUserMessages,
|
|
68
|
+
images: image ? [{
|
|
69
|
+
data: image,
|
|
70
|
+
image_type: "png"
|
|
71
|
+
}] : [],
|
|
72
|
+
identifier,
|
|
73
|
+
apiKey
|
|
74
|
+
});
|
|
75
|
+
if (extractionResult.isErr()) {
|
|
76
|
+
if (extractionResult.error.type === "NoDataFound") {
|
|
77
|
+
return (0, _neverthrow.ok)({});
|
|
78
|
+
}
|
|
79
|
+
return (0, _neverthrow.err)((0, _errors.invalidExtractionResult)(extractionResult.error.context));
|
|
80
|
+
}
|
|
81
|
+
return (0, _neverthrow.ok)(extractionResult.value.result);
|
|
82
|
+
}
|
|
83
|
+
async function extractPropertiesWithImageStrategy({
|
|
84
|
+
identifier,
|
|
85
|
+
itemEntityName,
|
|
86
|
+
itemEntitySchema,
|
|
87
|
+
items,
|
|
88
|
+
strategy,
|
|
89
|
+
tableAsJsonArray,
|
|
90
|
+
tableHeaders,
|
|
91
|
+
examples: previousExamples,
|
|
92
|
+
apiKey
|
|
93
|
+
}) {
|
|
94
|
+
const shouldUseTableData = !!tableAsJsonArray && tableAsJsonArray.length === items.length;
|
|
95
|
+
const CHUNK_SIZE = process.env.TEST === "true" ? 1 : 5;
|
|
96
|
+
const chunkedExtractPropertiesUsingGPTParameters = items.reduce((acc, item, index) => {
|
|
97
|
+
const chunkIndex = Math.floor(index / CHUNK_SIZE);
|
|
98
|
+
if (!acc[chunkIndex]) {
|
|
99
|
+
acc[chunkIndex] = [];
|
|
100
|
+
}
|
|
101
|
+
acc[chunkIndex].push({
|
|
102
|
+
item,
|
|
103
|
+
index
|
|
104
|
+
});
|
|
105
|
+
return acc;
|
|
106
|
+
}, []);
|
|
107
|
+
const result = [];
|
|
108
|
+
for (const chunk of chunkedExtractPropertiesUsingGPTParameters) {
|
|
109
|
+
const examples = (0, _utils.getRandomItems)([...result, ...previousExamples], 3);
|
|
110
|
+
const promises = chunk.map(args => extractPropertiesUsingGPT({
|
|
111
|
+
itemEntityName,
|
|
112
|
+
itemEntitySchema,
|
|
113
|
+
text: undefined,
|
|
114
|
+
image: args.item.type === "image" ? args.item.buffer : undefined,
|
|
115
|
+
extraRowDataFromTableRow: shouldUseTableData ? tableAsJsonArray[args.index] : undefined,
|
|
116
|
+
strategy,
|
|
117
|
+
tableHeaders,
|
|
118
|
+
identifier,
|
|
119
|
+
possibleTableHeaderOrFooter: args.index === 0 || args.index === items.length - 1,
|
|
120
|
+
examples,
|
|
121
|
+
apiKey
|
|
122
|
+
}));
|
|
123
|
+
const results = await Promise.all(promises);
|
|
124
|
+
const errorResult = results.find(r => r.isErr());
|
|
125
|
+
if (errorResult && errorResult.isErr()) {
|
|
126
|
+
return (0, _neverthrow.err)(errorResult.error);
|
|
127
|
+
}
|
|
128
|
+
const listOfResults = results.map(r => r._unsafeUnwrap());
|
|
129
|
+
listOfResults.forEach((r, i) => {
|
|
130
|
+
const itemIndexInFullList = i + result.length;
|
|
131
|
+
_Logger.logger.debug(`Extracted this info from array item #${itemIndexInFullList}: ${JSON.stringify(r)}`);
|
|
132
|
+
});
|
|
133
|
+
result.push(...listOfResults);
|
|
134
|
+
}
|
|
135
|
+
return (0, _neverthrow.ok)(result);
|
|
136
|
+
}
|
|
137
|
+
async function extractPropertiesWithHTMLStrategy({
|
|
138
|
+
identifier,
|
|
139
|
+
itemEntityName,
|
|
140
|
+
itemEntitySchema,
|
|
141
|
+
items,
|
|
142
|
+
strategy,
|
|
143
|
+
tableAsJsonArray,
|
|
144
|
+
tableHeaders,
|
|
145
|
+
examples: previousExamples,
|
|
146
|
+
apiKey
|
|
147
|
+
}) {
|
|
148
|
+
const shouldUseTableData = !!tableAsJsonArray && tableAsJsonArray.length === items.length;
|
|
149
|
+
const isWeakModel = strategy.model === "claude-3-haiku" || strategy.model == "gpt3.5-turbo";
|
|
150
|
+
const averageItemLength = items.reduce((sum, item) => {
|
|
151
|
+
if (item.type !== "text") return sum;
|
|
152
|
+
return sum + (0, _extractionHelpers.compressStringSpaces)(item.text).length;
|
|
153
|
+
}, 0) / items.length;
|
|
154
|
+
const shouldUseSmallerChunkSize = isWeakModel && averageItemLength > 1000;
|
|
155
|
+
const CHUNK_SIZE = shouldUseSmallerChunkSize ? 3 : 10;
|
|
156
|
+
const itemsChunks = items.reduce((resultArray, item, index) => {
|
|
157
|
+
const chunkIndex = Math.floor(index / CHUNK_SIZE);
|
|
158
|
+
if (!resultArray[chunkIndex]) {
|
|
159
|
+
resultArray[chunkIndex] = [];
|
|
160
|
+
}
|
|
161
|
+
resultArray[chunkIndex].push({
|
|
162
|
+
index,
|
|
163
|
+
item
|
|
164
|
+
});
|
|
165
|
+
return resultArray;
|
|
166
|
+
}, []);
|
|
167
|
+
const results = [];
|
|
168
|
+
const executeChunk = async chunk => {
|
|
169
|
+
const examples = (0, _utils.getRandomItems)([...results, ...previousExamples], 3);
|
|
170
|
+
const texts = chunk.map(i => {
|
|
171
|
+
if (i.item.type !== "text") {
|
|
172
|
+
throw new Error("Invalid type");
|
|
173
|
+
}
|
|
174
|
+
return `<ITEM_INDEX_${i.index}>\n${i.item.text}\n</ITEM_INDEX_${i.index}>`;
|
|
175
|
+
}).join("\n");
|
|
176
|
+
const extraRowDataFromTableRow = shouldUseTableData ? chunk.map(c => tableAsJsonArray[c.index]) : undefined;
|
|
177
|
+
const shouldWorkaroundGoogleModel = (0, _common.isGoogleModel)(strategy.model);
|
|
178
|
+
const itemSchema = !shouldWorkaroundGoogleModel ? {
|
|
179
|
+
oneOf: [itemEntitySchema, {
|
|
180
|
+
type: "null"
|
|
181
|
+
}]
|
|
182
|
+
} : {
|
|
183
|
+
type: "object",
|
|
184
|
+
properties: {
|
|
185
|
+
value: itemEntitySchema,
|
|
186
|
+
_isNull: {
|
|
187
|
+
type: "boolean",
|
|
188
|
+
description: "If the value is supposed to be null, set this to true and do not provide a value. Otherwise, set it to false and provide the value."
|
|
189
|
+
}
|
|
190
|
+
},
|
|
191
|
+
required: ["_isNull"]
|
|
192
|
+
};
|
|
193
|
+
const itemNullPromptExample = !shouldWorkaroundGoogleModel ? "null" : `{ "_isNull": true }`;
|
|
194
|
+
const result = await extractPropertiesUsingGPT({
|
|
195
|
+
identifier,
|
|
196
|
+
text: texts,
|
|
197
|
+
examples,
|
|
198
|
+
itemEntityName,
|
|
199
|
+
tableHeaders,
|
|
200
|
+
extraRowDataFromTableRow,
|
|
201
|
+
apiKey,
|
|
202
|
+
itemEntitySchema: {
|
|
203
|
+
type: "array",
|
|
204
|
+
description: `Extracted ` + itemEntityName + ` items from the content wrapped with <LIST_ITEM_INDEX_X> and </LIST_ITEM_INDEX_X> tags. each tag represents an item on the list, you should return ` + chunk.length + ` items, one for each index. If the item boundaries do not contain a valid entity, return null for that item with the right index. YOU SHOULD INCLUDE ALL ITEMS IN THE RESPONSE, EVERY INDEX BOUNDARY SHOULD BE REPRESENTED BY AN ITEM. IF the index boundary does not include a valid item you should return null for it.` + ` AND YOU SHOULD ALWAYS HAVE THE RIGHT INDEX FOR THE MISSING ITEMS, **do not put it at the end of the list or change the order of that item if in the list **. if <LIST_ITEM_INDEX_33> and </LIST_ITEM_INDEX_33> does not wrap a valid item, the item you return should be ` + `{ index: 33, isTableHeaderOrFooter: false, item: ${itemNullPromptExample} } where ths index is the number in the tag that's missing the item, ` + ` each <LIST_ITEM_INDEX_X> and </LIST_ITEM_INDEX_X> boundary represents a single list item. do not return the same list item more than once.`,
|
|
205
|
+
minItems: chunk.length,
|
|
206
|
+
maxItems: chunk.length,
|
|
207
|
+
items: {
|
|
208
|
+
type: "object",
|
|
209
|
+
properties: {
|
|
210
|
+
index: {
|
|
211
|
+
type: "number",
|
|
212
|
+
enum: generateRange(chunk[0].index, chunk[chunk.length - 1].index),
|
|
213
|
+
minimum: chunk[0].index,
|
|
214
|
+
maximum: chunk[chunk.length - 1].index,
|
|
215
|
+
description: `Identify the index of an item from the list based on the prefix and suffix around the extracted data. if you extract data between <LIST_ITEM_INDEX_x> and </LIST_ITEM_INDEX_x>, the index should be the value of x. if <LIST_ITEM_INDEX_x> exists and does not have a valid data inside it you still should return an item with index x and value of null , getting the wrong index for the missing item will break the whole extraction. YOU SHOULD INCLUDE ALL ITEMS IN THE RESPONSE, EVERY INDEX BOUNDARY SHOULD BE REPRESENTED BY AN ITEM`
|
|
216
|
+
},
|
|
217
|
+
isTableHeaderOrFooter: {
|
|
218
|
+
type: "boolean",
|
|
219
|
+
description: `If the extracted data is a table header or footer, set this field to true. Otherwise, set it to false.`
|
|
220
|
+
},
|
|
221
|
+
item: itemSchema
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
},
|
|
225
|
+
image: undefined,
|
|
226
|
+
strategy
|
|
227
|
+
});
|
|
228
|
+
if (result.isErr()) {
|
|
229
|
+
return (0, _neverthrow.err)(result.error);
|
|
230
|
+
}
|
|
231
|
+
const resultUnWrapped = result.value;
|
|
232
|
+
function isItemRecord(item) {
|
|
233
|
+
return typeof item.value !== "object";
|
|
234
|
+
}
|
|
235
|
+
resultUnWrapped === null || resultUnWrapped === void 0 || resultUnWrapped.forEach(r => {
|
|
236
|
+
if (r.isTableHeaderOrFooter) {
|
|
237
|
+
_Logger.logger.debug(`skipping item at index #${r.index + 1}: ${JSON.stringify(r.item)}, it's detected as a table header or footer.`);
|
|
238
|
+
results[r.index] = {};
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
const item = r.item;
|
|
242
|
+
let itemToUse;
|
|
243
|
+
if (item !== null && !isItemRecord(item)) {
|
|
244
|
+
itemToUse = item._isNull ? null : item.value;
|
|
245
|
+
} else {
|
|
246
|
+
itemToUse = item;
|
|
247
|
+
}
|
|
248
|
+
results[r.index] = itemToUse;
|
|
249
|
+
_Logger.logger.debug(`Extracted this info from array item #${r.index + 1}: ${JSON.stringify(itemToUse)}`);
|
|
250
|
+
});
|
|
251
|
+
};
|
|
252
|
+
const [firstChunk, ...otherChunks] = itemsChunks;
|
|
253
|
+
if (firstChunk) {
|
|
254
|
+
const firstChunkResult = await executeChunk(firstChunk);
|
|
255
|
+
if (firstChunkResult !== null && firstChunkResult !== void 0 && firstChunkResult.isErr()) {
|
|
256
|
+
return (0, _neverthrow.err)(firstChunkResult.error);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
if (Array.isArray(otherChunks)) {
|
|
260
|
+
const otherChunksResults = await Promise.all(otherChunks.map(a => executeChunk(a)));
|
|
261
|
+
const otherChunksError = otherChunksResults.find(r => r === null || r === void 0 ? void 0 : r.isErr());
|
|
262
|
+
if (otherChunksError && otherChunksError.isErr()) {
|
|
263
|
+
return (0, _neverthrow.err)(otherChunksError.error);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return (0, _neverthrow.ok)(results);
|
|
267
|
+
}
|
|
268
|
+
function generateRange(start, end) {
|
|
269
|
+
if (end < start) {
|
|
270
|
+
throw new Error("End value must be greater than or equal to start value.");
|
|
271
|
+
}
|
|
272
|
+
const range = [];
|
|
273
|
+
for (let i = start; i <= end; i++) {
|
|
274
|
+
range.push(i);
|
|
275
|
+
}
|
|
276
|
+
return range;
|
|
277
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.extractStructuredListUsingAi = extractStructuredListUsingAi;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _errors = require("../errors");
|
|
9
|
+
var _zod = require("zod");
|
|
10
|
+
var _common = require("../../common");
|
|
11
|
+
async function extractStructuredListUsingAi(entityName, itemSchema, data, identifier, strategy, prompt, apiKey) {
|
|
12
|
+
var _data$images;
|
|
13
|
+
const propertiesListSchema = {
|
|
14
|
+
type: "array",
|
|
15
|
+
items: itemSchema
|
|
16
|
+
};
|
|
17
|
+
const result = await (0, _common.extractStructuredDataUsingAi)({
|
|
18
|
+
entityName,
|
|
19
|
+
model: strategy.model,
|
|
20
|
+
jsonSchema: propertiesListSchema,
|
|
21
|
+
text: data.text ? [data.text] : undefined,
|
|
22
|
+
images: (_data$images = data.images) === null || _data$images === void 0 ? void 0 : _data$images.map(image => ({
|
|
23
|
+
data: image,
|
|
24
|
+
image_type: "png"
|
|
25
|
+
})),
|
|
26
|
+
identifier,
|
|
27
|
+
systemMessage: prompt,
|
|
28
|
+
apiKey
|
|
29
|
+
});
|
|
30
|
+
if (result.isErr()) {
|
|
31
|
+
if (result.error.type === "InsufficientAiCredits") {
|
|
32
|
+
return (0, _neverthrow.err)((0, _errors.insufficientAiCredits)(result.error.context));
|
|
33
|
+
}
|
|
34
|
+
if (result.error.type === "NoDataFound") {
|
|
35
|
+
return (0, _neverthrow.ok)([]);
|
|
36
|
+
}
|
|
37
|
+
return (0, _neverthrow.err)((0, _errors.invalidExtractionResult)(result.error.context));
|
|
38
|
+
}
|
|
39
|
+
const parsedResult = _zod.z.array(_zod.z.record(_zod.z.string())).safeParse(result.value.result);
|
|
40
|
+
if (!parsedResult.success) {
|
|
41
|
+
return (0, _neverthrow.err)((0, _errors.invalidExtractionResult)("Failed to parse extraction result."));
|
|
42
|
+
}
|
|
43
|
+
return (0, _neverthrow.ok)(parsedResult.data);
|
|
44
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.findSetOfXpathsToCreateAnArrayExtractor = findSetOfXpathsToCreateAnArrayExtractor;
|
|
7
|
+
exports.getContainerElement = getContainerElement;
|
|
8
|
+
exports.getListContainerXpath = getListContainerXpath;
|
|
9
|
+
exports.partOfSameArrayXpath = partOfSameArrayXpath;
|
|
10
|
+
exports.verifyThatAllXpathsArePartOfSameArray = verifyThatAllXpathsArePartOfSameArray;
|
|
11
|
+
async function getListContainerXpath(matches) {
|
|
12
|
+
const matchesList = Array.from(matches.entries());
|
|
13
|
+
const exactMatches = matchesList.map(([_, matchSet]) => matchSet.filter(match => match.exact));
|
|
14
|
+
const xpathsSetForExactMatches = exactMatches.map(value => value.map(match => match.nodeXpath)).filter(xpaths => xpaths.length > 0);
|
|
15
|
+
let resultXpaths = findSetOfXpathsToCreateAnArrayExtractor(xpathsSetForExactMatches);
|
|
16
|
+
if (!resultXpaths) {
|
|
17
|
+
const xpathsSetForAllMatches = matchesList.map(([_, matchSet]) => matchSet.map(match => match.nodeXpath)).filter(xpaths => xpaths.length > 0);
|
|
18
|
+
resultXpaths = findSetOfXpathsToCreateAnArrayExtractor(xpathsSetForAllMatches);
|
|
19
|
+
}
|
|
20
|
+
if (!resultXpaths) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
const partOfSameArray = verifyThatAllXpathsArePartOfSameArray(resultXpaths);
|
|
24
|
+
if (!partOfSameArray) {
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
const containerElement = getContainerElement(resultXpaths);
|
|
28
|
+
return containerElement;
|
|
29
|
+
}
|
|
30
|
+
function findSetOfXpathsToCreateAnArrayExtractor(input) {
|
|
31
|
+
if (input.length === 0) return null;
|
|
32
|
+
const smallestSet = input.reduce((acc, curr) => curr.length < acc.length ? curr : acc);
|
|
33
|
+
smallestSet.sort((a, b) => b.length - a.length);
|
|
34
|
+
for (const xpath of smallestSet) {
|
|
35
|
+
const result = [xpath];
|
|
36
|
+
for (const set of input) {
|
|
37
|
+
if (set === smallestSet) continue;
|
|
38
|
+
const matched = set.find(otherXpath => partOfSameArrayXpath(xpath, otherXpath));
|
|
39
|
+
if (matched) {
|
|
40
|
+
result.push(matched);
|
|
41
|
+
} else {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (result.length === input.length) return result;
|
|
46
|
+
}
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
function partOfSameArrayXpath(str1, str2) {
|
|
50
|
+
if (str1 === str2) return false;
|
|
51
|
+
const parts1 = str1.split("/");
|
|
52
|
+
const parts2 = str2.split("/");
|
|
53
|
+
if (parts1.length !== parts2.length) return false;
|
|
54
|
+
let numericDifferences = 0;
|
|
55
|
+
for (let i = 0; i < parts1.length; i++) {
|
|
56
|
+
if (parts1[i] !== parts2[i]) {
|
|
57
|
+
const regex = /\d+/g;
|
|
58
|
+
const numbers1 = (parts1[i].match(regex) || []).map(Number);
|
|
59
|
+
const numbers2 = (parts2[i].match(regex) || []).map(Number);
|
|
60
|
+
if (numbers1.length !== numbers2.length) return false;
|
|
61
|
+
let segmentDifferences = 0;
|
|
62
|
+
for (let j = 0; j < numbers1.length; j++) {
|
|
63
|
+
if (numbers1[j] !== numbers2[j]) segmentDifferences++;
|
|
64
|
+
}
|
|
65
|
+
if (segmentDifferences === 0) return false;
|
|
66
|
+
if (segmentDifferences > 1) return false;
|
|
67
|
+
numericDifferences += segmentDifferences;
|
|
68
|
+
if (numericDifferences > 1) return false;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return numericDifferences === 1;
|
|
72
|
+
}
|
|
73
|
+
function verifyThatAllXpathsArePartOfSameArray(xpaths) {
|
|
74
|
+
const firstPath = xpaths[0];
|
|
75
|
+
for (let i = 1; i < xpaths.length; i++) {
|
|
76
|
+
if (!partOfSameArrayXpath(xpaths[i], firstPath)) {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return true;
|
|
81
|
+
}
|
|
82
|
+
function getContainerElement(xpaths) {
|
|
83
|
+
if (!xpaths.length) return null;
|
|
84
|
+
let commonPrefix = xpaths[0].split("/");
|
|
85
|
+
for (let i = 1; i < xpaths.length; i++) {
|
|
86
|
+
const parts = xpaths[i].split("/");
|
|
87
|
+
let j = 0;
|
|
88
|
+
while (j < commonPrefix.length && j < parts.length && commonPrefix[j] === parts[j]) {
|
|
89
|
+
j++;
|
|
90
|
+
}
|
|
91
|
+
commonPrefix = commonPrefix.slice(0, j);
|
|
92
|
+
}
|
|
93
|
+
return commonPrefix.length ? commonPrefix.join("/") : null;
|
|
94
|
+
}
|
package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.getRelativeContainerXpathSelector = getRelativeContainerXpathSelector;
|
|
7
|
+
var _locatorHelpers = require("../../../common/locatorHelpers");
|
|
8
|
+
async function getRelativeContainerXpathSelector(searchRegionLocator, containerPath) {
|
|
9
|
+
const searchRegionXpath = await (0, _locatorHelpers.findXPathForLocator)(searchRegionLocator);
|
|
10
|
+
if (!searchRegionXpath) {
|
|
11
|
+
return null;
|
|
12
|
+
}
|
|
13
|
+
if (searchRegionXpath === containerPath) {
|
|
14
|
+
return ".";
|
|
15
|
+
}
|
|
16
|
+
if (!containerPath) {
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
return containerPath.replace(`${searchRegionXpath}/`, "");
|
|
20
|
+
}
|