@intuned/browser-dev 0.1.4-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +21 -0
- package/.eslintignore +10 -0
- package/.eslintrc.js +39 -0
- package/BROWSER_SCRIPTS_SETUP.md +84 -0
- package/LICENSE +43 -0
- package/README.md +160 -0
- package/RELEASE.md +60 -0
- package/dist/ai/export.d.js +5 -0
- package/dist/ai/export.d.ts +641 -0
- package/dist/ai/extractStructuredData.js +320 -0
- package/dist/ai/extractStructuredDataUsingAi.js +142 -0
- package/dist/ai/extractionHelpers/screenshotHelpers.js +56 -0
- package/dist/ai/extractionHelpers/validateSchema.js +148 -0
- package/dist/ai/index.d.ts +641 -0
- package/dist/ai/index.js +19 -0
- package/dist/ai/isPageLoaded.js +80 -0
- package/dist/ai/prompt.js +39 -0
- package/dist/ai/tests/testCheckAllTypesAreStrings.spec.js +137 -0
- package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +646 -0
- package/dist/ai/tests/testIsPageLoaded.spec.js +277 -0
- package/dist/ai/tools/index.js +48 -0
- package/dist/ai/types/errors.js +67 -0
- package/dist/ai/types/models.js +45 -0
- package/dist/ai/types/types.js +48 -0
- package/dist/ai/validators.js +167 -0
- package/dist/common/Logger/index.js +60 -0
- package/dist/common/Logger/types.js +5 -0
- package/dist/common/SdkError.js +50 -0
- package/dist/common/aiModelsValidations.js +32 -0
- package/dist/common/ensureBrowserScripts.js +14 -0
- package/dist/common/extendedTest.js +157 -0
- package/dist/common/extractionHelpers.js +19 -0
- package/dist/common/formatZodError.js +18 -0
- package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
- package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
- package/dist/common/fuzzySearch/utils.js +23 -0
- package/dist/common/getModelProvider.js +18 -0
- package/dist/common/getSimplifiedHtml.js +122 -0
- package/dist/common/hashObject.js +32 -0
- package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
- package/dist/common/html2markdown/index.js +19 -0
- package/dist/common/jwtTokenManager.js +57 -0
- package/dist/common/loadRuntime.js +16 -0
- package/dist/common/locatorHelpers.js +41 -0
- package/dist/common/matching/collectStrings.js +32 -0
- package/dist/common/matching/levenshtein.js +40 -0
- package/dist/common/matching/matching.js +317 -0
- package/dist/common/matching/types.js +1 -0
- package/dist/common/noEmpty.js +9 -0
- package/dist/common/saveSnapshotWithExamples.js +60 -0
- package/dist/common/script.js +2602 -0
- package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
- package/dist/common/xpathMapping.js +107 -0
- package/dist/helpers/clickUntilExhausted.js +85 -0
- package/dist/helpers/downloadFile.js +125 -0
- package/dist/helpers/export.d.js +5 -0
- package/dist/helpers/export.d.ts +1220 -0
- package/dist/helpers/extractMarkdown.js +35 -0
- package/dist/helpers/filterEmptyValues.js +54 -0
- package/dist/helpers/gotoUrl.js +98 -0
- package/dist/helpers/index.d.ts +1220 -0
- package/dist/helpers/index.js +122 -0
- package/dist/helpers/processDate.js +25 -0
- package/dist/helpers/resolveUrl.js +64 -0
- package/dist/helpers/sanitizeHtml.js +74 -0
- package/dist/helpers/saveFileToS3.js +50 -0
- package/dist/helpers/scrollToLoadContent.js +57 -0
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +372 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +206 -0
- package/dist/helpers/tests/testExtractMarkdown.spec.js +290 -0
- package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
- package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
- package/dist/helpers/tests/testProcessDate.spec.js +13 -0
- package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
- package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
- package/dist/helpers/tests/testScrollToLoadContent.spec.js +163 -0
- package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +342 -0
- package/dist/helpers/tests/testWithDomSettledWait.spec.js +164 -0
- package/dist/helpers/tests/testWithNetworkIdleWait.spec.js +114 -0
- package/dist/helpers/types/Attachment.js +115 -0
- package/dist/helpers/types/CustomTypeRegistry.js +48 -0
- package/dist/helpers/types/RunEnvironment.js +18 -0
- package/dist/helpers/types/ValidationError.js +17 -0
- package/dist/helpers/types/index.js +51 -0
- package/dist/helpers/uploadFileToS3.js +154 -0
- package/dist/helpers/utils/getS3Client.js +22 -0
- package/dist/helpers/utils/index.js +73 -0
- package/dist/helpers/utils/isDownload.js +10 -0
- package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
- package/dist/helpers/utils/isLocator.js +9 -0
- package/dist/helpers/utils/jwtTokenManager.js +18 -0
- package/dist/helpers/validateDataUsingSchema.js +103 -0
- package/dist/helpers/waitForDomSettled.js +90 -0
- package/dist/helpers/withNetworkSettledWait.js +91 -0
- package/dist/index.d.js +16 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +16 -0
- package/dist/intunedServices/ApiGateway/aiApiGateway.js +143 -0
- package/dist/intunedServices/ApiGateway/factory.js +16 -0
- package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
- package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
- package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
- package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +355 -0
- package/dist/intunedServices/ApiGateway/types.js +11 -0
- package/dist/intunedServices/cache/cache.js +61 -0
- package/dist/intunedServices/cache/index.js +12 -0
- package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
- package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
- package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +135 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +132 -0
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
- package/dist/optimized-extractors/common/findTableHeaders.js +162 -0
- package/dist/optimized-extractors/common/index.js +55 -0
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +84 -0
- package/dist/optimized-extractors/common/matching/matching.js +212 -0
- package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
- package/dist/optimized-extractors/common/matching/types.js +18 -0
- package/dist/optimized-extractors/common/matching/utils.js +184 -0
- package/dist/optimized-extractors/common/utils.js +58 -0
- package/dist/optimized-extractors/export.d.js +5 -0
- package/dist/optimized-extractors/export.d.ts +397 -0
- package/dist/optimized-extractors/extractArray.js +120 -0
- package/dist/optimized-extractors/extractObject.js +104 -0
- package/dist/optimized-extractors/index.d.ts +397 -0
- package/dist/optimized-extractors/index.js +31 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +269 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +146 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromPage.spec.js +130 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +160 -0
- package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
- package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +243 -0
- package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
- package/dist/optimized-extractors/models/anthropicModel.js +23 -0
- package/dist/optimized-extractors/models/openaiModel.js +23 -0
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
- package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
- package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
- package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
- package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
- package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
- package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
- package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
- package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
- package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
- package/dist/optimized-extractors/types/errors.js +42 -0
- package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
- package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
- package/dist/optimized-extractors/types/types.js +5 -0
- package/dist/optimized-extractors/validators.js +152 -0
- package/dist/types/intuned-runtime.d.js +1 -0
- package/dist/types/intuned-runtime.d.ts +64 -0
- package/dist/vite-env.d.js +1 -0
- package/dist/vite-env.d.ts +9 -0
- package/generated-docs/ai/functions/extractStructuredData.mdx +255 -0
- package/generated-docs/ai/functions/isPageLoaded.mdx +88 -0
- package/generated-docs/ai/interfaces/ArraySchema.mdx +36 -0
- package/generated-docs/ai/interfaces/BasicSchema.mdx +14 -0
- package/generated-docs/ai/interfaces/BooleanSchema.mdx +28 -0
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +16 -0
- package/generated-docs/ai/interfaces/NumberSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/ObjectSchema.mdx +39 -0
- package/generated-docs/ai/interfaces/StringSchema.mdx +35 -0
- package/generated-docs/ai/interfaces/TextContentItem.mdx +14 -0
- package/generated-docs/ai/type-aliases/ContentItem.mdx +12 -0
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +47 -0
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +85 -0
- package/generated-docs/helpers/functions/downloadFile.mdx +99 -0
- package/generated-docs/helpers/functions/extractMarkdown.mdx +56 -0
- package/generated-docs/helpers/functions/filterEmptyValues.mdx +51 -0
- package/generated-docs/helpers/functions/goToUrl.mdx +124 -0
- package/generated-docs/helpers/functions/processDate.mdx +55 -0
- package/generated-docs/helpers/functions/resolveUrl.mdx +165 -0
- package/generated-docs/helpers/functions/sanitizeHtml.mdx +113 -0
- package/generated-docs/helpers/functions/saveFileToS3.mdx +127 -0
- package/generated-docs/helpers/functions/scrollToLoadContent.mdx +89 -0
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +121 -0
- package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +90 -0
- package/generated-docs/helpers/functions/waitForDomSettled.mdx +91 -0
- package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +76 -0
- package/generated-docs/helpers/interfaces/Attachment.mdx +56 -0
- package/generated-docs/helpers/interfaces/S3Configs.mdx +52 -0
- package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +22 -0
- package/generated-docs/helpers/type-aliases/AttachmentType.mdx +10 -0
- package/generated-docs/helpers/type-aliases/FileType.mdx +61 -0
- package/generated-docs/helpers/type-aliases/Trigger.mdx +62 -0
- package/how-to-generate-docs.md +61 -0
- package/how-to-run-tests.md +42 -0
- package/intuned-runtime-setup.md +13 -0
- package/package.json +124 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +26 -0
package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromPage.spec.js
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _extendedTest = require("../../../common/extendedTest");
|
|
4
|
+
var _ = require("../..");
|
|
5
|
+
var _uuid = require("uuid");
|
|
6
|
+
const productListTemplate = `
|
|
7
|
+
<div class="products-container">
|
|
8
|
+
<div class="product-item">
|
|
9
|
+
<h2 class="product-title">iPhone 14 Pro</h2>
|
|
10
|
+
<div class="price-wrapper">
|
|
11
|
+
<span class="price">$999</span>
|
|
12
|
+
</div>
|
|
13
|
+
<div class="details">
|
|
14
|
+
<p class="product-description">Latest iPhone with advanced camera system</p>
|
|
15
|
+
</div>
|
|
16
|
+
</div>
|
|
17
|
+
<div class="product-item">
|
|
18
|
+
<h2 class="product-title">MacBook Air M2</h2>
|
|
19
|
+
<div class="price-wrapper">
|
|
20
|
+
<span class="price">$1199</span>
|
|
21
|
+
</div>
|
|
22
|
+
<div class="details">
|
|
23
|
+
<p class="product-description">Thin and light laptop with M2 chip</p>
|
|
24
|
+
</div>
|
|
25
|
+
</div>
|
|
26
|
+
<div class="product-item">
|
|
27
|
+
<h2 class="product-title">AirPods Pro</h2>
|
|
28
|
+
<div class="price-wrapper">
|
|
29
|
+
<span class="price">$249</span>
|
|
30
|
+
</div>
|
|
31
|
+
<div class="details">
|
|
32
|
+
<p class="product-description">Active noise cancellation earbuds</p>
|
|
33
|
+
</div>
|
|
34
|
+
</div>
|
|
35
|
+
<div class="additional-info">
|
|
36
|
+
<div class="shipping-notice">Free shipping on all orders</div>
|
|
37
|
+
<div class="return-policy">30-day return policy</div>
|
|
38
|
+
</div>
|
|
39
|
+
</div>
|
|
40
|
+
`;
|
|
41
|
+
_extendedTest.describe.skip("Array Extractor from Page Caching Tests", () => {
|
|
42
|
+
(0, _extendedTest.describe)("DOM Changes and Cache Behavior", () => {
|
|
43
|
+
(0, _extendedTest.test)("should demonstrate caching behavior with different types of DOM changes", async ({
|
|
44
|
+
page
|
|
45
|
+
}) => {
|
|
46
|
+
const testLabel = `product-list-page-cache-test-${(0, _uuid.v4)()}`;
|
|
47
|
+
const variantKey = testLabel;
|
|
48
|
+
const itemEntitySchema = {
|
|
49
|
+
type: "object",
|
|
50
|
+
required: ["title", "price"],
|
|
51
|
+
properties: {
|
|
52
|
+
title: {
|
|
53
|
+
type: "string",
|
|
54
|
+
description: "Product title",
|
|
55
|
+
primary: true
|
|
56
|
+
},
|
|
57
|
+
price: {
|
|
58
|
+
type: "string",
|
|
59
|
+
description: "Product price"
|
|
60
|
+
},
|
|
61
|
+
description: {
|
|
62
|
+
type: "string",
|
|
63
|
+
description: "Product description"
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
const extractionOptions = {
|
|
68
|
+
itemEntityName: "product",
|
|
69
|
+
label: testLabel,
|
|
70
|
+
itemEntitySchema,
|
|
71
|
+
strategy: {
|
|
72
|
+
model: "claude-3-5-sonnet-20240620",
|
|
73
|
+
type: "HTML"
|
|
74
|
+
},
|
|
75
|
+
variantKey: variantKey,
|
|
76
|
+
apiKey: process.env.ANTHROPIC_API_KEY
|
|
77
|
+
};
|
|
78
|
+
await page.setContent(productListTemplate);
|
|
79
|
+
const firstResult = await (0, _.extractArrayFromPage)(page, extractionOptions);
|
|
80
|
+
console.log("First extraction result:", firstResult);
|
|
81
|
+
(0, _extendedTest.expect)(firstResult).toHaveLength(3);
|
|
82
|
+
(0, _extendedTest.expect)(firstResult[0]).toHaveProperty("title", "iPhone 14 Pro");
|
|
83
|
+
(0, _extendedTest.expect)(firstResult[0]).toHaveProperty("price", "$999");
|
|
84
|
+
(0, _extendedTest.expect)(firstResult[1]).toHaveProperty("title", "MacBook Air M2");
|
|
85
|
+
(0, _extendedTest.expect)(firstResult[1]).toHaveProperty("price", "$1199");
|
|
86
|
+
(0, _extendedTest.expect)(firstResult[2]).toHaveProperty("title", "AirPods Pro");
|
|
87
|
+
(0, _extendedTest.expect)(firstResult[2]).toHaveProperty("price", "$249");
|
|
88
|
+
const modifiedTemplate = productListTemplate.replace("iPhone 14 Pro", "iPhone 15 Pro Max").replace("$999", "$1099").replace("MacBook Air M2", "MacBook Pro M3").replace("$1199", "$1999");
|
|
89
|
+
await page.setContent(modifiedTemplate);
|
|
90
|
+
const secondResult = await (0, _.extractArrayFromPage)(page, extractionOptions);
|
|
91
|
+
console.log("Second extraction result (after relevant change):", secondResult);
|
|
92
|
+
(0, _extendedTest.expect)(secondResult).not.toEqual(firstResult);
|
|
93
|
+
(0, _extendedTest.expect)(secondResult).toHaveLength(3);
|
|
94
|
+
(0, _extendedTest.expect)(secondResult[0]).toHaveProperty("title", "iPhone 15 Pro Max");
|
|
95
|
+
(0, _extendedTest.expect)(secondResult[0]).toHaveProperty("price", "$1099");
|
|
96
|
+
(0, _extendedTest.expect)(secondResult[1]).toHaveProperty("title", "MacBook Pro M3");
|
|
97
|
+
(0, _extendedTest.expect)(secondResult[1]).toHaveProperty("price", "$1999");
|
|
98
|
+
(0, _extendedTest.expect)(secondResult[2]).toHaveProperty("title", "AirPods Pro");
|
|
99
|
+
(0, _extendedTest.expect)(secondResult[2]).toHaveProperty("price", "$249");
|
|
100
|
+
const irrelevantChangeTemplate = modifiedTemplate.replace("Free shipping on all orders", "Express shipping available").replace("30-day return policy", "60-day return policy");
|
|
101
|
+
await page.setContent(irrelevantChangeTemplate);
|
|
102
|
+
const thirdResult = await (0, _.extractArrayFromPage)(page, extractionOptions);
|
|
103
|
+
console.log("Third extraction result (after irrelevant change):", thirdResult);
|
|
104
|
+
(0, _extendedTest.expect)(thirdResult).toEqual(secondResult);
|
|
105
|
+
(0, _extendedTest.expect)(thirdResult).toHaveLength(3);
|
|
106
|
+
(0, _extendedTest.expect)(thirdResult[0]).toHaveProperty("title", "iPhone 15 Pro Max");
|
|
107
|
+
(0, _extendedTest.expect)(thirdResult[0]).toHaveProperty("price", "$1099");
|
|
108
|
+
const appendedTemplate = irrelevantChangeTemplate + `
|
|
109
|
+
<div class="newly-added-section">
|
|
110
|
+
<div class="customer-service">
|
|
111
|
+
<h3>Customer Support</h3>
|
|
112
|
+
<p>24/7 support available</p>
|
|
113
|
+
</div>
|
|
114
|
+
<div class="social-media">
|
|
115
|
+
<button class="share-facebook">Share on Facebook</button>
|
|
116
|
+
<button class="share-twitter">Share on Twitter</button>
|
|
117
|
+
</div>
|
|
118
|
+
</div>
|
|
119
|
+
`;
|
|
120
|
+
await page.setContent(appendedTemplate);
|
|
121
|
+
const fourthResult = await (0, _.extractArrayFromPage)(page, extractionOptions);
|
|
122
|
+
console.log("Fourth extraction result (after appending content):", fourthResult);
|
|
123
|
+
(0, _extendedTest.expect)(fourthResult).toEqual(thirdResult);
|
|
124
|
+
(0, _extendedTest.expect)(fourthResult).toHaveLength(3);
|
|
125
|
+
(0, _extendedTest.expect)(fourthResult[0]).toHaveProperty("title", "iPhone 15 Pro Max");
|
|
126
|
+
(0, _extendedTest.expect)(fourthResult[0]).toHaveProperty("price", "$1099");
|
|
127
|
+
console.log("All cache behavior tests completed successfully!");
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
});
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _getListContainerXpath = require("../utils/getListContainerXpath");
|
|
4
|
+
var _vitest = require("vitest");
|
|
5
|
+
(0, _vitest.describe)("verifyThatAllXpathsArePartOfSameArray", () => {
|
|
6
|
+
(0, _vitest.it)("basic case", () => {
|
|
7
|
+
(0, _vitest.expect)((0, _getListContainerXpath.verifyThatAllXpathsArePartOfSameArray)(["html[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title"])).toBe(true);
|
|
8
|
+
});
|
|
9
|
+
});
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.dynamicListExtractor = dynamicListExtractor;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _captureSnapshot = require("../objectExtractionHelpers/captureSnapshot");
|
|
9
|
+
var _checksumUtils = require("../objectExtractionHelpers/checksumUtils");
|
|
10
|
+
var _runAiExtraction = require("./runAiExtraction");
|
|
11
|
+
var _validateOptions = require("./utils/validateOptions");
|
|
12
|
+
var _cache = require("../../intunedServices/cache");
|
|
13
|
+
var _xpathMapping = require("../../common/xpathMapping");
|
|
14
|
+
var _Logger = require("../../common/Logger");
|
|
15
|
+
async function dynamicListExtractor(page, identifier, options) {
|
|
16
|
+
const inputValidation = await (0, _validateOptions.validateDynamicListExtractorOptions)(page, identifier, options);
|
|
17
|
+
if (inputValidation.isErr()) {
|
|
18
|
+
return inputValidation;
|
|
19
|
+
}
|
|
20
|
+
const {
|
|
21
|
+
pageUrl,
|
|
22
|
+
searchRegion,
|
|
23
|
+
searchRegionHandler,
|
|
24
|
+
invalidate: _invalidate,
|
|
25
|
+
itemEntityName,
|
|
26
|
+
itemEntitySchema,
|
|
27
|
+
variantKey,
|
|
28
|
+
primaryProperty,
|
|
29
|
+
hasSearchRegionContainer,
|
|
30
|
+
label: _label,
|
|
31
|
+
searchRegionKey,
|
|
32
|
+
searchRegionXpath: _searchRegionXpath,
|
|
33
|
+
strategy,
|
|
34
|
+
prompt,
|
|
35
|
+
apiKey
|
|
36
|
+
} = inputValidation.value;
|
|
37
|
+
const extractorInputHash = (0, _checksumUtils.hashObject)({
|
|
38
|
+
itemEntityName,
|
|
39
|
+
itemEntitySchema,
|
|
40
|
+
variantKey,
|
|
41
|
+
currentPageUrl: pageUrl,
|
|
42
|
+
strategy,
|
|
43
|
+
searchRegionKey,
|
|
44
|
+
prompt
|
|
45
|
+
}, true);
|
|
46
|
+
const pageAndSearchRegion = {
|
|
47
|
+
page,
|
|
48
|
+
searchRegion,
|
|
49
|
+
searchRegionHandler: searchRegionHandler
|
|
50
|
+
};
|
|
51
|
+
const snapshot = await (0, _captureSnapshot.captureSnapshot)(pageAndSearchRegion);
|
|
52
|
+
if (snapshot.isErr()) {
|
|
53
|
+
return (0, _neverthrow.err)(snapshot.error);
|
|
54
|
+
}
|
|
55
|
+
_Logger.logger.info("Looking for value in the cache...");
|
|
56
|
+
const cachedResult = await _cache.cache.get(extractorInputHash);
|
|
57
|
+
if (cachedResult) {
|
|
58
|
+
_Logger.logger.info("Found value in cache");
|
|
59
|
+
if (cachedResult.exceedsLimit) {
|
|
60
|
+
_Logger.logger.warn(`Cache key ${extractorInputHash} exceeds cache limit and is not cacheable`);
|
|
61
|
+
} else {
|
|
62
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.containerPath);
|
|
63
|
+
if (isValid) {
|
|
64
|
+
const nonRelatedChildrenCount = cachedResult.nonRelatedChildrenCount;
|
|
65
|
+
const currentChildrenCount = await page.evaluate(fullContainerXpath => {
|
|
66
|
+
if (!fullContainerXpath) return 0;
|
|
67
|
+
const containerResult = document.evaluate(fullContainerXpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
68
|
+
const containerElement = containerResult.singleNodeValue;
|
|
69
|
+
if (!containerElement) return 0;
|
|
70
|
+
return containerElement.children.length;
|
|
71
|
+
}, cachedResult.fullContainerXpath);
|
|
72
|
+
if (currentChildrenCount - nonRelatedChildrenCount === cachedResult.result.length) {
|
|
73
|
+
_Logger.logger.info("The values in the cache are the same as the current page, returning the cached result");
|
|
74
|
+
return (0, _neverthrow.ok)(cachedResult.result);
|
|
75
|
+
}
|
|
76
|
+
_Logger.logger.info("The values in the cache are different from the current page, running AI extraction");
|
|
77
|
+
}
|
|
78
|
+
_Logger.logger.info("The values in the cache are different from the current page, running AI extraction");
|
|
79
|
+
}
|
|
80
|
+
} else {
|
|
81
|
+
_Logger.logger.info("No value found in the cache, running AI extraction");
|
|
82
|
+
}
|
|
83
|
+
const aiExtractionResult = await (0, _runAiExtraction.runAiExtraction)({
|
|
84
|
+
pageAndSearchRegion,
|
|
85
|
+
itemEntityName,
|
|
86
|
+
itemEntitySchema,
|
|
87
|
+
primaryProperty,
|
|
88
|
+
hasSearchRegionContainer,
|
|
89
|
+
strategy,
|
|
90
|
+
identifier,
|
|
91
|
+
prompt,
|
|
92
|
+
examples: [],
|
|
93
|
+
apiKey
|
|
94
|
+
});
|
|
95
|
+
if (aiExtractionResult.isErr()) {
|
|
96
|
+
return (0, _neverthrow.err)(aiExtractionResult.error);
|
|
97
|
+
}
|
|
98
|
+
const xpathsMapping = buildXpathsMapping(aiExtractionResult.value);
|
|
99
|
+
const resultsToReturn = getResultToReturn(aiExtractionResult.value.resultValues);
|
|
100
|
+
const allContainerChildrenCount = await page.evaluate(fullContainerXpath => {
|
|
101
|
+
if (!fullContainerXpath) return 0;
|
|
102
|
+
const containerResult = document.evaluate(fullContainerXpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
103
|
+
const containerElement = containerResult.singleNodeValue;
|
|
104
|
+
if (!containerElement) return 0;
|
|
105
|
+
return containerElement.children.length;
|
|
106
|
+
}, aiExtractionResult.value.fullContainerXpath);
|
|
107
|
+
const nonRelatedChildrenCount = allContainerChildrenCount - resultsToReturn.length;
|
|
108
|
+
const resultsToCache = {
|
|
109
|
+
result: resultsToReturn,
|
|
110
|
+
matchesMapping: xpathsMapping,
|
|
111
|
+
containerPath: aiExtractionResult.value.containerPath,
|
|
112
|
+
fullContainerXpath: aiExtractionResult.value.fullContainerXpath,
|
|
113
|
+
nonRelatedChildrenCount
|
|
114
|
+
};
|
|
115
|
+
const cacheDataSize = JSON.stringify(resultsToCache).length;
|
|
116
|
+
const CACHE_SIZE_LIMIT = 380 * 1024;
|
|
117
|
+
if (cacheDataSize > CACHE_SIZE_LIMIT) {
|
|
118
|
+
_Logger.logger.warn(`Results exceed cache limit (${cacheDataSize} bytes > ${CACHE_SIZE_LIMIT} bytes), skipping caching`);
|
|
119
|
+
await _cache.cache.set(extractorInputHash, {
|
|
120
|
+
exceedsLimit: true
|
|
121
|
+
});
|
|
122
|
+
} else {
|
|
123
|
+
_Logger.logger.debug("Caching results...");
|
|
124
|
+
await _cache.cache.set(extractorInputHash, resultsToCache);
|
|
125
|
+
_Logger.logger.debug("Results cached");
|
|
126
|
+
}
|
|
127
|
+
return (0, _neverthrow.ok)(resultsToReturn);
|
|
128
|
+
}
|
|
129
|
+
function buildXpathsMapping(results) {
|
|
130
|
+
const containerXpath = results.containerPath;
|
|
131
|
+
const xpathsMapping = {};
|
|
132
|
+
for (const result of results.resultValues) {
|
|
133
|
+
for (const [_key, valueObj] of Object.entries(result.result)) {
|
|
134
|
+
const value = valueObj;
|
|
135
|
+
const matchedXpath = value.matchXpath;
|
|
136
|
+
if (matchedXpath) {
|
|
137
|
+
const relativePath = matchedXpath.replace(containerXpath + "/", "");
|
|
138
|
+
const xpathEntry = {
|
|
139
|
+
xpath: relativePath,
|
|
140
|
+
matchType: value.matchType
|
|
141
|
+
};
|
|
142
|
+
if (!xpathsMapping[value.matchText]) {
|
|
143
|
+
xpathsMapping[value.matchText] = [];
|
|
144
|
+
}
|
|
145
|
+
xpathsMapping[value.matchText].push(xpathEntry);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return xpathsMapping;
|
|
150
|
+
}
|
|
151
|
+
function getResultToReturn(extractionResult) {
|
|
152
|
+
return extractionResult.map(result => {
|
|
153
|
+
const transformedResult = {};
|
|
154
|
+
for (const [key, valueObj] of Object.entries(result.result)) {
|
|
155
|
+
const value = valueObj;
|
|
156
|
+
transformedResult[key] = value.matchText;
|
|
157
|
+
}
|
|
158
|
+
return transformedResult;
|
|
159
|
+
});
|
|
160
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.requiredPropertyNotExtracted = exports.other = exports.invalidSearchRegion = exports.invalidList = exports.invalidInput = exports.invalidExtractionResult = exports.invalidAddressUrl = exports.insufficientAiCredits = void 0;
|
|
7
|
+
const other = (context, error) => ({
|
|
8
|
+
type: "Other",
|
|
9
|
+
context,
|
|
10
|
+
error
|
|
11
|
+
});
|
|
12
|
+
exports.other = other;
|
|
13
|
+
const invalidSearchRegion = () => ({
|
|
14
|
+
type: "InvalidSearchRegion"
|
|
15
|
+
});
|
|
16
|
+
exports.invalidSearchRegion = invalidSearchRegion;
|
|
17
|
+
const invalidList = () => ({
|
|
18
|
+
type: "InvalidList",
|
|
19
|
+
context: "Failed to get container path, please try to choose another primary property and make sure all list items are direct child of a common container"
|
|
20
|
+
});
|
|
21
|
+
exports.invalidList = invalidList;
|
|
22
|
+
const invalidInput = context => ({
|
|
23
|
+
type: "InvalidInput",
|
|
24
|
+
context
|
|
25
|
+
});
|
|
26
|
+
exports.invalidInput = invalidInput;
|
|
27
|
+
const invalidExtractionResult = context => ({
|
|
28
|
+
type: "InvalidExtractionResult",
|
|
29
|
+
context
|
|
30
|
+
});
|
|
31
|
+
exports.invalidExtractionResult = invalidExtractionResult;
|
|
32
|
+
const invalidAddressUrl = context => ({
|
|
33
|
+
type: "InvalidAddressUrl",
|
|
34
|
+
context
|
|
35
|
+
});
|
|
36
|
+
exports.invalidAddressUrl = invalidAddressUrl;
|
|
37
|
+
const requiredPropertyNotExtracted = context => ({
|
|
38
|
+
type: "RequiredPropertyNotExtracted",
|
|
39
|
+
context
|
|
40
|
+
});
|
|
41
|
+
exports.requiredPropertyNotExtracted = requiredPropertyNotExtracted;
|
|
42
|
+
const insufficientAiCredits = context => ({
|
|
43
|
+
type: "InsufficientAiCredits",
|
|
44
|
+
context
|
|
45
|
+
});
|
|
46
|
+
exports.insufficientAiCredits = insufficientAiCredits;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.getListMatches = getListMatches;
|
|
7
|
+
var _noEmpty = _interopRequireDefault(require("../../common/noEmpty"));
|
|
8
|
+
var _findDomMatches = require("../objectExtractionHelpers/findDomMatches");
|
|
9
|
+
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
10
|
+
async function getListMatches(page, searchRegionHandler, results) {
|
|
11
|
+
const allExtractedValues = results.map(v => Object.values(v.result)).filter(_noEmpty.default).flat();
|
|
12
|
+
const allExtractedValuesDomMatches = await (0, _findDomMatches.getDomMatches)(page, searchRegionHandler, allExtractedValues);
|
|
13
|
+
return allExtractedValuesDomMatches;
|
|
14
|
+
}
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.runAiExtraction = runAiExtraction;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _extractionHelpers = require("../../common/extractionHelpers");
|
|
9
|
+
var _findDomMatches = require("../objectExtractionHelpers/findDomMatches");
|
|
10
|
+
var _getSimplifiedHtml = require("../objectExtractionHelpers/getSimplifiedHtml");
|
|
11
|
+
var Errors = _interopRequireWildcard(require("./errors"));
|
|
12
|
+
var _getListMatches = require("./getListMatches");
|
|
13
|
+
var _extractPropertiesUsingGPTFromArray = require("./utils/extractPropertiesUsingGPTFromArray");
|
|
14
|
+
var _extractStructuredListUsingAi = require("./utils/extractStructuredListUsingAi");
|
|
15
|
+
var _getListContainerXpath = require("./utils/getListContainerXpath");
|
|
16
|
+
var _getRelativeContainerXpathSelector = require("./utils/getRelativeContainerXpathSelector");
|
|
17
|
+
var _getSimplifiedHtmlPerListItem = require("./utils/getSimplifiedHtmlPerListItem");
|
|
18
|
+
var _tablesUtils = require("./utils/tablesUtils");
|
|
19
|
+
var _buildImagesFromPage = require("../common/buildImagesFromPage");
|
|
20
|
+
var _findTableHeaders = require("../common/findTableHeaders");
|
|
21
|
+
var _Logger = require("../../common/Logger");
|
|
22
|
+
var _utils = require("../common/matching/utils");
|
|
23
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
24
|
+
async function runAiExtraction(params) {
|
|
25
|
+
return handleNewAiExtraction(params);
|
|
26
|
+
}
|
|
27
|
+
async function handleNewAiExtraction(params) {
|
|
28
|
+
var _primaryData$value;
|
|
29
|
+
const {
|
|
30
|
+
itemEntityName,
|
|
31
|
+
itemEntitySchema,
|
|
32
|
+
pageAndSearchRegion,
|
|
33
|
+
primaryProperty,
|
|
34
|
+
hasSearchRegionContainer,
|
|
35
|
+
strategy,
|
|
36
|
+
identifier,
|
|
37
|
+
prompt,
|
|
38
|
+
examples,
|
|
39
|
+
apiKey
|
|
40
|
+
} = params;
|
|
41
|
+
const [primaryPropertyName, primaryPropertyValue] = primaryProperty;
|
|
42
|
+
const aiExtractionData = await getAiExtractionData(strategy, pageAndSearchRegion, hasSearchRegionContainer);
|
|
43
|
+
if (aiExtractionData.isErr()) return (0, _neverthrow.err)(aiExtractionData.error);
|
|
44
|
+
const primaryData = await (0, _extractStructuredListUsingAi.extractStructuredListUsingAi)(itemEntityName, {
|
|
45
|
+
type: "object",
|
|
46
|
+
properties: {
|
|
47
|
+
[primaryPropertyName]: primaryPropertyValue
|
|
48
|
+
},
|
|
49
|
+
required: [primaryPropertyName]
|
|
50
|
+
}, aiExtractionData.value, identifier, strategy, prompt, apiKey);
|
|
51
|
+
if (primaryData.isErr()) {
|
|
52
|
+
return (0, _neverthrow.err)(primaryData.error);
|
|
53
|
+
}
|
|
54
|
+
if (primaryData.value.length === 0) {
|
|
55
|
+
_Logger.logger.debug(`the ai couldn't find any item with the ${primaryPropertyName} property`);
|
|
56
|
+
return (0, _neverthrow.ok)({
|
|
57
|
+
resultValues: [],
|
|
58
|
+
containerPath: null,
|
|
59
|
+
fullContainerXpath: null,
|
|
60
|
+
matches: new Map()
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
const primaryValues = (_primaryData$value = primaryData.value) === null || _primaryData$value === void 0 ? void 0 : _primaryData$value.map(i => i[primaryPropertyName]);
|
|
64
|
+
_Logger.logger.debug(`we were able to detect ${primaryValues.length} items with ${primaryPropertyName} property: ${JSON.stringify(primaryValues)}`);
|
|
65
|
+
if (primaryValues.length < 2) {
|
|
66
|
+
const allData = await (0, _extractStructuredListUsingAi.extractStructuredListUsingAi)(itemEntityName, itemEntitySchema, aiExtractionData.value, identifier, strategy, prompt, apiKey);
|
|
67
|
+
if (allData.isErr()) {
|
|
68
|
+
return (0, _neverthrow.err)(allData.error);
|
|
69
|
+
}
|
|
70
|
+
allData.value.forEach((v, i) => {
|
|
71
|
+
_Logger.logger.debug(`ai extraction result for row ${i}: ${JSON.stringify(v)}`);
|
|
72
|
+
});
|
|
73
|
+
const aiResults = allData.value.map((v, i) => ({
|
|
74
|
+
result: v,
|
|
75
|
+
rowIndex: i
|
|
76
|
+
}));
|
|
77
|
+
const resultValues = aiResults;
|
|
78
|
+
const matches = await (0, _getListMatches.getListMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, aiResults);
|
|
79
|
+
return (0, _neverthrow.ok)({
|
|
80
|
+
resultValues,
|
|
81
|
+
containerPath: null,
|
|
82
|
+
fullContainerXpath: null,
|
|
83
|
+
matches
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
const primaryValuesDomMatches = await (0, _findDomMatches.getDomMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, primaryValues);
|
|
87
|
+
const valuesDoesNotExistInDOM = Array.from(primaryValuesDomMatches.entries()).filter(([_, v]) => v.length === 0).map(i => i[0]);
|
|
88
|
+
if (valuesDoesNotExistInDOM.length > 0) {
|
|
89
|
+
_Logger.logger.debug(`the following values returned by AI does not exist in the page dom, [${valuesDoesNotExistInDOM}] , this will cause issues finding the list container`);
|
|
90
|
+
}
|
|
91
|
+
const fullContainerXpath = await (0, _getListContainerXpath.getListContainerXpath)(primaryValuesDomMatches);
|
|
92
|
+
let containerPath = fullContainerXpath;
|
|
93
|
+
if (hasSearchRegionContainer && containerPath) {
|
|
94
|
+
containerPath = await (0, _getRelativeContainerXpathSelector.getRelativeContainerXpathSelector)(pageAndSearchRegion.searchRegion, containerPath);
|
|
95
|
+
}
|
|
96
|
+
if (!containerPath) {
|
|
97
|
+
return (0, _neverthrow.err)(Errors.invalidList());
|
|
98
|
+
}
|
|
99
|
+
const listItemsContainerLocator = (await (0, _extractionHelpers.selectLocatorsUsingXpath)(pageAndSearchRegion.page, fullContainerXpath))[0];
|
|
100
|
+
const extractedData = await splitDomAndExtractData({
|
|
101
|
+
listItemsContainerLocator,
|
|
102
|
+
itemEntityName,
|
|
103
|
+
itemEntitySchema,
|
|
104
|
+
pageAndSearchRegion,
|
|
105
|
+
primaryPropertyName: primaryProperty[0],
|
|
106
|
+
strategy,
|
|
107
|
+
identifier,
|
|
108
|
+
examples,
|
|
109
|
+
apiKey
|
|
110
|
+
});
|
|
111
|
+
if (extractedData.isErr()) {
|
|
112
|
+
return (0, _neverthrow.err)(extractedData.error);
|
|
113
|
+
}
|
|
114
|
+
const {
|
|
115
|
+
matches,
|
|
116
|
+
resultValues
|
|
117
|
+
} = extractedData.value;
|
|
118
|
+
return (0, _neverthrow.ok)({
|
|
119
|
+
resultValues,
|
|
120
|
+
containerPath,
|
|
121
|
+
fullContainerXpath,
|
|
122
|
+
matches
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
async function buildImagesForItemsHandles(locators) {
|
|
126
|
+
const images = [];
|
|
127
|
+
for (const locator of locators) {
|
|
128
|
+
const elementHandle = await locator.elementHandle();
|
|
129
|
+
const screenshot = await elementHandle.screenshot({
|
|
130
|
+
type: "png"
|
|
131
|
+
});
|
|
132
|
+
images.push(screenshot);
|
|
133
|
+
}
|
|
134
|
+
return images.map(i => ({
|
|
135
|
+
type: "image",
|
|
136
|
+
buffer: i
|
|
137
|
+
}));
|
|
138
|
+
}
|
|
139
|
+
async function splitDomAndExtractData({
|
|
140
|
+
listItemsContainerLocator,
|
|
141
|
+
itemEntityName,
|
|
142
|
+
itemEntitySchema,
|
|
143
|
+
pageAndSearchRegion,
|
|
144
|
+
primaryPropertyName,
|
|
145
|
+
strategy,
|
|
146
|
+
identifier,
|
|
147
|
+
examples,
|
|
148
|
+
apiKey
|
|
149
|
+
}) {
|
|
150
|
+
const itemsLocators = await (0, _extractionHelpers.splitContainerIntoListLocators)(listItemsContainerLocator);
|
|
151
|
+
const itemsSimplifiedHtml = await (0, _getSimplifiedHtmlPerListItem.getSimplifiedHtmlPerListItem)(itemsLocators);
|
|
152
|
+
const {
|
|
153
|
+
isTable,
|
|
154
|
+
tableLocater
|
|
155
|
+
} = await (0, _tablesUtils.isListTable)(listItemsContainerLocator, itemsSimplifiedHtml);
|
|
156
|
+
const tableAsJsonArray = isTable ? await (0, _tablesUtils.createJsonFromTable)(pageAndSearchRegion.page) : [];
|
|
157
|
+
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater, identifier) : undefined;
|
|
158
|
+
if (tableHeaders && tableHeaders.isErr()) {
|
|
159
|
+
return (0, _neverthrow.err)(tableHeaders.error);
|
|
160
|
+
}
|
|
161
|
+
const extractedData = await (0, _extractPropertiesUsingGPTFromArray.extractPropertiesUsingGPTFromArray)({
|
|
162
|
+
itemEntityName,
|
|
163
|
+
itemEntitySchema,
|
|
164
|
+
itemsSimplifiedHtml,
|
|
165
|
+
tableAsJsonArray,
|
|
166
|
+
strategy,
|
|
167
|
+
tableHeaders: tableHeaders !== null && tableHeaders !== void 0 && tableHeaders.value.headers.length ? tableHeaders.value.headers : undefined,
|
|
168
|
+
items: strategy.type === "HTML" ? itemsSimplifiedHtml.map(v => ({
|
|
169
|
+
type: "text",
|
|
170
|
+
text: v
|
|
171
|
+
})) : await buildImagesForItemsHandles(itemsLocators),
|
|
172
|
+
identifier,
|
|
173
|
+
examples,
|
|
174
|
+
apiKey
|
|
175
|
+
});
|
|
176
|
+
if (extractedData.isErr()) {
|
|
177
|
+
return extractedData;
|
|
178
|
+
}
|
|
179
|
+
const resultValues = [];
|
|
180
|
+
for (let i = 0; i < extractedData.value.length; i++) {
|
|
181
|
+
const rowValues = extractedData.value[i] ?? {};
|
|
182
|
+
const rowLocator = itemsLocators[i];
|
|
183
|
+
const primaryValue = rowValues[primaryPropertyName];
|
|
184
|
+
if (primaryValue === null || primaryValue === undefined) {
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
const rowValuesMatches = await (0, _findDomMatches.getDomMatchesFromItemsHandles)(pageAndSearchRegion.page, await rowLocator.elementHandle(), Object.entries(rowValues).map(([_, value]) => value));
|
|
188
|
+
const matches = rowValuesMatches.get(primaryValue);
|
|
189
|
+
const rowValuesWithMatchesOnly = Object.entries(rowValues).reduce((acc, [key, value]) => {
|
|
190
|
+
const valueMatches = rowValuesMatches.get(value);
|
|
191
|
+
const bestMatch = (0, _utils.selectBestMatch)(value, valueMatches ?? []);
|
|
192
|
+
if (valueMatches && valueMatches.length > 0 && bestMatch) {
|
|
193
|
+
acc[key] = {
|
|
194
|
+
matchText: bestMatch.matchText,
|
|
195
|
+
matchXpath: bestMatch.matchXpath,
|
|
196
|
+
matchType: bestMatch.matchType
|
|
197
|
+
};
|
|
198
|
+
} else {
|
|
199
|
+
_Logger.logger.debug(`value "${value}" for key "${key}" in row ${i + 1} does not have any matches in the item's html, dropped for hallucination protection`);
|
|
200
|
+
}
|
|
201
|
+
return acc;
|
|
202
|
+
}, {});
|
|
203
|
+
if (matches && (matches === null || matches === void 0 ? void 0 : matches.length) > 0) {
|
|
204
|
+
resultValues.push({
|
|
205
|
+
rowIndex: i,
|
|
206
|
+
result: rowValuesWithMatchesOnly
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
const propertyKeys = Object.keys(itemEntitySchema.properties);
|
|
211
|
+
for (const propertyKey of propertyKeys) {
|
|
212
|
+
const isRequired = itemEntitySchema.required.includes(propertyKey);
|
|
213
|
+
const isPrimary = itemEntitySchema.properties[propertyKey].primary;
|
|
214
|
+
if (!isPrimary && isRequired && resultValues.some(i => {
|
|
215
|
+
const doenstExist = i.result[propertyKey] === null || i.result[propertyKey] === undefined;
|
|
216
|
+
return doenstExist;
|
|
217
|
+
})) {
|
|
218
|
+
return (0, _neverthrow.err)(Errors.invalidExtractionResult(`Required property ${propertyKey} not found in all rows`));
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
const matches = await (0, _getListMatches.getListMatches)(pageAndSearchRegion.page, pageAndSearchRegion.searchRegionHandler, resultValues.map(v => {
|
|
222
|
+
return {
|
|
223
|
+
rowIndex: v.rowIndex,
|
|
224
|
+
result: Object.fromEntries(Object.entries(v.result).map(([key, value]) => [key, value.matchText]))
|
|
225
|
+
};
|
|
226
|
+
}));
|
|
227
|
+
return (0, _neverthrow.ok)({
|
|
228
|
+
resultValues,
|
|
229
|
+
matches
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
async function getAiExtractionData(strategy, pageAndSearchRegion, hasSearchRegionContainer) {
|
|
233
|
+
if (strategy.type === "HTML") {
|
|
234
|
+
return (0, _neverthrow.ok)({
|
|
235
|
+
text: await (0, _getSimplifiedHtml.getSimplifiedHtml)(pageAndSearchRegion.searchRegionHandler)
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
const images = await (0, _buildImagesFromPage.buildImagesFromPageOrHandle)(pageAndSearchRegion.page, hasSearchRegionContainer ? pageAndSearchRegion.searchRegionHandler : undefined);
|
|
239
|
+
if (images.isErr()) return (0, _neverthrow.err)(Errors.other(images.error.context));
|
|
240
|
+
return (0, _neverthrow.ok)({
|
|
241
|
+
images: images.value
|
|
242
|
+
});
|
|
243
|
+
}
|