@intuned/browser-dev 2.2.3-unify-sdks.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +21 -0
- package/.eslintignore +10 -0
- package/.eslintrc.js +39 -0
- package/LICENSE +43 -0
- package/dist/ai-extractors/AnthropicClient/index.js +23 -0
- package/dist/ai-extractors/export.d.js +5 -0
- package/dist/ai-extractors/export.d.ts +422 -0
- package/dist/ai-extractors/extractStructuredData.js +79 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
- package/dist/ai-extractors/extractionHelpers/types.js +5 -0
- package/dist/ai-extractors/fileExtractors.js +176 -0
- package/dist/ai-extractors/index.js +31 -0
- package/dist/ai-extractors/jsonSchema.d.js +5 -0
- package/dist/ai-extractors/jsonSchema.d.ts +49 -0
- package/dist/ai-extractors/openAiClients/index.js +23 -0
- package/dist/ai-extractors/validators.js +239 -0
- package/dist/browser/ai/export.d.js +3 -0
- package/dist/browser/ai/export.d.ts +587 -0
- package/dist/browser/ai/extractMarkdown.js +15 -0
- package/dist/browser/ai/extractStructuredData.js +231 -0
- package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
- package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
- package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
- package/dist/browser/ai/index.d.ts +587 -0
- package/dist/browser/ai/index.js +19 -0
- package/dist/browser/ai/isPageLoaded.js +67 -0
- package/dist/browser/ai/prompt.js +39 -0
- package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
- package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
- package/dist/browser/ai/tools/index.js +48 -0
- package/dist/browser/ai/types/errors.js +67 -0
- package/dist/browser/ai/types/models.js +45 -0
- package/dist/browser/ai/types/types.js +48 -0
- package/dist/browser/ai/validators.js +136 -0
- package/dist/common/Logger/index.js +60 -0
- package/dist/common/Logger/types.js +5 -0
- package/dist/common/SdkError.js +50 -0
- package/dist/common/aiModelsValidations.js +50 -0
- package/dist/common/browser_scripts.js +2596 -0
- package/dist/common/ensureBrowserScripts.js +17 -0
- package/dist/common/environmentVariables.js +16 -0
- package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
- package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
- package/dist/common/extendedTest.js +148 -0
- package/dist/common/extractionHelpers.js +19 -0
- package/dist/common/formatZodError.js +18 -0
- package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
- package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
- package/dist/common/fuzzySearch/utils.js +23 -0
- package/dist/common/getModelProvider.js +18 -0
- package/dist/common/getSimplifiedHtml.js +122 -0
- package/dist/common/hashObject.js +32 -0
- package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
- package/dist/common/html2markdown/index.js +19 -0
- package/dist/common/jwtTokenManager.js +18 -0
- package/dist/common/loadRuntime.js +16 -0
- package/dist/common/locatorHelpers.js +41 -0
- package/dist/common/matching/collectStrings.js +32 -0
- package/dist/common/matching/levenshtein.js +40 -0
- package/dist/common/matching/matching.js +317 -0
- package/dist/common/matching/types.js +1 -0
- package/dist/common/noEmpty.js +9 -0
- package/dist/common/saveSnapshotWithExamples.js +60 -0
- package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
- package/dist/common/xpathMapping.js +107 -0
- package/dist/helpers/downloadFile.js +125 -0
- package/dist/helpers/export.d.js +1 -0
- package/dist/helpers/export.d.ts +1294 -0
- package/dist/helpers/extractMarkdown.js +35 -0
- package/dist/helpers/filterEmptyValues.js +54 -0
- package/dist/helpers/gotoUrl.js +93 -0
- package/dist/helpers/index.d.ts +1294 -0
- package/dist/helpers/index.js +115 -0
- package/dist/helpers/processDate.js +25 -0
- package/dist/helpers/resolveUrl.js +63 -0
- package/dist/helpers/sanitizeHtml.js +73 -0
- package/dist/helpers/saveFileToS3.js +46 -0
- package/dist/helpers/scrollToLoadContent.js +50 -0
- package/dist/helpers/tests/extendedTest.js +130 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
- package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
- package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
- package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
- package/dist/helpers/tests/testProcessDate.spec.js +13 -0
- package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
- package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
- package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
- package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
- package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
- package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
- package/dist/helpers/types/Attachment.js +81 -0
- package/dist/helpers/types/CustomTypeRegistry.js +48 -0
- package/dist/helpers/types/RunEnvironment.js +18 -0
- package/dist/helpers/types/ValidationError.js +17 -0
- package/dist/helpers/types/index.js +51 -0
- package/dist/helpers/uploadFileToS3.js +153 -0
- package/dist/helpers/utils/getS3Client.js +21 -0
- package/dist/helpers/utils/index.js +73 -0
- package/dist/helpers/utils/isDownload.js +10 -0
- package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
- package/dist/helpers/utils/isLocator.js +9 -0
- package/dist/helpers/utils/jwtTokenManager.js +18 -0
- package/dist/helpers/validateDataUsingSchema.js +119 -0
- package/dist/helpers/waitForDomSettled.js +182 -0
- package/dist/helpers/waitForNetworkIdle.js +191 -0
- package/dist/index.d.js +82 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +84 -0
- package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
- package/dist/intunedServices/ApiGateway/factory.js +13 -0
- package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
- package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
- package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
- package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
- package/dist/intunedServices/ApiGateway/types.js +11 -0
- package/dist/intunedServices/cache/cache.js +61 -0
- package/dist/intunedServices/cache/index.js +12 -0
- package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
- package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
- package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
- package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
- package/dist/optimized-extractors/common/index.js +55 -0
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
- package/dist/optimized-extractors/common/matching/matching.js +212 -0
- package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
- package/dist/optimized-extractors/common/matching/types.js +18 -0
- package/dist/optimized-extractors/common/matching/utils.js +184 -0
- package/dist/optimized-extractors/common/utils.js +58 -0
- package/dist/optimized-extractors/export.d.js +5 -0
- package/dist/optimized-extractors/export.d.ts +397 -0
- package/dist/optimized-extractors/extractArray.js +120 -0
- package/dist/optimized-extractors/extractObject.js +104 -0
- package/dist/optimized-extractors/index.d.ts +397 -0
- package/dist/optimized-extractors/index.js +31 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
- package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
- package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
- package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
- package/dist/optimized-extractors/models/anthropicModel.js +23 -0
- package/dist/optimized-extractors/models/openaiModel.js +23 -0
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
- package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
- package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
- package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
- package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
- package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
- package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
- package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
- package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
- package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
- package/dist/optimized-extractors/types/errors.js +42 -0
- package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
- package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
- package/dist/optimized-extractors/types/types.js +5 -0
- package/dist/optimized-extractors/validators.js +152 -0
- package/dist/vite-env.d.js +1 -0
- package/dist/vite-env.d.ts +9 -0
- package/docs.md +14 -0
- package/how-to-run-tests.md +10 -0
- package/intuned-runtime-setup.md +13 -0
- package/package.json +124 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +26 -0
package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _extendedTest = require("../../../common/extendedTest");
|
|
4
|
+
var _dynamicListExtractor = require("../dynamicListExtractor");
|
|
5
|
+
var _uuid = require("uuid");
|
|
6
|
+
var _dotenv = require("dotenv");
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _Logger = require("../../../common/Logger");
|
|
9
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
10
|
+
(0, _dotenv.config)();
|
|
11
|
+
const booksTemplate = `
|
|
12
|
+
<div class="books-list">
|
|
13
|
+
<div class="book-page">
|
|
14
|
+
<div class="book-info">
|
|
15
|
+
<h1 class="book-title">The Great Gatsby</h1>
|
|
16
|
+
<div class="price-container">
|
|
17
|
+
<span class="price">$15.99</span>
|
|
18
|
+
</div>
|
|
19
|
+
<div class="description">
|
|
20
|
+
<p class="book-description">A classic novel about the American Dream</p>
|
|
21
|
+
</div>
|
|
22
|
+
</div>
|
|
23
|
+
</div>
|
|
24
|
+
|
|
25
|
+
<div class="book-page">
|
|
26
|
+
<div class="book-info">
|
|
27
|
+
<h1 class="book-title">1984</h1>
|
|
28
|
+
<div class="price-container">
|
|
29
|
+
<span class="price">$12.99</span>
|
|
30
|
+
</div>
|
|
31
|
+
<div class="description">
|
|
32
|
+
<p class="book-description">A dystopian social science fiction novel</p>
|
|
33
|
+
</div>
|
|
34
|
+
</div>
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
<div class="book-page">
|
|
38
|
+
<div class="book-info">
|
|
39
|
+
<h1 class="book-title">To Kill a Mockingbird</h1>
|
|
40
|
+
<div class="price-container">
|
|
41
|
+
<span class="price">$14.99</span>
|
|
42
|
+
</div>
|
|
43
|
+
<div class="description">
|
|
44
|
+
<p class="book-description">A story of racial injustice and loss of innocence</p>
|
|
45
|
+
</div>
|
|
46
|
+
</div>
|
|
47
|
+
</div>
|
|
48
|
+
</div>
|
|
49
|
+
`;
|
|
50
|
+
(0, _extendedTest.describe)("Dynamic List Extractor Caching Tests", () => {
|
|
51
|
+
(0, _extendedTest.describe)("DOM Changes and Cache Behavior", () => {
|
|
52
|
+
(0, _extendedTest.test)("should demonstrate caching behavior with different types of DOM changes", async ({
|
|
53
|
+
page
|
|
54
|
+
}) => {
|
|
55
|
+
const testLabel = `books-cache-test-${(0, _uuid.v4)()}`;
|
|
56
|
+
const variantKey = testLabel;
|
|
57
|
+
const entitySchema = {
|
|
58
|
+
type: "object",
|
|
59
|
+
required: ["name"],
|
|
60
|
+
properties: {
|
|
61
|
+
name: {
|
|
62
|
+
type: "string",
|
|
63
|
+
description: "book name",
|
|
64
|
+
primary: true
|
|
65
|
+
},
|
|
66
|
+
price: {
|
|
67
|
+
type: "string",
|
|
68
|
+
description: "book price"
|
|
69
|
+
},
|
|
70
|
+
description: {
|
|
71
|
+
type: "string",
|
|
72
|
+
description: "book description"
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
const extractionOptions = {
|
|
77
|
+
itemEntityName: "book",
|
|
78
|
+
label: testLabel,
|
|
79
|
+
itemEntitySchema: entitySchema,
|
|
80
|
+
strategy: {
|
|
81
|
+
model: "claude-3-5-sonnet-20240620",
|
|
82
|
+
type: "HTML"
|
|
83
|
+
},
|
|
84
|
+
variantKey,
|
|
85
|
+
apiKey: process.env.ANTHROPIC_API_KEY
|
|
86
|
+
};
|
|
87
|
+
await page.setContent(booksTemplate);
|
|
88
|
+
const firstResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
89
|
+
(0, _extendedTest.expect)(firstResult.isOk()).toBe(true);
|
|
90
|
+
const books = firstResult._unsafeUnwrap();
|
|
91
|
+
const expectedResult = [{
|
|
92
|
+
name: {
|
|
93
|
+
matchText: "The Great Gatsby",
|
|
94
|
+
matchType: "direct-text",
|
|
95
|
+
matchXpath: "html[1]/body[1]/div[1]/div[1]/div[1]/h1[1]"
|
|
96
|
+
},
|
|
97
|
+
price: {
|
|
98
|
+
matchText: "$15.99",
|
|
99
|
+
matchType: "direct-text",
|
|
100
|
+
matchXpath: "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/span[1]"
|
|
101
|
+
},
|
|
102
|
+
description: {
|
|
103
|
+
matchText: "A classic novel about the American Dream",
|
|
104
|
+
matchType: "direct-text",
|
|
105
|
+
matchXpath: "html[1]/body[1]/div[1]/div[1]/div[1]/div[2]/p[1]"
|
|
106
|
+
}
|
|
107
|
+
}, {
|
|
108
|
+
name: {
|
|
109
|
+
matchText: "1984",
|
|
110
|
+
matchType: "direct-text",
|
|
111
|
+
matchXpath: "html[1]/body[1]/div[1]/div[2]/div[1]/h1[1]"
|
|
112
|
+
},
|
|
113
|
+
price: {
|
|
114
|
+
matchText: "$12.99",
|
|
115
|
+
matchType: "direct-text",
|
|
116
|
+
matchXpath: "html[1]/body[1]/div[1]/div[2]/div[1]/div[1]/span[1]"
|
|
117
|
+
},
|
|
118
|
+
description: {
|
|
119
|
+
matchText: "A dystopian social science fiction novel",
|
|
120
|
+
matchType: "direct-text",
|
|
121
|
+
matchXpath: "html[1]/body[1]/div[1]/div[2]/div[1]/div[2]/p[1]"
|
|
122
|
+
}
|
|
123
|
+
}, {
|
|
124
|
+
name: {
|
|
125
|
+
matchText: "To Kill a Mockingbird",
|
|
126
|
+
matchType: "direct-text",
|
|
127
|
+
matchXpath: "html[1]/body[1]/div[1]/div[3]/div[1]/h1[1]"
|
|
128
|
+
},
|
|
129
|
+
price: {
|
|
130
|
+
matchText: "$14.99",
|
|
131
|
+
matchType: "direct-text",
|
|
132
|
+
matchXpath: "html[1]/body[1]/div[1]/div[3]/div[1]/div[1]/span[1]"
|
|
133
|
+
},
|
|
134
|
+
description: {
|
|
135
|
+
matchText: "A story of racial injustice and loss of innocence",
|
|
136
|
+
matchType: "direct-text",
|
|
137
|
+
matchXpath: "html[1]/body[1]/div[1]/div[3]/div[1]/div[2]/p[1]"
|
|
138
|
+
}
|
|
139
|
+
}];
|
|
140
|
+
(0, _extendedTest.expect)(books).toHaveLength(3);
|
|
141
|
+
(0, _extendedTest.expect)(books[0]).toEqual(expectedResult[0]);
|
|
142
|
+
(0, _extendedTest.expect)(books[0]).toHaveProperty("price", expectedResult[0].price);
|
|
143
|
+
(0, _extendedTest.expect)(books[0]).toHaveProperty("description", expectedResult[0].description);
|
|
144
|
+
(0, _extendedTest.expect)(books[1]).toEqual(expectedResult[1]);
|
|
145
|
+
(0, _extendedTest.expect)(books[1]).toHaveProperty("price", expectedResult[1].price);
|
|
146
|
+
(0, _extendedTest.expect)(books[1]).toHaveProperty("description", expectedResult[1].description);
|
|
147
|
+
(0, _extendedTest.expect)(books[2]).toEqual(expectedResult[2]);
|
|
148
|
+
const irrelevantChangeTemplate = booksTemplate.replace('class="books-list"', 'class="books-list featured-books"');
|
|
149
|
+
await page.setContent(irrelevantChangeTemplate);
|
|
150
|
+
const secondResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
151
|
+
(0, _extendedTest.expect)(secondResult.isOk()).toBe(true);
|
|
152
|
+
(0, _extendedTest.expect)(secondResult._unsafeUnwrap()).toEqual(books);
|
|
153
|
+
const modifiedTemplate = booksTemplate.replace("The Great Gatsby", "The Great Gatsby (Deluxe Edition)").replace("$15.99", "$19.99");
|
|
154
|
+
await page.setContent(modifiedTemplate);
|
|
155
|
+
const thirdResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
156
|
+
(0, _extendedTest.expect)(thirdResult.isOk()).toBe(true);
|
|
157
|
+
const modifiedBooks = thirdResult._unsafeUnwrap();
|
|
158
|
+
(0, _extendedTest.expect)(modifiedBooks).not.toEqual(books);
|
|
159
|
+
(0, _extendedTest.expect)(modifiedBooks[0]).toHaveProperty("name", {
|
|
160
|
+
matchText: "The Great Gatsby (Deluxe Edition)",
|
|
161
|
+
matchType: "direct-text",
|
|
162
|
+
matchXpath: "html[1]/body[1]/div[1]/div[1]/div[1]/h1[1]"
|
|
163
|
+
});
|
|
164
|
+
(0, _extendedTest.expect)(modifiedBooks[0]).toHaveProperty("price", {
|
|
165
|
+
matchText: "$19.99",
|
|
166
|
+
matchType: "direct-text",
|
|
167
|
+
matchXpath: "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/span[1]"
|
|
168
|
+
});
|
|
169
|
+
const insertedTemplate = `
|
|
170
|
+
<div class="books-list">
|
|
171
|
+
<div class="featured-section">
|
|
172
|
+
<h2>Featured Books</h2>
|
|
173
|
+
</div>
|
|
174
|
+
${booksTemplate.split('<div class="books-list">')[1]}
|
|
175
|
+
`;
|
|
176
|
+
await page.setContent(insertedTemplate);
|
|
177
|
+
const fourthResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
178
|
+
(0, _extendedTest.expect)(fourthResult.isOk()).toBe(true);
|
|
179
|
+
(0, _extendedTest.expect)(fourthResult._unsafeUnwrap()).not.toEqual(modifiedBooks);
|
|
180
|
+
const appendedTemplate = insertedTemplate.replace("</div>\n</div>", `</div>
|
|
181
|
+
<div class="book-page">
|
|
182
|
+
<div class="book-info">
|
|
183
|
+
<h1 class="book-title">Dune</h1>
|
|
184
|
+
<div class="price-container">
|
|
185
|
+
<span class="price">$16.99</span>
|
|
186
|
+
</div>
|
|
187
|
+
<div class="description">
|
|
188
|
+
<p class="book-description">A science fiction masterpiece</p>
|
|
189
|
+
</div>
|
|
190
|
+
</div>
|
|
191
|
+
</div>
|
|
192
|
+
</div>`);
|
|
193
|
+
await page.setContent(appendedTemplate);
|
|
194
|
+
const fifthResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
195
|
+
(0, _extendedTest.expect)(fifthResult.isOk()).toBe(true);
|
|
196
|
+
const appendedBooks = fifthResult._unsafeUnwrap();
|
|
197
|
+
(0, _extendedTest.expect)(appendedBooks).toHaveLength(4);
|
|
198
|
+
console.log("All cache behavior tests completed successfully!");
|
|
199
|
+
const outsideTemplate = appendedTemplate + `
|
|
200
|
+
<div class="outside-books-list">
|
|
201
|
+
<div class="book-page">
|
|
202
|
+
<div class="book-info">
|
|
203
|
+
<h1 class="book-title">Dune</h1>
|
|
204
|
+
</div>
|
|
205
|
+
</div>
|
|
206
|
+
</div>
|
|
207
|
+
`;
|
|
208
|
+
await page.setContent(outsideTemplate);
|
|
209
|
+
const sixthResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
210
|
+
(0, _extendedTest.expect)(sixthResult.isOk()).toBe(true);
|
|
211
|
+
const outsideBooks = sixthResult._unsafeUnwrap();
|
|
212
|
+
(0, _extendedTest.expect)(outsideBooks).toHaveLength(4);
|
|
213
|
+
console.log("All cache behavior tests completed successfully!");
|
|
214
|
+
});
|
|
215
|
+
(0, _extendedTest.test)("should handle cache size limit correctly", async ({
|
|
216
|
+
page
|
|
217
|
+
}) => {
|
|
218
|
+
const testLabel = `books-cache-size-test-${(0, _uuid.v4)()}`;
|
|
219
|
+
const variantKey = testLabel;
|
|
220
|
+
const simpleTemplate = `
|
|
221
|
+
<div class="books-list">
|
|
222
|
+
<div class="book-page">
|
|
223
|
+
<div class="book-info">
|
|
224
|
+
<h1 class="book-title">Test Book</h1>
|
|
225
|
+
<div class="price-container">
|
|
226
|
+
<span class="price">$15.99</span>
|
|
227
|
+
</div>
|
|
228
|
+
<div class="description">
|
|
229
|
+
<p class="book-description">A test book</p>
|
|
230
|
+
</div>
|
|
231
|
+
</div>
|
|
232
|
+
</div>
|
|
233
|
+
</div>
|
|
234
|
+
`;
|
|
235
|
+
const entitySchema = {
|
|
236
|
+
type: "object",
|
|
237
|
+
required: ["name"],
|
|
238
|
+
properties: {
|
|
239
|
+
name: {
|
|
240
|
+
type: "string",
|
|
241
|
+
description: "book name",
|
|
242
|
+
primary: true
|
|
243
|
+
},
|
|
244
|
+
price: {
|
|
245
|
+
type: "string",
|
|
246
|
+
description: "book price"
|
|
247
|
+
},
|
|
248
|
+
description: {
|
|
249
|
+
type: "string",
|
|
250
|
+
description: "book description"
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
};
|
|
254
|
+
const extractionOptions = {
|
|
255
|
+
itemEntityName: "book",
|
|
256
|
+
label: testLabel,
|
|
257
|
+
itemEntitySchema: entitySchema,
|
|
258
|
+
strategy: {
|
|
259
|
+
model: "claude-3-5-sonnet-20240620",
|
|
260
|
+
type: "HTML"
|
|
261
|
+
},
|
|
262
|
+
variantKey,
|
|
263
|
+
apiKey: process.env.ANTHROPIC_API_KEY
|
|
264
|
+
};
|
|
265
|
+
const largeMockData = [];
|
|
266
|
+
for (let i = 0; i < 1000; i++) {
|
|
267
|
+
largeMockData.push({
|
|
268
|
+
rowIndex: i,
|
|
269
|
+
result: {
|
|
270
|
+
name: {
|
|
271
|
+
matchText: `Book Title ${i} - This is a very long book title with extensive descriptive text to make the cached data structure large enough to exceed the 380KB limit when we have many items like this in the response`,
|
|
272
|
+
matchXpath: "/html/body/div[1]/div[1]/div[1]/h1",
|
|
273
|
+
matchType: "all-text"
|
|
274
|
+
},
|
|
275
|
+
price: {
|
|
276
|
+
matchText: `$${(15 + i % 50).toFixed(2)}`,
|
|
277
|
+
matchXpath: "/html/body/div[1]/div[1]/div[1]/div[1]/span",
|
|
278
|
+
matchType: "all-text"
|
|
279
|
+
},
|
|
280
|
+
description: {
|
|
281
|
+
matchText: `A very detailed description of book ${i} with extensive content to make the cached data structure large enough to exceed the 380KB limit. This description contains multiple sentences and detailed information about the plot, characters, themes, and critical reception of the book. The description is intentionally verbose to increase the JSON size when serialized for caching purposes. Additional padding text to ensure we reach the size limit for testing cache overflow behavior.`,
|
|
282
|
+
matchXpath: "/html/body/div[1]/div[1]/div[1]/div[2]/p",
|
|
283
|
+
matchType: "all-text"
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
const runAiExtractionModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../runAiExtraction")));
|
|
289
|
+
const runAiExtractionSpy = _extendedTest.vi.spyOn(runAiExtractionModule, "runAiExtraction").mockResolvedValue((0, _neverthrow.ok)({
|
|
290
|
+
resultValues: largeMockData,
|
|
291
|
+
containerPath: "/html/body/div[1]",
|
|
292
|
+
matches: new Map()
|
|
293
|
+
}));
|
|
294
|
+
const consoleSpy = _extendedTest.vi.spyOn(_Logger.logger, "debug");
|
|
295
|
+
const consoleInfoSpy = _extendedTest.vi.spyOn(_Logger.logger, "info");
|
|
296
|
+
const consoleWarnSpy = _extendedTest.vi.spyOn(_Logger.logger, "warn");
|
|
297
|
+
await page.setContent(simpleTemplate);
|
|
298
|
+
const firstResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
299
|
+
(0, _extendedTest.expect)(firstResult.isOk()).toBe(true);
|
|
300
|
+
const books = firstResult._unsafeUnwrap();
|
|
301
|
+
(0, _extendedTest.expect)(books.length).toBe(1000);
|
|
302
|
+
(0, _extendedTest.expect)(consoleWarnSpy).toHaveBeenCalledWith(_extendedTest.expect.stringContaining("Results exceed cache limit"));
|
|
303
|
+
(0, _extendedTest.expect)(consoleWarnSpy).toHaveBeenCalledWith(_extendedTest.expect.stringContaining("skipping caching"));
|
|
304
|
+
const secondResult = await (0, _dynamicListExtractor.dynamicListExtractor)(page, ".books-list", extractionOptions);
|
|
305
|
+
(0, _extendedTest.expect)(secondResult.isOk()).toBe(true);
|
|
306
|
+
(0, _extendedTest.expect)(consoleWarnSpy).toHaveBeenCalledWith(_extendedTest.expect.stringContaining("exceeds cache limit and is not cacheable"));
|
|
307
|
+
consoleSpy.mockRestore();
|
|
308
|
+
runAiExtractionSpy.mockRestore();
|
|
309
|
+
console.log("Cache size limit test completed successfully!");
|
|
310
|
+
});
|
|
311
|
+
});
|
|
312
|
+
});
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _getListContainerXpath = require("../utils/getListContainerXpath");
|
|
4
|
+
var _vitest = require("vitest");
|
|
5
|
+
(0, _vitest.describe)("find xpaths to create an array", () => {
|
|
6
|
+
(0, _vitest.it)("basic case | 1 element in smallest", () => {
|
|
7
|
+
const input = [["html[1]/li[2]/article[1]/h3[1]", "html[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/li[2]/article[1]/h3[1]/a[1]/text()"], ["html[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title"], ["html[1]/li[1]/article[1]/h3[1]/a[1]/@title"], ["html[1]/li[4]/article[1]/h3[1]", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/text()"]];
|
|
8
|
+
(0, _vitest.expect)((0, _getListContainerXpath.findSetOfXpathsToCreateAnArrayExtractor)(input)).toEqual(["html[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title"]);
|
|
9
|
+
});
|
|
10
|
+
(0, _vitest.it)("basic case | 2 elements in smallest", () => {
|
|
11
|
+
const input = [["html[1]/li[1]/article[1]/h3[1]", "html[1]/li[1]/article[1]/h3[1]/a[1]/@href"], ["html[1]/li[2]/article[1]/h3[1]/a[1]/@href", "html[1]/li[2]/article[1]/h3[1]/a[1]/text()"], ["html[1]/li[3]/article[1]/h3[1]/a[1]/@href", "html[1]/li[3]/article[1]/h3[1]/a[1]/text()", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title"]];
|
|
12
|
+
(0, _vitest.expect)((0, _getListContainerXpath.findSetOfXpathsToCreateAnArrayExtractor)(input)).toEqual(["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[2]/article[1]/h3[1]/a[1]/@href", "html[1]/li[3]/article[1]/h3[1]/a[1]/@href"]);
|
|
13
|
+
});
|
|
14
|
+
(0, _vitest.it)("2 sets | 2 items different depth", () => {
|
|
15
|
+
const input = [["html[1]/li[1]/article[1]/h3[1]", "html[1]/li[1]/article[1]/h3[1]/a[1]"], ["html[1]/li[2]/article[1]/h3[1]", "html[1]/li[2]/article[1]/h3[1]/a[1]"]];
|
|
16
|
+
(0, _vitest.expect)((0, _getListContainerXpath.findSetOfXpathsToCreateAnArrayExtractor)(input)).toEqual(["html[1]/li[1]/article[1]/h3[1]/a[1]", "html[1]/li[2]/article[1]/h3[1]/a[1]"]);
|
|
17
|
+
});
|
|
18
|
+
(0, _vitest.it)("Real world", () => {
|
|
19
|
+
const input = [["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[1]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[2]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[2]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[3]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[3]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[5]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[6]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[6]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[6]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[6]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[7]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[8]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[9]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[11]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[12]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[12]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[12]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[12]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[14]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[15]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[16]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[17]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[17]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[17]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[17]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[18]/article[1]/h3[1]/a[1]/@title"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]/a[1]/text()"], ["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[20]/article[1]/h3[1]", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[20]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[20]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[20]/article[1]/h3[1]/a[1]/text()"]];
|
|
20
|
+
(0, _vitest.expect)((0, _getListContainerXpath.findSetOfXpathsToCreateAnArrayExtractor)(input)).toEqual(["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[5]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[6]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[7]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[8]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[9]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[11]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[12]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[14]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[15]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[16]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[17]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[18]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[20]/article[1]/h3[1]/a[1]/@title"]);
|
|
21
|
+
});
|
|
22
|
+
});
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _getListContainerXpath = require("../utils/getListContainerXpath");
|
|
4
|
+
var _vitest = require("vitest");
|
|
5
|
+
(0, _vitest.describe)("getContainerElement", () => {
|
|
6
|
+
(0, _vitest.it)("basic case", () => {
|
|
7
|
+
(0, _vitest.expect)((0, _getListContainerXpath.getContainerElement)(["html[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title"])).toBe("html[1]");
|
|
8
|
+
});
|
|
9
|
+
(0, _vitest.it)("real case", () => {
|
|
10
|
+
(0, _vitest.expect)((0, _getListContainerXpath.getContainerElement)(["html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[4]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[7]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[10]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[13]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[16]/article[1]/h3[1]/a[1]/@title", "html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]/li[19]/article[1]/h3[1]/a[1]/@title"])).toBe("html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/section[1]/div[2]/ol[1]");
|
|
11
|
+
});
|
|
12
|
+
(0, _vitest.it)("empty array", () => {
|
|
13
|
+
(0, _vitest.expect)((0, _getListContainerXpath.getContainerElement)([])).toBeNull();
|
|
14
|
+
});
|
|
15
|
+
(0, _vitest.it)("no common parent", () => {
|
|
16
|
+
(0, _vitest.expect)((0, _getListContainerXpath.getContainerElement)(["html[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[2]/div[1]/span[1]/@text"])).toBeNull();
|
|
17
|
+
});
|
|
18
|
+
(0, _vitest.it)("should return the container element even if the list starts at 2", () => {
|
|
19
|
+
(0, _vitest.expect)((0, _getListContainerXpath.getContainerElement)(["html[1]/body[1]/main[1]/div[3]/div[1]/div[4]/div[2]", "html[1]/body[1]/main[1]/div[3]/div[1]/div[7]/div[2]", "html[1]/body[1]/main[1]/div[3]/div[1]/div[10]/div[2]"])).toBe("html[1]/body[1]/main[1]/div[3]/div[1]");
|
|
20
|
+
});
|
|
21
|
+
});
|
package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _getListContainerXpath = require("../utils/getListContainerXpath");
|
|
4
|
+
var _vitest = require("vitest");
|
|
5
|
+
(0, _vitest.describe)("XPath Matcher", () => {
|
|
6
|
+
(0, _vitest.it)("basic case", () => {
|
|
7
|
+
const xpaths = ["html[1]/li[11]/article[1]/h3[1]/a[1]/@href", "html[1]/li[22]/article[1]/h3[1]/a[1]/@href"];
|
|
8
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(true);
|
|
9
|
+
});
|
|
10
|
+
(0, _vitest.it)("different structure case", () => {
|
|
11
|
+
const xpaths = ["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[2]/article[1]/h3[1]/a[1]/text()"];
|
|
12
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
13
|
+
});
|
|
14
|
+
(0, _vitest.it)("multiple numbers in a segment", () => {
|
|
15
|
+
const xpaths = ["html[1]/li[3]/article[2]/h3[1]/a[1]/@href", "html[1]/li[3]/article[2]/h3[1]/a[2]/@href"];
|
|
16
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(true);
|
|
17
|
+
});
|
|
18
|
+
(0, _vitest.it)("two numeric differences", () => {
|
|
19
|
+
const xpaths = ["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[2]/article[2]/h3[1]/a[1]/@href"];
|
|
20
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
21
|
+
});
|
|
22
|
+
(0, _vitest.it)("non-numeric differences", () => {
|
|
23
|
+
const xpaths = ["html[1]/ul[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[1]/article[1]/h3[1]/a[1]/@href"];
|
|
24
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
25
|
+
});
|
|
26
|
+
(0, _vitest.it)("one numeric difference, same structure", () => {
|
|
27
|
+
const xpaths = ["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[2]/article[1]/h3[1]/a[1]/@href"];
|
|
28
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(true);
|
|
29
|
+
});
|
|
30
|
+
(0, _vitest.it)("exact same xpath", () => {
|
|
31
|
+
const xpaths = ["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[1]/article[1]/h3[1]/a[1]/@href"];
|
|
32
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
33
|
+
});
|
|
34
|
+
(0, _vitest.it)("non-numeric differences in numeric segment", () => {
|
|
35
|
+
const xpaths = ["html[1]/li[1a]/article[1]/h3[1]/a[1]/@href", "html[1]/li[1b]/article[1]/h3[1]/a[1]/@href"];
|
|
36
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
37
|
+
});
|
|
38
|
+
(0, _vitest.it)("different lengths", () => {
|
|
39
|
+
const xpaths = ["html[1]/li[1]/article[1]/h3[1]/a[1]/@href", "html[1]/li[1]/article[1]/h3[1]/@href"];
|
|
40
|
+
(0, _vitest.expect)((0, _getListContainerXpath.partOfSameArrayXpath)(xpaths[0], xpaths[1])).toBe(false);
|
|
41
|
+
});
|
|
42
|
+
});
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
var _getListContainerXpath = require("../utils/getListContainerXpath");
|
|
4
|
+
var _vitest = require("vitest");
|
|
5
|
+
(0, _vitest.describe)("verifyThatAllXpathsArePartOfSameArray", () => {
|
|
6
|
+
(0, _vitest.it)("basic case", () => {
|
|
7
|
+
(0, _vitest.expect)((0, _getListContainerXpath.verifyThatAllXpathsArePartOfSameArray)(["html[1]/li[1]/article[1]/h3[1]/a[1]/@title", "html[1]/li[2]/article[1]/h3[1]/a[1]/@title", "html[1]/li[3]/article[1]/h3[1]/a[1]/@title", "html[1]/li[4]/article[1]/h3[1]/a[1]/@title"])).toBe(true);
|
|
8
|
+
});
|
|
9
|
+
});
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.dynamicListExtractor = dynamicListExtractor;
|
|
7
|
+
var _neverthrow = require("neverthrow");
|
|
8
|
+
var _captureSnapshot = require("../objectExtractionHelpers/captureSnapshot");
|
|
9
|
+
var _checksumUtils = require("../objectExtractionHelpers/checksumUtils");
|
|
10
|
+
var _runAiExtraction = require("./runAiExtraction");
|
|
11
|
+
var _validateOptions = require("./utils/validateOptions");
|
|
12
|
+
var _cache = require("../../intunedServices/cache");
|
|
13
|
+
var _xpathMapping = require("../../common/xpathMapping");
|
|
14
|
+
var _Logger = require("../../common/Logger");
|
|
15
|
+
async function dynamicListExtractor(page, identifier, options) {
|
|
16
|
+
const inputValidation = await (0, _validateOptions.validateDynamicListExtractorOptions)(page, identifier, options);
|
|
17
|
+
if (inputValidation.isErr()) {
|
|
18
|
+
return inputValidation;
|
|
19
|
+
}
|
|
20
|
+
const {
|
|
21
|
+
pageUrl,
|
|
22
|
+
searchRegion,
|
|
23
|
+
searchRegionHandler,
|
|
24
|
+
invalidate: _invalidate,
|
|
25
|
+
itemEntityName,
|
|
26
|
+
itemEntitySchema,
|
|
27
|
+
variantKey,
|
|
28
|
+
primaryProperty,
|
|
29
|
+
hasSearchRegionContainer,
|
|
30
|
+
label: _label,
|
|
31
|
+
searchRegionKey,
|
|
32
|
+
searchRegionXpath: _searchRegionXpath,
|
|
33
|
+
strategy,
|
|
34
|
+
prompt,
|
|
35
|
+
apiKey
|
|
36
|
+
} = inputValidation.value;
|
|
37
|
+
const extractorInputHash = (0, _checksumUtils.hashObject)({
|
|
38
|
+
itemEntityName,
|
|
39
|
+
itemEntitySchema,
|
|
40
|
+
variantKey,
|
|
41
|
+
currentPageUrl: pageUrl,
|
|
42
|
+
strategy,
|
|
43
|
+
searchRegionKey,
|
|
44
|
+
prompt
|
|
45
|
+
}, true);
|
|
46
|
+
const pageAndSearchRegion = {
|
|
47
|
+
page,
|
|
48
|
+
searchRegion,
|
|
49
|
+
searchRegionHandler: searchRegionHandler
|
|
50
|
+
};
|
|
51
|
+
const snapshot = await (0, _captureSnapshot.captureSnapshot)(pageAndSearchRegion);
|
|
52
|
+
if (snapshot.isErr()) {
|
|
53
|
+
return (0, _neverthrow.err)(snapshot.error);
|
|
54
|
+
}
|
|
55
|
+
_Logger.logger.info("Looking for value in the cache...");
|
|
56
|
+
const cachedResult = await _cache.cache.get(extractorInputHash);
|
|
57
|
+
if (cachedResult) {
|
|
58
|
+
_Logger.logger.info("Found value in cache");
|
|
59
|
+
if (cachedResult.exceedsLimit) {
|
|
60
|
+
_Logger.logger.warn(`Cache key ${extractorInputHash} exceeds cache limit and is not cacheable`);
|
|
61
|
+
} else {
|
|
62
|
+
const isValid = await (0, _xpathMapping.validateXPathMapping)(page, cachedResult.matchesMapping, cachedResult.containerPath);
|
|
63
|
+
if (isValid) {
|
|
64
|
+
const nonRelatedChildrenCount = cachedResult.nonRelatedChildrenCount;
|
|
65
|
+
const currentChildrenCount = await page.evaluate(containerPath => {
|
|
66
|
+
if (!containerPath) return 0;
|
|
67
|
+
const containerResult = document.evaluate(containerPath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
68
|
+
const containerElement = containerResult.singleNodeValue;
|
|
69
|
+
if (!containerElement) return 0;
|
|
70
|
+
return containerElement.children.length;
|
|
71
|
+
}, cachedResult.containerPath);
|
|
72
|
+
if (currentChildrenCount - nonRelatedChildrenCount === cachedResult.result.length) {
|
|
73
|
+
_Logger.logger.info("The values in the cache are the same as the current page, returning the cached result");
|
|
74
|
+
return (0, _neverthrow.ok)(cachedResult.result);
|
|
75
|
+
}
|
|
76
|
+
_Logger.logger.info("The values in the cache are different from the current page, running AI extraction");
|
|
77
|
+
}
|
|
78
|
+
_Logger.logger.info("The values in the cache are different from the current page, running AI extraction");
|
|
79
|
+
}
|
|
80
|
+
} else {
|
|
81
|
+
_Logger.logger.info("No value found in the cache, running AI extraction");
|
|
82
|
+
}
|
|
83
|
+
const aiExtractionResult = await (0, _runAiExtraction.runAiExtraction)({
|
|
84
|
+
pageAndSearchRegion,
|
|
85
|
+
itemEntityName,
|
|
86
|
+
itemEntitySchema,
|
|
87
|
+
primaryProperty,
|
|
88
|
+
hasSearchRegionContainer,
|
|
89
|
+
strategy,
|
|
90
|
+
identifier,
|
|
91
|
+
prompt,
|
|
92
|
+
examples: [],
|
|
93
|
+
apiKey
|
|
94
|
+
});
|
|
95
|
+
if (aiExtractionResult.isErr()) {
|
|
96
|
+
return (0, _neverthrow.err)(aiExtractionResult.error);
|
|
97
|
+
}
|
|
98
|
+
const xpathsMapping = buildXpathsMapping(aiExtractionResult.value);
|
|
99
|
+
const resultsToReturn = getResultToReturn(aiExtractionResult.value.resultValues);
|
|
100
|
+
const allContainerChildrenCount = await page.evaluate(containerPath => {
|
|
101
|
+
if (!containerPath) return 0;
|
|
102
|
+
const containerResult = document.evaluate(containerPath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
103
|
+
const containerElement = containerResult.singleNodeValue;
|
|
104
|
+
if (!containerElement) return 0;
|
|
105
|
+
return containerElement.children.length;
|
|
106
|
+
}, aiExtractionResult.value.containerPath);
|
|
107
|
+
const nonRelatedChildrenCount = allContainerChildrenCount - resultsToReturn.length;
|
|
108
|
+
const resultsToCache = {
|
|
109
|
+
result: resultsToReturn,
|
|
110
|
+
matchesMapping: xpathsMapping,
|
|
111
|
+
containerPath: aiExtractionResult.value.containerPath,
|
|
112
|
+
nonRelatedChildrenCount
|
|
113
|
+
};
|
|
114
|
+
const cacheDataSize = JSON.stringify(resultsToCache).length;
|
|
115
|
+
const CACHE_SIZE_LIMIT = 380 * 1024;
|
|
116
|
+
if (cacheDataSize > CACHE_SIZE_LIMIT) {
|
|
117
|
+
_Logger.logger.warn(`Results exceed cache limit (${cacheDataSize} bytes > ${CACHE_SIZE_LIMIT} bytes), skipping caching`);
|
|
118
|
+
await _cache.cache.set(extractorInputHash, {
|
|
119
|
+
exceedsLimit: true
|
|
120
|
+
});
|
|
121
|
+
} else {
|
|
122
|
+
_Logger.logger.debug("Caching results...");
|
|
123
|
+
await _cache.cache.set(extractorInputHash, resultsToCache);
|
|
124
|
+
_Logger.logger.debug("Results cached");
|
|
125
|
+
}
|
|
126
|
+
return (0, _neverthrow.ok)(resultsToReturn);
|
|
127
|
+
}
|
|
128
|
+
function buildXpathsMapping(results) {
|
|
129
|
+
const containerXpath = results.containerPath;
|
|
130
|
+
const xpathsMapping = {};
|
|
131
|
+
for (const result of results.resultValues) {
|
|
132
|
+
for (const [_key, valueObj] of Object.entries(result.result)) {
|
|
133
|
+
const value = valueObj;
|
|
134
|
+
const matchedXpath = value.matchXpath;
|
|
135
|
+
if (matchedXpath) {
|
|
136
|
+
const relativePath = matchedXpath.replace(containerXpath + "/", "");
|
|
137
|
+
const xpathEntry = {
|
|
138
|
+
xpath: relativePath,
|
|
139
|
+
matchType: value.matchType
|
|
140
|
+
};
|
|
141
|
+
if (!xpathsMapping[value.matchText]) {
|
|
142
|
+
xpathsMapping[value.matchText] = [];
|
|
143
|
+
}
|
|
144
|
+
xpathsMapping[value.matchText].push(xpathEntry);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return xpathsMapping;
|
|
149
|
+
}
|
|
150
|
+
function getResultToReturn(extractionResult) {
|
|
151
|
+
return extractionResult.map(result => result.result);
|
|
152
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.requiredPropertyNotExtracted = exports.other = exports.invalidSearchRegion = exports.invalidList = exports.invalidInput = exports.invalidExtractionResult = exports.invalidAddressUrl = exports.insufficientAiCredits = void 0;
|
|
7
|
+
const other = (context, error) => ({
|
|
8
|
+
type: "Other",
|
|
9
|
+
context,
|
|
10
|
+
error
|
|
11
|
+
});
|
|
12
|
+
exports.other = other;
|
|
13
|
+
const invalidSearchRegion = () => ({
|
|
14
|
+
type: "InvalidSearchRegion"
|
|
15
|
+
});
|
|
16
|
+
exports.invalidSearchRegion = invalidSearchRegion;
|
|
17
|
+
const invalidList = () => ({
|
|
18
|
+
type: "InvalidList",
|
|
19
|
+
context: "Failed to get container path, please try to choose another primary property and make sure all list items are direct child of a common container"
|
|
20
|
+
});
|
|
21
|
+
exports.invalidList = invalidList;
|
|
22
|
+
const invalidInput = context => ({
|
|
23
|
+
type: "InvalidInput",
|
|
24
|
+
context
|
|
25
|
+
});
|
|
26
|
+
exports.invalidInput = invalidInput;
|
|
27
|
+
const invalidExtractionResult = context => ({
|
|
28
|
+
type: "InvalidExtractionResult",
|
|
29
|
+
context
|
|
30
|
+
});
|
|
31
|
+
exports.invalidExtractionResult = invalidExtractionResult;
|
|
32
|
+
const invalidAddressUrl = context => ({
|
|
33
|
+
type: "InvalidAddressUrl",
|
|
34
|
+
context
|
|
35
|
+
});
|
|
36
|
+
exports.invalidAddressUrl = invalidAddressUrl;
|
|
37
|
+
const requiredPropertyNotExtracted = context => ({
|
|
38
|
+
type: "RequiredPropertyNotExtracted",
|
|
39
|
+
context
|
|
40
|
+
});
|
|
41
|
+
exports.requiredPropertyNotExtracted = requiredPropertyNotExtracted;
|
|
42
|
+
const insufficientAiCredits = context => ({
|
|
43
|
+
type: "InsufficientAiCredits",
|
|
44
|
+
context
|
|
45
|
+
});
|
|
46
|
+
exports.insufficientAiCredits = insufficientAiCredits;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.getListMatches = getListMatches;
|
|
7
|
+
var _noEmpty = _interopRequireDefault(require("../../common/noEmpty"));
|
|
8
|
+
var _findDomMatches = require("../objectExtractionHelpers/findDomMatches");
|
|
9
|
+
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
10
|
+
async function getListMatches(page, searchRegionHandler, results) {
|
|
11
|
+
const allExtractedValues = results.map(v => Object.values(v.result)).filter(_noEmpty.default).flat();
|
|
12
|
+
const allExtractedValuesDomMatches = await (0, _findDomMatches.getDomMatches)(page, searchRegionHandler, allExtractedValues);
|
|
13
|
+
return allExtractedValuesDomMatches;
|
|
14
|
+
}
|