@intuned/browser-dev 2.2.3-unify-sdks.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.babelrc +21 -0
- package/.eslintignore +10 -0
- package/.eslintrc.js +39 -0
- package/LICENSE +43 -0
- package/dist/ai-extractors/AnthropicClient/index.js +23 -0
- package/dist/ai-extractors/export.d.js +5 -0
- package/dist/ai-extractors/export.d.ts +422 -0
- package/dist/ai-extractors/extractStructuredData.js +79 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
- package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
- package/dist/ai-extractors/extractionHelpers/types.js +5 -0
- package/dist/ai-extractors/fileExtractors.js +176 -0
- package/dist/ai-extractors/index.js +31 -0
- package/dist/ai-extractors/jsonSchema.d.js +5 -0
- package/dist/ai-extractors/jsonSchema.d.ts +49 -0
- package/dist/ai-extractors/openAiClients/index.js +23 -0
- package/dist/ai-extractors/validators.js +239 -0
- package/dist/browser/ai/export.d.js +3 -0
- package/dist/browser/ai/export.d.ts +587 -0
- package/dist/browser/ai/extractMarkdown.js +15 -0
- package/dist/browser/ai/extractStructuredData.js +231 -0
- package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
- package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
- package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
- package/dist/browser/ai/index.d.ts +587 -0
- package/dist/browser/ai/index.js +19 -0
- package/dist/browser/ai/isPageLoaded.js +67 -0
- package/dist/browser/ai/prompt.js +39 -0
- package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
- package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
- package/dist/browser/ai/tools/index.js +48 -0
- package/dist/browser/ai/types/errors.js +67 -0
- package/dist/browser/ai/types/models.js +45 -0
- package/dist/browser/ai/types/types.js +48 -0
- package/dist/browser/ai/validators.js +136 -0
- package/dist/common/Logger/index.js +60 -0
- package/dist/common/Logger/types.js +5 -0
- package/dist/common/SdkError.js +50 -0
- package/dist/common/aiModelsValidations.js +50 -0
- package/dist/common/browser_scripts.js +2596 -0
- package/dist/common/ensureBrowserScripts.js +17 -0
- package/dist/common/environmentVariables.js +16 -0
- package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
- package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
- package/dist/common/extendedTest.js +148 -0
- package/dist/common/extractionHelpers.js +19 -0
- package/dist/common/formatZodError.js +18 -0
- package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
- package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
- package/dist/common/fuzzySearch/utils.js +23 -0
- package/dist/common/getModelProvider.js +18 -0
- package/dist/common/getSimplifiedHtml.js +122 -0
- package/dist/common/hashObject.js +32 -0
- package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
- package/dist/common/html2markdown/index.js +19 -0
- package/dist/common/jwtTokenManager.js +18 -0
- package/dist/common/loadRuntime.js +16 -0
- package/dist/common/locatorHelpers.js +41 -0
- package/dist/common/matching/collectStrings.js +32 -0
- package/dist/common/matching/levenshtein.js +40 -0
- package/dist/common/matching/matching.js +317 -0
- package/dist/common/matching/types.js +1 -0
- package/dist/common/noEmpty.js +9 -0
- package/dist/common/saveSnapshotWithExamples.js +60 -0
- package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
- package/dist/common/xpathMapping.js +107 -0
- package/dist/helpers/downloadFile.js +125 -0
- package/dist/helpers/export.d.js +1 -0
- package/dist/helpers/export.d.ts +1294 -0
- package/dist/helpers/extractMarkdown.js +35 -0
- package/dist/helpers/filterEmptyValues.js +54 -0
- package/dist/helpers/gotoUrl.js +93 -0
- package/dist/helpers/index.d.ts +1294 -0
- package/dist/helpers/index.js +115 -0
- package/dist/helpers/processDate.js +25 -0
- package/dist/helpers/resolveUrl.js +63 -0
- package/dist/helpers/sanitizeHtml.js +73 -0
- package/dist/helpers/saveFileToS3.js +46 -0
- package/dist/helpers/scrollToLoadContent.js +50 -0
- package/dist/helpers/tests/extendedTest.js +130 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
- package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
- package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
- package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
- package/dist/helpers/tests/testProcessDate.spec.js +13 -0
- package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
- package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
- package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
- package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
- package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
- package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
- package/dist/helpers/types/Attachment.js +81 -0
- package/dist/helpers/types/CustomTypeRegistry.js +48 -0
- package/dist/helpers/types/RunEnvironment.js +18 -0
- package/dist/helpers/types/ValidationError.js +17 -0
- package/dist/helpers/types/index.js +51 -0
- package/dist/helpers/uploadFileToS3.js +153 -0
- package/dist/helpers/utils/getS3Client.js +21 -0
- package/dist/helpers/utils/index.js +73 -0
- package/dist/helpers/utils/isDownload.js +10 -0
- package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
- package/dist/helpers/utils/isLocator.js +9 -0
- package/dist/helpers/utils/jwtTokenManager.js +18 -0
- package/dist/helpers/validateDataUsingSchema.js +119 -0
- package/dist/helpers/waitForDomSettled.js +182 -0
- package/dist/helpers/waitForNetworkIdle.js +191 -0
- package/dist/index.d.js +82 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +84 -0
- package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
- package/dist/intunedServices/ApiGateway/factory.js +13 -0
- package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
- package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
- package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
- package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
- package/dist/intunedServices/ApiGateway/types.js +11 -0
- package/dist/intunedServices/cache/cache.js +61 -0
- package/dist/intunedServices/cache/index.js +12 -0
- package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
- package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
- package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
- package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
- package/dist/optimized-extractors/common/index.js +55 -0
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
- package/dist/optimized-extractors/common/matching/matching.js +212 -0
- package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
- package/dist/optimized-extractors/common/matching/types.js +18 -0
- package/dist/optimized-extractors/common/matching/utils.js +184 -0
- package/dist/optimized-extractors/common/utils.js +58 -0
- package/dist/optimized-extractors/export.d.js +5 -0
- package/dist/optimized-extractors/export.d.ts +397 -0
- package/dist/optimized-extractors/extractArray.js +120 -0
- package/dist/optimized-extractors/extractObject.js +104 -0
- package/dist/optimized-extractors/index.d.ts +397 -0
- package/dist/optimized-extractors/index.js +31 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
- package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
- package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
- package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
- package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
- package/dist/optimized-extractors/models/anthropicModel.js +23 -0
- package/dist/optimized-extractors/models/openaiModel.js +23 -0
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
- package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
- package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
- package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
- package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
- package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
- package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
- package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
- package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
- package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
- package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
- package/dist/optimized-extractors/types/errors.js +42 -0
- package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
- package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
- package/dist/optimized-extractors/types/types.js +5 -0
- package/dist/optimized-extractors/validators.js +152 -0
- package/dist/vite-env.d.js +1 -0
- package/dist/vite-env.d.ts +9 -0
- package/docs.md +14 -0
- package/how-to-run-tests.md +10 -0
- package/intuned-runtime-setup.md +13 -0
- package/package.json +124 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.MatchSource = exports.MatchMode = void 0;
|
|
7
|
+
let MatchSource = exports.MatchSource = function (MatchSource) {
|
|
8
|
+
MatchSource["ATTRIBUTE"] = "attribute";
|
|
9
|
+
MatchSource["TEXT_CONTENT"] = "text_content";
|
|
10
|
+
MatchSource["DIRECT_TEXT_NODE"] = "direct_text_node";
|
|
11
|
+
return MatchSource;
|
|
12
|
+
}({});
|
|
13
|
+
let MatchMode = exports.MatchMode = function (MatchMode) {
|
|
14
|
+
MatchMode["FULL"] = "full";
|
|
15
|
+
MatchMode["PARTIAL"] = "partial";
|
|
16
|
+
MatchMode["FUZZY"] = "fuzzy";
|
|
17
|
+
return MatchMode;
|
|
18
|
+
}({});
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.calculateMaxLDist = calculateMaxLDist;
|
|
7
|
+
exports.findClosestMatch = findClosestMatch;
|
|
8
|
+
exports.getElementXPath = getElementXPath;
|
|
9
|
+
exports.hasNonFuzzyOrCloseFuzzyMatch = hasNonFuzzyOrCloseFuzzyMatch;
|
|
10
|
+
exports.isFuzzMatch = isFuzzMatch;
|
|
11
|
+
exports.isMatchExact = isMatchExact;
|
|
12
|
+
exports.isPartOfString = isPartOfString;
|
|
13
|
+
exports.normalizeSpacing = normalizeSpacing;
|
|
14
|
+
exports.rankMatch = rankMatch;
|
|
15
|
+
exports.removePunctuationAndSpaces = removePunctuationAndSpaces;
|
|
16
|
+
exports.selectBestMatch = selectBestMatch;
|
|
17
|
+
exports.traverseAndPrune = traverseAndPrune;
|
|
18
|
+
var _types = require("./types");
|
|
19
|
+
var _levenshteinSearch = require("../../../common/fuzzySearch/levenshtein-search");
|
|
20
|
+
function findClosestMatch(searchTerm, content, maxLDist) {
|
|
21
|
+
const results = [];
|
|
22
|
+
for (const result of (0, _levenshteinSearch.fuzzySearch)(searchTerm, content, maxLDist)) {
|
|
23
|
+
results.push(result);
|
|
24
|
+
}
|
|
25
|
+
results.sort((a, b) => {
|
|
26
|
+
if (a.dist === b.dist) {
|
|
27
|
+
return b.end - b.start - (a.end - a.start);
|
|
28
|
+
}
|
|
29
|
+
return a.dist - b.dist;
|
|
30
|
+
});
|
|
31
|
+
return results[0];
|
|
32
|
+
}
|
|
33
|
+
function normalizeSpacing(text) {
|
|
34
|
+
if (!text) {
|
|
35
|
+
return "";
|
|
36
|
+
}
|
|
37
|
+
let normalized = text.replace(/\n/g, " ").replace(/\t/g, " ");
|
|
38
|
+
normalized = normalized.split(/\s+/).join(" ");
|
|
39
|
+
return normalized.trim();
|
|
40
|
+
}
|
|
41
|
+
function isMatchExact(data, value) {
|
|
42
|
+
if (!data || !value) {
|
|
43
|
+
return [false, null];
|
|
44
|
+
}
|
|
45
|
+
const normalizedData = normalizeSpacing(data);
|
|
46
|
+
const normalizedValue = normalizeSpacing(value);
|
|
47
|
+
return [normalizedData === normalizedValue, normalizedValue];
|
|
48
|
+
}
|
|
49
|
+
function calculateMaxLDist(value) {
|
|
50
|
+
const length = value.length;
|
|
51
|
+
const Pmax = 0.2;
|
|
52
|
+
const Pmin = 0.05;
|
|
53
|
+
const lengthAtPmax = 10;
|
|
54
|
+
let percentage;
|
|
55
|
+
if (length <= lengthAtPmax) {
|
|
56
|
+
percentage = Pmax;
|
|
57
|
+
} else {
|
|
58
|
+
const k = -Math.log(Pmin / Pmax) / (600 - lengthAtPmax);
|
|
59
|
+
percentage = Pmax * Math.exp(-k * (length - lengthAtPmax));
|
|
60
|
+
}
|
|
61
|
+
percentage = Math.max(Pmin, percentage);
|
|
62
|
+
return Math.max(1, Math.floor(length * percentage));
|
|
63
|
+
}
|
|
64
|
+
function isFuzzMatch(searchTerm, content) {
|
|
65
|
+
if (!searchTerm || !content) {
|
|
66
|
+
return {
|
|
67
|
+
found: false,
|
|
68
|
+
matchedValue: null,
|
|
69
|
+
distance: null,
|
|
70
|
+
matchedSourceValue: null
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
const maxLDist = calculateMaxLDist(searchTerm);
|
|
74
|
+
const normalizedSearchTerm = normalizeSpacing(searchTerm);
|
|
75
|
+
const normalizedContent = normalizeSpacing(content);
|
|
76
|
+
const match = findClosestMatch(normalizedSearchTerm.toLowerCase(), normalizedContent.toLowerCase(), maxLDist);
|
|
77
|
+
if (!match) {
|
|
78
|
+
return {
|
|
79
|
+
found: false,
|
|
80
|
+
matchedValue: null,
|
|
81
|
+
distance: null,
|
|
82
|
+
matchedSourceValue: null
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
return {
|
|
86
|
+
found: true,
|
|
87
|
+
matchedValue: normalizedContent.slice(match.start, match.end),
|
|
88
|
+
matchedSourceValue: normalizedContent,
|
|
89
|
+
distance: match.dist
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
function hasNonFuzzyOrCloseFuzzyMatch(matches) {
|
|
93
|
+
const hasNonFuzzyMatch = matches.some(match => match.match_mode !== _types.MatchMode.FUZZY);
|
|
94
|
+
const hasVeryCloseFuzzyMatch = matches.some(match => match.match_mode === _types.MatchMode.FUZZY && match.fuzzy_distance && match.fuzzy_distance < 5);
|
|
95
|
+
return hasNonFuzzyMatch || hasVeryCloseFuzzyMatch;
|
|
96
|
+
}
|
|
97
|
+
function selectBestMatch(original, matches) {
|
|
98
|
+
const exactMatches = matches.filter(match => match.matchText !== _types.MatchMode.FUZZY);
|
|
99
|
+
if (exactMatches.length > 0) {
|
|
100
|
+
const fullExactMatch = exactMatches.find(i => i.exact);
|
|
101
|
+
const bestMatch = fullExactMatch ?? exactMatches[0];
|
|
102
|
+
return {
|
|
103
|
+
matchText: bestMatch.matchText,
|
|
104
|
+
matchXpath: bestMatch.matchXpath,
|
|
105
|
+
matchType: bestMatch.matchType
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
const fuzzyMatches = matches.filter(match => match.isFuzzy).map(match => [match, rankMatch(original, match)]).filter(([_, rank]) => rank === "HIGH");
|
|
109
|
+
if (fuzzyMatches.length > 0) {
|
|
110
|
+
fuzzyMatches.sort((a, b) => a[0].fuzzyDistance - b[0].fuzzyDistance);
|
|
111
|
+
const bestMatch = fuzzyMatches[0][0];
|
|
112
|
+
return {
|
|
113
|
+
matchText: bestMatch.matchText,
|
|
114
|
+
matchXpath: bestMatch.matchXpath,
|
|
115
|
+
matchType: bestMatch.matchType
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
function getElementXPath(element) {
|
|
121
|
+
if (!element || !element.parentNode || element.nodeName === "#document") {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
let siblingsCount = 1;
|
|
125
|
+
const parent = element.parentNode;
|
|
126
|
+
const nodeName = element.nodeName.toLowerCase();
|
|
127
|
+
const siblings = Array.from(parent.childNodes).filter(node => node.nodeType === 1);
|
|
128
|
+
for (const sibling of siblings) {
|
|
129
|
+
if (sibling === element) {
|
|
130
|
+
break;
|
|
131
|
+
}
|
|
132
|
+
if (sibling.nodeName.toLowerCase() === nodeName) {
|
|
133
|
+
siblingsCount++;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const parentXPath = getElementXPath(parent);
|
|
137
|
+
if (element.nodeName === "#text") {
|
|
138
|
+
return parentXPath;
|
|
139
|
+
}
|
|
140
|
+
return parentXPath ? `${parentXPath}/${nodeName}[${siblingsCount}]` : `${nodeName}[${siblingsCount}]`;
|
|
141
|
+
}
|
|
142
|
+
function traverseAndPrune(node, conditionFunc) {
|
|
143
|
+
const children = Array.from(node.children ?? []);
|
|
144
|
+
children.forEach(child => {
|
|
145
|
+
if (child.children) {
|
|
146
|
+
if (!conditionFunc(child)) {
|
|
147
|
+
traverseAndPrune(child, conditionFunc);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
function isPartOfString(input, dom) {
|
|
153
|
+
if (!input || !dom) {
|
|
154
|
+
return [false, null, null];
|
|
155
|
+
}
|
|
156
|
+
const normalizedInput = normalizeSpacing(input);
|
|
157
|
+
const normalizedDom = normalizeSpacing(dom);
|
|
158
|
+
const matchIndex = normalizedDom.toLowerCase().indexOf(normalizedInput.toLowerCase());
|
|
159
|
+
const matchedText = matchIndex !== -1 ? normalizedDom.substring(matchIndex, matchIndex + normalizedInput.length) : null;
|
|
160
|
+
return [matchIndex !== -1, matchedText, normalizedDom];
|
|
161
|
+
}
|
|
162
|
+
function removePunctuationAndSpaces(s) {
|
|
163
|
+
return s.replace(/[^\w\s]|_/g, "").replace(/\s+/g, "");
|
|
164
|
+
}
|
|
165
|
+
function similarityRatio(s1, s2, distance) {
|
|
166
|
+
const maxLength = Math.max(s1.length, s2.length);
|
|
167
|
+
return 1 - distance / maxLength;
|
|
168
|
+
}
|
|
169
|
+
function rankMatch(original, match) {
|
|
170
|
+
if (!original || !match.matchText) {
|
|
171
|
+
return "LOW";
|
|
172
|
+
}
|
|
173
|
+
const normalizedOriginal = normalizeSpacing(original).toLowerCase();
|
|
174
|
+
const normalizedMatch = normalizeSpacing(match.matchText).toLowerCase();
|
|
175
|
+
const ratio = similarityRatio(original, match.matchText, match.fuzzyDistance);
|
|
176
|
+
const lenOriginal = normalizedOriginal.length;
|
|
177
|
+
if (lenOriginal > 20 && ratio > 0.85) {
|
|
178
|
+
return "HIGH";
|
|
179
|
+
}
|
|
180
|
+
if (removePunctuationAndSpaces(normalizedOriginal) === removePunctuationAndSpaces(normalizedMatch)) {
|
|
181
|
+
return "HIGH";
|
|
182
|
+
}
|
|
183
|
+
return "LOW";
|
|
184
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.cleanupAiResult = cleanupAiResult;
|
|
7
|
+
exports.getRandomItems = getRandomItems;
|
|
8
|
+
exports.getResultFromOutputSchema = getResultFromOutputSchema;
|
|
9
|
+
exports.processInputSchema = processInputSchema;
|
|
10
|
+
function getRandomItems(arr, numItems) {
|
|
11
|
+
const shuffled = arr.sort(() => 0.5 - Math.random());
|
|
12
|
+
return shuffled.slice(0, numItems);
|
|
13
|
+
}
|
|
14
|
+
function processInputSchema(originalJsonSchema, entityName) {
|
|
15
|
+
const internalSchema = structuredClone(originalJsonSchema);
|
|
16
|
+
delete internalSchema.description;
|
|
17
|
+
if (originalJsonSchema.type === "array") {
|
|
18
|
+
return {
|
|
19
|
+
type: "object",
|
|
20
|
+
properties: {
|
|
21
|
+
[`number_of_${entityName}`]: {
|
|
22
|
+
type: "number",
|
|
23
|
+
description: `The number of ${entityName} items in the text - not the overall total. Relay on the text to find this, if the number is not mentioned in the text, this should be null. For example, some lists say 'showing 5 our of 20 items' - 5 is the number of items in the list.`
|
|
24
|
+
},
|
|
25
|
+
[`${entityName}`]: internalSchema
|
|
26
|
+
},
|
|
27
|
+
required: [`number_of_${entityName}`, `${entityName}`]
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
return internalSchema;
|
|
31
|
+
}
|
|
32
|
+
function getResultFromOutputSchema(originalJsonSchema, entityName, parsedData) {
|
|
33
|
+
if (originalJsonSchema.type === "array") {
|
|
34
|
+
const items = parsedData[entityName] ?? [];
|
|
35
|
+
return items;
|
|
36
|
+
}
|
|
37
|
+
return parsedData;
|
|
38
|
+
}
|
|
39
|
+
function cleanUpAiReturnedString(str) {
|
|
40
|
+
return str.replaceAll(/&/g, "&");
|
|
41
|
+
}
|
|
42
|
+
function cleanupAiResult(obj) {
|
|
43
|
+
if (typeof obj === "string") {
|
|
44
|
+
return cleanUpAiReturnedString(obj);
|
|
45
|
+
}
|
|
46
|
+
if (typeof obj !== "object" || obj === null) {
|
|
47
|
+
return obj;
|
|
48
|
+
}
|
|
49
|
+
if (Array.isArray(obj)) {
|
|
50
|
+
return obj.map(cleanupAiResult);
|
|
51
|
+
}
|
|
52
|
+
return Object.entries(obj).reduce((acc, [key, value]) => {
|
|
53
|
+
if (value !== null && value !== undefined && value !== "") {
|
|
54
|
+
acc[key] = cleanupAiResult(value);
|
|
55
|
+
}
|
|
56
|
+
return acc;
|
|
57
|
+
}, {});
|
|
58
|
+
}
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
import { Locator, Page } from "playwright-core";
|
|
2
|
+
import { BasicSchema } from "./types/jsonSchema";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* this strategy will use a screenshot of the page/locator with some processing to extract the needed data.
|
|
6
|
+
* should be used when the information you're trying to extract is not present in the dom as a text but can be identified visually.
|
|
7
|
+
* @interface
|
|
8
|
+
* @property model - the model to use in the extraction process.
|
|
9
|
+
* @property type - the type of the strategy
|
|
10
|
+
*/
|
|
11
|
+
export interface ImageStrategy {
|
|
12
|
+
model:
|
|
13
|
+
| "claude-3-haiku"
|
|
14
|
+
| "claude-3-haiku-20240307"
|
|
15
|
+
| "claude-3.5-sonnet"
|
|
16
|
+
| "claude-3-5-sonnet-20240620"
|
|
17
|
+
| "claude-3-5-sonnet-20241022"
|
|
18
|
+
| "claude-opus-4"
|
|
19
|
+
| "claude-opus-4-20250514"
|
|
20
|
+
| "claude-sonnet-4"
|
|
21
|
+
| "claude-sonnet-4-20250514"
|
|
22
|
+
| "gpt4-turbo"
|
|
23
|
+
| "gpt-4-turbo-2024-04-09"
|
|
24
|
+
| "gpt-4o"
|
|
25
|
+
| "gpt-4o-2024-05-13"
|
|
26
|
+
| "gpt-4o-mini"
|
|
27
|
+
| "gpt-4o-mini-2024-07-18"
|
|
28
|
+
| "gemini-1.5-pro"
|
|
29
|
+
| "gemini-1.5-pro-002"
|
|
30
|
+
| "gemini-1.5-flash-8b"
|
|
31
|
+
| "gemini-1.5-flash-8b-002"
|
|
32
|
+
| "gemini-1.5-flash"
|
|
33
|
+
| "gemini-1.5-flash-002"
|
|
34
|
+
| "gemini-2.0-flash-exp";
|
|
35
|
+
type: "IMAGE";
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* this strategy will use the html of the page/locator to extract the needed data. we filter out some of the attributes to reduce context.
|
|
39
|
+
* the attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`,
|
|
40
|
+
*
|
|
41
|
+
* @interface
|
|
42
|
+
* @property model - the model to use in the extraction process
|
|
43
|
+
* @property type - the type of the strategy
|
|
44
|
+
*/
|
|
45
|
+
export interface HtmlStrategy {
|
|
46
|
+
model:
|
|
47
|
+
| "claude-3-haiku"
|
|
48
|
+
| "claude-3-haiku-20240307"
|
|
49
|
+
| "claude-3-5-haiku"
|
|
50
|
+
| "claude-3-5-haiku-20241022"
|
|
51
|
+
| "claude-3.5-sonnet"
|
|
52
|
+
| "claude-3-5-sonnet-20240620"
|
|
53
|
+
| "claude-3-5-sonnet-20241022"
|
|
54
|
+
| "claude-opus-4"
|
|
55
|
+
| "claude-opus-4-20250514"
|
|
56
|
+
| "claude-sonnet-4"
|
|
57
|
+
| "claude-sonnet-4-20250514"
|
|
58
|
+
| "gpt4-turbo"
|
|
59
|
+
| "gpt-4-turbo-2024-04-09"
|
|
60
|
+
| "gpt3.5-turbo"
|
|
61
|
+
| "gpt-3.5-turbo-0125"
|
|
62
|
+
| "gpt-4o"
|
|
63
|
+
| "gpt-4o-2024-05-13"
|
|
64
|
+
| "gpt-4o-mini"
|
|
65
|
+
| "gpt-4o-mini-2024-07-18"
|
|
66
|
+
| "gemini-1.5-pro"
|
|
67
|
+
| "gemini-1.5-pro-002"
|
|
68
|
+
| "gemini-1.5-flash-8b"
|
|
69
|
+
| "gemini-1.5-flash-8b-002"
|
|
70
|
+
| "gemini-1.5-flash"
|
|
71
|
+
| "gemini-1.5-flash-002"
|
|
72
|
+
| "gemini-2.0-flash-exp";
|
|
73
|
+
type: "HTML";
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Extracts an array of structured data from a web page in an optimized way, this function will use ai for the first n times, until it collects multiple examples
|
|
77
|
+
* then it will build reliable selectors in the background to make the process more efficient
|
|
78
|
+
*
|
|
79
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
80
|
+
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
81
|
+
* @param options.itemEntityName - The name of the entity items being extracted, it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
82
|
+
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
83
|
+
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
84
|
+
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
85
|
+
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
86
|
+
* @param options.variantKey - Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes.
|
|
87
|
+
* @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
|
|
88
|
+
* @returns A promise that resolves to a list of extracted data.
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```typescript extractArrayFromPage
|
|
92
|
+
* import { extractArrayFromPage } from "@intuned/sdk/optimized-extractors";
|
|
93
|
+
*
|
|
94
|
+
* await page.goto("https://books.toscrape.com/")
|
|
95
|
+
* const books = await extractArrayFromPage(page,
|
|
96
|
+
* {
|
|
97
|
+
* strategy: {
|
|
98
|
+
* model: "gpt4-turbo",
|
|
99
|
+
* type: "HTML"
|
|
100
|
+
* },
|
|
101
|
+
* itemEntityName: "book",
|
|
102
|
+
* label: "books-extraction",
|
|
103
|
+
* itemEntitySchema: {
|
|
104
|
+
* type: "object",
|
|
105
|
+
* required: ["name"],
|
|
106
|
+
* properties: {
|
|
107
|
+
* name: {
|
|
108
|
+
* type: "string",
|
|
109
|
+
* description: "book name",
|
|
110
|
+
* primary: true
|
|
111
|
+
* }
|
|
112
|
+
* }
|
|
113
|
+
* }
|
|
114
|
+
* },
|
|
115
|
+
* )
|
|
116
|
+
*
|
|
117
|
+
* console.log(books)
|
|
118
|
+
*
|
|
119
|
+
* // output:
|
|
120
|
+
* // [
|
|
121
|
+
* // ...
|
|
122
|
+
* // { name: 'Olio' },
|
|
123
|
+
* // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' },
|
|
124
|
+
* // { name: 'Libertarianism for Beginners' },
|
|
125
|
+
* // { name: "It's Only the Himalayas" }
|
|
126
|
+
* // ...
|
|
127
|
+
* // ]
|
|
128
|
+
*
|
|
129
|
+
* ```
|
|
130
|
+
*/
|
|
131
|
+
export declare function extractArrayFromPage(
|
|
132
|
+
page: Page,
|
|
133
|
+
options: {
|
|
134
|
+
label: string;
|
|
135
|
+
itemEntityName: string;
|
|
136
|
+
itemEntitySchema: SimpleArrayItemSchema;
|
|
137
|
+
strategy?: ImageStrategy | HtmlStrategy;
|
|
138
|
+
prompt?: string;
|
|
139
|
+
optionalPropertiesInvalidator?: (
|
|
140
|
+
result: Record<string, string>[]
|
|
141
|
+
) => string[];
|
|
142
|
+
variantKey?: string;
|
|
143
|
+
apiKey?: string;
|
|
144
|
+
}
|
|
145
|
+
): Promise<Record<string, string>[]>;
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Extracts an array of structured data from a locator.
|
|
149
|
+
*
|
|
150
|
+
* @param locator - The Playwright Locator object from which to extract the data.
|
|
151
|
+
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
152
|
+
* @param options.itemEntityName - The name of the entity items being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
153
|
+
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
154
|
+
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
155
|
+
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
156
|
+
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
157
|
+
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
158
|
+
* @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
|
|
159
|
+
* @returns A promise that resolves to a list of extracted data.
|
|
160
|
+
*
|
|
161
|
+
* @example
|
|
162
|
+
* ```typescript extractArrayFromLocator
|
|
163
|
+
* import { extractArrayFromLocator } from "@intuned/sdk/optimized-extractors";
|
|
164
|
+
*
|
|
165
|
+
* await page.goto("https://books.toscrape.com/")
|
|
166
|
+
* const books = await extractArrayFromLocator(page.locator("section"),
|
|
167
|
+
* {
|
|
168
|
+
* itemEntityName: "book",
|
|
169
|
+
* label: "books-extraction",
|
|
170
|
+
* itemEntitySchema: {
|
|
171
|
+
* type: "object",
|
|
172
|
+
* required: ["name"],
|
|
173
|
+
* properties: {
|
|
174
|
+
* name: {
|
|
175
|
+
* type: "string",
|
|
176
|
+
* description: "book name",
|
|
177
|
+
* primary: true
|
|
178
|
+
* }
|
|
179
|
+
* }
|
|
180
|
+
* }
|
|
181
|
+
* },
|
|
182
|
+
* )
|
|
183
|
+
*
|
|
184
|
+
* console.log(books)
|
|
185
|
+
*
|
|
186
|
+
* // output:
|
|
187
|
+
* // [
|
|
188
|
+
* // ...
|
|
189
|
+
* // { name: 'Olio' },
|
|
190
|
+
* // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' },
|
|
191
|
+
* // { name: 'Libertarianism for Beginners' },
|
|
192
|
+
* // { name: "It's Only the Himalayas" }
|
|
193
|
+
* // ...
|
|
194
|
+
* // ]
|
|
195
|
+
*
|
|
196
|
+
* ```
|
|
197
|
+
*/
|
|
198
|
+
export declare function extractArrayFromLocator(
|
|
199
|
+
locator: Locator,
|
|
200
|
+
options: {
|
|
201
|
+
label: string;
|
|
202
|
+
itemEntityName: string;
|
|
203
|
+
itemEntitySchema: SimpleArrayItemSchema;
|
|
204
|
+
strategy?: ImageStrategy | HtmlStrategy;
|
|
205
|
+
prompt?: string;
|
|
206
|
+
optionalPropertiesInvalidator?: (
|
|
207
|
+
result: Record<string, string>[]
|
|
208
|
+
) => string[];
|
|
209
|
+
variantKey?: string;
|
|
210
|
+
apiKey?: string;
|
|
211
|
+
}
|
|
212
|
+
): Promise<Record<string, string>[]>;
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* A simple object schema with string properties.
|
|
216
|
+
* @interface SimpleObjectStringSchema
|
|
217
|
+
* @extends BasicSchema
|
|
218
|
+
* @property type - The type of the schema, which is always "string".
|
|
219
|
+
*/
|
|
220
|
+
interface SimpleObjectStringSchema extends BasicSchema {
|
|
221
|
+
type: "string";
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* A simple array schema with string properties.
|
|
226
|
+
* @interface SimpleArrayStringSchema
|
|
227
|
+
* @extends BasicSchema
|
|
228
|
+
* @property type - The type of the schema, which is always "string".
|
|
229
|
+
* @property [primary] - Optional. Indicates whether this is a primary property.
|
|
230
|
+
*/
|
|
231
|
+
interface SimpleArrayStringSchema extends BasicSchema {
|
|
232
|
+
type: "string";
|
|
233
|
+
primary?: boolean;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* A simple object schema with properties.
|
|
238
|
+
* @interface SimpleObjectSchema
|
|
239
|
+
* @extends BasicSchema
|
|
240
|
+
* @property type - The type of the schema, which is always "object".
|
|
241
|
+
* @property properties - The properties of the object.
|
|
242
|
+
* @property required - The required properties of the object.
|
|
243
|
+
*/
|
|
244
|
+
export interface SimpleObjectSchema extends BasicSchema {
|
|
245
|
+
type: "object";
|
|
246
|
+
properties: Record<string, SimpleObjectStringSchema>;
|
|
247
|
+
required: string[];
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* A simple array item schema with properties.
|
|
252
|
+
* @interface SimpleArrayItemSchema
|
|
253
|
+
* @extends BasicSchema
|
|
254
|
+
* @property type - The type of the schema, which is always "object".
|
|
255
|
+
* @property properties - The properties of the array item.
|
|
256
|
+
* @property required - The required properties of the array item.
|
|
257
|
+
*/
|
|
258
|
+
export interface SimpleArrayItemSchema extends BasicSchema {
|
|
259
|
+
type: "object";
|
|
260
|
+
properties: Record<string, SimpleArrayStringSchema>;
|
|
261
|
+
required: string[];
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Extracts a structured object from a web page.
|
|
266
|
+
*
|
|
267
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
268
|
+
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
269
|
+
* @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
270
|
+
* @param options.entitySchema - The schema of the entity being extracted.
|
|
271
|
+
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
272
|
+
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
273
|
+
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
274
|
+
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
275
|
+
* @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
|
|
276
|
+
* @returns A promise that resolves to the extracted object.
|
|
277
|
+
* @example
|
|
278
|
+
* ```typescript extractObjectFromPage
|
|
279
|
+
* import { extractObjectFromPage } from "@intuned/sdk/optimized-extractors";
|
|
280
|
+
*
|
|
281
|
+
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
282
|
+
* const book = await extractObjectFromPage(page,
|
|
283
|
+
* {
|
|
284
|
+
* entityName: "book",
|
|
285
|
+
* label: "book-extraction",
|
|
286
|
+
* entitySchema: {
|
|
287
|
+
* type: "object",
|
|
288
|
+
* required: ["name","price","reviews"],
|
|
289
|
+
* properties: {
|
|
290
|
+
* name: {
|
|
291
|
+
* type: "string",
|
|
292
|
+
* description: "book name",
|
|
293
|
+
* },
|
|
294
|
+
* price: {
|
|
295
|
+
* type: "string",
|
|
296
|
+
* description: "book price"
|
|
297
|
+
* },
|
|
298
|
+
* reviews: {
|
|
299
|
+
* type: "string",
|
|
300
|
+
* description: "Number of reviews"
|
|
301
|
+
* }
|
|
302
|
+
*
|
|
303
|
+
* }
|
|
304
|
+
* }
|
|
305
|
+
* },
|
|
306
|
+
* )
|
|
307
|
+
*
|
|
308
|
+
* console.log(book)
|
|
309
|
+
*
|
|
310
|
+
* // output:
|
|
311
|
+
* // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' }
|
|
312
|
+
*
|
|
313
|
+
* ```
|
|
314
|
+
*/
|
|
315
|
+
export declare function extractObjectFromPage(
|
|
316
|
+
page: Page,
|
|
317
|
+
options: {
|
|
318
|
+
label: string;
|
|
319
|
+
entityName: string;
|
|
320
|
+
entitySchema: SimpleObjectSchema;
|
|
321
|
+
strategy?: ImageStrategy | HtmlStrategy;
|
|
322
|
+
prompt?: string;
|
|
323
|
+
optionalPropertiesInvalidator?: (
|
|
324
|
+
result: Record<string, string | null> | null
|
|
325
|
+
) => string[];
|
|
326
|
+
variantKey?: string;
|
|
327
|
+
apiKey?: string;
|
|
328
|
+
}
|
|
329
|
+
): Promise<Record<string, string | null> | null>;
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Extracts a structured object from a locator.
|
|
333
|
+
*
|
|
334
|
+
* @param locator - The Playwright Locator object from which to extract the data.
|
|
335
|
+
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
336
|
+
* @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
337
|
+
* @param options.entitySchema - The schema of the entity being extracted.
|
|
338
|
+
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
339
|
+
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
340
|
+
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
341
|
+
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
342
|
+
* @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
|
|
343
|
+
* @returns A promise that resolves to the extracted object.
|
|
344
|
+
*
|
|
345
|
+
* @example
|
|
346
|
+
* ```typescript extractObjectFromLocator
|
|
347
|
+
* import { extractObjectFromLocator } from "@intuned/sdk/optimized-extractors";
|
|
348
|
+
*
|
|
349
|
+
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
350
|
+
* const book = await extractObjectFromLocator(page.locator(".page_inner"),
|
|
351
|
+
* {
|
|
352
|
+
* entityName: "book",
|
|
353
|
+
* label: "book-extraction",
|
|
354
|
+
* entitySchema: {
|
|
355
|
+
* type: "object",
|
|
356
|
+
* required: ["name","price","reviews"],
|
|
357
|
+
* properties: {
|
|
358
|
+
* name: {
|
|
359
|
+
* type: "string",
|
|
360
|
+
* description: "book name",
|
|
361
|
+
* },
|
|
362
|
+
* price: {
|
|
363
|
+
* type: "string",
|
|
364
|
+
* description: "book price"
|
|
365
|
+
* },
|
|
366
|
+
* reviews: {
|
|
367
|
+
* type: "string",
|
|
368
|
+
* description: "Number of reviews"
|
|
369
|
+
* }
|
|
370
|
+
*
|
|
371
|
+
* }
|
|
372
|
+
* }
|
|
373
|
+
* },
|
|
374
|
+
* )
|
|
375
|
+
*
|
|
376
|
+
* console.log(book)
|
|
377
|
+
*
|
|
378
|
+
* // output:
|
|
379
|
+
* // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' }
|
|
380
|
+
*
|
|
381
|
+
* ```
|
|
382
|
+
*/
|
|
383
|
+
export declare function extractObjectFromLocator(
|
|
384
|
+
locator: Locator,
|
|
385
|
+
options: {
|
|
386
|
+
label: string;
|
|
387
|
+
entityName: string;
|
|
388
|
+
entitySchema: SimpleObjectSchema;
|
|
389
|
+
strategy?: ImageStrategy | HtmlStrategy;
|
|
390
|
+
prompt?: string;
|
|
391
|
+
optionalPropertiesInvalidator?: (
|
|
392
|
+
result: Record<string, string | null> | null
|
|
393
|
+
) => string[];
|
|
394
|
+
variantKey?: string;
|
|
395
|
+
apiKey?: string;
|
|
396
|
+
}
|
|
397
|
+
): Promise<Record<string, string | null> | null>;
|