@intuned/browser-dev 2.2.3-unify-sdks.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/.babelrc +21 -0
  2. package/.eslintignore +10 -0
  3. package/.eslintrc.js +39 -0
  4. package/LICENSE +43 -0
  5. package/dist/ai-extractors/AnthropicClient/index.js +23 -0
  6. package/dist/ai-extractors/export.d.js +5 -0
  7. package/dist/ai-extractors/export.d.ts +422 -0
  8. package/dist/ai-extractors/extractStructuredData.js +79 -0
  9. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
  10. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
  11. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
  12. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
  13. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
  14. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
  15. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
  16. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
  17. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
  18. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
  19. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
  20. package/dist/ai-extractors/extractionHelpers/types.js +5 -0
  21. package/dist/ai-extractors/fileExtractors.js +176 -0
  22. package/dist/ai-extractors/index.js +31 -0
  23. package/dist/ai-extractors/jsonSchema.d.js +5 -0
  24. package/dist/ai-extractors/jsonSchema.d.ts +49 -0
  25. package/dist/ai-extractors/openAiClients/index.js +23 -0
  26. package/dist/ai-extractors/validators.js +239 -0
  27. package/dist/browser/ai/export.d.js +3 -0
  28. package/dist/browser/ai/export.d.ts +587 -0
  29. package/dist/browser/ai/extractMarkdown.js +15 -0
  30. package/dist/browser/ai/extractStructuredData.js +231 -0
  31. package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
  32. package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
  33. package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
  34. package/dist/browser/ai/index.d.ts +587 -0
  35. package/dist/browser/ai/index.js +19 -0
  36. package/dist/browser/ai/isPageLoaded.js +67 -0
  37. package/dist/browser/ai/prompt.js +39 -0
  38. package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
  39. package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
  40. package/dist/browser/ai/tools/index.js +48 -0
  41. package/dist/browser/ai/types/errors.js +67 -0
  42. package/dist/browser/ai/types/models.js +45 -0
  43. package/dist/browser/ai/types/types.js +48 -0
  44. package/dist/browser/ai/validators.js +136 -0
  45. package/dist/common/Logger/index.js +60 -0
  46. package/dist/common/Logger/types.js +5 -0
  47. package/dist/common/SdkError.js +50 -0
  48. package/dist/common/aiModelsValidations.js +50 -0
  49. package/dist/common/browser_scripts.js +2596 -0
  50. package/dist/common/ensureBrowserScripts.js +17 -0
  51. package/dist/common/environmentVariables.js +16 -0
  52. package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
  53. package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
  54. package/dist/common/extendedTest.js +148 -0
  55. package/dist/common/extractionHelpers.js +19 -0
  56. package/dist/common/formatZodError.js +18 -0
  57. package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
  58. package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
  59. package/dist/common/fuzzySearch/utils.js +23 -0
  60. package/dist/common/getModelProvider.js +18 -0
  61. package/dist/common/getSimplifiedHtml.js +122 -0
  62. package/dist/common/hashObject.js +32 -0
  63. package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
  64. package/dist/common/html2markdown/index.js +19 -0
  65. package/dist/common/jwtTokenManager.js +18 -0
  66. package/dist/common/loadRuntime.js +16 -0
  67. package/dist/common/locatorHelpers.js +41 -0
  68. package/dist/common/matching/collectStrings.js +32 -0
  69. package/dist/common/matching/levenshtein.js +40 -0
  70. package/dist/common/matching/matching.js +317 -0
  71. package/dist/common/matching/types.js +1 -0
  72. package/dist/common/noEmpty.js +9 -0
  73. package/dist/common/saveSnapshotWithExamples.js +60 -0
  74. package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
  75. package/dist/common/xpathMapping.js +107 -0
  76. package/dist/helpers/downloadFile.js +125 -0
  77. package/dist/helpers/export.d.js +1 -0
  78. package/dist/helpers/export.d.ts +1294 -0
  79. package/dist/helpers/extractMarkdown.js +35 -0
  80. package/dist/helpers/filterEmptyValues.js +54 -0
  81. package/dist/helpers/gotoUrl.js +93 -0
  82. package/dist/helpers/index.d.ts +1294 -0
  83. package/dist/helpers/index.js +115 -0
  84. package/dist/helpers/processDate.js +25 -0
  85. package/dist/helpers/resolveUrl.js +63 -0
  86. package/dist/helpers/sanitizeHtml.js +73 -0
  87. package/dist/helpers/saveFileToS3.js +46 -0
  88. package/dist/helpers/scrollToLoadContent.js +50 -0
  89. package/dist/helpers/tests/extendedTest.js +130 -0
  90. package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
  91. package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
  92. package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
  93. package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
  94. package/dist/helpers/tests/testProcessDate.spec.js +13 -0
  95. package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
  96. package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
  97. package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
  98. package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
  99. package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
  100. package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
  101. package/dist/helpers/types/Attachment.js +81 -0
  102. package/dist/helpers/types/CustomTypeRegistry.js +48 -0
  103. package/dist/helpers/types/RunEnvironment.js +18 -0
  104. package/dist/helpers/types/ValidationError.js +17 -0
  105. package/dist/helpers/types/index.js +51 -0
  106. package/dist/helpers/uploadFileToS3.js +153 -0
  107. package/dist/helpers/utils/getS3Client.js +21 -0
  108. package/dist/helpers/utils/index.js +73 -0
  109. package/dist/helpers/utils/isDownload.js +10 -0
  110. package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
  111. package/dist/helpers/utils/isLocator.js +9 -0
  112. package/dist/helpers/utils/jwtTokenManager.js +18 -0
  113. package/dist/helpers/validateDataUsingSchema.js +119 -0
  114. package/dist/helpers/waitForDomSettled.js +182 -0
  115. package/dist/helpers/waitForNetworkIdle.js +191 -0
  116. package/dist/index.d.js +82 -0
  117. package/dist/index.d.ts +11 -0
  118. package/dist/index.js +84 -0
  119. package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
  120. package/dist/intunedServices/ApiGateway/factory.js +13 -0
  121. package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
  122. package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
  123. package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
  124. package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
  125. package/dist/intunedServices/ApiGateway/types.js +11 -0
  126. package/dist/intunedServices/cache/cache.js +61 -0
  127. package/dist/intunedServices/cache/index.js +12 -0
  128. package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
  129. package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
  130. package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
  131. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
  132. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
  133. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
  134. package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
  135. package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
  136. package/dist/optimized-extractors/common/index.js +55 -0
  137. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
  138. package/dist/optimized-extractors/common/matching/matching.js +212 -0
  139. package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
  140. package/dist/optimized-extractors/common/matching/types.js +18 -0
  141. package/dist/optimized-extractors/common/matching/utils.js +184 -0
  142. package/dist/optimized-extractors/common/utils.js +58 -0
  143. package/dist/optimized-extractors/export.d.js +5 -0
  144. package/dist/optimized-extractors/export.d.ts +397 -0
  145. package/dist/optimized-extractors/extractArray.js +120 -0
  146. package/dist/optimized-extractors/extractObject.js +104 -0
  147. package/dist/optimized-extractors/index.d.ts +397 -0
  148. package/dist/optimized-extractors/index.js +31 -0
  149. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
  150. package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
  151. package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
  152. package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
  153. package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
  154. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
  155. package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
  156. package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
  157. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
  158. package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
  159. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
  160. package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
  161. package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
  162. package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
  163. package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
  164. package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
  165. package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
  166. package/dist/optimized-extractors/models/anthropicModel.js +23 -0
  167. package/dist/optimized-extractors/models/openaiModel.js +23 -0
  168. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
  169. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
  170. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
  171. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
  172. package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
  173. package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
  174. package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
  175. package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
  176. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
  177. package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
  178. package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
  179. package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
  180. package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
  181. package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
  182. package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
  183. package/dist/optimized-extractors/types/errors.js +42 -0
  184. package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
  185. package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
  186. package/dist/optimized-extractors/types/types.js +5 -0
  187. package/dist/optimized-extractors/validators.js +152 -0
  188. package/dist/vite-env.d.js +1 -0
  189. package/dist/vite-env.d.ts +9 -0
  190. package/docs.md +14 -0
  191. package/how-to-run-tests.md +10 -0
  192. package/intuned-runtime-setup.md +13 -0
  193. package/package.json +124 -0
  194. package/tsconfig.eslint.json +5 -0
  195. package/tsconfig.json +26 -0
@@ -0,0 +1,18 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.MatchSource = exports.MatchMode = void 0;
7
+ let MatchSource = exports.MatchSource = function (MatchSource) {
8
+ MatchSource["ATTRIBUTE"] = "attribute";
9
+ MatchSource["TEXT_CONTENT"] = "text_content";
10
+ MatchSource["DIRECT_TEXT_NODE"] = "direct_text_node";
11
+ return MatchSource;
12
+ }({});
13
+ let MatchMode = exports.MatchMode = function (MatchMode) {
14
+ MatchMode["FULL"] = "full";
15
+ MatchMode["PARTIAL"] = "partial";
16
+ MatchMode["FUZZY"] = "fuzzy";
17
+ return MatchMode;
18
+ }({});
@@ -0,0 +1,184 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.calculateMaxLDist = calculateMaxLDist;
7
+ exports.findClosestMatch = findClosestMatch;
8
+ exports.getElementXPath = getElementXPath;
9
+ exports.hasNonFuzzyOrCloseFuzzyMatch = hasNonFuzzyOrCloseFuzzyMatch;
10
+ exports.isFuzzMatch = isFuzzMatch;
11
+ exports.isMatchExact = isMatchExact;
12
+ exports.isPartOfString = isPartOfString;
13
+ exports.normalizeSpacing = normalizeSpacing;
14
+ exports.rankMatch = rankMatch;
15
+ exports.removePunctuationAndSpaces = removePunctuationAndSpaces;
16
+ exports.selectBestMatch = selectBestMatch;
17
+ exports.traverseAndPrune = traverseAndPrune;
18
+ var _types = require("./types");
19
+ var _levenshteinSearch = require("../../../common/fuzzySearch/levenshtein-search");
20
+ function findClosestMatch(searchTerm, content, maxLDist) {
21
+ const results = [];
22
+ for (const result of (0, _levenshteinSearch.fuzzySearch)(searchTerm, content, maxLDist)) {
23
+ results.push(result);
24
+ }
25
+ results.sort((a, b) => {
26
+ if (a.dist === b.dist) {
27
+ return b.end - b.start - (a.end - a.start);
28
+ }
29
+ return a.dist - b.dist;
30
+ });
31
+ return results[0];
32
+ }
33
+ function normalizeSpacing(text) {
34
+ if (!text) {
35
+ return "";
36
+ }
37
+ let normalized = text.replace(/\n/g, " ").replace(/\t/g, " ");
38
+ normalized = normalized.split(/\s+/).join(" ");
39
+ return normalized.trim();
40
+ }
41
+ function isMatchExact(data, value) {
42
+ if (!data || !value) {
43
+ return [false, null];
44
+ }
45
+ const normalizedData = normalizeSpacing(data);
46
+ const normalizedValue = normalizeSpacing(value);
47
+ return [normalizedData === normalizedValue, normalizedValue];
48
+ }
49
+ function calculateMaxLDist(value) {
50
+ const length = value.length;
51
+ const Pmax = 0.2;
52
+ const Pmin = 0.05;
53
+ const lengthAtPmax = 10;
54
+ let percentage;
55
+ if (length <= lengthAtPmax) {
56
+ percentage = Pmax;
57
+ } else {
58
+ const k = -Math.log(Pmin / Pmax) / (600 - lengthAtPmax);
59
+ percentage = Pmax * Math.exp(-k * (length - lengthAtPmax));
60
+ }
61
+ percentage = Math.max(Pmin, percentage);
62
+ return Math.max(1, Math.floor(length * percentage));
63
+ }
64
+ function isFuzzMatch(searchTerm, content) {
65
+ if (!searchTerm || !content) {
66
+ return {
67
+ found: false,
68
+ matchedValue: null,
69
+ distance: null,
70
+ matchedSourceValue: null
71
+ };
72
+ }
73
+ const maxLDist = calculateMaxLDist(searchTerm);
74
+ const normalizedSearchTerm = normalizeSpacing(searchTerm);
75
+ const normalizedContent = normalizeSpacing(content);
76
+ const match = findClosestMatch(normalizedSearchTerm.toLowerCase(), normalizedContent.toLowerCase(), maxLDist);
77
+ if (!match) {
78
+ return {
79
+ found: false,
80
+ matchedValue: null,
81
+ distance: null,
82
+ matchedSourceValue: null
83
+ };
84
+ }
85
+ return {
86
+ found: true,
87
+ matchedValue: normalizedContent.slice(match.start, match.end),
88
+ matchedSourceValue: normalizedContent,
89
+ distance: match.dist
90
+ };
91
+ }
92
+ function hasNonFuzzyOrCloseFuzzyMatch(matches) {
93
+ const hasNonFuzzyMatch = matches.some(match => match.match_mode !== _types.MatchMode.FUZZY);
94
+ const hasVeryCloseFuzzyMatch = matches.some(match => match.match_mode === _types.MatchMode.FUZZY && match.fuzzy_distance && match.fuzzy_distance < 5);
95
+ return hasNonFuzzyMatch || hasVeryCloseFuzzyMatch;
96
+ }
97
+ function selectBestMatch(original, matches) {
98
+ const exactMatches = matches.filter(match => match.matchText !== _types.MatchMode.FUZZY);
99
+ if (exactMatches.length > 0) {
100
+ const fullExactMatch = exactMatches.find(i => i.exact);
101
+ const bestMatch = fullExactMatch ?? exactMatches[0];
102
+ return {
103
+ matchText: bestMatch.matchText,
104
+ matchXpath: bestMatch.matchXpath,
105
+ matchType: bestMatch.matchType
106
+ };
107
+ }
108
+ const fuzzyMatches = matches.filter(match => match.isFuzzy).map(match => [match, rankMatch(original, match)]).filter(([_, rank]) => rank === "HIGH");
109
+ if (fuzzyMatches.length > 0) {
110
+ fuzzyMatches.sort((a, b) => a[0].fuzzyDistance - b[0].fuzzyDistance);
111
+ const bestMatch = fuzzyMatches[0][0];
112
+ return {
113
+ matchText: bestMatch.matchText,
114
+ matchXpath: bestMatch.matchXpath,
115
+ matchType: bestMatch.matchType
116
+ };
117
+ }
118
+ return null;
119
+ }
120
+ function getElementXPath(element) {
121
+ if (!element || !element.parentNode || element.nodeName === "#document") {
122
+ return null;
123
+ }
124
+ let siblingsCount = 1;
125
+ const parent = element.parentNode;
126
+ const nodeName = element.nodeName.toLowerCase();
127
+ const siblings = Array.from(parent.childNodes).filter(node => node.nodeType === 1);
128
+ for (const sibling of siblings) {
129
+ if (sibling === element) {
130
+ break;
131
+ }
132
+ if (sibling.nodeName.toLowerCase() === nodeName) {
133
+ siblingsCount++;
134
+ }
135
+ }
136
+ const parentXPath = getElementXPath(parent);
137
+ if (element.nodeName === "#text") {
138
+ return parentXPath;
139
+ }
140
+ return parentXPath ? `${parentXPath}/${nodeName}[${siblingsCount}]` : `${nodeName}[${siblingsCount}]`;
141
+ }
142
+ function traverseAndPrune(node, conditionFunc) {
143
+ const children = Array.from(node.children ?? []);
144
+ children.forEach(child => {
145
+ if (child.children) {
146
+ if (!conditionFunc(child)) {
147
+ traverseAndPrune(child, conditionFunc);
148
+ }
149
+ }
150
+ });
151
+ }
152
+ function isPartOfString(input, dom) {
153
+ if (!input || !dom) {
154
+ return [false, null, null];
155
+ }
156
+ const normalizedInput = normalizeSpacing(input);
157
+ const normalizedDom = normalizeSpacing(dom);
158
+ const matchIndex = normalizedDom.toLowerCase().indexOf(normalizedInput.toLowerCase());
159
+ const matchedText = matchIndex !== -1 ? normalizedDom.substring(matchIndex, matchIndex + normalizedInput.length) : null;
160
+ return [matchIndex !== -1, matchedText, normalizedDom];
161
+ }
162
+ function removePunctuationAndSpaces(s) {
163
+ return s.replace(/[^\w\s]|_/g, "").replace(/\s+/g, "");
164
+ }
165
+ function similarityRatio(s1, s2, distance) {
166
+ const maxLength = Math.max(s1.length, s2.length);
167
+ return 1 - distance / maxLength;
168
+ }
169
+ function rankMatch(original, match) {
170
+ if (!original || !match.matchText) {
171
+ return "LOW";
172
+ }
173
+ const normalizedOriginal = normalizeSpacing(original).toLowerCase();
174
+ const normalizedMatch = normalizeSpacing(match.matchText).toLowerCase();
175
+ const ratio = similarityRatio(original, match.matchText, match.fuzzyDistance);
176
+ const lenOriginal = normalizedOriginal.length;
177
+ if (lenOriginal > 20 && ratio > 0.85) {
178
+ return "HIGH";
179
+ }
180
+ if (removePunctuationAndSpaces(normalizedOriginal) === removePunctuationAndSpaces(normalizedMatch)) {
181
+ return "HIGH";
182
+ }
183
+ return "LOW";
184
+ }
@@ -0,0 +1,58 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.cleanupAiResult = cleanupAiResult;
7
+ exports.getRandomItems = getRandomItems;
8
+ exports.getResultFromOutputSchema = getResultFromOutputSchema;
9
+ exports.processInputSchema = processInputSchema;
10
+ function getRandomItems(arr, numItems) {
11
+ const shuffled = arr.sort(() => 0.5 - Math.random());
12
+ return shuffled.slice(0, numItems);
13
+ }
14
+ function processInputSchema(originalJsonSchema, entityName) {
15
+ const internalSchema = structuredClone(originalJsonSchema);
16
+ delete internalSchema.description;
17
+ if (originalJsonSchema.type === "array") {
18
+ return {
19
+ type: "object",
20
+ properties: {
21
+ [`number_of_${entityName}`]: {
22
+ type: "number",
23
+ description: `The number of ${entityName} items in the text - not the overall total. Relay on the text to find this, if the number is not mentioned in the text, this should be null. For example, some lists say 'showing 5 our of 20 items' - 5 is the number of items in the list.`
24
+ },
25
+ [`${entityName}`]: internalSchema
26
+ },
27
+ required: [`number_of_${entityName}`, `${entityName}`]
28
+ };
29
+ }
30
+ return internalSchema;
31
+ }
32
+ function getResultFromOutputSchema(originalJsonSchema, entityName, parsedData) {
33
+ if (originalJsonSchema.type === "array") {
34
+ const items = parsedData[entityName] ?? [];
35
+ return items;
36
+ }
37
+ return parsedData;
38
+ }
39
+ function cleanUpAiReturnedString(str) {
40
+ return str.replaceAll(/&amp;/g, "&");
41
+ }
42
+ function cleanupAiResult(obj) {
43
+ if (typeof obj === "string") {
44
+ return cleanUpAiReturnedString(obj);
45
+ }
46
+ if (typeof obj !== "object" || obj === null) {
47
+ return obj;
48
+ }
49
+ if (Array.isArray(obj)) {
50
+ return obj.map(cleanupAiResult);
51
+ }
52
+ return Object.entries(obj).reduce((acc, [key, value]) => {
53
+ if (value !== null && value !== undefined && value !== "") {
54
+ acc[key] = cleanupAiResult(value);
55
+ }
56
+ return acc;
57
+ }, {});
58
+ }
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
@@ -0,0 +1,397 @@
1
+ import { Locator, Page } from "playwright-core";
2
+ import { BasicSchema } from "./types/jsonSchema";
3
+
4
+ /**
5
+ * this strategy will use a screenshot of the page/locator with some processing to extract the needed data.
6
+ * should be used when the information you're trying to extract is not present in the dom as a text but can be identified visually.
7
+ * @interface
8
+ * @property model - the model to use in the extraction process.
9
+ * @property type - the type of the strategy
10
+ */
11
+ export interface ImageStrategy {
12
+ model:
13
+ | "claude-3-haiku"
14
+ | "claude-3-haiku-20240307"
15
+ | "claude-3.5-sonnet"
16
+ | "claude-3-5-sonnet-20240620"
17
+ | "claude-3-5-sonnet-20241022"
18
+ | "claude-opus-4"
19
+ | "claude-opus-4-20250514"
20
+ | "claude-sonnet-4"
21
+ | "claude-sonnet-4-20250514"
22
+ | "gpt4-turbo"
23
+ | "gpt-4-turbo-2024-04-09"
24
+ | "gpt-4o"
25
+ | "gpt-4o-2024-05-13"
26
+ | "gpt-4o-mini"
27
+ | "gpt-4o-mini-2024-07-18"
28
+ | "gemini-1.5-pro"
29
+ | "gemini-1.5-pro-002"
30
+ | "gemini-1.5-flash-8b"
31
+ | "gemini-1.5-flash-8b-002"
32
+ | "gemini-1.5-flash"
33
+ | "gemini-1.5-flash-002"
34
+ | "gemini-2.0-flash-exp";
35
+ type: "IMAGE";
36
+ }
37
+ /**
38
+ * this strategy will use the html of the page/locator to extract the needed data. we filter out some of the attributes to reduce context.
39
+ * the attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`,
40
+ *
41
+ * @interface
42
+ * @property model - the model to use in the extraction process
43
+ * @property type - the type of the strategy
44
+ */
45
+ export interface HtmlStrategy {
46
+ model:
47
+ | "claude-3-haiku"
48
+ | "claude-3-haiku-20240307"
49
+ | "claude-3-5-haiku"
50
+ | "claude-3-5-haiku-20241022"
51
+ | "claude-3.5-sonnet"
52
+ | "claude-3-5-sonnet-20240620"
53
+ | "claude-3-5-sonnet-20241022"
54
+ | "claude-opus-4"
55
+ | "claude-opus-4-20250514"
56
+ | "claude-sonnet-4"
57
+ | "claude-sonnet-4-20250514"
58
+ | "gpt4-turbo"
59
+ | "gpt-4-turbo-2024-04-09"
60
+ | "gpt3.5-turbo"
61
+ | "gpt-3.5-turbo-0125"
62
+ | "gpt-4o"
63
+ | "gpt-4o-2024-05-13"
64
+ | "gpt-4o-mini"
65
+ | "gpt-4o-mini-2024-07-18"
66
+ | "gemini-1.5-pro"
67
+ | "gemini-1.5-pro-002"
68
+ | "gemini-1.5-flash-8b"
69
+ | "gemini-1.5-flash-8b-002"
70
+ | "gemini-1.5-flash"
71
+ | "gemini-1.5-flash-002"
72
+ | "gemini-2.0-flash-exp";
73
+ type: "HTML";
74
+ }
75
+ /**
76
+ * Extracts an array of structured data from a web page in an optimized way, this function will use ai for the first n times, until it collects multiple examples
77
+ * then it will build reliable selectors in the background to make the process more efficient
78
+ *
79
+ * @param page - The Playwright Page object from which to extract the data.
80
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
81
+ * @param options.itemEntityName - The name of the entity items being extracted, it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
82
+ * @param options.itemEntitySchema - The schema of the entity items being extracted.
83
+ * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
84
+ * @param options.prompt - Optional. A prompt to guide the extraction process.
85
+ * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
86
+ * @param options.variantKey - Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes.
87
+ * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
88
+ * @returns A promise that resolves to a list of extracted data.
89
+ *
90
+ * @example
91
+ * ```typescript extractArrayFromPage
92
+ * import { extractArrayFromPage } from "@intuned/sdk/optimized-extractors";
93
+ *
94
+ * await page.goto("https://books.toscrape.com/")
95
+ * const books = await extractArrayFromPage(page,
96
+ * {
97
+ * strategy: {
98
+ * model: "gpt4-turbo",
99
+ * type: "HTML"
100
+ * },
101
+ * itemEntityName: "book",
102
+ * label: "books-extraction",
103
+ * itemEntitySchema: {
104
+ * type: "object",
105
+ * required: ["name"],
106
+ * properties: {
107
+ * name: {
108
+ * type: "string",
109
+ * description: "book name",
110
+ * primary: true
111
+ * }
112
+ * }
113
+ * }
114
+ * },
115
+ * )
116
+ *
117
+ * console.log(books)
118
+ *
119
+ * // output:
120
+ * // [
121
+ * // ...
122
+ * // { name: 'Olio' },
123
+ * // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' },
124
+ * // { name: 'Libertarianism for Beginners' },
125
+ * // { name: "It's Only the Himalayas" }
126
+ * // ...
127
+ * // ]
128
+ *
129
+ * ```
130
+ */
131
+ export declare function extractArrayFromPage(
132
+ page: Page,
133
+ options: {
134
+ label: string;
135
+ itemEntityName: string;
136
+ itemEntitySchema: SimpleArrayItemSchema;
137
+ strategy?: ImageStrategy | HtmlStrategy;
138
+ prompt?: string;
139
+ optionalPropertiesInvalidator?: (
140
+ result: Record<string, string>[]
141
+ ) => string[];
142
+ variantKey?: string;
143
+ apiKey?: string;
144
+ }
145
+ ): Promise<Record<string, string>[]>;
146
+
147
+ /**
148
+ * Extracts an array of structured data from a locator.
149
+ *
150
+ * @param locator - The Playwright Locator object from which to extract the data.
151
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
152
+ * @param options.itemEntityName - The name of the entity items being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
153
+ * @param options.itemEntitySchema - The schema of the entity items being extracted.
154
+ * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
155
+ * @param options.prompt - Optional. A prompt to guide the extraction process.
156
+ * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
157
+ * @param options.variantKey - Optional. A variant key for the extraction process.
158
+ * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
159
+ * @returns A promise that resolves to a list of extracted data.
160
+ *
161
+ * @example
162
+ * ```typescript extractArrayFromLocator
163
+ * import { extractArrayFromLocator } from "@intuned/sdk/optimized-extractors";
164
+ *
165
+ * await page.goto("https://books.toscrape.com/")
166
+ * const books = await extractArrayFromLocator(page.locator("section"),
167
+ * {
168
+ * itemEntityName: "book",
169
+ * label: "books-extraction",
170
+ * itemEntitySchema: {
171
+ * type: "object",
172
+ * required: ["name"],
173
+ * properties: {
174
+ * name: {
175
+ * type: "string",
176
+ * description: "book name",
177
+ * primary: true
178
+ * }
179
+ * }
180
+ * }
181
+ * },
182
+ * )
183
+ *
184
+ * console.log(books)
185
+ *
186
+ * // output:
187
+ * // [
188
+ * // ...
189
+ * // { name: 'Olio' },
190
+ * // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' },
191
+ * // { name: 'Libertarianism for Beginners' },
192
+ * // { name: "It's Only the Himalayas" }
193
+ * // ...
194
+ * // ]
195
+ *
196
+ * ```
197
+ */
198
+ export declare function extractArrayFromLocator(
199
+ locator: Locator,
200
+ options: {
201
+ label: string;
202
+ itemEntityName: string;
203
+ itemEntitySchema: SimpleArrayItemSchema;
204
+ strategy?: ImageStrategy | HtmlStrategy;
205
+ prompt?: string;
206
+ optionalPropertiesInvalidator?: (
207
+ result: Record<string, string>[]
208
+ ) => string[];
209
+ variantKey?: string;
210
+ apiKey?: string;
211
+ }
212
+ ): Promise<Record<string, string>[]>;
213
+
214
+ /**
215
+ * A simple object schema with string properties.
216
+ * @interface SimpleObjectStringSchema
217
+ * @extends BasicSchema
218
+ * @property type - The type of the schema, which is always "string".
219
+ */
220
+ interface SimpleObjectStringSchema extends BasicSchema {
221
+ type: "string";
222
+ }
223
+
224
+ /**
225
+ * A simple array schema with string properties.
226
+ * @interface SimpleArrayStringSchema
227
+ * @extends BasicSchema
228
+ * @property type - The type of the schema, which is always "string".
229
+ * @property [primary] - Optional. Indicates whether this is a primary property.
230
+ */
231
+ interface SimpleArrayStringSchema extends BasicSchema {
232
+ type: "string";
233
+ primary?: boolean;
234
+ }
235
+
236
+ /**
237
+ * A simple object schema with properties.
238
+ * @interface SimpleObjectSchema
239
+ * @extends BasicSchema
240
+ * @property type - The type of the schema, which is always "object".
241
+ * @property properties - The properties of the object.
242
+ * @property required - The required properties of the object.
243
+ */
244
+ export interface SimpleObjectSchema extends BasicSchema {
245
+ type: "object";
246
+ properties: Record<string, SimpleObjectStringSchema>;
247
+ required: string[];
248
+ }
249
+
250
+ /**
251
+ * A simple array item schema with properties.
252
+ * @interface SimpleArrayItemSchema
253
+ * @extends BasicSchema
254
+ * @property type - The type of the schema, which is always "object".
255
+ * @property properties - The properties of the array item.
256
+ * @property required - The required properties of the array item.
257
+ */
258
+ export interface SimpleArrayItemSchema extends BasicSchema {
259
+ type: "object";
260
+ properties: Record<string, SimpleArrayStringSchema>;
261
+ required: string[];
262
+ }
263
+
264
+ /**
265
+ * Extracts a structured object from a web page.
266
+ *
267
+ * @param page - The Playwright Page object from which to extract the data.
268
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
269
+ * @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
270
+ * @param options.entitySchema - The schema of the entity being extracted.
271
+ * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
272
+ * @param options.prompt - Optional. A prompt to guide the extraction process.
273
+ * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
274
+ * @param options.variantKey - Optional. A variant key for the extraction process.
275
+ * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
276
+ * @returns A promise that resolves to the extracted object.
277
+ * @example
278
+ * ```typescript extractObjectFromPage
279
+ * import { extractObjectFromPage } from "@intuned/sdk/optimized-extractors";
280
+ *
281
+ * await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
282
+ * const book = await extractObjectFromPage(page,
283
+ * {
284
+ * entityName: "book",
285
+ * label: "book-extraction",
286
+ * entitySchema: {
287
+ * type: "object",
288
+ * required: ["name","price","reviews"],
289
+ * properties: {
290
+ * name: {
291
+ * type: "string",
292
+ * description: "book name",
293
+ * },
294
+ * price: {
295
+ * type: "string",
296
+ * description: "book price"
297
+ * },
298
+ * reviews: {
299
+ * type: "string",
300
+ * description: "Number of reviews"
301
+ * }
302
+ *
303
+ * }
304
+ * }
305
+ * },
306
+ * )
307
+ *
308
+ * console.log(book)
309
+ *
310
+ * // output:
311
+ * // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' }
312
+ *
313
+ * ```
314
+ */
315
+ export declare function extractObjectFromPage(
316
+ page: Page,
317
+ options: {
318
+ label: string;
319
+ entityName: string;
320
+ entitySchema: SimpleObjectSchema;
321
+ strategy?: ImageStrategy | HtmlStrategy;
322
+ prompt?: string;
323
+ optionalPropertiesInvalidator?: (
324
+ result: Record<string, string | null> | null
325
+ ) => string[];
326
+ variantKey?: string;
327
+ apiKey?: string;
328
+ }
329
+ ): Promise<Record<string, string | null> | null>;
330
+
331
+ /**
332
+ * Extracts a structured object from a locator.
333
+ *
334
+ * @param locator - The Playwright Locator object from which to extract the data.
335
+ * @param options.label - A label for this extraction process, used for billing and monitoring.
336
+ * @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
337
+ * @param options.entitySchema - The schema of the entity being extracted.
338
+ * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
339
+ * @param options.prompt - Optional. A prompt to guide the extraction process.
340
+ * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
341
+ * @param options.variantKey - Optional. A variant key for the extraction process.
342
+ * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
343
+ * @returns A promise that resolves to the extracted object.
344
+ *
345
+ * @example
346
+ * ```typescript extractObjectFromLocator
347
+ * import { extractObjectFromLocator } from "@intuned/sdk/optimized-extractors";
348
+ *
349
+ * await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
350
+ * const book = await extractObjectFromLocator(page.locator(".page_inner"),
351
+ * {
352
+ * entityName: "book",
353
+ * label: "book-extraction",
354
+ * entitySchema: {
355
+ * type: "object",
356
+ * required: ["name","price","reviews"],
357
+ * properties: {
358
+ * name: {
359
+ * type: "string",
360
+ * description: "book name",
361
+ * },
362
+ * price: {
363
+ * type: "string",
364
+ * description: "book price"
365
+ * },
366
+ * reviews: {
367
+ * type: "string",
368
+ * description: "Number of reviews"
369
+ * }
370
+ *
371
+ * }
372
+ * }
373
+ * },
374
+ * )
375
+ *
376
+ * console.log(book)
377
+ *
378
+ * // output:
379
+ * // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' }
380
+ *
381
+ * ```
382
+ */
383
+ export declare function extractObjectFromLocator(
384
+ locator: Locator,
385
+ options: {
386
+ label: string;
387
+ entityName: string;
388
+ entitySchema: SimpleObjectSchema;
389
+ strategy?: ImageStrategy | HtmlStrategy;
390
+ prompt?: string;
391
+ optionalPropertiesInvalidator?: (
392
+ result: Record<string, string | null> | null
393
+ ) => string[];
394
+ variantKey?: string;
395
+ apiKey?: string;
396
+ }
397
+ ): Promise<Record<string, string | null> | null>;