@intuned/browser-dev 2.2.3-unify-sdks.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/.babelrc +21 -0
  2. package/.eslintignore +10 -0
  3. package/.eslintrc.js +39 -0
  4. package/LICENSE +43 -0
  5. package/dist/ai-extractors/AnthropicClient/index.js +23 -0
  6. package/dist/ai-extractors/export.d.js +5 -0
  7. package/dist/ai-extractors/export.d.ts +422 -0
  8. package/dist/ai-extractors/extractStructuredData.js +79 -0
  9. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +7 -0
  10. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +42 -0
  11. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +149 -0
  12. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +37 -0
  13. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +144 -0
  14. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +123 -0
  15. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +55 -0
  16. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +96 -0
  17. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +55 -0
  18. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +5 -0
  19. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +53 -0
  20. package/dist/ai-extractors/extractionHelpers/types.js +5 -0
  21. package/dist/ai-extractors/fileExtractors.js +176 -0
  22. package/dist/ai-extractors/index.js +31 -0
  23. package/dist/ai-extractors/jsonSchema.d.js +5 -0
  24. package/dist/ai-extractors/jsonSchema.d.ts +49 -0
  25. package/dist/ai-extractors/openAiClients/index.js +23 -0
  26. package/dist/ai-extractors/validators.js +239 -0
  27. package/dist/browser/ai/export.d.js +3 -0
  28. package/dist/browser/ai/export.d.ts +587 -0
  29. package/dist/browser/ai/extractMarkdown.js +15 -0
  30. package/dist/browser/ai/extractStructuredData.js +231 -0
  31. package/dist/browser/ai/extractStructuredDataUsingAi.js +140 -0
  32. package/dist/browser/ai/extractionHelpers/screenshotHelpers.js +55 -0
  33. package/dist/browser/ai/extractionHelpers/validateSchema.js +148 -0
  34. package/dist/browser/ai/index.d.ts +587 -0
  35. package/dist/browser/ai/index.js +19 -0
  36. package/dist/browser/ai/isPageLoaded.js +67 -0
  37. package/dist/browser/ai/prompt.js +39 -0
  38. package/dist/browser/ai/tests/testCheckAllTypesAreStrings.spec.js +143 -0
  39. package/dist/browser/ai/tests/testExtractStructuredData.spec.js +622 -0
  40. package/dist/browser/ai/tools/index.js +48 -0
  41. package/dist/browser/ai/types/errors.js +67 -0
  42. package/dist/browser/ai/types/models.js +45 -0
  43. package/dist/browser/ai/types/types.js +48 -0
  44. package/dist/browser/ai/validators.js +136 -0
  45. package/dist/common/Logger/index.js +60 -0
  46. package/dist/common/Logger/types.js +5 -0
  47. package/dist/common/SdkError.js +50 -0
  48. package/dist/common/aiModelsValidations.js +50 -0
  49. package/dist/common/browser_scripts.js +2596 -0
  50. package/dist/common/ensureBrowserScripts.js +17 -0
  51. package/dist/common/environmentVariables.js +16 -0
  52. package/dist/common/eventTracking/getAiTrackingHeaders.js +31 -0
  53. package/dist/common/eventTracking/getFileTrackingHeaders.js +23 -0
  54. package/dist/common/extendedTest.js +148 -0
  55. package/dist/common/extractionHelpers.js +19 -0
  56. package/dist/common/formatZodError.js +18 -0
  57. package/dist/common/fuzzySearch/fuzzySearch.test.js +250 -0
  58. package/dist/common/fuzzySearch/levenshtein-search.js +298 -0
  59. package/dist/common/fuzzySearch/utils.js +23 -0
  60. package/dist/common/getModelProvider.js +18 -0
  61. package/dist/common/getSimplifiedHtml.js +122 -0
  62. package/dist/common/hashObject.js +32 -0
  63. package/dist/common/html2markdown/convertElementToMarkdown.js +469 -0
  64. package/dist/common/html2markdown/index.js +19 -0
  65. package/dist/common/jwtTokenManager.js +18 -0
  66. package/dist/common/loadRuntime.js +16 -0
  67. package/dist/common/locatorHelpers.js +41 -0
  68. package/dist/common/matching/collectStrings.js +32 -0
  69. package/dist/common/matching/levenshtein.js +40 -0
  70. package/dist/common/matching/matching.js +317 -0
  71. package/dist/common/matching/types.js +1 -0
  72. package/dist/common/noEmpty.js +9 -0
  73. package/dist/common/saveSnapshotWithExamples.js +60 -0
  74. package/dist/common/tests/testEnsureBrowserScript.spec.js +31 -0
  75. package/dist/common/xpathMapping.js +107 -0
  76. package/dist/helpers/downloadFile.js +125 -0
  77. package/dist/helpers/export.d.js +1 -0
  78. package/dist/helpers/export.d.ts +1294 -0
  79. package/dist/helpers/extractMarkdown.js +35 -0
  80. package/dist/helpers/filterEmptyValues.js +54 -0
  81. package/dist/helpers/gotoUrl.js +93 -0
  82. package/dist/helpers/index.d.ts +1294 -0
  83. package/dist/helpers/index.js +115 -0
  84. package/dist/helpers/processDate.js +25 -0
  85. package/dist/helpers/resolveUrl.js +63 -0
  86. package/dist/helpers/sanitizeHtml.js +73 -0
  87. package/dist/helpers/saveFileToS3.js +46 -0
  88. package/dist/helpers/scrollToLoadContent.js +50 -0
  89. package/dist/helpers/tests/extendedTest.js +130 -0
  90. package/dist/helpers/tests/testDownloadFile.spec.js +197 -0
  91. package/dist/helpers/tests/testFilterEmptyValues.spec.js +151 -0
  92. package/dist/helpers/tests/testGoToUrl.spec.js +37 -0
  93. package/dist/helpers/tests/testIsPageLoaded.spec.js +285 -0
  94. package/dist/helpers/tests/testProcessDate.spec.js +13 -0
  95. package/dist/helpers/tests/testResolveUrl.spec.js +341 -0
  96. package/dist/helpers/tests/testSanitizeHtml.spec.js +330 -0
  97. package/dist/helpers/tests/testSimplifyHtml.spec.js +251 -0
  98. package/dist/helpers/tests/testValidateDataUsingSchema.spec.js +380 -0
  99. package/dist/helpers/tests/testWaitForDomSettled.spec.js +169 -0
  100. package/dist/helpers/tests/testWaitForNetworkIdle.spec.js +115 -0
  101. package/dist/helpers/types/Attachment.js +81 -0
  102. package/dist/helpers/types/CustomTypeRegistry.js +48 -0
  103. package/dist/helpers/types/RunEnvironment.js +18 -0
  104. package/dist/helpers/types/ValidationError.js +17 -0
  105. package/dist/helpers/types/index.js +51 -0
  106. package/dist/helpers/uploadFileToS3.js +153 -0
  107. package/dist/helpers/utils/getS3Client.js +21 -0
  108. package/dist/helpers/utils/index.js +73 -0
  109. package/dist/helpers/utils/isDownload.js +10 -0
  110. package/dist/helpers/utils/isGenerateCodeMode.js +9 -0
  111. package/dist/helpers/utils/isLocator.js +9 -0
  112. package/dist/helpers/utils/jwtTokenManager.js +18 -0
  113. package/dist/helpers/validateDataUsingSchema.js +119 -0
  114. package/dist/helpers/waitForDomSettled.js +182 -0
  115. package/dist/helpers/waitForNetworkIdle.js +191 -0
  116. package/dist/index.d.js +82 -0
  117. package/dist/index.d.ts +11 -0
  118. package/dist/index.js +84 -0
  119. package/dist/intunedServices/ApiGateway/aiApiGateway.js +87 -0
  120. package/dist/intunedServices/ApiGateway/factory.js +13 -0
  121. package/dist/intunedServices/ApiGateway/providers/Anthropic.js +26 -0
  122. package/dist/intunedServices/ApiGateway/providers/Gemini.js +29 -0
  123. package/dist/intunedServices/ApiGateway/providers/OpenAI.js +29 -0
  124. package/dist/intunedServices/ApiGateway/tests/testApiGateway.spec.js +221 -0
  125. package/dist/intunedServices/ApiGateway/types.js +11 -0
  126. package/dist/intunedServices/cache/cache.js +61 -0
  127. package/dist/intunedServices/cache/index.js +12 -0
  128. package/dist/intunedServices/cache/tests/testCache.spec.js +117 -0
  129. package/dist/optimized-extractors/common/buildExamplesPrompt.js +12 -0
  130. package/dist/optimized-extractors/common/buildImagesFromPage.js +55 -0
  131. package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +149 -0
  132. package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +37 -0
  133. package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +145 -0
  134. package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +122 -0
  135. package/dist/optimized-extractors/common/findTableHeaders.js +175 -0
  136. package/dist/optimized-extractors/common/index.js +55 -0
  137. package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +97 -0
  138. package/dist/optimized-extractors/common/matching/matching.js +212 -0
  139. package/dist/optimized-extractors/common/matching/matching.test.js +655 -0
  140. package/dist/optimized-extractors/common/matching/types.js +18 -0
  141. package/dist/optimized-extractors/common/matching/utils.js +184 -0
  142. package/dist/optimized-extractors/common/utils.js +58 -0
  143. package/dist/optimized-extractors/export.d.js +5 -0
  144. package/dist/optimized-extractors/export.d.ts +397 -0
  145. package/dist/optimized-extractors/extractArray.js +120 -0
  146. package/dist/optimized-extractors/extractObject.js +104 -0
  147. package/dist/optimized-extractors/index.d.ts +397 -0
  148. package/dist/optimized-extractors/index.js +31 -0
  149. package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +312 -0
  150. package/dist/optimized-extractors/listExtractionHelpers/__tests__/findSetOfXpathsToCreateAnArrayExtractor.test.js +22 -0
  151. package/dist/optimized-extractors/listExtractionHelpers/__tests__/getContainerElement.test.js +21 -0
  152. package/dist/optimized-extractors/listExtractionHelpers/__tests__/partOfSameArrayXpath.test.js +42 -0
  153. package/dist/optimized-extractors/listExtractionHelpers/__tests__/verifyThatAllXpathsArePartOfSameArray.test.js +9 -0
  154. package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +152 -0
  155. package/dist/optimized-extractors/listExtractionHelpers/errors.js +46 -0
  156. package/dist/optimized-extractors/listExtractionHelpers/getListMatches.js +14 -0
  157. package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +240 -0
  158. package/dist/optimized-extractors/listExtractionHelpers/typesAndSchema.js +5 -0
  159. package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +277 -0
  160. package/dist/optimized-extractors/listExtractionHelpers/utils/extractStructuredListUsingAi.js +44 -0
  161. package/dist/optimized-extractors/listExtractionHelpers/utils/getListContainerXpath.js +94 -0
  162. package/dist/optimized-extractors/listExtractionHelpers/utils/getRelativeContainerXpathSelector.js +20 -0
  163. package/dist/optimized-extractors/listExtractionHelpers/utils/getSimplifiedHtmlPerListItem.js +21 -0
  164. package/dist/optimized-extractors/listExtractionHelpers/utils/tablesUtils.js +48 -0
  165. package/dist/optimized-extractors/listExtractionHelpers/utils/validateOptions.js +52 -0
  166. package/dist/optimized-extractors/models/anthropicModel.js +23 -0
  167. package/dist/optimized-extractors/models/openaiModel.js +23 -0
  168. package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +73 -0
  169. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/checksumUtils.test.js +103 -0
  170. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +107 -0
  171. package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromPage.spec.js +107 -0
  172. package/dist/optimized-extractors/objectExtractionHelpers/calculateObjectExampleHash.js +28 -0
  173. package/dist/optimized-extractors/objectExtractionHelpers/captureSnapshot.js +26 -0
  174. package/dist/optimized-extractors/objectExtractionHelpers/checksumUtils.js +32 -0
  175. package/dist/optimized-extractors/objectExtractionHelpers/constants.js +7 -0
  176. package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +106 -0
  177. package/dist/optimized-extractors/objectExtractionHelpers/errors.js +42 -0
  178. package/dist/optimized-extractors/objectExtractionHelpers/findDomMatches.js +54 -0
  179. package/dist/optimized-extractors/objectExtractionHelpers/getSimplifiedHtml.js +122 -0
  180. package/dist/optimized-extractors/objectExtractionHelpers/typesAndSchemas.js +5 -0
  181. package/dist/optimized-extractors/objectExtractionHelpers/validateDynamicObjectExtractorOptions.js +52 -0
  182. package/dist/optimized-extractors/types/aiModelsValidation.js +45 -0
  183. package/dist/optimized-extractors/types/errors.js +42 -0
  184. package/dist/optimized-extractors/types/jsonSchema.d.js +5 -0
  185. package/dist/optimized-extractors/types/jsonSchema.d.ts +50 -0
  186. package/dist/optimized-extractors/types/types.js +5 -0
  187. package/dist/optimized-extractors/validators.js +152 -0
  188. package/dist/vite-env.d.js +1 -0
  189. package/dist/vite-env.d.ts +9 -0
  190. package/docs.md +14 -0
  191. package/how-to-run-tests.md +10 -0
  192. package/intuned-runtime-setup.md +13 -0
  193. package/package.json +124 -0
  194. package/tsconfig.eslint.json +5 -0
  195. package/tsconfig.json +26 -0
@@ -0,0 +1,587 @@
1
+ import { Locator, Page } from "playwright-core";
2
+ import { ObjectSchema } from "./jsonSchema";
3
+ import { JSONSchema7TypeName } from "json-schema";
4
+
5
+ /**
6
+ * Base schema interface that all JSON schema types extend from.
7
+ * Provides common properties like type and description.
8
+ *
9
+ * @interface BaseSchema
10
+ */
11
+ export interface BaseSchema {
12
+ /** The JSON schema type(s) for this schema definition */
13
+ type: JSONSchema7TypeName | JSONSchema7TypeName[];
14
+ /** Optional description of what this schema represents */
15
+ description?: string;
16
+ }
17
+
18
+ /**
19
+ * Schema definition for string values with validation constraints.
20
+ *
21
+ * @interface StringSchema
22
+ * @extends BaseSchema
23
+ * @example
24
+ * ```typescript
25
+ * const nameSchema: StringSchema = {
26
+ * type: "string",
27
+ * minLength: 2,
28
+ * maxLength: 50,
29
+ * pattern: "^[A-Za-z\\s]+$",
30
+ * description: "Person's full name"
31
+ * };
32
+ * ```
33
+ */
34
+ export interface StringSchema extends BaseSchema {
35
+ /** Must be "string" for string schemas */
36
+ type: "string";
37
+ /** Array of allowed string values (enumeration) */
38
+ enum?: string[];
39
+ /** Maximum allowed string length */
40
+ maxLength?: number;
41
+ /** Minimum required string length */
42
+ minLength?: number;
43
+ /** Regular expression pattern the string must match */
44
+ pattern?: string;
45
+ }
46
+
47
+ /**
48
+ * Schema definition for numeric values (numbers and integers) with validation constraints.
49
+ *
50
+ * @interface NumberSchema
51
+ * @extends BaseSchema
52
+ * @example
53
+ * ```typescript
54
+ * const ageSchema: NumberSchema = {
55
+ * type: "integer",
56
+ * minimum: 0,
57
+ * maximum: 150,
58
+ * description: "Person's age in years"
59
+ * };
60
+ * ```
61
+ */
62
+ export interface NumberSchema extends BaseSchema {
63
+ /** Must be "number" or "integer" for numeric schemas */
64
+ type: "number" | "integer";
65
+ /** Number must be a multiple of this value */
66
+ multipleOf?: number;
67
+ /** Maximum allowed value (inclusive) */
68
+ maximum?: number;
69
+ /** Maximum allowed value (exclusive) */
70
+ exclusiveMaximum?: number;
71
+ /** Minimum allowed value (inclusive) */
72
+ minimum?: number;
73
+ /** Minimum allowed value (exclusive) */
74
+ exclusiveMinimum?: number;
75
+ }
76
+
77
+ /**
78
+ * Schema definition for boolean values.
79
+ *
80
+ * @interface BooleanSchema
81
+ * @extends BaseSchema
82
+ * @example
83
+ * ```typescript
84
+ * const isActiveSchema: BooleanSchema = {
85
+ * type: "boolean",
86
+ * description: "Whether the user account is active"
87
+ * };
88
+ * ```
89
+ */
90
+ export interface BooleanSchema extends BaseSchema {
91
+ /** Must be "boolean" for boolean schemas */
92
+ type: "boolean";
93
+ }
94
+
95
+ /**
96
+ * Schema definition for array values with item validation and constraints.
97
+ *
98
+ * @interface ArraySchema
99
+ * @extends BaseSchema
100
+ * @example
101
+ * ```typescript
102
+ * const tagsSchema: ArraySchema = {
103
+ * type: "array",
104
+ * items: { type: "string" },
105
+ * minItems: 1,
106
+ * maxItems: 10,
107
+ * uniqueItems: true,
108
+ * description: "List of tags"
109
+ * };
110
+ * ```
111
+ */
112
+ export interface ArraySchema extends BaseSchema {
113
+ /** Must be "array" for array schemas */
114
+ type: "array";
115
+ /** Schema definition for array items */
116
+ items: JSONSchema;
117
+ /** Maximum number of items allowed */
118
+ maxItems?: number;
119
+ /** Minimum number of items required */
120
+ minItems?: number;
121
+ /** Whether all items must be unique */
122
+ uniqueItems?: boolean;
123
+ }
124
+
125
+ /**
126
+ * Schema definition for object values with property validation and constraints.
127
+ *
128
+ * @interface ObjectSchema
129
+ * @extends BaseSchema
130
+ * @example
131
+ * ```typescript
132
+ * const userSchema: ObjectSchema = {
133
+ * type: "object",
134
+ * properties: {
135
+ * name: { type: "string" },
136
+ * email: { type: "string", pattern: "^[^@]+@[^@]+\\.[^@]+$" },
137
+ * age: { type: "integer", minimum: 0 }
138
+ * },
139
+ * required: ["name", "email"],
140
+ * description: "User profile information"
141
+ * };
142
+ * ```
143
+ */
144
+ export interface ObjectSchema extends BaseSchema {
145
+ /** Must be "object" for object schemas */
146
+ type: "object";
147
+ /** Schema definitions for object properties */
148
+ properties: { [key: string]: JSONSchema };
149
+ /** Array of required property names */
150
+ required?: string[];
151
+ /** Maximum number of properties allowed */
152
+ maxProperties?: number;
153
+ /** Minimum number of properties required */
154
+ minProperties?: number;
155
+ }
156
+ /**
157
+ * Extract structured data from web pages using AI-powered content analysis.
158
+ *
159
+ * This function provides intelligent data extraction from web pages using various strategies
160
+ * including HTML parsing, image analysis, and Markdown conversion. It supports extraction
161
+ * from entire pages or specific elements, with built-in caching and retry mechanisms.
162
+ *
163
+ * @param options - Configuration object containing extraction parameters
164
+ * @param {Page | Locator} [options.source] - Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element
165
+ * @param {SUPPORTED_MODELS} [options.model] - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
166
+ * @param {string} [options.strategy] - Type of extraction: "HTML", "IMAGE", or "MARKDOWN"
167
+ * @param {JSONSchema} options.dataSchema - [JSONSchema](../interfaces/JSONSchema) defining the structure of the data to extract
168
+ * @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
169
+ * @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)s
170
+ * @param {boolean} [options.enableDomMatching=false] - Whether to disable DOM element matching during extraction. Defaults to False. When set to false, all types in the schema must be strings to match with the DOM elements. The extracted resultes will be matched with the DOM elements and returned, then will be cached in a smart fashion so that the next time the same data is extracted, the result will be returned from the cache even if the DOM has minor changes.
171
+ * @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. default true
172
+ * @param {integer} [options.retries=3] - Maximum number of retry attempts on failure. default 3
173
+ *
174
+ * @returns Promise resolving to the extracted structured data matching the provided schema
175
+
176
+ * @example
177
+ * ```typescript Extract Product Information from Entire Page
178
+ * import { extractStructuredData } from './extractors';
179
+ *
180
+ * const productSchema = {
181
+ * type: "object",
182
+ * properties: {
183
+ * name: { type: "string" },
184
+ * price: { type: "string" },
185
+ * description: { type: "string" },
186
+ * inStock: { type: "boolean" }
187
+ * },
188
+ * required: ["name", "price"]
189
+ * };
190
+ *
191
+ * const product = await extractStructuredData({
192
+ * page: page,
193
+ * strategy: { type: "HTML", model: "gpt-4" },
194
+ * dataSchema: productSchema,
195
+ * entityName: "product",
196
+ * label: "product-extractor",
197
+ * prompt: "Extract product details from this e-commerce page"
198
+ * });
199
+ *
200
+ * console.log(`Found product: ${product.name} - ${product.price}`);
201
+ * ```
202
+ *
203
+ * @example
204
+ * ```typescript Extract Article Data from Specific Element
205
+ * import { extractStructuredData } from './extractors';
206
+ *
207
+ * const articleSchema = {
208
+ * type: "object",
209
+ * properties: {
210
+ * title: { type: "string" },
211
+ * author: { type: "string" },
212
+ * publishDate: { type: "string" },
213
+ * content: { type: "string" },
214
+ * tags: { type: "array", items: { type: "string" } }
215
+ * },
216
+ * required: ["title", "content"]
217
+ * };
218
+ *
219
+ * const articleContainer = page.locator("article.main-content");
220
+ * const article = await extractStructuredData({
221
+ * locator: articleContainer,
222
+ * strategy: { type: "MARKDOWN", model: "claude-3" },
223
+ * dataSchema: articleSchema,
224
+ * entityName: "article",
225
+ * label: "article-extractor",
226
+ * retries: 5
227
+ * });
228
+ *
229
+ * console.log(`Article: ${article.title} by ${article.author}`);
230
+ * ```
231
+ *
232
+ * @example
233
+ * ```typescript Extract Data from Screenshots using Image Strategy
234
+ * import { extractStructuredData } from './extractors';
235
+ *
236
+ * const chartSchema = {
237
+ * type: "object",
238
+ * properties: {
239
+ * title: { type: "string" },
240
+ * dataPoints: {
241
+ * type: "array",
242
+ * items: {
243
+ * type: "object",
244
+ * properties: {
245
+ * label: { type: "string" },
246
+ * value: { type: "number" }
247
+ * }
248
+ * }
249
+ * }
250
+ * }
251
+ * };
252
+ *
253
+ * const chartElement = page.locator("#data-visualization");
254
+ * const chartData = await extractStructuredData({
255
+ * locator: chartElement,
256
+ * strategy: { type: "IMAGE", model: "gpt-4-vision" },
257
+ * dataSchema: chartSchema,
258
+ * entityName: "chart",
259
+ * label: "chart-extractor",
260
+ * prompt: "Extract the chart title and all data points with their values"
261
+ * });
262
+ *
263
+ * console.log(`Chart: ${chartData.title}`);
264
+ * chartData.dataPoints.forEach(point => {
265
+ * console.log(`${point.label}: ${point.value}`);
266
+ * });
267
+ * ```
268
+ */
269
+ export declare function extractStructuredData(options: {
270
+ source: Page | Locator;
271
+ dataSchema: JSONSchema;
272
+ prompt?: string;
273
+ strategy?: "IMAGE" | "MARKDOWN" | "HTML";
274
+ model?: SUPPORTED_MODELS;
275
+ apiKey?: string;
276
+ enableDomMatching?: boolean;
277
+ enableCache?: boolean;
278
+ maxRetries?: number;
279
+ }): Promise<any>;
280
+
281
+ /**
282
+ * This type defines the supported AI models for data extraction.
283
+ * It includes models from OpenAI, Anthropic, and Google Gemini.
284
+ * The models are used in the extraction strategies to process and analyze the content of web pages or elements.
285
+ * @type SUPPORTED_MODELS
286
+ */
287
+ type SUPPORTED_MODELS =
288
+ | "claude-opus-4-1-20250805"
289
+ | "claude-opus-4-20250514"
290
+ | "claude-sonnet-4-20250514"
291
+ | "claude-3-7-sonnet-20250219"
292
+ | "claude-3-5-sonnet-20240620"
293
+ | "claude-3-5-haiku-latest"
294
+ | "gpt-5"
295
+ | "gpt-5-mini"
296
+ | "gpt-5-nano"
297
+ | "gpt-5-chat"
298
+ | "gpt-5-chat-latest"
299
+ | "gpt-5-2025-08-07"
300
+ | "gpt-5-mini-2025-08-07"
301
+ | "gpt-5-nano-2025-08-07"
302
+ | "gpt-4.1"
303
+ | "gpt-4.1-mini"
304
+ | "gpt-4.1-nano"
305
+ | "o4-mini"
306
+ | "o3-mini"
307
+ | "o3"
308
+ | "o1-mini"
309
+ | "o1-preview"
310
+ | "gpt-4o-mini"
311
+ | "gpt-4o-mini-2024-07-18"
312
+ | "gpt-4o"
313
+ | "gpt-4o-2024-08-06"
314
+ | "gpt-4o-2024-05-13"
315
+ | "gpt-4o-2024-05-13"
316
+ | "gpt-4-turbo"
317
+ | "gpt-4-turbo-preview"
318
+ | "gpt-4-0125-preview"
319
+ | "gpt-4-1106-preview"
320
+ | "gpt-3.5-turbo-1106"
321
+ | "gpt-3.5-turbo"
322
+ | "gpt-3.5-turbo-0301"
323
+ | "gpt-3.5-turbo-0613"
324
+ | "gpt-3.5-turbo-16k"
325
+ | "gpt-3.5-turbo-16k-0613"
326
+ | "gpt-4"
327
+ | "gpt-4-0314"
328
+ | "gpt-4-0613"
329
+ | "gpt-4-32k"
330
+ | "gpt-4-32k-0314"
331
+ | "gpt-4-32k-0613"
332
+ | "gemini-pro"
333
+ | "gemini-1.5-pro-latest"
334
+ | "gemini-2.0-flash"
335
+ | "gemini-2.0-flash-exp"
336
+ | "gemini-2.0-flash-lite-preview-02-05";
337
+
338
+ /**
339
+ * Represents a JSON Schema definition for validating data structures.
340
+ * Supports various schema types including string, number, boolean, array, and object schemas
341
+ * with their respective validation rules and constraints.
342
+ *
343
+ * This type is a union of different schema types:
344
+ * - StringSchema: For string validation with length and pattern constraints
345
+ * - NumberSchema: For number/integer validation with range constraints
346
+ * - BooleanSchema: For boolean values
347
+ * - ArraySchema: For array validation with item constraints
348
+ * - ObjectSchema: For object validation with property constraints
349
+ *
350
+ * @type JSONSchema
351
+ * @example
352
+ * ```typescript String Schema
353
+ * const stringSchema: JSONSchema = {
354
+ * type: "string",
355
+ * minLength: 3,
356
+ * maxLength: 50,
357
+ * pattern: "^[A-Za-z]+$"
358
+ * };
359
+ * ```
360
+ *
361
+ * @example
362
+ * ```typescript Number Schema
363
+ * const numberSchema: JSONSchema = {
364
+ * type: "number",
365
+ * minimum: 0,
366
+ * maximum: 100,
367
+ * multipleOf: 0.5
368
+ * };
369
+ * ```
370
+ *
371
+ * @example
372
+ * ```typescript Array Schema
373
+ * const arraySchema: JSONSchema = {
374
+ * type: "array",
375
+ * items: {
376
+ * type: "string"
377
+ * },
378
+ * minItems: 1,
379
+ * maxItems: 10,
380
+ * uniqueItems: true
381
+ * };
382
+ * ```
383
+ *
384
+ * @example
385
+ * ```typescript Object Schema
386
+ * const objectSchema: JSONSchema = {
387
+ * type: "object",
388
+ * properties: {
389
+ * name: { type: "string" },
390
+ * age: { type: "number", minimum: 0 },
391
+ * email: { type: "string", pattern: "^[^@]+@[^@]+\\.[^@]+$" }
392
+ * },
393
+ * required: ["name", "email"]
394
+ * };
395
+ * ```
396
+ */
397
+ export type JSONSchema =
398
+ | StringSchema
399
+ | NumberSchema
400
+ | BooleanSchema
401
+ | ArraySchema
402
+ | ObjectSchema
403
+ | BaseSchema;
404
+
405
+ /**
406
+ * @interface HTMLStrategy
407
+ * Represents a strategy for extracting data from HTML content using AI models.
408
+ *
409
+ * This strategy processes the HTML structure of a page or element, focusing on semantic attributes
410
+ * for better context understanding. It automatically filters and includes only relevant HTML attributes:
411
+ * `aria-label`, `data-name`, `name`, `type`, `placeholder`, `value`, `role`, `title`, `href`, `id`, `alt`
412
+ *
413
+ * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
414
+ * @param {string} type - Type of extraction: 'HTML'
415
+ *
416
+ * @example
417
+ * ```typescript Basic HTML Extraction
418
+ * const htmlStrategy: HTMLStrategy = {
419
+ * type: "HTML",
420
+ * model: "gpt-4"
421
+ * };
422
+ *
423
+ * const data = await extractStructuredData({
424
+ * page: page,
425
+ * strategy: htmlStrategy,
426
+ * // ... other options
427
+ * });
428
+ * ```
429
+ *
430
+ * @example
431
+ * ```typescript Advanced HTML Extraction
432
+ * const htmlStrategy: HTMLStrategy = {
433
+ * type: "HTML",
434
+ * model: "claude-3-sonnet-20240620"
435
+ * };
436
+ *
437
+ * // Extract product details from a specific container
438
+ * const productData = await extractStructuredData({
439
+ * locator: page.locator('.product-container'),
440
+ * strategy: htmlStrategy,
441
+ * dataSchema: productSchema,
442
+ * entityName: "product",
443
+ * label: "product-extractor"
444
+ * });
445
+ * ```
446
+ */
447
+ export interface HTMLStrategy {
448
+ /** The AI model to use for content analysis and data extraction */
449
+ model: SUPPORTED_MODELS;
450
+
451
+ /** Strategy type identifier, must be "HTML" for HTML-based extraction */
452
+ type: "HTML";
453
+ }
454
+
455
+ /**
456
+ * @interface ImageStrategy
457
+ * Represents a strategy for extracting data from visual content using AI vision models.
458
+ *
459
+ * This strategy captures screenshots of the target page or element and uses AI vision
460
+ * capabilities to extract information. It's particularly useful for:
461
+ * - Data embedded in images or charts
462
+ * - Content with complex visual layouts
463
+ * - Information that's not directly accessible in the HTML
464
+ *
465
+ * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
466
+ * @param {string} type - Type of extraction: 'IMAGE'
467
+ * @example
468
+ * ```typescript Basic Image Analysis
469
+ * const imageStrategy: ImageStrategy = {
470
+ * type: "IMAGE",
471
+ * model: "gpt-4-vision"
472
+ * };
473
+ *
474
+ * const chartData = await extractStructuredData({
475
+ * locator: page.locator('.chart-container'),
476
+ * strategy: imageStrategy,
477
+ * dataSchema: chartSchema,
478
+ * entityName: "chart",
479
+ * label: "chart-data-extractor"
480
+ * });
481
+ * ```
482
+ *
483
+ * @example
484
+ * ```typescript Complex Visual Extraction
485
+ * const imageStrategy: ImageStrategy = {
486
+ * type: "IMAGE",
487
+ * model: "claude-3-sonnet-20240620"
488
+ * };
489
+ *
490
+ * // Extract data from a complex dashboard
491
+ * const dashboardData = await extractStructuredData({
492
+ * page: page,
493
+ * strategy: imageStrategy,
494
+ * dataSchema: dashboardSchema,
495
+ * entityName: "dashboard",
496
+ * label: "dashboard-metrics",
497
+ * prompt: "Extract all metrics and their values from this dashboard view"
498
+ * });
499
+ * ```
500
+ */
501
+ export interface ImageStrategy {
502
+ /** The AI vision model to use for image analysis and data extraction */
503
+ model: SUPPORTED_MODELS;
504
+
505
+ /** Strategy type identifier, must be "IMAGE" for image-based extraction */
506
+ type: "IMAGE";
507
+ }
508
+
509
+ /**
510
+ * @interface MarkDownStrategy
511
+ * Represents a strategy for extracting data from content after converting it to Markdown format.
512
+ *
513
+ * This strategy first converts the HTML content to semantic Markdown before processing,
514
+ * which helps in:
515
+ * - Preserving content hierarchy and structure
516
+ * - Removing unnecessary styling and formatting
517
+ * - Focusing on semantic meaning of the content
518
+ * - Handling content-heavy pages more efficiently
519
+ *
520
+ * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
521
+ * @param {string} type - Type of extraction: 'MARKDOWN'
522
+ * @example
523
+ * ```typescript Basic Article Extraction
524
+ * const markdownStrategy: MarkDownStrategy = {
525
+ * type: "MARKDOWN",
526
+ * model: "gpt-4"
527
+ * };
528
+ *
529
+ * const articleData = await extractStructuredData({
530
+ * locator: page.locator('article'),
531
+ * strategy: markdownStrategy,
532
+ * dataSchema: articleSchema,
533
+ * entityName: "article",
534
+ * label: "article-content"
535
+ * });
536
+ * ```
537
+ *
538
+ * @example
539
+ * ```typescript Documentation Extraction
540
+ * const markdownStrategy: MarkDownStrategy = {
541
+ * type: "MARKDOWN",
542
+ * model: "claude-3-sonnet-20240620"
543
+ * };
544
+ *
545
+ * // Extract structured data from documentation pages
546
+ * const docData = await extractStructuredData({
547
+ * page: page,
548
+ * strategy: markdownStrategy,
549
+ * dataSchema: documentationSchema,
550
+ * entityName: "documentation",
551
+ * label: "docs-extractor",
552
+ * prompt: "Extract main concepts, code examples, and API references"
553
+ * });
554
+ * ```
555
+ */
556
+ export interface MarkDownStrategy {
557
+ /** The AI model to use for processing the Markdown content */
558
+ model: SUPPORTED_MODELS;
559
+
560
+ /** Strategy type identifier, must be "MARKDOWN" for Markdown-based extraction */
561
+ type: "MARKDOWN";
562
+ }
563
+
564
+ /**
565
+ * @interface HtmlStrategy
566
+ * Represents a strategy for extracting data from HTML content using AI models.
567
+ * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
568
+ * @param {string} type - Type of extraction: 'HTML'
569
+ * @example
570
+ * ```typescript Basic HTML Extraction
571
+ * const htmlStrategy: HtmlStrategy = {
572
+ * type: "HTML",
573
+ * model: "gpt-4"
574
+ * };
575
+ * ```
576
+ * @example
577
+ * ```typescript Advanced HTML Extraction
578
+ * const htmlStrategy: HtmlStrategy = {
579
+ * type: "HTML",
580
+ * model: "claude-3-sonnet-20240620"
581
+ * };
582
+ * ```
583
+ */
584
+ export interface HtmlStrategy {
585
+ type: "HTML";
586
+ model: SUPPORTED_MODELS;
587
+ }
@@ -0,0 +1,15 @@
1
+ "use strict";
2
+
3
+ Object.defineProperty(exports, "__esModule", {
4
+ value: true
5
+ });
6
+ exports.extractMarkdownFromLocator = extractMarkdownFromLocator;
7
+ exports.extractMarkdownFromPage = extractMarkdownFromPage;
8
+ var _html2markdown = require("../../common/html2markdown");
9
+ async function extractMarkdownFromPage(page) {
10
+ const locator = page.locator("html");
11
+ return (0, _html2markdown.convertLocatorToMarkdown)(locator);
12
+ }
13
+ async function extractMarkdownFromLocator(locator) {
14
+ return (0, _html2markdown.convertLocatorToMarkdown)(locator);
15
+ }