@intuned/browser-dev 2.2.3-unify-sdks.26 → 2.2.3-unify-sdks.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/ai/export.d.ts +40 -239
  2. package/dist/ai/index.d.ts +40 -239
  3. package/dist/ai/isPageLoaded.js +8 -8
  4. package/dist/ai/tests/testExtractStructuredData.spec.js +1 -1
  5. package/dist/helpers/export.d.js +5 -1
  6. package/dist/helpers/export.d.ts +277 -464
  7. package/dist/helpers/gotoUrl.js +2 -1
  8. package/dist/helpers/index.d.ts +277 -464
  9. package/dist/helpers/index.js +3 -3
  10. package/dist/helpers/resolveUrl.js +4 -3
  11. package/dist/helpers/scrollToLoadContent.js +10 -5
  12. package/dist/helpers/tests/testIsPageLoaded.spec.js +12 -6
  13. package/dist/helpers/tests/testWaitForDomSettled.spec.js +36 -41
  14. package/dist/helpers/tests/{testWaitForNetworkIdle.spec.js → testWithNetworkIdleWait.spec.js} +44 -45
  15. package/dist/helpers/types/Attachment.js +43 -9
  16. package/dist/helpers/uploadFileToS3.js +5 -5
  17. package/dist/helpers/utils/getS3Client.js +3 -3
  18. package/dist/helpers/waitForDomSettled.js +4 -97
  19. package/dist/helpers/withNetworkIdleWait.js +91 -0
  20. package/dist/optimized-extractors/export.d.ts +4 -4
  21. package/dist/optimized-extractors/index.d.ts +4 -4
  22. package/generated-docs/ai/functions/extractStructuredData.mdx +168 -0
  23. package/generated-docs/ai/functions/isPageLoaded.mdx +139 -0
  24. package/generated-docs/ai/interfaces/ArraySchema.mdx +33 -0
  25. package/generated-docs/ai/interfaces/BasicSchema.mdx +14 -0
  26. package/generated-docs/ai/interfaces/BooleanSchema.mdx +25 -0
  27. package/generated-docs/ai/interfaces/NumberSchema.mdx +32 -0
  28. package/generated-docs/ai/interfaces/ObjectSchema.mdx +36 -0
  29. package/generated-docs/ai/interfaces/StringSchema.mdx +32 -0
  30. package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +12 -0
  31. package/generated-docs/helpers/functions/downloadFile.mdx +95 -0
  32. package/generated-docs/helpers/functions/extractMarkdown.mdx +53 -0
  33. package/generated-docs/helpers/functions/filterEmptyValues.mdx +48 -0
  34. package/generated-docs/helpers/functions/goToUrl.mdx +97 -0
  35. package/generated-docs/helpers/functions/processDate.mdx +52 -0
  36. package/generated-docs/helpers/functions/resolveUrl.mdx +79 -0
  37. package/generated-docs/helpers/functions/sanitizeHtml.mdx +113 -0
  38. package/generated-docs/helpers/functions/saveFileToS3.mdx +144 -0
  39. package/generated-docs/helpers/functions/scrollToLoadContent.mdx +87 -0
  40. package/generated-docs/helpers/functions/uploadFileToS3.mdx +104 -0
  41. package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +66 -0
  42. package/generated-docs/helpers/functions/waitForDomSettled.mdx +95 -0
  43. package/generated-docs/helpers/functions/withNetworkIdle.mdx +93 -0
  44. package/generated-docs/helpers/interfaces/Attachment.mdx +45 -0
  45. package/generated-docs/helpers/interfaces/S3Configs.mdx +36 -0
  46. package/generated-docs/helpers/interfaces/S3UploadOptions.mdx +40 -0
  47. package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +22 -0
  48. package/generated-docs/helpers/type-aliases/AttachmentType.mdx +12 -0
  49. package/generated-docs/helpers/type-aliases/DataInput.mdx +11 -0
  50. package/generated-docs/helpers/type-aliases/DataObject.mdx +11 -0
  51. package/generated-docs/helpers/type-aliases/S3UploadableFile.mdx +10 -0
  52. package/generated-docs/helpers/type-aliases/Trigger.mdx +13 -0
  53. package/package.json +3 -19
  54. package/dist/ai-extractors/AnthropicClient/index.js +0 -23
  55. package/dist/ai-extractors/export.d.js +0 -5
  56. package/dist/ai-extractors/export.d.ts +0 -425
  57. package/dist/ai-extractors/extractStructuredData.js +0 -79
  58. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/constants.js +0 -7
  59. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/errors.js +0 -42
  60. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingClaude.js +0 -149
  61. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingGoogle.js +0 -37
  62. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStructuredDataUsingOpenAi.js +0 -144
  63. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/extractStrucutredDataUsingAiInstance.js +0 -123
  64. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/index.js +0 -55
  65. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/isItemTableHeaderOrFooter.js +0 -96
  66. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/screenshotHelpers.js +0 -55
  67. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/types.js +0 -5
  68. package/dist/ai-extractors/extractionHelpers/extractStructuredDataUsingAi/utils.js +0 -53
  69. package/dist/ai-extractors/extractionHelpers/types.js +0 -5
  70. package/dist/ai-extractors/fileExtractors.js +0 -176
  71. package/dist/ai-extractors/index.d.ts +0 -425
  72. package/dist/ai-extractors/index.js +0 -31
  73. package/dist/ai-extractors/openAiClients/index.js +0 -23
  74. package/dist/ai-extractors/validators.js +0 -239
  75. package/dist/helpers/waitForNetworkIdle.js +0 -192
  76. package/dist/playwright/export.d.js +0 -5
  77. package/dist/playwright/export.d.ts +0 -220
  78. package/dist/playwright/index.d.ts +0 -220
  79. package/dist/playwright/index.js +0 -18
  80. package/dist/playwright/staticExtractors/extractHelpers.js +0 -170
  81. package/dist/playwright/staticExtractors/getArrayUsingArrayExtractor.js +0 -84
  82. package/dist/playwright/staticExtractors/getObjectUsingObjectExtractor.js +0 -45
  83. package/dist/playwright/staticExtractors/index.js +0 -37
  84. package/dist/playwright/staticExtractors/types.js +0 -26
@@ -1,5 +1,5 @@
1
1
  import { Locator, Page } from "playwright-core";
2
- import { JSONSchema7TypeName } from "json-schema";
2
+
3
3
  /**
4
4
  * Base schema interface that all JSON schema types extend from.
5
5
  * Provides common properties like type and description.
@@ -159,22 +159,22 @@ export interface ObjectSchema extends BasicSchema {
159
159
  * including HTML parsing, image analysis, and Markdown conversion. It supports extraction
160
160
  * from entire pages or specific elements, with built-in caching and retry mechanisms.
161
161
  *
162
- * @param options - Configuration object containing extraction parameters
163
- * @param {Page | Locator} [options.source] - Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element
162
+ * @param {Object} options - Configuration object containing extraction parameters
163
+ * @param {Page | Locator} options.source - Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element
164
+ * @param {JsonSchema} options.dataSchema - [JsonSchema](../interfaces/JsonSchema) defining the structure of the data to extract
164
165
  * @param {SUPPORTED_MODELS} [options.model] - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
165
166
  * @param {string} [options.strategy] - Type of extraction: "HTML", "IMAGE", or "MARKDOWN"
166
- * @param {JsonSchema} options.dataSchema - [JsonSchema](../interfaces/JsonSchema) defining the structure of the data to extract
167
167
  * @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
168
168
  * @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)s
169
169
  * @param {boolean} [options.enableDomMatching=false] - Whether to disable DOM element matching during extraction. Defaults to False. When set to false, all types in the schema must be strings to match with the DOM elements. The extracted resultes will be matched with the DOM elements and returned, then will be cached in a smart fashion so that the next time the same data is extracted, the result will be returned from the cache even if the DOM has minor changes.
170
170
  * @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. default true
171
- * @param {integer} [options.retries=3] - Maximum number of retry attempts on failure. default 3
171
+ * @param {integer} [options.retries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, api errors, output errors, etc.
172
172
  *
173
173
  * @returns Promise resolving to the extracted structured data matching the provided schema
174
174
 
175
175
  * @example
176
176
  * ```typescript Extract Product Information from Entire Page
177
- * import { extractStructuredData } from './extractors';
177
+ * import { extractStructuredData } from '@intuned/browser/ai';
178
178
  *
179
179
  * const productSchema = {
180
180
  * type: "object",
@@ -188,11 +188,10 @@ export interface ObjectSchema extends BasicSchema {
188
188
  * };
189
189
  *
190
190
  * const product = await extractStructuredData({
191
- * page: page,
192
- * strategy: { type: "HTML", model: "gpt-4" },
191
+ * source: page,
192
+ * strategy: "HTML",
193
+ * model: "gpt-4o",
193
194
  * dataSchema: productSchema,
194
- * entityName: "product",
195
- * label: "product-extractor",
196
195
  * prompt: "Extract product details from this e-commerce page"
197
196
  * });
198
197
  *
@@ -201,7 +200,7 @@ export interface ObjectSchema extends BasicSchema {
201
200
  *
202
201
  * @example
203
202
  * ```typescript Extract Article Data from Specific Element
204
- * import { extractStructuredData } from './extractors';
203
+ * import { extractStructuredData } from '@intuned/browser/ai';
205
204
  *
206
205
  * const articleSchema = {
207
206
  * type: "object",
@@ -217,12 +216,11 @@ export interface ObjectSchema extends BasicSchema {
217
216
  *
218
217
  * const articleContainer = page.locator("article.main-content");
219
218
  * const article = await extractStructuredData({
220
- * locator: articleContainer,
221
- * strategy: { type: "MARKDOWN", model: "claude-3" },
219
+ * source: articleContainer,
220
+ * strategy: "MARKDOWN",
221
+ * model: "claude-3",
222
222
  * dataSchema: articleSchema,
223
- * entityName: "article",
224
- * label: "article-extractor",
225
- * retries: 5
223
+ * maxRetries: 5
226
224
  * });
227
225
  *
228
226
  * console.log(`Article: ${article.title} by ${article.author}`);
@@ -230,7 +228,7 @@ export interface ObjectSchema extends BasicSchema {
230
228
  *
231
229
  * @example
232
230
  * ```typescript Extract Data from Screenshots using Image Strategy
233
- * import { extractStructuredData } from './extractors';
231
+ * import { extractStructuredData } from '@intuned/browser/ai';
234
232
  *
235
233
  * const chartSchema = {
236
234
  * type: "object",
@@ -251,11 +249,10 @@ export interface ObjectSchema extends BasicSchema {
251
249
  *
252
250
  * const chartElement = page.locator("#data-visualization");
253
251
  * const chartData = await extractStructuredData({
254
- * locator: chartElement,
255
- * strategy: { type: "IMAGE", model: "gpt-4-vision" },
252
+ * source: chartElement,
253
+ * strategy: "IMAGE",
254
+ * model: "gpt-4o",
256
255
  * dataSchema: chartSchema,
257
- * entityName: "chart",
258
- * label: "chart-extractor",
259
256
  * prompt: "Extract the chart title and all data points with their values"
260
257
  * });
261
258
  *
@@ -417,229 +414,43 @@ type SUPPORTED_MODELS = SUPPORTED_CLAUDE_MODELS | SUPPORTED_OPENAI_MODELS;
417
414
  * ```
418
415
  */
419
416
 
420
- /**
421
- * @interface HTMLStrategy
422
- * Represents a strategy for extracting data from HTML content using AI models.
423
- *
424
- * This strategy processes the HTML structure of a page or element, focusing on semantic attributes
425
- * for better context understanding. It automatically filters and includes only relevant HTML attributes:
426
- * `aria-label`, `data-name`, `name`, `type`, `placeholder`, `value`, `role`, `title`, `href`, `id`, `alt`
427
- *
428
- * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
429
- * @param {string} type - Type of extraction: 'HTML'
430
- *
431
- * @example
432
- * ```typescript Basic HTML Extraction
433
- * const htmlStrategy: HTMLStrategy = {
434
- * type: "HTML",
435
- * model: "gpt-4"
436
- * };
437
- *
438
- * const data = await extractStructuredData({
439
- * page: page,
440
- * strategy: htmlStrategy,
441
- * // ... other options
442
- * });
443
- * ```
444
- *
445
- * @example
446
- * ```typescript Advanced HTML Extraction
447
- * const htmlStrategy: HTMLStrategy = {
448
- * type: "HTML",
449
- * model: "claude-3-sonnet-20240620"
450
- * };
451
- *
452
- * // Extract product details from a specific container
453
- * const productData = await extractStructuredData({
454
- * locator: page.locator('.product-container'),
455
- * strategy: htmlStrategy,
456
- * dataSchema: productSchema,
457
- * entityName: "product",
458
- * label: "product-extractor"
459
- * });
460
- * ```
461
- */
462
- export interface HTMLStrategy {
463
- /** The AI model to use for content analysis and data extraction */
464
- model: SUPPORTED_MODELS;
465
-
466
- /** Strategy type identifier, must be "HTML" for HTML-based extraction */
467
- type: "HTML";
468
- }
469
-
470
- /**
471
- * @interface ImageStrategy
472
- * Represents a strategy for extracting data from visual content using AI vision models.
473
- *
474
- * This strategy captures screenshots of the target page or element and uses AI vision
475
- * capabilities to extract information. It's particularly useful for:
476
- * - Data embedded in images or charts
477
- * - Content with complex visual layouts
478
- * - Information that's not directly accessible in the HTML
479
- *
480
- * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
481
- * @param {string} type - Type of extraction: 'IMAGE'
482
- * @example
483
- * ```typescript Basic Image Analysis
484
- * const imageStrategy: ImageStrategy = {
485
- * type: "IMAGE",
486
- * model: "gpt-4-vision"
487
- * };
488
- *
489
- * const chartData = await extractStructuredData({
490
- * locator: page.locator('.chart-container'),
491
- * strategy: imageStrategy,
492
- * dataSchema: chartSchema,
493
- * entityName: "chart",
494
- * label: "chart-data-extractor"
495
- * });
496
- * ```
497
- *
498
- * @example
499
- * ```typescript Complex Visual Extraction
500
- * const imageStrategy: ImageStrategy = {
501
- * type: "IMAGE",
502
- * model: "claude-3-sonnet-20240620"
503
- * };
504
- *
505
- * // Extract data from a complex dashboard
506
- * const dashboardData = await extractStructuredData({
507
- * page: page,
508
- * strategy: imageStrategy,
509
- * dataSchema: dashboardSchema,
510
- * entityName: "dashboard",
511
- * label: "dashboard-metrics",
512
- * prompt: "Extract all metrics and their values from this dashboard view"
513
- * });
514
- * ```
515
- */
516
- export interface ImageStrategy {
517
- /** The AI vision model to use for image analysis and data extraction */
518
- model: SUPPORTED_MODELS;
519
-
520
- /** Strategy type identifier, must be "IMAGE" for image-based extraction */
521
- type: "IMAGE";
522
- }
523
-
524
- /**
525
- * @interface MarkDownStrategy
526
- * Represents a strategy for extracting data from content after converting it to Markdown format.
527
- *
528
- * This strategy first converts the HTML content to semantic Markdown before processing,
529
- * which helps in:
530
- * - Preserving content hierarchy and structure
531
- * - Removing unnecessary styling and formatting
532
- * - Focusing on semantic meaning of the content
533
- * - Handling content-heavy pages more efficiently
534
- *
535
- * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
536
- * @param {string} type - Type of extraction: 'MARKDOWN'
537
- * @example
538
- * ```typescript Basic Article Extraction
539
- * const markdownStrategy: MarkDownStrategy = {
540
- * type: "MARKDOWN",
541
- * model: "gpt-4"
542
- * };
543
- *
544
- * const articleData = await extractStructuredData({
545
- * locator: page.locator('article'),
546
- * strategy: markdownStrategy,
547
- * dataSchema: articleSchema,
548
- * entityName: "article",
549
- * label: "article-content"
550
- * });
551
- * ```
552
- *
553
- * @example
554
- * ```typescript Documentation Extraction
555
- * const markdownStrategy: MarkDownStrategy = {
556
- * type: "MARKDOWN",
557
- * model: "claude-3-sonnet-20240620"
558
- * };
559
- *
560
- * // Extract structured data from documentation pages
561
- * const docData = await extractStructuredData({
562
- * page: page,
563
- * strategy: markdownStrategy,
564
- * dataSchema: documentationSchema,
565
- * entityName: "documentation",
566
- * label: "docs-extractor",
567
- * prompt: "Extract main concepts, code examples, and API references"
568
- * });
569
- * ```
570
- */
571
- export interface MarkDownStrategy {
572
- /** The AI model to use for processing the Markdown content */
573
- model: SUPPORTED_MODELS;
574
-
575
- /** Strategy type identifier, must be "MARKDOWN" for Markdown-based extraction */
576
- type: "MARKDOWN";
577
- }
578
-
579
- /**
580
- * @interface HtmlStrategy
581
- * Represents a strategy for extracting data from HTML content using AI models.
582
- * @param {SUPPORTED_MODELS} model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models.
583
- * @param {string} type - Type of extraction: 'HTML'
584
- * @example
585
- * ```typescript Basic HTML Extraction
586
- * const htmlStrategy: HtmlStrategy = {
587
- * type: "HTML",
588
- * model: "gpt-4"
589
- * };
590
- * ```
591
- * @example
592
- * ```typescript Advanced HTML Extraction
593
- * const htmlStrategy: HtmlStrategy = {
594
- * type: "HTML",
595
- * model: "claude-3-sonnet-20240620"
596
- * };
597
- * ```
598
- */
599
- export interface HtmlStrategy {
600
- type: "HTML";
601
- model: SUPPORTED_MODELS;
602
- }
603
-
604
417
  /**
605
418
  * Uses AI vision to determine if a webpage has finished loading by analyzing a screenshot.
606
419
  * Detects loading spinners, blank content, or incomplete page states.
607
- *
608
- * @param {Page} page - The Playwright page to check
609
- * @param {Object} [options] - Optional configuration object
610
- * @param {SUPPORTED_MODELS} [options.model="gpt-4o-2024-08-06"] - [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) to use for the AI Check. default gpt-4o-2024-08-06
611
- * @param {number} [options.timeoutInMs=10000] - Screenshot timeout in milliseconds
612
- * @param {string} [options.apiKey] - Optional API key for the AI service
613
- * @returns {Promise.<{status: LoadingStatus, reason: (string|null|undefined), cost: (number|undefined)}>}
614
- * - `status`: "True" if page is loaded, "False" if still loading, "Dont know" if uncertain
615
- * - `reason`: Optional reason for the status (e.g., detected loading spinner)
616
- * - `cost`: Optional cost of the AI analysis (if applicable)
420
+ * @param {Object} input - Input object containing the page to check
421
+ * @param {Page} input.page - The Playwright page to check
422
+ * @param {number} [input.timeoutInMs=10000] - Screenshot timeout in milliseconds
423
+ * @param {SUPPORTED_MODELS} [input.model="gpt-4o-2024-08-06"] - [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) to use for the AI Check. default gpt-4o-2024-08-06
424
+ * @param {string} [input.apiKey] - Optional API key for the AI service
425
+ * @returns {Promise.<boolean>} True if page is loaded, false if still loading
617
426
  * @example
618
427
  * ```typescript Check Page Loading
619
- * import { isPageLoaded } from "@intuned/sdk/helpers";
428
+ * import { isPageLoaded } from "@intuned/browser/ai";
620
429
  *
621
430
  * // Wait for page to finish loading
622
431
  * await page.goto('https://example.com');
623
432
  *
624
- * const pageLoaded = await isPageLoaded(page);
625
- * if (pageLoaded['status']) {
626
- * console.log("Page loaded:", pageLoaded['reason']);
433
+ * const pageLoaded = await isPageLoaded({page});
434
+ * if (pageLoaded) {
627
435
  * // Continue with scraping or interactions
628
436
  * } else {
629
- * console.log("Still loading:", pageLoaded['reason']);
630
437
  * // Wait longer or retry
631
438
  * }
632
439
  * ```
633
440
  *
634
441
  * @example
635
442
  * ```typescript Loading Loop
636
- * import { isPageLoaded } from "@intuned/sdk/helpers";
443
+ * import { isPageLoaded } from "@intuned/browser/ai";
637
444
  *
638
445
  * // Keep checking until page loads
639
446
  * let attempts = 0;
640
447
  * while (attempts < 10) {
641
- * const pageLoaded = await isPageLoaded(page, "gpt-4o", 5);
642
- * if (pageLoaded['status']) break;
448
+ * const pageLoaded = await isPageLoaded({
449
+ * page,
450
+ * model: "gpt-4o",
451
+ * timeoutInMs: 5000
452
+ * });
453
+ * if (pageLoaded) break;
643
454
  *
644
455
  * await page.waitForTimeout(2000);
645
456
  * attempts++;
@@ -647,23 +458,13 @@ export interface HtmlStrategy {
647
458
  * ```
648
459
  *
649
460
  */
650
- export declare function isPageLoaded(
651
- page: Page,
652
- options?: {
653
- timeoutInMs?: number;
654
- model?: SUPPORTED_MODELS;
655
- apiKey?: string;
656
- }
657
- ): Promise<{
658
- status: LoadingStatus;
659
- reason?: string | null;
660
- cost?: number;
661
- }>;
461
+ export declare function isPageLoaded(input: {
462
+ page: Page;
463
+ timeoutInMs?: number;
464
+ model?: SUPPORTED_MODELS;
465
+ apiKey?: string;
466
+ }): Promise<boolean>;
662
467
 
663
- /**
664
- * LoadingStatus is a union of true, false, and "Dont know".
665
- */
666
- export type LoadingStatus = true | false | "Dont know";
667
468
  export type JsonSchema =
668
469
  | StringSchema
669
470
  | NumberSchema