@isdk/web-fetcher 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.cn.md +19 -0
  2. package/README.engine.cn.md +34 -6
  3. package/README.engine.md +29 -1
  4. package/README.md +21 -1
  5. package/dist/index.d.mts +1515 -1490
  6. package/dist/index.d.ts +1515 -1490
  7. package/dist/index.js +1 -1
  8. package/dist/index.mjs +1 -1
  9. package/docs/README.md +21 -1
  10. package/docs/_media/README.cn.md +19 -0
  11. package/docs/_media/README.engine.md +29 -1
  12. package/docs/classes/CheerioFetchEngine.md +95 -95
  13. package/docs/classes/ClickAction.md +29 -29
  14. package/docs/classes/EngineUpgradeError.md +335 -0
  15. package/docs/classes/EvaluateAction.md +29 -29
  16. package/docs/classes/ExtractAction.md +29 -29
  17. package/docs/classes/FetchAction.md +29 -29
  18. package/docs/classes/FetchEngine.md +93 -93
  19. package/docs/classes/FetchSession.md +14 -14
  20. package/docs/classes/FillAction.md +29 -29
  21. package/docs/classes/GetContentAction.md +29 -29
  22. package/docs/classes/GotoAction.md +29 -29
  23. package/docs/classes/KeyboardPressAction.md +29 -29
  24. package/docs/classes/KeyboardTypeAction.md +29 -29
  25. package/docs/classes/MouseClickAction.md +29 -29
  26. package/docs/classes/MouseMoveAction.md +29 -29
  27. package/docs/classes/MouseWheelAction.md +29 -29
  28. package/docs/classes/PauseAction.md +29 -29
  29. package/docs/classes/PlaywrightFetchEngine.md +101 -101
  30. package/docs/classes/ScrollIntoViewAction.md +29 -29
  31. package/docs/classes/SubmitAction.md +29 -29
  32. package/docs/classes/TrimAction.md +29 -29
  33. package/docs/classes/WaitForAction.md +29 -29
  34. package/docs/classes/WebFetcher.md +5 -5
  35. package/docs/enumerations/FetchActionResultStatus.md +4 -4
  36. package/docs/functions/fetchWeb.md +2 -2
  37. package/docs/functions/getRandomDelay.md +1 -1
  38. package/docs/globals.md +3 -1
  39. package/docs/interfaces/BaseFetchActionProperties.md +13 -13
  40. package/docs/interfaces/BaseFetchCollectorActionProperties.md +17 -17
  41. package/docs/interfaces/BaseFetcherProperties.md +44 -28
  42. package/docs/interfaces/DispatchedEngineAction.md +4 -4
  43. package/docs/interfaces/EvaluateActionOptions.md +3 -3
  44. package/docs/interfaces/ExtractActionProperties.md +13 -13
  45. package/docs/interfaces/FetchActionMeta.md +73 -0
  46. package/docs/interfaces/FetchActionProperties.md +15 -19
  47. package/docs/interfaces/FetchActionResult.md +7 -7
  48. package/docs/interfaces/FetchContext.md +65 -41
  49. package/docs/interfaces/FetchEngineContext.md +57 -33
  50. package/docs/interfaces/FetchMetadata.md +5 -5
  51. package/docs/interfaces/FetchResponse.md +14 -14
  52. package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
  53. package/docs/interfaces/FetchSite.md +55 -31
  54. package/docs/interfaces/FetcherOptions.md +55 -31
  55. package/docs/interfaces/GotoActionOptions.md +8 -8
  56. package/docs/interfaces/KeyboardPressParams.md +3 -3
  57. package/docs/interfaces/KeyboardTypeParams.md +3 -3
  58. package/docs/interfaces/MouseClickParams.md +6 -6
  59. package/docs/interfaces/MouseMoveParams.md +5 -5
  60. package/docs/interfaces/MouseWheelParams.md +7 -7
  61. package/docs/interfaces/PendingEngineRequest.md +3 -3
  62. package/docs/interfaces/ScrollIntoViewParams.md +2 -2
  63. package/docs/interfaces/StorageOptions.md +5 -5
  64. package/docs/interfaces/SubmitActionOptions.md +2 -2
  65. package/docs/interfaces/TrimActionOptions.md +3 -3
  66. package/docs/interfaces/WaitForActionOptions.md +5 -5
  67. package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
  68. package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
  69. package/docs/type-aliases/BrowserEngine.md +1 -1
  70. package/docs/type-aliases/FetchActionCapabilities.md +1 -1
  71. package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
  72. package/docs/type-aliases/FetchActionInContext.md +38 -0
  73. package/docs/type-aliases/FetchActionOptions.md +1 -1
  74. package/docs/type-aliases/FetchEngineAction.md +1 -1
  75. package/docs/type-aliases/FetchEngineType.md +1 -1
  76. package/docs/type-aliases/FetchReturnType.md +1 -1
  77. package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
  78. package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
  79. package/docs/type-aliases/ResourceType.md +1 -1
  80. package/docs/type-aliases/TrimPreset.md +1 -1
  81. package/docs/variables/DefaultFetcherProperties.md +1 -1
  82. package/docs/variables/FetcherOptionKeys.md +1 -1
  83. package/docs/variables/TRIM_PRESETS.md +1 -1
  84. package/package.json +1 -1
  85. package/docs/interfaces/FetchActionInContext.md +0 -190
package/dist/index.d.mts CHANGED
@@ -1,4 +1,4 @@
1
- import { CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Cookie, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions, SessionPoolOptions } from 'crawlee';
1
+ import { Cookie, SessionPoolOptions, CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions } from 'crawlee';
2
2
  export { Cookie } from 'crawlee';
3
3
  import { EventEmitter } from 'events-ex';
4
4
 
@@ -731,1810 +731,1835 @@ type _RequireAtLeastOne<
731
731
  Except<ObjectType, KeysType>;
732
732
 
733
733
  /**
734
- * Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
735
- * It acts as the target for extraction and interaction actions.
736
- */
737
- type FetchElementScope = any;
738
- /**
739
- * Interface representing the minimal engine capabilities required for extraction.
734
+ * Represents the state of an action being executed within a context.
740
735
  *
741
736
  * @remarks
742
- * This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
743
- * Implementing classes must ensure consistent behavior across different engines, especially
744
- * regarding scope handling (Element vs Array of Elements) and DOM traversal.
737
+ * Extends the basic action properties with runtime metadata like execution index,
738
+ * nesting depth, and any errors encountered during execution.
745
739
  */
746
- interface IExtractEngine {
747
- /**
748
- * Finds all elements matching the selector within the given scope.
749
- *
750
- * @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
751
- * @param selector - The CSS selector to match.
752
- * @returns A promise resolving to an array of found element scopes.
753
- *
754
- * @remarks
755
- * **Behavior Contract:**
756
- * 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
757
- * 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
758
- * 3. **Array Scope**: If `scope` is an array:
759
- * - It MUST process elements in the order they appear in the array (which should match document order).
760
- * - It MUST perform the check (Self + Descendants) for *each* element in the array.
761
- * - It MUST flatten the results into a single array.
762
- * - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
763
- * but generally, preserving document order is the priority.
764
- */
765
- _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
740
+ type FetchActionInContext = FetchActionOptions & {
766
741
  /**
767
- * Extracts a primitive value from the element based on the schema configuration.
768
- *
769
- * @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
770
- * @param scope - The specific element to extract data from.
771
- * @returns A promise resolving to the extracted value (string, number, boolean, or null).
772
- *
773
- * @remarks
774
- * **Behavior Contract:**
775
- * - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
776
- * - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
777
- * - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
778
- * - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
779
- * - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
742
+ * The 0-based index of the action in the execution sequence.
780
743
  */
781
- _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
744
+ index?: number;
782
745
  /**
783
- * Gets the parent element of the given scope.
784
- *
785
- * @param scope - The element to find the parent of.
786
- * @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
746
+ * Error encountered during action execution, if any.
787
747
  */
788
- _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
748
+ error?: Error;
789
749
  /**
790
- * Checks if two element scopes refer to the exact same DOM node.
791
- *
792
- * @param scope1 - The first element scope.
793
- * @param scope2 - The second element scope.
794
- * @returns A promise resolving to `true` if they are the same node, `false` otherwise.
795
- *
796
- * @remarks
797
- * This comparison MUST be identity-based, not just content-based.
750
+ * The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
798
751
  */
799
- _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
752
+ depth?: number;
753
+ };
754
+ /**
755
+ * Base internal state used by fetch engines to maintain their runtime environment.
756
+ *
757
+ * @internal
758
+ */
759
+ interface BaseFetchContextInteralState {
800
760
  /**
801
- * Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
802
- *
803
- * @param scope - The anchor element (starting point). The returned list starts *after* this element.
804
- * @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
805
- * If omitted or null, returns all following siblings.
806
- * @returns A promise resolving to an array of sibling element scopes.
807
- *
808
- * @remarks
809
- * **Behavior Contract:**
810
- * - **Starting Point**: The `scope` element itself IS NOT included in the result.
811
- * - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
812
- * - **Direction**: Only scans *following* siblings (next siblings).
813
- * - **Flattening**: The result is a flat list of siblings, not a nested structure.
761
+ * The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
762
+ * associated with this context.
814
763
  */
815
- _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
764
+ engine?: any;
816
765
  /**
817
- * Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
818
- *
819
- * @param scope - The starting element from which to ascend the DOM tree.
820
- * @param candidates - An array of potential ancestor elements to check against.
821
- * @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
822
- *
823
- * @remarks
824
- * **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
825
- * It effectively answers: "Which of these container candidates does my current element belong to?"
826
- *
827
- * **Implementation Guidelines**:
828
- * - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
829
- * - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
766
+ * Additional implementation-specific internal state.
830
767
  */
831
- _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
768
+ [key: string]: any;
769
+ }
770
+ /**
771
+ * Extended internal state for the fetch context, including action lifecycle management.
772
+ *
773
+ * @internal
774
+ */
775
+ interface FetchContextInteralState extends BaseFetchContextInteralState {
832
776
  /**
833
- * Checks if the `container` element contains the `element` (descendant).
834
- *
835
- * @param container - The potential ancestor element.
836
- * @param element - The potential descendant element.
837
- * @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
838
- *
839
- * @remarks
840
- * **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
841
- *
842
- * @performance-critical Used extensively in boundary checks for Segmented extraction.
843
- * - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
844
- * - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
777
+ * Stack of actions currently being executed, used to manage nested action calls.
845
778
  */
846
- _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
779
+ actionStack?: FetchActionInContext[];
847
780
  /**
848
- * Finds the Lowest Common Ancestor (LCA) of two element scopes.
849
- *
850
- * @param scope1 - The first element.
851
- * @param scope2 - The second element.
852
- * @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
853
- *
854
- * @remarks
855
- * This is a fundamental tree operation used to find the point where two element paths diverge.
856
- * **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
781
+ * Global counter for actions executed within the session, used to assign auto-incrementing indices.
857
782
  */
858
- _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
783
+ actionIndex?: number;
784
+ }
785
+ /**
786
+ * Context provided to the Fetch Engine during navigation and request handling.
787
+ *
788
+ * @remarks
789
+ * This interface contains the minimum set of properties required by an engine
790
+ * to perform a fetch operation and build a response.
791
+ */
792
+ interface FetchEngineContext extends BaseFetcherProperties {
859
793
  /**
860
- * Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
861
- *
862
- * @param element - The descendant element.
863
- * @param container - The ancestor container.
864
- * @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
865
- *
866
- * @remarks
867
- * This method traverses up from `element` until it finds the node whose parent is `container`.
868
- * **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
794
+ * Unique identifier for the session or request batch.
869
795
  */
870
- _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
796
+ id: string;
871
797
  /**
872
- * Logs debug information if debug mode is enabled.
873
- * @param category - The category of the log message.
874
- * @param args - Arguments to log.
798
+ * The target URL for the next navigation, if specified.
875
799
  */
876
- _logDebug(category: string, ...args: any[]): void;
877
- }
878
- /**
879
- * Base configuration for all extraction schemas.
880
- */
881
- interface BaseExtractSchema {
800
+ url?: string;
882
801
  /**
883
- * Whether this field is required. If true and the value is null,
884
- * the containing object or array item will be skipped (or throw error in strict mode).
802
+ * The final URL after all redirects have been followed.
885
803
  */
886
- required?: boolean;
804
+ finalUrl?: string;
887
805
  /**
888
- * Whether to enable strict mode for this extraction.
889
- * If true, missing required fields will throw an error instead of being skipped.
806
+ * The standardized response object from the most recent navigation.
890
807
  */
891
- strict?: boolean;
808
+ lastResponse?: FetchResponse;
892
809
  /**
893
- * Specifies the starting anchor for extraction of this field.
894
- * - Field Name: Uses the DOM element of a previously extracted field as the anchor.
895
- * - CSS Selector: Re-queries the selector within the current context to find the anchor.
896
- *
897
- * Once anchored, the search scope for this field becomes the siblings following the anchor.
810
+ * The result object from the most recent action execution.
898
811
  */
899
- anchor?: string;
812
+ lastResult?: FetchActionResult;
900
813
  /**
901
- * The maximum number of levels to bubble up from the anchor or matched element.
902
- * - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
903
- * - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
904
- * - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
814
+ * Engine-specific internal state.
905
815
  */
906
- depth?: number;
816
+ internal: BaseFetchContextInteralState;
907
817
  }
908
818
  /**
909
- * Extraction schema types.
910
- */
911
- type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
912
- /**
913
- * Configuration for extracting a single value.
819
+ * The full execution context for a Web Fetcher session or action batch.
820
+ *
821
+ * @remarks
822
+ * This object is the central state container for the fetch operation. It provides
823
+ * access to configuration, the event bus, shared outputs, and the execution engine.
824
+ * It is passed to every action during execution.
914
825
  */
915
- interface ExtractValueSchema extends BaseExtractSchema {
826
+ interface FetchContext extends FetchEngineContext {
916
827
  /**
917
- * The data type to cast the extracted value to.
918
- * @default 'string'
828
+ * Metadata about the action currently being executed.
919
829
  */
920
- type?: 'string' | 'number' | 'boolean' | 'html';
830
+ currentAction?: FetchActionInContext;
921
831
  /**
922
- * Extraction behavior mode.
923
- * - 'text': (Default) Uses textContent.
924
- * - 'innerText': Uses rendered text (respects CSS line breaks).
925
- * - 'html': Returns innerHTML.
926
- * - 'outerHTML': Returns HTML including the element's tag.
832
+ * A shared key-value store for storing data extracted from pages or
833
+ * metadata generated during action execution.
927
834
  */
928
- mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
835
+ outputs: Record<string, any>;
929
836
  /**
930
- * CSS selector to locate the element within the current context.
837
+ * Executes a FetchAction within the current context.
838
+ *
839
+ * @param actionOptions - Configuration for the action to be executed.
840
+ * @returns A promise that resolves to the action's result.
931
841
  */
932
- selector?: string;
842
+ execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
933
843
  /**
934
- * Attribute name to extract (e.g., 'href', 'src').
935
- * If omitted, the text content or HTML is extracted based on `type`.
844
+ * Convenience method to execute an action by its registered name or ID.
845
+ *
846
+ * @param name - The registered name or ID of the action.
847
+ * @param params - Parameters specific to the action type.
848
+ * @param options - Additional execution options (e.g., storeAs, failOnError).
849
+ * @returns A promise that resolves to a result.
936
850
  */
937
- attribute?: string;
851
+ action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
938
852
  /**
939
- * Filter elements that contain a descendant matching this CSS selector.
853
+ * Internal state for engine and lifecycle management.
940
854
  */
941
- has?: string;
855
+ internal: FetchContextInteralState;
942
856
  /**
943
- * Exclude elements matching this CSS selector.
857
+ * The central event bus for publishing and subscribing to session and action events.
944
858
  */
945
- exclude?: string;
859
+ eventBus: EventEmitter;
946
860
  }
947
- /**
948
- * Names of the supported array extraction modes.
949
- */
950
- type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
951
- /**
952
- * Base options for array extraction modes.
953
- */
954
- interface BaseModeOptions {
955
- type: ExtractArrayModeName;
956
- /**
957
- * Whether to enable strict mode for this specific array mode.
958
- * @default false
959
- */
960
- strict?: boolean;
961
- }
962
- /**
963
- * Options for columnar (column-alignment) extraction.
964
- */
965
- interface ColumnarOptions extends BaseModeOptions {
966
- type: 'columnar';
967
- /**
968
- * Whether to enable heuristic inference.
969
- * If true, tries to find a common parent to infer item wrappers when counts mismatch.
970
- * @default false
971
- */
972
- inference?: boolean;
861
+
862
+ type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
863
+ interface FetchReturnTypeRegistry {
864
+ response: FetchResponse;
865
+ context: FetchContext;
866
+ result: FetchActionResult<any> | undefined;
867
+ outputs: Record<string, any>;
868
+ any: any;
869
+ none: void;
973
870
  }
974
- /**
975
- * Options for segmented (anchor-based) extraction.
976
- */
977
- interface SegmentedOptions extends BaseModeOptions {
978
- type: 'segmented';
871
+ type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
872
+
873
+ declare enum FetchActionResultStatus {
979
874
  /**
980
- * The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
981
- * Defaults to the first property key's selector defined in `items`.
875
+ * 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
982
876
  */
983
- anchor?: string;
877
+ Failed = 0,
984
878
  /**
985
- * Where to start searching for fields within each segment.
986
- * - 'anchor': (Default) All fields are searched within the entire segment.
987
- * - 'previous': Each field is searched starting from after the previous field's match.
879
+ * 动作按预期完成(即便产生 warnings)
988
880
  */
989
- relativeTo?: 'anchor' | 'previous';
881
+ Success = 1,
990
882
  /**
991
- * The maximum number of levels to bubble up from the anchor to find a segment container.
992
- * If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
883
+ * 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
884
+ * 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
993
885
  */
994
- depth?: number;
886
+ Skipped = 2
995
887
  }
996
- /**
997
- * Union type for array extraction modes and their options.
998
- */
999
- type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
1000
- /**
1001
- * Configuration for extracting an array of items.
1002
- */
1003
- interface ExtractArraySchema extends BaseExtractSchema {
1004
- type: 'array';
1005
- /**
1006
- * CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
1007
- */
1008
- selector: string;
1009
- /**
1010
- * Filter items/containers that contain a descendant matching this CSS selector.
1011
- */
1012
- has?: string;
1013
- /**
1014
- * Exclude items/containers matching this CSS selector.
1015
- */
1016
- exclude?: string;
1017
- /**
1018
- * Schema applied recursively to each extracted item.
1019
- * If omitted, defaults to extracting text.
1020
- */
1021
- items?: ExtractSchema;
1022
- /**
1023
- * Shortcut for `items` to extract a specific attribute directly.
1024
- */
1025
- attribute?: string;
1026
- /**
1027
- * Array extraction mode.
1028
- * - 'nested': (Default) Items are elements matched by `selector`.
1029
- * - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
1030
- * - 'segmented': `selector` is a container, items are segmented by an anchor field.
1031
- */
1032
- mode?: ExtractArrayMode;
888
+ type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
889
+ interface FetchActionMeta {
890
+ id: string;
891
+ index?: number;
892
+ engineType?: FetchEngineType;
893
+ capability?: FetchActionCapabilityMode;
894
+ response?: FetchResponse;
895
+ timings?: {
896
+ start: number;
897
+ total: number;
898
+ };
899
+ retries?: number;
900
+ }
901
+ interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
902
+ status: FetchActionResultStatus;
903
+ returnType?: R;
904
+ result?: FetchReturnTypeFor<R>;
905
+ error?: Error;
906
+ meta?: FetchActionMeta;
907
+ }
908
+ interface BaseFetchActionProperties {
909
+ id?: string;
910
+ name?: string;
911
+ action?: string | any;
912
+ index?: number;
913
+ params?: any;
914
+ args?: any;
915
+ storeAs?: string;
916
+ failOnError?: boolean;
917
+ failOnTimeout?: boolean;
918
+ timeoutMs?: number;
919
+ maxRetries?: number;
920
+ [key: string]: any;
921
+ }
922
+ type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
923
+ interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
924
+ activateOn?: string | RegExp | Array<string | RegExp>;
925
+ deactivateOn?: string | RegExp | Array<string | RegExp>;
926
+ collectOn?: string | RegExp | Array<string | RegExp>;
927
+ background?: boolean;
928
+ }
929
+ type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
930
+ interface FetchActionProperties extends BaseFetchActionProperties {
931
+ collectors?: BaseFetchCollectorOptions[];
932
+ }
933
+ type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
934
+ declare class EngineUpgradeError extends Error {
935
+ res: FetchResponse;
936
+ code: string;
937
+ constructor(res: FetchResponse);
1033
938
  }
939
+ type FetchEngineType = 'http' | 'browser';
940
+ type BrowserEngine = 'playwright' | 'puppeteer';
941
+ type FetchEngineMode = FetchEngineType | 'auto' | string;
942
+ type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
1034
943
  /**
1035
- * Configuration for extracting an object with multiple properties.
944
+ * Storage configuration options for the fetch engine.
945
+ *
946
+ * @remarks
947
+ * Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
1036
948
  */
1037
- interface ExtractObjectSchema extends BaseExtractSchema {
1038
- type: 'object';
949
+ interface StorageOptions {
1039
950
  /**
1040
- * Root selector for the object. If provided, sub-properties are searched within this element.
951
+ * Custom identifier for the storage.
952
+ * If provided, multiple sessions can share the same storage by using the same ID.
953
+ * If not provided, a unique session ID is used (strong isolation).
1041
954
  */
1042
- selector?: string;
955
+ id?: string;
1043
956
  /**
1044
- * Filter the object element based on descendants.
957
+ * Whether to persist storage to disk.
958
+ * If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
959
+ * Corresponds to Crawlee's `persistStorage` configuration.
1045
960
  */
1046
- has?: string;
961
+ persist?: boolean;
1047
962
  /**
1048
- * Exclude the object element if it matches this selector.
963
+ * Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
964
+ * Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
1049
965
  */
1050
- exclude?: string;
966
+ purge?: boolean;
1051
967
  /**
1052
- * Where to start searching for fields within this object.
1053
- * Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
1054
- * - 'anchor': (Default) All fields are searched within the entire scope.
1055
- * - 'previous': Each field is searched starting from after the previous field's match.
968
+ * Additional Crawlee configuration options.
969
+ * Allows fine-grained control over the underlying Crawlee instance.
1056
970
  */
1057
- relativeTo?: 'anchor' | 'previous';
971
+ config?: Record<string, any>;
972
+ }
973
+ interface BaseFetcherProperties {
1058
974
  /**
1059
- * Explicit order of property extraction.
1060
- * Useful when using `relativeTo: 'previous'`.
975
+ * 抓取模式
976
+ *
977
+ * - `http`: 使用 HTTP 进行抓取
978
+ * - `browser`: 使用浏览器进行抓取
979
+ * - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
1061
980
  */
1062
- order?: string[];
981
+ engine?: FetchEngineMode;
982
+ enableSmart?: boolean;
983
+ syncStateOnUpgrade?: boolean;
984
+ upgradeThresholdMs?: number;
985
+ useSiteRegistry?: boolean;
986
+ antibot?: boolean;
987
+ debug?: boolean | string | string[];
988
+ headers?: Record<string, string>;
989
+ cookies?: Cookie[];
990
+ sessionState?: any;
991
+ sessionPoolOptions?: SessionPoolOptions;
992
+ overrideSessionState?: boolean;
993
+ throwHttpErrors?: boolean;
994
+ output?: {
995
+ cookies?: boolean;
996
+ sessionState?: boolean;
997
+ };
998
+ proxy?: string | string[];
999
+ blockResources?: ResourceType[];
1063
1000
  /**
1064
- * Definition of the object's properties and their corresponding extraction schemas.
1001
+ * Storage configuration for session isolation and persistence.
1065
1002
  */
1066
- properties: {
1067
- [key: string]: ExtractSchema;
1003
+ storage?: StorageOptions;
1004
+ ignoreSslErrors?: boolean;
1005
+ browser?: {
1006
+ /**
1007
+ * 浏览器引擎,默认为 playwright
1008
+ *
1009
+ * - `playwright`: 使用 Playwright 引擎
1010
+ * - `puppeteer`: 使用 Puppeteer 引擎
1011
+ */
1012
+ engine?: BrowserEngine;
1013
+ headless?: boolean;
1014
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1015
+ launchOptions?: Record<string, any>;
1016
+ };
1017
+ http?: {
1018
+ method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
1019
+ body?: any;
1068
1020
  };
1069
- }
1070
-
1071
- interface PromiseLock extends Promise<void> {
1072
- release: () => void;
1073
- }
1074
-
1075
- /**
1076
- * Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
1077
- *
1078
- * @remarks
1079
- * Used when navigating to a URL to specify additional parameters beyond the basic URL.
1080
- *
1081
- * @example
1082
- * ```ts
1083
- * await engine.goto('https://example.com', {
1084
- * method: 'POST',
1085
- * payload: { username: 'user', password: 'pass' },
1086
- * headers: { 'Content-Type': 'application/json' },
1087
- * waitUntil: 'networkidle'
1088
- * });
1089
- * ```
1090
- */
1091
- interface GotoActionOptions {
1092
- method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
1093
- payload?: any;
1094
- headers?: Record<string, string>;
1095
- waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1096
1021
  timeoutMs?: number;
1097
- simulate?: boolean;
1022
+ requestHandlerTimeoutSecs?: number;
1023
+ maxConcurrency?: number;
1024
+ maxRequestsPerMinute?: number;
1025
+ delayBetweenRequestsMs?: number;
1026
+ retries?: number;
1027
+ sites?: FetchSite[];
1028
+ url?: string;
1098
1029
  }
1099
- /**
1100
- * Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
1101
- *
1102
- * @remarks
1103
- * Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
1104
- */
1105
- interface WaitForActionOptions {
1106
- ms?: number;
1107
- selector?: string;
1108
- networkIdle?: boolean;
1109
- failOnTimeout?: boolean;
1030
+ interface FetchSite extends BaseFetcherProperties {
1031
+ domain: string;
1032
+ pathScope?: string[];
1033
+ meta?: {
1034
+ updatedAt?: number;
1035
+ ttlMs?: number;
1036
+ source?: 'manual' | 'smart';
1037
+ };
1038
+ }
1039
+ type OnFetchPauseCallback = (options: {
1040
+ message?: string;
1041
+ }) => Promise<void>;
1042
+ interface FetcherOptions extends BaseFetcherProperties {
1043
+ actions?: FetchActionOptions[];
1044
+ onPause?: OnFetchPauseCallback;
1045
+ }
1046
+ interface FetchMetadata {
1047
+ mode: FetchEngineType;
1048
+ engine?: BrowserEngine;
1049
+ timings?: {
1050
+ start: number;
1051
+ total: number;
1052
+ ttfb?: number;
1053
+ dns?: number;
1054
+ tcp?: number;
1055
+ firstByte?: number;
1056
+ download?: number;
1057
+ };
1058
+ proxy?: string;
1059
+ [key: string]: any;
1060
+ }
1061
+ interface FetchResponse {
1062
+ url: string;
1063
+ finalUrl: string;
1064
+ statusCode?: number;
1065
+ statusText?: string;
1066
+ headers: Record<string, string>;
1067
+ contentType?: string;
1068
+ body?: string | Buffer<ArrayBufferLike>;
1069
+ html?: string;
1070
+ text?: string;
1071
+ json?: any;
1072
+ cookies?: Cookie[];
1073
+ sessionState?: any;
1074
+ metadata?: FetchMetadata;
1110
1075
  }
1076
+ declare const DefaultFetcherProperties: BaseFetcherProperties;
1077
+ declare const FetcherOptionKeys: string[];
1078
+
1111
1079
  /**
1112
- * Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
1080
+ * Represents a stateful web fetching session.
1113
1081
  *
1114
1082
  * @remarks
1115
- * Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
1116
- */
1117
- interface SubmitActionOptions {
1118
- enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
1119
- }
1120
- /**
1121
- * Predefined cleanup groups for the {@link FetchEngine.trim} action.
1122
- */
1123
- type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
1124
- /**
1125
- * Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
1083
+ * A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
1084
+ * cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
1085
+ * session-level configurations and outputs.
1086
+ *
1087
+ * Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
1126
1088
  */
1127
- interface TrimActionOptions {
1128
- selectors?: string | string[];
1129
- presets?: TrimPreset | TrimPreset[];
1089
+ declare class FetchSession {
1090
+ protected options: FetcherOptions;
1091
+ /**
1092
+ * Unique identifier for the session.
1093
+ */
1094
+ readonly id: string;
1095
+ /**
1096
+ * The execution context for this session, containing configurations, event bus, and shared state.
1097
+ */
1098
+ readonly context: FetchContext;
1099
+ protected closed: boolean;
1100
+ /**
1101
+ * Creates a new FetchSession.
1102
+ *
1103
+ * @param options - Configuration options for the fetcher.
1104
+ */
1105
+ constructor(options?: FetcherOptions);
1106
+ protected _logDebug(category: string, ...args: any[]): void;
1107
+ /**
1108
+ * Executes a single action within the session.
1109
+ *
1110
+ * @param actionOptions - Configuration for the action to be executed.
1111
+ * @param context - Optional context override for this specific execution. Defaults to the session context.
1112
+ * @returns A promise that resolves to the result of the action.
1113
+ * @template R - The expected return type of the action.
1114
+ *
1115
+ * @example
1116
+ * ```ts
1117
+ * await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
1118
+ * ```
1119
+ */
1120
+ execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
1121
+ /**
1122
+ * Executes a sequence of actions.
1123
+ *
1124
+ * @param actions - An array of action options to be executed in order.
1125
+ * @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
1126
+ * These overrides do not affect the main session context.
1127
+ * @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
1128
+ *
1129
+ * @example
1130
+ * ```ts
1131
+ * const { result, outputs } = await session.executeAll([
1132
+ * { name: 'goto', params: { url: 'https://example.com' } },
1133
+ * { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
1134
+ * ], { timeoutMs: 30000 });
1135
+ * ```
1136
+ */
1137
+ executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
1138
+ index?: number;
1139
+ }): Promise<{
1140
+ result: FetchResponse | undefined;
1141
+ outputs: Record<string, any>;
1142
+ }>;
1143
+ /**
1144
+ * Retrieves all outputs accumulated during the session.
1145
+ *
1146
+ * @returns A record of stored output data.
1147
+ */
1148
+ getOutputs(): Record<string, any>;
1149
+ /**
1150
+ * Gets the current state of the session, including cookies and engine-specific state.
1151
+ *
1152
+ * @returns A promise resolving to the session state, or undefined if no engine is initialized.
1153
+ */
1154
+ getState(): Promise<{
1155
+ cookies: Cookie[];
1156
+ sessionState?: any;
1157
+ } | undefined>;
1158
+ /**
1159
+ * Disposes of the session and its associated engine.
1160
+ *
1161
+ * @remarks
1162
+ * This method should be called when the session is no longer needed to free up resources
1163
+ * (e.g., closing browser instances, purging temporary storage).
1164
+ */
1165
+ dispose(): Promise<void>;
1166
+ private ensureEngine;
1167
+ protected createContext(options?: FetcherOptions): FetchContext;
1130
1168
  }
1131
- declare const TRIM_PRESETS: Record<string, string[]>;
1169
+
1132
1170
  /**
1133
- * Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
1171
+ * High-level entry point for the Web Fetcher library.
1134
1172
  *
1135
1173
  * @remarks
1136
- * This action allows executing custom JavaScript logic within the page context.
1137
- *
1138
- * **Execution Environments:**
1139
- * - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
1140
- * - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
1141
- * including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
1142
- *
1143
- * **Navigation Handling:**
1144
- * If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
1145
- * automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
1146
- *
1147
- * @example
1148
- * ```json
1149
- * {
1150
- * "action": "evaluate",
1151
- * "params": {
1152
- * "fn": "([a, b]) => a + b",
1153
- * "args": [1, 2]
1154
- * }
1155
- * }
1156
- * ```
1174
+ * The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
1175
+ * It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
1157
1176
  *
1158
1177
  * @example
1159
- * ```json
1160
- * {
1161
- * "action": "evaluate",
1162
- * "params": {
1163
- * "fn": "({ x, y }) => x * y",
1164
- * "args": { "x": 6, "y": 7 }
1165
- * }
1166
- * }
1178
+ * ```ts
1179
+ * const fetcher = new WebFetcher();
1180
+ * const { result } = await fetcher.fetch('https://example.com');
1167
1181
  * ```
1168
1182
  */
1169
- interface EvaluateActionOptions {
1183
+ declare class WebFetcher {
1184
+ private defaults;
1170
1185
  /**
1171
- * The function or expression to execute.
1186
+ * Creates a new WebFetcher with default options.
1172
1187
  *
1173
- * @remarks
1174
- * Can be:
1175
- * 1. A function object (only available when using the API directly).
1176
- * 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
1177
- * 3. A string containing a direct expression, e.g., `"document.title"`
1188
+ * @param defaults - Default configuration options applied to all sessions and requests.
1189
+ */
1190
+ constructor(defaults?: FetcherOptions);
1191
+ /**
1192
+ * Creates a new FetchSession.
1178
1193
  *
1179
- * **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
1180
- * Use destructuring to handle multiple parameters.
1194
+ * @param options - Configuration options for the session, merged with defaults.
1195
+ * @returns A promise resolving to a new FetchSession instance.
1181
1196
  */
1182
- fn: string | ((...args: any[]) => any);
1197
+ createSession(options?: FetcherOptions): Promise<FetchSession>;
1183
1198
  /**
1184
- * Data to pass to the function.
1199
+ * Fetches content from a URL or executes a complex action script.
1185
1200
  *
1186
1201
  * @remarks
1187
- * This value is passed as the first and only argument to the function defined in {@link fn}.
1188
- * Recommended to use an array or object for multiple values.
1202
+ * This method automatically creates a session, executes the specified actions,
1203
+ * retrieves the content, and disposes of the session.
1204
+ *
1205
+ * @param url - The target URL or a complete FetcherOptions object.
1206
+ * @param options - Additional options when the first parameter is a URL string.
1207
+ * @returns A promise resolving to the final response and any extracted outputs.
1189
1208
  */
1190
- args?: any;
1209
+ fetch(url: string, options?: FetcherOptions): Promise<{
1210
+ result: FetchResponse | undefined;
1211
+ outputs: Record<string, any>;
1212
+ }>;
1213
+ fetch(options: FetcherOptions): Promise<{
1214
+ result: FetchResponse | undefined;
1215
+ outputs: Record<string, any>;
1216
+ }>;
1191
1217
  }
1218
+
1192
1219
  /**
1193
- * Union type representing all possible engine actions that can be dispatched.
1220
+ * Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
1221
+ * It acts as the target for extraction and interaction actions.
1222
+ */
1223
+ type FetchElementScope = any;
1224
+ /**
1225
+ * Interface representing the minimal engine capabilities required for extraction.
1194
1226
  *
1195
1227
  * @remarks
1196
- * Defines the command structure processed during page interactions. Each action type corresponds to
1197
- * a specific user interaction or navigation command within the action loop architecture.
1228
+ * This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
1229
+ * Implementing classes must ensure consistent behavior across different engines, especially
1230
+ * regarding scope handling (Element vs Array of Elements) and DOM traversal.
1198
1231
  */
1199
- type FetchEngineAction = {
1200
- type: 'click';
1201
- selector: string;
1202
- } | {
1203
- type: 'fill';
1204
- selector: string;
1205
- value: string;
1206
- } | {
1207
- type: 'mouseMove';
1208
- params: {
1209
- x?: number;
1210
- y?: number;
1211
- selector?: string;
1212
- steps?: number;
1213
- };
1214
- } | {
1215
- type: 'mouseClick';
1216
- params: {
1217
- x?: number;
1218
- y?: number;
1219
- button?: 'left' | 'right' | 'middle';
1220
- clickCount?: number;
1221
- delay?: number;
1222
- steps?: number;
1223
- };
1224
- } | {
1225
- type: 'mouseWheel';
1226
- params: {
1227
- x?: number;
1228
- y?: number;
1229
- selector?: string;
1230
- deltaX?: number;
1231
- deltaY?: number;
1232
- steps?: number;
1233
- };
1234
- } | {
1235
- type: 'keyboardType';
1236
- params: {
1237
- text: string;
1238
- delay?: number;
1239
- };
1240
- } | {
1241
- type: 'keyboardPress';
1242
- params: {
1243
- key: string;
1244
- delay?: number;
1245
- };
1246
- } | {
1247
- type: 'scrollIntoView';
1248
- params: {
1249
- selector: string;
1250
- };
1251
- } | {
1252
- type: 'waitFor';
1253
- options?: WaitForActionOptions;
1254
- } | {
1255
- type: 'submit';
1256
- selector?: any;
1257
- options?: SubmitActionOptions;
1258
- } | {
1259
- type: 'getContent';
1260
- } | {
1261
- type: 'navigate';
1262
- url: string;
1263
- opts?: GotoActionOptions;
1264
- } | {
1265
- type: 'extract';
1266
- schema: ExtractSchema;
1267
- } | {
1268
- type: 'pause';
1269
- message?: string;
1270
- } | {
1271
- type: 'trim';
1272
- options: TrimActionOptions;
1273
- } | {
1274
- type: 'evaluate';
1275
- params: EvaluateActionOptions;
1276
- } | {
1277
- type: 'dispose';
1278
- };
1279
- /**
1280
- * Represents an action that has been dispatched and is awaiting execution in the active page context.
1281
- *
1282
- * @remarks
1283
- * Connects the action request with its resolution mechanism. Used internally by the action dispatch system
1284
- * to handle promises while maintaining the page context validity window.
1285
- */
1286
- interface DispatchedEngineAction {
1287
- action: FetchEngineAction;
1288
- resolve: (value?: any) => void;
1289
- reject: (reason?: any) => void;
1290
- }
1291
- /**
1292
- * Represents a pending navigation request awaiting resolution.
1293
- *
1294
- * @remarks
1295
- * Tracks navigation requests that have been queued but not yet processed by the request handler.
1296
- */
1297
- interface PendingEngineRequest {
1298
- resolve: (value: any) => void;
1299
- reject: (reason?: any) => void;
1300
- }
1301
- /**
1302
- * Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
1303
- *
1304
- * @remarks
1305
- * The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
1306
- * `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
1307
- * content retrieval, and user interaction.
1308
- *
1309
- * The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
1310
- * the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
1311
- * page context validity across asynchronous operations.
1312
- *
1313
- * @example
1314
- * ```ts
1315
- * import "./playwright"; // 引入注册 Playwright browser 引擎
1316
- * const engine = await FetchEngine.create(context, { engine: 'browser' });
1317
- * await engine.goto('https://example.com');
1318
- * await engine.fill('#username', 'user');
1319
- * await engine.click('#submit');
1320
- * const response = await engine.getContent();
1321
- * ```
1322
- */
1323
- type AnyFetchEngine = FetchEngine<any, any, any>;
1324
- type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
1325
- declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
1326
- private static registry;
1232
+ interface IExtractEngine {
1327
1233
  /**
1328
- * Registers a fetch engine implementation with the global registry.
1234
+ * Finds all elements matching the selector within the given scope.
1329
1235
  *
1330
- * @param engineClass - The engine class to register
1331
- * @throws {Error} When engine class lacks static `id` or ID is already registered
1236
+ * @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
1237
+ * @param selector - The CSS selector to match.
1238
+ * @returns A promise resolving to an array of found element scopes.
1332
1239
  *
1333
- * @example
1334
- * ```ts
1335
- * FetchEngine.register(CheerioFetchEngine);
1336
- * ```
1240
+ * @remarks
1241
+ * **Behavior Contract:**
1242
+ * 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
1243
+ * 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
1244
+ * 3. **Array Scope**: If `scope` is an array:
1245
+ * - It MUST process elements in the order they appear in the array (which should match document order).
1246
+ * - It MUST perform the check (Self + Descendants) for *each* element in the array.
1247
+ * - It MUST flatten the results into a single array.
1248
+ * - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
1249
+ * but generally, preserving document order is the priority.
1337
1250
  */
1338
- static register(engineClass: AnyFetchEngineCtor): void;
1251
+ _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1339
1252
  /**
1340
- * Retrieves a fetch engine implementation by its unique ID.
1253
+ * Extracts a primitive value from the element based on the schema configuration.
1341
1254
  *
1342
- * @param id - The ID of the engine to retrieve
1343
- * @returns Engine class if found, otherwise `undefined`
1255
+ * @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
1256
+ * @param scope - The specific element to extract data from.
1257
+ * @returns A promise resolving to the extracted value (string, number, boolean, or null).
1258
+ *
1259
+ * @remarks
1260
+ * **Behavior Contract:**
1261
+ * - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
1262
+ * - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
1263
+ * - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
1264
+ * - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
1265
+ * - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
1344
1266
  */
1345
- static get(id: string): AnyFetchEngineCtor | undefined;
1267
+ _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1346
1268
  /**
1347
- * Retrieves a fetch engine implementation by execution mode.
1269
+ * Gets the parent element of the given scope.
1348
1270
  *
1349
- * @param mode - Execution mode (`'http'` or `'browser'`)
1350
- * @returns Engine class if found, otherwise `undefined`
1271
+ * @param scope - The element to find the parent of.
1272
+ * @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
1351
1273
  */
1352
- static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
1274
+ _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1353
1275
  /**
1354
- * Factory method to create and initialize a fetch engine instance.
1276
+ * Checks if two element scopes refer to the exact same DOM node.
1355
1277
  *
1356
- * @param ctx - Fetch engine context
1357
- * @param options - Configuration options
1358
- * @returns Initialized fetch engine instance
1359
- * @throws {Error} When no suitable engine implementation is found
1278
+ * @param scope1 - The first element scope.
1279
+ * @param scope2 - The second element scope.
1280
+ * @returns A promise resolving to `true` if they are the same node, `false` otherwise.
1360
1281
  *
1361
1282
  * @remarks
1362
- * Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
1283
+ * This comparison MUST be identity-based, not just content-based.
1363
1284
  */
1364
- static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
1285
+ _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1365
1286
  /**
1366
- * Unique identifier for the engine implementation.
1287
+ * Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
1288
+ *
1289
+ * @param scope - The anchor element (starting point). The returned list starts *after* this element.
1290
+ * @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
1291
+ * If omitted or null, returns all following siblings.
1292
+ * @returns A promise resolving to an array of sibling element scopes.
1367
1293
  *
1368
1294
  * @remarks
1369
- * Must be defined by concrete implementations. Used for registration and lookup in engine registry.
1295
+ * **Behavior Contract:**
1296
+ * - **Starting Point**: The `scope` element itself IS NOT included in the result.
1297
+ * - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
1298
+ * - **Direction**: Only scans *following* siblings (next siblings).
1299
+ * - **Flattening**: The result is a flat list of siblings, not a nested structure.
1370
1300
  */
1371
- static readonly id: string;
1301
+ _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1372
1302
  /**
1373
- * Execution mode of the engine (`'http'` or `'browser'`).
1303
+ * Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
1304
+ *
1305
+ * @param scope - The starting element from which to ascend the DOM tree.
1306
+ * @param candidates - An array of potential ancestor elements to check against.
1307
+ * @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
1374
1308
  *
1375
1309
  * @remarks
1376
- * Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
1377
- */
1378
- static readonly mode: FetchEngineType;
1379
- protected ctx?: FetchEngineContext;
1380
- protected opts?: BaseFetcherProperties;
1381
- protected crawler?: TCrawler;
1382
- protected isCrawlerReady?: boolean;
1383
- protected crawlerRunPromise?: Promise<FinalStatistics>;
1384
- protected config?: Configuration;
1385
- protected requestQueue?: RequestQueue;
1386
- protected kvStore?: KeyValueStore;
1387
- protected proxyConfiguration?: ProxyConfiguration;
1388
- protected hdrs: Record<string, string>;
1389
- protected _initialCookies?: Cookie[];
1390
- protected _initializedSessions: Set<string>;
1391
- protected currentSession?: Session;
1392
- protected pendingRequests: Map<string, PendingEngineRequest>;
1393
- protected requestCounter: number;
1394
- protected actionEmitter: EventEmitter;
1395
- protected isPageActive: boolean;
1396
- protected isEngineDisposed: boolean;
1397
- protected navigationLock: PromiseLock;
1398
- protected activeContext?: TContext;
1399
- protected isExecutingAction: boolean;
1400
- protected lastResponse?: FetchResponse;
1401
- protected actionQueue: DispatchedEngineAction[];
1402
- protected isProcessingActionLoop: boolean;
1403
- protected blockedTypes: Set<string>;
1404
- _logDebug(category: string, ...args: any[]): void;
1405
- protected _cleanup?(): Promise<void>;
1406
- protected _getTrimInfo(options: TrimActionOptions): {
1407
- selectors: string[];
1408
- removeComments: boolean;
1409
- removeHidden: boolean;
1410
- };
1411
- /**
1412
- * Finds all elements matching the selector within the given scope.
1310
+ * **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
1311
+ * It effectively answers: "Which of these container candidates does my current element belong to?"
1413
1312
  *
1414
- * @param scope - The scope to search in (Engine-specific element/node or array of nodes).
1415
- * @param selector - CSS selector.
1416
- * @returns List of matching elements.
1417
- * @see {@link IExtractEngine._querySelectorAll} for behavior contract.
1418
- * @internal
1313
+ * **Implementation Guidelines**:
1314
+ * - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
1315
+ * - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
1419
1316
  */
1420
- abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1317
+ _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1421
1318
  /**
1422
- * Extracts a primitive value from the element based on schema.
1319
+ * Checks if the `container` element contains the `element` (descendant).
1423
1320
  *
1424
- * @param schema - Value extraction schema.
1425
- * @param scope - The element scope.
1426
- * @returns Extracted value.
1427
- * @see {@link IExtractEngine._extractValue} for behavior contract.
1428
- * @internal
1429
- */
1430
- abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1431
- /**
1432
- * Gets the parent element of the given element.
1321
+ * @param container - The potential ancestor element.
1322
+ * @param element - The potential descendant element.
1323
+ * @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
1433
1324
  *
1434
- * @param scope - The element scope.
1435
- * @returns Parent element or null.
1436
- * @internal
1325
+ * @remarks
1326
+ * **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
1327
+ *
1328
+ * @performance-critical Used extensively in boundary checks for Segmented extraction.
1329
+ * - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
1330
+ * - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
1437
1331
  */
1438
- abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1332
+ _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1439
1333
  /**
1440
- * Checks if two elements are the same identity.
1334
+ * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1441
1335
  *
1442
- * @param scope1 - First element scope.
1443
- * @param scope2 - Second element scope.
1444
- * @returns True if they are the same DOM node.
1445
- * @internal
1336
+ * @param scope1 - The first element.
1337
+ * @param scope2 - The second element.
1338
+ * @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
1339
+ *
1340
+ * @remarks
1341
+ * This is a fundamental tree operation used to find the point where two element paths diverge.
1342
+ * **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
1446
1343
  */
1447
- abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1344
+ _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1448
1345
  /**
1449
- * Gets all subsequent siblings of an element until a sibling matches the selector.
1450
- * Used in 'segmented' extraction mode.
1346
+ * Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
1451
1347
  *
1452
- * @param scope - The anchor element scope.
1453
- * @param untilSelector - Optional selector that marks the end of the segment (exclusive).
1454
- * @returns List of sibling elements between anchor and untilSelector.
1455
- * @internal
1348
+ * @param element - The descendant element.
1349
+ * @param container - The ancestor container.
1350
+ * @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
1351
+ *
1352
+ * @remarks
1353
+ * This method traverses up from `element` until it finds the node whose parent is `container`.
1354
+ * **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
1456
1355
  */
1457
- abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1356
+ _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1458
1357
  /**
1459
- * Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
1460
- *
1461
- * @param scope - The starting element.
1462
- * @param candidates - The array of potential ancestor scopes.
1463
- * @returns A promise resolving to the matching candidate scope, or `null` if none found.
1464
- * @see {@link IExtractEngine._findClosestAncestor} for implementation details.
1465
- * @internal
1358
+ * Logs debug information if debug mode is enabled.
1359
+ * @param category - The category of the log message.
1360
+ * @param args - Arguments to log.
1466
1361
  */
1467
- abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1362
+ _logDebug(category: string, ...args: any[]): void;
1363
+ }
1364
+ /**
1365
+ * Base configuration for all extraction schemas.
1366
+ */
1367
+ interface BaseExtractSchema {
1468
1368
  /**
1469
- * Checks if the `container` scope contains the `element` scope.
1470
- *
1471
- * @param container - The potential ancestor element.
1472
- * @param element - The potential descendant element.
1473
- * @returns A promise resolving to `true` if `container` contains `element`.
1474
- * @see {@link IExtractEngine._contains} for implementation details.
1475
- * @internal
1369
+ * Whether this field is required. If true and the value is null,
1370
+ * the containing object or array item will be skipped (or throw error in strict mode).
1476
1371
  */
1477
- abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1372
+ required?: boolean;
1478
1373
  /**
1479
- * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1480
- *
1481
- * @param scope1 - The first element scope.
1482
- * @param scope2 - The second element scope.
1483
- * @returns A promise resolving to the LCA element scope, or `null` if none found.
1484
- * @internal
1374
+ * Whether to enable strict mode for this extraction.
1375
+ * If true, missing required fields will throw an error instead of being skipped.
1485
1376
  */
1486
- abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1377
+ strict?: boolean;
1487
1378
  /**
1488
- * Finds the direct child of container that contains element.
1379
+ * Specifies the starting anchor for extraction of this field.
1380
+ * - Field Name: Uses the DOM element of a previously extracted field as the anchor.
1381
+ * - CSS Selector: Re-queries the selector within the current context to find the anchor.
1489
1382
  *
1490
- * @param element - The descendant element.
1491
- * @param container - The container element.
1492
- * @returns The child element of container, or null.
1493
- * @internal
1383
+ * Once anchored, the search scope for this field becomes the siblings following the anchor.
1494
1384
  */
1495
- abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1496
- protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
1385
+ anchor?: string;
1497
1386
  /**
1498
- * Normalizes the array extraction mode into an options object.
1499
- * @param mode - The mode string or options object.
1500
- * @internal
1387
+ * The maximum number of levels to bubble up from the anchor or matched element.
1388
+ * - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
1389
+ * - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
1390
+ * - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
1501
1391
  */
1502
- protected _normalizeArrayMode(mode?: ExtractArrayMode): {
1503
- type: ExtractArrayModeName;
1504
- } & any;
1392
+ depth?: number;
1393
+ }
1394
+ /**
1395
+ * Extraction schema types.
1396
+ */
1397
+ type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
1398
+ /**
1399
+ * Configuration for extracting a single value.
1400
+ */
1401
+ interface ExtractValueSchema extends BaseExtractSchema {
1505
1402
  /**
1506
- * Performs standard nested array extraction.
1507
- * @param items - The schema for each item.
1508
- * @param elements - The list of item elements.
1509
- * @internal
1403
+ * The data type to cast the extracted value to.
1404
+ * @default 'string'
1510
1405
  */
1511
- protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
1512
- strict?: boolean;
1513
- }): Promise<any[]>;
1406
+ type?: 'string' | 'number' | 'boolean' | 'html';
1514
1407
  /**
1515
- * Performs columnar extraction (Column Alignment Mode).
1516
- *
1517
- * @param schema - The schema for a single item (must be an object or implicit object).
1518
- * @param container - The container element to search within.
1519
- * @param opts - Columnar extraction options (strict, inference).
1520
- * @returns An array of extracted items, or null if requirements aren't met.
1521
- * @internal
1408
+ * Extraction behavior mode.
1409
+ * - 'text': (Default) Uses textContent.
1410
+ * - 'innerText': Uses rendered text (respects CSS line breaks).
1411
+ * - 'html': Returns innerHTML.
1412
+ * - 'outerHTML': Returns HTML including the element's tag.
1522
1413
  */
1523
- protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
1414
+ mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
1524
1415
  /**
1525
- * Performs segmented extraction (Anchor-based Scanning).
1526
- *
1527
- * @param schema - The schema for a single item (must be an object).
1528
- * @param container - The container element to scan.
1529
- * @param opts - Segmented extraction options (anchor).
1530
- * @returns An array of extracted items.
1531
- * @internal
1416
+ * CSS selector to locate the element within the current context.
1532
1417
  */
1533
- protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
1418
+ selector?: string;
1534
1419
  /**
1535
- * Creates the crawler instance for the specific engine implementation.
1536
- * @param options - The final crawler options.
1537
- * @internal
1420
+ * Attribute name to extract (e.g., 'href', 'src').
1421
+ * If omitted, the text content or HTML is extracted based on `type`.
1538
1422
  */
1539
- protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
1423
+ attribute?: string;
1540
1424
  /**
1541
- * Gets the crawler-specific options from the subclass.
1542
- * @param ctx - The fetch engine context.
1543
- * @internal
1425
+ * Filter elements that contain a descendant matching this CSS selector.
1544
1426
  */
1545
- protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
1427
+ has?: string;
1546
1428
  /**
1547
- * Abstract method for building standard [FetchResponse] from Crawlee context.
1548
- *
1549
- * @param context - Crawlee crawling context
1550
- * @returns Promise resolving to [FetchResponse] object
1551
- *
1552
- * @remarks
1553
- * Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
1554
- * @internal
1429
+ * Exclude elements matching this CSS selector.
1555
1430
  */
1556
- protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
1557
- protected buildResponse(context: TContext): Promise<FetchResponse>;
1431
+ exclude?: string;
1432
+ }
1433
+ /**
1434
+ * Names of the supported array extraction modes.
1435
+ */
1436
+ type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
1437
+ /**
1438
+ * Base options for array extraction modes.
1439
+ */
1440
+ interface BaseModeOptions {
1441
+ type: ExtractArrayModeName;
1558
1442
  /**
1559
- * Abstract method for executing action within current page context.
1560
- *
1561
- * @param context - Crawlee crawling context
1562
- * @param action - Action to execute
1563
- * @returns Promise resolving to action result
1564
- *
1565
- * @remarks
1566
- * Handles specific user interactions using underlying technology (Playwright/Cheerio).
1567
- * @internal
1443
+ * Whether to enable strict mode for this specific array mode.
1444
+ * @default false
1568
1445
  */
1569
- protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
1446
+ strict?: boolean;
1447
+ }
1448
+ /**
1449
+ * Options for columnar (column-alignment) extraction.
1450
+ */
1451
+ interface ColumnarOptions extends BaseModeOptions {
1452
+ type: 'columnar';
1570
1453
  /**
1571
- * Navigates to the specified URL.
1572
- *
1573
- * @param url - Target URL
1574
- * @param params - Navigation options
1575
- * @returns Promise resolving when navigation completes
1576
- *
1577
- * @example
1578
- * ```ts
1579
- * await engine.goto('https://example.com');
1580
- * ```
1454
+ * Whether to enable heuristic inference.
1455
+ * If true, tries to find a common parent to infer item wrappers when counts mismatch.
1456
+ * @default false
1581
1457
  */
1582
- abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
1458
+ inference?: boolean;
1459
+ }
1460
+ /**
1461
+ * Options for segmented (anchor-based) extraction.
1462
+ */
1463
+ interface SegmentedOptions extends BaseModeOptions {
1464
+ type: 'segmented';
1583
1465
  /**
1584
- * Waits for specified condition before continuing.
1585
- *
1586
- * @param params - Wait conditions
1587
- * @returns Promise resolving when wait condition is met
1588
- *
1589
- * @example
1590
- * ```ts
1591
- * await engine.waitFor({ ms: 1000 }); // Wait 1 second
1592
- * await engine.waitFor({ selector: '#content' }); // Wait for element
1593
- * ```
1466
+ * The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
1467
+ * Defaults to the first property key's selector defined in `items`.
1594
1468
  */
1595
- waitFor(params?: WaitForActionOptions): Promise<void>;
1469
+ anchor?: string;
1596
1470
  /**
1597
- * Clicks on element matching selector.
1598
- *
1599
- * @param selector - CSS selector of element to click
1600
- * @returns Promise resolving when click is processed
1601
- * @throws {Error} When no active page context exists
1471
+ * Where to start searching for fields within each segment.
1472
+ * - 'anchor': (Default) All fields are searched within the entire segment.
1473
+ * - 'previous': Each field is searched starting from after the previous field's match.
1602
1474
  */
1603
- click(selector: string): Promise<void>;
1475
+ relativeTo?: 'anchor' | 'previous';
1604
1476
  /**
1605
- * Moves mouse to specified position or element.
1606
- *
1607
- * @param params - Move parameters (x, y, selector, steps)
1608
- */
1609
- mouseMove(params: {
1610
- x?: number;
1611
- y?: number;
1612
- selector?: string;
1613
- steps?: number;
1614
- }): Promise<void>;
1615
- /**
1616
- * Clicks at current position or specified position.
1617
- *
1618
- * @param params - Click parameters (x, y, button, clickCount, delay)
1477
+ * The maximum number of levels to bubble up from the anchor to find a segment container.
1478
+ * If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
1619
1479
  */
1620
- mouseClick(params: {
1621
- x?: number;
1622
- y?: number;
1623
- button?: 'left' | 'right' | 'middle';
1624
- clickCount?: number;
1625
- delay?: number;
1626
- }): Promise<void>;
1480
+ depth?: number;
1481
+ }
1482
+ /**
1483
+ * Union type for array extraction modes and their options.
1484
+ */
1485
+ type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
1486
+ /**
1487
+ * Configuration for extracting an array of items.
1488
+ */
1489
+ interface ExtractArraySchema extends BaseExtractSchema {
1490
+ type: 'array';
1627
1491
  /**
1628
- * Scrolls the mouse wheel.
1629
- *
1630
- * @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
1492
+ * CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
1631
1493
  */
1632
- mouseWheel(params: {
1633
- x?: number;
1634
- y?: number;
1635
- selector?: string;
1636
- deltaX?: number;
1637
- deltaY?: number;
1638
- steps?: number;
1639
- }): Promise<void>;
1494
+ selector: string;
1640
1495
  /**
1641
- * Scrolls the element into view.
1642
- *
1643
- * @param params - Scroll parameters (selector)
1496
+ * Filter items/containers that contain a descendant matching this CSS selector.
1644
1497
  */
1645
- scrollIntoView(params: {
1646
- selector: string;
1647
- }): Promise<void>;
1498
+ has?: string;
1648
1499
  /**
1649
- * Types text into current focused element.
1650
- *
1651
- * @param text - Text to type
1652
- * @param delay - Delay between key presses
1500
+ * Exclude items/containers matching this CSS selector.
1653
1501
  */
1654
- keyboardType(text: string, delay?: number): Promise<void>;
1502
+ exclude?: string;
1655
1503
  /**
1656
- * Presses specified key.
1657
- *
1658
- * @param key - Key to press
1659
- * @param delay - Delay after key press
1504
+ * Schema applied recursively to each extracted item.
1505
+ * If omitted, defaults to extracting text.
1660
1506
  */
1661
- keyboardPress(key: string, delay?: number): Promise<void>;
1507
+ items?: ExtractSchema;
1662
1508
  /**
1663
- * Fills input element with specified value.
1664
- *
1665
- * @param selector - CSS selector of input element
1666
- * @param value - Value to fill
1667
- * @returns Promise resolving when fill operation completes
1668
- * @throws {Error} When no active page context exists
1509
+ * Shortcut for `items` to extract a specific attribute directly.
1669
1510
  */
1670
- fill(selector: string, value: string): Promise<void>;
1511
+ attribute?: string;
1671
1512
  /**
1672
- * Submits a form.
1673
- *
1674
- * @param selector - Optional form/submit button selector
1675
- * @param options - Submission options
1676
- * @returns Promise resolving when form is submitted
1677
- * @throws {Error} When no active page context exists
1513
+ * Array extraction mode.
1514
+ * - 'nested': (Default) Items are elements matched by `selector`.
1515
+ * - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
1516
+ * - 'segmented': `selector` is a container, items are segmented by an anchor field.
1678
1517
  */
1679
- submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
1518
+ mode?: ExtractArrayMode;
1519
+ }
1520
+ /**
1521
+ * Configuration for extracting an object with multiple properties.
1522
+ */
1523
+ interface ExtractObjectSchema extends BaseExtractSchema {
1524
+ type: 'object';
1680
1525
  /**
1681
- * Removes elements from the DOM based on selectors and presets.
1682
- *
1683
- * @param options - Trim options specifying selectors and presets
1684
- * @returns Promise resolving when trim operation completes
1685
- * @throws {Error} When no active page context exists
1526
+ * Root selector for the object. If provided, sub-properties are searched within this element.
1686
1527
  */
1687
- trim(options: TrimActionOptions): Promise<void>;
1528
+ selector?: string;
1688
1529
  /**
1689
- * Pauses execution, allowing for manual intervention or inspection.
1690
- *
1691
- * @param message - Optional message to display during pause
1692
- * @returns Promise resolving when execution is resumed
1693
- * @throws {Error} When no active page context exists
1530
+ * Filter the object element based on descendants.
1694
1531
  */
1695
- pause(message?: string): Promise<void>;
1532
+ has?: string;
1696
1533
  /**
1697
- * Executes a custom function or expression within the current page context.
1698
- *
1699
- * @remarks
1700
- * This is a powerful action that allows running custom logic to interact with the DOM,
1701
- * calculate values, or trigger navigations.
1702
- *
1703
- * - In **Browser Mode**, it runs in the real browser.
1704
- * - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
1705
- *
1706
- * The action handles automatic navigation if `window.location` is modified.
1707
- *
1708
- * @param params - Configuration for the execution, including the function and arguments.
1709
- * @returns A promise resolving to the result of the execution.
1710
- * @throws {Error} If no active page context exists or if execution fails.
1711
- *
1712
- * @see {@link EvaluateActionOptions} for detailed parameter options and examples.
1534
+ * Exclude the object element if it matches this selector.
1713
1535
  */
1714
- evaluate(params: EvaluateActionOptions): Promise<any>;
1536
+ exclude?: string;
1715
1537
  /**
1716
- * Extracts structured data from the current page content.
1717
- *
1718
- * @param schema - An object defining the data to extract.
1719
- * @returns A promise that resolves to an object with the extracted data.
1538
+ * Where to start searching for fields within this object.
1539
+ * Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
1540
+ * - 'anchor': (Default) All fields are searched within the entire scope.
1541
+ * - 'previous': Each field is searched starting from after the previous field's match.
1720
1542
  */
1721
- extract<T>(schema: ExtractSchema): Promise<T>;
1543
+ relativeTo?: 'anchor' | 'previous';
1722
1544
  /**
1723
- * Gets the unique identifier of this engine implementation.
1545
+ * Explicit order of property extraction.
1546
+ * Useful when using `relativeTo: 'previous'`.
1724
1547
  */
1725
- get id(): string;
1548
+ order?: string[];
1726
1549
  /**
1727
- * Returns the current state of the engine (cookies)
1728
- * that can be used to restore the session later.
1550
+ * Definition of the object's properties and their corresponding extraction schemas.
1729
1551
  */
1730
- getState(): Promise<{
1731
- cookies: Cookie[];
1732
- sessionState?: any;
1733
- }>;
1552
+ properties: {
1553
+ [key: string]: ExtractSchema;
1554
+ };
1555
+ }
1556
+
1557
+ interface PromiseLock extends Promise<void> {
1558
+ release: () => void;
1559
+ }
1560
+
1561
+ /**
1562
+ * Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
1563
+ *
1564
+ * @remarks
1565
+ * Used when navigating to a URL to specify additional parameters beyond the basic URL.
1566
+ *
1567
+ * @example
1568
+ * ```ts
1569
+ * await engine.goto('https://example.com', {
1570
+ * method: 'POST',
1571
+ * payload: { username: 'user', password: 'pass' },
1572
+ * headers: { 'Content-Type': 'application/json' },
1573
+ * waitUntil: 'networkidle'
1574
+ * });
1575
+ * ```
1576
+ */
1577
+ interface GotoActionOptions {
1578
+ method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
1579
+ payload?: any;
1580
+ headers?: Record<string, string>;
1581
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1582
+ timeoutMs?: number;
1583
+ simulate?: boolean;
1584
+ }
1585
+ /**
1586
+ * Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
1587
+ *
1588
+ * @remarks
1589
+ * Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
1590
+ */
1591
+ interface WaitForActionOptions {
1592
+ ms?: number;
1593
+ selector?: string;
1594
+ networkIdle?: boolean;
1595
+ failOnTimeout?: boolean;
1596
+ }
1597
+ /**
1598
+ * Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
1599
+ *
1600
+ * @remarks
1601
+ * Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
1602
+ */
1603
+ interface SubmitActionOptions {
1604
+ enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
1605
+ }
1606
+ /**
1607
+ * Predefined cleanup groups for the {@link FetchEngine.trim} action.
1608
+ */
1609
+ type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
1610
+ /**
1611
+ * Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
1612
+ */
1613
+ interface TrimActionOptions {
1614
+ selectors?: string | string[];
1615
+ presets?: TrimPreset | TrimPreset[];
1616
+ }
1617
+ declare const TRIM_PRESETS: Record<string, string[]>;
1618
+ /**
1619
+ * Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
1620
+ *
1621
+ * @remarks
1622
+ * This action allows executing custom JavaScript logic within the page context.
1623
+ *
1624
+ * **Execution Environments:**
1625
+ * - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
1626
+ * - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
1627
+ * including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
1628
+ *
1629
+ * **Navigation Handling:**
1630
+ * If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
1631
+ * automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
1632
+ *
1633
+ * @example
1634
+ * ```json
1635
+ * {
1636
+ * "action": "evaluate",
1637
+ * "params": {
1638
+ * "fn": "([a, b]) => a + b",
1639
+ * "args": [1, 2]
1640
+ * }
1641
+ * }
1642
+ * ```
1643
+ *
1644
+ * @example
1645
+ * ```json
1646
+ * {
1647
+ * "action": "evaluate",
1648
+ * "params": {
1649
+ * "fn": "({ x, y }) => x * y",
1650
+ * "args": { "x": 6, "y": 7 }
1651
+ * }
1652
+ * }
1653
+ * ```
1654
+ */
1655
+ interface EvaluateActionOptions {
1734
1656
  /**
1735
- * Gets the execution mode of this engine (`'http'` or `'browser'`).
1736
- */
1737
- get mode(): FetchEngineType;
1738
- /**
1739
- * Gets the fetch engine context associated with this instance.
1657
+ * The function or expression to execute.
1658
+ *
1659
+ * @remarks
1660
+ * Can be:
1661
+ * 1. A function object (only available when using the API directly).
1662
+ * 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
1663
+ * 3. A string containing a direct expression, e.g., `"document.title"`
1664
+ *
1665
+ * **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
1666
+ * Use destructuring to handle multiple parameters.
1740
1667
  */
1741
- get context(): FetchEngineContext | undefined;
1668
+ fn: string | ((...args: any[]) => any);
1742
1669
  /**
1743
- * Initializes the fetch engine with provided context and options.
1744
- *
1745
- * @param context - Fetch engine context
1746
- * @param options - Configuration options
1747
- * @returns Promise resolving when initialization completes
1670
+ * Data to pass to the function.
1748
1671
  *
1749
1672
  * @remarks
1750
- * Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
1751
- * Automatically called when creating engine via `FetchEngine.create()`.
1673
+ * This value is passed as the first and only argument to the function defined in {@link fn}.
1674
+ * Recommended to use an array or object for multiple values.
1752
1675
  */
1753
- initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
1754
- cleanup(): Promise<void>;
1676
+ args?: any;
1677
+ }
1678
+ /**
1679
+ * Union type representing all possible engine actions that can be dispatched.
1680
+ *
1681
+ * @remarks
1682
+ * Defines the command structure processed during page interactions. Each action type corresponds to
1683
+ * a specific user interaction or navigation command within the action loop architecture.
1684
+ */
1685
+ type FetchEngineAction = {
1686
+ type: 'click';
1687
+ selector: string;
1688
+ } | {
1689
+ type: 'fill';
1690
+ selector: string;
1691
+ value: string;
1692
+ } | {
1693
+ type: 'mouseMove';
1694
+ params: {
1695
+ x?: number;
1696
+ y?: number;
1697
+ selector?: string;
1698
+ steps?: number;
1699
+ };
1700
+ } | {
1701
+ type: 'mouseClick';
1702
+ params: {
1703
+ x?: number;
1704
+ y?: number;
1705
+ button?: 'left' | 'right' | 'middle';
1706
+ clickCount?: number;
1707
+ delay?: number;
1708
+ steps?: number;
1709
+ };
1710
+ } | {
1711
+ type: 'mouseWheel';
1712
+ params: {
1713
+ x?: number;
1714
+ y?: number;
1715
+ selector?: string;
1716
+ deltaX?: number;
1717
+ deltaY?: number;
1718
+ steps?: number;
1719
+ };
1720
+ } | {
1721
+ type: 'keyboardType';
1722
+ params: {
1723
+ text: string;
1724
+ delay?: number;
1725
+ };
1726
+ } | {
1727
+ type: 'keyboardPress';
1728
+ params: {
1729
+ key: string;
1730
+ delay?: number;
1731
+ };
1732
+ } | {
1733
+ type: 'scrollIntoView';
1734
+ params: {
1735
+ selector: string;
1736
+ };
1737
+ } | {
1738
+ type: 'waitFor';
1739
+ options?: WaitForActionOptions;
1740
+ } | {
1741
+ type: 'submit';
1742
+ selector?: any;
1743
+ options?: SubmitActionOptions;
1744
+ } | {
1745
+ type: 'getContent';
1746
+ } | {
1747
+ type: 'navigate';
1748
+ url: string;
1749
+ opts?: GotoActionOptions;
1750
+ } | {
1751
+ type: 'extract';
1752
+ schema: ExtractSchema;
1753
+ } | {
1754
+ type: 'pause';
1755
+ message?: string;
1756
+ } | {
1757
+ type: 'trim';
1758
+ options: TrimActionOptions;
1759
+ } | {
1760
+ type: 'evaluate';
1761
+ params: EvaluateActionOptions;
1762
+ } | {
1763
+ type: 'dispose';
1764
+ };
1765
+ /**
1766
+ * Represents an action that has been dispatched and is awaiting execution in the active page context.
1767
+ *
1768
+ * @remarks
1769
+ * Connects the action request with its resolution mechanism. Used internally by the action dispatch system
1770
+ * to handle promises while maintaining the page context validity window.
1771
+ */
1772
+ interface DispatchedEngineAction {
1773
+ action: FetchEngineAction;
1774
+ resolve: (value?: any) => void;
1775
+ reject: (reason?: any) => void;
1776
+ }
1777
+ /**
1778
+ * Represents a pending navigation request awaiting resolution.
1779
+ *
1780
+ * @remarks
1781
+ * Tracks navigation requests that have been queued but not yet processed by the request handler.
1782
+ */
1783
+ interface PendingEngineRequest {
1784
+ resolve: (value: any) => void;
1785
+ reject: (reason?: any) => void;
1786
+ }
1787
+ /**
1788
+ * Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
1789
+ *
1790
+ * @remarks
1791
+ * The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
1792
+ * `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
1793
+ * content retrieval, and user interaction.
1794
+ *
1795
+ * The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
1796
+ * the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
1797
+ * page context validity across asynchronous operations.
1798
+ *
1799
+ * @example
1800
+ * ```ts
1801
+ * import "./playwright"; // 引入注册 Playwright browser 引擎
1802
+ * const engine = await FetchEngine.create(context, { engine: 'browser' });
1803
+ * await engine.goto('https://example.com');
1804
+ * await engine.fill('#username', 'user');
1805
+ * await engine.click('#submit');
1806
+ * const response = await engine.getContent();
1807
+ * ```
1808
+ */
1809
+ type AnyFetchEngine = FetchEngine<any, any, any>;
1810
+ type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
1811
+ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
1812
+ private static registry;
1755
1813
  /**
1756
- * Gets the initial scope for extraction for the specific engine.
1757
- * @param context - Crawlee crawling context
1758
- * @internal
1814
+ * Registers a fetch engine implementation with the global registry.
1815
+ *
1816
+ * @param engineClass - The engine class to register
1817
+ * @throws {Error} When engine class lacks static `id` or ID is already registered
1818
+ *
1819
+ * @example
1820
+ * ```ts
1821
+ * FetchEngine.register(CheerioFetchEngine);
1822
+ * ```
1759
1823
  */
1760
- protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
1824
+ static register(engineClass: AnyFetchEngineCtor): void;
1761
1825
  /**
1762
- * Unified action processor that handles engine-agnostic actions.
1763
- * @param context - Crawlee crawling context
1764
- * @param action - Action to execute
1765
- * @internal
1826
+ * Retrieves a fetch engine implementation by its unique ID.
1827
+ *
1828
+ * @param id - The ID of the engine to retrieve
1829
+ * @returns Engine class if found, otherwise `undefined`
1766
1830
  */
1767
- protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
1768
- protected _handlePause(action: {
1769
- message?: string;
1770
- }): Promise<void>;
1831
+ static get(id: string): AnyFetchEngineCtor | undefined;
1771
1832
  /**
1772
- * Executes all pending fetch engine actions within the current Crawlee request handler context.
1773
- *
1774
- * **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
1775
- * of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
1776
- *
1777
- * ### Why This Constraint Exists
1778
- * - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
1779
- * is **only valid during the synchronous execution phase** of the request handler
1780
- * - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
1781
- * due to Crawlee's internal resource management
1782
- *
1783
- * ### How It Works
1784
- * 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
1785
- * 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
1786
- * 3. Automatically cleans up event listeners upon completion
1833
+ * Retrieves a fetch engine implementation by execution mode.
1787
1834
  *
1788
- * Usage see {@link _sharedRequestHandler}
1789
- * @see {@link _sharedRequestHandler}
1790
- * @param context The active Crawlee crawling context containing the page/$ object
1791
- * @throws {Error} If called outside valid page context window (`!this.isPageActive`)
1792
- * @internal Engine infrastructure method - not for direct consumer use
1835
+ * @param mode - Execution mode (`'http'` or `'browser'`)
1836
+ * @returns Engine class if found, otherwise `undefined`
1793
1837
  */
1794
- protected _executePendingActions(context: TContext): Promise<void>;
1795
- protected _sharedRequestHandler(context: TContext): Promise<void>;
1796
- protected _sharedFailedRequestHandler(context: TContext, error?: Error): Promise<void>;
1797
- protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
1798
- private _requestHandler;
1799
- private _failedRequestHandler;
1800
- protected _commonCleanup(): Promise<void>;
1838
+ static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
1801
1839
  /**
1802
- * Blocks specified resource types from loading.
1840
+ * Factory method to create and initialize a fetch engine instance.
1803
1841
  *
1804
- * @param types - Resource types to block
1805
- * @param overwrite - Whether to replace existing blocked types
1806
- * @returns Number of blocked resource types
1842
+ * @param ctx - Fetch engine context
1843
+ * @param options - Configuration options
1844
+ * @returns Initialized fetch engine instance
1845
+ * @throws {Error} When no suitable engine implementation is found
1807
1846
  *
1808
- * @example
1809
- * ```ts
1810
- * await engine.blockResources(['image', 'stylesheet']);
1811
- * await engine.blockResources(['script'], true); // Replace existing
1812
- * ```
1847
+ * @remarks
1848
+ * Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
1813
1849
  */
1814
- blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
1850
+ static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
1815
1851
  /**
1816
- * Gets content of current page.
1852
+ * Unique identifier for the engine implementation.
1817
1853
  *
1818
- * @returns Promise resolving to fetch response
1819
- * @throws {Error} When no content has been fetched yet
1854
+ * @remarks
1855
+ * Must be defined by concrete implementations. Used for registration and lookup in engine registry.
1820
1856
  */
1821
- getContent(): Promise<FetchResponse>;
1857
+ static readonly id: string;
1822
1858
  /**
1823
- * Manages HTTP headers for requests with multiple overloads.
1824
- *
1825
- * @overload
1826
- * Gets all headers.
1827
- * @returns All headers as record
1859
+ * Execution mode of the engine (`'http'` or `'browser'`).
1828
1860
  *
1829
- * @overload
1830
- * Gets specific header value.
1831
- * @param name - Header name
1832
- * @returns Header value
1833
- *
1834
- * @overload
1835
- * Sets multiple headers.
1836
- * @param headers - Headers to set
1837
- * @param replaced - Whether to replace all existing headers
1838
- * @returns `true` if successful
1839
- *
1840
- * @overload
1841
- * Sets single header.
1842
- * @param name - Header name
1843
- * @param value - Header value or `null` to remove
1844
- * @returns `true` if successful
1845
- *
1846
- * @example
1847
- * ```ts
1848
- * const allHeaders = await engine.headers();
1849
- * const userAgent = await engine.headers('user-agent');
1850
- * await engine.headers({ 'x-custom': 'value' });
1851
- * await engine.headers('auth', 'token');
1852
- * ```
1861
+ * @remarks
1862
+ * Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
1853
1863
  */
1854
- headers(): Promise<Record<string, string>>;
1855
- headers(name: string): Promise<string>;
1856
- headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
1857
- headers(name: string, value: string | null): Promise<boolean>;
1864
+ static readonly mode: FetchEngineType;
1865
+ protected ctx?: FetchEngineContext;
1866
+ protected opts?: BaseFetcherProperties;
1867
+ protected crawler?: TCrawler;
1868
+ protected isCrawlerReady?: boolean;
1869
+ protected crawlerRunPromise?: Promise<FinalStatistics>;
1870
+ protected config?: Configuration;
1871
+ protected requestQueue?: RequestQueue;
1872
+ protected kvStore?: KeyValueStore;
1873
+ protected proxyConfiguration?: ProxyConfiguration;
1874
+ protected hdrs: Record<string, string>;
1875
+ protected _initialCookies?: Cookie[];
1876
+ protected _initializedSessions: Set<string>;
1877
+ protected currentSession?: Session;
1878
+ protected pendingRequests: Map<string, PendingEngineRequest>;
1879
+ protected requestCounter: number;
1880
+ protected actionEmitter: EventEmitter;
1881
+ protected isPageActive: boolean;
1882
+ protected isEngineDisposed: boolean;
1883
+ protected navigationLock: PromiseLock;
1884
+ protected activeContext?: TContext;
1885
+ protected isExecutingAction: boolean;
1886
+ protected lastResponse?: FetchResponse;
1887
+ protected actionQueue: DispatchedEngineAction[];
1888
+ protected isProcessingActionLoop: boolean;
1889
+ protected blockedTypes: Set<string>;
1890
+ _logDebug(category: string, ...args: any[]): void;
1891
+ protected _cleanup?(): Promise<void>;
1892
+ protected _getTrimInfo(options: TrimActionOptions): {
1893
+ selectors: string[];
1894
+ removeComments: boolean;
1895
+ removeHidden: boolean;
1896
+ };
1858
1897
  /**
1859
- * Manages cookies for current session with multiple overloads.
1860
- *
1861
- * @overload
1862
- * Gets all cookies.
1863
- * @returns Array of cookies
1864
- *
1865
- * @overload
1866
- * Sets cookies for session.
1867
- * @param cookies - Cookies to set
1868
- * @returns `true` if successful
1898
+ * Finds all elements matching the selector within the given scope.
1869
1899
  *
1870
- * @example
1871
- * ```ts
1872
- * const cookies = await engine.cookies();
1873
- * await engine.cookies([{ name: 'session', value: '123' }]);
1874
- * ```
1900
+ * @param scope - The scope to search in (Engine-specific element/node or array of nodes).
1901
+ * @param selector - CSS selector.
1902
+ * @returns List of matching elements.
1903
+ * @see {@link IExtractEngine._querySelectorAll} for behavior contract.
1904
+ * @internal
1875
1905
  */
1876
- cookies(): Promise<Cookie[]>;
1877
- cookies(cookies: Cookie[]): Promise<boolean>;
1906
+ abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1878
1907
  /**
1879
- * Disposes of engine, cleaning up all resources.
1908
+ * Extracts a primitive value from the element based on schema.
1880
1909
  *
1881
- * @returns Promise resolving when disposal completes
1910
+ * @param schema - Value extraction schema.
1911
+ * @param scope - The element scope.
1912
+ * @returns Extracted value.
1913
+ * @see {@link IExtractEngine._extractValue} for behavior contract.
1914
+ * @internal
1882
1915
  */
1883
- dispose(): Promise<void>;
1884
- }
1885
- declare function getRandomDelay(base: number, variance?: number): number;
1886
-
1887
- type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
1888
- interface FetchReturnTypeRegistry {
1889
- response: FetchResponse;
1890
- context: FetchContext;
1891
- result: FetchActionResult<any> | undefined;
1892
- outputs: Record<string, any>;
1893
- any: any;
1894
- none: void;
1895
- }
1896
- type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
1897
-
1898
- /**
1899
- * Represents the state of an action being executed within a context.
1900
- *
1901
- * @remarks
1902
- * Extends the basic action properties with runtime metadata like execution index,
1903
- * nesting depth, and any errors encountered during execution.
1904
- */
1905
- interface FetchActionInContext extends FetchActionProperties {
1916
+ abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1906
1917
  /**
1907
- * The 0-based index of the action in the execution sequence.
1918
+ * Gets the parent element of the given element.
1919
+ *
1920
+ * @param scope - The element scope.
1921
+ * @returns Parent element or null.
1922
+ * @internal
1908
1923
  */
1909
- index?: number;
1924
+ abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1910
1925
  /**
1911
- * Error encountered during action execution, if any.
1926
+ * Checks if two elements are the same identity.
1927
+ *
1928
+ * @param scope1 - First element scope.
1929
+ * @param scope2 - Second element scope.
1930
+ * @returns True if they are the same DOM node.
1931
+ * @internal
1912
1932
  */
1913
- error?: Error;
1933
+ abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1914
1934
  /**
1915
- * The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
1935
+ * Gets all subsequent siblings of an element until a sibling matches the selector.
1936
+ * Used in 'segmented' extraction mode.
1937
+ *
1938
+ * @param scope - The anchor element scope.
1939
+ * @param untilSelector - Optional selector that marks the end of the segment (exclusive).
1940
+ * @returns List of sibling elements between anchor and untilSelector.
1941
+ * @internal
1916
1942
  */
1917
- depth?: number;
1918
- }
1919
- /**
1920
- * Base internal state used by fetch engines to maintain their runtime environment.
1921
- *
1922
- * @internal
1923
- */
1924
- interface BaseFetchContextInteralState {
1943
+ abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1925
1944
  /**
1926
- * The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
1927
- * associated with this context.
1945
+ * Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
1946
+ *
1947
+ * @param scope - The starting element.
1948
+ * @param candidates - The array of potential ancestor scopes.
1949
+ * @returns A promise resolving to the matching candidate scope, or `null` if none found.
1950
+ * @see {@link IExtractEngine._findClosestAncestor} for implementation details.
1951
+ * @internal
1928
1952
  */
1929
- engine?: FetchEngine;
1953
+ abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1930
1954
  /**
1931
- * Additional implementation-specific internal state.
1955
+ * Checks if the `container` scope contains the `element` scope.
1956
+ *
1957
+ * @param container - The potential ancestor element.
1958
+ * @param element - The potential descendant element.
1959
+ * @returns A promise resolving to `true` if `container` contains `element`.
1960
+ * @see {@link IExtractEngine._contains} for implementation details.
1961
+ * @internal
1932
1962
  */
1933
- [key: string]: any;
1934
- }
1935
- /**
1936
- * Extended internal state for the fetch context, including action lifecycle management.
1937
- *
1938
- * @internal
1939
- */
1940
- interface FetchContextInteralState extends BaseFetchContextInteralState {
1963
+ abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1941
1964
  /**
1942
- * Stack of actions currently being executed, used to manage nested action calls.
1965
+ * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1966
+ *
1967
+ * @param scope1 - The first element scope.
1968
+ * @param scope2 - The second element scope.
1969
+ * @returns A promise resolving to the LCA element scope, or `null` if none found.
1970
+ * @internal
1943
1971
  */
1944
- actionStack?: FetchActionInContext[];
1972
+ abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1945
1973
  /**
1946
- * Global counter for actions executed within the session, used to assign auto-incrementing indices.
1974
+ * Finds the direct child of container that contains element.
1975
+ *
1976
+ * @param element - The descendant element.
1977
+ * @param container - The container element.
1978
+ * @returns The child element of container, or null.
1979
+ * @internal
1947
1980
  */
1948
- actionIndex?: number;
1949
- }
1950
- /**
1951
- * Context provided to the Fetch Engine during navigation and request handling.
1952
- *
1953
- * @remarks
1954
- * This interface contains the minimum set of properties required by an engine
1955
- * to perform a fetch operation and build a response.
1956
- */
1957
- interface FetchEngineContext extends BaseFetcherProperties {
1981
+ abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1982
+ protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
1958
1983
  /**
1959
- * Unique identifier for the session or request batch.
1984
+ * Normalizes the array extraction mode into an options object.
1985
+ * @param mode - The mode string or options object.
1986
+ * @internal
1960
1987
  */
1961
- id: string;
1988
+ protected _normalizeArrayMode(mode?: ExtractArrayMode): {
1989
+ type: ExtractArrayModeName;
1990
+ } & any;
1962
1991
  /**
1963
- * The target URL for the next navigation, if specified.
1992
+ * Performs standard nested array extraction.
1993
+ * @param items - The schema for each item.
1994
+ * @param elements - The list of item elements.
1995
+ * @internal
1964
1996
  */
1965
- url?: string;
1997
+ protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
1998
+ strict?: boolean;
1999
+ }): Promise<any[]>;
1966
2000
  /**
1967
- * The final URL after all redirects have been followed.
2001
+ * Performs columnar extraction (Column Alignment Mode).
2002
+ *
2003
+ * @param schema - The schema for a single item (must be an object or implicit object).
2004
+ * @param container - The container element to search within.
2005
+ * @param opts - Columnar extraction options (strict, inference).
2006
+ * @returns An array of extracted items, or null if requirements aren't met.
2007
+ * @internal
1968
2008
  */
1969
- finalUrl?: string;
2009
+ protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
1970
2010
  /**
1971
- * The standardized response object from the most recent navigation.
2011
+ * Performs segmented extraction (Anchor-based Scanning).
2012
+ *
2013
+ * @param schema - The schema for a single item (must be an object).
2014
+ * @param container - The container element to scan.
2015
+ * @param opts - Segmented extraction options (anchor).
2016
+ * @returns An array of extracted items.
2017
+ * @internal
1972
2018
  */
1973
- lastResponse?: FetchResponse;
2019
+ protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
1974
2020
  /**
1975
- * The result object from the most recent action execution.
2021
+ * Creates the crawler instance for the specific engine implementation.
2022
+ * @param options - The final crawler options.
2023
+ * @internal
1976
2024
  */
1977
- lastResult?: FetchActionResult;
2025
+ protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
1978
2026
  /**
1979
- * Engine-specific internal state.
2027
+ * Gets the crawler-specific options from the subclass.
2028
+ * @param ctx - The fetch engine context.
2029
+ * @internal
1980
2030
  */
1981
- internal: BaseFetchContextInteralState;
1982
- }
1983
- /**
1984
- * The full execution context for a Web Fetcher session or action batch.
1985
- *
1986
- * @remarks
1987
- * This object is the central state container for the fetch operation. It provides
1988
- * access to configuration, the event bus, shared outputs, and the execution engine.
1989
- * It is passed to every action during execution.
1990
- */
1991
- interface FetchContext extends FetchEngineContext {
2031
+ protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
1992
2032
  /**
1993
- * Metadata about the action currently being executed.
2033
+ * Abstract method for building standard [FetchResponse] from Crawlee context.
2034
+ *
2035
+ * @param context - Crawlee crawling context
2036
+ * @returns Promise resolving to [FetchResponse] object
2037
+ *
2038
+ * @remarks
2039
+ * Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
2040
+ * @internal
1994
2041
  */
1995
- currentAction?: FetchActionInContext;
2042
+ protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
2043
+ protected buildResponse(context: TContext): Promise<FetchResponse>;
1996
2044
  /**
1997
- * A shared key-value store for storing data extracted from pages or
1998
- * metadata generated during action execution.
2045
+ * Abstract method for executing action within current page context.
2046
+ *
2047
+ * @param context - Crawlee crawling context
2048
+ * @param action - Action to execute
2049
+ * @returns Promise resolving to action result
2050
+ *
2051
+ * @remarks
2052
+ * Handles specific user interactions using underlying technology (Playwright/Cheerio).
2053
+ * @internal
1999
2054
  */
2000
- outputs: Record<string, any>;
2055
+ protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
2001
2056
  /**
2002
- * Executes a FetchAction within the current context.
2057
+ * Navigates to the specified URL.
2003
2058
  *
2004
- * @param actionOptions - Configuration for the action to be executed.
2005
- * @returns A promise that resolves to the action's result.
2059
+ * @param url - Target URL
2060
+ * @param params - Navigation options
2061
+ * @returns Promise resolving when navigation completes
2062
+ *
2063
+ * @example
2064
+ * ```ts
2065
+ * await engine.goto('https://example.com');
2066
+ * ```
2006
2067
  */
2007
- execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
2068
+ abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2008
2069
  /**
2009
- * Convenience method to execute an action by its registered name or ID.
2070
+ * Waits for specified condition before continuing.
2010
2071
  *
2011
- * @param name - The registered name or ID of the action.
2012
- * @param params - Parameters specific to the action type.
2013
- * @param options - Additional execution options (e.g., storeAs, failOnError).
2014
- * @returns A promise that resolves to the action's result.
2072
+ * @param params - Wait conditions
2073
+ * @returns Promise resolving when wait condition is met
2074
+ *
2075
+ * @example
2076
+ * ```ts
2077
+ * await engine.waitFor({ ms: 1000 }); // Wait 1 second
2078
+ * await engine.waitFor({ selector: '#content' }); // Wait for element
2079
+ * ```
2015
2080
  */
2016
- action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
2081
+ waitFor(params?: WaitForActionOptions): Promise<void>;
2017
2082
  /**
2018
- * Internal state for engine and lifecycle management.
2083
+ * Clicks on element matching selector.
2084
+ *
2085
+ * @param selector - CSS selector of element to click
2086
+ * @returns Promise resolving when click is processed
2087
+ * @throws {Error} When no active page context exists
2019
2088
  */
2020
- internal: FetchContextInteralState;
2089
+ click(selector: string): Promise<void>;
2021
2090
  /**
2022
- * The central event bus for publishing and subscribing to session and action events.
2091
+ * Moves mouse to specified position or element.
2092
+ *
2093
+ * @param params - Move parameters (x, y, selector, steps)
2023
2094
  */
2024
- eventBus: EventEmitter;
2025
- }
2026
-
2027
- type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
2028
- type CheerioSelection = ReturnType<CheerioAPI>;
2029
- type CheerioNode = ReturnType<CheerioSelection['first']>;
2030
- declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
2031
- static readonly id = "cheerio";
2032
- static readonly mode = "http";
2033
- private _ensureCheerioContext;
2034
- protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
2035
- _querySelectorAll(scope: {
2036
- $: CheerioAPI;
2037
- el: any;
2038
- } | any[], selector: string): Promise<FetchElementScope[]>;
2039
- _nextSiblingsUntil(scope: {
2040
- $: CheerioAPI;
2041
- el: CheerioNode;
2042
- }, untilSelector?: string): Promise<FetchElementScope[]>;
2043
- _parentElement(scope: {
2044
- $: CheerioAPI;
2045
- el: CheerioNode;
2046
- }): Promise<FetchElementScope | null>;
2047
- _isSameElement(scope1: {
2048
- el: CheerioNode;
2049
- }, scope2: {
2050
- el: CheerioNode;
2051
- }): Promise<boolean>;
2052
- _findClosestAncestor(scope: {
2053
- $: CheerioAPI;
2054
- el: CheerioNode;
2055
- }, candidates: {
2056
- $: CheerioAPI;
2057
- el: CheerioNode;
2058
- }[]): Promise<FetchElementScope | null>;
2059
- _contains(container: {
2060
- $: CheerioAPI;
2061
- el: CheerioNode;
2062
- }, element: {
2063
- $: CheerioAPI;
2064
- el: CheerioNode;
2065
- }): Promise<boolean>;
2066
- _findCommonAncestor(scope1: {
2067
- $: CheerioAPI;
2068
- el: CheerioNode;
2069
- }, scope2: {
2070
- $: CheerioAPI;
2071
- el: CheerioNode;
2072
- }): Promise<FetchElementScope | null>;
2073
- _findContainerChild(element: {
2074
- $: CheerioAPI;
2075
- el: CheerioNode;
2076
- }, container: {
2077
- $: CheerioAPI;
2078
- el: CheerioNode;
2079
- }): Promise<FetchElementScope | null>;
2080
- _extractValue(schema: ExtractValueSchema, scope: {
2081
- $: CheerioAPI;
2082
- el: CheerioNode;
2083
- }): Promise<any>;
2084
- protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
2085
- protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
2086
- protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
2087
- url: string;
2088
- method: string;
2089
- body?: any;
2090
- headers?: Record<string, string>;
2091
- }): Promise<any>;
2092
- protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
2093
- protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
2094
- protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
2095
- goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2096
- }
2097
-
2098
- type Page = NonNullable<PlaywrightCrawlingContext['page']>;
2099
- type Locator = ReturnType<Page['locator']>;
2100
- declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
2101
- static readonly id = "playwright";
2102
- static readonly mode = "browser";
2103
- protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
2104
- _querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
2105
- _nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
2106
- _parentElement(scope: Locator): Promise<FetchElementScope | null>;
2107
- _isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
2108
- _findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
2109
- _contains(container: Locator, element: Locator): Promise<boolean>;
2110
- _findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
2111
- _findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
2112
- _extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
2113
- protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
2114
- protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
2115
- protected currentMousePos: {
2116
- x: number;
2117
- y: number;
2118
- };
2119
- protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
2120
- protected mouseInitialized: boolean;
2121
- protected _initializeMousePos(page: Page): Promise<void>;
2122
- protected _getTrajectory(start: {
2123
- x: number;
2124
- y: number;
2125
- }, end: {
2126
- x: number;
2127
- y: number;
2128
- }, steps?: number): {
2129
- x: number;
2130
- y: number;
2131
- }[];
2132
- protected _moveToPos(context: PlaywrightCrawlingContext, target: {
2133
- x: number;
2134
- y: number;
2135
- }, steps?: number): Promise<{
2136
- x: number;
2137
- y: number;
2138
- }>;
2139
- protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
2140
- x: number;
2141
- y: number;
2142
- }>;
2143
- protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
2144
- x: number;
2145
- y: number;
2146
- }>;
2147
- protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
2148
- protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
2149
- protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
2150
- goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
2151
- }
2152
-
2153
- declare enum FetchActionResultStatus {
2095
+ mouseMove(params: {
2096
+ x?: number;
2097
+ y?: number;
2098
+ selector?: string;
2099
+ steps?: number;
2100
+ }): Promise<void>;
2154
2101
  /**
2155
- * 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
2102
+ * Clicks at current position or specified position.
2103
+ *
2104
+ * @param params - Click parameters (x, y, button, clickCount, delay)
2156
2105
  */
2157
- Failed = 0,
2106
+ mouseClick(params: {
2107
+ x?: number;
2108
+ y?: number;
2109
+ button?: 'left' | 'right' | 'middle';
2110
+ clickCount?: number;
2111
+ delay?: number;
2112
+ }): Promise<void>;
2158
2113
  /**
2159
- * 动作按预期完成(即便产生 warnings)
2114
+ * Scrolls the mouse wheel.
2115
+ *
2116
+ * @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
2160
2117
  */
2161
- Success = 1,
2118
+ mouseWheel(params: {
2119
+ x?: number;
2120
+ y?: number;
2121
+ selector?: string;
2122
+ deltaX?: number;
2123
+ deltaY?: number;
2124
+ steps?: number;
2125
+ }): Promise<void>;
2162
2126
  /**
2163
- * 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
2164
- * 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
2127
+ * Scrolls the element into view.
2128
+ *
2129
+ * @param params - Scroll parameters (selector)
2165
2130
  */
2166
- Skipped = 2
2167
- }
2168
- type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
2169
- interface FetchActionMeta {
2170
- id: string;
2171
- index?: number;
2172
- engineType?: FetchEngineType;
2173
- capability?: FetchActionCapabilityMode;
2174
- response?: FetchResponse;
2175
- timings?: {
2176
- start: number;
2177
- total: number;
2178
- };
2179
- retries?: number;
2180
- }
2181
- interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
2182
- status: FetchActionResultStatus;
2183
- returnType?: R;
2184
- result?: FetchReturnTypeFor<R>;
2185
- error?: Error;
2186
- meta?: FetchActionMeta;
2187
- }
2188
- interface BaseFetchActionProperties {
2189
- id?: string;
2190
- name?: string;
2191
- action?: string | FetchAction;
2192
- index?: number;
2193
- params?: any;
2194
- args?: any;
2195
- storeAs?: string;
2196
- failOnError?: boolean;
2197
- failOnTimeout?: boolean;
2198
- timeoutMs?: number;
2199
- maxRetries?: number;
2200
- [key: string]: any;
2201
- }
2202
- type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
2203
- interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
2204
- activateOn?: string | RegExp | Array<string | RegExp>;
2205
- deactivateOn?: string | RegExp | Array<string | RegExp>;
2206
- collectOn?: string | RegExp | Array<string | RegExp>;
2207
- background?: boolean;
2208
- }
2209
- type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
2210
- interface FetchActionProperties extends BaseFetchActionProperties {
2211
- collectors?: BaseFetchCollectorOptions[];
2212
- }
2213
- type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
2214
- type FetchActionCapabilities = {
2215
- [mode in FetchEngineType]?: FetchActionCapabilityMode;
2216
- };
2217
- declare abstract class FetchAction {
2218
- private static registry;
2219
- static register(actionClass: typeof FetchAction): void;
2220
- static get(id: string): typeof FetchAction | undefined;
2221
- static create(id: FetchActionOptions): FetchAction | undefined;
2222
- static create(id: string): FetchAction | undefined;
2223
- static has(name: string): boolean;
2224
- static list(): string[];
2225
- static id: string;
2226
- static returnType: FetchReturnType;
2227
- static capabilities: FetchActionCapabilities;
2228
- static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2229
- getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2230
- get id(): string;
2231
- get returnType(): FetchReturnType;
2232
- get capabilities(): FetchActionCapabilities;
2233
- protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2234
- protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2235
- abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
2236
- protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
2237
- protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
2131
+ scrollIntoView(params: {
2132
+ selector: string;
2133
+ }): Promise<void>;
2238
2134
  /**
2239
- * Action 开始生命周期
2240
- * 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
2135
+ * Types text into current focused element.
2136
+ *
2137
+ * @param text - Text to type
2138
+ * @param delay - Delay between key presses
2241
2139
  */
2242
- beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
2243
- entry: FetchActionInContext;
2244
- collectors: CollectorsRuntime | undefined;
2245
- }>;
2140
+ keyboardType(text: string, delay?: number): Promise<void>;
2246
2141
  /**
2247
- * Action 结束生命周期
2248
- * 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
2142
+ * Presses specified key.
2143
+ *
2144
+ * @param key - Key to press
2145
+ * @param delay - Delay after key press
2249
2146
  */
2250
- afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
2251
- entry: FetchActionInContext;
2252
- collectors?: CollectorsRuntime;
2253
- }): Promise<void>;
2254
- execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
2255
- }
2256
- type CollectorsRuntime = {
2257
- cleanup: () => void;
2258
- awaitExecPendings: () => Promise<void>;
2259
- };
2260
-
2261
- type FetchEngineType = 'http' | 'browser';
2262
- type BrowserEngine = 'playwright' | 'puppeteer';
2263
- type FetchEngineMode = FetchEngineType | 'auto' | string;
2264
- type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
2265
- /**
2266
- * Storage configuration options for the fetch engine.
2267
- *
2268
- * @remarks
2269
- * Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
2270
- */
2271
- interface StorageOptions {
2147
+ keyboardPress(key: string, delay?: number): Promise<void>;
2272
2148
  /**
2273
- * Custom identifier for the storage.
2274
- * If provided, multiple sessions can share the same storage by using the same ID.
2275
- * If not provided, a unique session ID is used (strong isolation).
2149
+ * Fills input element with specified value.
2150
+ *
2151
+ * @param selector - CSS selector of input element
2152
+ * @param value - Value to fill
2153
+ * @returns Promise resolving when fill operation completes
2154
+ * @throws {Error} When no active page context exists
2276
2155
  */
2277
- id?: string;
2156
+ fill(selector: string, value: string): Promise<void>;
2278
2157
  /**
2279
- * Whether to persist storage to disk.
2280
- * If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
2281
- * Corresponds to Crawlee's `persistStorage` configuration.
2158
+ * Submits a form.
2159
+ *
2160
+ * @param selector - Optional form/submit button selector
2161
+ * @param options - Submission options
2162
+ * @returns Promise resolving when form is submitted
2163
+ * @throws {Error} When no active page context exists
2282
2164
  */
2283
- persist?: boolean;
2165
+ submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
2284
2166
  /**
2285
- * Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
2286
- * Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
2167
+ * Removes elements from the DOM based on selectors and presets.
2168
+ *
2169
+ * @param options - Trim options specifying selectors and presets
2170
+ * @returns Promise resolving when trim operation completes
2171
+ * @throws {Error} When no active page context exists
2287
2172
  */
2288
- purge?: boolean;
2173
+ trim(options: TrimActionOptions): Promise<void>;
2289
2174
  /**
2290
- * Additional Crawlee configuration options.
2291
- * Allows fine-grained control over the underlying Crawlee instance.
2175
+ * Pauses execution, allowing for manual intervention or inspection.
2176
+ *
2177
+ * @param message - Optional message to display during pause
2178
+ * @returns Promise resolving when execution is resumed
2179
+ * @throws {Error} When no active page context exists
2292
2180
  */
2293
- config?: Record<string, any>;
2294
- }
2295
- interface BaseFetcherProperties {
2181
+ pause(message?: string): Promise<void>;
2296
2182
  /**
2297
- * 抓取模式
2183
+ * Executes a custom function or expression within the current page context.
2298
2184
  *
2299
- * - `http`: 使用 HTTP 进行抓取
2300
- * - `browser`: 使用浏览器进行抓取
2301
- * - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
2185
+ * @remarks
2186
+ * This is a powerful action that allows running custom logic to interact with the DOM,
2187
+ * calculate values, or trigger navigations.
2188
+ *
2189
+ * - In **Browser Mode**, it runs in the real browser.
2190
+ * - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
2191
+ *
2192
+ * The action handles automatic navigation if `window.location` is modified.
2193
+ *
2194
+ * @param params - Configuration for the execution, including the function and arguments.
2195
+ * @returns A promise resolving to the result of the execution.
2196
+ * @throws {Error} If no active page context exists or if execution fails.
2197
+ *
2198
+ * @see {@link EvaluateActionOptions} for detailed parameter options and examples.
2302
2199
  */
2303
- engine?: FetchEngineMode;
2304
- enableSmart?: boolean;
2305
- useSiteRegistry?: boolean;
2306
- antibot?: boolean;
2307
- debug?: boolean | string | string[];
2308
- headers?: Record<string, string>;
2309
- cookies?: Cookie[];
2310
- sessionState?: any;
2311
- sessionPoolOptions?: SessionPoolOptions;
2312
- overrideSessionState?: boolean;
2313
- throwHttpErrors?: boolean;
2314
- output?: {
2315
- cookies?: boolean;
2316
- sessionState?: boolean;
2317
- };
2318
- proxy?: string | string[];
2319
- blockResources?: ResourceType[];
2200
+ evaluate(params: EvaluateActionOptions): Promise<any>;
2320
2201
  /**
2321
- * Storage configuration for session isolation and persistence.
2202
+ * Extracts structured data from the current page content.
2203
+ *
2204
+ * @param schema - An object defining the data to extract.
2205
+ * @returns A promise that resolves to an object with the extracted data.
2322
2206
  */
2323
- storage?: StorageOptions;
2324
- ignoreSslErrors?: boolean;
2325
- browser?: {
2326
- /**
2327
- * 浏览器引擎,默认为 playwright
2328
- *
2329
- * - `playwright`: 使用 Playwright 引擎
2330
- * - `puppeteer`: 使用 Puppeteer 引擎
2331
- */
2332
- engine?: BrowserEngine;
2333
- headless?: boolean;
2334
- waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
2335
- launchOptions?: Record<string, any>;
2336
- };
2337
- http?: {
2338
- method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
2339
- body?: any;
2340
- };
2341
- timeoutMs?: number;
2342
- requestHandlerTimeoutSecs?: number;
2343
- maxConcurrency?: number;
2344
- maxRequestsPerMinute?: number;
2345
- delayBetweenRequestsMs?: number;
2346
- retries?: number;
2347
- sites?: FetchSite[];
2348
- url?: string;
2349
- }
2350
- interface FetchSite extends BaseFetcherProperties {
2351
- domain: string;
2352
- pathScope?: string[];
2353
- meta?: {
2354
- updatedAt?: number;
2355
- ttlMs?: number;
2356
- source?: 'manual' | 'smart';
2357
- };
2358
- }
2359
- type OnFetchPauseCallback = (options: {
2360
- message?: string;
2361
- }) => Promise<void>;
2362
- interface FetcherOptions extends BaseFetcherProperties {
2363
- actions?: FetchActionOptions[];
2364
- onPause?: OnFetchPauseCallback;
2365
- }
2366
- interface FetchMetadata {
2367
- mode: FetchEngineType;
2368
- engine?: BrowserEngine;
2369
- timings?: {
2370
- start: number;
2371
- total: number;
2372
- ttfb?: number;
2373
- dns?: number;
2374
- tcp?: number;
2375
- firstByte?: number;
2376
- download?: number;
2377
- };
2378
- proxy?: string;
2379
- [key: string]: any;
2380
- }
2381
- interface FetchResponse {
2382
- url: string;
2383
- finalUrl: string;
2384
- statusCode?: number;
2385
- statusText?: string;
2386
- headers: Record<string, string>;
2387
- contentType?: string;
2388
- body?: string | Buffer<ArrayBufferLike>;
2389
- html?: string;
2390
- text?: string;
2391
- json?: any;
2392
- cookies?: Cookie[];
2393
- sessionState?: any;
2394
- metadata?: FetchMetadata;
2395
- }
2396
- declare const DefaultFetcherProperties: BaseFetcherProperties;
2397
- declare const FetcherOptionKeys: string[];
2398
-
2399
- /**
2400
- * Represents a stateful web fetching session.
2401
- *
2402
- * @remarks
2403
- * A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
2404
- * cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
2405
- * session-level configurations and outputs.
2406
- *
2407
- * Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
2408
- */
2409
- declare class FetchSession {
2410
- protected options: FetcherOptions;
2207
+ extract<T>(schema: ExtractSchema): Promise<T>;
2411
2208
  /**
2412
- * Unique identifier for the session.
2209
+ * Gets the unique identifier of this engine implementation.
2413
2210
  */
2414
- readonly id: string;
2211
+ get id(): string;
2415
2212
  /**
2416
- * The execution context for this session, containing configurations, event bus, and shared state.
2213
+ * Returns the current state of the engine (cookies)
2214
+ * that can be used to restore the session later.
2417
2215
  */
2418
- readonly context: FetchContext;
2419
- protected closed: boolean;
2216
+ getState(): Promise<{
2217
+ cookies: Cookie[];
2218
+ sessionState?: any;
2219
+ }>;
2420
2220
  /**
2421
- * Creates a new FetchSession.
2221
+ * Gets the execution mode of this engine (`'http'` or `'browser'`).
2222
+ */
2223
+ get mode(): FetchEngineType;
2224
+ /**
2225
+ * Gets the fetch engine context associated with this instance.
2226
+ */
2227
+ get context(): FetchEngineContext | undefined;
2228
+ /**
2229
+ * Initializes the fetch engine with provided context and options.
2422
2230
  *
2423
- * @param options - Configuration options for the fetcher.
2231
+ * @param context - Fetch engine context
2232
+ * @param options - Configuration options
2233
+ * @returns Promise resolving when initialization completes
2234
+ *
2235
+ * @remarks
2236
+ * Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
2237
+ * Automatically called when creating engine via `FetchEngine.create()`.
2424
2238
  */
2425
- constructor(options?: FetcherOptions);
2426
- protected _logDebug(category: string, ...args: any[]): void;
2239
+ initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
2240
+ cleanup(): Promise<void>;
2427
2241
  /**
2428
- * Executes a single action within the session.
2242
+ * Gets the initial scope for extraction for the specific engine.
2243
+ * @param context - Crawlee crawling context
2244
+ * @internal
2245
+ */
2246
+ protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
2247
+ /**
2248
+ * Unified action processor that handles engine-agnostic actions.
2249
+ * @param context - Crawlee crawling context
2250
+ * @param action - Action to execute
2251
+ * @internal
2252
+ */
2253
+ protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
2254
+ protected _handlePause(action: {
2255
+ message?: string;
2256
+ }): Promise<void>;
2257
+ /**
2258
+ * Executes all pending fetch engine actions within the current Crawlee request handler context.
2429
2259
  *
2430
- * @param actionOptions - Configuration for the action to be executed.
2431
- * @param context - Optional context override for this specific execution. Defaults to the session context.
2432
- * @returns A promise that resolves to the result of the action.
2433
- * @template R - The expected return type of the action.
2260
+ * **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
2261
+ * of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
2262
+ *
2263
+ * ### Why This Constraint Exists
2264
+ * - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
2265
+ * is **only valid during the synchronous execution phase** of the request handler
2266
+ * - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
2267
+ * due to Crawlee's internal resource management
2268
+ *
2269
+ * ### How It Works
2270
+ * 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
2271
+ * 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
2272
+ * 3. Automatically cleans up event listeners upon completion
2273
+ *
2274
+ * Usage see {@link _sharedRequestHandler}
2275
+ * @see {@link _sharedRequestHandler}
2276
+ * @param context The active Crawlee crawling context containing the page/$ object
2277
+ * @throws {Error} If called outside valid page context window (`!this.isPageActive`)
2278
+ * @internal Engine infrastructure method - not for direct consumer use
2279
+ */
2280
+ protected _executePendingActions(context: TContext): Promise<void>;
2281
+ protected _sharedRequestHandler(context: TContext): Promise<void>;
2282
+ protected _sharedFailedRequestHandler(context: TContext & {
2283
+ response?: FetchResponse;
2284
+ body?: string | Buffer;
2285
+ }, error?: Error): Promise<void>;
2286
+ protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
2287
+ private _requestHandler;
2288
+ private _failedRequestHandler;
2289
+ protected _commonCleanup(): Promise<void>;
2290
+ /**
2291
+ * Blocks specified resource types from loading.
2292
+ *
2293
+ * @param types - Resource types to block
2294
+ * @param overwrite - Whether to replace existing blocked types
2295
+ * @returns Number of blocked resource types
2434
2296
  *
2435
2297
  * @example
2436
2298
  * ```ts
2437
- * await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
2299
+ * await engine.blockResources(['image', 'stylesheet']);
2300
+ * await engine.blockResources(['script'], true); // Replace existing
2438
2301
  * ```
2439
2302
  */
2440
- execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
2303
+ blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
2441
2304
  /**
2442
- * Executes a sequence of actions.
2305
+ * Gets content of current page.
2443
2306
  *
2444
- * @param actions - An array of action options to be executed in order.
2445
- * @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
2446
- * These overrides do not affect the main session context.
2447
- * @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
2307
+ * @returns Promise resolving to fetch response
2308
+ * @throws {Error} When no content has been fetched yet
2309
+ */
2310
+ getContent(): Promise<FetchResponse>;
2311
+ /**
2312
+ * Manages HTTP headers for requests with multiple overloads.
2313
+ *
2314
+ * @overload
2315
+ * Gets all headers.
2316
+ * @returns All headers as record
2317
+ *
2318
+ * @overload
2319
+ * Gets specific header value.
2320
+ * @param name - Header name
2321
+ * @returns Header value
2322
+ *
2323
+ * @overload
2324
+ * Sets multiple headers.
2325
+ * @param headers - Headers to set
2326
+ * @param replaced - Whether to replace all existing headers
2327
+ * @returns `true` if successful
2328
+ *
2329
+ * @overload
2330
+ * Sets single header.
2331
+ * @param name - Header name
2332
+ * @param value - Header value or `null` to remove
2333
+ * @returns `true` if successful
2448
2334
  *
2449
2335
  * @example
2450
2336
  * ```ts
2451
- * const { result, outputs } = await session.executeAll([
2452
- * { name: 'goto', params: { url: 'https://example.com' } },
2453
- * { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
2454
- * ], { timeoutMs: 30000 });
2337
+ * const allHeaders = await engine.headers();
2338
+ * const userAgent = await engine.headers('user-agent');
2339
+ * await engine.headers({ 'x-custom': 'value' });
2340
+ * await engine.headers('auth', 'token');
2455
2341
  * ```
2456
2342
  */
2457
- executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
2458
- index?: number;
2459
- }): Promise<{
2460
- result: FetchResponse | undefined;
2461
- outputs: Record<string, any>;
2462
- }>;
2343
+ headers(): Promise<Record<string, string>>;
2344
+ headers(name: string): Promise<string>;
2345
+ headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
2346
+ headers(name: string, value: string | null): Promise<boolean>;
2463
2347
  /**
2464
- * Retrieves all outputs accumulated during the session.
2348
+ * Manages cookies for current session with multiple overloads.
2465
2349
  *
2466
- * @returns A record of stored output data.
2467
- */
2468
- getOutputs(): Record<string, any>;
2469
- /**
2470
- * Gets the current state of the session, including cookies and engine-specific state.
2350
+ * @overload
2351
+ * Gets all cookies.
2352
+ * @returns Array of cookies
2471
2353
  *
2472
- * @returns A promise resolving to the session state, or undefined if no engine is initialized.
2354
+ * @overload
2355
+ * Sets cookies for session.
2356
+ * @param cookies - Cookies to set
2357
+ * @returns `true` if successful
2358
+ *
2359
+ * @example
2360
+ * ```ts
2361
+ * const cookies = await engine.cookies();
2362
+ * await engine.cookies([{ name: 'session', value: '123' }]);
2363
+ * ```
2473
2364
  */
2474
- getState(): Promise<{
2475
- cookies: Cookie[];
2476
- sessionState?: any;
2477
- } | undefined>;
2365
+ cookies(): Promise<Cookie[]>;
2366
+ cookies(cookies: Cookie[]): Promise<boolean>;
2478
2367
  /**
2479
- * Disposes of the session and its associated engine.
2368
+ * Disposes of engine, cleaning up all resources.
2480
2369
  *
2481
- * @remarks
2482
- * This method should be called when the session is no longer needed to free up resources
2483
- * (e.g., closing browser instances, purging temporary storage).
2370
+ * @returns Promise resolving when disposal completes
2484
2371
  */
2485
2372
  dispose(): Promise<void>;
2486
- private ensureEngine;
2487
- protected createContext(options?: FetcherOptions): FetchContext;
2488
2373
  }
2374
+ declare function getRandomDelay(base: number, variance?: number): number;
2489
2375
 
2490
- /**
2491
- * High-level entry point for the Web Fetcher library.
2492
- *
2493
- * @remarks
2494
- * The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
2495
- * It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
2496
- *
2497
- * @example
2498
- * ```ts
2499
- * const fetcher = new WebFetcher();
2500
- * const { result } = await fetcher.fetch('https://example.com');
2501
- * ```
2502
- */
2503
- declare class WebFetcher {
2504
- private defaults;
2505
- /**
2506
- * Creates a new WebFetcher with default options.
2507
- *
2508
- * @param defaults - Default configuration options applied to all sessions and requests.
2509
- */
2510
- constructor(defaults?: FetcherOptions);
2376
+ type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
2377
+ type CheerioSelection = ReturnType<CheerioAPI>;
2378
+ type CheerioNode = ReturnType<CheerioSelection['first']>;
2379
+ declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
2380
+ static readonly id = "cheerio";
2381
+ static readonly mode = "http";
2382
+ private _ensureCheerioContext;
2383
+ protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
2384
+ _querySelectorAll(scope: {
2385
+ $: CheerioAPI;
2386
+ el: any;
2387
+ } | any[], selector: string): Promise<FetchElementScope[]>;
2388
+ _nextSiblingsUntil(scope: {
2389
+ $: CheerioAPI;
2390
+ el: CheerioNode;
2391
+ }, untilSelector?: string): Promise<FetchElementScope[]>;
2392
+ _parentElement(scope: {
2393
+ $: CheerioAPI;
2394
+ el: CheerioNode;
2395
+ }): Promise<FetchElementScope | null>;
2396
+ _isSameElement(scope1: {
2397
+ el: CheerioNode;
2398
+ }, scope2: {
2399
+ el: CheerioNode;
2400
+ }): Promise<boolean>;
2401
+ _findClosestAncestor(scope: {
2402
+ $: CheerioAPI;
2403
+ el: CheerioNode;
2404
+ }, candidates: {
2405
+ $: CheerioAPI;
2406
+ el: CheerioNode;
2407
+ }[]): Promise<FetchElementScope | null>;
2408
+ _contains(container: {
2409
+ $: CheerioAPI;
2410
+ el: CheerioNode;
2411
+ }, element: {
2412
+ $: CheerioAPI;
2413
+ el: CheerioNode;
2414
+ }): Promise<boolean>;
2415
+ _findCommonAncestor(scope1: {
2416
+ $: CheerioAPI;
2417
+ el: CheerioNode;
2418
+ }, scope2: {
2419
+ $: CheerioAPI;
2420
+ el: CheerioNode;
2421
+ }): Promise<FetchElementScope | null>;
2422
+ _findContainerChild(element: {
2423
+ $: CheerioAPI;
2424
+ el: CheerioNode;
2425
+ }, container: {
2426
+ $: CheerioAPI;
2427
+ el: CheerioNode;
2428
+ }): Promise<FetchElementScope | null>;
2429
+ _extractValue(schema: ExtractValueSchema, scope: {
2430
+ $: CheerioAPI;
2431
+ el: CheerioNode;
2432
+ }): Promise<any>;
2433
+ protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
2434
+ protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
2435
+ protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
2436
+ url: string;
2437
+ method: string;
2438
+ body?: any;
2439
+ headers?: Record<string, string>;
2440
+ }): Promise<any>;
2441
+ protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
2442
+ protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
2443
+ protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
2444
+ goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2445
+ }
2446
+
2447
+ type Page = NonNullable<PlaywrightCrawlingContext['page']>;
2448
+ type Locator = ReturnType<Page['locator']>;
2449
+ declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
2450
+ static readonly id = "playwright";
2451
+ static readonly mode = "browser";
2452
+ protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
2453
+ _querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
2454
+ _nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
2455
+ _parentElement(scope: Locator): Promise<FetchElementScope | null>;
2456
+ _isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
2457
+ _findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
2458
+ _contains(container: Locator, element: Locator): Promise<boolean>;
2459
+ _findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
2460
+ _findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
2461
+ _extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
2462
+ protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
2463
+ protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
2464
+ protected currentMousePos: {
2465
+ x: number;
2466
+ y: number;
2467
+ };
2468
+ protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
2469
+ protected mouseInitialized: boolean;
2470
+ protected _initializeMousePos(page: Page): Promise<void>;
2471
+ protected _getTrajectory(start: {
2472
+ x: number;
2473
+ y: number;
2474
+ }, end: {
2475
+ x: number;
2476
+ y: number;
2477
+ }, steps?: number): {
2478
+ x: number;
2479
+ y: number;
2480
+ }[];
2481
+ protected _moveToPos(context: PlaywrightCrawlingContext, target: {
2482
+ x: number;
2483
+ y: number;
2484
+ }, steps?: number): Promise<{
2485
+ x: number;
2486
+ y: number;
2487
+ }>;
2488
+ protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
2489
+ x: number;
2490
+ y: number;
2491
+ }>;
2492
+ protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
2493
+ x: number;
2494
+ y: number;
2495
+ }>;
2496
+ protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
2497
+ protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
2498
+ protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
2499
+ goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
2500
+ }
2501
+
2502
+ type FetchActionCapabilities = {
2503
+ [mode in FetchEngineType]?: FetchActionCapabilityMode;
2504
+ };
2505
+ declare abstract class FetchAction {
2506
+ private static registry;
2507
+ static register(actionClass: any): void;
2508
+ static get(id: string): any | undefined;
2509
+ static create(id: FetchActionOptions): FetchAction | undefined;
2510
+ static create(id: string): FetchAction | undefined;
2511
+ static has(name: string): boolean;
2512
+ static list(): string[];
2513
+ static id: string;
2514
+ static returnType: FetchReturnType;
2515
+ static capabilities: FetchActionCapabilities;
2516
+ static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2517
+ getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2518
+ get id(): string;
2519
+ get returnType(): FetchReturnType;
2520
+ get capabilities(): FetchActionCapabilities;
2521
+ protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2522
+ protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2523
+ abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
2524
+ protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
2525
+ protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
2511
2526
  /**
2512
- * Creates a new FetchSession.
2513
- *
2514
- * @param options - Configuration options for the session, merged with defaults.
2515
- * @returns A promise resolving to a new FetchSession instance.
2527
+ * Action 开始生命周期
2528
+ * 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
2516
2529
  */
2517
- createSession(options?: FetcherOptions): Promise<FetchSession>;
2530
+ beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
2531
+ entry: Required<Pick<FetchActionProperties, "action">> & Partial<Pick<FetchActionProperties, "id" | "name">> & {
2532
+ [x: string]: any;
2533
+ collectors?: BaseFetchCollectorOptions[] | undefined;
2534
+ index?: number | undefined;
2535
+ params?: any;
2536
+ args?: any;
2537
+ storeAs?: string | undefined;
2538
+ failOnError?: boolean | undefined;
2539
+ failOnTimeout?: boolean | undefined;
2540
+ timeoutMs?: number | undefined;
2541
+ maxRetries?: number | undefined;
2542
+ } & {
2543
+ index?: number;
2544
+ error?: Error;
2545
+ depth?: number;
2546
+ };
2547
+ collectors: CollectorsRuntime | undefined;
2548
+ }>;
2518
2549
  /**
2519
- * Fetches content from a URL or executes a complex action script.
2520
- *
2521
- * @remarks
2522
- * This method automatically creates a session, executes the specified actions,
2523
- * retrieves the content, and disposes of the session.
2524
- *
2525
- * @param url - The target URL or a complete FetcherOptions object.
2526
- * @param options - Additional options when the first parameter is a URL string.
2527
- * @returns A promise resolving to the final response and any extracted outputs.
2550
+ * Action 结束生命周期
2551
+ * 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
2528
2552
  */
2529
- fetch(url: string, options?: FetcherOptions): Promise<{
2530
- result: FetchResponse | undefined;
2531
- outputs: Record<string, any>;
2532
- }>;
2533
- fetch(options: FetcherOptions): Promise<{
2534
- result: FetchResponse | undefined;
2535
- outputs: Record<string, any>;
2536
- }>;
2553
+ afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
2554
+ entry: FetchActionInContext;
2555
+ collectors?: CollectorsRuntime;
2556
+ }): Promise<void>;
2557
+ execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
2537
2558
  }
2559
+ type CollectorsRuntime = {
2560
+ cleanup: () => void;
2561
+ awaitExecPendings: () => Promise<void>;
2562
+ };
2538
2563
 
2539
2564
  declare class ClickAction extends FetchAction {
2540
2565
  static id: string;
@@ -2779,4 +2804,4 @@ declare function fetchWeb(url: string, options?: FetcherOptions): Promise<{
2779
2804
  outputs: Record<string, any>;
2780
2805
  }>;
2781
2806
 
2782
- export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };
2807
+ export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EngineUpgradeError, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionMeta, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };