@isdk/web-fetcher 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.action.cn.md +28 -4
  2. package/README.action.md +27 -4
  3. package/README.cn.md +21 -0
  4. package/README.engine.cn.md +35 -7
  5. package/README.engine.md +30 -2
  6. package/README.md +23 -1
  7. package/dist/index.d.mts +1571 -1448
  8. package/dist/index.d.ts +1571 -1448
  9. package/dist/index.js +1 -1
  10. package/dist/index.mjs +1 -1
  11. package/docs/README.md +23 -1
  12. package/docs/_media/README.action.md +27 -4
  13. package/docs/_media/README.cn.md +21 -0
  14. package/docs/_media/README.engine.md +30 -2
  15. package/docs/classes/CheerioFetchEngine.md +169 -93
  16. package/docs/classes/ClickAction.md +29 -29
  17. package/docs/classes/EngineUpgradeError.md +335 -0
  18. package/docs/classes/EvaluateAction.md +29 -29
  19. package/docs/classes/ExtractAction.md +29 -29
  20. package/docs/classes/FetchAction.md +31 -29
  21. package/docs/classes/FetchEngine.md +159 -91
  22. package/docs/classes/FetchSession.md +14 -14
  23. package/docs/classes/FillAction.md +29 -29
  24. package/docs/classes/GetContentAction.md +29 -29
  25. package/docs/classes/GotoAction.md +29 -29
  26. package/docs/classes/KeyboardPressAction.md +29 -29
  27. package/docs/classes/KeyboardTypeAction.md +29 -29
  28. package/docs/classes/MouseClickAction.md +29 -29
  29. package/docs/classes/MouseMoveAction.md +29 -29
  30. package/docs/classes/MouseWheelAction.md +533 -0
  31. package/docs/classes/PauseAction.md +29 -29
  32. package/docs/classes/PlaywrightFetchEngine.md +252 -118
  33. package/docs/classes/ScrollIntoViewAction.md +533 -0
  34. package/docs/classes/SubmitAction.md +29 -29
  35. package/docs/classes/TrimAction.md +29 -29
  36. package/docs/classes/WaitForAction.md +29 -29
  37. package/docs/classes/WebFetcher.md +5 -5
  38. package/docs/enumerations/FetchActionResultStatus.md +4 -4
  39. package/docs/functions/fetchWeb.md +2 -2
  40. package/docs/functions/getRandomDelay.md +25 -0
  41. package/docs/globals.md +8 -1
  42. package/docs/interfaces/BaseFetchActionProperties.md +13 -13
  43. package/docs/interfaces/BaseFetchCollectorActionProperties.md +17 -17
  44. package/docs/interfaces/BaseFetcherProperties.md +44 -28
  45. package/docs/interfaces/DispatchedEngineAction.md +4 -4
  46. package/docs/interfaces/EvaluateActionOptions.md +3 -3
  47. package/docs/interfaces/ExtractActionProperties.md +13 -13
  48. package/docs/interfaces/FetchActionMeta.md +73 -0
  49. package/docs/interfaces/FetchActionProperties.md +15 -19
  50. package/docs/interfaces/FetchActionResult.md +7 -7
  51. package/docs/interfaces/FetchContext.md +65 -41
  52. package/docs/interfaces/FetchEngineContext.md +57 -33
  53. package/docs/interfaces/FetchMetadata.md +5 -5
  54. package/docs/interfaces/FetchResponse.md +14 -14
  55. package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
  56. package/docs/interfaces/FetchSite.md +55 -31
  57. package/docs/interfaces/FetcherOptions.md +55 -31
  58. package/docs/interfaces/GotoActionOptions.md +8 -8
  59. package/docs/interfaces/KeyboardPressParams.md +3 -3
  60. package/docs/interfaces/KeyboardTypeParams.md +3 -3
  61. package/docs/interfaces/MouseClickParams.md +6 -6
  62. package/docs/interfaces/MouseMoveParams.md +5 -5
  63. package/docs/interfaces/MouseWheelParams.md +69 -0
  64. package/docs/interfaces/PendingEngineRequest.md +3 -3
  65. package/docs/interfaces/ScrollIntoViewParams.md +17 -0
  66. package/docs/interfaces/StorageOptions.md +5 -5
  67. package/docs/interfaces/SubmitActionOptions.md +2 -2
  68. package/docs/interfaces/TrimActionOptions.md +3 -3
  69. package/docs/interfaces/WaitForActionOptions.md +5 -5
  70. package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
  71. package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
  72. package/docs/type-aliases/BrowserEngine.md +1 -1
  73. package/docs/type-aliases/FetchActionCapabilities.md +1 -1
  74. package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
  75. package/docs/type-aliases/FetchActionInContext.md +38 -0
  76. package/docs/type-aliases/FetchActionOptions.md +1 -1
  77. package/docs/type-aliases/FetchEngineAction.md +2 -2
  78. package/docs/type-aliases/FetchEngineType.md +1 -1
  79. package/docs/type-aliases/FetchReturnType.md +1 -1
  80. package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
  81. package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
  82. package/docs/type-aliases/ResourceType.md +1 -1
  83. package/docs/type-aliases/TrimPreset.md +1 -1
  84. package/docs/variables/DefaultFetcherProperties.md +1 -1
  85. package/docs/variables/FetcherOptionKeys.md +1 -1
  86. package/docs/variables/TRIM_PRESETS.md +1 -1
  87. package/package.json +7 -7
  88. package/docs/interfaces/FetchActionInContext.md +0 -190
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Cookie, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions, SessionPoolOptions } from 'crawlee';
1
+ import { Cookie, SessionPoolOptions, CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions } from 'crawlee';
2
2
  export { Cookie } from 'crawlee';
3
3
  import { EventEmitter } from 'events-ex';
4
4
 
@@ -731,1759 +731,1835 @@ type _RequireAtLeastOne<
731
731
  Except<ObjectType, KeysType>;
732
732
 
733
733
  /**
734
- * Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
735
- * It acts as the target for extraction and interaction actions.
736
- */
737
- type FetchElementScope = any;
738
- /**
739
- * Interface representing the minimal engine capabilities required for extraction.
734
+ * Represents the state of an action being executed within a context.
740
735
  *
741
736
  * @remarks
742
- * This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
743
- * Implementing classes must ensure consistent behavior across different engines, especially
744
- * regarding scope handling (Element vs Array of Elements) and DOM traversal.
737
+ * Extends the basic action properties with runtime metadata like execution index,
738
+ * nesting depth, and any errors encountered during execution.
745
739
  */
746
- interface IExtractEngine {
747
- /**
748
- * Finds all elements matching the selector within the given scope.
749
- *
750
- * @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
751
- * @param selector - The CSS selector to match.
752
- * @returns A promise resolving to an array of found element scopes.
753
- *
754
- * @remarks
755
- * **Behavior Contract:**
756
- * 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
757
- * 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
758
- * 3. **Array Scope**: If `scope` is an array:
759
- * - It MUST process elements in the order they appear in the array (which should match document order).
760
- * - It MUST perform the check (Self + Descendants) for *each* element in the array.
761
- * - It MUST flatten the results into a single array.
762
- * - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
763
- * but generally, preserving document order is the priority.
764
- */
765
- _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
740
+ type FetchActionInContext = FetchActionOptions & {
766
741
  /**
767
- * Extracts a primitive value from the element based on the schema configuration.
768
- *
769
- * @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
770
- * @param scope - The specific element to extract data from.
771
- * @returns A promise resolving to the extracted value (string, number, boolean, or null).
772
- *
773
- * @remarks
774
- * **Behavior Contract:**
775
- * - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
776
- * - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
777
- * - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
778
- * - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
779
- * - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
742
+ * The 0-based index of the action in the execution sequence.
780
743
  */
781
- _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
744
+ index?: number;
782
745
  /**
783
- * Gets the parent element of the given scope.
784
- *
785
- * @param scope - The element to find the parent of.
786
- * @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
746
+ * Error encountered during action execution, if any.
787
747
  */
788
- _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
748
+ error?: Error;
789
749
  /**
790
- * Checks if two element scopes refer to the exact same DOM node.
791
- *
792
- * @param scope1 - The first element scope.
793
- * @param scope2 - The second element scope.
794
- * @returns A promise resolving to `true` if they are the same node, `false` otherwise.
795
- *
796
- * @remarks
797
- * This comparison MUST be identity-based, not just content-based.
750
+ * The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
798
751
  */
799
- _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
752
+ depth?: number;
753
+ };
754
+ /**
755
+ * Base internal state used by fetch engines to maintain their runtime environment.
756
+ *
757
+ * @internal
758
+ */
759
+ interface BaseFetchContextInteralState {
800
760
  /**
801
- * Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
802
- *
803
- * @param scope - The anchor element (starting point). The returned list starts *after* this element.
804
- * @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
805
- * If omitted or null, returns all following siblings.
806
- * @returns A promise resolving to an array of sibling element scopes.
807
- *
808
- * @remarks
809
- * **Behavior Contract:**
810
- * - **Starting Point**: The `scope` element itself IS NOT included in the result.
811
- * - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
812
- * - **Direction**: Only scans *following* siblings (next siblings).
813
- * - **Flattening**: The result is a flat list of siblings, not a nested structure.
761
+ * The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
762
+ * associated with this context.
814
763
  */
815
- _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
764
+ engine?: any;
816
765
  /**
817
- * Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
818
- *
819
- * @param scope - The starting element from which to ascend the DOM tree.
820
- * @param candidates - An array of potential ancestor elements to check against.
821
- * @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
822
- *
823
- * @remarks
824
- * **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
825
- * It effectively answers: "Which of these container candidates does my current element belong to?"
826
- *
827
- * **Implementation Guidelines**:
828
- * - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
829
- * - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
766
+ * Additional implementation-specific internal state.
830
767
  */
831
- _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
768
+ [key: string]: any;
769
+ }
770
+ /**
771
+ * Extended internal state for the fetch context, including action lifecycle management.
772
+ *
773
+ * @internal
774
+ */
775
+ interface FetchContextInteralState extends BaseFetchContextInteralState {
832
776
  /**
833
- * Checks if the `container` element contains the `element` (descendant).
834
- *
835
- * @param container - The potential ancestor element.
836
- * @param element - The potential descendant element.
837
- * @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
838
- *
839
- * @remarks
840
- * **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
841
- *
842
- * @performance-critical Used extensively in boundary checks for Segmented extraction.
843
- * - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
844
- * - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
777
+ * Stack of actions currently being executed, used to manage nested action calls.
845
778
  */
846
- _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
779
+ actionStack?: FetchActionInContext[];
847
780
  /**
848
- * Finds the Lowest Common Ancestor (LCA) of two element scopes.
849
- *
850
- * @param scope1 - The first element.
851
- * @param scope2 - The second element.
852
- * @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
853
- *
854
- * @remarks
855
- * This is a fundamental tree operation used to find the point where two element paths diverge.
856
- * **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
781
+ * Global counter for actions executed within the session, used to assign auto-incrementing indices.
857
782
  */
858
- _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
783
+ actionIndex?: number;
784
+ }
785
+ /**
786
+ * Context provided to the Fetch Engine during navigation and request handling.
787
+ *
788
+ * @remarks
789
+ * This interface contains the minimum set of properties required by an engine
790
+ * to perform a fetch operation and build a response.
791
+ */
792
+ interface FetchEngineContext extends BaseFetcherProperties {
859
793
  /**
860
- * Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
861
- *
862
- * @param element - The descendant element.
863
- * @param container - The ancestor container.
864
- * @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
865
- *
866
- * @remarks
867
- * This method traverses up from `element` until it finds the node whose parent is `container`.
868
- * **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
794
+ * Unique identifier for the session or request batch.
869
795
  */
870
- _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
796
+ id: string;
871
797
  /**
872
- * Logs debug information if debug mode is enabled.
873
- * @param category - The category of the log message.
874
- * @param args - Arguments to log.
798
+ * The target URL for the next navigation, if specified.
875
799
  */
876
- _logDebug(category: string, ...args: any[]): void;
877
- }
878
- /**
879
- * Base configuration for all extraction schemas.
880
- */
881
- interface BaseExtractSchema {
800
+ url?: string;
882
801
  /**
883
- * Whether this field is required. If true and the value is null,
884
- * the containing object or array item will be skipped (or throw error in strict mode).
802
+ * The final URL after all redirects have been followed.
885
803
  */
886
- required?: boolean;
804
+ finalUrl?: string;
887
805
  /**
888
- * Whether to enable strict mode for this extraction.
889
- * If true, missing required fields will throw an error instead of being skipped.
806
+ * The standardized response object from the most recent navigation.
890
807
  */
891
- strict?: boolean;
808
+ lastResponse?: FetchResponse;
892
809
  /**
893
- * Specifies the starting anchor for extraction of this field.
894
- * - Field Name: Uses the DOM element of a previously extracted field as the anchor.
895
- * - CSS Selector: Re-queries the selector within the current context to find the anchor.
896
- *
897
- * Once anchored, the search scope for this field becomes the siblings following the anchor.
810
+ * The result object from the most recent action execution.
898
811
  */
899
- anchor?: string;
812
+ lastResult?: FetchActionResult;
900
813
  /**
901
- * The maximum number of levels to bubble up from the anchor or matched element.
902
- * - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
903
- * - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
904
- * - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
814
+ * Engine-specific internal state.
905
815
  */
906
- depth?: number;
816
+ internal: BaseFetchContextInteralState;
907
817
  }
908
818
  /**
909
- * Extraction schema types.
910
- */
911
- type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
912
- /**
913
- * Configuration for extracting a single value.
819
+ * The full execution context for a Web Fetcher session or action batch.
820
+ *
821
+ * @remarks
822
+ * This object is the central state container for the fetch operation. It provides
823
+ * access to configuration, the event bus, shared outputs, and the execution engine.
824
+ * It is passed to every action during execution.
914
825
  */
915
- interface ExtractValueSchema extends BaseExtractSchema {
826
+ interface FetchContext extends FetchEngineContext {
916
827
  /**
917
- * The data type to cast the extracted value to.
918
- * @default 'string'
828
+ * Metadata about the action currently being executed.
919
829
  */
920
- type?: 'string' | 'number' | 'boolean' | 'html';
830
+ currentAction?: FetchActionInContext;
921
831
  /**
922
- * Extraction behavior mode.
923
- * - 'text': (Default) Uses textContent.
924
- * - 'innerText': Uses rendered text (respects CSS line breaks).
925
- * - 'html': Returns innerHTML.
926
- * - 'outerHTML': Returns HTML including the element's tag.
832
+ * A shared key-value store for storing data extracted from pages or
833
+ * metadata generated during action execution.
927
834
  */
928
- mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
835
+ outputs: Record<string, any>;
929
836
  /**
930
- * CSS selector to locate the element within the current context.
837
+ * Executes a FetchAction within the current context.
838
+ *
839
+ * @param actionOptions - Configuration for the action to be executed.
840
+ * @returns A promise that resolves to the action's result.
931
841
  */
932
- selector?: string;
842
+ execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
933
843
  /**
934
- * Attribute name to extract (e.g., 'href', 'src').
935
- * If omitted, the text content or HTML is extracted based on `type`.
844
+ * Convenience method to execute an action by its registered name or ID.
845
+ *
846
+ * @param name - The registered name or ID of the action.
847
+ * @param params - Parameters specific to the action type.
848
+ * @param options - Additional execution options (e.g., storeAs, failOnError).
849
+ * @returns A promise that resolves to a result.
936
850
  */
937
- attribute?: string;
851
+ action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
938
852
  /**
939
- * Filter elements that contain a descendant matching this CSS selector.
853
+ * Internal state for engine and lifecycle management.
940
854
  */
941
- has?: string;
855
+ internal: FetchContextInteralState;
942
856
  /**
943
- * Exclude elements matching this CSS selector.
857
+ * The central event bus for publishing and subscribing to session and action events.
944
858
  */
945
- exclude?: string;
859
+ eventBus: EventEmitter;
946
860
  }
947
- /**
948
- * Names of the supported array extraction modes.
949
- */
950
- type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
951
- /**
952
- * Base options for array extraction modes.
953
- */
954
- interface BaseModeOptions {
955
- type: ExtractArrayModeName;
956
- /**
957
- * Whether to enable strict mode for this specific array mode.
958
- * @default false
959
- */
960
- strict?: boolean;
961
- }
962
- /**
963
- * Options for columnar (column-alignment) extraction.
964
- */
965
- interface ColumnarOptions extends BaseModeOptions {
966
- type: 'columnar';
967
- /**
968
- * Whether to enable heuristic inference.
969
- * If true, tries to find a common parent to infer item wrappers when counts mismatch.
970
- * @default false
971
- */
972
- inference?: boolean;
861
+
862
+ type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
863
+ interface FetchReturnTypeRegistry {
864
+ response: FetchResponse;
865
+ context: FetchContext;
866
+ result: FetchActionResult<any> | undefined;
867
+ outputs: Record<string, any>;
868
+ any: any;
869
+ none: void;
973
870
  }
974
- /**
975
- * Options for segmented (anchor-based) extraction.
976
- */
977
- interface SegmentedOptions extends BaseModeOptions {
978
- type: 'segmented';
871
+ type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
872
+
873
+ declare enum FetchActionResultStatus {
979
874
  /**
980
- * The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
981
- * Defaults to the first property key's selector defined in `items`.
875
+ * 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
982
876
  */
983
- anchor?: string;
877
+ Failed = 0,
984
878
  /**
985
- * Where to start searching for fields within each segment.
986
- * - 'anchor': (Default) All fields are searched within the entire segment.
987
- * - 'previous': Each field is searched starting from after the previous field's match.
879
+ * 动作按预期完成(即便产生 warnings)
988
880
  */
989
- relativeTo?: 'anchor' | 'previous';
881
+ Success = 1,
990
882
  /**
991
- * The maximum number of levels to bubble up from the anchor to find a segment container.
992
- * If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
883
+ * 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
884
+ * 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
993
885
  */
994
- depth?: number;
886
+ Skipped = 2
995
887
  }
996
- /**
997
- * Union type for array extraction modes and their options.
998
- */
999
- type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
1000
- /**
1001
- * Configuration for extracting an array of items.
1002
- */
1003
- interface ExtractArraySchema extends BaseExtractSchema {
1004
- type: 'array';
1005
- /**
1006
- * CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
1007
- */
1008
- selector: string;
1009
- /**
1010
- * Filter items/containers that contain a descendant matching this CSS selector.
1011
- */
1012
- has?: string;
1013
- /**
1014
- * Exclude items/containers matching this CSS selector.
1015
- */
1016
- exclude?: string;
1017
- /**
1018
- * Schema applied recursively to each extracted item.
1019
- * If omitted, defaults to extracting text.
1020
- */
1021
- items?: ExtractSchema;
1022
- /**
1023
- * Shortcut for `items` to extract a specific attribute directly.
1024
- */
1025
- attribute?: string;
1026
- /**
1027
- * Array extraction mode.
1028
- * - 'nested': (Default) Items are elements matched by `selector`.
1029
- * - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
1030
- * - 'segmented': `selector` is a container, items are segmented by an anchor field.
1031
- */
1032
- mode?: ExtractArrayMode;
888
+ type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
889
+ interface FetchActionMeta {
890
+ id: string;
891
+ index?: number;
892
+ engineType?: FetchEngineType;
893
+ capability?: FetchActionCapabilityMode;
894
+ response?: FetchResponse;
895
+ timings?: {
896
+ start: number;
897
+ total: number;
898
+ };
899
+ retries?: number;
900
+ }
901
+ interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
902
+ status: FetchActionResultStatus;
903
+ returnType?: R;
904
+ result?: FetchReturnTypeFor<R>;
905
+ error?: Error;
906
+ meta?: FetchActionMeta;
907
+ }
908
+ interface BaseFetchActionProperties {
909
+ id?: string;
910
+ name?: string;
911
+ action?: string | any;
912
+ index?: number;
913
+ params?: any;
914
+ args?: any;
915
+ storeAs?: string;
916
+ failOnError?: boolean;
917
+ failOnTimeout?: boolean;
918
+ timeoutMs?: number;
919
+ maxRetries?: number;
920
+ [key: string]: any;
921
+ }
922
+ type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
923
+ interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
924
+ activateOn?: string | RegExp | Array<string | RegExp>;
925
+ deactivateOn?: string | RegExp | Array<string | RegExp>;
926
+ collectOn?: string | RegExp | Array<string | RegExp>;
927
+ background?: boolean;
928
+ }
929
+ type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
930
+ interface FetchActionProperties extends BaseFetchActionProperties {
931
+ collectors?: BaseFetchCollectorOptions[];
932
+ }
933
+ type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
934
+ declare class EngineUpgradeError extends Error {
935
+ res: FetchResponse;
936
+ code: string;
937
+ constructor(res: FetchResponse);
1033
938
  }
939
+ type FetchEngineType = 'http' | 'browser';
940
+ type BrowserEngine = 'playwright' | 'puppeteer';
941
+ type FetchEngineMode = FetchEngineType | 'auto' | string;
942
+ type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
1034
943
  /**
1035
- * Configuration for extracting an object with multiple properties.
944
+ * Storage configuration options for the fetch engine.
945
+ *
946
+ * @remarks
947
+ * Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
1036
948
  */
1037
- interface ExtractObjectSchema extends BaseExtractSchema {
1038
- type: 'object';
949
+ interface StorageOptions {
1039
950
  /**
1040
- * Root selector for the object. If provided, sub-properties are searched within this element.
951
+ * Custom identifier for the storage.
952
+ * If provided, multiple sessions can share the same storage by using the same ID.
953
+ * If not provided, a unique session ID is used (strong isolation).
1041
954
  */
1042
- selector?: string;
955
+ id?: string;
1043
956
  /**
1044
- * Filter the object element based on descendants.
957
+ * Whether to persist storage to disk.
958
+ * If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
959
+ * Corresponds to Crawlee's `persistStorage` configuration.
1045
960
  */
1046
- has?: string;
961
+ persist?: boolean;
1047
962
  /**
1048
- * Exclude the object element if it matches this selector.
963
+ * Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
964
+ * Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
1049
965
  */
1050
- exclude?: string;
966
+ purge?: boolean;
1051
967
  /**
1052
- * Where to start searching for fields within this object.
1053
- * Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
1054
- * - 'anchor': (Default) All fields are searched within the entire scope.
1055
- * - 'previous': Each field is searched starting from after the previous field's match.
968
+ * Additional Crawlee configuration options.
969
+ * Allows fine-grained control over the underlying Crawlee instance.
1056
970
  */
1057
- relativeTo?: 'anchor' | 'previous';
971
+ config?: Record<string, any>;
972
+ }
973
+ interface BaseFetcherProperties {
1058
974
  /**
1059
- * Explicit order of property extraction.
1060
- * Useful when using `relativeTo: 'previous'`.
975
+ * 抓取模式
976
+ *
977
+ * - `http`: 使用 HTTP 进行抓取
978
+ * - `browser`: 使用浏览器进行抓取
979
+ * - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
1061
980
  */
1062
- order?: string[];
981
+ engine?: FetchEngineMode;
982
+ enableSmart?: boolean;
983
+ syncStateOnUpgrade?: boolean;
984
+ upgradeThresholdMs?: number;
985
+ useSiteRegistry?: boolean;
986
+ antibot?: boolean;
987
+ debug?: boolean | string | string[];
988
+ headers?: Record<string, string>;
989
+ cookies?: Cookie[];
990
+ sessionState?: any;
991
+ sessionPoolOptions?: SessionPoolOptions;
992
+ overrideSessionState?: boolean;
993
+ throwHttpErrors?: boolean;
994
+ output?: {
995
+ cookies?: boolean;
996
+ sessionState?: boolean;
997
+ };
998
+ proxy?: string | string[];
999
+ blockResources?: ResourceType[];
1063
1000
  /**
1064
- * Definition of the object's properties and their corresponding extraction schemas.
1001
+ * Storage configuration for session isolation and persistence.
1065
1002
  */
1066
- properties: {
1067
- [key: string]: ExtractSchema;
1003
+ storage?: StorageOptions;
1004
+ ignoreSslErrors?: boolean;
1005
+ browser?: {
1006
+ /**
1007
+ * 浏览器引擎,默认为 playwright
1008
+ *
1009
+ * - `playwright`: 使用 Playwright 引擎
1010
+ * - `puppeteer`: 使用 Puppeteer 引擎
1011
+ */
1012
+ engine?: BrowserEngine;
1013
+ headless?: boolean;
1014
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1015
+ launchOptions?: Record<string, any>;
1016
+ };
1017
+ http?: {
1018
+ method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
1019
+ body?: any;
1068
1020
  };
1069
- }
1070
-
1071
- interface PromiseLock extends Promise<void> {
1072
- release: () => void;
1073
- }
1074
-
1075
- /**
1076
- * Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
1077
- *
1078
- * @remarks
1079
- * Used when navigating to a URL to specify additional parameters beyond the basic URL.
1080
- *
1081
- * @example
1082
- * ```ts
1083
- * await engine.goto('https://example.com', {
1084
- * method: 'POST',
1085
- * payload: { username: 'user', password: 'pass' },
1086
- * headers: { 'Content-Type': 'application/json' },
1087
- * waitUntil: 'networkidle'
1088
- * });
1089
- * ```
1090
- */
1091
- interface GotoActionOptions {
1092
- method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
1093
- payload?: any;
1094
- headers?: Record<string, string>;
1095
- waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1096
1021
  timeoutMs?: number;
1097
- simulate?: boolean;
1022
+ requestHandlerTimeoutSecs?: number;
1023
+ maxConcurrency?: number;
1024
+ maxRequestsPerMinute?: number;
1025
+ delayBetweenRequestsMs?: number;
1026
+ retries?: number;
1027
+ sites?: FetchSite[];
1028
+ url?: string;
1098
1029
  }
1099
- /**
1100
- * Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
1101
- *
1102
- * @remarks
1103
- * Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
1104
- */
1105
- interface WaitForActionOptions {
1106
- ms?: number;
1107
- selector?: string;
1108
- networkIdle?: boolean;
1109
- failOnTimeout?: boolean;
1030
+ interface FetchSite extends BaseFetcherProperties {
1031
+ domain: string;
1032
+ pathScope?: string[];
1033
+ meta?: {
1034
+ updatedAt?: number;
1035
+ ttlMs?: number;
1036
+ source?: 'manual' | 'smart';
1037
+ };
1038
+ }
1039
+ type OnFetchPauseCallback = (options: {
1040
+ message?: string;
1041
+ }) => Promise<void>;
1042
+ interface FetcherOptions extends BaseFetcherProperties {
1043
+ actions?: FetchActionOptions[];
1044
+ onPause?: OnFetchPauseCallback;
1045
+ }
1046
+ interface FetchMetadata {
1047
+ mode: FetchEngineType;
1048
+ engine?: BrowserEngine;
1049
+ timings?: {
1050
+ start: number;
1051
+ total: number;
1052
+ ttfb?: number;
1053
+ dns?: number;
1054
+ tcp?: number;
1055
+ firstByte?: number;
1056
+ download?: number;
1057
+ };
1058
+ proxy?: string;
1059
+ [key: string]: any;
1060
+ }
1061
+ interface FetchResponse {
1062
+ url: string;
1063
+ finalUrl: string;
1064
+ statusCode?: number;
1065
+ statusText?: string;
1066
+ headers: Record<string, string>;
1067
+ contentType?: string;
1068
+ body?: string | Buffer<ArrayBufferLike>;
1069
+ html?: string;
1070
+ text?: string;
1071
+ json?: any;
1072
+ cookies?: Cookie[];
1073
+ sessionState?: any;
1074
+ metadata?: FetchMetadata;
1110
1075
  }
1076
+ declare const DefaultFetcherProperties: BaseFetcherProperties;
1077
+ declare const FetcherOptionKeys: string[];
1078
+
1111
1079
  /**
1112
- * Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
1080
+ * Represents a stateful web fetching session.
1113
1081
  *
1114
1082
  * @remarks
1115
- * Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
1116
- */
1117
- interface SubmitActionOptions {
1118
- enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
1119
- }
1120
- /**
1121
- * Predefined cleanup groups for the {@link FetchEngine.trim} action.
1122
- */
1123
- type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
1124
- /**
1125
- * Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
1083
+ * A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
1084
+ * cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
1085
+ * session-level configurations and outputs.
1086
+ *
1087
+ * Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
1126
1088
  */
1127
- interface TrimActionOptions {
1128
- selectors?: string | string[];
1129
- presets?: TrimPreset | TrimPreset[];
1089
+ declare class FetchSession {
1090
+ protected options: FetcherOptions;
1091
+ /**
1092
+ * Unique identifier for the session.
1093
+ */
1094
+ readonly id: string;
1095
+ /**
1096
+ * The execution context for this session, containing configurations, event bus, and shared state.
1097
+ */
1098
+ readonly context: FetchContext;
1099
+ protected closed: boolean;
1100
+ /**
1101
+ * Creates a new FetchSession.
1102
+ *
1103
+ * @param options - Configuration options for the fetcher.
1104
+ */
1105
+ constructor(options?: FetcherOptions);
1106
+ protected _logDebug(category: string, ...args: any[]): void;
1107
+ /**
1108
+ * Executes a single action within the session.
1109
+ *
1110
+ * @param actionOptions - Configuration for the action to be executed.
1111
+ * @param context - Optional context override for this specific execution. Defaults to the session context.
1112
+ * @returns A promise that resolves to the result of the action.
1113
+ * @template R - The expected return type of the action.
1114
+ *
1115
+ * @example
1116
+ * ```ts
1117
+ * await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
1118
+ * ```
1119
+ */
1120
+ execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
1121
+ /**
1122
+ * Executes a sequence of actions.
1123
+ *
1124
+ * @param actions - An array of action options to be executed in order.
1125
+ * @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
1126
+ * These overrides do not affect the main session context.
1127
+ * @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
1128
+ *
1129
+ * @example
1130
+ * ```ts
1131
+ * const { result, outputs } = await session.executeAll([
1132
+ * { name: 'goto', params: { url: 'https://example.com' } },
1133
+ * { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
1134
+ * ], { timeoutMs: 30000 });
1135
+ * ```
1136
+ */
1137
+ executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
1138
+ index?: number;
1139
+ }): Promise<{
1140
+ result: FetchResponse | undefined;
1141
+ outputs: Record<string, any>;
1142
+ }>;
1143
+ /**
1144
+ * Retrieves all outputs accumulated during the session.
1145
+ *
1146
+ * @returns A record of stored output data.
1147
+ */
1148
+ getOutputs(): Record<string, any>;
1149
+ /**
1150
+ * Gets the current state of the session, including cookies and engine-specific state.
1151
+ *
1152
+ * @returns A promise resolving to the session state, or undefined if no engine is initialized.
1153
+ */
1154
+ getState(): Promise<{
1155
+ cookies: Cookie[];
1156
+ sessionState?: any;
1157
+ } | undefined>;
1158
+ /**
1159
+ * Disposes of the session and its associated engine.
1160
+ *
1161
+ * @remarks
1162
+ * This method should be called when the session is no longer needed to free up resources
1163
+ * (e.g., closing browser instances, purging temporary storage).
1164
+ */
1165
+ dispose(): Promise<void>;
1166
+ private ensureEngine;
1167
+ protected createContext(options?: FetcherOptions): FetchContext;
1130
1168
  }
1131
- declare const TRIM_PRESETS: Record<string, string[]>;
1169
+
1132
1170
  /**
1133
- * Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
1171
+ * High-level entry point for the Web Fetcher library.
1134
1172
  *
1135
1173
  * @remarks
1136
- * This action allows executing custom JavaScript logic within the page context.
1137
- *
1138
- * **Execution Environments:**
1139
- * - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
1140
- * - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
1141
- * including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
1142
- *
1143
- * **Navigation Handling:**
1144
- * If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
1145
- * automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
1146
- *
1147
- * @example
1148
- * ```json
1149
- * {
1150
- * "action": "evaluate",
1151
- * "params": {
1152
- * "fn": "([a, b]) => a + b",
1153
- * "args": [1, 2]
1154
- * }
1155
- * }
1156
- * ```
1174
+ * The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
1175
+ * It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
1157
1176
  *
1158
1177
  * @example
1159
- * ```json
1160
- * {
1161
- * "action": "evaluate",
1162
- * "params": {
1163
- * "fn": "({ x, y }) => x * y",
1164
- * "args": { "x": 6, "y": 7 }
1165
- * }
1166
- * }
1178
+ * ```ts
1179
+ * const fetcher = new WebFetcher();
1180
+ * const { result } = await fetcher.fetch('https://example.com');
1167
1181
  * ```
1168
1182
  */
1169
- interface EvaluateActionOptions {
1183
+ declare class WebFetcher {
1184
+ private defaults;
1170
1185
  /**
1171
- * The function or expression to execute.
1186
+ * Creates a new WebFetcher with default options.
1172
1187
  *
1173
- * @remarks
1174
- * Can be:
1175
- * 1. A function object (only available when using the API directly).
1176
- * 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
1177
- * 3. A string containing a direct expression, e.g., `"document.title"`
1188
+ * @param defaults - Default configuration options applied to all sessions and requests.
1189
+ */
1190
+ constructor(defaults?: FetcherOptions);
1191
+ /**
1192
+ * Creates a new FetchSession.
1178
1193
  *
1179
- * **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
1180
- * Use destructuring to handle multiple parameters.
1194
+ * @param options - Configuration options for the session, merged with defaults.
1195
+ * @returns A promise resolving to a new FetchSession instance.
1181
1196
  */
1182
- fn: string | ((...args: any[]) => any);
1197
+ createSession(options?: FetcherOptions): Promise<FetchSession>;
1183
1198
  /**
1184
- * Data to pass to the function.
1199
+ * Fetches content from a URL or executes a complex action script.
1185
1200
  *
1186
1201
  * @remarks
1187
- * This value is passed as the first and only argument to the function defined in {@link fn}.
1188
- * Recommended to use an array or object for multiple values.
1202
+ * This method automatically creates a session, executes the specified actions,
1203
+ * retrieves the content, and disposes of the session.
1204
+ *
1205
+ * @param url - The target URL or a complete FetcherOptions object.
1206
+ * @param options - Additional options when the first parameter is a URL string.
1207
+ * @returns A promise resolving to the final response and any extracted outputs.
1189
1208
  */
1190
- args?: any;
1209
+ fetch(url: string, options?: FetcherOptions): Promise<{
1210
+ result: FetchResponse | undefined;
1211
+ outputs: Record<string, any>;
1212
+ }>;
1213
+ fetch(options: FetcherOptions): Promise<{
1214
+ result: FetchResponse | undefined;
1215
+ outputs: Record<string, any>;
1216
+ }>;
1191
1217
  }
1218
+
1192
1219
  /**
1193
- * Union type representing all possible engine actions that can be dispatched.
1220
+ * Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
1221
+ * It acts as the target for extraction and interaction actions.
1222
+ */
1223
+ type FetchElementScope = any;
1224
+ /**
1225
+ * Interface representing the minimal engine capabilities required for extraction.
1194
1226
  *
1195
1227
  * @remarks
1196
- * Defines the command structure processed during page interactions. Each action type corresponds to
1197
- * a specific user interaction or navigation command within the action loop architecture.
1228
+ * This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
1229
+ * Implementing classes must ensure consistent behavior across different engines, especially
1230
+ * regarding scope handling (Element vs Array of Elements) and DOM traversal.
1198
1231
  */
1199
- type FetchEngineAction = {
1200
- type: 'click';
1201
- selector: string;
1202
- } | {
1203
- type: 'fill';
1204
- selector: string;
1205
- value: string;
1206
- } | {
1207
- type: 'mouseMove';
1208
- params: {
1209
- x?: number;
1210
- y?: number;
1211
- selector?: string;
1212
- steps?: number;
1213
- };
1214
- } | {
1215
- type: 'mouseClick';
1216
- params: {
1217
- x?: number;
1218
- y?: number;
1219
- button?: 'left' | 'right' | 'middle';
1220
- clickCount?: number;
1221
- delay?: number;
1222
- };
1223
- } | {
1224
- type: 'keyboardType';
1225
- params: {
1226
- text: string;
1227
- delay?: number;
1228
- };
1229
- } | {
1230
- type: 'keyboardPress';
1231
- params: {
1232
- key: string;
1233
- delay?: number;
1234
- };
1235
- } | {
1236
- type: 'waitFor';
1237
- options?: WaitForActionOptions;
1238
- } | {
1239
- type: 'submit';
1240
- selector?: any;
1241
- options?: SubmitActionOptions;
1242
- } | {
1243
- type: 'getContent';
1244
- } | {
1245
- type: 'navigate';
1246
- url: string;
1247
- opts?: GotoActionOptions;
1248
- } | {
1249
- type: 'extract';
1250
- schema: ExtractSchema;
1251
- } | {
1252
- type: 'pause';
1253
- message?: string;
1254
- } | {
1255
- type: 'trim';
1256
- options: TrimActionOptions;
1257
- } | {
1258
- type: 'evaluate';
1259
- params: EvaluateActionOptions;
1260
- } | {
1261
- type: 'dispose';
1262
- };
1263
- /**
1264
- * Represents an action that has been dispatched and is awaiting execution in the active page context.
1265
- *
1266
- * @remarks
1267
- * Connects the action request with its resolution mechanism. Used internally by the action dispatch system
1268
- * to handle promises while maintaining the page context validity window.
1269
- */
1270
- interface DispatchedEngineAction {
1271
- action: FetchEngineAction;
1272
- resolve: (value?: any) => void;
1273
- reject: (reason?: any) => void;
1274
- }
1275
- /**
1276
- * Represents a pending navigation request awaiting resolution.
1277
- *
1278
- * @remarks
1279
- * Tracks navigation requests that have been queued but not yet processed by the request handler.
1280
- */
1281
- interface PendingEngineRequest {
1282
- resolve: (value: any) => void;
1283
- reject: (reason?: any) => void;
1284
- }
1285
- /**
1286
- * Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
1287
- *
1288
- * @remarks
1289
- * The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
1290
- * `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
1291
- * content retrieval, and user interaction.
1292
- *
1293
- * The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
1294
- * the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
1295
- * page context validity across asynchronous operations.
1296
- *
1297
- * @example
1298
- * ```ts
1299
- * import "./playwright"; // 引入注册 Playwright browser 引擎
1300
- * const engine = await FetchEngine.create(context, { engine: 'browser' });
1301
- * await engine.goto('https://example.com');
1302
- * await engine.fill('#username', 'user');
1303
- * await engine.click('#submit');
1304
- * const response = await engine.getContent();
1305
- * ```
1306
- */
1307
- type AnyFetchEngine = FetchEngine<any, any, any>;
1308
- type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
1309
- declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
1310
- private static registry;
1232
+ interface IExtractEngine {
1311
1233
  /**
1312
- * Registers a fetch engine implementation with the global registry.
1234
+ * Finds all elements matching the selector within the given scope.
1313
1235
  *
1314
- * @param engineClass - The engine class to register
1315
- * @throws {Error} When engine class lacks static `id` or ID is already registered
1236
+ * @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
1237
+ * @param selector - The CSS selector to match.
1238
+ * @returns A promise resolving to an array of found element scopes.
1316
1239
  *
1317
- * @example
1318
- * ```ts
1319
- * FetchEngine.register(CheerioFetchEngine);
1320
- * ```
1240
+ * @remarks
1241
+ * **Behavior Contract:**
1242
+ * 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
1243
+ * 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
1244
+ * 3. **Array Scope**: If `scope` is an array:
1245
+ * - It MUST process elements in the order they appear in the array (which should match document order).
1246
+ * - It MUST perform the check (Self + Descendants) for *each* element in the array.
1247
+ * - It MUST flatten the results into a single array.
1248
+ * - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
1249
+ * but generally, preserving document order is the priority.
1321
1250
  */
1322
- static register(engineClass: AnyFetchEngineCtor): void;
1251
+ _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1323
1252
  /**
1324
- * Retrieves a fetch engine implementation by its unique ID.
1253
+ * Extracts a primitive value from the element based on the schema configuration.
1325
1254
  *
1326
- * @param id - The ID of the engine to retrieve
1327
- * @returns Engine class if found, otherwise `undefined`
1255
+ * @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
1256
+ * @param scope - The specific element to extract data from.
1257
+ * @returns A promise resolving to the extracted value (string, number, boolean, or null).
1258
+ *
1259
+ * @remarks
1260
+ * **Behavior Contract:**
1261
+ * - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
1262
+ * - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
1263
+ * - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
1264
+ * - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
1265
+ * - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
1328
1266
  */
1329
- static get(id: string): AnyFetchEngineCtor | undefined;
1267
+ _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1330
1268
  /**
1331
- * Retrieves a fetch engine implementation by execution mode.
1269
+ * Gets the parent element of the given scope.
1332
1270
  *
1333
- * @param mode - Execution mode (`'http'` or `'browser'`)
1334
- * @returns Engine class if found, otherwise `undefined`
1271
+ * @param scope - The element to find the parent of.
1272
+ * @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
1335
1273
  */
1336
- static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
1274
+ _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1337
1275
  /**
1338
- * Factory method to create and initialize a fetch engine instance.
1276
+ * Checks if two element scopes refer to the exact same DOM node.
1339
1277
  *
1340
- * @param ctx - Fetch engine context
1341
- * @param options - Configuration options
1342
- * @returns Initialized fetch engine instance
1343
- * @throws {Error} When no suitable engine implementation is found
1278
+ * @param scope1 - The first element scope.
1279
+ * @param scope2 - The second element scope.
1280
+ * @returns A promise resolving to `true` if they are the same node, `false` otherwise.
1344
1281
  *
1345
1282
  * @remarks
1346
- * Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
1283
+ * This comparison MUST be identity-based, not just content-based.
1347
1284
  */
1348
- static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
1285
+ _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1349
1286
  /**
1350
- * Unique identifier for the engine implementation.
1287
+ * Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
1288
+ *
1289
+ * @param scope - The anchor element (starting point). The returned list starts *after* this element.
1290
+ * @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
1291
+ * If omitted or null, returns all following siblings.
1292
+ * @returns A promise resolving to an array of sibling element scopes.
1351
1293
  *
1352
1294
  * @remarks
1353
- * Must be defined by concrete implementations. Used for registration and lookup in engine registry.
1295
+ * **Behavior Contract:**
1296
+ * - **Starting Point**: The `scope` element itself IS NOT included in the result.
1297
+ * - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
1298
+ * - **Direction**: Only scans *following* siblings (next siblings).
1299
+ * - **Flattening**: The result is a flat list of siblings, not a nested structure.
1354
1300
  */
1355
- static readonly id: string;
1301
+ _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1356
1302
  /**
1357
- * Execution mode of the engine (`'http'` or `'browser'`).
1303
+ * Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
1304
+ *
1305
+ * @param scope - The starting element from which to ascend the DOM tree.
1306
+ * @param candidates - An array of potential ancestor elements to check against.
1307
+ * @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
1358
1308
  *
1359
1309
  * @remarks
1360
- * Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
1361
- */
1362
- static readonly mode: FetchEngineType;
1363
- protected ctx?: FetchEngineContext;
1364
- protected opts?: BaseFetcherProperties;
1365
- protected crawler?: TCrawler;
1366
- protected isCrawlerReady?: boolean;
1367
- protected crawlerRunPromise?: Promise<FinalStatistics>;
1368
- protected config?: Configuration;
1369
- protected requestQueue?: RequestQueue;
1370
- protected kvStore?: KeyValueStore;
1371
- protected proxyConfiguration?: ProxyConfiguration;
1372
- protected hdrs: Record<string, string>;
1373
- protected _initialCookies?: Cookie[];
1374
- protected _initializedSessions: Set<string>;
1375
- protected currentSession?: Session;
1376
- protected pendingRequests: Map<string, PendingEngineRequest>;
1377
- protected requestCounter: number;
1378
- protected actionEmitter: EventEmitter;
1379
- protected isPageActive: boolean;
1380
- protected isEngineDisposed: boolean;
1381
- protected navigationLock: PromiseLock;
1382
- protected activeContext?: TContext;
1383
- protected isExecutingAction: boolean;
1384
- protected lastResponse?: FetchResponse;
1385
- protected actionQueue: DispatchedEngineAction[];
1386
- protected isProcessingActionLoop: boolean;
1387
- protected blockedTypes: Set<string>;
1388
- _logDebug(category: string, ...args: any[]): void;
1389
- protected _cleanup?(): Promise<void>;
1390
- protected _getTrimInfo(options: TrimActionOptions): {
1391
- selectors: string[];
1392
- removeComments: boolean;
1393
- removeHidden: boolean;
1394
- };
1395
- /**
1396
- * Finds all elements matching the selector within the given scope.
1310
+ * **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
1311
+ * It effectively answers: "Which of these container candidates does my current element belong to?"
1397
1312
  *
1398
- * @param scope - The scope to search in (Engine-specific element/node or array of nodes).
1399
- * @param selector - CSS selector.
1400
- * @returns List of matching elements.
1401
- * @see {@link IExtractEngine._querySelectorAll} for behavior contract.
1402
- * @internal
1313
+ * **Implementation Guidelines**:
1314
+ * - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
1315
+ * - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
1403
1316
  */
1404
- abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1317
+ _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1405
1318
  /**
1406
- * Extracts a primitive value from the element based on schema.
1319
+ * Checks if the `container` element contains the `element` (descendant).
1407
1320
  *
1408
- * @param schema - Value extraction schema.
1409
- * @param scope - The element scope.
1410
- * @returns Extracted value.
1411
- * @see {@link IExtractEngine._extractValue} for behavior contract.
1412
- * @internal
1321
+ * @param container - The potential ancestor element.
1322
+ * @param element - The potential descendant element.
1323
+ * @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
1324
+ *
1325
+ * @remarks
1326
+ * **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
1327
+ *
1328
+ * @performance-critical Used extensively in boundary checks for Segmented extraction.
1329
+ * - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
1330
+ * - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
1413
1331
  */
1414
- abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1332
+ _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1415
1333
  /**
1416
- * Gets the parent element of the given element.
1334
+ * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1417
1335
  *
1418
- * @param scope - The element scope.
1419
- * @returns Parent element or null.
1420
- * @internal
1336
+ * @param scope1 - The first element.
1337
+ * @param scope2 - The second element.
1338
+ * @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
1339
+ *
1340
+ * @remarks
1341
+ * This is a fundamental tree operation used to find the point where two element paths diverge.
1342
+ * **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
1421
1343
  */
1422
- abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1344
+ _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1423
1345
  /**
1424
- * Checks if two elements are the same identity.
1346
+ * Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
1425
1347
  *
1426
- * @param scope1 - First element scope.
1427
- * @param scope2 - Second element scope.
1428
- * @returns True if they are the same DOM node.
1429
- * @internal
1348
+ * @param element - The descendant element.
1349
+ * @param container - The ancestor container.
1350
+ * @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
1351
+ *
1352
+ * @remarks
1353
+ * This method traverses up from `element` until it finds the node whose parent is `container`.
1354
+ * **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
1430
1355
  */
1431
- abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1356
+ _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1432
1357
  /**
1433
- * Gets all subsequent siblings of an element until a sibling matches the selector.
1434
- * Used in 'segmented' extraction mode.
1435
- *
1436
- * @param scope - The anchor element scope.
1437
- * @param untilSelector - Optional selector that marks the end of the segment (exclusive).
1438
- * @returns List of sibling elements between anchor and untilSelector.
1439
- * @internal
1358
+ * Logs debug information if debug mode is enabled.
1359
+ * @param category - The category of the log message.
1360
+ * @param args - Arguments to log.
1440
1361
  */
1441
- abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1362
+ _logDebug(category: string, ...args: any[]): void;
1363
+ }
1364
+ /**
1365
+ * Base configuration for all extraction schemas.
1366
+ */
1367
+ interface BaseExtractSchema {
1442
1368
  /**
1443
- * Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
1444
- *
1445
- * @param scope - The starting element.
1446
- * @param candidates - The array of potential ancestor scopes.
1447
- * @returns A promise resolving to the matching candidate scope, or `null` if none found.
1448
- * @see {@link IExtractEngine._findClosestAncestor} for implementation details.
1449
- * @internal
1369
+ * Whether this field is required. If true and the value is null,
1370
+ * the containing object or array item will be skipped (or throw error in strict mode).
1450
1371
  */
1451
- abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1372
+ required?: boolean;
1452
1373
  /**
1453
- * Checks if the `container` scope contains the `element` scope.
1454
- *
1455
- * @param container - The potential ancestor element.
1456
- * @param element - The potential descendant element.
1457
- * @returns A promise resolving to `true` if `container` contains `element`.
1458
- * @see {@link IExtractEngine._contains} for implementation details.
1459
- * @internal
1374
+ * Whether to enable strict mode for this extraction.
1375
+ * If true, missing required fields will throw an error instead of being skipped.
1460
1376
  */
1461
- abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1377
+ strict?: boolean;
1462
1378
  /**
1463
- * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1379
+ * Specifies the starting anchor for extraction of this field.
1380
+ * - Field Name: Uses the DOM element of a previously extracted field as the anchor.
1381
+ * - CSS Selector: Re-queries the selector within the current context to find the anchor.
1464
1382
  *
1465
- * @param scope1 - The first element scope.
1466
- * @param scope2 - The second element scope.
1467
- * @returns A promise resolving to the LCA element scope, or `null` if none found.
1468
- * @internal
1383
+ * Once anchored, the search scope for this field becomes the siblings following the anchor.
1469
1384
  */
1470
- abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1385
+ anchor?: string;
1471
1386
  /**
1472
- * Finds the direct child of container that contains element.
1473
- *
1474
- * @param element - The descendant element.
1475
- * @param container - The container element.
1476
- * @returns The child element of container, or null.
1477
- * @internal
1387
+ * The maximum number of levels to bubble up from the anchor or matched element.
1388
+ * - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
1389
+ * - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
1390
+ * - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
1478
1391
  */
1479
- abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1480
- protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
1392
+ depth?: number;
1393
+ }
1394
+ /**
1395
+ * Extraction schema types.
1396
+ */
1397
+ type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
1398
+ /**
1399
+ * Configuration for extracting a single value.
1400
+ */
1401
+ interface ExtractValueSchema extends BaseExtractSchema {
1481
1402
  /**
1482
- * Normalizes the array extraction mode into an options object.
1483
- * @param mode - The mode string or options object.
1484
- * @internal
1403
+ * The data type to cast the extracted value to.
1404
+ * @default 'string'
1485
1405
  */
1486
- protected _normalizeArrayMode(mode?: ExtractArrayMode): {
1487
- type: ExtractArrayModeName;
1488
- } & any;
1406
+ type?: 'string' | 'number' | 'boolean' | 'html';
1489
1407
  /**
1490
- * Performs standard nested array extraction.
1491
- * @param items - The schema for each item.
1492
- * @param elements - The list of item elements.
1493
- * @internal
1408
+ * Extraction behavior mode.
1409
+ * - 'text': (Default) Uses textContent.
1410
+ * - 'innerText': Uses rendered text (respects CSS line breaks).
1411
+ * - 'html': Returns innerHTML.
1412
+ * - 'outerHTML': Returns HTML including the element's tag.
1494
1413
  */
1495
- protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
1496
- strict?: boolean;
1497
- }): Promise<any[]>;
1414
+ mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
1498
1415
  /**
1499
- * Performs columnar extraction (Column Alignment Mode).
1500
- *
1501
- * @param schema - The schema for a single item (must be an object or implicit object).
1502
- * @param container - The container element to search within.
1503
- * @param opts - Columnar extraction options (strict, inference).
1504
- * @returns An array of extracted items, or null if requirements aren't met.
1505
- * @internal
1416
+ * CSS selector to locate the element within the current context.
1506
1417
  */
1507
- protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
1418
+ selector?: string;
1508
1419
  /**
1509
- * Performs segmented extraction (Anchor-based Scanning).
1510
- *
1511
- * @param schema - The schema for a single item (must be an object).
1512
- * @param container - The container element to scan.
1513
- * @param opts - Segmented extraction options (anchor).
1514
- * @returns An array of extracted items.
1515
- * @internal
1420
+ * Attribute name to extract (e.g., 'href', 'src').
1421
+ * If omitted, the text content or HTML is extracted based on `type`.
1516
1422
  */
1517
- protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
1423
+ attribute?: string;
1518
1424
  /**
1519
- * Creates the crawler instance for the specific engine implementation.
1520
- * @param options - The final crawler options.
1521
- * @internal
1425
+ * Filter elements that contain a descendant matching this CSS selector.
1522
1426
  */
1523
- protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
1427
+ has?: string;
1524
1428
  /**
1525
- * Gets the crawler-specific options from the subclass.
1526
- * @param ctx - The fetch engine context.
1527
- * @internal
1429
+ * Exclude elements matching this CSS selector.
1528
1430
  */
1529
- protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
1431
+ exclude?: string;
1432
+ }
1433
+ /**
1434
+ * Names of the supported array extraction modes.
1435
+ */
1436
+ type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
1437
+ /**
1438
+ * Base options for array extraction modes.
1439
+ */
1440
+ interface BaseModeOptions {
1441
+ type: ExtractArrayModeName;
1530
1442
  /**
1531
- * Abstract method for building standard [FetchResponse] from Crawlee context.
1532
- *
1533
- * @param context - Crawlee crawling context
1534
- * @returns Promise resolving to [FetchResponse] object
1535
- *
1536
- * @remarks
1537
- * Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
1538
- * @internal
1443
+ * Whether to enable strict mode for this specific array mode.
1444
+ * @default false
1539
1445
  */
1540
- protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
1541
- protected buildResponse(context: TContext): Promise<FetchResponse>;
1446
+ strict?: boolean;
1447
+ }
1448
+ /**
1449
+ * Options for columnar (column-alignment) extraction.
1450
+ */
1451
+ interface ColumnarOptions extends BaseModeOptions {
1452
+ type: 'columnar';
1542
1453
  /**
1543
- * Abstract method for executing action within current page context.
1544
- *
1545
- * @param context - Crawlee crawling context
1546
- * @param action - Action to execute
1547
- * @returns Promise resolving to action result
1548
- *
1549
- * @remarks
1550
- * Handles specific user interactions using underlying technology (Playwright/Cheerio).
1551
- * @internal
1454
+ * Whether to enable heuristic inference.
1455
+ * If true, tries to find a common parent to infer item wrappers when counts mismatch.
1456
+ * @default false
1552
1457
  */
1553
- protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
1458
+ inference?: boolean;
1459
+ }
1460
+ /**
1461
+ * Options for segmented (anchor-based) extraction.
1462
+ */
1463
+ interface SegmentedOptions extends BaseModeOptions {
1464
+ type: 'segmented';
1554
1465
  /**
1555
- * Navigates to the specified URL.
1556
- *
1557
- * @param url - Target URL
1558
- * @param params - Navigation options
1559
- * @returns Promise resolving when navigation completes
1560
- *
1561
- * @example
1562
- * ```ts
1563
- * await engine.goto('https://example.com');
1564
- * ```
1466
+ * The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
1467
+ * Defaults to the first property key's selector defined in `items`.
1565
1468
  */
1566
- abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
1469
+ anchor?: string;
1567
1470
  /**
1568
- * Waits for specified condition before continuing.
1569
- *
1570
- * @param params - Wait conditions
1571
- * @returns Promise resolving when wait condition is met
1572
- *
1573
- * @example
1574
- * ```ts
1575
- * await engine.waitFor({ ms: 1000 }); // Wait 1 second
1576
- * await engine.waitFor({ selector: '#content' }); // Wait for element
1577
- * ```
1471
+ * Where to start searching for fields within each segment.
1472
+ * - 'anchor': (Default) All fields are searched within the entire segment.
1473
+ * - 'previous': Each field is searched starting from after the previous field's match.
1578
1474
  */
1579
- waitFor(params?: WaitForActionOptions): Promise<void>;
1475
+ relativeTo?: 'anchor' | 'previous';
1580
1476
  /**
1581
- * Clicks on element matching selector.
1582
- *
1583
- * @param selector - CSS selector of element to click
1584
- * @returns Promise resolving when click is processed
1585
- * @throws {Error} When no active page context exists
1477
+ * The maximum number of levels to bubble up from the anchor to find a segment container.
1478
+ * If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
1586
1479
  */
1587
- click(selector: string): Promise<void>;
1480
+ depth?: number;
1481
+ }
1482
+ /**
1483
+ * Union type for array extraction modes and their options.
1484
+ */
1485
+ type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
1486
+ /**
1487
+ * Configuration for extracting an array of items.
1488
+ */
1489
+ interface ExtractArraySchema extends BaseExtractSchema {
1490
+ type: 'array';
1588
1491
  /**
1589
- * Moves mouse to specified position or element.
1590
- *
1591
- * @param params - Move parameters (x, y, selector, steps)
1492
+ * CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
1592
1493
  */
1593
- mouseMove(params: {
1594
- x?: number;
1595
- y?: number;
1596
- selector?: string;
1597
- steps?: number;
1598
- }): Promise<void>;
1599
- /**
1600
- * Clicks at current position or specified position.
1601
- *
1602
- * @param params - Click parameters (x, y, button, clickCount, delay)
1603
- */
1604
- mouseClick(params: {
1605
- x?: number;
1606
- y?: number;
1607
- button?: 'left' | 'right' | 'middle';
1608
- clickCount?: number;
1609
- delay?: number;
1610
- }): Promise<void>;
1611
- /**
1612
- * Types text into current focused element.
1613
- *
1614
- * @param text - Text to type
1615
- * @param delay - Delay between key presses
1616
- */
1617
- keyboardType(text: string, delay?: number): Promise<void>;
1494
+ selector: string;
1618
1495
  /**
1619
- * Presses specified key.
1620
- *
1621
- * @param key - Key to press
1622
- * @param delay - Delay after key press
1496
+ * Filter items/containers that contain a descendant matching this CSS selector.
1623
1497
  */
1624
- keyboardPress(key: string, delay?: number): Promise<void>;
1498
+ has?: string;
1625
1499
  /**
1626
- * Fills input element with specified value.
1627
- *
1628
- * @param selector - CSS selector of input element
1629
- * @param value - Value to fill
1630
- * @returns Promise resolving when fill operation completes
1631
- * @throws {Error} When no active page context exists
1500
+ * Exclude items/containers matching this CSS selector.
1632
1501
  */
1633
- fill(selector: string, value: string): Promise<void>;
1502
+ exclude?: string;
1634
1503
  /**
1635
- * Submits a form.
1636
- *
1637
- * @param selector - Optional form/submit button selector
1638
- * @param options - Submission options
1639
- * @returns Promise resolving when form is submitted
1640
- * @throws {Error} When no active page context exists
1504
+ * Schema applied recursively to each extracted item.
1505
+ * If omitted, defaults to extracting text.
1641
1506
  */
1642
- submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
1507
+ items?: ExtractSchema;
1643
1508
  /**
1644
- * Removes elements from the DOM based on selectors and presets.
1645
- *
1646
- * @param options - Trim options specifying selectors and presets
1647
- * @returns Promise resolving when trim operation completes
1648
- * @throws {Error} When no active page context exists
1509
+ * Shortcut for `items` to extract a specific attribute directly.
1649
1510
  */
1650
- trim(options: TrimActionOptions): Promise<void>;
1511
+ attribute?: string;
1651
1512
  /**
1652
- * Pauses execution, allowing for manual intervention or inspection.
1653
- *
1654
- * @param message - Optional message to display during pause
1655
- * @returns Promise resolving when execution is resumed
1656
- * @throws {Error} When no active page context exists
1513
+ * Array extraction mode.
1514
+ * - 'nested': (Default) Items are elements matched by `selector`.
1515
+ * - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
1516
+ * - 'segmented': `selector` is a container, items are segmented by an anchor field.
1657
1517
  */
1658
- pause(message?: string): Promise<void>;
1518
+ mode?: ExtractArrayMode;
1519
+ }
1520
+ /**
1521
+ * Configuration for extracting an object with multiple properties.
1522
+ */
1523
+ interface ExtractObjectSchema extends BaseExtractSchema {
1524
+ type: 'object';
1659
1525
  /**
1660
- * Executes a custom function or expression within the current page context.
1661
- *
1662
- * @remarks
1663
- * This is a powerful action that allows running custom logic to interact with the DOM,
1664
- * calculate values, or trigger navigations.
1665
- *
1666
- * - In **Browser Mode**, it runs in the real browser.
1667
- * - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
1668
- *
1669
- * The action handles automatic navigation if `window.location` is modified.
1670
- *
1671
- * @param params - Configuration for the execution, including the function and arguments.
1672
- * @returns A promise resolving to the result of the execution.
1673
- * @throws {Error} If no active page context exists or if execution fails.
1674
- *
1675
- * @see {@link EvaluateActionOptions} for detailed parameter options and examples.
1526
+ * Root selector for the object. If provided, sub-properties are searched within this element.
1676
1527
  */
1677
- evaluate(params: EvaluateActionOptions): Promise<any>;
1528
+ selector?: string;
1678
1529
  /**
1679
- * Extracts structured data from the current page content.
1680
- *
1681
- * @param schema - An object defining the data to extract.
1682
- * @returns A promise that resolves to an object with the extracted data.
1530
+ * Filter the object element based on descendants.
1683
1531
  */
1684
- extract<T>(schema: ExtractSchema): Promise<T>;
1532
+ has?: string;
1685
1533
  /**
1686
- * Gets the unique identifier of this engine implementation.
1534
+ * Exclude the object element if it matches this selector.
1687
1535
  */
1688
- get id(): string;
1536
+ exclude?: string;
1689
1537
  /**
1690
- * Returns the current state of the engine (cookies)
1691
- * that can be used to restore the session later.
1538
+ * Where to start searching for fields within this object.
1539
+ * Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
1540
+ * - 'anchor': (Default) All fields are searched within the entire scope.
1541
+ * - 'previous': Each field is searched starting from after the previous field's match.
1692
1542
  */
1693
- getState(): Promise<{
1694
- cookies: Cookie[];
1695
- sessionState?: any;
1696
- }>;
1543
+ relativeTo?: 'anchor' | 'previous';
1697
1544
  /**
1698
- * Gets the execution mode of this engine (`'http'` or `'browser'`).
1545
+ * Explicit order of property extraction.
1546
+ * Useful when using `relativeTo: 'previous'`.
1699
1547
  */
1700
- get mode(): FetchEngineType;
1548
+ order?: string[];
1701
1549
  /**
1702
- * Gets the fetch engine context associated with this instance.
1550
+ * Definition of the object's properties and their corresponding extraction schemas.
1703
1551
  */
1704
- get context(): FetchEngineContext | undefined;
1552
+ properties: {
1553
+ [key: string]: ExtractSchema;
1554
+ };
1555
+ }
1556
+
1557
+ interface PromiseLock extends Promise<void> {
1558
+ release: () => void;
1559
+ }
1560
+
1561
+ /**
1562
+ * Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
1563
+ *
1564
+ * @remarks
1565
+ * Used when navigating to a URL to specify additional parameters beyond the basic URL.
1566
+ *
1567
+ * @example
1568
+ * ```ts
1569
+ * await engine.goto('https://example.com', {
1570
+ * method: 'POST',
1571
+ * payload: { username: 'user', password: 'pass' },
1572
+ * headers: { 'Content-Type': 'application/json' },
1573
+ * waitUntil: 'networkidle'
1574
+ * });
1575
+ * ```
1576
+ */
1577
+ interface GotoActionOptions {
1578
+ method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
1579
+ payload?: any;
1580
+ headers?: Record<string, string>;
1581
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
1582
+ timeoutMs?: number;
1583
+ simulate?: boolean;
1584
+ }
1585
+ /**
1586
+ * Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
1587
+ *
1588
+ * @remarks
1589
+ * Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
1590
+ */
1591
+ interface WaitForActionOptions {
1592
+ ms?: number;
1593
+ selector?: string;
1594
+ networkIdle?: boolean;
1595
+ failOnTimeout?: boolean;
1596
+ }
1597
+ /**
1598
+ * Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
1599
+ *
1600
+ * @remarks
1601
+ * Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
1602
+ */
1603
+ interface SubmitActionOptions {
1604
+ enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
1605
+ }
1606
+ /**
1607
+ * Predefined cleanup groups for the {@link FetchEngine.trim} action.
1608
+ */
1609
+ type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
1610
+ /**
1611
+ * Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
1612
+ */
1613
+ interface TrimActionOptions {
1614
+ selectors?: string | string[];
1615
+ presets?: TrimPreset | TrimPreset[];
1616
+ }
1617
+ declare const TRIM_PRESETS: Record<string, string[]>;
1618
+ /**
1619
+ * Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
1620
+ *
1621
+ * @remarks
1622
+ * This action allows executing custom JavaScript logic within the page context.
1623
+ *
1624
+ * **Execution Environments:**
1625
+ * - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
1626
+ * - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
1627
+ * including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
1628
+ *
1629
+ * **Navigation Handling:**
1630
+ * If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
1631
+ * automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
1632
+ *
1633
+ * @example
1634
+ * ```json
1635
+ * {
1636
+ * "action": "evaluate",
1637
+ * "params": {
1638
+ * "fn": "([a, b]) => a + b",
1639
+ * "args": [1, 2]
1640
+ * }
1641
+ * }
1642
+ * ```
1643
+ *
1644
+ * @example
1645
+ * ```json
1646
+ * {
1647
+ * "action": "evaluate",
1648
+ * "params": {
1649
+ * "fn": "({ x, y }) => x * y",
1650
+ * "args": { "x": 6, "y": 7 }
1651
+ * }
1652
+ * }
1653
+ * ```
1654
+ */
1655
+ interface EvaluateActionOptions {
1705
1656
  /**
1706
- * Initializes the fetch engine with provided context and options.
1707
- *
1708
- * @param context - Fetch engine context
1709
- * @param options - Configuration options
1710
- * @returns Promise resolving when initialization completes
1657
+ * The function or expression to execute.
1711
1658
  *
1712
1659
  * @remarks
1713
- * Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
1714
- * Automatically called when creating engine via `FetchEngine.create()`.
1660
+ * Can be:
1661
+ * 1. A function object (only available when using the API directly).
1662
+ * 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
1663
+ * 3. A string containing a direct expression, e.g., `"document.title"`
1664
+ *
1665
+ * **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
1666
+ * Use destructuring to handle multiple parameters.
1715
1667
  */
1716
- initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
1717
- cleanup(): Promise<void>;
1668
+ fn: string | ((...args: any[]) => any);
1718
1669
  /**
1719
- * Gets the initial scope for extraction for the specific engine.
1720
- * @param context - Crawlee crawling context
1721
- * @internal
1670
+ * Data to pass to the function.
1671
+ *
1672
+ * @remarks
1673
+ * This value is passed as the first and only argument to the function defined in {@link fn}.
1674
+ * Recommended to use an array or object for multiple values.
1722
1675
  */
1723
- protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
1724
- /**
1725
- * Unified action processor that handles engine-agnostic actions.
1726
- * @param context - Crawlee crawling context
1727
- * @param action - Action to execute
1728
- * @internal
1729
- */
1730
- protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
1731
- protected _handlePause(action: {
1732
- message?: string;
1733
- }): Promise<void>;
1734
- /**
1735
- * Executes all pending fetch engine actions within the current Crawlee request handler context.
1736
- *
1737
- * **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
1738
- * of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
1739
- *
1740
- * ### Why This Constraint Exists
1741
- * - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
1742
- * is **only valid during the synchronous execution phase** of the request handler
1743
- * - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
1744
- * due to Crawlee's internal resource management
1745
- *
1746
- * ### How It Works
1747
- * 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
1748
- * 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
1749
- * 3. Automatically cleans up event listeners upon completion
1750
- *
1751
- * Usage see {@link _sharedRequestHandler}
1752
- * @see {@link _sharedRequestHandler}
1753
- * @param context The active Crawlee crawling context containing the page/$ object
1754
- * @throws {Error} If called outside valid page context window (`!this.isPageActive`)
1755
- * @internal Engine infrastructure method - not for direct consumer use
1756
- */
1757
- protected _executePendingActions(context: TContext): Promise<void>;
1758
- protected _sharedRequestHandler(context: TContext): Promise<void>;
1759
- protected _sharedFailedRequestHandler(context: TContext, error?: Error): Promise<void>;
1760
- protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
1761
- private _requestHandler;
1762
- private _failedRequestHandler;
1763
- protected _commonCleanup(): Promise<void>;
1676
+ args?: any;
1677
+ }
1678
+ /**
1679
+ * Union type representing all possible engine actions that can be dispatched.
1680
+ *
1681
+ * @remarks
1682
+ * Defines the command structure processed during page interactions. Each action type corresponds to
1683
+ * a specific user interaction or navigation command within the action loop architecture.
1684
+ */
1685
+ type FetchEngineAction = {
1686
+ type: 'click';
1687
+ selector: string;
1688
+ } | {
1689
+ type: 'fill';
1690
+ selector: string;
1691
+ value: string;
1692
+ } | {
1693
+ type: 'mouseMove';
1694
+ params: {
1695
+ x?: number;
1696
+ y?: number;
1697
+ selector?: string;
1698
+ steps?: number;
1699
+ };
1700
+ } | {
1701
+ type: 'mouseClick';
1702
+ params: {
1703
+ x?: number;
1704
+ y?: number;
1705
+ button?: 'left' | 'right' | 'middle';
1706
+ clickCount?: number;
1707
+ delay?: number;
1708
+ steps?: number;
1709
+ };
1710
+ } | {
1711
+ type: 'mouseWheel';
1712
+ params: {
1713
+ x?: number;
1714
+ y?: number;
1715
+ selector?: string;
1716
+ deltaX?: number;
1717
+ deltaY?: number;
1718
+ steps?: number;
1719
+ };
1720
+ } | {
1721
+ type: 'keyboardType';
1722
+ params: {
1723
+ text: string;
1724
+ delay?: number;
1725
+ };
1726
+ } | {
1727
+ type: 'keyboardPress';
1728
+ params: {
1729
+ key: string;
1730
+ delay?: number;
1731
+ };
1732
+ } | {
1733
+ type: 'scrollIntoView';
1734
+ params: {
1735
+ selector: string;
1736
+ };
1737
+ } | {
1738
+ type: 'waitFor';
1739
+ options?: WaitForActionOptions;
1740
+ } | {
1741
+ type: 'submit';
1742
+ selector?: any;
1743
+ options?: SubmitActionOptions;
1744
+ } | {
1745
+ type: 'getContent';
1746
+ } | {
1747
+ type: 'navigate';
1748
+ url: string;
1749
+ opts?: GotoActionOptions;
1750
+ } | {
1751
+ type: 'extract';
1752
+ schema: ExtractSchema;
1753
+ } | {
1754
+ type: 'pause';
1755
+ message?: string;
1756
+ } | {
1757
+ type: 'trim';
1758
+ options: TrimActionOptions;
1759
+ } | {
1760
+ type: 'evaluate';
1761
+ params: EvaluateActionOptions;
1762
+ } | {
1763
+ type: 'dispose';
1764
+ };
1765
+ /**
1766
+ * Represents an action that has been dispatched and is awaiting execution in the active page context.
1767
+ *
1768
+ * @remarks
1769
+ * Connects the action request with its resolution mechanism. Used internally by the action dispatch system
1770
+ * to handle promises while maintaining the page context validity window.
1771
+ */
1772
+ interface DispatchedEngineAction {
1773
+ action: FetchEngineAction;
1774
+ resolve: (value?: any) => void;
1775
+ reject: (reason?: any) => void;
1776
+ }
1777
+ /**
1778
+ * Represents a pending navigation request awaiting resolution.
1779
+ *
1780
+ * @remarks
1781
+ * Tracks navigation requests that have been queued but not yet processed by the request handler.
1782
+ */
1783
+ interface PendingEngineRequest {
1784
+ resolve: (value: any) => void;
1785
+ reject: (reason?: any) => void;
1786
+ }
1787
+ /**
1788
+ * Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
1789
+ *
1790
+ * @remarks
1791
+ * The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
1792
+ * `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
1793
+ * content retrieval, and user interaction.
1794
+ *
1795
+ * The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
1796
+ * the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
1797
+ * page context validity across asynchronous operations.
1798
+ *
1799
+ * @example
1800
+ * ```ts
1801
+ * import "./playwright"; // 引入注册 Playwright browser 引擎
1802
+ * const engine = await FetchEngine.create(context, { engine: 'browser' });
1803
+ * await engine.goto('https://example.com');
1804
+ * await engine.fill('#username', 'user');
1805
+ * await engine.click('#submit');
1806
+ * const response = await engine.getContent();
1807
+ * ```
1808
+ */
1809
+ type AnyFetchEngine = FetchEngine<any, any, any>;
1810
+ type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
1811
+ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
1812
+ private static registry;
1764
1813
  /**
1765
- * Blocks specified resource types from loading.
1814
+ * Registers a fetch engine implementation with the global registry.
1766
1815
  *
1767
- * @param types - Resource types to block
1768
- * @param overwrite - Whether to replace existing blocked types
1769
- * @returns Number of blocked resource types
1816
+ * @param engineClass - The engine class to register
1817
+ * @throws {Error} When engine class lacks static `id` or ID is already registered
1770
1818
  *
1771
1819
  * @example
1772
1820
  * ```ts
1773
- * await engine.blockResources(['image', 'stylesheet']);
1774
- * await engine.blockResources(['script'], true); // Replace existing
1821
+ * FetchEngine.register(CheerioFetchEngine);
1775
1822
  * ```
1776
1823
  */
1777
- blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
1824
+ static register(engineClass: AnyFetchEngineCtor): void;
1778
1825
  /**
1779
- * Gets content of current page.
1826
+ * Retrieves a fetch engine implementation by its unique ID.
1780
1827
  *
1781
- * @returns Promise resolving to fetch response
1782
- * @throws {Error} When no content has been fetched yet
1828
+ * @param id - The ID of the engine to retrieve
1829
+ * @returns Engine class if found, otherwise `undefined`
1783
1830
  */
1784
- getContent(): Promise<FetchResponse>;
1831
+ static get(id: string): AnyFetchEngineCtor | undefined;
1785
1832
  /**
1786
- * Manages HTTP headers for requests with multiple overloads.
1787
- *
1788
- * @overload
1789
- * Gets all headers.
1790
- * @returns All headers as record
1791
- *
1792
- * @overload
1793
- * Gets specific header value.
1794
- * @param name - Header name
1795
- * @returns Header value
1796
- *
1797
- * @overload
1798
- * Sets multiple headers.
1799
- * @param headers - Headers to set
1800
- * @param replaced - Whether to replace all existing headers
1801
- * @returns `true` if successful
1802
- *
1803
- * @overload
1804
- * Sets single header.
1805
- * @param name - Header name
1806
- * @param value - Header value or `null` to remove
1807
- * @returns `true` if successful
1833
+ * Retrieves a fetch engine implementation by execution mode.
1808
1834
  *
1809
- * @example
1810
- * ```ts
1811
- * const allHeaders = await engine.headers();
1812
- * const userAgent = await engine.headers('user-agent');
1813
- * await engine.headers({ 'x-custom': 'value' });
1814
- * await engine.headers('auth', 'token');
1815
- * ```
1835
+ * @param mode - Execution mode (`'http'` or `'browser'`)
1836
+ * @returns Engine class if found, otherwise `undefined`
1816
1837
  */
1817
- headers(): Promise<Record<string, string>>;
1818
- headers(name: string): Promise<string>;
1819
- headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
1820
- headers(name: string, value: string | null): Promise<boolean>;
1838
+ static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
1821
1839
  /**
1822
- * Manages cookies for current session with multiple overloads.
1823
- *
1824
- * @overload
1825
- * Gets all cookies.
1826
- * @returns Array of cookies
1840
+ * Factory method to create and initialize a fetch engine instance.
1827
1841
  *
1828
- * @overload
1829
- * Sets cookies for session.
1830
- * @param cookies - Cookies to set
1831
- * @returns `true` if successful
1842
+ * @param ctx - Fetch engine context
1843
+ * @param options - Configuration options
1844
+ * @returns Initialized fetch engine instance
1845
+ * @throws {Error} When no suitable engine implementation is found
1832
1846
  *
1833
- * @example
1834
- * ```ts
1835
- * const cookies = await engine.cookies();
1836
- * await engine.cookies([{ name: 'session', value: '123' }]);
1837
- * ```
1847
+ * @remarks
1848
+ * Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
1838
1849
  */
1839
- cookies(): Promise<Cookie[]>;
1840
- cookies(cookies: Cookie[]): Promise<boolean>;
1850
+ static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
1841
1851
  /**
1842
- * Disposes of engine, cleaning up all resources.
1852
+ * Unique identifier for the engine implementation.
1843
1853
  *
1844
- * @returns Promise resolving when disposal completes
1845
- */
1846
- dispose(): Promise<void>;
1847
- }
1848
-
1849
- type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
1850
- interface FetchReturnTypeRegistry {
1851
- response: FetchResponse;
1852
- context: FetchContext;
1853
- result: FetchActionResult<any> | undefined;
1854
- outputs: Record<string, any>;
1855
- any: any;
1856
- none: void;
1857
- }
1858
- type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
1859
-
1860
- /**
1861
- * Represents the state of an action being executed within a context.
1862
- *
1863
- * @remarks
1864
- * Extends the basic action properties with runtime metadata like execution index,
1865
- * nesting depth, and any errors encountered during execution.
1866
- */
1867
- interface FetchActionInContext extends FetchActionProperties {
1868
- /**
1869
- * The 0-based index of the action in the execution sequence.
1870
- */
1871
- index?: number;
1872
- /**
1873
- * Error encountered during action execution, if any.
1854
+ * @remarks
1855
+ * Must be defined by concrete implementations. Used for registration and lookup in engine registry.
1874
1856
  */
1875
- error?: Error;
1857
+ static readonly id: string;
1876
1858
  /**
1877
- * The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
1859
+ * Execution mode of the engine (`'http'` or `'browser'`).
1860
+ *
1861
+ * @remarks
1862
+ * Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
1878
1863
  */
1879
- depth?: number;
1880
- }
1881
- /**
1882
- * Base internal state used by fetch engines to maintain their runtime environment.
1883
- *
1884
- * @internal
1885
- */
1886
- interface BaseFetchContextInteralState {
1864
+ static readonly mode: FetchEngineType;
1865
+ protected ctx?: FetchEngineContext;
1866
+ protected opts?: BaseFetcherProperties;
1867
+ protected crawler?: TCrawler;
1868
+ protected isCrawlerReady?: boolean;
1869
+ protected crawlerRunPromise?: Promise<FinalStatistics>;
1870
+ protected config?: Configuration;
1871
+ protected requestQueue?: RequestQueue;
1872
+ protected kvStore?: KeyValueStore;
1873
+ protected proxyConfiguration?: ProxyConfiguration;
1874
+ protected hdrs: Record<string, string>;
1875
+ protected _initialCookies?: Cookie[];
1876
+ protected _initializedSessions: Set<string>;
1877
+ protected currentSession?: Session;
1878
+ protected pendingRequests: Map<string, PendingEngineRequest>;
1879
+ protected requestCounter: number;
1880
+ protected actionEmitter: EventEmitter;
1881
+ protected isPageActive: boolean;
1882
+ protected isEngineDisposed: boolean;
1883
+ protected navigationLock: PromiseLock;
1884
+ protected activeContext?: TContext;
1885
+ protected isExecutingAction: boolean;
1886
+ protected lastResponse?: FetchResponse;
1887
+ protected actionQueue: DispatchedEngineAction[];
1888
+ protected isProcessingActionLoop: boolean;
1889
+ protected blockedTypes: Set<string>;
1890
+ _logDebug(category: string, ...args: any[]): void;
1891
+ protected _cleanup?(): Promise<void>;
1892
+ protected _getTrimInfo(options: TrimActionOptions): {
1893
+ selectors: string[];
1894
+ removeComments: boolean;
1895
+ removeHidden: boolean;
1896
+ };
1887
1897
  /**
1888
- * The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
1889
- * associated with this context.
1898
+ * Finds all elements matching the selector within the given scope.
1899
+ *
1900
+ * @param scope - The scope to search in (Engine-specific element/node or array of nodes).
1901
+ * @param selector - CSS selector.
1902
+ * @returns List of matching elements.
1903
+ * @see {@link IExtractEngine._querySelectorAll} for behavior contract.
1904
+ * @internal
1890
1905
  */
1891
- engine?: FetchEngine;
1906
+ abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
1892
1907
  /**
1893
- * Additional implementation-specific internal state.
1908
+ * Extracts a primitive value from the element based on schema.
1909
+ *
1910
+ * @param schema - Value extraction schema.
1911
+ * @param scope - The element scope.
1912
+ * @returns Extracted value.
1913
+ * @see {@link IExtractEngine._extractValue} for behavior contract.
1914
+ * @internal
1894
1915
  */
1895
- [key: string]: any;
1896
- }
1897
- /**
1898
- * Extended internal state for the fetch context, including action lifecycle management.
1899
- *
1900
- * @internal
1901
- */
1902
- interface FetchContextInteralState extends BaseFetchContextInteralState {
1916
+ abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
1903
1917
  /**
1904
- * Stack of actions currently being executed, used to manage nested action calls.
1918
+ * Gets the parent element of the given element.
1919
+ *
1920
+ * @param scope - The element scope.
1921
+ * @returns Parent element or null.
1922
+ * @internal
1905
1923
  */
1906
- actionStack?: FetchActionInContext[];
1924
+ abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
1907
1925
  /**
1908
- * Global counter for actions executed within the session, used to assign auto-incrementing indices.
1926
+ * Checks if two elements are the same identity.
1927
+ *
1928
+ * @param scope1 - First element scope.
1929
+ * @param scope2 - Second element scope.
1930
+ * @returns True if they are the same DOM node.
1931
+ * @internal
1909
1932
  */
1910
- actionIndex?: number;
1911
- }
1912
- /**
1913
- * Context provided to the Fetch Engine during navigation and request handling.
1914
- *
1915
- * @remarks
1916
- * This interface contains the minimum set of properties required by an engine
1917
- * to perform a fetch operation and build a response.
1918
- */
1919
- interface FetchEngineContext extends BaseFetcherProperties {
1933
+ abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
1920
1934
  /**
1921
- * Unique identifier for the session or request batch.
1935
+ * Gets all subsequent siblings of an element until a sibling matches the selector.
1936
+ * Used in 'segmented' extraction mode.
1937
+ *
1938
+ * @param scope - The anchor element scope.
1939
+ * @param untilSelector - Optional selector that marks the end of the segment (exclusive).
1940
+ * @returns List of sibling elements between anchor and untilSelector.
1941
+ * @internal
1922
1942
  */
1923
- id: string;
1943
+ abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
1924
1944
  /**
1925
- * The target URL for the next navigation, if specified.
1945
+ * Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
1946
+ *
1947
+ * @param scope - The starting element.
1948
+ * @param candidates - The array of potential ancestor scopes.
1949
+ * @returns A promise resolving to the matching candidate scope, or `null` if none found.
1950
+ * @see {@link IExtractEngine._findClosestAncestor} for implementation details.
1951
+ * @internal
1926
1952
  */
1927
- url?: string;
1953
+ abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
1928
1954
  /**
1929
- * The final URL after all redirects have been followed.
1955
+ * Checks if the `container` scope contains the `element` scope.
1956
+ *
1957
+ * @param container - The potential ancestor element.
1958
+ * @param element - The potential descendant element.
1959
+ * @returns A promise resolving to `true` if `container` contains `element`.
1960
+ * @see {@link IExtractEngine._contains} for implementation details.
1961
+ * @internal
1930
1962
  */
1931
- finalUrl?: string;
1963
+ abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
1932
1964
  /**
1933
- * The standardized response object from the most recent navigation.
1965
+ * Finds the Lowest Common Ancestor (LCA) of two element scopes.
1966
+ *
1967
+ * @param scope1 - The first element scope.
1968
+ * @param scope2 - The second element scope.
1969
+ * @returns A promise resolving to the LCA element scope, or `null` if none found.
1970
+ * @internal
1934
1971
  */
1935
- lastResponse?: FetchResponse;
1972
+ abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
1936
1973
  /**
1937
- * The result object from the most recent action execution.
1974
+ * Finds the direct child of container that contains element.
1975
+ *
1976
+ * @param element - The descendant element.
1977
+ * @param container - The container element.
1978
+ * @returns The child element of container, or null.
1979
+ * @internal
1938
1980
  */
1939
- lastResult?: FetchActionResult;
1981
+ abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
1982
+ protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
1940
1983
  /**
1941
- * Engine-specific internal state.
1984
+ * Normalizes the array extraction mode into an options object.
1985
+ * @param mode - The mode string or options object.
1986
+ * @internal
1942
1987
  */
1943
- internal: BaseFetchContextInteralState;
1944
- }
1945
- /**
1946
- * The full execution context for a Web Fetcher session or action batch.
1947
- *
1948
- * @remarks
1949
- * This object is the central state container for the fetch operation. It provides
1950
- * access to configuration, the event bus, shared outputs, and the execution engine.
1951
- * It is passed to every action during execution.
1952
- */
1953
- interface FetchContext extends FetchEngineContext {
1988
+ protected _normalizeArrayMode(mode?: ExtractArrayMode): {
1989
+ type: ExtractArrayModeName;
1990
+ } & any;
1954
1991
  /**
1955
- * Metadata about the action currently being executed.
1992
+ * Performs standard nested array extraction.
1993
+ * @param items - The schema for each item.
1994
+ * @param elements - The list of item elements.
1995
+ * @internal
1956
1996
  */
1957
- currentAction?: FetchActionInContext;
1997
+ protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
1998
+ strict?: boolean;
1999
+ }): Promise<any[]>;
1958
2000
  /**
1959
- * A shared key-value store for storing data extracted from pages or
1960
- * metadata generated during action execution.
2001
+ * Performs columnar extraction (Column Alignment Mode).
2002
+ *
2003
+ * @param schema - The schema for a single item (must be an object or implicit object).
2004
+ * @param container - The container element to search within.
2005
+ * @param opts - Columnar extraction options (strict, inference).
2006
+ * @returns An array of extracted items, or null if requirements aren't met.
2007
+ * @internal
1961
2008
  */
1962
- outputs: Record<string, any>;
2009
+ protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
1963
2010
  /**
1964
- * Executes a FetchAction within the current context.
2011
+ * Performs segmented extraction (Anchor-based Scanning).
1965
2012
  *
1966
- * @param actionOptions - Configuration for the action to be executed.
1967
- * @returns A promise that resolves to the action's result.
2013
+ * @param schema - The schema for a single item (must be an object).
2014
+ * @param container - The container element to scan.
2015
+ * @param opts - Segmented extraction options (anchor).
2016
+ * @returns An array of extracted items.
2017
+ * @internal
1968
2018
  */
1969
- execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
2019
+ protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
1970
2020
  /**
1971
- * Convenience method to execute an action by its registered name or ID.
1972
- *
1973
- * @param name - The registered name or ID of the action.
1974
- * @param params - Parameters specific to the action type.
1975
- * @param options - Additional execution options (e.g., storeAs, failOnError).
1976
- * @returns A promise that resolves to the action's result.
2021
+ * Creates the crawler instance for the specific engine implementation.
2022
+ * @param options - The final crawler options.
2023
+ * @internal
1977
2024
  */
1978
- action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
2025
+ protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
1979
2026
  /**
1980
- * Internal state for engine and lifecycle management.
2027
+ * Gets the crawler-specific options from the subclass.
2028
+ * @param ctx - The fetch engine context.
2029
+ * @internal
1981
2030
  */
1982
- internal: FetchContextInteralState;
2031
+ protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
1983
2032
  /**
1984
- * The central event bus for publishing and subscribing to session and action events.
2033
+ * Abstract method for building standard [FetchResponse] from Crawlee context.
2034
+ *
2035
+ * @param context - Crawlee crawling context
2036
+ * @returns Promise resolving to [FetchResponse] object
2037
+ *
2038
+ * @remarks
2039
+ * Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
2040
+ * @internal
1985
2041
  */
1986
- eventBus: EventEmitter;
1987
- }
1988
-
1989
- type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
1990
- type CheerioSelection = ReturnType<CheerioAPI>;
1991
- type CheerioNode = ReturnType<CheerioSelection['first']>;
1992
- declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
1993
- static readonly id = "cheerio";
1994
- static readonly mode = "http";
1995
- private _ensureCheerioContext;
1996
- protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
1997
- _querySelectorAll(scope: {
1998
- $: CheerioAPI;
1999
- el: any;
2000
- } | any[], selector: string): Promise<FetchElementScope[]>;
2001
- _nextSiblingsUntil(scope: {
2002
- $: CheerioAPI;
2003
- el: CheerioNode;
2004
- }, untilSelector?: string): Promise<FetchElementScope[]>;
2005
- _parentElement(scope: {
2006
- $: CheerioAPI;
2007
- el: CheerioNode;
2008
- }): Promise<FetchElementScope | null>;
2009
- _isSameElement(scope1: {
2010
- el: CheerioNode;
2011
- }, scope2: {
2012
- el: CheerioNode;
2013
- }): Promise<boolean>;
2014
- _findClosestAncestor(scope: {
2015
- $: CheerioAPI;
2016
- el: CheerioNode;
2017
- }, candidates: {
2018
- $: CheerioAPI;
2019
- el: CheerioNode;
2020
- }[]): Promise<FetchElementScope | null>;
2021
- _contains(container: {
2022
- $: CheerioAPI;
2023
- el: CheerioNode;
2024
- }, element: {
2025
- $: CheerioAPI;
2026
- el: CheerioNode;
2027
- }): Promise<boolean>;
2028
- _findCommonAncestor(scope1: {
2029
- $: CheerioAPI;
2030
- el: CheerioNode;
2031
- }, scope2: {
2032
- $: CheerioAPI;
2033
- el: CheerioNode;
2034
- }): Promise<FetchElementScope | null>;
2035
- _findContainerChild(element: {
2036
- $: CheerioAPI;
2037
- el: CheerioNode;
2038
- }, container: {
2039
- $: CheerioAPI;
2040
- el: CheerioNode;
2041
- }): Promise<FetchElementScope | null>;
2042
- _extractValue(schema: ExtractValueSchema, scope: {
2043
- $: CheerioAPI;
2044
- el: CheerioNode;
2045
- }): Promise<any>;
2046
- protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
2047
- protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
2048
- protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
2049
- url: string;
2050
- method: string;
2051
- body?: any;
2052
- headers?: Record<string, string>;
2053
- }): Promise<any>;
2054
- protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
2055
- protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
2056
- protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
2057
- goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2058
- }
2059
-
2060
- type Page = NonNullable<PlaywrightCrawlingContext['page']>;
2061
- type Locator = ReturnType<Page['locator']>;
2062
- declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
2063
- static readonly id = "playwright";
2064
- static readonly mode = "browser";
2065
- protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
2066
- _querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
2067
- _nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
2068
- _parentElement(scope: Locator): Promise<FetchElementScope | null>;
2069
- _isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
2070
- _findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
2071
- _contains(container: Locator, element: Locator): Promise<boolean>;
2072
- _findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
2073
- _findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
2074
- _extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
2075
- protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
2076
- protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
2077
- protected currentMousePos: {
2078
- x: number;
2079
- y: number;
2080
- };
2081
- protected _getRandomDelay(base: number, variance?: number): number;
2082
- protected _getTrajectory(start: {
2083
- x: number;
2084
- y: number;
2085
- }, end: {
2086
- x: number;
2087
- y: number;
2088
- }, steps?: number): {
2089
- x: number;
2090
- y: number;
2091
- }[];
2092
- protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
2093
- x: number;
2094
- y: number;
2095
- }>;
2096
- protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
2097
- protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
2098
- protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
2099
- goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
2100
- }
2101
-
2102
- declare enum FetchActionResultStatus {
2042
+ protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
2043
+ protected buildResponse(context: TContext): Promise<FetchResponse>;
2103
2044
  /**
2104
- * 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
2045
+ * Abstract method for executing action within current page context.
2046
+ *
2047
+ * @param context - Crawlee crawling context
2048
+ * @param action - Action to execute
2049
+ * @returns Promise resolving to action result
2050
+ *
2051
+ * @remarks
2052
+ * Handles specific user interactions using underlying technology (Playwright/Cheerio).
2053
+ * @internal
2105
2054
  */
2106
- Failed = 0,
2055
+ protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
2107
2056
  /**
2108
- * 动作按预期完成(即便产生 warnings)
2057
+ * Navigates to the specified URL.
2058
+ *
2059
+ * @param url - Target URL
2060
+ * @param params - Navigation options
2061
+ * @returns Promise resolving when navigation completes
2062
+ *
2063
+ * @example
2064
+ * ```ts
2065
+ * await engine.goto('https://example.com');
2066
+ * ```
2109
2067
  */
2110
- Success = 1,
2068
+ abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2111
2069
  /**
2112
- * 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
2113
- * 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
2070
+ * Waits for specified condition before continuing.
2071
+ *
2072
+ * @param params - Wait conditions
2073
+ * @returns Promise resolving when wait condition is met
2074
+ *
2075
+ * @example
2076
+ * ```ts
2077
+ * await engine.waitFor({ ms: 1000 }); // Wait 1 second
2078
+ * await engine.waitFor({ selector: '#content' }); // Wait for element
2079
+ * ```
2114
2080
  */
2115
- Skipped = 2
2116
- }
2117
- type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
2118
- interface FetchActionMeta {
2119
- id: string;
2120
- index?: number;
2121
- engineType?: FetchEngineType;
2122
- capability?: FetchActionCapabilityMode;
2123
- response?: FetchResponse;
2124
- timings?: {
2125
- start: number;
2126
- total: number;
2127
- };
2128
- retries?: number;
2129
- }
2130
- interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
2131
- status: FetchActionResultStatus;
2132
- returnType?: R;
2133
- result?: FetchReturnTypeFor<R>;
2134
- error?: Error;
2135
- meta?: FetchActionMeta;
2136
- }
2137
- interface BaseFetchActionProperties {
2138
- id?: string;
2139
- name?: string;
2140
- action?: string | FetchAction;
2141
- index?: number;
2142
- params?: any;
2143
- args?: any;
2144
- storeAs?: string;
2145
- failOnError?: boolean;
2146
- failOnTimeout?: boolean;
2147
- timeoutMs?: number;
2148
- maxRetries?: number;
2149
- [key: string]: any;
2150
- }
2151
- type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
2152
- interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
2153
- activateOn?: string | RegExp | Array<string | RegExp>;
2154
- deactivateOn?: string | RegExp | Array<string | RegExp>;
2155
- collectOn?: string | RegExp | Array<string | RegExp>;
2156
- background?: boolean;
2157
- }
2158
- type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
2159
- interface FetchActionProperties extends BaseFetchActionProperties {
2160
- collectors?: BaseFetchCollectorOptions[];
2161
- }
2162
- type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
2163
- type FetchActionCapabilities = {
2164
- [mode in FetchEngineType]?: FetchActionCapabilityMode;
2165
- };
2166
- declare abstract class FetchAction {
2167
- private static registry;
2168
- static register(actionClass: typeof FetchAction): void;
2169
- static get(id: string): typeof FetchAction | undefined;
2170
- static create(id: FetchActionOptions): FetchAction | undefined;
2171
- static create(id: string): FetchAction | undefined;
2172
- static has(name: string): boolean;
2173
- static list(): string[];
2174
- static id: string;
2175
- static returnType: FetchReturnType;
2176
- static capabilities: FetchActionCapabilities;
2177
- static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2178
- getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2179
- get id(): string;
2180
- get returnType(): FetchReturnType;
2181
- get capabilities(): FetchActionCapabilities;
2182
- protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2183
- protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2184
- abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
2185
- protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
2186
- protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
2081
+ waitFor(params?: WaitForActionOptions): Promise<void>;
2187
2082
  /**
2188
- * Action 开始生命周期
2189
- * 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
2083
+ * Clicks on element matching selector.
2084
+ *
2085
+ * @param selector - CSS selector of element to click
2086
+ * @returns Promise resolving when click is processed
2087
+ * @throws {Error} When no active page context exists
2190
2088
  */
2191
- beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
2192
- entry: FetchActionInContext;
2193
- collectors: CollectorsRuntime | undefined;
2194
- }>;
2089
+ click(selector: string): Promise<void>;
2195
2090
  /**
2196
- * Action 结束生命周期
2197
- * 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
2091
+ * Moves mouse to specified position or element.
2092
+ *
2093
+ * @param params - Move parameters (x, y, selector, steps)
2198
2094
  */
2199
- afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
2200
- entry: FetchActionInContext;
2201
- collectors?: CollectorsRuntime;
2095
+ mouseMove(params: {
2096
+ x?: number;
2097
+ y?: number;
2098
+ selector?: string;
2099
+ steps?: number;
2202
2100
  }): Promise<void>;
2203
- execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
2204
- }
2205
- type CollectorsRuntime = {
2206
- cleanup: () => void;
2207
- awaitExecPendings: () => Promise<void>;
2208
- };
2209
-
2210
- type FetchEngineType = 'http' | 'browser';
2211
- type BrowserEngine = 'playwright' | 'puppeteer';
2212
- type FetchEngineMode = FetchEngineType | 'auto' | string;
2213
- type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
2214
- /**
2215
- * Storage configuration options for the fetch engine.
2216
- *
2217
- * @remarks
2218
- * Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
2219
- */
2220
- interface StorageOptions {
2221
2101
  /**
2222
- * Custom identifier for the storage.
2223
- * If provided, multiple sessions can share the same storage by using the same ID.
2224
- * If not provided, a unique session ID is used (strong isolation).
2102
+ * Clicks at current position or specified position.
2103
+ *
2104
+ * @param params - Click parameters (x, y, button, clickCount, delay)
2225
2105
  */
2226
- id?: string;
2106
+ mouseClick(params: {
2107
+ x?: number;
2108
+ y?: number;
2109
+ button?: 'left' | 'right' | 'middle';
2110
+ clickCount?: number;
2111
+ delay?: number;
2112
+ }): Promise<void>;
2227
2113
  /**
2228
- * Whether to persist storage to disk.
2229
- * If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
2230
- * Corresponds to Crawlee's `persistStorage` configuration.
2114
+ * Scrolls the mouse wheel.
2115
+ *
2116
+ * @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
2231
2117
  */
2232
- persist?: boolean;
2118
+ mouseWheel(params: {
2119
+ x?: number;
2120
+ y?: number;
2121
+ selector?: string;
2122
+ deltaX?: number;
2123
+ deltaY?: number;
2124
+ steps?: number;
2125
+ }): Promise<void>;
2233
2126
  /**
2234
- * Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
2235
- * Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
2127
+ * Scrolls the element into view.
2128
+ *
2129
+ * @param params - Scroll parameters (selector)
2236
2130
  */
2237
- purge?: boolean;
2131
+ scrollIntoView(params: {
2132
+ selector: string;
2133
+ }): Promise<void>;
2238
2134
  /**
2239
- * Additional Crawlee configuration options.
2240
- * Allows fine-grained control over the underlying Crawlee instance.
2135
+ * Types text into current focused element.
2136
+ *
2137
+ * @param text - Text to type
2138
+ * @param delay - Delay between key presses
2139
+ */
2140
+ keyboardType(text: string, delay?: number): Promise<void>;
2141
+ /**
2142
+ * Presses specified key.
2143
+ *
2144
+ * @param key - Key to press
2145
+ * @param delay - Delay after key press
2146
+ */
2147
+ keyboardPress(key: string, delay?: number): Promise<void>;
2148
+ /**
2149
+ * Fills input element with specified value.
2150
+ *
2151
+ * @param selector - CSS selector of input element
2152
+ * @param value - Value to fill
2153
+ * @returns Promise resolving when fill operation completes
2154
+ * @throws {Error} When no active page context exists
2155
+ */
2156
+ fill(selector: string, value: string): Promise<void>;
2157
+ /**
2158
+ * Submits a form.
2159
+ *
2160
+ * @param selector - Optional form/submit button selector
2161
+ * @param options - Submission options
2162
+ * @returns Promise resolving when form is submitted
2163
+ * @throws {Error} When no active page context exists
2164
+ */
2165
+ submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
2166
+ /**
2167
+ * Removes elements from the DOM based on selectors and presets.
2168
+ *
2169
+ * @param options - Trim options specifying selectors and presets
2170
+ * @returns Promise resolving when trim operation completes
2171
+ * @throws {Error} When no active page context exists
2172
+ */
2173
+ trim(options: TrimActionOptions): Promise<void>;
2174
+ /**
2175
+ * Pauses execution, allowing for manual intervention or inspection.
2176
+ *
2177
+ * @param message - Optional message to display during pause
2178
+ * @returns Promise resolving when execution is resumed
2179
+ * @throws {Error} When no active page context exists
2180
+ */
2181
+ pause(message?: string): Promise<void>;
2182
+ /**
2183
+ * Executes a custom function or expression within the current page context.
2184
+ *
2185
+ * @remarks
2186
+ * This is a powerful action that allows running custom logic to interact with the DOM,
2187
+ * calculate values, or trigger navigations.
2188
+ *
2189
+ * - In **Browser Mode**, it runs in the real browser.
2190
+ * - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
2191
+ *
2192
+ * The action handles automatic navigation if `window.location` is modified.
2193
+ *
2194
+ * @param params - Configuration for the execution, including the function and arguments.
2195
+ * @returns A promise resolving to the result of the execution.
2196
+ * @throws {Error} If no active page context exists or if execution fails.
2197
+ *
2198
+ * @see {@link EvaluateActionOptions} for detailed parameter options and examples.
2199
+ */
2200
+ evaluate(params: EvaluateActionOptions): Promise<any>;
2201
+ /**
2202
+ * Extracts structured data from the current page content.
2203
+ *
2204
+ * @param schema - An object defining the data to extract.
2205
+ * @returns A promise that resolves to an object with the extracted data.
2206
+ */
2207
+ extract<T>(schema: ExtractSchema): Promise<T>;
2208
+ /**
2209
+ * Gets the unique identifier of this engine implementation.
2210
+ */
2211
+ get id(): string;
2212
+ /**
2213
+ * Returns the current state of the engine (cookies)
2214
+ * that can be used to restore the session later.
2215
+ */
2216
+ getState(): Promise<{
2217
+ cookies: Cookie[];
2218
+ sessionState?: any;
2219
+ }>;
2220
+ /**
2221
+ * Gets the execution mode of this engine (`'http'` or `'browser'`).
2241
2222
  */
2242
- config?: Record<string, any>;
2243
- }
2244
- interface BaseFetcherProperties {
2223
+ get mode(): FetchEngineType;
2245
2224
  /**
2246
- * 抓取模式
2247
- *
2248
- * - `http`: 使用 HTTP 进行抓取
2249
- * - `browser`: 使用浏览器进行抓取
2250
- * - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
2225
+ * Gets the fetch engine context associated with this instance.
2251
2226
  */
2252
- engine?: FetchEngineMode;
2253
- enableSmart?: boolean;
2254
- useSiteRegistry?: boolean;
2255
- antibot?: boolean;
2256
- debug?: boolean | string | string[];
2257
- headers?: Record<string, string>;
2258
- cookies?: Cookie[];
2259
- sessionState?: any;
2260
- sessionPoolOptions?: SessionPoolOptions;
2261
- overrideSessionState?: boolean;
2262
- throwHttpErrors?: boolean;
2263
- output?: {
2264
- cookies?: boolean;
2265
- sessionState?: boolean;
2266
- };
2267
- proxy?: string | string[];
2268
- blockResources?: ResourceType[];
2227
+ get context(): FetchEngineContext | undefined;
2269
2228
  /**
2270
- * Storage configuration for session isolation and persistence.
2229
+ * Initializes the fetch engine with provided context and options.
2230
+ *
2231
+ * @param context - Fetch engine context
2232
+ * @param options - Configuration options
2233
+ * @returns Promise resolving when initialization completes
2234
+ *
2235
+ * @remarks
2236
+ * Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
2237
+ * Automatically called when creating engine via `FetchEngine.create()`.
2271
2238
  */
2272
- storage?: StorageOptions;
2273
- ignoreSslErrors?: boolean;
2274
- browser?: {
2275
- /**
2276
- * 浏览器引擎,默认为 playwright
2277
- *
2278
- * - `playwright`: 使用 Playwright 引擎
2279
- * - `puppeteer`: 使用 Puppeteer 引擎
2280
- */
2281
- engine?: BrowserEngine;
2282
- headless?: boolean;
2283
- waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
2284
- launchOptions?: Record<string, any>;
2285
- };
2286
- http?: {
2287
- method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
2288
- body?: any;
2289
- };
2290
- timeoutMs?: number;
2291
- requestHandlerTimeoutSecs?: number;
2292
- maxConcurrency?: number;
2293
- maxRequestsPerMinute?: number;
2294
- delayBetweenRequestsMs?: number;
2295
- retries?: number;
2296
- sites?: FetchSite[];
2297
- url?: string;
2298
- }
2299
- interface FetchSite extends BaseFetcherProperties {
2300
- domain: string;
2301
- pathScope?: string[];
2302
- meta?: {
2303
- updatedAt?: number;
2304
- ttlMs?: number;
2305
- source?: 'manual' | 'smart';
2306
- };
2307
- }
2308
- type OnFetchPauseCallback = (options: {
2309
- message?: string;
2310
- }) => Promise<void>;
2311
- interface FetcherOptions extends BaseFetcherProperties {
2312
- actions?: FetchActionOptions[];
2313
- onPause?: OnFetchPauseCallback;
2314
- }
2315
- interface FetchMetadata {
2316
- mode: FetchEngineType;
2317
- engine?: BrowserEngine;
2318
- timings?: {
2319
- start: number;
2320
- total: number;
2321
- ttfb?: number;
2322
- dns?: number;
2323
- tcp?: number;
2324
- firstByte?: number;
2325
- download?: number;
2326
- };
2327
- proxy?: string;
2328
- [key: string]: any;
2329
- }
2330
- interface FetchResponse {
2331
- url: string;
2332
- finalUrl: string;
2333
- statusCode?: number;
2334
- statusText?: string;
2335
- headers: Record<string, string>;
2336
- contentType?: string;
2337
- body?: string | Buffer<ArrayBufferLike>;
2338
- html?: string;
2339
- text?: string;
2340
- json?: any;
2341
- cookies?: Cookie[];
2342
- sessionState?: any;
2343
- metadata?: FetchMetadata;
2344
- }
2345
- declare const DefaultFetcherProperties: BaseFetcherProperties;
2346
- declare const FetcherOptionKeys: string[];
2347
-
2348
- /**
2349
- * Represents a stateful web fetching session.
2350
- *
2351
- * @remarks
2352
- * A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
2353
- * cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
2354
- * session-level configurations and outputs.
2355
- *
2356
- * Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
2357
- */
2358
- declare class FetchSession {
2359
- protected options: FetcherOptions;
2239
+ initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
2240
+ cleanup(): Promise<void>;
2360
2241
  /**
2361
- * Unique identifier for the session.
2242
+ * Gets the initial scope for extraction for the specific engine.
2243
+ * @param context - Crawlee crawling context
2244
+ * @internal
2362
2245
  */
2363
- readonly id: string;
2246
+ protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
2364
2247
  /**
2365
- * The execution context for this session, containing configurations, event bus, and shared state.
2248
+ * Unified action processor that handles engine-agnostic actions.
2249
+ * @param context - Crawlee crawling context
2250
+ * @param action - Action to execute
2251
+ * @internal
2366
2252
  */
2367
- readonly context: FetchContext;
2368
- protected closed: boolean;
2253
+ protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
2254
+ protected _handlePause(action: {
2255
+ message?: string;
2256
+ }): Promise<void>;
2369
2257
  /**
2370
- * Creates a new FetchSession.
2258
+ * Executes all pending fetch engine actions within the current Crawlee request handler context.
2371
2259
  *
2372
- * @param options - Configuration options for the fetcher.
2260
+ * **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
2261
+ * of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
2262
+ *
2263
+ * ### Why This Constraint Exists
2264
+ * - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
2265
+ * is **only valid during the synchronous execution phase** of the request handler
2266
+ * - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
2267
+ * due to Crawlee's internal resource management
2268
+ *
2269
+ * ### How It Works
2270
+ * 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
2271
+ * 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
2272
+ * 3. Automatically cleans up event listeners upon completion
2273
+ *
2274
+ * Usage see {@link _sharedRequestHandler}
2275
+ * @see {@link _sharedRequestHandler}
2276
+ * @param context The active Crawlee crawling context containing the page/$ object
2277
+ * @throws {Error} If called outside valid page context window (`!this.isPageActive`)
2278
+ * @internal Engine infrastructure method - not for direct consumer use
2373
2279
  */
2374
- constructor(options?: FetcherOptions);
2375
- protected _logDebug(category: string, ...args: any[]): void;
2280
+ protected _executePendingActions(context: TContext): Promise<void>;
2281
+ protected _sharedRequestHandler(context: TContext): Promise<void>;
2282
+ protected _sharedFailedRequestHandler(context: TContext & {
2283
+ response?: FetchResponse;
2284
+ body?: string | Buffer;
2285
+ }, error?: Error): Promise<void>;
2286
+ protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
2287
+ private _requestHandler;
2288
+ private _failedRequestHandler;
2289
+ protected _commonCleanup(): Promise<void>;
2376
2290
  /**
2377
- * Executes a single action within the session.
2291
+ * Blocks specified resource types from loading.
2378
2292
  *
2379
- * @param actionOptions - Configuration for the action to be executed.
2380
- * @param context - Optional context override for this specific execution. Defaults to the session context.
2381
- * @returns A promise that resolves to the result of the action.
2382
- * @template R - The expected return type of the action.
2293
+ * @param types - Resource types to block
2294
+ * @param overwrite - Whether to replace existing blocked types
2295
+ * @returns Number of blocked resource types
2383
2296
  *
2384
2297
  * @example
2385
2298
  * ```ts
2386
- * await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
2299
+ * await engine.blockResources(['image', 'stylesheet']);
2300
+ * await engine.blockResources(['script'], true); // Replace existing
2387
2301
  * ```
2388
2302
  */
2389
- execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
2303
+ blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
2390
2304
  /**
2391
- * Executes a sequence of actions.
2305
+ * Gets content of current page.
2392
2306
  *
2393
- * @param actions - An array of action options to be executed in order.
2394
- * @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
2395
- * These overrides do not affect the main session context.
2396
- * @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
2307
+ * @returns Promise resolving to fetch response
2308
+ * @throws {Error} When no content has been fetched yet
2309
+ */
2310
+ getContent(): Promise<FetchResponse>;
2311
+ /**
2312
+ * Manages HTTP headers for requests with multiple overloads.
2313
+ *
2314
+ * @overload
2315
+ * Gets all headers.
2316
+ * @returns All headers as record
2317
+ *
2318
+ * @overload
2319
+ * Gets specific header value.
2320
+ * @param name - Header name
2321
+ * @returns Header value
2322
+ *
2323
+ * @overload
2324
+ * Sets multiple headers.
2325
+ * @param headers - Headers to set
2326
+ * @param replaced - Whether to replace all existing headers
2327
+ * @returns `true` if successful
2328
+ *
2329
+ * @overload
2330
+ * Sets single header.
2331
+ * @param name - Header name
2332
+ * @param value - Header value or `null` to remove
2333
+ * @returns `true` if successful
2397
2334
  *
2398
2335
  * @example
2399
2336
  * ```ts
2400
- * const { result, outputs } = await session.executeAll([
2401
- * { name: 'goto', params: { url: 'https://example.com' } },
2402
- * { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
2403
- * ], { timeoutMs: 30000 });
2337
+ * const allHeaders = await engine.headers();
2338
+ * const userAgent = await engine.headers('user-agent');
2339
+ * await engine.headers({ 'x-custom': 'value' });
2340
+ * await engine.headers('auth', 'token');
2404
2341
  * ```
2405
2342
  */
2406
- executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
2407
- index?: number;
2408
- }): Promise<{
2409
- result: FetchResponse | undefined;
2410
- outputs: Record<string, any>;
2411
- }>;
2343
+ headers(): Promise<Record<string, string>>;
2344
+ headers(name: string): Promise<string>;
2345
+ headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
2346
+ headers(name: string, value: string | null): Promise<boolean>;
2412
2347
  /**
2413
- * Retrieves all outputs accumulated during the session.
2348
+ * Manages cookies for current session with multiple overloads.
2414
2349
  *
2415
- * @returns A record of stored output data.
2416
- */
2417
- getOutputs(): Record<string, any>;
2418
- /**
2419
- * Gets the current state of the session, including cookies and engine-specific state.
2350
+ * @overload
2351
+ * Gets all cookies.
2352
+ * @returns Array of cookies
2353
+ *
2354
+ * @overload
2355
+ * Sets cookies for session.
2356
+ * @param cookies - Cookies to set
2357
+ * @returns `true` if successful
2420
2358
  *
2421
- * @returns A promise resolving to the session state, or undefined if no engine is initialized.
2359
+ * @example
2360
+ * ```ts
2361
+ * const cookies = await engine.cookies();
2362
+ * await engine.cookies([{ name: 'session', value: '123' }]);
2363
+ * ```
2422
2364
  */
2423
- getState(): Promise<{
2424
- cookies: Cookie[];
2425
- sessionState?: any;
2426
- } | undefined>;
2365
+ cookies(): Promise<Cookie[]>;
2366
+ cookies(cookies: Cookie[]): Promise<boolean>;
2427
2367
  /**
2428
- * Disposes of the session and its associated engine.
2368
+ * Disposes of engine, cleaning up all resources.
2429
2369
  *
2430
- * @remarks
2431
- * This method should be called when the session is no longer needed to free up resources
2432
- * (e.g., closing browser instances, purging temporary storage).
2370
+ * @returns Promise resolving when disposal completes
2433
2371
  */
2434
2372
  dispose(): Promise<void>;
2435
- private ensureEngine;
2436
- protected createContext(options?: FetcherOptions): FetchContext;
2437
2373
  }
2374
+ declare function getRandomDelay(base: number, variance?: number): number;
2438
2375
 
2439
- /**
2440
- * High-level entry point for the Web Fetcher library.
2441
- *
2442
- * @remarks
2443
- * The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
2444
- * It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
2445
- *
2446
- * @example
2447
- * ```ts
2448
- * const fetcher = new WebFetcher();
2449
- * const { result } = await fetcher.fetch('https://example.com');
2450
- * ```
2451
- */
2452
- declare class WebFetcher {
2453
- private defaults;
2454
- /**
2455
- * Creates a new WebFetcher with default options.
2456
- *
2457
- * @param defaults - Default configuration options applied to all sessions and requests.
2458
- */
2459
- constructor(defaults?: FetcherOptions);
2376
+ type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
2377
+ type CheerioSelection = ReturnType<CheerioAPI>;
2378
+ type CheerioNode = ReturnType<CheerioSelection['first']>;
2379
+ declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
2380
+ static readonly id = "cheerio";
2381
+ static readonly mode = "http";
2382
+ private _ensureCheerioContext;
2383
+ protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
2384
+ _querySelectorAll(scope: {
2385
+ $: CheerioAPI;
2386
+ el: any;
2387
+ } | any[], selector: string): Promise<FetchElementScope[]>;
2388
+ _nextSiblingsUntil(scope: {
2389
+ $: CheerioAPI;
2390
+ el: CheerioNode;
2391
+ }, untilSelector?: string): Promise<FetchElementScope[]>;
2392
+ _parentElement(scope: {
2393
+ $: CheerioAPI;
2394
+ el: CheerioNode;
2395
+ }): Promise<FetchElementScope | null>;
2396
+ _isSameElement(scope1: {
2397
+ el: CheerioNode;
2398
+ }, scope2: {
2399
+ el: CheerioNode;
2400
+ }): Promise<boolean>;
2401
+ _findClosestAncestor(scope: {
2402
+ $: CheerioAPI;
2403
+ el: CheerioNode;
2404
+ }, candidates: {
2405
+ $: CheerioAPI;
2406
+ el: CheerioNode;
2407
+ }[]): Promise<FetchElementScope | null>;
2408
+ _contains(container: {
2409
+ $: CheerioAPI;
2410
+ el: CheerioNode;
2411
+ }, element: {
2412
+ $: CheerioAPI;
2413
+ el: CheerioNode;
2414
+ }): Promise<boolean>;
2415
+ _findCommonAncestor(scope1: {
2416
+ $: CheerioAPI;
2417
+ el: CheerioNode;
2418
+ }, scope2: {
2419
+ $: CheerioAPI;
2420
+ el: CheerioNode;
2421
+ }): Promise<FetchElementScope | null>;
2422
+ _findContainerChild(element: {
2423
+ $: CheerioAPI;
2424
+ el: CheerioNode;
2425
+ }, container: {
2426
+ $: CheerioAPI;
2427
+ el: CheerioNode;
2428
+ }): Promise<FetchElementScope | null>;
2429
+ _extractValue(schema: ExtractValueSchema, scope: {
2430
+ $: CheerioAPI;
2431
+ el: CheerioNode;
2432
+ }): Promise<any>;
2433
+ protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
2434
+ protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
2435
+ protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
2436
+ url: string;
2437
+ method: string;
2438
+ body?: any;
2439
+ headers?: Record<string, string>;
2440
+ }): Promise<any>;
2441
+ protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
2442
+ protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
2443
+ protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
2444
+ goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
2445
+ }
2446
+
2447
+ type Page = NonNullable<PlaywrightCrawlingContext['page']>;
2448
+ type Locator = ReturnType<Page['locator']>;
2449
+ declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
2450
+ static readonly id = "playwright";
2451
+ static readonly mode = "browser";
2452
+ protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
2453
+ _querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
2454
+ _nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
2455
+ _parentElement(scope: Locator): Promise<FetchElementScope | null>;
2456
+ _isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
2457
+ _findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
2458
+ _contains(container: Locator, element: Locator): Promise<boolean>;
2459
+ _findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
2460
+ _findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
2461
+ _extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
2462
+ protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
2463
+ protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
2464
+ protected currentMousePos: {
2465
+ x: number;
2466
+ y: number;
2467
+ };
2468
+ protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
2469
+ protected mouseInitialized: boolean;
2470
+ protected _initializeMousePos(page: Page): Promise<void>;
2471
+ protected _getTrajectory(start: {
2472
+ x: number;
2473
+ y: number;
2474
+ }, end: {
2475
+ x: number;
2476
+ y: number;
2477
+ }, steps?: number): {
2478
+ x: number;
2479
+ y: number;
2480
+ }[];
2481
+ protected _moveToPos(context: PlaywrightCrawlingContext, target: {
2482
+ x: number;
2483
+ y: number;
2484
+ }, steps?: number): Promise<{
2485
+ x: number;
2486
+ y: number;
2487
+ }>;
2488
+ protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
2489
+ x: number;
2490
+ y: number;
2491
+ }>;
2492
+ protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
2493
+ x: number;
2494
+ y: number;
2495
+ }>;
2496
+ protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
2497
+ protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
2498
+ protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
2499
+ goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
2500
+ }
2501
+
2502
+ type FetchActionCapabilities = {
2503
+ [mode in FetchEngineType]?: FetchActionCapabilityMode;
2504
+ };
2505
+ declare abstract class FetchAction {
2506
+ private static registry;
2507
+ static register(actionClass: any): void;
2508
+ static get(id: string): any | undefined;
2509
+ static create(id: FetchActionOptions): FetchAction | undefined;
2510
+ static create(id: string): FetchAction | undefined;
2511
+ static has(name: string): boolean;
2512
+ static list(): string[];
2513
+ static id: string;
2514
+ static returnType: FetchReturnType;
2515
+ static capabilities: FetchActionCapabilities;
2516
+ static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2517
+ getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
2518
+ get id(): string;
2519
+ get returnType(): FetchReturnType;
2520
+ get capabilities(): FetchActionCapabilities;
2521
+ protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2522
+ protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
2523
+ abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
2524
+ protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
2525
+ protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
2460
2526
  /**
2461
- * Creates a new FetchSession.
2462
- *
2463
- * @param options - Configuration options for the session, merged with defaults.
2464
- * @returns A promise resolving to a new FetchSession instance.
2527
+ * Action 开始生命周期
2528
+ * 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
2465
2529
  */
2466
- createSession(options?: FetcherOptions): Promise<FetchSession>;
2530
+ beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
2531
+ entry: Required<Pick<FetchActionProperties, "action">> & Partial<Pick<FetchActionProperties, "id" | "name">> & {
2532
+ [x: string]: any;
2533
+ collectors?: BaseFetchCollectorOptions[] | undefined;
2534
+ index?: number | undefined;
2535
+ params?: any;
2536
+ args?: any;
2537
+ storeAs?: string | undefined;
2538
+ failOnError?: boolean | undefined;
2539
+ failOnTimeout?: boolean | undefined;
2540
+ timeoutMs?: number | undefined;
2541
+ maxRetries?: number | undefined;
2542
+ } & {
2543
+ index?: number;
2544
+ error?: Error;
2545
+ depth?: number;
2546
+ };
2547
+ collectors: CollectorsRuntime | undefined;
2548
+ }>;
2467
2549
  /**
2468
- * Fetches content from a URL or executes a complex action script.
2469
- *
2470
- * @remarks
2471
- * This method automatically creates a session, executes the specified actions,
2472
- * retrieves the content, and disposes of the session.
2473
- *
2474
- * @param url - The target URL or a complete FetcherOptions object.
2475
- * @param options - Additional options when the first parameter is a URL string.
2476
- * @returns A promise resolving to the final response and any extracted outputs.
2550
+ * Action 结束生命周期
2551
+ * 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
2477
2552
  */
2478
- fetch(url: string, options?: FetcherOptions): Promise<{
2479
- result: FetchResponse | undefined;
2480
- outputs: Record<string, any>;
2481
- }>;
2482
- fetch(options: FetcherOptions): Promise<{
2483
- result: FetchResponse | undefined;
2484
- outputs: Record<string, any>;
2485
- }>;
2553
+ afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
2554
+ entry: FetchActionInContext;
2555
+ collectors?: CollectorsRuntime;
2556
+ }): Promise<void>;
2557
+ execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
2486
2558
  }
2559
+ type CollectorsRuntime = {
2560
+ cleanup: () => void;
2561
+ awaitExecPendings: () => Promise<void>;
2562
+ };
2487
2563
 
2488
2564
  declare class ClickAction extends FetchAction {
2489
2565
  static id: string;
@@ -2644,6 +2720,53 @@ declare class MouseClickAction extends FetchAction {
2644
2720
  };
2645
2721
  onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
2646
2722
  }
2723
+ interface ScrollIntoViewParams {
2724
+ selector: string;
2725
+ }
2726
+ declare class ScrollIntoViewAction extends FetchAction {
2727
+ static id: string;
2728
+ static returnType: "none";
2729
+ static capabilities: {
2730
+ http: "noop";
2731
+ browser: "native";
2732
+ };
2733
+ onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
2734
+ }
2735
+ interface MouseWheelParams {
2736
+ /**
2737
+ * Target X coordinate for the mouse wheel event.
2738
+ */
2739
+ x?: number;
2740
+ /**
2741
+ * Target Y coordinate for the mouse wheel event.
2742
+ */
2743
+ y?: number;
2744
+ /**
2745
+ * Selector for the element to scroll. If provided, mouse will move to this element before scrolling.
2746
+ */
2747
+ selector?: string;
2748
+ /**
2749
+ * Horizontal scroll delta.
2750
+ */
2751
+ deltaX?: number;
2752
+ /**
2753
+ * Vertical scroll delta.
2754
+ */
2755
+ deltaY?: number;
2756
+ /**
2757
+ * Number of steps to split the scroll into for simulating human-like behavior.
2758
+ */
2759
+ steps?: number;
2760
+ }
2761
+ declare class MouseWheelAction extends FetchAction {
2762
+ static id: string;
2763
+ static returnType: "none";
2764
+ static capabilities: {
2765
+ http: "noop";
2766
+ browser: "native";
2767
+ };
2768
+ onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
2769
+ }
2647
2770
 
2648
2771
  interface KeyboardTypeParams {
2649
2772
  text: string;
@@ -2681,4 +2804,4 @@ declare function fetchWeb(url: string, options?: FetcherOptions): Promise<{
2681
2804
  outputs: Record<string, any>;
2682
2805
  }>;
2683
2806
 
2684
- export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb };
2807
+ export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EngineUpgradeError, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionMeta, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };