@isdk/web-fetcher 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.action.cn.md +28 -4
- package/README.action.md +27 -4
- package/README.cn.md +21 -0
- package/README.engine.cn.md +35 -7
- package/README.engine.md +30 -2
- package/README.md +23 -1
- package/dist/index.d.mts +1571 -1448
- package/dist/index.d.ts +1571 -1448
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/README.md +23 -1
- package/docs/_media/README.action.md +27 -4
- package/docs/_media/README.cn.md +21 -0
- package/docs/_media/README.engine.md +30 -2
- package/docs/classes/CheerioFetchEngine.md +169 -93
- package/docs/classes/ClickAction.md +29 -29
- package/docs/classes/EngineUpgradeError.md +335 -0
- package/docs/classes/EvaluateAction.md +29 -29
- package/docs/classes/ExtractAction.md +29 -29
- package/docs/classes/FetchAction.md +31 -29
- package/docs/classes/FetchEngine.md +159 -91
- package/docs/classes/FetchSession.md +14 -14
- package/docs/classes/FillAction.md +29 -29
- package/docs/classes/GetContentAction.md +29 -29
- package/docs/classes/GotoAction.md +29 -29
- package/docs/classes/KeyboardPressAction.md +29 -29
- package/docs/classes/KeyboardTypeAction.md +29 -29
- package/docs/classes/MouseClickAction.md +29 -29
- package/docs/classes/MouseMoveAction.md +29 -29
- package/docs/classes/MouseWheelAction.md +533 -0
- package/docs/classes/PauseAction.md +29 -29
- package/docs/classes/PlaywrightFetchEngine.md +252 -118
- package/docs/classes/ScrollIntoViewAction.md +533 -0
- package/docs/classes/SubmitAction.md +29 -29
- package/docs/classes/TrimAction.md +29 -29
- package/docs/classes/WaitForAction.md +29 -29
- package/docs/classes/WebFetcher.md +5 -5
- package/docs/enumerations/FetchActionResultStatus.md +4 -4
- package/docs/functions/fetchWeb.md +2 -2
- package/docs/functions/getRandomDelay.md +25 -0
- package/docs/globals.md +8 -1
- package/docs/interfaces/BaseFetchActionProperties.md +13 -13
- package/docs/interfaces/BaseFetchCollectorActionProperties.md +17 -17
- package/docs/interfaces/BaseFetcherProperties.md +44 -28
- package/docs/interfaces/DispatchedEngineAction.md +4 -4
- package/docs/interfaces/EvaluateActionOptions.md +3 -3
- package/docs/interfaces/ExtractActionProperties.md +13 -13
- package/docs/interfaces/FetchActionMeta.md +73 -0
- package/docs/interfaces/FetchActionProperties.md +15 -19
- package/docs/interfaces/FetchActionResult.md +7 -7
- package/docs/interfaces/FetchContext.md +65 -41
- package/docs/interfaces/FetchEngineContext.md +57 -33
- package/docs/interfaces/FetchMetadata.md +5 -5
- package/docs/interfaces/FetchResponse.md +14 -14
- package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
- package/docs/interfaces/FetchSite.md +55 -31
- package/docs/interfaces/FetcherOptions.md +55 -31
- package/docs/interfaces/GotoActionOptions.md +8 -8
- package/docs/interfaces/KeyboardPressParams.md +3 -3
- package/docs/interfaces/KeyboardTypeParams.md +3 -3
- package/docs/interfaces/MouseClickParams.md +6 -6
- package/docs/interfaces/MouseMoveParams.md +5 -5
- package/docs/interfaces/MouseWheelParams.md +69 -0
- package/docs/interfaces/PendingEngineRequest.md +3 -3
- package/docs/interfaces/ScrollIntoViewParams.md +17 -0
- package/docs/interfaces/StorageOptions.md +5 -5
- package/docs/interfaces/SubmitActionOptions.md +2 -2
- package/docs/interfaces/TrimActionOptions.md +3 -3
- package/docs/interfaces/WaitForActionOptions.md +5 -5
- package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
- package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
- package/docs/type-aliases/BrowserEngine.md +1 -1
- package/docs/type-aliases/FetchActionCapabilities.md +1 -1
- package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
- package/docs/type-aliases/FetchActionInContext.md +38 -0
- package/docs/type-aliases/FetchActionOptions.md +1 -1
- package/docs/type-aliases/FetchEngineAction.md +2 -2
- package/docs/type-aliases/FetchEngineType.md +1 -1
- package/docs/type-aliases/FetchReturnType.md +1 -1
- package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
- package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
- package/docs/type-aliases/ResourceType.md +1 -1
- package/docs/type-aliases/TrimPreset.md +1 -1
- package/docs/variables/DefaultFetcherProperties.md +1 -1
- package/docs/variables/FetcherOptionKeys.md +1 -1
- package/docs/variables/TRIM_PRESETS.md +1 -1
- package/package.json +7 -7
- package/docs/interfaces/FetchActionInContext.md +0 -190
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration,
|
|
1
|
+
import { Cookie, SessionPoolOptions, CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions } from 'crawlee';
|
|
2
2
|
export { Cookie } from 'crawlee';
|
|
3
3
|
import { EventEmitter } from 'events-ex';
|
|
4
4
|
|
|
@@ -731,1759 +731,1835 @@ type _RequireAtLeastOne<
|
|
|
731
731
|
Except<ObjectType, KeysType>;
|
|
732
732
|
|
|
733
733
|
/**
|
|
734
|
-
* Represents the
|
|
735
|
-
* It acts as the target for extraction and interaction actions.
|
|
736
|
-
*/
|
|
737
|
-
type FetchElementScope = any;
|
|
738
|
-
/**
|
|
739
|
-
* Interface representing the minimal engine capabilities required for extraction.
|
|
734
|
+
* Represents the state of an action being executed within a context.
|
|
740
735
|
*
|
|
741
736
|
* @remarks
|
|
742
|
-
*
|
|
743
|
-
*
|
|
744
|
-
* regarding scope handling (Element vs Array of Elements) and DOM traversal.
|
|
737
|
+
* Extends the basic action properties with runtime metadata like execution index,
|
|
738
|
+
* nesting depth, and any errors encountered during execution.
|
|
745
739
|
*/
|
|
746
|
-
|
|
747
|
-
/**
|
|
748
|
-
* Finds all elements matching the selector within the given scope.
|
|
749
|
-
*
|
|
750
|
-
* @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
|
|
751
|
-
* @param selector - The CSS selector to match.
|
|
752
|
-
* @returns A promise resolving to an array of found element scopes.
|
|
753
|
-
*
|
|
754
|
-
* @remarks
|
|
755
|
-
* **Behavior Contract:**
|
|
756
|
-
* 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
|
|
757
|
-
* 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
|
|
758
|
-
* 3. **Array Scope**: If `scope` is an array:
|
|
759
|
-
* - It MUST process elements in the order they appear in the array (which should match document order).
|
|
760
|
-
* - It MUST perform the check (Self + Descendants) for *each* element in the array.
|
|
761
|
-
* - It MUST flatten the results into a single array.
|
|
762
|
-
* - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
|
|
763
|
-
* but generally, preserving document order is the priority.
|
|
764
|
-
*/
|
|
765
|
-
_querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
740
|
+
type FetchActionInContext = FetchActionOptions & {
|
|
766
741
|
/**
|
|
767
|
-
*
|
|
768
|
-
*
|
|
769
|
-
* @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
|
|
770
|
-
* @param scope - The specific element to extract data from.
|
|
771
|
-
* @returns A promise resolving to the extracted value (string, number, boolean, or null).
|
|
772
|
-
*
|
|
773
|
-
* @remarks
|
|
774
|
-
* **Behavior Contract:**
|
|
775
|
-
* - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
|
|
776
|
-
* - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
|
|
777
|
-
* - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
|
|
778
|
-
* - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
|
|
779
|
-
* - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
|
|
742
|
+
* The 0-based index of the action in the execution sequence.
|
|
780
743
|
*/
|
|
781
|
-
|
|
744
|
+
index?: number;
|
|
782
745
|
/**
|
|
783
|
-
*
|
|
784
|
-
*
|
|
785
|
-
* @param scope - The element to find the parent of.
|
|
786
|
-
* @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
|
|
746
|
+
* Error encountered during action execution, if any.
|
|
787
747
|
*/
|
|
788
|
-
|
|
748
|
+
error?: Error;
|
|
789
749
|
/**
|
|
790
|
-
*
|
|
791
|
-
*
|
|
792
|
-
* @param scope1 - The first element scope.
|
|
793
|
-
* @param scope2 - The second element scope.
|
|
794
|
-
* @returns A promise resolving to `true` if they are the same node, `false` otherwise.
|
|
795
|
-
*
|
|
796
|
-
* @remarks
|
|
797
|
-
* This comparison MUST be identity-based, not just content-based.
|
|
750
|
+
* The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
|
|
798
751
|
*/
|
|
799
|
-
|
|
752
|
+
depth?: number;
|
|
753
|
+
};
|
|
754
|
+
/**
|
|
755
|
+
* Base internal state used by fetch engines to maintain their runtime environment.
|
|
756
|
+
*
|
|
757
|
+
* @internal
|
|
758
|
+
*/
|
|
759
|
+
interface BaseFetchContextInteralState {
|
|
800
760
|
/**
|
|
801
|
-
*
|
|
802
|
-
*
|
|
803
|
-
* @param scope - The anchor element (starting point). The returned list starts *after* this element.
|
|
804
|
-
* @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
|
|
805
|
-
* If omitted or null, returns all following siblings.
|
|
806
|
-
* @returns A promise resolving to an array of sibling element scopes.
|
|
807
|
-
*
|
|
808
|
-
* @remarks
|
|
809
|
-
* **Behavior Contract:**
|
|
810
|
-
* - **Starting Point**: The `scope` element itself IS NOT included in the result.
|
|
811
|
-
* - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
|
|
812
|
-
* - **Direction**: Only scans *following* siblings (next siblings).
|
|
813
|
-
* - **Flattening**: The result is a flat list of siblings, not a nested structure.
|
|
761
|
+
* The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
|
|
762
|
+
* associated with this context.
|
|
814
763
|
*/
|
|
815
|
-
|
|
764
|
+
engine?: any;
|
|
816
765
|
/**
|
|
817
|
-
*
|
|
818
|
-
*
|
|
819
|
-
* @param scope - The starting element from which to ascend the DOM tree.
|
|
820
|
-
* @param candidates - An array of potential ancestor elements to check against.
|
|
821
|
-
* @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
|
|
822
|
-
*
|
|
823
|
-
* @remarks
|
|
824
|
-
* **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
|
|
825
|
-
* It effectively answers: "Which of these container candidates does my current element belong to?"
|
|
826
|
-
*
|
|
827
|
-
* **Implementation Guidelines**:
|
|
828
|
-
* - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
|
|
829
|
-
* - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
|
|
766
|
+
* Additional implementation-specific internal state.
|
|
830
767
|
*/
|
|
831
|
-
|
|
768
|
+
[key: string]: any;
|
|
769
|
+
}
|
|
770
|
+
/**
|
|
771
|
+
* Extended internal state for the fetch context, including action lifecycle management.
|
|
772
|
+
*
|
|
773
|
+
* @internal
|
|
774
|
+
*/
|
|
775
|
+
interface FetchContextInteralState extends BaseFetchContextInteralState {
|
|
832
776
|
/**
|
|
833
|
-
*
|
|
834
|
-
*
|
|
835
|
-
* @param container - The potential ancestor element.
|
|
836
|
-
* @param element - The potential descendant element.
|
|
837
|
-
* @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
|
|
838
|
-
*
|
|
839
|
-
* @remarks
|
|
840
|
-
* **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
|
|
841
|
-
*
|
|
842
|
-
* @performance-critical Used extensively in boundary checks for Segmented extraction.
|
|
843
|
-
* - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
|
|
844
|
-
* - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
|
|
777
|
+
* Stack of actions currently being executed, used to manage nested action calls.
|
|
845
778
|
*/
|
|
846
|
-
|
|
779
|
+
actionStack?: FetchActionInContext[];
|
|
847
780
|
/**
|
|
848
|
-
*
|
|
849
|
-
*
|
|
850
|
-
* @param scope1 - The first element.
|
|
851
|
-
* @param scope2 - The second element.
|
|
852
|
-
* @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
|
|
853
|
-
*
|
|
854
|
-
* @remarks
|
|
855
|
-
* This is a fundamental tree operation used to find the point where two element paths diverge.
|
|
856
|
-
* **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
|
|
781
|
+
* Global counter for actions executed within the session, used to assign auto-incrementing indices.
|
|
857
782
|
*/
|
|
858
|
-
|
|
783
|
+
actionIndex?: number;
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* Context provided to the Fetch Engine during navigation and request handling.
|
|
787
|
+
*
|
|
788
|
+
* @remarks
|
|
789
|
+
* This interface contains the minimum set of properties required by an engine
|
|
790
|
+
* to perform a fetch operation and build a response.
|
|
791
|
+
*/
|
|
792
|
+
interface FetchEngineContext extends BaseFetcherProperties {
|
|
859
793
|
/**
|
|
860
|
-
*
|
|
861
|
-
*
|
|
862
|
-
* @param element - The descendant element.
|
|
863
|
-
* @param container - The ancestor container.
|
|
864
|
-
* @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
|
|
865
|
-
*
|
|
866
|
-
* @remarks
|
|
867
|
-
* This method traverses up from `element` until it finds the node whose parent is `container`.
|
|
868
|
-
* **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
|
|
794
|
+
* Unique identifier for the session or request batch.
|
|
869
795
|
*/
|
|
870
|
-
|
|
796
|
+
id: string;
|
|
871
797
|
/**
|
|
872
|
-
*
|
|
873
|
-
* @param category - The category of the log message.
|
|
874
|
-
* @param args - Arguments to log.
|
|
798
|
+
* The target URL for the next navigation, if specified.
|
|
875
799
|
*/
|
|
876
|
-
|
|
877
|
-
}
|
|
878
|
-
/**
|
|
879
|
-
* Base configuration for all extraction schemas.
|
|
880
|
-
*/
|
|
881
|
-
interface BaseExtractSchema {
|
|
800
|
+
url?: string;
|
|
882
801
|
/**
|
|
883
|
-
*
|
|
884
|
-
* the containing object or array item will be skipped (or throw error in strict mode).
|
|
802
|
+
* The final URL after all redirects have been followed.
|
|
885
803
|
*/
|
|
886
|
-
|
|
804
|
+
finalUrl?: string;
|
|
887
805
|
/**
|
|
888
|
-
*
|
|
889
|
-
* If true, missing required fields will throw an error instead of being skipped.
|
|
806
|
+
* The standardized response object from the most recent navigation.
|
|
890
807
|
*/
|
|
891
|
-
|
|
808
|
+
lastResponse?: FetchResponse;
|
|
892
809
|
/**
|
|
893
|
-
*
|
|
894
|
-
* - Field Name: Uses the DOM element of a previously extracted field as the anchor.
|
|
895
|
-
* - CSS Selector: Re-queries the selector within the current context to find the anchor.
|
|
896
|
-
*
|
|
897
|
-
* Once anchored, the search scope for this field becomes the siblings following the anchor.
|
|
810
|
+
* The result object from the most recent action execution.
|
|
898
811
|
*/
|
|
899
|
-
|
|
812
|
+
lastResult?: FetchActionResult;
|
|
900
813
|
/**
|
|
901
|
-
*
|
|
902
|
-
* - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
|
|
903
|
-
* - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
|
|
904
|
-
* - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
|
|
814
|
+
* Engine-specific internal state.
|
|
905
815
|
*/
|
|
906
|
-
|
|
816
|
+
internal: BaseFetchContextInteralState;
|
|
907
817
|
}
|
|
908
818
|
/**
|
|
909
|
-
*
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
*
|
|
819
|
+
* The full execution context for a Web Fetcher session or action batch.
|
|
820
|
+
*
|
|
821
|
+
* @remarks
|
|
822
|
+
* This object is the central state container for the fetch operation. It provides
|
|
823
|
+
* access to configuration, the event bus, shared outputs, and the execution engine.
|
|
824
|
+
* It is passed to every action during execution.
|
|
914
825
|
*/
|
|
915
|
-
interface
|
|
826
|
+
interface FetchContext extends FetchEngineContext {
|
|
916
827
|
/**
|
|
917
|
-
*
|
|
918
|
-
* @default 'string'
|
|
828
|
+
* Metadata about the action currently being executed.
|
|
919
829
|
*/
|
|
920
|
-
|
|
830
|
+
currentAction?: FetchActionInContext;
|
|
921
831
|
/**
|
|
922
|
-
*
|
|
923
|
-
*
|
|
924
|
-
* - 'innerText': Uses rendered text (respects CSS line breaks).
|
|
925
|
-
* - 'html': Returns innerHTML.
|
|
926
|
-
* - 'outerHTML': Returns HTML including the element's tag.
|
|
832
|
+
* A shared key-value store for storing data extracted from pages or
|
|
833
|
+
* metadata generated during action execution.
|
|
927
834
|
*/
|
|
928
|
-
|
|
835
|
+
outputs: Record<string, any>;
|
|
929
836
|
/**
|
|
930
|
-
*
|
|
837
|
+
* Executes a FetchAction within the current context.
|
|
838
|
+
*
|
|
839
|
+
* @param actionOptions - Configuration for the action to be executed.
|
|
840
|
+
* @returns A promise that resolves to the action's result.
|
|
931
841
|
*/
|
|
932
|
-
|
|
842
|
+
execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
|
|
933
843
|
/**
|
|
934
|
-
*
|
|
935
|
-
*
|
|
844
|
+
* Convenience method to execute an action by its registered name or ID.
|
|
845
|
+
*
|
|
846
|
+
* @param name - The registered name or ID of the action.
|
|
847
|
+
* @param params - Parameters specific to the action type.
|
|
848
|
+
* @param options - Additional execution options (e.g., storeAs, failOnError).
|
|
849
|
+
* @returns A promise that resolves to a result.
|
|
936
850
|
*/
|
|
937
|
-
|
|
851
|
+
action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
|
|
938
852
|
/**
|
|
939
|
-
*
|
|
853
|
+
* Internal state for engine and lifecycle management.
|
|
940
854
|
*/
|
|
941
|
-
|
|
855
|
+
internal: FetchContextInteralState;
|
|
942
856
|
/**
|
|
943
|
-
*
|
|
857
|
+
* The central event bus for publishing and subscribing to session and action events.
|
|
944
858
|
*/
|
|
945
|
-
|
|
859
|
+
eventBus: EventEmitter;
|
|
946
860
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
/**
|
|
957
|
-
* Whether to enable strict mode for this specific array mode.
|
|
958
|
-
* @default false
|
|
959
|
-
*/
|
|
960
|
-
strict?: boolean;
|
|
961
|
-
}
|
|
962
|
-
/**
|
|
963
|
-
* Options for columnar (column-alignment) extraction.
|
|
964
|
-
*/
|
|
965
|
-
interface ColumnarOptions extends BaseModeOptions {
|
|
966
|
-
type: 'columnar';
|
|
967
|
-
/**
|
|
968
|
-
* Whether to enable heuristic inference.
|
|
969
|
-
* If true, tries to find a common parent to infer item wrappers when counts mismatch.
|
|
970
|
-
* @default false
|
|
971
|
-
*/
|
|
972
|
-
inference?: boolean;
|
|
861
|
+
|
|
862
|
+
type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
|
|
863
|
+
interface FetchReturnTypeRegistry {
|
|
864
|
+
response: FetchResponse;
|
|
865
|
+
context: FetchContext;
|
|
866
|
+
result: FetchActionResult<any> | undefined;
|
|
867
|
+
outputs: Record<string, any>;
|
|
868
|
+
any: any;
|
|
869
|
+
none: void;
|
|
973
870
|
}
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
interface SegmentedOptions extends BaseModeOptions {
|
|
978
|
-
type: 'segmented';
|
|
871
|
+
type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
|
|
872
|
+
|
|
873
|
+
declare enum FetchActionResultStatus {
|
|
979
874
|
/**
|
|
980
|
-
*
|
|
981
|
-
* Defaults to the first property key's selector defined in `items`.
|
|
875
|
+
* 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
|
|
982
876
|
*/
|
|
983
|
-
|
|
877
|
+
Failed = 0,
|
|
984
878
|
/**
|
|
985
|
-
*
|
|
986
|
-
* - 'anchor': (Default) All fields are searched within the entire segment.
|
|
987
|
-
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
879
|
+
* 动作按预期完成(即便产生 warnings)
|
|
988
880
|
*/
|
|
989
|
-
|
|
881
|
+
Success = 1,
|
|
990
882
|
/**
|
|
991
|
-
*
|
|
992
|
-
*
|
|
883
|
+
* 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
|
|
884
|
+
* 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
|
|
993
885
|
*/
|
|
994
|
-
|
|
886
|
+
Skipped = 2
|
|
995
887
|
}
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
888
|
+
type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
|
|
889
|
+
interface FetchActionMeta {
|
|
890
|
+
id: string;
|
|
891
|
+
index?: number;
|
|
892
|
+
engineType?: FetchEngineType;
|
|
893
|
+
capability?: FetchActionCapabilityMode;
|
|
894
|
+
response?: FetchResponse;
|
|
895
|
+
timings?: {
|
|
896
|
+
start: number;
|
|
897
|
+
total: number;
|
|
898
|
+
};
|
|
899
|
+
retries?: number;
|
|
900
|
+
}
|
|
901
|
+
interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
|
|
902
|
+
status: FetchActionResultStatus;
|
|
903
|
+
returnType?: R;
|
|
904
|
+
result?: FetchReturnTypeFor<R>;
|
|
905
|
+
error?: Error;
|
|
906
|
+
meta?: FetchActionMeta;
|
|
907
|
+
}
|
|
908
|
+
interface BaseFetchActionProperties {
|
|
909
|
+
id?: string;
|
|
910
|
+
name?: string;
|
|
911
|
+
action?: string | any;
|
|
912
|
+
index?: number;
|
|
913
|
+
params?: any;
|
|
914
|
+
args?: any;
|
|
915
|
+
storeAs?: string;
|
|
916
|
+
failOnError?: boolean;
|
|
917
|
+
failOnTimeout?: boolean;
|
|
918
|
+
timeoutMs?: number;
|
|
919
|
+
maxRetries?: number;
|
|
920
|
+
[key: string]: any;
|
|
921
|
+
}
|
|
922
|
+
type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
|
|
923
|
+
interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
|
|
924
|
+
activateOn?: string | RegExp | Array<string | RegExp>;
|
|
925
|
+
deactivateOn?: string | RegExp | Array<string | RegExp>;
|
|
926
|
+
collectOn?: string | RegExp | Array<string | RegExp>;
|
|
927
|
+
background?: boolean;
|
|
928
|
+
}
|
|
929
|
+
type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
|
|
930
|
+
interface FetchActionProperties extends BaseFetchActionProperties {
|
|
931
|
+
collectors?: BaseFetchCollectorOptions[];
|
|
932
|
+
}
|
|
933
|
+
type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
|
|
934
|
+
declare class EngineUpgradeError extends Error {
|
|
935
|
+
res: FetchResponse;
|
|
936
|
+
code: string;
|
|
937
|
+
constructor(res: FetchResponse);
|
|
1033
938
|
}
|
|
939
|
+
type FetchEngineType = 'http' | 'browser';
|
|
940
|
+
type BrowserEngine = 'playwright' | 'puppeteer';
|
|
941
|
+
type FetchEngineMode = FetchEngineType | 'auto' | string;
|
|
942
|
+
type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
|
|
1034
943
|
/**
|
|
1035
|
-
*
|
|
944
|
+
* Storage configuration options for the fetch engine.
|
|
945
|
+
*
|
|
946
|
+
* @remarks
|
|
947
|
+
* Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
|
|
1036
948
|
*/
|
|
1037
|
-
interface
|
|
1038
|
-
type: 'object';
|
|
949
|
+
interface StorageOptions {
|
|
1039
950
|
/**
|
|
1040
|
-
*
|
|
951
|
+
* Custom identifier for the storage.
|
|
952
|
+
* If provided, multiple sessions can share the same storage by using the same ID.
|
|
953
|
+
* If not provided, a unique session ID is used (strong isolation).
|
|
1041
954
|
*/
|
|
1042
|
-
|
|
955
|
+
id?: string;
|
|
1043
956
|
/**
|
|
1044
|
-
*
|
|
957
|
+
* Whether to persist storage to disk.
|
|
958
|
+
* If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
|
|
959
|
+
* Corresponds to Crawlee's `persistStorage` configuration.
|
|
1045
960
|
*/
|
|
1046
|
-
|
|
961
|
+
persist?: boolean;
|
|
1047
962
|
/**
|
|
1048
|
-
*
|
|
963
|
+
* Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
|
|
964
|
+
* Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
|
|
1049
965
|
*/
|
|
1050
|
-
|
|
966
|
+
purge?: boolean;
|
|
1051
967
|
/**
|
|
1052
|
-
*
|
|
1053
|
-
*
|
|
1054
|
-
* - 'anchor': (Default) All fields are searched within the entire scope.
|
|
1055
|
-
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
968
|
+
* Additional Crawlee configuration options.
|
|
969
|
+
* Allows fine-grained control over the underlying Crawlee instance.
|
|
1056
970
|
*/
|
|
1057
|
-
|
|
971
|
+
config?: Record<string, any>;
|
|
972
|
+
}
|
|
973
|
+
interface BaseFetcherProperties {
|
|
1058
974
|
/**
|
|
1059
|
-
*
|
|
1060
|
-
*
|
|
975
|
+
* 抓取模式
|
|
976
|
+
*
|
|
977
|
+
* - `http`: 使用 HTTP 进行抓取
|
|
978
|
+
* - `browser`: 使用浏览器进行抓取
|
|
979
|
+
* - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
|
|
1061
980
|
*/
|
|
1062
|
-
|
|
981
|
+
engine?: FetchEngineMode;
|
|
982
|
+
enableSmart?: boolean;
|
|
983
|
+
syncStateOnUpgrade?: boolean;
|
|
984
|
+
upgradeThresholdMs?: number;
|
|
985
|
+
useSiteRegistry?: boolean;
|
|
986
|
+
antibot?: boolean;
|
|
987
|
+
debug?: boolean | string | string[];
|
|
988
|
+
headers?: Record<string, string>;
|
|
989
|
+
cookies?: Cookie[];
|
|
990
|
+
sessionState?: any;
|
|
991
|
+
sessionPoolOptions?: SessionPoolOptions;
|
|
992
|
+
overrideSessionState?: boolean;
|
|
993
|
+
throwHttpErrors?: boolean;
|
|
994
|
+
output?: {
|
|
995
|
+
cookies?: boolean;
|
|
996
|
+
sessionState?: boolean;
|
|
997
|
+
};
|
|
998
|
+
proxy?: string | string[];
|
|
999
|
+
blockResources?: ResourceType[];
|
|
1063
1000
|
/**
|
|
1064
|
-
*
|
|
1001
|
+
* Storage configuration for session isolation and persistence.
|
|
1065
1002
|
*/
|
|
1066
|
-
|
|
1067
|
-
|
|
1003
|
+
storage?: StorageOptions;
|
|
1004
|
+
ignoreSslErrors?: boolean;
|
|
1005
|
+
browser?: {
|
|
1006
|
+
/**
|
|
1007
|
+
* 浏览器引擎,默认为 playwright
|
|
1008
|
+
*
|
|
1009
|
+
* - `playwright`: 使用 Playwright 引擎
|
|
1010
|
+
* - `puppeteer`: 使用 Puppeteer 引擎
|
|
1011
|
+
*/
|
|
1012
|
+
engine?: BrowserEngine;
|
|
1013
|
+
headless?: boolean;
|
|
1014
|
+
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1015
|
+
launchOptions?: Record<string, any>;
|
|
1016
|
+
};
|
|
1017
|
+
http?: {
|
|
1018
|
+
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
|
1019
|
+
body?: any;
|
|
1068
1020
|
};
|
|
1069
|
-
}
|
|
1070
|
-
|
|
1071
|
-
interface PromiseLock extends Promise<void> {
|
|
1072
|
-
release: () => void;
|
|
1073
|
-
}
|
|
1074
|
-
|
|
1075
|
-
/**
|
|
1076
|
-
* Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
|
|
1077
|
-
*
|
|
1078
|
-
* @remarks
|
|
1079
|
-
* Used when navigating to a URL to specify additional parameters beyond the basic URL.
|
|
1080
|
-
*
|
|
1081
|
-
* @example
|
|
1082
|
-
* ```ts
|
|
1083
|
-
* await engine.goto('https://example.com', {
|
|
1084
|
-
* method: 'POST',
|
|
1085
|
-
* payload: { username: 'user', password: 'pass' },
|
|
1086
|
-
* headers: { 'Content-Type': 'application/json' },
|
|
1087
|
-
* waitUntil: 'networkidle'
|
|
1088
|
-
* });
|
|
1089
|
-
* ```
|
|
1090
|
-
*/
|
|
1091
|
-
interface GotoActionOptions {
|
|
1092
|
-
method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
|
|
1093
|
-
payload?: any;
|
|
1094
|
-
headers?: Record<string, string>;
|
|
1095
|
-
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1096
1021
|
timeoutMs?: number;
|
|
1097
|
-
|
|
1022
|
+
requestHandlerTimeoutSecs?: number;
|
|
1023
|
+
maxConcurrency?: number;
|
|
1024
|
+
maxRequestsPerMinute?: number;
|
|
1025
|
+
delayBetweenRequestsMs?: number;
|
|
1026
|
+
retries?: number;
|
|
1027
|
+
sites?: FetchSite[];
|
|
1028
|
+
url?: string;
|
|
1098
1029
|
}
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1030
|
+
interface FetchSite extends BaseFetcherProperties {
|
|
1031
|
+
domain: string;
|
|
1032
|
+
pathScope?: string[];
|
|
1033
|
+
meta?: {
|
|
1034
|
+
updatedAt?: number;
|
|
1035
|
+
ttlMs?: number;
|
|
1036
|
+
source?: 'manual' | 'smart';
|
|
1037
|
+
};
|
|
1038
|
+
}
|
|
1039
|
+
type OnFetchPauseCallback = (options: {
|
|
1040
|
+
message?: string;
|
|
1041
|
+
}) => Promise<void>;
|
|
1042
|
+
interface FetcherOptions extends BaseFetcherProperties {
|
|
1043
|
+
actions?: FetchActionOptions[];
|
|
1044
|
+
onPause?: OnFetchPauseCallback;
|
|
1045
|
+
}
|
|
1046
|
+
interface FetchMetadata {
|
|
1047
|
+
mode: FetchEngineType;
|
|
1048
|
+
engine?: BrowserEngine;
|
|
1049
|
+
timings?: {
|
|
1050
|
+
start: number;
|
|
1051
|
+
total: number;
|
|
1052
|
+
ttfb?: number;
|
|
1053
|
+
dns?: number;
|
|
1054
|
+
tcp?: number;
|
|
1055
|
+
firstByte?: number;
|
|
1056
|
+
download?: number;
|
|
1057
|
+
};
|
|
1058
|
+
proxy?: string;
|
|
1059
|
+
[key: string]: any;
|
|
1060
|
+
}
|
|
1061
|
+
interface FetchResponse {
|
|
1062
|
+
url: string;
|
|
1063
|
+
finalUrl: string;
|
|
1064
|
+
statusCode?: number;
|
|
1065
|
+
statusText?: string;
|
|
1066
|
+
headers: Record<string, string>;
|
|
1067
|
+
contentType?: string;
|
|
1068
|
+
body?: string | Buffer<ArrayBufferLike>;
|
|
1069
|
+
html?: string;
|
|
1070
|
+
text?: string;
|
|
1071
|
+
json?: any;
|
|
1072
|
+
cookies?: Cookie[];
|
|
1073
|
+
sessionState?: any;
|
|
1074
|
+
metadata?: FetchMetadata;
|
|
1110
1075
|
}
|
|
1076
|
+
declare const DefaultFetcherProperties: BaseFetcherProperties;
|
|
1077
|
+
declare const FetcherOptionKeys: string[];
|
|
1078
|
+
|
|
1111
1079
|
/**
|
|
1112
|
-
*
|
|
1080
|
+
* Represents a stateful web fetching session.
|
|
1113
1081
|
*
|
|
1114
1082
|
* @remarks
|
|
1115
|
-
*
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
/**
|
|
1121
|
-
* Predefined cleanup groups for the {@link FetchEngine.trim} action.
|
|
1122
|
-
*/
|
|
1123
|
-
type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
|
|
1124
|
-
/**
|
|
1125
|
-
* Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
|
|
1083
|
+
* A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
|
|
1084
|
+
* cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
|
|
1085
|
+
* session-level configurations and outputs.
|
|
1086
|
+
*
|
|
1087
|
+
* Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
|
|
1126
1088
|
*/
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1089
|
+
declare class FetchSession {
|
|
1090
|
+
protected options: FetcherOptions;
|
|
1091
|
+
/**
|
|
1092
|
+
* Unique identifier for the session.
|
|
1093
|
+
*/
|
|
1094
|
+
readonly id: string;
|
|
1095
|
+
/**
|
|
1096
|
+
* The execution context for this session, containing configurations, event bus, and shared state.
|
|
1097
|
+
*/
|
|
1098
|
+
readonly context: FetchContext;
|
|
1099
|
+
protected closed: boolean;
|
|
1100
|
+
/**
|
|
1101
|
+
* Creates a new FetchSession.
|
|
1102
|
+
*
|
|
1103
|
+
* @param options - Configuration options for the fetcher.
|
|
1104
|
+
*/
|
|
1105
|
+
constructor(options?: FetcherOptions);
|
|
1106
|
+
protected _logDebug(category: string, ...args: any[]): void;
|
|
1107
|
+
/**
|
|
1108
|
+
* Executes a single action within the session.
|
|
1109
|
+
*
|
|
1110
|
+
* @param actionOptions - Configuration for the action to be executed.
|
|
1111
|
+
* @param context - Optional context override for this specific execution. Defaults to the session context.
|
|
1112
|
+
* @returns A promise that resolves to the result of the action.
|
|
1113
|
+
* @template R - The expected return type of the action.
|
|
1114
|
+
*
|
|
1115
|
+
* @example
|
|
1116
|
+
* ```ts
|
|
1117
|
+
* await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
|
|
1118
|
+
* ```
|
|
1119
|
+
*/
|
|
1120
|
+
execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
|
|
1121
|
+
/**
|
|
1122
|
+
* Executes a sequence of actions.
|
|
1123
|
+
*
|
|
1124
|
+
* @param actions - An array of action options to be executed in order.
|
|
1125
|
+
* @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
|
|
1126
|
+
* These overrides do not affect the main session context.
|
|
1127
|
+
* @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
|
|
1128
|
+
*
|
|
1129
|
+
* @example
|
|
1130
|
+
* ```ts
|
|
1131
|
+
* const { result, outputs } = await session.executeAll([
|
|
1132
|
+
* { name: 'goto', params: { url: 'https://example.com' } },
|
|
1133
|
+
* { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
|
|
1134
|
+
* ], { timeoutMs: 30000 });
|
|
1135
|
+
* ```
|
|
1136
|
+
*/
|
|
1137
|
+
executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
|
|
1138
|
+
index?: number;
|
|
1139
|
+
}): Promise<{
|
|
1140
|
+
result: FetchResponse | undefined;
|
|
1141
|
+
outputs: Record<string, any>;
|
|
1142
|
+
}>;
|
|
1143
|
+
/**
|
|
1144
|
+
* Retrieves all outputs accumulated during the session.
|
|
1145
|
+
*
|
|
1146
|
+
* @returns A record of stored output data.
|
|
1147
|
+
*/
|
|
1148
|
+
getOutputs(): Record<string, any>;
|
|
1149
|
+
/**
|
|
1150
|
+
* Gets the current state of the session, including cookies and engine-specific state.
|
|
1151
|
+
*
|
|
1152
|
+
* @returns A promise resolving to the session state, or undefined if no engine is initialized.
|
|
1153
|
+
*/
|
|
1154
|
+
getState(): Promise<{
|
|
1155
|
+
cookies: Cookie[];
|
|
1156
|
+
sessionState?: any;
|
|
1157
|
+
} | undefined>;
|
|
1158
|
+
/**
|
|
1159
|
+
* Disposes of the session and its associated engine.
|
|
1160
|
+
*
|
|
1161
|
+
* @remarks
|
|
1162
|
+
* This method should be called when the session is no longer needed to free up resources
|
|
1163
|
+
* (e.g., closing browser instances, purging temporary storage).
|
|
1164
|
+
*/
|
|
1165
|
+
dispose(): Promise<void>;
|
|
1166
|
+
private ensureEngine;
|
|
1167
|
+
protected createContext(options?: FetcherOptions): FetchContext;
|
|
1130
1168
|
}
|
|
1131
|
-
|
|
1169
|
+
|
|
1132
1170
|
/**
|
|
1133
|
-
*
|
|
1171
|
+
* High-level entry point for the Web Fetcher library.
|
|
1134
1172
|
*
|
|
1135
1173
|
* @remarks
|
|
1136
|
-
*
|
|
1137
|
-
*
|
|
1138
|
-
* **Execution Environments:**
|
|
1139
|
-
* - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
|
|
1140
|
-
* - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
|
|
1141
|
-
* including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
|
|
1142
|
-
*
|
|
1143
|
-
* **Navigation Handling:**
|
|
1144
|
-
* If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
|
|
1145
|
-
* automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
|
|
1146
|
-
*
|
|
1147
|
-
* @example
|
|
1148
|
-
* ```json
|
|
1149
|
-
* {
|
|
1150
|
-
* "action": "evaluate",
|
|
1151
|
-
* "params": {
|
|
1152
|
-
* "fn": "([a, b]) => a + b",
|
|
1153
|
-
* "args": [1, 2]
|
|
1154
|
-
* }
|
|
1155
|
-
* }
|
|
1156
|
-
* ```
|
|
1174
|
+
* The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
|
|
1175
|
+
* It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
|
|
1157
1176
|
*
|
|
1158
1177
|
* @example
|
|
1159
|
-
* ```
|
|
1160
|
-
*
|
|
1161
|
-
*
|
|
1162
|
-
* "params": {
|
|
1163
|
-
* "fn": "({ x, y }) => x * y",
|
|
1164
|
-
* "args": { "x": 6, "y": 7 }
|
|
1165
|
-
* }
|
|
1166
|
-
* }
|
|
1178
|
+
* ```ts
|
|
1179
|
+
* const fetcher = new WebFetcher();
|
|
1180
|
+
* const { result } = await fetcher.fetch('https://example.com');
|
|
1167
1181
|
* ```
|
|
1168
1182
|
*/
|
|
1169
|
-
|
|
1183
|
+
declare class WebFetcher {
|
|
1184
|
+
private defaults;
|
|
1170
1185
|
/**
|
|
1171
|
-
*
|
|
1186
|
+
* Creates a new WebFetcher with default options.
|
|
1172
1187
|
*
|
|
1173
|
-
* @
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
*
|
|
1188
|
+
* @param defaults - Default configuration options applied to all sessions and requests.
|
|
1189
|
+
*/
|
|
1190
|
+
constructor(defaults?: FetcherOptions);
|
|
1191
|
+
/**
|
|
1192
|
+
* Creates a new FetchSession.
|
|
1178
1193
|
*
|
|
1179
|
-
*
|
|
1180
|
-
*
|
|
1194
|
+
* @param options - Configuration options for the session, merged with defaults.
|
|
1195
|
+
* @returns A promise resolving to a new FetchSession instance.
|
|
1181
1196
|
*/
|
|
1182
|
-
|
|
1197
|
+
createSession(options?: FetcherOptions): Promise<FetchSession>;
|
|
1183
1198
|
/**
|
|
1184
|
-
*
|
|
1199
|
+
* Fetches content from a URL or executes a complex action script.
|
|
1185
1200
|
*
|
|
1186
1201
|
* @remarks
|
|
1187
|
-
* This
|
|
1188
|
-
*
|
|
1202
|
+
* This method automatically creates a session, executes the specified actions,
|
|
1203
|
+
* retrieves the content, and disposes of the session.
|
|
1204
|
+
*
|
|
1205
|
+
* @param url - The target URL or a complete FetcherOptions object.
|
|
1206
|
+
* @param options - Additional options when the first parameter is a URL string.
|
|
1207
|
+
* @returns A promise resolving to the final response and any extracted outputs.
|
|
1189
1208
|
*/
|
|
1190
|
-
|
|
1209
|
+
fetch(url: string, options?: FetcherOptions): Promise<{
|
|
1210
|
+
result: FetchResponse | undefined;
|
|
1211
|
+
outputs: Record<string, any>;
|
|
1212
|
+
}>;
|
|
1213
|
+
fetch(options: FetcherOptions): Promise<{
|
|
1214
|
+
result: FetchResponse | undefined;
|
|
1215
|
+
outputs: Record<string, any>;
|
|
1216
|
+
}>;
|
|
1191
1217
|
}
|
|
1218
|
+
|
|
1192
1219
|
/**
|
|
1193
|
-
*
|
|
1220
|
+
* Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
|
|
1221
|
+
* It acts as the target for extraction and interaction actions.
|
|
1222
|
+
*/
|
|
1223
|
+
type FetchElementScope = any;
|
|
1224
|
+
/**
|
|
1225
|
+
* Interface representing the minimal engine capabilities required for extraction.
|
|
1194
1226
|
*
|
|
1195
1227
|
* @remarks
|
|
1196
|
-
*
|
|
1197
|
-
*
|
|
1228
|
+
* This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
|
|
1229
|
+
* Implementing classes must ensure consistent behavior across different engines, especially
|
|
1230
|
+
* regarding scope handling (Element vs Array of Elements) and DOM traversal.
|
|
1198
1231
|
*/
|
|
1199
|
-
|
|
1200
|
-
type: 'click';
|
|
1201
|
-
selector: string;
|
|
1202
|
-
} | {
|
|
1203
|
-
type: 'fill';
|
|
1204
|
-
selector: string;
|
|
1205
|
-
value: string;
|
|
1206
|
-
} | {
|
|
1207
|
-
type: 'mouseMove';
|
|
1208
|
-
params: {
|
|
1209
|
-
x?: number;
|
|
1210
|
-
y?: number;
|
|
1211
|
-
selector?: string;
|
|
1212
|
-
steps?: number;
|
|
1213
|
-
};
|
|
1214
|
-
} | {
|
|
1215
|
-
type: 'mouseClick';
|
|
1216
|
-
params: {
|
|
1217
|
-
x?: number;
|
|
1218
|
-
y?: number;
|
|
1219
|
-
button?: 'left' | 'right' | 'middle';
|
|
1220
|
-
clickCount?: number;
|
|
1221
|
-
delay?: number;
|
|
1222
|
-
};
|
|
1223
|
-
} | {
|
|
1224
|
-
type: 'keyboardType';
|
|
1225
|
-
params: {
|
|
1226
|
-
text: string;
|
|
1227
|
-
delay?: number;
|
|
1228
|
-
};
|
|
1229
|
-
} | {
|
|
1230
|
-
type: 'keyboardPress';
|
|
1231
|
-
params: {
|
|
1232
|
-
key: string;
|
|
1233
|
-
delay?: number;
|
|
1234
|
-
};
|
|
1235
|
-
} | {
|
|
1236
|
-
type: 'waitFor';
|
|
1237
|
-
options?: WaitForActionOptions;
|
|
1238
|
-
} | {
|
|
1239
|
-
type: 'submit';
|
|
1240
|
-
selector?: any;
|
|
1241
|
-
options?: SubmitActionOptions;
|
|
1242
|
-
} | {
|
|
1243
|
-
type: 'getContent';
|
|
1244
|
-
} | {
|
|
1245
|
-
type: 'navigate';
|
|
1246
|
-
url: string;
|
|
1247
|
-
opts?: GotoActionOptions;
|
|
1248
|
-
} | {
|
|
1249
|
-
type: 'extract';
|
|
1250
|
-
schema: ExtractSchema;
|
|
1251
|
-
} | {
|
|
1252
|
-
type: 'pause';
|
|
1253
|
-
message?: string;
|
|
1254
|
-
} | {
|
|
1255
|
-
type: 'trim';
|
|
1256
|
-
options: TrimActionOptions;
|
|
1257
|
-
} | {
|
|
1258
|
-
type: 'evaluate';
|
|
1259
|
-
params: EvaluateActionOptions;
|
|
1260
|
-
} | {
|
|
1261
|
-
type: 'dispose';
|
|
1262
|
-
};
|
|
1263
|
-
/**
|
|
1264
|
-
* Represents an action that has been dispatched and is awaiting execution in the active page context.
|
|
1265
|
-
*
|
|
1266
|
-
* @remarks
|
|
1267
|
-
* Connects the action request with its resolution mechanism. Used internally by the action dispatch system
|
|
1268
|
-
* to handle promises while maintaining the page context validity window.
|
|
1269
|
-
*/
|
|
1270
|
-
interface DispatchedEngineAction {
|
|
1271
|
-
action: FetchEngineAction;
|
|
1272
|
-
resolve: (value?: any) => void;
|
|
1273
|
-
reject: (reason?: any) => void;
|
|
1274
|
-
}
|
|
1275
|
-
/**
|
|
1276
|
-
* Represents a pending navigation request awaiting resolution.
|
|
1277
|
-
*
|
|
1278
|
-
* @remarks
|
|
1279
|
-
* Tracks navigation requests that have been queued but not yet processed by the request handler.
|
|
1280
|
-
*/
|
|
1281
|
-
interface PendingEngineRequest {
|
|
1282
|
-
resolve: (value: any) => void;
|
|
1283
|
-
reject: (reason?: any) => void;
|
|
1284
|
-
}
|
|
1285
|
-
/**
|
|
1286
|
-
* Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
|
|
1287
|
-
*
|
|
1288
|
-
* @remarks
|
|
1289
|
-
* The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
|
|
1290
|
-
* `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
|
|
1291
|
-
* content retrieval, and user interaction.
|
|
1292
|
-
*
|
|
1293
|
-
* The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
|
|
1294
|
-
* the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
|
|
1295
|
-
* page context validity across asynchronous operations.
|
|
1296
|
-
*
|
|
1297
|
-
* @example
|
|
1298
|
-
* ```ts
|
|
1299
|
-
* import "./playwright"; // 引入注册 Playwright browser 引擎
|
|
1300
|
-
* const engine = await FetchEngine.create(context, { engine: 'browser' });
|
|
1301
|
-
* await engine.goto('https://example.com');
|
|
1302
|
-
* await engine.fill('#username', 'user');
|
|
1303
|
-
* await engine.click('#submit');
|
|
1304
|
-
* const response = await engine.getContent();
|
|
1305
|
-
* ```
|
|
1306
|
-
*/
|
|
1307
|
-
type AnyFetchEngine = FetchEngine<any, any, any>;
|
|
1308
|
-
type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
|
|
1309
|
-
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
|
|
1310
|
-
private static registry;
|
|
1232
|
+
interface IExtractEngine {
|
|
1311
1233
|
/**
|
|
1312
|
-
*
|
|
1234
|
+
* Finds all elements matching the selector within the given scope.
|
|
1313
1235
|
*
|
|
1314
|
-
* @param
|
|
1315
|
-
* @
|
|
1236
|
+
* @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
|
|
1237
|
+
* @param selector - The CSS selector to match.
|
|
1238
|
+
* @returns A promise resolving to an array of found element scopes.
|
|
1316
1239
|
*
|
|
1317
|
-
* @
|
|
1318
|
-
*
|
|
1319
|
-
*
|
|
1320
|
-
*
|
|
1240
|
+
* @remarks
|
|
1241
|
+
* **Behavior Contract:**
|
|
1242
|
+
* 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
|
|
1243
|
+
* 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
|
|
1244
|
+
* 3. **Array Scope**: If `scope` is an array:
|
|
1245
|
+
* - It MUST process elements in the order they appear in the array (which should match document order).
|
|
1246
|
+
* - It MUST perform the check (Self + Descendants) for *each* element in the array.
|
|
1247
|
+
* - It MUST flatten the results into a single array.
|
|
1248
|
+
* - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
|
|
1249
|
+
* but generally, preserving document order is the priority.
|
|
1321
1250
|
*/
|
|
1322
|
-
|
|
1251
|
+
_querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
1323
1252
|
/**
|
|
1324
|
-
*
|
|
1253
|
+
* Extracts a primitive value from the element based on the schema configuration.
|
|
1325
1254
|
*
|
|
1326
|
-
* @param
|
|
1327
|
-
* @
|
|
1255
|
+
* @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
|
|
1256
|
+
* @param scope - The specific element to extract data from.
|
|
1257
|
+
* @returns A promise resolving to the extracted value (string, number, boolean, or null).
|
|
1258
|
+
*
|
|
1259
|
+
* @remarks
|
|
1260
|
+
* **Behavior Contract:**
|
|
1261
|
+
* - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
|
|
1262
|
+
* - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
|
|
1263
|
+
* - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
|
|
1264
|
+
* - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
|
|
1265
|
+
* - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
|
|
1328
1266
|
*/
|
|
1329
|
-
|
|
1267
|
+
_extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1330
1268
|
/**
|
|
1331
|
-
*
|
|
1269
|
+
* Gets the parent element of the given scope.
|
|
1332
1270
|
*
|
|
1333
|
-
* @param
|
|
1334
|
-
* @returns
|
|
1271
|
+
* @param scope - The element to find the parent of.
|
|
1272
|
+
* @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
|
|
1335
1273
|
*/
|
|
1336
|
-
|
|
1274
|
+
_parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1337
1275
|
/**
|
|
1338
|
-
*
|
|
1276
|
+
* Checks if two element scopes refer to the exact same DOM node.
|
|
1339
1277
|
*
|
|
1340
|
-
* @param
|
|
1341
|
-
* @param
|
|
1342
|
-
* @returns
|
|
1343
|
-
* @throws {Error} When no suitable engine implementation is found
|
|
1278
|
+
* @param scope1 - The first element scope.
|
|
1279
|
+
* @param scope2 - The second element scope.
|
|
1280
|
+
* @returns A promise resolving to `true` if they are the same node, `false` otherwise.
|
|
1344
1281
|
*
|
|
1345
1282
|
* @remarks
|
|
1346
|
-
*
|
|
1283
|
+
* This comparison MUST be identity-based, not just content-based.
|
|
1347
1284
|
*/
|
|
1348
|
-
|
|
1285
|
+
_isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
1349
1286
|
/**
|
|
1350
|
-
*
|
|
1287
|
+
* Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
|
|
1288
|
+
*
|
|
1289
|
+
* @param scope - The anchor element (starting point). The returned list starts *after* this element.
|
|
1290
|
+
* @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
|
|
1291
|
+
* If omitted or null, returns all following siblings.
|
|
1292
|
+
* @returns A promise resolving to an array of sibling element scopes.
|
|
1351
1293
|
*
|
|
1352
1294
|
* @remarks
|
|
1353
|
-
*
|
|
1295
|
+
* **Behavior Contract:**
|
|
1296
|
+
* - **Starting Point**: The `scope` element itself IS NOT included in the result.
|
|
1297
|
+
* - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
|
|
1298
|
+
* - **Direction**: Only scans *following* siblings (next siblings).
|
|
1299
|
+
* - **Flattening**: The result is a flat list of siblings, not a nested structure.
|
|
1354
1300
|
*/
|
|
1355
|
-
|
|
1301
|
+
_nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
1356
1302
|
/**
|
|
1357
|
-
*
|
|
1303
|
+
* Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
|
|
1304
|
+
*
|
|
1305
|
+
* @param scope - The starting element from which to ascend the DOM tree.
|
|
1306
|
+
* @param candidates - An array of potential ancestor elements to check against.
|
|
1307
|
+
* @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
|
|
1358
1308
|
*
|
|
1359
1309
|
* @remarks
|
|
1360
|
-
*
|
|
1361
|
-
|
|
1362
|
-
static readonly mode: FetchEngineType;
|
|
1363
|
-
protected ctx?: FetchEngineContext;
|
|
1364
|
-
protected opts?: BaseFetcherProperties;
|
|
1365
|
-
protected crawler?: TCrawler;
|
|
1366
|
-
protected isCrawlerReady?: boolean;
|
|
1367
|
-
protected crawlerRunPromise?: Promise<FinalStatistics>;
|
|
1368
|
-
protected config?: Configuration;
|
|
1369
|
-
protected requestQueue?: RequestQueue;
|
|
1370
|
-
protected kvStore?: KeyValueStore;
|
|
1371
|
-
protected proxyConfiguration?: ProxyConfiguration;
|
|
1372
|
-
protected hdrs: Record<string, string>;
|
|
1373
|
-
protected _initialCookies?: Cookie[];
|
|
1374
|
-
protected _initializedSessions: Set<string>;
|
|
1375
|
-
protected currentSession?: Session;
|
|
1376
|
-
protected pendingRequests: Map<string, PendingEngineRequest>;
|
|
1377
|
-
protected requestCounter: number;
|
|
1378
|
-
protected actionEmitter: EventEmitter;
|
|
1379
|
-
protected isPageActive: boolean;
|
|
1380
|
-
protected isEngineDisposed: boolean;
|
|
1381
|
-
protected navigationLock: PromiseLock;
|
|
1382
|
-
protected activeContext?: TContext;
|
|
1383
|
-
protected isExecutingAction: boolean;
|
|
1384
|
-
protected lastResponse?: FetchResponse;
|
|
1385
|
-
protected actionQueue: DispatchedEngineAction[];
|
|
1386
|
-
protected isProcessingActionLoop: boolean;
|
|
1387
|
-
protected blockedTypes: Set<string>;
|
|
1388
|
-
_logDebug(category: string, ...args: any[]): void;
|
|
1389
|
-
protected _cleanup?(): Promise<void>;
|
|
1390
|
-
protected _getTrimInfo(options: TrimActionOptions): {
|
|
1391
|
-
selectors: string[];
|
|
1392
|
-
removeComments: boolean;
|
|
1393
|
-
removeHidden: boolean;
|
|
1394
|
-
};
|
|
1395
|
-
/**
|
|
1396
|
-
* Finds all elements matching the selector within the given scope.
|
|
1310
|
+
* **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
|
|
1311
|
+
* It effectively answers: "Which of these container candidates does my current element belong to?"
|
|
1397
1312
|
*
|
|
1398
|
-
*
|
|
1399
|
-
*
|
|
1400
|
-
*
|
|
1401
|
-
* @see {@link IExtractEngine._querySelectorAll} for behavior contract.
|
|
1402
|
-
* @internal
|
|
1313
|
+
* **Implementation Guidelines**:
|
|
1314
|
+
* - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
|
|
1315
|
+
* - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
|
|
1403
1316
|
*/
|
|
1404
|
-
|
|
1317
|
+
_findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
1405
1318
|
/**
|
|
1406
|
-
*
|
|
1319
|
+
* Checks if the `container` element contains the `element` (descendant).
|
|
1407
1320
|
*
|
|
1408
|
-
* @param
|
|
1409
|
-
* @param
|
|
1410
|
-
* @returns
|
|
1411
|
-
*
|
|
1412
|
-
* @
|
|
1321
|
+
* @param container - The potential ancestor element.
|
|
1322
|
+
* @param element - The potential descendant element.
|
|
1323
|
+
* @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
|
|
1324
|
+
*
|
|
1325
|
+
* @remarks
|
|
1326
|
+
* **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
|
|
1327
|
+
*
|
|
1328
|
+
* @performance-critical Used extensively in boundary checks for Segmented extraction.
|
|
1329
|
+
* - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
|
|
1330
|
+
* - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
|
|
1413
1331
|
*/
|
|
1414
|
-
|
|
1332
|
+
_contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
1415
1333
|
/**
|
|
1416
|
-
*
|
|
1334
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
1417
1335
|
*
|
|
1418
|
-
* @param
|
|
1419
|
-
* @
|
|
1420
|
-
* @
|
|
1336
|
+
* @param scope1 - The first element.
|
|
1337
|
+
* @param scope2 - The second element.
|
|
1338
|
+
* @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
|
|
1339
|
+
*
|
|
1340
|
+
* @remarks
|
|
1341
|
+
* This is a fundamental tree operation used to find the point where two element paths diverge.
|
|
1342
|
+
* **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
|
|
1421
1343
|
*/
|
|
1422
|
-
|
|
1344
|
+
_findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1423
1345
|
/**
|
|
1424
|
-
*
|
|
1346
|
+
* Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
|
|
1425
1347
|
*
|
|
1426
|
-
* @param
|
|
1427
|
-
* @param
|
|
1428
|
-
* @returns
|
|
1429
|
-
*
|
|
1348
|
+
* @param element - The descendant element.
|
|
1349
|
+
* @param container - The ancestor container.
|
|
1350
|
+
* @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
|
|
1351
|
+
*
|
|
1352
|
+
* @remarks
|
|
1353
|
+
* This method traverses up from `element` until it finds the node whose parent is `container`.
|
|
1354
|
+
* **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
|
|
1430
1355
|
*/
|
|
1431
|
-
|
|
1356
|
+
_findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1432
1357
|
/**
|
|
1433
|
-
*
|
|
1434
|
-
*
|
|
1435
|
-
*
|
|
1436
|
-
* @param scope - The anchor element scope.
|
|
1437
|
-
* @param untilSelector - Optional selector that marks the end of the segment (exclusive).
|
|
1438
|
-
* @returns List of sibling elements between anchor and untilSelector.
|
|
1439
|
-
* @internal
|
|
1358
|
+
* Logs debug information if debug mode is enabled.
|
|
1359
|
+
* @param category - The category of the log message.
|
|
1360
|
+
* @param args - Arguments to log.
|
|
1440
1361
|
*/
|
|
1441
|
-
|
|
1362
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
1363
|
+
}
|
|
1364
|
+
/**
|
|
1365
|
+
* Base configuration for all extraction schemas.
|
|
1366
|
+
*/
|
|
1367
|
+
interface BaseExtractSchema {
|
|
1442
1368
|
/**
|
|
1443
|
-
*
|
|
1444
|
-
*
|
|
1445
|
-
* @param scope - The starting element.
|
|
1446
|
-
* @param candidates - The array of potential ancestor scopes.
|
|
1447
|
-
* @returns A promise resolving to the matching candidate scope, or `null` if none found.
|
|
1448
|
-
* @see {@link IExtractEngine._findClosestAncestor} for implementation details.
|
|
1449
|
-
* @internal
|
|
1369
|
+
* Whether this field is required. If true and the value is null,
|
|
1370
|
+
* the containing object or array item will be skipped (or throw error in strict mode).
|
|
1450
1371
|
*/
|
|
1451
|
-
|
|
1372
|
+
required?: boolean;
|
|
1452
1373
|
/**
|
|
1453
|
-
*
|
|
1454
|
-
*
|
|
1455
|
-
* @param container - The potential ancestor element.
|
|
1456
|
-
* @param element - The potential descendant element.
|
|
1457
|
-
* @returns A promise resolving to `true` if `container` contains `element`.
|
|
1458
|
-
* @see {@link IExtractEngine._contains} for implementation details.
|
|
1459
|
-
* @internal
|
|
1374
|
+
* Whether to enable strict mode for this extraction.
|
|
1375
|
+
* If true, missing required fields will throw an error instead of being skipped.
|
|
1460
1376
|
*/
|
|
1461
|
-
|
|
1377
|
+
strict?: boolean;
|
|
1462
1378
|
/**
|
|
1463
|
-
*
|
|
1379
|
+
* Specifies the starting anchor for extraction of this field.
|
|
1380
|
+
* - Field Name: Uses the DOM element of a previously extracted field as the anchor.
|
|
1381
|
+
* - CSS Selector: Re-queries the selector within the current context to find the anchor.
|
|
1464
1382
|
*
|
|
1465
|
-
*
|
|
1466
|
-
* @param scope2 - The second element scope.
|
|
1467
|
-
* @returns A promise resolving to the LCA element scope, or `null` if none found.
|
|
1468
|
-
* @internal
|
|
1383
|
+
* Once anchored, the search scope for this field becomes the siblings following the anchor.
|
|
1469
1384
|
*/
|
|
1470
|
-
|
|
1385
|
+
anchor?: string;
|
|
1471
1386
|
/**
|
|
1472
|
-
*
|
|
1473
|
-
*
|
|
1474
|
-
*
|
|
1475
|
-
*
|
|
1476
|
-
* @returns The child element of container, or null.
|
|
1477
|
-
* @internal
|
|
1387
|
+
* The maximum number of levels to bubble up from the anchor or matched element.
|
|
1388
|
+
* - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
|
|
1389
|
+
* - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
|
|
1390
|
+
* - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
|
|
1478
1391
|
*/
|
|
1479
|
-
|
|
1480
|
-
|
|
1392
|
+
depth?: number;
|
|
1393
|
+
}
|
|
1394
|
+
/**
|
|
1395
|
+
* Extraction schema types.
|
|
1396
|
+
*/
|
|
1397
|
+
type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
|
|
1398
|
+
/**
|
|
1399
|
+
* Configuration for extracting a single value.
|
|
1400
|
+
*/
|
|
1401
|
+
interface ExtractValueSchema extends BaseExtractSchema {
|
|
1481
1402
|
/**
|
|
1482
|
-
*
|
|
1483
|
-
* @
|
|
1484
|
-
* @internal
|
|
1403
|
+
* The data type to cast the extracted value to.
|
|
1404
|
+
* @default 'string'
|
|
1485
1405
|
*/
|
|
1486
|
-
|
|
1487
|
-
type: ExtractArrayModeName;
|
|
1488
|
-
} & any;
|
|
1406
|
+
type?: 'string' | 'number' | 'boolean' | 'html';
|
|
1489
1407
|
/**
|
|
1490
|
-
*
|
|
1491
|
-
*
|
|
1492
|
-
*
|
|
1493
|
-
*
|
|
1408
|
+
* Extraction behavior mode.
|
|
1409
|
+
* - 'text': (Default) Uses textContent.
|
|
1410
|
+
* - 'innerText': Uses rendered text (respects CSS line breaks).
|
|
1411
|
+
* - 'html': Returns innerHTML.
|
|
1412
|
+
* - 'outerHTML': Returns HTML including the element's tag.
|
|
1494
1413
|
*/
|
|
1495
|
-
|
|
1496
|
-
strict?: boolean;
|
|
1497
|
-
}): Promise<any[]>;
|
|
1414
|
+
mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
|
|
1498
1415
|
/**
|
|
1499
|
-
*
|
|
1500
|
-
*
|
|
1501
|
-
* @param schema - The schema for a single item (must be an object or implicit object).
|
|
1502
|
-
* @param container - The container element to search within.
|
|
1503
|
-
* @param opts - Columnar extraction options (strict, inference).
|
|
1504
|
-
* @returns An array of extracted items, or null if requirements aren't met.
|
|
1505
|
-
* @internal
|
|
1416
|
+
* CSS selector to locate the element within the current context.
|
|
1506
1417
|
*/
|
|
1507
|
-
|
|
1418
|
+
selector?: string;
|
|
1508
1419
|
/**
|
|
1509
|
-
*
|
|
1510
|
-
*
|
|
1511
|
-
* @param schema - The schema for a single item (must be an object).
|
|
1512
|
-
* @param container - The container element to scan.
|
|
1513
|
-
* @param opts - Segmented extraction options (anchor).
|
|
1514
|
-
* @returns An array of extracted items.
|
|
1515
|
-
* @internal
|
|
1420
|
+
* Attribute name to extract (e.g., 'href', 'src').
|
|
1421
|
+
* If omitted, the text content or HTML is extracted based on `type`.
|
|
1516
1422
|
*/
|
|
1517
|
-
|
|
1423
|
+
attribute?: string;
|
|
1518
1424
|
/**
|
|
1519
|
-
*
|
|
1520
|
-
* @param options - The final crawler options.
|
|
1521
|
-
* @internal
|
|
1425
|
+
* Filter elements that contain a descendant matching this CSS selector.
|
|
1522
1426
|
*/
|
|
1523
|
-
|
|
1427
|
+
has?: string;
|
|
1524
1428
|
/**
|
|
1525
|
-
*
|
|
1526
|
-
* @param ctx - The fetch engine context.
|
|
1527
|
-
* @internal
|
|
1429
|
+
* Exclude elements matching this CSS selector.
|
|
1528
1430
|
*/
|
|
1529
|
-
|
|
1431
|
+
exclude?: string;
|
|
1432
|
+
}
|
|
1433
|
+
/**
|
|
1434
|
+
* Names of the supported array extraction modes.
|
|
1435
|
+
*/
|
|
1436
|
+
type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
|
|
1437
|
+
/**
|
|
1438
|
+
* Base options for array extraction modes.
|
|
1439
|
+
*/
|
|
1440
|
+
interface BaseModeOptions {
|
|
1441
|
+
type: ExtractArrayModeName;
|
|
1530
1442
|
/**
|
|
1531
|
-
*
|
|
1532
|
-
*
|
|
1533
|
-
* @param context - Crawlee crawling context
|
|
1534
|
-
* @returns Promise resolving to [FetchResponse] object
|
|
1535
|
-
*
|
|
1536
|
-
* @remarks
|
|
1537
|
-
* Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
|
|
1538
|
-
* @internal
|
|
1443
|
+
* Whether to enable strict mode for this specific array mode.
|
|
1444
|
+
* @default false
|
|
1539
1445
|
*/
|
|
1540
|
-
|
|
1541
|
-
|
|
1446
|
+
strict?: boolean;
|
|
1447
|
+
}
|
|
1448
|
+
/**
|
|
1449
|
+
* Options for columnar (column-alignment) extraction.
|
|
1450
|
+
*/
|
|
1451
|
+
interface ColumnarOptions extends BaseModeOptions {
|
|
1452
|
+
type: 'columnar';
|
|
1542
1453
|
/**
|
|
1543
|
-
*
|
|
1544
|
-
*
|
|
1545
|
-
* @
|
|
1546
|
-
* @param action - Action to execute
|
|
1547
|
-
* @returns Promise resolving to action result
|
|
1548
|
-
*
|
|
1549
|
-
* @remarks
|
|
1550
|
-
* Handles specific user interactions using underlying technology (Playwright/Cheerio).
|
|
1551
|
-
* @internal
|
|
1454
|
+
* Whether to enable heuristic inference.
|
|
1455
|
+
* If true, tries to find a common parent to infer item wrappers when counts mismatch.
|
|
1456
|
+
* @default false
|
|
1552
1457
|
*/
|
|
1553
|
-
|
|
1458
|
+
inference?: boolean;
|
|
1459
|
+
}
|
|
1460
|
+
/**
|
|
1461
|
+
* Options for segmented (anchor-based) extraction.
|
|
1462
|
+
*/
|
|
1463
|
+
interface SegmentedOptions extends BaseModeOptions {
|
|
1464
|
+
type: 'segmented';
|
|
1554
1465
|
/**
|
|
1555
|
-
*
|
|
1556
|
-
*
|
|
1557
|
-
* @param url - Target URL
|
|
1558
|
-
* @param params - Navigation options
|
|
1559
|
-
* @returns Promise resolving when navigation completes
|
|
1560
|
-
*
|
|
1561
|
-
* @example
|
|
1562
|
-
* ```ts
|
|
1563
|
-
* await engine.goto('https://example.com');
|
|
1564
|
-
* ```
|
|
1466
|
+
* The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
|
|
1467
|
+
* Defaults to the first property key's selector defined in `items`.
|
|
1565
1468
|
*/
|
|
1566
|
-
|
|
1469
|
+
anchor?: string;
|
|
1567
1470
|
/**
|
|
1568
|
-
*
|
|
1569
|
-
*
|
|
1570
|
-
*
|
|
1571
|
-
* @returns Promise resolving when wait condition is met
|
|
1572
|
-
*
|
|
1573
|
-
* @example
|
|
1574
|
-
* ```ts
|
|
1575
|
-
* await engine.waitFor({ ms: 1000 }); // Wait 1 second
|
|
1576
|
-
* await engine.waitFor({ selector: '#content' }); // Wait for element
|
|
1577
|
-
* ```
|
|
1471
|
+
* Where to start searching for fields within each segment.
|
|
1472
|
+
* - 'anchor': (Default) All fields are searched within the entire segment.
|
|
1473
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
1578
1474
|
*/
|
|
1579
|
-
|
|
1475
|
+
relativeTo?: 'anchor' | 'previous';
|
|
1580
1476
|
/**
|
|
1581
|
-
*
|
|
1582
|
-
*
|
|
1583
|
-
* @param selector - CSS selector of element to click
|
|
1584
|
-
* @returns Promise resolving when click is processed
|
|
1585
|
-
* @throws {Error} When no active page context exists
|
|
1477
|
+
* The maximum number of levels to bubble up from the anchor to find a segment container.
|
|
1478
|
+
* If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
|
|
1586
1479
|
*/
|
|
1587
|
-
|
|
1480
|
+
depth?: number;
|
|
1481
|
+
}
|
|
1482
|
+
/**
|
|
1483
|
+
* Union type for array extraction modes and their options.
|
|
1484
|
+
*/
|
|
1485
|
+
type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
|
|
1486
|
+
/**
|
|
1487
|
+
* Configuration for extracting an array of items.
|
|
1488
|
+
*/
|
|
1489
|
+
interface ExtractArraySchema extends BaseExtractSchema {
|
|
1490
|
+
type: 'array';
|
|
1588
1491
|
/**
|
|
1589
|
-
*
|
|
1590
|
-
*
|
|
1591
|
-
* @param params - Move parameters (x, y, selector, steps)
|
|
1492
|
+
* CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
|
|
1592
1493
|
*/
|
|
1593
|
-
|
|
1594
|
-
x?: number;
|
|
1595
|
-
y?: number;
|
|
1596
|
-
selector?: string;
|
|
1597
|
-
steps?: number;
|
|
1598
|
-
}): Promise<void>;
|
|
1599
|
-
/**
|
|
1600
|
-
* Clicks at current position or specified position.
|
|
1601
|
-
*
|
|
1602
|
-
* @param params - Click parameters (x, y, button, clickCount, delay)
|
|
1603
|
-
*/
|
|
1604
|
-
mouseClick(params: {
|
|
1605
|
-
x?: number;
|
|
1606
|
-
y?: number;
|
|
1607
|
-
button?: 'left' | 'right' | 'middle';
|
|
1608
|
-
clickCount?: number;
|
|
1609
|
-
delay?: number;
|
|
1610
|
-
}): Promise<void>;
|
|
1611
|
-
/**
|
|
1612
|
-
* Types text into current focused element.
|
|
1613
|
-
*
|
|
1614
|
-
* @param text - Text to type
|
|
1615
|
-
* @param delay - Delay between key presses
|
|
1616
|
-
*/
|
|
1617
|
-
keyboardType(text: string, delay?: number): Promise<void>;
|
|
1494
|
+
selector: string;
|
|
1618
1495
|
/**
|
|
1619
|
-
*
|
|
1620
|
-
*
|
|
1621
|
-
* @param key - Key to press
|
|
1622
|
-
* @param delay - Delay after key press
|
|
1496
|
+
* Filter items/containers that contain a descendant matching this CSS selector.
|
|
1623
1497
|
*/
|
|
1624
|
-
|
|
1498
|
+
has?: string;
|
|
1625
1499
|
/**
|
|
1626
|
-
*
|
|
1627
|
-
*
|
|
1628
|
-
* @param selector - CSS selector of input element
|
|
1629
|
-
* @param value - Value to fill
|
|
1630
|
-
* @returns Promise resolving when fill operation completes
|
|
1631
|
-
* @throws {Error} When no active page context exists
|
|
1500
|
+
* Exclude items/containers matching this CSS selector.
|
|
1632
1501
|
*/
|
|
1633
|
-
|
|
1502
|
+
exclude?: string;
|
|
1634
1503
|
/**
|
|
1635
|
-
*
|
|
1636
|
-
*
|
|
1637
|
-
* @param selector - Optional form/submit button selector
|
|
1638
|
-
* @param options - Submission options
|
|
1639
|
-
* @returns Promise resolving when form is submitted
|
|
1640
|
-
* @throws {Error} When no active page context exists
|
|
1504
|
+
* Schema applied recursively to each extracted item.
|
|
1505
|
+
* If omitted, defaults to extracting text.
|
|
1641
1506
|
*/
|
|
1642
|
-
|
|
1507
|
+
items?: ExtractSchema;
|
|
1643
1508
|
/**
|
|
1644
|
-
*
|
|
1645
|
-
*
|
|
1646
|
-
* @param options - Trim options specifying selectors and presets
|
|
1647
|
-
* @returns Promise resolving when trim operation completes
|
|
1648
|
-
* @throws {Error} When no active page context exists
|
|
1509
|
+
* Shortcut for `items` to extract a specific attribute directly.
|
|
1649
1510
|
*/
|
|
1650
|
-
|
|
1511
|
+
attribute?: string;
|
|
1651
1512
|
/**
|
|
1652
|
-
*
|
|
1653
|
-
*
|
|
1654
|
-
*
|
|
1655
|
-
*
|
|
1656
|
-
* @throws {Error} When no active page context exists
|
|
1513
|
+
* Array extraction mode.
|
|
1514
|
+
* - 'nested': (Default) Items are elements matched by `selector`.
|
|
1515
|
+
* - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
|
|
1516
|
+
* - 'segmented': `selector` is a container, items are segmented by an anchor field.
|
|
1657
1517
|
*/
|
|
1658
|
-
|
|
1518
|
+
mode?: ExtractArrayMode;
|
|
1519
|
+
}
|
|
1520
|
+
/**
|
|
1521
|
+
* Configuration for extracting an object with multiple properties.
|
|
1522
|
+
*/
|
|
1523
|
+
interface ExtractObjectSchema extends BaseExtractSchema {
|
|
1524
|
+
type: 'object';
|
|
1659
1525
|
/**
|
|
1660
|
-
*
|
|
1661
|
-
*
|
|
1662
|
-
* @remarks
|
|
1663
|
-
* This is a powerful action that allows running custom logic to interact with the DOM,
|
|
1664
|
-
* calculate values, or trigger navigations.
|
|
1665
|
-
*
|
|
1666
|
-
* - In **Browser Mode**, it runs in the real browser.
|
|
1667
|
-
* - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
|
|
1668
|
-
*
|
|
1669
|
-
* The action handles automatic navigation if `window.location` is modified.
|
|
1670
|
-
*
|
|
1671
|
-
* @param params - Configuration for the execution, including the function and arguments.
|
|
1672
|
-
* @returns A promise resolving to the result of the execution.
|
|
1673
|
-
* @throws {Error} If no active page context exists or if execution fails.
|
|
1674
|
-
*
|
|
1675
|
-
* @see {@link EvaluateActionOptions} for detailed parameter options and examples.
|
|
1526
|
+
* Root selector for the object. If provided, sub-properties are searched within this element.
|
|
1676
1527
|
*/
|
|
1677
|
-
|
|
1528
|
+
selector?: string;
|
|
1678
1529
|
/**
|
|
1679
|
-
*
|
|
1680
|
-
*
|
|
1681
|
-
* @param schema - An object defining the data to extract.
|
|
1682
|
-
* @returns A promise that resolves to an object with the extracted data.
|
|
1530
|
+
* Filter the object element based on descendants.
|
|
1683
1531
|
*/
|
|
1684
|
-
|
|
1532
|
+
has?: string;
|
|
1685
1533
|
/**
|
|
1686
|
-
*
|
|
1534
|
+
* Exclude the object element if it matches this selector.
|
|
1687
1535
|
*/
|
|
1688
|
-
|
|
1536
|
+
exclude?: string;
|
|
1689
1537
|
/**
|
|
1690
|
-
*
|
|
1691
|
-
*
|
|
1538
|
+
* Where to start searching for fields within this object.
|
|
1539
|
+
* Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
|
|
1540
|
+
* - 'anchor': (Default) All fields are searched within the entire scope.
|
|
1541
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
1692
1542
|
*/
|
|
1693
|
-
|
|
1694
|
-
cookies: Cookie[];
|
|
1695
|
-
sessionState?: any;
|
|
1696
|
-
}>;
|
|
1543
|
+
relativeTo?: 'anchor' | 'previous';
|
|
1697
1544
|
/**
|
|
1698
|
-
*
|
|
1545
|
+
* Explicit order of property extraction.
|
|
1546
|
+
* Useful when using `relativeTo: 'previous'`.
|
|
1699
1547
|
*/
|
|
1700
|
-
|
|
1548
|
+
order?: string[];
|
|
1701
1549
|
/**
|
|
1702
|
-
*
|
|
1550
|
+
* Definition of the object's properties and their corresponding extraction schemas.
|
|
1703
1551
|
*/
|
|
1704
|
-
|
|
1552
|
+
properties: {
|
|
1553
|
+
[key: string]: ExtractSchema;
|
|
1554
|
+
};
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
interface PromiseLock extends Promise<void> {
|
|
1558
|
+
release: () => void;
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
/**
|
|
1562
|
+
* Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
|
|
1563
|
+
*
|
|
1564
|
+
* @remarks
|
|
1565
|
+
* Used when navigating to a URL to specify additional parameters beyond the basic URL.
|
|
1566
|
+
*
|
|
1567
|
+
* @example
|
|
1568
|
+
* ```ts
|
|
1569
|
+
* await engine.goto('https://example.com', {
|
|
1570
|
+
* method: 'POST',
|
|
1571
|
+
* payload: { username: 'user', password: 'pass' },
|
|
1572
|
+
* headers: { 'Content-Type': 'application/json' },
|
|
1573
|
+
* waitUntil: 'networkidle'
|
|
1574
|
+
* });
|
|
1575
|
+
* ```
|
|
1576
|
+
*/
|
|
1577
|
+
interface GotoActionOptions {
|
|
1578
|
+
method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
|
|
1579
|
+
payload?: any;
|
|
1580
|
+
headers?: Record<string, string>;
|
|
1581
|
+
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1582
|
+
timeoutMs?: number;
|
|
1583
|
+
simulate?: boolean;
|
|
1584
|
+
}
|
|
1585
|
+
/**
|
|
1586
|
+
* Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
|
|
1587
|
+
*
|
|
1588
|
+
* @remarks
|
|
1589
|
+
* Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
|
|
1590
|
+
*/
|
|
1591
|
+
interface WaitForActionOptions {
|
|
1592
|
+
ms?: number;
|
|
1593
|
+
selector?: string;
|
|
1594
|
+
networkIdle?: boolean;
|
|
1595
|
+
failOnTimeout?: boolean;
|
|
1596
|
+
}
|
|
1597
|
+
/**
|
|
1598
|
+
* Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
|
|
1599
|
+
*
|
|
1600
|
+
* @remarks
|
|
1601
|
+
* Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
|
|
1602
|
+
*/
|
|
1603
|
+
interface SubmitActionOptions {
|
|
1604
|
+
enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
|
|
1605
|
+
}
|
|
1606
|
+
/**
|
|
1607
|
+
* Predefined cleanup groups for the {@link FetchEngine.trim} action.
|
|
1608
|
+
*/
|
|
1609
|
+
type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
|
|
1610
|
+
/**
|
|
1611
|
+
* Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
|
|
1612
|
+
*/
|
|
1613
|
+
interface TrimActionOptions {
|
|
1614
|
+
selectors?: string | string[];
|
|
1615
|
+
presets?: TrimPreset | TrimPreset[];
|
|
1616
|
+
}
|
|
1617
|
+
declare const TRIM_PRESETS: Record<string, string[]>;
|
|
1618
|
+
/**
|
|
1619
|
+
* Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
|
|
1620
|
+
*
|
|
1621
|
+
* @remarks
|
|
1622
|
+
* This action allows executing custom JavaScript logic within the page context.
|
|
1623
|
+
*
|
|
1624
|
+
* **Execution Environments:**
|
|
1625
|
+
* - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
|
|
1626
|
+
* - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
|
|
1627
|
+
* including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
|
|
1628
|
+
*
|
|
1629
|
+
* **Navigation Handling:**
|
|
1630
|
+
* If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
|
|
1631
|
+
* automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
|
|
1632
|
+
*
|
|
1633
|
+
* @example
|
|
1634
|
+
* ```json
|
|
1635
|
+
* {
|
|
1636
|
+
* "action": "evaluate",
|
|
1637
|
+
* "params": {
|
|
1638
|
+
* "fn": "([a, b]) => a + b",
|
|
1639
|
+
* "args": [1, 2]
|
|
1640
|
+
* }
|
|
1641
|
+
* }
|
|
1642
|
+
* ```
|
|
1643
|
+
*
|
|
1644
|
+
* @example
|
|
1645
|
+
* ```json
|
|
1646
|
+
* {
|
|
1647
|
+
* "action": "evaluate",
|
|
1648
|
+
* "params": {
|
|
1649
|
+
* "fn": "({ x, y }) => x * y",
|
|
1650
|
+
* "args": { "x": 6, "y": 7 }
|
|
1651
|
+
* }
|
|
1652
|
+
* }
|
|
1653
|
+
* ```
|
|
1654
|
+
*/
|
|
1655
|
+
interface EvaluateActionOptions {
|
|
1705
1656
|
/**
|
|
1706
|
-
*
|
|
1707
|
-
*
|
|
1708
|
-
* @param context - Fetch engine context
|
|
1709
|
-
* @param options - Configuration options
|
|
1710
|
-
* @returns Promise resolving when initialization completes
|
|
1657
|
+
* The function or expression to execute.
|
|
1711
1658
|
*
|
|
1712
1659
|
* @remarks
|
|
1713
|
-
*
|
|
1714
|
-
*
|
|
1660
|
+
* Can be:
|
|
1661
|
+
* 1. A function object (only available when using the API directly).
|
|
1662
|
+
* 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
|
|
1663
|
+
* 3. A string containing a direct expression, e.g., `"document.title"`
|
|
1664
|
+
*
|
|
1665
|
+
* **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
|
|
1666
|
+
* Use destructuring to handle multiple parameters.
|
|
1715
1667
|
*/
|
|
1716
|
-
|
|
1717
|
-
cleanup(): Promise<void>;
|
|
1668
|
+
fn: string | ((...args: any[]) => any);
|
|
1718
1669
|
/**
|
|
1719
|
-
*
|
|
1720
|
-
*
|
|
1721
|
-
* @
|
|
1670
|
+
* Data to pass to the function.
|
|
1671
|
+
*
|
|
1672
|
+
* @remarks
|
|
1673
|
+
* This value is passed as the first and only argument to the function defined in {@link fn}.
|
|
1674
|
+
* Recommended to use an array or object for multiple values.
|
|
1722
1675
|
*/
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1676
|
+
args?: any;
|
|
1677
|
+
}
|
|
1678
|
+
/**
|
|
1679
|
+
* Union type representing all possible engine actions that can be dispatched.
|
|
1680
|
+
*
|
|
1681
|
+
* @remarks
|
|
1682
|
+
* Defines the command structure processed during page interactions. Each action type corresponds to
|
|
1683
|
+
* a specific user interaction or navigation command within the action loop architecture.
|
|
1684
|
+
*/
|
|
1685
|
+
type FetchEngineAction = {
|
|
1686
|
+
type: 'click';
|
|
1687
|
+
selector: string;
|
|
1688
|
+
} | {
|
|
1689
|
+
type: 'fill';
|
|
1690
|
+
selector: string;
|
|
1691
|
+
value: string;
|
|
1692
|
+
} | {
|
|
1693
|
+
type: 'mouseMove';
|
|
1694
|
+
params: {
|
|
1695
|
+
x?: number;
|
|
1696
|
+
y?: number;
|
|
1697
|
+
selector?: string;
|
|
1698
|
+
steps?: number;
|
|
1699
|
+
};
|
|
1700
|
+
} | {
|
|
1701
|
+
type: 'mouseClick';
|
|
1702
|
+
params: {
|
|
1703
|
+
x?: number;
|
|
1704
|
+
y?: number;
|
|
1705
|
+
button?: 'left' | 'right' | 'middle';
|
|
1706
|
+
clickCount?: number;
|
|
1707
|
+
delay?: number;
|
|
1708
|
+
steps?: number;
|
|
1709
|
+
};
|
|
1710
|
+
} | {
|
|
1711
|
+
type: 'mouseWheel';
|
|
1712
|
+
params: {
|
|
1713
|
+
x?: number;
|
|
1714
|
+
y?: number;
|
|
1715
|
+
selector?: string;
|
|
1716
|
+
deltaX?: number;
|
|
1717
|
+
deltaY?: number;
|
|
1718
|
+
steps?: number;
|
|
1719
|
+
};
|
|
1720
|
+
} | {
|
|
1721
|
+
type: 'keyboardType';
|
|
1722
|
+
params: {
|
|
1723
|
+
text: string;
|
|
1724
|
+
delay?: number;
|
|
1725
|
+
};
|
|
1726
|
+
} | {
|
|
1727
|
+
type: 'keyboardPress';
|
|
1728
|
+
params: {
|
|
1729
|
+
key: string;
|
|
1730
|
+
delay?: number;
|
|
1731
|
+
};
|
|
1732
|
+
} | {
|
|
1733
|
+
type: 'scrollIntoView';
|
|
1734
|
+
params: {
|
|
1735
|
+
selector: string;
|
|
1736
|
+
};
|
|
1737
|
+
} | {
|
|
1738
|
+
type: 'waitFor';
|
|
1739
|
+
options?: WaitForActionOptions;
|
|
1740
|
+
} | {
|
|
1741
|
+
type: 'submit';
|
|
1742
|
+
selector?: any;
|
|
1743
|
+
options?: SubmitActionOptions;
|
|
1744
|
+
} | {
|
|
1745
|
+
type: 'getContent';
|
|
1746
|
+
} | {
|
|
1747
|
+
type: 'navigate';
|
|
1748
|
+
url: string;
|
|
1749
|
+
opts?: GotoActionOptions;
|
|
1750
|
+
} | {
|
|
1751
|
+
type: 'extract';
|
|
1752
|
+
schema: ExtractSchema;
|
|
1753
|
+
} | {
|
|
1754
|
+
type: 'pause';
|
|
1755
|
+
message?: string;
|
|
1756
|
+
} | {
|
|
1757
|
+
type: 'trim';
|
|
1758
|
+
options: TrimActionOptions;
|
|
1759
|
+
} | {
|
|
1760
|
+
type: 'evaluate';
|
|
1761
|
+
params: EvaluateActionOptions;
|
|
1762
|
+
} | {
|
|
1763
|
+
type: 'dispose';
|
|
1764
|
+
};
|
|
1765
|
+
/**
|
|
1766
|
+
* Represents an action that has been dispatched and is awaiting execution in the active page context.
|
|
1767
|
+
*
|
|
1768
|
+
* @remarks
|
|
1769
|
+
* Connects the action request with its resolution mechanism. Used internally by the action dispatch system
|
|
1770
|
+
* to handle promises while maintaining the page context validity window.
|
|
1771
|
+
*/
|
|
1772
|
+
interface DispatchedEngineAction {
|
|
1773
|
+
action: FetchEngineAction;
|
|
1774
|
+
resolve: (value?: any) => void;
|
|
1775
|
+
reject: (reason?: any) => void;
|
|
1776
|
+
}
|
|
1777
|
+
/**
|
|
1778
|
+
* Represents a pending navigation request awaiting resolution.
|
|
1779
|
+
*
|
|
1780
|
+
* @remarks
|
|
1781
|
+
* Tracks navigation requests that have been queued but not yet processed by the request handler.
|
|
1782
|
+
*/
|
|
1783
|
+
interface PendingEngineRequest {
|
|
1784
|
+
resolve: (value: any) => void;
|
|
1785
|
+
reject: (reason?: any) => void;
|
|
1786
|
+
}
|
|
1787
|
+
/**
|
|
1788
|
+
* Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
|
|
1789
|
+
*
|
|
1790
|
+
* @remarks
|
|
1791
|
+
* The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
|
|
1792
|
+
* `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
|
|
1793
|
+
* content retrieval, and user interaction.
|
|
1794
|
+
*
|
|
1795
|
+
* The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
|
|
1796
|
+
* the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
|
|
1797
|
+
* page context validity across asynchronous operations.
|
|
1798
|
+
*
|
|
1799
|
+
* @example
|
|
1800
|
+
* ```ts
|
|
1801
|
+
* import "./playwright"; // 引入注册 Playwright browser 引擎
|
|
1802
|
+
* const engine = await FetchEngine.create(context, { engine: 'browser' });
|
|
1803
|
+
* await engine.goto('https://example.com');
|
|
1804
|
+
* await engine.fill('#username', 'user');
|
|
1805
|
+
* await engine.click('#submit');
|
|
1806
|
+
* const response = await engine.getContent();
|
|
1807
|
+
* ```
|
|
1808
|
+
*/
|
|
1809
|
+
type AnyFetchEngine = FetchEngine<any, any, any>;
|
|
1810
|
+
type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
|
|
1811
|
+
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
|
|
1812
|
+
private static registry;
|
|
1764
1813
|
/**
|
|
1765
|
-
*
|
|
1814
|
+
* Registers a fetch engine implementation with the global registry.
|
|
1766
1815
|
*
|
|
1767
|
-
* @param
|
|
1768
|
-
* @
|
|
1769
|
-
* @returns Number of blocked resource types
|
|
1816
|
+
* @param engineClass - The engine class to register
|
|
1817
|
+
* @throws {Error} When engine class lacks static `id` or ID is already registered
|
|
1770
1818
|
*
|
|
1771
1819
|
* @example
|
|
1772
1820
|
* ```ts
|
|
1773
|
-
*
|
|
1774
|
-
* await engine.blockResources(['script'], true); // Replace existing
|
|
1821
|
+
* FetchEngine.register(CheerioFetchEngine);
|
|
1775
1822
|
* ```
|
|
1776
1823
|
*/
|
|
1777
|
-
|
|
1824
|
+
static register(engineClass: AnyFetchEngineCtor): void;
|
|
1778
1825
|
/**
|
|
1779
|
-
*
|
|
1826
|
+
* Retrieves a fetch engine implementation by its unique ID.
|
|
1780
1827
|
*
|
|
1781
|
-
* @
|
|
1782
|
-
* @
|
|
1828
|
+
* @param id - The ID of the engine to retrieve
|
|
1829
|
+
* @returns Engine class if found, otherwise `undefined`
|
|
1783
1830
|
*/
|
|
1784
|
-
|
|
1831
|
+
static get(id: string): AnyFetchEngineCtor | undefined;
|
|
1785
1832
|
/**
|
|
1786
|
-
*
|
|
1787
|
-
*
|
|
1788
|
-
* @overload
|
|
1789
|
-
* Gets all headers.
|
|
1790
|
-
* @returns All headers as record
|
|
1791
|
-
*
|
|
1792
|
-
* @overload
|
|
1793
|
-
* Gets specific header value.
|
|
1794
|
-
* @param name - Header name
|
|
1795
|
-
* @returns Header value
|
|
1796
|
-
*
|
|
1797
|
-
* @overload
|
|
1798
|
-
* Sets multiple headers.
|
|
1799
|
-
* @param headers - Headers to set
|
|
1800
|
-
* @param replaced - Whether to replace all existing headers
|
|
1801
|
-
* @returns `true` if successful
|
|
1802
|
-
*
|
|
1803
|
-
* @overload
|
|
1804
|
-
* Sets single header.
|
|
1805
|
-
* @param name - Header name
|
|
1806
|
-
* @param value - Header value or `null` to remove
|
|
1807
|
-
* @returns `true` if successful
|
|
1833
|
+
* Retrieves a fetch engine implementation by execution mode.
|
|
1808
1834
|
*
|
|
1809
|
-
* @
|
|
1810
|
-
*
|
|
1811
|
-
* const allHeaders = await engine.headers();
|
|
1812
|
-
* const userAgent = await engine.headers('user-agent');
|
|
1813
|
-
* await engine.headers({ 'x-custom': 'value' });
|
|
1814
|
-
* await engine.headers('auth', 'token');
|
|
1815
|
-
* ```
|
|
1835
|
+
* @param mode - Execution mode (`'http'` or `'browser'`)
|
|
1836
|
+
* @returns Engine class if found, otherwise `undefined`
|
|
1816
1837
|
*/
|
|
1817
|
-
|
|
1818
|
-
headers(name: string): Promise<string>;
|
|
1819
|
-
headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
|
|
1820
|
-
headers(name: string, value: string | null): Promise<boolean>;
|
|
1838
|
+
static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
|
|
1821
1839
|
/**
|
|
1822
|
-
*
|
|
1823
|
-
*
|
|
1824
|
-
* @overload
|
|
1825
|
-
* Gets all cookies.
|
|
1826
|
-
* @returns Array of cookies
|
|
1840
|
+
* Factory method to create and initialize a fetch engine instance.
|
|
1827
1841
|
*
|
|
1828
|
-
* @
|
|
1829
|
-
*
|
|
1830
|
-
* @
|
|
1831
|
-
* @
|
|
1842
|
+
* @param ctx - Fetch engine context
|
|
1843
|
+
* @param options - Configuration options
|
|
1844
|
+
* @returns Initialized fetch engine instance
|
|
1845
|
+
* @throws {Error} When no suitable engine implementation is found
|
|
1832
1846
|
*
|
|
1833
|
-
* @
|
|
1834
|
-
*
|
|
1835
|
-
* const cookies = await engine.cookies();
|
|
1836
|
-
* await engine.cookies([{ name: 'session', value: '123' }]);
|
|
1837
|
-
* ```
|
|
1847
|
+
* @remarks
|
|
1848
|
+
* Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
|
|
1838
1849
|
*/
|
|
1839
|
-
|
|
1840
|
-
cookies(cookies: Cookie[]): Promise<boolean>;
|
|
1850
|
+
static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
|
|
1841
1851
|
/**
|
|
1842
|
-
*
|
|
1852
|
+
* Unique identifier for the engine implementation.
|
|
1843
1853
|
*
|
|
1844
|
-
* @
|
|
1845
|
-
|
|
1846
|
-
dispose(): Promise<void>;
|
|
1847
|
-
}
|
|
1848
|
-
|
|
1849
|
-
type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
|
|
1850
|
-
interface FetchReturnTypeRegistry {
|
|
1851
|
-
response: FetchResponse;
|
|
1852
|
-
context: FetchContext;
|
|
1853
|
-
result: FetchActionResult<any> | undefined;
|
|
1854
|
-
outputs: Record<string, any>;
|
|
1855
|
-
any: any;
|
|
1856
|
-
none: void;
|
|
1857
|
-
}
|
|
1858
|
-
type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
|
|
1859
|
-
|
|
1860
|
-
/**
|
|
1861
|
-
* Represents the state of an action being executed within a context.
|
|
1862
|
-
*
|
|
1863
|
-
* @remarks
|
|
1864
|
-
* Extends the basic action properties with runtime metadata like execution index,
|
|
1865
|
-
* nesting depth, and any errors encountered during execution.
|
|
1866
|
-
*/
|
|
1867
|
-
interface FetchActionInContext extends FetchActionProperties {
|
|
1868
|
-
/**
|
|
1869
|
-
* The 0-based index of the action in the execution sequence.
|
|
1870
|
-
*/
|
|
1871
|
-
index?: number;
|
|
1872
|
-
/**
|
|
1873
|
-
* Error encountered during action execution, if any.
|
|
1854
|
+
* @remarks
|
|
1855
|
+
* Must be defined by concrete implementations. Used for registration and lookup in engine registry.
|
|
1874
1856
|
*/
|
|
1875
|
-
|
|
1857
|
+
static readonly id: string;
|
|
1876
1858
|
/**
|
|
1877
|
-
*
|
|
1859
|
+
* Execution mode of the engine (`'http'` or `'browser'`).
|
|
1860
|
+
*
|
|
1861
|
+
* @remarks
|
|
1862
|
+
* Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
|
|
1878
1863
|
*/
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1864
|
+
static readonly mode: FetchEngineType;
|
|
1865
|
+
protected ctx?: FetchEngineContext;
|
|
1866
|
+
protected opts?: BaseFetcherProperties;
|
|
1867
|
+
protected crawler?: TCrawler;
|
|
1868
|
+
protected isCrawlerReady?: boolean;
|
|
1869
|
+
protected crawlerRunPromise?: Promise<FinalStatistics>;
|
|
1870
|
+
protected config?: Configuration;
|
|
1871
|
+
protected requestQueue?: RequestQueue;
|
|
1872
|
+
protected kvStore?: KeyValueStore;
|
|
1873
|
+
protected proxyConfiguration?: ProxyConfiguration;
|
|
1874
|
+
protected hdrs: Record<string, string>;
|
|
1875
|
+
protected _initialCookies?: Cookie[];
|
|
1876
|
+
protected _initializedSessions: Set<string>;
|
|
1877
|
+
protected currentSession?: Session;
|
|
1878
|
+
protected pendingRequests: Map<string, PendingEngineRequest>;
|
|
1879
|
+
protected requestCounter: number;
|
|
1880
|
+
protected actionEmitter: EventEmitter;
|
|
1881
|
+
protected isPageActive: boolean;
|
|
1882
|
+
protected isEngineDisposed: boolean;
|
|
1883
|
+
protected navigationLock: PromiseLock;
|
|
1884
|
+
protected activeContext?: TContext;
|
|
1885
|
+
protected isExecutingAction: boolean;
|
|
1886
|
+
protected lastResponse?: FetchResponse;
|
|
1887
|
+
protected actionQueue: DispatchedEngineAction[];
|
|
1888
|
+
protected isProcessingActionLoop: boolean;
|
|
1889
|
+
protected blockedTypes: Set<string>;
|
|
1890
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
1891
|
+
protected _cleanup?(): Promise<void>;
|
|
1892
|
+
protected _getTrimInfo(options: TrimActionOptions): {
|
|
1893
|
+
selectors: string[];
|
|
1894
|
+
removeComments: boolean;
|
|
1895
|
+
removeHidden: boolean;
|
|
1896
|
+
};
|
|
1887
1897
|
/**
|
|
1888
|
-
*
|
|
1889
|
-
*
|
|
1898
|
+
* Finds all elements matching the selector within the given scope.
|
|
1899
|
+
*
|
|
1900
|
+
* @param scope - The scope to search in (Engine-specific element/node or array of nodes).
|
|
1901
|
+
* @param selector - CSS selector.
|
|
1902
|
+
* @returns List of matching elements.
|
|
1903
|
+
* @see {@link IExtractEngine._querySelectorAll} for behavior contract.
|
|
1904
|
+
* @internal
|
|
1890
1905
|
*/
|
|
1891
|
-
|
|
1906
|
+
abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
1892
1907
|
/**
|
|
1893
|
-
*
|
|
1908
|
+
* Extracts a primitive value from the element based on schema.
|
|
1909
|
+
*
|
|
1910
|
+
* @param schema - Value extraction schema.
|
|
1911
|
+
* @param scope - The element scope.
|
|
1912
|
+
* @returns Extracted value.
|
|
1913
|
+
* @see {@link IExtractEngine._extractValue} for behavior contract.
|
|
1914
|
+
* @internal
|
|
1894
1915
|
*/
|
|
1895
|
-
|
|
1896
|
-
}
|
|
1897
|
-
/**
|
|
1898
|
-
* Extended internal state for the fetch context, including action lifecycle management.
|
|
1899
|
-
*
|
|
1900
|
-
* @internal
|
|
1901
|
-
*/
|
|
1902
|
-
interface FetchContextInteralState extends BaseFetchContextInteralState {
|
|
1916
|
+
abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1903
1917
|
/**
|
|
1904
|
-
*
|
|
1918
|
+
* Gets the parent element of the given element.
|
|
1919
|
+
*
|
|
1920
|
+
* @param scope - The element scope.
|
|
1921
|
+
* @returns Parent element or null.
|
|
1922
|
+
* @internal
|
|
1905
1923
|
*/
|
|
1906
|
-
|
|
1924
|
+
abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1907
1925
|
/**
|
|
1908
|
-
*
|
|
1926
|
+
* Checks if two elements are the same identity.
|
|
1927
|
+
*
|
|
1928
|
+
* @param scope1 - First element scope.
|
|
1929
|
+
* @param scope2 - Second element scope.
|
|
1930
|
+
* @returns True if they are the same DOM node.
|
|
1931
|
+
* @internal
|
|
1909
1932
|
*/
|
|
1910
|
-
|
|
1911
|
-
}
|
|
1912
|
-
/**
|
|
1913
|
-
* Context provided to the Fetch Engine during navigation and request handling.
|
|
1914
|
-
*
|
|
1915
|
-
* @remarks
|
|
1916
|
-
* This interface contains the minimum set of properties required by an engine
|
|
1917
|
-
* to perform a fetch operation and build a response.
|
|
1918
|
-
*/
|
|
1919
|
-
interface FetchEngineContext extends BaseFetcherProperties {
|
|
1933
|
+
abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
1920
1934
|
/**
|
|
1921
|
-
*
|
|
1935
|
+
* Gets all subsequent siblings of an element until a sibling matches the selector.
|
|
1936
|
+
* Used in 'segmented' extraction mode.
|
|
1937
|
+
*
|
|
1938
|
+
* @param scope - The anchor element scope.
|
|
1939
|
+
* @param untilSelector - Optional selector that marks the end of the segment (exclusive).
|
|
1940
|
+
* @returns List of sibling elements between anchor and untilSelector.
|
|
1941
|
+
* @internal
|
|
1922
1942
|
*/
|
|
1923
|
-
|
|
1943
|
+
abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
1924
1944
|
/**
|
|
1925
|
-
*
|
|
1945
|
+
* Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
|
|
1946
|
+
*
|
|
1947
|
+
* @param scope - The starting element.
|
|
1948
|
+
* @param candidates - The array of potential ancestor scopes.
|
|
1949
|
+
* @returns A promise resolving to the matching candidate scope, or `null` if none found.
|
|
1950
|
+
* @see {@link IExtractEngine._findClosestAncestor} for implementation details.
|
|
1951
|
+
* @internal
|
|
1926
1952
|
*/
|
|
1927
|
-
|
|
1953
|
+
abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
1928
1954
|
/**
|
|
1929
|
-
*
|
|
1955
|
+
* Checks if the `container` scope contains the `element` scope.
|
|
1956
|
+
*
|
|
1957
|
+
* @param container - The potential ancestor element.
|
|
1958
|
+
* @param element - The potential descendant element.
|
|
1959
|
+
* @returns A promise resolving to `true` if `container` contains `element`.
|
|
1960
|
+
* @see {@link IExtractEngine._contains} for implementation details.
|
|
1961
|
+
* @internal
|
|
1930
1962
|
*/
|
|
1931
|
-
|
|
1963
|
+
abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
1932
1964
|
/**
|
|
1933
|
-
*
|
|
1965
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
1966
|
+
*
|
|
1967
|
+
* @param scope1 - The first element scope.
|
|
1968
|
+
* @param scope2 - The second element scope.
|
|
1969
|
+
* @returns A promise resolving to the LCA element scope, or `null` if none found.
|
|
1970
|
+
* @internal
|
|
1934
1971
|
*/
|
|
1935
|
-
|
|
1972
|
+
abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1936
1973
|
/**
|
|
1937
|
-
*
|
|
1974
|
+
* Finds the direct child of container that contains element.
|
|
1975
|
+
*
|
|
1976
|
+
* @param element - The descendant element.
|
|
1977
|
+
* @param container - The container element.
|
|
1978
|
+
* @returns The child element of container, or null.
|
|
1979
|
+
* @internal
|
|
1938
1980
|
*/
|
|
1939
|
-
|
|
1981
|
+
abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1982
|
+
protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
|
|
1940
1983
|
/**
|
|
1941
|
-
*
|
|
1984
|
+
* Normalizes the array extraction mode into an options object.
|
|
1985
|
+
* @param mode - The mode string or options object.
|
|
1986
|
+
* @internal
|
|
1942
1987
|
*/
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
* The full execution context for a Web Fetcher session or action batch.
|
|
1947
|
-
*
|
|
1948
|
-
* @remarks
|
|
1949
|
-
* This object is the central state container for the fetch operation. It provides
|
|
1950
|
-
* access to configuration, the event bus, shared outputs, and the execution engine.
|
|
1951
|
-
* It is passed to every action during execution.
|
|
1952
|
-
*/
|
|
1953
|
-
interface FetchContext extends FetchEngineContext {
|
|
1988
|
+
protected _normalizeArrayMode(mode?: ExtractArrayMode): {
|
|
1989
|
+
type: ExtractArrayModeName;
|
|
1990
|
+
} & any;
|
|
1954
1991
|
/**
|
|
1955
|
-
*
|
|
1992
|
+
* Performs standard nested array extraction.
|
|
1993
|
+
* @param items - The schema for each item.
|
|
1994
|
+
* @param elements - The list of item elements.
|
|
1995
|
+
* @internal
|
|
1956
1996
|
*/
|
|
1957
|
-
|
|
1997
|
+
protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
|
|
1998
|
+
strict?: boolean;
|
|
1999
|
+
}): Promise<any[]>;
|
|
1958
2000
|
/**
|
|
1959
|
-
*
|
|
1960
|
-
*
|
|
2001
|
+
* Performs columnar extraction (Column Alignment Mode).
|
|
2002
|
+
*
|
|
2003
|
+
* @param schema - The schema for a single item (must be an object or implicit object).
|
|
2004
|
+
* @param container - The container element to search within.
|
|
2005
|
+
* @param opts - Columnar extraction options (strict, inference).
|
|
2006
|
+
* @returns An array of extracted items, or null if requirements aren't met.
|
|
2007
|
+
* @internal
|
|
1961
2008
|
*/
|
|
1962
|
-
|
|
2009
|
+
protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
|
|
1963
2010
|
/**
|
|
1964
|
-
*
|
|
2011
|
+
* Performs segmented extraction (Anchor-based Scanning).
|
|
1965
2012
|
*
|
|
1966
|
-
* @param
|
|
1967
|
-
* @
|
|
2013
|
+
* @param schema - The schema for a single item (must be an object).
|
|
2014
|
+
* @param container - The container element to scan.
|
|
2015
|
+
* @param opts - Segmented extraction options (anchor).
|
|
2016
|
+
* @returns An array of extracted items.
|
|
2017
|
+
* @internal
|
|
1968
2018
|
*/
|
|
1969
|
-
|
|
2019
|
+
protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
|
|
1970
2020
|
/**
|
|
1971
|
-
*
|
|
1972
|
-
*
|
|
1973
|
-
* @
|
|
1974
|
-
* @param params - Parameters specific to the action type.
|
|
1975
|
-
* @param options - Additional execution options (e.g., storeAs, failOnError).
|
|
1976
|
-
* @returns A promise that resolves to the action's result.
|
|
2021
|
+
* Creates the crawler instance for the specific engine implementation.
|
|
2022
|
+
* @param options - The final crawler options.
|
|
2023
|
+
* @internal
|
|
1977
2024
|
*/
|
|
1978
|
-
|
|
2025
|
+
protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
|
|
1979
2026
|
/**
|
|
1980
|
-
*
|
|
2027
|
+
* Gets the crawler-specific options from the subclass.
|
|
2028
|
+
* @param ctx - The fetch engine context.
|
|
2029
|
+
* @internal
|
|
1981
2030
|
*/
|
|
1982
|
-
|
|
2031
|
+
protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
|
|
1983
2032
|
/**
|
|
1984
|
-
*
|
|
2033
|
+
* Abstract method for building standard [FetchResponse] from Crawlee context.
|
|
2034
|
+
*
|
|
2035
|
+
* @param context - Crawlee crawling context
|
|
2036
|
+
* @returns Promise resolving to [FetchResponse] object
|
|
2037
|
+
*
|
|
2038
|
+
* @remarks
|
|
2039
|
+
* Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
|
|
2040
|
+
* @internal
|
|
1985
2041
|
*/
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
|
|
1990
|
-
type CheerioSelection = ReturnType<CheerioAPI>;
|
|
1991
|
-
type CheerioNode = ReturnType<CheerioSelection['first']>;
|
|
1992
|
-
declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
|
|
1993
|
-
static readonly id = "cheerio";
|
|
1994
|
-
static readonly mode = "http";
|
|
1995
|
-
private _ensureCheerioContext;
|
|
1996
|
-
protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
|
|
1997
|
-
_querySelectorAll(scope: {
|
|
1998
|
-
$: CheerioAPI;
|
|
1999
|
-
el: any;
|
|
2000
|
-
} | any[], selector: string): Promise<FetchElementScope[]>;
|
|
2001
|
-
_nextSiblingsUntil(scope: {
|
|
2002
|
-
$: CheerioAPI;
|
|
2003
|
-
el: CheerioNode;
|
|
2004
|
-
}, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2005
|
-
_parentElement(scope: {
|
|
2006
|
-
$: CheerioAPI;
|
|
2007
|
-
el: CheerioNode;
|
|
2008
|
-
}): Promise<FetchElementScope | null>;
|
|
2009
|
-
_isSameElement(scope1: {
|
|
2010
|
-
el: CheerioNode;
|
|
2011
|
-
}, scope2: {
|
|
2012
|
-
el: CheerioNode;
|
|
2013
|
-
}): Promise<boolean>;
|
|
2014
|
-
_findClosestAncestor(scope: {
|
|
2015
|
-
$: CheerioAPI;
|
|
2016
|
-
el: CheerioNode;
|
|
2017
|
-
}, candidates: {
|
|
2018
|
-
$: CheerioAPI;
|
|
2019
|
-
el: CheerioNode;
|
|
2020
|
-
}[]): Promise<FetchElementScope | null>;
|
|
2021
|
-
_contains(container: {
|
|
2022
|
-
$: CheerioAPI;
|
|
2023
|
-
el: CheerioNode;
|
|
2024
|
-
}, element: {
|
|
2025
|
-
$: CheerioAPI;
|
|
2026
|
-
el: CheerioNode;
|
|
2027
|
-
}): Promise<boolean>;
|
|
2028
|
-
_findCommonAncestor(scope1: {
|
|
2029
|
-
$: CheerioAPI;
|
|
2030
|
-
el: CheerioNode;
|
|
2031
|
-
}, scope2: {
|
|
2032
|
-
$: CheerioAPI;
|
|
2033
|
-
el: CheerioNode;
|
|
2034
|
-
}): Promise<FetchElementScope | null>;
|
|
2035
|
-
_findContainerChild(element: {
|
|
2036
|
-
$: CheerioAPI;
|
|
2037
|
-
el: CheerioNode;
|
|
2038
|
-
}, container: {
|
|
2039
|
-
$: CheerioAPI;
|
|
2040
|
-
el: CheerioNode;
|
|
2041
|
-
}): Promise<FetchElementScope | null>;
|
|
2042
|
-
_extractValue(schema: ExtractValueSchema, scope: {
|
|
2043
|
-
$: CheerioAPI;
|
|
2044
|
-
el: CheerioNode;
|
|
2045
|
-
}): Promise<any>;
|
|
2046
|
-
protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
|
|
2047
|
-
protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2048
|
-
protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
|
|
2049
|
-
url: string;
|
|
2050
|
-
method: string;
|
|
2051
|
-
body?: any;
|
|
2052
|
-
headers?: Record<string, string>;
|
|
2053
|
-
}): Promise<any>;
|
|
2054
|
-
protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
|
|
2055
|
-
protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
|
|
2056
|
-
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
|
|
2057
|
-
goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2058
|
-
}
|
|
2059
|
-
|
|
2060
|
-
type Page = NonNullable<PlaywrightCrawlingContext['page']>;
|
|
2061
|
-
type Locator = ReturnType<Page['locator']>;
|
|
2062
|
-
declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
|
|
2063
|
-
static readonly id = "playwright";
|
|
2064
|
-
static readonly mode = "browser";
|
|
2065
|
-
protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
|
|
2066
|
-
_querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
|
|
2067
|
-
_nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2068
|
-
_parentElement(scope: Locator): Promise<FetchElementScope | null>;
|
|
2069
|
-
_isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
|
|
2070
|
-
_findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
|
|
2071
|
-
_contains(container: Locator, element: Locator): Promise<boolean>;
|
|
2072
|
-
_findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
|
|
2073
|
-
_findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
|
|
2074
|
-
_extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
|
|
2075
|
-
protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
|
|
2076
|
-
protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
|
|
2077
|
-
protected currentMousePos: {
|
|
2078
|
-
x: number;
|
|
2079
|
-
y: number;
|
|
2080
|
-
};
|
|
2081
|
-
protected _getRandomDelay(base: number, variance?: number): number;
|
|
2082
|
-
protected _getTrajectory(start: {
|
|
2083
|
-
x: number;
|
|
2084
|
-
y: number;
|
|
2085
|
-
}, end: {
|
|
2086
|
-
x: number;
|
|
2087
|
-
y: number;
|
|
2088
|
-
}, steps?: number): {
|
|
2089
|
-
x: number;
|
|
2090
|
-
y: number;
|
|
2091
|
-
}[];
|
|
2092
|
-
protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
|
|
2093
|
-
x: number;
|
|
2094
|
-
y: number;
|
|
2095
|
-
}>;
|
|
2096
|
-
protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2097
|
-
protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
|
|
2098
|
-
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
|
|
2099
|
-
goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
|
|
2100
|
-
}
|
|
2101
|
-
|
|
2102
|
-
declare enum FetchActionResultStatus {
|
|
2042
|
+
protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
|
|
2043
|
+
protected buildResponse(context: TContext): Promise<FetchResponse>;
|
|
2103
2044
|
/**
|
|
2104
|
-
*
|
|
2045
|
+
* Abstract method for executing action within current page context.
|
|
2046
|
+
*
|
|
2047
|
+
* @param context - Crawlee crawling context
|
|
2048
|
+
* @param action - Action to execute
|
|
2049
|
+
* @returns Promise resolving to action result
|
|
2050
|
+
*
|
|
2051
|
+
* @remarks
|
|
2052
|
+
* Handles specific user interactions using underlying technology (Playwright/Cheerio).
|
|
2053
|
+
* @internal
|
|
2105
2054
|
*/
|
|
2106
|
-
|
|
2055
|
+
protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
|
|
2107
2056
|
/**
|
|
2108
|
-
*
|
|
2057
|
+
* Navigates to the specified URL.
|
|
2058
|
+
*
|
|
2059
|
+
* @param url - Target URL
|
|
2060
|
+
* @param params - Navigation options
|
|
2061
|
+
* @returns Promise resolving when navigation completes
|
|
2062
|
+
*
|
|
2063
|
+
* @example
|
|
2064
|
+
* ```ts
|
|
2065
|
+
* await engine.goto('https://example.com');
|
|
2066
|
+
* ```
|
|
2109
2067
|
*/
|
|
2110
|
-
|
|
2068
|
+
abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2111
2069
|
/**
|
|
2112
|
-
*
|
|
2113
|
-
*
|
|
2070
|
+
* Waits for specified condition before continuing.
|
|
2071
|
+
*
|
|
2072
|
+
* @param params - Wait conditions
|
|
2073
|
+
* @returns Promise resolving when wait condition is met
|
|
2074
|
+
*
|
|
2075
|
+
* @example
|
|
2076
|
+
* ```ts
|
|
2077
|
+
* await engine.waitFor({ ms: 1000 }); // Wait 1 second
|
|
2078
|
+
* await engine.waitFor({ selector: '#content' }); // Wait for element
|
|
2079
|
+
* ```
|
|
2114
2080
|
*/
|
|
2115
|
-
|
|
2116
|
-
}
|
|
2117
|
-
type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
|
|
2118
|
-
interface FetchActionMeta {
|
|
2119
|
-
id: string;
|
|
2120
|
-
index?: number;
|
|
2121
|
-
engineType?: FetchEngineType;
|
|
2122
|
-
capability?: FetchActionCapabilityMode;
|
|
2123
|
-
response?: FetchResponse;
|
|
2124
|
-
timings?: {
|
|
2125
|
-
start: number;
|
|
2126
|
-
total: number;
|
|
2127
|
-
};
|
|
2128
|
-
retries?: number;
|
|
2129
|
-
}
|
|
2130
|
-
interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
|
|
2131
|
-
status: FetchActionResultStatus;
|
|
2132
|
-
returnType?: R;
|
|
2133
|
-
result?: FetchReturnTypeFor<R>;
|
|
2134
|
-
error?: Error;
|
|
2135
|
-
meta?: FetchActionMeta;
|
|
2136
|
-
}
|
|
2137
|
-
interface BaseFetchActionProperties {
|
|
2138
|
-
id?: string;
|
|
2139
|
-
name?: string;
|
|
2140
|
-
action?: string | FetchAction;
|
|
2141
|
-
index?: number;
|
|
2142
|
-
params?: any;
|
|
2143
|
-
args?: any;
|
|
2144
|
-
storeAs?: string;
|
|
2145
|
-
failOnError?: boolean;
|
|
2146
|
-
failOnTimeout?: boolean;
|
|
2147
|
-
timeoutMs?: number;
|
|
2148
|
-
maxRetries?: number;
|
|
2149
|
-
[key: string]: any;
|
|
2150
|
-
}
|
|
2151
|
-
type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
|
|
2152
|
-
interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
|
|
2153
|
-
activateOn?: string | RegExp | Array<string | RegExp>;
|
|
2154
|
-
deactivateOn?: string | RegExp | Array<string | RegExp>;
|
|
2155
|
-
collectOn?: string | RegExp | Array<string | RegExp>;
|
|
2156
|
-
background?: boolean;
|
|
2157
|
-
}
|
|
2158
|
-
type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
|
|
2159
|
-
interface FetchActionProperties extends BaseFetchActionProperties {
|
|
2160
|
-
collectors?: BaseFetchCollectorOptions[];
|
|
2161
|
-
}
|
|
2162
|
-
type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
|
|
2163
|
-
type FetchActionCapabilities = {
|
|
2164
|
-
[mode in FetchEngineType]?: FetchActionCapabilityMode;
|
|
2165
|
-
};
|
|
2166
|
-
declare abstract class FetchAction {
|
|
2167
|
-
private static registry;
|
|
2168
|
-
static register(actionClass: typeof FetchAction): void;
|
|
2169
|
-
static get(id: string): typeof FetchAction | undefined;
|
|
2170
|
-
static create(id: FetchActionOptions): FetchAction | undefined;
|
|
2171
|
-
static create(id: string): FetchAction | undefined;
|
|
2172
|
-
static has(name: string): boolean;
|
|
2173
|
-
static list(): string[];
|
|
2174
|
-
static id: string;
|
|
2175
|
-
static returnType: FetchReturnType;
|
|
2176
|
-
static capabilities: FetchActionCapabilities;
|
|
2177
|
-
static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2178
|
-
getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2179
|
-
get id(): string;
|
|
2180
|
-
get returnType(): FetchReturnType;
|
|
2181
|
-
get capabilities(): FetchActionCapabilities;
|
|
2182
|
-
protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2183
|
-
protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2184
|
-
abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
|
|
2185
|
-
protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
|
|
2186
|
-
protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
|
|
2081
|
+
waitFor(params?: WaitForActionOptions): Promise<void>;
|
|
2187
2082
|
/**
|
|
2188
|
-
*
|
|
2189
|
-
*
|
|
2083
|
+
* Clicks on element matching selector.
|
|
2084
|
+
*
|
|
2085
|
+
* @param selector - CSS selector of element to click
|
|
2086
|
+
* @returns Promise resolving when click is processed
|
|
2087
|
+
* @throws {Error} When no active page context exists
|
|
2190
2088
|
*/
|
|
2191
|
-
|
|
2192
|
-
entry: FetchActionInContext;
|
|
2193
|
-
collectors: CollectorsRuntime | undefined;
|
|
2194
|
-
}>;
|
|
2089
|
+
click(selector: string): Promise<void>;
|
|
2195
2090
|
/**
|
|
2196
|
-
*
|
|
2197
|
-
*
|
|
2091
|
+
* Moves mouse to specified position or element.
|
|
2092
|
+
*
|
|
2093
|
+
* @param params - Move parameters (x, y, selector, steps)
|
|
2198
2094
|
*/
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2095
|
+
mouseMove(params: {
|
|
2096
|
+
x?: number;
|
|
2097
|
+
y?: number;
|
|
2098
|
+
selector?: string;
|
|
2099
|
+
steps?: number;
|
|
2202
2100
|
}): Promise<void>;
|
|
2203
|
-
execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
|
|
2204
|
-
}
|
|
2205
|
-
type CollectorsRuntime = {
|
|
2206
|
-
cleanup: () => void;
|
|
2207
|
-
awaitExecPendings: () => Promise<void>;
|
|
2208
|
-
};
|
|
2209
|
-
|
|
2210
|
-
type FetchEngineType = 'http' | 'browser';
|
|
2211
|
-
type BrowserEngine = 'playwright' | 'puppeteer';
|
|
2212
|
-
type FetchEngineMode = FetchEngineType | 'auto' | string;
|
|
2213
|
-
type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
|
|
2214
|
-
/**
|
|
2215
|
-
* Storage configuration options for the fetch engine.
|
|
2216
|
-
*
|
|
2217
|
-
* @remarks
|
|
2218
|
-
* Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
|
|
2219
|
-
*/
|
|
2220
|
-
interface StorageOptions {
|
|
2221
2101
|
/**
|
|
2222
|
-
*
|
|
2223
|
-
*
|
|
2224
|
-
*
|
|
2102
|
+
* Clicks at current position or specified position.
|
|
2103
|
+
*
|
|
2104
|
+
* @param params - Click parameters (x, y, button, clickCount, delay)
|
|
2225
2105
|
*/
|
|
2226
|
-
|
|
2106
|
+
mouseClick(params: {
|
|
2107
|
+
x?: number;
|
|
2108
|
+
y?: number;
|
|
2109
|
+
button?: 'left' | 'right' | 'middle';
|
|
2110
|
+
clickCount?: number;
|
|
2111
|
+
delay?: number;
|
|
2112
|
+
}): Promise<void>;
|
|
2227
2113
|
/**
|
|
2228
|
-
*
|
|
2229
|
-
*
|
|
2230
|
-
*
|
|
2114
|
+
* Scrolls the mouse wheel.
|
|
2115
|
+
*
|
|
2116
|
+
* @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
|
|
2231
2117
|
*/
|
|
2232
|
-
|
|
2118
|
+
mouseWheel(params: {
|
|
2119
|
+
x?: number;
|
|
2120
|
+
y?: number;
|
|
2121
|
+
selector?: string;
|
|
2122
|
+
deltaX?: number;
|
|
2123
|
+
deltaY?: number;
|
|
2124
|
+
steps?: number;
|
|
2125
|
+
}): Promise<void>;
|
|
2233
2126
|
/**
|
|
2234
|
-
*
|
|
2235
|
-
*
|
|
2127
|
+
* Scrolls the element into view.
|
|
2128
|
+
*
|
|
2129
|
+
* @param params - Scroll parameters (selector)
|
|
2236
2130
|
*/
|
|
2237
|
-
|
|
2131
|
+
scrollIntoView(params: {
|
|
2132
|
+
selector: string;
|
|
2133
|
+
}): Promise<void>;
|
|
2238
2134
|
/**
|
|
2239
|
-
*
|
|
2240
|
-
*
|
|
2135
|
+
* Types text into current focused element.
|
|
2136
|
+
*
|
|
2137
|
+
* @param text - Text to type
|
|
2138
|
+
* @param delay - Delay between key presses
|
|
2139
|
+
*/
|
|
2140
|
+
keyboardType(text: string, delay?: number): Promise<void>;
|
|
2141
|
+
/**
|
|
2142
|
+
* Presses specified key.
|
|
2143
|
+
*
|
|
2144
|
+
* @param key - Key to press
|
|
2145
|
+
* @param delay - Delay after key press
|
|
2146
|
+
*/
|
|
2147
|
+
keyboardPress(key: string, delay?: number): Promise<void>;
|
|
2148
|
+
/**
|
|
2149
|
+
* Fills input element with specified value.
|
|
2150
|
+
*
|
|
2151
|
+
* @param selector - CSS selector of input element
|
|
2152
|
+
* @param value - Value to fill
|
|
2153
|
+
* @returns Promise resolving when fill operation completes
|
|
2154
|
+
* @throws {Error} When no active page context exists
|
|
2155
|
+
*/
|
|
2156
|
+
fill(selector: string, value: string): Promise<void>;
|
|
2157
|
+
/**
|
|
2158
|
+
* Submits a form.
|
|
2159
|
+
*
|
|
2160
|
+
* @param selector - Optional form/submit button selector
|
|
2161
|
+
* @param options - Submission options
|
|
2162
|
+
* @returns Promise resolving when form is submitted
|
|
2163
|
+
* @throws {Error} When no active page context exists
|
|
2164
|
+
*/
|
|
2165
|
+
submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
|
|
2166
|
+
/**
|
|
2167
|
+
* Removes elements from the DOM based on selectors and presets.
|
|
2168
|
+
*
|
|
2169
|
+
* @param options - Trim options specifying selectors and presets
|
|
2170
|
+
* @returns Promise resolving when trim operation completes
|
|
2171
|
+
* @throws {Error} When no active page context exists
|
|
2172
|
+
*/
|
|
2173
|
+
trim(options: TrimActionOptions): Promise<void>;
|
|
2174
|
+
/**
|
|
2175
|
+
* Pauses execution, allowing for manual intervention or inspection.
|
|
2176
|
+
*
|
|
2177
|
+
* @param message - Optional message to display during pause
|
|
2178
|
+
* @returns Promise resolving when execution is resumed
|
|
2179
|
+
* @throws {Error} When no active page context exists
|
|
2180
|
+
*/
|
|
2181
|
+
pause(message?: string): Promise<void>;
|
|
2182
|
+
/**
|
|
2183
|
+
* Executes a custom function or expression within the current page context.
|
|
2184
|
+
*
|
|
2185
|
+
* @remarks
|
|
2186
|
+
* This is a powerful action that allows running custom logic to interact with the DOM,
|
|
2187
|
+
* calculate values, or trigger navigations.
|
|
2188
|
+
*
|
|
2189
|
+
* - In **Browser Mode**, it runs in the real browser.
|
|
2190
|
+
* - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
|
|
2191
|
+
*
|
|
2192
|
+
* The action handles automatic navigation if `window.location` is modified.
|
|
2193
|
+
*
|
|
2194
|
+
* @param params - Configuration for the execution, including the function and arguments.
|
|
2195
|
+
* @returns A promise resolving to the result of the execution.
|
|
2196
|
+
* @throws {Error} If no active page context exists or if execution fails.
|
|
2197
|
+
*
|
|
2198
|
+
* @see {@link EvaluateActionOptions} for detailed parameter options and examples.
|
|
2199
|
+
*/
|
|
2200
|
+
evaluate(params: EvaluateActionOptions): Promise<any>;
|
|
2201
|
+
/**
|
|
2202
|
+
* Extracts structured data from the current page content.
|
|
2203
|
+
*
|
|
2204
|
+
* @param schema - An object defining the data to extract.
|
|
2205
|
+
* @returns A promise that resolves to an object with the extracted data.
|
|
2206
|
+
*/
|
|
2207
|
+
extract<T>(schema: ExtractSchema): Promise<T>;
|
|
2208
|
+
/**
|
|
2209
|
+
* Gets the unique identifier of this engine implementation.
|
|
2210
|
+
*/
|
|
2211
|
+
get id(): string;
|
|
2212
|
+
/**
|
|
2213
|
+
* Returns the current state of the engine (cookies)
|
|
2214
|
+
* that can be used to restore the session later.
|
|
2215
|
+
*/
|
|
2216
|
+
getState(): Promise<{
|
|
2217
|
+
cookies: Cookie[];
|
|
2218
|
+
sessionState?: any;
|
|
2219
|
+
}>;
|
|
2220
|
+
/**
|
|
2221
|
+
* Gets the execution mode of this engine (`'http'` or `'browser'`).
|
|
2241
2222
|
*/
|
|
2242
|
-
|
|
2243
|
-
}
|
|
2244
|
-
interface BaseFetcherProperties {
|
|
2223
|
+
get mode(): FetchEngineType;
|
|
2245
2224
|
/**
|
|
2246
|
-
*
|
|
2247
|
-
*
|
|
2248
|
-
* - `http`: 使用 HTTP 进行抓取
|
|
2249
|
-
* - `browser`: 使用浏览器进行抓取
|
|
2250
|
-
* - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
|
|
2225
|
+
* Gets the fetch engine context associated with this instance.
|
|
2251
2226
|
*/
|
|
2252
|
-
|
|
2253
|
-
enableSmart?: boolean;
|
|
2254
|
-
useSiteRegistry?: boolean;
|
|
2255
|
-
antibot?: boolean;
|
|
2256
|
-
debug?: boolean | string | string[];
|
|
2257
|
-
headers?: Record<string, string>;
|
|
2258
|
-
cookies?: Cookie[];
|
|
2259
|
-
sessionState?: any;
|
|
2260
|
-
sessionPoolOptions?: SessionPoolOptions;
|
|
2261
|
-
overrideSessionState?: boolean;
|
|
2262
|
-
throwHttpErrors?: boolean;
|
|
2263
|
-
output?: {
|
|
2264
|
-
cookies?: boolean;
|
|
2265
|
-
sessionState?: boolean;
|
|
2266
|
-
};
|
|
2267
|
-
proxy?: string | string[];
|
|
2268
|
-
blockResources?: ResourceType[];
|
|
2227
|
+
get context(): FetchEngineContext | undefined;
|
|
2269
2228
|
/**
|
|
2270
|
-
*
|
|
2229
|
+
* Initializes the fetch engine with provided context and options.
|
|
2230
|
+
*
|
|
2231
|
+
* @param context - Fetch engine context
|
|
2232
|
+
* @param options - Configuration options
|
|
2233
|
+
* @returns Promise resolving when initialization completes
|
|
2234
|
+
*
|
|
2235
|
+
* @remarks
|
|
2236
|
+
* Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
|
|
2237
|
+
* Automatically called when creating engine via `FetchEngine.create()`.
|
|
2271
2238
|
*/
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
browser?: {
|
|
2275
|
-
/**
|
|
2276
|
-
* 浏览器引擎,默认为 playwright
|
|
2277
|
-
*
|
|
2278
|
-
* - `playwright`: 使用 Playwright 引擎
|
|
2279
|
-
* - `puppeteer`: 使用 Puppeteer 引擎
|
|
2280
|
-
*/
|
|
2281
|
-
engine?: BrowserEngine;
|
|
2282
|
-
headless?: boolean;
|
|
2283
|
-
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
2284
|
-
launchOptions?: Record<string, any>;
|
|
2285
|
-
};
|
|
2286
|
-
http?: {
|
|
2287
|
-
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
|
2288
|
-
body?: any;
|
|
2289
|
-
};
|
|
2290
|
-
timeoutMs?: number;
|
|
2291
|
-
requestHandlerTimeoutSecs?: number;
|
|
2292
|
-
maxConcurrency?: number;
|
|
2293
|
-
maxRequestsPerMinute?: number;
|
|
2294
|
-
delayBetweenRequestsMs?: number;
|
|
2295
|
-
retries?: number;
|
|
2296
|
-
sites?: FetchSite[];
|
|
2297
|
-
url?: string;
|
|
2298
|
-
}
|
|
2299
|
-
interface FetchSite extends BaseFetcherProperties {
|
|
2300
|
-
domain: string;
|
|
2301
|
-
pathScope?: string[];
|
|
2302
|
-
meta?: {
|
|
2303
|
-
updatedAt?: number;
|
|
2304
|
-
ttlMs?: number;
|
|
2305
|
-
source?: 'manual' | 'smart';
|
|
2306
|
-
};
|
|
2307
|
-
}
|
|
2308
|
-
type OnFetchPauseCallback = (options: {
|
|
2309
|
-
message?: string;
|
|
2310
|
-
}) => Promise<void>;
|
|
2311
|
-
interface FetcherOptions extends BaseFetcherProperties {
|
|
2312
|
-
actions?: FetchActionOptions[];
|
|
2313
|
-
onPause?: OnFetchPauseCallback;
|
|
2314
|
-
}
|
|
2315
|
-
interface FetchMetadata {
|
|
2316
|
-
mode: FetchEngineType;
|
|
2317
|
-
engine?: BrowserEngine;
|
|
2318
|
-
timings?: {
|
|
2319
|
-
start: number;
|
|
2320
|
-
total: number;
|
|
2321
|
-
ttfb?: number;
|
|
2322
|
-
dns?: number;
|
|
2323
|
-
tcp?: number;
|
|
2324
|
-
firstByte?: number;
|
|
2325
|
-
download?: number;
|
|
2326
|
-
};
|
|
2327
|
-
proxy?: string;
|
|
2328
|
-
[key: string]: any;
|
|
2329
|
-
}
|
|
2330
|
-
interface FetchResponse {
|
|
2331
|
-
url: string;
|
|
2332
|
-
finalUrl: string;
|
|
2333
|
-
statusCode?: number;
|
|
2334
|
-
statusText?: string;
|
|
2335
|
-
headers: Record<string, string>;
|
|
2336
|
-
contentType?: string;
|
|
2337
|
-
body?: string | Buffer<ArrayBufferLike>;
|
|
2338
|
-
html?: string;
|
|
2339
|
-
text?: string;
|
|
2340
|
-
json?: any;
|
|
2341
|
-
cookies?: Cookie[];
|
|
2342
|
-
sessionState?: any;
|
|
2343
|
-
metadata?: FetchMetadata;
|
|
2344
|
-
}
|
|
2345
|
-
declare const DefaultFetcherProperties: BaseFetcherProperties;
|
|
2346
|
-
declare const FetcherOptionKeys: string[];
|
|
2347
|
-
|
|
2348
|
-
/**
|
|
2349
|
-
* Represents a stateful web fetching session.
|
|
2350
|
-
*
|
|
2351
|
-
* @remarks
|
|
2352
|
-
* A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
|
|
2353
|
-
* cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
|
|
2354
|
-
* session-level configurations and outputs.
|
|
2355
|
-
*
|
|
2356
|
-
* Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
|
|
2357
|
-
*/
|
|
2358
|
-
declare class FetchSession {
|
|
2359
|
-
protected options: FetcherOptions;
|
|
2239
|
+
initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
|
|
2240
|
+
cleanup(): Promise<void>;
|
|
2360
2241
|
/**
|
|
2361
|
-
*
|
|
2242
|
+
* Gets the initial scope for extraction for the specific engine.
|
|
2243
|
+
* @param context - Crawlee crawling context
|
|
2244
|
+
* @internal
|
|
2362
2245
|
*/
|
|
2363
|
-
|
|
2246
|
+
protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
|
|
2364
2247
|
/**
|
|
2365
|
-
*
|
|
2248
|
+
* Unified action processor that handles engine-agnostic actions.
|
|
2249
|
+
* @param context - Crawlee crawling context
|
|
2250
|
+
* @param action - Action to execute
|
|
2251
|
+
* @internal
|
|
2366
2252
|
*/
|
|
2367
|
-
|
|
2368
|
-
protected
|
|
2253
|
+
protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
|
|
2254
|
+
protected _handlePause(action: {
|
|
2255
|
+
message?: string;
|
|
2256
|
+
}): Promise<void>;
|
|
2369
2257
|
/**
|
|
2370
|
-
*
|
|
2258
|
+
* Executes all pending fetch engine actions within the current Crawlee request handler context.
|
|
2371
2259
|
*
|
|
2372
|
-
*
|
|
2260
|
+
* **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
|
|
2261
|
+
* of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
|
|
2262
|
+
*
|
|
2263
|
+
* ### Why This Constraint Exists
|
|
2264
|
+
* - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
|
|
2265
|
+
* is **only valid during the synchronous execution phase** of the request handler
|
|
2266
|
+
* - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
|
|
2267
|
+
* due to Crawlee's internal resource management
|
|
2268
|
+
*
|
|
2269
|
+
* ### How It Works
|
|
2270
|
+
* 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
|
|
2271
|
+
* 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
|
|
2272
|
+
* 3. Automatically cleans up event listeners upon completion
|
|
2273
|
+
*
|
|
2274
|
+
* Usage see {@link _sharedRequestHandler}
|
|
2275
|
+
* @see {@link _sharedRequestHandler}
|
|
2276
|
+
* @param context The active Crawlee crawling context containing the page/$ object
|
|
2277
|
+
* @throws {Error} If called outside valid page context window (`!this.isPageActive`)
|
|
2278
|
+
* @internal Engine infrastructure method - not for direct consumer use
|
|
2373
2279
|
*/
|
|
2374
|
-
|
|
2375
|
-
protected
|
|
2280
|
+
protected _executePendingActions(context: TContext): Promise<void>;
|
|
2281
|
+
protected _sharedRequestHandler(context: TContext): Promise<void>;
|
|
2282
|
+
protected _sharedFailedRequestHandler(context: TContext & {
|
|
2283
|
+
response?: FetchResponse;
|
|
2284
|
+
body?: string | Buffer;
|
|
2285
|
+
}, error?: Error): Promise<void>;
|
|
2286
|
+
protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
|
|
2287
|
+
private _requestHandler;
|
|
2288
|
+
private _failedRequestHandler;
|
|
2289
|
+
protected _commonCleanup(): Promise<void>;
|
|
2376
2290
|
/**
|
|
2377
|
-
*
|
|
2291
|
+
* Blocks specified resource types from loading.
|
|
2378
2292
|
*
|
|
2379
|
-
* @param
|
|
2380
|
-
* @param
|
|
2381
|
-
* @returns
|
|
2382
|
-
* @template R - The expected return type of the action.
|
|
2293
|
+
* @param types - Resource types to block
|
|
2294
|
+
* @param overwrite - Whether to replace existing blocked types
|
|
2295
|
+
* @returns Number of blocked resource types
|
|
2383
2296
|
*
|
|
2384
2297
|
* @example
|
|
2385
2298
|
* ```ts
|
|
2386
|
-
* await
|
|
2299
|
+
* await engine.blockResources(['image', 'stylesheet']);
|
|
2300
|
+
* await engine.blockResources(['script'], true); // Replace existing
|
|
2387
2301
|
* ```
|
|
2388
2302
|
*/
|
|
2389
|
-
|
|
2303
|
+
blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
|
|
2390
2304
|
/**
|
|
2391
|
-
*
|
|
2305
|
+
* Gets content of current page.
|
|
2392
2306
|
*
|
|
2393
|
-
* @
|
|
2394
|
-
* @
|
|
2395
|
-
|
|
2396
|
-
|
|
2307
|
+
* @returns Promise resolving to fetch response
|
|
2308
|
+
* @throws {Error} When no content has been fetched yet
|
|
2309
|
+
*/
|
|
2310
|
+
getContent(): Promise<FetchResponse>;
|
|
2311
|
+
/**
|
|
2312
|
+
* Manages HTTP headers for requests with multiple overloads.
|
|
2313
|
+
*
|
|
2314
|
+
* @overload
|
|
2315
|
+
* Gets all headers.
|
|
2316
|
+
* @returns All headers as record
|
|
2317
|
+
*
|
|
2318
|
+
* @overload
|
|
2319
|
+
* Gets specific header value.
|
|
2320
|
+
* @param name - Header name
|
|
2321
|
+
* @returns Header value
|
|
2322
|
+
*
|
|
2323
|
+
* @overload
|
|
2324
|
+
* Sets multiple headers.
|
|
2325
|
+
* @param headers - Headers to set
|
|
2326
|
+
* @param replaced - Whether to replace all existing headers
|
|
2327
|
+
* @returns `true` if successful
|
|
2328
|
+
*
|
|
2329
|
+
* @overload
|
|
2330
|
+
* Sets single header.
|
|
2331
|
+
* @param name - Header name
|
|
2332
|
+
* @param value - Header value or `null` to remove
|
|
2333
|
+
* @returns `true` if successful
|
|
2397
2334
|
*
|
|
2398
2335
|
* @example
|
|
2399
2336
|
* ```ts
|
|
2400
|
-
* const
|
|
2401
|
-
*
|
|
2402
|
-
*
|
|
2403
|
-
*
|
|
2337
|
+
* const allHeaders = await engine.headers();
|
|
2338
|
+
* const userAgent = await engine.headers('user-agent');
|
|
2339
|
+
* await engine.headers({ 'x-custom': 'value' });
|
|
2340
|
+
* await engine.headers('auth', 'token');
|
|
2404
2341
|
* ```
|
|
2405
2342
|
*/
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
outputs: Record<string, any>;
|
|
2411
|
-
}>;
|
|
2343
|
+
headers(): Promise<Record<string, string>>;
|
|
2344
|
+
headers(name: string): Promise<string>;
|
|
2345
|
+
headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
|
|
2346
|
+
headers(name: string, value: string | null): Promise<boolean>;
|
|
2412
2347
|
/**
|
|
2413
|
-
*
|
|
2348
|
+
* Manages cookies for current session with multiple overloads.
|
|
2414
2349
|
*
|
|
2415
|
-
* @
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
*
|
|
2350
|
+
* @overload
|
|
2351
|
+
* Gets all cookies.
|
|
2352
|
+
* @returns Array of cookies
|
|
2353
|
+
*
|
|
2354
|
+
* @overload
|
|
2355
|
+
* Sets cookies for session.
|
|
2356
|
+
* @param cookies - Cookies to set
|
|
2357
|
+
* @returns `true` if successful
|
|
2420
2358
|
*
|
|
2421
|
-
* @
|
|
2359
|
+
* @example
|
|
2360
|
+
* ```ts
|
|
2361
|
+
* const cookies = await engine.cookies();
|
|
2362
|
+
* await engine.cookies([{ name: 'session', value: '123' }]);
|
|
2363
|
+
* ```
|
|
2422
2364
|
*/
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
sessionState?: any;
|
|
2426
|
-
} | undefined>;
|
|
2365
|
+
cookies(): Promise<Cookie[]>;
|
|
2366
|
+
cookies(cookies: Cookie[]): Promise<boolean>;
|
|
2427
2367
|
/**
|
|
2428
|
-
* Disposes of
|
|
2368
|
+
* Disposes of engine, cleaning up all resources.
|
|
2429
2369
|
*
|
|
2430
|
-
* @
|
|
2431
|
-
* This method should be called when the session is no longer needed to free up resources
|
|
2432
|
-
* (e.g., closing browser instances, purging temporary storage).
|
|
2370
|
+
* @returns Promise resolving when disposal completes
|
|
2433
2371
|
*/
|
|
2434
2372
|
dispose(): Promise<void>;
|
|
2435
|
-
private ensureEngine;
|
|
2436
|
-
protected createContext(options?: FetcherOptions): FetchContext;
|
|
2437
2373
|
}
|
|
2374
|
+
declare function getRandomDelay(base: number, variance?: number): number;
|
|
2438
2375
|
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2376
|
+
type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
|
|
2377
|
+
type CheerioSelection = ReturnType<CheerioAPI>;
|
|
2378
|
+
type CheerioNode = ReturnType<CheerioSelection['first']>;
|
|
2379
|
+
declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
|
|
2380
|
+
static readonly id = "cheerio";
|
|
2381
|
+
static readonly mode = "http";
|
|
2382
|
+
private _ensureCheerioContext;
|
|
2383
|
+
protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
|
|
2384
|
+
_querySelectorAll(scope: {
|
|
2385
|
+
$: CheerioAPI;
|
|
2386
|
+
el: any;
|
|
2387
|
+
} | any[], selector: string): Promise<FetchElementScope[]>;
|
|
2388
|
+
_nextSiblingsUntil(scope: {
|
|
2389
|
+
$: CheerioAPI;
|
|
2390
|
+
el: CheerioNode;
|
|
2391
|
+
}, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2392
|
+
_parentElement(scope: {
|
|
2393
|
+
$: CheerioAPI;
|
|
2394
|
+
el: CheerioNode;
|
|
2395
|
+
}): Promise<FetchElementScope | null>;
|
|
2396
|
+
_isSameElement(scope1: {
|
|
2397
|
+
el: CheerioNode;
|
|
2398
|
+
}, scope2: {
|
|
2399
|
+
el: CheerioNode;
|
|
2400
|
+
}): Promise<boolean>;
|
|
2401
|
+
_findClosestAncestor(scope: {
|
|
2402
|
+
$: CheerioAPI;
|
|
2403
|
+
el: CheerioNode;
|
|
2404
|
+
}, candidates: {
|
|
2405
|
+
$: CheerioAPI;
|
|
2406
|
+
el: CheerioNode;
|
|
2407
|
+
}[]): Promise<FetchElementScope | null>;
|
|
2408
|
+
_contains(container: {
|
|
2409
|
+
$: CheerioAPI;
|
|
2410
|
+
el: CheerioNode;
|
|
2411
|
+
}, element: {
|
|
2412
|
+
$: CheerioAPI;
|
|
2413
|
+
el: CheerioNode;
|
|
2414
|
+
}): Promise<boolean>;
|
|
2415
|
+
_findCommonAncestor(scope1: {
|
|
2416
|
+
$: CheerioAPI;
|
|
2417
|
+
el: CheerioNode;
|
|
2418
|
+
}, scope2: {
|
|
2419
|
+
$: CheerioAPI;
|
|
2420
|
+
el: CheerioNode;
|
|
2421
|
+
}): Promise<FetchElementScope | null>;
|
|
2422
|
+
_findContainerChild(element: {
|
|
2423
|
+
$: CheerioAPI;
|
|
2424
|
+
el: CheerioNode;
|
|
2425
|
+
}, container: {
|
|
2426
|
+
$: CheerioAPI;
|
|
2427
|
+
el: CheerioNode;
|
|
2428
|
+
}): Promise<FetchElementScope | null>;
|
|
2429
|
+
_extractValue(schema: ExtractValueSchema, scope: {
|
|
2430
|
+
$: CheerioAPI;
|
|
2431
|
+
el: CheerioNode;
|
|
2432
|
+
}): Promise<any>;
|
|
2433
|
+
protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
|
|
2434
|
+
protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2435
|
+
protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
|
|
2436
|
+
url: string;
|
|
2437
|
+
method: string;
|
|
2438
|
+
body?: any;
|
|
2439
|
+
headers?: Record<string, string>;
|
|
2440
|
+
}): Promise<any>;
|
|
2441
|
+
protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
|
|
2442
|
+
protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
|
|
2443
|
+
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
|
|
2444
|
+
goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
type Page = NonNullable<PlaywrightCrawlingContext['page']>;
|
|
2448
|
+
type Locator = ReturnType<Page['locator']>;
|
|
2449
|
+
declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
|
|
2450
|
+
static readonly id = "playwright";
|
|
2451
|
+
static readonly mode = "browser";
|
|
2452
|
+
protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
|
|
2453
|
+
_querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
|
|
2454
|
+
_nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2455
|
+
_parentElement(scope: Locator): Promise<FetchElementScope | null>;
|
|
2456
|
+
_isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
|
|
2457
|
+
_findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
|
|
2458
|
+
_contains(container: Locator, element: Locator): Promise<boolean>;
|
|
2459
|
+
_findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
|
|
2460
|
+
_findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
|
|
2461
|
+
_extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
|
|
2462
|
+
protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
|
|
2463
|
+
protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
|
|
2464
|
+
protected currentMousePos: {
|
|
2465
|
+
x: number;
|
|
2466
|
+
y: number;
|
|
2467
|
+
};
|
|
2468
|
+
protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
|
|
2469
|
+
protected mouseInitialized: boolean;
|
|
2470
|
+
protected _initializeMousePos(page: Page): Promise<void>;
|
|
2471
|
+
protected _getTrajectory(start: {
|
|
2472
|
+
x: number;
|
|
2473
|
+
y: number;
|
|
2474
|
+
}, end: {
|
|
2475
|
+
x: number;
|
|
2476
|
+
y: number;
|
|
2477
|
+
}, steps?: number): {
|
|
2478
|
+
x: number;
|
|
2479
|
+
y: number;
|
|
2480
|
+
}[];
|
|
2481
|
+
protected _moveToPos(context: PlaywrightCrawlingContext, target: {
|
|
2482
|
+
x: number;
|
|
2483
|
+
y: number;
|
|
2484
|
+
}, steps?: number): Promise<{
|
|
2485
|
+
x: number;
|
|
2486
|
+
y: number;
|
|
2487
|
+
}>;
|
|
2488
|
+
protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
|
|
2489
|
+
x: number;
|
|
2490
|
+
y: number;
|
|
2491
|
+
}>;
|
|
2492
|
+
protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
|
|
2493
|
+
x: number;
|
|
2494
|
+
y: number;
|
|
2495
|
+
}>;
|
|
2496
|
+
protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2497
|
+
protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
|
|
2498
|
+
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
|
|
2499
|
+
goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
|
|
2500
|
+
}
|
|
2501
|
+
|
|
2502
|
+
type FetchActionCapabilities = {
|
|
2503
|
+
[mode in FetchEngineType]?: FetchActionCapabilityMode;
|
|
2504
|
+
};
|
|
2505
|
+
declare abstract class FetchAction {
|
|
2506
|
+
private static registry;
|
|
2507
|
+
static register(actionClass: any): void;
|
|
2508
|
+
static get(id: string): any | undefined;
|
|
2509
|
+
static create(id: FetchActionOptions): FetchAction | undefined;
|
|
2510
|
+
static create(id: string): FetchAction | undefined;
|
|
2511
|
+
static has(name: string): boolean;
|
|
2512
|
+
static list(): string[];
|
|
2513
|
+
static id: string;
|
|
2514
|
+
static returnType: FetchReturnType;
|
|
2515
|
+
static capabilities: FetchActionCapabilities;
|
|
2516
|
+
static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2517
|
+
getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2518
|
+
get id(): string;
|
|
2519
|
+
get returnType(): FetchReturnType;
|
|
2520
|
+
get capabilities(): FetchActionCapabilities;
|
|
2521
|
+
protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2522
|
+
protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2523
|
+
abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
|
|
2524
|
+
protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
|
|
2525
|
+
protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
|
|
2460
2526
|
/**
|
|
2461
|
-
*
|
|
2462
|
-
*
|
|
2463
|
-
* @param options - Configuration options for the session, merged with defaults.
|
|
2464
|
-
* @returns A promise resolving to a new FetchSession instance.
|
|
2527
|
+
* Action 开始生命周期
|
|
2528
|
+
* 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
|
|
2465
2529
|
*/
|
|
2466
|
-
|
|
2530
|
+
beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
|
|
2531
|
+
entry: Required<Pick<FetchActionProperties, "action">> & Partial<Pick<FetchActionProperties, "id" | "name">> & {
|
|
2532
|
+
[x: string]: any;
|
|
2533
|
+
collectors?: BaseFetchCollectorOptions[] | undefined;
|
|
2534
|
+
index?: number | undefined;
|
|
2535
|
+
params?: any;
|
|
2536
|
+
args?: any;
|
|
2537
|
+
storeAs?: string | undefined;
|
|
2538
|
+
failOnError?: boolean | undefined;
|
|
2539
|
+
failOnTimeout?: boolean | undefined;
|
|
2540
|
+
timeoutMs?: number | undefined;
|
|
2541
|
+
maxRetries?: number | undefined;
|
|
2542
|
+
} & {
|
|
2543
|
+
index?: number;
|
|
2544
|
+
error?: Error;
|
|
2545
|
+
depth?: number;
|
|
2546
|
+
};
|
|
2547
|
+
collectors: CollectorsRuntime | undefined;
|
|
2548
|
+
}>;
|
|
2467
2549
|
/**
|
|
2468
|
-
*
|
|
2469
|
-
*
|
|
2470
|
-
* @remarks
|
|
2471
|
-
* This method automatically creates a session, executes the specified actions,
|
|
2472
|
-
* retrieves the content, and disposes of the session.
|
|
2473
|
-
*
|
|
2474
|
-
* @param url - The target URL or a complete FetcherOptions object.
|
|
2475
|
-
* @param options - Additional options when the first parameter is a URL string.
|
|
2476
|
-
* @returns A promise resolving to the final response and any extracted outputs.
|
|
2550
|
+
* Action 结束生命周期
|
|
2551
|
+
* 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
|
|
2477
2552
|
*/
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
}>;
|
|
2482
|
-
|
|
2483
|
-
result: FetchResponse | undefined;
|
|
2484
|
-
outputs: Record<string, any>;
|
|
2485
|
-
}>;
|
|
2553
|
+
afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
|
|
2554
|
+
entry: FetchActionInContext;
|
|
2555
|
+
collectors?: CollectorsRuntime;
|
|
2556
|
+
}): Promise<void>;
|
|
2557
|
+
execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
|
|
2486
2558
|
}
|
|
2559
|
+
type CollectorsRuntime = {
|
|
2560
|
+
cleanup: () => void;
|
|
2561
|
+
awaitExecPendings: () => Promise<void>;
|
|
2562
|
+
};
|
|
2487
2563
|
|
|
2488
2564
|
declare class ClickAction extends FetchAction {
|
|
2489
2565
|
static id: string;
|
|
@@ -2644,6 +2720,53 @@ declare class MouseClickAction extends FetchAction {
|
|
|
2644
2720
|
};
|
|
2645
2721
|
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2646
2722
|
}
|
|
2723
|
+
interface ScrollIntoViewParams {
|
|
2724
|
+
selector: string;
|
|
2725
|
+
}
|
|
2726
|
+
declare class ScrollIntoViewAction extends FetchAction {
|
|
2727
|
+
static id: string;
|
|
2728
|
+
static returnType: "none";
|
|
2729
|
+
static capabilities: {
|
|
2730
|
+
http: "noop";
|
|
2731
|
+
browser: "native";
|
|
2732
|
+
};
|
|
2733
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2734
|
+
}
|
|
2735
|
+
interface MouseWheelParams {
|
|
2736
|
+
/**
|
|
2737
|
+
* Target X coordinate for the mouse wheel event.
|
|
2738
|
+
*/
|
|
2739
|
+
x?: number;
|
|
2740
|
+
/**
|
|
2741
|
+
* Target Y coordinate for the mouse wheel event.
|
|
2742
|
+
*/
|
|
2743
|
+
y?: number;
|
|
2744
|
+
/**
|
|
2745
|
+
* Selector for the element to scroll. If provided, mouse will move to this element before scrolling.
|
|
2746
|
+
*/
|
|
2747
|
+
selector?: string;
|
|
2748
|
+
/**
|
|
2749
|
+
* Horizontal scroll delta.
|
|
2750
|
+
*/
|
|
2751
|
+
deltaX?: number;
|
|
2752
|
+
/**
|
|
2753
|
+
* Vertical scroll delta.
|
|
2754
|
+
*/
|
|
2755
|
+
deltaY?: number;
|
|
2756
|
+
/**
|
|
2757
|
+
* Number of steps to split the scroll into for simulating human-like behavior.
|
|
2758
|
+
*/
|
|
2759
|
+
steps?: number;
|
|
2760
|
+
}
|
|
2761
|
+
declare class MouseWheelAction extends FetchAction {
|
|
2762
|
+
static id: string;
|
|
2763
|
+
static returnType: "none";
|
|
2764
|
+
static capabilities: {
|
|
2765
|
+
http: "noop";
|
|
2766
|
+
browser: "native";
|
|
2767
|
+
};
|
|
2768
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2769
|
+
}
|
|
2647
2770
|
|
|
2648
2771
|
interface KeyboardTypeParams {
|
|
2649
2772
|
text: string;
|
|
@@ -2681,4 +2804,4 @@ declare function fetchWeb(url: string, options?: FetcherOptions): Promise<{
|
|
|
2681
2804
|
outputs: Record<string, any>;
|
|
2682
2805
|
}>;
|
|
2683
2806
|
|
|
2684
|
-
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb };
|
|
2807
|
+
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EngineUpgradeError, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionMeta, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };
|