@isdk/web-fetcher 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.cn.md +19 -0
- package/README.engine.cn.md +34 -6
- package/README.engine.md +29 -1
- package/README.md +21 -1
- package/dist/index.d.mts +1515 -1490
- package/dist/index.d.ts +1515 -1490
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/README.md +21 -1
- package/docs/_media/README.cn.md +19 -0
- package/docs/_media/README.engine.md +29 -1
- package/docs/classes/CheerioFetchEngine.md +95 -95
- package/docs/classes/ClickAction.md +29 -29
- package/docs/classes/EngineUpgradeError.md +335 -0
- package/docs/classes/EvaluateAction.md +29 -29
- package/docs/classes/ExtractAction.md +29 -29
- package/docs/classes/FetchAction.md +29 -29
- package/docs/classes/FetchEngine.md +93 -93
- package/docs/classes/FetchSession.md +14 -14
- package/docs/classes/FillAction.md +29 -29
- package/docs/classes/GetContentAction.md +29 -29
- package/docs/classes/GotoAction.md +29 -29
- package/docs/classes/KeyboardPressAction.md +29 -29
- package/docs/classes/KeyboardTypeAction.md +29 -29
- package/docs/classes/MouseClickAction.md +29 -29
- package/docs/classes/MouseMoveAction.md +29 -29
- package/docs/classes/MouseWheelAction.md +29 -29
- package/docs/classes/PauseAction.md +29 -29
- package/docs/classes/PlaywrightFetchEngine.md +101 -101
- package/docs/classes/ScrollIntoViewAction.md +29 -29
- package/docs/classes/SubmitAction.md +29 -29
- package/docs/classes/TrimAction.md +29 -29
- package/docs/classes/WaitForAction.md +29 -29
- package/docs/classes/WebFetcher.md +5 -5
- package/docs/enumerations/FetchActionResultStatus.md +4 -4
- package/docs/functions/fetchWeb.md +2 -2
- package/docs/functions/getRandomDelay.md +1 -1
- package/docs/globals.md +3 -1
- package/docs/interfaces/BaseFetchActionProperties.md +13 -13
- package/docs/interfaces/BaseFetchCollectorActionProperties.md +17 -17
- package/docs/interfaces/BaseFetcherProperties.md +44 -28
- package/docs/interfaces/DispatchedEngineAction.md +4 -4
- package/docs/interfaces/EvaluateActionOptions.md +3 -3
- package/docs/interfaces/ExtractActionProperties.md +13 -13
- package/docs/interfaces/FetchActionMeta.md +73 -0
- package/docs/interfaces/FetchActionProperties.md +15 -19
- package/docs/interfaces/FetchActionResult.md +7 -7
- package/docs/interfaces/FetchContext.md +65 -41
- package/docs/interfaces/FetchEngineContext.md +57 -33
- package/docs/interfaces/FetchMetadata.md +5 -5
- package/docs/interfaces/FetchResponse.md +14 -14
- package/docs/interfaces/FetchReturnTypeRegistry.md +7 -7
- package/docs/interfaces/FetchSite.md +55 -31
- package/docs/interfaces/FetcherOptions.md +55 -31
- package/docs/interfaces/GotoActionOptions.md +8 -8
- package/docs/interfaces/KeyboardPressParams.md +3 -3
- package/docs/interfaces/KeyboardTypeParams.md +3 -3
- package/docs/interfaces/MouseClickParams.md +6 -6
- package/docs/interfaces/MouseMoveParams.md +5 -5
- package/docs/interfaces/MouseWheelParams.md +7 -7
- package/docs/interfaces/PendingEngineRequest.md +3 -3
- package/docs/interfaces/ScrollIntoViewParams.md +2 -2
- package/docs/interfaces/StorageOptions.md +5 -5
- package/docs/interfaces/SubmitActionOptions.md +2 -2
- package/docs/interfaces/TrimActionOptions.md +3 -3
- package/docs/interfaces/WaitForActionOptions.md +5 -5
- package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
- package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
- package/docs/type-aliases/BrowserEngine.md +1 -1
- package/docs/type-aliases/FetchActionCapabilities.md +1 -1
- package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
- package/docs/type-aliases/FetchActionInContext.md +38 -0
- package/docs/type-aliases/FetchActionOptions.md +1 -1
- package/docs/type-aliases/FetchEngineAction.md +1 -1
- package/docs/type-aliases/FetchEngineType.md +1 -1
- package/docs/type-aliases/FetchReturnType.md +1 -1
- package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
- package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
- package/docs/type-aliases/ResourceType.md +1 -1
- package/docs/type-aliases/TrimPreset.md +1 -1
- package/docs/variables/DefaultFetcherProperties.md +1 -1
- package/docs/variables/FetcherOptionKeys.md +1 -1
- package/docs/variables/TRIM_PRESETS.md +1 -1
- package/package.json +1 -1
- package/docs/interfaces/FetchActionInContext.md +0 -190
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration,
|
|
1
|
+
import { Cookie, SessionPoolOptions, CrawlingContext, BasicCrawler, BasicCrawlerOptions, FinalStatistics, Configuration, RequestQueue, KeyValueStore, ProxyConfiguration, Session, CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions, PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions } from 'crawlee';
|
|
2
2
|
export { Cookie } from 'crawlee';
|
|
3
3
|
import { EventEmitter } from 'events-ex';
|
|
4
4
|
|
|
@@ -731,1810 +731,1835 @@ type _RequireAtLeastOne<
|
|
|
731
731
|
Except<ObjectType, KeysType>;
|
|
732
732
|
|
|
733
733
|
/**
|
|
734
|
-
* Represents the
|
|
735
|
-
* It acts as the target for extraction and interaction actions.
|
|
736
|
-
*/
|
|
737
|
-
type FetchElementScope = any;
|
|
738
|
-
/**
|
|
739
|
-
* Interface representing the minimal engine capabilities required for extraction.
|
|
734
|
+
* Represents the state of an action being executed within a context.
|
|
740
735
|
*
|
|
741
736
|
* @remarks
|
|
742
|
-
*
|
|
743
|
-
*
|
|
744
|
-
* regarding scope handling (Element vs Array of Elements) and DOM traversal.
|
|
737
|
+
* Extends the basic action properties with runtime metadata like execution index,
|
|
738
|
+
* nesting depth, and any errors encountered during execution.
|
|
745
739
|
*/
|
|
746
|
-
|
|
747
|
-
/**
|
|
748
|
-
* Finds all elements matching the selector within the given scope.
|
|
749
|
-
*
|
|
750
|
-
* @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
|
|
751
|
-
* @param selector - The CSS selector to match.
|
|
752
|
-
* @returns A promise resolving to an array of found element scopes.
|
|
753
|
-
*
|
|
754
|
-
* @remarks
|
|
755
|
-
* **Behavior Contract:**
|
|
756
|
-
* 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
|
|
757
|
-
* 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
|
|
758
|
-
* 3. **Array Scope**: If `scope` is an array:
|
|
759
|
-
* - It MUST process elements in the order they appear in the array (which should match document order).
|
|
760
|
-
* - It MUST perform the check (Self + Descendants) for *each* element in the array.
|
|
761
|
-
* - It MUST flatten the results into a single array.
|
|
762
|
-
* - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
|
|
763
|
-
* but generally, preserving document order is the priority.
|
|
764
|
-
*/
|
|
765
|
-
_querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
740
|
+
type FetchActionInContext = FetchActionOptions & {
|
|
766
741
|
/**
|
|
767
|
-
*
|
|
768
|
-
*
|
|
769
|
-
* @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
|
|
770
|
-
* @param scope - The specific element to extract data from.
|
|
771
|
-
* @returns A promise resolving to the extracted value (string, number, boolean, or null).
|
|
772
|
-
*
|
|
773
|
-
* @remarks
|
|
774
|
-
* **Behavior Contract:**
|
|
775
|
-
* - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
|
|
776
|
-
* - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
|
|
777
|
-
* - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
|
|
778
|
-
* - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
|
|
779
|
-
* - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
|
|
742
|
+
* The 0-based index of the action in the execution sequence.
|
|
780
743
|
*/
|
|
781
|
-
|
|
744
|
+
index?: number;
|
|
782
745
|
/**
|
|
783
|
-
*
|
|
784
|
-
*
|
|
785
|
-
* @param scope - The element to find the parent of.
|
|
786
|
-
* @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
|
|
746
|
+
* Error encountered during action execution, if any.
|
|
787
747
|
*/
|
|
788
|
-
|
|
748
|
+
error?: Error;
|
|
789
749
|
/**
|
|
790
|
-
*
|
|
791
|
-
*
|
|
792
|
-
* @param scope1 - The first element scope.
|
|
793
|
-
* @param scope2 - The second element scope.
|
|
794
|
-
* @returns A promise resolving to `true` if they are the same node, `false` otherwise.
|
|
795
|
-
*
|
|
796
|
-
* @remarks
|
|
797
|
-
* This comparison MUST be identity-based, not just content-based.
|
|
750
|
+
* The nesting depth of the action. Top-level actions (executed directly by the session) have a depth of 0.
|
|
798
751
|
*/
|
|
799
|
-
|
|
752
|
+
depth?: number;
|
|
753
|
+
};
|
|
754
|
+
/**
|
|
755
|
+
* Base internal state used by fetch engines to maintain their runtime environment.
|
|
756
|
+
*
|
|
757
|
+
* @internal
|
|
758
|
+
*/
|
|
759
|
+
interface BaseFetchContextInteralState {
|
|
800
760
|
/**
|
|
801
|
-
*
|
|
802
|
-
*
|
|
803
|
-
* @param scope - The anchor element (starting point). The returned list starts *after* this element.
|
|
804
|
-
* @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
|
|
805
|
-
* If omitted or null, returns all following siblings.
|
|
806
|
-
* @returns A promise resolving to an array of sibling element scopes.
|
|
807
|
-
*
|
|
808
|
-
* @remarks
|
|
809
|
-
* **Behavior Contract:**
|
|
810
|
-
* - **Starting Point**: The `scope` element itself IS NOT included in the result.
|
|
811
|
-
* - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
|
|
812
|
-
* - **Direction**: Only scans *following* siblings (next siblings).
|
|
813
|
-
* - **Flattening**: The result is a flat list of siblings, not a nested structure.
|
|
761
|
+
* The active engine instance (e.g., CheerioFetchEngine or PlaywrightFetchEngine)
|
|
762
|
+
* associated with this context.
|
|
814
763
|
*/
|
|
815
|
-
|
|
764
|
+
engine?: any;
|
|
816
765
|
/**
|
|
817
|
-
*
|
|
818
|
-
*
|
|
819
|
-
* @param scope - The starting element from which to ascend the DOM tree.
|
|
820
|
-
* @param candidates - An array of potential ancestor elements to check against.
|
|
821
|
-
* @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
|
|
822
|
-
*
|
|
823
|
-
* @remarks
|
|
824
|
-
* **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
|
|
825
|
-
* It effectively answers: "Which of these container candidates does my current element belong to?"
|
|
826
|
-
*
|
|
827
|
-
* **Implementation Guidelines**:
|
|
828
|
-
* - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
|
|
829
|
-
* - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
|
|
766
|
+
* Additional implementation-specific internal state.
|
|
830
767
|
*/
|
|
831
|
-
|
|
768
|
+
[key: string]: any;
|
|
769
|
+
}
|
|
770
|
+
/**
|
|
771
|
+
* Extended internal state for the fetch context, including action lifecycle management.
|
|
772
|
+
*
|
|
773
|
+
* @internal
|
|
774
|
+
*/
|
|
775
|
+
interface FetchContextInteralState extends BaseFetchContextInteralState {
|
|
832
776
|
/**
|
|
833
|
-
*
|
|
834
|
-
*
|
|
835
|
-
* @param container - The potential ancestor element.
|
|
836
|
-
* @param element - The potential descendant element.
|
|
837
|
-
* @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
|
|
838
|
-
*
|
|
839
|
-
* @remarks
|
|
840
|
-
* **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
|
|
841
|
-
*
|
|
842
|
-
* @performance-critical Used extensively in boundary checks for Segmented extraction.
|
|
843
|
-
* - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
|
|
844
|
-
* - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
|
|
777
|
+
* Stack of actions currently being executed, used to manage nested action calls.
|
|
845
778
|
*/
|
|
846
|
-
|
|
779
|
+
actionStack?: FetchActionInContext[];
|
|
847
780
|
/**
|
|
848
|
-
*
|
|
849
|
-
*
|
|
850
|
-
* @param scope1 - The first element.
|
|
851
|
-
* @param scope2 - The second element.
|
|
852
|
-
* @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
|
|
853
|
-
*
|
|
854
|
-
* @remarks
|
|
855
|
-
* This is a fundamental tree operation used to find the point where two element paths diverge.
|
|
856
|
-
* **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
|
|
781
|
+
* Global counter for actions executed within the session, used to assign auto-incrementing indices.
|
|
857
782
|
*/
|
|
858
|
-
|
|
783
|
+
actionIndex?: number;
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* Context provided to the Fetch Engine during navigation and request handling.
|
|
787
|
+
*
|
|
788
|
+
* @remarks
|
|
789
|
+
* This interface contains the minimum set of properties required by an engine
|
|
790
|
+
* to perform a fetch operation and build a response.
|
|
791
|
+
*/
|
|
792
|
+
interface FetchEngineContext extends BaseFetcherProperties {
|
|
859
793
|
/**
|
|
860
|
-
*
|
|
861
|
-
*
|
|
862
|
-
* @param element - The descendant element.
|
|
863
|
-
* @param container - The ancestor container.
|
|
864
|
-
* @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
|
|
865
|
-
*
|
|
866
|
-
* @remarks
|
|
867
|
-
* This method traverses up from `element` until it finds the node whose parent is `container`.
|
|
868
|
-
* **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
|
|
794
|
+
* Unique identifier for the session or request batch.
|
|
869
795
|
*/
|
|
870
|
-
|
|
796
|
+
id: string;
|
|
871
797
|
/**
|
|
872
|
-
*
|
|
873
|
-
* @param category - The category of the log message.
|
|
874
|
-
* @param args - Arguments to log.
|
|
798
|
+
* The target URL for the next navigation, if specified.
|
|
875
799
|
*/
|
|
876
|
-
|
|
877
|
-
}
|
|
878
|
-
/**
|
|
879
|
-
* Base configuration for all extraction schemas.
|
|
880
|
-
*/
|
|
881
|
-
interface BaseExtractSchema {
|
|
800
|
+
url?: string;
|
|
882
801
|
/**
|
|
883
|
-
*
|
|
884
|
-
* the containing object or array item will be skipped (or throw error in strict mode).
|
|
802
|
+
* The final URL after all redirects have been followed.
|
|
885
803
|
*/
|
|
886
|
-
|
|
804
|
+
finalUrl?: string;
|
|
887
805
|
/**
|
|
888
|
-
*
|
|
889
|
-
* If true, missing required fields will throw an error instead of being skipped.
|
|
806
|
+
* The standardized response object from the most recent navigation.
|
|
890
807
|
*/
|
|
891
|
-
|
|
808
|
+
lastResponse?: FetchResponse;
|
|
892
809
|
/**
|
|
893
|
-
*
|
|
894
|
-
* - Field Name: Uses the DOM element of a previously extracted field as the anchor.
|
|
895
|
-
* - CSS Selector: Re-queries the selector within the current context to find the anchor.
|
|
896
|
-
*
|
|
897
|
-
* Once anchored, the search scope for this field becomes the siblings following the anchor.
|
|
810
|
+
* The result object from the most recent action execution.
|
|
898
811
|
*/
|
|
899
|
-
|
|
812
|
+
lastResult?: FetchActionResult;
|
|
900
813
|
/**
|
|
901
|
-
*
|
|
902
|
-
* - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
|
|
903
|
-
* - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
|
|
904
|
-
* - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
|
|
814
|
+
* Engine-specific internal state.
|
|
905
815
|
*/
|
|
906
|
-
|
|
816
|
+
internal: BaseFetchContextInteralState;
|
|
907
817
|
}
|
|
908
818
|
/**
|
|
909
|
-
*
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
*
|
|
819
|
+
* The full execution context for a Web Fetcher session or action batch.
|
|
820
|
+
*
|
|
821
|
+
* @remarks
|
|
822
|
+
* This object is the central state container for the fetch operation. It provides
|
|
823
|
+
* access to configuration, the event bus, shared outputs, and the execution engine.
|
|
824
|
+
* It is passed to every action during execution.
|
|
914
825
|
*/
|
|
915
|
-
interface
|
|
826
|
+
interface FetchContext extends FetchEngineContext {
|
|
916
827
|
/**
|
|
917
|
-
*
|
|
918
|
-
* @default 'string'
|
|
828
|
+
* Metadata about the action currently being executed.
|
|
919
829
|
*/
|
|
920
|
-
|
|
830
|
+
currentAction?: FetchActionInContext;
|
|
921
831
|
/**
|
|
922
|
-
*
|
|
923
|
-
*
|
|
924
|
-
* - 'innerText': Uses rendered text (respects CSS line breaks).
|
|
925
|
-
* - 'html': Returns innerHTML.
|
|
926
|
-
* - 'outerHTML': Returns HTML including the element's tag.
|
|
832
|
+
* A shared key-value store for storing data extracted from pages or
|
|
833
|
+
* metadata generated during action execution.
|
|
927
834
|
*/
|
|
928
|
-
|
|
835
|
+
outputs: Record<string, any>;
|
|
929
836
|
/**
|
|
930
|
-
*
|
|
837
|
+
* Executes a FetchAction within the current context.
|
|
838
|
+
*
|
|
839
|
+
* @param actionOptions - Configuration for the action to be executed.
|
|
840
|
+
* @returns A promise that resolves to the action's result.
|
|
931
841
|
*/
|
|
932
|
-
|
|
842
|
+
execute<R extends FetchReturnType = 'any'>(actionOptions: FetchActionOptions): Promise<FetchActionResult<R>>;
|
|
933
843
|
/**
|
|
934
|
-
*
|
|
935
|
-
*
|
|
844
|
+
* Convenience method to execute an action by its registered name or ID.
|
|
845
|
+
*
|
|
846
|
+
* @param name - The registered name or ID of the action.
|
|
847
|
+
* @param params - Parameters specific to the action type.
|
|
848
|
+
* @param options - Additional execution options (e.g., storeAs, failOnError).
|
|
849
|
+
* @returns A promise that resolves to a result.
|
|
936
850
|
*/
|
|
937
|
-
|
|
851
|
+
action<R extends FetchReturnType = 'any'>(name: string, params?: any, options?: Partial<FetchActionOptions>): Promise<FetchActionResult<R>>;
|
|
938
852
|
/**
|
|
939
|
-
*
|
|
853
|
+
* Internal state for engine and lifecycle management.
|
|
940
854
|
*/
|
|
941
|
-
|
|
855
|
+
internal: FetchContextInteralState;
|
|
942
856
|
/**
|
|
943
|
-
*
|
|
857
|
+
* The central event bus for publishing and subscribing to session and action events.
|
|
944
858
|
*/
|
|
945
|
-
|
|
859
|
+
eventBus: EventEmitter;
|
|
946
860
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
/**
|
|
957
|
-
* Whether to enable strict mode for this specific array mode.
|
|
958
|
-
* @default false
|
|
959
|
-
*/
|
|
960
|
-
strict?: boolean;
|
|
961
|
-
}
|
|
962
|
-
/**
|
|
963
|
-
* Options for columnar (column-alignment) extraction.
|
|
964
|
-
*/
|
|
965
|
-
interface ColumnarOptions extends BaseModeOptions {
|
|
966
|
-
type: 'columnar';
|
|
967
|
-
/**
|
|
968
|
-
* Whether to enable heuristic inference.
|
|
969
|
-
* If true, tries to find a common parent to infer item wrappers when counts mismatch.
|
|
970
|
-
* @default false
|
|
971
|
-
*/
|
|
972
|
-
inference?: boolean;
|
|
861
|
+
|
|
862
|
+
type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
|
|
863
|
+
interface FetchReturnTypeRegistry {
|
|
864
|
+
response: FetchResponse;
|
|
865
|
+
context: FetchContext;
|
|
866
|
+
result: FetchActionResult<any> | undefined;
|
|
867
|
+
outputs: Record<string, any>;
|
|
868
|
+
any: any;
|
|
869
|
+
none: void;
|
|
973
870
|
}
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
interface SegmentedOptions extends BaseModeOptions {
|
|
978
|
-
type: 'segmented';
|
|
871
|
+
type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
|
|
872
|
+
|
|
873
|
+
declare enum FetchActionResultStatus {
|
|
979
874
|
/**
|
|
980
|
-
*
|
|
981
|
-
* Defaults to the first property key's selector defined in `items`.
|
|
875
|
+
* 动作执行失败但未抛出(通常因 failOnError=false);错误信息在 error 字段
|
|
982
876
|
*/
|
|
983
|
-
|
|
877
|
+
Failed = 0,
|
|
984
878
|
/**
|
|
985
|
-
*
|
|
986
|
-
* - 'anchor': (Default) All fields are searched within the entire segment.
|
|
987
|
-
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
879
|
+
* 动作按预期完成(即便产生 warnings)
|
|
988
880
|
*/
|
|
989
|
-
|
|
881
|
+
Success = 1,
|
|
990
882
|
/**
|
|
991
|
-
*
|
|
992
|
-
*
|
|
883
|
+
* 动作被判定为不执行/降级为 noop(比如引擎不支持且 degradeTo='noop')
|
|
884
|
+
* 能力不支持且 degradeTo='noop' 时:status='skipped',warnings 增加 { code:'capability-not-supported' }
|
|
993
885
|
*/
|
|
994
|
-
|
|
886
|
+
Skipped = 2
|
|
995
887
|
}
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
888
|
+
type FetchActionCapabilityMode = 'native' | 'simulate' | 'noop';
|
|
889
|
+
interface FetchActionMeta {
|
|
890
|
+
id: string;
|
|
891
|
+
index?: number;
|
|
892
|
+
engineType?: FetchEngineType;
|
|
893
|
+
capability?: FetchActionCapabilityMode;
|
|
894
|
+
response?: FetchResponse;
|
|
895
|
+
timings?: {
|
|
896
|
+
start: number;
|
|
897
|
+
total: number;
|
|
898
|
+
};
|
|
899
|
+
retries?: number;
|
|
900
|
+
}
|
|
901
|
+
interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
|
|
902
|
+
status: FetchActionResultStatus;
|
|
903
|
+
returnType?: R;
|
|
904
|
+
result?: FetchReturnTypeFor<R>;
|
|
905
|
+
error?: Error;
|
|
906
|
+
meta?: FetchActionMeta;
|
|
907
|
+
}
|
|
908
|
+
interface BaseFetchActionProperties {
|
|
909
|
+
id?: string;
|
|
910
|
+
name?: string;
|
|
911
|
+
action?: string | any;
|
|
912
|
+
index?: number;
|
|
913
|
+
params?: any;
|
|
914
|
+
args?: any;
|
|
915
|
+
storeAs?: string;
|
|
916
|
+
failOnError?: boolean;
|
|
917
|
+
failOnTimeout?: boolean;
|
|
918
|
+
timeoutMs?: number;
|
|
919
|
+
maxRetries?: number;
|
|
920
|
+
[key: string]: any;
|
|
921
|
+
}
|
|
922
|
+
type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
|
|
923
|
+
interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
|
|
924
|
+
activateOn?: string | RegExp | Array<string | RegExp>;
|
|
925
|
+
deactivateOn?: string | RegExp | Array<string | RegExp>;
|
|
926
|
+
collectOn?: string | RegExp | Array<string | RegExp>;
|
|
927
|
+
background?: boolean;
|
|
928
|
+
}
|
|
929
|
+
type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
|
|
930
|
+
interface FetchActionProperties extends BaseFetchActionProperties {
|
|
931
|
+
collectors?: BaseFetchCollectorOptions[];
|
|
932
|
+
}
|
|
933
|
+
type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
|
|
934
|
+
declare class EngineUpgradeError extends Error {
|
|
935
|
+
res: FetchResponse;
|
|
936
|
+
code: string;
|
|
937
|
+
constructor(res: FetchResponse);
|
|
1033
938
|
}
|
|
939
|
+
type FetchEngineType = 'http' | 'browser';
|
|
940
|
+
type BrowserEngine = 'playwright' | 'puppeteer';
|
|
941
|
+
type FetchEngineMode = FetchEngineType | 'auto' | string;
|
|
942
|
+
type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
|
|
1034
943
|
/**
|
|
1035
|
-
*
|
|
944
|
+
* Storage configuration options for the fetch engine.
|
|
945
|
+
*
|
|
946
|
+
* @remarks
|
|
947
|
+
* Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
|
|
1036
948
|
*/
|
|
1037
|
-
interface
|
|
1038
|
-
type: 'object';
|
|
949
|
+
interface StorageOptions {
|
|
1039
950
|
/**
|
|
1040
|
-
*
|
|
951
|
+
* Custom identifier for the storage.
|
|
952
|
+
* If provided, multiple sessions can share the same storage by using the same ID.
|
|
953
|
+
* If not provided, a unique session ID is used (strong isolation).
|
|
1041
954
|
*/
|
|
1042
|
-
|
|
955
|
+
id?: string;
|
|
1043
956
|
/**
|
|
1044
|
-
*
|
|
957
|
+
* Whether to persist storage to disk.
|
|
958
|
+
* If true, uses Crawlee's disk persistence. If false, data might be stored in memory or temporary directory.
|
|
959
|
+
* Corresponds to Crawlee's `persistStorage` configuration.
|
|
1045
960
|
*/
|
|
1046
|
-
|
|
961
|
+
persist?: boolean;
|
|
1047
962
|
/**
|
|
1048
|
-
*
|
|
963
|
+
* Whether to delete the storage (RequestQueue and KeyValueStore) when the session is closed.
|
|
964
|
+
* Defaults to true. Set to false if you want to keep data for future reuse with the same `id`.
|
|
1049
965
|
*/
|
|
1050
|
-
|
|
966
|
+
purge?: boolean;
|
|
1051
967
|
/**
|
|
1052
|
-
*
|
|
1053
|
-
*
|
|
1054
|
-
* - 'anchor': (Default) All fields are searched within the entire scope.
|
|
1055
|
-
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
968
|
+
* Additional Crawlee configuration options.
|
|
969
|
+
* Allows fine-grained control over the underlying Crawlee instance.
|
|
1056
970
|
*/
|
|
1057
|
-
|
|
971
|
+
config?: Record<string, any>;
|
|
972
|
+
}
|
|
973
|
+
interface BaseFetcherProperties {
|
|
1058
974
|
/**
|
|
1059
|
-
*
|
|
1060
|
-
*
|
|
975
|
+
* 抓取模式
|
|
976
|
+
*
|
|
977
|
+
* - `http`: 使用 HTTP 进行抓取
|
|
978
|
+
* - `browser`: 使用浏览器进行抓取
|
|
979
|
+
* - `auto`: auto 会走“智能探测”选择 http 或 browser, 但是如果没有启用 smart,并且在站点注册表中没有,那么则等价为 http.
|
|
1061
980
|
*/
|
|
1062
|
-
|
|
981
|
+
engine?: FetchEngineMode;
|
|
982
|
+
enableSmart?: boolean;
|
|
983
|
+
syncStateOnUpgrade?: boolean;
|
|
984
|
+
upgradeThresholdMs?: number;
|
|
985
|
+
useSiteRegistry?: boolean;
|
|
986
|
+
antibot?: boolean;
|
|
987
|
+
debug?: boolean | string | string[];
|
|
988
|
+
headers?: Record<string, string>;
|
|
989
|
+
cookies?: Cookie[];
|
|
990
|
+
sessionState?: any;
|
|
991
|
+
sessionPoolOptions?: SessionPoolOptions;
|
|
992
|
+
overrideSessionState?: boolean;
|
|
993
|
+
throwHttpErrors?: boolean;
|
|
994
|
+
output?: {
|
|
995
|
+
cookies?: boolean;
|
|
996
|
+
sessionState?: boolean;
|
|
997
|
+
};
|
|
998
|
+
proxy?: string | string[];
|
|
999
|
+
blockResources?: ResourceType[];
|
|
1063
1000
|
/**
|
|
1064
|
-
*
|
|
1001
|
+
* Storage configuration for session isolation and persistence.
|
|
1065
1002
|
*/
|
|
1066
|
-
|
|
1067
|
-
|
|
1003
|
+
storage?: StorageOptions;
|
|
1004
|
+
ignoreSslErrors?: boolean;
|
|
1005
|
+
browser?: {
|
|
1006
|
+
/**
|
|
1007
|
+
* 浏览器引擎,默认为 playwright
|
|
1008
|
+
*
|
|
1009
|
+
* - `playwright`: 使用 Playwright 引擎
|
|
1010
|
+
* - `puppeteer`: 使用 Puppeteer 引擎
|
|
1011
|
+
*/
|
|
1012
|
+
engine?: BrowserEngine;
|
|
1013
|
+
headless?: boolean;
|
|
1014
|
+
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1015
|
+
launchOptions?: Record<string, any>;
|
|
1016
|
+
};
|
|
1017
|
+
http?: {
|
|
1018
|
+
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
|
1019
|
+
body?: any;
|
|
1068
1020
|
};
|
|
1069
|
-
}
|
|
1070
|
-
|
|
1071
|
-
interface PromiseLock extends Promise<void> {
|
|
1072
|
-
release: () => void;
|
|
1073
|
-
}
|
|
1074
|
-
|
|
1075
|
-
/**
|
|
1076
|
-
* Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
|
|
1077
|
-
*
|
|
1078
|
-
* @remarks
|
|
1079
|
-
* Used when navigating to a URL to specify additional parameters beyond the basic URL.
|
|
1080
|
-
*
|
|
1081
|
-
* @example
|
|
1082
|
-
* ```ts
|
|
1083
|
-
* await engine.goto('https://example.com', {
|
|
1084
|
-
* method: 'POST',
|
|
1085
|
-
* payload: { username: 'user', password: 'pass' },
|
|
1086
|
-
* headers: { 'Content-Type': 'application/json' },
|
|
1087
|
-
* waitUntil: 'networkidle'
|
|
1088
|
-
* });
|
|
1089
|
-
* ```
|
|
1090
|
-
*/
|
|
1091
|
-
interface GotoActionOptions {
|
|
1092
|
-
method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
|
|
1093
|
-
payload?: any;
|
|
1094
|
-
headers?: Record<string, string>;
|
|
1095
|
-
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1096
1021
|
timeoutMs?: number;
|
|
1097
|
-
|
|
1022
|
+
requestHandlerTimeoutSecs?: number;
|
|
1023
|
+
maxConcurrency?: number;
|
|
1024
|
+
maxRequestsPerMinute?: number;
|
|
1025
|
+
delayBetweenRequestsMs?: number;
|
|
1026
|
+
retries?: number;
|
|
1027
|
+
sites?: FetchSite[];
|
|
1028
|
+
url?: string;
|
|
1098
1029
|
}
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1030
|
+
interface FetchSite extends BaseFetcherProperties {
|
|
1031
|
+
domain: string;
|
|
1032
|
+
pathScope?: string[];
|
|
1033
|
+
meta?: {
|
|
1034
|
+
updatedAt?: number;
|
|
1035
|
+
ttlMs?: number;
|
|
1036
|
+
source?: 'manual' | 'smart';
|
|
1037
|
+
};
|
|
1038
|
+
}
|
|
1039
|
+
type OnFetchPauseCallback = (options: {
|
|
1040
|
+
message?: string;
|
|
1041
|
+
}) => Promise<void>;
|
|
1042
|
+
interface FetcherOptions extends BaseFetcherProperties {
|
|
1043
|
+
actions?: FetchActionOptions[];
|
|
1044
|
+
onPause?: OnFetchPauseCallback;
|
|
1045
|
+
}
|
|
1046
|
+
interface FetchMetadata {
|
|
1047
|
+
mode: FetchEngineType;
|
|
1048
|
+
engine?: BrowserEngine;
|
|
1049
|
+
timings?: {
|
|
1050
|
+
start: number;
|
|
1051
|
+
total: number;
|
|
1052
|
+
ttfb?: number;
|
|
1053
|
+
dns?: number;
|
|
1054
|
+
tcp?: number;
|
|
1055
|
+
firstByte?: number;
|
|
1056
|
+
download?: number;
|
|
1057
|
+
};
|
|
1058
|
+
proxy?: string;
|
|
1059
|
+
[key: string]: any;
|
|
1060
|
+
}
|
|
1061
|
+
interface FetchResponse {
|
|
1062
|
+
url: string;
|
|
1063
|
+
finalUrl: string;
|
|
1064
|
+
statusCode?: number;
|
|
1065
|
+
statusText?: string;
|
|
1066
|
+
headers: Record<string, string>;
|
|
1067
|
+
contentType?: string;
|
|
1068
|
+
body?: string | Buffer<ArrayBufferLike>;
|
|
1069
|
+
html?: string;
|
|
1070
|
+
text?: string;
|
|
1071
|
+
json?: any;
|
|
1072
|
+
cookies?: Cookie[];
|
|
1073
|
+
sessionState?: any;
|
|
1074
|
+
metadata?: FetchMetadata;
|
|
1110
1075
|
}
|
|
1076
|
+
declare const DefaultFetcherProperties: BaseFetcherProperties;
|
|
1077
|
+
declare const FetcherOptionKeys: string[];
|
|
1078
|
+
|
|
1111
1079
|
/**
|
|
1112
|
-
*
|
|
1080
|
+
* Represents a stateful web fetching session.
|
|
1113
1081
|
*
|
|
1114
1082
|
* @remarks
|
|
1115
|
-
*
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
/**
|
|
1121
|
-
* Predefined cleanup groups for the {@link FetchEngine.trim} action.
|
|
1122
|
-
*/
|
|
1123
|
-
type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
|
|
1124
|
-
/**
|
|
1125
|
-
* Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
|
|
1083
|
+
* A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
|
|
1084
|
+
* cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
|
|
1085
|
+
* session-level configurations and outputs.
|
|
1086
|
+
*
|
|
1087
|
+
* Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
|
|
1126
1088
|
*/
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1089
|
+
declare class FetchSession {
|
|
1090
|
+
protected options: FetcherOptions;
|
|
1091
|
+
/**
|
|
1092
|
+
* Unique identifier for the session.
|
|
1093
|
+
*/
|
|
1094
|
+
readonly id: string;
|
|
1095
|
+
/**
|
|
1096
|
+
* The execution context for this session, containing configurations, event bus, and shared state.
|
|
1097
|
+
*/
|
|
1098
|
+
readonly context: FetchContext;
|
|
1099
|
+
protected closed: boolean;
|
|
1100
|
+
/**
|
|
1101
|
+
* Creates a new FetchSession.
|
|
1102
|
+
*
|
|
1103
|
+
* @param options - Configuration options for the fetcher.
|
|
1104
|
+
*/
|
|
1105
|
+
constructor(options?: FetcherOptions);
|
|
1106
|
+
protected _logDebug(category: string, ...args: any[]): void;
|
|
1107
|
+
/**
|
|
1108
|
+
* Executes a single action within the session.
|
|
1109
|
+
*
|
|
1110
|
+
* @param actionOptions - Configuration for the action to be executed.
|
|
1111
|
+
* @param context - Optional context override for this specific execution. Defaults to the session context.
|
|
1112
|
+
* @returns A promise that resolves to the result of the action.
|
|
1113
|
+
* @template R - The expected return type of the action.
|
|
1114
|
+
*
|
|
1115
|
+
* @example
|
|
1116
|
+
* ```ts
|
|
1117
|
+
* await session.execute({ name: 'goto', params: { url: 'https://example.com' } });
|
|
1118
|
+
* ```
|
|
1119
|
+
*/
|
|
1120
|
+
execute<R extends FetchReturnType = 'response'>(actionOptions: FetchActionOptions, context?: FetchContext): Promise<FetchActionResult<R>>;
|
|
1121
|
+
/**
|
|
1122
|
+
* Executes a sequence of actions.
|
|
1123
|
+
*
|
|
1124
|
+
* @param actions - An array of action options to be executed in order.
|
|
1125
|
+
* @param options - Optional temporary configuration overrides (e.g., timeoutMs, headers) for this batch of actions.
|
|
1126
|
+
* These overrides do not affect the main session context.
|
|
1127
|
+
* @returns A promise that resolves to an object containing the result of the last action and all accumulated outputs.
|
|
1128
|
+
*
|
|
1129
|
+
* @example
|
|
1130
|
+
* ```ts
|
|
1131
|
+
* const { result, outputs } = await session.executeAll([
|
|
1132
|
+
* { name: 'goto', params: { url: 'https://example.com' } },
|
|
1133
|
+
* { name: 'extract', params: { schema: { title: 'h1' } }, storeAs: 'data' }
|
|
1134
|
+
* ], { timeoutMs: 30000 });
|
|
1135
|
+
* ```
|
|
1136
|
+
*/
|
|
1137
|
+
executeAll(actions: FetchActionOptions[], options?: Partial<FetcherOptions> & {
|
|
1138
|
+
index?: number;
|
|
1139
|
+
}): Promise<{
|
|
1140
|
+
result: FetchResponse | undefined;
|
|
1141
|
+
outputs: Record<string, any>;
|
|
1142
|
+
}>;
|
|
1143
|
+
/**
|
|
1144
|
+
* Retrieves all outputs accumulated during the session.
|
|
1145
|
+
*
|
|
1146
|
+
* @returns A record of stored output data.
|
|
1147
|
+
*/
|
|
1148
|
+
getOutputs(): Record<string, any>;
|
|
1149
|
+
/**
|
|
1150
|
+
* Gets the current state of the session, including cookies and engine-specific state.
|
|
1151
|
+
*
|
|
1152
|
+
* @returns A promise resolving to the session state, or undefined if no engine is initialized.
|
|
1153
|
+
*/
|
|
1154
|
+
getState(): Promise<{
|
|
1155
|
+
cookies: Cookie[];
|
|
1156
|
+
sessionState?: any;
|
|
1157
|
+
} | undefined>;
|
|
1158
|
+
/**
|
|
1159
|
+
* Disposes of the session and its associated engine.
|
|
1160
|
+
*
|
|
1161
|
+
* @remarks
|
|
1162
|
+
* This method should be called when the session is no longer needed to free up resources
|
|
1163
|
+
* (e.g., closing browser instances, purging temporary storage).
|
|
1164
|
+
*/
|
|
1165
|
+
dispose(): Promise<void>;
|
|
1166
|
+
private ensureEngine;
|
|
1167
|
+
protected createContext(options?: FetcherOptions): FetchContext;
|
|
1130
1168
|
}
|
|
1131
|
-
|
|
1169
|
+
|
|
1132
1170
|
/**
|
|
1133
|
-
*
|
|
1171
|
+
* High-level entry point for the Web Fetcher library.
|
|
1134
1172
|
*
|
|
1135
1173
|
* @remarks
|
|
1136
|
-
*
|
|
1137
|
-
*
|
|
1138
|
-
* **Execution Environments:**
|
|
1139
|
-
* - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
|
|
1140
|
-
* - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
|
|
1141
|
-
* including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
|
|
1142
|
-
*
|
|
1143
|
-
* **Navigation Handling:**
|
|
1144
|
-
* If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
|
|
1145
|
-
* automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
|
|
1146
|
-
*
|
|
1147
|
-
* @example
|
|
1148
|
-
* ```json
|
|
1149
|
-
* {
|
|
1150
|
-
* "action": "evaluate",
|
|
1151
|
-
* "params": {
|
|
1152
|
-
* "fn": "([a, b]) => a + b",
|
|
1153
|
-
* "args": [1, 2]
|
|
1154
|
-
* }
|
|
1155
|
-
* }
|
|
1156
|
-
* ```
|
|
1174
|
+
* The `WebFetcher` provides a simplified API for fetching web content without manually managing sessions.
|
|
1175
|
+
* It can be used for one-off requests or as a factory for more complex `FetchSession` instances.
|
|
1157
1176
|
*
|
|
1158
1177
|
* @example
|
|
1159
|
-
* ```
|
|
1160
|
-
*
|
|
1161
|
-
*
|
|
1162
|
-
* "params": {
|
|
1163
|
-
* "fn": "({ x, y }) => x * y",
|
|
1164
|
-
* "args": { "x": 6, "y": 7 }
|
|
1165
|
-
* }
|
|
1166
|
-
* }
|
|
1178
|
+
* ```ts
|
|
1179
|
+
* const fetcher = new WebFetcher();
|
|
1180
|
+
* const { result } = await fetcher.fetch('https://example.com');
|
|
1167
1181
|
* ```
|
|
1168
1182
|
*/
|
|
1169
|
-
|
|
1183
|
+
declare class WebFetcher {
|
|
1184
|
+
private defaults;
|
|
1170
1185
|
/**
|
|
1171
|
-
*
|
|
1186
|
+
* Creates a new WebFetcher with default options.
|
|
1172
1187
|
*
|
|
1173
|
-
* @
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
*
|
|
1188
|
+
* @param defaults - Default configuration options applied to all sessions and requests.
|
|
1189
|
+
*/
|
|
1190
|
+
constructor(defaults?: FetcherOptions);
|
|
1191
|
+
/**
|
|
1192
|
+
* Creates a new FetchSession.
|
|
1178
1193
|
*
|
|
1179
|
-
*
|
|
1180
|
-
*
|
|
1194
|
+
* @param options - Configuration options for the session, merged with defaults.
|
|
1195
|
+
* @returns A promise resolving to a new FetchSession instance.
|
|
1181
1196
|
*/
|
|
1182
|
-
|
|
1197
|
+
createSession(options?: FetcherOptions): Promise<FetchSession>;
|
|
1183
1198
|
/**
|
|
1184
|
-
*
|
|
1199
|
+
* Fetches content from a URL or executes a complex action script.
|
|
1185
1200
|
*
|
|
1186
1201
|
* @remarks
|
|
1187
|
-
* This
|
|
1188
|
-
*
|
|
1202
|
+
* This method automatically creates a session, executes the specified actions,
|
|
1203
|
+
* retrieves the content, and disposes of the session.
|
|
1204
|
+
*
|
|
1205
|
+
* @param url - The target URL or a complete FetcherOptions object.
|
|
1206
|
+
* @param options - Additional options when the first parameter is a URL string.
|
|
1207
|
+
* @returns A promise resolving to the final response and any extracted outputs.
|
|
1189
1208
|
*/
|
|
1190
|
-
|
|
1209
|
+
fetch(url: string, options?: FetcherOptions): Promise<{
|
|
1210
|
+
result: FetchResponse | undefined;
|
|
1211
|
+
outputs: Record<string, any>;
|
|
1212
|
+
}>;
|
|
1213
|
+
fetch(options: FetcherOptions): Promise<{
|
|
1214
|
+
result: FetchResponse | undefined;
|
|
1215
|
+
outputs: Record<string, any>;
|
|
1216
|
+
}>;
|
|
1191
1217
|
}
|
|
1218
|
+
|
|
1192
1219
|
/**
|
|
1193
|
-
*
|
|
1220
|
+
* Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
|
|
1221
|
+
* It acts as the target for extraction and interaction actions.
|
|
1222
|
+
*/
|
|
1223
|
+
type FetchElementScope = any;
|
|
1224
|
+
/**
|
|
1225
|
+
* Interface representing the minimal engine capabilities required for extraction.
|
|
1194
1226
|
*
|
|
1195
1227
|
* @remarks
|
|
1196
|
-
*
|
|
1197
|
-
*
|
|
1228
|
+
* This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
|
|
1229
|
+
* Implementing classes must ensure consistent behavior across different engines, especially
|
|
1230
|
+
* regarding scope handling (Element vs Array of Elements) and DOM traversal.
|
|
1198
1231
|
*/
|
|
1199
|
-
|
|
1200
|
-
type: 'click';
|
|
1201
|
-
selector: string;
|
|
1202
|
-
} | {
|
|
1203
|
-
type: 'fill';
|
|
1204
|
-
selector: string;
|
|
1205
|
-
value: string;
|
|
1206
|
-
} | {
|
|
1207
|
-
type: 'mouseMove';
|
|
1208
|
-
params: {
|
|
1209
|
-
x?: number;
|
|
1210
|
-
y?: number;
|
|
1211
|
-
selector?: string;
|
|
1212
|
-
steps?: number;
|
|
1213
|
-
};
|
|
1214
|
-
} | {
|
|
1215
|
-
type: 'mouseClick';
|
|
1216
|
-
params: {
|
|
1217
|
-
x?: number;
|
|
1218
|
-
y?: number;
|
|
1219
|
-
button?: 'left' | 'right' | 'middle';
|
|
1220
|
-
clickCount?: number;
|
|
1221
|
-
delay?: number;
|
|
1222
|
-
steps?: number;
|
|
1223
|
-
};
|
|
1224
|
-
} | {
|
|
1225
|
-
type: 'mouseWheel';
|
|
1226
|
-
params: {
|
|
1227
|
-
x?: number;
|
|
1228
|
-
y?: number;
|
|
1229
|
-
selector?: string;
|
|
1230
|
-
deltaX?: number;
|
|
1231
|
-
deltaY?: number;
|
|
1232
|
-
steps?: number;
|
|
1233
|
-
};
|
|
1234
|
-
} | {
|
|
1235
|
-
type: 'keyboardType';
|
|
1236
|
-
params: {
|
|
1237
|
-
text: string;
|
|
1238
|
-
delay?: number;
|
|
1239
|
-
};
|
|
1240
|
-
} | {
|
|
1241
|
-
type: 'keyboardPress';
|
|
1242
|
-
params: {
|
|
1243
|
-
key: string;
|
|
1244
|
-
delay?: number;
|
|
1245
|
-
};
|
|
1246
|
-
} | {
|
|
1247
|
-
type: 'scrollIntoView';
|
|
1248
|
-
params: {
|
|
1249
|
-
selector: string;
|
|
1250
|
-
};
|
|
1251
|
-
} | {
|
|
1252
|
-
type: 'waitFor';
|
|
1253
|
-
options?: WaitForActionOptions;
|
|
1254
|
-
} | {
|
|
1255
|
-
type: 'submit';
|
|
1256
|
-
selector?: any;
|
|
1257
|
-
options?: SubmitActionOptions;
|
|
1258
|
-
} | {
|
|
1259
|
-
type: 'getContent';
|
|
1260
|
-
} | {
|
|
1261
|
-
type: 'navigate';
|
|
1262
|
-
url: string;
|
|
1263
|
-
opts?: GotoActionOptions;
|
|
1264
|
-
} | {
|
|
1265
|
-
type: 'extract';
|
|
1266
|
-
schema: ExtractSchema;
|
|
1267
|
-
} | {
|
|
1268
|
-
type: 'pause';
|
|
1269
|
-
message?: string;
|
|
1270
|
-
} | {
|
|
1271
|
-
type: 'trim';
|
|
1272
|
-
options: TrimActionOptions;
|
|
1273
|
-
} | {
|
|
1274
|
-
type: 'evaluate';
|
|
1275
|
-
params: EvaluateActionOptions;
|
|
1276
|
-
} | {
|
|
1277
|
-
type: 'dispose';
|
|
1278
|
-
};
|
|
1279
|
-
/**
|
|
1280
|
-
* Represents an action that has been dispatched and is awaiting execution in the active page context.
|
|
1281
|
-
*
|
|
1282
|
-
* @remarks
|
|
1283
|
-
* Connects the action request with its resolution mechanism. Used internally by the action dispatch system
|
|
1284
|
-
* to handle promises while maintaining the page context validity window.
|
|
1285
|
-
*/
|
|
1286
|
-
interface DispatchedEngineAction {
|
|
1287
|
-
action: FetchEngineAction;
|
|
1288
|
-
resolve: (value?: any) => void;
|
|
1289
|
-
reject: (reason?: any) => void;
|
|
1290
|
-
}
|
|
1291
|
-
/**
|
|
1292
|
-
* Represents a pending navigation request awaiting resolution.
|
|
1293
|
-
*
|
|
1294
|
-
* @remarks
|
|
1295
|
-
* Tracks navigation requests that have been queued but not yet processed by the request handler.
|
|
1296
|
-
*/
|
|
1297
|
-
interface PendingEngineRequest {
|
|
1298
|
-
resolve: (value: any) => void;
|
|
1299
|
-
reject: (reason?: any) => void;
|
|
1300
|
-
}
|
|
1301
|
-
/**
|
|
1302
|
-
* Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
|
|
1303
|
-
*
|
|
1304
|
-
* @remarks
|
|
1305
|
-
* The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
|
|
1306
|
-
* `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
|
|
1307
|
-
* content retrieval, and user interaction.
|
|
1308
|
-
*
|
|
1309
|
-
* The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
|
|
1310
|
-
* the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
|
|
1311
|
-
* page context validity across asynchronous operations.
|
|
1312
|
-
*
|
|
1313
|
-
* @example
|
|
1314
|
-
* ```ts
|
|
1315
|
-
* import "./playwright"; // 引入注册 Playwright browser 引擎
|
|
1316
|
-
* const engine = await FetchEngine.create(context, { engine: 'browser' });
|
|
1317
|
-
* await engine.goto('https://example.com');
|
|
1318
|
-
* await engine.fill('#username', 'user');
|
|
1319
|
-
* await engine.click('#submit');
|
|
1320
|
-
* const response = await engine.getContent();
|
|
1321
|
-
* ```
|
|
1322
|
-
*/
|
|
1323
|
-
type AnyFetchEngine = FetchEngine<any, any, any>;
|
|
1324
|
-
type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
|
|
1325
|
-
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
|
|
1326
|
-
private static registry;
|
|
1232
|
+
interface IExtractEngine {
|
|
1327
1233
|
/**
|
|
1328
|
-
*
|
|
1234
|
+
* Finds all elements matching the selector within the given scope.
|
|
1329
1235
|
*
|
|
1330
|
-
* @param
|
|
1331
|
-
* @
|
|
1236
|
+
* @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
|
|
1237
|
+
* @param selector - The CSS selector to match.
|
|
1238
|
+
* @returns A promise resolving to an array of found element scopes.
|
|
1332
1239
|
*
|
|
1333
|
-
* @
|
|
1334
|
-
*
|
|
1335
|
-
*
|
|
1336
|
-
*
|
|
1240
|
+
* @remarks
|
|
1241
|
+
* **Behavior Contract:**
|
|
1242
|
+
* 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
|
|
1243
|
+
* 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
|
|
1244
|
+
* 3. **Array Scope**: If `scope` is an array:
|
|
1245
|
+
* - It MUST process elements in the order they appear in the array (which should match document order).
|
|
1246
|
+
* - It MUST perform the check (Self + Descendants) for *each* element in the array.
|
|
1247
|
+
* - It MUST flatten the results into a single array.
|
|
1248
|
+
* - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
|
|
1249
|
+
* but generally, preserving document order is the priority.
|
|
1337
1250
|
*/
|
|
1338
|
-
|
|
1251
|
+
_querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
1339
1252
|
/**
|
|
1340
|
-
*
|
|
1253
|
+
* Extracts a primitive value from the element based on the schema configuration.
|
|
1341
1254
|
*
|
|
1342
|
-
* @param
|
|
1343
|
-
* @
|
|
1255
|
+
* @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
|
|
1256
|
+
* @param scope - The specific element to extract data from.
|
|
1257
|
+
* @returns A promise resolving to the extracted value (string, number, boolean, or null).
|
|
1258
|
+
*
|
|
1259
|
+
* @remarks
|
|
1260
|
+
* **Behavior Contract:**
|
|
1261
|
+
* - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
|
|
1262
|
+
* - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
|
|
1263
|
+
* - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
|
|
1264
|
+
* - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
|
|
1265
|
+
* - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
|
|
1344
1266
|
*/
|
|
1345
|
-
|
|
1267
|
+
_extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1346
1268
|
/**
|
|
1347
|
-
*
|
|
1269
|
+
* Gets the parent element of the given scope.
|
|
1348
1270
|
*
|
|
1349
|
-
* @param
|
|
1350
|
-
* @returns
|
|
1271
|
+
* @param scope - The element to find the parent of.
|
|
1272
|
+
* @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
|
|
1351
1273
|
*/
|
|
1352
|
-
|
|
1274
|
+
_parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1353
1275
|
/**
|
|
1354
|
-
*
|
|
1276
|
+
* Checks if two element scopes refer to the exact same DOM node.
|
|
1355
1277
|
*
|
|
1356
|
-
* @param
|
|
1357
|
-
* @param
|
|
1358
|
-
* @returns
|
|
1359
|
-
* @throws {Error} When no suitable engine implementation is found
|
|
1278
|
+
* @param scope1 - The first element scope.
|
|
1279
|
+
* @param scope2 - The second element scope.
|
|
1280
|
+
* @returns A promise resolving to `true` if they are the same node, `false` otherwise.
|
|
1360
1281
|
*
|
|
1361
1282
|
* @remarks
|
|
1362
|
-
*
|
|
1283
|
+
* This comparison MUST be identity-based, not just content-based.
|
|
1363
1284
|
*/
|
|
1364
|
-
|
|
1285
|
+
_isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
1365
1286
|
/**
|
|
1366
|
-
*
|
|
1287
|
+
* Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
|
|
1288
|
+
*
|
|
1289
|
+
* @param scope - The anchor element (starting point). The returned list starts *after* this element.
|
|
1290
|
+
* @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
|
|
1291
|
+
* If omitted or null, returns all following siblings.
|
|
1292
|
+
* @returns A promise resolving to an array of sibling element scopes.
|
|
1367
1293
|
*
|
|
1368
1294
|
* @remarks
|
|
1369
|
-
*
|
|
1295
|
+
* **Behavior Contract:**
|
|
1296
|
+
* - **Starting Point**: The `scope` element itself IS NOT included in the result.
|
|
1297
|
+
* - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
|
|
1298
|
+
* - **Direction**: Only scans *following* siblings (next siblings).
|
|
1299
|
+
* - **Flattening**: The result is a flat list of siblings, not a nested structure.
|
|
1370
1300
|
*/
|
|
1371
|
-
|
|
1301
|
+
_nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
1372
1302
|
/**
|
|
1373
|
-
*
|
|
1303
|
+
* Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
|
|
1304
|
+
*
|
|
1305
|
+
* @param scope - The starting element from which to ascend the DOM tree.
|
|
1306
|
+
* @param candidates - An array of potential ancestor elements to check against.
|
|
1307
|
+
* @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
|
|
1374
1308
|
*
|
|
1375
1309
|
* @remarks
|
|
1376
|
-
*
|
|
1377
|
-
|
|
1378
|
-
static readonly mode: FetchEngineType;
|
|
1379
|
-
protected ctx?: FetchEngineContext;
|
|
1380
|
-
protected opts?: BaseFetcherProperties;
|
|
1381
|
-
protected crawler?: TCrawler;
|
|
1382
|
-
protected isCrawlerReady?: boolean;
|
|
1383
|
-
protected crawlerRunPromise?: Promise<FinalStatistics>;
|
|
1384
|
-
protected config?: Configuration;
|
|
1385
|
-
protected requestQueue?: RequestQueue;
|
|
1386
|
-
protected kvStore?: KeyValueStore;
|
|
1387
|
-
protected proxyConfiguration?: ProxyConfiguration;
|
|
1388
|
-
protected hdrs: Record<string, string>;
|
|
1389
|
-
protected _initialCookies?: Cookie[];
|
|
1390
|
-
protected _initializedSessions: Set<string>;
|
|
1391
|
-
protected currentSession?: Session;
|
|
1392
|
-
protected pendingRequests: Map<string, PendingEngineRequest>;
|
|
1393
|
-
protected requestCounter: number;
|
|
1394
|
-
protected actionEmitter: EventEmitter;
|
|
1395
|
-
protected isPageActive: boolean;
|
|
1396
|
-
protected isEngineDisposed: boolean;
|
|
1397
|
-
protected navigationLock: PromiseLock;
|
|
1398
|
-
protected activeContext?: TContext;
|
|
1399
|
-
protected isExecutingAction: boolean;
|
|
1400
|
-
protected lastResponse?: FetchResponse;
|
|
1401
|
-
protected actionQueue: DispatchedEngineAction[];
|
|
1402
|
-
protected isProcessingActionLoop: boolean;
|
|
1403
|
-
protected blockedTypes: Set<string>;
|
|
1404
|
-
_logDebug(category: string, ...args: any[]): void;
|
|
1405
|
-
protected _cleanup?(): Promise<void>;
|
|
1406
|
-
protected _getTrimInfo(options: TrimActionOptions): {
|
|
1407
|
-
selectors: string[];
|
|
1408
|
-
removeComments: boolean;
|
|
1409
|
-
removeHidden: boolean;
|
|
1410
|
-
};
|
|
1411
|
-
/**
|
|
1412
|
-
* Finds all elements matching the selector within the given scope.
|
|
1310
|
+
* **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
|
|
1311
|
+
* It effectively answers: "Which of these container candidates does my current element belong to?"
|
|
1413
1312
|
*
|
|
1414
|
-
*
|
|
1415
|
-
*
|
|
1416
|
-
*
|
|
1417
|
-
* @see {@link IExtractEngine._querySelectorAll} for behavior contract.
|
|
1418
|
-
* @internal
|
|
1313
|
+
* **Implementation Guidelines**:
|
|
1314
|
+
* - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
|
|
1315
|
+
* - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
|
|
1419
1316
|
*/
|
|
1420
|
-
|
|
1317
|
+
_findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
1421
1318
|
/**
|
|
1422
|
-
*
|
|
1319
|
+
* Checks if the `container` element contains the `element` (descendant).
|
|
1423
1320
|
*
|
|
1424
|
-
* @param
|
|
1425
|
-
* @param
|
|
1426
|
-
* @returns
|
|
1427
|
-
* @see {@link IExtractEngine._extractValue} for behavior contract.
|
|
1428
|
-
* @internal
|
|
1429
|
-
*/
|
|
1430
|
-
abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1431
|
-
/**
|
|
1432
|
-
* Gets the parent element of the given element.
|
|
1321
|
+
* @param container - The potential ancestor element.
|
|
1322
|
+
* @param element - The potential descendant element.
|
|
1323
|
+
* @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
|
|
1433
1324
|
*
|
|
1434
|
-
* @
|
|
1435
|
-
*
|
|
1436
|
-
*
|
|
1325
|
+
* @remarks
|
|
1326
|
+
* **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
|
|
1327
|
+
*
|
|
1328
|
+
* @performance-critical Used extensively in boundary checks for Segmented extraction.
|
|
1329
|
+
* - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
|
|
1330
|
+
* - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
|
|
1437
1331
|
*/
|
|
1438
|
-
|
|
1332
|
+
_contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
1439
1333
|
/**
|
|
1440
|
-
*
|
|
1334
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
1441
1335
|
*
|
|
1442
|
-
* @param scope1 -
|
|
1443
|
-
* @param scope2 -
|
|
1444
|
-
* @returns
|
|
1445
|
-
*
|
|
1336
|
+
* @param scope1 - The first element.
|
|
1337
|
+
* @param scope2 - The second element.
|
|
1338
|
+
* @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
|
|
1339
|
+
*
|
|
1340
|
+
* @remarks
|
|
1341
|
+
* This is a fundamental tree operation used to find the point where two element paths diverge.
|
|
1342
|
+
* **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
|
|
1446
1343
|
*/
|
|
1447
|
-
|
|
1344
|
+
_findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1448
1345
|
/**
|
|
1449
|
-
*
|
|
1450
|
-
* Used in 'segmented' extraction mode.
|
|
1346
|
+
* Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
|
|
1451
1347
|
*
|
|
1452
|
-
* @param
|
|
1453
|
-
* @param
|
|
1454
|
-
* @returns
|
|
1455
|
-
*
|
|
1348
|
+
* @param element - The descendant element.
|
|
1349
|
+
* @param container - The ancestor container.
|
|
1350
|
+
* @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
|
|
1351
|
+
*
|
|
1352
|
+
* @remarks
|
|
1353
|
+
* This method traverses up from `element` until it finds the node whose parent is `container`.
|
|
1354
|
+
* **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
|
|
1456
1355
|
*/
|
|
1457
|
-
|
|
1356
|
+
_findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1458
1357
|
/**
|
|
1459
|
-
*
|
|
1460
|
-
*
|
|
1461
|
-
* @param
|
|
1462
|
-
* @param candidates - The array of potential ancestor scopes.
|
|
1463
|
-
* @returns A promise resolving to the matching candidate scope, or `null` if none found.
|
|
1464
|
-
* @see {@link IExtractEngine._findClosestAncestor} for implementation details.
|
|
1465
|
-
* @internal
|
|
1358
|
+
* Logs debug information if debug mode is enabled.
|
|
1359
|
+
* @param category - The category of the log message.
|
|
1360
|
+
* @param args - Arguments to log.
|
|
1466
1361
|
*/
|
|
1467
|
-
|
|
1362
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
1363
|
+
}
|
|
1364
|
+
/**
|
|
1365
|
+
* Base configuration for all extraction schemas.
|
|
1366
|
+
*/
|
|
1367
|
+
interface BaseExtractSchema {
|
|
1468
1368
|
/**
|
|
1469
|
-
*
|
|
1470
|
-
*
|
|
1471
|
-
* @param container - The potential ancestor element.
|
|
1472
|
-
* @param element - The potential descendant element.
|
|
1473
|
-
* @returns A promise resolving to `true` if `container` contains `element`.
|
|
1474
|
-
* @see {@link IExtractEngine._contains} for implementation details.
|
|
1475
|
-
* @internal
|
|
1369
|
+
* Whether this field is required. If true and the value is null,
|
|
1370
|
+
* the containing object or array item will be skipped (or throw error in strict mode).
|
|
1476
1371
|
*/
|
|
1477
|
-
|
|
1372
|
+
required?: boolean;
|
|
1478
1373
|
/**
|
|
1479
|
-
*
|
|
1480
|
-
*
|
|
1481
|
-
* @param scope1 - The first element scope.
|
|
1482
|
-
* @param scope2 - The second element scope.
|
|
1483
|
-
* @returns A promise resolving to the LCA element scope, or `null` if none found.
|
|
1484
|
-
* @internal
|
|
1374
|
+
* Whether to enable strict mode for this extraction.
|
|
1375
|
+
* If true, missing required fields will throw an error instead of being skipped.
|
|
1485
1376
|
*/
|
|
1486
|
-
|
|
1377
|
+
strict?: boolean;
|
|
1487
1378
|
/**
|
|
1488
|
-
*
|
|
1379
|
+
* Specifies the starting anchor for extraction of this field.
|
|
1380
|
+
* - Field Name: Uses the DOM element of a previously extracted field as the anchor.
|
|
1381
|
+
* - CSS Selector: Re-queries the selector within the current context to find the anchor.
|
|
1489
1382
|
*
|
|
1490
|
-
*
|
|
1491
|
-
* @param container - The container element.
|
|
1492
|
-
* @returns The child element of container, or null.
|
|
1493
|
-
* @internal
|
|
1383
|
+
* Once anchored, the search scope for this field becomes the siblings following the anchor.
|
|
1494
1384
|
*/
|
|
1495
|
-
|
|
1496
|
-
protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
|
|
1385
|
+
anchor?: string;
|
|
1497
1386
|
/**
|
|
1498
|
-
*
|
|
1499
|
-
*
|
|
1500
|
-
*
|
|
1387
|
+
* The maximum number of levels to bubble up from the anchor or matched element.
|
|
1388
|
+
* - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
|
|
1389
|
+
* - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
|
|
1390
|
+
* - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
|
|
1501
1391
|
*/
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1392
|
+
depth?: number;
|
|
1393
|
+
}
|
|
1394
|
+
/**
|
|
1395
|
+
* Extraction schema types.
|
|
1396
|
+
*/
|
|
1397
|
+
type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSchema;
|
|
1398
|
+
/**
|
|
1399
|
+
* Configuration for extracting a single value.
|
|
1400
|
+
*/
|
|
1401
|
+
interface ExtractValueSchema extends BaseExtractSchema {
|
|
1505
1402
|
/**
|
|
1506
|
-
*
|
|
1507
|
-
* @
|
|
1508
|
-
* @param elements - The list of item elements.
|
|
1509
|
-
* @internal
|
|
1403
|
+
* The data type to cast the extracted value to.
|
|
1404
|
+
* @default 'string'
|
|
1510
1405
|
*/
|
|
1511
|
-
|
|
1512
|
-
strict?: boolean;
|
|
1513
|
-
}): Promise<any[]>;
|
|
1406
|
+
type?: 'string' | 'number' | 'boolean' | 'html';
|
|
1514
1407
|
/**
|
|
1515
|
-
*
|
|
1516
|
-
*
|
|
1517
|
-
*
|
|
1518
|
-
*
|
|
1519
|
-
*
|
|
1520
|
-
* @returns An array of extracted items, or null if requirements aren't met.
|
|
1521
|
-
* @internal
|
|
1408
|
+
* Extraction behavior mode.
|
|
1409
|
+
* - 'text': (Default) Uses textContent.
|
|
1410
|
+
* - 'innerText': Uses rendered text (respects CSS line breaks).
|
|
1411
|
+
* - 'html': Returns innerHTML.
|
|
1412
|
+
* - 'outerHTML': Returns HTML including the element's tag.
|
|
1522
1413
|
*/
|
|
1523
|
-
|
|
1414
|
+
mode?: 'text' | 'innerText' | 'html' | 'outerHTML';
|
|
1524
1415
|
/**
|
|
1525
|
-
*
|
|
1526
|
-
*
|
|
1527
|
-
* @param schema - The schema for a single item (must be an object).
|
|
1528
|
-
* @param container - The container element to scan.
|
|
1529
|
-
* @param opts - Segmented extraction options (anchor).
|
|
1530
|
-
* @returns An array of extracted items.
|
|
1531
|
-
* @internal
|
|
1416
|
+
* CSS selector to locate the element within the current context.
|
|
1532
1417
|
*/
|
|
1533
|
-
|
|
1418
|
+
selector?: string;
|
|
1534
1419
|
/**
|
|
1535
|
-
*
|
|
1536
|
-
*
|
|
1537
|
-
* @internal
|
|
1420
|
+
* Attribute name to extract (e.g., 'href', 'src').
|
|
1421
|
+
* If omitted, the text content or HTML is extracted based on `type`.
|
|
1538
1422
|
*/
|
|
1539
|
-
|
|
1423
|
+
attribute?: string;
|
|
1540
1424
|
/**
|
|
1541
|
-
*
|
|
1542
|
-
* @param ctx - The fetch engine context.
|
|
1543
|
-
* @internal
|
|
1425
|
+
* Filter elements that contain a descendant matching this CSS selector.
|
|
1544
1426
|
*/
|
|
1545
|
-
|
|
1427
|
+
has?: string;
|
|
1546
1428
|
/**
|
|
1547
|
-
*
|
|
1548
|
-
*
|
|
1549
|
-
* @param context - Crawlee crawling context
|
|
1550
|
-
* @returns Promise resolving to [FetchResponse] object
|
|
1551
|
-
*
|
|
1552
|
-
* @remarks
|
|
1553
|
-
* Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
|
|
1554
|
-
* @internal
|
|
1429
|
+
* Exclude elements matching this CSS selector.
|
|
1555
1430
|
*/
|
|
1556
|
-
|
|
1557
|
-
|
|
1431
|
+
exclude?: string;
|
|
1432
|
+
}
|
|
1433
|
+
/**
|
|
1434
|
+
* Names of the supported array extraction modes.
|
|
1435
|
+
*/
|
|
1436
|
+
type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
|
|
1437
|
+
/**
|
|
1438
|
+
* Base options for array extraction modes.
|
|
1439
|
+
*/
|
|
1440
|
+
interface BaseModeOptions {
|
|
1441
|
+
type: ExtractArrayModeName;
|
|
1558
1442
|
/**
|
|
1559
|
-
*
|
|
1560
|
-
*
|
|
1561
|
-
* @param context - Crawlee crawling context
|
|
1562
|
-
* @param action - Action to execute
|
|
1563
|
-
* @returns Promise resolving to action result
|
|
1564
|
-
*
|
|
1565
|
-
* @remarks
|
|
1566
|
-
* Handles specific user interactions using underlying technology (Playwright/Cheerio).
|
|
1567
|
-
* @internal
|
|
1443
|
+
* Whether to enable strict mode for this specific array mode.
|
|
1444
|
+
* @default false
|
|
1568
1445
|
*/
|
|
1569
|
-
|
|
1446
|
+
strict?: boolean;
|
|
1447
|
+
}
|
|
1448
|
+
/**
|
|
1449
|
+
* Options for columnar (column-alignment) extraction.
|
|
1450
|
+
*/
|
|
1451
|
+
interface ColumnarOptions extends BaseModeOptions {
|
|
1452
|
+
type: 'columnar';
|
|
1570
1453
|
/**
|
|
1571
|
-
*
|
|
1572
|
-
*
|
|
1573
|
-
* @
|
|
1574
|
-
* @param params - Navigation options
|
|
1575
|
-
* @returns Promise resolving when navigation completes
|
|
1576
|
-
*
|
|
1577
|
-
* @example
|
|
1578
|
-
* ```ts
|
|
1579
|
-
* await engine.goto('https://example.com');
|
|
1580
|
-
* ```
|
|
1454
|
+
* Whether to enable heuristic inference.
|
|
1455
|
+
* If true, tries to find a common parent to infer item wrappers when counts mismatch.
|
|
1456
|
+
* @default false
|
|
1581
1457
|
*/
|
|
1582
|
-
|
|
1458
|
+
inference?: boolean;
|
|
1459
|
+
}
|
|
1460
|
+
/**
|
|
1461
|
+
* Options for segmented (anchor-based) extraction.
|
|
1462
|
+
*/
|
|
1463
|
+
interface SegmentedOptions extends BaseModeOptions {
|
|
1464
|
+
type: 'segmented';
|
|
1583
1465
|
/**
|
|
1584
|
-
*
|
|
1585
|
-
*
|
|
1586
|
-
* @param params - Wait conditions
|
|
1587
|
-
* @returns Promise resolving when wait condition is met
|
|
1588
|
-
*
|
|
1589
|
-
* @example
|
|
1590
|
-
* ```ts
|
|
1591
|
-
* await engine.waitFor({ ms: 1000 }); // Wait 1 second
|
|
1592
|
-
* await engine.waitFor({ selector: '#content' }); // Wait for element
|
|
1593
|
-
* ```
|
|
1466
|
+
* The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
|
|
1467
|
+
* Defaults to the first property key's selector defined in `items`.
|
|
1594
1468
|
*/
|
|
1595
|
-
|
|
1469
|
+
anchor?: string;
|
|
1596
1470
|
/**
|
|
1597
|
-
*
|
|
1598
|
-
*
|
|
1599
|
-
*
|
|
1600
|
-
* @returns Promise resolving when click is processed
|
|
1601
|
-
* @throws {Error} When no active page context exists
|
|
1471
|
+
* Where to start searching for fields within each segment.
|
|
1472
|
+
* - 'anchor': (Default) All fields are searched within the entire segment.
|
|
1473
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
1602
1474
|
*/
|
|
1603
|
-
|
|
1475
|
+
relativeTo?: 'anchor' | 'previous';
|
|
1604
1476
|
/**
|
|
1605
|
-
*
|
|
1606
|
-
*
|
|
1607
|
-
* @param params - Move parameters (x, y, selector, steps)
|
|
1608
|
-
*/
|
|
1609
|
-
mouseMove(params: {
|
|
1610
|
-
x?: number;
|
|
1611
|
-
y?: number;
|
|
1612
|
-
selector?: string;
|
|
1613
|
-
steps?: number;
|
|
1614
|
-
}): Promise<void>;
|
|
1615
|
-
/**
|
|
1616
|
-
* Clicks at current position or specified position.
|
|
1617
|
-
*
|
|
1618
|
-
* @param params - Click parameters (x, y, button, clickCount, delay)
|
|
1477
|
+
* The maximum number of levels to bubble up from the anchor to find a segment container.
|
|
1478
|
+
* If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
|
|
1619
1479
|
*/
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1480
|
+
depth?: number;
|
|
1481
|
+
}
|
|
1482
|
+
/**
|
|
1483
|
+
* Union type for array extraction modes and their options.
|
|
1484
|
+
*/
|
|
1485
|
+
type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOptions;
|
|
1486
|
+
/**
|
|
1487
|
+
* Configuration for extracting an array of items.
|
|
1488
|
+
*/
|
|
1489
|
+
interface ExtractArraySchema extends BaseExtractSchema {
|
|
1490
|
+
type: 'array';
|
|
1627
1491
|
/**
|
|
1628
|
-
*
|
|
1629
|
-
*
|
|
1630
|
-
* @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
|
|
1492
|
+
* CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
|
|
1631
1493
|
*/
|
|
1632
|
-
|
|
1633
|
-
x?: number;
|
|
1634
|
-
y?: number;
|
|
1635
|
-
selector?: string;
|
|
1636
|
-
deltaX?: number;
|
|
1637
|
-
deltaY?: number;
|
|
1638
|
-
steps?: number;
|
|
1639
|
-
}): Promise<void>;
|
|
1494
|
+
selector: string;
|
|
1640
1495
|
/**
|
|
1641
|
-
*
|
|
1642
|
-
*
|
|
1643
|
-
* @param params - Scroll parameters (selector)
|
|
1496
|
+
* Filter items/containers that contain a descendant matching this CSS selector.
|
|
1644
1497
|
*/
|
|
1645
|
-
|
|
1646
|
-
selector: string;
|
|
1647
|
-
}): Promise<void>;
|
|
1498
|
+
has?: string;
|
|
1648
1499
|
/**
|
|
1649
|
-
*
|
|
1650
|
-
*
|
|
1651
|
-
* @param text - Text to type
|
|
1652
|
-
* @param delay - Delay between key presses
|
|
1500
|
+
* Exclude items/containers matching this CSS selector.
|
|
1653
1501
|
*/
|
|
1654
|
-
|
|
1502
|
+
exclude?: string;
|
|
1655
1503
|
/**
|
|
1656
|
-
*
|
|
1657
|
-
*
|
|
1658
|
-
* @param key - Key to press
|
|
1659
|
-
* @param delay - Delay after key press
|
|
1504
|
+
* Schema applied recursively to each extracted item.
|
|
1505
|
+
* If omitted, defaults to extracting text.
|
|
1660
1506
|
*/
|
|
1661
|
-
|
|
1507
|
+
items?: ExtractSchema;
|
|
1662
1508
|
/**
|
|
1663
|
-
*
|
|
1664
|
-
*
|
|
1665
|
-
* @param selector - CSS selector of input element
|
|
1666
|
-
* @param value - Value to fill
|
|
1667
|
-
* @returns Promise resolving when fill operation completes
|
|
1668
|
-
* @throws {Error} When no active page context exists
|
|
1509
|
+
* Shortcut for `items` to extract a specific attribute directly.
|
|
1669
1510
|
*/
|
|
1670
|
-
|
|
1511
|
+
attribute?: string;
|
|
1671
1512
|
/**
|
|
1672
|
-
*
|
|
1673
|
-
*
|
|
1674
|
-
*
|
|
1675
|
-
*
|
|
1676
|
-
* @returns Promise resolving when form is submitted
|
|
1677
|
-
* @throws {Error} When no active page context exists
|
|
1513
|
+
* Array extraction mode.
|
|
1514
|
+
* - 'nested': (Default) Items are elements matched by `selector`.
|
|
1515
|
+
* - 'columnar': `selector` is a container, fields in `items` are parallel columns aligned by index.
|
|
1516
|
+
* - 'segmented': `selector` is a container, items are segmented by an anchor field.
|
|
1678
1517
|
*/
|
|
1679
|
-
|
|
1518
|
+
mode?: ExtractArrayMode;
|
|
1519
|
+
}
|
|
1520
|
+
/**
|
|
1521
|
+
* Configuration for extracting an object with multiple properties.
|
|
1522
|
+
*/
|
|
1523
|
+
interface ExtractObjectSchema extends BaseExtractSchema {
|
|
1524
|
+
type: 'object';
|
|
1680
1525
|
/**
|
|
1681
|
-
*
|
|
1682
|
-
*
|
|
1683
|
-
* @param options - Trim options specifying selectors and presets
|
|
1684
|
-
* @returns Promise resolving when trim operation completes
|
|
1685
|
-
* @throws {Error} When no active page context exists
|
|
1526
|
+
* Root selector for the object. If provided, sub-properties are searched within this element.
|
|
1686
1527
|
*/
|
|
1687
|
-
|
|
1528
|
+
selector?: string;
|
|
1688
1529
|
/**
|
|
1689
|
-
*
|
|
1690
|
-
*
|
|
1691
|
-
* @param message - Optional message to display during pause
|
|
1692
|
-
* @returns Promise resolving when execution is resumed
|
|
1693
|
-
* @throws {Error} When no active page context exists
|
|
1530
|
+
* Filter the object element based on descendants.
|
|
1694
1531
|
*/
|
|
1695
|
-
|
|
1532
|
+
has?: string;
|
|
1696
1533
|
/**
|
|
1697
|
-
*
|
|
1698
|
-
*
|
|
1699
|
-
* @remarks
|
|
1700
|
-
* This is a powerful action that allows running custom logic to interact with the DOM,
|
|
1701
|
-
* calculate values, or trigger navigations.
|
|
1702
|
-
*
|
|
1703
|
-
* - In **Browser Mode**, it runs in the real browser.
|
|
1704
|
-
* - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
|
|
1705
|
-
*
|
|
1706
|
-
* The action handles automatic navigation if `window.location` is modified.
|
|
1707
|
-
*
|
|
1708
|
-
* @param params - Configuration for the execution, including the function and arguments.
|
|
1709
|
-
* @returns A promise resolving to the result of the execution.
|
|
1710
|
-
* @throws {Error} If no active page context exists or if execution fails.
|
|
1711
|
-
*
|
|
1712
|
-
* @see {@link EvaluateActionOptions} for detailed parameter options and examples.
|
|
1534
|
+
* Exclude the object element if it matches this selector.
|
|
1713
1535
|
*/
|
|
1714
|
-
|
|
1536
|
+
exclude?: string;
|
|
1715
1537
|
/**
|
|
1716
|
-
*
|
|
1717
|
-
*
|
|
1718
|
-
*
|
|
1719
|
-
*
|
|
1538
|
+
* Where to start searching for fields within this object.
|
|
1539
|
+
* Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
|
|
1540
|
+
* - 'anchor': (Default) All fields are searched within the entire scope.
|
|
1541
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
1720
1542
|
*/
|
|
1721
|
-
|
|
1543
|
+
relativeTo?: 'anchor' | 'previous';
|
|
1722
1544
|
/**
|
|
1723
|
-
*
|
|
1545
|
+
* Explicit order of property extraction.
|
|
1546
|
+
* Useful when using `relativeTo: 'previous'`.
|
|
1724
1547
|
*/
|
|
1725
|
-
|
|
1548
|
+
order?: string[];
|
|
1726
1549
|
/**
|
|
1727
|
-
*
|
|
1728
|
-
* that can be used to restore the session later.
|
|
1550
|
+
* Definition of the object's properties and their corresponding extraction schemas.
|
|
1729
1551
|
*/
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1552
|
+
properties: {
|
|
1553
|
+
[key: string]: ExtractSchema;
|
|
1554
|
+
};
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
interface PromiseLock extends Promise<void> {
|
|
1558
|
+
release: () => void;
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
/**
|
|
1562
|
+
* Options for the {@link FetchEngine.goto}, allowing configuration of HTTP method, payload, headers, and navigation behavior.
|
|
1563
|
+
*
|
|
1564
|
+
* @remarks
|
|
1565
|
+
* Used when navigating to a URL to specify additional parameters beyond the basic URL.
|
|
1566
|
+
*
|
|
1567
|
+
* @example
|
|
1568
|
+
* ```ts
|
|
1569
|
+
* await engine.goto('https://example.com', {
|
|
1570
|
+
* method: 'POST',
|
|
1571
|
+
* payload: { username: 'user', password: 'pass' },
|
|
1572
|
+
* headers: { 'Content-Type': 'application/json' },
|
|
1573
|
+
* waitUntil: 'networkidle'
|
|
1574
|
+
* });
|
|
1575
|
+
* ```
|
|
1576
|
+
*/
|
|
1577
|
+
interface GotoActionOptions {
|
|
1578
|
+
method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH';
|
|
1579
|
+
payload?: any;
|
|
1580
|
+
headers?: Record<string, string>;
|
|
1581
|
+
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
1582
|
+
timeoutMs?: number;
|
|
1583
|
+
simulate?: boolean;
|
|
1584
|
+
}
|
|
1585
|
+
/**
|
|
1586
|
+
* Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
|
|
1587
|
+
*
|
|
1588
|
+
* @remarks
|
|
1589
|
+
* Controls timing behavior for interactions, allowing waiting for elements, time intervals, or network conditions.
|
|
1590
|
+
*/
|
|
1591
|
+
interface WaitForActionOptions {
|
|
1592
|
+
ms?: number;
|
|
1593
|
+
selector?: string;
|
|
1594
|
+
networkIdle?: boolean;
|
|
1595
|
+
failOnTimeout?: boolean;
|
|
1596
|
+
}
|
|
1597
|
+
/**
|
|
1598
|
+
* Options for the {@link FetchEngine.submit} action, configuring form submission behavior.
|
|
1599
|
+
*
|
|
1600
|
+
* @remarks
|
|
1601
|
+
* Specifies encoding type for form submissions, particularly relevant for JSON-based APIs.
|
|
1602
|
+
*/
|
|
1603
|
+
interface SubmitActionOptions {
|
|
1604
|
+
enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
|
|
1605
|
+
}
|
|
1606
|
+
/**
|
|
1607
|
+
* Predefined cleanup groups for the {@link FetchEngine.trim} action.
|
|
1608
|
+
*/
|
|
1609
|
+
type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
|
|
1610
|
+
/**
|
|
1611
|
+
* Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
|
|
1612
|
+
*/
|
|
1613
|
+
interface TrimActionOptions {
|
|
1614
|
+
selectors?: string | string[];
|
|
1615
|
+
presets?: TrimPreset | TrimPreset[];
|
|
1616
|
+
}
|
|
1617
|
+
declare const TRIM_PRESETS: Record<string, string[]>;
|
|
1618
|
+
/**
|
|
1619
|
+
* Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
|
|
1620
|
+
*
|
|
1621
|
+
* @remarks
|
|
1622
|
+
* This action allows executing custom JavaScript logic within the page context.
|
|
1623
|
+
*
|
|
1624
|
+
* **Execution Environments:**
|
|
1625
|
+
* - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
|
|
1626
|
+
* - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
|
|
1627
|
+
* including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
|
|
1628
|
+
*
|
|
1629
|
+
* **Navigation Handling:**
|
|
1630
|
+
* If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
|
|
1631
|
+
* automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
|
|
1632
|
+
*
|
|
1633
|
+
* @example
|
|
1634
|
+
* ```json
|
|
1635
|
+
* {
|
|
1636
|
+
* "action": "evaluate",
|
|
1637
|
+
* "params": {
|
|
1638
|
+
* "fn": "([a, b]) => a + b",
|
|
1639
|
+
* "args": [1, 2]
|
|
1640
|
+
* }
|
|
1641
|
+
* }
|
|
1642
|
+
* ```
|
|
1643
|
+
*
|
|
1644
|
+
* @example
|
|
1645
|
+
* ```json
|
|
1646
|
+
* {
|
|
1647
|
+
* "action": "evaluate",
|
|
1648
|
+
* "params": {
|
|
1649
|
+
* "fn": "({ x, y }) => x * y",
|
|
1650
|
+
* "args": { "x": 6, "y": 7 }
|
|
1651
|
+
* }
|
|
1652
|
+
* }
|
|
1653
|
+
* ```
|
|
1654
|
+
*/
|
|
1655
|
+
interface EvaluateActionOptions {
|
|
1734
1656
|
/**
|
|
1735
|
-
*
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
*
|
|
1657
|
+
* The function or expression to execute.
|
|
1658
|
+
*
|
|
1659
|
+
* @remarks
|
|
1660
|
+
* Can be:
|
|
1661
|
+
* 1. A function object (only available when using the API directly).
|
|
1662
|
+
* 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
|
|
1663
|
+
* 3. A string containing a direct expression, e.g., `"document.title"`
|
|
1664
|
+
*
|
|
1665
|
+
* **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
|
|
1666
|
+
* Use destructuring to handle multiple parameters.
|
|
1740
1667
|
*/
|
|
1741
|
-
|
|
1668
|
+
fn: string | ((...args: any[]) => any);
|
|
1742
1669
|
/**
|
|
1743
|
-
*
|
|
1744
|
-
*
|
|
1745
|
-
* @param context - Fetch engine context
|
|
1746
|
-
* @param options - Configuration options
|
|
1747
|
-
* @returns Promise resolving when initialization completes
|
|
1670
|
+
* Data to pass to the function.
|
|
1748
1671
|
*
|
|
1749
1672
|
* @remarks
|
|
1750
|
-
*
|
|
1751
|
-
*
|
|
1673
|
+
* This value is passed as the first and only argument to the function defined in {@link fn}.
|
|
1674
|
+
* Recommended to use an array or object for multiple values.
|
|
1752
1675
|
*/
|
|
1753
|
-
|
|
1754
|
-
|
|
1676
|
+
args?: any;
|
|
1677
|
+
}
|
|
1678
|
+
/**
|
|
1679
|
+
* Union type representing all possible engine actions that can be dispatched.
|
|
1680
|
+
*
|
|
1681
|
+
* @remarks
|
|
1682
|
+
* Defines the command structure processed during page interactions. Each action type corresponds to
|
|
1683
|
+
* a specific user interaction or navigation command within the action loop architecture.
|
|
1684
|
+
*/
|
|
1685
|
+
type FetchEngineAction = {
|
|
1686
|
+
type: 'click';
|
|
1687
|
+
selector: string;
|
|
1688
|
+
} | {
|
|
1689
|
+
type: 'fill';
|
|
1690
|
+
selector: string;
|
|
1691
|
+
value: string;
|
|
1692
|
+
} | {
|
|
1693
|
+
type: 'mouseMove';
|
|
1694
|
+
params: {
|
|
1695
|
+
x?: number;
|
|
1696
|
+
y?: number;
|
|
1697
|
+
selector?: string;
|
|
1698
|
+
steps?: number;
|
|
1699
|
+
};
|
|
1700
|
+
} | {
|
|
1701
|
+
type: 'mouseClick';
|
|
1702
|
+
params: {
|
|
1703
|
+
x?: number;
|
|
1704
|
+
y?: number;
|
|
1705
|
+
button?: 'left' | 'right' | 'middle';
|
|
1706
|
+
clickCount?: number;
|
|
1707
|
+
delay?: number;
|
|
1708
|
+
steps?: number;
|
|
1709
|
+
};
|
|
1710
|
+
} | {
|
|
1711
|
+
type: 'mouseWheel';
|
|
1712
|
+
params: {
|
|
1713
|
+
x?: number;
|
|
1714
|
+
y?: number;
|
|
1715
|
+
selector?: string;
|
|
1716
|
+
deltaX?: number;
|
|
1717
|
+
deltaY?: number;
|
|
1718
|
+
steps?: number;
|
|
1719
|
+
};
|
|
1720
|
+
} | {
|
|
1721
|
+
type: 'keyboardType';
|
|
1722
|
+
params: {
|
|
1723
|
+
text: string;
|
|
1724
|
+
delay?: number;
|
|
1725
|
+
};
|
|
1726
|
+
} | {
|
|
1727
|
+
type: 'keyboardPress';
|
|
1728
|
+
params: {
|
|
1729
|
+
key: string;
|
|
1730
|
+
delay?: number;
|
|
1731
|
+
};
|
|
1732
|
+
} | {
|
|
1733
|
+
type: 'scrollIntoView';
|
|
1734
|
+
params: {
|
|
1735
|
+
selector: string;
|
|
1736
|
+
};
|
|
1737
|
+
} | {
|
|
1738
|
+
type: 'waitFor';
|
|
1739
|
+
options?: WaitForActionOptions;
|
|
1740
|
+
} | {
|
|
1741
|
+
type: 'submit';
|
|
1742
|
+
selector?: any;
|
|
1743
|
+
options?: SubmitActionOptions;
|
|
1744
|
+
} | {
|
|
1745
|
+
type: 'getContent';
|
|
1746
|
+
} | {
|
|
1747
|
+
type: 'navigate';
|
|
1748
|
+
url: string;
|
|
1749
|
+
opts?: GotoActionOptions;
|
|
1750
|
+
} | {
|
|
1751
|
+
type: 'extract';
|
|
1752
|
+
schema: ExtractSchema;
|
|
1753
|
+
} | {
|
|
1754
|
+
type: 'pause';
|
|
1755
|
+
message?: string;
|
|
1756
|
+
} | {
|
|
1757
|
+
type: 'trim';
|
|
1758
|
+
options: TrimActionOptions;
|
|
1759
|
+
} | {
|
|
1760
|
+
type: 'evaluate';
|
|
1761
|
+
params: EvaluateActionOptions;
|
|
1762
|
+
} | {
|
|
1763
|
+
type: 'dispose';
|
|
1764
|
+
};
|
|
1765
|
+
/**
|
|
1766
|
+
* Represents an action that has been dispatched and is awaiting execution in the active page context.
|
|
1767
|
+
*
|
|
1768
|
+
* @remarks
|
|
1769
|
+
* Connects the action request with its resolution mechanism. Used internally by the action dispatch system
|
|
1770
|
+
* to handle promises while maintaining the page context validity window.
|
|
1771
|
+
*/
|
|
1772
|
+
interface DispatchedEngineAction {
|
|
1773
|
+
action: FetchEngineAction;
|
|
1774
|
+
resolve: (value?: any) => void;
|
|
1775
|
+
reject: (reason?: any) => void;
|
|
1776
|
+
}
|
|
1777
|
+
/**
|
|
1778
|
+
* Represents a pending navigation request awaiting resolution.
|
|
1779
|
+
*
|
|
1780
|
+
* @remarks
|
|
1781
|
+
* Tracks navigation requests that have been queued but not yet processed by the request handler.
|
|
1782
|
+
*/
|
|
1783
|
+
interface PendingEngineRequest {
|
|
1784
|
+
resolve: (value: any) => void;
|
|
1785
|
+
reject: (reason?: any) => void;
|
|
1786
|
+
}
|
|
1787
|
+
/**
|
|
1788
|
+
* Abstract base class for all fetch engines, providing a unified interface for web content fetching and interaction.
|
|
1789
|
+
*
|
|
1790
|
+
* @remarks
|
|
1791
|
+
* The `FetchEngine` class serves as the foundation for concrete engine implementations (e.g., `CheerioFetchEngine`,
|
|
1792
|
+
* `PlaywrightFetchEngine`). It abstracts underlying crawling technology and provides a consistent API for navigation,
|
|
1793
|
+
* content retrieval, and user interaction.
|
|
1794
|
+
*
|
|
1795
|
+
* The engine architecture uses an event-driven action loop to bridge Crawlee's stateless request handling with
|
|
1796
|
+
* the need for a stateful, sequential API for page interactions. This solves the critical challenge of maintaining
|
|
1797
|
+
* page context validity across asynchronous operations.
|
|
1798
|
+
*
|
|
1799
|
+
* @example
|
|
1800
|
+
* ```ts
|
|
1801
|
+
* import "./playwright"; // 引入注册 Playwright browser 引擎
|
|
1802
|
+
* const engine = await FetchEngine.create(context, { engine: 'browser' });
|
|
1803
|
+
* await engine.goto('https://example.com');
|
|
1804
|
+
* await engine.fill('#username', 'user');
|
|
1805
|
+
* await engine.click('#submit');
|
|
1806
|
+
* const response = await engine.getContent();
|
|
1807
|
+
* ```
|
|
1808
|
+
*/
|
|
1809
|
+
type AnyFetchEngine = FetchEngine<any, any, any>;
|
|
1810
|
+
type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
|
|
1811
|
+
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
|
|
1812
|
+
private static registry;
|
|
1755
1813
|
/**
|
|
1756
|
-
*
|
|
1757
|
-
*
|
|
1758
|
-
* @
|
|
1814
|
+
* Registers a fetch engine implementation with the global registry.
|
|
1815
|
+
*
|
|
1816
|
+
* @param engineClass - The engine class to register
|
|
1817
|
+
* @throws {Error} When engine class lacks static `id` or ID is already registered
|
|
1818
|
+
*
|
|
1819
|
+
* @example
|
|
1820
|
+
* ```ts
|
|
1821
|
+
* FetchEngine.register(CheerioFetchEngine);
|
|
1822
|
+
* ```
|
|
1759
1823
|
*/
|
|
1760
|
-
|
|
1824
|
+
static register(engineClass: AnyFetchEngineCtor): void;
|
|
1761
1825
|
/**
|
|
1762
|
-
*
|
|
1763
|
-
*
|
|
1764
|
-
* @param
|
|
1765
|
-
* @
|
|
1826
|
+
* Retrieves a fetch engine implementation by its unique ID.
|
|
1827
|
+
*
|
|
1828
|
+
* @param id - The ID of the engine to retrieve
|
|
1829
|
+
* @returns Engine class if found, otherwise `undefined`
|
|
1766
1830
|
*/
|
|
1767
|
-
|
|
1768
|
-
protected _handlePause(action: {
|
|
1769
|
-
message?: string;
|
|
1770
|
-
}): Promise<void>;
|
|
1831
|
+
static get(id: string): AnyFetchEngineCtor | undefined;
|
|
1771
1832
|
/**
|
|
1772
|
-
*
|
|
1773
|
-
*
|
|
1774
|
-
* **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
|
|
1775
|
-
* of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
|
|
1776
|
-
*
|
|
1777
|
-
* ### Why This Constraint Exists
|
|
1778
|
-
* - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
|
|
1779
|
-
* is **only valid during the synchronous execution phase** of the request handler
|
|
1780
|
-
* - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
|
|
1781
|
-
* due to Crawlee's internal resource management
|
|
1782
|
-
*
|
|
1783
|
-
* ### How It Works
|
|
1784
|
-
* 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
|
|
1785
|
-
* 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
|
|
1786
|
-
* 3. Automatically cleans up event listeners upon completion
|
|
1833
|
+
* Retrieves a fetch engine implementation by execution mode.
|
|
1787
1834
|
*
|
|
1788
|
-
*
|
|
1789
|
-
* @
|
|
1790
|
-
* @param context The active Crawlee crawling context containing the page/$ object
|
|
1791
|
-
* @throws {Error} If called outside valid page context window (`!this.isPageActive`)
|
|
1792
|
-
* @internal Engine infrastructure method - not for direct consumer use
|
|
1835
|
+
* @param mode - Execution mode (`'http'` or `'browser'`)
|
|
1836
|
+
* @returns Engine class if found, otherwise `undefined`
|
|
1793
1837
|
*/
|
|
1794
|
-
|
|
1795
|
-
protected _sharedRequestHandler(context: TContext): Promise<void>;
|
|
1796
|
-
protected _sharedFailedRequestHandler(context: TContext, error?: Error): Promise<void>;
|
|
1797
|
-
protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
|
|
1798
|
-
private _requestHandler;
|
|
1799
|
-
private _failedRequestHandler;
|
|
1800
|
-
protected _commonCleanup(): Promise<void>;
|
|
1838
|
+
static getByMode(mode: FetchEngineType): AnyFetchEngineCtor | undefined;
|
|
1801
1839
|
/**
|
|
1802
|
-
*
|
|
1840
|
+
* Factory method to create and initialize a fetch engine instance.
|
|
1803
1841
|
*
|
|
1804
|
-
* @param
|
|
1805
|
-
* @param
|
|
1806
|
-
* @returns
|
|
1842
|
+
* @param ctx - Fetch engine context
|
|
1843
|
+
* @param options - Configuration options
|
|
1844
|
+
* @returns Initialized fetch engine instance
|
|
1845
|
+
* @throws {Error} When no suitable engine implementation is found
|
|
1807
1846
|
*
|
|
1808
|
-
* @
|
|
1809
|
-
*
|
|
1810
|
-
* await engine.blockResources(['image', 'stylesheet']);
|
|
1811
|
-
* await engine.blockResources(['script'], true); // Replace existing
|
|
1812
|
-
* ```
|
|
1847
|
+
* @remarks
|
|
1848
|
+
* Primary entry point for engine creation. Selects appropriate implementation based on `engine` name of the option or context.
|
|
1813
1849
|
*/
|
|
1814
|
-
|
|
1850
|
+
static create(ctx: FetchEngineContext, options?: BaseFetcherProperties): Promise<AnyFetchEngine | undefined>;
|
|
1815
1851
|
/**
|
|
1816
|
-
*
|
|
1852
|
+
* Unique identifier for the engine implementation.
|
|
1817
1853
|
*
|
|
1818
|
-
* @
|
|
1819
|
-
*
|
|
1854
|
+
* @remarks
|
|
1855
|
+
* Must be defined by concrete implementations. Used for registration and lookup in engine registry.
|
|
1820
1856
|
*/
|
|
1821
|
-
|
|
1857
|
+
static readonly id: string;
|
|
1822
1858
|
/**
|
|
1823
|
-
*
|
|
1824
|
-
*
|
|
1825
|
-
* @overload
|
|
1826
|
-
* Gets all headers.
|
|
1827
|
-
* @returns All headers as record
|
|
1859
|
+
* Execution mode of the engine (`'http'` or `'browser'`).
|
|
1828
1860
|
*
|
|
1829
|
-
* @
|
|
1830
|
-
*
|
|
1831
|
-
* @param name - Header name
|
|
1832
|
-
* @returns Header value
|
|
1833
|
-
*
|
|
1834
|
-
* @overload
|
|
1835
|
-
* Sets multiple headers.
|
|
1836
|
-
* @param headers - Headers to set
|
|
1837
|
-
* @param replaced - Whether to replace all existing headers
|
|
1838
|
-
* @returns `true` if successful
|
|
1839
|
-
*
|
|
1840
|
-
* @overload
|
|
1841
|
-
* Sets single header.
|
|
1842
|
-
* @param name - Header name
|
|
1843
|
-
* @param value - Header value or `null` to remove
|
|
1844
|
-
* @returns `true` if successful
|
|
1845
|
-
*
|
|
1846
|
-
* @example
|
|
1847
|
-
* ```ts
|
|
1848
|
-
* const allHeaders = await engine.headers();
|
|
1849
|
-
* const userAgent = await engine.headers('user-agent');
|
|
1850
|
-
* await engine.headers({ 'x-custom': 'value' });
|
|
1851
|
-
* await engine.headers('auth', 'token');
|
|
1852
|
-
* ```
|
|
1861
|
+
* @remarks
|
|
1862
|
+
* Must be defined by concrete implementations. Indicates whether engine operates at HTTP level or uses full browser.
|
|
1853
1863
|
*/
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1864
|
+
static readonly mode: FetchEngineType;
|
|
1865
|
+
protected ctx?: FetchEngineContext;
|
|
1866
|
+
protected opts?: BaseFetcherProperties;
|
|
1867
|
+
protected crawler?: TCrawler;
|
|
1868
|
+
protected isCrawlerReady?: boolean;
|
|
1869
|
+
protected crawlerRunPromise?: Promise<FinalStatistics>;
|
|
1870
|
+
protected config?: Configuration;
|
|
1871
|
+
protected requestQueue?: RequestQueue;
|
|
1872
|
+
protected kvStore?: KeyValueStore;
|
|
1873
|
+
protected proxyConfiguration?: ProxyConfiguration;
|
|
1874
|
+
protected hdrs: Record<string, string>;
|
|
1875
|
+
protected _initialCookies?: Cookie[];
|
|
1876
|
+
protected _initializedSessions: Set<string>;
|
|
1877
|
+
protected currentSession?: Session;
|
|
1878
|
+
protected pendingRequests: Map<string, PendingEngineRequest>;
|
|
1879
|
+
protected requestCounter: number;
|
|
1880
|
+
protected actionEmitter: EventEmitter;
|
|
1881
|
+
protected isPageActive: boolean;
|
|
1882
|
+
protected isEngineDisposed: boolean;
|
|
1883
|
+
protected navigationLock: PromiseLock;
|
|
1884
|
+
protected activeContext?: TContext;
|
|
1885
|
+
protected isExecutingAction: boolean;
|
|
1886
|
+
protected lastResponse?: FetchResponse;
|
|
1887
|
+
protected actionQueue: DispatchedEngineAction[];
|
|
1888
|
+
protected isProcessingActionLoop: boolean;
|
|
1889
|
+
protected blockedTypes: Set<string>;
|
|
1890
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
1891
|
+
protected _cleanup?(): Promise<void>;
|
|
1892
|
+
protected _getTrimInfo(options: TrimActionOptions): {
|
|
1893
|
+
selectors: string[];
|
|
1894
|
+
removeComments: boolean;
|
|
1895
|
+
removeHidden: boolean;
|
|
1896
|
+
};
|
|
1858
1897
|
/**
|
|
1859
|
-
*
|
|
1860
|
-
*
|
|
1861
|
-
* @overload
|
|
1862
|
-
* Gets all cookies.
|
|
1863
|
-
* @returns Array of cookies
|
|
1864
|
-
*
|
|
1865
|
-
* @overload
|
|
1866
|
-
* Sets cookies for session.
|
|
1867
|
-
* @param cookies - Cookies to set
|
|
1868
|
-
* @returns `true` if successful
|
|
1898
|
+
* Finds all elements matching the selector within the given scope.
|
|
1869
1899
|
*
|
|
1870
|
-
* @
|
|
1871
|
-
*
|
|
1872
|
-
*
|
|
1873
|
-
*
|
|
1874
|
-
*
|
|
1900
|
+
* @param scope - The scope to search in (Engine-specific element/node or array of nodes).
|
|
1901
|
+
* @param selector - CSS selector.
|
|
1902
|
+
* @returns List of matching elements.
|
|
1903
|
+
* @see {@link IExtractEngine._querySelectorAll} for behavior contract.
|
|
1904
|
+
* @internal
|
|
1875
1905
|
*/
|
|
1876
|
-
|
|
1877
|
-
cookies(cookies: Cookie[]): Promise<boolean>;
|
|
1906
|
+
abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
1878
1907
|
/**
|
|
1879
|
-
*
|
|
1908
|
+
* Extracts a primitive value from the element based on schema.
|
|
1880
1909
|
*
|
|
1881
|
-
* @
|
|
1910
|
+
* @param schema - Value extraction schema.
|
|
1911
|
+
* @param scope - The element scope.
|
|
1912
|
+
* @returns Extracted value.
|
|
1913
|
+
* @see {@link IExtractEngine._extractValue} for behavior contract.
|
|
1914
|
+
* @internal
|
|
1882
1915
|
*/
|
|
1883
|
-
|
|
1884
|
-
}
|
|
1885
|
-
declare function getRandomDelay(base: number, variance?: number): number;
|
|
1886
|
-
|
|
1887
|
-
type FetchReturnType = 'response' | 'context' | 'outputs' | 'any' | 'none';
|
|
1888
|
-
interface FetchReturnTypeRegistry {
|
|
1889
|
-
response: FetchResponse;
|
|
1890
|
-
context: FetchContext;
|
|
1891
|
-
result: FetchActionResult<any> | undefined;
|
|
1892
|
-
outputs: Record<string, any>;
|
|
1893
|
-
any: any;
|
|
1894
|
-
none: void;
|
|
1895
|
-
}
|
|
1896
|
-
type FetchReturnTypeFor<R extends FetchReturnType> = R extends keyof FetchReturnTypeRegistry ? FetchReturnTypeRegistry[R] : never;
|
|
1897
|
-
|
|
1898
|
-
/**
|
|
1899
|
-
* Represents the state of an action being executed within a context.
|
|
1900
|
-
*
|
|
1901
|
-
* @remarks
|
|
1902
|
-
* Extends the basic action properties with runtime metadata like execution index,
|
|
1903
|
-
* nesting depth, and any errors encountered during execution.
|
|
1904
|
-
*/
|
|
1905
|
-
interface FetchActionInContext extends FetchActionProperties {
|
|
1916
|
+
abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1906
1917
|
/**
|
|
1907
|
-
*
|
|
1918
|
+
* Gets the parent element of the given element.
|
|
1919
|
+
*
|
|
1920
|
+
* @param scope - The element scope.
|
|
1921
|
+
* @returns Parent element or null.
|
|
1922
|
+
* @internal
|
|
1908
1923
|
*/
|
|
1909
|
-
|
|
1924
|
+
abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1910
1925
|
/**
|
|
1911
|
-
*
|
|
1926
|
+
* Checks if two elements are the same identity.
|
|
1927
|
+
*
|
|
1928
|
+
* @param scope1 - First element scope.
|
|
1929
|
+
* @param scope2 - Second element scope.
|
|
1930
|
+
* @returns True if they are the same DOM node.
|
|
1931
|
+
* @internal
|
|
1912
1932
|
*/
|
|
1913
|
-
|
|
1933
|
+
abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
1914
1934
|
/**
|
|
1915
|
-
*
|
|
1935
|
+
* Gets all subsequent siblings of an element until a sibling matches the selector.
|
|
1936
|
+
* Used in 'segmented' extraction mode.
|
|
1937
|
+
*
|
|
1938
|
+
* @param scope - The anchor element scope.
|
|
1939
|
+
* @param untilSelector - Optional selector that marks the end of the segment (exclusive).
|
|
1940
|
+
* @returns List of sibling elements between anchor and untilSelector.
|
|
1941
|
+
* @internal
|
|
1916
1942
|
*/
|
|
1917
|
-
|
|
1918
|
-
}
|
|
1919
|
-
/**
|
|
1920
|
-
* Base internal state used by fetch engines to maintain their runtime environment.
|
|
1921
|
-
*
|
|
1922
|
-
* @internal
|
|
1923
|
-
*/
|
|
1924
|
-
interface BaseFetchContextInteralState {
|
|
1943
|
+
abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
1925
1944
|
/**
|
|
1926
|
-
*
|
|
1927
|
-
*
|
|
1945
|
+
* Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
|
|
1946
|
+
*
|
|
1947
|
+
* @param scope - The starting element.
|
|
1948
|
+
* @param candidates - The array of potential ancestor scopes.
|
|
1949
|
+
* @returns A promise resolving to the matching candidate scope, or `null` if none found.
|
|
1950
|
+
* @see {@link IExtractEngine._findClosestAncestor} for implementation details.
|
|
1951
|
+
* @internal
|
|
1928
1952
|
*/
|
|
1929
|
-
|
|
1953
|
+
abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
1930
1954
|
/**
|
|
1931
|
-
*
|
|
1955
|
+
* Checks if the `container` scope contains the `element` scope.
|
|
1956
|
+
*
|
|
1957
|
+
* @param container - The potential ancestor element.
|
|
1958
|
+
* @param element - The potential descendant element.
|
|
1959
|
+
* @returns A promise resolving to `true` if `container` contains `element`.
|
|
1960
|
+
* @see {@link IExtractEngine._contains} for implementation details.
|
|
1961
|
+
* @internal
|
|
1932
1962
|
*/
|
|
1933
|
-
|
|
1934
|
-
}
|
|
1935
|
-
/**
|
|
1936
|
-
* Extended internal state for the fetch context, including action lifecycle management.
|
|
1937
|
-
*
|
|
1938
|
-
* @internal
|
|
1939
|
-
*/
|
|
1940
|
-
interface FetchContextInteralState extends BaseFetchContextInteralState {
|
|
1963
|
+
abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
1941
1964
|
/**
|
|
1942
|
-
*
|
|
1965
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
1966
|
+
*
|
|
1967
|
+
* @param scope1 - The first element scope.
|
|
1968
|
+
* @param scope2 - The second element scope.
|
|
1969
|
+
* @returns A promise resolving to the LCA element scope, or `null` if none found.
|
|
1970
|
+
* @internal
|
|
1943
1971
|
*/
|
|
1944
|
-
|
|
1972
|
+
abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1945
1973
|
/**
|
|
1946
|
-
*
|
|
1974
|
+
* Finds the direct child of container that contains element.
|
|
1975
|
+
*
|
|
1976
|
+
* @param element - The descendant element.
|
|
1977
|
+
* @param container - The container element.
|
|
1978
|
+
* @returns The child element of container, or null.
|
|
1979
|
+
* @internal
|
|
1947
1980
|
*/
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
/**
|
|
1951
|
-
* Context provided to the Fetch Engine during navigation and request handling.
|
|
1952
|
-
*
|
|
1953
|
-
* @remarks
|
|
1954
|
-
* This interface contains the minimum set of properties required by an engine
|
|
1955
|
-
* to perform a fetch operation and build a response.
|
|
1956
|
-
*/
|
|
1957
|
-
interface FetchEngineContext extends BaseFetcherProperties {
|
|
1981
|
+
abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1982
|
+
protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
|
|
1958
1983
|
/**
|
|
1959
|
-
*
|
|
1984
|
+
* Normalizes the array extraction mode into an options object.
|
|
1985
|
+
* @param mode - The mode string or options object.
|
|
1986
|
+
* @internal
|
|
1960
1987
|
*/
|
|
1961
|
-
|
|
1988
|
+
protected _normalizeArrayMode(mode?: ExtractArrayMode): {
|
|
1989
|
+
type: ExtractArrayModeName;
|
|
1990
|
+
} & any;
|
|
1962
1991
|
/**
|
|
1963
|
-
*
|
|
1992
|
+
* Performs standard nested array extraction.
|
|
1993
|
+
* @param items - The schema for each item.
|
|
1994
|
+
* @param elements - The list of item elements.
|
|
1995
|
+
* @internal
|
|
1964
1996
|
*/
|
|
1965
|
-
|
|
1997
|
+
protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
|
|
1998
|
+
strict?: boolean;
|
|
1999
|
+
}): Promise<any[]>;
|
|
1966
2000
|
/**
|
|
1967
|
-
*
|
|
2001
|
+
* Performs columnar extraction (Column Alignment Mode).
|
|
2002
|
+
*
|
|
2003
|
+
* @param schema - The schema for a single item (must be an object or implicit object).
|
|
2004
|
+
* @param container - The container element to search within.
|
|
2005
|
+
* @param opts - Columnar extraction options (strict, inference).
|
|
2006
|
+
* @returns An array of extracted items, or null if requirements aren't met.
|
|
2007
|
+
* @internal
|
|
1968
2008
|
*/
|
|
1969
|
-
|
|
2009
|
+
protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
|
|
1970
2010
|
/**
|
|
1971
|
-
*
|
|
2011
|
+
* Performs segmented extraction (Anchor-based Scanning).
|
|
2012
|
+
*
|
|
2013
|
+
* @param schema - The schema for a single item (must be an object).
|
|
2014
|
+
* @param container - The container element to scan.
|
|
2015
|
+
* @param opts - Segmented extraction options (anchor).
|
|
2016
|
+
* @returns An array of extracted items.
|
|
2017
|
+
* @internal
|
|
1972
2018
|
*/
|
|
1973
|
-
|
|
2019
|
+
protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
|
|
1974
2020
|
/**
|
|
1975
|
-
*
|
|
2021
|
+
* Creates the crawler instance for the specific engine implementation.
|
|
2022
|
+
* @param options - The final crawler options.
|
|
2023
|
+
* @internal
|
|
1976
2024
|
*/
|
|
1977
|
-
|
|
2025
|
+
protected abstract _createCrawler(options: TOptions, config?: Configuration): TCrawler;
|
|
1978
2026
|
/**
|
|
1979
|
-
*
|
|
2027
|
+
* Gets the crawler-specific options from the subclass.
|
|
2028
|
+
* @param ctx - The fetch engine context.
|
|
2029
|
+
* @internal
|
|
1980
2030
|
*/
|
|
1981
|
-
|
|
1982
|
-
}
|
|
1983
|
-
/**
|
|
1984
|
-
* The full execution context for a Web Fetcher session or action batch.
|
|
1985
|
-
*
|
|
1986
|
-
* @remarks
|
|
1987
|
-
* This object is the central state container for the fetch operation. It provides
|
|
1988
|
-
* access to configuration, the event bus, shared outputs, and the execution engine.
|
|
1989
|
-
* It is passed to every action during execution.
|
|
1990
|
-
*/
|
|
1991
|
-
interface FetchContext extends FetchEngineContext {
|
|
2031
|
+
protected abstract _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<TOptions>> | Partial<TOptions>;
|
|
1992
2032
|
/**
|
|
1993
|
-
*
|
|
2033
|
+
* Abstract method for building standard [FetchResponse] from Crawlee context.
|
|
2034
|
+
*
|
|
2035
|
+
* @param context - Crawlee crawling context
|
|
2036
|
+
* @returns Promise resolving to [FetchResponse] object
|
|
2037
|
+
*
|
|
2038
|
+
* @remarks
|
|
2039
|
+
* Converts implementation-specific context (Playwright `page` or Cheerio `$`) to standardized response.
|
|
2040
|
+
* @internal
|
|
1994
2041
|
*/
|
|
1995
|
-
|
|
2042
|
+
protected abstract _buildResponse(context: TContext): Promise<FetchResponse>;
|
|
2043
|
+
protected buildResponse(context: TContext): Promise<FetchResponse>;
|
|
1996
2044
|
/**
|
|
1997
|
-
*
|
|
1998
|
-
*
|
|
2045
|
+
* Abstract method for executing action within current page context.
|
|
2046
|
+
*
|
|
2047
|
+
* @param context - Crawlee crawling context
|
|
2048
|
+
* @param action - Action to execute
|
|
2049
|
+
* @returns Promise resolving to action result
|
|
2050
|
+
*
|
|
2051
|
+
* @remarks
|
|
2052
|
+
* Handles specific user interactions using underlying technology (Playwright/Cheerio).
|
|
2053
|
+
* @internal
|
|
1999
2054
|
*/
|
|
2000
|
-
|
|
2055
|
+
protected abstract executeAction(context: TContext, action: FetchEngineAction): Promise<any>;
|
|
2001
2056
|
/**
|
|
2002
|
-
*
|
|
2057
|
+
* Navigates to the specified URL.
|
|
2003
2058
|
*
|
|
2004
|
-
* @param
|
|
2005
|
-
* @
|
|
2059
|
+
* @param url - Target URL
|
|
2060
|
+
* @param params - Navigation options
|
|
2061
|
+
* @returns Promise resolving when navigation completes
|
|
2062
|
+
*
|
|
2063
|
+
* @example
|
|
2064
|
+
* ```ts
|
|
2065
|
+
* await engine.goto('https://example.com');
|
|
2066
|
+
* ```
|
|
2006
2067
|
*/
|
|
2007
|
-
|
|
2068
|
+
abstract goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2008
2069
|
/**
|
|
2009
|
-
*
|
|
2070
|
+
* Waits for specified condition before continuing.
|
|
2010
2071
|
*
|
|
2011
|
-
* @param
|
|
2012
|
-
* @
|
|
2013
|
-
*
|
|
2014
|
-
* @
|
|
2072
|
+
* @param params - Wait conditions
|
|
2073
|
+
* @returns Promise resolving when wait condition is met
|
|
2074
|
+
*
|
|
2075
|
+
* @example
|
|
2076
|
+
* ```ts
|
|
2077
|
+
* await engine.waitFor({ ms: 1000 }); // Wait 1 second
|
|
2078
|
+
* await engine.waitFor({ selector: '#content' }); // Wait for element
|
|
2079
|
+
* ```
|
|
2015
2080
|
*/
|
|
2016
|
-
|
|
2081
|
+
waitFor(params?: WaitForActionOptions): Promise<void>;
|
|
2017
2082
|
/**
|
|
2018
|
-
*
|
|
2083
|
+
* Clicks on element matching selector.
|
|
2084
|
+
*
|
|
2085
|
+
* @param selector - CSS selector of element to click
|
|
2086
|
+
* @returns Promise resolving when click is processed
|
|
2087
|
+
* @throws {Error} When no active page context exists
|
|
2019
2088
|
*/
|
|
2020
|
-
|
|
2089
|
+
click(selector: string): Promise<void>;
|
|
2021
2090
|
/**
|
|
2022
|
-
*
|
|
2091
|
+
* Moves mouse to specified position or element.
|
|
2092
|
+
*
|
|
2093
|
+
* @param params - Move parameters (x, y, selector, steps)
|
|
2023
2094
|
*/
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
|
|
2031
|
-
static readonly id = "cheerio";
|
|
2032
|
-
static readonly mode = "http";
|
|
2033
|
-
private _ensureCheerioContext;
|
|
2034
|
-
protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
|
|
2035
|
-
_querySelectorAll(scope: {
|
|
2036
|
-
$: CheerioAPI;
|
|
2037
|
-
el: any;
|
|
2038
|
-
} | any[], selector: string): Promise<FetchElementScope[]>;
|
|
2039
|
-
_nextSiblingsUntil(scope: {
|
|
2040
|
-
$: CheerioAPI;
|
|
2041
|
-
el: CheerioNode;
|
|
2042
|
-
}, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2043
|
-
_parentElement(scope: {
|
|
2044
|
-
$: CheerioAPI;
|
|
2045
|
-
el: CheerioNode;
|
|
2046
|
-
}): Promise<FetchElementScope | null>;
|
|
2047
|
-
_isSameElement(scope1: {
|
|
2048
|
-
el: CheerioNode;
|
|
2049
|
-
}, scope2: {
|
|
2050
|
-
el: CheerioNode;
|
|
2051
|
-
}): Promise<boolean>;
|
|
2052
|
-
_findClosestAncestor(scope: {
|
|
2053
|
-
$: CheerioAPI;
|
|
2054
|
-
el: CheerioNode;
|
|
2055
|
-
}, candidates: {
|
|
2056
|
-
$: CheerioAPI;
|
|
2057
|
-
el: CheerioNode;
|
|
2058
|
-
}[]): Promise<FetchElementScope | null>;
|
|
2059
|
-
_contains(container: {
|
|
2060
|
-
$: CheerioAPI;
|
|
2061
|
-
el: CheerioNode;
|
|
2062
|
-
}, element: {
|
|
2063
|
-
$: CheerioAPI;
|
|
2064
|
-
el: CheerioNode;
|
|
2065
|
-
}): Promise<boolean>;
|
|
2066
|
-
_findCommonAncestor(scope1: {
|
|
2067
|
-
$: CheerioAPI;
|
|
2068
|
-
el: CheerioNode;
|
|
2069
|
-
}, scope2: {
|
|
2070
|
-
$: CheerioAPI;
|
|
2071
|
-
el: CheerioNode;
|
|
2072
|
-
}): Promise<FetchElementScope | null>;
|
|
2073
|
-
_findContainerChild(element: {
|
|
2074
|
-
$: CheerioAPI;
|
|
2075
|
-
el: CheerioNode;
|
|
2076
|
-
}, container: {
|
|
2077
|
-
$: CheerioAPI;
|
|
2078
|
-
el: CheerioNode;
|
|
2079
|
-
}): Promise<FetchElementScope | null>;
|
|
2080
|
-
_extractValue(schema: ExtractValueSchema, scope: {
|
|
2081
|
-
$: CheerioAPI;
|
|
2082
|
-
el: CheerioNode;
|
|
2083
|
-
}): Promise<any>;
|
|
2084
|
-
protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
|
|
2085
|
-
protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2086
|
-
protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
|
|
2087
|
-
url: string;
|
|
2088
|
-
method: string;
|
|
2089
|
-
body?: any;
|
|
2090
|
-
headers?: Record<string, string>;
|
|
2091
|
-
}): Promise<any>;
|
|
2092
|
-
protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
|
|
2093
|
-
protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
|
|
2094
|
-
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
|
|
2095
|
-
goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2096
|
-
}
|
|
2097
|
-
|
|
2098
|
-
type Page = NonNullable<PlaywrightCrawlingContext['page']>;
|
|
2099
|
-
type Locator = ReturnType<Page['locator']>;
|
|
2100
|
-
declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
|
|
2101
|
-
static readonly id = "playwright";
|
|
2102
|
-
static readonly mode = "browser";
|
|
2103
|
-
protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
|
|
2104
|
-
_querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
|
|
2105
|
-
_nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2106
|
-
_parentElement(scope: Locator): Promise<FetchElementScope | null>;
|
|
2107
|
-
_isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
|
|
2108
|
-
_findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
|
|
2109
|
-
_contains(container: Locator, element: Locator): Promise<boolean>;
|
|
2110
|
-
_findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
|
|
2111
|
-
_findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
|
|
2112
|
-
_extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
|
|
2113
|
-
protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
|
|
2114
|
-
protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
|
|
2115
|
-
protected currentMousePos: {
|
|
2116
|
-
x: number;
|
|
2117
|
-
y: number;
|
|
2118
|
-
};
|
|
2119
|
-
protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
|
|
2120
|
-
protected mouseInitialized: boolean;
|
|
2121
|
-
protected _initializeMousePos(page: Page): Promise<void>;
|
|
2122
|
-
protected _getTrajectory(start: {
|
|
2123
|
-
x: number;
|
|
2124
|
-
y: number;
|
|
2125
|
-
}, end: {
|
|
2126
|
-
x: number;
|
|
2127
|
-
y: number;
|
|
2128
|
-
}, steps?: number): {
|
|
2129
|
-
x: number;
|
|
2130
|
-
y: number;
|
|
2131
|
-
}[];
|
|
2132
|
-
protected _moveToPos(context: PlaywrightCrawlingContext, target: {
|
|
2133
|
-
x: number;
|
|
2134
|
-
y: number;
|
|
2135
|
-
}, steps?: number): Promise<{
|
|
2136
|
-
x: number;
|
|
2137
|
-
y: number;
|
|
2138
|
-
}>;
|
|
2139
|
-
protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
|
|
2140
|
-
x: number;
|
|
2141
|
-
y: number;
|
|
2142
|
-
}>;
|
|
2143
|
-
protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
|
|
2144
|
-
x: number;
|
|
2145
|
-
y: number;
|
|
2146
|
-
}>;
|
|
2147
|
-
protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2148
|
-
protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
|
|
2149
|
-
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
|
|
2150
|
-
goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
|
|
2151
|
-
}
|
|
2152
|
-
|
|
2153
|
-
declare enum FetchActionResultStatus {
|
|
2095
|
+
mouseMove(params: {
|
|
2096
|
+
x?: number;
|
|
2097
|
+
y?: number;
|
|
2098
|
+
selector?: string;
|
|
2099
|
+
steps?: number;
|
|
2100
|
+
}): Promise<void>;
|
|
2154
2101
|
/**
|
|
2155
|
-
*
|
|
2102
|
+
* Clicks at current position or specified position.
|
|
2103
|
+
*
|
|
2104
|
+
* @param params - Click parameters (x, y, button, clickCount, delay)
|
|
2156
2105
|
*/
|
|
2157
|
-
|
|
2106
|
+
mouseClick(params: {
|
|
2107
|
+
x?: number;
|
|
2108
|
+
y?: number;
|
|
2109
|
+
button?: 'left' | 'right' | 'middle';
|
|
2110
|
+
clickCount?: number;
|
|
2111
|
+
delay?: number;
|
|
2112
|
+
}): Promise<void>;
|
|
2158
2113
|
/**
|
|
2159
|
-
*
|
|
2114
|
+
* Scrolls the mouse wheel.
|
|
2115
|
+
*
|
|
2116
|
+
* @param params - Wheel parameters (x, y, selector, deltaX, deltaY, steps)
|
|
2160
2117
|
*/
|
|
2161
|
-
|
|
2118
|
+
mouseWheel(params: {
|
|
2119
|
+
x?: number;
|
|
2120
|
+
y?: number;
|
|
2121
|
+
selector?: string;
|
|
2122
|
+
deltaX?: number;
|
|
2123
|
+
deltaY?: number;
|
|
2124
|
+
steps?: number;
|
|
2125
|
+
}): Promise<void>;
|
|
2162
2126
|
/**
|
|
2163
|
-
*
|
|
2164
|
-
*
|
|
2127
|
+
* Scrolls the element into view.
|
|
2128
|
+
*
|
|
2129
|
+
* @param params - Scroll parameters (selector)
|
|
2165
2130
|
*/
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
interface FetchActionMeta {
|
|
2170
|
-
id: string;
|
|
2171
|
-
index?: number;
|
|
2172
|
-
engineType?: FetchEngineType;
|
|
2173
|
-
capability?: FetchActionCapabilityMode;
|
|
2174
|
-
response?: FetchResponse;
|
|
2175
|
-
timings?: {
|
|
2176
|
-
start: number;
|
|
2177
|
-
total: number;
|
|
2178
|
-
};
|
|
2179
|
-
retries?: number;
|
|
2180
|
-
}
|
|
2181
|
-
interface FetchActionResult<R extends FetchReturnType = FetchReturnType> {
|
|
2182
|
-
status: FetchActionResultStatus;
|
|
2183
|
-
returnType?: R;
|
|
2184
|
-
result?: FetchReturnTypeFor<R>;
|
|
2185
|
-
error?: Error;
|
|
2186
|
-
meta?: FetchActionMeta;
|
|
2187
|
-
}
|
|
2188
|
-
interface BaseFetchActionProperties {
|
|
2189
|
-
id?: string;
|
|
2190
|
-
name?: string;
|
|
2191
|
-
action?: string | FetchAction;
|
|
2192
|
-
index?: number;
|
|
2193
|
-
params?: any;
|
|
2194
|
-
args?: any;
|
|
2195
|
-
storeAs?: string;
|
|
2196
|
-
failOnError?: boolean;
|
|
2197
|
-
failOnTimeout?: boolean;
|
|
2198
|
-
timeoutMs?: number;
|
|
2199
|
-
maxRetries?: number;
|
|
2200
|
-
[key: string]: any;
|
|
2201
|
-
}
|
|
2202
|
-
type BaseFetchActionOptions = RequireAtLeastOne<BaseFetchActionProperties, 'id' | 'name' | 'action'>;
|
|
2203
|
-
interface BaseFetchCollectorActionProperties extends BaseFetchActionProperties {
|
|
2204
|
-
activateOn?: string | RegExp | Array<string | RegExp>;
|
|
2205
|
-
deactivateOn?: string | RegExp | Array<string | RegExp>;
|
|
2206
|
-
collectOn?: string | RegExp | Array<string | RegExp>;
|
|
2207
|
-
background?: boolean;
|
|
2208
|
-
}
|
|
2209
|
-
type BaseFetchCollectorOptions = RequireAtLeastOne<BaseFetchCollectorActionProperties, 'id' | 'name' | 'action'>;
|
|
2210
|
-
interface FetchActionProperties extends BaseFetchActionProperties {
|
|
2211
|
-
collectors?: BaseFetchCollectorOptions[];
|
|
2212
|
-
}
|
|
2213
|
-
type FetchActionOptions = RequireAtLeastOne<FetchActionProperties, 'id' | 'name' | 'action'>;
|
|
2214
|
-
type FetchActionCapabilities = {
|
|
2215
|
-
[mode in FetchEngineType]?: FetchActionCapabilityMode;
|
|
2216
|
-
};
|
|
2217
|
-
declare abstract class FetchAction {
|
|
2218
|
-
private static registry;
|
|
2219
|
-
static register(actionClass: typeof FetchAction): void;
|
|
2220
|
-
static get(id: string): typeof FetchAction | undefined;
|
|
2221
|
-
static create(id: FetchActionOptions): FetchAction | undefined;
|
|
2222
|
-
static create(id: string): FetchAction | undefined;
|
|
2223
|
-
static has(name: string): boolean;
|
|
2224
|
-
static list(): string[];
|
|
2225
|
-
static id: string;
|
|
2226
|
-
static returnType: FetchReturnType;
|
|
2227
|
-
static capabilities: FetchActionCapabilities;
|
|
2228
|
-
static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2229
|
-
getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2230
|
-
get id(): string;
|
|
2231
|
-
get returnType(): FetchReturnType;
|
|
2232
|
-
get capabilities(): FetchActionCapabilities;
|
|
2233
|
-
protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2234
|
-
protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2235
|
-
abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
|
|
2236
|
-
protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
|
|
2237
|
-
protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
|
|
2131
|
+
scrollIntoView(params: {
|
|
2132
|
+
selector: string;
|
|
2133
|
+
}): Promise<void>;
|
|
2238
2134
|
/**
|
|
2239
|
-
*
|
|
2240
|
-
*
|
|
2135
|
+
* Types text into current focused element.
|
|
2136
|
+
*
|
|
2137
|
+
* @param text - Text to type
|
|
2138
|
+
* @param delay - Delay between key presses
|
|
2241
2139
|
*/
|
|
2242
|
-
|
|
2243
|
-
entry: FetchActionInContext;
|
|
2244
|
-
collectors: CollectorsRuntime | undefined;
|
|
2245
|
-
}>;
|
|
2140
|
+
keyboardType(text: string, delay?: number): Promise<void>;
|
|
2246
2141
|
/**
|
|
2247
|
-
*
|
|
2248
|
-
*
|
|
2142
|
+
* Presses specified key.
|
|
2143
|
+
*
|
|
2144
|
+
* @param key - Key to press
|
|
2145
|
+
* @param delay - Delay after key press
|
|
2249
2146
|
*/
|
|
2250
|
-
|
|
2251
|
-
entry: FetchActionInContext;
|
|
2252
|
-
collectors?: CollectorsRuntime;
|
|
2253
|
-
}): Promise<void>;
|
|
2254
|
-
execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
|
|
2255
|
-
}
|
|
2256
|
-
type CollectorsRuntime = {
|
|
2257
|
-
cleanup: () => void;
|
|
2258
|
-
awaitExecPendings: () => Promise<void>;
|
|
2259
|
-
};
|
|
2260
|
-
|
|
2261
|
-
type FetchEngineType = 'http' | 'browser';
|
|
2262
|
-
type BrowserEngine = 'playwright' | 'puppeteer';
|
|
2263
|
-
type FetchEngineMode = FetchEngineType | 'auto' | string;
|
|
2264
|
-
type ResourceType = 'image' | 'stylesheet' | 'font' | 'script' | 'media' | string;
|
|
2265
|
-
/**
|
|
2266
|
-
* Storage configuration options for the fetch engine.
|
|
2267
|
-
*
|
|
2268
|
-
* @remarks
|
|
2269
|
-
* Controls how Crawlee's internal storage (RequestQueue, KeyValueStore, SessionPool) is managed.
|
|
2270
|
-
*/
|
|
2271
|
-
interface StorageOptions {
|
|
2147
|
+
keyboardPress(key: string, delay?: number): Promise<void>;
|
|
2272
2148
|
/**
|
|
2273
|
-
*
|
|
2274
|
-
*
|
|
2275
|
-
*
|
|
2149
|
+
* Fills input element with specified value.
|
|
2150
|
+
*
|
|
2151
|
+
* @param selector - CSS selector of input element
|
|
2152
|
+
* @param value - Value to fill
|
|
2153
|
+
* @returns Promise resolving when fill operation completes
|
|
2154
|
+
* @throws {Error} When no active page context exists
|
|
2276
2155
|
*/
|
|
2277
|
-
|
|
2156
|
+
fill(selector: string, value: string): Promise<void>;
|
|
2278
2157
|
/**
|
|
2279
|
-
*
|
|
2280
|
-
*
|
|
2281
|
-
*
|
|
2158
|
+
* Submits a form.
|
|
2159
|
+
*
|
|
2160
|
+
* @param selector - Optional form/submit button selector
|
|
2161
|
+
* @param options - Submission options
|
|
2162
|
+
* @returns Promise resolving when form is submitted
|
|
2163
|
+
* @throws {Error} When no active page context exists
|
|
2282
2164
|
*/
|
|
2283
|
-
|
|
2165
|
+
submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
|
|
2284
2166
|
/**
|
|
2285
|
-
*
|
|
2286
|
-
*
|
|
2167
|
+
* Removes elements from the DOM based on selectors and presets.
|
|
2168
|
+
*
|
|
2169
|
+
* @param options - Trim options specifying selectors and presets
|
|
2170
|
+
* @returns Promise resolving when trim operation completes
|
|
2171
|
+
* @throws {Error} When no active page context exists
|
|
2287
2172
|
*/
|
|
2288
|
-
|
|
2173
|
+
trim(options: TrimActionOptions): Promise<void>;
|
|
2289
2174
|
/**
|
|
2290
|
-
*
|
|
2291
|
-
*
|
|
2175
|
+
* Pauses execution, allowing for manual intervention or inspection.
|
|
2176
|
+
*
|
|
2177
|
+
* @param message - Optional message to display during pause
|
|
2178
|
+
* @returns Promise resolving when execution is resumed
|
|
2179
|
+
* @throws {Error} When no active page context exists
|
|
2292
2180
|
*/
|
|
2293
|
-
|
|
2294
|
-
}
|
|
2295
|
-
interface BaseFetcherProperties {
|
|
2181
|
+
pause(message?: string): Promise<void>;
|
|
2296
2182
|
/**
|
|
2297
|
-
*
|
|
2183
|
+
* Executes a custom function or expression within the current page context.
|
|
2298
2184
|
*
|
|
2299
|
-
*
|
|
2300
|
-
*
|
|
2301
|
-
*
|
|
2185
|
+
* @remarks
|
|
2186
|
+
* This is a powerful action that allows running custom logic to interact with the DOM,
|
|
2187
|
+
* calculate values, or trigger navigations.
|
|
2188
|
+
*
|
|
2189
|
+
* - In **Browser Mode**, it runs in the real browser.
|
|
2190
|
+
* - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
|
|
2191
|
+
*
|
|
2192
|
+
* The action handles automatic navigation if `window.location` is modified.
|
|
2193
|
+
*
|
|
2194
|
+
* @param params - Configuration for the execution, including the function and arguments.
|
|
2195
|
+
* @returns A promise resolving to the result of the execution.
|
|
2196
|
+
* @throws {Error} If no active page context exists or if execution fails.
|
|
2197
|
+
*
|
|
2198
|
+
* @see {@link EvaluateActionOptions} for detailed parameter options and examples.
|
|
2302
2199
|
*/
|
|
2303
|
-
|
|
2304
|
-
enableSmart?: boolean;
|
|
2305
|
-
useSiteRegistry?: boolean;
|
|
2306
|
-
antibot?: boolean;
|
|
2307
|
-
debug?: boolean | string | string[];
|
|
2308
|
-
headers?: Record<string, string>;
|
|
2309
|
-
cookies?: Cookie[];
|
|
2310
|
-
sessionState?: any;
|
|
2311
|
-
sessionPoolOptions?: SessionPoolOptions;
|
|
2312
|
-
overrideSessionState?: boolean;
|
|
2313
|
-
throwHttpErrors?: boolean;
|
|
2314
|
-
output?: {
|
|
2315
|
-
cookies?: boolean;
|
|
2316
|
-
sessionState?: boolean;
|
|
2317
|
-
};
|
|
2318
|
-
proxy?: string | string[];
|
|
2319
|
-
blockResources?: ResourceType[];
|
|
2200
|
+
evaluate(params: EvaluateActionOptions): Promise<any>;
|
|
2320
2201
|
/**
|
|
2321
|
-
*
|
|
2202
|
+
* Extracts structured data from the current page content.
|
|
2203
|
+
*
|
|
2204
|
+
* @param schema - An object defining the data to extract.
|
|
2205
|
+
* @returns A promise that resolves to an object with the extracted data.
|
|
2322
2206
|
*/
|
|
2323
|
-
|
|
2324
|
-
ignoreSslErrors?: boolean;
|
|
2325
|
-
browser?: {
|
|
2326
|
-
/**
|
|
2327
|
-
* 浏览器引擎,默认为 playwright
|
|
2328
|
-
*
|
|
2329
|
-
* - `playwright`: 使用 Playwright 引擎
|
|
2330
|
-
* - `puppeteer`: 使用 Puppeteer 引擎
|
|
2331
|
-
*/
|
|
2332
|
-
engine?: BrowserEngine;
|
|
2333
|
-
headless?: boolean;
|
|
2334
|
-
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
2335
|
-
launchOptions?: Record<string, any>;
|
|
2336
|
-
};
|
|
2337
|
-
http?: {
|
|
2338
|
-
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
|
2339
|
-
body?: any;
|
|
2340
|
-
};
|
|
2341
|
-
timeoutMs?: number;
|
|
2342
|
-
requestHandlerTimeoutSecs?: number;
|
|
2343
|
-
maxConcurrency?: number;
|
|
2344
|
-
maxRequestsPerMinute?: number;
|
|
2345
|
-
delayBetweenRequestsMs?: number;
|
|
2346
|
-
retries?: number;
|
|
2347
|
-
sites?: FetchSite[];
|
|
2348
|
-
url?: string;
|
|
2349
|
-
}
|
|
2350
|
-
interface FetchSite extends BaseFetcherProperties {
|
|
2351
|
-
domain: string;
|
|
2352
|
-
pathScope?: string[];
|
|
2353
|
-
meta?: {
|
|
2354
|
-
updatedAt?: number;
|
|
2355
|
-
ttlMs?: number;
|
|
2356
|
-
source?: 'manual' | 'smart';
|
|
2357
|
-
};
|
|
2358
|
-
}
|
|
2359
|
-
type OnFetchPauseCallback = (options: {
|
|
2360
|
-
message?: string;
|
|
2361
|
-
}) => Promise<void>;
|
|
2362
|
-
interface FetcherOptions extends BaseFetcherProperties {
|
|
2363
|
-
actions?: FetchActionOptions[];
|
|
2364
|
-
onPause?: OnFetchPauseCallback;
|
|
2365
|
-
}
|
|
2366
|
-
interface FetchMetadata {
|
|
2367
|
-
mode: FetchEngineType;
|
|
2368
|
-
engine?: BrowserEngine;
|
|
2369
|
-
timings?: {
|
|
2370
|
-
start: number;
|
|
2371
|
-
total: number;
|
|
2372
|
-
ttfb?: number;
|
|
2373
|
-
dns?: number;
|
|
2374
|
-
tcp?: number;
|
|
2375
|
-
firstByte?: number;
|
|
2376
|
-
download?: number;
|
|
2377
|
-
};
|
|
2378
|
-
proxy?: string;
|
|
2379
|
-
[key: string]: any;
|
|
2380
|
-
}
|
|
2381
|
-
interface FetchResponse {
|
|
2382
|
-
url: string;
|
|
2383
|
-
finalUrl: string;
|
|
2384
|
-
statusCode?: number;
|
|
2385
|
-
statusText?: string;
|
|
2386
|
-
headers: Record<string, string>;
|
|
2387
|
-
contentType?: string;
|
|
2388
|
-
body?: string | Buffer<ArrayBufferLike>;
|
|
2389
|
-
html?: string;
|
|
2390
|
-
text?: string;
|
|
2391
|
-
json?: any;
|
|
2392
|
-
cookies?: Cookie[];
|
|
2393
|
-
sessionState?: any;
|
|
2394
|
-
metadata?: FetchMetadata;
|
|
2395
|
-
}
|
|
2396
|
-
declare const DefaultFetcherProperties: BaseFetcherProperties;
|
|
2397
|
-
declare const FetcherOptionKeys: string[];
|
|
2398
|
-
|
|
2399
|
-
/**
|
|
2400
|
-
* Represents a stateful web fetching session.
|
|
2401
|
-
*
|
|
2402
|
-
* @remarks
|
|
2403
|
-
* A `FetchSession` manages the lifecycle of a single crawling operation, including engine initialization,
|
|
2404
|
-
* cookie persistence, and sequential action execution. It maintains a `FetchContext` that stores
|
|
2405
|
-
* session-level configurations and outputs.
|
|
2406
|
-
*
|
|
2407
|
-
* Sessions are isolated; each has its own unique ID and (by default) its own storage and cookies.
|
|
2408
|
-
*/
|
|
2409
|
-
declare class FetchSession {
|
|
2410
|
-
protected options: FetcherOptions;
|
|
2207
|
+
extract<T>(schema: ExtractSchema): Promise<T>;
|
|
2411
2208
|
/**
|
|
2412
|
-
*
|
|
2209
|
+
* Gets the unique identifier of this engine implementation.
|
|
2413
2210
|
*/
|
|
2414
|
-
|
|
2211
|
+
get id(): string;
|
|
2415
2212
|
/**
|
|
2416
|
-
*
|
|
2213
|
+
* Returns the current state of the engine (cookies)
|
|
2214
|
+
* that can be used to restore the session later.
|
|
2417
2215
|
*/
|
|
2418
|
-
|
|
2419
|
-
|
|
2216
|
+
getState(): Promise<{
|
|
2217
|
+
cookies: Cookie[];
|
|
2218
|
+
sessionState?: any;
|
|
2219
|
+
}>;
|
|
2420
2220
|
/**
|
|
2421
|
-
*
|
|
2221
|
+
* Gets the execution mode of this engine (`'http'` or `'browser'`).
|
|
2222
|
+
*/
|
|
2223
|
+
get mode(): FetchEngineType;
|
|
2224
|
+
/**
|
|
2225
|
+
* Gets the fetch engine context associated with this instance.
|
|
2226
|
+
*/
|
|
2227
|
+
get context(): FetchEngineContext | undefined;
|
|
2228
|
+
/**
|
|
2229
|
+
* Initializes the fetch engine with provided context and options.
|
|
2422
2230
|
*
|
|
2423
|
-
* @param
|
|
2231
|
+
* @param context - Fetch engine context
|
|
2232
|
+
* @param options - Configuration options
|
|
2233
|
+
* @returns Promise resolving when initialization completes
|
|
2234
|
+
*
|
|
2235
|
+
* @remarks
|
|
2236
|
+
* Sets up internal state and calls implementation-specific [_initialize](file:///home/riceball/Documents/mywork/public/@isdk/ai-tools/packages/web-fetcher/src/engine/cheerio.ts#L169-L204) method.
|
|
2237
|
+
* Automatically called when creating engine via `FetchEngine.create()`.
|
|
2424
2238
|
*/
|
|
2425
|
-
|
|
2426
|
-
|
|
2239
|
+
initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
|
|
2240
|
+
cleanup(): Promise<void>;
|
|
2427
2241
|
/**
|
|
2428
|
-
*
|
|
2242
|
+
* Gets the initial scope for extraction for the specific engine.
|
|
2243
|
+
* @param context - Crawlee crawling context
|
|
2244
|
+
* @internal
|
|
2245
|
+
*/
|
|
2246
|
+
protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
|
|
2247
|
+
/**
|
|
2248
|
+
* Unified action processor that handles engine-agnostic actions.
|
|
2249
|
+
* @param context - Crawlee crawling context
|
|
2250
|
+
* @param action - Action to execute
|
|
2251
|
+
* @internal
|
|
2252
|
+
*/
|
|
2253
|
+
protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
|
|
2254
|
+
protected _handlePause(action: {
|
|
2255
|
+
message?: string;
|
|
2256
|
+
}): Promise<void>;
|
|
2257
|
+
/**
|
|
2258
|
+
* Executes all pending fetch engine actions within the current Crawlee request handler context.
|
|
2429
2259
|
*
|
|
2430
|
-
*
|
|
2431
|
-
*
|
|
2432
|
-
*
|
|
2433
|
-
*
|
|
2260
|
+
* **Critical Execution Constraint**: This method **MUST** be awaited within the synchronous execution path
|
|
2261
|
+
* of Crawlee's [requestHandler](https://crawlee.dev/js/api/basic-crawler) (before any `await` that yields control back to the event loop).
|
|
2262
|
+
*
|
|
2263
|
+
* ### Why This Constraint Exists
|
|
2264
|
+
* - Crawlee's page context ([PlaywrightCrawler](https://crawlee.dev/js/api/playwright-crawler)'s `page` or [CheerioCrawler](https://crawlee.dev/js/api/cheerio-crawler)'s `$`)
|
|
2265
|
+
* is **only valid during the synchronous execution phase** of the request handler
|
|
2266
|
+
* - After any `await` (e.g., `await page.goto()`), the page context may be destroyed
|
|
2267
|
+
* due to Crawlee's internal resource management
|
|
2268
|
+
*
|
|
2269
|
+
* ### How It Works
|
|
2270
|
+
* 1. Processes all actions queued via {@link dispatchAction} (click, fill, submit, etc.)
|
|
2271
|
+
* 2. Maintains the page context validity window via {@link isPageActive} lifecycle flag
|
|
2272
|
+
* 3. Automatically cleans up event listeners upon completion
|
|
2273
|
+
*
|
|
2274
|
+
* Usage see {@link _sharedRequestHandler}
|
|
2275
|
+
* @see {@link _sharedRequestHandler}
|
|
2276
|
+
* @param context The active Crawlee crawling context containing the page/$ object
|
|
2277
|
+
* @throws {Error} If called outside valid page context window (`!this.isPageActive`)
|
|
2278
|
+
* @internal Engine infrastructure method - not for direct consumer use
|
|
2279
|
+
*/
|
|
2280
|
+
protected _executePendingActions(context: TContext): Promise<void>;
|
|
2281
|
+
protected _sharedRequestHandler(context: TContext): Promise<void>;
|
|
2282
|
+
protected _sharedFailedRequestHandler(context: TContext & {
|
|
2283
|
+
response?: FetchResponse;
|
|
2284
|
+
body?: string | Buffer;
|
|
2285
|
+
}, error?: Error): Promise<void>;
|
|
2286
|
+
protected dispatchAction<T>(action: FetchEngineAction): Promise<T>;
|
|
2287
|
+
private _requestHandler;
|
|
2288
|
+
private _failedRequestHandler;
|
|
2289
|
+
protected _commonCleanup(): Promise<void>;
|
|
2290
|
+
/**
|
|
2291
|
+
* Blocks specified resource types from loading.
|
|
2292
|
+
*
|
|
2293
|
+
* @param types - Resource types to block
|
|
2294
|
+
* @param overwrite - Whether to replace existing blocked types
|
|
2295
|
+
* @returns Number of blocked resource types
|
|
2434
2296
|
*
|
|
2435
2297
|
* @example
|
|
2436
2298
|
* ```ts
|
|
2437
|
-
* await
|
|
2299
|
+
* await engine.blockResources(['image', 'stylesheet']);
|
|
2300
|
+
* await engine.blockResources(['script'], true); // Replace existing
|
|
2438
2301
|
* ```
|
|
2439
2302
|
*/
|
|
2440
|
-
|
|
2303
|
+
blockResources(types: ResourceType[], overwrite?: boolean): Promise<number>;
|
|
2441
2304
|
/**
|
|
2442
|
-
*
|
|
2305
|
+
* Gets content of current page.
|
|
2443
2306
|
*
|
|
2444
|
-
* @
|
|
2445
|
-
* @
|
|
2446
|
-
|
|
2447
|
-
|
|
2307
|
+
* @returns Promise resolving to fetch response
|
|
2308
|
+
* @throws {Error} When no content has been fetched yet
|
|
2309
|
+
*/
|
|
2310
|
+
getContent(): Promise<FetchResponse>;
|
|
2311
|
+
/**
|
|
2312
|
+
* Manages HTTP headers for requests with multiple overloads.
|
|
2313
|
+
*
|
|
2314
|
+
* @overload
|
|
2315
|
+
* Gets all headers.
|
|
2316
|
+
* @returns All headers as record
|
|
2317
|
+
*
|
|
2318
|
+
* @overload
|
|
2319
|
+
* Gets specific header value.
|
|
2320
|
+
* @param name - Header name
|
|
2321
|
+
* @returns Header value
|
|
2322
|
+
*
|
|
2323
|
+
* @overload
|
|
2324
|
+
* Sets multiple headers.
|
|
2325
|
+
* @param headers - Headers to set
|
|
2326
|
+
* @param replaced - Whether to replace all existing headers
|
|
2327
|
+
* @returns `true` if successful
|
|
2328
|
+
*
|
|
2329
|
+
* @overload
|
|
2330
|
+
* Sets single header.
|
|
2331
|
+
* @param name - Header name
|
|
2332
|
+
* @param value - Header value or `null` to remove
|
|
2333
|
+
* @returns `true` if successful
|
|
2448
2334
|
*
|
|
2449
2335
|
* @example
|
|
2450
2336
|
* ```ts
|
|
2451
|
-
* const
|
|
2452
|
-
*
|
|
2453
|
-
*
|
|
2454
|
-
*
|
|
2337
|
+
* const allHeaders = await engine.headers();
|
|
2338
|
+
* const userAgent = await engine.headers('user-agent');
|
|
2339
|
+
* await engine.headers({ 'x-custom': 'value' });
|
|
2340
|
+
* await engine.headers('auth', 'token');
|
|
2455
2341
|
* ```
|
|
2456
2342
|
*/
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
outputs: Record<string, any>;
|
|
2462
|
-
}>;
|
|
2343
|
+
headers(): Promise<Record<string, string>>;
|
|
2344
|
+
headers(name: string): Promise<string>;
|
|
2345
|
+
headers(headers: Record<string, string>, replaced?: boolean): Promise<boolean>;
|
|
2346
|
+
headers(name: string, value: string | null): Promise<boolean>;
|
|
2463
2347
|
/**
|
|
2464
|
-
*
|
|
2348
|
+
* Manages cookies for current session with multiple overloads.
|
|
2465
2349
|
*
|
|
2466
|
-
* @
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
/**
|
|
2470
|
-
* Gets the current state of the session, including cookies and engine-specific state.
|
|
2350
|
+
* @overload
|
|
2351
|
+
* Gets all cookies.
|
|
2352
|
+
* @returns Array of cookies
|
|
2471
2353
|
*
|
|
2472
|
-
* @
|
|
2354
|
+
* @overload
|
|
2355
|
+
* Sets cookies for session.
|
|
2356
|
+
* @param cookies - Cookies to set
|
|
2357
|
+
* @returns `true` if successful
|
|
2358
|
+
*
|
|
2359
|
+
* @example
|
|
2360
|
+
* ```ts
|
|
2361
|
+
* const cookies = await engine.cookies();
|
|
2362
|
+
* await engine.cookies([{ name: 'session', value: '123' }]);
|
|
2363
|
+
* ```
|
|
2473
2364
|
*/
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
sessionState?: any;
|
|
2477
|
-
} | undefined>;
|
|
2365
|
+
cookies(): Promise<Cookie[]>;
|
|
2366
|
+
cookies(cookies: Cookie[]): Promise<boolean>;
|
|
2478
2367
|
/**
|
|
2479
|
-
* Disposes of
|
|
2368
|
+
* Disposes of engine, cleaning up all resources.
|
|
2480
2369
|
*
|
|
2481
|
-
* @
|
|
2482
|
-
* This method should be called when the session is no longer needed to free up resources
|
|
2483
|
-
* (e.g., closing browser instances, purging temporary storage).
|
|
2370
|
+
* @returns Promise resolving when disposal completes
|
|
2484
2371
|
*/
|
|
2485
2372
|
dispose(): Promise<void>;
|
|
2486
|
-
private ensureEngine;
|
|
2487
|
-
protected createContext(options?: FetcherOptions): FetchContext;
|
|
2488
2373
|
}
|
|
2374
|
+
declare function getRandomDelay(base: number, variance?: number): number;
|
|
2489
2375
|
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2376
|
+
type CheerioAPI = NonNullable<CheerioCrawlingContext['$']>;
|
|
2377
|
+
type CheerioSelection = ReturnType<CheerioAPI>;
|
|
2378
|
+
type CheerioNode = ReturnType<CheerioSelection['first']>;
|
|
2379
|
+
declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, CheerioCrawler, CheerioCrawlerOptions> {
|
|
2380
|
+
static readonly id = "cheerio";
|
|
2381
|
+
static readonly mode = "http";
|
|
2382
|
+
private _ensureCheerioContext;
|
|
2383
|
+
protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
|
|
2384
|
+
_querySelectorAll(scope: {
|
|
2385
|
+
$: CheerioAPI;
|
|
2386
|
+
el: any;
|
|
2387
|
+
} | any[], selector: string): Promise<FetchElementScope[]>;
|
|
2388
|
+
_nextSiblingsUntil(scope: {
|
|
2389
|
+
$: CheerioAPI;
|
|
2390
|
+
el: CheerioNode;
|
|
2391
|
+
}, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2392
|
+
_parentElement(scope: {
|
|
2393
|
+
$: CheerioAPI;
|
|
2394
|
+
el: CheerioNode;
|
|
2395
|
+
}): Promise<FetchElementScope | null>;
|
|
2396
|
+
_isSameElement(scope1: {
|
|
2397
|
+
el: CheerioNode;
|
|
2398
|
+
}, scope2: {
|
|
2399
|
+
el: CheerioNode;
|
|
2400
|
+
}): Promise<boolean>;
|
|
2401
|
+
_findClosestAncestor(scope: {
|
|
2402
|
+
$: CheerioAPI;
|
|
2403
|
+
el: CheerioNode;
|
|
2404
|
+
}, candidates: {
|
|
2405
|
+
$: CheerioAPI;
|
|
2406
|
+
el: CheerioNode;
|
|
2407
|
+
}[]): Promise<FetchElementScope | null>;
|
|
2408
|
+
_contains(container: {
|
|
2409
|
+
$: CheerioAPI;
|
|
2410
|
+
el: CheerioNode;
|
|
2411
|
+
}, element: {
|
|
2412
|
+
$: CheerioAPI;
|
|
2413
|
+
el: CheerioNode;
|
|
2414
|
+
}): Promise<boolean>;
|
|
2415
|
+
_findCommonAncestor(scope1: {
|
|
2416
|
+
$: CheerioAPI;
|
|
2417
|
+
el: CheerioNode;
|
|
2418
|
+
}, scope2: {
|
|
2419
|
+
$: CheerioAPI;
|
|
2420
|
+
el: CheerioNode;
|
|
2421
|
+
}): Promise<FetchElementScope | null>;
|
|
2422
|
+
_findContainerChild(element: {
|
|
2423
|
+
$: CheerioAPI;
|
|
2424
|
+
el: CheerioNode;
|
|
2425
|
+
}, container: {
|
|
2426
|
+
$: CheerioAPI;
|
|
2427
|
+
el: CheerioNode;
|
|
2428
|
+
}): Promise<FetchElementScope | null>;
|
|
2429
|
+
_extractValue(schema: ExtractValueSchema, scope: {
|
|
2430
|
+
$: CheerioAPI;
|
|
2431
|
+
el: CheerioNode;
|
|
2432
|
+
}): Promise<any>;
|
|
2433
|
+
protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
|
|
2434
|
+
protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2435
|
+
protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
|
|
2436
|
+
url: string;
|
|
2437
|
+
method: string;
|
|
2438
|
+
body?: any;
|
|
2439
|
+
headers?: Record<string, string>;
|
|
2440
|
+
}): Promise<any>;
|
|
2441
|
+
protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
|
|
2442
|
+
protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
|
|
2443
|
+
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
|
|
2444
|
+
goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
type Page = NonNullable<PlaywrightCrawlingContext['page']>;
|
|
2448
|
+
type Locator = ReturnType<Page['locator']>;
|
|
2449
|
+
declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContext, PlaywrightCrawler, PlaywrightCrawlerOptions> {
|
|
2450
|
+
static readonly id = "playwright";
|
|
2451
|
+
static readonly mode = "browser";
|
|
2452
|
+
protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
|
|
2453
|
+
_querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
|
|
2454
|
+
_nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2455
|
+
_parentElement(scope: Locator): Promise<FetchElementScope | null>;
|
|
2456
|
+
_isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
|
|
2457
|
+
_findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
|
|
2458
|
+
_contains(container: Locator, element: Locator): Promise<boolean>;
|
|
2459
|
+
_findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
|
|
2460
|
+
_findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
|
|
2461
|
+
_extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
|
|
2462
|
+
protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
|
|
2463
|
+
protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
|
|
2464
|
+
protected currentMousePos: {
|
|
2465
|
+
x: number;
|
|
2466
|
+
y: number;
|
|
2467
|
+
};
|
|
2468
|
+
protected _sharedRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
|
|
2469
|
+
protected mouseInitialized: boolean;
|
|
2470
|
+
protected _initializeMousePos(page: Page): Promise<void>;
|
|
2471
|
+
protected _getTrajectory(start: {
|
|
2472
|
+
x: number;
|
|
2473
|
+
y: number;
|
|
2474
|
+
}, end: {
|
|
2475
|
+
x: number;
|
|
2476
|
+
y: number;
|
|
2477
|
+
}, steps?: number): {
|
|
2478
|
+
x: number;
|
|
2479
|
+
y: number;
|
|
2480
|
+
}[];
|
|
2481
|
+
protected _moveToPos(context: PlaywrightCrawlingContext, target: {
|
|
2482
|
+
x: number;
|
|
2483
|
+
y: number;
|
|
2484
|
+
}, steps?: number): Promise<{
|
|
2485
|
+
x: number;
|
|
2486
|
+
y: number;
|
|
2487
|
+
}>;
|
|
2488
|
+
protected _ensureVisible(context: PlaywrightCrawlingContext, selector: string): Promise<{
|
|
2489
|
+
x: number;
|
|
2490
|
+
y: number;
|
|
2491
|
+
}>;
|
|
2492
|
+
protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
|
|
2493
|
+
x: number;
|
|
2494
|
+
y: number;
|
|
2495
|
+
}>;
|
|
2496
|
+
protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
2497
|
+
protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
|
|
2498
|
+
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
|
|
2499
|
+
goto(url: string, opts?: GotoActionOptions): Promise<FetchResponse>;
|
|
2500
|
+
}
|
|
2501
|
+
|
|
2502
|
+
type FetchActionCapabilities = {
|
|
2503
|
+
[mode in FetchEngineType]?: FetchActionCapabilityMode;
|
|
2504
|
+
};
|
|
2505
|
+
declare abstract class FetchAction {
|
|
2506
|
+
private static registry;
|
|
2507
|
+
static register(actionClass: any): void;
|
|
2508
|
+
static get(id: string): any | undefined;
|
|
2509
|
+
static create(id: FetchActionOptions): FetchAction | undefined;
|
|
2510
|
+
static create(id: string): FetchAction | undefined;
|
|
2511
|
+
static has(name: string): boolean;
|
|
2512
|
+
static list(): string[];
|
|
2513
|
+
static id: string;
|
|
2514
|
+
static returnType: FetchReturnType;
|
|
2515
|
+
static capabilities: FetchActionCapabilities;
|
|
2516
|
+
static getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2517
|
+
getCapability(mode?: FetchEngineType): FetchActionCapabilityMode;
|
|
2518
|
+
get id(): string;
|
|
2519
|
+
get returnType(): FetchReturnType;
|
|
2520
|
+
get capabilities(): FetchActionCapabilities;
|
|
2521
|
+
protected onBeforeExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2522
|
+
protected onAfterExec?(context: FetchContext, options?: FetchActionProperties): Promise<void> | void;
|
|
2523
|
+
abstract onExecute(context: FetchContext, options?: FetchActionProperties, eventPayload?: any): Promise<any> | any;
|
|
2524
|
+
protected delegateToEngine(context: FetchContext, method: keyof FetchEngine, ...args: any[]): Promise<any>;
|
|
2525
|
+
protected installCollectors(context: FetchContext, options?: FetchActionProperties): CollectorsRuntime | undefined;
|
|
2511
2526
|
/**
|
|
2512
|
-
*
|
|
2513
|
-
*
|
|
2514
|
-
* @param options - Configuration options for the session, merged with defaults.
|
|
2515
|
-
* @returns A promise resolving to a new FetchSession instance.
|
|
2527
|
+
* Action 开始生命周期
|
|
2528
|
+
* 负责:初始化 stack、设置 currentAction、触发事件、调用钩子
|
|
2516
2529
|
*/
|
|
2517
|
-
|
|
2530
|
+
beforeExec(context: FetchContext, options?: FetchActionProperties): Promise<{
|
|
2531
|
+
entry: Required<Pick<FetchActionProperties, "action">> & Partial<Pick<FetchActionProperties, "id" | "name">> & {
|
|
2532
|
+
[x: string]: any;
|
|
2533
|
+
collectors?: BaseFetchCollectorOptions[] | undefined;
|
|
2534
|
+
index?: number | undefined;
|
|
2535
|
+
params?: any;
|
|
2536
|
+
args?: any;
|
|
2537
|
+
storeAs?: string | undefined;
|
|
2538
|
+
failOnError?: boolean | undefined;
|
|
2539
|
+
failOnTimeout?: boolean | undefined;
|
|
2540
|
+
timeoutMs?: number | undefined;
|
|
2541
|
+
maxRetries?: number | undefined;
|
|
2542
|
+
} & {
|
|
2543
|
+
index?: number;
|
|
2544
|
+
error?: Error;
|
|
2545
|
+
depth?: number;
|
|
2546
|
+
};
|
|
2547
|
+
collectors: CollectorsRuntime | undefined;
|
|
2548
|
+
}>;
|
|
2518
2549
|
/**
|
|
2519
|
-
*
|
|
2520
|
-
*
|
|
2521
|
-
* @remarks
|
|
2522
|
-
* This method automatically creates a session, executes the specified actions,
|
|
2523
|
-
* retrieves the content, and disposes of the session.
|
|
2524
|
-
*
|
|
2525
|
-
* @param url - The target URL or a complete FetcherOptions object.
|
|
2526
|
-
* @param options - Additional options when the first parameter is a URL string.
|
|
2527
|
-
* @returns A promise resolving to the final response and any extracted outputs.
|
|
2550
|
+
* Action 结束生命周期
|
|
2551
|
+
* 负责:调用钩子、赋值lastResult, 触发事件、清理 stack、恢复 currentAction
|
|
2528
2552
|
*/
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2532
|
-
}>;
|
|
2533
|
-
|
|
2534
|
-
result: FetchResponse | undefined;
|
|
2535
|
-
outputs: Record<string, any>;
|
|
2536
|
-
}>;
|
|
2553
|
+
afterExec(context: FetchContext, options?: BaseFetchCollectorActionProperties, result?: FetchActionResult, scope?: {
|
|
2554
|
+
entry: FetchActionInContext;
|
|
2555
|
+
collectors?: CollectorsRuntime;
|
|
2556
|
+
}): Promise<void>;
|
|
2557
|
+
execute<R extends FetchReturnType = 'any'>(context: FetchContext, options?: FetchActionProperties): Promise<FetchActionResult<R>>;
|
|
2537
2558
|
}
|
|
2559
|
+
type CollectorsRuntime = {
|
|
2560
|
+
cleanup: () => void;
|
|
2561
|
+
awaitExecPendings: () => Promise<void>;
|
|
2562
|
+
};
|
|
2538
2563
|
|
|
2539
2564
|
declare class ClickAction extends FetchAction {
|
|
2540
2565
|
static id: string;
|
|
@@ -2779,4 +2804,4 @@ declare function fetchWeb(url: string, options?: FetcherOptions): Promise<{
|
|
|
2779
2804
|
outputs: Record<string, any>;
|
|
2780
2805
|
}>;
|
|
2781
2806
|
|
|
2782
|
-
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };
|
|
2807
|
+
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EngineUpgradeError, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionMeta, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, MouseWheelAction, type MouseWheelParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, ScrollIntoViewAction, type ScrollIntoViewParams, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb, getRandomDelay };
|