@isdk/web-fetcher 0.2.12 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.action.cn.md +197 -155
- package/README.action.extract.cn.md +263 -0
- package/README.action.extract.md +263 -0
- package/README.action.md +202 -147
- package/README.cn.md +25 -15
- package/README.engine.cn.md +118 -14
- package/README.engine.md +115 -14
- package/README.md +19 -10
- package/dist/index.d.mts +667 -50
- package/dist/index.d.ts +667 -50
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/README.md +19 -10
- package/docs/_media/README.action.md +202 -147
- package/docs/_media/README.cn.md +25 -15
- package/docs/_media/README.engine.md +115 -14
- package/docs/classes/CheerioFetchEngine.md +805 -135
- package/docs/classes/ClickAction.md +33 -33
- package/docs/classes/EvaluateAction.md +559 -0
- package/docs/classes/ExtractAction.md +33 -33
- package/docs/classes/FetchAction.md +39 -33
- package/docs/classes/FetchEngine.md +660 -122
- package/docs/classes/FetchSession.md +38 -16
- package/docs/classes/FillAction.md +33 -33
- package/docs/classes/GetContentAction.md +33 -33
- package/docs/classes/GotoAction.md +33 -33
- package/docs/classes/KeyboardPressAction.md +533 -0
- package/docs/classes/KeyboardTypeAction.md +533 -0
- package/docs/classes/MouseClickAction.md +533 -0
- package/docs/classes/MouseMoveAction.md +533 -0
- package/docs/classes/PauseAction.md +33 -33
- package/docs/classes/PlaywrightFetchEngine.md +820 -122
- package/docs/classes/SubmitAction.md +33 -33
- package/docs/classes/TrimAction.md +533 -0
- package/docs/classes/WaitForAction.md +33 -33
- package/docs/classes/WebFetcher.md +9 -9
- package/docs/enumerations/FetchActionResultStatus.md +4 -4
- package/docs/functions/fetchWeb.md +6 -6
- package/docs/globals.md +14 -0
- package/docs/interfaces/BaseFetchActionProperties.md +12 -12
- package/docs/interfaces/BaseFetchCollectorActionProperties.md +16 -16
- package/docs/interfaces/BaseFetcherProperties.md +32 -28
- package/docs/interfaces/Cookie.md +14 -14
- package/docs/interfaces/DispatchedEngineAction.md +4 -4
- package/docs/interfaces/EvaluateActionOptions.md +81 -0
- package/docs/interfaces/ExtractActionProperties.md +12 -12
- package/docs/interfaces/FetchActionInContext.md +15 -15
- package/docs/interfaces/FetchActionProperties.md +13 -13
- package/docs/interfaces/FetchActionResult.md +6 -6
- package/docs/interfaces/FetchContext.md +42 -38
- package/docs/interfaces/FetchEngineContext.md +37 -33
- package/docs/interfaces/FetchMetadata.md +5 -5
- package/docs/interfaces/FetchResponse.md +14 -14
- package/docs/interfaces/FetchReturnTypeRegistry.md +8 -8
- package/docs/interfaces/FetchSite.md +35 -31
- package/docs/interfaces/FetcherOptions.md +34 -30
- package/docs/interfaces/GotoActionOptions.md +14 -6
- package/docs/interfaces/KeyboardPressParams.md +25 -0
- package/docs/interfaces/KeyboardTypeParams.md +25 -0
- package/docs/interfaces/MouseClickParams.md +49 -0
- package/docs/interfaces/MouseMoveParams.md +41 -0
- package/docs/interfaces/PendingEngineRequest.md +3 -3
- package/docs/interfaces/StorageOptions.md +5 -5
- package/docs/interfaces/SubmitActionOptions.md +2 -2
- package/docs/interfaces/TrimActionOptions.md +27 -0
- package/docs/interfaces/WaitForActionOptions.md +5 -5
- package/docs/type-aliases/BaseFetchActionOptions.md +1 -1
- package/docs/type-aliases/BaseFetchCollectorOptions.md +1 -1
- package/docs/type-aliases/BrowserEngine.md +1 -1
- package/docs/type-aliases/FetchActionCapabilities.md +1 -1
- package/docs/type-aliases/FetchActionCapabilityMode.md +1 -1
- package/docs/type-aliases/FetchActionOptions.md +1 -1
- package/docs/type-aliases/FetchEngineAction.md +2 -2
- package/docs/type-aliases/FetchEngineType.md +1 -1
- package/docs/type-aliases/FetchReturnType.md +1 -1
- package/docs/type-aliases/FetchReturnTypeFor.md +1 -1
- package/docs/type-aliases/OnFetchPauseCallback.md +1 -1
- package/docs/type-aliases/ResourceType.md +1 -1
- package/docs/type-aliases/TrimPreset.md +13 -0
- package/docs/variables/DefaultFetcherProperties.md +1 -1
- package/docs/variables/FetcherOptionKeys.md +1 -1
- package/docs/variables/TRIM_PRESETS.md +11 -0
- package/package.json +11 -11
package/dist/index.d.mts
CHANGED
|
@@ -730,6 +730,181 @@ type _RequireAtLeastOne<
|
|
|
730
730
|
// 3. Add the remaining keys not in `KeysType`
|
|
731
731
|
Except<ObjectType, KeysType>;
|
|
732
732
|
|
|
733
|
+
/**
|
|
734
|
+
* Represents the engine-specific execution scope (e.g., a Cheerio node or a Playwright Locator).
|
|
735
|
+
* It acts as the target for extraction and interaction actions.
|
|
736
|
+
*/
|
|
737
|
+
type FetchElementScope = any;
|
|
738
|
+
/**
|
|
739
|
+
* Interface representing the minimal engine capabilities required for extraction.
|
|
740
|
+
*
|
|
741
|
+
* @remarks
|
|
742
|
+
* This interface abstracts the underlying DOM manipulation library (Cheerio or Playwright).
|
|
743
|
+
* Implementing classes must ensure consistent behavior across different engines, especially
|
|
744
|
+
* regarding scope handling (Element vs Array of Elements) and DOM traversal.
|
|
745
|
+
*/
|
|
746
|
+
interface IExtractEngine {
|
|
747
|
+
/**
|
|
748
|
+
* Finds all elements matching the selector within the given scope.
|
|
749
|
+
*
|
|
750
|
+
* @param scope - The context to search in. Can be a single element or an array of elements (e.g., in segmented mode).
|
|
751
|
+
* @param selector - The CSS selector to match.
|
|
752
|
+
* @returns A promise resolving to an array of found element scopes.
|
|
753
|
+
*
|
|
754
|
+
* @remarks
|
|
755
|
+
* **Behavior Contract:**
|
|
756
|
+
* 1. **Descendants**: It MUST search for descendants matching the selector within the scope.
|
|
757
|
+
* 2. **Self-Matching**: It MUST check if the scope element(s) *themselves* match the selector.
|
|
758
|
+
* 3. **Array Scope**: If `scope` is an array:
|
|
759
|
+
* - It MUST process elements in the order they appear in the array (which should match document order).
|
|
760
|
+
* - It MUST perform the check (Self + Descendants) for *each* element in the array.
|
|
761
|
+
* - It MUST flatten the results into a single array.
|
|
762
|
+
* - It SHOULD dedup the results if the engine's query mechanism naturally produces duplicates (e.g. nested scopes),
|
|
763
|
+
* but generally, preserving document order is the priority.
|
|
764
|
+
*/
|
|
765
|
+
_querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
766
|
+
/**
|
|
767
|
+
* Extracts a primitive value from the element based on the schema configuration.
|
|
768
|
+
*
|
|
769
|
+
* @param schema - The value extraction schema defining `type`, `mode`, and `attribute`.
|
|
770
|
+
* @param scope - The specific element to extract data from.
|
|
771
|
+
* @returns A promise resolving to the extracted value (string, number, boolean, or null).
|
|
772
|
+
*
|
|
773
|
+
* @remarks
|
|
774
|
+
* **Behavior Contract:**
|
|
775
|
+
* - **Attribute**: If `schema.attribute` is set, returns the attribute value. If missing, returns `null` or empty string based on engine.
|
|
776
|
+
* - **HTML**: If `schema.mode` is 'html', returns `innerHTML`.
|
|
777
|
+
* - **OuterHTML**: If `schema.mode` is 'outerHTML', returns `outerHTML`.
|
|
778
|
+
* - **Text**: If `schema.mode` is 'text', returns `textContent` (trimmed by default in most implementations).
|
|
779
|
+
* - **InnerText**: If `schema.mode` is 'innerText', returns rendered text (visual approximation in Cheerio).
|
|
780
|
+
*/
|
|
781
|
+
_extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
782
|
+
/**
|
|
783
|
+
* Gets the parent element of the given scope.
|
|
784
|
+
*
|
|
785
|
+
* @param scope - The element to find the parent of.
|
|
786
|
+
* @returns A promise resolving to the parent element scope, or `null` if the element is root or detached.
|
|
787
|
+
*/
|
|
788
|
+
_parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
789
|
+
/**
|
|
790
|
+
* Checks if two element scopes refer to the exact same DOM node.
|
|
791
|
+
*
|
|
792
|
+
* @param scope1 - The first element scope.
|
|
793
|
+
* @param scope2 - The second element scope.
|
|
794
|
+
* @returns A promise resolving to `true` if they are the same node, `false` otherwise.
|
|
795
|
+
*
|
|
796
|
+
* @remarks
|
|
797
|
+
* This comparison MUST be identity-based, not just content-based.
|
|
798
|
+
*/
|
|
799
|
+
_isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
800
|
+
/**
|
|
801
|
+
* Retrieves all subsequent sibling elements of the `scope` element, stopping *before* the first sibling that matches `untilSelector`.
|
|
802
|
+
*
|
|
803
|
+
* @param scope - The anchor element (starting point). The returned list starts *after* this element.
|
|
804
|
+
* @param untilSelector - Optional. A CSS selector. If provided, the scanning stops when a sibling matches this selector (exclusive).
|
|
805
|
+
* If omitted or null, returns all following siblings.
|
|
806
|
+
* @returns A promise resolving to an array of sibling element scopes.
|
|
807
|
+
*
|
|
808
|
+
* @remarks
|
|
809
|
+
* **Behavior Contract:**
|
|
810
|
+
* - **Starting Point**: The `scope` element itself IS NOT included in the result.
|
|
811
|
+
* - **Ending Point**: The element matching `untilSelector` IS NOT included in the result.
|
|
812
|
+
* - **Direction**: Only scans *following* siblings (next siblings).
|
|
813
|
+
* - **Flattening**: The result is a flat list of siblings, not a nested structure.
|
|
814
|
+
*/
|
|
815
|
+
_nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
816
|
+
/**
|
|
817
|
+
* Finds the closest ancestor of the `scope` element (including the element itself) that is present in the `candidates` array.
|
|
818
|
+
*
|
|
819
|
+
* @param scope - The starting element from which to ascend the DOM tree.
|
|
820
|
+
* @param candidates - An array of potential ancestor elements to check against.
|
|
821
|
+
* @returns A promise resolving to the matching candidate element from the array, or `null` if no match is found.
|
|
822
|
+
*
|
|
823
|
+
* @remarks
|
|
824
|
+
* **Performance Critical**: This method is a key optimization for "bubbling up" logic (e.g., in Segmented extraction).
|
|
825
|
+
* It effectively answers: "Which of these container candidates does my current element belong to?"
|
|
826
|
+
*
|
|
827
|
+
* **Implementation Guidelines**:
|
|
828
|
+
* - **Cheerio**: Should use a `Set` for O(1) candidate lookup during tree traversal (Total O(Depth)).
|
|
829
|
+
* - **Playwright**: Should perform the entire traversal within a single `page.evaluate` call to avoid O(Depth) IPC round-trips.
|
|
830
|
+
*/
|
|
831
|
+
_findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
832
|
+
/**
|
|
833
|
+
* Checks if the `container` element contains the `element` (descendant).
|
|
834
|
+
*
|
|
835
|
+
* @param container - The potential ancestor element.
|
|
836
|
+
* @param element - The potential descendant element.
|
|
837
|
+
* @returns A promise resolving to `true` if `container` contains `element`, `false` otherwise.
|
|
838
|
+
*
|
|
839
|
+
* @remarks
|
|
840
|
+
* **Standard Compliance**: This mirrors the DOM [Node.contains()](https://developer.mozilla.org/en-US/docs/Web/API/Node/contains) behavior.
|
|
841
|
+
*
|
|
842
|
+
* @performance-critical Used extensively in boundary checks for Segmented extraction.
|
|
843
|
+
* - **Playwright**: MUST use `elementHandle.evaluate` to use native `Node.contains` in the browser context, reducing IPC overhead.
|
|
844
|
+
* - **Cheerio**: Should use efficient lookups like `$.contains` or `.find()`.
|
|
845
|
+
*/
|
|
846
|
+
_contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
847
|
+
/**
|
|
848
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
849
|
+
*
|
|
850
|
+
* @param scope1 - The first element.
|
|
851
|
+
* @param scope2 - The second element.
|
|
852
|
+
* @returns A promise resolving to the LCA element, or null if they are in different documents/trees.
|
|
853
|
+
*
|
|
854
|
+
* @remarks
|
|
855
|
+
* This is a fundamental tree operation used to find the point where two element paths diverge.
|
|
856
|
+
* **Performance Critical**: For Playwright, this MUST be implemented in a single `evaluate` call.
|
|
857
|
+
*/
|
|
858
|
+
_findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
859
|
+
/**
|
|
860
|
+
* Finds the direct child of the `container` that contains the `element` (or is the `element` itself).
|
|
861
|
+
*
|
|
862
|
+
* @param element - The descendant element.
|
|
863
|
+
* @param container - The ancestor container.
|
|
864
|
+
* @returns A promise resolving to the child element, or null if `element` is not a descendant of `container`.
|
|
865
|
+
*
|
|
866
|
+
* @remarks
|
|
867
|
+
* This method traverses up from `element` until it finds the node whose parent is `container`.
|
|
868
|
+
* **Performance Critical**: This replaces the manual bubble-up loop in Node.js.
|
|
869
|
+
*/
|
|
870
|
+
_findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
871
|
+
/**
|
|
872
|
+
* Logs debug information if debug mode is enabled.
|
|
873
|
+
* @param category - The category of the log message.
|
|
874
|
+
* @param args - Arguments to log.
|
|
875
|
+
*/
|
|
876
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
877
|
+
}
|
|
878
|
+
/**
|
|
879
|
+
* Base configuration for all extraction schemas.
|
|
880
|
+
*/
|
|
881
|
+
interface BaseExtractSchema {
|
|
882
|
+
/**
|
|
883
|
+
* Whether this field is required. If true and the value is null,
|
|
884
|
+
* the containing object or array item will be skipped (or throw error in strict mode).
|
|
885
|
+
*/
|
|
886
|
+
required?: boolean;
|
|
887
|
+
/**
|
|
888
|
+
* Whether to enable strict mode for this extraction.
|
|
889
|
+
* If true, missing required fields will throw an error instead of being skipped.
|
|
890
|
+
*/
|
|
891
|
+
strict?: boolean;
|
|
892
|
+
/**
|
|
893
|
+
* Specifies the starting anchor for extraction of this field.
|
|
894
|
+
* - Field Name: Uses the DOM element of a previously extracted field as the anchor.
|
|
895
|
+
* - CSS Selector: Re-queries the selector within the current context to find the anchor.
|
|
896
|
+
*
|
|
897
|
+
* Once anchored, the search scope for this field becomes the siblings following the anchor.
|
|
898
|
+
*/
|
|
899
|
+
anchor?: string;
|
|
900
|
+
/**
|
|
901
|
+
* The maximum number of levels to bubble up from the anchor or matched element.
|
|
902
|
+
* - In 'anchor' mode: Defines how many parent levels to traverse to collect following siblings.
|
|
903
|
+
* - In 'segmented' mode: Defines the maximum levels to ascend from the anchor to find a container.
|
|
904
|
+
* - In 'object' mode: Enables "Try-And-Bubble". Attempts extraction at current level; if required fields are missing, bubbles up (max `depth` levels) to retry.
|
|
905
|
+
*/
|
|
906
|
+
depth?: number;
|
|
907
|
+
}
|
|
733
908
|
/**
|
|
734
909
|
* Extraction schema types.
|
|
735
910
|
*/
|
|
@@ -737,7 +912,7 @@ type ExtractSchema = ExtractObjectSchema | ExtractArraySchema | ExtractValueSche
|
|
|
737
912
|
/**
|
|
738
913
|
* Configuration for extracting a single value.
|
|
739
914
|
*/
|
|
740
|
-
interface ExtractValueSchema {
|
|
915
|
+
interface ExtractValueSchema extends BaseExtractSchema {
|
|
741
916
|
/**
|
|
742
917
|
* The data type to cast the extracted value to.
|
|
743
918
|
* @default 'string'
|
|
@@ -774,16 +949,21 @@ interface ExtractValueSchema {
|
|
|
774
949
|
*/
|
|
775
950
|
type ExtractArrayModeName = 'nested' | 'columnar' | 'segmented';
|
|
776
951
|
/**
|
|
777
|
-
*
|
|
952
|
+
* Base options for array extraction modes.
|
|
778
953
|
*/
|
|
779
|
-
interface
|
|
780
|
-
type:
|
|
954
|
+
interface BaseModeOptions {
|
|
955
|
+
type: ExtractArrayModeName;
|
|
781
956
|
/**
|
|
782
|
-
* Whether to enable strict mode.
|
|
783
|
-
*
|
|
784
|
-
* @default true
|
|
957
|
+
* Whether to enable strict mode for this specific array mode.
|
|
958
|
+
* @default false
|
|
785
959
|
*/
|
|
786
960
|
strict?: boolean;
|
|
961
|
+
}
|
|
962
|
+
/**
|
|
963
|
+
* Options for columnar (column-alignment) extraction.
|
|
964
|
+
*/
|
|
965
|
+
interface ColumnarOptions extends BaseModeOptions {
|
|
966
|
+
type: 'columnar';
|
|
787
967
|
/**
|
|
788
968
|
* Whether to enable heuristic inference.
|
|
789
969
|
* If true, tries to find a common parent to infer item wrappers when counts mismatch.
|
|
@@ -794,13 +974,24 @@ interface ColumnarOptions {
|
|
|
794
974
|
/**
|
|
795
975
|
* Options for segmented (anchor-based) extraction.
|
|
796
976
|
*/
|
|
797
|
-
interface SegmentedOptions {
|
|
977
|
+
interface SegmentedOptions extends BaseModeOptions {
|
|
798
978
|
type: 'segmented';
|
|
799
979
|
/**
|
|
800
|
-
* The name of the field in `items` to use as a segment anchor.
|
|
801
|
-
* Defaults to the first property key defined in `items`.
|
|
980
|
+
* The name of the field in `items` to use as a segment anchor, or a direct CSS selector.
|
|
981
|
+
* Defaults to the first property key's selector defined in `items`.
|
|
802
982
|
*/
|
|
803
983
|
anchor?: string;
|
|
984
|
+
/**
|
|
985
|
+
* Where to start searching for fields within each segment.
|
|
986
|
+
* - 'anchor': (Default) All fields are searched within the entire segment.
|
|
987
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
988
|
+
*/
|
|
989
|
+
relativeTo?: 'anchor' | 'previous';
|
|
990
|
+
/**
|
|
991
|
+
* The maximum number of levels to bubble up from the anchor to find a segment container.
|
|
992
|
+
* If omitted, it bubbles up as high as possible without conflicting with neighboring segments.
|
|
993
|
+
*/
|
|
994
|
+
depth?: number;
|
|
804
995
|
}
|
|
805
996
|
/**
|
|
806
997
|
* Union type for array extraction modes and their options.
|
|
@@ -809,7 +1000,7 @@ type ExtractArrayMode = ExtractArrayModeName | ColumnarOptions | SegmentedOption
|
|
|
809
1000
|
/**
|
|
810
1001
|
* Configuration for extracting an array of items.
|
|
811
1002
|
*/
|
|
812
|
-
interface ExtractArraySchema {
|
|
1003
|
+
interface ExtractArraySchema extends BaseExtractSchema {
|
|
813
1004
|
type: 'array';
|
|
814
1005
|
/**
|
|
815
1006
|
* CSS selector for items (in 'nested' mode) or the container (in 'columnar'/'segmented' modes).
|
|
@@ -843,7 +1034,7 @@ interface ExtractArraySchema {
|
|
|
843
1034
|
/**
|
|
844
1035
|
* Configuration for extracting an object with multiple properties.
|
|
845
1036
|
*/
|
|
846
|
-
interface ExtractObjectSchema {
|
|
1037
|
+
interface ExtractObjectSchema extends BaseExtractSchema {
|
|
847
1038
|
type: 'object';
|
|
848
1039
|
/**
|
|
849
1040
|
* Root selector for the object. If provided, sub-properties are searched within this element.
|
|
@@ -857,6 +1048,18 @@ interface ExtractObjectSchema {
|
|
|
857
1048
|
* Exclude the object element if it matches this selector.
|
|
858
1049
|
*/
|
|
859
1050
|
exclude?: string;
|
|
1051
|
+
/**
|
|
1052
|
+
* Where to start searching for fields within this object.
|
|
1053
|
+
* Only applicable when the object is being extracted from an array of elements (e.g. in 'segmented' mode).
|
|
1054
|
+
* - 'anchor': (Default) All fields are searched within the entire scope.
|
|
1055
|
+
* - 'previous': Each field is searched starting from after the previous field's match.
|
|
1056
|
+
*/
|
|
1057
|
+
relativeTo?: 'anchor' | 'previous';
|
|
1058
|
+
/**
|
|
1059
|
+
* Explicit order of property extraction.
|
|
1060
|
+
* Useful when using `relativeTo: 'previous'`.
|
|
1061
|
+
*/
|
|
1062
|
+
order?: string[];
|
|
860
1063
|
/**
|
|
861
1064
|
* Definition of the object's properties and their corresponding extraction schemas.
|
|
862
1065
|
*/
|
|
@@ -891,6 +1094,7 @@ interface GotoActionOptions {
|
|
|
891
1094
|
headers?: Record<string, string>;
|
|
892
1095
|
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
893
1096
|
timeoutMs?: number;
|
|
1097
|
+
simulate?: boolean;
|
|
894
1098
|
}
|
|
895
1099
|
/**
|
|
896
1100
|
* Options for the {@link FetchEngine.waitFor} action, specifying conditions to wait for before continuing.
|
|
@@ -913,6 +1117,78 @@ interface WaitForActionOptions {
|
|
|
913
1117
|
interface SubmitActionOptions {
|
|
914
1118
|
enctype?: 'application/x-www-form-urlencoded' | 'application/json' | 'multipart/form-data';
|
|
915
1119
|
}
|
|
1120
|
+
/**
|
|
1121
|
+
* Predefined cleanup groups for the {@link FetchEngine.trim} action.
|
|
1122
|
+
*/
|
|
1123
|
+
type TrimPreset = 'scripts' | 'styles' | 'svgs' | 'images' | 'comments' | 'hidden' | 'all';
|
|
1124
|
+
/**
|
|
1125
|
+
* Options for the {@link FetchEngine.trim} action, specifying which elements to remove from the DOM.
|
|
1126
|
+
*/
|
|
1127
|
+
interface TrimActionOptions {
|
|
1128
|
+
selectors?: string | string[];
|
|
1129
|
+
presets?: TrimPreset | TrimPreset[];
|
|
1130
|
+
}
|
|
1131
|
+
declare const TRIM_PRESETS: Record<string, string[]>;
|
|
1132
|
+
/**
|
|
1133
|
+
* Options for the {@link FetchEngine.evaluate} action, specifying the function to execute and its arguments.
|
|
1134
|
+
*
|
|
1135
|
+
* @remarks
|
|
1136
|
+
* This action allows executing custom JavaScript logic within the page context.
|
|
1137
|
+
*
|
|
1138
|
+
* **Execution Environments:**
|
|
1139
|
+
* - **`browser` mode (Playwright)**: Executes directly in the real browser's execution context.
|
|
1140
|
+
* - **`http` mode (Cheerio)**: Executes in a Node.js sandbox using `newFunction`. It provides a mocked browser environment
|
|
1141
|
+
* including `window`, `document` (with `querySelector`, `querySelectorAll`, etc.), and `console`.
|
|
1142
|
+
*
|
|
1143
|
+
* **Navigation Handling:**
|
|
1144
|
+
* If the executed code modifies `window.location.href` (or calls `assign()`/`replace()`), the engine will
|
|
1145
|
+
* automatically detect the change, trigger a navigation, and wait for the new page to load before resolving the action.
|
|
1146
|
+
*
|
|
1147
|
+
* @example
|
|
1148
|
+
* ```json
|
|
1149
|
+
* {
|
|
1150
|
+
* "action": "evaluate",
|
|
1151
|
+
* "params": {
|
|
1152
|
+
* "fn": "([a, b]) => a + b",
|
|
1153
|
+
* "args": [1, 2]
|
|
1154
|
+
* }
|
|
1155
|
+
* }
|
|
1156
|
+
* ```
|
|
1157
|
+
*
|
|
1158
|
+
* @example
|
|
1159
|
+
* ```json
|
|
1160
|
+
* {
|
|
1161
|
+
* "action": "evaluate",
|
|
1162
|
+
* "params": {
|
|
1163
|
+
* "fn": "({ x, y }) => x * y",
|
|
1164
|
+
* "args": { "x": 6, "y": 7 }
|
|
1165
|
+
* }
|
|
1166
|
+
* }
|
|
1167
|
+
* ```
|
|
1168
|
+
*/
|
|
1169
|
+
interface EvaluateActionOptions {
|
|
1170
|
+
/**
|
|
1171
|
+
* The function or expression to execute.
|
|
1172
|
+
*
|
|
1173
|
+
* @remarks
|
|
1174
|
+
* Can be:
|
|
1175
|
+
* 1. A function object (only available when using the API directly).
|
|
1176
|
+
* 2. A string containing a function definition, e.g., `"async (args) => { ... }"`
|
|
1177
|
+
* 3. A string containing a direct expression, e.g., `"document.title"`
|
|
1178
|
+
*
|
|
1179
|
+
* **Note:** When using a function, it receives exactly ONE argument: the value provided in {@link args}.
|
|
1180
|
+
* Use destructuring to handle multiple parameters.
|
|
1181
|
+
*/
|
|
1182
|
+
fn: string | ((...args: any[]) => any);
|
|
1183
|
+
/**
|
|
1184
|
+
* Data to pass to the function.
|
|
1185
|
+
*
|
|
1186
|
+
* @remarks
|
|
1187
|
+
* This value is passed as the first and only argument to the function defined in {@link fn}.
|
|
1188
|
+
* Recommended to use an array or object for multiple values.
|
|
1189
|
+
*/
|
|
1190
|
+
args?: any;
|
|
1191
|
+
}
|
|
916
1192
|
/**
|
|
917
1193
|
* Union type representing all possible engine actions that can be dispatched.
|
|
918
1194
|
*
|
|
@@ -927,6 +1203,35 @@ type FetchEngineAction = {
|
|
|
927
1203
|
type: 'fill';
|
|
928
1204
|
selector: string;
|
|
929
1205
|
value: string;
|
|
1206
|
+
} | {
|
|
1207
|
+
type: 'mouseMove';
|
|
1208
|
+
params: {
|
|
1209
|
+
x?: number;
|
|
1210
|
+
y?: number;
|
|
1211
|
+
selector?: string;
|
|
1212
|
+
steps?: number;
|
|
1213
|
+
};
|
|
1214
|
+
} | {
|
|
1215
|
+
type: 'mouseClick';
|
|
1216
|
+
params: {
|
|
1217
|
+
x?: number;
|
|
1218
|
+
y?: number;
|
|
1219
|
+
button?: 'left' | 'right' | 'middle';
|
|
1220
|
+
clickCount?: number;
|
|
1221
|
+
delay?: number;
|
|
1222
|
+
};
|
|
1223
|
+
} | {
|
|
1224
|
+
type: 'keyboardType';
|
|
1225
|
+
params: {
|
|
1226
|
+
text: string;
|
|
1227
|
+
delay?: number;
|
|
1228
|
+
};
|
|
1229
|
+
} | {
|
|
1230
|
+
type: 'keyboardPress';
|
|
1231
|
+
params: {
|
|
1232
|
+
key: string;
|
|
1233
|
+
delay?: number;
|
|
1234
|
+
};
|
|
930
1235
|
} | {
|
|
931
1236
|
type: 'waitFor';
|
|
932
1237
|
options?: WaitForActionOptions;
|
|
@@ -946,6 +1251,12 @@ type FetchEngineAction = {
|
|
|
946
1251
|
} | {
|
|
947
1252
|
type: 'pause';
|
|
948
1253
|
message?: string;
|
|
1254
|
+
} | {
|
|
1255
|
+
type: 'trim';
|
|
1256
|
+
options: TrimActionOptions;
|
|
1257
|
+
} | {
|
|
1258
|
+
type: 'evaluate';
|
|
1259
|
+
params: EvaluateActionOptions;
|
|
949
1260
|
} | {
|
|
950
1261
|
type: 'dispose';
|
|
951
1262
|
};
|
|
@@ -995,7 +1306,7 @@ interface PendingEngineRequest {
|
|
|
995
1306
|
*/
|
|
996
1307
|
type AnyFetchEngine = FetchEngine<any, any, any>;
|
|
997
1308
|
type AnyFetchEngineCtor = new (...args: any[]) => AnyFetchEngine;
|
|
998
|
-
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> {
|
|
1309
|
+
declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCrawler extends BasicCrawler<TContext> = any, TOptions extends BasicCrawlerOptions<TContext> = any> implements IExtractEngine {
|
|
999
1310
|
private static registry;
|
|
1000
1311
|
/**
|
|
1001
1312
|
* Registers a fetch engine implementation with the global registry.
|
|
@@ -1068,46 +1379,105 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1068
1379
|
protected isPageActive: boolean;
|
|
1069
1380
|
protected isEngineDisposed: boolean;
|
|
1070
1381
|
protected navigationLock: PromiseLock;
|
|
1382
|
+
protected activeContext?: TContext;
|
|
1383
|
+
protected isExecutingAction: boolean;
|
|
1071
1384
|
protected lastResponse?: FetchResponse;
|
|
1385
|
+
protected actionQueue: DispatchedEngineAction[];
|
|
1386
|
+
protected isProcessingActionLoop: boolean;
|
|
1072
1387
|
protected blockedTypes: Set<string>;
|
|
1388
|
+
_logDebug(category: string, ...args: any[]): void;
|
|
1073
1389
|
protected _cleanup?(): Promise<void>;
|
|
1390
|
+
protected _getTrimInfo(options: TrimActionOptions): {
|
|
1391
|
+
selectors: string[];
|
|
1392
|
+
removeComments: boolean;
|
|
1393
|
+
removeHidden: boolean;
|
|
1394
|
+
};
|
|
1074
1395
|
/**
|
|
1075
|
-
* Finds all elements matching the selector within the given
|
|
1076
|
-
*
|
|
1396
|
+
* Finds all elements matching the selector within the given scope.
|
|
1397
|
+
*
|
|
1398
|
+
* @param scope - The scope to search in (Engine-specific element/node or array of nodes).
|
|
1077
1399
|
* @param selector - CSS selector.
|
|
1400
|
+
* @returns List of matching elements.
|
|
1401
|
+
* @see {@link IExtractEngine._querySelectorAll} for behavior contract.
|
|
1078
1402
|
* @internal
|
|
1079
1403
|
*/
|
|
1080
|
-
|
|
1404
|
+
abstract _querySelectorAll(scope: FetchElementScope, selector: string): Promise<FetchElementScope[]>;
|
|
1081
1405
|
/**
|
|
1082
1406
|
* Extracts a primitive value from the element based on schema.
|
|
1407
|
+
*
|
|
1083
1408
|
* @param schema - Value extraction schema.
|
|
1084
|
-
* @param
|
|
1409
|
+
* @param scope - The element scope.
|
|
1410
|
+
* @returns Extracted value.
|
|
1411
|
+
* @see {@link IExtractEngine._extractValue} for behavior contract.
|
|
1085
1412
|
* @internal
|
|
1086
1413
|
*/
|
|
1087
|
-
|
|
1414
|
+
abstract _extractValue(schema: ExtractValueSchema, scope: FetchElementScope): Promise<any>;
|
|
1088
1415
|
/**
|
|
1089
1416
|
* Gets the parent element of the given element.
|
|
1090
|
-
*
|
|
1417
|
+
*
|
|
1418
|
+
* @param scope - The element scope.
|
|
1419
|
+
* @returns Parent element or null.
|
|
1091
1420
|
* @internal
|
|
1092
1421
|
*/
|
|
1093
|
-
|
|
1422
|
+
abstract _parentElement(scope: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1094
1423
|
/**
|
|
1095
|
-
* Checks if two elements are the same.
|
|
1096
|
-
*
|
|
1097
|
-
* @param
|
|
1424
|
+
* Checks if two elements are the same identity.
|
|
1425
|
+
*
|
|
1426
|
+
* @param scope1 - First element scope.
|
|
1427
|
+
* @param scope2 - Second element scope.
|
|
1428
|
+
* @returns True if they are the same DOM node.
|
|
1098
1429
|
* @internal
|
|
1099
1430
|
*/
|
|
1100
|
-
|
|
1431
|
+
abstract _isSameElement(scope1: FetchElementScope, scope2: FetchElementScope): Promise<boolean>;
|
|
1101
1432
|
/**
|
|
1102
1433
|
* Gets all subsequent siblings of an element until a sibling matches the selector.
|
|
1103
1434
|
* Used in 'segmented' extraction mode.
|
|
1104
|
-
*
|
|
1105
|
-
* @param
|
|
1435
|
+
*
|
|
1436
|
+
* @param scope - The anchor element scope.
|
|
1437
|
+
* @param untilSelector - Optional selector that marks the end of the segment (exclusive).
|
|
1438
|
+
* @returns List of sibling elements between anchor and untilSelector.
|
|
1106
1439
|
* @internal
|
|
1107
1440
|
*/
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1441
|
+
abstract _nextSiblingsUntil(scope: FetchElementScope, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
1442
|
+
/**
|
|
1443
|
+
* Finds the closest ancestor of `scope` (including itself) that exists in the `candidates` array.
|
|
1444
|
+
*
|
|
1445
|
+
* @param scope - The starting element.
|
|
1446
|
+
* @param candidates - The array of potential ancestor scopes.
|
|
1447
|
+
* @returns A promise resolving to the matching candidate scope, or `null` if none found.
|
|
1448
|
+
* @see {@link IExtractEngine._findClosestAncestor} for implementation details.
|
|
1449
|
+
* @internal
|
|
1450
|
+
*/
|
|
1451
|
+
abstract _findClosestAncestor(scope: FetchElementScope, candidates: FetchElementScope[]): Promise<FetchElementScope | null>;
|
|
1452
|
+
/**
|
|
1453
|
+
* Checks if the `container` scope contains the `element` scope.
|
|
1454
|
+
*
|
|
1455
|
+
* @param container - The potential ancestor element.
|
|
1456
|
+
* @param element - The potential descendant element.
|
|
1457
|
+
* @returns A promise resolving to `true` if `container` contains `element`.
|
|
1458
|
+
* @see {@link IExtractEngine._contains} for implementation details.
|
|
1459
|
+
* @internal
|
|
1460
|
+
*/
|
|
1461
|
+
abstract _contains(container: FetchElementScope, element: FetchElementScope): Promise<boolean>;
|
|
1462
|
+
/**
|
|
1463
|
+
* Finds the Lowest Common Ancestor (LCA) of two element scopes.
|
|
1464
|
+
*
|
|
1465
|
+
* @param scope1 - The first element scope.
|
|
1466
|
+
* @param scope2 - The second element scope.
|
|
1467
|
+
* @returns A promise resolving to the LCA element scope, or `null` if none found.
|
|
1468
|
+
* @internal
|
|
1469
|
+
*/
|
|
1470
|
+
abstract _findCommonAncestor(scope1: FetchElementScope, scope2: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1471
|
+
/**
|
|
1472
|
+
* Finds the direct child of container that contains element.
|
|
1473
|
+
*
|
|
1474
|
+
* @param element - The descendant element.
|
|
1475
|
+
* @param container - The container element.
|
|
1476
|
+
* @returns The child element of container, or null.
|
|
1477
|
+
* @internal
|
|
1478
|
+
*/
|
|
1479
|
+
abstract _findContainerChild(element: FetchElementScope, container: FetchElementScope): Promise<FetchElementScope | null>;
|
|
1480
|
+
protected _extract(schema: ExtractSchema, scope: FetchElementScope, parentStrict?: boolean): Promise<any>;
|
|
1111
1481
|
/**
|
|
1112
1482
|
* Normalizes the array extraction mode into an options object.
|
|
1113
1483
|
* @param mode - The mode string or options object.
|
|
@@ -1122,7 +1492,9 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1122
1492
|
* @param elements - The list of item elements.
|
|
1123
1493
|
* @internal
|
|
1124
1494
|
*/
|
|
1125
|
-
protected _extractNested(items: ExtractSchema, elements:
|
|
1495
|
+
protected _extractNested(items: ExtractSchema, elements: FetchElementScope[], opts?: {
|
|
1496
|
+
strict?: boolean;
|
|
1497
|
+
}): Promise<any[]>;
|
|
1126
1498
|
/**
|
|
1127
1499
|
* Performs columnar extraction (Column Alignment Mode).
|
|
1128
1500
|
*
|
|
@@ -1132,7 +1504,7 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1132
1504
|
* @returns An array of extracted items, or null if requirements aren't met.
|
|
1133
1505
|
* @internal
|
|
1134
1506
|
*/
|
|
1135
|
-
protected _extractColumnar(schema: ExtractSchema, container:
|
|
1507
|
+
protected _extractColumnar(schema: ExtractSchema, container: FetchElementScope, opts?: ColumnarOptions): Promise<any[] | null>;
|
|
1136
1508
|
/**
|
|
1137
1509
|
* Performs segmented extraction (Anchor-based Scanning).
|
|
1138
1510
|
*
|
|
@@ -1142,7 +1514,7 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1142
1514
|
* @returns An array of extracted items.
|
|
1143
1515
|
* @internal
|
|
1144
1516
|
*/
|
|
1145
|
-
protected _extractSegmented(schema: ExtractSchema, container:
|
|
1517
|
+
protected _extractSegmented(schema: ExtractSchema, container: FetchElementScope, opts?: SegmentedOptions): Promise<any[] | null>;
|
|
1146
1518
|
/**
|
|
1147
1519
|
* Creates the crawler instance for the specific engine implementation.
|
|
1148
1520
|
* @param options - The final crawler options.
|
|
@@ -1213,6 +1585,43 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1213
1585
|
* @throws {Error} When no active page context exists
|
|
1214
1586
|
*/
|
|
1215
1587
|
click(selector: string): Promise<void>;
|
|
1588
|
+
/**
|
|
1589
|
+
* Moves mouse to specified position or element.
|
|
1590
|
+
*
|
|
1591
|
+
* @param params - Move parameters (x, y, selector, steps)
|
|
1592
|
+
*/
|
|
1593
|
+
mouseMove(params: {
|
|
1594
|
+
x?: number;
|
|
1595
|
+
y?: number;
|
|
1596
|
+
selector?: string;
|
|
1597
|
+
steps?: number;
|
|
1598
|
+
}): Promise<void>;
|
|
1599
|
+
/**
|
|
1600
|
+
* Clicks at current position or specified position.
|
|
1601
|
+
*
|
|
1602
|
+
* @param params - Click parameters (x, y, button, clickCount, delay)
|
|
1603
|
+
*/
|
|
1604
|
+
mouseClick(params: {
|
|
1605
|
+
x?: number;
|
|
1606
|
+
y?: number;
|
|
1607
|
+
button?: 'left' | 'right' | 'middle';
|
|
1608
|
+
clickCount?: number;
|
|
1609
|
+
delay?: number;
|
|
1610
|
+
}): Promise<void>;
|
|
1611
|
+
/**
|
|
1612
|
+
* Types text into current focused element.
|
|
1613
|
+
*
|
|
1614
|
+
* @param text - Text to type
|
|
1615
|
+
* @param delay - Delay between key presses
|
|
1616
|
+
*/
|
|
1617
|
+
keyboardType(text: string, delay?: number): Promise<void>;
|
|
1618
|
+
/**
|
|
1619
|
+
* Presses specified key.
|
|
1620
|
+
*
|
|
1621
|
+
* @param key - Key to press
|
|
1622
|
+
* @param delay - Delay after key press
|
|
1623
|
+
*/
|
|
1624
|
+
keyboardPress(key: string, delay?: number): Promise<void>;
|
|
1216
1625
|
/**
|
|
1217
1626
|
* Fills input element with specified value.
|
|
1218
1627
|
*
|
|
@@ -1231,6 +1640,14 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1231
1640
|
* @throws {Error} When no active page context exists
|
|
1232
1641
|
*/
|
|
1233
1642
|
submit(selector?: any, options?: SubmitActionOptions): Promise<void>;
|
|
1643
|
+
/**
|
|
1644
|
+
* Removes elements from the DOM based on selectors and presets.
|
|
1645
|
+
*
|
|
1646
|
+
* @param options - Trim options specifying selectors and presets
|
|
1647
|
+
* @returns Promise resolving when trim operation completes
|
|
1648
|
+
* @throws {Error} When no active page context exists
|
|
1649
|
+
*/
|
|
1650
|
+
trim(options: TrimActionOptions): Promise<void>;
|
|
1234
1651
|
/**
|
|
1235
1652
|
* Pauses execution, allowing for manual intervention or inspection.
|
|
1236
1653
|
*
|
|
@@ -1239,6 +1656,25 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1239
1656
|
* @throws {Error} When no active page context exists
|
|
1240
1657
|
*/
|
|
1241
1658
|
pause(message?: string): Promise<void>;
|
|
1659
|
+
/**
|
|
1660
|
+
* Executes a custom function or expression within the current page context.
|
|
1661
|
+
*
|
|
1662
|
+
* @remarks
|
|
1663
|
+
* This is a powerful action that allows running custom logic to interact with the DOM,
|
|
1664
|
+
* calculate values, or trigger navigations.
|
|
1665
|
+
*
|
|
1666
|
+
* - In **Browser Mode**, it runs in the real browser.
|
|
1667
|
+
* - In **HTTP Mode**, it runs in a Node.js sandbox with a mocked DOM.
|
|
1668
|
+
*
|
|
1669
|
+
* The action handles automatic navigation if `window.location` is modified.
|
|
1670
|
+
*
|
|
1671
|
+
* @param params - Configuration for the execution, including the function and arguments.
|
|
1672
|
+
* @returns A promise resolving to the result of the execution.
|
|
1673
|
+
* @throws {Error} If no active page context exists or if execution fails.
|
|
1674
|
+
*
|
|
1675
|
+
* @see {@link EvaluateActionOptions} for detailed parameter options and examples.
|
|
1676
|
+
*/
|
|
1677
|
+
evaluate(params: EvaluateActionOptions): Promise<any>;
|
|
1242
1678
|
/**
|
|
1243
1679
|
* Extracts structured data from the current page content.
|
|
1244
1680
|
*
|
|
@@ -1246,7 +1682,6 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1246
1682
|
* @returns A promise that resolves to an object with the extracted data.
|
|
1247
1683
|
*/
|
|
1248
1684
|
extract<T>(schema: ExtractSchema): Promise<T>;
|
|
1249
|
-
protected _normalizeSchema(schema: ExtractSchema): ExtractSchema;
|
|
1250
1685
|
/**
|
|
1251
1686
|
* Gets the unique identifier of this engine implementation.
|
|
1252
1687
|
*/
|
|
@@ -1280,6 +1715,22 @@ declare abstract class FetchEngine<TContext extends CrawlingContext = any, TCraw
|
|
|
1280
1715
|
*/
|
|
1281
1716
|
initialize(context: FetchEngineContext, options?: BaseFetcherProperties): Promise<void>;
|
|
1282
1717
|
cleanup(): Promise<void>;
|
|
1718
|
+
/**
|
|
1719
|
+
* Gets the initial scope for extraction for the specific engine.
|
|
1720
|
+
* @param context - Crawlee crawling context
|
|
1721
|
+
* @internal
|
|
1722
|
+
*/
|
|
1723
|
+
protected abstract _getInitialElementScope(context: TContext): FetchElementScope;
|
|
1724
|
+
/**
|
|
1725
|
+
* Unified action processor that handles engine-agnostic actions.
|
|
1726
|
+
* @param context - Crawlee crawling context
|
|
1727
|
+
* @param action - Action to execute
|
|
1728
|
+
* @internal
|
|
1729
|
+
*/
|
|
1730
|
+
protected _processAction(context: TContext, action: FetchEngineAction): Promise<any>;
|
|
1731
|
+
protected _handlePause(action: {
|
|
1732
|
+
message?: string;
|
|
1733
|
+
}): Promise<void>;
|
|
1283
1734
|
/**
|
|
1284
1735
|
* Executes all pending fetch engine actions within the current Crawlee request handler context.
|
|
1285
1736
|
*
|
|
@@ -1543,29 +1994,64 @@ declare class CheerioFetchEngine extends FetchEngine<CheerioCrawlingContext, Che
|
|
|
1543
1994
|
static readonly mode = "http";
|
|
1544
1995
|
private _ensureCheerioContext;
|
|
1545
1996
|
protected _buildResponse(context: CheerioCrawlingContext): Promise<FetchResponse>;
|
|
1546
|
-
|
|
1997
|
+
_querySelectorAll(scope: {
|
|
1547
1998
|
$: CheerioAPI;
|
|
1548
1999
|
el: any;
|
|
1549
|
-
} | any[], selector: string): Promise<
|
|
1550
|
-
|
|
2000
|
+
} | any[], selector: string): Promise<FetchElementScope[]>;
|
|
2001
|
+
_nextSiblingsUntil(scope: {
|
|
2002
|
+
$: CheerioAPI;
|
|
2003
|
+
el: CheerioNode;
|
|
2004
|
+
}, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2005
|
+
_parentElement(scope: {
|
|
2006
|
+
$: CheerioAPI;
|
|
2007
|
+
el: CheerioNode;
|
|
2008
|
+
}): Promise<FetchElementScope | null>;
|
|
2009
|
+
_isSameElement(scope1: {
|
|
2010
|
+
el: CheerioNode;
|
|
2011
|
+
}, scope2: {
|
|
2012
|
+
el: CheerioNode;
|
|
2013
|
+
}): Promise<boolean>;
|
|
2014
|
+
_findClosestAncestor(scope: {
|
|
1551
2015
|
$: CheerioAPI;
|
|
1552
2016
|
el: CheerioNode;
|
|
1553
|
-
},
|
|
1554
|
-
protected _parentElement(context: {
|
|
2017
|
+
}, candidates: {
|
|
1555
2018
|
$: CheerioAPI;
|
|
1556
2019
|
el: CheerioNode;
|
|
1557
|
-
}): Promise<
|
|
1558
|
-
|
|
2020
|
+
}[]): Promise<FetchElementScope | null>;
|
|
2021
|
+
_contains(container: {
|
|
2022
|
+
$: CheerioAPI;
|
|
1559
2023
|
el: CheerioNode;
|
|
1560
|
-
},
|
|
2024
|
+
}, element: {
|
|
2025
|
+
$: CheerioAPI;
|
|
1561
2026
|
el: CheerioNode;
|
|
1562
2027
|
}): Promise<boolean>;
|
|
1563
|
-
|
|
2028
|
+
_findCommonAncestor(scope1: {
|
|
2029
|
+
$: CheerioAPI;
|
|
2030
|
+
el: CheerioNode;
|
|
2031
|
+
}, scope2: {
|
|
2032
|
+
$: CheerioAPI;
|
|
2033
|
+
el: CheerioNode;
|
|
2034
|
+
}): Promise<FetchElementScope | null>;
|
|
2035
|
+
_findContainerChild(element: {
|
|
2036
|
+
$: CheerioAPI;
|
|
2037
|
+
el: CheerioNode;
|
|
2038
|
+
}, container: {
|
|
2039
|
+
$: CheerioAPI;
|
|
2040
|
+
el: CheerioNode;
|
|
2041
|
+
}): Promise<FetchElementScope | null>;
|
|
2042
|
+
_extractValue(schema: ExtractValueSchema, scope: {
|
|
1564
2043
|
$: CheerioAPI;
|
|
1565
2044
|
el: CheerioNode;
|
|
1566
2045
|
}): Promise<any>;
|
|
2046
|
+
protected _getInitialElementScope(context: CheerioCrawlingContext): FetchElementScope;
|
|
1567
2047
|
protected executeAction(context: CheerioCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
1568
|
-
|
|
2048
|
+
protected _requestWithRedirects(context: CheerioCrawlingContext, options: {
|
|
2049
|
+
url: string;
|
|
2050
|
+
method: string;
|
|
2051
|
+
body?: any;
|
|
2052
|
+
headers?: Record<string, string>;
|
|
2053
|
+
}): Promise<any>;
|
|
2054
|
+
protected _updateStateAfterNavigation(context: CheerioCrawlingContext, loadedRequest: any): Promise<void>;
|
|
1569
2055
|
protected _createCrawler(options: CheerioCrawlerOptions, config?: Configuration): CheerioCrawler;
|
|
1570
2056
|
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): CheerioCrawlerOptions;
|
|
1571
2057
|
goto(url: string, params?: GotoActionOptions): Promise<void | FetchResponse>;
|
|
@@ -1577,11 +2063,36 @@ declare class PlaywrightFetchEngine extends FetchEngine<PlaywrightCrawlingContex
|
|
|
1577
2063
|
static readonly id = "playwright";
|
|
1578
2064
|
static readonly mode = "browser";
|
|
1579
2065
|
protected _buildResponse(context: PlaywrightCrawlingContext): Promise<FetchResponse>;
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
2066
|
+
_querySelectorAll(scope: Locator | Locator[], selector: string): Promise<FetchElementScope[]>;
|
|
2067
|
+
_nextSiblingsUntil(scope: Locator, untilSelector?: string): Promise<FetchElementScope[]>;
|
|
2068
|
+
_parentElement(scope: Locator): Promise<FetchElementScope | null>;
|
|
2069
|
+
_isSameElement(scope1: Locator, scope2: Locator): Promise<boolean>;
|
|
2070
|
+
_findClosestAncestor(scope: Locator, candidates: Locator[]): Promise<FetchElementScope | null>;
|
|
2071
|
+
_contains(container: Locator, element: Locator): Promise<boolean>;
|
|
2072
|
+
_findCommonAncestor(scope1: Locator, scope2: Locator): Promise<FetchElementScope | null>;
|
|
2073
|
+
_findContainerChild(element: Locator, container: Locator): Promise<FetchElementScope | null>;
|
|
2074
|
+
_extractValue(schema: ExtractValueSchema, scope: Locator): Promise<any>;
|
|
2075
|
+
protected _getInitialElementScope(context: PlaywrightCrawlingContext): FetchElementScope;
|
|
2076
|
+
protected _waitForNavigation(context: PlaywrightCrawlingContext, oldUrl: string, actionType: string): Promise<void>;
|
|
2077
|
+
protected currentMousePos: {
|
|
2078
|
+
x: number;
|
|
2079
|
+
y: number;
|
|
2080
|
+
};
|
|
2081
|
+
protected _getRandomDelay(base: number, variance?: number): number;
|
|
2082
|
+
protected _getTrajectory(start: {
|
|
2083
|
+
x: number;
|
|
2084
|
+
y: number;
|
|
2085
|
+
}, end: {
|
|
2086
|
+
x: number;
|
|
2087
|
+
y: number;
|
|
2088
|
+
}, steps?: number): {
|
|
2089
|
+
x: number;
|
|
2090
|
+
y: number;
|
|
2091
|
+
}[];
|
|
2092
|
+
protected _moveToSelector(context: PlaywrightCrawlingContext, selector: string, steps?: number): Promise<{
|
|
2093
|
+
x: number;
|
|
2094
|
+
y: number;
|
|
2095
|
+
}>;
|
|
1585
2096
|
protected executeAction(context: PlaywrightCrawlingContext, action: FetchEngineAction): Promise<any>;
|
|
1586
2097
|
protected _createCrawler(options: PlaywrightCrawlerOptions, config?: Configuration): PlaywrightCrawler;
|
|
1587
2098
|
protected _getSpecificCrawlerOptions(ctx: FetchEngineContext): Promise<Partial<PlaywrightCrawlerOptions>>;
|
|
@@ -1742,7 +2253,7 @@ interface BaseFetcherProperties {
|
|
|
1742
2253
|
enableSmart?: boolean;
|
|
1743
2254
|
useSiteRegistry?: boolean;
|
|
1744
2255
|
antibot?: boolean;
|
|
1745
|
-
debug?: boolean;
|
|
2256
|
+
debug?: boolean | string | string[];
|
|
1746
2257
|
headers?: Record<string, string>;
|
|
1747
2258
|
cookies?: Cookie[];
|
|
1748
2259
|
sessionState?: any;
|
|
@@ -1770,6 +2281,7 @@ interface BaseFetcherProperties {
|
|
|
1770
2281
|
engine?: BrowserEngine;
|
|
1771
2282
|
headless?: boolean;
|
|
1772
2283
|
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle' | 'commit';
|
|
2284
|
+
launchOptions?: Record<string, any>;
|
|
1773
2285
|
};
|
|
1774
2286
|
http?: {
|
|
1775
2287
|
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
|
@@ -1860,6 +2372,7 @@ declare class FetchSession {
|
|
|
1860
2372
|
* @param options - Configuration options for the fetcher.
|
|
1861
2373
|
*/
|
|
1862
2374
|
constructor(options?: FetcherOptions);
|
|
2375
|
+
protected _logDebug(category: string, ...args: any[]): void;
|
|
1863
2376
|
/**
|
|
1864
2377
|
* Executes a single action within the session.
|
|
1865
2378
|
*
|
|
@@ -2055,6 +2568,110 @@ declare class PauseAction extends FetchAction {
|
|
|
2055
2568
|
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2056
2569
|
}
|
|
2057
2570
|
|
|
2571
|
+
declare class TrimAction extends FetchAction {
|
|
2572
|
+
static id: string;
|
|
2573
|
+
static returnType: "none";
|
|
2574
|
+
static capabilities: {
|
|
2575
|
+
http: "simulate";
|
|
2576
|
+
browser: "native";
|
|
2577
|
+
};
|
|
2578
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2579
|
+
}
|
|
2580
|
+
|
|
2581
|
+
/**
|
|
2582
|
+
* Action that evaluates a JavaScript function or expression in the context of the page.
|
|
2583
|
+
*
|
|
2584
|
+
* @remarks
|
|
2585
|
+
* This action is cross-engine compatible. In Cheerio (HTTP) mode, it simulates a browser environment
|
|
2586
|
+
* by providing `window` and `document` objects linked to the Cheerio instance.
|
|
2587
|
+
*
|
|
2588
|
+
* Key features:
|
|
2589
|
+
* - Supports async functions.
|
|
2590
|
+
* - Supports direct expressions (e.g., `"document.title"`).
|
|
2591
|
+
* - Detects URL changes and triggers navigation.
|
|
2592
|
+
* - Consistent parameter passing with Playwright (single argument).
|
|
2593
|
+
*
|
|
2594
|
+
* @example
|
|
2595
|
+
* ```json
|
|
2596
|
+
* {
|
|
2597
|
+
* "action": "evaluate",
|
|
2598
|
+
* "params": {
|
|
2599
|
+
* "fn": "({ selector }) => document.querySelector(selector).textContent",
|
|
2600
|
+
* "args": { "selector": "h1" }
|
|
2601
|
+
* },
|
|
2602
|
+
* "storeAs": "pageTitle"
|
|
2603
|
+
* }
|
|
2604
|
+
* ```
|
|
2605
|
+
*/
|
|
2606
|
+
declare class EvaluateAction extends FetchAction {
|
|
2607
|
+
static id: string;
|
|
2608
|
+
static returnType: "any";
|
|
2609
|
+
static capabilities: {
|
|
2610
|
+
http: "simulate";
|
|
2611
|
+
browser: "native";
|
|
2612
|
+
};
|
|
2613
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<any>;
|
|
2614
|
+
}
|
|
2615
|
+
|
|
2616
|
+
interface MouseMoveParams {
|
|
2617
|
+
x?: number;
|
|
2618
|
+
y?: number;
|
|
2619
|
+
selector?: string;
|
|
2620
|
+
steps?: number;
|
|
2621
|
+
}
|
|
2622
|
+
declare class MouseMoveAction extends FetchAction {
|
|
2623
|
+
static id: string;
|
|
2624
|
+
static returnType: "none";
|
|
2625
|
+
static capabilities: {
|
|
2626
|
+
http: "noop";
|
|
2627
|
+
browser: "native";
|
|
2628
|
+
};
|
|
2629
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2630
|
+
}
|
|
2631
|
+
interface MouseClickParams {
|
|
2632
|
+
x?: number;
|
|
2633
|
+
y?: number;
|
|
2634
|
+
button?: 'left' | 'right' | 'middle';
|
|
2635
|
+
clickCount?: number;
|
|
2636
|
+
delay?: number;
|
|
2637
|
+
}
|
|
2638
|
+
declare class MouseClickAction extends FetchAction {
|
|
2639
|
+
static id: string;
|
|
2640
|
+
static returnType: "none";
|
|
2641
|
+
static capabilities: {
|
|
2642
|
+
http: "noop";
|
|
2643
|
+
browser: "native";
|
|
2644
|
+
};
|
|
2645
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2646
|
+
}
|
|
2647
|
+
|
|
2648
|
+
interface KeyboardTypeParams {
|
|
2649
|
+
text: string;
|
|
2650
|
+
delay?: number;
|
|
2651
|
+
}
|
|
2652
|
+
declare class KeyboardTypeAction extends FetchAction {
|
|
2653
|
+
static id: string;
|
|
2654
|
+
static returnType: "none";
|
|
2655
|
+
static capabilities: {
|
|
2656
|
+
http: "noop";
|
|
2657
|
+
browser: "native";
|
|
2658
|
+
};
|
|
2659
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2660
|
+
}
|
|
2661
|
+
interface KeyboardPressParams {
|
|
2662
|
+
key: string;
|
|
2663
|
+
delay?: number;
|
|
2664
|
+
}
|
|
2665
|
+
declare class KeyboardPressAction extends FetchAction {
|
|
2666
|
+
static id: string;
|
|
2667
|
+
static returnType: "none";
|
|
2668
|
+
static capabilities: {
|
|
2669
|
+
http: "noop";
|
|
2670
|
+
browser: "native";
|
|
2671
|
+
};
|
|
2672
|
+
onExecute(context: FetchContext, options?: BaseFetchActionProperties): Promise<void>;
|
|
2673
|
+
}
|
|
2674
|
+
|
|
2058
2675
|
declare function fetchWeb(options: FetcherOptions): Promise<{
|
|
2059
2676
|
result: FetchResponse | undefined;
|
|
2060
2677
|
outputs: Record<string, any>;
|
|
@@ -2064,4 +2681,4 @@ declare function fetchWeb(url: string, options?: FetcherOptions): Promise<{
|
|
|
2064
2681
|
outputs: Record<string, any>;
|
|
2065
2682
|
}>;
|
|
2066
2683
|
|
|
2067
|
-
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, type StorageOptions, SubmitAction, type SubmitActionOptions, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb };
|
|
2684
|
+
export { type BaseFetchActionOptions, type BaseFetchActionProperties, type BaseFetchCollectorActionProperties, type BaseFetchCollectorOptions, type BaseFetcherProperties, type BrowserEngine, CheerioFetchEngine, ClickAction, DefaultFetcherProperties, type DispatchedEngineAction, EvaluateAction, type EvaluateActionOptions, ExtractAction, type ExtractActionProperties, FetchAction, type FetchActionCapabilities, type FetchActionCapabilityMode, type FetchActionInContext, type FetchActionOptions, type FetchActionProperties, type FetchActionResult, FetchActionResultStatus, type FetchContext, FetchEngine, type FetchEngineAction, type FetchEngineContext, type FetchEngineType, type FetchMetadata, type FetchResponse, type FetchReturnType, type FetchReturnTypeFor, type FetchReturnTypeRegistry, FetchSession, type FetchSite, FetcherOptionKeys, type FetcherOptions, FillAction, GetContentAction, GotoAction, type GotoActionOptions, KeyboardPressAction, type KeyboardPressParams, KeyboardTypeAction, type KeyboardTypeParams, MouseClickAction, type MouseClickParams, MouseMoveAction, type MouseMoveParams, type OnFetchPauseCallback, PauseAction, type PendingEngineRequest, PlaywrightFetchEngine, type ResourceType, type StorageOptions, SubmitAction, type SubmitActionOptions, TRIM_PRESETS, TrimAction, type TrimActionOptions, type TrimPreset, WaitForAction, type WaitForActionOptions, WebFetcher, fetchWeb };
|