npm - @letsscrapedata/scraper - Versions diffs - 0.0.74 - Mend

@letsscrapedata/scraper 0.0.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,239 @@
+import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
+import { Proxy } from '@letsscrapedata/proxy';
+import { LogFunction } from '@letsscrapedata/utils';
+type TemplateId = number;
+type HttpHeaders = Record<string, string>;
+interface ScraperStateData extends BrowserStateData {
+    /**
+     * @default {}
+     */
+    headers: HttpHeaders;
+    /**
+     * @default {}
+     */
+    userData: Record<string, string>;
+}
+/**
+ * Network context used to execute the task
+ */
+interface TaskNetworkContext {
+    /**
+     * Proxy that is used to access the target website:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * @default null
+     */
+    proxy: Proxy | null;
+    /**
+     * browser page that is used to open web pages when executing the task
+     * * null when domainNum is less than 0, which means no browser page is required to execute the task
+     * @default null
+     */
+    page: LsdPage | null;
+    /**
+     * LsdApiContext that shares the state data within the same browser context:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * @default null
+     */
+    browserApiContext: LsdApiContext | null;
+    /**
+     * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * * it is not recommended to use this context unless you never use a browser to access web pages.
+     * @default null
+     */
+    standaloneApiContext: LsdApiContext | null;
+}
+type DataRecord = Record<string, string>;
+type ExecData = Record<string, DataRecord[]>;
+interface Subtask {
+    tid: number;
+    parasstr: string;
+    idx?: number;
+    sapFlag?: boolean;
+}
+interface TaskMisc {
+    taskId: number;
+    message: string;
+    stack: string;
+    variables: Record<string, string>;
+}
+interface TaskData {
+    templateId: TemplateId;
+    parasStr: string;
+    credits: number;
+    execData: ExecData;
+    subtasks: Subtask[];
+}
+interface TaskResult {
+    taskData: TaskData;
+    subtaskDatas?: TaskData[];
+    /**
+     * included if credits >= 0
+     */
+    newStateData?: ScraperStateData;
+    /**
+     * included if credits < 0
+     */
+    misc?: TaskMisc;
+}
+type TaskType = "indAsync" | "indSync" | "memSync";
+interface TemplateTasks {
+    tid: number;
+    parasstrs: string[];
+}
+/**
+ * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
+ */
+interface BrowserConfig {
+    browserControllerType?: BrowserControllerType;
+    /**
+     * url used to connected the current browser
+     ** url starts with "http://", such as "http://localhost:9222/"
+     ** browserUrl can be used when mannaul login in advance.
+     */
+    browserUrl?: string;
+    /**
+     * proxy
+     ** no proxy will be used if proxyUrl is ""
+     ** valid only if !browserUrl
+     */
+    proxyUrl?: string;
+    /**
+     * type of browser to be launched
+     * valid only if !browserUrl
+     * @default "chromium"
+     */
+    browserType?: LsdBrowserType;
+}
+interface TemplatePara {
+    templateId: number;
+    /**
+     * code for reading or getting the template
+     * @default "" - only public templates can be got if readCode is ""
+     */
+    readCode?: string;
+    /**
+     * the maximum number of concurrent tasks that can execute the same template in a browserContext
+     * @default 1
+     */
+    maxConncurrency?: number;
+}
+type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
+interface ScraperConfig {
+    /**
+     * @default false
+     */
+    exitWhenCompleted?: boolean;
+    /**
+     * whether to use the parasstr in XML if parasstr of a task is ""
+     * @default false
+     */
+    useParasstrInXmlIfNeeded?: boolean;
+    /**
+     * whether to load unfinished tasks
+     * @default false
+     */
+    loadUnfinishedTasks?: boolean;
+    /**
+     * @default "", which will use current directory of process + "/data/"
+     * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
+     */
+    baseDir?: string;
+    /**
+     * where are the templates saved
+     * @default "", which means to get the templates from LSD server
+     */
+    templateDir?: string;
+    /**
+     * filename in action_setvar_get/get_file must include inputFileDirePart for security.
+     * @default "LetsScrapeData"
+     */
+    inputFileDirPart?: string;
+    /**
+     * wether to use puppeteer-extra-plugin-stealth, use patchright instead
+     * @default false
+     */
+    useStealthPlugin?: boolean;
+    /**
+     * default browserControllerType of BrowserConfig
+     * @default "patchright"
+     */
+    browserControllerType?: BrowserControllerType;
+    /**
+     * default browserType of BrowserConfig
+     * @default "chromium"
+     */
+    browserType?: LsdBrowserType;
+    /**
+     * @default { headless: false }
+     */
+    lsdLaunchOptions?: LsdLaunchOptions;
+    /**
+     * @default {browserUrl: ""}
+     */
+    lsdConnectOptions?: LsdConnectOptions;
+    /**
+     * Important: browsers to be launched or connected using proxyUrl
+     * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
+     */
+    browserConfigs?: BrowserConfig[];
+    captcha?: {
+        /**
+         * clientKey of 2captcha
+         */
+        clientKey: string;
+    };
+    templateUrl?: string;
+    /**
+     * the default maximum number of concurrent tasks that can execute the same template in a browserContext
+     * @default 1
+     */
+    maxConcurrency?: number;
+    /**
+     * @default ""
+     */
+    readCode?: string;
+    /**
+     * @default []
+     */
+    templateParas?: TemplatePara[];
+    /**
+     * @default 10
+     */
+    totalMaxConcurrency?: number;
+    /**
+     * min miliseconds between two tasks of the same template
+     * @default 2000
+     */
+    minMiliseconds?: number;
+    /**
+     * whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
+     * @default false
+     */
+    moveDataWhenStart?: boolean;
+    /**
+     ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
+     * @default "jsonl"
+     */
+    dataFileFormat?: DataFileFormat;
+    /**
+     * valid only when dataFileFormat is "jsonl"
+     * @default true
+     */
+    useNickName?: boolean;
+    /**
+     * valid only when dataFileFormat is "txt"
+     * @default "::"
+     */
+    columnSeperator?: string;
+}
+declare function setScraperLogFun(logFun: LogFunction): boolean;
+declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
+declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
+declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
+export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,239 @@
+import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
+import { Proxy } from '@letsscrapedata/proxy';
+import { LogFunction } from '@letsscrapedata/utils';
+type TemplateId = number;
+type HttpHeaders = Record<string, string>;
+interface ScraperStateData extends BrowserStateData {
+    /**
+     * @default {}
+     */
+    headers: HttpHeaders;
+    /**
+     * @default {}
+     */
+    userData: Record<string, string>;
+}
+/**
+ * Network context used to execute the task
+ */
+interface TaskNetworkContext {
+    /**
+     * Proxy that is used to access the target website:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * @default null
+     */
+    proxy: Proxy | null;
+    /**
+     * browser page that is used to open web pages when executing the task
+     * * null when domainNum is less than 0, which means no browser page is required to execute the task
+     * @default null
+     */
+    page: LsdPage | null;
+    /**
+     * LsdApiContext that shares the state data within the same browser context:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * @default null
+     */
+    browserApiContext: LsdApiContext | null;
+    /**
+     * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
+     * * null only when domainId is 0, which means no network resource are required to execute the task
+     * * it is not recommended to use this context unless you never use a browser to access web pages.
+     * @default null
+     */
+    standaloneApiContext: LsdApiContext | null;
+}
+type DataRecord = Record<string, string>;
+type ExecData = Record<string, DataRecord[]>;
+interface Subtask {
+    tid: number;
+    parasstr: string;
+    idx?: number;
+    sapFlag?: boolean;
+}
+interface TaskMisc {
+    taskId: number;
+    message: string;
+    stack: string;
+    variables: Record<string, string>;
+}
+interface TaskData {
+    templateId: TemplateId;
+    parasStr: string;
+    credits: number;
+    execData: ExecData;
+    subtasks: Subtask[];
+}
+interface TaskResult {
+    taskData: TaskData;
+    subtaskDatas?: TaskData[];
+    /**
+     * included if credits >= 0
+     */
+    newStateData?: ScraperStateData;
+    /**
+     * included if credits < 0
+     */
+    misc?: TaskMisc;
+}
+type TaskType = "indAsync" | "indSync" | "memSync";
+interface TemplateTasks {
+    tid: number;
+    parasstrs: string[];
+}
+/**
+ * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
+ */
+interface BrowserConfig {
+    browserControllerType?: BrowserControllerType;
+    /**
+     * url used to connected the current browser
+     ** url starts with "http://", such as "http://localhost:9222/"
+     ** browserUrl can be used when mannaul login in advance.
+     */
+    browserUrl?: string;
+    /**
+     * proxy
+     ** no proxy will be used if proxyUrl is ""
+     ** valid only if !browserUrl
+     */
+    proxyUrl?: string;
+    /**
+     * type of browser to be launched
+     * valid only if !browserUrl
+     * @default "chromium"
+     */
+    browserType?: LsdBrowserType;
+}
+interface TemplatePara {
+    templateId: number;
+    /**
+     * code for reading or getting the template
+     * @default "" - only public templates can be got if readCode is ""
+     */
+    readCode?: string;
+    /**
+     * the maximum number of concurrent tasks that can execute the same template in a browserContext
+     * @default 1
+     */
+    maxConncurrency?: number;
+}
+type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
+interface ScraperConfig {
+    /**
+     * @default false
+     */
+    exitWhenCompleted?: boolean;
+    /**
+     * whether to use the parasstr in XML if parasstr of a task is ""
+     * @default false
+     */
+    useParasstrInXmlIfNeeded?: boolean;
+    /**
+     * whether to load unfinished tasks
+     * @default false
+     */
+    loadUnfinishedTasks?: boolean;
+    /**
+     * @default "", which will use current directory of process + "/data/"
+     * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
+     */
+    baseDir?: string;
+    /**
+     * where are the templates saved
+     * @default "", which means to get the templates from LSD server
+     */
+    templateDir?: string;
+    /**
+     * filename in action_setvar_get/get_file must include inputFileDirePart for security.
+     * @default "LetsScrapeData"
+     */
+    inputFileDirPart?: string;
+    /**
+     * wether to use puppeteer-extra-plugin-stealth, use patchright instead
+     * @default false
+     */
+    useStealthPlugin?: boolean;
+    /**
+     * default browserControllerType of BrowserConfig
+     * @default "patchright"
+     */
+    browserControllerType?: BrowserControllerType;
+    /**
+     * default browserType of BrowserConfig
+     * @default "chromium"
+     */
+    browserType?: LsdBrowserType;
+    /**
+     * @default { headless: false }
+     */
+    lsdLaunchOptions?: LsdLaunchOptions;
+    /**
+     * @default {browserUrl: ""}
+     */
+    lsdConnectOptions?: LsdConnectOptions;
+    /**
+     * Important: browsers to be launched or connected using proxyUrl
+     * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
+     */
+    browserConfigs?: BrowserConfig[];
+    captcha?: {
+        /**
+         * clientKey of 2captcha
+         */
+        clientKey: string;
+    };
+    templateUrl?: string;
+    /**
+     * the default maximum number of concurrent tasks that can execute the same template in a browserContext
+     * @default 1
+     */
+    maxConcurrency?: number;
+    /**
+     * @default ""
+     */
+    readCode?: string;
+    /**
+     * @default []
+     */
+    templateParas?: TemplatePara[];
+    /**
+     * @default 10
+     */
+    totalMaxConcurrency?: number;
+    /**
+     * min miliseconds between two tasks of the same template
+     * @default 2000
+     */
+    minMiliseconds?: number;
+    /**
+     * whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
+     * @default false
+     */
+    moveDataWhenStart?: boolean;
+    /**
+     ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
+     * @default "jsonl"
+     */
+    dataFileFormat?: DataFileFormat;
+    /**
+     * valid only when dataFileFormat is "jsonl"
+     * @default true
+     */
+    useNickName?: boolean;
+    /**
+     * valid only when dataFileFormat is "txt"
+     * @default "::"
+     */
+    columnSeperator?: string;
+}
+declare function setScraperLogFun(logFun: LogFunction): boolean;
+declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
+declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
+declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
+export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };