@letsscrapedata/controller 0.0.43 → 0.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +13 -1
  2. package/readme.md +462 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@letsscrapedata/controller",
3
- "version": "0.0.43",
3
+ "version": "0.0.44",
4
4
  "description": "Unified browser / HTML controller interfaces that support playwright, puppeteer and cheerio",
5
5
  "type": "module",
6
6
  "main": "./dist/index.mjs",
@@ -33,6 +33,18 @@
33
33
  "node": ">=18"
34
34
  },
35
35
  "license": "Apache-2.0",
36
+ "keywords": [
37
+ "letsscrapedata",
38
+ "controller",
39
+ "headless",
40
+ "chrome",
41
+ "playwright",
42
+ "puppeteer",
43
+ "cheerio",
44
+ "crawler",
45
+ "scraper",
46
+ "apify"
47
+ ],
36
48
  "dependencies": {
37
49
  "@letsscrapedata/utils": "^0.0.12",
38
50
  "cheerio": "^1.0.0",
package/readme.md ADDED
@@ -0,0 +1,462 @@
1
+ <div align="center">
2
+ <div>
3
+ <a href="https://www.LetsScrapeData.com" style="text-decoration: none" target="_blank">
4
+ <img src="https://www.letsscrapedata.com/assets/logo.svg" width="160" alt="LetsScrapeData">
5
+ </a>
6
+ </div>
7
+ <!-- <div>This is part of LetsScrapeData <a href="https://www.npmjs.com/~letsscrapedata"> web scraping suites </a>.</div> -->
8
+ <div>You can use a free <a href="https://www.LetsScrapeData.com">LetsScrapeData App</a> if you want to scrape web data without programming.</div>
9
+ <br/>
10
+ </div>
11
+
12
+ <font size=4>Please get help and discuss how to scrape a website on the [discord server](https://discord.gg/46atZ8kPVb), which can respond quickly. It is better to submit issues on [github](https://github.com/LetsScrapeData/controller) for better tracking.</font>
13
+
14
+ ## Features
15
+ This package is used by [@letsscrapedata/scraper](https://www.npmjs.com/package/@letsscrapedata/scraper) to facilitate switching between different types of browser controllers and to facilitate support for the new anti-bot browser controller without modifying existing programs.
16
+ * Same interface of playwright, puppeteer, cheerio (more to come): easy to switch between them
17
+ * Web browsing automation: goto(open) / click / input / hover / select / scroll
18
+ * State data management: cookies, localStorage, HTTP Headers, custom session data
19
+ * Request and response interception management: data and HTTP headers
20
+ * Elements selection by CSS selectors or XPath: whether in frames or not
21
+ * Element's attributes: innerHtml, innerText, outerHtml, textContent, etc
22
+ * Automatic file saving: such as screenshot, pdf
23
+ * CDP message
24
+ * Page evaluate
25
+ * Completed the functions that are not supported by individual browser controllers or provided workarounds for known issues
26
+
27
+ ## Install
28
+ ```sh
29
+ npm install @letsscrapedata/controller
30
+ ```
31
+
32
+ ## Examples
33
+ ```typescript
34
+ import { controller } from "@letsscrapedata/controller";
35
+
36
+ const browser = await controller.launch("playwright", "chromium", { headless: false });
37
+ const browserContext = await browser.newBrowserContext();
38
+ const page = await browserContext.getPage();
39
+
40
+ await page.goto("https://www.letsscrapedata.com/pages/listexample1.html");
41
+ await page.screenshot({path: "screenshot.png"});
42
+ await browser.close();
43
+ ```
44
+
45
+ ## Same interfaces
46
+ * LsdElement
47
+ * LsdPage
48
+ * LsdBrowserContext
49
+ * LsdBrowser
50
+ * LsdBrowserController
51
+
52
+ ### LsdPage
53
+ ```typescript
54
+ export interface LsdPage extends EventEmitter {
55
+ /**
56
+ * Get the LsdApiContext associated with this page's LsdBrowserContext
57
+ * * only vaild in playwright
58
+ */
59
+ apiContext(): LsdApiContext;
60
+
61
+ bringToFront(): Promise<boolean>;
62
+
63
+ browserContext(): LsdBrowserContext;
64
+
65
+ /**
66
+ * clear the cookies of the current page(url)
67
+ * * Prerequisites: page must has a valid url, such as by calling goto(url)
68
+ */
69
+ clearCookies(): Promise<boolean>;
70
+
71
+ /**
72
+ * clear the localStorage of the current page(url)
73
+ * * Prerequisites: page must has a valid url, such as by calling goto(url)
74
+ */
75
+ clearLocalStorage(): Promise<boolean>;
76
+
77
+ /**
78
+ * Clear all request interceptions on the page
79
+ */
80
+ clearRequestInterceptions(): Promise<boolean>;
81
+ /**
82
+ * Clear all response interceptions on the page
83
+ */
84
+ clearResponseInterceptions(): Promise<boolean>;
85
+
86
+ /**
87
+ * clear the stateData of the current page(url):
88
+ * * stateData: cookies, localStorage, indexedDB
89
+ * * Prerequisites: page must has a valid url, such as by calling goto(url)
90
+ */
91
+ clearStateData(): Promise<boolean>;
92
+
93
+ /**
94
+ * Only free page can be closed!
95
+ */
96
+ close(): Promise<boolean>;
97
+
98
+ /**
99
+ * Get the full HTML content of the page or decendant frame
100
+ * @param iframeOptions default [], selectors of decendant frames
101
+ */
102
+ content(iframeOptions?: IframeOption[]): Promise<string>;
103
+
104
+ cookies(): Promise<CookieItem[]>;
105
+
106
+ evalute(fun: Function, args?: any[]): Promise<any>;
107
+
108
+ /**
109
+ * @returns the first element matching the given CSS selector or XPath
110
+ * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
111
+ * @param iframeOptions default [], options to select decendant frame
112
+ */
113
+ findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement | null>;
114
+
115
+ /**
116
+ * @returns elements matching the given CSS selector or XPath
117
+ * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
118
+ * @param iframeOptions default [], options to select decendant frame
119
+ */
120
+
121
+ findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement[]>;
122
+
123
+ /**
124
+ * Free a busy page. All request and response interceptions will be cleared.
125
+ */
126
+ free(): Promise<boolean>;
127
+
128
+ /**
129
+ * @returns whether the element has the specified attribute or not
130
+ * @param attributeName
131
+ */
132
+ goto(url: string, options?: GotoOptions): Promise<boolean>;
133
+
134
+ id(): string;
135
+
136
+ isFree(): boolean;
137
+
138
+ /**
139
+ * valid only in CheerioPage
140
+ * @param html
141
+ * @param isHtml default true
142
+ */
143
+ load(html: string, isHtml?: boolean): boolean;
144
+
145
+ localStroage(): Promise<LocalStorageOrigin[]>;
146
+
147
+ mainFrame(): AllFrame;
148
+
149
+ maximizeViewport(): Promise<boolean>;
150
+
151
+ pageHeight(): Promise<number>;
152
+
153
+ pageInfo(): PageInfo;
154
+
155
+ pageWidth(): Promise<number>;
156
+
157
+ pdf(options?: PDFOptions): Promise<Buffer>;
158
+
159
+ screenshot(options?: ScreenshotOptions): Promise<Buffer>;
160
+
161
+ scrollBy(x: number, y: number): Promise<boolean>;
162
+
163
+ scrollTo(x: number, y: number): Promise<boolean>;
164
+
165
+ /**
166
+ *
167
+ * Send a CDP message over the current(not detached) or new CDP session
168
+ * @param method protocol method name
169
+ * @param params default null(ignored), method parameters
170
+ * @param detach default true, whether to detach the CDPSession from target
171
+ */
172
+ sendCDPMessage(method: string, params?: object | null, detach?: boolean): Promise<any>;
173
+
174
+ setCookies(cookies: CookieItem[]): Promise<boolean>;
175
+
176
+ setExtraHTTPHeaders(headers: Record<string, string>): Promise<boolean>;
177
+
178
+ /**
179
+ * set localStorage on the current web page(page.url())
180
+ * @param localStorageItems
181
+ */
182
+ setLocalStroage(localStorageItems: LocalStorageItem[]): Promise<boolean>;
183
+
184
+ setPageInfo(pageInfo: UpdatablePageInfo): boolean;
185
+
186
+ /**
187
+ * Intercept requests that meet the conditions(requestMatch) to perform an action(action and fulfill).
188
+ * @param options
189
+ */
190
+ setRequestInterception(options: RequestInterceptionOption | RequestInterceptionOption[]): Promise<boolean>;
191
+ /**
192
+ * Intercept responses that meet the conditions(requestMatch and responseMatch) to perform actions(cacheArray and handler )
193
+ * @param options
194
+ */
195
+ setResponseInterception(options: ResponseInterceptionOption | ResponseInterceptionOption[]): Promise<boolean>;
196
+
197
+ /**
198
+ * Shortcut for LsdPage.browserContext().setStateData(stateData)
199
+ * @param stateData
200
+ */
201
+ setStateData(stateData: BrowserStateData): Promise<boolean>;
202
+
203
+ /**
204
+ * valid only in puppeteer
205
+ * @param userAgent
206
+ */
207
+ setUserAgent(userAgent: string): Promise<boolean>;
208
+
209
+ setViewportSize(viewPortSize: ViewportSize): Promise<boolean>;
210
+
211
+ stateData(): Promise<BrowserStateData>;
212
+
213
+ status(): PageStatus;
214
+
215
+ title(): Promise<string>;
216
+
217
+ url(): string;
218
+
219
+ /**
220
+ * start to use this free page
221
+ */
222
+ use(): boolean;
223
+
224
+ /**
225
+ *
226
+ * @param selector CSS selector, not XPath
227
+ * @param options
228
+ */
229
+ waitForElement(selector: string, options?: WaitElementOptions): Promise<boolean>;
230
+
231
+ /**
232
+ *
233
+ * @param options
234
+ */
235
+ waitForNavigation(options: WaitNavigationOptions): Promise<boolean>;
236
+
237
+ /**
238
+ * obj=window?.[key1]...?.[keyn]
239
+ * @return obj ? JSON.stringify(obj) : ""
240
+ * @param keys
241
+ */
242
+ windowMember(keys: string[]): Promise<string>;
243
+
244
+ _origPage(): AllPage;
245
+ }
246
+ ```
247
+
248
+ ### LsdElement
249
+ ```typescript
250
+ export interface LsdElement {
251
+ /////////////////////////////////////////////////////////////////////////////// methods used to extract data from the element
252
+ /**
253
+ *
254
+ * @return the value of a specified attribute on the element
255
+ * @param attributeName
256
+ */
257
+ attribute(attributeName: string): Promise<string>;
258
+ /**
259
+ * @returns the attribute names of the element
260
+ */
261
+ attributeNames(): Promise<string[]>;
262
+ /**
263
+ * @returns the first element matching the given CSS selector or XPath
264
+ * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
265
+ * @param iframeOptions default [], options to select decendant frame
266
+ * @param absolute valid only if iframeOptions.length===0
267
+ */
268
+ findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement | null>;
269
+ /**
270
+ * @returns elements matching the given CSS selector or XPath
271
+ * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
272
+ * @param iframeOptions default [], options to select decendant frame
273
+ * @param absolute valid only if iframeOptions.length===0
274
+ */
275
+ findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement[]>;
276
+ /**
277
+ * @returns whether the element has the specified attribute or not
278
+ * @param attributeName
279
+ */
280
+ hasAttribute(attributeName: string): Promise<boolean>;
281
+ /**
282
+ * @returns the HTML or XML markup contained within the element
283
+ */
284
+ innerHtml(): Promise<string>;
285
+
286
+ /**
287
+ * @returns innerText of element
288
+ * @param onlyChild default false, whether to include only the text of the child text nodes
289
+ */
290
+ innerText(onlyChild?: boolean): Promise<string>;
291
+ /**
292
+ * @returns the serialized HTML fragment describing the element including its descendants
293
+ */
294
+ outerHtml(): Promise<string>;
295
+ textContent(): Promise<string>;
296
+
297
+ /////////////////////////////////////////////////////////////////////////////// methods to operate the element(only valid for browser)
298
+ /**
299
+ * Click this element.
300
+ * @param options default {button: "left", count: 1, delay: 0, modifies: []}
301
+ */
302
+ click(options?: MouseClickOptions): Promise<boolean>;
303
+ focus(): Promise<boolean>;
304
+ hover(): Promise<boolean>;
305
+ /**
306
+ * * playwright: fill
307
+ * * puppeteer: type
308
+ */
309
+ input(value: string, options?: InputOptions): Promise<boolean>;
310
+ press(key: KeyInput, options: KeyPressOptions): Promise<boolean>;
311
+ screenshot(options?: ScreenshotOptions): Promise<Buffer>;
312
+ scrollIntoView(): Promise<boolean>;
313
+ select(options: SelectOptions): Promise<boolean>;
314
+ setAttribute(attributeName: string, newValue: string): Promise<boolean>;
315
+ _origElement(): AllElement;
316
+ }
317
+ ```
318
+
319
+ ### LsdBrowserContext
320
+ ```typescript
321
+ export interface LsdBrowserContext extends EventEmitter {
322
+ /**
323
+ * Get the LsdApiContext associated with this LsdBrowserContext
324
+ * * only vaild in playwright
325
+ */
326
+ apiContext(): LsdApiContext;
327
+
328
+ browser(): LsdBrowser;
329
+
330
+ close(): Promise<boolean>;
331
+
332
+ /**
333
+ * close pages that are free more than maxPageFreeSeconds if maxPageFreeSeconds > 0
334
+ * * but the last page in the browserContext will not be closed
335
+ * @default 0 the default maxPageFreeSeconds of the browserContext will be used
336
+ */
337
+ closeFreePages(maxPageFreeSeconds?: number): Promise<boolean>;
338
+
339
+ /**
340
+ * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
341
+ * @param browserContextRequirements
342
+ */
343
+ doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
344
+
345
+ /**
346
+ * get a free page from current pages or by creating a new page
347
+ */
348
+ getPage(always?: boolean): Promise<LsdPage | null>;
349
+
350
+ /**
351
+ * whether can get a number of free page(s)
352
+ * * refer to getPage()
353
+ * @param pageNum default 1, the number of free pages
354
+ */
355
+ hasFreePage(pageNum?: number): boolean;
356
+
357
+ id(): string;
358
+
359
+ isIncognito(): boolean;
360
+
361
+ page(pageIdx: number): LsdPage | null;
362
+
363
+ pages(): LsdPage[];
364
+
365
+ proxy(): ProxyInController | null; // 备用
366
+
367
+ setStateData(stateData: BrowserStateData): Promise<boolean>;
368
+
369
+ _origBrowserContext(): AllBrowserContext;
370
+ }
371
+
372
+ ```
373
+
374
+ ### LsdBrowser
375
+ ```typescript
376
+ export interface LsdBrowser extends EventEmitter {
377
+ // By default, constructor can be called in LsdBrowserController.launch/connect to create new instance
378
+ // main methods
379
+ newBrowserContext(options?: LsdBrowserContextOptions): Promise<LsdBrowserContext | null>;
380
+ /**
381
+ * 1. launched: close all browserContexts and this browser
382
+ * 2. connected:
383
+ * * in puppeteer: close all browserContexts and this browser???
384
+ * * in playwright: only browserContexts created by newContext will be closed, browser is disconnected and browser will not be closed
385
+ */
386
+ close(): Promise<boolean>;
387
+
388
+ // other methods
389
+ browserContexts(): LsdBrowserContext[];
390
+ browserControllerType(): BrowserControllerType;
391
+ browserCreationMethod(): BrowserCreationMethod;
392
+ browserType(): LsdBrowserType;
393
+
394
+ /**
395
+ * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
396
+ * @param browserContextRequirements
397
+ */
398
+ doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
399
+
400
+ /**
401
+ * @returns
402
+ * 1. launched: actual executable path
403
+ * 2. connected: exectuablePath in LsdConnectOptions, default ""(unkown)
404
+ */
405
+ executablePath(): string;
406
+
407
+ id(): string;
408
+ isConnected(): boolean;
409
+ isHeadless(): boolean;
410
+ options(): LsdLaunchOptions | LsdConnectOptions;
411
+ /**
412
+ * * puppeteer: return pid of connected or launched browser
413
+ * * playwright: return pid of connected browser that is launched manually or using launchServer, or else return 0
414
+ */
415
+ pid(): number;
416
+ /**
417
+ * get the cpu utility(%) and memory usage(MB) of browser processes if pid is greater than 0 (refer to pid())
418
+ */
419
+ pidUsage(): Promise<{ cpu: number, memory: number }>;
420
+ version(): Promise<string>; // playwright: sync; puppeteer: async
421
+
422
+ _origBrowser(): AllBrowser;
423
+ }
424
+ ```
425
+
426
+ ### LsdBrowserController
427
+ ```typescript
428
+ export interface LsdBrowserController {
429
+ /**
430
+ * launch a new browser using related browser controller
431
+ * @param browserControllerType
432
+ * @param browserType
433
+ * @param options
434
+ */
435
+ launch(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdLaunchOptions): Promise<LsdBrowser>;
436
+
437
+ /**
438
+ * connect to the current browser using related browser controller
439
+ * @param browserControllerType
440
+ * @param browserType
441
+ * @param options
442
+ */
443
+ connect(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdConnectOptions): Promise<LsdBrowser>;
444
+
445
+ /**
446
+ *
447
+ * @param puppeteer null means use puppeteer-extra-plugin-stealth based on puppeteer-extra
448
+ */
449
+ setPuppeteerNode(puppeteer: PuppeteerNode | null): boolean;
450
+
451
+ /**
452
+ *
453
+ * @param puppeteer null means use puppeteer-extra-plugin-stealth based on playwright-extra
454
+ */
455
+ setPlaywrightBrowserType(browserType: LsdBrowserType, playwrightBrowserType: BrowserType | null): boolean;
456
+
457
+ /**
458
+ * Create a new LsdApiContext, valid in playwright;
459
+ */
460
+ newApiContext(options?: LsdApiContextOptions): Promise<LsdApiContext>;
461
+ }
462
+ ```