portadom 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +202 -0
- package/dist/cjs/dom/dom.d.ts +19 -0
- package/dist/cjs/dom/dom.js +813 -0
- package/dist/cjs/dom/dom.js.map +1 -0
- package/dist/cjs/dom/domUtils.d.ts +42 -0
- package/dist/cjs/dom/domUtils.js +126 -0
- package/dist/cjs/dom/domUtils.js.map +1 -0
- package/dist/cjs/dom/types.d.ts +371 -0
- package/dist/cjs/dom/types.js +216 -0
- package/dist/cjs/dom/types.js.map +1 -0
- package/dist/cjs/index.d.ts +6 -0
- package/dist/cjs/index.js +22 -0
- package/dist/cjs/index.js.map +1 -0
- package/dist/cjs/page/page.d.ts +12 -0
- package/dist/cjs/page/page.js +105 -0
- package/dist/cjs/page/page.js.map +1 -0
- package/dist/cjs/page/pageUtils.d.ts +16 -0
- package/dist/cjs/page/pageUtils.js +116 -0
- package/dist/cjs/page/pageUtils.js.map +1 -0
- package/dist/cjs/page/types.d.ts +61 -0
- package/dist/cjs/page/types.js +3 -0
- package/dist/cjs/page/types.js.map +1 -0
- package/dist/cjs/utils/async.d.ts +19 -0
- package/dist/cjs/utils/async.js +74 -0
- package/dist/cjs/utils/async.js.map +1 -0
- package/dist/cjs/utils/error.d.ts +1 -0
- package/dist/cjs/utils/error.js +10 -0
- package/dist/cjs/utils/error.js.map +1 -0
- package/dist/cjs/utils/format.d.ts +9 -0
- package/dist/cjs/utils/format.js +19 -0
- package/dist/cjs/utils/format.js.map +1 -0
- package/dist/cjs/utils/types.d.ts +6 -0
- package/dist/cjs/utils/types.js +9 -0
- package/dist/cjs/utils/types.js.map +1 -0
- package/dist/cjs/utils/url.d.ts +9 -0
- package/dist/cjs/utils/url.js +32 -0
- package/dist/cjs/utils/url.js.map +1 -0
- package/package.json +68 -0
package/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# Portadom
|
|
2
|
+
|
|
3
|
+
*Single DOM manipulation interface across Browser API, JSDOM, Cheerio, Playwright.*
|
|
4
|
+
|
|
5
|
+
If you write web scrapers, you will know that you have multiple ways of parsing and manipulating the HTML / DOM:
|
|
6
|
+
- Download the HTML and feed into JSDOM or Cheerio.
|
|
7
|
+
- Through browser automation like Playwright, Puppeteer, or Selenium.
|
|
8
|
+
- Or right from inside the DevTools console, if you need to test something out.
|
|
9
|
+
|
|
10
|
+
When I'm writing scrapers, my approach is usually:
|
|
11
|
+
1. Define the transformations in DevTools with vanilla JS.
|
|
12
|
+
2. Check if the HTML data can be extracted statically, just from the HTML (no JS).
|
|
13
|
+
3. If static HTML is enough, then migrate vanilla JS to JSDOM or Cheerio.
|
|
14
|
+
4. If I need JS runtime, migrate the vanilla JS to Playwright or other browser automation tool.
|
|
15
|
+
|
|
16
|
+
Migrating from one to another can be prone to errors, and you may miss some features.
|
|
17
|
+
|
|
18
|
+
Portadom takes care of this. Here's how you can move the same DOM manipulation logic from Cheerio to Playwright:
|
|
19
|
+
|
|
20
|
+
Before:
|
|
21
|
+
|
|
22
|
+
```js
|
|
23
|
+
import { load as loadCheerio } from 'cheerio';
|
|
24
|
+
import { cheerioPortadom } from 'portadom';
|
|
25
|
+
|
|
26
|
+
// Loading step changes
|
|
27
|
+
const html = `<div>
|
|
28
|
+
<a href="#">Click Me!</a>
|
|
29
|
+
</div>`;
|
|
30
|
+
const $ = loadCheerio(html);
|
|
31
|
+
const dom = cheerioPortadom($.root(), url);
|
|
32
|
+
|
|
33
|
+
// DOM manipulation remains the same
|
|
34
|
+
const btn = dom.findOne('a');
|
|
35
|
+
const btnText = await btn.text();
|
|
36
|
+
// btnText == "Click Me!"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
After:
|
|
40
|
+
|
|
41
|
+
```js
|
|
42
|
+
import { playwrightLocatorPortadom } from 'portadom';
|
|
43
|
+
|
|
44
|
+
// Loading step changes
|
|
45
|
+
const page = await somehowLoadPage();
|
|
46
|
+
const bodyLoc = page.locator('body');
|
|
47
|
+
const dom = playwrightLocatorPortadom(bodyLoc, page);
|
|
48
|
+
|
|
49
|
+
// DOM manipulation remains the same
|
|
50
|
+
const btn = dom.findOne('a');
|
|
51
|
+
const btnText = await btn.text();
|
|
52
|
+
// btnText == "Click Me!"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
npm install portadom
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Basic usage
|
|
62
|
+
|
|
63
|
+
```js
|
|
64
|
+
const html = `<div>
|
|
65
|
+
<a href="#">Click Me!</a>
|
|
66
|
+
</div>`;
|
|
67
|
+
const $ = loadCheerio(html);
|
|
68
|
+
const dom = cheerioPortadom($.root(), url);
|
|
69
|
+
|
|
70
|
+
const btn = dom.findOne('a');
|
|
71
|
+
const btnText = await btn.text();
|
|
72
|
+
// btnText == "Click Me!"
|
|
73
|
+
|
|
74
|
+
const btnProp = await btn.href();
|
|
75
|
+
// btnProp == "https://example.com#"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Loading
|
|
79
|
+
|
|
80
|
+
Here is how you can load DOM in different environments:
|
|
81
|
+
|
|
82
|
+
#### Browser
|
|
83
|
+
|
|
84
|
+
When working with browser [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document), the `node` is an [Element](https://developer.mozilla.org/en-US/docs/Web/API/Element).
|
|
85
|
+
|
|
86
|
+
```js
|
|
87
|
+
import { browserPortadom } from 'portadom';
|
|
88
|
+
|
|
89
|
+
const dom = browserPortadom(document.body);
|
|
90
|
+
const btnNode = await dom.findOne('a').node;
|
|
91
|
+
|
|
92
|
+
// Or
|
|
93
|
+
const startNode = document.querySelector('...');
|
|
94
|
+
const dom = browserPortadom(startNode);
|
|
95
|
+
const btnNode = await dom.findOne('a').node;
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
#### Cheerio
|
|
99
|
+
|
|
100
|
+
In [Cheerio](https://cheerio.js.org/), the `node` is the Cheerio Element wrapper. [See DOM traversal with Cheerio](https://cheerio.js.org/docs/basics/traversing).
|
|
101
|
+
|
|
102
|
+
```js
|
|
103
|
+
import { cheerioPortadom } from 'portadom';
|
|
104
|
+
import { load as loadCheerio } from 'cheerio';
|
|
105
|
+
|
|
106
|
+
const $ = loadCheerio(html);
|
|
107
|
+
const dom = cheerioPortadom($.root(), url);
|
|
108
|
+
const btnNode = await dom.findOne('a').node;
|
|
109
|
+
|
|
110
|
+
// Or
|
|
111
|
+
const startNode = $('a');
|
|
112
|
+
const dom = cheerioPortadom(startNode, url);
|
|
113
|
+
const btnNode = await dom.findOne('a').node;
|
|
114
|
+
|
|
115
|
+
// Set `null` if you don't have an URL for the HTML
|
|
116
|
+
const dom = cheerioPortadom($.root(), null);
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
#### Playwright (using Locators)
|
|
120
|
+
|
|
121
|
+
In [Playwright](https://cheerio.js.org/), you can either work with the [Locators](https://playwright.dev/docs/api/class-locator) or the [ElementHandles](https://playwright.dev/docs/api/class-elementhandle).
|
|
122
|
+
|
|
123
|
+
When using Locators, the `node` is a Locator instance.
|
|
124
|
+
|
|
125
|
+
```js
|
|
126
|
+
import { playwrightLocatorPortadom } from 'portadom';
|
|
127
|
+
|
|
128
|
+
const page = await somehowLoadPage();
|
|
129
|
+
const bodyLoc = page.locator('body');
|
|
130
|
+
const dom = playwrightLocatorPortadom(bodyLoc, page);
|
|
131
|
+
const btnNode = await dom.findOne('a').node;
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
#### Playwright (using Handles)
|
|
135
|
+
|
|
136
|
+
When using ElementHandles, the `node` is an ElementHandle instance.
|
|
137
|
+
|
|
138
|
+
NOTE: You can pass Locator to `playwrightHandlePortadom`, but this will be converted to JSHandle internally.
|
|
139
|
+
|
|
140
|
+
```js
|
|
141
|
+
import { playwrightHandlePortadom } from 'portadom';
|
|
142
|
+
|
|
143
|
+
const page = await somehowLoadPage();
|
|
144
|
+
|
|
145
|
+
// Use `evaluateHandle` with page-side logic to query the target element
|
|
146
|
+
const handle = await page.evaluateHandle(, () => document.body);
|
|
147
|
+
const handle = await page.evaluateHandle(, () => document.querySelector('.myClass'));
|
|
148
|
+
|
|
149
|
+
// Or use other helpers such as `getByText`
|
|
150
|
+
const handle = await page.getByText('hello');
|
|
151
|
+
|
|
152
|
+
// Or use locators
|
|
153
|
+
const handle = page.locator('body');
|
|
154
|
+
|
|
155
|
+
const dom = playwrightHandlePortadom(bodyLoc, page);
|
|
156
|
+
const btnNode = await dom.findOne('a').node;
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Chaining
|
|
160
|
+
|
|
161
|
+
For cross-compatibility, each method on a Portadom instance returns
|
|
162
|
+
a Promise.
|
|
163
|
+
|
|
164
|
+
But this then leads to `then` / `await` hell when you need to call multiple methods in a row:
|
|
165
|
+
|
|
166
|
+
```js
|
|
167
|
+
const employerName = (await (await el.findOne('.employer'))?.text()) ?? null;
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
To get around that, the results are wrapped in chainable instance. This applies to each method that returns a Portadom instance, or an array of Portadom instances.
|
|
171
|
+
|
|
172
|
+
So instead, we can call:
|
|
173
|
+
|
|
174
|
+
```js
|
|
175
|
+
const employerName = await el.findOne('.employer').text();
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
You don't have to chain the commands. Instead, you can access the associated promise under `promise` property. For example this:
|
|
179
|
+
|
|
180
|
+
```js
|
|
181
|
+
const mapPromises = await dom.findOne('ul')
|
|
182
|
+
.parent()
|
|
183
|
+
.findMany('li[data-id]')
|
|
184
|
+
.map((li) => li.attr('data-id'));
|
|
185
|
+
const attrs = await Promise.all(mapResult);
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Is the same as:
|
|
189
|
+
|
|
190
|
+
```js
|
|
191
|
+
const ul = await dom.findOne('ul').promise;
|
|
192
|
+
const parent = await ul?.parent().promise;
|
|
193
|
+
const idEls = await parent?.findMany('li[data-id]').promise;
|
|
194
|
+
const mapPromises = idEls?.map((li) => li.attr('data-id')) ?? [];
|
|
195
|
+
const attrs = await Promise.all(mapPromises);
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Reference
|
|
199
|
+
|
|
200
|
+
See the [full documentation here](./docs/typedoc/modules.md).
|
|
201
|
+
- [Portadom](./docs/typedoc/interfaces/Portadom.md)
|
|
202
|
+
- [Portapage](./docs/typedoc/interfaces/Portapage.md)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { AnyNode, Cheerio } from 'cheerio';
|
|
2
|
+
import type { ElementHandle, Locator, Page } from 'playwright';
|
|
3
|
+
import { type Portadom } from './types';
|
|
4
|
+
/** Implementation of Portadom in browser (using Browser API) */
|
|
5
|
+
export type BrowserPortadom<T extends Element = Element> = Portadom<T, Element>;
|
|
6
|
+
/** Implementation of Portadom in browser (using Browser API) */
|
|
7
|
+
export declare const browserPortadom: <El extends Element>(node: El) => BrowserPortadom<El>;
|
|
8
|
+
/** Implementation of Portadom in Cheerio */
|
|
9
|
+
export type CheerioPortadom<El extends Cheerio<AnyNode> = Cheerio<AnyNode>> = Portadom<El, Cheerio<AnyNode>>;
|
|
10
|
+
/** Implementation of Portadom in Cheerio */
|
|
11
|
+
export declare const cheerioPortadom: <El extends Cheerio<AnyNode>>(cheerioNode: El, srcUrl: string | null) => CheerioPortadom<El>;
|
|
12
|
+
/** Implementation of Portadom in Playwright using Handles */
|
|
13
|
+
export type PlaywrightHandlePortadom<El extends Locator | ElementHandle<Node> = Locator | ElementHandle<Node>> = Portadom<El, Locator | ElementHandle<Node>>;
|
|
14
|
+
/** Implementation of Portadom in Playwright using Handles */
|
|
15
|
+
export declare const playwrightHandlePortadom: <El extends ElementHandle<Node> | Locator>(node: El, page: Page) => PlaywrightHandlePortadom<El>;
|
|
16
|
+
/** Implementation of Portadom in Playwright using Locators */
|
|
17
|
+
export type PlaywrightLocatorPortadom<El extends Locator = Locator> = Portadom<El, Locator>;
|
|
18
|
+
/** Implementation of Portadom in Playwright using Locators */
|
|
19
|
+
export declare const playwrightLocatorPortadom: <El extends Locator>(node: El, page: Page) => PlaywrightLocatorPortadom<El>;
|