portadom 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +202 -0
  2. package/dist/cjs/dom/dom.d.ts +19 -0
  3. package/dist/cjs/dom/dom.js +813 -0
  4. package/dist/cjs/dom/dom.js.map +1 -0
  5. package/dist/cjs/dom/domUtils.d.ts +42 -0
  6. package/dist/cjs/dom/domUtils.js +126 -0
  7. package/dist/cjs/dom/domUtils.js.map +1 -0
  8. package/dist/cjs/dom/types.d.ts +371 -0
  9. package/dist/cjs/dom/types.js +216 -0
  10. package/dist/cjs/dom/types.js.map +1 -0
  11. package/dist/cjs/index.d.ts +6 -0
  12. package/dist/cjs/index.js +22 -0
  13. package/dist/cjs/index.js.map +1 -0
  14. package/dist/cjs/page/page.d.ts +12 -0
  15. package/dist/cjs/page/page.js +105 -0
  16. package/dist/cjs/page/page.js.map +1 -0
  17. package/dist/cjs/page/pageUtils.d.ts +16 -0
  18. package/dist/cjs/page/pageUtils.js +116 -0
  19. package/dist/cjs/page/pageUtils.js.map +1 -0
  20. package/dist/cjs/page/types.d.ts +61 -0
  21. package/dist/cjs/page/types.js +3 -0
  22. package/dist/cjs/page/types.js.map +1 -0
  23. package/dist/cjs/utils/async.d.ts +19 -0
  24. package/dist/cjs/utils/async.js +74 -0
  25. package/dist/cjs/utils/async.js.map +1 -0
  26. package/dist/cjs/utils/error.d.ts +1 -0
  27. package/dist/cjs/utils/error.js +10 -0
  28. package/dist/cjs/utils/error.js.map +1 -0
  29. package/dist/cjs/utils/format.d.ts +9 -0
  30. package/dist/cjs/utils/format.js +19 -0
  31. package/dist/cjs/utils/format.js.map +1 -0
  32. package/dist/cjs/utils/types.d.ts +6 -0
  33. package/dist/cjs/utils/types.js +9 -0
  34. package/dist/cjs/utils/types.js.map +1 -0
  35. package/dist/cjs/utils/url.d.ts +9 -0
  36. package/dist/cjs/utils/url.js +32 -0
  37. package/dist/cjs/utils/url.js.map +1 -0
  38. package/package.json +68 -0
package/README.md ADDED
@@ -0,0 +1,202 @@
1
+ # Portadom
2
+
3
+ *Single DOM manipulation interface across Browser API, JSDOM, Cheerio, Playwright.*
4
+
5
+ If you write web scrapers, you will know that you have multiple ways of parsing and manipulating the HTML / DOM:
6
+ - Download the HTML and feed into JSDOM or Cheerio.
7
+ - Through browser automation like Playwright, Puppeteer, or Selenium.
8
+ - Or right from inside the DevTools console, if you need to test something out.
9
+
10
+ When I'm writing scrapers, my approach is usually:
11
+ 1. Define the transformations in DevTools with vanilla JS.
12
+ 2. Check if the HTML data can be extracted statically, just from the HTML (no JS).
13
+ 3. If static HTML is enough, then migrate vanilla JS to JSDOM or Cheerio.
14
+ 4. If I need JS runtime, migrate the vanilla JS to Playwright or other browser automation tool.
15
+
16
+ Migrating from one to another can be prone to errors, and you may miss some features.
17
+
18
+ Portadom takes care of this. Here's how you can move the same DOM manipulation logic from Cheerio to Playwright:
19
+
20
+ Before:
21
+
22
+ ```js
23
+ import { load as loadCheerio } from 'cheerio';
24
+ import { cheerioPortadom } from 'portadom';
25
+
26
+ // Loading step changes
27
+ const html = `<div>
28
+ <a href="#">Click Me!</a>
29
+ </div>`;
30
+ const $ = loadCheerio(html);
31
+ const dom = cheerioPortadom($.root(), url);
32
+
33
+ // DOM manipulation remains the same
34
+ const btn = dom.findOne('a');
35
+ const btnText = await btn.text();
36
+ // btnText == "Click Me!"
37
+ ```
38
+
39
+ After:
40
+
41
+ ```js
42
+ import { playwrightLocatorPortadom } from 'portadom';
43
+
44
+ // Loading step changes
45
+ const page = await somehowLoadPage();
46
+ const bodyLoc = page.locator('body');
47
+ const dom = playwrightLocatorPortadom(bodyLoc, page);
48
+
49
+ // DOM manipulation remains the same
50
+ const btn = dom.findOne('a');
51
+ const btnText = await btn.text();
52
+ // btnText == "Click Me!"
53
+ ```
54
+
55
+ ## Installation
56
+
57
+ ```sh
58
+ npm install portadom
59
+ ```
60
+
61
+ ## Basic usage
62
+
63
+ ```js
64
+ const html = `<div>
65
+ <a href="#">Click Me!</a>
66
+ </div>`;
67
+ const $ = loadCheerio(html);
68
+ const dom = cheerioPortadom($.root(), url);
69
+
70
+ const btn = dom.findOne('a');
71
+ const btnText = await btn.text();
72
+ // btnText == "Click Me!"
73
+
74
+ const btnProp = await btn.href();
75
+ // btnProp == "https://example.com#"
76
+ ```
77
+
78
+ ### Loading
79
+
80
+ Here is how you can load DOM in different environments:
81
+
82
+ #### Browser
83
+
84
+ When working with browser [Document](https://developer.mozilla.org/en-US/docs/Web/API/Document), the `node` is an [Element](https://developer.mozilla.org/en-US/docs/Web/API/Element).
85
+
86
+ ```js
87
+ import { browserPortadom } from 'portadom';
88
+
89
+ const dom = browserPortadom(document.body);
90
+ const btnNode = await dom.findOne('a').node;
91
+
92
+ // Or
93
+ const startNode = document.querySelector('...');
94
+ const dom = browserPortadom(startNode);
95
+ const btnNode = await dom.findOne('a').node;
96
+ ```
97
+
98
+ #### Cheerio
99
+
100
+ In [Cheerio](https://cheerio.js.org/), the `node` is the Cheerio Element wrapper. [See DOM traversal with Cheerio](https://cheerio.js.org/docs/basics/traversing).
101
+
102
+ ```js
103
+ import { cheerioPortadom } from 'portadom';
104
+ import { load as loadCheerio } from 'cheerio';
105
+
106
+ const $ = loadCheerio(html);
107
+ const dom = cheerioPortadom($.root(), url);
108
+ const btnNode = await dom.findOne('a').node;
109
+
110
+ // Or
111
+ const startNode = $('a');
112
+ const dom = cheerioPortadom(startNode, url);
113
+ const btnNode = await dom.findOne('a').node;
114
+
115
+ // Set `null` if you don't have an URL for the HTML
116
+ const dom = cheerioPortadom($.root(), null);
117
+ ```
118
+
119
+ #### Playwright (using Locators)
120
+
121
+ In [Playwright](https://cheerio.js.org/), you can either work with the [Locators](https://playwright.dev/docs/api/class-locator) or the [ElementHandles](https://playwright.dev/docs/api/class-elementhandle).
122
+
123
+ When using Locators, the `node` is a Locator instance.
124
+
125
+ ```js
126
+ import { playwrightLocatorPortadom } from 'portadom';
127
+
128
+ const page = await somehowLoadPage();
129
+ const bodyLoc = page.locator('body');
130
+ const dom = playwrightLocatorPortadom(bodyLoc, page);
131
+ const btnNode = await dom.findOne('a').node;
132
+ ```
133
+
134
+ #### Playwright (using Handles)
135
+
136
+ When using ElementHandles, the `node` is an ElementHandle instance.
137
+
138
+ NOTE: You can pass Locator to `playwrightHandlePortadom`, but this will be converted to JSHandle internally.
139
+
140
+ ```js
141
+ import { playwrightHandlePortadom } from 'portadom';
142
+
143
+ const page = await somehowLoadPage();
144
+
145
+ // Use `evaluateHandle` with page-side logic to query the target element
146
+ const handle = await page.evaluateHandle(, () => document.body);
147
+ const handle = await page.evaluateHandle(, () => document.querySelector('.myClass'));
148
+
149
+ // Or use other helpers such as `getByText`
150
+ const handle = await page.getByText('hello');
151
+
152
+ // Or use locators
153
+ const handle = page.locator('body');
154
+
155
+ const dom = playwrightHandlePortadom(bodyLoc, page);
156
+ const btnNode = await dom.findOne('a').node;
157
+ ```
158
+
159
+ ### Chaining
160
+
161
+ For cross-compatibility, each method on a Portadom instance returns
162
+ a Promise.
163
+
164
+ But this then leads to `then` / `await` hell when you need to call multiple methods in a row:
165
+
166
+ ```js
167
+ const employerName = (await (await el.findOne('.employer'))?.text()) ?? null;
168
+ ```
169
+
170
+ To get around that, the results are wrapped in chainable instance. This applies to each method that returns a Portadom instance, or an array of Portadom instances.
171
+
172
+ So instead, we can call:
173
+
174
+ ```js
175
+ const employerName = await el.findOne('.employer').text();
176
+ ```
177
+
178
+ You don't have to chain the commands. Instead, you can access the associated promise under `promise` property. For example this:
179
+
180
+ ```js
181
+ const mapPromises = await dom.findOne('ul')
182
+ .parent()
183
+ .findMany('li[data-id]')
184
+ .map((li) => li.attr('data-id'));
185
+ const attrs = await Promise.all(mapResult);
186
+ ```
187
+
188
+ Is the same as:
189
+
190
+ ```js
191
+ const ul = await dom.findOne('ul').promise;
192
+ const parent = await ul?.parent().promise;
193
+ const idEls = await parent?.findMany('li[data-id]').promise;
194
+ const mapPromises = idEls?.map((li) => li.attr('data-id')) ?? [];
195
+ const attrs = await Promise.all(mapPromises);
196
+ ```
197
+
198
+ ## Reference
199
+
200
+ See the [full documentation here](./docs/typedoc/modules.md).
201
+ - [Portadom](./docs/typedoc/interfaces/Portadom.md)
202
+ - [Portapage](./docs/typedoc/interfaces/Portapage.md)
@@ -0,0 +1,19 @@
1
+ import type { AnyNode, Cheerio } from 'cheerio';
2
+ import type { ElementHandle, Locator, Page } from 'playwright';
3
+ import { type Portadom } from './types';
4
+ /** Implementation of Portadom in browser (using Browser API) */
5
+ export type BrowserPortadom<T extends Element = Element> = Portadom<T, Element>;
6
+ /** Implementation of Portadom in browser (using Browser API) */
7
+ export declare const browserPortadom: <El extends Element>(node: El) => BrowserPortadom<El>;
8
+ /** Implementation of Portadom in Cheerio */
9
+ export type CheerioPortadom<El extends Cheerio<AnyNode> = Cheerio<AnyNode>> = Portadom<El, Cheerio<AnyNode>>;
10
+ /** Implementation of Portadom in Cheerio */
11
+ export declare const cheerioPortadom: <El extends Cheerio<AnyNode>>(cheerioNode: El, srcUrl: string | null) => CheerioPortadom<El>;
12
+ /** Implementation of Portadom in Playwright using Handles */
13
+ export type PlaywrightHandlePortadom<El extends Locator | ElementHandle<Node> = Locator | ElementHandle<Node>> = Portadom<El, Locator | ElementHandle<Node>>;
14
+ /** Implementation of Portadom in Playwright using Handles */
15
+ export declare const playwrightHandlePortadom: <El extends ElementHandle<Node> | Locator>(node: El, page: Page) => PlaywrightHandlePortadom<El>;
16
+ /** Implementation of Portadom in Playwright using Locators */
17
+ export type PlaywrightLocatorPortadom<El extends Locator = Locator> = Portadom<El, Locator>;
18
+ /** Implementation of Portadom in Playwright using Locators */
19
+ export declare const playwrightLocatorPortadom: <El extends Locator>(node: El, page: Page) => PlaywrightLocatorPortadom<El>;