portadom 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -58,7 +58,9 @@ const btnText = await btn.text();
|
|
|
58
58
|
npm install portadom
|
|
59
59
|
```
|
|
60
60
|
|
|
61
|
-
##
|
|
61
|
+
## How to use
|
|
62
|
+
|
|
63
|
+
### Minimal example
|
|
62
64
|
|
|
63
65
|
```js
|
|
64
66
|
const html = `<div>
|
|
@@ -75,6 +77,59 @@ const btnProp = await btn.href();
|
|
|
75
77
|
// btnProp == "https://example.com#"
|
|
76
78
|
```
|
|
77
79
|
|
|
80
|
+
### Full example
|
|
81
|
+
|
|
82
|
+
```js
|
|
83
|
+
const $ = loadCheerio(html);
|
|
84
|
+
const dom = cheerioPortadom($.root(), url);
|
|
85
|
+
// ...
|
|
86
|
+
const rootEl = dom.root();
|
|
87
|
+
const url = await dom.url();
|
|
88
|
+
|
|
89
|
+
// Find and extract data
|
|
90
|
+
const entries = await rootEl.findMany('.list-row:not(.native-agent):not(.reach-list)')
|
|
91
|
+
.mapAsyncSerial(async (el) => {
|
|
92
|
+
const employerName = await el.findOne('.employer').text();
|
|
93
|
+
const employerUrl = await el.findOne('.offer-company-logo-link').href();
|
|
94
|
+
const employerLogoUrl = await el.findOne('.offer-company-logo-link img').src();
|
|
95
|
+
|
|
96
|
+
const offerUrlEl = el.findOne('h2 a');
|
|
97
|
+
const offerUrl = await offerUrlEl.href();
|
|
98
|
+
const offerName = await offerUrlEl.text();
|
|
99
|
+
const offerId = offerUrl?.match(/O\d{2,}/)?.[0] ?? null;
|
|
100
|
+
|
|
101
|
+
const location = await el.findOne('.job-location').text();
|
|
102
|
+
|
|
103
|
+
const salaryText = await el.findOne('.label-group > a[data-dimension7="Salary label"]').text();
|
|
104
|
+
|
|
105
|
+
const labels = await el.findMany('.label-group > a:not([data-dimension7="Salary label"])')
|
|
106
|
+
.mapAsyncSerial((el) => el.text())
|
|
107
|
+
.then((arr) => arr.filter(Boolean) as string[]);
|
|
108
|
+
|
|
109
|
+
const footerInfoEl = el.findOne('.list-footer .info');
|
|
110
|
+
const lastChangeRelativeTimeEl = footerInfoEl.findOne('strong');
|
|
111
|
+
const lastChangeRelativeTime = await lastChangeRelativeTimeEl.text();
|
|
112
|
+
// Remove the element so it's easier to get the text content
|
|
113
|
+
await lastChangeRelativeTimeEl.remove();
|
|
114
|
+
const lastChangeTypeText = await footerInfoEl.textAsLower();
|
|
115
|
+
const lastChangeType = lastChangeTypeText === 'pridané' ? 'added' : 'modified';
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
listingUrl: url,
|
|
119
|
+
employerName,
|
|
120
|
+
employerUrl,
|
|
121
|
+
employerLogoUrl,
|
|
122
|
+
offerName,
|
|
123
|
+
offerUrl,
|
|
124
|
+
offerId,
|
|
125
|
+
location,
|
|
126
|
+
labels,
|
|
127
|
+
lastChangeRelativeTime,
|
|
128
|
+
lastChangeType,
|
|
129
|
+
};
|
|
130
|
+
});
|
|
131
|
+
```
|
|
132
|
+
|
|
78
133
|
### Loading
|
|
79
134
|
|
|
80
135
|
Here is how you can load DOM in different environments:
|
|
@@ -200,3 +255,10 @@ const attrs = await Promise.all(mapPromises);
|
|
|
200
255
|
See the [full documentation here](./docs/typedoc/modules.md).
|
|
201
256
|
- [Portadom](./docs/typedoc/interfaces/Portadom.md)
|
|
202
257
|
- [Portapage](./docs/typedoc/interfaces/Portapage.md)
|
|
258
|
+
|
|
259
|
+
## Real life exampes
|
|
260
|
+
|
|
261
|
+
- [Profesia.sk Scraper](https://github.com/JuroOravec/apify-actor-profesia-sk)
|
|
262
|
+
- [Example 1](https://github.com/JuroOravec/apify-actor-profesia-sk/blob/3793915632bd81dc257d36699808635c8bc3f87e/src/pageActions/jobListing.ts#L128)
|
|
263
|
+
- [Example 2](https://github.com/JuroOravec/apify-actor-profesia-sk/blob/3793915632bd81dc257d36699808635c8bc3f87e/src/pageActions/jobDetail.ts#L75)
|
|
264
|
+
- [SKCRIS Scraper](https://github.com/JuroOravec/apify-actor-skcris/blob/9ce92f9bd55ffcde91f22744e49ba97b6b4f0e44/src/pageActions/detail.ts#L510)
|
package/package.json
CHANGED