@crawlee/playwright 3.7.4-beta.8 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/index.d.ts +2 -0
- package/index.d.ts.map +1 -1
- package/index.js +1 -0
- package/index.js.map +1 -1
- package/index.mjs +3 -0
- package/internals/adaptive-playwright-crawler.d.ts +112 -0
- package/internals/adaptive-playwright-crawler.d.ts.map +1 -0
- package/internals/adaptive-playwright-crawler.js +269 -0
- package/internals/adaptive-playwright-crawler.js.map +1 -0
- package/internals/utils/playwright-utils.d.ts +2 -0
- package/internals/utils/playwright-utils.d.ts.map +1 -1
- package/internals/utils/playwright-utils.js +2 -0
- package/internals/utils/playwright-utils.js.map +1 -1
- package/internals/utils/rendering-type-prediction.d.ts +34 -0
- package/internals/utils/rendering-type-prediction.d.ts.map +1 -0
- package/internals/utils/rendering-type-prediction.js +108 -0
- package/internals/utils/rendering-type-prediction.js.map +1 -0
- package/package.json +11 -6
- package/tsconfig.build.tsbuildinfo +1 -1
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RenderingTypePredictor = void 0;
|
|
4
|
+
const tslib_1 = require("tslib");
|
|
5
|
+
const ml_logistic_regression_1 = tslib_1.__importDefault(require("ml-logistic-regression"));
|
|
6
|
+
const ml_matrix_1 = require("ml-matrix");
|
|
7
|
+
const string_comparison_1 = tslib_1.__importDefault(require("string-comparison"));
|
|
8
|
+
const urlComponents = (url) => {
|
|
9
|
+
return [url.hostname, ...url.pathname.split('/')];
|
|
10
|
+
};
|
|
11
|
+
const calculateUrlSimilarity = (a, b) => {
|
|
12
|
+
const values = [];
|
|
13
|
+
if (a[0] !== b[0]) {
|
|
14
|
+
return 0;
|
|
15
|
+
}
|
|
16
|
+
for (let i = 1; i < Math.max(a.length, b.length); i++) {
|
|
17
|
+
values.push(string_comparison_1.default.jaroWinkler.similarity(a[i] ?? '', b[i] ?? '') > 0.8 ? 1 : 0);
|
|
18
|
+
}
|
|
19
|
+
return sum(values) / Math.max(a.length, b.length);
|
|
20
|
+
};
|
|
21
|
+
const sum = (values) => values.reduce((acc, value) => acc + value);
|
|
22
|
+
const mean = (values) => (values.length > 0 ? sum(values) / values.length : undefined);
|
|
23
|
+
/**
|
|
24
|
+
* Stores rendering type information for previously crawled URLs and predicts the rendering type for URLs that have yet to be crawled and recommends when rendering type detection should be performed.
|
|
25
|
+
*
|
|
26
|
+
* @experimental
|
|
27
|
+
*/
|
|
28
|
+
class RenderingTypePredictor {
|
|
29
|
+
constructor({ detectionRatio }) {
|
|
30
|
+
Object.defineProperty(this, "renderingTypeDetectionResults", {
|
|
31
|
+
enumerable: true,
|
|
32
|
+
configurable: true,
|
|
33
|
+
writable: true,
|
|
34
|
+
value: new Map()
|
|
35
|
+
});
|
|
36
|
+
Object.defineProperty(this, "detectionRatio", {
|
|
37
|
+
enumerable: true,
|
|
38
|
+
configurable: true,
|
|
39
|
+
writable: true,
|
|
40
|
+
value: void 0
|
|
41
|
+
});
|
|
42
|
+
Object.defineProperty(this, "logreg", {
|
|
43
|
+
enumerable: true,
|
|
44
|
+
configurable: true,
|
|
45
|
+
writable: true,
|
|
46
|
+
value: void 0
|
|
47
|
+
});
|
|
48
|
+
this.detectionRatio = detectionRatio;
|
|
49
|
+
this.logreg = new ml_logistic_regression_1.default({ numSteps: 1000, learningRate: 0.05 });
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Predict the rendering type for a given URL and request label.
|
|
53
|
+
*/
|
|
54
|
+
predict(url, label) {
|
|
55
|
+
if (this.logreg.classifiers.length === 0) {
|
|
56
|
+
return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 };
|
|
57
|
+
}
|
|
58
|
+
const urlFeature = new ml_matrix_1.Matrix([this.calculateFeatureVector(urlComponents(url), label)]);
|
|
59
|
+
const [prediction] = this.logreg.predict(urlFeature);
|
|
60
|
+
const scores = [this.logreg.classifiers[0].testScores(urlFeature), this.logreg.classifiers[1].testScores(urlFeature)];
|
|
61
|
+
return {
|
|
62
|
+
renderingType: prediction === 1 ? 'static' : 'clientOnly',
|
|
63
|
+
detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1 ? 1 : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)),
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Store the rendering type for a given URL and request label. This updates the underlying prediction model, which may be costly.
|
|
68
|
+
*/
|
|
69
|
+
storeResult(url, label, renderingType) {
|
|
70
|
+
if (!this.renderingTypeDetectionResults.has(renderingType)) {
|
|
71
|
+
this.renderingTypeDetectionResults.set(renderingType, new Map());
|
|
72
|
+
}
|
|
73
|
+
if (!this.renderingTypeDetectionResults.get(renderingType).has(label)) {
|
|
74
|
+
this.renderingTypeDetectionResults.get(renderingType).set(label, []);
|
|
75
|
+
}
|
|
76
|
+
this.renderingTypeDetectionResults.get(renderingType).get(label).push(urlComponents(url));
|
|
77
|
+
this.retrain();
|
|
78
|
+
}
|
|
79
|
+
resultCount(label) {
|
|
80
|
+
return Array.from(this.renderingTypeDetectionResults.values())
|
|
81
|
+
.map((results) => results.get(label)?.length ?? 0)
|
|
82
|
+
.reduce((acc, value) => acc + value, 0);
|
|
83
|
+
}
|
|
84
|
+
calculateFeatureVector(url, label) {
|
|
85
|
+
return [
|
|
86
|
+
mean((this.renderingTypeDetectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
|
|
87
|
+
mean((this.renderingTypeDetectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
|
|
88
|
+
];
|
|
89
|
+
}
|
|
90
|
+
retrain() {
|
|
91
|
+
const X = [
|
|
92
|
+
[0, 1],
|
|
93
|
+
[1, 0],
|
|
94
|
+
];
|
|
95
|
+
const Y = [0, 1];
|
|
96
|
+
for (const [renderingType, urlsByLabel] of this.renderingTypeDetectionResults.entries()) {
|
|
97
|
+
for (const [label, urls] of urlsByLabel) {
|
|
98
|
+
for (const url of urls) {
|
|
99
|
+
X.push(this.calculateFeatureVector(url, label));
|
|
100
|
+
Y.push(renderingType === 'static' ? 1 : 0);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
this.logreg.train(new ml_matrix_1.Matrix(X), ml_matrix_1.Matrix.columnVector(Y));
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
exports.RenderingTypePredictor = RenderingTypePredictor;
|
|
108
|
+
//# sourceMappingURL=rendering-type-prediction.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rendering-type-prediction.js","sourceRoot":"","sources":["../../../src/internals/utils/rendering-type-prediction.ts"],"names":[],"mappings":";;;;AAAA,4FAAwD;AACxD,yCAAmC;AACnC,kFAAiD;AAMjD,MAAM,aAAa,GAAG,CAAC,GAAQ,EAAiB,EAAE;IAC9C,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,CAAC,CAAgB,EAAE,CAAgB,EAAsB,EAAE;IACtF,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,CAAC;IACb,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,2BAAgB,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,GAAG,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;AAC7E,MAAM,IAAI,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;AASjG;;;;GAIG;AACH,MAAa,sBAAsB;IAK/B,YAAY,EAAE,cAAc,EAAiC;QAJrD;;;;mBAAgC,IAAI,GAAG,EAA2D;WAAC;QACnG;;;;;WAAuB;QACvB;;;;;WAA2B;QAG/B,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;QACrC,IAAI,CAAC,MAAM,GAAG,IAAI,gCAAkB,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC;IACjF,CAAC;IAED;;OAEG;IACI,OAAO,CAAC,GAAQ,EAAE,KAAyB;QAC9C,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,kCAAkC,EAAE,CAAC,EAAE,CAAC;QAClF,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,kBAAM,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,aAAa,CAAC,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;QACxF,MAAM,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC;QAEtH,OAAO;YACH,aAAa,EAAE,UAAU,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,YAAY;YACzD,kCAAkC,EAAE,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;SACjJ,CAAC;IACN,CAAC;IAED;;OAEG;IACI,WAAW,CAAC,GAAQ,EAAE,KAAyB,EAAE,aAA4B;QAChF,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;YACzD,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YACrE,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC1E,CAAC;QAED,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;QAC5F,IAAI,CAAC,OAAO,EAAE,CAAC;IACnB,CAAC;IAEO,WAAW,CAAC,KAAyB;QACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,MAAM,EAAE,CAAC;aACzD,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC;aACjD,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC;IAChD,CAAC;IAES,sBAAsB,CAAC,GAAkB,EAAE,KAAyB;QAC1E,OAAO;YACH,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;YAC7I,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;SACpJ,CAAC;IACN,CAAC;IAES,OAAO;QACb,MAAM,CAAC,GAAoB;YACvB,CAAC,CAAC,EAAE,CAAC,CAAC;YACN,CAAC,CAAC,EAAE,CAAC,CAAC;SACT,CAAC;QACF,MAAM,CAAC,GAAa,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAE3B,KAAK,MAAM,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,IAAI,CAAC,6BAA6B,CAAC,OAAO,EAAE,EAAE,CAAC;YACtF,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;gBACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;oBACrB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,sBAAsB,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC;oBAChD,CAAC,CAAC,IAAI,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/C,CAAC;YACL,CAAC;QACL,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,kBAAM,CAAC,CAAC,CAAC,EAAE,kBAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC;CACJ;AA3ED,wDA2EC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/playwright",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.8.0",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -55,14 +55,19 @@
|
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@apify/datastructures": "^2.0.0",
|
|
57
57
|
"@apify/log": "^2.4.0",
|
|
58
|
-
"@
|
|
59
|
-
"@crawlee/browser
|
|
60
|
-
"@crawlee/
|
|
61
|
-
"@crawlee/
|
|
58
|
+
"@apify/timeout": "^0.3.1",
|
|
59
|
+
"@crawlee/browser": "3.8.0",
|
|
60
|
+
"@crawlee/browser-pool": "3.8.0",
|
|
61
|
+
"@crawlee/core": "3.8.0",
|
|
62
|
+
"@crawlee/types": "3.8.0",
|
|
63
|
+
"@crawlee/utils": "3.8.0",
|
|
62
64
|
"cheerio": "^1.0.0-rc.12",
|
|
63
65
|
"idcac-playwright": "^0.1.2",
|
|
64
66
|
"jquery": "^3.6.0",
|
|
67
|
+
"lodash.isequal": "^4.5.0",
|
|
68
|
+
"ml-logistic-regression": "^2.0.0",
|
|
65
69
|
"ow": "^0.28.1",
|
|
70
|
+
"string-comparison": "^1.3.0",
|
|
66
71
|
"tslib": "^2.4.0"
|
|
67
72
|
},
|
|
68
73
|
"peerDependencies": {
|
|
@@ -80,5 +85,5 @@
|
|
|
80
85
|
}
|
|
81
86
|
}
|
|
82
87
|
},
|
|
83
|
-
"gitHead": "
|
|
88
|
+
"gitHead": "bd430d3de22c0a9f064cb00654a6cad3bc6cd601"
|
|
84
89
|
}
|