@crawlee/playwright 4.0.0-beta.2 → 4.0.0-beta.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/internals/adaptive-playwright-crawler.d.ts +38 -54
- package/internals/adaptive-playwright-crawler.d.ts.map +1 -1
- package/internals/adaptive-playwright-crawler.js +244 -169
- package/internals/adaptive-playwright-crawler.js.map +1 -1
- package/internals/enqueue-links/click-elements.d.ts +10 -0
- package/internals/enqueue-links/click-elements.d.ts.map +1 -1
- package/internals/enqueue-links/click-elements.js +14 -2
- package/internals/enqueue-links/click-elements.js.map +1 -1
- package/internals/playwright-crawler.d.ts +19 -42
- package/internals/playwright-crawler.d.ts.map +1 -1
- package/internals/playwright-crawler.js +47 -9
- package/internals/playwright-crawler.js.map +1 -1
- package/internals/utils/playwright-utils.d.ts +10 -3
- package/internals/utils/playwright-utils.d.ts.map +1 -1
- package/internals/utils/playwright-utils.js +36 -38
- package/internals/utils/playwright-utils.js.map +1 -1
- package/internals/utils/rendering-type-prediction.d.ts +8 -3
- package/internals/utils/rendering-type-prediction.d.ts.map +1 -1
- package/internals/utils/rendering-type-prediction.js +22 -10
- package/internals/utils/rendering-type-prediction.js.map +1 -1
- package/package.json +12 -8
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { RecoverableState } from '@crawlee/core';
|
|
1
2
|
import LogisticRegression from 'ml-logistic-regression';
|
|
2
3
|
import { Matrix } from 'ml-matrix';
|
|
3
4
|
import stringComparison from 'string-comparison';
|
|
@@ -24,25 +25,36 @@ const mean = (values) => (values.length > 0 ? sum(values) / values.length : unde
|
|
|
24
25
|
export class RenderingTypePredictor {
|
|
25
26
|
renderingTypeDetectionResults = new Map();
|
|
26
27
|
detectionRatio;
|
|
27
|
-
|
|
28
|
-
constructor({ detectionRatio }) {
|
|
28
|
+
state;
|
|
29
|
+
constructor({ detectionRatio, persistenceOptions }) {
|
|
29
30
|
this.detectionRatio = detectionRatio;
|
|
30
|
-
this.
|
|
31
|
+
this.state = new RecoverableState({
|
|
32
|
+
defaultState: { logreg: new LogisticRegression({ numSteps: 1000, learningRate: 0.05 }) },
|
|
33
|
+
serialize: (state) => JSON.stringify({ logreg: state.logreg.toJSON() }),
|
|
34
|
+
deserialize: (serializedState) => ({ logreg: LogisticRegression.load(JSON.parse(serializedState).logreg) }),
|
|
35
|
+
persistStateKey: 'rendering-type-predictor-state',
|
|
36
|
+
persistenceEnabled: true,
|
|
37
|
+
...persistenceOptions,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Initialize the predictor by restoring persisted state.
|
|
42
|
+
*/
|
|
43
|
+
async initialize() {
|
|
44
|
+
await this.state.initialize();
|
|
31
45
|
}
|
|
32
46
|
/**
|
|
33
47
|
* Predict the rendering type for a given URL and request label.
|
|
34
48
|
*/
|
|
35
49
|
predict({ url, loadedUrl, label }) {
|
|
36
|
-
|
|
50
|
+
const { logreg } = this.state.currentValue;
|
|
51
|
+
if (logreg.classifiers.length === 0) {
|
|
37
52
|
return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 };
|
|
38
53
|
}
|
|
39
54
|
const predictionUrl = new URL(loadedUrl ?? url);
|
|
40
55
|
const urlFeature = new Matrix([this.calculateFeatureVector(urlComponents(predictionUrl), label)]);
|
|
41
|
-
const [prediction] =
|
|
42
|
-
const scores = [
|
|
43
|
-
this.logreg.classifiers[0].testScores(urlFeature),
|
|
44
|
-
this.logreg.classifiers[1].testScores(urlFeature),
|
|
45
|
-
];
|
|
56
|
+
const [prediction] = logreg.predict(urlFeature);
|
|
57
|
+
const scores = [logreg.classifiers[0].testScores(urlFeature), logreg.classifiers[1].testScores(urlFeature)];
|
|
46
58
|
return {
|
|
47
59
|
renderingType: prediction === 1 ? 'static' : 'clientOnly',
|
|
48
60
|
detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1
|
|
@@ -89,7 +101,7 @@ export class RenderingTypePredictor {
|
|
|
89
101
|
}
|
|
90
102
|
}
|
|
91
103
|
}
|
|
92
|
-
this.logreg.train(new Matrix(X), Matrix.columnVector(Y));
|
|
104
|
+
this.state.currentValue.logreg.train(new Matrix(X), Matrix.columnVector(Y));
|
|
93
105
|
}
|
|
94
106
|
}
|
|
95
107
|
//# sourceMappingURL=rendering-type-prediction.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"rendering-type-prediction.js","sourceRoot":"","sources":["../../../src/internals/utils/rendering-type-prediction.ts"],"names":[],"mappings":"AACA,OAAO,kBAAkB,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,MAAM,EAAE,MAAM,WAAW,CAAC;AACnC,OAAO,gBAAgB,MAAM,mBAAmB,CAAC;AAMjD,MAAM,aAAa,GAAG,CAAC,GAAQ,EAAiB,EAAE;IAC9C,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,CAAC,CAAgB,EAAE,CAAgB,EAAsB,EAAE;IACtF,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,CAAC;IACb,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,GAAG,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;AAC7E,MAAM,IAAI,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"rendering-type-prediction.js","sourceRoot":"","sources":["../../../src/internals/utils/rendering-type-prediction.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AACjD,OAAO,kBAAkB,MAAM,wBAAwB,CAAC;AACxD,OAAO,EAAE,MAAM,EAAE,MAAM,WAAW,CAAC;AACnC,OAAO,gBAAgB,MAAM,mBAAmB,CAAC;AAMjD,MAAM,aAAa,GAAG,CAAC,GAAQ,EAAiB,EAAE;IAC9C,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,CAAC,CAAgB,EAAE,CAAgB,EAAsB,EAAE;IACtF,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,CAAC;IACb,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,GAAG,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;AAC7E,MAAM,IAAI,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;AAUjG;;;;GAIG;AACH,MAAM,OAAO,sBAAsB;IACvB,6BAA6B,GAAG,IAAI,GAAG,EAA2D,CAAC;IACnG,cAAc,CAAS;IACvB,KAAK,CAAmD;IAEhE,YAAY,EAAE,cAAc,EAAE,kBAAkB,EAAiC;QAC7E,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;QACrC,IAAI,CAAC,KAAK,GAAG,IAAI,gBAAgB,CAAC;YAC9B,YAAY,EAAE,EAAE,MAAM,EAAE,IAAI,kBAAkB,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;YACxF,SAAS,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;YACvE,WAAW,EAAE,CAAC,eAAe,EAAE,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC;YAC3G,eAAe,EAAE,gCAAgC;YACjD,kBAAkB,EAAE,IAAI;YACxB,GAAG,kBAAkB;SACxB,CAAC,CAAC;IACP,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU;QACZ,MAAM,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC;IAClC,CAAC;IAED;;OAEG;IACI,OAAO,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAW;QAI7C,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC;QAC3C,IAAI,MAAM,CAAC,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,kCAAkC,EAAE,CAAC,EAAE,CAAC;QAClF,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QAEhD,MAAM,UAAU,GAAG,IAAI,MAAM,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,aAAa,CAAC,aAAa,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;QAClG,MAAM,CAAC,UAAU,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC;QAE5G,OAAO;YACH,aAAa,EAAE,UAAU,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,YAAY;YACzD,kCAAkC,EAC9B,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG;gBACjC,CAAC,CAAC,CAAC;gBACH,CAAC,CAAC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;SAC3E,CAAC;IACN,CAAC;IAED;;OAEG;IACI,WAAW,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAW,EAAE,aAA4B;QAC/E,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QAE5C,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;YACzD,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YACrE,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC1E,CAAC;QAED,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC,CAAC;QAClG,IAAI,CAAC,OAAO,EAAE,CAAC;IACnB,CAAC;IAEO,WAAW,CAAC,KAAyB;QACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,MAAM,EAAE,CAAC;aACzD,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC;aACjD,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC;IAChD,CAAC;IAES,sBAAsB,CAAC,GAAkB,EAAE,KAAyB;QAC1E,OAAO;YACH,IAAI,CACA,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CACpE,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAC3D,CACJ,IAAI,CAAC;YACN,IAAI,CACA,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CACxE,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAC3D,CACJ,IAAI,CAAC;SACT,CAAC;IACN,CAAC;IAES,OAAO;QACb,MAAM,CAAC,GAAoB;YACvB,CAAC,CAAC,EAAE,CAAC,CAAC;YACN,CAAC,CAAC,EAAE,CAAC,CAAC;SACT,CAAC;QACF,MAAM,CAAC,GAAa,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAE3B,KAAK,MAAM,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,IAAI,CAAC,6BAA6B,CAAC,OAAO,EAAE,EAAE,CAAC;YACtF,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;gBACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;oBACrB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,sBAAsB,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC;oBAChD,CAAC,CAAC,IAAI,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/C,CAAC;YACL,CAAC;QACL,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IAChF,CAAC;CACJ"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/playwright",
|
|
3
|
-
"version": "4.0.0-beta.
|
|
3
|
+
"version": "4.0.0-beta.21",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=22.0.0"
|
|
@@ -50,15 +50,15 @@
|
|
|
50
50
|
"@apify/datastructures": "^2.0.3",
|
|
51
51
|
"@apify/log": "^2.5.18",
|
|
52
52
|
"@apify/timeout": "^0.3.2",
|
|
53
|
-
"@crawlee/browser": "4.0.0-beta.
|
|
54
|
-
"@crawlee/browser-pool": "4.0.0-beta.
|
|
55
|
-
"@crawlee/
|
|
56
|
-
"@crawlee/
|
|
57
|
-
"@crawlee/
|
|
53
|
+
"@crawlee/browser": "4.0.0-beta.21",
|
|
54
|
+
"@crawlee/browser-pool": "4.0.0-beta.21",
|
|
55
|
+
"@crawlee/cheerio": "4.0.0-beta.21",
|
|
56
|
+
"@crawlee/core": "4.0.0-beta.21",
|
|
57
|
+
"@crawlee/types": "4.0.0-beta.21",
|
|
58
|
+
"@crawlee/utils": "4.0.0-beta.21",
|
|
58
59
|
"cheerio": "^1.0.0",
|
|
59
60
|
"idcac-playwright": "^0.1.3",
|
|
60
61
|
"jquery": "^3.7.1",
|
|
61
|
-
"lodash.isequal": "^4.5.0",
|
|
62
62
|
"ml-logistic-regression": "^2.0.0",
|
|
63
63
|
"ml-matrix": "^6.12.1",
|
|
64
64
|
"ow": "^2.0.0",
|
|
@@ -66,9 +66,13 @@
|
|
|
66
66
|
"tslib": "^2.8.1"
|
|
67
67
|
},
|
|
68
68
|
"peerDependencies": {
|
|
69
|
+
"idcac-playwright": "^0.1.2",
|
|
69
70
|
"playwright": "*"
|
|
70
71
|
},
|
|
71
72
|
"peerDependenciesMeta": {
|
|
73
|
+
"idcac-playwright": {
|
|
74
|
+
"optional": true
|
|
75
|
+
},
|
|
72
76
|
"playwright": {
|
|
73
77
|
"optional": true
|
|
74
78
|
}
|
|
@@ -80,5 +84,5 @@
|
|
|
80
84
|
}
|
|
81
85
|
}
|
|
82
86
|
},
|
|
83
|
-
"gitHead": "
|
|
87
|
+
"gitHead": "e370ab2f4ffcf4f63b52a61c9b6e97081d525e64"
|
|
84
88
|
}
|