@crawlee/playwright 3.7.4-beta.8 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.RenderingTypePredictor = void 0;
4
+ const tslib_1 = require("tslib");
5
+ const ml_logistic_regression_1 = tslib_1.__importDefault(require("ml-logistic-regression"));
6
+ const ml_matrix_1 = require("ml-matrix");
7
+ const string_comparison_1 = tslib_1.__importDefault(require("string-comparison"));
8
+ const urlComponents = (url) => {
9
+ return [url.hostname, ...url.pathname.split('/')];
10
+ };
11
+ const calculateUrlSimilarity = (a, b) => {
12
+ const values = [];
13
+ if (a[0] !== b[0]) {
14
+ return 0;
15
+ }
16
+ for (let i = 1; i < Math.max(a.length, b.length); i++) {
17
+ values.push(string_comparison_1.default.jaroWinkler.similarity(a[i] ?? '', b[i] ?? '') > 0.8 ? 1 : 0);
18
+ }
19
+ return sum(values) / Math.max(a.length, b.length);
20
+ };
21
+ const sum = (values) => values.reduce((acc, value) => acc + value);
22
+ const mean = (values) => (values.length > 0 ? sum(values) / values.length : undefined);
23
+ /**
24
+ * Stores rendering type information for previously crawled URLs and predicts the rendering type for URLs that have yet to be crawled and recommends when rendering type detection should be performed.
25
+ *
26
+ * @experimental
27
+ */
28
+ class RenderingTypePredictor {
29
+ constructor({ detectionRatio }) {
30
+ Object.defineProperty(this, "renderingTypeDetectionResults", {
31
+ enumerable: true,
32
+ configurable: true,
33
+ writable: true,
34
+ value: new Map()
35
+ });
36
+ Object.defineProperty(this, "detectionRatio", {
37
+ enumerable: true,
38
+ configurable: true,
39
+ writable: true,
40
+ value: void 0
41
+ });
42
+ Object.defineProperty(this, "logreg", {
43
+ enumerable: true,
44
+ configurable: true,
45
+ writable: true,
46
+ value: void 0
47
+ });
48
+ this.detectionRatio = detectionRatio;
49
+ this.logreg = new ml_logistic_regression_1.default({ numSteps: 1000, learningRate: 0.05 });
50
+ }
51
+ /**
52
+ * Predict the rendering type for a given URL and request label.
53
+ */
54
+ predict(url, label) {
55
+ if (this.logreg.classifiers.length === 0) {
56
+ return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 };
57
+ }
58
+ const urlFeature = new ml_matrix_1.Matrix([this.calculateFeatureVector(urlComponents(url), label)]);
59
+ const [prediction] = this.logreg.predict(urlFeature);
60
+ const scores = [this.logreg.classifiers[0].testScores(urlFeature), this.logreg.classifiers[1].testScores(urlFeature)];
61
+ return {
62
+ renderingType: prediction === 1 ? 'static' : 'clientOnly',
63
+ detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1 ? 1 : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)),
64
+ };
65
+ }
66
+ /**
67
+ * Store the rendering type for a given URL and request label. This updates the underlying prediction model, which may be costly.
68
+ */
69
+ storeResult(url, label, renderingType) {
70
+ if (!this.renderingTypeDetectionResults.has(renderingType)) {
71
+ this.renderingTypeDetectionResults.set(renderingType, new Map());
72
+ }
73
+ if (!this.renderingTypeDetectionResults.get(renderingType).has(label)) {
74
+ this.renderingTypeDetectionResults.get(renderingType).set(label, []);
75
+ }
76
+ this.renderingTypeDetectionResults.get(renderingType).get(label).push(urlComponents(url));
77
+ this.retrain();
78
+ }
79
+ resultCount(label) {
80
+ return Array.from(this.renderingTypeDetectionResults.values())
81
+ .map((results) => results.get(label)?.length ?? 0)
82
+ .reduce((acc, value) => acc + value, 0);
83
+ }
84
+ calculateFeatureVector(url, label) {
85
+ return [
86
+ mean((this.renderingTypeDetectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
87
+ mean((this.renderingTypeDetectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
88
+ ];
89
+ }
90
+ retrain() {
91
+ const X = [
92
+ [0, 1],
93
+ [1, 0],
94
+ ];
95
+ const Y = [0, 1];
96
+ for (const [renderingType, urlsByLabel] of this.renderingTypeDetectionResults.entries()) {
97
+ for (const [label, urls] of urlsByLabel) {
98
+ for (const url of urls) {
99
+ X.push(this.calculateFeatureVector(url, label));
100
+ Y.push(renderingType === 'static' ? 1 : 0);
101
+ }
102
+ }
103
+ }
104
+ this.logreg.train(new ml_matrix_1.Matrix(X), ml_matrix_1.Matrix.columnVector(Y));
105
+ }
106
+ }
107
+ exports.RenderingTypePredictor = RenderingTypePredictor;
108
+ //# sourceMappingURL=rendering-type-prediction.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rendering-type-prediction.js","sourceRoot":"","sources":["../../../src/internals/utils/rendering-type-prediction.ts"],"names":[],"mappings":";;;;AAAA,4FAAwD;AACxD,yCAAmC;AACnC,kFAAiD;AAMjD,MAAM,aAAa,GAAG,CAAC,GAAQ,EAAiB,EAAE;IAC9C,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,CAAC,CAAgB,EAAE,CAAgB,EAAsB,EAAE;IACtF,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,CAAC;IACb,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,2BAAgB,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/F,CAAC;IAED,OAAO,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;AACtD,CAAC,CAAC;AAEF,MAAM,GAAG,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;AAC7E,MAAM,IAAI,GAAG,CAAC,MAAgB,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;AASjG;;;;GAIG;AACH,MAAa,sBAAsB;IAK/B,YAAY,EAAE,cAAc,EAAiC;QAJrD;;;;mBAAgC,IAAI,GAAG,EAA2D;WAAC;QACnG;;;;;WAAuB;QACvB;;;;;WAA2B;QAG/B,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;QACrC,IAAI,CAAC,MAAM,GAAG,IAAI,gCAAkB,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC;IACjF,CAAC;IAED;;OAEG;IACI,OAAO,CAAC,GAAQ,EAAE,KAAyB;QAC9C,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,kCAAkC,EAAE,CAAC,EAAE,CAAC;QAClF,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,kBAAM,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,aAAa,CAAC,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;QACxF,MAAM,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC;QAEtH,OAAO;YACH,aAAa,EAAE,UAAU,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,YAAY;YACzD,kCAAkC,EAAE,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;SACjJ,CAAC;IACN,CAAC;IAED;;OAEG;IACI,WAAW,CAAC,GAAQ,EAAE,KAAyB,EAAE,aAA4B;QAChF,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;YACzD,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YACrE,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC1E,CAAC;QAED,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,aAAa,CAAE,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;QAC5F,IAAI,CAAC,OAAO,EAAE,CAAC;IACnB,CAAC;IAEO,WAAW,CAAC,KAAyB;QACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,6BAA6B,CAAC,MAAM,EAAE,CAAC;aACzD,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC;aACjD,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC;IAChD,CAAC;IAES,sBAAsB,CAAC,GAAkB,EAAE,KAAyB;QAC1E,OAAO;YACH,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;YAC7I,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;SACpJ,CAAC;IACN,CAAC;IAES,OAAO;QACb,MAAM,CAAC,GAAoB;YACvB,CAAC,CAAC,EAAE,CAAC,CAAC;YACN,CAAC,CAAC,EAAE,CAAC,CAAC;SACT,CAAC;QACF,MAAM,CAAC,GAAa,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAE3B,KAAK,MAAM,CAAC,aAAa,EAAE,WAAW,CAAC,IAAI,IAAI,CAAC,6BAA6B,CAAC,OAAO,EAAE,EAAE,CAAC;YACtF,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,WAAW,EAAE,CAAC;gBACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;oBACrB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,sBAAsB,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC;oBAChD,CAAC,CAAC,IAAI,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/C,CAAC;YACL,CAAC;QACL,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,kBAAM,CAAC,CAAC,CAAC,EAAE,kBAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC;CACJ;AA3ED,wDA2EC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@crawlee/playwright",
3
- "version": "3.7.4-beta.8",
3
+ "version": "3.8.0",
4
4
  "description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
5
5
  "engines": {
6
6
  "node": ">=16.0.0"
@@ -55,14 +55,19 @@
55
55
  "dependencies": {
56
56
  "@apify/datastructures": "^2.0.0",
57
57
  "@apify/log": "^2.4.0",
58
- "@crawlee/browser": "3.7.4-beta.8",
59
- "@crawlee/browser-pool": "3.7.4-beta.8",
60
- "@crawlee/types": "3.7.4-beta.8",
61
- "@crawlee/utils": "3.7.4-beta.8",
58
+ "@apify/timeout": "^0.3.1",
59
+ "@crawlee/browser": "3.8.0",
60
+ "@crawlee/browser-pool": "3.8.0",
61
+ "@crawlee/core": "3.8.0",
62
+ "@crawlee/types": "3.8.0",
63
+ "@crawlee/utils": "3.8.0",
62
64
  "cheerio": "^1.0.0-rc.12",
63
65
  "idcac-playwright": "^0.1.2",
64
66
  "jquery": "^3.6.0",
67
+ "lodash.isequal": "^4.5.0",
68
+ "ml-logistic-regression": "^2.0.0",
65
69
  "ow": "^0.28.1",
70
+ "string-comparison": "^1.3.0",
66
71
  "tslib": "^2.4.0"
67
72
  },
68
73
  "peerDependencies": {
@@ -80,5 +85,5 @@
80
85
  }
81
86
  }
82
87
  },
83
- "gitHead": "83abc8f1527c11bac121512097b50c6124ebf4fa"
88
+ "gitHead": "bd430d3de22c0a9f064cb00654a6cad3bc6cd601"
84
89
  }