scraply 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -19,7 +19,28 @@ export const shouldRetry = (error) => {
|
|
|
19
19
|
const shouldIncludeURL = (url) => {
|
|
20
20
|
try {
|
|
21
21
|
const urlObj = new URL(url);
|
|
22
|
-
|
|
22
|
+
|
|
23
|
+
// Check if the URL matches any include pattern
|
|
24
|
+
const isIncluded = CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => {
|
|
25
|
+
if (typeof pattern === 'string') {
|
|
26
|
+
return urlObj.toString().includes(pattern); // Check if the URL contains the string pattern
|
|
27
|
+
} else if (pattern instanceof RegExp) {
|
|
28
|
+
return pattern.test(urlObj.toString()); // Check if the URL matches the RegExp pattern
|
|
29
|
+
}
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
if (!isIncluded) return false; // This URL is not included
|
|
33
|
+
|
|
34
|
+
// Check if the URL matches any exclude pattern
|
|
35
|
+
const isExcluded = CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => {
|
|
36
|
+
if (typeof pattern === 'string') {
|
|
37
|
+
return urlObj.pathname.includes(pattern); // Check if the URL pathname contains the string pattern
|
|
38
|
+
} else if (pattern instanceof RegExp) {
|
|
39
|
+
return pattern.test(urlObj.pathname); // Check if the URL pathname matches the RegExp pattern
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
return !isExcluded; // This URL is included and not excluded
|
|
23
44
|
} catch (error) {
|
|
24
45
|
return false;
|
|
25
46
|
}
|