scraply 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/defaultConfig.js +9 -9
- package/src/loadConfig.js +5 -0
- package/src/utils/crawl/url/handlers.js +13 -19
package/package.json
CHANGED
package/src/defaultConfig.js
CHANGED
|
@@ -52,15 +52,15 @@ export const DEFAULT_CONFIG = {
|
|
|
52
52
|
},
|
|
53
53
|
},
|
|
54
54
|
HARD_CODED_LINKS: [
|
|
55
|
-
{
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
}
|
|
55
|
+
// {
|
|
56
|
+
// file_name: 'hc-links.json',
|
|
57
|
+
// data: [
|
|
58
|
+
// {
|
|
59
|
+
// "url": "https://custom-link.com",
|
|
60
|
+
// "content": "That's a custom link content, you can add as many as you want."
|
|
61
|
+
// },
|
|
62
|
+
// ]
|
|
63
|
+
// }
|
|
64
64
|
]
|
|
65
65
|
}
|
|
66
66
|
};
|
package/src/loadConfig.js
CHANGED
|
@@ -26,5 +26,10 @@ export function loadConfig(userConfig = {}) {
|
|
|
26
26
|
config.CRAWLER.INCLUDE_URLS = config.CRAWLER.INITIAL_URLS.map(url => `${url}.*`);
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
// If HARD_CODED_LINKS is missing, set to an empty array by default
|
|
30
|
+
if (!config.DATA_FORMATTER.HARD_CODED_LINKS) {
|
|
31
|
+
config.DATA_FORMATTER.HARD_CODED_LINKS = [];
|
|
32
|
+
}
|
|
33
|
+
|
|
29
34
|
return config;
|
|
30
35
|
};
|
|
@@ -18,30 +18,24 @@ export const shouldRetry = (error) => {
|
|
|
18
18
|
|
|
19
19
|
const shouldIncludeURL = (url) => {
|
|
20
20
|
try {
|
|
21
|
-
const
|
|
21
|
+
const { INITIAL_URLS, INCLUDE_URLS, EXCLUDE_PATTERNS } = CONFIG.CRAWLER;
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
const isIncluded = CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => {
|
|
25
|
-
if (typeof pattern === 'string') {
|
|
26
|
-
return urlObj.toString().includes(pattern); // Check if the URL contains the string pattern
|
|
27
|
-
} else if (pattern instanceof RegExp) {
|
|
28
|
-
return pattern.test(urlObj.toString()); // Check if the URL matches the RegExp pattern
|
|
29
|
-
}
|
|
30
|
-
});
|
|
23
|
+
if (INITIAL_URLS.includes(url)) return true;
|
|
31
24
|
|
|
32
|
-
|
|
25
|
+
// Pre-compile string patterns into regular expressions for both include and exclude patterns
|
|
26
|
+
const compiledExcludePatterns = EXCLUDE_PATTERNS.map(pattern =>
|
|
27
|
+
typeof pattern === 'string' ? new RegExp(pattern) : pattern
|
|
28
|
+
);
|
|
29
|
+
const compiledIncludePatterns = INCLUDE_URLS.map(pattern =>
|
|
30
|
+
typeof pattern === 'string' ? new RegExp(pattern) : pattern
|
|
31
|
+
);
|
|
33
32
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if (typeof pattern === 'string') {
|
|
37
|
-
return urlObj.pathname.includes(pattern); // Check if the URL pathname contains the string pattern
|
|
38
|
-
} else if (pattern instanceof RegExp) {
|
|
39
|
-
return pattern.test(urlObj.pathname); // Check if the URL pathname matches the RegExp pattern
|
|
40
|
-
}
|
|
41
|
-
});
|
|
33
|
+
if (compiledExcludePatterns.some(pattern => pattern.test(url))) return false;
|
|
34
|
+
if (compiledIncludePatterns.some(pattern => pattern.test(url))) return true;
|
|
42
35
|
|
|
43
|
-
return
|
|
36
|
+
return false; // If the URL doesn't match any include patterns, exclude it.
|
|
44
37
|
} catch (error) {
|
|
38
|
+
console.error(`Error processing URL: ${url}`, error);
|
|
45
39
|
return false;
|
|
46
40
|
}
|
|
47
41
|
};
|