scraply 1.0.6 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "scraply",
3
3
  "description": "A simple, configurable and functional content scraper",
4
- "version": "1.0.6",
4
+ "version": "1.0.8",
5
5
  "main": "src/scraply.js",
6
6
  "type": "module",
7
7
  "scripts": {
@@ -18,9 +18,24 @@ export const shouldRetry = (error) => {
18
18
 
19
19
  const shouldIncludeURL = (url) => {
20
20
  try {
21
- const urlObj = new URL(url);
22
- return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
21
+ const { INITIAL_URLS, INCLUDE_URLS, EXCLUDE_PATTERNS } = CONFIG.CRAWLER;
22
+
23
+ if (INITIAL_URLS.includes(url)) return true;
24
+
25
+ // Pre-compile string patterns into regular expressions for both include and exclude patterns
26
+ const compiledExcludePatterns = EXCLUDE_PATTERNS.map(pattern =>
27
+ typeof pattern === 'string' ? new RegExp(pattern) : pattern
28
+ );
29
+ const compiledIncludePatterns = INCLUDE_URLS.map(pattern =>
30
+ typeof pattern === 'string' ? new RegExp(pattern) : pattern
31
+ );
32
+
33
+ if (compiledExcludePatterns.some(pattern => pattern.test(url))) return false;
34
+ if (compiledIncludePatterns.some(pattern => pattern.test(url))) return true;
35
+
36
+ return false; // If the URL doesn't match any include patterns, exclude it.
23
37
  } catch (error) {
38
+ console.error(`Error processing URL: ${url}`, error);
24
39
  return false;
25
40
  }
26
41
  };