scraply 1.0.19 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "scraply",
3
3
  "description": "A simple, configurable and functional content scraper",
4
- "version": "1.0.19",
4
+ "version": "1.0.21",
5
5
  "main": "src/scraply.js",
6
6
  "type": "module",
7
7
  "scripts": {
@@ -13,9 +13,11 @@
13
13
  ],
14
14
  "author": "Pau Serrat Gutiérrez",
15
15
  "dependencies": {
16
- "axios": "^1.7.7",
16
+ "axios": "^1.7.9",
17
17
  "cheerio": "^1.0.0",
18
- "he": "^1.2.0"
18
+ "he": "^1.2.0",
19
+ "puppeteer": "^24.2.0",
20
+ "puppeteer-cluster": "^0.24.0"
19
21
  },
20
22
  "publishConfig": {
21
23
  "registry": "https://registry.npmjs.org/",
package/readme.md CHANGED
@@ -41,7 +41,7 @@ CRAWLER: {
41
41
  'https://crawler-test.com/'
42
42
  ],
43
43
  INCLUDE_URLS: [
44
- 'https://crawler-test.com/.*'
44
+ 'https://crawler-test.com/'
45
45
  ],
46
46
  ALLOWED_CONTENT_TYPES: [
47
47
  'text/html'
@@ -70,15 +70,16 @@ CRAWLER: {
70
70
  'aside',
71
71
  'button'
72
72
  ],
73
- RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
74
- REQUEST_TIMEOUT: 4000,
75
- MAX_REDIRECTS: 3,
76
- MAX_RETRIES: 2,
73
+ RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
74
+ REQUEST_TIMEOUT: 3000,
75
+ MAX_REDIRECTS: 2,
76
+ MAX_CONTENT_LENGTH: 20 * 1024 * 1024, // 20MB
77
+ MAX_RETRIES: 1,
77
78
  CRAWL_DELAY_MS: 200,
78
- CRAWL_ERROR_RETRY_DELAY_MS: 800,
79
+ CRAWL_ERROR_RETRY_DELAY_MS: 1000,
79
80
  CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
80
- EXIT_CODE_RATE_LIMIT: 10,
81
- EXIT_ON_RATE_LIMIT: true
81
+ EXIT_ON_RATE_LIMIT: true, // If true, forces exit instantly. If false, only exits after retries (if still 429)
82
+ EXIT_CODE_RATE_LIMIT: 10
82
83
  },
83
84
 
84
85
  DATA_FORMATTER: {
@@ -6,7 +6,7 @@ export const DEFAULT_CONFIG = {
6
6
  'https://crawler-test.com/'
7
7
  ],
8
8
  INCLUDE_URLS: [
9
- 'https://crawler-test.com/.*'
9
+ 'https://crawler-test.com/'
10
10
  ],
11
11
  ALLOWED_CONTENT_TYPES: [
12
12
  'text/html'
@@ -33,16 +33,25 @@ export const DEFAULT_CONFIG = {
33
33
  'header',
34
34
  'footer',
35
35
  'aside',
36
- 'button'
36
+ 'button',
37
+ '[aria-modal]',
38
+ '[role="dialog"]',
39
+ '[role="alert"]',
40
+ '[role="banner"]',
41
+ '[role="form"]',
42
+ '[role="navigation"]',
43
+ '[role="search"]'
37
44
  ],
38
- RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
39
- REQUEST_TIMEOUT: 4000,
40
- MAX_REDIRECTS: 3,
41
- MAX_RETRIES: 2,
45
+ DYNAMIC_CRAWLING: false,
46
+ RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
47
+ REQUEST_TIMEOUT: 3000,
48
+ MAX_REDIRECTS: 2,
49
+ MAX_CONTENT_LENGTH: 20 * 1024 * 1024, // 20MB
50
+ MAX_RETRIES: 1,
42
51
  CRAWL_DELAY_MS: 200,
43
- CRAWL_ERROR_RETRY_DELAY_MS: 800,
52
+ CRAWL_ERROR_RETRY_DELAY_MS: 1000,
44
53
  CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
45
- EXIT_ON_RATE_LIMIT: true,
54
+ EXIT_ON_RATE_LIMIT: true, // If true, forces exit instantly. If false, only exits after retries (if still 429)
46
55
  EXIT_CODE_RATE_LIMIT: 10
47
56
  },
48
57
 
package/src/scraply.js CHANGED
@@ -3,6 +3,7 @@ import { normalizeURL } from './utils/crawl/url/normalize.js';
3
3
  import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
4
4
  import { processURL } from './utils/crawl/url/processor.js';
5
5
  import { formatData, saveSortedFormattedJSON } from './utils/format/formatData.js';
6
+ import { initializeCluster, closeCluster } from './utils/crawl/browser/helper.js';
6
7
 
7
8
  let urlData = [];
8
9
  let CONFIG = {};
@@ -52,6 +53,10 @@ const start = async () => {
52
53
  - Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
53
54
  `);
54
55
 
56
+ if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
57
+ await initializeCluster();
58
+ }
59
+
55
60
  let fileNumber = urlData.filter(entry => entry.file).length + 1;
56
61
  for await (const entry of urlData) {
57
62
  if (!entry.file) {
@@ -104,6 +109,10 @@ const start = async () => {
104
109
  console.log(`\nCLEANING UP UNTRACKED FILES...`);
105
110
  deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
106
111
  generatedFiles.clear(); // Clear the set to prepare for future crawls
112
+
113
+ if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
114
+ await closeCluster();
115
+ }
107
116
  };
108
117
 
109
118
  // Main function to be exported and used
@@ -0,0 +1,142 @@
1
+ import { Cluster } from 'puppeteer-cluster';
2
+ import { delay } from '../delay.js';
3
+
4
+ let cluster;
5
+ let initializing = false;
6
+
7
+ export const initializeCluster = async () => {
8
+ if (!cluster && !initializing) {
9
+ initializing = true;
10
+ console.log('Initializing Puppeteer cluster...');
11
+ cluster = await Cluster.launch({
12
+ concurrency: Cluster.CONCURRENCY_CONTEXT,
13
+ maxConcurrency: 1, // Lower concurrency for stability
14
+ puppeteerOptions: {
15
+ headless: true,
16
+ args: [
17
+ // '--no-sandbox',
18
+ '--disable-setuid-sandbox',
19
+ '--disable-dev-shm-usage',
20
+ '--disable-accelerated-2d-canvas',
21
+ '--disable-gpu',
22
+ // '--single-process',
23
+ '--disable-background-networking',
24
+ '--disable-background-timer-throttling',
25
+ '--disable-breakpad',
26
+ '--disable-client-side-phishing-detection',
27
+ '--disable-default-apps',
28
+ '--disable-extensions',
29
+ '--disable-hang-monitor',
30
+ '--disable-popup-blocking',
31
+ '--disable-prompt-on-repost',
32
+ '--disable-sync',
33
+ '--disable-translate',
34
+ '--metrics-recording-only',
35
+ '--no-first-run',
36
+ '--safebrowsing-disable-auto-update',
37
+ '--enable-automation',
38
+ '--password-store=basic',
39
+ '--use-mock-keychain',
40
+ '--disable-software-rasterizer',
41
+ '--no-zygote',
42
+ '--disable-infobars',
43
+ '--disable-blink-features=AutomationControlled',
44
+ '--disable-component-extensions-with-background-pages',
45
+ '--mute-audio',
46
+ '--window-size=1280,800', // Moderate window size
47
+ '--window-position=0,0',
48
+ '--ignore-certificate-errors',
49
+ '--ignore-certificate-errors-skip-list',
50
+ '--hide-scrollbars',
51
+ '--disable-notifications',
52
+ '--disable-backgrounding-occluded-windows',
53
+ '--disable-features=TranslateUI,BlinkGenPropertyTrees',
54
+ '--disable-ipc-flooding-protection',
55
+ '--disable-renderer-backgrounding',
56
+ '--enable-features=NetworkService,NetworkServiceInProcess',
57
+ '--force-color-profile=srgb'
58
+ ],
59
+ timeout: 15000
60
+ }
61
+ });
62
+
63
+ cluster.task(async ({ page, data: { url } }) => {
64
+ let response;
65
+ try {
66
+ page.on('response', res => {
67
+ if (res.url() === url) response = res;
68
+ });
69
+
70
+ // Skip downloading resources
71
+ await page.setRequestInterception(true);
72
+ page.on('request', (req) => {
73
+ const resourceType = req.resourceType();
74
+ if (['image', 'stylesheet', 'font'].includes(resourceType)) {
75
+ req.abort();
76
+ } else {
77
+ req.continue();
78
+ }
79
+ });
80
+
81
+ await page.goto(url, { timeout: 15000, waitUntil: 'domcontentloaded' }); // Possible values: load, domcontentloaded, networkidle0, networkidle2
82
+ const content = await page.content();
83
+ const headers = response ? response.headers() : {};
84
+ const statusCode = response ? response.status() : 0;
85
+ return { content, headers, statusCode };
86
+ } catch (error) {
87
+ console.error(`Error in cluster task for URL ${url}:`, error);
88
+ throw error;
89
+ } finally {
90
+ if (page) {
91
+ try {
92
+ await page.close();
93
+ } catch (closeError) {
94
+ console.error('Error closing page:', closeError);
95
+ }
96
+ }
97
+ }
98
+ });
99
+
100
+ console.log('Puppeteer cluster initialized!');
101
+ initializing = false;
102
+ } else if (initializing) {
103
+ console.log('Cluster is already being initialized, waiting...');
104
+ while (initializing) {
105
+ await delay(100); // Wait for initialization to complete
106
+ }
107
+ }
108
+
109
+ return cluster;
110
+ };
111
+
112
+ export const fetchPageContent = async (url, selector) => {
113
+ if (!cluster) await initializeCluster();
114
+ try {
115
+ const result = await cluster.execute({ url, selector }); // Returns the page content
116
+ return { data: result.content, headers: result.headers, status: result.statusCode };
117
+ } catch (error) {
118
+ console.error(`Error fetching page content for URL ${url}:`, error);
119
+ throw {
120
+ response: { status: error.statusCode, statusText: error.message }
121
+ };
122
+ }
123
+ };
124
+
125
+ export const closeCluster = async () => {
126
+ if (cluster) {
127
+ console.log('Closing Puppeteer cluster...');
128
+ await cluster.idle();
129
+ await cluster.close();
130
+ cluster = null;
131
+ console.log('Puppeteer cluster closed!');
132
+ }
133
+ };
134
+
135
+ // Handle app termination
136
+ const handleAppTermination = async () => {
137
+ await closeCluster();
138
+ process.exit(0);
139
+ };
140
+
141
+ process.on('SIGINT', handleAppTermination);
142
+ process.on('SIGTERM', handleAppTermination);
@@ -1,38 +1,62 @@
1
1
  import axios from 'axios';
2
2
  import { delay } from '../delay.js';
3
3
  import { shouldRetry } from './handlers.js';
4
+ import { fetchPageContent } from '../browser/helper.js';
5
+ // import { normalizeURL } from './normalize.js';
4
6
 
5
7
  export async function fetchURL(url, retries = 2) {
6
8
  try {
7
- const response = await axios.get(url, {
8
- timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
9
- maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS
10
- });
9
+ let response;
10
+
11
+ if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) { // JavaScript Dynamic Content
12
+ response = await fetchPageContent(url, 'body'); // Returns a custom object with content, headers and status, similar to axios
11
13
 
12
- const { 'content-type': contentType } = response.headers;
14
+ // Manually handle redirects (Puppeter doesn't follow them, axios does automatically)
15
+ if (response.status >= 300 && response.status < 400) { // 3xx Redirect
16
+ const redirectUrl = response.headers.location;
17
+ if (redirectUrl) {
18
+ if (CONFIG.CRAWLER.MAX_REDIRECTS <= 0) {
19
+ const error = new Error(`Max redirects reached`);
20
+ error.response = { status: response.status, headers: response.headers, data: response.data };
21
+ throw error;
22
+ }
23
+ // Normalize URL ?
24
+ const newUrl = new URL(redirectUrl, url).href;
25
+ return fetchURL(newUrl, retries - 1);
26
+ }
27
+ }
13
28
 
14
- // Validate content type
15
- if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
16
- return {
17
- error: `Content-Type ${contentType} is not allowed.`,
18
- status: response.status
19
- };
20
- };
29
+ // Validate status code (Puppeteer doesn't throw, axios does automatically)
30
+ if (response.status < 200 || response.status >= 300) {
31
+ const error = new Error(`Invalid status code: ${response.status}`);
32
+ error.response = { status: response.status, headers: response.headers, data: response.data };
33
+ throw error;
34
+ }
35
+ } else { // Static Content
36
+ response = await axios.get(url, {
37
+ timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
38
+ maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS,
39
+ maxContentLength: CONFIG.CRAWLER.MAX_CONTENT_LENGTH
40
+ });
41
+ }
42
+
43
+ // Validate content type header
44
+ const { 'content-type': contentType } = response.headers;
45
+ if (!contentType) return { error: `Missing Content-Type header`, status: response.status };
46
+ if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
21
47
 
22
48
  return { data: response.data, status: response.status };
23
49
  } catch (error) {
24
50
  if (retries > 0 && (await shouldRetry(error))) {
25
51
  const retryCount = CONFIG.CRAWLER.MAX_RETRIES - retries + 1;
26
52
  console.log(`Retrying (${retryCount}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
27
-
28
- if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
29
-
53
+ if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
30
54
  return fetchURL(url, retries - 1);
31
55
  }
32
56
 
33
57
  // If still 429 after retries, exit with configured code
34
58
  if (error.response?.status === 429) {
35
- console.log(`Force exiting with code ${CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT} after retries...`);
59
+ console.log(`Force exiting with code ${CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT} (after retries)...`);
36
60
  process.exit(CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT);
37
61
  }
38
62
 
@@ -2,7 +2,6 @@ import { URL } from 'node:url';
2
2
  import { normalizeURL } from './normalize.js';
3
3
  import { delay } from '../delay.js';
4
4
 
5
- // Handle HTML Status Codes HERE!
6
5
  export const shouldRetry = async (error) => {
7
6
  if (!error.response) return true;
8
7
 
@@ -42,21 +41,16 @@ export const shouldRetry = async (error) => {
42
41
  const shouldIncludeURL = (url) => {
43
42
  try {
44
43
  const { INITIAL_URLS, INCLUDE_URLS, EXCLUDE_PATTERNS } = CONFIG.CRAWLER;
45
-
46
44
  if (INITIAL_URLS.includes(url)) return true;
47
45
 
48
46
  // Pre-compile string patterns into regular expressions for both include and exclude patterns
49
- const compiledExcludePatterns = EXCLUDE_PATTERNS.map(pattern =>
50
- typeof pattern === 'string' ? new RegExp(pattern) : pattern
51
- );
52
- const compiledIncludePatterns = INCLUDE_URLS.map(pattern =>
53
- typeof pattern === 'string' ? new RegExp(pattern) : pattern
54
- );
47
+ const compiledExcludePatterns = EXCLUDE_PATTERNS.map(p => typeof p === 'string' ? new RegExp(p) : p);
48
+ if (compiledExcludePatterns.some(p => p.test(url))) return false;
55
49
 
56
- if (compiledExcludePatterns.some(pattern => pattern.test(url))) return false;
57
- if (compiledIncludePatterns.some(pattern => pattern.test(url))) return true;
50
+ const compiledIncludePatterns = INCLUDE_URLS.map(p => typeof p === 'string' ? new RegExp(p) : p);
51
+ if (compiledIncludePatterns.some(p => p.test(url))) return true;
58
52
 
59
- return false; // If the URL doesn't match any include patterns, exclude it.
53
+ return false; // URL doesn't match any include patterns, it is excluded.
60
54
  } catch (error) {
61
55
  console.error(`Error processing URL: ${url}`, error);
62
56
  return false;
@@ -2,7 +2,13 @@ export const normalizeURL = (url) => {
2
2
  const urlObj = new URL(url);
3
3
  urlObj.hash = ''; // Remove the fragment part
4
4
  urlObj.search = ''; // Remove the query part
5
- urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
6
- urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
5
+ urlObj.protocol = 'https:'; // Force HTTPS
6
+ urlObj.pathname = urlObj.pathname.endsWith('/')
7
+ ? urlObj.pathname.slice(0, -1)
8
+ : urlObj.pathname; // Remove trailing slashes
9
+ urlObj.pathname = urlObj.pathname === ''
10
+ ? '/'
11
+ : urlObj.pathname; // Handle the root URL separately
12
+ urlObj.hostname = urlObj.hostname.replace(/^www\./, ''); // Remove 'www.' prefix
7
13
  return urlObj.toString();
8
14
  };