scraply 1.0.20 → 1.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -3
- package/readme.md +1 -1
- package/src/defaultConfig.js +10 -2
- package/src/scraply.js +9 -0
- package/src/utils/crawl/browser/helper.js +142 -0
- package/src/utils/crawl/url/fetch.js +38 -17
- package/src/utils/crawl/url/handlers.js +5 -11
- package/src/utils/crawl/url/normalize.js +8 -2
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.21",
|
|
5
5
|
"main": "src/scraply.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"scripts": {
|
|
@@ -13,9 +13,11 @@
|
|
|
13
13
|
],
|
|
14
14
|
"author": "Pau Serrat Gutiérrez",
|
|
15
15
|
"dependencies": {
|
|
16
|
-
"axios": "^1.7.
|
|
16
|
+
"axios": "^1.7.9",
|
|
17
17
|
"cheerio": "^1.0.0",
|
|
18
|
-
"he": "^1.2.0"
|
|
18
|
+
"he": "^1.2.0",
|
|
19
|
+
"puppeteer": "^24.2.0",
|
|
20
|
+
"puppeteer-cluster": "^0.24.0"
|
|
19
21
|
},
|
|
20
22
|
"publishConfig": {
|
|
21
23
|
"registry": "https://registry.npmjs.org/",
|
package/readme.md
CHANGED
package/src/defaultConfig.js
CHANGED
|
@@ -6,7 +6,7 @@ export const DEFAULT_CONFIG = {
|
|
|
6
6
|
'https://crawler-test.com/'
|
|
7
7
|
],
|
|
8
8
|
INCLUDE_URLS: [
|
|
9
|
-
'https://crawler-test.com
|
|
9
|
+
'https://crawler-test.com/'
|
|
10
10
|
],
|
|
11
11
|
ALLOWED_CONTENT_TYPES: [
|
|
12
12
|
'text/html'
|
|
@@ -33,8 +33,16 @@ export const DEFAULT_CONFIG = {
|
|
|
33
33
|
'header',
|
|
34
34
|
'footer',
|
|
35
35
|
'aside',
|
|
36
|
-
'button'
|
|
36
|
+
'button',
|
|
37
|
+
'[aria-modal]',
|
|
38
|
+
'[role="dialog"]',
|
|
39
|
+
'[role="alert"]',
|
|
40
|
+
'[role="banner"]',
|
|
41
|
+
'[role="form"]',
|
|
42
|
+
'[role="navigation"]',
|
|
43
|
+
'[role="search"]'
|
|
37
44
|
],
|
|
45
|
+
DYNAMIC_CRAWLING: false,
|
|
38
46
|
RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
|
|
39
47
|
REQUEST_TIMEOUT: 3000,
|
|
40
48
|
MAX_REDIRECTS: 2,
|
package/src/scraply.js
CHANGED
|
@@ -3,6 +3,7 @@ import { normalizeURL } from './utils/crawl/url/normalize.js';
|
|
|
3
3
|
import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
|
|
4
4
|
import { processURL } from './utils/crawl/url/processor.js';
|
|
5
5
|
import { formatData, saveSortedFormattedJSON } from './utils/format/formatData.js';
|
|
6
|
+
import { initializeCluster, closeCluster } from './utils/crawl/browser/helper.js';
|
|
6
7
|
|
|
7
8
|
let urlData = [];
|
|
8
9
|
let CONFIG = {};
|
|
@@ -52,6 +53,10 @@ const start = async () => {
|
|
|
52
53
|
- Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
|
|
53
54
|
`);
|
|
54
55
|
|
|
56
|
+
if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
|
|
57
|
+
await initializeCluster();
|
|
58
|
+
}
|
|
59
|
+
|
|
55
60
|
let fileNumber = urlData.filter(entry => entry.file).length + 1;
|
|
56
61
|
for await (const entry of urlData) {
|
|
57
62
|
if (!entry.file) {
|
|
@@ -104,6 +109,10 @@ const start = async () => {
|
|
|
104
109
|
console.log(`\nCLEANING UP UNTRACKED FILES...`);
|
|
105
110
|
deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
|
|
106
111
|
generatedFiles.clear(); // Clear the set to prepare for future crawls
|
|
112
|
+
|
|
113
|
+
if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
|
|
114
|
+
await closeCluster();
|
|
115
|
+
}
|
|
107
116
|
};
|
|
108
117
|
|
|
109
118
|
// Main function to be exported and used
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { Cluster } from 'puppeteer-cluster';
|
|
2
|
+
import { delay } from '../delay.js';
|
|
3
|
+
|
|
4
|
+
let cluster;
|
|
5
|
+
let initializing = false;
|
|
6
|
+
|
|
7
|
+
export const initializeCluster = async () => {
|
|
8
|
+
if (!cluster && !initializing) {
|
|
9
|
+
initializing = true;
|
|
10
|
+
console.log('Initializing Puppeteer cluster...');
|
|
11
|
+
cluster = await Cluster.launch({
|
|
12
|
+
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
|
13
|
+
maxConcurrency: 1, // Lower concurrency for stability
|
|
14
|
+
puppeteerOptions: {
|
|
15
|
+
headless: true,
|
|
16
|
+
args: [
|
|
17
|
+
// '--no-sandbox',
|
|
18
|
+
'--disable-setuid-sandbox',
|
|
19
|
+
'--disable-dev-shm-usage',
|
|
20
|
+
'--disable-accelerated-2d-canvas',
|
|
21
|
+
'--disable-gpu',
|
|
22
|
+
// '--single-process',
|
|
23
|
+
'--disable-background-networking',
|
|
24
|
+
'--disable-background-timer-throttling',
|
|
25
|
+
'--disable-breakpad',
|
|
26
|
+
'--disable-client-side-phishing-detection',
|
|
27
|
+
'--disable-default-apps',
|
|
28
|
+
'--disable-extensions',
|
|
29
|
+
'--disable-hang-monitor',
|
|
30
|
+
'--disable-popup-blocking',
|
|
31
|
+
'--disable-prompt-on-repost',
|
|
32
|
+
'--disable-sync',
|
|
33
|
+
'--disable-translate',
|
|
34
|
+
'--metrics-recording-only',
|
|
35
|
+
'--no-first-run',
|
|
36
|
+
'--safebrowsing-disable-auto-update',
|
|
37
|
+
'--enable-automation',
|
|
38
|
+
'--password-store=basic',
|
|
39
|
+
'--use-mock-keychain',
|
|
40
|
+
'--disable-software-rasterizer',
|
|
41
|
+
'--no-zygote',
|
|
42
|
+
'--disable-infobars',
|
|
43
|
+
'--disable-blink-features=AutomationControlled',
|
|
44
|
+
'--disable-component-extensions-with-background-pages',
|
|
45
|
+
'--mute-audio',
|
|
46
|
+
'--window-size=1280,800', // Moderate window size
|
|
47
|
+
'--window-position=0,0',
|
|
48
|
+
'--ignore-certificate-errors',
|
|
49
|
+
'--ignore-certificate-errors-skip-list',
|
|
50
|
+
'--hide-scrollbars',
|
|
51
|
+
'--disable-notifications',
|
|
52
|
+
'--disable-backgrounding-occluded-windows',
|
|
53
|
+
'--disable-features=TranslateUI,BlinkGenPropertyTrees',
|
|
54
|
+
'--disable-ipc-flooding-protection',
|
|
55
|
+
'--disable-renderer-backgrounding',
|
|
56
|
+
'--enable-features=NetworkService,NetworkServiceInProcess',
|
|
57
|
+
'--force-color-profile=srgb'
|
|
58
|
+
],
|
|
59
|
+
timeout: 15000
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
cluster.task(async ({ page, data: { url } }) => {
|
|
64
|
+
let response;
|
|
65
|
+
try {
|
|
66
|
+
page.on('response', res => {
|
|
67
|
+
if (res.url() === url) response = res;
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// Skip downloading resources
|
|
71
|
+
await page.setRequestInterception(true);
|
|
72
|
+
page.on('request', (req) => {
|
|
73
|
+
const resourceType = req.resourceType();
|
|
74
|
+
if (['image', 'stylesheet', 'font'].includes(resourceType)) {
|
|
75
|
+
req.abort();
|
|
76
|
+
} else {
|
|
77
|
+
req.continue();
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
await page.goto(url, { timeout: 15000, waitUntil: 'domcontentloaded' }); // Possible values: load, domcontentloaded, networkidle0, networkidle2
|
|
82
|
+
const content = await page.content();
|
|
83
|
+
const headers = response ? response.headers() : {};
|
|
84
|
+
const statusCode = response ? response.status() : 0;
|
|
85
|
+
return { content, headers, statusCode };
|
|
86
|
+
} catch (error) {
|
|
87
|
+
console.error(`Error in cluster task for URL ${url}:`, error);
|
|
88
|
+
throw error;
|
|
89
|
+
} finally {
|
|
90
|
+
if (page) {
|
|
91
|
+
try {
|
|
92
|
+
await page.close();
|
|
93
|
+
} catch (closeError) {
|
|
94
|
+
console.error('Error closing page:', closeError);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
console.log('Puppeteer cluster initialized!');
|
|
101
|
+
initializing = false;
|
|
102
|
+
} else if (initializing) {
|
|
103
|
+
console.log('Cluster is already being initialized, waiting...');
|
|
104
|
+
while (initializing) {
|
|
105
|
+
await delay(100); // Wait for initialization to complete
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return cluster;
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
export const fetchPageContent = async (url, selector) => {
|
|
113
|
+
if (!cluster) await initializeCluster();
|
|
114
|
+
try {
|
|
115
|
+
const result = await cluster.execute({ url, selector }); // Returns the page content
|
|
116
|
+
return { data: result.content, headers: result.headers, status: result.statusCode };
|
|
117
|
+
} catch (error) {
|
|
118
|
+
console.error(`Error fetching page content for URL ${url}:`, error);
|
|
119
|
+
throw {
|
|
120
|
+
response: { status: error.statusCode, statusText: error.message }
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
export const closeCluster = async () => {
|
|
126
|
+
if (cluster) {
|
|
127
|
+
console.log('Closing Puppeteer cluster...');
|
|
128
|
+
await cluster.idle();
|
|
129
|
+
await cluster.close();
|
|
130
|
+
cluster = null;
|
|
131
|
+
console.log('Puppeteer cluster closed!');
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
// Handle app termination
|
|
136
|
+
const handleAppTermination = async () => {
|
|
137
|
+
await closeCluster();
|
|
138
|
+
process.exit(0);
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
process.on('SIGINT', handleAppTermination);
|
|
142
|
+
process.on('SIGTERM', handleAppTermination);
|
|
@@ -1,35 +1,56 @@
|
|
|
1
1
|
import axios from 'axios';
|
|
2
2
|
import { delay } from '../delay.js';
|
|
3
3
|
import { shouldRetry } from './handlers.js';
|
|
4
|
+
import { fetchPageContent } from '../browser/helper.js';
|
|
5
|
+
// import { normalizeURL } from './normalize.js';
|
|
4
6
|
|
|
5
7
|
export async function fetchURL(url, retries = 2) {
|
|
6
8
|
try {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
});
|
|
9
|
+
let response;
|
|
10
|
+
|
|
11
|
+
if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) { // JavaScript Dynamic Content
|
|
12
|
+
response = await fetchPageContent(url, 'body'); // Returns a custom object with content, headers and status, similar to axios
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
// Manually handle redirects (Puppeter doesn't follow them, axios does automatically)
|
|
15
|
+
if (response.status >= 300 && response.status < 400) { // 3xx Redirect
|
|
16
|
+
const redirectUrl = response.headers.location;
|
|
17
|
+
if (redirectUrl) {
|
|
18
|
+
if (CONFIG.CRAWLER.MAX_REDIRECTS <= 0) {
|
|
19
|
+
const error = new Error(`Max redirects reached`);
|
|
20
|
+
error.response = { status: response.status, headers: response.headers, data: response.data };
|
|
21
|
+
throw error;
|
|
22
|
+
}
|
|
23
|
+
// Normalize URL ?
|
|
24
|
+
const newUrl = new URL(redirectUrl, url).href;
|
|
25
|
+
return fetchURL(newUrl, retries - 1);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
14
28
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
29
|
+
// Validate status code (Puppeteer doesn't throw, axios does automatically)
|
|
30
|
+
if (response.status < 200 || response.status >= 300) {
|
|
31
|
+
const error = new Error(`Invalid status code: ${response.status}`);
|
|
32
|
+
error.response = { status: response.status, headers: response.headers, data: response.data };
|
|
33
|
+
throw error;
|
|
34
|
+
}
|
|
35
|
+
} else { // Static Content
|
|
36
|
+
response = await axios.get(url, {
|
|
37
|
+
timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
|
|
38
|
+
maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS,
|
|
39
|
+
maxContentLength: CONFIG.CRAWLER.MAX_CONTENT_LENGTH
|
|
40
|
+
});
|
|
41
|
+
}
|
|
19
42
|
|
|
20
|
-
// Validate content type
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
};
|
|
43
|
+
// Validate content type header
|
|
44
|
+
const { 'content-type': contentType } = response.headers;
|
|
45
|
+
if (!contentType) return { error: `Missing Content-Type header`, status: response.status };
|
|
46
|
+
if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
|
|
24
47
|
|
|
25
48
|
return { data: response.data, status: response.status };
|
|
26
49
|
} catch (error) {
|
|
27
50
|
if (retries > 0 && (await shouldRetry(error))) {
|
|
28
51
|
const retryCount = CONFIG.CRAWLER.MAX_RETRIES - retries + 1;
|
|
29
52
|
console.log(`Retrying (${retryCount}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
|
|
30
|
-
|
|
31
|
-
if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
|
|
32
|
-
|
|
53
|
+
if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
|
|
33
54
|
return fetchURL(url, retries - 1);
|
|
34
55
|
}
|
|
35
56
|
|
|
@@ -2,7 +2,6 @@ import { URL } from 'node:url';
|
|
|
2
2
|
import { normalizeURL } from './normalize.js';
|
|
3
3
|
import { delay } from '../delay.js';
|
|
4
4
|
|
|
5
|
-
// Handle HTML Status Codes HERE!
|
|
6
5
|
export const shouldRetry = async (error) => {
|
|
7
6
|
if (!error.response) return true;
|
|
8
7
|
|
|
@@ -42,21 +41,16 @@ export const shouldRetry = async (error) => {
|
|
|
42
41
|
const shouldIncludeURL = (url) => {
|
|
43
42
|
try {
|
|
44
43
|
const { INITIAL_URLS, INCLUDE_URLS, EXCLUDE_PATTERNS } = CONFIG.CRAWLER;
|
|
45
|
-
|
|
46
44
|
if (INITIAL_URLS.includes(url)) return true;
|
|
47
45
|
|
|
48
46
|
// Pre-compile string patterns into regular expressions for both include and exclude patterns
|
|
49
|
-
const compiledExcludePatterns = EXCLUDE_PATTERNS.map(
|
|
50
|
-
|
|
51
|
-
);
|
|
52
|
-
const compiledIncludePatterns = INCLUDE_URLS.map(pattern =>
|
|
53
|
-
typeof pattern === 'string' ? new RegExp(pattern) : pattern
|
|
54
|
-
);
|
|
47
|
+
const compiledExcludePatterns = EXCLUDE_PATTERNS.map(p => typeof p === 'string' ? new RegExp(p) : p);
|
|
48
|
+
if (compiledExcludePatterns.some(p => p.test(url))) return false;
|
|
55
49
|
|
|
56
|
-
|
|
57
|
-
if (compiledIncludePatterns.some(
|
|
50
|
+
const compiledIncludePatterns = INCLUDE_URLS.map(p => typeof p === 'string' ? new RegExp(p) : p);
|
|
51
|
+
if (compiledIncludePatterns.some(p => p.test(url))) return true;
|
|
58
52
|
|
|
59
|
-
return false; //
|
|
53
|
+
return false; // URL doesn't match any include patterns, it is excluded.
|
|
60
54
|
} catch (error) {
|
|
61
55
|
console.error(`Error processing URL: ${url}`, error);
|
|
62
56
|
return false;
|
|
@@ -2,7 +2,13 @@ export const normalizeURL = (url) => {
|
|
|
2
2
|
const urlObj = new URL(url);
|
|
3
3
|
urlObj.hash = ''; // Remove the fragment part
|
|
4
4
|
urlObj.search = ''; // Remove the query part
|
|
5
|
-
urlObj.
|
|
6
|
-
urlObj.pathname = urlObj.pathname
|
|
5
|
+
urlObj.protocol = 'https:'; // Force HTTPS
|
|
6
|
+
urlObj.pathname = urlObj.pathname.endsWith('/')
|
|
7
|
+
? urlObj.pathname.slice(0, -1)
|
|
8
|
+
: urlObj.pathname; // Remove trailing slashes
|
|
9
|
+
urlObj.pathname = urlObj.pathname === ''
|
|
10
|
+
? '/'
|
|
11
|
+
: urlObj.pathname; // Handle the root URL separately
|
|
12
|
+
urlObj.hostname = urlObj.hostname.replace(/^www\./, ''); // Remove 'www.' prefix
|
|
7
13
|
return urlObj.toString();
|
|
8
14
|
};
|