scraply 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/readme.md +4 -12
- package/src/defaultConfig.js +4 -12
- package/src/loadConfig.js +4 -4
- package/src/scraply.js +3 -14
- package/src/utils/crawl/fileOperations.js +3 -3
- package/src/utils/crawl/url/fetch.js +26 -9
- package/src/utils/crawl/url/handlers.js +31 -11
- package/src/utils/crawl/url/processor.js +6 -7
- package/src/utils/format/formatData.js +1 -1
package/package.json
CHANGED
package/readme.md
CHANGED
|
@@ -76,6 +76,9 @@ CRAWLER: {
|
|
|
76
76
|
MAX_RETRIES: 2,
|
|
77
77
|
CRAWL_DELAY_MS: 200,
|
|
78
78
|
CRAWL_ERROR_RETRY_DELAY_MS: 800,
|
|
79
|
+
CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
|
|
80
|
+
EXIT_CODE_RATE_LIMIT: 10,
|
|
81
|
+
EXIT_ON_RATE_LIMIT: true
|
|
79
82
|
},
|
|
80
83
|
|
|
81
84
|
DATA_FORMATTER: {
|
|
@@ -85,17 +88,6 @@ DATA_FORMATTER: {
|
|
|
85
88
|
'mobile': 'mobile.json',
|
|
86
89
|
'*': 'general.json'
|
|
87
90
|
},
|
|
88
|
-
}
|
|
89
|
-
HARD_CODED_LINKS: [
|
|
90
|
-
{
|
|
91
|
-
file_name: 'hc-links.json',
|
|
92
|
-
data: [
|
|
93
|
-
{
|
|
94
|
-
"url": "https://custom-link.com",
|
|
95
|
-
"content": "That's a custom link content, you can add as many as you want."
|
|
96
|
-
},
|
|
97
|
-
]
|
|
98
|
-
}
|
|
99
|
-
]
|
|
91
|
+
}
|
|
100
92
|
}
|
|
101
93
|
```
|
package/src/defaultConfig.js
CHANGED
|
@@ -41,6 +41,9 @@ export const DEFAULT_CONFIG = {
|
|
|
41
41
|
MAX_RETRIES: 2,
|
|
42
42
|
CRAWL_DELAY_MS: 200,
|
|
43
43
|
CRAWL_ERROR_RETRY_DELAY_MS: 800,
|
|
44
|
+
CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
|
|
45
|
+
EXIT_CODE_RATE_LIMIT: 10,
|
|
46
|
+
EXIT_ON_RATE_LIMIT: true
|
|
44
47
|
},
|
|
45
48
|
|
|
46
49
|
DATA_FORMATTER: {
|
|
@@ -50,17 +53,6 @@ export const DEFAULT_CONFIG = {
|
|
|
50
53
|
'mobile': 'mobile.json',
|
|
51
54
|
'*': 'general.json'
|
|
52
55
|
},
|
|
53
|
-
}
|
|
54
|
-
HARD_CODED_LINKS: [
|
|
55
|
-
// {
|
|
56
|
-
// file_name: 'hc-links.json',
|
|
57
|
-
// data: [
|
|
58
|
-
// {
|
|
59
|
-
// "url": "https://custom-link.com",
|
|
60
|
-
// "content": "That's a custom link content, you can add as many as you want."
|
|
61
|
-
// },
|
|
62
|
-
// ]
|
|
63
|
-
// }
|
|
64
|
-
]
|
|
56
|
+
}
|
|
65
57
|
}
|
|
66
58
|
};
|
package/src/loadConfig.js
CHANGED
|
@@ -16,10 +16,10 @@ export function loadConfig(userConfig = {}) {
|
|
|
16
16
|
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
17
17
|
|
|
18
18
|
// Dynamically construct paths using MAIN_DIR
|
|
19
|
-
config.CRAWLER.QUEUE_PATH = path.join(config.MAIN_DIR, 'queue.json');
|
|
20
|
-
config.CRAWLER.CRAWLED_PATH = path.join(config.MAIN_DIR, 'crawled');
|
|
21
|
-
config.DATA_FORMATTER.FORMATTED_PATH = path.join(config.MAIN_DIR, 'formatted');
|
|
22
|
-
config.DATA_FORMATTER.ERROR_REPORT_PATH = path.join(config.MAIN_DIR, 'error-report.json');
|
|
19
|
+
config.CRAWLER.QUEUE_PATH = path.posix.join(config.MAIN_DIR, 'queue.json');
|
|
20
|
+
config.CRAWLER.CRAWLED_PATH = path.posix.join(config.MAIN_DIR, 'crawled');
|
|
21
|
+
config.DATA_FORMATTER.FORMATTED_PATH = path.posix.join(config.MAIN_DIR, 'formatted');
|
|
22
|
+
config.DATA_FORMATTER.ERROR_REPORT_PATH = path.posix.join(config.MAIN_DIR, 'error-report.json');
|
|
23
23
|
|
|
24
24
|
// If INCLUDE_URLS is not specified, set it to INITIAL_URLS by default
|
|
25
25
|
if (!config.CRAWLER.INCLUDE_URLS || config.CRAWLER.INCLUDE_URLS.length === 0) {
|
package/src/scraply.js
CHANGED
|
@@ -5,7 +5,6 @@ import { processURL } from './utils/crawl/url/processor.js';
|
|
|
5
5
|
import { formatData, saveSortedFormattedJSON } from './utils/format/formatData.js';
|
|
6
6
|
|
|
7
7
|
let urlData = [];
|
|
8
|
-
let urlMetadata = {};
|
|
9
8
|
let CONFIG = {};
|
|
10
9
|
let generatedFiles = new Set(); // Track files generated in the current crawl session.
|
|
11
10
|
|
|
@@ -17,7 +16,7 @@ const init = () => {
|
|
|
17
16
|
|
|
18
17
|
CONFIG.CRAWLER.INITIAL_URLS.forEach(url => {
|
|
19
18
|
const normalizedURL = normalizeURL(url);
|
|
20
|
-
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
19
|
+
urlData.push({ url: normalizedURL, file: null, status: null, error: null, referrerUrl: null, depth: 0 });
|
|
21
20
|
});
|
|
22
21
|
saveQueue(urlData);
|
|
23
22
|
} else { // If the queue is not empty
|
|
@@ -27,12 +26,10 @@ const init = () => {
|
|
|
27
26
|
|
|
28
27
|
// Reset data for a fresh crawl.
|
|
29
28
|
urlData = [];
|
|
30
|
-
urlMetadata = {};
|
|
31
29
|
|
|
32
30
|
// Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data.
|
|
33
31
|
deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
|
|
34
32
|
deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
|
|
35
|
-
deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
|
|
36
33
|
|
|
37
34
|
init();
|
|
38
35
|
} else { // If there are URLs that haven't been processed yet, resume from the queue.
|
|
@@ -58,7 +55,7 @@ const start = async () => {
|
|
|
58
55
|
let fileNumber = urlData.filter(entry => entry.file).length + 1;
|
|
59
56
|
for await (const entry of urlData) {
|
|
60
57
|
if (!entry.file) {
|
|
61
|
-
const processedFile = await processURL(entry, fileNumber, urlData
|
|
58
|
+
const processedFile = await processURL(entry, fileNumber, urlData);
|
|
62
59
|
if (processedFile) {
|
|
63
60
|
generatedFiles.add(processedFile); // Track the file generated
|
|
64
61
|
}
|
|
@@ -71,7 +68,7 @@ const start = async () => {
|
|
|
71
68
|
const notCrawledUrls = totalUrls - crawledUrls;
|
|
72
69
|
const errorUrls = urlData.filter(entry => entry.error !== null);
|
|
73
70
|
|
|
74
|
-
console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled)`);
|
|
71
|
+
console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled, ${errorUrls.length} errors)`);
|
|
75
72
|
|
|
76
73
|
// Iterate over all the urlData and save all the url & content to files, categorized by CONFIG.DATA_FORMATTER.CATEGORISED_PATHS. Exclude the URLs that match the patterns in CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS. Save in CONFIG.DATA_FORMATTER.FORMATTED_PATH.
|
|
77
74
|
console.log(`\nFORMATTING DATA...`);
|
|
@@ -103,14 +100,6 @@ const start = async () => {
|
|
|
103
100
|
};
|
|
104
101
|
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
105
102
|
|
|
106
|
-
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
107
|
-
const errorData = errorUrls.map(entry => {
|
|
108
|
-
return { url: entry.url, status: entry.status, error: entry.error };
|
|
109
|
-
});
|
|
110
|
-
saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
|
|
111
|
-
|
|
112
|
-
console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
|
|
113
|
-
|
|
114
103
|
// After formatting data, delete untracked files
|
|
115
104
|
console.log(`\nCLEANING UP UNTRACKED FILES...`);
|
|
116
105
|
deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
|
|
@@ -11,7 +11,7 @@ export const saveJSON = (filePath, data) => {
|
|
|
11
11
|
|
|
12
12
|
export const saveDataset = (data, fileNumber) => {
|
|
13
13
|
if (!fs.existsSync(CONFIG.CRAWLER.CRAWLED_PATH)) fs.mkdirSync(CONFIG.CRAWLER.CRAWLED_PATH, { recursive: true });
|
|
14
|
-
const filename =
|
|
14
|
+
const filename = path.posix.join(CONFIG.CRAWLER.CRAWLED_PATH, `${fileNumber}.json`);
|
|
15
15
|
fs.writeFileSync(filename, JSON.stringify(data, null, 2), 'utf8');
|
|
16
16
|
return filename;
|
|
17
17
|
};
|
|
@@ -22,7 +22,7 @@ export const deleteDataFiles = (filePath) => {
|
|
|
22
22
|
if (fs.existsSync(filePath)) {
|
|
23
23
|
if (fs.lstatSync(filePath).isDirectory()) {
|
|
24
24
|
fs.readdirSync(filePath).forEach((file) => {
|
|
25
|
-
const currentPath = path.join(filePath, file);
|
|
25
|
+
const currentPath = path.posix.join(filePath, file);
|
|
26
26
|
if (fs.lstatSync(currentPath).isDirectory()) {
|
|
27
27
|
deleteDataFiles(currentPath);
|
|
28
28
|
} else {
|
|
@@ -39,7 +39,7 @@ export const deleteDataFiles = (filePath) => {
|
|
|
39
39
|
export const deleteUntrackedFiles = (folderPath, trackedFiles) => {
|
|
40
40
|
if (fs.existsSync(folderPath)) {
|
|
41
41
|
fs.readdirSync(folderPath).forEach((file) => {
|
|
42
|
-
const currentPath = path.join(folderPath, file);
|
|
42
|
+
const currentPath = path.posix.join(folderPath, file);
|
|
43
43
|
if (fs.lstatSync(currentPath).isDirectory()) {
|
|
44
44
|
deleteUntrackedFiles(currentPath, trackedFiles);
|
|
45
45
|
} else if (!trackedFiles.has(currentPath)) {
|
|
@@ -2,24 +2,41 @@ import axios from 'axios';
|
|
|
2
2
|
import { delay } from '../delay.js';
|
|
3
3
|
import { shouldRetry } from './handlers.js';
|
|
4
4
|
|
|
5
|
-
export
|
|
5
|
+
export async function fetchURL(url, retries = 2) {
|
|
6
6
|
try {
|
|
7
|
-
const response = await axios.get(url, {
|
|
8
|
-
|
|
7
|
+
const response = await axios.get(url, {
|
|
8
|
+
timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
|
|
9
|
+
maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS
|
|
10
|
+
});
|
|
9
11
|
|
|
12
|
+
const { 'content-type': contentType } = response.headers;
|
|
13
|
+
|
|
14
|
+
// Validate content type
|
|
10
15
|
if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
|
|
11
|
-
return {
|
|
16
|
+
return {
|
|
17
|
+
error: `Content-Type ${contentType} is not allowed.`,
|
|
18
|
+
status: response.status
|
|
19
|
+
};
|
|
12
20
|
};
|
|
13
21
|
|
|
14
|
-
return {
|
|
22
|
+
return {
|
|
23
|
+
data: response.data,
|
|
24
|
+
status: response.status
|
|
25
|
+
};
|
|
15
26
|
} catch (error) {
|
|
16
27
|
if (retries > 0 && shouldRetry(error)) {
|
|
17
|
-
|
|
28
|
+
const retryCount = CONFIG.CRAWLER.MAX_RETRIES - retries + 1;
|
|
29
|
+
console.log(`Retrying (${retryCount}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
|
|
30
|
+
|
|
18
31
|
if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
|
|
32
|
+
|
|
19
33
|
return fetchURL(url, retries - 1);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
console.error(`Failed to fetch ${url} -> ${error.message}`);
|
|
37
|
+
return {
|
|
38
|
+
error: error.message,
|
|
39
|
+
status: error.response?.status
|
|
23
40
|
};
|
|
24
41
|
};
|
|
25
42
|
};
|
|
@@ -1,19 +1,40 @@
|
|
|
1
1
|
import { URL } from 'node:url';
|
|
2
2
|
import { normalizeURL } from './normalize.js';
|
|
3
|
+
import { delay } from '../delay.js';
|
|
3
4
|
|
|
4
5
|
// Handle HTML Status Codes HERE!
|
|
5
|
-
export const shouldRetry = (error) => {
|
|
6
|
+
export const shouldRetry = async (error) => {
|
|
6
7
|
if (!error.response) return true;
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
|
|
9
|
+
const { status, headers } = error.response;
|
|
10
|
+
const retryAfter = headers?.['retry-after'];
|
|
11
|
+
const rateLimitReset = headers?.['x-ratelimit-reset'];
|
|
12
|
+
|
|
13
|
+
if (status === 429) {
|
|
14
|
+
let waitTime = null;
|
|
15
|
+
|
|
16
|
+
if (retryAfter) {
|
|
17
|
+
waitTime = isNaN(retryAfter)
|
|
18
|
+
? Math.ceil((new Date(retryAfter).getTime() - Date.now()) / 1000) // HTTP date
|
|
19
|
+
: parseInt(retryAfter, 10); // Seconds
|
|
20
|
+
console.log(`Rate limited. Retrying after ${waitTime} seconds...`);
|
|
21
|
+
} else if (rateLimitReset) {
|
|
22
|
+
waitTime = Math.max(parseInt(rateLimitReset, 10) - Math.floow(Date.now() / 1000), 0);
|
|
23
|
+
console.log(`Rate limited. Retrying after ${waitTime} seconds...`);
|
|
11
24
|
} else {
|
|
12
|
-
|
|
25
|
+
waitTime = CONFIG.CRAWLER.CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS / 1000;
|
|
26
|
+
console.log(`Rate limited. No 'retry-after' or 'x-ratelimit-reset' headers found. Falling back to ${waitTime} seconds...`);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (CONFIG.CRAWLER.EXIT_ON_RATE_LIMIT) {
|
|
30
|
+
console.log(`Exiting due to rate limit.`);
|
|
31
|
+
process.exit(CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
|
|
32
|
+
} else {
|
|
33
|
+
await delay(waitTime * 1000);
|
|
13
34
|
}
|
|
14
|
-
process.exit(10); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
|
|
15
35
|
}
|
|
16
|
-
|
|
36
|
+
|
|
37
|
+
return CONFIG.CRAWLER.RETRY_STATUS_CODES.includes(status); // Retry on the specified status codes
|
|
17
38
|
};
|
|
18
39
|
|
|
19
40
|
const shouldIncludeURL = (url) => {
|
|
@@ -40,7 +61,7 @@ const shouldIncludeURL = (url) => {
|
|
|
40
61
|
}
|
|
41
62
|
};
|
|
42
63
|
|
|
43
|
-
export const enqueueURLs = (urlData,
|
|
64
|
+
export const enqueueURLs = (urlData, $, baseURL, depth) => {
|
|
44
65
|
$('a[href]').each((_, element) => {
|
|
45
66
|
const href = $(element).attr('href');
|
|
46
67
|
if (!href) return;
|
|
@@ -49,8 +70,7 @@ export const enqueueURLs = (urlData, urlMetadata, $, baseURL, referrer, depth) =
|
|
|
49
70
|
const newURL = new URL(href, baseURL).toString();
|
|
50
71
|
const normalizedURL = normalizeURL(newURL);
|
|
51
72
|
if (shouldIncludeURL(normalizedURL) && !urlData.some(entry => entry.url === normalizedURL)) {
|
|
52
|
-
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
53
|
-
urlMetadata[normalizedURL] = { referrer, depth };
|
|
73
|
+
urlData.push({ url: normalizedURL, file: null, status: null, error: null, referrerUrl: baseURL, depth });
|
|
54
74
|
}
|
|
55
75
|
} catch (error) {
|
|
56
76
|
console.error(`Failed to enqueue URL: ${href} from ${baseURL}: ${error.message}`);
|
|
@@ -5,21 +5,20 @@ import { shouldRetry, enqueueURLs } from './handlers.js';
|
|
|
5
5
|
import { fetchURL } from './fetch.js';
|
|
6
6
|
import { saveDataset, saveQueue } from '../fileOperations.js';
|
|
7
7
|
|
|
8
|
-
export const processURL = async (entry, fileNumber, urlData
|
|
9
|
-
|
|
8
|
+
export const processURL = async (entry, fileNumber, urlData) => {
|
|
9
|
+
const startTime = new Date().getTime();
|
|
10
|
+
const { url, referrer, depth } = entry;
|
|
10
11
|
|
|
12
|
+
if (entry.file || (entry.error && !(await shouldRetry({ response: { status: entry.status } })))) return;
|
|
13
|
+
|
|
11
14
|
console.log(`- ${fileNumber}/${urlData.length} -> ${entry.url}`);
|
|
12
15
|
|
|
13
|
-
const { url } = entry;
|
|
14
|
-
const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
|
|
15
|
-
|
|
16
|
-
const startTime = new Date().getTime();
|
|
17
16
|
try {
|
|
18
17
|
const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
|
|
19
18
|
if (result && result.data) {
|
|
20
19
|
const { data: html, status } = result;
|
|
21
20
|
const $ = cheerio.load(html);
|
|
22
|
-
enqueueURLs(urlData,
|
|
21
|
+
enqueueURLs(urlData, $, url, depth + 1);
|
|
23
22
|
|
|
24
23
|
const content = cleanHTML($);
|
|
25
24
|
const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
|
|
@@ -24,7 +24,7 @@ export const formatData = (entry) => {
|
|
|
24
24
|
|
|
25
25
|
// Fallback to wildcard match ('*') if no specific path is found
|
|
26
26
|
if (!categorisedPath) categorisedPath = categorisedPaths['*'];
|
|
27
|
-
if (categorisedPath) return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
27
|
+
if (categorisedPath) return path.posix.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
30
|
} catch (e) {
|