scraply 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +30 -0
- package/package.json +5 -1
- package/readme.md +89 -4
- package/src/{config.js → defaultConfig.js} +55 -55
- package/src/loadConfig.js +30 -25
- package/src/scraply.js +121 -121
- package/src/utils/crawl/cleanHTML.js +38 -38
- package/src/utils/crawl/delay.js +1 -1
- package/src/utils/crawl/fileOperations.js +37 -37
- package/src/utils/crawl/url/fetch.js +25 -25
- package/src/utils/crawl/url/handlers.js +44 -44
- package/src/utils/crawl/url/normalize.js +8 -8
- package/src/utils/crawl/url/processor.js +44 -44
- package/src/utils/format/formatData.js +56 -56
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish to NPM
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
tags:
|
|
8
|
+
- 'v*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- name: Check out the repository
|
|
16
|
+
uses: actions/checkout@v2
|
|
17
|
+
|
|
18
|
+
- name: Set up Node.js
|
|
19
|
+
uses: actions/setup-node@v2
|
|
20
|
+
with:
|
|
21
|
+
node-version: '20.17'
|
|
22
|
+
registry-url: 'https://registry.npmjs.org/'
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: npm install
|
|
26
|
+
|
|
27
|
+
- name: Publish to npm
|
|
28
|
+
run: npm publish
|
|
29
|
+
env:
|
|
30
|
+
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.4",
|
|
5
5
|
"main": "src/scraply.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"scripts": {
|
|
@@ -16,5 +16,9 @@
|
|
|
16
16
|
"axios": "^1.7.7",
|
|
17
17
|
"cheerio": "^1.0.0",
|
|
18
18
|
"he": "^1.2.0"
|
|
19
|
+
},
|
|
20
|
+
"publishConfig": {
|
|
21
|
+
"registry": "https://registry.npmjs.org/",
|
|
22
|
+
"access": "public"
|
|
19
23
|
}
|
|
20
24
|
}
|
package/readme.md
CHANGED
|
@@ -1,4 +1,89 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
# Scraply
|
|
2
|
+
Scraply is a customizable and efficient web crawler and data scraper for Node.js, designed to handle various web crawling needs with ease. You can define the URLs to crawl, configure patterns to include/exclude, and format the output data in JSON. Scraply is built to be flexible, with user-configurable settings and dynamic paths.
|
|
3
|
+
|
|
4
|
+
Bug Reports & Dev Stuff on: [Scraply's GitHub](https://github.com/pauserratgutierrez/scraply)
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
Using npm:
|
|
8
|
+
``npm install scraply``
|
|
9
|
+
|
|
10
|
+
## Working Example
|
|
11
|
+
Initialize Scraply with provided URLs to start crawling:
|
|
12
|
+
```
|
|
13
|
+
import { scraply } from 'scraply';
|
|
14
|
+
scraply({
|
|
15
|
+
CRAWLER: {
|
|
16
|
+
INITIAL_URLS: ['https://example.com']
|
|
17
|
+
}
|
|
18
|
+
});
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## How Scraply Works
|
|
22
|
+
### Persistent Data Storage
|
|
23
|
+
Scraply persistently saves the state of the crawler in JSON files (the queue, crawled data, etc.). If the crawler is interrupted or rate-limited, all progress is saved, and the crawler will automatically stop. When restarted, Scraply resumes crawling exactly where it left off, without reprocessing already crawled URLs.
|
|
24
|
+
|
|
25
|
+
### Handling Rate Limiting
|
|
26
|
+
Scraply is designed to handle rate-limiting gracefully. If the crawler encounters rate-limited responses (e.g., status code `429`), it stops processing further requests and saves everything in the queue. Once restarted, it resumes the crawling process from where it stopped.
|
|
27
|
+
|
|
28
|
+
This makes Scraply ideal for long-running, continuous crawling tasks. You can integrate Scraply with GitHub Actions or other CI/CD pipelines to perform endless crawling jobs over time. Simply schedule Scraply to run periodically, and it will continue gathering data without duplicating work.
|
|
29
|
+
|
|
30
|
+
### Integration with GitHub Actions
|
|
31
|
+
Scraply can be easily integrated into a GitHub Action workflow for continuous, long-running crawling tasks. You can set it up to crawl for a set duration or number of URLs, persistently saving the progress, and then resuming where it left off on the next run.
|
|
32
|
+
|
|
33
|
+
## Config Options
|
|
34
|
+
Scraply allows you to pass a configuration object to the main ```scraply()``` function to customize the crawling behavior. Below are the current configuration options:
|
|
35
|
+
```
|
|
36
|
+
MAIN_DIR: 'dataset',
|
|
37
|
+
|
|
38
|
+
CRAWLER: {
|
|
39
|
+
INITIAL_URLS: [
|
|
40
|
+
'https://crawler-test.com/'
|
|
41
|
+
],
|
|
42
|
+
INCLUDE_URLS: [
|
|
43
|
+
'https://crawler-test.com/.*'
|
|
44
|
+
],
|
|
45
|
+
ALLOWED_CONTENT_TYPES: [
|
|
46
|
+
'text/html'
|
|
47
|
+
],
|
|
48
|
+
EXCLUDE_PATTERNS: [
|
|
49
|
+
'/cdn-cgi/',
|
|
50
|
+
/\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/,
|
|
51
|
+
],
|
|
52
|
+
DOM_ELEMENTS_REMOVE: [
|
|
53
|
+
'script',
|
|
54
|
+
'noscript',
|
|
55
|
+
'style',
|
|
56
|
+
'meta',
|
|
57
|
+
'link',
|
|
58
|
+
'svg',
|
|
59
|
+
'path',
|
|
60
|
+
'img',
|
|
61
|
+
'input',
|
|
62
|
+
'textarea',
|
|
63
|
+
'embed',
|
|
64
|
+
'object',
|
|
65
|
+
'iframe',
|
|
66
|
+
'nav',
|
|
67
|
+
'header',
|
|
68
|
+
'footer',
|
|
69
|
+
'aside',
|
|
70
|
+
'button'
|
|
71
|
+
],
|
|
72
|
+
RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
|
|
73
|
+
REQUEST_TIMEOUT: 4000,
|
|
74
|
+
MAX_REDIRECTS: 3,
|
|
75
|
+
MAX_RETRIES: 2,
|
|
76
|
+
CRAWL_DELAY_MS: 200,
|
|
77
|
+
CRAWL_ERROR_RETRY_DELAY_MS: 800,
|
|
78
|
+
},
|
|
79
|
+
|
|
80
|
+
DATA_FORMATTER: {
|
|
81
|
+
EXCLUDED_PATTERNS: [],
|
|
82
|
+
CATEGORISED_PATHS: {
|
|
83
|
+
'https://crawler-test.com': {
|
|
84
|
+
'mobile': 'mobile.json',
|
|
85
|
+
'fallback': 'general.json'
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
}
|
|
89
|
+
```
|
|
@@ -1,55 +1,55 @@
|
|
|
1
|
-
export const DEFAULT_CONFIG = {
|
|
2
|
-
MAIN_DIR: 'dataset',
|
|
3
|
-
|
|
4
|
-
CRAWLER: {
|
|
5
|
-
INITIAL_URLS: [
|
|
6
|
-
'https://crawler-test.com/'
|
|
7
|
-
],
|
|
8
|
-
INCLUDE_URLS: [
|
|
9
|
-
'https://crawler-test.com/.*'
|
|
10
|
-
],
|
|
11
|
-
ALLOWED_CONTENT_TYPES: [
|
|
12
|
-
'text/html'
|
|
13
|
-
],
|
|
14
|
-
EXCLUDE_PATTERNS: [
|
|
15
|
-
'/cdn-cgi/',
|
|
16
|
-
/\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/,
|
|
17
|
-
],
|
|
18
|
-
DOM_ELEMENTS_REMOVE: [
|
|
19
|
-
'script',
|
|
20
|
-
'noscript',
|
|
21
|
-
'style',
|
|
22
|
-
'meta',
|
|
23
|
-
'link',
|
|
24
|
-
'svg',
|
|
25
|
-
'path',
|
|
26
|
-
'img',
|
|
27
|
-
'input',
|
|
28
|
-
'textarea',
|
|
29
|
-
'embed',
|
|
30
|
-
'object',
|
|
31
|
-
'iframe',
|
|
32
|
-
'nav',
|
|
33
|
-
'header',
|
|
34
|
-
'footer',
|
|
35
|
-
'aside',
|
|
36
|
-
'button'
|
|
37
|
-
],
|
|
38
|
-
RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
|
|
39
|
-
REQUEST_TIMEOUT: 4000,
|
|
40
|
-
MAX_REDIRECTS: 3,
|
|
41
|
-
MAX_RETRIES: 2,
|
|
42
|
-
CRAWL_DELAY_MS: 200,
|
|
43
|
-
CRAWL_ERROR_RETRY_DELAY_MS: 800,
|
|
44
|
-
},
|
|
45
|
-
|
|
46
|
-
DATA_FORMATTER: {
|
|
47
|
-
EXCLUDED_PATTERNS: [],
|
|
48
|
-
CATEGORISED_PATHS: {
|
|
49
|
-
'https://crawler-test.com': {
|
|
50
|
-
'mobile': 'mobile.json',
|
|
51
|
-
'fallback': 'general.json'
|
|
52
|
-
},
|
|
53
|
-
},
|
|
54
|
-
}
|
|
55
|
-
};
|
|
1
|
+
export const DEFAULT_CONFIG = {
|
|
2
|
+
MAIN_DIR: 'dataset',
|
|
3
|
+
|
|
4
|
+
CRAWLER: {
|
|
5
|
+
INITIAL_URLS: [
|
|
6
|
+
'https://crawler-test.com/'
|
|
7
|
+
],
|
|
8
|
+
INCLUDE_URLS: [
|
|
9
|
+
'https://crawler-test.com/.*'
|
|
10
|
+
],
|
|
11
|
+
ALLOWED_CONTENT_TYPES: [
|
|
12
|
+
'text/html'
|
|
13
|
+
],
|
|
14
|
+
EXCLUDE_PATTERNS: [
|
|
15
|
+
'/cdn-cgi/',
|
|
16
|
+
/\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/,
|
|
17
|
+
],
|
|
18
|
+
DOM_ELEMENTS_REMOVE: [
|
|
19
|
+
'script',
|
|
20
|
+
'noscript',
|
|
21
|
+
'style',
|
|
22
|
+
'meta',
|
|
23
|
+
'link',
|
|
24
|
+
'svg',
|
|
25
|
+
'path',
|
|
26
|
+
'img',
|
|
27
|
+
'input',
|
|
28
|
+
'textarea',
|
|
29
|
+
'embed',
|
|
30
|
+
'object',
|
|
31
|
+
'iframe',
|
|
32
|
+
'nav',
|
|
33
|
+
'header',
|
|
34
|
+
'footer',
|
|
35
|
+
'aside',
|
|
36
|
+
'button'
|
|
37
|
+
],
|
|
38
|
+
RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
|
|
39
|
+
REQUEST_TIMEOUT: 4000,
|
|
40
|
+
MAX_REDIRECTS: 3,
|
|
41
|
+
MAX_RETRIES: 2,
|
|
42
|
+
CRAWL_DELAY_MS: 200,
|
|
43
|
+
CRAWL_ERROR_RETRY_DELAY_MS: 800,
|
|
44
|
+
},
|
|
45
|
+
|
|
46
|
+
DATA_FORMATTER: {
|
|
47
|
+
EXCLUDED_PATTERNS: [],
|
|
48
|
+
CATEGORISED_PATHS: {
|
|
49
|
+
'https://crawler-test.com': {
|
|
50
|
+
'mobile': 'mobile.json',
|
|
51
|
+
'fallback': 'general.json'
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
};
|
package/src/loadConfig.js
CHANGED
|
@@ -1,25 +1,30 @@
|
|
|
1
|
-
import path from 'node:path';
|
|
2
|
-
import { DEFAULT_CONFIG } from './
|
|
3
|
-
|
|
4
|
-
// A utility function to perform a deep merge of objects
|
|
5
|
-
function deepMerge(target, source) {
|
|
6
|
-
for (const key in source) {
|
|
7
|
-
if (source[key] instanceof Object && key in target) {
|
|
8
|
-
Object.assign(source[key], deepMerge(target[key], source[key]));
|
|
9
|
-
}
|
|
10
|
-
}
|
|
11
|
-
return { ...target, ...source };
|
|
12
|
-
};
|
|
13
|
-
|
|
14
|
-
export function loadConfig(userConfig = {}) {
|
|
15
|
-
// Merge the user config with the default config
|
|
16
|
-
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
17
|
-
|
|
18
|
-
// Dynamically construct paths using MAIN_DIR
|
|
19
|
-
config.CRAWLER.QUEUE_PATH = path.join(config.MAIN_DIR, 'queue.json');
|
|
20
|
-
config.CRAWLER.CRAWLED_PATH = path.join(config.MAIN_DIR, 'crawled');
|
|
21
|
-
config.DATA_FORMATTER.FORMATTED_PATH = path.join(config.MAIN_DIR, 'formatted');
|
|
22
|
-
config.DATA_FORMATTER.ERROR_REPORT_PATH = path.join(config.MAIN_DIR, 'error-report.json');
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { DEFAULT_CONFIG } from './defaultConfig.js';
|
|
3
|
+
|
|
4
|
+
// A utility function to perform a deep merge of objects
|
|
5
|
+
function deepMerge(target, source) {
|
|
6
|
+
for (const key in source) {
|
|
7
|
+
if (source[key] instanceof Object && key in target) {
|
|
8
|
+
Object.assign(source[key], deepMerge(target[key], source[key]));
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
return { ...target, ...source };
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
export function loadConfig(userConfig = {}) {
|
|
15
|
+
// Merge the user config with the default config
|
|
16
|
+
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
17
|
+
|
|
18
|
+
// Dynamically construct paths using MAIN_DIR
|
|
19
|
+
config.CRAWLER.QUEUE_PATH = path.join(config.MAIN_DIR, 'queue.json');
|
|
20
|
+
config.CRAWLER.CRAWLED_PATH = path.join(config.MAIN_DIR, 'crawled');
|
|
21
|
+
config.DATA_FORMATTER.FORMATTED_PATH = path.join(config.MAIN_DIR, 'formatted');
|
|
22
|
+
config.DATA_FORMATTER.ERROR_REPORT_PATH = path.join(config.MAIN_DIR, 'error-report.json');
|
|
23
|
+
|
|
24
|
+
// If INCLUDE_URLS is not specified, set it to INITIAL_URLS by default
|
|
25
|
+
if (!config.CRAWLER.INCLUDE_URLS || config.CRAWLER.INCLUDE_URLS.length === 0) {
|
|
26
|
+
config.CRAWLER.INCLUDE_URLS = config.CRAWLER.INITIAL_URLS.map(url => `${url}.*`);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return config;
|
|
30
|
+
};
|
package/src/scraply.js
CHANGED
|
@@ -1,121 +1,121 @@
|
|
|
1
|
-
import { loadConfig } from './loadConfig.js';
|
|
2
|
-
import { normalizeURL } from './utils/crawl/url/normalize.js';
|
|
3
|
-
import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
|
|
4
|
-
import { processURL } from './utils/crawl/url/processor.js';
|
|
5
|
-
import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
|
|
6
|
-
|
|
7
|
-
let urlData = [];
|
|
8
|
-
let urlMetadata = {};
|
|
9
|
-
let CONFIG = {};
|
|
10
|
-
|
|
11
|
-
const
|
|
12
|
-
urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
|
|
13
|
-
|
|
14
|
-
if (urlData.length === 0) { // If the queue is empty, start fresh with the initial URLs.
|
|
15
|
-
console.log(`Starting fresh! No URLs found in ${CONFIG.CRAWLER.QUEUE_PATH}\n`);
|
|
16
|
-
|
|
17
|
-
CONFIG.CRAWLER.INITIAL_URLS.forEach(url => {
|
|
18
|
-
const normalizedURL = normalizeURL(url);
|
|
19
|
-
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
20
|
-
});
|
|
21
|
-
saveQueue(urlData);
|
|
22
|
-
} else { // If the queue is not empty
|
|
23
|
-
const allProcessed = urlData.every(entry => entry.file !== null || entry.error !== null);
|
|
24
|
-
if (allProcessed) { // If all URLs have been processed
|
|
25
|
-
console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
|
|
26
|
-
|
|
27
|
-
urlData = [];
|
|
28
|
-
urlMetadata = {};
|
|
29
|
-
|
|
30
|
-
// Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data. This way, the Discord Bot will fetch the correct & latest data from the GitHub repo, without fetching any incomplete data or empty data, as it watches for file diffs!
|
|
31
|
-
deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
|
|
32
|
-
deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
|
|
33
|
-
deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
} else { // If there are URLs that haven't been processed yet, resume from the queue.
|
|
37
|
-
console.log(`Resuming from ${CONFIG.CRAWLER.QUEUE_PATH} with ${urlData.length} total found URLs\n`);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
};
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
console.log(`STARTING CRAWLER
|
|
44
|
-
- Initial URLs: ${CONFIG.CRAWLER.INITIAL_URLS}
|
|
45
|
-
- Include URLs: ${CONFIG.CRAWLER.INCLUDE_URLS}
|
|
46
|
-
- Excluded Patterns: ${CONFIG.CRAWLER.EXCLUDE_PATTERNS}
|
|
47
|
-
- Allowed Content Types: ${CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES}
|
|
48
|
-
- Retry Status Codes: ${CONFIG.CRAWLER.RETRY_STATUS_CODES}
|
|
49
|
-
- Request Timeout: ${CONFIG.CRAWLER.REQUEST_TIMEOUT}
|
|
50
|
-
- Max Redirects: ${CONFIG.CRAWLER.MAX_REDIRECTS}
|
|
51
|
-
- Max Retries: ${CONFIG.CRAWLER.MAX_RETRIES}
|
|
52
|
-
- Crawl Delay: ${CONFIG.CRAWLER.CRAWL_DELAY_MS}ms
|
|
53
|
-
- Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
|
|
54
|
-
`);
|
|
55
|
-
|
|
56
|
-
let fileNumber = urlData.filter(entry => entry.file).length + 1;
|
|
57
|
-
for await (const entry of urlData) {
|
|
58
|
-
if (!entry.file) { // Only process URLs that haven't been processed yet.
|
|
59
|
-
await processURL(entry, fileNumber, urlData, urlMetadata);
|
|
60
|
-
fileNumber++; // Increment the file number only if the URL was processed successfully.
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const totalUrls = urlData.length;
|
|
65
|
-
const crawledUrls = urlData.filter(entry => entry.file !== null).length;
|
|
66
|
-
const notCrawledUrls = totalUrls - crawledUrls;
|
|
67
|
-
const errorUrls = urlData.filter(entry => entry.error !== null);
|
|
68
|
-
|
|
69
|
-
console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled)`);
|
|
70
|
-
|
|
71
|
-
// Iterate over all the urlData and save all the url & content to files, categorized by CONFIG.DATA_FORMATTER.CATEGORISED_PATHS. Exclude the URLs that match the patterns in CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS. Save in CONFIG.DATA_FORMATTER.FORMATTED_PATH.
|
|
72
|
-
console.log(`\nFORMATTING DATA...`);
|
|
73
|
-
const dataToSave = {};
|
|
74
|
-
|
|
75
|
-
for await (const entry of urlData) {
|
|
76
|
-
const savePath = formatData(entry);
|
|
77
|
-
if (savePath) { // If the URL should be saved.
|
|
78
|
-
if (!dataToSave[savePath]) dataToSave[savePath] = [];
|
|
79
|
-
|
|
80
|
-
// Load content from the file referenced by entry.file
|
|
81
|
-
let content = null;
|
|
82
|
-
try {
|
|
83
|
-
content = loadJSON(entry.file);
|
|
84
|
-
dataToSave[savePath].push({ url: entry.url, content: content.content });
|
|
85
|
-
} catch (e) {
|
|
86
|
-
console.error(`Error loading content from ${entry.file}: ${e.message}`);
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
};
|
|
90
|
-
|
|
91
|
-
// Save the data to files.
|
|
92
|
-
let totalSavedURLs = 0;
|
|
93
|
-
for (const [savePath, data] of Object.entries(dataToSave)) {
|
|
94
|
-
totalSavedURLs += data.length;
|
|
95
|
-
console.log(`${data.length} -> ${savePath}`);
|
|
96
|
-
saveSortedFormattedJSON(savePath, data);
|
|
97
|
-
};
|
|
98
|
-
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
99
|
-
|
|
100
|
-
// Save hardcoded extra links to files.
|
|
101
|
-
await saveHardcodedExtraLinks();
|
|
102
|
-
console.log(`Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
103
|
-
|
|
104
|
-
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
105
|
-
const errorData = errorUrls.map(entry => {
|
|
106
|
-
return { url: entry.url, status: entry.status, error: entry.error };
|
|
107
|
-
});
|
|
108
|
-
|
|
109
|
-
saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
|
|
110
|
-
|
|
111
|
-
console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
|
|
112
|
-
};
|
|
113
|
-
|
|
114
|
-
// Main function to be exported and used
|
|
115
|
-
export const scraply = async (userConfig = {}) => {
|
|
116
|
-
CONFIG = loadConfig(userConfig);
|
|
117
|
-
global.CONFIG = CONFIG;
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
await
|
|
121
|
-
};
|
|
1
|
+
import { loadConfig } from './loadConfig.js';
|
|
2
|
+
import { normalizeURL } from './utils/crawl/url/normalize.js';
|
|
3
|
+
import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
|
|
4
|
+
import { processURL } from './utils/crawl/url/processor.js';
|
|
5
|
+
import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
|
|
6
|
+
|
|
7
|
+
let urlData = [];
|
|
8
|
+
let urlMetadata = {};
|
|
9
|
+
let CONFIG = {};
|
|
10
|
+
|
|
11
|
+
const init = () => {
|
|
12
|
+
urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
|
|
13
|
+
|
|
14
|
+
if (urlData.length === 0) { // If the queue is empty, start fresh with the initial URLs.
|
|
15
|
+
console.log(`Starting fresh! No URLs found in ${CONFIG.CRAWLER.QUEUE_PATH}\n`);
|
|
16
|
+
|
|
17
|
+
CONFIG.CRAWLER.INITIAL_URLS.forEach(url => {
|
|
18
|
+
const normalizedURL = normalizeURL(url);
|
|
19
|
+
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
20
|
+
});
|
|
21
|
+
saveQueue(urlData);
|
|
22
|
+
} else { // If the queue is not empty
|
|
23
|
+
const allProcessed = urlData.every(entry => entry.file !== null || entry.error !== null);
|
|
24
|
+
if (allProcessed) { // If all URLs have been processed
|
|
25
|
+
console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
|
|
26
|
+
|
|
27
|
+
urlData = [];
|
|
28
|
+
urlMetadata = {};
|
|
29
|
+
|
|
30
|
+
// Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data. This way, the Discord Bot will fetch the correct & latest data from the GitHub repo, without fetching any incomplete data or empty data, as it watches for file diffs!
|
|
31
|
+
deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
|
|
32
|
+
deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
|
|
33
|
+
deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
|
|
34
|
+
|
|
35
|
+
init();
|
|
36
|
+
} else { // If there are URLs that haven't been processed yet, resume from the queue.
|
|
37
|
+
console.log(`Resuming from ${CONFIG.CRAWLER.QUEUE_PATH} with ${urlData.length} total found URLs\n`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const start = async () => {
|
|
43
|
+
console.log(`STARTING SCRAPLY CRAWLER...
|
|
44
|
+
- Initial URLs: ${CONFIG.CRAWLER.INITIAL_URLS}
|
|
45
|
+
- Include URLs: ${CONFIG.CRAWLER.INCLUDE_URLS}
|
|
46
|
+
- Excluded Patterns: ${CONFIG.CRAWLER.EXCLUDE_PATTERNS}
|
|
47
|
+
- Allowed Content Types: ${CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES}
|
|
48
|
+
- Retry Status Codes: ${CONFIG.CRAWLER.RETRY_STATUS_CODES}
|
|
49
|
+
- Request Timeout: ${CONFIG.CRAWLER.REQUEST_TIMEOUT}
|
|
50
|
+
- Max Redirects: ${CONFIG.CRAWLER.MAX_REDIRECTS}
|
|
51
|
+
- Max Retries: ${CONFIG.CRAWLER.MAX_RETRIES}
|
|
52
|
+
- Crawl Delay: ${CONFIG.CRAWLER.CRAWL_DELAY_MS}ms
|
|
53
|
+
- Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
|
|
54
|
+
`);
|
|
55
|
+
|
|
56
|
+
let fileNumber = urlData.filter(entry => entry.file).length + 1;
|
|
57
|
+
for await (const entry of urlData) {
|
|
58
|
+
if (!entry.file) { // Only process URLs that haven't been processed yet.
|
|
59
|
+
await processURL(entry, fileNumber, urlData, urlMetadata);
|
|
60
|
+
fileNumber++; // Increment the file number only if the URL was processed successfully.
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const totalUrls = urlData.length;
|
|
65
|
+
const crawledUrls = urlData.filter(entry => entry.file !== null).length;
|
|
66
|
+
const notCrawledUrls = totalUrls - crawledUrls;
|
|
67
|
+
const errorUrls = urlData.filter(entry => entry.error !== null);
|
|
68
|
+
|
|
69
|
+
console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled)`);
|
|
70
|
+
|
|
71
|
+
// Iterate over all the urlData and save all the url & content to files, categorized by CONFIG.DATA_FORMATTER.CATEGORISED_PATHS. Exclude the URLs that match the patterns in CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS. Save in CONFIG.DATA_FORMATTER.FORMATTED_PATH.
|
|
72
|
+
console.log(`\nFORMATTING DATA...`);
|
|
73
|
+
const dataToSave = {};
|
|
74
|
+
|
|
75
|
+
for await (const entry of urlData) {
|
|
76
|
+
const savePath = formatData(entry);
|
|
77
|
+
if (savePath) { // If the URL should be saved.
|
|
78
|
+
if (!dataToSave[savePath]) dataToSave[savePath] = [];
|
|
79
|
+
|
|
80
|
+
// Load content from the file referenced by entry.file
|
|
81
|
+
let content = null;
|
|
82
|
+
try {
|
|
83
|
+
content = loadJSON(entry.file);
|
|
84
|
+
dataToSave[savePath].push({ url: entry.url, content: content.content });
|
|
85
|
+
} catch (e) {
|
|
86
|
+
console.error(`Error loading content from ${entry.file}: ${e.message}`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
// Save the data to files.
|
|
92
|
+
let totalSavedURLs = 0;
|
|
93
|
+
for (const [savePath, data] of Object.entries(dataToSave)) {
|
|
94
|
+
totalSavedURLs += data.length;
|
|
95
|
+
console.log(`${data.length} -> ${savePath}`);
|
|
96
|
+
saveSortedFormattedJSON(savePath, data);
|
|
97
|
+
};
|
|
98
|
+
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
99
|
+
|
|
100
|
+
// Save hardcoded extra links to files.
|
|
101
|
+
await saveHardcodedExtraLinks();
|
|
102
|
+
console.log(`Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
103
|
+
|
|
104
|
+
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
105
|
+
const errorData = errorUrls.map(entry => {
|
|
106
|
+
return { url: entry.url, status: entry.status, error: entry.error };
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
|
|
110
|
+
|
|
111
|
+
console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// Main function to be exported and used
|
|
115
|
+
export const scraply = async (userConfig = {}) => {
|
|
116
|
+
CONFIG = loadConfig(userConfig);
|
|
117
|
+
global.CONFIG = CONFIG;
|
|
118
|
+
|
|
119
|
+
init();
|
|
120
|
+
await start();
|
|
121
|
+
};
|
|
@@ -1,38 +1,38 @@
|
|
|
1
|
-
import he from 'he';
|
|
2
|
-
|
|
3
|
-
export const cleanHTML = ($) => {
|
|
4
|
-
// Remove unwanted elements
|
|
5
|
-
const $aux = $;
|
|
6
|
-
$aux(CONFIG.CRAWLER.DOM_ELEMENTS_REMOVE.join(',')).remove();
|
|
7
|
-
$aux('*').contents().filter((_, el) => el.type === 'comment').remove();
|
|
8
|
-
|
|
9
|
-
// Get the text content of the body and decode HTML entities
|
|
10
|
-
// let bodyText = he.decode($aux('body').text(), { level: 'all' });
|
|
11
|
-
|
|
12
|
-
// Get the text content of the body element, ensuring spaces between child elements
|
|
13
|
-
let bodyText = getTextWithSpaces($aux, $aux('body'));
|
|
14
|
-
|
|
15
|
-
// Decode HTML entities
|
|
16
|
-
bodyText = he.decode(bodyText, { level: 'all' });
|
|
17
|
-
|
|
18
|
-
// Clean up the resulting text
|
|
19
|
-
return bodyText
|
|
20
|
-
.replace(/\n/g, ' ') // Replace newlines with a space
|
|
21
|
-
.replace(/\\['"\\]/g, match => match.slice(1)) // Replace escaped characters with the unescaped character
|
|
22
|
-
.replace(/[\u200B\u00A0\u2028\u2029\u202F\u00AD\u2060\uFEFF]/g, ' ') // Replace zero-width spaces with a space
|
|
23
|
-
.replace(/\s{2,}/g, ' ') // Replace multiple spaces with a single space
|
|
24
|
-
.trim();
|
|
25
|
-
};
|
|
26
|
-
|
|
27
|
-
// Custom function to get text content with spaces between elements
|
|
28
|
-
const getTextWithSpaces = ($, element) => {
|
|
29
|
-
let text = '';
|
|
30
|
-
element.contents().each((_, el) => {
|
|
31
|
-
if (el.type === 'text') {
|
|
32
|
-
text += $(el).text() + ' ';
|
|
33
|
-
} else if (el.type === 'tag') {
|
|
34
|
-
text += getTextWithSpaces($, $(el));
|
|
35
|
-
}
|
|
36
|
-
});
|
|
37
|
-
return text;
|
|
38
|
-
};
|
|
1
|
+
import he from 'he';
|
|
2
|
+
|
|
3
|
+
export const cleanHTML = ($) => {
|
|
4
|
+
// Remove unwanted elements
|
|
5
|
+
const $aux = $;
|
|
6
|
+
$aux(CONFIG.CRAWLER.DOM_ELEMENTS_REMOVE.join(',')).remove();
|
|
7
|
+
$aux('*').contents().filter((_, el) => el.type === 'comment').remove();
|
|
8
|
+
|
|
9
|
+
// Get the text content of the body and decode HTML entities
|
|
10
|
+
// let bodyText = he.decode($aux('body').text(), { level: 'all' });
|
|
11
|
+
|
|
12
|
+
// Get the text content of the body element, ensuring spaces between child elements
|
|
13
|
+
let bodyText = getTextWithSpaces($aux, $aux('body'));
|
|
14
|
+
|
|
15
|
+
// Decode HTML entities
|
|
16
|
+
bodyText = he.decode(bodyText, { level: 'all' });
|
|
17
|
+
|
|
18
|
+
// Clean up the resulting text
|
|
19
|
+
return bodyText
|
|
20
|
+
.replace(/\n/g, ' ') // Replace newlines with a space
|
|
21
|
+
.replace(/\\['"\\]/g, match => match.slice(1)) // Replace escaped characters with the unescaped character
|
|
22
|
+
.replace(/[\u200B\u00A0\u2028\u2029\u202F\u00AD\u2060\uFEFF]/g, ' ') // Replace zero-width spaces with a space
|
|
23
|
+
.replace(/\s{2,}/g, ' ') // Replace multiple spaces with a single space
|
|
24
|
+
.trim();
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
// Custom function to get text content with spaces between elements
|
|
28
|
+
const getTextWithSpaces = ($, element) => {
|
|
29
|
+
let text = '';
|
|
30
|
+
element.contents().each((_, el) => {
|
|
31
|
+
if (el.type === 'text') {
|
|
32
|
+
text += $(el).text() + ' ';
|
|
33
|
+
} else if (el.type === 'tag') {
|
|
34
|
+
text += getTextWithSpaces($, $(el));
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
return text;
|
|
38
|
+
};
|
package/src/utils/crawl/delay.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export const delay = async (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
1
|
+
export const delay = async (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
@@ -1,37 +1,37 @@
|
|
|
1
|
-
import fs from 'node:fs';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
|
|
4
|
-
export const loadJSON = (filePath) => fs.existsSync(filePath) ? JSON.parse(fs.readFileSync(filePath, 'utf8')) : [];
|
|
5
|
-
|
|
6
|
-
export const saveJSON = (filePath, data) => {
|
|
7
|
-
const dir = path.dirname(filePath);
|
|
8
|
-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
9
|
-
return fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
|
|
10
|
-
};
|
|
11
|
-
|
|
12
|
-
export const saveDataset = (data, fileNumber) => {
|
|
13
|
-
if (!fs.existsSync(CONFIG.CRAWLER.CRAWLED_PATH)) fs.mkdirSync(CONFIG.CRAWLER.CRAWLED_PATH, { recursive: true });
|
|
14
|
-
const filename = `${CONFIG.CRAWLER.CRAWLED_PATH}/${fileNumber}.json`;
|
|
15
|
-
fs.writeFileSync(filename, JSON.stringify(data, null, 2), 'utf8');
|
|
16
|
-
return filename;
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
export const saveQueue = (urlData) => saveJSON(CONFIG.CRAWLER.QUEUE_PATH, urlData);
|
|
20
|
-
|
|
21
|
-
export const deleteDataFiles = (filePath) => {
|
|
22
|
-
if (fs.existsSync(filePath)) {
|
|
23
|
-
if (fs.lstatSync(filePath).isDirectory()) {
|
|
24
|
-
fs.readdirSync(filePath).forEach((file) => {
|
|
25
|
-
const currentPath = path.join(filePath, file);
|
|
26
|
-
if (fs.lstatSync(currentPath).isDirectory()) {
|
|
27
|
-
deleteDataFiles(currentPath);
|
|
28
|
-
} else {
|
|
29
|
-
fs.unlinkSync(currentPath);
|
|
30
|
-
}
|
|
31
|
-
});
|
|
32
|
-
fs.rmdirSync(filePath);
|
|
33
|
-
} else {
|
|
34
|
-
fs.unlinkSync(filePath);
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
};
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
4
|
+
export const loadJSON = (filePath) => fs.existsSync(filePath) ? JSON.parse(fs.readFileSync(filePath, 'utf8')) : [];
|
|
5
|
+
|
|
6
|
+
export const saveJSON = (filePath, data) => {
|
|
7
|
+
const dir = path.dirname(filePath);
|
|
8
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
9
|
+
return fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
export const saveDataset = (data, fileNumber) => {
|
|
13
|
+
if (!fs.existsSync(CONFIG.CRAWLER.CRAWLED_PATH)) fs.mkdirSync(CONFIG.CRAWLER.CRAWLED_PATH, { recursive: true });
|
|
14
|
+
const filename = `${CONFIG.CRAWLER.CRAWLED_PATH}/${fileNumber}.json`;
|
|
15
|
+
fs.writeFileSync(filename, JSON.stringify(data, null, 2), 'utf8');
|
|
16
|
+
return filename;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
export const saveQueue = (urlData) => saveJSON(CONFIG.CRAWLER.QUEUE_PATH, urlData);
|
|
20
|
+
|
|
21
|
+
export const deleteDataFiles = (filePath) => {
|
|
22
|
+
if (fs.existsSync(filePath)) {
|
|
23
|
+
if (fs.lstatSync(filePath).isDirectory()) {
|
|
24
|
+
fs.readdirSync(filePath).forEach((file) => {
|
|
25
|
+
const currentPath = path.join(filePath, file);
|
|
26
|
+
if (fs.lstatSync(currentPath).isDirectory()) {
|
|
27
|
+
deleteDataFiles(currentPath);
|
|
28
|
+
} else {
|
|
29
|
+
fs.unlinkSync(currentPath);
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
fs.rmdirSync(filePath);
|
|
33
|
+
} else {
|
|
34
|
+
fs.unlinkSync(filePath);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
};
|
|
@@ -1,25 +1,25 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
import { delay } from '../delay.js';
|
|
3
|
-
import { shouldRetry } from './handlers.js';
|
|
4
|
-
|
|
5
|
-
export const fetchURL = async (url, retries = 2) => {
|
|
6
|
-
try {
|
|
7
|
-
const response = await axios.get(url, { timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT, maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS });
|
|
8
|
-
const contentType = response.headers['content-type'];
|
|
9
|
-
|
|
10
|
-
if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
|
|
11
|
-
return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
|
|
12
|
-
};
|
|
13
|
-
|
|
14
|
-
return { data: response.data, status: response.status };
|
|
15
|
-
} catch (error) {
|
|
16
|
-
if (retries > 0 && shouldRetry(error)) {
|
|
17
|
-
console.log(`Retrying (${CONFIG.CRAWLER.MAX_RETRIES - retries + 1}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
|
|
18
|
-
if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
|
|
19
|
-
return fetchURL(url, retries - 1);
|
|
20
|
-
} else {
|
|
21
|
-
console.error(`Failed to fetch ${url} -> ${error.message}`);
|
|
22
|
-
return { error: error.message, status: error.response ? error.response.status : null };
|
|
23
|
-
};
|
|
24
|
-
};
|
|
25
|
-
};
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { delay } from '../delay.js';
|
|
3
|
+
import { shouldRetry } from './handlers.js';
|
|
4
|
+
|
|
5
|
+
export const fetchURL = async (url, retries = 2) => {
|
|
6
|
+
try {
|
|
7
|
+
const response = await axios.get(url, { timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT, maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS });
|
|
8
|
+
const contentType = response.headers['content-type'];
|
|
9
|
+
|
|
10
|
+
if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
|
|
11
|
+
return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
return { data: response.data, status: response.status };
|
|
15
|
+
} catch (error) {
|
|
16
|
+
if (retries > 0 && shouldRetry(error)) {
|
|
17
|
+
console.log(`Retrying (${CONFIG.CRAWLER.MAX_RETRIES - retries + 1}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
|
|
18
|
+
if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
|
|
19
|
+
return fetchURL(url, retries - 1);
|
|
20
|
+
} else {
|
|
21
|
+
console.error(`Failed to fetch ${url} -> ${error.message}`);
|
|
22
|
+
return { error: error.message, status: error.response ? error.response.status : null };
|
|
23
|
+
};
|
|
24
|
+
};
|
|
25
|
+
};
|
|
@@ -1,44 +1,44 @@
|
|
|
1
|
-
import { URL } from 'node:url';
|
|
2
|
-
import { normalizeURL } from './normalize.js';
|
|
3
|
-
|
|
4
|
-
// Handle HTML Status Codes HERE!
|
|
5
|
-
export const shouldRetry = (error) => {
|
|
6
|
-
if (!error.response) return true;
|
|
7
|
-
if (error.response.status === 429) {
|
|
8
|
-
const waitTime = error.response.headers ? error.response.headers['retry-after'] : null;
|
|
9
|
-
if (waitTime) {
|
|
10
|
-
console.log(`Rate limited for ${waitTime} seconds, exiting Crawler...`);
|
|
11
|
-
} else {
|
|
12
|
-
console.log(`Rate limited, no retry-after header found, exiting Crawler...`);
|
|
13
|
-
}
|
|
14
|
-
process.exit(10); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
|
|
15
|
-
}
|
|
16
|
-
return CONFIG.CRAWLER.RETRY_STATUS_CODES.includes(error.response.status); // Retry only on specific status codes
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
const shouldIncludeURL = (url) => {
|
|
20
|
-
try {
|
|
21
|
-
const urlObj = new URL(url);
|
|
22
|
-
return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
|
|
23
|
-
} catch (error) {
|
|
24
|
-
return false;
|
|
25
|
-
}
|
|
26
|
-
};
|
|
27
|
-
|
|
28
|
-
export const enqueueURLs = (urlData, urlMetadata, $, baseURL, referrer, depth) => {
|
|
29
|
-
$('a[href]').each((_, element) => {
|
|
30
|
-
const href = $(element).attr('href');
|
|
31
|
-
if (!href) return;
|
|
32
|
-
|
|
33
|
-
try {
|
|
34
|
-
const newURL = new URL(href, baseURL).toString();
|
|
35
|
-
const normalizedURL = normalizeURL(newURL);
|
|
36
|
-
if (shouldIncludeURL(normalizedURL) && !urlData.some(entry => entry.url === normalizedURL)) {
|
|
37
|
-
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
38
|
-
urlMetadata[normalizedURL] = { referrer, depth };
|
|
39
|
-
}
|
|
40
|
-
} catch (error) {
|
|
41
|
-
console.error(`Failed to enqueue URL: ${href} from ${baseURL}: ${error.message}`);
|
|
42
|
-
}
|
|
43
|
-
});
|
|
44
|
-
};
|
|
1
|
+
import { URL } from 'node:url';
|
|
2
|
+
import { normalizeURL } from './normalize.js';
|
|
3
|
+
|
|
4
|
+
// Handle HTML Status Codes HERE!
|
|
5
|
+
export const shouldRetry = (error) => {
|
|
6
|
+
if (!error.response) return true;
|
|
7
|
+
if (error.response.status === 429) {
|
|
8
|
+
const waitTime = error.response.headers ? error.response.headers['retry-after'] : null;
|
|
9
|
+
if (waitTime) {
|
|
10
|
+
console.log(`Rate limited for ${waitTime} seconds, exiting Crawler...`);
|
|
11
|
+
} else {
|
|
12
|
+
console.log(`Rate limited, no retry-after header found, exiting Crawler...`);
|
|
13
|
+
}
|
|
14
|
+
process.exit(10); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
|
|
15
|
+
}
|
|
16
|
+
return CONFIG.CRAWLER.RETRY_STATUS_CODES.includes(error.response.status); // Retry only on specific status codes
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
const shouldIncludeURL = (url) => {
|
|
20
|
+
try {
|
|
21
|
+
const urlObj = new URL(url);
|
|
22
|
+
return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
|
|
23
|
+
} catch (error) {
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export const enqueueURLs = (urlData, urlMetadata, $, baseURL, referrer, depth) => {
|
|
29
|
+
$('a[href]').each((_, element) => {
|
|
30
|
+
const href = $(element).attr('href');
|
|
31
|
+
if (!href) return;
|
|
32
|
+
|
|
33
|
+
try {
|
|
34
|
+
const newURL = new URL(href, baseURL).toString();
|
|
35
|
+
const normalizedURL = normalizeURL(newURL);
|
|
36
|
+
if (shouldIncludeURL(normalizedURL) && !urlData.some(entry => entry.url === normalizedURL)) {
|
|
37
|
+
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
38
|
+
urlMetadata[normalizedURL] = { referrer, depth };
|
|
39
|
+
}
|
|
40
|
+
} catch (error) {
|
|
41
|
+
console.error(`Failed to enqueue URL: ${href} from ${baseURL}: ${error.message}`);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
};
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export const normalizeURL = (url) => {
|
|
2
|
-
const urlObj = new URL(url);
|
|
3
|
-
urlObj.hash = ''; // Remove the fragment part
|
|
4
|
-
urlObj.search = ''; // Remove the query part
|
|
5
|
-
urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
|
|
6
|
-
urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
|
|
7
|
-
return urlObj.toString();
|
|
8
|
-
};
|
|
1
|
+
export const normalizeURL = (url) => {
|
|
2
|
+
const urlObj = new URL(url);
|
|
3
|
+
urlObj.hash = ''; // Remove the fragment part
|
|
4
|
+
urlObj.search = ''; // Remove the query part
|
|
5
|
+
urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
|
|
6
|
+
urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
|
|
7
|
+
return urlObj.toString();
|
|
8
|
+
};
|
|
@@ -1,44 +1,44 @@
|
|
|
1
|
-
import { delay } from '../delay.js';
|
|
2
|
-
import { cleanHTML } from '../cleanHTML.js';
|
|
3
|
-
|
|
4
|
-
import * as cheerio from 'cheerio';
|
|
5
|
-
import { shouldRetry, enqueueURLs } from './handlers.js';
|
|
6
|
-
import { fetchURL } from './fetch.js';
|
|
7
|
-
import { saveDataset, saveQueue } from '../fileOperations.js';
|
|
8
|
-
|
|
9
|
-
export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
|
|
10
|
-
if (entry.file || (entry.error && !shouldRetry({ response: { status: entry.status } }))) return;
|
|
11
|
-
|
|
12
|
-
console.log(`- ${fileNumber}/${urlData.length} -> ${entry.url}`);
|
|
13
|
-
|
|
14
|
-
const { url } = entry;
|
|
15
|
-
const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
|
|
16
|
-
|
|
17
|
-
const startTime = new Date().getTime();
|
|
18
|
-
try {
|
|
19
|
-
const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
|
|
20
|
-
if (result && result.data) {
|
|
21
|
-
const { data: html, status } = result;
|
|
22
|
-
const $ = cheerio.load(html);
|
|
23
|
-
enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
|
|
24
|
-
const content = cleanHTML($);
|
|
25
|
-
const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
|
|
26
|
-
entry.file = filename;
|
|
27
|
-
entry.status = status;
|
|
28
|
-
entry.error = null;
|
|
29
|
-
} else {
|
|
30
|
-
entry.error = result.error;
|
|
31
|
-
entry.status = result.status;
|
|
32
|
-
}
|
|
33
|
-
} catch (error) {
|
|
34
|
-
entry.error = error.message;
|
|
35
|
-
entry.status = null;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
saveQueue(urlData);
|
|
39
|
-
|
|
40
|
-
const endTime = new Date().getTime();
|
|
41
|
-
const elapsedTime = endTime - startTime;
|
|
42
|
-
|
|
43
|
-
if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
|
|
44
|
-
};
|
|
1
|
+
import { delay } from '../delay.js';
|
|
2
|
+
import { cleanHTML } from '../cleanHTML.js';
|
|
3
|
+
|
|
4
|
+
import * as cheerio from 'cheerio';
|
|
5
|
+
import { shouldRetry, enqueueURLs } from './handlers.js';
|
|
6
|
+
import { fetchURL } from './fetch.js';
|
|
7
|
+
import { saveDataset, saveQueue } from '../fileOperations.js';
|
|
8
|
+
|
|
9
|
+
export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
|
|
10
|
+
if (entry.file || (entry.error && !shouldRetry({ response: { status: entry.status } }))) return;
|
|
11
|
+
|
|
12
|
+
console.log(`- ${fileNumber}/${urlData.length} -> ${entry.url}`);
|
|
13
|
+
|
|
14
|
+
const { url } = entry;
|
|
15
|
+
const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
|
|
16
|
+
|
|
17
|
+
const startTime = new Date().getTime();
|
|
18
|
+
try {
|
|
19
|
+
const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
|
|
20
|
+
if (result && result.data) {
|
|
21
|
+
const { data: html, status } = result;
|
|
22
|
+
const $ = cheerio.load(html);
|
|
23
|
+
enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
|
|
24
|
+
const content = cleanHTML($);
|
|
25
|
+
const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
|
|
26
|
+
entry.file = filename;
|
|
27
|
+
entry.status = status;
|
|
28
|
+
entry.error = null;
|
|
29
|
+
} else {
|
|
30
|
+
entry.error = result.error;
|
|
31
|
+
entry.status = result.status;
|
|
32
|
+
}
|
|
33
|
+
} catch (error) {
|
|
34
|
+
entry.error = error.message;
|
|
35
|
+
entry.status = null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
saveQueue(urlData);
|
|
39
|
+
|
|
40
|
+
const endTime = new Date().getTime();
|
|
41
|
+
const elapsedTime = endTime - startTime;
|
|
42
|
+
|
|
43
|
+
if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
|
|
44
|
+
};
|
|
@@ -1,56 +1,56 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
|
|
4
|
-
export const formatData = (entry) => {
|
|
5
|
-
if (entry.file && entry.error === null) {
|
|
6
|
-
try {
|
|
7
|
-
const url = new URL(entry.url);
|
|
8
|
-
const pathname = url.pathname;
|
|
9
|
-
const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
|
|
10
|
-
|
|
11
|
-
if (!isExcluded) {
|
|
12
|
-
const categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathname.split('/')[1]] || CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.fallback;
|
|
13
|
-
if (categorisedPath) {
|
|
14
|
-
return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
|
-
} catch (e) {
|
|
18
|
-
console.error(`Error formatting data for ${entry.url}: ${e.message}`);
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
return null;
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
// Sort data consistently to always save it in the same order between each run, so GitHub doesn't show a diff for the same data.
|
|
26
|
-
function sortData(data, sortKey) {
|
|
27
|
-
return data.sort((a, b) => {
|
|
28
|
-
if (a[sortKey] < b[sortKey]) return -1;
|
|
29
|
-
if (a[sortKey] > b[sortKey]) return 1;
|
|
30
|
-
return 0;
|
|
31
|
-
});
|
|
32
|
-
};
|
|
33
|
-
|
|
34
|
-
export const saveSortedFormattedJSON = (filePath, data) => {
|
|
35
|
-
const dir = path.dirname(filePath);
|
|
36
|
-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
37
|
-
const sortedData = sortData(data, 'url'); // ensure data is sorted before saving
|
|
38
|
-
return fs.writeFileSync(filePath, JSON.stringify(sortedData, null, 2), 'utf8');
|
|
39
|
-
};
|
|
40
|
-
|
|
41
|
-
export const saveHardcodedExtraLinks = async () => {
|
|
42
|
-
const data = {
|
|
43
|
-
file_name: 'cs-links.json',
|
|
44
|
-
data: [
|
|
45
|
-
{
|
|
46
|
-
"url": "https://elemn.to/ai",
|
|
47
|
-
"content": "🧠 AI - How to save time - This page provides valuable insights on how to leverage AI tools for optimizing workflows and saving time across various tasks."
|
|
48
|
-
},
|
|
49
|
-
],
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
|
|
53
|
-
saveSortedFormattedJSON(filePath, data.data);
|
|
54
|
-
|
|
55
|
-
return data.data.length;
|
|
56
|
-
};
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
4
|
+
export const formatData = (entry) => {
|
|
5
|
+
if (entry.file && entry.error === null) {
|
|
6
|
+
try {
|
|
7
|
+
const url = new URL(entry.url);
|
|
8
|
+
const pathname = url.pathname;
|
|
9
|
+
const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
|
|
10
|
+
|
|
11
|
+
if (!isExcluded) {
|
|
12
|
+
const categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathname.split('/')[1]] || CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.fallback;
|
|
13
|
+
if (categorisedPath) {
|
|
14
|
+
return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
} catch (e) {
|
|
18
|
+
console.error(`Error formatting data for ${entry.url}: ${e.message}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
return null;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Sort data consistently to always save it in the same order between each run, so GitHub doesn't show a diff for the same data.
|
|
26
|
+
function sortData(data, sortKey) {
|
|
27
|
+
return data.sort((a, b) => {
|
|
28
|
+
if (a[sortKey] < b[sortKey]) return -1;
|
|
29
|
+
if (a[sortKey] > b[sortKey]) return 1;
|
|
30
|
+
return 0;
|
|
31
|
+
});
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
export const saveSortedFormattedJSON = (filePath, data) => {
|
|
35
|
+
const dir = path.dirname(filePath);
|
|
36
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
37
|
+
const sortedData = sortData(data, 'url'); // ensure data is sorted before saving
|
|
38
|
+
return fs.writeFileSync(filePath, JSON.stringify(sortedData, null, 2), 'utf8');
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
export const saveHardcodedExtraLinks = async () => {
|
|
42
|
+
const data = {
|
|
43
|
+
file_name: 'cs-links.json',
|
|
44
|
+
data: [
|
|
45
|
+
{
|
|
46
|
+
"url": "https://elemn.to/ai",
|
|
47
|
+
"content": "🧠 AI - How to save time - This page provides valuable insights on how to leverage AI tools for optimizing workflows and saving time across various tasks."
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
|
|
53
|
+
saveSortedFormattedJSON(filePath, data.data);
|
|
54
|
+
|
|
55
|
+
return data.data.length;
|
|
56
|
+
};
|