scraply 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENCE ADDED
File without changes
package/package.json ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "scraply",
3
+ "description": "A simple, configurable and functional content scraper",
4
+ "version": "1.0.0",
5
+ "main": "src/scraply.js",
6
+ "type": "module",
7
+ "scripts": {
8
+ "start": "node ."
9
+ },
10
+ "keywords": ["crawler", "scraper"],
11
+ "author": "Pau Serrat Gutiérrez",
12
+ "dependencies": {
13
+ "axios": "^1.7.7",
14
+ "cheerio": "^1.0.0",
15
+ "he": "^1.2.0"
16
+ }
17
+ }
package/readme.md ADDED
File without changes
package/src/config.js ADDED
@@ -0,0 +1,55 @@
1
+ export const DEFAULT_CONFIG = {
2
+ MAIN_DIR: 'dataset',
3
+
4
+ CRAWLER: {
5
+ INITIAL_URLS: [
6
+ 'https://crawler-test.com/'
7
+ ],
8
+ INCLUDE_URLS: [
9
+ 'https://crawler-test.com/.*'
10
+ ],
11
+ ALLOWED_CONTENT_TYPES: [
12
+ 'text/html'
13
+ ],
14
+ EXCLUDE_PATTERNS: [
15
+ '/cdn-cgi/',
16
+ /\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/,
17
+ ],
18
+ DOM_ELEMENTS_REMOVE: [
19
+ 'script',
20
+ 'noscript',
21
+ 'style',
22
+ 'meta',
23
+ 'link',
24
+ 'svg',
25
+ 'path',
26
+ 'img',
27
+ 'input',
28
+ 'textarea',
29
+ 'embed',
30
+ 'object',
31
+ 'iframe',
32
+ 'nav',
33
+ 'header',
34
+ 'footer',
35
+ 'aside',
36
+ 'button'
37
+ ],
38
+ RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
39
+ REQUEST_TIMEOUT: 4000,
40
+ MAX_REDIRECTS: 3,
41
+ MAX_RETRIES: 2,
42
+ CRAWL_DELAY_MS: 200,
43
+ CRAWL_ERROR_RETRY_DELAY_MS: 800,
44
+ },
45
+
46
+ DATA_FORMATTER: {
47
+ EXCLUDED_PATTERNS: [],
48
+ CATEGORISED_PATHS: {
49
+ 'https://crawler-test.com': {
50
+ 'mobile': 'mobile.json',
51
+ 'fallback': 'general.json'
52
+ },
53
+ },
54
+ }
55
+ };
@@ -0,0 +1,25 @@
1
+ import path from 'node:path';
2
+ import { DEFAULT_CONFIG } from './config.js';
3
+
4
+ // A utility function to perform a deep merge of objects
5
+ function deepMerge(target, source) {
6
+ for (const key in source) {
7
+ if (source[key] instanceof Object && key in target) {
8
+ Object.assign(source[key], deepMerge(target[key], source[key]));
9
+ }
10
+ }
11
+ return { ...target, ...source };
12
+ };
13
+
14
+ export function loadConfig(userConfig = {}) {
15
+ // Merge the user config with the default config
16
+ const config = deepMerge(DEFAULT_CONFIG, userConfig);
17
+
18
+ // Dynamically construct paths using MAIN_DIR
19
+ config.CRAWLER.QUEUE_PATH = path.join(config.MAIN_DIR, 'queue.json');
20
+ config.CRAWLER.CRAWLED_PATH = path.join(config.MAIN_DIR, 'crawled');
21
+ config.DATA_FORMATTER.FORMATTED_PATH = path.join(config.MAIN_DIR, 'formatted');
22
+ config.DATA_FORMATTER.ERROR_REPORT_PATH = path.join(config.MAIN_DIR, 'error-report.json');
23
+
24
+ return config;
25
+ };
package/src/scraply.js ADDED
@@ -0,0 +1,120 @@
1
+ import { loadConfig } from './loadConfig.js';
2
+ import { normalizeURL } from './utils/crawl/url/normalize.js';
3
+ import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
4
+ import { processURL } from './utils/crawl/url/processor.js';
5
+ import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
6
+
7
+ const userConfig = {};
8
+
9
+ // Load and merge the configuration
10
+ const CONFIG = loadConfig(userConfig);
11
+ global.CONFIG = CONFIG;
12
+
13
+ let urlData = [];
14
+ let urlMetadata = {};
15
+
16
+ const initializeCrawler = () => {
17
+ urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
18
+
19
+ if (urlData.length === 0) { // If the queue is empty, start fresh with the initial URLs.
20
+ console.log(`Starting fresh! No URLs found in ${CONFIG.CRAWLER.QUEUE_PATH}\n`);
21
+
22
+ CONFIG.CRAWLER.INITIAL_URLS.forEach(url => {
23
+ const normalizedURL = normalizeURL(url);
24
+ urlData.push({ url: normalizedURL, file: null, status: null, error: null });
25
+ });
26
+ saveQueue(urlData);
27
+ } else { // If the queue is not empty
28
+ const allProcessed = urlData.every(entry => entry.file !== null || entry.error !== null);
29
+ if (allProcessed) { // If all URLs have been processed
30
+ console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
31
+
32
+ urlData = [];
33
+ urlMetadata = {};
34
+
35
+ // Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data. This way, the Discord Bot will fetch the correct & latest data from the GitHub repo, without fetching any incomplete data or empty data, as it watches for file diffs!
36
+ deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
37
+ deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
38
+ deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
39
+
40
+ initializeCrawler();
41
+ } else { // If there are URLs that haven't been processed yet, resume from the queue.
42
+ console.log(`Resuming from ${CONFIG.CRAWLER.QUEUE_PATH} with ${urlData.length} total found URLs\n`);
43
+ }
44
+ }
45
+ };
46
+
47
+ const app = async () => {
48
+ console.log(`STARTING CRAWLER
49
+ - Initial URLs: ${CONFIG.CRAWLER.INITIAL_URLS}
50
+ - Include URLs: ${CONFIG.CRAWLER.INCLUDE_URLS}
51
+ - Excluded Patterns: ${CONFIG.CRAWLER.EXCLUDE_PATTERNS}
52
+ - Allowed Content Types: ${CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES}
53
+ - Retry Status Codes: ${CONFIG.CRAWLER.RETRY_STATUS_CODES}
54
+ - Request Timeout: ${CONFIG.CRAWLER.REQUEST_TIMEOUT}
55
+ - Max Redirects: ${CONFIG.CRAWLER.MAX_REDIRECTS}
56
+ - Max Retries: ${CONFIG.CRAWLER.MAX_RETRIES}
57
+ - Crawl Delay: ${CONFIG.CRAWLER.CRAWL_DELAY_MS}ms
58
+ - Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
59
+ `);
60
+
61
+ let fileNumber = urlData.filter(entry => entry.file).length + 1;
62
+ for await (const entry of urlData) {
63
+ if (!entry.file) { // Only process URLs that haven't been processed yet.
64
+ await processURL(entry, fileNumber, urlData, urlMetadata);
65
+ fileNumber++; // Increment the file number only if the URL was processed successfully.
66
+ }
67
+ }
68
+
69
+ const totalUrls = urlData.length;
70
+ const crawledUrls = urlData.filter(entry => entry.file !== null).length;
71
+ const notCrawledUrls = totalUrls - crawledUrls;
72
+ const errorUrls = urlData.filter(entry => entry.error !== null);
73
+
74
+ console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled)`);
75
+
76
+ // Iterate over all the urlData and save all the url & content to files, categorized by CONFIG.DATA_FORMATTER.CATEGORISED_PATHS. Exclude the URLs that match the patterns in CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS. Save in CONFIG.DATA_FORMATTER.FORMATTED_PATH.
77
+ console.log(`\nFORMATTING DATA...`);
78
+ const dataToSave = {};
79
+
80
+ for await (const entry of urlData) {
81
+ const savePath = formatData(entry);
82
+ if (savePath) { // If the URL should be saved.
83
+ if (!dataToSave[savePath]) dataToSave[savePath] = [];
84
+
85
+ // Load content from the file referenced by entry.file
86
+ let content = null;
87
+ try {
88
+ content = loadJSON(entry.file);
89
+ dataToSave[savePath].push({ url: entry.url, content: content.content });
90
+ } catch (e) {
91
+ console.error(`Error loading content from ${entry.file}: ${e.message}`);
92
+ }
93
+ }
94
+ };
95
+
96
+ // Save the data to files.
97
+ let totalSavedURLs = 0;
98
+ for (const [savePath, data] of Object.entries(dataToSave)) {
99
+ totalSavedURLs += data.length;
100
+ console.log(`${data.length} -> ${savePath}`);
101
+ saveSortedFormattedJSON(savePath, data);
102
+ };
103
+ console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
104
+
105
+ // Save hardcoded extra links to files.
106
+ await saveHardcodedExtraLinks();
107
+ console.log(`Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
108
+
109
+ // Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
110
+ const errorData = errorUrls.map(entry => {
111
+ return { url: entry.url, status: entry.status, error: entry.error };
112
+ });
113
+
114
+ saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
115
+
116
+ console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
117
+ };
118
+
119
+ initializeCrawler();
120
+ await app();
@@ -0,0 +1,38 @@
1
+ import he from 'he';
2
+
3
+ export const cleanHTML = ($) => {
4
+ // Remove unwanted elements
5
+ const $aux = $;
6
+ $aux(CONFIG.CRAWLER.DOM_ELEMENTS_REMOVE.join(',')).remove();
7
+ $aux('*').contents().filter((_, el) => el.type === 'comment').remove();
8
+
9
+ // Get the text content of the body and decode HTML entities
10
+ // let bodyText = he.decode($aux('body').text(), { level: 'all' });
11
+
12
+ // Get the text content of the body element, ensuring spaces between child elements
13
+ let bodyText = getTextWithSpaces($aux, $aux('body'));
14
+
15
+ // Decode HTML entities
16
+ bodyText = he.decode(bodyText, { level: 'all' });
17
+
18
+ // Clean up the resulting text
19
+ return bodyText
20
+ .replace(/\n/g, ' ') // Replace newlines with a space
21
+ .replace(/\\['"\\]/g, match => match.slice(1)) // Replace escaped characters with the unescaped character
22
+ .replace(/[\u200B\u00A0\u2028\u2029\u202F\u00AD\u2060\uFEFF]/g, ' ') // Replace zero-width spaces with a space
23
+ .replace(/\s{2,}/g, ' ') // Replace multiple spaces with a single space
24
+ .trim();
25
+ };
26
+
27
+ // Custom function to get text content with spaces between elements
28
+ const getTextWithSpaces = ($, element) => {
29
+ let text = '';
30
+ element.contents().each((_, el) => {
31
+ if (el.type === 'text') {
32
+ text += $(el).text() + ' ';
33
+ } else if (el.type === 'tag') {
34
+ text += getTextWithSpaces($, $(el));
35
+ }
36
+ });
37
+ return text;
38
+ };
@@ -0,0 +1 @@
1
+ export const delay = async (ms) => new Promise(resolve => setTimeout(resolve, ms));
@@ -0,0 +1,37 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+
4
+ export const loadJSON = (filePath) => fs.existsSync(filePath) ? JSON.parse(fs.readFileSync(filePath, 'utf8')) : [];
5
+
6
+ export const saveJSON = (filePath, data) => {
7
+ const dir = path.dirname(filePath);
8
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
9
+ return fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
10
+ };
11
+
12
+ export const saveDataset = (data, fileNumber) => {
13
+ if (!fs.existsSync(CONFIG.CRAWLER.CRAWLED_PATH)) fs.mkdirSync(CONFIG.CRAWLER.CRAWLED_PATH, { recursive: true });
14
+ const filename = `${CONFIG.CRAWLER.CRAWLED_PATH}/${fileNumber}.json`;
15
+ fs.writeFileSync(filename, JSON.stringify(data, null, 2), 'utf8');
16
+ return filename;
17
+ };
18
+
19
+ export const saveQueue = (urlData) => saveJSON(CONFIG.CRAWLER.QUEUE_PATH, urlData);
20
+
21
+ export const deleteDataFiles = (filePath) => {
22
+ if (fs.existsSync(filePath)) {
23
+ if (fs.lstatSync(filePath).isDirectory()) {
24
+ fs.readdirSync(filePath).forEach((file) => {
25
+ const currentPath = path.join(filePath, file);
26
+ if (fs.lstatSync(currentPath).isDirectory()) {
27
+ deleteDataFiles(currentPath);
28
+ } else {
29
+ fs.unlinkSync(currentPath);
30
+ }
31
+ });
32
+ fs.rmdirSync(filePath);
33
+ } else {
34
+ fs.unlinkSync(filePath);
35
+ }
36
+ }
37
+ };
@@ -0,0 +1,25 @@
1
+ import axios from 'axios';
2
+ import { delay } from '../delay.js';
3
+ import { shouldRetry } from './handlers.js';
4
+
5
+ export const fetchURL = async (url, retries = 2) => {
6
+ try {
7
+ const response = await axios.get(url, { timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT, maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS });
8
+ const contentType = response.headers['content-type'];
9
+
10
+ if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
11
+ return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
12
+ };
13
+
14
+ return { data: response.data, status: response.status };
15
+ } catch (error) {
16
+ if (retries > 0 && shouldRetry(error)) {
17
+ console.log(`Retrying (${CONFIG.CRAWLER.MAX_RETRIES - retries + 1}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
18
+ if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
19
+ return fetchURL(url, retries - 1);
20
+ } else {
21
+ console.error(`Failed to fetch ${url} -> ${error.message}`);
22
+ return { error: error.message, status: error.response ? error.response.status : null };
23
+ };
24
+ };
25
+ };
@@ -0,0 +1,44 @@
1
+ import { URL } from 'node:url';
2
+ import { normalizeURL } from './normalize.js';
3
+
4
+ // Handle HTML Status Codes HERE!
5
+ export const shouldRetry = (error) => {
6
+ if (!error.response) return true;
7
+ if (error.response.status === 429) {
8
+ const waitTime = error.response.headers ? error.response.headers['retry-after'] : null;
9
+ if (waitTime) {
10
+ console.log(`Rate limited for ${waitTime} seconds, exiting Crawler...`);
11
+ } else {
12
+ console.log(`Rate limited, no retry-after header found, exiting Crawler...`);
13
+ }
14
+ process.exit(10); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
15
+ }
16
+ return CONFIG.CRAWLER.RETRY_STATUS_CODES.includes(error.response.status); // Retry only on specific status codes
17
+ };
18
+
19
+ const shouldIncludeURL = (url) => {
20
+ try {
21
+ const urlObj = new URL(url);
22
+ return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
23
+ } catch (error) {
24
+ return false;
25
+ }
26
+ };
27
+
28
+ export const enqueueURLs = (urlData, urlMetadata, $, baseURL, referrer, depth) => {
29
+ $('a[href]').each((_, element) => {
30
+ const href = $(element).attr('href');
31
+ if (!href) return;
32
+
33
+ try {
34
+ const newURL = new URL(href, baseURL).toString();
35
+ const normalizedURL = normalizeURL(newURL);
36
+ if (shouldIncludeURL(normalizedURL) && !urlData.some(entry => entry.url === normalizedURL)) {
37
+ urlData.push({ url: normalizedURL, file: null, status: null, error: null });
38
+ urlMetadata[normalizedURL] = { referrer, depth };
39
+ }
40
+ } catch (error) {
41
+ console.error(`Failed to enqueue URL: ${href} from ${baseURL}: ${error.message}`);
42
+ }
43
+ });
44
+ };
@@ -0,0 +1,8 @@
1
+ export const normalizeURL = (url) => {
2
+ const urlObj = new URL(url);
3
+ urlObj.hash = ''; // Remove the fragment part
4
+ urlObj.search = ''; // Remove the query part
5
+ urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
6
+ urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
7
+ return urlObj.toString();
8
+ };
@@ -0,0 +1,44 @@
1
+ import { delay } from '../delay.js';
2
+ import { cleanHTML } from '../cleanHTML.js';
3
+
4
+ import * as cheerio from 'cheerio';
5
+ import { shouldRetry, enqueueURLs } from './handlers.js';
6
+ import { fetchURL } from './fetch.js';
7
+ import { saveDataset, saveQueue } from '../fileOperations.js';
8
+
9
+ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
10
+ if (entry.file || (entry.error && !shouldRetry({ response: { status: entry.status } }))) return;
11
+
12
+ console.log(`- ${fileNumber}/${urlData.length} -> ${entry.url}`);
13
+
14
+ const { url } = entry;
15
+ const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
16
+
17
+ const startTime = new Date().getTime();
18
+ try {
19
+ const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
20
+ if (result && result.data) {
21
+ const { data: html, status } = result;
22
+ const $ = cheerio.load(html);
23
+ enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
24
+ const content = cleanHTML($);
25
+ const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
26
+ entry.file = filename;
27
+ entry.status = status;
28
+ entry.error = null;
29
+ } else {
30
+ entry.error = result.error;
31
+ entry.status = result.status;
32
+ }
33
+ } catch (error) {
34
+ entry.error = error.message;
35
+ entry.status = null;
36
+ }
37
+
38
+ saveQueue(urlData);
39
+
40
+ const endTime = new Date().getTime();
41
+ const elapsedTime = endTime - startTime;
42
+
43
+ if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
44
+ };
@@ -0,0 +1,56 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+
4
+ export const formatData = (entry) => {
5
+ if (entry.file && entry.error === null) {
6
+ try {
7
+ const url = new URL(entry.url);
8
+ const pathname = url.pathname;
9
+ const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
10
+
11
+ if (!isExcluded) {
12
+ const categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathname.split('/')[1]] || CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.fallback;
13
+ if (categorisedPath) {
14
+ return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
15
+ }
16
+ }
17
+ } catch (e) {
18
+ console.error(`Error formatting data for ${entry.url}: ${e.message}`);
19
+ }
20
+ }
21
+
22
+ return null;
23
+ };
24
+
25
+ // Sort data consistently to always save it in the same order between each run, so GitHub doesn't show a diff for the same data.
26
+ function sortData(data, sortKey) {
27
+ return data.sort((a, b) => {
28
+ if (a[sortKey] < b[sortKey]) return -1;
29
+ if (a[sortKey] > b[sortKey]) return 1;
30
+ return 0;
31
+ });
32
+ };
33
+
34
+ export const saveSortedFormattedJSON = (filePath, data) => {
35
+ const dir = path.dirname(filePath);
36
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
37
+ const sortedData = sortData(data, 'url'); // ensure data is sorted before saving
38
+ return fs.writeFileSync(filePath, JSON.stringify(sortedData, null, 2), 'utf8');
39
+ };
40
+
41
+ export const saveHardcodedExtraLinks = async () => {
42
+ const data = {
43
+ file_name: 'cs-links.json',
44
+ data: [
45
+ {
46
+ "url": "https://elemn.to/ai",
47
+ "content": "🧠 AI - How to save time - This page provides valuable insights on how to leverage AI tools for optimizing workflows and saving time across various tasks."
48
+ },
49
+ ],
50
+ };
51
+
52
+ const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
53
+ saveSortedFormattedJSON(filePath, data.data);
54
+
55
+ return data.data.length;
56
+ };