npm - scraply - Versions diffs - 1.0.0 - Mend

scraply 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/LICENCE +0 -0
package/package.json +17 -0
package/readme.md +0 -0
package/src/config.js +55 -0
package/src/loadConfig.js +25 -0
package/src/scraply.js +120 -0
package/src/utils/crawl/cleanHTML.js +38 -0
package/src/utils/crawl/delay.js +1 -0
package/src/utils/crawl/fileOperations.js +37 -0
package/src/utils/crawl/url/fetch.js +25 -0
package/src/utils/crawl/url/handlers.js +44 -0
package/src/utils/crawl/url/normalize.js +8 -0
package/src/utils/crawl/url/processor.js +44 -0
package/src/utils/format/formatData.js +56 -0

package/LICENCE ADDED Viewed

File without changes

package/package.json ADDED Viewed

@@ -0,0 +1,17 @@
+{
+  "name": "scraply",
+  "description": "A simple, configurable and functional content scraper",
+  "version": "1.0.0",
+  "main": "src/scraply.js",
+  "type": "module",
+  "scripts": {
+    "start": "node ."
+  },
+  "keywords": ["crawler", "scraper"],
+  "author": "Pau Serrat Gutiérrez",
+  "dependencies": {
+    "axios": "^1.7.7",
+    "cheerio": "^1.0.0",
+    "he": "^1.2.0"
+  }
+}

package/readme.md ADDED Viewed

File without changes

package/src/config.js ADDED Viewed

@@ -0,0 +1,55 @@
+export const DEFAULT_CONFIG = {
+  MAIN_DIR: 'dataset',
+  CRAWLER: {
+    INITIAL_URLS: [
+      'https://crawler-test.com/'
+    ],
+    INCLUDE_URLS: [
+      'https://crawler-test.com/.*'
+    ],
+    ALLOWED_CONTENT_TYPES: [
+      'text/html'
+    ],
+    EXCLUDE_PATTERNS: [
+      '/cdn-cgi/',
+      /\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/,
+    ],
+    DOM_ELEMENTS_REMOVE: [
+      'script',
+      'noscript',
+      'style',
+      'meta',
+      'link',
+      'svg',
+      'path',
+      'img',
+      'input',
+      'textarea',
+      'embed',
+      'object',
+      'iframe',
+      'nav',
+      'header',
+      'footer',
+      'aside',
+      'button'
+    ],
+    RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
+    REQUEST_TIMEOUT: 4000,
+    MAX_REDIRECTS: 3,
+    MAX_RETRIES: 2,
+    CRAWL_DELAY_MS: 200,
+    CRAWL_ERROR_RETRY_DELAY_MS: 800,
+  },
+  DATA_FORMATTER: {
+    EXCLUDED_PATTERNS: [],
+    CATEGORISED_PATHS: {
+      'https://crawler-test.com': {
+        'mobile': 'mobile.json',
+        'fallback': 'general.json'
+      },
+    },
+  }
+};

package/src/loadConfig.js ADDED Viewed

@@ -0,0 +1,25 @@
+import path from 'node:path';
+import { DEFAULT_CONFIG } from './config.js';
+// A utility function to perform a deep merge of objects
+function deepMerge(target, source) {
+  for (const key in source) {
+    if (source[key] instanceof Object && key in target) {
+      Object.assign(source[key], deepMerge(target[key], source[key]));
+    }
+  }
+  return { ...target, ...source };
+};
+export function loadConfig(userConfig = {}) {
+  // Merge the user config with the default config
+  const config = deepMerge(DEFAULT_CONFIG, userConfig);
+  // Dynamically construct paths using MAIN_DIR
+  config.CRAWLER.QUEUE_PATH = path.join(config.MAIN_DIR, 'queue.json');
+  config.CRAWLER.CRAWLED_PATH = path.join(config.MAIN_DIR, 'crawled');
+  config.DATA_FORMATTER.FORMATTED_PATH = path.join(config.MAIN_DIR, 'formatted');
+  config.DATA_FORMATTER.ERROR_REPORT_PATH = path.join(config.MAIN_DIR, 'error-report.json');
+  return config;
+};

package/src/scraply.js ADDED Viewed

@@ -0,0 +1,120 @@
+import { loadConfig } from './loadConfig.js';
+import { normalizeURL } from './utils/crawl/url/normalize.js';
+import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
+import { processURL } from './utils/crawl/url/processor.js';
+import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
+const userConfig = {};
+// Load and merge the configuration
+const CONFIG = loadConfig(userConfig);
+global.CONFIG = CONFIG;
+let urlData = [];
+let urlMetadata = {};
+const initializeCrawler = () => {
+  urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
+  if (urlData.length === 0) { // If the queue is empty, start fresh with the initial URLs.
+    console.log(`Starting fresh! No URLs found in ${CONFIG.CRAWLER.QUEUE_PATH}\n`);
+    CONFIG.CRAWLER.INITIAL_URLS.forEach(url => {
+      const normalizedURL = normalizeURL(url);
+      urlData.push({ url: normalizedURL, file: null, status: null, error: null });
+    });
+    saveQueue(urlData);
+  } else { // If the queue is not empty
+    const allProcessed = urlData.every(entry => entry.file !== null || entry.error !== null);
+    if (allProcessed) { // If all URLs have been processed
+      console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
+      urlData = [];
+      urlMetadata = {};
+      // Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data. This way, the Discord Bot will fetch the correct & latest data from the GitHub repo, without fetching any incomplete data or empty data, as it watches for file diffs!
+      deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
+      deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
+      deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
+      initializeCrawler();
+    } else { // If there are URLs that haven't been processed yet, resume from the queue.
+      console.log(`Resuming from ${CONFIG.CRAWLER.QUEUE_PATH} with ${urlData.length} total found URLs\n`);
+    }
+  }
+};
+const app = async () => {
+  console.log(`STARTING CRAWLER
+  - Initial URLs: ${CONFIG.CRAWLER.INITIAL_URLS}
+  - Include URLs: ${CONFIG.CRAWLER.INCLUDE_URLS}
+  - Excluded Patterns: ${CONFIG.CRAWLER.EXCLUDE_PATTERNS}
+  - Allowed Content Types: ${CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES}
+  - Retry Status Codes: ${CONFIG.CRAWLER.RETRY_STATUS_CODES}
+  - Request Timeout: ${CONFIG.CRAWLER.REQUEST_TIMEOUT}
+  - Max Redirects: ${CONFIG.CRAWLER.MAX_REDIRECTS}
+  - Max Retries: ${CONFIG.CRAWLER.MAX_RETRIES}
+  - Crawl Delay: ${CONFIG.CRAWLER.CRAWL_DELAY_MS}ms
+  - Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
+  `);
+  let fileNumber = urlData.filter(entry => entry.file).length + 1;
+  for await (const entry of urlData) {
+    if (!entry.file) { // Only process URLs that haven't been processed yet.
+      await processURL(entry, fileNumber, urlData, urlMetadata);
+      fileNumber++; // Increment the file number only if the URL was processed successfully.
+    }
+  }
+  const totalUrls = urlData.length;
+  const crawledUrls = urlData.filter(entry => entry.file !== null).length;
+  const notCrawledUrls = totalUrls - crawledUrls;
+  const errorUrls = urlData.filter(entry => entry.error !== null);
+  console.log(`\nCRAWLING COMPLETED! ${crawledUrls} of ${totalUrls} (${notCrawledUrls} not crawled)`);
+  // Iterate over all the urlData and save all the url & content to files, categorized by CONFIG.DATA_FORMATTER.CATEGORISED_PATHS. Exclude the URLs that match the patterns in CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS. Save in CONFIG.DATA_FORMATTER.FORMATTED_PATH.
+  console.log(`\nFORMATTING DATA...`);
+  const dataToSave = {};
+  for await (const entry of urlData) {
+    const savePath = formatData(entry);
+    if (savePath) { // If the URL should be saved.
+      if (!dataToSave[savePath]) dataToSave[savePath] = [];
+      // Load content from the file referenced by entry.file
+      let content = null;
+      try {
+        content = loadJSON(entry.file);
+        dataToSave[savePath].push({ url: entry.url, content: content.content });
+      } catch (e) {
+        console.error(`Error loading content from ${entry.file}: ${e.message}`);
+      }
+    }
+  };
+  // Save the data to files.
+  let totalSavedURLs = 0;
+  for (const [savePath, data] of Object.entries(dataToSave)) {
+    totalSavedURLs += data.length;
+    console.log(`${data.length} -> ${savePath}`);
+    saveSortedFormattedJSON(savePath, data);
+  };
+  console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
+  // Save hardcoded extra links to files.
+  await saveHardcodedExtraLinks();
+  console.log(`Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
+  // Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
+  const errorData = errorUrls.map(entry => {
+    return { url: entry.url, status: entry.status, error: entry.error };
+  });
+  saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
+  console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
+};
+initializeCrawler();
+await app();

package/src/utils/crawl/cleanHTML.js ADDED Viewed

@@ -0,0 +1,38 @@
+import he from 'he';
+export const cleanHTML = ($) => {
+  // Remove unwanted elements
+  const $aux = $;
+  $aux(CONFIG.CRAWLER.DOM_ELEMENTS_REMOVE.join(',')).remove();
+  $aux('*').contents().filter((_, el) => el.type === 'comment').remove();
+  // Get the text content of the body and decode HTML entities
+  // let bodyText = he.decode($aux('body').text(), { level: 'all' });
+  // Get the text content of the body element, ensuring spaces between child elements
+  let bodyText = getTextWithSpaces($aux, $aux('body'));
+  // Decode HTML entities
+  bodyText = he.decode(bodyText, { level: 'all' });
+  // Clean up the resulting text
+  return bodyText
+    .replace(/\n/g, ' ') // Replace newlines with a space
+    .replace(/\\['"\\]/g, match => match.slice(1)) // Replace escaped characters with the unescaped character
+    .replace(/[\u200B\u00A0\u2028\u2029\u202F\u00AD\u2060\uFEFF]/g, ' ') // Replace zero-width spaces with a space
+    .replace(/\s{2,}/g, ' ') // Replace multiple spaces with a single space
+    .trim();
+};
+// Custom function to get text content with spaces between elements
+const getTextWithSpaces = ($, element) => {
+  let text = '';
+  element.contents().each((_, el) => {
+    if (el.type === 'text') {
+      text += $(el).text() + ' ';
+    } else if (el.type === 'tag') {
+      text += getTextWithSpaces($, $(el));
+    }
+  });
+  return text;
+};

package/src/utils/crawl/delay.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export const delay = async (ms) => new Promise(resolve => setTimeout(resolve, ms));

package/src/utils/crawl/fileOperations.js ADDED Viewed

@@ -0,0 +1,37 @@
+import fs from 'node:fs';
+import path from 'node:path';
+export const loadJSON = (filePath) => fs.existsSync(filePath) ? JSON.parse(fs.readFileSync(filePath, 'utf8')) : [];
+export const saveJSON = (filePath, data) => {
+  const dir = path.dirname(filePath);
+  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+  return fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
+};
+export const saveDataset = (data, fileNumber) => {
+  if (!fs.existsSync(CONFIG.CRAWLER.CRAWLED_PATH)) fs.mkdirSync(CONFIG.CRAWLER.CRAWLED_PATH, { recursive: true });
+  const filename = `${CONFIG.CRAWLER.CRAWLED_PATH}/${fileNumber}.json`;
+  fs.writeFileSync(filename, JSON.stringify(data, null, 2), 'utf8');
+  return filename;
+};
+export const saveQueue = (urlData) => saveJSON(CONFIG.CRAWLER.QUEUE_PATH, urlData);
+export const deleteDataFiles = (filePath) => {
+  if (fs.existsSync(filePath)) {
+    if (fs.lstatSync(filePath).isDirectory()) {
+      fs.readdirSync(filePath).forEach((file) => {
+        const currentPath = path.join(filePath, file);
+        if (fs.lstatSync(currentPath).isDirectory()) {
+          deleteDataFiles(currentPath);
+        } else {
+          fs.unlinkSync(currentPath);
+        }
+      });
+      fs.rmdirSync(filePath);
+    } else {
+      fs.unlinkSync(filePath);
+    }
+  }
+};

package/src/utils/crawl/url/fetch.js ADDED Viewed

@@ -0,0 +1,25 @@
+import axios from 'axios';
+import { delay } from '../delay.js';
+import { shouldRetry } from './handlers.js';
+export const fetchURL = async (url, retries = 2) => {
+  try {
+    const response = await axios.get(url, { timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT, maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS });
+    const contentType = response.headers['content-type'];
+    if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
+      return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
+    };
+    return { data: response.data, status: response.status };
+  } catch (error) {
+    if (retries > 0 && shouldRetry(error)) {
+      console.log(`Retrying (${CONFIG.CRAWLER.MAX_RETRIES - retries + 1}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
+      if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
+      return fetchURL(url, retries - 1);
+    } else {
+      console.error(`Failed to fetch ${url} -> ${error.message}`);
+      return { error: error.message, status: error.response ? error.response.status : null };
+    };
+  };
+};

package/src/utils/crawl/url/handlers.js ADDED Viewed

@@ -0,0 +1,44 @@
+import { URL } from 'node:url';
+import { normalizeURL } from './normalize.js';
+// Handle HTML Status Codes HERE!
+export const shouldRetry = (error) => {
+  if (!error.response) return true;
+  if (error.response.status === 429) {
+    const waitTime = error.response.headers ? error.response.headers['retry-after'] : null;
+    if (waitTime) {
+      console.log(`Rate limited for ${waitTime} seconds, exiting Crawler...`);
+    } else {
+      console.log(`Rate limited, no retry-after header found, exiting Crawler...`);
+    }
+    process.exit(10); // GitHub Actions Docker uses values ranged from 0 to 255, so any bigger value will be % 256!
+  }
+  return CONFIG.CRAWLER.RETRY_STATUS_CODES.includes(error.response.status); // Retry only on specific status codes
+};
+const shouldIncludeURL = (url) => {
+  try {
+    const urlObj = new URL(url);
+    return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
+  } catch (error) {
+    return false;
+  }
+};
+export const enqueueURLs = (urlData, urlMetadata, $, baseURL, referrer, depth) => {
+  $('a[href]').each((_, element) => {
+    const href = $(element).attr('href');
+    if (!href) return;
+    try {
+      const newURL = new URL(href, baseURL).toString();
+      const normalizedURL = normalizeURL(newURL);
+      if (shouldIncludeURL(normalizedURL) && !urlData.some(entry => entry.url === normalizedURL)) {
+        urlData.push({ url: normalizedURL, file: null, status: null, error: null });
+        urlMetadata[normalizedURL] = { referrer, depth };
+      }
+    } catch (error) {
+      console.error(`Failed to enqueue URL: ${href} from ${baseURL}: ${error.message}`);
+    }
+  });
+};

package/src/utils/crawl/url/normalize.js ADDED Viewed

@@ -0,0 +1,8 @@
+export const normalizeURL = (url) => {
+  const urlObj = new URL(url);
+  urlObj.hash = ''; // Remove the fragment part
+  urlObj.search = ''; // Remove the query part
+  urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
+  urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
+  return urlObj.toString();
+};

package/src/utils/crawl/url/processor.js ADDED Viewed

@@ -0,0 +1,44 @@
+import { delay } from '../delay.js';
+import { cleanHTML } from '../cleanHTML.js';
+import * as cheerio from 'cheerio';
+import { shouldRetry, enqueueURLs } from './handlers.js';
+import { fetchURL } from './fetch.js';
+import { saveDataset, saveQueue } from '../fileOperations.js';
+export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
+  if (entry.file || (entry.error && !shouldRetry({ response: { status: entry.status } }))) return;
+  console.log(`- ${fileNumber}/${urlData.length} -> ${entry.url}`);
+  const { url } = entry;
+  const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
+  const startTime = new Date().getTime();
+  try {
+    const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
+    if (result && result.data) {
+      const { data: html, status } = result;
+      const $ = cheerio.load(html);
+      enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
+      const content = cleanHTML($);
+      const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
+      entry.file = filename;
+      entry.status = status;
+      entry.error = null;
+    } else {
+      entry.error = result.error;
+      entry.status = result.status;
+    }
+  } catch (error) {
+    entry.error = error.message;
+    entry.status = null;
+  }
+  saveQueue(urlData);
+  const endTime = new Date().getTime();
+  const elapsedTime = endTime - startTime;
+  if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
+};

package/src/utils/format/formatData.js ADDED Viewed

@@ -0,0 +1,56 @@
+import fs from 'fs';
+import path from 'path';
+export const formatData = (entry) => {
+  if (entry.file && entry.error === null) {
+    try {
+      const url = new URL(entry.url);
+      const pathname = url.pathname;
+      const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
+      if (!isExcluded) {
+        const categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathname.split('/')[1]] || CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.fallback;
+        if (categorisedPath) {
+          return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
+        }
+      }
+    } catch (e) {
+      console.error(`Error formatting data for ${entry.url}: ${e.message}`);
+    }
+  }
+  return null;
+};
+// Sort data consistently to always save it in the same order between each run, so GitHub doesn't show a diff for the same data.
+function sortData(data, sortKey) {
+  return data.sort((a, b) => {
+    if (a[sortKey] < b[sortKey]) return -1;
+    if (a[sortKey] > b[sortKey]) return 1;
+    return 0;
+  });
+};
+export const saveSortedFormattedJSON = (filePath, data) => {
+  const dir = path.dirname(filePath);
+  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+  const sortedData = sortData(data, 'url'); // ensure data is sorted before saving
+  return fs.writeFileSync(filePath, JSON.stringify(sortedData, null, 2), 'utf8');
+};
+export const saveHardcodedExtraLinks = async () => {
+  const data = {
+    file_name: 'cs-links.json',
+    data: [
+        {
+          "url": "https://elemn.to/ai",
+          "content": "🧠 AI - How to save time - This page provides valuable insights on how to leverage AI tools for optimizing workflows and saving time across various tasks."
+        },
+    ],
+  };
+  const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
+  saveSortedFormattedJSON(filePath, data.data);
+  return data.data.length;
+};