scraply 1.0.10 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "scraply",
3
3
  "description": "A simple, configurable and functional content scraper",
4
- "version": "1.0.10",
4
+ "version": "1.0.12",
5
5
  "main": "src/scraply.js",
6
6
  "type": "module",
7
7
  "scripts": {
package/src/scraply.js CHANGED
@@ -1,12 +1,14 @@
1
1
  import { loadConfig } from './loadConfig.js';
2
2
  import { normalizeURL } from './utils/crawl/url/normalize.js';
3
- import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
3
+ import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
4
4
  import { processURL } from './utils/crawl/url/processor.js';
5
5
  import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
6
+ import path from 'node:path';
6
7
 
7
8
  let urlData = [];
8
9
  let urlMetadata = {};
9
10
  let CONFIG = {};
11
+ let generatedFiles = new Set(); // Track files generated in the current crawl session.
10
12
 
11
13
  const init = () => {
12
14
  urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
@@ -24,10 +26,11 @@ const init = () => {
24
26
  if (allProcessed) { // If all URLs have been processed
25
27
  console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
26
28
 
29
+ // Reset data for a fresh crawl.
27
30
  urlData = [];
28
31
  urlMetadata = {};
29
32
 
30
- // Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data. This way, the Discord Bot will fetch the correct & latest data from the GitHub repo, without fetching any incomplete data or empty data, as it watches for file diffs!
33
+ // Delete everything except CONFIG.DATA_FORMATTER.FORMATTED_PATH, so that the formatted data is always preserved until the crawler really finalizes the data.
31
34
  deleteDataFiles(CONFIG.CRAWLER.QUEUE_PATH);
32
35
  deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
33
36
  deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
@@ -55,9 +58,12 @@ const start = async () => {
55
58
 
56
59
  let fileNumber = urlData.filter(entry => entry.file).length + 1;
57
60
  for await (const entry of urlData) {
58
- if (!entry.file) { // Only process URLs that haven't been processed yet.
59
- await processURL(entry, fileNumber, urlData, urlMetadata);
60
- fileNumber++; // Increment the file number only if the URL was processed successfully.
61
+ if (!entry.file) {
62
+ const processedFile = await processURL(entry, fileNumber, urlData, urlMetadata);
63
+ if (processedFile) {
64
+ generatedFiles.add(processedFile); // Track the file generated
65
+ }
66
+ fileNumber++;
61
67
  }
62
68
  }
63
69
 
@@ -94,6 +100,7 @@ const start = async () => {
94
100
  totalSavedURLs += data.length;
95
101
  console.log(`${data.length} -> ${savePath}`);
96
102
  saveSortedFormattedJSON(savePath, data);
103
+ generatedFiles.add(savePath); // Track the file saved
97
104
  };
98
105
  console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
99
106
 
@@ -101,14 +108,24 @@ const start = async () => {
101
108
  const totalHardcodedLinks = await saveHardcodedExtraLinks();
102
109
  console.log(`${totalHardcodedLinks} Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
103
110
 
111
+ // Track the files generated for hardcoded links with full paths.
112
+ CONFIG.DATA_FORMATTER.HARD_CODED_LINKS.forEach(link => {
113
+ const hardcodedFilePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
114
+ generatedFiles.add(hardcodedFilePath); // Add the full path to the set
115
+ });
116
+
104
117
  // Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
105
118
  const errorData = errorUrls.map(entry => {
106
119
  return { url: entry.url, status: entry.status, error: entry.error };
107
120
  });
108
-
109
121
  saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
110
122
 
111
123
  console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
124
+
125
+ // After formatting data, delete untracked files
126
+ console.log(`\nCLEANING UP UNTRACKED FILES...`);
127
+ deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
128
+ generatedFiles.clear(); // Clear the set to prepare for future crawls
112
129
  };
113
130
 
114
131
  // Main function to be exported and used
@@ -35,3 +35,17 @@ export const deleteDataFiles = (filePath) => {
35
35
  }
36
36
  }
37
37
  };
38
+
39
+ export const deleteUntrackedFiles = (folderPath, trackedFiles) => {
40
+ if (fs.existsSync(folderPath)) {
41
+ fs.readdirSync(folderPath).forEach((file) => {
42
+ const currentPath = path.join(folderPath, file);
43
+ if (fs.lstatSync(currentPath).isDirectory()) {
44
+ deleteUntrackedFiles(currentPath, trackedFiles);
45
+ } else if (!trackedFiles.has(currentPath)) {
46
+ console.log(`Deleting untracked file: ${currentPath}`);
47
+ fs.unlinkSync(currentPath);
48
+ }
49
+ });
50
+ }
51
+ };
@@ -1,6 +1,5 @@
1
1
  import { delay } from '../delay.js';
2
2
  import { cleanHTML } from '../cleanHTML.js';
3
-
4
3
  import * as cheerio from 'cheerio';
5
4
  import { shouldRetry, enqueueURLs } from './handlers.js';
6
5
  import { fetchURL } from './fetch.js';
@@ -13,7 +12,7 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
13
12
 
14
13
  const { url } = entry;
15
14
  const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
16
-
15
+
17
16
  const startTime = new Date().getTime();
18
17
  try {
19
18
  const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
@@ -21,11 +20,15 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
21
20
  const { data: html, status } = result;
22
21
  const $ = cheerio.load(html);
23
22
  enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
23
+
24
24
  const content = cleanHTML($);
25
25
  const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
26
+
26
27
  entry.file = filename;
27
28
  entry.status = status;
28
29
  entry.error = null;
30
+
31
+ return filename; // Return the filename of the saved dataset
29
32
  } else {
30
33
  entry.error = result.error;
31
34
  entry.status = result.status;
@@ -33,12 +36,18 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
33
36
  } catch (error) {
34
37
  entry.error = error.message;
35
38
  entry.status = null;
36
- }
39
+ } finally {
40
+ // Save the queue state whether successful or not
41
+ saveQueue(urlData);
37
42
 
38
- saveQueue(urlData);
43
+ const endTime = new Date().getTime();
44
+ const elapsedTime = endTime - startTime;
39
45
 
40
- const endTime = new Date().getTime();
41
- const elapsedTime = endTime - startTime;
46
+ // Apply delay if necessary
47
+ if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) {
48
+ await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
49
+ }
50
+ }
42
51
 
43
- if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
52
+ return null; // Return null if no file was generated
44
53
  };