scraply 1.0.11 → 1.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/scraply.js
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import { loadConfig } from './loadConfig.js';
|
|
2
2
|
import { normalizeURL } from './utils/crawl/url/normalize.js';
|
|
3
|
-
import { loadJSON, saveQueue, deleteDataFiles } from './utils/crawl/fileOperations.js';
|
|
3
|
+
import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
|
|
4
4
|
import { processURL } from './utils/crawl/url/processor.js';
|
|
5
5
|
import { formatData, saveSortedFormattedJSON, saveHardcodedExtraLinks } from './utils/format/formatData.js';
|
|
6
|
+
import path from 'node:path';
|
|
6
7
|
|
|
7
8
|
let urlData = [];
|
|
8
9
|
let urlMetadata = {};
|
|
9
10
|
let CONFIG = {};
|
|
11
|
+
let generatedFiles = new Set(); // Track files generated in the current crawl session.
|
|
10
12
|
|
|
11
13
|
const init = () => {
|
|
12
14
|
urlData = loadJSON(CONFIG.CRAWLER.QUEUE_PATH);
|
|
@@ -19,14 +21,12 @@ const init = () => {
|
|
|
19
21
|
urlData.push({ url: normalizedURL, file: null, status: null, error: null });
|
|
20
22
|
});
|
|
21
23
|
saveQueue(urlData);
|
|
22
|
-
|
|
23
|
-
// Delete everything under CONFIG.MAIN_DIR
|
|
24
|
-
deleteDataFiles(CONFIG.MAIN_DIR);
|
|
25
24
|
} else { // If the queue is not empty
|
|
26
25
|
const allProcessed = urlData.every(entry => entry.file !== null || entry.error !== null);
|
|
27
26
|
if (allProcessed) { // If all URLs have been processed
|
|
28
27
|
console.log(`All URLs in ${CONFIG.CRAWLER.QUEUE_PATH} have been processed. Deleting persistent storage and starting a fresh Crawl...\n`);
|
|
29
28
|
|
|
29
|
+
// Reset data for a fresh crawl.
|
|
30
30
|
urlData = [];
|
|
31
31
|
urlMetadata = {};
|
|
32
32
|
|
|
@@ -35,7 +35,6 @@ const init = () => {
|
|
|
35
35
|
deleteDataFiles(CONFIG.CRAWLER.CRAWLED_PATH);
|
|
36
36
|
deleteDataFiles(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH);
|
|
37
37
|
|
|
38
|
-
|
|
39
38
|
init();
|
|
40
39
|
} else { // If there are URLs that haven't been processed yet, resume from the queue.
|
|
41
40
|
console.log(`Resuming from ${CONFIG.CRAWLER.QUEUE_PATH} with ${urlData.length} total found URLs\n`);
|
|
@@ -59,9 +58,12 @@ const start = async () => {
|
|
|
59
58
|
|
|
60
59
|
let fileNumber = urlData.filter(entry => entry.file).length + 1;
|
|
61
60
|
for await (const entry of urlData) {
|
|
62
|
-
if (!entry.file) {
|
|
63
|
-
await processURL(entry, fileNumber, urlData, urlMetadata);
|
|
64
|
-
|
|
61
|
+
if (!entry.file) {
|
|
62
|
+
const processedFile = await processURL(entry, fileNumber, urlData, urlMetadata);
|
|
63
|
+
if (processedFile) {
|
|
64
|
+
generatedFiles.add(processedFile); // Track the file generated
|
|
65
|
+
}
|
|
66
|
+
fileNumber++;
|
|
65
67
|
}
|
|
66
68
|
}
|
|
67
69
|
|
|
@@ -98,6 +100,7 @@ const start = async () => {
|
|
|
98
100
|
totalSavedURLs += data.length;
|
|
99
101
|
console.log(`${data.length} -> ${savePath}`);
|
|
100
102
|
saveSortedFormattedJSON(savePath, data);
|
|
103
|
+
generatedFiles.add(savePath); // Track the file saved
|
|
101
104
|
};
|
|
102
105
|
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
103
106
|
|
|
@@ -105,14 +108,24 @@ const start = async () => {
|
|
|
105
108
|
const totalHardcodedLinks = await saveHardcodedExtraLinks();
|
|
106
109
|
console.log(`${totalHardcodedLinks} Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
107
110
|
|
|
111
|
+
// Track the files generated for hardcoded links with full paths.
|
|
112
|
+
CONFIG.DATA_FORMATTER.HARD_CODED_LINKS.forEach(link => {
|
|
113
|
+
const hardcodedFilePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
|
|
114
|
+
generatedFiles.add(hardcodedFilePath); // Add the full path to the set
|
|
115
|
+
});
|
|
116
|
+
|
|
108
117
|
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
109
118
|
const errorData = errorUrls.map(entry => {
|
|
110
119
|
return { url: entry.url, status: entry.status, error: entry.error };
|
|
111
120
|
});
|
|
112
|
-
|
|
113
121
|
saveSortedFormattedJSON(CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH, errorData);
|
|
114
122
|
|
|
115
123
|
console.log(`Errors: ${errorData.length} -> ${CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH}.`);
|
|
124
|
+
|
|
125
|
+
// After formatting data, delete untracked files
|
|
126
|
+
console.log(`\nCLEANING UP UNTRACKED FILES...`);
|
|
127
|
+
deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
|
|
128
|
+
generatedFiles.clear(); // Clear the set to prepare for future crawls
|
|
116
129
|
};
|
|
117
130
|
|
|
118
131
|
// Main function to be exported and used
|
|
@@ -35,3 +35,17 @@ export const deleteDataFiles = (filePath) => {
|
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
37
|
};
|
|
38
|
+
|
|
39
|
+
export const deleteUntrackedFiles = (folderPath, trackedFiles) => {
|
|
40
|
+
if (fs.existsSync(folderPath)) {
|
|
41
|
+
fs.readdirSync(folderPath).forEach((file) => {
|
|
42
|
+
const currentPath = path.join(folderPath, file);
|
|
43
|
+
if (fs.lstatSync(currentPath).isDirectory()) {
|
|
44
|
+
deleteUntrackedFiles(currentPath, trackedFiles);
|
|
45
|
+
} else if (!trackedFiles.has(currentPath)) {
|
|
46
|
+
console.log(`Deleting untracked file: ${currentPath}`);
|
|
47
|
+
fs.unlinkSync(currentPath);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
};
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { delay } from '../delay.js';
|
|
2
2
|
import { cleanHTML } from '../cleanHTML.js';
|
|
3
|
-
|
|
4
3
|
import * as cheerio from 'cheerio';
|
|
5
4
|
import { shouldRetry, enqueueURLs } from './handlers.js';
|
|
6
5
|
import { fetchURL } from './fetch.js';
|
|
@@ -13,7 +12,7 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
|
|
|
13
12
|
|
|
14
13
|
const { url } = entry;
|
|
15
14
|
const { referrer, depth } = urlMetadata[url] || { referrer: null, depth: 0 }; // Default depth is 0.
|
|
16
|
-
|
|
15
|
+
|
|
17
16
|
const startTime = new Date().getTime();
|
|
18
17
|
try {
|
|
19
18
|
const result = await fetchURL(url, CONFIG.CRAWLER.MAX_RETRIES);
|
|
@@ -21,11 +20,15 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
|
|
|
21
20
|
const { data: html, status } = result;
|
|
22
21
|
const $ = cheerio.load(html);
|
|
23
22
|
enqueueURLs(urlData, urlMetadata, $, url, url, depth + 1);
|
|
23
|
+
|
|
24
24
|
const content = cleanHTML($);
|
|
25
25
|
const filename = saveDataset({ url, referrerURL: referrer, statusCode: status, depth, content }, fileNumber);
|
|
26
|
+
|
|
26
27
|
entry.file = filename;
|
|
27
28
|
entry.status = status;
|
|
28
29
|
entry.error = null;
|
|
30
|
+
|
|
31
|
+
return filename; // Return the filename of the saved dataset
|
|
29
32
|
} else {
|
|
30
33
|
entry.error = result.error;
|
|
31
34
|
entry.status = result.status;
|
|
@@ -33,12 +36,18 @@ export const processURL = async (entry, fileNumber, urlData, urlMetadata) => {
|
|
|
33
36
|
} catch (error) {
|
|
34
37
|
entry.error = error.message;
|
|
35
38
|
entry.status = null;
|
|
36
|
-
}
|
|
39
|
+
} finally {
|
|
40
|
+
// Save the queue state whether successful or not
|
|
41
|
+
saveQueue(urlData);
|
|
37
42
|
|
|
38
|
-
|
|
43
|
+
const endTime = new Date().getTime();
|
|
44
|
+
const elapsedTime = endTime - startTime;
|
|
39
45
|
|
|
40
|
-
|
|
41
|
-
|
|
46
|
+
// Apply delay if necessary
|
|
47
|
+
if (CONFIG.CRAWLER.CRAWL_DELAY_MS > 0 && elapsedTime < CONFIG.CRAWLER.CRAWL_DELAY_MS) {
|
|
48
|
+
await delay(CONFIG.CRAWLER.CRAWL_DELAY_MS - elapsedTime);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
42
51
|
|
|
43
|
-
|
|
52
|
+
return null; // Return null if no file was generated
|
|
44
53
|
};
|