scraply 1.0.12 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -16,11 +16,11 @@ jobs:
|
|
|
16
16
|
- name: Set up Node.js
|
|
17
17
|
uses: actions/setup-node@v3
|
|
18
18
|
with:
|
|
19
|
-
node-version: '
|
|
19
|
+
node-version: '22'
|
|
20
20
|
registry-url: 'https://registry.npmjs.org/'
|
|
21
21
|
|
|
22
22
|
- name: Install dependencies
|
|
23
|
-
run: npm
|
|
23
|
+
run: npm ci
|
|
24
24
|
|
|
25
25
|
- name: Publish to npm
|
|
26
26
|
run: npm publish
|
package/package.json
CHANGED
package/src/scraply.js
CHANGED
|
@@ -2,8 +2,7 @@ import { loadConfig } from './loadConfig.js';
|
|
|
2
2
|
import { normalizeURL } from './utils/crawl/url/normalize.js';
|
|
3
3
|
import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
|
|
4
4
|
import { processURL } from './utils/crawl/url/processor.js';
|
|
5
|
-
import { formatData, saveSortedFormattedJSON
|
|
6
|
-
import path from 'node:path';
|
|
5
|
+
import { formatData, saveSortedFormattedJSON } from './utils/format/formatData.js';
|
|
7
6
|
|
|
8
7
|
let urlData = [];
|
|
9
8
|
let urlMetadata = {};
|
|
@@ -104,16 +103,6 @@ const start = async () => {
|
|
|
104
103
|
};
|
|
105
104
|
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
106
105
|
|
|
107
|
-
// Save hardcoded extra links to files.
|
|
108
|
-
const totalHardcodedLinks = await saveHardcodedExtraLinks();
|
|
109
|
-
console.log(`${totalHardcodedLinks} Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
110
|
-
|
|
111
|
-
// Track the files generated for hardcoded links with full paths.
|
|
112
|
-
CONFIG.DATA_FORMATTER.HARD_CODED_LINKS.forEach(link => {
|
|
113
|
-
const hardcodedFilePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
|
|
114
|
-
generatedFiles.add(hardcodedFilePath); // Add the full path to the set
|
|
115
|
-
});
|
|
116
|
-
|
|
117
106
|
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
118
107
|
const errorData = errorUrls.map(entry => {
|
|
119
108
|
return { url: entry.url, status: entry.status, error: entry.error };
|
|
@@ -9,17 +9,22 @@ export const formatData = (entry) => {
|
|
|
9
9
|
const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
|
|
10
10
|
|
|
11
11
|
if (!isExcluded) {
|
|
12
|
-
|
|
13
|
-
const
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
12
|
+
const pathSegments = pathname.split('/').filter(Boolean); // filter out empty segments
|
|
13
|
+
const categorisedPaths = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin];
|
|
14
|
+
|
|
15
|
+
if (categorisedPaths) {
|
|
16
|
+
// Try to match the full path segments, reducing specificity step by step
|
|
17
|
+
let categorisedPath = null;
|
|
18
|
+
|
|
19
|
+
for (let i = pathSegments.length; i >= 1; i--) {
|
|
20
|
+
const pathKey = pathSegments.slice(0, i).join('/');
|
|
21
|
+
categorisedPath = categorisedPaths[pathKey];
|
|
22
|
+
if (categorisedPath) break;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Fallback to wildcard match ('*') if no specific path is found
|
|
26
|
+
if (!categorisedPath) categorisedPath = categorisedPaths['*'];
|
|
27
|
+
if (categorisedPath) return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
23
28
|
}
|
|
24
29
|
}
|
|
25
30
|
} catch (e) {
|
|
@@ -45,14 +50,3 @@ export const saveSortedFormattedJSON = (filePath, data) => {
|
|
|
45
50
|
const sortedData = sortData(data, 'url'); // ensure data is sorted before saving
|
|
46
51
|
return fs.writeFileSync(filePath, JSON.stringify(sortedData, null, 2), 'utf8');
|
|
47
52
|
};
|
|
48
|
-
|
|
49
|
-
export const saveHardcodedExtraLinks = async () => {
|
|
50
|
-
const hardcodedLinks = CONFIG.DATA_FORMATTER.HARD_CODED_LINKS;
|
|
51
|
-
|
|
52
|
-
for (const link of hardcodedLinks) {
|
|
53
|
-
const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
|
|
54
|
-
saveSortedFormattedJSON(filePath, link.data);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
return hardcodedLinks.reduce((acc, link) => acc + link.data.length, 0); // Total number of links saved
|
|
58
|
-
};
|