scraply 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish.yml +4 -6
- package/package.json +1 -1
- package/readme.md +13 -2
- package/src/defaultConfig.js +12 -1
- package/src/scraply.js +2 -2
- package/src/utils/format/formatData.js +19 -17
- package/LICENCE +0 -0
|
@@ -2,10 +2,8 @@ name: Publish to NPM
|
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
|
-
branches:
|
|
6
|
-
- main
|
|
7
5
|
tags:
|
|
8
|
-
- 'v*'
|
|
6
|
+
- 'v*' # Trigger only when a version tag is pushed
|
|
9
7
|
|
|
10
8
|
jobs:
|
|
11
9
|
publish:
|
|
@@ -13,12 +11,12 @@ jobs:
|
|
|
13
11
|
|
|
14
12
|
steps:
|
|
15
13
|
- name: Check out the repository
|
|
16
|
-
uses: actions/checkout@
|
|
14
|
+
uses: actions/checkout@v3
|
|
17
15
|
|
|
18
16
|
- name: Set up Node.js
|
|
19
|
-
uses: actions/setup-node@
|
|
17
|
+
uses: actions/setup-node@v3
|
|
20
18
|
with:
|
|
21
|
-
node-version: '20
|
|
19
|
+
node-version: '20'
|
|
22
20
|
registry-url: 'https://registry.npmjs.org/'
|
|
23
21
|
|
|
24
22
|
- name: Install dependencies
|
package/package.json
CHANGED
package/readme.md
CHANGED
|
@@ -82,8 +82,19 @@ DATA_FORMATTER: {
|
|
|
82
82
|
CATEGORISED_PATHS: {
|
|
83
83
|
'https://crawler-test.com': {
|
|
84
84
|
'mobile': 'mobile.json',
|
|
85
|
-
'
|
|
85
|
+
'*': 'general.json'
|
|
86
86
|
},
|
|
87
87
|
},
|
|
88
|
+
HARD_CODED_LINKS: [
|
|
89
|
+
{
|
|
90
|
+
file_name: 'hc-links.json',
|
|
91
|
+
data: [
|
|
92
|
+
{
|
|
93
|
+
"url": "https://custom-link.com",
|
|
94
|
+
"content": "That's a custom link content, you can add as many as you want."
|
|
95
|
+
},
|
|
96
|
+
]
|
|
97
|
+
}
|
|
98
|
+
]
|
|
88
99
|
}
|
|
89
|
-
```
|
|
100
|
+
```
|
package/src/defaultConfig.js
CHANGED
|
@@ -48,8 +48,19 @@ export const DEFAULT_CONFIG = {
|
|
|
48
48
|
CATEGORISED_PATHS: {
|
|
49
49
|
'https://crawler-test.com': {
|
|
50
50
|
'mobile': 'mobile.json',
|
|
51
|
-
'
|
|
51
|
+
'*': 'general.json'
|
|
52
52
|
},
|
|
53
53
|
},
|
|
54
|
+
HARD_CODED_LINKS: [
|
|
55
|
+
{
|
|
56
|
+
file_name: 'hc-links.json',
|
|
57
|
+
data: [
|
|
58
|
+
{
|
|
59
|
+
"url": "https://custom-link.com",
|
|
60
|
+
"content": "That's a custom link content, you can add as many as you want."
|
|
61
|
+
},
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
]
|
|
54
65
|
}
|
|
55
66
|
};
|
package/src/scraply.js
CHANGED
|
@@ -98,8 +98,8 @@ const start = async () => {
|
|
|
98
98
|
console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
99
99
|
|
|
100
100
|
// Save hardcoded extra links to files.
|
|
101
|
-
await saveHardcodedExtraLinks();
|
|
102
|
-
console.log(
|
|
101
|
+
const totalHardcodedLinks = await saveHardcodedExtraLinks();
|
|
102
|
+
console.log(`${totalHardcodedLinks} Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
|
|
103
103
|
|
|
104
104
|
// Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
|
|
105
105
|
const errorData = errorUrls.map(entry => {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
3
|
|
|
4
4
|
export const formatData = (entry) => {
|
|
5
5
|
if (entry.file && entry.error === null) {
|
|
@@ -9,7 +9,15 @@ export const formatData = (entry) => {
|
|
|
9
9
|
const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
|
|
10
10
|
|
|
11
11
|
if (!isExcluded) {
|
|
12
|
-
|
|
12
|
+
// Check for the specific category path
|
|
13
|
+
const pathSegments = pathname.split('/');
|
|
14
|
+
let categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathSegments[1]];
|
|
15
|
+
|
|
16
|
+
// If no specific category path is found, use the "*" fallback
|
|
17
|
+
if (!categorisedPath) {
|
|
18
|
+
categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.['*'];
|
|
19
|
+
}
|
|
20
|
+
|
|
13
21
|
if (categorisedPath) {
|
|
14
22
|
return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
|
|
15
23
|
}
|
|
@@ -39,18 +47,12 @@ export const saveSortedFormattedJSON = (filePath, data) => {
|
|
|
39
47
|
};
|
|
40
48
|
|
|
41
49
|
export const saveHardcodedExtraLinks = async () => {
|
|
42
|
-
const
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
|
|
53
|
-
saveSortedFormattedJSON(filePath, data.data);
|
|
54
|
-
|
|
55
|
-
return data.data.length;
|
|
50
|
+
const hardcodedLinks = CONFIG.DATA_FORMATTER.HARD_CODED_LINKS;
|
|
51
|
+
|
|
52
|
+
for (const link of hardcodedLinks) {
|
|
53
|
+
const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
|
|
54
|
+
saveSortedFormattedJSON(filePath, link.data);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return hardcodedLinks.reduce((acc, link) => acc + link.data.length, 0); // Total number of links saved
|
|
56
58
|
};
|
package/LICENCE
DELETED
|
File without changes
|