scraply 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "scraply",
3
3
  "description": "A simple, configurable and functional content scraper",
4
- "version": "1.0.5",
4
+ "version": "1.0.7",
5
5
  "main": "src/scraply.js",
6
6
  "type": "module",
7
7
  "scripts": {
package/readme.md CHANGED
@@ -82,8 +82,19 @@ DATA_FORMATTER: {
82
82
  CATEGORISED_PATHS: {
83
83
  'https://crawler-test.com': {
84
84
  'mobile': 'mobile.json',
85
- 'fallback': 'general.json'
85
+ '*': 'general.json'
86
86
  },
87
87
  },
88
+ HARD_CODED_LINKS: [
89
+ {
90
+ file_name: 'hc-links.json',
91
+ data: [
92
+ {
93
+ "url": "https://custom-link.com",
94
+ "content": "That's a custom link content, you can add as many as you want."
95
+ },
96
+ ]
97
+ }
98
+ ]
88
99
  }
89
- ```
100
+ ```
@@ -48,8 +48,19 @@ export const DEFAULT_CONFIG = {
48
48
  CATEGORISED_PATHS: {
49
49
  'https://crawler-test.com': {
50
50
  'mobile': 'mobile.json',
51
- 'fallback': 'general.json'
51
+ '*': 'general.json'
52
52
  },
53
53
  },
54
+ HARD_CODED_LINKS: [
55
+ {
56
+ file_name: 'hc-links.json',
57
+ data: [
58
+ {
59
+ "url": "https://custom-link.com",
60
+ "content": "That's a custom link content, you can add as many as you want."
61
+ },
62
+ ]
63
+ }
64
+ ]
54
65
  }
55
66
  };
package/src/scraply.js CHANGED
@@ -98,8 +98,8 @@ const start = async () => {
98
98
  console.log(`${totalSavedURLs} total saved URLs to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
99
99
 
100
100
  // Save hardcoded extra links to files.
101
- await saveHardcodedExtraLinks();
102
- console.log(`Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
101
+ const totalHardcodedLinks = await saveHardcodedExtraLinks();
102
+ console.log(`${totalHardcodedLinks} Hardcoded extra links saved to ${CONFIG.DATA_FORMATTER.FORMATTED_PATH}`);
103
103
 
104
104
  // Error reporting: Save into CONFIG.DATA_FORMATTER.ERROR_REPORT_PATH the URLs that had any error: Save the url, the referrer, status code and error!
105
105
  const errorData = errorUrls.map(entry => {
@@ -19,7 +19,28 @@ export const shouldRetry = (error) => {
19
19
  const shouldIncludeURL = (url) => {
20
20
  try {
21
21
  const urlObj = new URL(url);
22
- return CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => new RegExp(pattern).test(urlObj.toString())) && !CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => new RegExp(pattern).test(urlObj.pathname));
22
+
23
+ // Check if the URL matches any include pattern
24
+ const isIncluded = CONFIG.CRAWLER.INCLUDE_URLS.some(pattern => {
25
+ if (typeof pattern === 'string') {
26
+ return urlObj.toString().includes(pattern); // Check if the URL contains the string pattern
27
+ } else if (pattern instanceof RegExp) {
28
+ return pattern.test(urlObj.toString()); // Check if the URL matches the RegExp pattern
29
+ }
30
+ });
31
+
32
+ if (!isIncluded) return false; // This URL is not included
33
+
34
+ // Check if the URL matches any exclude pattern
35
+ const isExcluded = CONFIG.CRAWLER.EXCLUDE_PATTERNS.some(pattern => {
36
+ if (typeof pattern === 'string') {
37
+ return urlObj.pathname.includes(pattern); // Check if the URL pathname contains the string pattern
38
+ } else if (pattern instanceof RegExp) {
39
+ return pattern.test(urlObj.pathname); // Check if the URL pathname matches the RegExp pattern
40
+ }
41
+ });
42
+
43
+ return !isExcluded; // This URL is included and not excluded
23
44
  } catch (error) {
24
45
  return false;
25
46
  }
@@ -1,5 +1,5 @@
1
- import fs from 'fs';
2
- import path from 'path';
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
3
 
4
4
  export const formatData = (entry) => {
5
5
  if (entry.file && entry.error === null) {
@@ -9,7 +9,15 @@ export const formatData = (entry) => {
9
9
  const isExcluded = CONFIG.DATA_FORMATTER.EXCLUDED_PATTERNS.some(pattern => new RegExp(pattern).test(entry.url));
10
10
 
11
11
  if (!isExcluded) {
12
- const categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathname.split('/')[1]] || CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.fallback;
12
+ // Check for the specific category path
13
+ const pathSegments = pathname.split('/');
14
+ let categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.[pathSegments[1]];
15
+
16
+ // If no specific category path is found, use the "*" fallback
17
+ if (!categorisedPath) {
18
+ categorisedPath = CONFIG.DATA_FORMATTER.CATEGORISED_PATHS[url.origin]?.['*'];
19
+ }
20
+
13
21
  if (categorisedPath) {
14
22
  return path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, categorisedPath); // Return the path where the data should be saved.
15
23
  }
@@ -39,18 +47,12 @@ export const saveSortedFormattedJSON = (filePath, data) => {
39
47
  };
40
48
 
41
49
  export const saveHardcodedExtraLinks = async () => {
42
- const data = {
43
- file_name: 'cs-links.json',
44
- data: [
45
- {
46
- "url": "https://elemn.to/ai",
47
- "content": "🧠 AI - How to save time - This page provides valuable insights on how to leverage AI tools for optimizing workflows and saving time across various tasks."
48
- },
49
- ],
50
- };
51
-
52
- const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, data.file_name);
53
- saveSortedFormattedJSON(filePath, data.data);
54
-
55
- return data.data.length;
50
+ const hardcodedLinks = CONFIG.DATA_FORMATTER.HARD_CODED_LINKS;
51
+
52
+ for (const link of hardcodedLinks) {
53
+ const filePath = path.join(CONFIG.DATA_FORMATTER.FORMATTED_PATH, link.file_name);
54
+ saveSortedFormattedJSON(filePath, link.data);
55
+ }
56
+
57
+ return hardcodedLinks.reduce((acc, link) => acc + link.data.length, 0); // Total number of links saved
56
58
  };
package/LICENCE DELETED
File without changes