clean-web-scraper 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of clean-web-scraper might be problematic. Click here for more details.

@@ -0,0 +1,19 @@
1
+ name: Publish on NPM registry
2
+
3
+ on:
4
+ push:
5
+ branches: ['main']
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v2
12
+ - uses: actions/setup-node@v2
13
+ with:
14
+ node-version: 20.x
15
+ registry-url: https://registry.npmjs.org/
16
+ # - run: npm install
17
+ - run: npm publish --access public
18
+ env:
19
+ NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
package/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # πŸ•ΈοΈ Web Content Scraper
2
+
3
+ A powerful Node.js web scraper that extracts clean, readable content from websites while keeping everything nicely organized. Perfect for creating AI training datasets! πŸ€–
4
+
5
+ ## ✨ Features
6
+
7
+ - 🌐 Smart recursive web crawling of internal links
8
+ - πŸ“ Clean content extraction using Mozilla's Readability
9
+ - πŸ—‚οΈ Maintains original URL structure in saved files
10
+ - 🚫 Excludes unwanted paths from scraping
11
+ - πŸ”„ Handles relative and absolute URLs like a pro
12
+ - 🎯 No duplicate page visits
13
+ - πŸ“Š AI-friendly clean text output (perfect for LLM fine-tuning!)
14
+
15
+ ## πŸ› οΈ Prerequisites
16
+
17
+ - Node.js (v18 or higher)
18
+ - npm
19
+
20
+ ## πŸ“¦ Dependencies
21
+
22
+ - **axios** - HTTP requests master
23
+ - **jsdom** - DOM parsing wizard
24
+ - **@mozilla/readability** - Content extraction genius
25
+
26
+ ## πŸš€ Installation
27
+
28
+ ```bash
29
+ npm i clean-web-scraper
30
+
31
+ # OR
32
+
33
+ git clone https://github.com/mlibre/Clean-Web-Scraper
34
+ cd Clean-Web-Scraper
35
+ npm install
36
+ ```
37
+
38
+ ## πŸ’» Usage
39
+
40
+ 1. Configure the base URL in `main.js` (default is set to decolonizepalestine.com)
41
+ 2. Set up exclude list if needed
42
+ 3. Run the scraper:
43
+
44
+ ```bash
45
+ node main.js
46
+ ```
47
+
48
+ ## πŸ“€ Output
49
+
50
+ Your AI-ready content is saved in a clean, structured format:
51
+
52
+ - πŸ“ Base folder: ./folderPath/example.com/
53
+ - πŸ“‘ Files preserve original URL paths
54
+ - πŸ“ Pure text format, perfect for LLM training and fine-tuning
55
+ - πŸ€– No HTML, no mess - just clean, structured text ready for AI consumption
56
+
57
+ ## πŸ€– AI/LLM Training Ready
58
+
59
+ The output is specifically formatted for AI training purposes:
60
+
61
+ - Clean, processed text without HTML markup
62
+ - Consistent formatting across all documents
63
+ - Structured content perfect for fine-tuning LLMs
64
+ - Ready to use in your ML pipelines
65
+
66
+ ## Standing with Palestine πŸ‡΅πŸ‡Έ
67
+
68
+ This project supports Palestinian rights and stands in solidarity with Palestine. We believe in the importance of documenting and preserving Palestinian narratives, history, and struggles for justice and liberation.
69
+
70
+ Free Palestine πŸ‡΅πŸ‡Έ
@@ -0,0 +1,64 @@
1
+ // eslint.config.cjs
2
+
3
+ module.exports = [
4
+ {
5
+ languageOptions: {
6
+ parserOptions: {
7
+ ecmaVersion: 13,
8
+ impliedStrict: true,
9
+ }
10
+ },
11
+ rules: {
12
+ "no-trailing-spaces": "error",
13
+ "linebreak-style": ["error", "unix"],
14
+ "quotes": ["error", "double"],
15
+ "one-var": ["error", "never"],
16
+ "brace-style": ["error", "allman", { allowSingleLine: true }],
17
+ "space-before-blocks": "warn",
18
+ "func-call-spacing": "error",
19
+ "space-before-function-paren": "error",
20
+ "space-in-parens": ["error", "always", { exceptions: ["{}"] }],
21
+ "keyword-spacing": "error",
22
+ "comma-spacing": "error",
23
+ "space-unary-ops": "error",
24
+ "block-spacing": "error",
25
+ "arrow-spacing": "error",
26
+ "key-spacing": "error",
27
+ "comma-style": "error",
28
+ "space-infix-ops": "error",
29
+ "array-bracket-spacing": "error",
30
+ "object-curly-spacing": ["error", "always"],
31
+ "no-multi-spaces": "error",
32
+ "operator-linebreak": "error",
33
+ "function-paren-newline": "warn",
34
+ "arrow-body-style": ["error", "always"],
35
+ "no-template-curly-in-string": "error",
36
+ "no-new-object": "error",
37
+ "no-extra-parens": ["error", "all", { conditionalAssign: false }],
38
+ "no-empty-function": "error",
39
+ "no-empty": ["warn", { allowEmptyCatch: true }],
40
+ "no-eq-null": "error",
41
+ "no-extra-bind": "error",
42
+ "no-self-compare": "error",
43
+ "no-useless-call": "error",
44
+ "no-undefined": "error",
45
+ "no-array-constructor": "error",
46
+ "prefer-destructuring": ["error",
47
+ {
48
+ VariableDeclarator: { array: false, object: true }, AssignmentExpression: { array: false, object: false } }, { enforceForRenamedProperties: false
49
+ }
50
+ ],
51
+ "object-shorthand": "warn",
52
+ "prefer-spread": "warn",
53
+ "prefer-template": "warn",
54
+ "no-loop-func": "warn",
55
+ "prefer-rest-params": "warn",
56
+ "no-new-func": "warn",
57
+ "no-unneeded-ternary": "warn",
58
+ "no-process-exit": "off",
59
+ "require-await": "warn",
60
+ "indent": ["error", "tab", { MemberExpression: 0 }],
61
+ "no-tabs": 0,
62
+ },
63
+ },
64
+ ];
package/main.js ADDED
@@ -0,0 +1,16 @@
1
+ const WebScraper = require( "./src/WebScraper" );
2
+
3
+ const baseURL = "https://decolonizepalestine.com";
4
+ const folderPath = "./dataset";
5
+ const excludeList = [
6
+ "https://decolonizepalestine.com/cdn-cgi",
7
+ "https://decolonizepalestine.com/introduction-to-palestine",
8
+ "https://decolonizepalestine.com/myths",
9
+ "https://decolonizepalestine.com/reading-list",
10
+ "https://decolonizepalestine.com/support-us"
11
+ ];
12
+ const exactExcludeList = ["https://decolonizepalestine.com/rainbow-washing"]
13
+
14
+
15
+ const scraper = new WebScraper({ baseURL, folderPath, excludeList, exactExcludeList });
16
+ scraper.start();
package/package.json ADDED
@@ -0,0 +1,19 @@
1
+ {
2
+ "name": "clean-web-scraper",
3
+ "version": "1.0.5",
4
+ "main": "main.js",
5
+ "scripts": {
6
+ "start": "node main.js",
7
+ "test": "echo \"Error: no test specified\" && exit 1"
8
+ },
9
+ "keywords": [],
10
+ "author": "",
11
+ "license": "ISC",
12
+ "description": "",
13
+ "dependencies": {
14
+ "@mozilla/readability": "^0.5.0",
15
+ "axios": "^1.7.9",
16
+ "eslint": "^9.17.0",
17
+ "jsdom": "^26.0.0"
18
+ }
19
+ }
@@ -0,0 +1,161 @@
1
+ const axios = require( "axios" );
2
+ const jsdom = require( "jsdom" );
3
+ const { JSDOM } = jsdom;
4
+ const { Readability } = require( "@mozilla/readability" );
5
+ const fs = require( "fs" );
6
+ const path = require( "path" );
7
+
8
+ class WebScraper
9
+ {
10
+ constructor ({ baseURL, folderPath, excludeList, exactExcludeList })
11
+ {
12
+ this.baseURL = baseURL;
13
+ this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
14
+ this.visited = new Set();
15
+ this.excludeList = excludeList;
16
+ this.exactExcludeList = exactExcludeList;
17
+ this.createOutputDirectory()
18
+ }
19
+
20
+ async start ()
21
+ {
22
+ this.visited.add( this.baseURL );
23
+ await this.fetchPage( this.baseURL );
24
+ }
25
+
26
+ async fetchPage ( url )
27
+ {
28
+ try
29
+ {
30
+ const { data } = await axios.get( url );
31
+ const dom = new JSDOM( data, { url });
32
+
33
+ // Only save if the URL is not excluded
34
+ if ( !this.isExcluded( url ) )
35
+ {
36
+ const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
37
+ const article = reader.parse();
38
+
39
+ if ( article )
40
+ {
41
+ this.saveArticle( url, article.textContent );
42
+ }
43
+ else
44
+ {
45
+ console.error( `No readable content found at ${url}` );
46
+ }
47
+ }
48
+
49
+ const links = this.extractLinks( data );
50
+ for ( const link of links )
51
+ {
52
+ if ( !this.visited.has( link ) )
53
+ {
54
+ this.visited.add( link );
55
+ await this.fetchPage( link );
56
+ }
57
+ }
58
+ }
59
+ catch ( error )
60
+ {
61
+ console.error( `Error fetching ${url}:`, error.message );
62
+ }
63
+ }
64
+
65
+ extractLinks ( data )
66
+ {
67
+ const links = new Set();
68
+ const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
69
+ let match;
70
+
71
+ while ( ( match = regex.exec( data ) ) !== null )
72
+ {
73
+ let href = match[2];
74
+ if ( href.endsWith( "/" ) )
75
+ {
76
+ href = href.slice( 0, -1 );
77
+ }
78
+ if ( href.startsWith( this.baseURL ) )
79
+ {
80
+ links.add( href );
81
+ }
82
+ else if ( href.startsWith( "/" ) )
83
+ {
84
+ links.add( new URL( href, this.baseURL ).href );
85
+ }
86
+ }
87
+
88
+ return links;
89
+ }
90
+
91
+ saveArticle ( url, content )
92
+ {
93
+ const processedContent = this.processContent( content );
94
+
95
+ let urlPath = new URL( url ).pathname;
96
+ if ( urlPath === "/" )
97
+ {
98
+ urlPath = "/index";
99
+ }
100
+ const filePath = path.join( __dirname, this.folderPath, urlPath );
101
+ const dir = path.dirname( filePath );
102
+
103
+ // Create metadata object
104
+ const metadata = {
105
+ url,
106
+ dateScraped: new Date().toISOString(),
107
+ contentLength: processedContent.length,
108
+ fileName: `${path.basename( filePath )}.txt`
109
+ };
110
+
111
+ // Create directory if it doesn't exist
112
+ fs.mkdirSync( dir, { recursive: true });
113
+
114
+ // Save the text content
115
+ fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
116
+
117
+ // Save the JSON metadata
118
+ fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
119
+
120
+ console.log( `Saved: ${filePath}.txt` );
121
+ console.log( `Saved: ${filePath}.json` );
122
+ }
123
+
124
+ // Add this new method to the WebScraper class
125
+ processContent ( content )
126
+ {
127
+ let processed = content;
128
+
129
+ // Remove "[You can read more about this here]" and similar patterns
130
+ processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
131
+
132
+ // Add more processing rules as needed:
133
+ // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
134
+ // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
135
+
136
+ return processed;
137
+ }
138
+
139
+ isExcluded ( url )
140
+ {
141
+ if ( this.exactExcludeList.includes( url ) )
142
+ {
143
+ return true;
144
+ }
145
+ return this.excludeList.some( excluded => { return url.startsWith( excluded ) });
146
+ }
147
+
148
+ createOutputDirectory ()
149
+ {
150
+ if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
151
+ {
152
+ fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
153
+ }
154
+ if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
155
+ {
156
+ fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
157
+ }
158
+ }
159
+ }
160
+
161
+ module.exports = WebScraper;