clean-web-scraper 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of clean-web-scraper might be problematic. Click here for more details.

@@ -0,0 +1,19 @@
1
+ name: Publish on NPM registry
2
+
3
+ on:
4
+ push:
5
+ branches: ['main']
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v2
12
+ - uses: actions/setup-node@v2
13
+ with:
14
+ node-version: 20.x
15
+ registry-url: https://registry.npmjs.org/
16
+ # - run: npm install
17
+ - run: npm publish --access public
18
+ env:
19
+ NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
package/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # πŸ•ΈοΈ Web Content Scraper
2
+
3
+ A powerful Node.js web scraper that extracts clean, readable content from websites while keeping everything nicely organized. Perfect for creating AI training datasets! πŸ€–
4
+
5
+ ## ✨ Features
6
+
7
+ - 🌐 Smart recursive web crawling of internal links
8
+ - πŸ“ Clean content extraction using Mozilla's Readability
9
+ - 🧹 Smart content processing and cleaning
10
+ - πŸ—‚οΈ Maintains original URL structure in saved files
11
+ - 🚫 Excludes unwanted paths from scraping
12
+ - πŸ”„ Handles relative and absolute URLs like a pro
13
+ - 🎯 No duplicate page visits
14
+ - πŸ“Š Generates JSONL output file for ML training
15
+ - πŸ“Š AI-friendly clean text output (perfect for LLM fine-tuning!)
16
+
17
+ ## πŸ› οΈ Prerequisites
18
+
19
+ - Node.js (v18 or higher)
20
+ - npm
21
+
22
+ ## πŸ“¦ Dependencies
23
+
24
+ - **axios** - HTTP requests master
25
+ - **jsdom** - DOM parsing wizard
26
+ - **@mozilla/readability** - Content extraction genius
27
+
28
+ ## πŸš€ Installation
29
+
30
+ ```bash
31
+ npm i clean-web-scraper
32
+
33
+ # OR
34
+
35
+ git clone https://github.com/mlibre/Clean-Web-Scraper
36
+ cd Clean-Web-Scraper
37
+ npm install
38
+ ```
39
+
40
+ ## πŸ’» Usage
41
+
42
+ ```js
43
+ const WebScraper = require('clean-web-scraper');
44
+
45
+ const scraper = new WebScraper({
46
+ baseURL: 'https://example.com', // Required: The website to scrape
47
+ folderPath: './output', // Required: Where to save the content
48
+ excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
+ exactExcludeList: ['/specific-page'],// Optional: Exact URLs to exclude
50
+ jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
51
+ });
52
+
53
+ scraper.start();
54
+ ```
55
+
56
+ ```bash
57
+ node example-usage.js
58
+ ```
59
+
60
+ ## πŸ“€ Output
61
+
62
+ Your AI-ready content is saved in a clean, structured format:
63
+
64
+ - πŸ“ Base folder: ./folderPath/example.com/
65
+ - πŸ“‘ Files preserve original URL paths
66
+ - πŸ“ Pure text format, perfect for LLM training and fine-tuning
67
+ - πŸ€– No HTML, no mess - just clean, structured text ready for AI consumption
68
+ - πŸ“Š JSONL output for ML training
69
+
70
+ ## πŸ€– AI/LLM Training Ready
71
+
72
+ The output is specifically formatted for AI training purposes:
73
+
74
+ - Clean, processed text without HTML markup
75
+ - Consistent formatting across all documents
76
+ - Structured content perfect for fine-tuning LLMs
77
+ - Ready to use in your ML pipelines
78
+
79
+ ## Standing with Palestine πŸ‡΅πŸ‡Έ
80
+
81
+ This project supports Palestinian rights and stands in solidarity with Palestine. We believe in the importance of documenting and preserving Palestinian narratives, history, and struggles for justice and liberation.
82
+
83
+ Free Palestine πŸ‡΅πŸ‡Έ
@@ -0,0 +1,64 @@
1
+ // eslint.config.cjs
2
+
3
+ module.exports = [
4
+ {
5
+ languageOptions: {
6
+ parserOptions: {
7
+ ecmaVersion: 13,
8
+ impliedStrict: true,
9
+ }
10
+ },
11
+ rules: {
12
+ "no-trailing-spaces": "error",
13
+ "linebreak-style": ["error", "unix"],
14
+ "quotes": ["error", "double"],
15
+ "one-var": ["error", "never"],
16
+ "brace-style": ["error", "allman", { allowSingleLine: true }],
17
+ "space-before-blocks": "warn",
18
+ "func-call-spacing": "error",
19
+ "space-before-function-paren": "error",
20
+ "space-in-parens": ["error", "always", { exceptions: ["{}"] }],
21
+ "keyword-spacing": "error",
22
+ "comma-spacing": "error",
23
+ "space-unary-ops": "error",
24
+ "block-spacing": "error",
25
+ "arrow-spacing": "error",
26
+ "key-spacing": "error",
27
+ "comma-style": "error",
28
+ "space-infix-ops": "error",
29
+ "array-bracket-spacing": "error",
30
+ "object-curly-spacing": ["error", "always"],
31
+ "no-multi-spaces": "error",
32
+ "operator-linebreak": "error",
33
+ "function-paren-newline": "warn",
34
+ "arrow-body-style": ["error", "always"],
35
+ "no-template-curly-in-string": "error",
36
+ "no-new-object": "error",
37
+ "no-extra-parens": ["error", "all", { conditionalAssign: false }],
38
+ "no-empty-function": "error",
39
+ "no-empty": ["warn", { allowEmptyCatch: true }],
40
+ "no-eq-null": "error",
41
+ "no-extra-bind": "error",
42
+ "no-self-compare": "error",
43
+ "no-useless-call": "error",
44
+ "no-undefined": "error",
45
+ "no-array-constructor": "error",
46
+ "prefer-destructuring": ["error",
47
+ {
48
+ VariableDeclarator: { array: false, object: true }, AssignmentExpression: { array: false, object: false } }, { enforceForRenamedProperties: false
49
+ }
50
+ ],
51
+ "object-shorthand": "warn",
52
+ "prefer-spread": "warn",
53
+ "prefer-template": "warn",
54
+ "no-loop-func": "warn",
55
+ "prefer-rest-params": "warn",
56
+ "no-new-func": "warn",
57
+ "no-unneeded-ternary": "warn",
58
+ "no-process-exit": "off",
59
+ "require-await": "warn",
60
+ "indent": ["error", "tab", { MemberExpression: 0 }],
61
+ "no-tabs": 0,
62
+ },
63
+ },
64
+ ];
@@ -0,0 +1,25 @@
1
+ const WebScraper = require( "./src/WebScraper" );
2
+
3
+ const baseURL = "https://decolonizepalestine.com";
4
+ const folderPath = "./dataset";
5
+ const excludeList = [
6
+ "https://decolonizepalestine.com/cdn-cgi",
7
+ "https://decolonizepalestine.com/introduction-to-palestine",
8
+ "https://decolonizepalestine.com/myths",
9
+ "https://decolonizepalestine.com/reading-list",
10
+ "https://decolonizepalestine.com/support-us"
11
+ ];
12
+ const exactExcludeList = [
13
+ "https://decolonizepalestine.com/rainbow-washing",
14
+ "https://decolonizepalestine.com/"
15
+ ]
16
+
17
+
18
+ const scraper = new WebScraper({
19
+ baseURL,
20
+ folderPath,
21
+ excludeList,
22
+ exactExcludeList,
23
+ jsonlPath: "./dataset/final.jsonl"
24
+ });
25
+ scraper.start();
package/main.js ADDED
@@ -0,0 +1,2 @@
1
+ const WebScraper = require( "./src/WebScraper" );
2
+ module.exports = WebScraper;
package/package.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "name": "clean-web-scraper",
3
+ "version": "2.0.1",
4
+ "main": "main.js",
5
+ "scripts": {
6
+ "start": "node main.js",
7
+ "test": "echo \"Error: no test specified\" && exit 1"
8
+ },
9
+ "keywords": [
10
+ "clean-web-scraper",
11
+ "web-scraper",
12
+ "scraper",
13
+ "scraper-js",
14
+ "scraper-js-library",
15
+ "web-scraper-js",
16
+ "ai-ready-web-scraper",
17
+ "ai",
18
+ "fine-tune",
19
+ "data-processing"
20
+ ],
21
+ "author": "",
22
+ "license": "ISC",
23
+ "description": "",
24
+ "dependencies": {
25
+ "@mozilla/readability": "^0.5.0",
26
+ "axios": "^1.7.9",
27
+ "eslint": "^9.17.0",
28
+ "jsdom": "^26.0.0"
29
+ }
30
+ }
@@ -0,0 +1,208 @@
1
+ const axios = require( "axios" );
2
+ const jsdom = require( "jsdom" );
3
+ const { JSDOM } = jsdom;
4
+ const { Readability } = require( "@mozilla/readability" );
5
+ const fs = require( "fs" );
6
+ const path = require( "path" );
7
+
8
+ class WebScraper
9
+ {
10
+ constructor ({ baseURL, folderPath, excludeList, exactExcludeList, jsonlPath })
11
+ {
12
+ this.baseURL = baseURL;
13
+ this.jsonlPath = jsonlPath || "output.jsonl";
14
+ this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
15
+ this.visited = new Set();
16
+ this.excludeList = new Set( excludeList );
17
+ this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
18
+ this.processedContent = []; // Add this line
19
+ this.createOutputDirectory();
20
+ }
21
+
22
+ async start ()
23
+ {
24
+ this.visited.add( this.baseURL );
25
+ await this.fetchPage( this.baseURL );
26
+ this.createJSONLFile();
27
+ }
28
+
29
+ async fetchPage ( url )
30
+ {
31
+ try
32
+ {
33
+ const { data } = await axios.get( url );
34
+ const dom = new JSDOM( data, { url });
35
+
36
+ // Only save if the URL is not excluded
37
+ if ( !this.isExcluded( url ) )
38
+ {
39
+ const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
40
+ const article = reader.parse();
41
+
42
+ if ( article )
43
+ {
44
+ this.saveArticle( url, article.textContent );
45
+ }
46
+ else
47
+ {
48
+ console.error( `No readable content found at ${url}` );
49
+ }
50
+ }
51
+
52
+ const links = this.extractLinks( data );
53
+ for ( const link of links )
54
+ {
55
+ if ( !this.visited.has( link ) )
56
+ {
57
+ this.visited.add( link );
58
+ await this.fetchPage( link );
59
+ }
60
+ }
61
+ }
62
+ catch ( error )
63
+ {
64
+ console.error( `Error fetching ${url}:`, error.message );
65
+ }
66
+ }
67
+
68
+ extractLinks ( data )
69
+ {
70
+ const links = new Set();
71
+ const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
72
+ let match;
73
+
74
+ while ( ( match = regex.exec( data ) ) !== null )
75
+ {
76
+ let href = match[2];
77
+ if ( href.endsWith( "/" ) )
78
+ {
79
+ href = href.slice( 0, -1 );
80
+ }
81
+ if ( href.startsWith( this.baseURL ) )
82
+ {
83
+ links.add( href );
84
+ }
85
+ else if ( href.startsWith( "/" ) )
86
+ {
87
+ links.add( new URL( href, this.baseURL ).href );
88
+ }
89
+ }
90
+
91
+ return links;
92
+ }
93
+
94
+ saveArticle ( url, content )
95
+ {
96
+ const processedContent = this.processContent( content );
97
+
98
+ this.processedContent.push({
99
+ text: processedContent.trim()
100
+ });
101
+
102
+ let urlPath = new URL( url ).pathname;
103
+ if ( urlPath === "/" )
104
+ {
105
+ urlPath = "/index";
106
+ }
107
+ const filePath = path.join( __dirname, this.folderPath, urlPath );
108
+ const dir = path.dirname( filePath );
109
+
110
+ // Create metadata object
111
+ const metadata = {
112
+ url,
113
+ dateScraped: new Date().toISOString(),
114
+ contentLength: processedContent.length,
115
+ fileName: `${path.basename( filePath )}.txt`
116
+ };
117
+
118
+ // Create directory if it doesn't exist
119
+ fs.mkdirSync( dir, { recursive: true });
120
+
121
+ // Save the text content
122
+ fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
123
+
124
+ // Save the JSON metadata
125
+ fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
126
+
127
+ console.log( `Saved: ${filePath}.txt` );
128
+ console.log( `Saved: ${filePath}.json` );
129
+ }
130
+
131
+ createJSONLFile ()
132
+ {
133
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
134
+
135
+ for ( const content of this.processedContent )
136
+ {
137
+ const jsonLine = `${JSON.stringify( content )}\n`;
138
+ writeStream.write( jsonLine );
139
+ }
140
+
141
+ writeStream.end();
142
+ console.log( `Created JSONL file at: ${this.jsonlPath}` );
143
+ }
144
+
145
+ processContent ( content )
146
+ {
147
+ let processed = content;
148
+
149
+ // Remove "[You can read more about this here]" and similar patterns
150
+ processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
151
+
152
+ // Trim each line
153
+ processed = processed.split( "\n" )
154
+ .map( line => { return line.trim() })
155
+ .join( "\n" );
156
+
157
+ // Replace 3 or more newlines with a single newline
158
+ processed = processed.replace( /\n{3,}/g, "\n\n" );
159
+
160
+ // Add more processing rules as needed:
161
+ // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
162
+ // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
163
+
164
+ return processed;
165
+ }
166
+
167
+ normalizeExcludeList ( list )
168
+ {
169
+ const normalizedSet = new Set();
170
+ for ( let i = 0; i < list.length; i++ )
171
+ {
172
+ const item = list[i];
173
+ if ( item.endsWith( "/" ) )
174
+ {
175
+ normalizedSet.add( item.slice( 0, -1 ) );
176
+ }
177
+ else
178
+ {
179
+ normalizedSet.add( item );
180
+ }
181
+ normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
182
+ }
183
+ return normalizedSet;
184
+ }
185
+
186
+ isExcluded ( url )
187
+ {
188
+ if ( this.exactExcludeList.has( url ) )
189
+ {
190
+ return true;
191
+ }
192
+ return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
193
+ }
194
+
195
+ createOutputDirectory ()
196
+ {
197
+ if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
198
+ {
199
+ fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
200
+ }
201
+ if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
202
+ {
203
+ fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
204
+ }
205
+ }
206
+ }
207
+
208
+ module.exports = WebScraper;