clean-web-scraper 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of clean-web-scraper might be problematic. Click here for more details.
- package/.github/workflows/npm.yml +19 -0
- package/README.md +83 -0
- package/eslint.config.cjs +64 -0
- package/example-usage.js +25 -0
- package/main.js +2 -0
- package/package.json +30 -0
- package/src/WebScraper.js +208 -0
- package/src/my-custom-output.jsonl +192 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
name: Publish on NPM registry
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: ['main']
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
build:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v2
|
12
|
+
- uses: actions/setup-node@v2
|
13
|
+
with:
|
14
|
+
node-version: 20.x
|
15
|
+
registry-url: https://registry.npmjs.org/
|
16
|
+
# - run: npm install
|
17
|
+
- run: npm publish --access public
|
18
|
+
env:
|
19
|
+
NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
|
package/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# πΈοΈ Web Content Scraper
|
2
|
+
|
3
|
+
A powerful Node.js web scraper that extracts clean, readable content from websites while keeping everything nicely organized. Perfect for creating AI training datasets! π€
|
4
|
+
|
5
|
+
## β¨ Features
|
6
|
+
|
7
|
+
- π Smart recursive web crawling of internal links
|
8
|
+
- π Clean content extraction using Mozilla's Readability
|
9
|
+
- π§Ή Smart content processing and cleaning
|
10
|
+
- ποΈ Maintains original URL structure in saved files
|
11
|
+
- π« Excludes unwanted paths from scraping
|
12
|
+
- π Handles relative and absolute URLs like a pro
|
13
|
+
- π― No duplicate page visits
|
14
|
+
- π Generates JSONL output file for ML training
|
15
|
+
- π AI-friendly clean text output (perfect for LLM fine-tuning!)
|
16
|
+
|
17
|
+
## π οΈ Prerequisites
|
18
|
+
|
19
|
+
- Node.js (v18 or higher)
|
20
|
+
- npm
|
21
|
+
|
22
|
+
## π¦ Dependencies
|
23
|
+
|
24
|
+
- **axios** - HTTP requests master
|
25
|
+
- **jsdom** - DOM parsing wizard
|
26
|
+
- **@mozilla/readability** - Content extraction genius
|
27
|
+
|
28
|
+
## π Installation
|
29
|
+
|
30
|
+
```bash
|
31
|
+
npm i clean-web-scraper
|
32
|
+
|
33
|
+
# OR
|
34
|
+
|
35
|
+
git clone https://github.com/mlibre/Clean-Web-Scraper
|
36
|
+
cd Clean-Web-Scraper
|
37
|
+
npm install
|
38
|
+
```
|
39
|
+
|
40
|
+
## π» Usage
|
41
|
+
|
42
|
+
```js
|
43
|
+
const WebScraper = require('clean-web-scraper');
|
44
|
+
|
45
|
+
const scraper = new WebScraper({
|
46
|
+
baseURL: 'https://example.com', // Required: The website to scrape
|
47
|
+
folderPath: './output', // Required: Where to save the content
|
48
|
+
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
49
|
+
exactExcludeList: ['/specific-page'],// Optional: Exact URLs to exclude
|
50
|
+
jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
|
51
|
+
});
|
52
|
+
|
53
|
+
scraper.start();
|
54
|
+
```
|
55
|
+
|
56
|
+
```bash
|
57
|
+
node example-usage.js
|
58
|
+
```
|
59
|
+
|
60
|
+
## π€ Output
|
61
|
+
|
62
|
+
Your AI-ready content is saved in a clean, structured format:
|
63
|
+
|
64
|
+
- π Base folder: ./folderPath/example.com/
|
65
|
+
- π Files preserve original URL paths
|
66
|
+
- π Pure text format, perfect for LLM training and fine-tuning
|
67
|
+
- π€ No HTML, no mess - just clean, structured text ready for AI consumption
|
68
|
+
- π JSONL output for ML training
|
69
|
+
|
70
|
+
## π€ AI/LLM Training Ready
|
71
|
+
|
72
|
+
The output is specifically formatted for AI training purposes:
|
73
|
+
|
74
|
+
- Clean, processed text without HTML markup
|
75
|
+
- Consistent formatting across all documents
|
76
|
+
- Structured content perfect for fine-tuning LLMs
|
77
|
+
- Ready to use in your ML pipelines
|
78
|
+
|
79
|
+
## Standing with Palestine π΅πΈ
|
80
|
+
|
81
|
+
This project supports Palestinian rights and stands in solidarity with Palestine. We believe in the importance of documenting and preserving Palestinian narratives, history, and struggles for justice and liberation.
|
82
|
+
|
83
|
+
Free Palestine π΅πΈ
|
@@ -0,0 +1,64 @@
|
|
1
|
+
// eslint.config.cjs
|
2
|
+
|
3
|
+
module.exports = [
|
4
|
+
{
|
5
|
+
languageOptions: {
|
6
|
+
parserOptions: {
|
7
|
+
ecmaVersion: 13,
|
8
|
+
impliedStrict: true,
|
9
|
+
}
|
10
|
+
},
|
11
|
+
rules: {
|
12
|
+
"no-trailing-spaces": "error",
|
13
|
+
"linebreak-style": ["error", "unix"],
|
14
|
+
"quotes": ["error", "double"],
|
15
|
+
"one-var": ["error", "never"],
|
16
|
+
"brace-style": ["error", "allman", { allowSingleLine: true }],
|
17
|
+
"space-before-blocks": "warn",
|
18
|
+
"func-call-spacing": "error",
|
19
|
+
"space-before-function-paren": "error",
|
20
|
+
"space-in-parens": ["error", "always", { exceptions: ["{}"] }],
|
21
|
+
"keyword-spacing": "error",
|
22
|
+
"comma-spacing": "error",
|
23
|
+
"space-unary-ops": "error",
|
24
|
+
"block-spacing": "error",
|
25
|
+
"arrow-spacing": "error",
|
26
|
+
"key-spacing": "error",
|
27
|
+
"comma-style": "error",
|
28
|
+
"space-infix-ops": "error",
|
29
|
+
"array-bracket-spacing": "error",
|
30
|
+
"object-curly-spacing": ["error", "always"],
|
31
|
+
"no-multi-spaces": "error",
|
32
|
+
"operator-linebreak": "error",
|
33
|
+
"function-paren-newline": "warn",
|
34
|
+
"arrow-body-style": ["error", "always"],
|
35
|
+
"no-template-curly-in-string": "error",
|
36
|
+
"no-new-object": "error",
|
37
|
+
"no-extra-parens": ["error", "all", { conditionalAssign: false }],
|
38
|
+
"no-empty-function": "error",
|
39
|
+
"no-empty": ["warn", { allowEmptyCatch: true }],
|
40
|
+
"no-eq-null": "error",
|
41
|
+
"no-extra-bind": "error",
|
42
|
+
"no-self-compare": "error",
|
43
|
+
"no-useless-call": "error",
|
44
|
+
"no-undefined": "error",
|
45
|
+
"no-array-constructor": "error",
|
46
|
+
"prefer-destructuring": ["error",
|
47
|
+
{
|
48
|
+
VariableDeclarator: { array: false, object: true }, AssignmentExpression: { array: false, object: false } }, { enforceForRenamedProperties: false
|
49
|
+
}
|
50
|
+
],
|
51
|
+
"object-shorthand": "warn",
|
52
|
+
"prefer-spread": "warn",
|
53
|
+
"prefer-template": "warn",
|
54
|
+
"no-loop-func": "warn",
|
55
|
+
"prefer-rest-params": "warn",
|
56
|
+
"no-new-func": "warn",
|
57
|
+
"no-unneeded-ternary": "warn",
|
58
|
+
"no-process-exit": "off",
|
59
|
+
"require-await": "warn",
|
60
|
+
"indent": ["error", "tab", { MemberExpression: 0 }],
|
61
|
+
"no-tabs": 0,
|
62
|
+
},
|
63
|
+
},
|
64
|
+
];
|
package/example-usage.js
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
const WebScraper = require( "./src/WebScraper" );
|
2
|
+
|
3
|
+
const baseURL = "https://decolonizepalestine.com";
|
4
|
+
const folderPath = "./dataset";
|
5
|
+
const excludeList = [
|
6
|
+
"https://decolonizepalestine.com/cdn-cgi",
|
7
|
+
"https://decolonizepalestine.com/introduction-to-palestine",
|
8
|
+
"https://decolonizepalestine.com/myths",
|
9
|
+
"https://decolonizepalestine.com/reading-list",
|
10
|
+
"https://decolonizepalestine.com/support-us"
|
11
|
+
];
|
12
|
+
const exactExcludeList = [
|
13
|
+
"https://decolonizepalestine.com/rainbow-washing",
|
14
|
+
"https://decolonizepalestine.com/"
|
15
|
+
]
|
16
|
+
|
17
|
+
|
18
|
+
const scraper = new WebScraper({
|
19
|
+
baseURL,
|
20
|
+
folderPath,
|
21
|
+
excludeList,
|
22
|
+
exactExcludeList,
|
23
|
+
jsonlPath: "./dataset/final.jsonl"
|
24
|
+
});
|
25
|
+
scraper.start();
|
package/main.js
ADDED
package/package.json
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
{
|
2
|
+
"name": "clean-web-scraper",
|
3
|
+
"version": "2.0.1",
|
4
|
+
"main": "main.js",
|
5
|
+
"scripts": {
|
6
|
+
"start": "node main.js",
|
7
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
8
|
+
},
|
9
|
+
"keywords": [
|
10
|
+
"clean-web-scraper",
|
11
|
+
"web-scraper",
|
12
|
+
"scraper",
|
13
|
+
"scraper-js",
|
14
|
+
"scraper-js-library",
|
15
|
+
"web-scraper-js",
|
16
|
+
"ai-ready-web-scraper",
|
17
|
+
"ai",
|
18
|
+
"fine-tune",
|
19
|
+
"data-processing"
|
20
|
+
],
|
21
|
+
"author": "",
|
22
|
+
"license": "ISC",
|
23
|
+
"description": "",
|
24
|
+
"dependencies": {
|
25
|
+
"@mozilla/readability": "^0.5.0",
|
26
|
+
"axios": "^1.7.9",
|
27
|
+
"eslint": "^9.17.0",
|
28
|
+
"jsdom": "^26.0.0"
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,208 @@
|
|
1
|
+
const axios = require( "axios" );
|
2
|
+
const jsdom = require( "jsdom" );
|
3
|
+
const { JSDOM } = jsdom;
|
4
|
+
const { Readability } = require( "@mozilla/readability" );
|
5
|
+
const fs = require( "fs" );
|
6
|
+
const path = require( "path" );
|
7
|
+
|
8
|
+
class WebScraper
|
9
|
+
{
|
10
|
+
constructor ({ baseURL, folderPath, excludeList, exactExcludeList, jsonlPath })
|
11
|
+
{
|
12
|
+
this.baseURL = baseURL;
|
13
|
+
this.jsonlPath = jsonlPath || "output.jsonl";
|
14
|
+
this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
|
15
|
+
this.visited = new Set();
|
16
|
+
this.excludeList = new Set( excludeList );
|
17
|
+
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
18
|
+
this.processedContent = []; // Add this line
|
19
|
+
this.createOutputDirectory();
|
20
|
+
}
|
21
|
+
|
22
|
+
async start ()
|
23
|
+
{
|
24
|
+
this.visited.add( this.baseURL );
|
25
|
+
await this.fetchPage( this.baseURL );
|
26
|
+
this.createJSONLFile();
|
27
|
+
}
|
28
|
+
|
29
|
+
async fetchPage ( url )
|
30
|
+
{
|
31
|
+
try
|
32
|
+
{
|
33
|
+
const { data } = await axios.get( url );
|
34
|
+
const dom = new JSDOM( data, { url });
|
35
|
+
|
36
|
+
// Only save if the URL is not excluded
|
37
|
+
if ( !this.isExcluded( url ) )
|
38
|
+
{
|
39
|
+
const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
|
40
|
+
const article = reader.parse();
|
41
|
+
|
42
|
+
if ( article )
|
43
|
+
{
|
44
|
+
this.saveArticle( url, article.textContent );
|
45
|
+
}
|
46
|
+
else
|
47
|
+
{
|
48
|
+
console.error( `No readable content found at ${url}` );
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
const links = this.extractLinks( data );
|
53
|
+
for ( const link of links )
|
54
|
+
{
|
55
|
+
if ( !this.visited.has( link ) )
|
56
|
+
{
|
57
|
+
this.visited.add( link );
|
58
|
+
await this.fetchPage( link );
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
catch ( error )
|
63
|
+
{
|
64
|
+
console.error( `Error fetching ${url}:`, error.message );
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
extractLinks ( data )
|
69
|
+
{
|
70
|
+
const links = new Set();
|
71
|
+
const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
|
72
|
+
let match;
|
73
|
+
|
74
|
+
while ( ( match = regex.exec( data ) ) !== null )
|
75
|
+
{
|
76
|
+
let href = match[2];
|
77
|
+
if ( href.endsWith( "/" ) )
|
78
|
+
{
|
79
|
+
href = href.slice( 0, -1 );
|
80
|
+
}
|
81
|
+
if ( href.startsWith( this.baseURL ) )
|
82
|
+
{
|
83
|
+
links.add( href );
|
84
|
+
}
|
85
|
+
else if ( href.startsWith( "/" ) )
|
86
|
+
{
|
87
|
+
links.add( new URL( href, this.baseURL ).href );
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
return links;
|
92
|
+
}
|
93
|
+
|
94
|
+
saveArticle ( url, content )
|
95
|
+
{
|
96
|
+
const processedContent = this.processContent( content );
|
97
|
+
|
98
|
+
this.processedContent.push({
|
99
|
+
text: processedContent.trim()
|
100
|
+
});
|
101
|
+
|
102
|
+
let urlPath = new URL( url ).pathname;
|
103
|
+
if ( urlPath === "/" )
|
104
|
+
{
|
105
|
+
urlPath = "/index";
|
106
|
+
}
|
107
|
+
const filePath = path.join( __dirname, this.folderPath, urlPath );
|
108
|
+
const dir = path.dirname( filePath );
|
109
|
+
|
110
|
+
// Create metadata object
|
111
|
+
const metadata = {
|
112
|
+
url,
|
113
|
+
dateScraped: new Date().toISOString(),
|
114
|
+
contentLength: processedContent.length,
|
115
|
+
fileName: `${path.basename( filePath )}.txt`
|
116
|
+
};
|
117
|
+
|
118
|
+
// Create directory if it doesn't exist
|
119
|
+
fs.mkdirSync( dir, { recursive: true });
|
120
|
+
|
121
|
+
// Save the text content
|
122
|
+
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
123
|
+
|
124
|
+
// Save the JSON metadata
|
125
|
+
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
126
|
+
|
127
|
+
console.log( `Saved: ${filePath}.txt` );
|
128
|
+
console.log( `Saved: ${filePath}.json` );
|
129
|
+
}
|
130
|
+
|
131
|
+
createJSONLFile ()
|
132
|
+
{
|
133
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
|
134
|
+
|
135
|
+
for ( const content of this.processedContent )
|
136
|
+
{
|
137
|
+
const jsonLine = `${JSON.stringify( content )}\n`;
|
138
|
+
writeStream.write( jsonLine );
|
139
|
+
}
|
140
|
+
|
141
|
+
writeStream.end();
|
142
|
+
console.log( `Created JSONL file at: ${this.jsonlPath}` );
|
143
|
+
}
|
144
|
+
|
145
|
+
processContent ( content )
|
146
|
+
{
|
147
|
+
let processed = content;
|
148
|
+
|
149
|
+
// Remove "[You can read more about this here]" and similar patterns
|
150
|
+
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
151
|
+
|
152
|
+
// Trim each line
|
153
|
+
processed = processed.split( "\n" )
|
154
|
+
.map( line => { return line.trim() })
|
155
|
+
.join( "\n" );
|
156
|
+
|
157
|
+
// Replace 3 or more newlines with a single newline
|
158
|
+
processed = processed.replace( /\n{3,}/g, "\n\n" );
|
159
|
+
|
160
|
+
// Add more processing rules as needed:
|
161
|
+
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
162
|
+
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
163
|
+
|
164
|
+
return processed;
|
165
|
+
}
|
166
|
+
|
167
|
+
normalizeExcludeList ( list )
|
168
|
+
{
|
169
|
+
const normalizedSet = new Set();
|
170
|
+
for ( let i = 0; i < list.length; i++ )
|
171
|
+
{
|
172
|
+
const item = list[i];
|
173
|
+
if ( item.endsWith( "/" ) )
|
174
|
+
{
|
175
|
+
normalizedSet.add( item.slice( 0, -1 ) );
|
176
|
+
}
|
177
|
+
else
|
178
|
+
{
|
179
|
+
normalizedSet.add( item );
|
180
|
+
}
|
181
|
+
normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
|
182
|
+
}
|
183
|
+
return normalizedSet;
|
184
|
+
}
|
185
|
+
|
186
|
+
isExcluded ( url )
|
187
|
+
{
|
188
|
+
if ( this.exactExcludeList.has( url ) )
|
189
|
+
{
|
190
|
+
return true;
|
191
|
+
}
|
192
|
+
return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
|
193
|
+
}
|
194
|
+
|
195
|
+
createOutputDirectory ()
|
196
|
+
{
|
197
|
+
if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
|
198
|
+
{
|
199
|
+
fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
|
200
|
+
}
|
201
|
+
if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
|
202
|
+
{
|
203
|
+
fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
module.exports = WebScraper;
|