clean-web-scraper 3.2.3 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +8 -5
  2. package/package.json +1 -1
package/README.md CHANGED
@@ -53,9 +53,6 @@ const scraper = new WebScraper({
53
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
55
  csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
56
- maxDepth: 3, // Optional: Maximum depth for recursive crawling
57
- includeMetadata: false, // Optional: Include metadata in output files
58
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
59
56
  });
60
57
  await scraper.start();
61
58
  ```
@@ -72,13 +69,19 @@ const WebScraper = require('clean-web-scraper');
72
69
  // Scrape documentation website
73
70
  const docsScraper = new WebScraper({
74
71
  baseURL: 'https://docs.example.com',
75
- scrapResultPath: './datasets/docs'
72
+ scrapResultPath: './datasets/docs',
73
+ maxDepth: 3, // Optional: Maximum depth for recursive crawling
74
+ includeMetadata: true, // Optional: Include metadata in output files
75
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
76
76
  });
77
77
 
78
78
  // Scrape blog website
79
79
  const blogScraper = new WebScraper({
80
80
  baseURL: 'https://blog.example.com',
81
- scrapResultPath: './datasets/blog'
81
+ scrapResultPath: './datasets/blog',
82
+ maxDepth: 3, // Optional: Maximum depth for recursive crawling
83
+ includeMetadata: true, // Optional: Include metadata in output files
84
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
82
85
  });
83
86
 
84
87
  // Start scraping both sites
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.2.3",
3
+ "version": "3.2.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",