meatscraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +157 -0
- package/dist/cli.d.ts +10 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +64 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +74 -0
- package/dist/index.js.map +1 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts +10 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts.map +1 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.js +44 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.js.map +1 -0
- package/dist/metascraper-plugins/metascraper-reddit.d.ts +8 -0
- package/dist/metascraper-plugins/metascraper-reddit.d.ts.map +1 -0
- package/dist/metascraper-plugins/metascraper-reddit.js +47 -0
- package/dist/metascraper-plugins/metascraper-reddit.js.map +1 -0
- package/dist/metascraper-setup.d.ts +23 -0
- package/dist/metascraper-setup.d.ts.map +1 -0
- package/dist/metascraper-setup.js +78 -0
- package/dist/metascraper-setup.js.map +1 -0
- package/dist/modes/file-mode.d.ts +12 -0
- package/dist/modes/file-mode.d.ts.map +1 -0
- package/dist/modes/file-mode.js +63 -0
- package/dist/modes/file-mode.js.map +1 -0
- package/dist/modes/http-mode.d.ts +12 -0
- package/dist/modes/http-mode.d.ts.map +1 -0
- package/dist/modes/http-mode.js +111 -0
- package/dist/modes/http-mode.js.map +1 -0
- package/dist/pipeline.d.ts +23 -0
- package/dist/pipeline.d.ts.map +1 -0
- package/dist/pipeline.js +59 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/steps/step1-metadata.d.ts +9 -0
- package/dist/steps/step1-metadata.d.ts.map +1 -0
- package/dist/steps/step1-metadata.js +42 -0
- package/dist/steps/step1-metadata.js.map +1 -0
- package/dist/steps/step2-readable.d.ts +16 -0
- package/dist/steps/step2-readable.d.ts.map +1 -0
- package/dist/steps/step2-readable.js +45 -0
- package/dist/steps/step2-readable.js.map +1 -0
- package/dist/steps/step3-sanitize.d.ts +15 -0
- package/dist/steps/step3-sanitize.d.ts.map +1 -0
- package/dist/steps/step3-sanitize.js +43 -0
- package/dist/steps/step3-sanitize.js.map +1 -0
- package/dist/steps/step4-plaintext.d.ts +14 -0
- package/dist/steps/step4-plaintext.d.ts.map +1 -0
- package/dist/steps/step4-plaintext.js +47 -0
- package/dist/steps/step4-plaintext.js.map +1 -0
- package/dist/steps/step5-image.d.ts +22 -0
- package/dist/steps/step5-image.d.ts.map +1 -0
- package/dist/steps/step5-image.js +121 -0
- package/dist/steps/step5-image.js.map +1 -0
- package/dist/types.d.ts +56 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/formatters.d.ts +45 -0
- package/dist/utils/formatters.d.ts.map +1 -0
- package/dist/utils/formatters.js +61 -0
- package/dist/utils/formatters.js.map +1 -0
- package/dist/utils.d.ts +17 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +34 -0
- package/dist/utils.js.map +1 -0
- package/package.json +72 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 paulohgodinho
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# meatscraper
|
|
2
|
+
|
|
3
|
+
Extract content from webpages! Perfect for bookmarking tools and AI ;)
|
|
4
|
+
|
|
5
|
+
Clean text content, metadata, and primary images from any webpage using [Metascraper](https://github.com/microlinkhq/metascraper), [Readability](https://github.com/mozilla/readability), [DOMPurify](https://github.com/cure53/DOMPurify) and custom logic.
|
|
6
|
+
|
|
7
|
+
*Disclaimer: This project was vibe coded.*
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install as a library
|
|
13
|
+
npm install meatscraper
|
|
14
|
+
|
|
15
|
+
# Or install globally for CLI access
|
|
16
|
+
npm install -g meatscraper
|
|
17
|
+
|
|
18
|
+
# Or use directly with npx (no install needed)
|
|
19
|
+
npx meatscraper serve
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Inspiration
|
|
23
|
+
This project is based on [Karakeep](https://github.com/karakeep/karakeep). They have done an amazing job building a content extraction pipeline. I wanted to use that functionality in other projects, so I pulled it from them and the created this library/CLI/server around it.
|
|
24
|
+
|
|
25
|
+
## Quick Example
|
|
26
|
+
|
|
27
|
+
**Input HTML:**
|
|
28
|
+
```html
|
|
29
|
+
<html>
|
|
30
|
+
<head><title>My Article</title></head>
|
|
31
|
+
<body>
|
|
32
|
+
<h1>Hello World</h1>
|
|
33
|
+
<p>This is the actual content you want to keep.</p>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**Output JSON:**
|
|
39
|
+
```json
|
|
40
|
+
{
|
|
41
|
+
"success": true,
|
|
42
|
+
"data": {
|
|
43
|
+
"content": "Hello World\nThis is the actual content you want to keep.",
|
|
44
|
+
"image": null,
|
|
45
|
+
"metadata": {
|
|
46
|
+
"title": "My Article"
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
### As a Library (TypeScript/JavaScript)
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
import { meatExtractor } from 'meatscraper';
|
|
58
|
+
|
|
59
|
+
const result = await meatExtractor(htmlString);
|
|
60
|
+
console.log(result.content); // Clean text only
|
|
61
|
+
console.log(result.image); // Primary image URL
|
|
62
|
+
console.log(result.metadata); // {title, author, date, ...}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### CLI - Process Local File
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# After global install
|
|
69
|
+
meatscraper ./article.html
|
|
70
|
+
|
|
71
|
+
# Or with npx (no install needed)
|
|
72
|
+
npx meatscraper ./article.html
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Output is printed as JSON to stdout.
|
|
76
|
+
|
|
77
|
+
### CLI - Start HTTP Server
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# After global install
|
|
81
|
+
meatscraper serve
|
|
82
|
+
|
|
83
|
+
# Or with npx
|
|
84
|
+
npx meatscraper serve
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Server runs on port 8676. Send HTML via POST:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
curl -X POST http://localhost:8676/extract \
|
|
91
|
+
-H "Content-Type: application/json" \
|
|
92
|
+
-d '{"html":"<html>...</html>"}'
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Endpoints:
|
|
96
|
+
- `POST /extract` - Extract content from HTML
|
|
97
|
+
- `GET /health` - Health check
|
|
98
|
+
- `GET /stats` - Server statistics
|
|
99
|
+
|
|
100
|
+
### Docker
|
|
101
|
+
|
|
102
|
+
Pull and run the latest published image:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# Server mode
|
|
106
|
+
docker run -p 8676:8676 ghcr.io/paulohgodinho/meatscraper:main serve
|
|
107
|
+
|
|
108
|
+
# File mode (requires mounted volume)
|
|
109
|
+
docker run -v $(pwd):/data ghcr.io/paulohgodinho/meatscraper:main /data/article.html
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## API Response
|
|
113
|
+
|
|
114
|
+
Complete response structure:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"success": true,
|
|
119
|
+
"data": {
|
|
120
|
+
"content": "Hello World\nThis is the actual content you want to keep.",
|
|
121
|
+
"image": "https://example.com/image.jpg",
|
|
122
|
+
"metadata": {
|
|
123
|
+
"title": "My Article",
|
|
124
|
+
"description": "Article description here",
|
|
125
|
+
"author": "John Doe",
|
|
126
|
+
"publisher": "Example Publication",
|
|
127
|
+
"datePublished": "2024-01-15T10:30:00Z",
|
|
128
|
+
"dateModified": "2024-01-15T12:00:00Z",
|
|
129
|
+
"url": "https://example.com/article",
|
|
130
|
+
"logo": "https://example.com/logo.png",
|
|
131
|
+
"youtubeVideoId": null,
|
|
132
|
+
"youtubeChannelName": null,
|
|
133
|
+
"youtubeChannelId": null,
|
|
134
|
+
"twitterHandle": null,
|
|
135
|
+
"twitterCreator": null,
|
|
136
|
+
"amazonPrice": null,
|
|
137
|
+
"amazonProductTitle": null,
|
|
138
|
+
"redditSubreddit": null,
|
|
139
|
+
"redditAuthor": null
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Features
|
|
146
|
+
|
|
147
|
+
- **5-step processing pipeline** - Metadata extraction, readability analysis, sanitization, plain text conversion, image selection
|
|
148
|
+
- **Rich metadata extraction** - Extracts 20+ fields including title, author, publish date, image, and platform-specific data
|
|
149
|
+
- **Multiple platforms** - Special handling for YouTube, Twitter, Amazon, Reddit
|
|
150
|
+
- **HTML sanitization** - Removes scripts, styles, and dangerous content
|
|
151
|
+
- **Plain text output** - No HTML tags, clean readable text
|
|
152
|
+
- **Image selection** - Finds and returns the best primary image
|
|
153
|
+
- **Three usage modes** - Library, CLI, or HTTP server
|
|
154
|
+
|
|
155
|
+
## License
|
|
156
|
+
|
|
157
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI entry point for meatscraper
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* meatscraper <file-path> # File mode: extract from HTML file
|
|
7
|
+
* meatscraper serve # Server mode: start HTTP server on port 8676
|
|
8
|
+
*/
|
|
9
|
+
export {};
|
|
10
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;GAMG"}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* CLI entry point for meatscraper
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* meatscraper <file-path> # File mode: extract from HTML file
|
|
8
|
+
* meatscraper serve # Server mode: start HTTP server on port 8676
|
|
9
|
+
*/
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
const file_mode_1 = require("./modes/file-mode");
|
|
12
|
+
const http_mode_1 = require("./modes/http-mode");
|
|
13
|
+
async function main() {
|
|
14
|
+
const args = process.argv.slice(2);
|
|
15
|
+
const command = args[0];
|
|
16
|
+
// No arguments provided
|
|
17
|
+
if (!command) {
|
|
18
|
+
console.error("❌ No command provided");
|
|
19
|
+
console.error("\nUsage:");
|
|
20
|
+
console.error(" meatscraper <file-path> Extract content from an HTML file");
|
|
21
|
+
console.error(" meatscraper serve Start HTTP server on port 8676");
|
|
22
|
+
console.error("\nExamples:");
|
|
23
|
+
console.error(" meatscraper ./example.html");
|
|
24
|
+
console.error(" meatscraper /path/to/file.html");
|
|
25
|
+
console.error(" meatscraper serve");
|
|
26
|
+
process.exit(1);
|
|
27
|
+
}
|
|
28
|
+
// Server mode
|
|
29
|
+
if (command === "serve") {
|
|
30
|
+
try {
|
|
31
|
+
(0, http_mode_1.startHttpServer)(8676);
|
|
32
|
+
// Keep the process running
|
|
33
|
+
process.on("SIGINT", () => {
|
|
34
|
+
console.log("\n\n👋 Server shutting down...");
|
|
35
|
+
process.exit(0);
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
console.error("❌ Failed to start server:");
|
|
40
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
// File mode
|
|
46
|
+
try {
|
|
47
|
+
const result = await (0, file_mode_1.processFileMode)(command);
|
|
48
|
+
// Output to stdout for piping/redirection
|
|
49
|
+
console.log(result);
|
|
50
|
+
process.exit(0);
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
console.error("❌ Error processing file:");
|
|
54
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Run the CLI
|
|
59
|
+
main().catch((error) => {
|
|
60
|
+
console.error("❌ Unexpected error:");
|
|
61
|
+
console.error(error);
|
|
62
|
+
process.exit(1);
|
|
63
|
+
});
|
|
64
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";;AAEA;;;;;;GAMG;;AAEH,iDAAoD;AACpD,iDAAoD;AAEpD,KAAK,UAAU,IAAI;IACjB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;IAExB,wBAAwB;IACxB,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACvC,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QAC1B,OAAO,CAAC,KAAK,CAAC,iEAAiE,CAAC,CAAC;QACjF,OAAO,CAAC,KAAK,CAAC,8DAA8D,CAAC,CAAC;QAC9E,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAC9C,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QAClD,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;QACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,cAAc;IACd,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC;YACH,IAAA,2BAAe,EAAC,IAAI,CAAC,CAAC;YACtB,2BAA2B;YAC3B,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;gBACxB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;gBAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;YAC3C,OAAO,CAAC,KAAK,CACX,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,OAAO;IACT,CAAC;IAED,YAAY;IACZ,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,IAAA,2BAAe,EAAC,OAAO,CAAC,CAAC;QAC9C,0CAA0C;QAC1C,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,KAAK,CACX,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,cAAc;AACd,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACrC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* meatscraper - Extract text content and primary image from webpages
|
|
3
|
+
*
|
|
4
|
+
* A comprehensive web scraping package that processes HTML through multiple
|
|
5
|
+
* cleaning and extraction steps to produce clean text and metadata.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { meatExtractor } from 'meatscraper';
|
|
10
|
+
*
|
|
11
|
+
* const result = await meatExtractor(htmlString);
|
|
12
|
+
* console.log(result.content); // Plain text
|
|
13
|
+
* console.log(result.image); // Image URL
|
|
14
|
+
* console.log(result.metadata); // Full metadata
|
|
15
|
+
* ```
|
|
16
|
+
*/
|
|
17
|
+
import { MeatExtractorResult, MeatExtractorOptions } from "./types";
|
|
18
|
+
/**
|
|
19
|
+
* Extract text content and metadata from HTML
|
|
20
|
+
*
|
|
21
|
+
* Processes HTML through a 5-step pipeline:
|
|
22
|
+
* 1. Metadata extraction (metascraper with 12+ plugins)
|
|
23
|
+
* 2. Readable content extraction (Mozilla Readability)
|
|
24
|
+
* 3. HTML sanitization (DOMPurify)
|
|
25
|
+
* 4. Plain text conversion (html-to-text)
|
|
26
|
+
* 5. Image selection (best primary image)
|
|
27
|
+
*
|
|
28
|
+
* @param htmlString - Raw HTML content to process
|
|
29
|
+
* @param options - Configuration options
|
|
30
|
+
* @returns Promise resolving to extraction result with content, image, and metadata
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* // Basic usage
|
|
35
|
+
* const result = await meatExtractor(html);
|
|
36
|
+
* console.log(result.content); // Plain text
|
|
37
|
+
* console.log(result.image); // Image URL or null
|
|
38
|
+
* console.log(result.metadata); // {title, description, author, ...}
|
|
39
|
+
*
|
|
40
|
+
* // With debugging
|
|
41
|
+
* const result = await meatExtractor(html, { debug: true });
|
|
42
|
+
* console.log(result.debug?.step1_metadata);
|
|
43
|
+
* console.log(result.debug?.step2_readableContent);
|
|
44
|
+
* console.log(result.debug?.step3_sanitizedContent);
|
|
45
|
+
*
|
|
46
|
+
* // With URL hint
|
|
47
|
+
* const result = await meatExtractor(html, {
|
|
48
|
+
* url: 'https://example.com/article'
|
|
49
|
+
* });
|
|
50
|
+
* ```
|
|
51
|
+
*/
|
|
52
|
+
export declare function meatExtractor(htmlString: string, options?: MeatExtractorOptions): Promise<MeatExtractorResult>;
|
|
53
|
+
export type { MeatExtractorResult, MeatExtractorOptions, MetadataResult, DebugInfo, } from "./types";
|
|
54
|
+
export { step1ExtractMetadata } from "./steps/step1-metadata";
|
|
55
|
+
export { step2ExtractReadableContent } from "./steps/step2-readable";
|
|
56
|
+
export { step3SanitizeHtml } from "./steps/step3-sanitize";
|
|
57
|
+
export { step4ConvertToPlainText } from "./steps/step4-plaintext";
|
|
58
|
+
export { step5SelectImage } from "./steps/step5-image";
|
|
59
|
+
export { extractMetadata, createMetascraperParser } from "./metascraper-setup";
|
|
60
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,SAAS,CAAC;AAGpE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,OAAO,CAAC,mBAAmB,CAAC,CAE9B;AAGD,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,cAAc,EACd,SAAS,GACV,MAAM,SAAS,CAAC;AAGjB,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,2BAA2B,EAAE,MAAM,wBAAwB,CAAC;AACrE,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,uBAAuB,EAAE,MAAM,yBAAyB,CAAC;AAClE,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAGvD,OAAO,EAAE,eAAe,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* meatscraper - Extract text content and primary image from webpages
|
|
4
|
+
*
|
|
5
|
+
* A comprehensive web scraping package that processes HTML through multiple
|
|
6
|
+
* cleaning and extraction steps to produce clean text and metadata.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* import { meatExtractor } from 'meatscraper';
|
|
11
|
+
*
|
|
12
|
+
* const result = await meatExtractor(htmlString);
|
|
13
|
+
* console.log(result.content); // Plain text
|
|
14
|
+
* console.log(result.image); // Image URL
|
|
15
|
+
* console.log(result.metadata); // Full metadata
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
19
|
+
exports.createMetascraperParser = exports.extractMetadata = exports.step5SelectImage = exports.step4ConvertToPlainText = exports.step3SanitizeHtml = exports.step2ExtractReadableContent = exports.step1ExtractMetadata = void 0;
|
|
20
|
+
exports.meatExtractor = meatExtractor;
|
|
21
|
+
const pipeline_1 = require("./pipeline");
|
|
22
|
+
/**
|
|
23
|
+
* Extract text content and metadata from HTML
|
|
24
|
+
*
|
|
25
|
+
* Processes HTML through a 5-step pipeline:
|
|
26
|
+
* 1. Metadata extraction (metascraper with 12+ plugins)
|
|
27
|
+
* 2. Readable content extraction (Mozilla Readability)
|
|
28
|
+
* 3. HTML sanitization (DOMPurify)
|
|
29
|
+
* 4. Plain text conversion (html-to-text)
|
|
30
|
+
* 5. Image selection (best primary image)
|
|
31
|
+
*
|
|
32
|
+
* @param htmlString - Raw HTML content to process
|
|
33
|
+
* @param options - Configuration options
|
|
34
|
+
* @returns Promise resolving to extraction result with content, image, and metadata
|
|
35
|
+
*
|
|
36
|
+
* @example
|
|
37
|
+
* ```typescript
|
|
38
|
+
* // Basic usage
|
|
39
|
+
* const result = await meatExtractor(html);
|
|
40
|
+
* console.log(result.content); // Plain text
|
|
41
|
+
* console.log(result.image); // Image URL or null
|
|
42
|
+
* console.log(result.metadata); // {title, description, author, ...}
|
|
43
|
+
*
|
|
44
|
+
* // With debugging
|
|
45
|
+
* const result = await meatExtractor(html, { debug: true });
|
|
46
|
+
* console.log(result.debug?.step1_metadata);
|
|
47
|
+
* console.log(result.debug?.step2_readableContent);
|
|
48
|
+
* console.log(result.debug?.step3_sanitizedContent);
|
|
49
|
+
*
|
|
50
|
+
* // With URL hint
|
|
51
|
+
* const result = await meatExtractor(html, {
|
|
52
|
+
* url: 'https://example.com/article'
|
|
53
|
+
* });
|
|
54
|
+
* ```
|
|
55
|
+
*/
|
|
56
|
+
async function meatExtractor(htmlString, options) {
|
|
57
|
+
return (0, pipeline_1.executePipeline)(htmlString, options);
|
|
58
|
+
}
|
|
59
|
+
// Re-export individual step functions for advanced usage
|
|
60
|
+
var step1_metadata_1 = require("./steps/step1-metadata");
|
|
61
|
+
Object.defineProperty(exports, "step1ExtractMetadata", { enumerable: true, get: function () { return step1_metadata_1.step1ExtractMetadata; } });
|
|
62
|
+
var step2_readable_1 = require("./steps/step2-readable");
|
|
63
|
+
Object.defineProperty(exports, "step2ExtractReadableContent", { enumerable: true, get: function () { return step2_readable_1.step2ExtractReadableContent; } });
|
|
64
|
+
var step3_sanitize_1 = require("./steps/step3-sanitize");
|
|
65
|
+
Object.defineProperty(exports, "step3SanitizeHtml", { enumerable: true, get: function () { return step3_sanitize_1.step3SanitizeHtml; } });
|
|
66
|
+
var step4_plaintext_1 = require("./steps/step4-plaintext");
|
|
67
|
+
Object.defineProperty(exports, "step4ConvertToPlainText", { enumerable: true, get: function () { return step4_plaintext_1.step4ConvertToPlainText; } });
|
|
68
|
+
var step5_image_1 = require("./steps/step5-image");
|
|
69
|
+
Object.defineProperty(exports, "step5SelectImage", { enumerable: true, get: function () { return step5_image_1.step5SelectImage; } });
|
|
70
|
+
// Re-export metascraper utilities
|
|
71
|
+
var metascraper_setup_1 = require("./metascraper-setup");
|
|
72
|
+
Object.defineProperty(exports, "extractMetadata", { enumerable: true, get: function () { return metascraper_setup_1.extractMetadata; } });
|
|
73
|
+
Object.defineProperty(exports, "createMetascraperParser", { enumerable: true, get: function () { return metascraper_setup_1.createMetascraperParser; } });
|
|
74
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;GAeG;;;AAuCH,sCAKC;AAzCD,yCAA6C;AAE7C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACI,KAAK,UAAU,aAAa,CACjC,UAAkB,EAClB,OAA8B;IAE9B,OAAO,IAAA,0BAAe,EAAC,UAAU,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC;AAUD,yDAAyD;AACzD,yDAA8D;AAArD,sHAAA,oBAAoB,OAAA;AAC7B,yDAAqE;AAA5D,6HAAA,2BAA2B,OAAA;AACpC,yDAA2D;AAAlD,mHAAA,iBAAiB,OAAA;AAC1B,2DAAkE;AAAzD,0HAAA,uBAAuB,OAAA;AAChC,mDAAuD;AAA9C,+GAAA,gBAAgB,OAAA;AAEzB,kCAAkC;AAClC,yDAA+E;AAAtE,oHAAA,eAAe,OAAA;AAAE,4HAAA,uBAAuB,OAAA"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metascraper plugin for improved Amazon product image extraction
|
|
3
|
+
*
|
|
4
|
+
* Fixes image extraction bug in standard metascraperAmazon by prioritizing
|
|
5
|
+
* high-quality product images over generic site logos.
|
|
6
|
+
*
|
|
7
|
+
* MUST be used before metascraperAmazon() in the plugin chain
|
|
8
|
+
*/
|
|
9
|
+
export default function metascraperAmazonImproved(opts?: any): any;
|
|
10
|
+
//# sourceMappingURL=metascraper-amazon-improved.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-amazon-improved.d.ts","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-amazon-improved.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,MAAM,CAAC,OAAO,UAAU,yBAAyB,CAAC,IAAI,CAAC,EAAE,GAAG,OA6C3D"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Metascraper plugin for improved Amazon product image extraction
|
|
4
|
+
*
|
|
5
|
+
* Fixes image extraction bug in standard metascraperAmazon by prioritizing
|
|
6
|
+
* high-quality product images over generic site logos.
|
|
7
|
+
*
|
|
8
|
+
* MUST be used before metascraperAmazon() in the plugin chain
|
|
9
|
+
*/
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
exports.default = metascraperAmazonImproved;
|
|
12
|
+
const helpers_1 = require("@metascraper/helpers");
|
|
13
|
+
function metascraperAmazonImproved(opts) {
|
|
14
|
+
const toImage = (0, helpers_1.toRule)(helpers_1.image, opts);
|
|
15
|
+
const rules = {
|
|
16
|
+
image: [
|
|
17
|
+
// Amazon product main image (most specific selector)
|
|
18
|
+
toImage(($) => $('img[data-a-dynamic-image]')
|
|
19
|
+
.first()
|
|
20
|
+
.attr('src') ||
|
|
21
|
+
$('img[data-a-dynamic-image]').first().attr('data-src')),
|
|
22
|
+
// Amazon product landing page images
|
|
23
|
+
toImage(($) => $('.a-dynamic-image img')
|
|
24
|
+
.first()
|
|
25
|
+
.attr('src') ||
|
|
26
|
+
$('.a-dynamic-image img').first().attr('data-src')),
|
|
27
|
+
// Amazon image container
|
|
28
|
+
toImage(($) => $('img.a-dynamic-image')
|
|
29
|
+
.first()
|
|
30
|
+
.attr('src') ||
|
|
31
|
+
$('img.a-dynamic-image').first().attr('data-src')),
|
|
32
|
+
// Generic product image
|
|
33
|
+
toImage(($) => $('img[alt*="product"]')
|
|
34
|
+
.first()
|
|
35
|
+
.attr('src') ||
|
|
36
|
+
$('img[alt*="product"]').first().attr('data-src')),
|
|
37
|
+
// Fallback to og:image
|
|
38
|
+
toImage(($) => $('meta[property="og:image"]').attr('content')),
|
|
39
|
+
],
|
|
40
|
+
};
|
|
41
|
+
rules.pkgName = 'metascraper-amazon-improved';
|
|
42
|
+
return rules;
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=metascraper-amazon-improved.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-amazon-improved.js","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-amazon-improved.ts"],"names":[],"mappings":";AAAA;;;;;;;GAOG;;AAIH,4CA6CC;AA/CD,kDAAoD;AAEpD,SAAwB,yBAAyB,CAAC,IAAU;IAC1D,MAAM,OAAO,GAAG,IAAA,gBAAM,EAAC,eAAK,EAAE,IAAI,CAAC,CAAA;IAEnC,MAAM,KAAK,GAAQ;QACjB,KAAK,EAAE;YACL,qDAAqD;YACrD,OAAO,CACL,CAAC,CAAM,EAAE,EAAE,CACT,CAAC,CAAC,2BAA2B,CAAC;iBAC3B,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,2BAA2B,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CAC1D;YACD,qCAAqC;YACrC,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,sBAAsB,CAAC;iBACtB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,sBAAsB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACrD;YACD,yBAAyB;YACzB,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,qBAAqB,CAAC;iBACrB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACpD;YACD,wBAAwB;YACxB,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,qBAAqB,CAAC;iBACrB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACpD;YACD,uBAAuB;YACvB,OAAO,CAAC,CAAC,CAAO,EAAE,EAAE,CAAC,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;SACrE;KACF,CAAA;IAED,KAAK,CAAC,OAAO,GAAG,6BAA6B,CAAA;IAE7C,OAAO,KAAK,CAAA;AACd,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metascraper plugin for Reddit post metadata extraction
|
|
3
|
+
*
|
|
4
|
+
* Extracts Reddit-specific metadata including subreddit, author,
|
|
5
|
+
* upvote count, and other post information
|
|
6
|
+
*/
|
|
7
|
+
export default function metascraperReddit(opts?: any): any;
|
|
8
|
+
//# sourceMappingURL=metascraper-reddit.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-reddit.d.ts","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-reddit.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,CAAC,OAAO,UAAU,iBAAiB,CAAC,IAAI,CAAC,EAAE,GAAG,OAgDnD"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Metascraper plugin for Reddit post metadata extraction
|
|
4
|
+
*
|
|
5
|
+
* Extracts Reddit-specific metadata including subreddit, author,
|
|
6
|
+
* upvote count, and other post information
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.default = metascraperReddit;
|
|
10
|
+
const helpers_1 = require("@metascraper/helpers");
|
|
11
|
+
// Identity processor for custom fields that don't need special handling
|
|
12
|
+
const identity = (value) => value;
|
|
13
|
+
function metascraperReddit(opts) {
|
|
14
|
+
const toCustom = (0, helpers_1.toRule)(identity, opts);
|
|
15
|
+
const rules = {
|
|
16
|
+
// Extract subreddit from the og:url meta tag
|
|
17
|
+
subreddit: [
|
|
18
|
+
toCustom(($) => {
|
|
19
|
+
const ogUrl = $('meta[property="og:url"]').attr('content');
|
|
20
|
+
const match = ogUrl?.match(/\/r\/([^/]+)/);
|
|
21
|
+
return match ? match[1] : undefined;
|
|
22
|
+
}),
|
|
23
|
+
],
|
|
24
|
+
// Extract author from meta tags
|
|
25
|
+
author: [
|
|
26
|
+
toCustom(($) => $('meta[name="author"]').attr('content')),
|
|
27
|
+
toCustom(($) => (0, helpers_1.$filter)($, $('[data-testid="post_author_by_line"] a[href*="/user/"]'))),
|
|
28
|
+
],
|
|
29
|
+
// Extract description from og:description
|
|
30
|
+
description: [
|
|
31
|
+
toCustom(($) => $('meta[property="og:description"]').attr('content')),
|
|
32
|
+
],
|
|
33
|
+
// Extract Reddit-specific metadata (upvotes)
|
|
34
|
+
redditUpvotes: [
|
|
35
|
+
toCustom(($) => {
|
|
36
|
+
const upvoteText = $('._1rZjMh_0').text();
|
|
37
|
+
const match = upvoteText?.match(/([\d.,]+)/);
|
|
38
|
+
return match
|
|
39
|
+
? parseInt(match[1].replace(/[,.']/g, ''), 10)
|
|
40
|
+
: undefined;
|
|
41
|
+
}),
|
|
42
|
+
],
|
|
43
|
+
};
|
|
44
|
+
rules.pkgName = 'metascraper-reddit';
|
|
45
|
+
return rules;
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=metascraper-reddit.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-reddit.js","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-reddit.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;AAOH,oCAgDC;AArDD,kDAAsD;AAEtD,wEAAwE;AACxE,MAAM,QAAQ,GAAG,CAAC,KAAU,EAAE,EAAE,CAAC,KAAK,CAAA;AAEtC,SAAwB,iBAAiB,CAAC,IAAU;IAClD,MAAM,QAAQ,GAAG,IAAA,gBAAM,EAAC,QAAQ,EAAE,IAAI,CAAC,CAAA;IAEvC,MAAM,KAAK,GAAQ;QACjB,6CAA6C;QAC7C,SAAS,EAAE;YACT,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE;gBACnB,MAAM,KAAK,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBAC1D,MAAM,KAAK,GAAG,KAAK,EAAE,KAAK,CAAC,cAAc,CAAC,CAAA;gBAC1C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;YACrC,CAAC,CAAC;SACH;QAED,gCAAgC;QAChC,MAAM,EAAE;YACN,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CAAC,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC/D,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CACnB,IAAA,iBAAO,EACL,CAAC,EACD,CAAC,CACC,uDAAuD,CACxD,CACF,CACF;SACF;QAED,0CAA0C;QAC1C,WAAW,EAAE;YACX,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CACnB,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CACrD;SACF;QAED,6CAA6C;QAC7C,aAAa,EAAE;YACb,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE;gBACnB,MAAM,UAAU,GAAG,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAA;gBACzC,MAAM,KAAK,GAAG,UAAU,EAAE,KAAK,CAAC,WAAW,CAAC,CAAA;gBAC5C,OAAO,KAAK;oBACV,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC9C,CAAC,CAAC,SAAS,CAAA;YACf,CAAC,CAAC;SACH;KACF,CAAA;IAED,KAAK,CAAC,OAAO,GAAG,oBAAoB,CAAA;IAEpC,OAAO,KAAK,CAAA;AACd,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metascraper configuration with all plugins
|
|
3
|
+
*
|
|
4
|
+
* This sets up the complete metascraper parser with all plugins
|
|
5
|
+
* in the correct order to extract comprehensive metadata from webpages
|
|
6
|
+
*/
|
|
7
|
+
import metascraper from "metascraper";
|
|
8
|
+
/**
|
|
9
|
+
* Create and return the configured metascraper parser
|
|
10
|
+
*
|
|
11
|
+
* Plugin order is important - some plugins must come before others
|
|
12
|
+
* to ensure correct extraction priority
|
|
13
|
+
*/
|
|
14
|
+
export declare function createMetascraperParser(): metascraper.Metascraper;
|
|
15
|
+
/**
|
|
16
|
+
* Extract metadata from HTML content
|
|
17
|
+
*
|
|
18
|
+
* @param htmlContent - The raw HTML string to extract metadata from
|
|
19
|
+
* @param url - Optional URL for context (helps canonicalization)
|
|
20
|
+
* @returns Promise resolving to metadata object
|
|
21
|
+
*/
|
|
22
|
+
export declare function extractMetadata(htmlContent: string, url?: string): Promise<Record<string, any>>;
|
|
23
|
+
//# sourceMappingURL=metascraper-setup.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-setup.d.ts","sourceRoot":"","sources":["../src/metascraper-setup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,WAAW,MAAM,aAAa,CAAC;AAgBtC;;;;;GAKG;AACH,wBAAgB,uBAAuB,4BAgCtC;AAED;;;;;;GAMG;AACH,wBAAsB,eAAe,CACnC,WAAW,EAAE,MAAM,EACnB,GAAG,CAAC,EAAE,MAAM,GACX,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAW9B"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Metascraper configuration with all plugins
|
|
4
|
+
*
|
|
5
|
+
* This sets up the complete metascraper parser with all plugins
|
|
6
|
+
* in the correct order to extract comprehensive metadata from webpages
|
|
7
|
+
*/
|
|
8
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
9
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.createMetascraperParser = createMetascraperParser;
|
|
13
|
+
exports.extractMetadata = extractMetadata;
|
|
14
|
+
const metascraper_1 = __importDefault(require("metascraper"));
|
|
15
|
+
const metascraper_amazon_1 = __importDefault(require("metascraper-amazon"));
|
|
16
|
+
const metascraper_author_1 = __importDefault(require("metascraper-author"));
|
|
17
|
+
const metascraper_date_1 = __importDefault(require("metascraper-date"));
|
|
18
|
+
const metascraper_description_1 = __importDefault(require("metascraper-description"));
|
|
19
|
+
const metascraper_image_1 = __importDefault(require("metascraper-image"));
|
|
20
|
+
const metascraper_logo_favicon_1 = __importDefault(require("metascraper-logo-favicon"));
|
|
21
|
+
const metascraper_publisher_1 = __importDefault(require("metascraper-publisher"));
|
|
22
|
+
const metascraper_title_1 = __importDefault(require("metascraper-title"));
|
|
23
|
+
const metascraper_url_1 = __importDefault(require("metascraper-url"));
|
|
24
|
+
const metascraper_x_1 = __importDefault(require("metascraper-x"));
|
|
25
|
+
const metascraper_youtube_1 = __importDefault(require("metascraper-youtube"));
|
|
26
|
+
const metascraper_amazon_improved_1 = __importDefault(require("./metascraper-plugins/metascraper-amazon-improved"));
|
|
27
|
+
const metascraper_reddit_1 = __importDefault(require("./metascraper-plugins/metascraper-reddit"));
|
|
28
|
+
/**
|
|
29
|
+
* Create and return the configured metascraper parser
|
|
30
|
+
*
|
|
31
|
+
* Plugin order is important - some plugins must come before others
|
|
32
|
+
* to ensure correct extraction priority
|
|
33
|
+
*/
|
|
34
|
+
function createMetascraperParser() {
|
|
35
|
+
return (0, metascraper_1.default)([
|
|
36
|
+
// Date extraction must be early
|
|
37
|
+
(0, metascraper_date_1.default)({
|
|
38
|
+
dateModified: true,
|
|
39
|
+
datePublished: true,
|
|
40
|
+
}),
|
|
41
|
+
// Amazon improved MUST come before base Amazon plugin
|
|
42
|
+
(0, metascraper_amazon_improved_1.default)(),
|
|
43
|
+
(0, metascraper_amazon_1.default)(),
|
|
44
|
+
// Platform-specific extractors
|
|
45
|
+
(0, metascraper_youtube_1.default)(),
|
|
46
|
+
(0, metascraper_reddit_1.default)(),
|
|
47
|
+
// General metadata extractors
|
|
48
|
+
(0, metascraper_author_1.default)(),
|
|
49
|
+
(0, metascraper_publisher_1.default)(),
|
|
50
|
+
(0, metascraper_title_1.default)(),
|
|
51
|
+
(0, metascraper_description_1.default)(),
|
|
52
|
+
(0, metascraper_x_1.default)(),
|
|
53
|
+
// Image extraction - PRIMARY image source
|
|
54
|
+
(0, metascraper_image_1.default)(),
|
|
55
|
+
// Logo/favicon as fallback image
|
|
56
|
+
(0, metascraper_logo_favicon_1.default)(),
|
|
57
|
+
// URL canonicalization - should be last
|
|
58
|
+
(0, metascraper_url_1.default)(),
|
|
59
|
+
]);
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Extract metadata from HTML content
|
|
63
|
+
*
|
|
64
|
+
* @param htmlContent - The raw HTML string to extract metadata from
|
|
65
|
+
* @param url - Optional URL for context (helps canonicalization)
|
|
66
|
+
* @returns Promise resolving to metadata object
|
|
67
|
+
*/
|
|
68
|
+
async function extractMetadata(htmlContent, url) {
|
|
69
|
+
const parser = createMetascraperParser();
|
|
70
|
+
const meta = await parser({
|
|
71
|
+
html: htmlContent,
|
|
72
|
+
url: url || "about:blank",
|
|
73
|
+
// Don't validate URL - we're processing pre-fetched HTML
|
|
74
|
+
validateUrl: false,
|
|
75
|
+
});
|
|
76
|
+
return meta;
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=metascraper-setup.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metascraper-setup.js","sourceRoot":"","sources":["../src/metascraper-setup.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;AAwBH,0DAgCC;AASD,0CAcC;AA7ED,8DAAsC;AACtC,4EAAmD;AACnD,4EAAmD;AACnD,wEAA+C;AAC/C,sFAA6D;AAC7D,0EAAiD;AACjD,wFAAuD;AACvD,kFAAyD;AACzD,0EAAiD;AACjD,sEAA6C;AAC7C,kEAAyC;AACzC,8EAAqD;AAErD,oHAA0F;AAC1F,kGAAyE;AAEzE;;;;;GAKG;AACH,SAAgB,uBAAuB;IACrC,OAAO,IAAA,qBAAW,EAAC;QACjB,gCAAgC;QAChC,IAAA,0BAAe,EAAC;YACd,YAAY,EAAE,IAAI;YAClB,aAAa,EAAE,IAAI;SACpB,CAAC;QAEF,sDAAsD;QACtD,IAAA,qCAAyB,GAAS;QAClC,IAAA,4BAAiB,GAAE;QAEnB,+BAA+B;QAC/B,IAAA,6BAAkB,GAAE;QACpB,IAAA,4BAAiB,GAAS;QAE1B,8BAA8B;QAC9B,IAAA,4BAAiB,GAAE;QACnB,IAAA,+BAAoB,GAAE;QACtB,IAAA,2BAAgB,GAAE;QAClB,IAAA,iCAAsB,GAAE;QACxB,IAAA,uBAAY,GAAE;QAEd,0CAA0C;QAC1C,IAAA,2BAAgB,GAAE;QAElB,iCAAiC;QACjC,IAAA,kCAAe,GAAE;QAEjB,wCAAwC;QACxC,IAAA,yBAAc,GAAE;KACV,CAAC,CAAC;AACZ,CAAC;AAED;;;;;;GAMG;AACI,KAAK,UAAU,eAAe,CACnC,WAAmB,EACnB,GAAY;IAEZ,MAAM,MAAM,GAAG,uBAAuB,EAAE,CAAC;IAEzC,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC;QACxB,IAAI,EAAE,WAAW;QACjB,GAAG,EAAE,GAAG,IAAI,aAAa;QACzB,yDAAyD;QACzD,WAAW,EAAE,KAAK;KACnB,CAAC,CAAC;IAEH,OAAO,IAAI,CAAC;AACd,CAAC"}
|