meatscraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +157 -0
  3. package/dist/cli.d.ts +10 -0
  4. package/dist/cli.d.ts.map +1 -0
  5. package/dist/cli.js +64 -0
  6. package/dist/cli.js.map +1 -0
  7. package/dist/index.d.ts +60 -0
  8. package/dist/index.d.ts.map +1 -0
  9. package/dist/index.js +74 -0
  10. package/dist/index.js.map +1 -0
  11. package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts +10 -0
  12. package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts.map +1 -0
  13. package/dist/metascraper-plugins/metascraper-amazon-improved.js +44 -0
  14. package/dist/metascraper-plugins/metascraper-amazon-improved.js.map +1 -0
  15. package/dist/metascraper-plugins/metascraper-reddit.d.ts +8 -0
  16. package/dist/metascraper-plugins/metascraper-reddit.d.ts.map +1 -0
  17. package/dist/metascraper-plugins/metascraper-reddit.js +47 -0
  18. package/dist/metascraper-plugins/metascraper-reddit.js.map +1 -0
  19. package/dist/metascraper-setup.d.ts +23 -0
  20. package/dist/metascraper-setup.d.ts.map +1 -0
  21. package/dist/metascraper-setup.js +78 -0
  22. package/dist/metascraper-setup.js.map +1 -0
  23. package/dist/modes/file-mode.d.ts +12 -0
  24. package/dist/modes/file-mode.d.ts.map +1 -0
  25. package/dist/modes/file-mode.js +63 -0
  26. package/dist/modes/file-mode.js.map +1 -0
  27. package/dist/modes/http-mode.d.ts +12 -0
  28. package/dist/modes/http-mode.d.ts.map +1 -0
  29. package/dist/modes/http-mode.js +111 -0
  30. package/dist/modes/http-mode.js.map +1 -0
  31. package/dist/pipeline.d.ts +23 -0
  32. package/dist/pipeline.d.ts.map +1 -0
  33. package/dist/pipeline.js +59 -0
  34. package/dist/pipeline.js.map +1 -0
  35. package/dist/steps/step1-metadata.d.ts +9 -0
  36. package/dist/steps/step1-metadata.d.ts.map +1 -0
  37. package/dist/steps/step1-metadata.js +42 -0
  38. package/dist/steps/step1-metadata.js.map +1 -0
  39. package/dist/steps/step2-readable.d.ts +16 -0
  40. package/dist/steps/step2-readable.d.ts.map +1 -0
  41. package/dist/steps/step2-readable.js +45 -0
  42. package/dist/steps/step2-readable.js.map +1 -0
  43. package/dist/steps/step3-sanitize.d.ts +15 -0
  44. package/dist/steps/step3-sanitize.d.ts.map +1 -0
  45. package/dist/steps/step3-sanitize.js +43 -0
  46. package/dist/steps/step3-sanitize.js.map +1 -0
  47. package/dist/steps/step4-plaintext.d.ts +14 -0
  48. package/dist/steps/step4-plaintext.d.ts.map +1 -0
  49. package/dist/steps/step4-plaintext.js +47 -0
  50. package/dist/steps/step4-plaintext.js.map +1 -0
  51. package/dist/steps/step5-image.d.ts +22 -0
  52. package/dist/steps/step5-image.d.ts.map +1 -0
  53. package/dist/steps/step5-image.js +121 -0
  54. package/dist/steps/step5-image.js.map +1 -0
  55. package/dist/types.d.ts +56 -0
  56. package/dist/types.d.ts.map +1 -0
  57. package/dist/types.js +3 -0
  58. package/dist/types.js.map +1 -0
  59. package/dist/utils/formatters.d.ts +45 -0
  60. package/dist/utils/formatters.d.ts.map +1 -0
  61. package/dist/utils/formatters.js +61 -0
  62. package/dist/utils/formatters.js.map +1 -0
  63. package/dist/utils.d.ts +17 -0
  64. package/dist/utils.d.ts.map +1 -0
  65. package/dist/utils.js +34 -0
  66. package/dist/utils.js.map +1 -0
  67. package/package.json +72 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 paulohgodinho
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,157 @@
1
+ # meatscraper
2
+
3
+ Extract content from webpages! Perfect for bookmarking tools and AI ;)
4
+
5
+ Clean text content, metadata, and primary images from any webpage using [Metascraper](https://github.com/microlinkhq/metascraper), [Readability](https://github.com/mozilla/readability), [DOMPurify](https://github.com/cure53/DOMPurify) and custom logic.
6
+
7
+ *Disclaimer: This project was vibe coded.*
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ # Install as a library
13
+ npm install meatscraper
14
+
15
+ # Or install globally for CLI access
16
+ npm install -g meatscraper
17
+
18
+ # Or use directly with npx (no install needed)
19
+ npx meatscraper serve
20
+ ```
21
+
22
+ ## Inspiration
23
+ This project is based on [Karakeep](https://github.com/karakeep/karakeep). They have done an amazing job building a content extraction pipeline. I wanted to use that functionality in other projects, so I pulled it from them and the created this library/CLI/server around it.
24
+
25
+ ## Quick Example
26
+
27
+ **Input HTML:**
28
+ ```html
29
+ <html>
30
+ <head><title>My Article</title></head>
31
+ <body>
32
+ <h1>Hello World</h1>
33
+ <p>This is the actual content you want to keep.</p>
34
+ </body>
35
+ </html>
36
+ ```
37
+
38
+ **Output JSON:**
39
+ ```json
40
+ {
41
+ "success": true,
42
+ "data": {
43
+ "content": "Hello World\nThis is the actual content you want to keep.",
44
+ "image": null,
45
+ "metadata": {
46
+ "title": "My Article"
47
+ }
48
+ }
49
+ }
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ### As a Library (TypeScript/JavaScript)
55
+
56
+ ```typescript
57
+ import { meatExtractor } from 'meatscraper';
58
+
59
+ const result = await meatExtractor(htmlString);
60
+ console.log(result.content); // Clean text only
61
+ console.log(result.image); // Primary image URL
62
+ console.log(result.metadata); // {title, author, date, ...}
63
+ ```
64
+
65
+ ### CLI - Process Local File
66
+
67
+ ```bash
68
+ # After global install
69
+ meatscraper ./article.html
70
+
71
+ # Or with npx (no install needed)
72
+ npx meatscraper ./article.html
73
+ ```
74
+
75
+ Output is printed as JSON to stdout.
76
+
77
+ ### CLI - Start HTTP Server
78
+
79
+ ```bash
80
+ # After global install
81
+ meatscraper serve
82
+
83
+ # Or with npx
84
+ npx meatscraper serve
85
+ ```
86
+
87
+ Server runs on port 8676. Send HTML via POST:
88
+
89
+ ```bash
90
+ curl -X POST http://localhost:8676/extract \
91
+ -H "Content-Type: application/json" \
92
+ -d '{"html":"<html>...</html>"}'
93
+ ```
94
+
95
+ Endpoints:
96
+ - `POST /extract` - Extract content from HTML
97
+ - `GET /health` - Health check
98
+ - `GET /stats` - Server statistics
99
+
100
+ ### Docker
101
+
102
+ Pull and run the latest published image:
103
+
104
+ ```bash
105
+ # Server mode
106
+ docker run -p 8676:8676 ghcr.io/paulohgodinho/meatscraper:main serve
107
+
108
+ # File mode (requires mounted volume)
109
+ docker run -v $(pwd):/data ghcr.io/paulohgodinho/meatscraper:main /data/article.html
110
+ ```
111
+
112
+ ## API Response
113
+
114
+ Complete response structure:
115
+
116
+ ```json
117
+ {
118
+ "success": true,
119
+ "data": {
120
+ "content": "Hello World\nThis is the actual content you want to keep.",
121
+ "image": "https://example.com/image.jpg",
122
+ "metadata": {
123
+ "title": "My Article",
124
+ "description": "Article description here",
125
+ "author": "John Doe",
126
+ "publisher": "Example Publication",
127
+ "datePublished": "2024-01-15T10:30:00Z",
128
+ "dateModified": "2024-01-15T12:00:00Z",
129
+ "url": "https://example.com/article",
130
+ "logo": "https://example.com/logo.png",
131
+ "youtubeVideoId": null,
132
+ "youtubeChannelName": null,
133
+ "youtubeChannelId": null,
134
+ "twitterHandle": null,
135
+ "twitterCreator": null,
136
+ "amazonPrice": null,
137
+ "amazonProductTitle": null,
138
+ "redditSubreddit": null,
139
+ "redditAuthor": null
140
+ }
141
+ }
142
+ }
143
+ ```
144
+
145
+ ## Features
146
+
147
+ - **5-step processing pipeline** - Metadata extraction, readability analysis, sanitization, plain text conversion, image selection
148
+ - **Rich metadata extraction** - Extracts 20+ fields including title, author, publish date, image, and platform-specific data
149
+ - **Multiple platforms** - Special handling for YouTube, Twitter, Amazon, Reddit
150
+ - **HTML sanitization** - Removes scripts, styles, and dangerous content
151
+ - **Plain text output** - No HTML tags, clean readable text
152
+ - **Image selection** - Finds and returns the best primary image
153
+ - **Three usage modes** - Library, CLI, or HTTP server
154
+
155
+ ## License
156
+
157
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * CLI entry point for meatscraper
4
+ *
5
+ * Usage:
6
+ * meatscraper <file-path> # File mode: extract from HTML file
7
+ * meatscraper serve # Server mode: start HTTP server on port 8676
8
+ */
9
+ export {};
10
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;GAMG"}
package/dist/cli.js ADDED
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ /**
4
+ * CLI entry point for meatscraper
5
+ *
6
+ * Usage:
7
+ * meatscraper <file-path> # File mode: extract from HTML file
8
+ * meatscraper serve # Server mode: start HTTP server on port 8676
9
+ */
10
+ Object.defineProperty(exports, "__esModule", { value: true });
11
+ const file_mode_1 = require("./modes/file-mode");
12
+ const http_mode_1 = require("./modes/http-mode");
13
+ async function main() {
14
+ const args = process.argv.slice(2);
15
+ const command = args[0];
16
+ // No arguments provided
17
+ if (!command) {
18
+ console.error("❌ No command provided");
19
+ console.error("\nUsage:");
20
+ console.error(" meatscraper <file-path> Extract content from an HTML file");
21
+ console.error(" meatscraper serve Start HTTP server on port 8676");
22
+ console.error("\nExamples:");
23
+ console.error(" meatscraper ./example.html");
24
+ console.error(" meatscraper /path/to/file.html");
25
+ console.error(" meatscraper serve");
26
+ process.exit(1);
27
+ }
28
+ // Server mode
29
+ if (command === "serve") {
30
+ try {
31
+ (0, http_mode_1.startHttpServer)(8676);
32
+ // Keep the process running
33
+ process.on("SIGINT", () => {
34
+ console.log("\n\n👋 Server shutting down...");
35
+ process.exit(0);
36
+ });
37
+ }
38
+ catch (error) {
39
+ console.error("❌ Failed to start server:");
40
+ console.error(error instanceof Error ? error.message : String(error));
41
+ process.exit(1);
42
+ }
43
+ return;
44
+ }
45
+ // File mode
46
+ try {
47
+ const result = await (0, file_mode_1.processFileMode)(command);
48
+ // Output to stdout for piping/redirection
49
+ console.log(result);
50
+ process.exit(0);
51
+ }
52
+ catch (error) {
53
+ console.error("❌ Error processing file:");
54
+ console.error(error instanceof Error ? error.message : String(error));
55
+ process.exit(1);
56
+ }
57
+ }
58
+ // Run the CLI
59
+ main().catch((error) => {
60
+ console.error("❌ Unexpected error:");
61
+ console.error(error);
62
+ process.exit(1);
63
+ });
64
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";;AAEA;;;;;;GAMG;;AAEH,iDAAoD;AACpD,iDAAoD;AAEpD,KAAK,UAAU,IAAI;IACjB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;IAExB,wBAAwB;IACxB,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACvC,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QAC1B,OAAO,CAAC,KAAK,CAAC,iEAAiE,CAAC,CAAC;QACjF,OAAO,CAAC,KAAK,CAAC,8DAA8D,CAAC,CAAC;QAC9E,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAC9C,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QAClD,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;QACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,cAAc;IACd,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC;YACH,IAAA,2BAAe,EAAC,IAAI,CAAC,CAAC;YACtB,2BAA2B;YAC3B,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;gBACxB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;gBAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;YAC3C,OAAO,CAAC,KAAK,CACX,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QACD,OAAO;IACT,CAAC;IAED,YAAY;IACZ,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,IAAA,2BAAe,EAAC,OAAO,CAAC,CAAC;QAC9C,0CAA0C;QAC1C,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,KAAK,CACX,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CACvD,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,cAAc;AACd,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACrC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
@@ -0,0 +1,60 @@
1
+ /**
2
+ * meatscraper - Extract text content and primary image from webpages
3
+ *
4
+ * A comprehensive web scraping package that processes HTML through multiple
5
+ * cleaning and extraction steps to produce clean text and metadata.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { meatExtractor } from 'meatscraper';
10
+ *
11
+ * const result = await meatExtractor(htmlString);
12
+ * console.log(result.content); // Plain text
13
+ * console.log(result.image); // Image URL
14
+ * console.log(result.metadata); // Full metadata
15
+ * ```
16
+ */
17
+ import { MeatExtractorResult, MeatExtractorOptions } from "./types";
18
+ /**
19
+ * Extract text content and metadata from HTML
20
+ *
21
+ * Processes HTML through a 5-step pipeline:
22
+ * 1. Metadata extraction (metascraper with 12+ plugins)
23
+ * 2. Readable content extraction (Mozilla Readability)
24
+ * 3. HTML sanitization (DOMPurify)
25
+ * 4. Plain text conversion (html-to-text)
26
+ * 5. Image selection (best primary image)
27
+ *
28
+ * @param htmlString - Raw HTML content to process
29
+ * @param options - Configuration options
30
+ * @returns Promise resolving to extraction result with content, image, and metadata
31
+ *
32
+ * @example
33
+ * ```typescript
34
+ * // Basic usage
35
+ * const result = await meatExtractor(html);
36
+ * console.log(result.content); // Plain text
37
+ * console.log(result.image); // Image URL or null
38
+ * console.log(result.metadata); // {title, description, author, ...}
39
+ *
40
+ * // With debugging
41
+ * const result = await meatExtractor(html, { debug: true });
42
+ * console.log(result.debug?.step1_metadata);
43
+ * console.log(result.debug?.step2_readableContent);
44
+ * console.log(result.debug?.step3_sanitizedContent);
45
+ *
46
+ * // With URL hint
47
+ * const result = await meatExtractor(html, {
48
+ * url: 'https://example.com/article'
49
+ * });
50
+ * ```
51
+ */
52
+ export declare function meatExtractor(htmlString: string, options?: MeatExtractorOptions): Promise<MeatExtractorResult>;
53
+ export type { MeatExtractorResult, MeatExtractorOptions, MetadataResult, DebugInfo, } from "./types";
54
+ export { step1ExtractMetadata } from "./steps/step1-metadata";
55
+ export { step2ExtractReadableContent } from "./steps/step2-readable";
56
+ export { step3SanitizeHtml } from "./steps/step3-sanitize";
57
+ export { step4ConvertToPlainText } from "./steps/step4-plaintext";
58
+ export { step5SelectImage } from "./steps/step5-image";
59
+ export { extractMetadata, createMetascraperParser } from "./metascraper-setup";
60
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,SAAS,CAAC;AAGpE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,oBAAoB,GAC7B,OAAO,CAAC,mBAAmB,CAAC,CAE9B;AAGD,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,cAAc,EACd,SAAS,GACV,MAAM,SAAS,CAAC;AAGjB,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,2BAA2B,EAAE,MAAM,wBAAwB,CAAC;AACrE,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,uBAAuB,EAAE,MAAM,yBAAyB,CAAC;AAClE,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAGvD,OAAO,EAAE,eAAe,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,74 @@
1
+ "use strict";
2
+ /**
3
+ * meatscraper - Extract text content and primary image from webpages
4
+ *
5
+ * A comprehensive web scraping package that processes HTML through multiple
6
+ * cleaning and extraction steps to produce clean text and metadata.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * import { meatExtractor } from 'meatscraper';
11
+ *
12
+ * const result = await meatExtractor(htmlString);
13
+ * console.log(result.content); // Plain text
14
+ * console.log(result.image); // Image URL
15
+ * console.log(result.metadata); // Full metadata
16
+ * ```
17
+ */
18
+ Object.defineProperty(exports, "__esModule", { value: true });
19
+ exports.createMetascraperParser = exports.extractMetadata = exports.step5SelectImage = exports.step4ConvertToPlainText = exports.step3SanitizeHtml = exports.step2ExtractReadableContent = exports.step1ExtractMetadata = void 0;
20
+ exports.meatExtractor = meatExtractor;
21
+ const pipeline_1 = require("./pipeline");
22
+ /**
23
+ * Extract text content and metadata from HTML
24
+ *
25
+ * Processes HTML through a 5-step pipeline:
26
+ * 1. Metadata extraction (metascraper with 12+ plugins)
27
+ * 2. Readable content extraction (Mozilla Readability)
28
+ * 3. HTML sanitization (DOMPurify)
29
+ * 4. Plain text conversion (html-to-text)
30
+ * 5. Image selection (best primary image)
31
+ *
32
+ * @param htmlString - Raw HTML content to process
33
+ * @param options - Configuration options
34
+ * @returns Promise resolving to extraction result with content, image, and metadata
35
+ *
36
+ * @example
37
+ * ```typescript
38
+ * // Basic usage
39
+ * const result = await meatExtractor(html);
40
+ * console.log(result.content); // Plain text
41
+ * console.log(result.image); // Image URL or null
42
+ * console.log(result.metadata); // {title, description, author, ...}
43
+ *
44
+ * // With debugging
45
+ * const result = await meatExtractor(html, { debug: true });
46
+ * console.log(result.debug?.step1_metadata);
47
+ * console.log(result.debug?.step2_readableContent);
48
+ * console.log(result.debug?.step3_sanitizedContent);
49
+ *
50
+ * // With URL hint
51
+ * const result = await meatExtractor(html, {
52
+ * url: 'https://example.com/article'
53
+ * });
54
+ * ```
55
+ */
56
+ async function meatExtractor(htmlString, options) {
57
+ return (0, pipeline_1.executePipeline)(htmlString, options);
58
+ }
59
+ // Re-export individual step functions for advanced usage
60
+ var step1_metadata_1 = require("./steps/step1-metadata");
61
+ Object.defineProperty(exports, "step1ExtractMetadata", { enumerable: true, get: function () { return step1_metadata_1.step1ExtractMetadata; } });
62
+ var step2_readable_1 = require("./steps/step2-readable");
63
+ Object.defineProperty(exports, "step2ExtractReadableContent", { enumerable: true, get: function () { return step2_readable_1.step2ExtractReadableContent; } });
64
+ var step3_sanitize_1 = require("./steps/step3-sanitize");
65
+ Object.defineProperty(exports, "step3SanitizeHtml", { enumerable: true, get: function () { return step3_sanitize_1.step3SanitizeHtml; } });
66
+ var step4_plaintext_1 = require("./steps/step4-plaintext");
67
+ Object.defineProperty(exports, "step4ConvertToPlainText", { enumerable: true, get: function () { return step4_plaintext_1.step4ConvertToPlainText; } });
68
+ var step5_image_1 = require("./steps/step5-image");
69
+ Object.defineProperty(exports, "step5SelectImage", { enumerable: true, get: function () { return step5_image_1.step5SelectImage; } });
70
+ // Re-export metascraper utilities
71
+ var metascraper_setup_1 = require("./metascraper-setup");
72
+ Object.defineProperty(exports, "extractMetadata", { enumerable: true, get: function () { return metascraper_setup_1.extractMetadata; } });
73
+ Object.defineProperty(exports, "createMetascraperParser", { enumerable: true, get: function () { return metascraper_setup_1.createMetascraperParser; } });
74
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;GAeG;;;AAuCH,sCAKC;AAzCD,yCAA6C;AAE7C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACI,KAAK,UAAU,aAAa,CACjC,UAAkB,EAClB,OAA8B;IAE9B,OAAO,IAAA,0BAAe,EAAC,UAAU,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC;AAUD,yDAAyD;AACzD,yDAA8D;AAArD,sHAAA,oBAAoB,OAAA;AAC7B,yDAAqE;AAA5D,6HAAA,2BAA2B,OAAA;AACpC,yDAA2D;AAAlD,mHAAA,iBAAiB,OAAA;AAC1B,2DAAkE;AAAzD,0HAAA,uBAAuB,OAAA;AAChC,mDAAuD;AAA9C,+GAAA,gBAAgB,OAAA;AAEzB,kCAAkC;AAClC,yDAA+E;AAAtE,oHAAA,eAAe,OAAA;AAAE,4HAAA,uBAAuB,OAAA"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Metascraper plugin for improved Amazon product image extraction
3
+ *
4
+ * Fixes image extraction bug in standard metascraperAmazon by prioritizing
5
+ * high-quality product images over generic site logos.
6
+ *
7
+ * MUST be used before metascraperAmazon() in the plugin chain
8
+ */
9
+ export default function metascraperAmazonImproved(opts?: any): any;
10
+ //# sourceMappingURL=metascraper-amazon-improved.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-amazon-improved.d.ts","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-amazon-improved.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,MAAM,CAAC,OAAO,UAAU,yBAAyB,CAAC,IAAI,CAAC,EAAE,GAAG,OA6C3D"}
@@ -0,0 +1,44 @@
1
+ "use strict";
2
+ /**
3
+ * Metascraper plugin for improved Amazon product image extraction
4
+ *
5
+ * Fixes image extraction bug in standard metascraperAmazon by prioritizing
6
+ * high-quality product images over generic site logos.
7
+ *
8
+ * MUST be used before metascraperAmazon() in the plugin chain
9
+ */
10
+ Object.defineProperty(exports, "__esModule", { value: true });
11
+ exports.default = metascraperAmazonImproved;
12
+ const helpers_1 = require("@metascraper/helpers");
13
+ function metascraperAmazonImproved(opts) {
14
+ const toImage = (0, helpers_1.toRule)(helpers_1.image, opts);
15
+ const rules = {
16
+ image: [
17
+ // Amazon product main image (most specific selector)
18
+ toImage(($) => $('img[data-a-dynamic-image]')
19
+ .first()
20
+ .attr('src') ||
21
+ $('img[data-a-dynamic-image]').first().attr('data-src')),
22
+ // Amazon product landing page images
23
+ toImage(($) => $('.a-dynamic-image img')
24
+ .first()
25
+ .attr('src') ||
26
+ $('.a-dynamic-image img').first().attr('data-src')),
27
+ // Amazon image container
28
+ toImage(($) => $('img.a-dynamic-image')
29
+ .first()
30
+ .attr('src') ||
31
+ $('img.a-dynamic-image').first().attr('data-src')),
32
+ // Generic product image
33
+ toImage(($) => $('img[alt*="product"]')
34
+ .first()
35
+ .attr('src') ||
36
+ $('img[alt*="product"]').first().attr('data-src')),
37
+ // Fallback to og:image
38
+ toImage(($) => $('meta[property="og:image"]').attr('content')),
39
+ ],
40
+ };
41
+ rules.pkgName = 'metascraper-amazon-improved';
42
+ return rules;
43
+ }
44
+ //# sourceMappingURL=metascraper-amazon-improved.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-amazon-improved.js","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-amazon-improved.ts"],"names":[],"mappings":";AAAA;;;;;;;GAOG;;AAIH,4CA6CC;AA/CD,kDAAoD;AAEpD,SAAwB,yBAAyB,CAAC,IAAU;IAC1D,MAAM,OAAO,GAAG,IAAA,gBAAM,EAAC,eAAK,EAAE,IAAI,CAAC,CAAA;IAEnC,MAAM,KAAK,GAAQ;QACjB,KAAK,EAAE;YACL,qDAAqD;YACrD,OAAO,CACL,CAAC,CAAM,EAAE,EAAE,CACT,CAAC,CAAC,2BAA2B,CAAC;iBAC3B,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,2BAA2B,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CAC1D;YACD,qCAAqC;YACrC,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,sBAAsB,CAAC;iBACtB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,sBAAsB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACrD;YACD,yBAAyB;YACzB,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,qBAAqB,CAAC;iBACrB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACpD;YACD,wBAAwB;YACxB,OAAO,CACL,CAAC,CAAO,EAAE,EAAE,CACV,CAAC,CAAC,qBAAqB,CAAC;iBACrB,KAAK,EAAE;iBACP,IAAI,CAAC,KAAK,CAAC;gBACd,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CACpD;YACD,uBAAuB;YACvB,OAAO,CAAC,CAAC,CAAO,EAAE,EAAE,CAAC,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;SACrE;KACF,CAAA;IAED,KAAK,CAAC,OAAO,GAAG,6BAA6B,CAAA;IAE7C,OAAO,KAAK,CAAA;AACd,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Metascraper plugin for Reddit post metadata extraction
3
+ *
4
+ * Extracts Reddit-specific metadata including subreddit, author,
5
+ * upvote count, and other post information
6
+ */
7
+ export default function metascraperReddit(opts?: any): any;
8
+ //# sourceMappingURL=metascraper-reddit.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-reddit.d.ts","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-reddit.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,CAAC,OAAO,UAAU,iBAAiB,CAAC,IAAI,CAAC,EAAE,GAAG,OAgDnD"}
@@ -0,0 +1,47 @@
1
+ "use strict";
2
+ /**
3
+ * Metascraper plugin for Reddit post metadata extraction
4
+ *
5
+ * Extracts Reddit-specific metadata including subreddit, author,
6
+ * upvote count, and other post information
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.default = metascraperReddit;
10
+ const helpers_1 = require("@metascraper/helpers");
11
+ // Identity processor for custom fields that don't need special handling
12
+ const identity = (value) => value;
13
+ function metascraperReddit(opts) {
14
+ const toCustom = (0, helpers_1.toRule)(identity, opts);
15
+ const rules = {
16
+ // Extract subreddit from the og:url meta tag
17
+ subreddit: [
18
+ toCustom(($) => {
19
+ const ogUrl = $('meta[property="og:url"]').attr('content');
20
+ const match = ogUrl?.match(/\/r\/([^/]+)/);
21
+ return match ? match[1] : undefined;
22
+ }),
23
+ ],
24
+ // Extract author from meta tags
25
+ author: [
26
+ toCustom(($) => $('meta[name="author"]').attr('content')),
27
+ toCustom(($) => (0, helpers_1.$filter)($, $('[data-testid="post_author_by_line"] a[href*="/user/"]'))),
28
+ ],
29
+ // Extract description from og:description
30
+ description: [
31
+ toCustom(($) => $('meta[property="og:description"]').attr('content')),
32
+ ],
33
+ // Extract Reddit-specific metadata (upvotes)
34
+ redditUpvotes: [
35
+ toCustom(($) => {
36
+ const upvoteText = $('._1rZjMh_0').text();
37
+ const match = upvoteText?.match(/([\d.,]+)/);
38
+ return match
39
+ ? parseInt(match[1].replace(/[,.']/g, ''), 10)
40
+ : undefined;
41
+ }),
42
+ ],
43
+ };
44
+ rules.pkgName = 'metascraper-reddit';
45
+ return rules;
46
+ }
47
+ //# sourceMappingURL=metascraper-reddit.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-reddit.js","sourceRoot":"","sources":["../../src/metascraper-plugins/metascraper-reddit.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;AAOH,oCAgDC;AArDD,kDAAsD;AAEtD,wEAAwE;AACxE,MAAM,QAAQ,GAAG,CAAC,KAAU,EAAE,EAAE,CAAC,KAAK,CAAA;AAEtC,SAAwB,iBAAiB,CAAC,IAAU;IAClD,MAAM,QAAQ,GAAG,IAAA,gBAAM,EAAC,QAAQ,EAAE,IAAI,CAAC,CAAA;IAEvC,MAAM,KAAK,GAAQ;QACjB,6CAA6C;QAC7C,SAAS,EAAE;YACT,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE;gBACnB,MAAM,KAAK,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBAC1D,MAAM,KAAK,GAAG,KAAK,EAAE,KAAK,CAAC,cAAc,CAAC,CAAA;gBAC1C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;YACrC,CAAC,CAAC;SACH;QAED,gCAAgC;QAChC,MAAM,EAAE;YACN,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CAAC,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC/D,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CACnB,IAAA,iBAAO,EACL,CAAC,EACD,CAAC,CACC,uDAAuD,CACxD,CACF,CACF;SACF;QAED,0CAA0C;QAC1C,WAAW,EAAE;YACX,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE,CACnB,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CACrD;SACF;QAED,6CAA6C;QAC7C,aAAa,EAAE;YACb,QAAQ,CAAC,CAAC,CAAO,EAAE,EAAE;gBACnB,MAAM,UAAU,GAAG,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAA;gBACzC,MAAM,KAAK,GAAG,UAAU,EAAE,KAAK,CAAC,WAAW,CAAC,CAAA;gBAC5C,OAAO,KAAK;oBACV,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC9C,CAAC,CAAC,SAAS,CAAA;YACf,CAAC,CAAC;SACH;KACF,CAAA;IAED,KAAK,CAAC,OAAO,GAAG,oBAAoB,CAAA;IAEpC,OAAO,KAAK,CAAA;AACd,CAAC"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Metascraper configuration with all plugins
3
+ *
4
+ * This sets up the complete metascraper parser with all plugins
5
+ * in the correct order to extract comprehensive metadata from webpages
6
+ */
7
+ import metascraper from "metascraper";
8
+ /**
9
+ * Create and return the configured metascraper parser
10
+ *
11
+ * Plugin order is important - some plugins must come before others
12
+ * to ensure correct extraction priority
13
+ */
14
+ export declare function createMetascraperParser(): metascraper.Metascraper;
15
+ /**
16
+ * Extract metadata from HTML content
17
+ *
18
+ * @param htmlContent - The raw HTML string to extract metadata from
19
+ * @param url - Optional URL for context (helps canonicalization)
20
+ * @returns Promise resolving to metadata object
21
+ */
22
+ export declare function extractMetadata(htmlContent: string, url?: string): Promise<Record<string, any>>;
23
+ //# sourceMappingURL=metascraper-setup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-setup.d.ts","sourceRoot":"","sources":["../src/metascraper-setup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,WAAW,MAAM,aAAa,CAAC;AAgBtC;;;;;GAKG;AACH,wBAAgB,uBAAuB,4BAgCtC;AAED;;;;;;GAMG;AACH,wBAAsB,eAAe,CACnC,WAAW,EAAE,MAAM,EACnB,GAAG,CAAC,EAAE,MAAM,GACX,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAW9B"}
@@ -0,0 +1,78 @@
1
+ "use strict";
2
+ /**
3
+ * Metascraper configuration with all plugins
4
+ *
5
+ * This sets up the complete metascraper parser with all plugins
6
+ * in the correct order to extract comprehensive metadata from webpages
7
+ */
8
+ var __importDefault = (this && this.__importDefault) || function (mod) {
9
+ return (mod && mod.__esModule) ? mod : { "default": mod };
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.createMetascraperParser = createMetascraperParser;
13
+ exports.extractMetadata = extractMetadata;
14
+ const metascraper_1 = __importDefault(require("metascraper"));
15
+ const metascraper_amazon_1 = __importDefault(require("metascraper-amazon"));
16
+ const metascraper_author_1 = __importDefault(require("metascraper-author"));
17
+ const metascraper_date_1 = __importDefault(require("metascraper-date"));
18
+ const metascraper_description_1 = __importDefault(require("metascraper-description"));
19
+ const metascraper_image_1 = __importDefault(require("metascraper-image"));
20
+ const metascraper_logo_favicon_1 = __importDefault(require("metascraper-logo-favicon"));
21
+ const metascraper_publisher_1 = __importDefault(require("metascraper-publisher"));
22
+ const metascraper_title_1 = __importDefault(require("metascraper-title"));
23
+ const metascraper_url_1 = __importDefault(require("metascraper-url"));
24
+ const metascraper_x_1 = __importDefault(require("metascraper-x"));
25
+ const metascraper_youtube_1 = __importDefault(require("metascraper-youtube"));
26
+ const metascraper_amazon_improved_1 = __importDefault(require("./metascraper-plugins/metascraper-amazon-improved"));
27
+ const metascraper_reddit_1 = __importDefault(require("./metascraper-plugins/metascraper-reddit"));
28
+ /**
29
+ * Create and return the configured metascraper parser
30
+ *
31
+ * Plugin order is important - some plugins must come before others
32
+ * to ensure correct extraction priority
33
+ */
34
+ function createMetascraperParser() {
35
+ return (0, metascraper_1.default)([
36
+ // Date extraction must be early
37
+ (0, metascraper_date_1.default)({
38
+ dateModified: true,
39
+ datePublished: true,
40
+ }),
41
+ // Amazon improved MUST come before base Amazon plugin
42
+ (0, metascraper_amazon_improved_1.default)(),
43
+ (0, metascraper_amazon_1.default)(),
44
+ // Platform-specific extractors
45
+ (0, metascraper_youtube_1.default)(),
46
+ (0, metascraper_reddit_1.default)(),
47
+ // General metadata extractors
48
+ (0, metascraper_author_1.default)(),
49
+ (0, metascraper_publisher_1.default)(),
50
+ (0, metascraper_title_1.default)(),
51
+ (0, metascraper_description_1.default)(),
52
+ (0, metascraper_x_1.default)(),
53
+ // Image extraction - PRIMARY image source
54
+ (0, metascraper_image_1.default)(),
55
+ // Logo/favicon as fallback image
56
+ (0, metascraper_logo_favicon_1.default)(),
57
+ // URL canonicalization - should be last
58
+ (0, metascraper_url_1.default)(),
59
+ ]);
60
+ }
61
+ /**
62
+ * Extract metadata from HTML content
63
+ *
64
+ * @param htmlContent - The raw HTML string to extract metadata from
65
+ * @param url - Optional URL for context (helps canonicalization)
66
+ * @returns Promise resolving to metadata object
67
+ */
68
+ async function extractMetadata(htmlContent, url) {
69
+ const parser = createMetascraperParser();
70
+ const meta = await parser({
71
+ html: htmlContent,
72
+ url: url || "about:blank",
73
+ // Don't validate URL - we're processing pre-fetched HTML
74
+ validateUrl: false,
75
+ });
76
+ return meta;
77
+ }
78
+ //# sourceMappingURL=metascraper-setup.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metascraper-setup.js","sourceRoot":"","sources":["../src/metascraper-setup.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;AAwBH,0DAgCC;AASD,0CAcC;AA7ED,8DAAsC;AACtC,4EAAmD;AACnD,4EAAmD;AACnD,wEAA+C;AAC/C,sFAA6D;AAC7D,0EAAiD;AACjD,wFAAuD;AACvD,kFAAyD;AACzD,0EAAiD;AACjD,sEAA6C;AAC7C,kEAAyC;AACzC,8EAAqD;AAErD,oHAA0F;AAC1F,kGAAyE;AAEzE;;;;;GAKG;AACH,SAAgB,uBAAuB;IACrC,OAAO,IAAA,qBAAW,EAAC;QACjB,gCAAgC;QAChC,IAAA,0BAAe,EAAC;YACd,YAAY,EAAE,IAAI;YAClB,aAAa,EAAE,IAAI;SACpB,CAAC;QAEF,sDAAsD;QACtD,IAAA,qCAAyB,GAAS;QAClC,IAAA,4BAAiB,GAAE;QAEnB,+BAA+B;QAC/B,IAAA,6BAAkB,GAAE;QACpB,IAAA,4BAAiB,GAAS;QAE1B,8BAA8B;QAC9B,IAAA,4BAAiB,GAAE;QACnB,IAAA,+BAAoB,GAAE;QACtB,IAAA,2BAAgB,GAAE;QAClB,IAAA,iCAAsB,GAAE;QACxB,IAAA,uBAAY,GAAE;QAEd,0CAA0C;QAC1C,IAAA,2BAAgB,GAAE;QAElB,iCAAiC;QACjC,IAAA,kCAAe,GAAE;QAEjB,wCAAwC;QACxC,IAAA,yBAAc,GAAE;KACV,CAAC,CAAC;AACZ,CAAC;AAED;;;;;;GAMG;AACI,KAAK,UAAU,eAAe,CACnC,WAAmB,EACnB,GAAY;IAEZ,MAAM,MAAM,GAAG,uBAAuB,EAAE,CAAC;IAEzC,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC;QACxB,IAAI,EAAE,WAAW;QACjB,GAAG,EAAE,GAAG,IAAI,aAAa;QACzB,yDAAyD;QACzD,WAAW,EAAE,KAAK;KACnB,CAAC,CAAC;IAEH,OAAO,IAAI,CAAC;AACd,CAAC"}