meatscraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +157 -0
- package/dist/cli.d.ts +10 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +64 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +74 -0
- package/dist/index.js.map +1 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts +10 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts.map +1 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.js +44 -0
- package/dist/metascraper-plugins/metascraper-amazon-improved.js.map +1 -0
- package/dist/metascraper-plugins/metascraper-reddit.d.ts +8 -0
- package/dist/metascraper-plugins/metascraper-reddit.d.ts.map +1 -0
- package/dist/metascraper-plugins/metascraper-reddit.js +47 -0
- package/dist/metascraper-plugins/metascraper-reddit.js.map +1 -0
- package/dist/metascraper-setup.d.ts +23 -0
- package/dist/metascraper-setup.d.ts.map +1 -0
- package/dist/metascraper-setup.js +78 -0
- package/dist/metascraper-setup.js.map +1 -0
- package/dist/modes/file-mode.d.ts +12 -0
- package/dist/modes/file-mode.d.ts.map +1 -0
- package/dist/modes/file-mode.js +63 -0
- package/dist/modes/file-mode.js.map +1 -0
- package/dist/modes/http-mode.d.ts +12 -0
- package/dist/modes/http-mode.d.ts.map +1 -0
- package/dist/modes/http-mode.js +111 -0
- package/dist/modes/http-mode.js.map +1 -0
- package/dist/pipeline.d.ts +23 -0
- package/dist/pipeline.d.ts.map +1 -0
- package/dist/pipeline.js +59 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/steps/step1-metadata.d.ts +9 -0
- package/dist/steps/step1-metadata.d.ts.map +1 -0
- package/dist/steps/step1-metadata.js +42 -0
- package/dist/steps/step1-metadata.js.map +1 -0
- package/dist/steps/step2-readable.d.ts +16 -0
- package/dist/steps/step2-readable.d.ts.map +1 -0
- package/dist/steps/step2-readable.js +45 -0
- package/dist/steps/step2-readable.js.map +1 -0
- package/dist/steps/step3-sanitize.d.ts +15 -0
- package/dist/steps/step3-sanitize.d.ts.map +1 -0
- package/dist/steps/step3-sanitize.js +43 -0
- package/dist/steps/step3-sanitize.js.map +1 -0
- package/dist/steps/step4-plaintext.d.ts +14 -0
- package/dist/steps/step4-plaintext.d.ts.map +1 -0
- package/dist/steps/step4-plaintext.js +47 -0
- package/dist/steps/step4-plaintext.js.map +1 -0
- package/dist/steps/step5-image.d.ts +22 -0
- package/dist/steps/step5-image.d.ts.map +1 -0
- package/dist/steps/step5-image.js +121 -0
- package/dist/steps/step5-image.js.map +1 -0
- package/dist/types.d.ts +56 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/formatters.d.ts +45 -0
- package/dist/utils/formatters.d.ts.map +1 -0
- package/dist/utils/formatters.js +61 -0
- package/dist/utils/formatters.js.map +1 -0
- package/dist/utils.d.ts +17 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +34 -0
- package/dist/utils.js.map +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Step 5: Select the primary image
|
|
4
|
+
*
|
|
5
|
+
* Extracts the best primary image URL from metadata.
|
|
6
|
+
* Uses metascraperImage result as primary, falls back to logo if needed.
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.step5SelectImage = step5SelectImage;
|
|
10
|
+
const jsdom_1 = require("jsdom");
|
|
11
|
+
const url_1 = require("url");
|
|
12
|
+
/**
|
|
13
|
+
* Extract favicon URL from HTML <link> tags
|
|
14
|
+
*
|
|
15
|
+
* @param htmlContent - Raw HTML string to parse
|
|
16
|
+
* @param baseUrl - Base URL for resolving relative paths
|
|
17
|
+
* @returns Favicon URL or null if not found
|
|
18
|
+
*/
|
|
19
|
+
function extractFaviconFromHtml(htmlContent, baseUrl) {
|
|
20
|
+
try {
|
|
21
|
+
const dom = new jsdom_1.JSDOM(htmlContent);
|
|
22
|
+
const document = dom.window.document;
|
|
23
|
+
// Search for favicon link tags in order of preference
|
|
24
|
+
const faviconSelectors = [
|
|
25
|
+
'link[rel="icon"]',
|
|
26
|
+
'link[rel="shortcut icon"]',
|
|
27
|
+
'link[rel="apple-touch-icon"]',
|
|
28
|
+
'link[rel="apple-touch-icon-precomposed"]',
|
|
29
|
+
];
|
|
30
|
+
for (const selector of faviconSelectors) {
|
|
31
|
+
const linkElement = document.querySelector(selector);
|
|
32
|
+
if (linkElement) {
|
|
33
|
+
const href = linkElement.getAttribute('href');
|
|
34
|
+
if (href) {
|
|
35
|
+
// Skip data URIs
|
|
36
|
+
if (href.startsWith('data:')) {
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
// Resolve relative URLs to absolute
|
|
40
|
+
try {
|
|
41
|
+
const resolvedUrl = new url_1.URL(href, baseUrl);
|
|
42
|
+
return resolvedUrl.href;
|
|
43
|
+
}
|
|
44
|
+
catch (e) {
|
|
45
|
+
// Invalid URL, try next
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
catch (error) {
|
|
54
|
+
// HTML parsing failed
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Select the best primary image from metadata
|
|
60
|
+
*
|
|
61
|
+
* @param metadata - Metadata object from step 1
|
|
62
|
+
* @param htmlContent - Raw HTML content for favicon extraction fallback
|
|
63
|
+
* @param url - Base URL for resolving relative favicon paths
|
|
64
|
+
* @returns Object with extracted image, selected image, and reason
|
|
65
|
+
*/
|
|
66
|
+
function step5SelectImage(metadata, htmlContent, url) {
|
|
67
|
+
// Helper to extract first string from image (handles array or string)
|
|
68
|
+
const getImageUrl = (img) => {
|
|
69
|
+
if (!img)
|
|
70
|
+
return null;
|
|
71
|
+
if (typeof img === "string")
|
|
72
|
+
return img;
|
|
73
|
+
if (Array.isArray(img) && img.length > 0) {
|
|
74
|
+
return typeof img[0] === "string" ? img[0] : null;
|
|
75
|
+
}
|
|
76
|
+
return null;
|
|
77
|
+
};
|
|
78
|
+
// Primary source: metascraperImage result
|
|
79
|
+
const imageUrl = getImageUrl(metadata.image);
|
|
80
|
+
if (imageUrl) {
|
|
81
|
+
// Don't use data URIs - they're typically very large and not useful for external reference
|
|
82
|
+
if (!imageUrl.startsWith("data:")) {
|
|
83
|
+
return {
|
|
84
|
+
extracted: imageUrl,
|
|
85
|
+
selected: imageUrl,
|
|
86
|
+
reason: "Primary image from metascraper",
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Fallback 1: Logo/favicon
|
|
91
|
+
const logoUrl = getImageUrl(metadata.logo);
|
|
92
|
+
if (logoUrl && !logoUrl.startsWith("data:")) {
|
|
93
|
+
return {
|
|
94
|
+
extracted: imageUrl,
|
|
95
|
+
selected: logoUrl,
|
|
96
|
+
reason: "Fallback to logo/favicon",
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
// Fallback 2: Search HTML for favicon <link> tags
|
|
100
|
+
if (htmlContent) {
|
|
101
|
+
// Prefer metadata.url (from og:url) over passed url parameter
|
|
102
|
+
const baseUrl = metadata.url || url;
|
|
103
|
+
if (baseUrl) {
|
|
104
|
+
const htmlFavicon = extractFaviconFromHtml(htmlContent, baseUrl);
|
|
105
|
+
if (htmlFavicon) {
|
|
106
|
+
return {
|
|
107
|
+
extracted: imageUrl,
|
|
108
|
+
selected: htmlFavicon,
|
|
109
|
+
reason: "Favicon extracted from HTML <link> tags",
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// No suitable image found
|
|
115
|
+
return {
|
|
116
|
+
extracted: imageUrl,
|
|
117
|
+
selected: null,
|
|
118
|
+
reason: "No suitable image found",
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
//# sourceMappingURL=step5-image.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"step5-image.js","sourceRoot":"","sources":["../../src/steps/step5-image.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;AAqEH,4CA6DC;AA/HD,iCAA8B;AAC9B,6BAA0B;AAQ1B;;;;;;GAMG;AACH,SAAS,sBAAsB,CAAC,WAAmB,EAAE,OAAe;IAClE,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,WAAW,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;QAErC,sDAAsD;QACtD,MAAM,gBAAgB,GAAG;YACvB,kBAAkB;YAClB,2BAA2B;YAC3B,8BAA8B;YAC9B,0CAA0C;SAC3C,CAAC;QAEF,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YACrD,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,IAAI,GAAG,WAAW,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBAC9C,IAAI,IAAI,EAAE,CAAC;oBACT,iBAAiB;oBACjB,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;wBAC7B,SAAS;oBACX,CAAC;oBAED,oCAAoC;oBACpC,IAAI,CAAC;wBACH,MAAM,WAAW,GAAG,IAAI,SAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;wBAC3C,OAAO,WAAW,CAAC,IAAI,CAAC;oBAC1B,CAAC;oBAAC,OAAO,CAAC,EAAE,CAAC;wBACX,wBAAwB;wBACxB,SAAS;oBACX,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,sBAAsB;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,SAAgB,gBAAgB,CAC9B,QAAwB,EACxB,WAAoB,EACpB,GAAY;IAEZ,sEAAsE;IACtE,MAAM,WAAW,GAAG,CAAC,GAAQ,EAAiB,EAAE;QAC9C,IAAI,CAAC,GAAG;YAAE,OAAO,IAAI,CAAC;QACtB,IAAI,OAAO,GAAG,KAAK,QAAQ;YAAE,OAAO,GAAG,CAAC;QACxC,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzC,OAAO,OAAO,GAAG,CAAC,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACpD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC;IAEF,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC7C,IAAI,QAAQ,EAAE,CAAC;QACb,2FAA2F;QAC3F,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAClC,OAAO;gBACL,SAAS,EAAE,QAAQ;gBACnB,QAAQ,EAAE,QAAQ;gBAClB,MAAM,EAAE,gCAAgC;aACzC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,MAAM,OAAO,GAAG,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC3C,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5C,OAAO;YACL,SAAS,EAAE,QAAQ;YACnB,QAAQ,EAAE,OAAO;YACjB,MAAM,EAAE,0BAA0B;SACnC,CAAC;IACJ,CAAC;IAED,kDAAkD;IAClD,IAAI,WAAW,EAAE,CAAC;QAChB,8DAA8D;QAC9D,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;QAEpC,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,WAAW,GAAG,sBAAsB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;YACjE,IAAI,WAAW,EAAE,CAAC;gBAChB,OAAO;oBACL,SAAS,EAAE,QAAQ;oBACnB,QAAQ,EAAE,WAAW;oBACrB,MAAM,EAAE,yCAAyC;iBAClD,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,0BAA0B;IAC1B,OAAO;QACL,SAAS,EAAE,QAAQ;QACnB,QAAQ,EAAE,IAAI;QACd,MAAM,EAAE,yBAAyB;KAClC,CAAC;AACJ,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metadata extracted from webpage by metascraper
|
|
3
|
+
*/
|
|
4
|
+
export interface MetadataResult {
|
|
5
|
+
title?: string;
|
|
6
|
+
description?: string;
|
|
7
|
+
image?: string;
|
|
8
|
+
logo?: string;
|
|
9
|
+
author?: string;
|
|
10
|
+
publisher?: string;
|
|
11
|
+
datePublished?: string;
|
|
12
|
+
dateModified?: string;
|
|
13
|
+
url?: string;
|
|
14
|
+
youtubeVideoId?: string;
|
|
15
|
+
youtubeChannelName?: string;
|
|
16
|
+
youtubeChannelId?: string;
|
|
17
|
+
twitterHandle?: string;
|
|
18
|
+
twitterCreator?: string;
|
|
19
|
+
amazonPrice?: string;
|
|
20
|
+
amazonProductTitle?: string;
|
|
21
|
+
redditSubreddit?: string;
|
|
22
|
+
redditAuthor?: string;
|
|
23
|
+
readableContentHtml?: string;
|
|
24
|
+
[key: string]: any;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Debug information from each pipeline step
|
|
28
|
+
*/
|
|
29
|
+
export interface DebugInfo {
|
|
30
|
+
step1_metadata: MetadataResult;
|
|
31
|
+
step2_readableContent: string;
|
|
32
|
+
step3_sanitizedContent: string;
|
|
33
|
+
step4_plaintext: string;
|
|
34
|
+
step5_imageSelection: {
|
|
35
|
+
extracted: string | null;
|
|
36
|
+
selected: string | null;
|
|
37
|
+
reason: string;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Final result from meatscraper extraction
|
|
42
|
+
*/
|
|
43
|
+
export interface MeatExtractorResult {
|
|
44
|
+
content: string;
|
|
45
|
+
image: string | null;
|
|
46
|
+
metadata: MetadataResult;
|
|
47
|
+
debug?: DebugInfo;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Options for meatscraper function
|
|
51
|
+
*/
|
|
52
|
+
export interface MeatExtractorOptions {
|
|
53
|
+
debug?: boolean;
|
|
54
|
+
url?: string;
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,WAAW,cAAc;IAE7B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,GAAG,CAAC,EAAE,MAAM,CAAC;IAGb,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAG7B,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,cAAc,EAAE,cAAc,CAAC;IAC/B,qBAAqB,EAAE,MAAM,CAAC;IAC9B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,eAAe,EAAE,MAAM,CAAC;IACxB,oBAAoB,EAAE;QACpB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;QACzB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;QACxB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAElC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,QAAQ,EAAE,cAAc,CAAC;IAGzB,KAAK,CAAC,EAAE,SAAS,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Output formatters for different modes and response types
|
|
3
|
+
*/
|
|
4
|
+
import { MeatExtractorResult } from "../index";
|
|
5
|
+
/**
|
|
6
|
+
* Success response format with extracted data
|
|
7
|
+
*/
|
|
8
|
+
export interface SuccessResponse {
|
|
9
|
+
success: true;
|
|
10
|
+
data: {
|
|
11
|
+
content: string;
|
|
12
|
+
image: string | null;
|
|
13
|
+
metadata: any;
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Error response format
|
|
18
|
+
*/
|
|
19
|
+
export interface ErrorResponse {
|
|
20
|
+
success: false;
|
|
21
|
+
error: string;
|
|
22
|
+
code?: string;
|
|
23
|
+
}
|
|
24
|
+
export type ApiResponse = SuccessResponse | ErrorResponse;
|
|
25
|
+
/**
|
|
26
|
+
* Format a successful meatscraper result for API/file responses
|
|
27
|
+
*/
|
|
28
|
+
export declare function formatSuccessResponse(result: MeatExtractorResult): SuccessResponse;
|
|
29
|
+
/**
|
|
30
|
+
* Format an error response
|
|
31
|
+
*/
|
|
32
|
+
export declare function formatErrorResponse(error: Error | string, code?: string): ErrorResponse;
|
|
33
|
+
/**
|
|
34
|
+
* Format response as pretty-printed JSON string (2 spaces)
|
|
35
|
+
*/
|
|
36
|
+
export declare function formatAsJson(response: ApiResponse): string;
|
|
37
|
+
/**
|
|
38
|
+
* Get current timestamp in ISO format for logging
|
|
39
|
+
*/
|
|
40
|
+
export declare function getTimestamp(): string;
|
|
41
|
+
/**
|
|
42
|
+
* Format log message with timestamp
|
|
43
|
+
*/
|
|
44
|
+
export declare function formatLogMessage(method: string, path: string, statusCode: number, durationMs: number, responseSize?: number, clientId?: string): string;
|
|
45
|
+
//# sourceMappingURL=formatters.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"formatters.d.ts","sourceRoot":"","sources":["../../src/utils/formatters.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,UAAU,CAAC;AAE/C;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,IAAI,CAAC;IACd,IAAI,EAAE;QACJ,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;QACrB,QAAQ,EAAE,GAAG,CAAC;KACf,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,KAAK,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,MAAM,MAAM,WAAW,GAAG,eAAe,GAAG,aAAa,CAAC;AAE1D;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,mBAAmB,GAAG,eAAe,CASlF;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,KAAK,EAAE,KAAK,GAAG,MAAM,EACrB,IAAI,CAAC,EAAE,MAAM,GACZ,aAAa,CAOf;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CAE1D;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,MAAM,CAErC;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,MAAM,EACZ,UAAU,EAAE,MAAM,EAClB,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,MAAM,EACrB,QAAQ,CAAC,EAAE,MAAM,GAChB,MAAM,CAUR"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Output formatters for different modes and response types
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.formatSuccessResponse = formatSuccessResponse;
|
|
7
|
+
exports.formatErrorResponse = formatErrorResponse;
|
|
8
|
+
exports.formatAsJson = formatAsJson;
|
|
9
|
+
exports.getTimestamp = getTimestamp;
|
|
10
|
+
exports.formatLogMessage = formatLogMessage;
|
|
11
|
+
/**
|
|
12
|
+
* Format a successful meatscraper result for API/file responses
|
|
13
|
+
*/
|
|
14
|
+
function formatSuccessResponse(result) {
|
|
15
|
+
return {
|
|
16
|
+
success: true,
|
|
17
|
+
data: {
|
|
18
|
+
content: result.content,
|
|
19
|
+
image: result.image,
|
|
20
|
+
metadata: result.metadata,
|
|
21
|
+
},
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Format an error response
|
|
26
|
+
*/
|
|
27
|
+
function formatErrorResponse(error, code) {
|
|
28
|
+
const message = typeof error === "string" ? error : error.message;
|
|
29
|
+
return {
|
|
30
|
+
success: false,
|
|
31
|
+
error: message,
|
|
32
|
+
...(code && { code }),
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Format response as pretty-printed JSON string (2 spaces)
|
|
37
|
+
*/
|
|
38
|
+
function formatAsJson(response) {
|
|
39
|
+
return JSON.stringify(response, null, 2);
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Get current timestamp in ISO format for logging
|
|
43
|
+
*/
|
|
44
|
+
function getTimestamp() {
|
|
45
|
+
return new Date().toISOString();
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Format log message with timestamp
|
|
49
|
+
*/
|
|
50
|
+
function formatLogMessage(method, path, statusCode, durationMs, responseSize, clientId) {
|
|
51
|
+
const timestamp = new Date().toLocaleString();
|
|
52
|
+
let message = `[${timestamp}] ${method} ${path} - ${statusCode} (${durationMs}ms)`;
|
|
53
|
+
if (responseSize) {
|
|
54
|
+
message += ` - ${responseSize} bytes`;
|
|
55
|
+
}
|
|
56
|
+
if (clientId) {
|
|
57
|
+
message += ` [${clientId}]`;
|
|
58
|
+
}
|
|
59
|
+
return message;
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=formatters.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"formatters.js","sourceRoot":"","sources":["../../src/utils/formatters.ts"],"names":[],"mappings":";AAAA;;GAEG;;AA8BH,sDASC;AAKD,kDAUC;AAKD,oCAEC;AAKD,oCAEC;AAKD,4CAiBC;AA/DD;;GAEG;AACH,SAAgB,qBAAqB,CAAC,MAA2B;IAC/D,OAAO;QACL,OAAO,EAAE,IAAI;QACb,IAAI,EAAE;YACJ,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAgB,mBAAmB,CACjC,KAAqB,EACrB,IAAa;IAEb,MAAM,OAAO,GAAG,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAClE,OAAO;QACL,OAAO,EAAE,KAAK;QACd,KAAK,EAAE,OAAO;QACd,GAAG,CAAC,IAAI,IAAI,EAAE,IAAI,EAAE,CAAC;KACtB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAgB,YAAY,CAAC,QAAqB;IAChD,OAAO,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,SAAgB,YAAY;IAC1B,OAAO,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;AAClC,CAAC;AAED;;GAEG;AACH,SAAgB,gBAAgB,CAC9B,MAAc,EACd,IAAY,EACZ,UAAkB,EAClB,UAAkB,EAClB,YAAqB,EACrB,QAAiB;IAEjB,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,cAAc,EAAE,CAAC;IAC9C,IAAI,OAAO,GAAG,IAAI,SAAS,KAAK,MAAM,IAAI,IAAI,MAAM,UAAU,KAAK,UAAU,KAAK,CAAC;IACnF,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,IAAI,MAAM,YAAY,QAAQ,CAAC;IACxC,CAAC;IACD,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,IAAI,KAAK,QAAQ,GAAG,CAAC;IAC9B,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for meatscraper
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Normalize a Content-Type header by stripping parameters (e.g., charset)
|
|
6
|
+
* and lowercasing the media type
|
|
7
|
+
*/
|
|
8
|
+
export declare function normalizeContentType(header: string | null): string | null;
|
|
9
|
+
/**
|
|
10
|
+
* Check if a string is empty or whitespace only
|
|
11
|
+
*/
|
|
12
|
+
export declare function isEmpty(str: string | null | undefined): boolean;
|
|
13
|
+
/**
|
|
14
|
+
* Truncate a string to a maximum length
|
|
15
|
+
*/
|
|
16
|
+
export declare function truncate(str: string, maxLength?: number, suffix?: string): string;
|
|
17
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI,GAAG,MAAM,GAAG,IAAI,CAKzE;AAED;;GAEG;AACH,wBAAgB,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO,CAE/D;AAED;;GAEG;AACH,wBAAgB,QAAQ,CACtB,GAAG,EAAE,MAAM,EACX,SAAS,GAAE,MAAY,EACvB,MAAM,GAAE,MAAc,GACrB,MAAM,CAKR"}
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Utility functions for meatscraper
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.normalizeContentType = normalizeContentType;
|
|
7
|
+
exports.isEmpty = isEmpty;
|
|
8
|
+
exports.truncate = truncate;
|
|
9
|
+
/**
|
|
10
|
+
* Normalize a Content-Type header by stripping parameters (e.g., charset)
|
|
11
|
+
* and lowercasing the media type
|
|
12
|
+
*/
|
|
13
|
+
function normalizeContentType(header) {
|
|
14
|
+
if (!header) {
|
|
15
|
+
return null;
|
|
16
|
+
}
|
|
17
|
+
return header.split(";", 1)[0]?.trim().toLowerCase() || null;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Check if a string is empty or whitespace only
|
|
21
|
+
*/
|
|
22
|
+
function isEmpty(str) {
|
|
23
|
+
return !str || str.trim().length === 0;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Truncate a string to a maximum length
|
|
27
|
+
*/
|
|
28
|
+
function truncate(str, maxLength = 100, suffix = "...") {
|
|
29
|
+
if (str.length <= maxLength) {
|
|
30
|
+
return str;
|
|
31
|
+
}
|
|
32
|
+
return str.substring(0, maxLength) + suffix;
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAMH,oDAKC;AAKD,0BAEC;AAKD,4BASC;AA9BD;;;GAGG;AACH,SAAgB,oBAAoB,CAAC,MAAqB;IACxD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,IAAI,CAAC;AAC/D,CAAC;AAED;;GAEG;AACH,SAAgB,OAAO,CAAC,GAA8B;IACpD,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,SAAgB,QAAQ,CACtB,GAAW,EACX,YAAoB,GAAG,EACvB,SAAiB,KAAK;IAEtB,IAAI,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC5B,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,GAAG,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,MAAM,CAAC;AAC9C,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "meatscraper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Extract text content and primary image from webpages using advanced scraping techniques",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"bin": {
|
|
8
|
+
"meatscraper": "./dist/cli.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist",
|
|
12
|
+
"README.md",
|
|
13
|
+
"LICENSE"
|
|
14
|
+
],
|
|
15
|
+
"scripts": {
|
|
16
|
+
"build": "tsc",
|
|
17
|
+
"dev": "tsc --watch",
|
|
18
|
+
"clean": "rm -rf dist",
|
|
19
|
+
"prepublishOnly": "npm run build",
|
|
20
|
+
"start": "node dist/cli.js",
|
|
21
|
+
"start:file": "node dist/cli.js",
|
|
22
|
+
"start:serve": "node dist/cli.js serve"
|
|
23
|
+
},
|
|
24
|
+
"keywords": [
|
|
25
|
+
"web-scraping",
|
|
26
|
+
"content-extraction",
|
|
27
|
+
"metadata",
|
|
28
|
+
"readability",
|
|
29
|
+
"sanitization",
|
|
30
|
+
"html-parser",
|
|
31
|
+
"web-content",
|
|
32
|
+
"article-extraction"
|
|
33
|
+
],
|
|
34
|
+
"author": "paulohgodinho",
|
|
35
|
+
"license": "MIT",
|
|
36
|
+
"repository": {
|
|
37
|
+
"type": "git",
|
|
38
|
+
"url": "https://github.com/paulohgodinho/meatscraper.git"
|
|
39
|
+
},
|
|
40
|
+
"homepage": "https://github.com/paulohgodinho/meatscraper#readme",
|
|
41
|
+
"bugs": {
|
|
42
|
+
"url": "https://github.com/paulohgodinho/meatscraper/issues"
|
|
43
|
+
},
|
|
44
|
+
"dependencies": {
|
|
45
|
+
"@mozilla/readability": "^0.6.0",
|
|
46
|
+
"@types/express": "^5.0.6",
|
|
47
|
+
"dompurify": "^3.2.4",
|
|
48
|
+
"dotenv": "^17.2.3",
|
|
49
|
+
"express": "^5.2.1",
|
|
50
|
+
"html-to-text": "^9.0.0",
|
|
51
|
+
"jsdom": "^24.0.0",
|
|
52
|
+
"metascraper": "^5.49.5",
|
|
53
|
+
"metascraper-amazon": "^5.49.5",
|
|
54
|
+
"metascraper-author": "^5.49.5",
|
|
55
|
+
"metascraper-date": "^5.49.5",
|
|
56
|
+
"metascraper-description": "^5.49.5",
|
|
57
|
+
"metascraper-image": "^5.49.5",
|
|
58
|
+
"metascraper-logo-favicon": "^5.49.5",
|
|
59
|
+
"metascraper-publisher": "^5.49.5",
|
|
60
|
+
"metascraper-title": "^5.49.5",
|
|
61
|
+
"metascraper-url": "^5.49.5",
|
|
62
|
+
"metascraper-x": "^5.49.5",
|
|
63
|
+
"metascraper-youtube": "^5.49.5"
|
|
64
|
+
},
|
|
65
|
+
"devDependencies": {
|
|
66
|
+
"@types/dompurify": "^3.0.5",
|
|
67
|
+
"@types/html-to-text": "^9.0.4",
|
|
68
|
+
"@types/jsdom": "^27.0.0",
|
|
69
|
+
"@types/node": "^20.19.30",
|
|
70
|
+
"typescript": "^5.3.3"
|
|
71
|
+
}
|
|
72
|
+
}
|