meatscraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +157 -0
  3. package/dist/cli.d.ts +10 -0
  4. package/dist/cli.d.ts.map +1 -0
  5. package/dist/cli.js +64 -0
  6. package/dist/cli.js.map +1 -0
  7. package/dist/index.d.ts +60 -0
  8. package/dist/index.d.ts.map +1 -0
  9. package/dist/index.js +74 -0
  10. package/dist/index.js.map +1 -0
  11. package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts +10 -0
  12. package/dist/metascraper-plugins/metascraper-amazon-improved.d.ts.map +1 -0
  13. package/dist/metascraper-plugins/metascraper-amazon-improved.js +44 -0
  14. package/dist/metascraper-plugins/metascraper-amazon-improved.js.map +1 -0
  15. package/dist/metascraper-plugins/metascraper-reddit.d.ts +8 -0
  16. package/dist/metascraper-plugins/metascraper-reddit.d.ts.map +1 -0
  17. package/dist/metascraper-plugins/metascraper-reddit.js +47 -0
  18. package/dist/metascraper-plugins/metascraper-reddit.js.map +1 -0
  19. package/dist/metascraper-setup.d.ts +23 -0
  20. package/dist/metascraper-setup.d.ts.map +1 -0
  21. package/dist/metascraper-setup.js +78 -0
  22. package/dist/metascraper-setup.js.map +1 -0
  23. package/dist/modes/file-mode.d.ts +12 -0
  24. package/dist/modes/file-mode.d.ts.map +1 -0
  25. package/dist/modes/file-mode.js +63 -0
  26. package/dist/modes/file-mode.js.map +1 -0
  27. package/dist/modes/http-mode.d.ts +12 -0
  28. package/dist/modes/http-mode.d.ts.map +1 -0
  29. package/dist/modes/http-mode.js +111 -0
  30. package/dist/modes/http-mode.js.map +1 -0
  31. package/dist/pipeline.d.ts +23 -0
  32. package/dist/pipeline.d.ts.map +1 -0
  33. package/dist/pipeline.js +59 -0
  34. package/dist/pipeline.js.map +1 -0
  35. package/dist/steps/step1-metadata.d.ts +9 -0
  36. package/dist/steps/step1-metadata.d.ts.map +1 -0
  37. package/dist/steps/step1-metadata.js +42 -0
  38. package/dist/steps/step1-metadata.js.map +1 -0
  39. package/dist/steps/step2-readable.d.ts +16 -0
  40. package/dist/steps/step2-readable.d.ts.map +1 -0
  41. package/dist/steps/step2-readable.js +45 -0
  42. package/dist/steps/step2-readable.js.map +1 -0
  43. package/dist/steps/step3-sanitize.d.ts +15 -0
  44. package/dist/steps/step3-sanitize.d.ts.map +1 -0
  45. package/dist/steps/step3-sanitize.js +43 -0
  46. package/dist/steps/step3-sanitize.js.map +1 -0
  47. package/dist/steps/step4-plaintext.d.ts +14 -0
  48. package/dist/steps/step4-plaintext.d.ts.map +1 -0
  49. package/dist/steps/step4-plaintext.js +47 -0
  50. package/dist/steps/step4-plaintext.js.map +1 -0
  51. package/dist/steps/step5-image.d.ts +22 -0
  52. package/dist/steps/step5-image.d.ts.map +1 -0
  53. package/dist/steps/step5-image.js +121 -0
  54. package/dist/steps/step5-image.js.map +1 -0
  55. package/dist/types.d.ts +56 -0
  56. package/dist/types.d.ts.map +1 -0
  57. package/dist/types.js +3 -0
  58. package/dist/types.js.map +1 -0
  59. package/dist/utils/formatters.d.ts +45 -0
  60. package/dist/utils/formatters.d.ts.map +1 -0
  61. package/dist/utils/formatters.js +61 -0
  62. package/dist/utils/formatters.js.map +1 -0
  63. package/dist/utils.d.ts +17 -0
  64. package/dist/utils.d.ts.map +1 -0
  65. package/dist/utils.js +34 -0
  66. package/dist/utils.js.map +1 -0
  67. package/package.json +72 -0
@@ -0,0 +1,121 @@
1
+ "use strict";
2
+ /**
3
+ * Step 5: Select the primary image
4
+ *
5
+ * Extracts the best primary image URL from metadata.
6
+ * Uses metascraperImage result as primary, falls back to logo if needed.
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.step5SelectImage = step5SelectImage;
10
+ const jsdom_1 = require("jsdom");
11
+ const url_1 = require("url");
12
+ /**
13
+ * Extract favicon URL from HTML <link> tags
14
+ *
15
+ * @param htmlContent - Raw HTML string to parse
16
+ * @param baseUrl - Base URL for resolving relative paths
17
+ * @returns Favicon URL or null if not found
18
+ */
19
+ function extractFaviconFromHtml(htmlContent, baseUrl) {
20
+ try {
21
+ const dom = new jsdom_1.JSDOM(htmlContent);
22
+ const document = dom.window.document;
23
+ // Search for favicon link tags in order of preference
24
+ const faviconSelectors = [
25
+ 'link[rel="icon"]',
26
+ 'link[rel="shortcut icon"]',
27
+ 'link[rel="apple-touch-icon"]',
28
+ 'link[rel="apple-touch-icon-precomposed"]',
29
+ ];
30
+ for (const selector of faviconSelectors) {
31
+ const linkElement = document.querySelector(selector);
32
+ if (linkElement) {
33
+ const href = linkElement.getAttribute('href');
34
+ if (href) {
35
+ // Skip data URIs
36
+ if (href.startsWith('data:')) {
37
+ continue;
38
+ }
39
+ // Resolve relative URLs to absolute
40
+ try {
41
+ const resolvedUrl = new url_1.URL(href, baseUrl);
42
+ return resolvedUrl.href;
43
+ }
44
+ catch (e) {
45
+ // Invalid URL, try next
46
+ continue;
47
+ }
48
+ }
49
+ }
50
+ }
51
+ return null;
52
+ }
53
+ catch (error) {
54
+ // HTML parsing failed
55
+ return null;
56
+ }
57
+ }
58
+ /**
59
+ * Select the best primary image from metadata
60
+ *
61
+ * @param metadata - Metadata object from step 1
62
+ * @param htmlContent - Raw HTML content for favicon extraction fallback
63
+ * @param url - Base URL for resolving relative favicon paths
64
+ * @returns Object with extracted image, selected image, and reason
65
+ */
66
+ function step5SelectImage(metadata, htmlContent, url) {
67
+ // Helper to extract first string from image (handles array or string)
68
+ const getImageUrl = (img) => {
69
+ if (!img)
70
+ return null;
71
+ if (typeof img === "string")
72
+ return img;
73
+ if (Array.isArray(img) && img.length > 0) {
74
+ return typeof img[0] === "string" ? img[0] : null;
75
+ }
76
+ return null;
77
+ };
78
+ // Primary source: metascraperImage result
79
+ const imageUrl = getImageUrl(metadata.image);
80
+ if (imageUrl) {
81
+ // Don't use data URIs - they're typically very large and not useful for external reference
82
+ if (!imageUrl.startsWith("data:")) {
83
+ return {
84
+ extracted: imageUrl,
85
+ selected: imageUrl,
86
+ reason: "Primary image from metascraper",
87
+ };
88
+ }
89
+ }
90
+ // Fallback 1: Logo/favicon
91
+ const logoUrl = getImageUrl(metadata.logo);
92
+ if (logoUrl && !logoUrl.startsWith("data:")) {
93
+ return {
94
+ extracted: imageUrl,
95
+ selected: logoUrl,
96
+ reason: "Fallback to logo/favicon",
97
+ };
98
+ }
99
+ // Fallback 2: Search HTML for favicon <link> tags
100
+ if (htmlContent) {
101
+ // Prefer metadata.url (from og:url) over passed url parameter
102
+ const baseUrl = metadata.url || url;
103
+ if (baseUrl) {
104
+ const htmlFavicon = extractFaviconFromHtml(htmlContent, baseUrl);
105
+ if (htmlFavicon) {
106
+ return {
107
+ extracted: imageUrl,
108
+ selected: htmlFavicon,
109
+ reason: "Favicon extracted from HTML <link> tags",
110
+ };
111
+ }
112
+ }
113
+ }
114
+ // No suitable image found
115
+ return {
116
+ extracted: imageUrl,
117
+ selected: null,
118
+ reason: "No suitable image found",
119
+ };
120
+ }
121
+ //# sourceMappingURL=step5-image.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"step5-image.js","sourceRoot":"","sources":["../../src/steps/step5-image.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;AAqEH,4CA6DC;AA/HD,iCAA8B;AAC9B,6BAA0B;AAQ1B;;;;;;GAMG;AACH,SAAS,sBAAsB,CAAC,WAAmB,EAAE,OAAe;IAClE,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,WAAW,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;QAErC,sDAAsD;QACtD,MAAM,gBAAgB,GAAG;YACvB,kBAAkB;YAClB,2BAA2B;YAC3B,8BAA8B;YAC9B,0CAA0C;SAC3C,CAAC;QAEF,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YACrD,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,IAAI,GAAG,WAAW,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBAC9C,IAAI,IAAI,EAAE,CAAC;oBACT,iBAAiB;oBACjB,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;wBAC7B,SAAS;oBACX,CAAC;oBAED,oCAAoC;oBACpC,IAAI,CAAC;wBACH,MAAM,WAAW,GAAG,IAAI,SAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;wBAC3C,OAAO,WAAW,CAAC,IAAI,CAAC;oBAC1B,CAAC;oBAAC,OAAO,CAAC,EAAE,CAAC;wBACX,wBAAwB;wBACxB,SAAS;oBACX,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,sBAAsB;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,SAAgB,gBAAgB,CAC9B,QAAwB,EACxB,WAAoB,EACpB,GAAY;IAEZ,sEAAsE;IACtE,MAAM,WAAW,GAAG,CAAC,GAAQ,EAAiB,EAAE;QAC9C,IAAI,CAAC,GAAG;YAAE,OAAO,IAAI,CAAC;QACtB,IAAI,OAAO,GAAG,KAAK,QAAQ;YAAE,OAAO,GAAG,CAAC;QACxC,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzC,OAAO,OAAO,GAAG,CAAC,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACpD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC;IAEF,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC7C,IAAI,QAAQ,EAAE,CAAC;QACb,2FAA2F;QAC3F,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAClC,OAAO;gBACL,SAAS,EAAE,QAAQ;gBACnB,QAAQ,EAAE,QAAQ;gBAClB,MAAM,EAAE,gCAAgC;aACzC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,MAAM,OAAO,GAAG,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC3C,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5C,OAAO;YACL,SAAS,EAAE,QAAQ;YACnB,QAAQ,EAAE,OAAO;YACjB,MAAM,EAAE,0BAA0B;SACnC,CAAC;IACJ,CAAC;IAED,kDAAkD;IAClD,IAAI,WAAW,EAAE,CAAC;QAChB,8DAA8D;QAC9D,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;QAEpC,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,WAAW,GAAG,sBAAsB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;YACjE,IAAI,WAAW,EAAE,CAAC;gBAChB,OAAO;oBACL,SAAS,EAAE,QAAQ;oBACnB,QAAQ,EAAE,WAAW;oBACrB,MAAM,EAAE,yCAAyC;iBAClD,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,0BAA0B;IAC1B,OAAO;QACL,SAAS,EAAE,QAAQ;QACnB,QAAQ,EAAE,IAAI;QACd,MAAM,EAAE,yBAAyB;KAClC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Metadata extracted from webpage by metascraper
3
+ */
4
+ export interface MetadataResult {
5
+ title?: string;
6
+ description?: string;
7
+ image?: string;
8
+ logo?: string;
9
+ author?: string;
10
+ publisher?: string;
11
+ datePublished?: string;
12
+ dateModified?: string;
13
+ url?: string;
14
+ youtubeVideoId?: string;
15
+ youtubeChannelName?: string;
16
+ youtubeChannelId?: string;
17
+ twitterHandle?: string;
18
+ twitterCreator?: string;
19
+ amazonPrice?: string;
20
+ amazonProductTitle?: string;
21
+ redditSubreddit?: string;
22
+ redditAuthor?: string;
23
+ readableContentHtml?: string;
24
+ [key: string]: any;
25
+ }
26
+ /**
27
+ * Debug information from each pipeline step
28
+ */
29
+ export interface DebugInfo {
30
+ step1_metadata: MetadataResult;
31
+ step2_readableContent: string;
32
+ step3_sanitizedContent: string;
33
+ step4_plaintext: string;
34
+ step5_imageSelection: {
35
+ extracted: string | null;
36
+ selected: string | null;
37
+ reason: string;
38
+ };
39
+ }
40
+ /**
41
+ * Final result from meatscraper extraction
42
+ */
43
+ export interface MeatExtractorResult {
44
+ content: string;
45
+ image: string | null;
46
+ metadata: MetadataResult;
47
+ debug?: DebugInfo;
48
+ }
49
+ /**
50
+ * Options for meatscraper function
51
+ */
52
+ export interface MeatExtractorOptions {
53
+ debug?: boolean;
54
+ url?: string;
55
+ }
56
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,WAAW,cAAc;IAE7B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,GAAG,CAAC,EAAE,MAAM,CAAC;IAGb,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAG7B,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,cAAc,EAAE,cAAc,CAAC;IAC/B,qBAAqB,EAAE,MAAM,CAAC;IAC9B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,eAAe,EAAE,MAAM,CAAC;IACxB,oBAAoB,EAAE;QACpB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;QACzB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;QACxB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAElC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,QAAQ,EAAE,cAAc,CAAC;IAGzB,KAAK,CAAC,EAAE,SAAS,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd"}
package/dist/types.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Output formatters for different modes and response types
3
+ */
4
+ import { MeatExtractorResult } from "../index";
5
+ /**
6
+ * Success response format with extracted data
7
+ */
8
+ export interface SuccessResponse {
9
+ success: true;
10
+ data: {
11
+ content: string;
12
+ image: string | null;
13
+ metadata: any;
14
+ };
15
+ }
16
+ /**
17
+ * Error response format
18
+ */
19
+ export interface ErrorResponse {
20
+ success: false;
21
+ error: string;
22
+ code?: string;
23
+ }
24
+ export type ApiResponse = SuccessResponse | ErrorResponse;
25
+ /**
26
+ * Format a successful meatscraper result for API/file responses
27
+ */
28
+ export declare function formatSuccessResponse(result: MeatExtractorResult): SuccessResponse;
29
+ /**
30
+ * Format an error response
31
+ */
32
+ export declare function formatErrorResponse(error: Error | string, code?: string): ErrorResponse;
33
+ /**
34
+ * Format response as pretty-printed JSON string (2 spaces)
35
+ */
36
+ export declare function formatAsJson(response: ApiResponse): string;
37
+ /**
38
+ * Get current timestamp in ISO format for logging
39
+ */
40
+ export declare function getTimestamp(): string;
41
+ /**
42
+ * Format log message with timestamp
43
+ */
44
+ export declare function formatLogMessage(method: string, path: string, statusCode: number, durationMs: number, responseSize?: number, clientId?: string): string;
45
+ //# sourceMappingURL=formatters.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"formatters.d.ts","sourceRoot":"","sources":["../../src/utils/formatters.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,UAAU,CAAC;AAE/C;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,IAAI,CAAC;IACd,IAAI,EAAE;QACJ,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;QACrB,QAAQ,EAAE,GAAG,CAAC;KACf,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,KAAK,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,MAAM,MAAM,WAAW,GAAG,eAAe,GAAG,aAAa,CAAC;AAE1D;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,mBAAmB,GAAG,eAAe,CASlF;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,KAAK,EAAE,KAAK,GAAG,MAAM,EACrB,IAAI,CAAC,EAAE,MAAM,GACZ,aAAa,CAOf;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CAE1D;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,MAAM,CAErC;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,MAAM,EACZ,UAAU,EAAE,MAAM,EAClB,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,MAAM,EACrB,QAAQ,CAAC,EAAE,MAAM,GAChB,MAAM,CAUR"}
@@ -0,0 +1,61 @@
1
+ "use strict";
2
+ /**
3
+ * Output formatters for different modes and response types
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.formatSuccessResponse = formatSuccessResponse;
7
+ exports.formatErrorResponse = formatErrorResponse;
8
+ exports.formatAsJson = formatAsJson;
9
+ exports.getTimestamp = getTimestamp;
10
+ exports.formatLogMessage = formatLogMessage;
11
+ /**
12
+ * Format a successful meatscraper result for API/file responses
13
+ */
14
+ function formatSuccessResponse(result) {
15
+ return {
16
+ success: true,
17
+ data: {
18
+ content: result.content,
19
+ image: result.image,
20
+ metadata: result.metadata,
21
+ },
22
+ };
23
+ }
24
+ /**
25
+ * Format an error response
26
+ */
27
+ function formatErrorResponse(error, code) {
28
+ const message = typeof error === "string" ? error : error.message;
29
+ return {
30
+ success: false,
31
+ error: message,
32
+ ...(code && { code }),
33
+ };
34
+ }
35
+ /**
36
+ * Format response as pretty-printed JSON string (2 spaces)
37
+ */
38
+ function formatAsJson(response) {
39
+ return JSON.stringify(response, null, 2);
40
+ }
41
+ /**
42
+ * Get current timestamp in ISO format for logging
43
+ */
44
+ function getTimestamp() {
45
+ return new Date().toISOString();
46
+ }
47
+ /**
48
+ * Format log message with timestamp
49
+ */
50
+ function formatLogMessage(method, path, statusCode, durationMs, responseSize, clientId) {
51
+ const timestamp = new Date().toLocaleString();
52
+ let message = `[${timestamp}] ${method} ${path} - ${statusCode} (${durationMs}ms)`;
53
+ if (responseSize) {
54
+ message += ` - ${responseSize} bytes`;
55
+ }
56
+ if (clientId) {
57
+ message += ` [${clientId}]`;
58
+ }
59
+ return message;
60
+ }
61
+ //# sourceMappingURL=formatters.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"formatters.js","sourceRoot":"","sources":["../../src/utils/formatters.ts"],"names":[],"mappings":";AAAA;;GAEG;;AA8BH,sDASC;AAKD,kDAUC;AAKD,oCAEC;AAKD,oCAEC;AAKD,4CAiBC;AA/DD;;GAEG;AACH,SAAgB,qBAAqB,CAAC,MAA2B;IAC/D,OAAO;QACL,OAAO,EAAE,IAAI;QACb,IAAI,EAAE;YACJ,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAgB,mBAAmB,CACjC,KAAqB,EACrB,IAAa;IAEb,MAAM,OAAO,GAAG,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAClE,OAAO;QACL,OAAO,EAAE,KAAK;QACd,KAAK,EAAE,OAAO;QACd,GAAG,CAAC,IAAI,IAAI,EAAE,IAAI,EAAE,CAAC;KACtB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAgB,YAAY,CAAC,QAAqB;IAChD,OAAO,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,SAAgB,YAAY;IAC1B,OAAO,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;AAClC,CAAC;AAED;;GAEG;AACH,SAAgB,gBAAgB,CAC9B,MAAc,EACd,IAAY,EACZ,UAAkB,EAClB,UAAkB,EAClB,YAAqB,EACrB,QAAiB;IAEjB,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,cAAc,EAAE,CAAC;IAC9C,IAAI,OAAO,GAAG,IAAI,SAAS,KAAK,MAAM,IAAI,IAAI,MAAM,UAAU,KAAK,UAAU,KAAK,CAAC;IACnF,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,IAAI,MAAM,YAAY,QAAQ,CAAC;IACxC,CAAC;IACD,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,IAAI,KAAK,QAAQ,GAAG,CAAC;IAC9B,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Utility functions for meatscraper
3
+ */
4
+ /**
5
+ * Normalize a Content-Type header by stripping parameters (e.g., charset)
6
+ * and lowercasing the media type
7
+ */
8
+ export declare function normalizeContentType(header: string | null): string | null;
9
+ /**
10
+ * Check if a string is empty or whitespace only
11
+ */
12
+ export declare function isEmpty(str: string | null | undefined): boolean;
13
+ /**
14
+ * Truncate a string to a maximum length
15
+ */
16
+ export declare function truncate(str: string, maxLength?: number, suffix?: string): string;
17
+ //# sourceMappingURL=utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI,GAAG,MAAM,GAAG,IAAI,CAKzE;AAED;;GAEG;AACH,wBAAgB,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO,CAE/D;AAED;;GAEG;AACH,wBAAgB,QAAQ,CACtB,GAAG,EAAE,MAAM,EACX,SAAS,GAAE,MAAY,EACvB,MAAM,GAAE,MAAc,GACrB,MAAM,CAKR"}
package/dist/utils.js ADDED
@@ -0,0 +1,34 @@
1
+ "use strict";
2
+ /**
3
+ * Utility functions for meatscraper
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.normalizeContentType = normalizeContentType;
7
+ exports.isEmpty = isEmpty;
8
+ exports.truncate = truncate;
9
+ /**
10
+ * Normalize a Content-Type header by stripping parameters (e.g., charset)
11
+ * and lowercasing the media type
12
+ */
13
+ function normalizeContentType(header) {
14
+ if (!header) {
15
+ return null;
16
+ }
17
+ return header.split(";", 1)[0]?.trim().toLowerCase() || null;
18
+ }
19
+ /**
20
+ * Check if a string is empty or whitespace only
21
+ */
22
+ function isEmpty(str) {
23
+ return !str || str.trim().length === 0;
24
+ }
25
+ /**
26
+ * Truncate a string to a maximum length
27
+ */
28
+ function truncate(str, maxLength = 100, suffix = "...") {
29
+ if (str.length <= maxLength) {
30
+ return str;
31
+ }
32
+ return str.substring(0, maxLength) + suffix;
33
+ }
34
+ //# sourceMappingURL=utils.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAMH,oDAKC;AAKD,0BAEC;AAKD,4BASC;AA9BD;;;GAGG;AACH,SAAgB,oBAAoB,CAAC,MAAqB;IACxD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,IAAI,CAAC;AAC/D,CAAC;AAED;;GAEG;AACH,SAAgB,OAAO,CAAC,GAA8B;IACpD,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,SAAgB,QAAQ,CACtB,GAAW,EACX,YAAoB,GAAG,EACvB,SAAiB,KAAK;IAEtB,IAAI,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC5B,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,GAAG,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,MAAM,CAAC;AAC9C,CAAC"}
package/package.json ADDED
@@ -0,0 +1,72 @@
1
+ {
2
+ "name": "meatscraper",
3
+ "version": "1.0.0",
4
+ "description": "Extract text content and primary image from webpages using advanced scraping techniques",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "bin": {
8
+ "meatscraper": "./dist/cli.js"
9
+ },
10
+ "files": [
11
+ "dist",
12
+ "README.md",
13
+ "LICENSE"
14
+ ],
15
+ "scripts": {
16
+ "build": "tsc",
17
+ "dev": "tsc --watch",
18
+ "clean": "rm -rf dist",
19
+ "prepublishOnly": "npm run build",
20
+ "start": "node dist/cli.js",
21
+ "start:file": "node dist/cli.js",
22
+ "start:serve": "node dist/cli.js serve"
23
+ },
24
+ "keywords": [
25
+ "web-scraping",
26
+ "content-extraction",
27
+ "metadata",
28
+ "readability",
29
+ "sanitization",
30
+ "html-parser",
31
+ "web-content",
32
+ "article-extraction"
33
+ ],
34
+ "author": "paulohgodinho",
35
+ "license": "MIT",
36
+ "repository": {
37
+ "type": "git",
38
+ "url": "https://github.com/paulohgodinho/meatscraper.git"
39
+ },
40
+ "homepage": "https://github.com/paulohgodinho/meatscraper#readme",
41
+ "bugs": {
42
+ "url": "https://github.com/paulohgodinho/meatscraper/issues"
43
+ },
44
+ "dependencies": {
45
+ "@mozilla/readability": "^0.6.0",
46
+ "@types/express": "^5.0.6",
47
+ "dompurify": "^3.2.4",
48
+ "dotenv": "^17.2.3",
49
+ "express": "^5.2.1",
50
+ "html-to-text": "^9.0.0",
51
+ "jsdom": "^24.0.0",
52
+ "metascraper": "^5.49.5",
53
+ "metascraper-amazon": "^5.49.5",
54
+ "metascraper-author": "^5.49.5",
55
+ "metascraper-date": "^5.49.5",
56
+ "metascraper-description": "^5.49.5",
57
+ "metascraper-image": "^5.49.5",
58
+ "metascraper-logo-favicon": "^5.49.5",
59
+ "metascraper-publisher": "^5.49.5",
60
+ "metascraper-title": "^5.49.5",
61
+ "metascraper-url": "^5.49.5",
62
+ "metascraper-x": "^5.49.5",
63
+ "metascraper-youtube": "^5.49.5"
64
+ },
65
+ "devDependencies": {
66
+ "@types/dompurify": "^3.0.5",
67
+ "@types/html-to-text": "^9.0.4",
68
+ "@types/jsdom": "^27.0.0",
69
+ "@types/node": "^20.19.30",
70
+ "typescript": "^5.3.3"
71
+ }
72
+ }