ogscrap 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/app/example.ts +4 -0
- package/app/page-metadata.ts +127 -0
- package/package.json +21 -0
- package/tsconfig.json +51 -0
package/README.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# ogscrap
|
|
2
|
+
|
|
3
|
+
Lightweight utility to fetch a web page and extract clean metadata.
|
|
4
|
+
|
|
5
|
+
It returns a simple JSON object with:
|
|
6
|
+
|
|
7
|
+
- `url`
|
|
8
|
+
- `title`
|
|
9
|
+
- `description`
|
|
10
|
+
- `image`
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm install
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```ts
|
|
21
|
+
import { fetchPageMetadata } from "./app/page-metadata";
|
|
22
|
+
|
|
23
|
+
const data = await fetchPageMetadata("https://example.com/post");
|
|
24
|
+
console.log(data);
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Response shape:
|
|
28
|
+
|
|
29
|
+
```json
|
|
30
|
+
{
|
|
31
|
+
"url": "https://example.com/post",
|
|
32
|
+
"title": "Page title",
|
|
33
|
+
"description": "Short summary about the content",
|
|
34
|
+
"image": "https://example.com/image.jpg"
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Local Example
|
|
39
|
+
|
|
40
|
+
You can run the sample in `app/example.ts` (depending on your TS runtime setup):
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
import { fetchPageMetadata } from "./page-metadata";
|
|
44
|
+
|
|
45
|
+
const data = await fetchPageMetadata("https://example.com/post");
|
|
46
|
+
console.log(data);
|
|
47
|
+
```
|
package/app/example.ts
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import createDOMPurify from "dompurify";
|
|
3
|
+
import { JSDOM } from "jsdom";
|
|
4
|
+
|
|
5
|
+
export type PageMetadata = {
|
|
6
|
+
url: string;
|
|
7
|
+
title: string;
|
|
8
|
+
description: string;
|
|
9
|
+
image: string;
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
const purifyWindow = new JSDOM("").window;
|
|
13
|
+
const DOMPurify = createDOMPurify(purifyWindow as typeof purifyWindow);
|
|
14
|
+
|
|
15
|
+
function cleanText(value: string | undefined): string {
|
|
16
|
+
return (value ?? "").replace(/\s+/g, " ").trim();
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function sanitizePlainText(value: string | undefined): string {
|
|
20
|
+
const sanitized = DOMPurify.sanitize(value ?? "", {
|
|
21
|
+
ALLOWED_TAGS: [],
|
|
22
|
+
ALLOWED_ATTR: [],
|
|
23
|
+
});
|
|
24
|
+
return cleanText(sanitized);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function firstNonEmpty(values: Array<string | undefined>): string {
|
|
28
|
+
for (const value of values) {
|
|
29
|
+
const cleaned = sanitizePlainText(value);
|
|
30
|
+
if (cleaned) return cleaned;
|
|
31
|
+
}
|
|
32
|
+
return "";
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function toAbsoluteUrl(baseUrl: string, maybeUrl: string | undefined): string {
|
|
36
|
+
const value = sanitizePlainText(maybeUrl);
|
|
37
|
+
if (!value) return "";
|
|
38
|
+
try {
|
|
39
|
+
return new URL(value, baseUrl).toString();
|
|
40
|
+
} catch {
|
|
41
|
+
return "";
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function getMainText($: cheerio.CheerioAPI): string {
|
|
46
|
+
const candidates = [
|
|
47
|
+
$("main article").first().text(),
|
|
48
|
+
$("article").first().text(),
|
|
49
|
+
$("main").first().text(),
|
|
50
|
+
];
|
|
51
|
+
return sanitizePlainText(candidates.find((v) => sanitizePlainText(v)));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function getMainImage($: cheerio.CheerioAPI, url: string): string {
|
|
55
|
+
const imageSrc = firstNonEmpty([
|
|
56
|
+
$("main article img").first().attr("src"),
|
|
57
|
+
$("article img").first().attr("src"),
|
|
58
|
+
$("main img").first().attr("src"),
|
|
59
|
+
$("img").first().attr("src"),
|
|
60
|
+
]);
|
|
61
|
+
return toAbsoluteUrl(url, imageSrc);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function getTitle($: cheerio.CheerioAPI): string {
|
|
65
|
+
return firstNonEmpty([
|
|
66
|
+
$('meta[name="title"]').attr("content"),
|
|
67
|
+
$('meta[name="twitter:title"]').attr("content"),
|
|
68
|
+
$('meta[property="og:title"]').attr("content"),
|
|
69
|
+
$("title").first().text(),
|
|
70
|
+
$("h1").first().text(),
|
|
71
|
+
]);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function getDescription($: cheerio.CheerioAPI): string {
|
|
75
|
+
return firstNonEmpty([
|
|
76
|
+
$('meta[name="description"]').attr("content"),
|
|
77
|
+
$('meta[name="twitter:description"]').attr("content"),
|
|
78
|
+
$('meta[property="og:description"]').attr("content"),
|
|
79
|
+
$("meta[name='description']").first().attr("content"),
|
|
80
|
+
$("meta[name='twitter:description']").first().attr("content"),
|
|
81
|
+
$("meta[property='og:description']").first().attr("content"),
|
|
82
|
+
$("p").first().text(),
|
|
83
|
+
]);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function getImage($: cheerio.CheerioAPI, url: string): string {
|
|
87
|
+
const metaImage = firstNonEmpty([
|
|
88
|
+
$('meta[name="image"]').attr("content"),
|
|
89
|
+
$('meta[property="og:image"]').attr("content"),
|
|
90
|
+
$('meta[name="twitter:image"]').attr("content"),
|
|
91
|
+
$('link[rel="image_src"]').attr("href"),
|
|
92
|
+
$("img").first().attr("src"),
|
|
93
|
+
$('link[rel="apple-touch-icon"]').attr("href"),
|
|
94
|
+
$('link[rel="icon"]').attr("href"),
|
|
95
|
+
]);
|
|
96
|
+
return toAbsoluteUrl(url, metaImage);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export async function fetchPageMetadata(url: string): Promise<PageMetadata> {
|
|
100
|
+
const response = await fetch(url, {
|
|
101
|
+
headers: {
|
|
102
|
+
"user-agent": "Mozilla/5.0 (compatible; page-metadata/1.0)",
|
|
103
|
+
accept: "text/html,application/xhtml+xml",
|
|
104
|
+
referrerPolicy: "no-referrer"
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (!response.ok) {
|
|
109
|
+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const html = await response.text();
|
|
113
|
+
const $ = cheerio.load(html);
|
|
114
|
+
|
|
115
|
+
const title = getTitle($);
|
|
116
|
+
const descriptionFromMeta = getDescription($);
|
|
117
|
+
const mainText = getMainText($);
|
|
118
|
+
const description = sanitizePlainText(descriptionFromMeta || mainText.slice(0, 300));
|
|
119
|
+
const image = getImage($, url) || getMainImage($, url);
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
url: toAbsoluteUrl(url, url),
|
|
123
|
+
title: sanitizePlainText(title),
|
|
124
|
+
description,
|
|
125
|
+
image: sanitizePlainText(image),
|
|
126
|
+
};
|
|
127
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ogscrap",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Lightweight utility to fetch a web page and extract clean metadata.",
|
|
5
|
+
"main": "./dist/index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"build" : "tsc"
|
|
8
|
+
},
|
|
9
|
+
"keywords": ["metadata" , "ogscrap", "page-metadata", "scraper"],
|
|
10
|
+
"author": "edah",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"type": "module",
|
|
13
|
+
"dependencies": {
|
|
14
|
+
"cheerio": "^1.2.0",
|
|
15
|
+
"dompurify": "^3.3.1",
|
|
16
|
+
"jsdom": "^28.1.0"
|
|
17
|
+
},
|
|
18
|
+
"devDependencies": {
|
|
19
|
+
"@types/jsdom": "^27.0.0"
|
|
20
|
+
}
|
|
21
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
// Visit https://aka.ms/tsconfig to read more about this file
|
|
3
|
+
"compilerOptions": {
|
|
4
|
+
// File Layout
|
|
5
|
+
"rootDir": "./app",
|
|
6
|
+
"outDir": "./dist",
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
// Environment Settings
|
|
12
|
+
// See also https://aka.ms/tsconfig/module
|
|
13
|
+
"module": "nodenext",
|
|
14
|
+
"target": "esnext",
|
|
15
|
+
"types": [],
|
|
16
|
+
// For nodejs:
|
|
17
|
+
// "lib": ["esnext"],
|
|
18
|
+
// "types": ["node"],
|
|
19
|
+
// and npm install -D @types/node
|
|
20
|
+
|
|
21
|
+
// Other Outputs
|
|
22
|
+
"sourceMap": false,
|
|
23
|
+
"declaration": true,
|
|
24
|
+
"declarationMap": false,
|
|
25
|
+
|
|
26
|
+
// Stricter Typechecking Options
|
|
27
|
+
"noUncheckedIndexedAccess": true,
|
|
28
|
+
"exactOptionalPropertyTypes": true,
|
|
29
|
+
|
|
30
|
+
// Style Options
|
|
31
|
+
// "noImplicitReturns": true,
|
|
32
|
+
// "noImplicitOverride": true,
|
|
33
|
+
// "noUnusedLocals": true,
|
|
34
|
+
// "noUnusedParameters": true,
|
|
35
|
+
// "noFallthroughCasesInSwitch": true,
|
|
36
|
+
// "noPropertyAccessFromIndexSignature": true,
|
|
37
|
+
|
|
38
|
+
// Recommended Options
|
|
39
|
+
"strict": true,
|
|
40
|
+
|
|
41
|
+
"verbatimModuleSyntax": true,
|
|
42
|
+
"isolatedModules": true,
|
|
43
|
+
"noUncheckedSideEffectImports": true,
|
|
44
|
+
"moduleDetection": "force",
|
|
45
|
+
"skipLibCheck": true,
|
|
46
|
+
},
|
|
47
|
+
"exclude" : [
|
|
48
|
+
"node_modules",
|
|
49
|
+
"./app/example.ts"
|
|
50
|
+
],
|
|
51
|
+
}
|