ogscrap 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # ogscrap
2
+
3
+ Lightweight utility to fetch a web page and extract clean metadata.
4
+
5
+ It returns a simple JSON object with:
6
+
7
+ - `url`
8
+ - `title`
9
+ - `description`
10
+ - `image`
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ npm install
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```ts
21
+ import { fetchPageMetadata } from "./app/page-metadata";
22
+
23
+ const data = await fetchPageMetadata("https://example.com/post");
24
+ console.log(data);
25
+ ```
26
+
27
+ Response shape:
28
+
29
+ ```json
30
+ {
31
+ "url": "https://example.com/post",
32
+ "title": "Page title",
33
+ "description": "Short summary about the content",
34
+ "image": "https://example.com/image.jpg"
35
+ }
36
+ ```
37
+
38
+ ## Local Example
39
+
40
+ You can run the sample in `app/example.ts` (depending on your TS runtime setup):
41
+
42
+ ```ts
43
+ import { fetchPageMetadata } from "./page-metadata";
44
+
45
+ const data = await fetchPageMetadata("https://example.com/post");
46
+ console.log(data);
47
+ ```
package/app/example.ts ADDED
@@ -0,0 +1,4 @@
1
+ import { fetchPageMetadata } from "./page-metadata.js";
2
+ const data = await fetchPageMetadata("https://medium.com/@pshubham/using-react-with-cordova-f235de698cc3");
3
+ console.log(data);
4
+ //https://fr.wikipedia.org/wiki/Ski_acrobatique_aux_Jeux_olympiques_de_2026
@@ -0,0 +1,127 @@
1
+ import * as cheerio from "cheerio";
2
+ import createDOMPurify from "dompurify";
3
+ import { JSDOM } from "jsdom";
4
+
5
+ export type PageMetadata = {
6
+ url: string;
7
+ title: string;
8
+ description: string;
9
+ image: string;
10
+ };
11
+
12
+ const purifyWindow = new JSDOM("").window;
13
+ const DOMPurify = createDOMPurify(purifyWindow as typeof purifyWindow);
14
+
15
+ function cleanText(value: string | undefined): string {
16
+ return (value ?? "").replace(/\s+/g, " ").trim();
17
+ }
18
+
19
+ function sanitizePlainText(value: string | undefined): string {
20
+ const sanitized = DOMPurify.sanitize(value ?? "", {
21
+ ALLOWED_TAGS: [],
22
+ ALLOWED_ATTR: [],
23
+ });
24
+ return cleanText(sanitized);
25
+ }
26
+
27
+ function firstNonEmpty(values: Array<string | undefined>): string {
28
+ for (const value of values) {
29
+ const cleaned = sanitizePlainText(value);
30
+ if (cleaned) return cleaned;
31
+ }
32
+ return "";
33
+ }
34
+
35
+ function toAbsoluteUrl(baseUrl: string, maybeUrl: string | undefined): string {
36
+ const value = sanitizePlainText(maybeUrl);
37
+ if (!value) return "";
38
+ try {
39
+ return new URL(value, baseUrl).toString();
40
+ } catch {
41
+ return "";
42
+ }
43
+ }
44
+
45
+ function getMainText($: cheerio.CheerioAPI): string {
46
+ const candidates = [
47
+ $("main article").first().text(),
48
+ $("article").first().text(),
49
+ $("main").first().text(),
50
+ ];
51
+ return sanitizePlainText(candidates.find((v) => sanitizePlainText(v)));
52
+ }
53
+
54
+ function getMainImage($: cheerio.CheerioAPI, url: string): string {
55
+ const imageSrc = firstNonEmpty([
56
+ $("main article img").first().attr("src"),
57
+ $("article img").first().attr("src"),
58
+ $("main img").first().attr("src"),
59
+ $("img").first().attr("src"),
60
+ ]);
61
+ return toAbsoluteUrl(url, imageSrc);
62
+ }
63
+
64
+ function getTitle($: cheerio.CheerioAPI): string {
65
+ return firstNonEmpty([
66
+ $('meta[name="title"]').attr("content"),
67
+ $('meta[name="twitter:title"]').attr("content"),
68
+ $('meta[property="og:title"]').attr("content"),
69
+ $("title").first().text(),
70
+ $("h1").first().text(),
71
+ ]);
72
+ }
73
+
74
+ function getDescription($: cheerio.CheerioAPI): string {
75
+ return firstNonEmpty([
76
+ $('meta[name="description"]').attr("content"),
77
+ $('meta[name="twitter:description"]').attr("content"),
78
+ $('meta[property="og:description"]').attr("content"),
79
+ $("meta[name='description']").first().attr("content"),
80
+ $("meta[name='twitter:description']").first().attr("content"),
81
+ $("meta[property='og:description']").first().attr("content"),
82
+ $("p").first().text(),
83
+ ]);
84
+ }
85
+
86
+ function getImage($: cheerio.CheerioAPI, url: string): string {
87
+ const metaImage = firstNonEmpty([
88
+ $('meta[name="image"]').attr("content"),
89
+ $('meta[property="og:image"]').attr("content"),
90
+ $('meta[name="twitter:image"]').attr("content"),
91
+ $('link[rel="image_src"]').attr("href"),
92
+ $("img").first().attr("src"),
93
+ $('link[rel="apple-touch-icon"]').attr("href"),
94
+ $('link[rel="icon"]').attr("href"),
95
+ ]);
96
+ return toAbsoluteUrl(url, metaImage);
97
+ }
98
+
99
+ export async function fetchPageMetadata(url: string): Promise<PageMetadata> {
100
+ const response = await fetch(url, {
101
+ headers: {
102
+ "user-agent": "Mozilla/5.0 (compatible; page-metadata/1.0)",
103
+ accept: "text/html,application/xhtml+xml",
104
+ referrerPolicy: "no-referrer"
105
+ },
106
+ });
107
+
108
+ if (!response.ok) {
109
+ throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
110
+ }
111
+
112
+ const html = await response.text();
113
+ const $ = cheerio.load(html);
114
+
115
+ const title = getTitle($);
116
+ const descriptionFromMeta = getDescription($);
117
+ const mainText = getMainText($);
118
+ const description = sanitizePlainText(descriptionFromMeta || mainText.slice(0, 300));
119
+ const image = getImage($, url) || getMainImage($, url);
120
+
121
+ return {
122
+ url: toAbsoluteUrl(url, url),
123
+ title: sanitizePlainText(title),
124
+ description,
125
+ image: sanitizePlainText(image),
126
+ };
127
+ }
package/package.json ADDED
@@ -0,0 +1,21 @@
1
+ {
2
+ "name": "ogscrap",
3
+ "version": "1.0.0",
4
+ "description": "Lightweight utility to fetch a web page and extract clean metadata.",
5
+ "main": "./dist/index.js",
6
+ "scripts": {
7
+ "build" : "tsc"
8
+ },
9
+ "keywords": ["metadata" , "ogscrap", "page-metadata", "scraper"],
10
+ "author": "edah",
11
+ "license": "MIT",
12
+ "type": "module",
13
+ "dependencies": {
14
+ "cheerio": "^1.2.0",
15
+ "dompurify": "^3.3.1",
16
+ "jsdom": "^28.1.0"
17
+ },
18
+ "devDependencies": {
19
+ "@types/jsdom": "^27.0.0"
20
+ }
21
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,51 @@
1
+ {
2
+ // Visit https://aka.ms/tsconfig to read more about this file
3
+ "compilerOptions": {
4
+ // File Layout
5
+ "rootDir": "./app",
6
+ "outDir": "./dist",
7
+
8
+
9
+
10
+
11
+ // Environment Settings
12
+ // See also https://aka.ms/tsconfig/module
13
+ "module": "nodenext",
14
+ "target": "esnext",
15
+ "types": [],
16
+ // For nodejs:
17
+ // "lib": ["esnext"],
18
+ // "types": ["node"],
19
+ // and npm install -D @types/node
20
+
21
+ // Other Outputs
22
+ "sourceMap": false,
23
+ "declaration": true,
24
+ "declarationMap": false,
25
+
26
+ // Stricter Typechecking Options
27
+ "noUncheckedIndexedAccess": true,
28
+ "exactOptionalPropertyTypes": true,
29
+
30
+ // Style Options
31
+ // "noImplicitReturns": true,
32
+ // "noImplicitOverride": true,
33
+ // "noUnusedLocals": true,
34
+ // "noUnusedParameters": true,
35
+ // "noFallthroughCasesInSwitch": true,
36
+ // "noPropertyAccessFromIndexSignature": true,
37
+
38
+ // Recommended Options
39
+ "strict": true,
40
+
41
+ "verbatimModuleSyntax": true,
42
+ "isolatedModules": true,
43
+ "noUncheckedSideEffectImports": true,
44
+ "moduleDetection": "force",
45
+ "skipLibCheck": true,
46
+ },
47
+ "exclude" : [
48
+ "node_modules",
49
+ "./app/example.ts"
50
+ ],
51
+ }