metanova 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,263 @@
1
+ # MetaNova
2
+
3
+ MetaNova is a modular JavaScript and TypeScript library for extracting, analyzing, scoring, and normalizing metadata from web pages and public URLs.
4
+
5
+ It is designed for link previews, bots, bookmark managers, search systems, AI agents, browser extensions, dashboards, CMS integrations, and content aggregation platforms.
6
+
7
+ ## Features
8
+
9
+ - Open Graph, Twitter Cards, JSON-LD, Schema.org, oEmbed discovery, standard HTML metadata, canonical URLs, favicons, images, videos, and audio.
10
+ - Unified typed JSON output with diagnostics.
11
+ - Embedded data extraction from `__NEXT_DATA__`, Nuxt payloads, window state objects, Apollo-like caches, and JSON script blobs.
12
+ - MediaDiscoveryEngine for deterministic image/video/audio candidate unification.
13
+ - Smart image scoring for deterministic `bestImage` selection with human-readable diagnostics.
14
+ - ConfidenceEngine and completeness scoring from 0 to 100.
15
+ - Redirect-aware fetch pipeline with timeouts, retries, byte limits, and SSRF protections.
16
+ - Browser-like request headers by default, with user override options.
17
+ - Site adapter layer for Reddit, Pinterest, Behance, YouTube, TikTok, X/Twitter, Facebook, and Instagram.
18
+ - Plugin API for custom extractors, adapters, and scorers.
19
+ - ESM and CommonJS builds with TypeScript declarations.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ npm install metanova
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```ts
30
+ import { createPreviewCard, fetchMetadata } from "metanova";
31
+
32
+ const metadata = await fetchMetadata("https://example.com/article");
33
+ const card = createPreviewCard(metadata);
34
+
35
+ console.log(metadata.bestImage);
36
+ console.log(card);
37
+ ```
38
+
39
+ For already-downloaded HTML:
40
+
41
+ ```ts
42
+ import { parseMetadata } from "metanova";
43
+
44
+ const metadata = parseMetadata(html, "https://example.com/article");
45
+ ```
46
+
47
+ ## Unified Output
48
+
49
+ ```ts
50
+ {
51
+ ok: true,
52
+ url: "https://example.com/article",
53
+ finalUrl: "https://example.com/article",
54
+ type: "article",
55
+ title: "...",
56
+ description: "...",
57
+ siteName: "...",
58
+ canonicalUrl: "...",
59
+ confidence: 94,
60
+ completeness: 88,
61
+ bestImage: "...",
62
+ images: [],
63
+ videos: [],
64
+ audio: [],
65
+ favicons: [],
66
+ article: {},
67
+ product: {},
68
+ diagnostics: {
69
+ statusCode: 200,
70
+ contentType: "text/html; charset=utf-8",
71
+ redirects: [],
72
+ sourcesUsed: ["openGraph", "jsonLd", "html"],
73
+ warnings: [],
74
+ trace: ["downloaded page", "parsed Open Graph", "selected image from openGraph"],
75
+ selectedImageReason: "Selected openGraph image with 1200x630, score 100.",
76
+ extractedAt: "..."
77
+ }
78
+ }
79
+ ```
80
+
81
+ ## Public API
82
+
83
+ ```ts
84
+ fetchMetadata(url, options)
85
+ parseMetadata(html, url, options)
86
+ parseMetadataAsync(html, url, options)
87
+ normalizeMetadata(rawSources)
88
+ extractOpenGraph(html)
89
+ extractTwitterCards(html)
90
+ extractJsonLd(html)
91
+ extractOEmbed(html, url)
92
+ extractEmbeddedData(html)
93
+ extractImages(html, baseUrl)
94
+ extractVideos(html, baseUrl)
95
+ extractAudio(html, baseUrl)
96
+ resolveUrl(url, baseUrl)
97
+ scoreImages(images)
98
+ discoverMedia(rawSources, finalUrl)
99
+ calculateConfidence(input)
100
+ calculateCompleteness(input)
101
+ createPreviewCard(metadata)
102
+ MetaNova.use(plugin)
103
+ ```
104
+
105
+ ## Architecture
106
+
107
+ MetaNova is a layered extraction pipeline:
108
+
109
+ 1. URL validation, short-link detection, redirect resolution, and secure fetch.
110
+ 2. Browser-like download with realistic `User-Agent`, `Accept`, `Accept-Language`, and `Accept-Encoding` headers.
111
+ 3. Source extractors for Open Graph, Twitter Cards, JSON-LD, embedded application data, oEmbed, HTML metadata, and media tags.
112
+ 4. Site adapters for social and content platforms.
113
+ 5. `MediaDiscoveryEngine` unifies media candidates from every source.
114
+ 6. Image scoring ranks candidates and explains `bestImage`.
115
+ 7. `ConfidenceEngine` and completeness scoring measure result quality from 0 to 100.
116
+ 8. Normalization returns a stable JSON shape plus diagnostics and extraction trace.
117
+
118
+ ## Adapters
119
+
120
+ Built-in adapters recover title, description, images, videos, author, publish date, and identifiers for Reddit, Pinterest, Behance, YouTube, TikTok, X/Twitter, Facebook, and Instagram. They use embedded app data and discovered media as fallbacks when Open Graph is weak.
121
+
122
+ ## Confidence Engine
123
+
124
+ `confidence` is an integer from 0 to 100. It considers title quality, description quality, image quality, canonical URL, structured data, adapter success, embedded data, and warnings.
125
+
126
+ `completeness` is also 0 to 100. It measures how many useful preview fields are present.
127
+
128
+ `reliability` is 0 to 100 and combines confidence, completeness, adapter success, media quality, and warnings.
129
+
130
+ ## Media Discovery Engine
131
+
132
+ The engine searches Open Graph, Twitter Cards, JSON-LD, embedded app data, oEmbed, HTML images, `srcset`, `picture`, lazy-loaded attributes, video posters, social platform media, and fallback images. It resolves relative URLs, filters weak candidates, and deduplicates near-identical media.
133
+
134
+ ## Diagnostics And Extraction Trace
135
+
136
+ Diagnostics include `sourcesUsed`, `warnings`, `redirects`, `selectedImageReason`, and `trace`.
137
+
138
+ Important fields also include source attribution:
139
+
140
+ ```json
141
+ {
142
+ "sources": {
143
+ "title": "jsonLd",
144
+ "description": "openGraph",
145
+ "author": "youtubeAdapter",
146
+ "image": "twitter"
147
+ },
148
+ "diagnostics": {
149
+ "adapter": {
150
+ "matched": true,
151
+ "name": "youtubeAdapter",
152
+ "confidence": 95
153
+ }
154
+ }
155
+ }
156
+ ```
157
+
158
+ ```json
159
+ [
160
+ "downloaded page",
161
+ "parsed Open Graph",
162
+ "parsed JSON-LD",
163
+ "adapter matched: redditAdapter",
164
+ "selected image from redditAdapter (openGraph)"
165
+ ]
166
+ ```
167
+
168
+ ## Performance
169
+
170
+ MetaNova stays lightweight by default: no browser automation in core, bounded response size, request timeouts, retry controls, cache hooks, and mostly synchronous parsing for already-downloaded HTML.
171
+
172
+ ## Security Defaults
173
+
174
+ `fetchMetadata` blocks risky targets by default:
175
+
176
+ - `localhost`
177
+ - loopback IPs
178
+ - private network IPs
179
+ - link-local and reserved networks
180
+ - unsupported protocols
181
+ - oversized responses
182
+ - malicious redirect targets
183
+
184
+ You can opt into trusted internal targets:
185
+
186
+ ```ts
187
+ await fetchMetadata("http://localhost:3000", {
188
+ allowLocalhost: true,
189
+ allowPrivateNetwork: true
190
+ });
191
+ ```
192
+
193
+ ## Plugins
194
+
195
+ ```ts
196
+ import { MetaNova, type MetaNovaPlugin } from "metanova";
197
+
198
+ const plugin: MetaNovaPlugin = {
199
+ name: "custom-docs",
200
+ setup(api) {
201
+ api.addExtractor("docs", ({ $ }) => ({
202
+ source: "docs",
203
+ title: $("meta[name='doc:title']").attr("content"),
204
+ siteName: "Docs"
205
+ }));
206
+
207
+ api.addImageScorer((image) => (image.url.includes("/hero/") ? 10 : 0));
208
+ }
209
+ };
210
+
211
+ MetaNova.use(plugin);
212
+ ```
213
+
214
+ ## Development
215
+
216
+ ```bash
217
+ npm install
218
+ npm run typecheck
219
+ npm test
220
+ npm run lint
221
+ npm run build
222
+ ```
223
+
224
+ Mock examples use local HTML fixtures and do not require network:
225
+
226
+ ```bash
227
+ node examples/reddit.mjs
228
+ node examples/pinterest.mjs
229
+ node examples/behance.mjs
230
+ node examples/youtube.mjs
231
+ node examples/diagnostics.mjs
232
+ ```
233
+
234
+ Live network examples take URLs from arguments and intentionally do not embed validation URLs:
235
+
236
+ ```bash
237
+ node examples/live-fetch.mjs https://example.com
238
+ node examples/youtube-video.mjs https://example.com
239
+ node examples/youtube-playlist.mjs https://example.com
240
+ node examples/social-preview.mjs https://example.com
241
+ ```
242
+
243
+ For a full from-zero walkthrough, see [USAGE_GUIDE.md](./USAGE_GUIDE.md).
244
+
245
+ ## Project Layout
246
+
247
+ ```txt
248
+ src/
249
+ adapters/
250
+ diagnostics/
251
+ extractors/
252
+ fetcher/
253
+ normalizers/
254
+ plugins/
255
+ scorers/
256
+ types/
257
+ utils/
258
+ index.ts
259
+ ```
260
+
261
+ ## Publishing
262
+
263
+ The package is configured with dual ESM/CommonJS exports, generated type declarations, source maps, an npm `files` allowlist, and Node engine constraints. Run `npm run build` before publishing.