metanova 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +263 -0
- package/USAGE_GUIDE.md +829 -0
- package/dist/index.cjs +3756 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +493 -0
- package/dist/index.d.ts +493 -0
- package/dist/index.js +3674 -0
- package/dist/index.js.map +1 -0
- package/examples/behance.mjs +23 -0
- package/examples/commonjs.cjs +12 -0
- package/examples/custom-adapter.mjs +41 -0
- package/examples/custom-plugin.mjs +26 -0
- package/examples/diagnostics.mjs +17 -0
- package/examples/live-fetch.mjs +21 -0
- package/examples/parse-html.mjs +15 -0
- package/examples/pinterest.mjs +22 -0
- package/examples/preview-card.mjs +11 -0
- package/examples/quick-start.mjs +24 -0
- package/examples/reddit.mjs +23 -0
- package/examples/social-links.mjs +28 -0
- package/examples/social-preview.mjs +21 -0
- package/examples/youtube-playlist.mjs +19 -0
- package/examples/youtube-video.mjs +22 -0
- package/examples/youtube.mjs +22 -0
- package/package.json +70 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# MetaNova
|
|
2
|
+
|
|
3
|
+
MetaNova is a modular JavaScript and TypeScript library for extracting, analyzing, scoring, and normalizing metadata from web pages and public URLs.
|
|
4
|
+
|
|
5
|
+
It is designed for link previews, bots, bookmark managers, search systems, AI agents, browser extensions, dashboards, CMS integrations, and content aggregation platforms.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Open Graph, Twitter Cards, JSON-LD, Schema.org, oEmbed discovery, standard HTML metadata, canonical URLs, favicons, images, videos, and audio.
|
|
10
|
+
- Unified typed JSON output with diagnostics.
|
|
11
|
+
- Embedded data extraction from `__NEXT_DATA__`, Nuxt payloads, window state objects, Apollo-like caches, and JSON script blobs.
|
|
12
|
+
- MediaDiscoveryEngine for deterministic image/video/audio candidate unification.
|
|
13
|
+
- Smart image scoring for deterministic `bestImage` selection with human-readable diagnostics.
|
|
14
|
+
- ConfidenceEngine and completeness scoring from 0 to 100.
|
|
15
|
+
- Redirect-aware fetch pipeline with timeouts, retries, byte limits, and SSRF protections.
|
|
16
|
+
- Browser-like request headers by default, with user override options.
|
|
17
|
+
- Site adapter layer for Reddit, Pinterest, Behance, YouTube, TikTok, X/Twitter, Facebook, and Instagram.
|
|
18
|
+
- Plugin API for custom extractors, adapters, and scorers.
|
|
19
|
+
- ESM and CommonJS builds with TypeScript declarations.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npm install metanova
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```ts
|
|
30
|
+
import { createPreviewCard, fetchMetadata } from "metanova";
|
|
31
|
+
|
|
32
|
+
const metadata = await fetchMetadata("https://example.com/article");
|
|
33
|
+
const card = createPreviewCard(metadata);
|
|
34
|
+
|
|
35
|
+
console.log(metadata.bestImage);
|
|
36
|
+
console.log(card);
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
For already-downloaded HTML:
|
|
40
|
+
|
|
41
|
+
```ts
|
|
42
|
+
import { parseMetadata } from "metanova";
|
|
43
|
+
|
|
44
|
+
const metadata = parseMetadata(html, "https://example.com/article");
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Unified Output
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
{
|
|
51
|
+
ok: true,
|
|
52
|
+
url: "https://example.com/article",
|
|
53
|
+
finalUrl: "https://example.com/article",
|
|
54
|
+
type: "article",
|
|
55
|
+
title: "...",
|
|
56
|
+
description: "...",
|
|
57
|
+
siteName: "...",
|
|
58
|
+
canonicalUrl: "...",
|
|
59
|
+
confidence: 94,
|
|
60
|
+
completeness: 88,
|
|
61
|
+
bestImage: "...",
|
|
62
|
+
images: [],
|
|
63
|
+
videos: [],
|
|
64
|
+
audio: [],
|
|
65
|
+
favicons: [],
|
|
66
|
+
article: {},
|
|
67
|
+
product: {},
|
|
68
|
+
diagnostics: {
|
|
69
|
+
statusCode: 200,
|
|
70
|
+
contentType: "text/html; charset=utf-8",
|
|
71
|
+
redirects: [],
|
|
72
|
+
sourcesUsed: ["openGraph", "jsonLd", "html"],
|
|
73
|
+
warnings: [],
|
|
74
|
+
trace: ["downloaded page", "parsed Open Graph", "selected image from openGraph"],
|
|
75
|
+
selectedImageReason: "Selected openGraph image with 1200x630, score 100.",
|
|
76
|
+
extractedAt: "..."
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Public API
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
fetchMetadata(url, options)
|
|
85
|
+
parseMetadata(html, url, options)
|
|
86
|
+
parseMetadataAsync(html, url, options)
|
|
87
|
+
normalizeMetadata(rawSources)
|
|
88
|
+
extractOpenGraph(html)
|
|
89
|
+
extractTwitterCards(html)
|
|
90
|
+
extractJsonLd(html)
|
|
91
|
+
extractOEmbed(html, url)
|
|
92
|
+
extractEmbeddedData(html)
|
|
93
|
+
extractImages(html, baseUrl)
|
|
94
|
+
extractVideos(html, baseUrl)
|
|
95
|
+
extractAudio(html, baseUrl)
|
|
96
|
+
resolveUrl(url, baseUrl)
|
|
97
|
+
scoreImages(images)
|
|
98
|
+
discoverMedia(rawSources, finalUrl)
|
|
99
|
+
calculateConfidence(input)
|
|
100
|
+
calculateCompleteness(input)
|
|
101
|
+
createPreviewCard(metadata)
|
|
102
|
+
MetaNova.use(plugin)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Architecture
|
|
106
|
+
|
|
107
|
+
MetaNova is a layered extraction pipeline:
|
|
108
|
+
|
|
109
|
+
1. URL validation, short-link detection, redirect resolution, and secure fetch.
|
|
110
|
+
2. Browser-like download with realistic `User-Agent`, `Accept`, `Accept-Language`, and `Accept-Encoding` headers.
|
|
111
|
+
3. Source extractors for Open Graph, Twitter Cards, JSON-LD, embedded application data, oEmbed, HTML metadata, and media tags.
|
|
112
|
+
4. Site adapters for social and content platforms.
|
|
113
|
+
5. `MediaDiscoveryEngine` unifies media candidates from every source.
|
|
114
|
+
6. Image scoring ranks candidates and explains `bestImage`.
|
|
115
|
+
7. `ConfidenceEngine` and completeness scoring measure result quality from 0 to 100.
|
|
116
|
+
8. Normalization returns a stable JSON shape plus diagnostics and extraction trace.
|
|
117
|
+
|
|
118
|
+
## Adapters
|
|
119
|
+
|
|
120
|
+
Built-in adapters recover title, description, images, videos, author, publish date, and identifiers for Reddit, Pinterest, Behance, YouTube, TikTok, X/Twitter, Facebook, and Instagram. They use embedded app data and discovered media as fallbacks when Open Graph is weak.
|
|
121
|
+
|
|
122
|
+
## Confidence Engine
|
|
123
|
+
|
|
124
|
+
`confidence` is an integer from 0 to 100. It considers title quality, description quality, image quality, canonical URL, structured data, adapter success, embedded data, and warnings.
|
|
125
|
+
|
|
126
|
+
`completeness` is also 0 to 100. It measures how many useful preview fields are present.
|
|
127
|
+
|
|
128
|
+
`reliability` is 0 to 100 and combines confidence, completeness, adapter success, media quality, and warnings.
|
|
129
|
+
|
|
130
|
+
## Media Discovery Engine
|
|
131
|
+
|
|
132
|
+
The engine searches Open Graph, Twitter Cards, JSON-LD, embedded app data, oEmbed, HTML images, `srcset`, `picture`, lazy-loaded attributes, video posters, social platform media, and fallback images. It resolves relative URLs, filters weak candidates, and deduplicates near-identical media.
|
|
133
|
+
|
|
134
|
+
## Diagnostics And Extraction Trace
|
|
135
|
+
|
|
136
|
+
Diagnostics include `sourcesUsed`, `warnings`, `redirects`, `selectedImageReason`, and `trace`.
|
|
137
|
+
|
|
138
|
+
Important fields also include source attribution:
|
|
139
|
+
|
|
140
|
+
```json
|
|
141
|
+
{
|
|
142
|
+
"sources": {
|
|
143
|
+
"title": "jsonLd",
|
|
144
|
+
"description": "openGraph",
|
|
145
|
+
"author": "youtubeAdapter",
|
|
146
|
+
"image": "twitter"
|
|
147
|
+
},
|
|
148
|
+
"diagnostics": {
|
|
149
|
+
"adapter": {
|
|
150
|
+
"matched": true,
|
|
151
|
+
"name": "youtubeAdapter",
|
|
152
|
+
"confidence": 95
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
[
|
|
160
|
+
"downloaded page",
|
|
161
|
+
"parsed Open Graph",
|
|
162
|
+
"parsed JSON-LD",
|
|
163
|
+
"adapter matched: redditAdapter",
|
|
164
|
+
"selected image from redditAdapter (openGraph)"
|
|
165
|
+
]
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Performance
|
|
169
|
+
|
|
170
|
+
MetaNova stays lightweight by default: no browser automation in core, bounded response size, request timeouts, retry controls, cache hooks, and mostly synchronous parsing for already-downloaded HTML.
|
|
171
|
+
|
|
172
|
+
## Security Defaults
|
|
173
|
+
|
|
174
|
+
`fetchMetadata` blocks risky targets by default:
|
|
175
|
+
|
|
176
|
+
- `localhost`
|
|
177
|
+
- loopback IPs
|
|
178
|
+
- private network IPs
|
|
179
|
+
- link-local and reserved networks
|
|
180
|
+
- unsupported protocols
|
|
181
|
+
- oversized responses
|
|
182
|
+
- malicious redirect targets
|
|
183
|
+
|
|
184
|
+
You can opt into trusted internal targets:
|
|
185
|
+
|
|
186
|
+
```ts
|
|
187
|
+
await fetchMetadata("http://localhost:3000", {
|
|
188
|
+
allowLocalhost: true,
|
|
189
|
+
allowPrivateNetwork: true
|
|
190
|
+
});
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Plugins
|
|
194
|
+
|
|
195
|
+
```ts
|
|
196
|
+
import { MetaNova, type MetaNovaPlugin } from "metanova";
|
|
197
|
+
|
|
198
|
+
const plugin: MetaNovaPlugin = {
|
|
199
|
+
name: "custom-docs",
|
|
200
|
+
setup(api) {
|
|
201
|
+
api.addExtractor("docs", ({ $ }) => ({
|
|
202
|
+
source: "docs",
|
|
203
|
+
title: $("meta[name='doc:title']").attr("content"),
|
|
204
|
+
siteName: "Docs"
|
|
205
|
+
}));
|
|
206
|
+
|
|
207
|
+
api.addImageScorer((image) => (image.url.includes("/hero/") ? 10 : 0));
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
MetaNova.use(plugin);
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Development
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
npm install
|
|
218
|
+
npm run typecheck
|
|
219
|
+
npm test
|
|
220
|
+
npm run lint
|
|
221
|
+
npm run build
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Mock examples use local HTML fixtures and do not require network:
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
node examples/reddit.mjs
|
|
228
|
+
node examples/pinterest.mjs
|
|
229
|
+
node examples/behance.mjs
|
|
230
|
+
node examples/youtube.mjs
|
|
231
|
+
node examples/diagnostics.mjs
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Live network examples take URLs from arguments and intentionally do not embed validation URLs:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
node examples/live-fetch.mjs https://example.com
|
|
238
|
+
node examples/youtube-video.mjs https://example.com
|
|
239
|
+
node examples/youtube-playlist.mjs https://example.com
|
|
240
|
+
node examples/social-preview.mjs https://example.com
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
For a full from-zero walkthrough, see [USAGE_GUIDE.md](./USAGE_GUIDE.md).
|
|
244
|
+
|
|
245
|
+
## Project Layout
|
|
246
|
+
|
|
247
|
+
```txt
|
|
248
|
+
src/
|
|
249
|
+
adapters/
|
|
250
|
+
diagnostics/
|
|
251
|
+
extractors/
|
|
252
|
+
fetcher/
|
|
253
|
+
normalizers/
|
|
254
|
+
plugins/
|
|
255
|
+
scorers/
|
|
256
|
+
types/
|
|
257
|
+
utils/
|
|
258
|
+
index.ts
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Publishing
|
|
262
|
+
|
|
263
|
+
The package is configured with dual ESM/CommonJS exports, generated type declarations, source maps, an npm `files` allowlist, and Node engine constraints. Run `npm run build` before publishing.
|