@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +26 -0
  3. package/dist/dom-evaluation.d.ts +72 -24
  4. package/dist/dom-evaluation.js +310 -84
  5. package/dist/extract-meta.d.ts +98 -0
  6. package/dist/extract-meta.js +75 -0
  7. package/dist/index.d.ts +3 -1
  8. package/dist/index.js +1 -0
  9. package/dist/meta/classify.d.ts +52 -0
  10. package/dist/meta/classify.js +731 -0
  11. package/dist/meta/collect-head.d.ts +63 -0
  12. package/dist/meta/collect-head.js +223 -0
  13. package/dist/meta/id-extractors.d.ts +40 -0
  14. package/dist/meta/id-extractors.js +196 -0
  15. package/dist/meta/keys.d.ts +41 -0
  16. package/dist/meta/keys.js +507 -0
  17. package/dist/meta/parsers.d.ts +74 -0
  18. package/dist/meta/parsers.js +293 -0
  19. package/dist/meta/tag-detection.d.ts +59 -0
  20. package/dist/meta/tag-detection.js +120 -0
  21. package/dist/meta/types.d.ts +874 -0
  22. package/dist/meta/types.js +12 -0
  23. package/dist/scraper.js +15 -13
  24. package/dist/types.d.ts +3 -38
  25. package/package.json +8 -5
  26. package/src/dom-evaluation.spec.ts +301 -73
  27. package/src/dom-evaluation.ts +417 -88
  28. package/src/extract-meta.spec.ts +247 -0
  29. package/src/extract-meta.ts +121 -0
  30. package/src/index.ts +45 -0
  31. package/src/meta/classify.spec.ts +281 -0
  32. package/src/meta/classify.ts +810 -0
  33. package/src/meta/collect-head.ts +247 -0
  34. package/src/meta/id-extractors.spec.ts +69 -0
  35. package/src/meta/id-extractors.ts +206 -0
  36. package/src/meta/keys.ts +568 -0
  37. package/src/meta/parsers.spec.ts +178 -0
  38. package/src/meta/parsers.ts +304 -0
  39. package/src/meta/simple-wappalyzer.d.ts +37 -0
  40. package/src/meta/tag-detection.spec.ts +134 -0
  41. package/src/meta/tag-detection.ts +161 -0
  42. package/src/meta/types.ts +949 -0
  43. package/src/scraper.ts +19 -13
  44. package/src/types.ts +49 -55
  45. package/tsconfig.tsbuildinfo +1 -1
@@ -0,0 +1,281 @@
1
+ import type { RawHeadEntry } from './types.js';
2
+
3
+ import { describe, expect, it } from 'vitest';
4
+
5
+ import { classify, emptyMeta, setByPath } from './classify.js';
6
+
7
+ describe('emptyMeta', () => {
8
+ it('initializes all required fields with empty values', () => {
9
+ const meta = emptyMeta();
10
+ expect(meta.title).toBe('');
11
+ expect(meta.originTrial).toEqual([]);
12
+ expect(meta.jsonLd).toEqual([]);
13
+ expect(meta.speculationRules).toEqual([]);
14
+ expect(meta.tags).toEqual({ detected: {}, entries: [] });
15
+ expect(meta.others).toEqual({
16
+ meta: {},
17
+ property: {},
18
+ httpEquiv: {},
19
+ itemprop: {},
20
+ link: [],
21
+ script: [],
22
+ iframe: [],
23
+ });
24
+ });
25
+ });
26
+
27
+ describe('setByPath', () => {
28
+ it('creates intermediate objects', () => {
29
+ const obj: Record<string, unknown> = {};
30
+ setByPath(obj, 'a.b.c', 1, false);
31
+ expect(obj).toEqual({ a: { b: { c: 1 } } });
32
+ });
33
+
34
+ it('keeps first assignment when multi=false', () => {
35
+ const obj: Record<string, unknown> = {};
36
+ setByPath(obj, 'k', 'first', false);
37
+ setByPath(obj, 'k', 'second', false);
38
+ expect(obj.k).toBe('first');
39
+ });
40
+
41
+ it('appends to leaf array when multi=true', () => {
42
+ const obj: Record<string, unknown> = {};
43
+ setByPath(obj, 'list', 'a', true);
44
+ setByPath(obj, 'list', 'b', true);
45
+ expect(obj.list).toEqual(['a', 'b']);
46
+ });
47
+ });
48
+
49
+ describe('classify', () => {
50
+ it('captures title and html attributes', () => {
51
+ const raw: RawHeadEntry[] = [
52
+ {
53
+ kind: 'html',
54
+ lang: 'ja',
55
+ dir: 'ltr',
56
+ prefix: 'og: https://ogp.me/ns#',
57
+ itemscope: true,
58
+ itemtype: 'https://schema.org/WebSite',
59
+ },
60
+ { kind: 'title', content: 'Example' },
61
+ ];
62
+ const meta = classify(raw);
63
+ expect(meta.title).toBe('Example');
64
+ expect(meta.lang).toBe('ja');
65
+ expect(meta.dir).toBe('ltr');
66
+ expect(meta.prefix).toBe('og: https://ogp.me/ns#');
67
+ expect(meta.rdfa?.prefix).toBe('og: https://ogp.me/ns#');
68
+ expect(meta.itemType).toBe('https://schema.org/WebSite');
69
+ expect(meta.microdata?.itemscope).toBe(true);
70
+ });
71
+
72
+ it('routes meta name="description" and meta property="og:image"', () => {
73
+ const raw: RawHeadEntry[] = [
74
+ { kind: 'meta', name: 'description', content: 'Page desc' },
75
+ { kind: 'meta', property: 'og:image', content: 'https://x.test/a.png' },
76
+ { kind: 'meta', property: 'og:image', content: 'https://x.test/b.png' },
77
+ ];
78
+ const meta = classify(raw);
79
+ expect(meta.description).toBe('Page desc');
80
+ expect(meta.og?.image).toEqual(['https://x.test/a.png', 'https://x.test/b.png']);
81
+ });
82
+
83
+ it('parses viewport meta', () => {
84
+ const raw: RawHeadEntry[] = [
85
+ { kind: 'meta', name: 'viewport', content: 'width=device-width, initial-scale=1' },
86
+ ];
87
+ const meta = classify(raw);
88
+ expect(meta.viewport?.width).toBe('device-width');
89
+ expect(meta.viewport?.initialScale).toBe(1);
90
+ });
91
+
92
+ it('parses robots meta with mixed flags and directives', () => {
93
+ const raw: RawHeadEntry[] = [
94
+ { kind: 'meta', name: 'robots', content: 'noindex, max-snippet:30' },
95
+ ];
96
+ const meta = classify(raw);
97
+ expect(meta.robots?.noindex).toBe(true);
98
+ expect(meta.robots?.maxSnippet).toBe(30);
99
+ });
100
+
101
+ it('routes theme-color by media attribute', () => {
102
+ const raw: RawHeadEntry[] = [
103
+ { kind: 'meta', name: 'theme-color', content: '#fff' },
104
+ {
105
+ kind: 'meta',
106
+ name: 'theme-color',
107
+ content: '#000',
108
+ media: '(prefers-color-scheme: dark)',
109
+ },
110
+ {
111
+ kind: 'meta',
112
+ name: 'theme-color',
113
+ content: '#eee',
114
+ media: '(prefers-color-scheme: light)',
115
+ },
116
+ ];
117
+ const meta = classify(raw);
118
+ expect(meta.themeColor).toBe('#fff');
119
+ expect(meta.themeColorDark).toBe('#000');
120
+ expect(meta.themeColorLight).toBe('#eee');
121
+ });
122
+
123
+ it('parses http-equiv refresh', () => {
124
+ const raw: RawHeadEntry[] = [
125
+ { kind: 'meta', httpEquiv: 'refresh', content: '5; url=https://example.com/' },
126
+ ];
127
+ const meta = classify(raw);
128
+ expect(meta.httpEquiv?.refresh?.seconds).toBe(5);
129
+ expect(meta.httpEquiv?.refresh?.url).toBe('https://example.com/');
130
+ });
131
+
132
+ it('routes canonical link to Meta.link.canonical', () => {
133
+ const raw: RawHeadEntry[] = [
134
+ { kind: 'link', rel: ['canonical'], href: 'https://example.com/' },
135
+ ];
136
+ const meta = classify(raw);
137
+ expect(meta.link?.canonical).toBe('https://example.com/');
138
+ });
139
+
140
+ it('routes alternate rss feed by type', () => {
141
+ const raw: RawHeadEntry[] = [
142
+ {
143
+ kind: 'link',
144
+ rel: ['alternate'],
145
+ href: '/feed.xml',
146
+ type: 'application/rss+xml',
147
+ title: 'RSS',
148
+ },
149
+ ];
150
+ const meta = classify(raw);
151
+ expect(meta.link?.alternateRss).toHaveLength(1);
152
+ expect(meta.link?.alternateRss[0]?.title).toBe('RSS');
153
+ });
154
+
155
+ it('refines icon by sizes/type', () => {
156
+ const raw: RawHeadEntry[] = [
157
+ { kind: 'link', rel: ['icon'], href: '/icon.svg', type: 'image/svg+xml' },
158
+ { kind: 'link', rel: ['icon'], href: '/icon.ico' },
159
+ { kind: 'link', rel: ['icon'], href: '/icon-32.png', sizes: '32x32' },
160
+ { kind: 'link', rel: ['icon'], href: '/icon-any.png', sizes: 'any' },
161
+ ];
162
+ const meta = classify(raw);
163
+ expect(meta.link?.iconSvg?.href).toBe('/icon.svg');
164
+ expect(meta.link?.icon?.href).toBe('/icon.ico');
165
+ expect(meta.link?.iconSized).toHaveLength(1);
166
+ expect(meta.link?.iconAny?.href).toBe('/icon-any.png');
167
+ });
168
+
169
+ it('refines apple-touch-icon by sizes', () => {
170
+ const raw: RawHeadEntry[] = [
171
+ { kind: 'link', rel: ['apple-touch-icon'], href: '/apple-touch-icon.png' },
172
+ {
173
+ kind: 'link',
174
+ rel: ['apple-touch-icon'],
175
+ href: '/apple-touch-icon-180.png',
176
+ sizes: '180x180',
177
+ },
178
+ ];
179
+ const meta = classify(raw);
180
+ expect(meta.link?.appleTouchIcon?.href).toBe('/apple-touch-icon.png');
181
+ expect(meta.link?.appleTouchIconSized).toHaveLength(1);
182
+ });
183
+
184
+ it('preserves unknown meta name in others.meta', () => {
185
+ const raw: RawHeadEntry[] = [
186
+ { kind: 'meta', name: 'x-d-zero-custom', content: 'value-a' },
187
+ { kind: 'meta', name: 'x-d-zero-custom', content: 'value-b' },
188
+ ];
189
+ const meta = classify(raw);
190
+ expect(meta.others.meta['x-d-zero-custom']).toEqual(['value-a', 'value-b']);
191
+ });
192
+
193
+ it('preserves unknown link rel in others.link', () => {
194
+ const raw: RawHeadEntry[] = [{ kind: 'link', rel: ['some-future-rel'], href: '/x' }];
195
+ const meta = classify(raw);
196
+ expect(meta.others.link).toHaveLength(1);
197
+ expect(meta.others.link[0]?.href).toBe('/x');
198
+ });
199
+
200
+ it('parses application/ld+json into jsonLd', () => {
201
+ const raw: RawHeadEntry[] = [
202
+ {
203
+ kind: 'script',
204
+ scriptType: 'application/ld+json',
205
+ content: '{"@type":"WebSite","name":"X"}',
206
+ location: 'head',
207
+ },
208
+ ];
209
+ const meta = classify(raw);
210
+ expect(meta.jsonLd).toHaveLength(1);
211
+ expect(meta.jsonLd[0]?.parsed).toEqual({ '@type': 'WebSite', name: 'X' });
212
+ });
213
+
214
+ it('records jsonLd parseError on invalid JSON', () => {
215
+ const raw: RawHeadEntry[] = [
216
+ {
217
+ kind: 'script',
218
+ scriptType: 'application/ld+json',
219
+ content: '{not valid',
220
+ location: 'head',
221
+ },
222
+ ];
223
+ const meta = classify(raw);
224
+ expect(meta.jsonLd).toHaveLength(1);
225
+ expect(meta.jsonLd[0]?.parseError).toBeDefined();
226
+ });
227
+
228
+ it('captures iframes into others.iframe', () => {
229
+ const raw: RawHeadEntry[] = [
230
+ {
231
+ kind: 'iframe',
232
+ src: 'https://www.googletagmanager.com/ns.html?id=GTM-XYZ',
233
+ location: 'noscript',
234
+ },
235
+ ];
236
+ const meta = classify(raw);
237
+ expect(meta.others.iframe).toHaveLength(1);
238
+ expect(meta.others.iframe[0]?.location).toBe('noscript');
239
+ });
240
+
241
+ it('writes cross-reference paths: msapplication-config goes to both', () => {
242
+ const raw: RawHeadEntry[] = [
243
+ { kind: 'meta', name: 'msapplication-config', content: '/browserconfig.xml' },
244
+ ];
245
+ const meta = classify(raw);
246
+ expect(meta.msapplication?.config).toBe('/browserconfig.xml');
247
+ expect(meta.msapplication?.configFile).toBe('/browserconfig.xml');
248
+ });
249
+
250
+ it('writes verification.google for google-site-verification', () => {
251
+ const raw: RawHeadEntry[] = [
252
+ { kind: 'meta', name: 'google-site-verification', content: 'abc123' },
253
+ ];
254
+ const meta = classify(raw);
255
+ expect(meta.verification?.google).toBe('abc123');
256
+ });
257
+
258
+ it('honors `includeRaw` option', () => {
259
+ const raw: RawHeadEntry[] = [{ kind: 'title', content: 'X' }];
260
+ const meta = classify(raw, { includeRaw: true });
261
+ expect(meta._raw).toBe(raw);
262
+ });
263
+
264
+ it('integrates external tags option', () => {
265
+ const meta = classify([], {
266
+ tags: {
267
+ detected: { Analytics: { 'Google Analytics': { ids: ['G-1'] } } },
268
+ entries: [
269
+ {
270
+ provider: 'Google Analytics',
271
+ categories: ['Analytics'],
272
+ id: 'G-1',
273
+ sources: [{ type: 'html' }],
274
+ },
275
+ ],
276
+ },
277
+ });
278
+ expect(meta.tags.detected.Analytics?.['Google Analytics']?.ids).toEqual(['G-1']);
279
+ expect(meta.tags.entries).toHaveLength(1);
280
+ });
281
+ });