@jackwener/opencli 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -28
- package/README.md +6 -5
- package/README.zh-CN.md +5 -4
- package/SKILL.md +18 -4
- package/dist/browser.js +2 -3
- package/dist/cli-manifest.json +195 -22
- package/dist/clis/linkedin/search.d.ts +1 -0
- package/dist/clis/linkedin/search.js +366 -0
- package/dist/clis/reddit/read.d.ts +1 -0
- package/dist/clis/reddit/read.js +184 -0
- package/dist/clis/youtube/transcript-group.d.ts +44 -0
- package/dist/clis/youtube/transcript-group.js +226 -0
- package/dist/clis/youtube/transcript-group.test.d.ts +1 -0
- package/dist/clis/youtube/transcript-group.test.js +99 -0
- package/dist/clis/youtube/transcript.d.ts +1 -0
- package/dist/clis/youtube/transcript.js +264 -0
- package/dist/clis/youtube/utils.d.ts +8 -0
- package/dist/clis/youtube/utils.js +28 -0
- package/dist/clis/youtube/video.d.ts +1 -0
- package/dist/clis/youtube/video.js +114 -0
- package/dist/engine.js +2 -1
- package/dist/main.js +10 -2
- package/dist/output.js +2 -1
- package/dist/registry.d.ts +1 -8
- package/dist/snapshotFormatter.d.ts +9 -0
- package/dist/snapshotFormatter.js +352 -15
- package/dist/snapshotFormatter.test.d.ts +7 -0
- package/dist/snapshotFormatter.test.js +521 -0
- package/dist/validate.d.ts +14 -2
- package/dist/verify.d.ts +14 -2
- package/package.json +2 -2
- package/src/browser.ts +2 -4
- package/src/clis/linkedin/search.ts +416 -0
- package/src/clis/reddit/read.ts +186 -0
- package/src/clis/youtube/transcript-group.test.ts +108 -0
- package/src/clis/youtube/transcript-group.ts +287 -0
- package/src/clis/youtube/transcript.ts +280 -0
- package/src/clis/youtube/utils.ts +28 -0
- package/src/clis/youtube/video.ts +116 -0
- package/src/engine.ts +4 -1
- package/src/main.ts +10 -2
- package/src/output.ts +2 -1
- package/src/registry.ts +1 -8
- package/src/snapshotFormatter.test.ts +579 -0
- package/src/snapshotFormatter.ts +399 -13
- package/src/validate.ts +19 -4
- package/src/verify.ts +17 -3
- package/vitest.config.ts +15 -1
- package/dist/clis/reddit/read.yaml +0 -76
- package/src/clis/reddit/read.yaml +0 -76
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YouTube video metadata — read ytInitialPlayerResponse + ytInitialData from video page.
|
|
3
|
+
*/
|
|
4
|
+
import { cli, Strategy } from '../../registry.js';
|
|
5
|
+
import { parseVideoId } from './utils.js';
|
|
6
|
+
cli({
|
|
7
|
+
site: 'youtube',
|
|
8
|
+
name: 'video',
|
|
9
|
+
description: 'Get YouTube video metadata (title, views, description, etc.)',
|
|
10
|
+
domain: 'www.youtube.com',
|
|
11
|
+
strategy: Strategy.COOKIE,
|
|
12
|
+
args: [
|
|
13
|
+
{ name: 'url', required: true, help: 'YouTube video URL or video ID' },
|
|
14
|
+
],
|
|
15
|
+
columns: ['field', 'value'],
|
|
16
|
+
func: async (page, kwargs) => {
|
|
17
|
+
const videoId = parseVideoId(kwargs.url);
|
|
18
|
+
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
19
|
+
await page.goto(videoUrl);
|
|
20
|
+
await page.wait(3);
|
|
21
|
+
const data = await page.evaluate(`
|
|
22
|
+
(async () => {
|
|
23
|
+
const player = window.ytInitialPlayerResponse;
|
|
24
|
+
const yt = window.ytInitialData;
|
|
25
|
+
if (!player) return { error: 'ytInitialPlayerResponse not found' };
|
|
26
|
+
|
|
27
|
+
const details = player.videoDetails || {};
|
|
28
|
+
const microformat = player.microformat?.playerMicroformatRenderer || {};
|
|
29
|
+
|
|
30
|
+
// Try to get full description from ytInitialData
|
|
31
|
+
let fullDescription = details.shortDescription || '';
|
|
32
|
+
try {
|
|
33
|
+
const contents = yt?.contents?.twoColumnWatchNextResults
|
|
34
|
+
?.results?.results?.contents;
|
|
35
|
+
if (contents) {
|
|
36
|
+
for (const c of contents) {
|
|
37
|
+
const desc = c.videoSecondaryInfoRenderer?.attributedDescription?.content;
|
|
38
|
+
if (desc) { fullDescription = desc; break; }
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
} catch {}
|
|
42
|
+
|
|
43
|
+
// Get like count if available
|
|
44
|
+
let likes = '';
|
|
45
|
+
try {
|
|
46
|
+
const contents = yt?.contents?.twoColumnWatchNextResults
|
|
47
|
+
?.results?.results?.contents;
|
|
48
|
+
if (contents) {
|
|
49
|
+
for (const c of contents) {
|
|
50
|
+
const buttons = c.videoPrimaryInfoRenderer?.videoActions
|
|
51
|
+
?.menuRenderer?.topLevelButtons;
|
|
52
|
+
if (buttons) {
|
|
53
|
+
for (const b of buttons) {
|
|
54
|
+
const toggle = b.segmentedLikeDislikeButtonViewModel
|
|
55
|
+
?.likeButtonViewModel?.likeButtonViewModel?.toggleButtonViewModel
|
|
56
|
+
?.toggleButtonViewModel?.defaultButtonViewModel?.buttonViewModel;
|
|
57
|
+
if (toggle?.title) { likes = toggle.title; break; }
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
} catch {}
|
|
63
|
+
|
|
64
|
+
// Get publish date
|
|
65
|
+
const publishDate = microformat.publishDate
|
|
66
|
+
|| microformat.uploadDate
|
|
67
|
+
|| details.publishDate || '';
|
|
68
|
+
|
|
69
|
+
// Get category
|
|
70
|
+
const category = microformat.category || '';
|
|
71
|
+
|
|
72
|
+
// Get channel subscriber count if available
|
|
73
|
+
let subscribers = '';
|
|
74
|
+
try {
|
|
75
|
+
const contents = yt?.contents?.twoColumnWatchNextResults
|
|
76
|
+
?.results?.results?.contents;
|
|
77
|
+
if (contents) {
|
|
78
|
+
for (const c of contents) {
|
|
79
|
+
const owner = c.videoSecondaryInfoRenderer?.owner
|
|
80
|
+
?.videoOwnerRenderer?.subscriberCountText?.simpleText;
|
|
81
|
+
if (owner) { subscribers = owner; break; }
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
} catch {}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
title: details.title || '',
|
|
88
|
+
channel: details.author || '',
|
|
89
|
+
channelId: details.channelId || '',
|
|
90
|
+
videoId: details.videoId || '',
|
|
91
|
+
views: details.viewCount || '',
|
|
92
|
+
likes,
|
|
93
|
+
subscribers,
|
|
94
|
+
duration: details.lengthSeconds ? details.lengthSeconds + 's' : '',
|
|
95
|
+
publishDate,
|
|
96
|
+
category,
|
|
97
|
+
description: fullDescription,
|
|
98
|
+
keywords: (details.keywords || []).join(', '),
|
|
99
|
+
isLive: details.isLiveContent || false,
|
|
100
|
+
thumbnail: details.thumbnail?.thumbnails?.slice(-1)?.[0]?.url || '',
|
|
101
|
+
};
|
|
102
|
+
})()
|
|
103
|
+
`);
|
|
104
|
+
if (!data || typeof data !== 'object')
|
|
105
|
+
throw new Error('Failed to extract video metadata from page');
|
|
106
|
+
if (data.error)
|
|
107
|
+
throw new Error(data.error);
|
|
108
|
+
// Return as field/value pairs for table display
|
|
109
|
+
return Object.entries(data).map(([field, value]) => ({
|
|
110
|
+
field,
|
|
111
|
+
value: String(value),
|
|
112
|
+
}));
|
|
113
|
+
},
|
|
114
|
+
});
|
package/dist/engine.js
CHANGED
|
@@ -100,7 +100,8 @@ async function discoverClisFromFs(dir) {
|
|
|
100
100
|
if (file.endsWith('.yaml') || file.endsWith('.yml')) {
|
|
101
101
|
registerYamlCli(filePath, site);
|
|
102
102
|
}
|
|
103
|
-
else if (file.endsWith('.js') && !file.endsWith('.d.js'))
|
|
103
|
+
else if ((file.endsWith('.js') && !file.endsWith('.d.js')) ||
|
|
104
|
+
(file.endsWith('.ts') && !file.endsWith('.d.ts') && !file.endsWith('.test.ts'))) {
|
|
104
105
|
promises.push(import(`file://${filePath}`).catch((err) => {
|
|
105
106
|
process.stderr.write(`Warning: failed to load module ${filePath}: ${err.message}\n`);
|
|
106
107
|
}));
|
package/dist/main.js
CHANGED
|
@@ -65,9 +65,17 @@ program.command('list').description('List all available CLI commands').option('-
|
|
|
65
65
|
console.log();
|
|
66
66
|
});
|
|
67
67
|
program.command('validate').description('Validate CLI definitions').argument('[target]', 'site or site/name')
|
|
68
|
-
.action(async (target) => {
|
|
68
|
+
.action(async (target) => {
|
|
69
|
+
const { validateClisWithTarget, renderValidationReport } = await import('./validate.js');
|
|
70
|
+
console.log(renderValidationReport(validateClisWithTarget([BUILTIN_CLIS, USER_CLIS], target)));
|
|
71
|
+
});
|
|
69
72
|
program.command('verify').description('Validate + smoke test').argument('[target]').option('--smoke', 'Run smoke tests', false)
|
|
70
|
-
.action(async (target, opts) => {
|
|
73
|
+
.action(async (target, opts) => {
|
|
74
|
+
const { verifyClis, renderVerifyReport } = await import('./verify.js');
|
|
75
|
+
const r = await verifyClis({ builtinClis: BUILTIN_CLIS, userClis: USER_CLIS, target, smoke: opts.smoke });
|
|
76
|
+
console.log(renderVerifyReport(r));
|
|
77
|
+
process.exitCode = r.ok ? 0 : 1;
|
|
78
|
+
});
|
|
71
79
|
program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3').option('--auto', 'Enable interactive fuzzing (simulate clicks to trigger lazy APIs)').option('--click <labels>', 'Comma-separated labels to click before fuzzing (e.g. "字幕,CC,评论")')
|
|
72
80
|
.action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); const clickLabels = opts.click ? opts.click.split(',').map((s) => s.trim()) : undefined; console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait), auto: opts.auto, clickLabels }))); });
|
|
73
81
|
program.command('synthesize').description('Synthesize CLIs from explore').argument('<target>').option('--top <n>', '', '3')
|
package/dist/output.js
CHANGED
|
@@ -85,7 +85,8 @@ function renderCsv(data, opts) {
|
|
|
85
85
|
for (const row of rows) {
|
|
86
86
|
console.log(columns.map(c => {
|
|
87
87
|
const v = String(row[c] ?? '');
|
|
88
|
-
return v.includes(',') || v.includes('"')
|
|
88
|
+
return v.includes(',') || v.includes('"') || v.includes('\n')
|
|
89
|
+
? `"${v.replace(/"/g, '""')}"` : v;
|
|
89
90
|
}).join(','));
|
|
90
91
|
}
|
|
91
92
|
}
|
package/dist/registry.d.ts
CHANGED
|
@@ -37,18 +37,11 @@ export interface InternalCliCommand extends CliCommand {
|
|
|
37
37
|
_lazy?: boolean;
|
|
38
38
|
_modulePath?: string;
|
|
39
39
|
}
|
|
40
|
-
export interface CliOptions {
|
|
40
|
+
export interface CliOptions extends Partial<Omit<CliCommand, 'args' | 'description'>> {
|
|
41
41
|
site: string;
|
|
42
42
|
name: string;
|
|
43
43
|
description?: string;
|
|
44
|
-
domain?: string;
|
|
45
|
-
strategy?: Strategy;
|
|
46
|
-
browser?: boolean;
|
|
47
44
|
args?: Arg[];
|
|
48
|
-
columns?: string[];
|
|
49
|
-
func?: (page: IPage, kwargs: Record<string, any>, debug?: boolean) => Promise<any>;
|
|
50
|
-
pipeline?: any[];
|
|
51
|
-
timeoutSeconds?: number;
|
|
52
45
|
}
|
|
53
46
|
export declare function cli(opts: CliOptions): CliCommand;
|
|
54
47
|
export declare function getRegistry(): Map<string, CliCommand>;
|
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Aria snapshot formatter: parses Playwright MCP snapshot text into clean format.
|
|
3
|
+
*
|
|
4
|
+
* Multi-pass pipeline:
|
|
5
|
+
* 1. Parse & filter: strip annotations, metadata, noise roles, ads, decorators
|
|
6
|
+
* 2. Deduplicate: generic/text child matching parent label
|
|
7
|
+
* 3. Deduplicate: heading + link with identical labels
|
|
8
|
+
* 4. Deduplicate: nested identical links
|
|
9
|
+
* 5. Prune: empty containers (iterative bottom-up)
|
|
10
|
+
* 6. Collapse: single-child containers
|
|
3
11
|
*/
|
|
4
12
|
export interface FormatOptions {
|
|
5
13
|
interactive?: boolean;
|
|
6
14
|
compact?: boolean;
|
|
7
15
|
maxDepth?: number;
|
|
16
|
+
maxTextLength?: number;
|
|
8
17
|
}
|
|
9
18
|
export declare function formatSnapshot(raw: string, opts?: FormatOptions): string;
|
|
@@ -1,41 +1,378 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Aria snapshot formatter: parses Playwright MCP snapshot text into clean format.
|
|
3
|
+
*
|
|
4
|
+
* Multi-pass pipeline:
|
|
5
|
+
* 1. Parse & filter: strip annotations, metadata, noise roles, ads, decorators
|
|
6
|
+
* 2. Deduplicate: generic/text child matching parent label
|
|
7
|
+
* 3. Deduplicate: heading + link with identical labels
|
|
8
|
+
* 4. Deduplicate: nested identical links
|
|
9
|
+
* 5. Prune: empty containers (iterative bottom-up)
|
|
10
|
+
* 6. Collapse: single-child containers
|
|
3
11
|
*/
|
|
12
|
+
const DEFAULT_MAX_TEXT_LENGTH = 200;
|
|
13
|
+
// Roles that are pure noise and should always be filtered
|
|
14
|
+
const NOISE_ROLES = new Set([
|
|
15
|
+
'none', 'presentation', 'separator', 'paragraph', 'tooltip', 'status',
|
|
16
|
+
]);
|
|
17
|
+
// Roles whose entire subtree should be removed (footer boilerplate, etc.)
|
|
18
|
+
const SUBTREE_NOISE_ROLES = new Set([
|
|
19
|
+
'contentinfo',
|
|
20
|
+
]);
|
|
21
|
+
// Roles considered interactive (clickable/typeable)
|
|
22
|
+
const INTERACTIVE_ROLES = new Set([
|
|
23
|
+
'button', 'link', 'textbox', 'checkbox', 'radio',
|
|
24
|
+
'combobox', 'tab', 'menuitem', 'option', 'switch',
|
|
25
|
+
'slider', 'spinbutton', 'searchbox',
|
|
26
|
+
]);
|
|
27
|
+
// Structural landmark roles kept even in interactive mode
|
|
28
|
+
const LANDMARK_ROLES = new Set([
|
|
29
|
+
'main', 'navigation', 'banner', 'heading', 'search',
|
|
30
|
+
'region', 'list', 'listitem', 'article', 'complementary',
|
|
31
|
+
'group', 'toolbar', 'tablist',
|
|
32
|
+
]);
|
|
33
|
+
// Container roles eligible for pruning and collapse
|
|
34
|
+
const CONTAINER_ROLES = new Set([
|
|
35
|
+
'list', 'listitem', 'group', 'toolbar', 'tablist',
|
|
36
|
+
'navigation', 'region', 'complementary',
|
|
37
|
+
'search', 'article', 'paragraph', 'figure',
|
|
38
|
+
]);
|
|
39
|
+
// Decorator / separator text that adds no semantic value
|
|
40
|
+
const DECORATOR_TEXT = new Set(['•', '·', '|', '—', '-', '/', '\\']);
|
|
41
|
+
// Ad-related URL patterns
|
|
42
|
+
const AD_URL_PATTERNS = [
|
|
43
|
+
'googleadservices.com/pagead/',
|
|
44
|
+
'alb.reddit.com/cr?',
|
|
45
|
+
'doubleclick.net/',
|
|
46
|
+
'cm.bilibili.com/cm/api/fees/',
|
|
47
|
+
];
|
|
48
|
+
// Boilerplate button labels to filter (back-to-top, etc.)
|
|
49
|
+
const BOILERPLATE_LABELS = [
|
|
50
|
+
'回到顶部', 'back to top', 'scroll to top', 'go to top',
|
|
51
|
+
];
|
|
52
|
+
/**
|
|
53
|
+
* Parse role and text from a trimmed snapshot line.
|
|
54
|
+
* Handles quoted labels and trailing text after colon correctly,
|
|
55
|
+
* including lines wrapped in single quotes by Playwright.
|
|
56
|
+
*/
|
|
57
|
+
function parseLine(trimmed) {
|
|
58
|
+
// Unwrap outer single quotes if present (Playwright wraps lines with special chars)
|
|
59
|
+
let line = trimmed;
|
|
60
|
+
if (line.startsWith("'") && line.endsWith("':")) {
|
|
61
|
+
line = line.slice(1, -2) + ':';
|
|
62
|
+
}
|
|
63
|
+
else if (line.startsWith("'") && line.endsWith("'")) {
|
|
64
|
+
line = line.slice(1, -1);
|
|
65
|
+
}
|
|
66
|
+
// Role is the first word
|
|
67
|
+
const roleMatch = line.match(/^([a-zA-Z]+)\b/);
|
|
68
|
+
const role = roleMatch ? roleMatch[1].toLowerCase() : '';
|
|
69
|
+
// Extract quoted text content (the semantic label)
|
|
70
|
+
const textMatch = line.match(/"([^"]*)"/);
|
|
71
|
+
const text = textMatch ? textMatch[1] : '';
|
|
72
|
+
// For trailing text: strip annotations and quoted strings first, then check after last colon
|
|
73
|
+
// This avoids matching colons inside quoted labels like "Account: user@email.com"
|
|
74
|
+
let stripped = line;
|
|
75
|
+
// Remove all quoted strings
|
|
76
|
+
stripped = stripped.replace(/"[^"]*"/g, '""');
|
|
77
|
+
// Remove all bracket annotations
|
|
78
|
+
stripped = stripped.replace(/\[[^\]]*\]/g, '');
|
|
79
|
+
const colonIdx = stripped.lastIndexOf(':');
|
|
80
|
+
let trailingText = '';
|
|
81
|
+
if (colonIdx !== -1) {
|
|
82
|
+
const afterColon = stripped.slice(colonIdx + 1).trim();
|
|
83
|
+
if (afterColon.length > 0) {
|
|
84
|
+
// Get the actual trailing text from original line at same position
|
|
85
|
+
const origColonIdx = line.lastIndexOf(':');
|
|
86
|
+
if (origColonIdx !== -1) {
|
|
87
|
+
trailingText = line.slice(origColonIdx + 1).trim();
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return { role, text, hasText: text.length > 0 || trailingText.length > 0, trailingText };
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Strip ALL bracket annotations from a content line, preserving quoted strings.
|
|
95
|
+
* Handles both double-quoted and outer single-quoted lines from Playwright.
|
|
96
|
+
*/
|
|
97
|
+
function stripAnnotations(content) {
|
|
98
|
+
// Unwrap outer single quotes first
|
|
99
|
+
let line = content;
|
|
100
|
+
if (line.startsWith("'") && (line.endsWith("':") || line.endsWith("'"))) {
|
|
101
|
+
if (line.endsWith("':")) {
|
|
102
|
+
line = line.slice(1, -2) + ':';
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
line = line.slice(1, -1);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Split by double quotes to protect quoted content
|
|
109
|
+
const parts = line.split('"');
|
|
110
|
+
for (let i = 0; i < parts.length; i += 2) {
|
|
111
|
+
// Only strip annotations from non-quoted parts (even indices)
|
|
112
|
+
parts[i] = parts[i].replace(/\s*\[[^\]]*\]/g, '');
|
|
113
|
+
}
|
|
114
|
+
let result = parts.join('"').replace(/\s{2,}/g, ' ').trim();
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Check if a line is a metadata-only line (like /url: ...).
|
|
119
|
+
*/
|
|
120
|
+
function isMetadataLine(trimmed) {
|
|
121
|
+
return /^\/[a-zA-Z]+:/.test(trimmed);
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Check if text content is purely decorative (separators, dots, etc.)
|
|
125
|
+
*/
|
|
126
|
+
function isDecoratorText(text) {
|
|
127
|
+
return DECORATOR_TEXT.has(text.trim());
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Check if a node is ad-related based on its text content.
|
|
131
|
+
*/
|
|
132
|
+
function isAdNode(text, trailingText) {
|
|
133
|
+
const t = (text + ' ' + trailingText).toLowerCase();
|
|
134
|
+
if (t.includes('sponsored') || t.includes('advertisement'))
|
|
135
|
+
return true;
|
|
136
|
+
if (t.includes('广告'))
|
|
137
|
+
return true;
|
|
138
|
+
// Check for ad tracking URLs in the label
|
|
139
|
+
for (const pattern of AD_URL_PATTERNS) {
|
|
140
|
+
if (text.includes(pattern) || trailingText.includes(pattern))
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Check if a node is boilerplate UI (back-to-top, etc.)
|
|
147
|
+
*/
|
|
148
|
+
function isBoilerplateNode(text) {
|
|
149
|
+
const t = text.toLowerCase();
|
|
150
|
+
return BOILERPLATE_LABELS.some(label => t.includes(label));
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Check if a role is noise that should be filtered.
|
|
154
|
+
*/
|
|
155
|
+
function isNoiseNode(role, hasText, text, trailingText) {
|
|
156
|
+
if (NOISE_ROLES.has(role))
|
|
157
|
+
return true;
|
|
158
|
+
// generic without text is a wrapper
|
|
159
|
+
if (role === 'generic' && !hasText)
|
|
160
|
+
return true;
|
|
161
|
+
// img without alt text is noise
|
|
162
|
+
if (role === 'img' && !hasText)
|
|
163
|
+
return true;
|
|
164
|
+
// Decorator-only text nodes
|
|
165
|
+
if ((role === 'generic' || role === 'text') && hasText) {
|
|
166
|
+
const content = trailingText || text;
|
|
167
|
+
if (isDecoratorText(content))
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
4
172
|
export function formatSnapshot(raw, opts = {}) {
|
|
5
173
|
if (!raw || typeof raw !== 'string')
|
|
6
174
|
return '';
|
|
175
|
+
const maxTextLen = opts.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
|
|
7
176
|
const lines = raw.split('\n');
|
|
8
|
-
|
|
177
|
+
// === Pass 1: Parse, filter, and collect entries ===
|
|
178
|
+
const entries = [];
|
|
9
179
|
let refCounter = 0;
|
|
10
|
-
|
|
180
|
+
let skipUntilDepth = -1; // When >= 0, skip all nodes at depth > this value
|
|
181
|
+
for (let i = 0; i < lines.length; i++) {
|
|
182
|
+
const line = lines[i];
|
|
11
183
|
if (!line.trim())
|
|
12
184
|
continue;
|
|
13
185
|
const indent = line.length - line.trimStart().length;
|
|
14
186
|
const depth = Math.floor(indent / 2);
|
|
15
|
-
|
|
16
|
-
|
|
187
|
+
// If we're in a subtree skip zone, check depth
|
|
188
|
+
if (skipUntilDepth >= 0) {
|
|
189
|
+
if (depth > skipUntilDepth)
|
|
190
|
+
continue; // still inside subtree
|
|
191
|
+
skipUntilDepth = -1; // exited subtree
|
|
192
|
+
}
|
|
17
193
|
let content = line.trimStart();
|
|
18
|
-
//
|
|
19
|
-
if (
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
194
|
+
// Strip leading "- "
|
|
195
|
+
if (content.startsWith('- ')) {
|
|
196
|
+
content = content.slice(2);
|
|
197
|
+
}
|
|
198
|
+
// Skip metadata lines
|
|
199
|
+
if (isMetadataLine(content))
|
|
200
|
+
continue;
|
|
201
|
+
// Apply maxDepth filter
|
|
202
|
+
if (opts.maxDepth !== undefined && depth > opts.maxDepth)
|
|
203
|
+
continue;
|
|
204
|
+
const { role, text, hasText, trailingText } = parseLine(content);
|
|
205
|
+
// Skip noise nodes
|
|
206
|
+
if (isNoiseNode(role, hasText, text, trailingText))
|
|
207
|
+
continue;
|
|
208
|
+
// Skip subtree noise roles (contentinfo footer, etc.) — skip entire subtree
|
|
209
|
+
if (SUBTREE_NOISE_ROLES.has(role)) {
|
|
210
|
+
skipUntilDepth = depth;
|
|
211
|
+
continue;
|
|
24
212
|
}
|
|
25
|
-
//
|
|
213
|
+
// Strip annotations
|
|
214
|
+
content = stripAnnotations(content);
|
|
215
|
+
// Check if node should trigger subtree skip (ads, boilerplate)
|
|
216
|
+
const isSubtreeSkip = isAdNode(text, trailingText) || isBoilerplateNode(text);
|
|
217
|
+
// Interactive mode filter
|
|
218
|
+
const isInteractive = INTERACTIVE_ROLES.has(role);
|
|
219
|
+
const isLandmark = LANDMARK_ROLES.has(role);
|
|
220
|
+
if (opts.interactive && !isInteractive && !isLandmark && !hasText)
|
|
221
|
+
continue;
|
|
222
|
+
// Compact mode
|
|
26
223
|
if (opts.compact) {
|
|
27
224
|
content = content
|
|
28
225
|
.replace(/\s*\[.*?\]\s*/g, ' ')
|
|
29
226
|
.replace(/\s+/g, ' ')
|
|
30
227
|
.trim();
|
|
31
228
|
}
|
|
229
|
+
// Text truncation
|
|
230
|
+
if (maxTextLen > 0 && content.length > maxTextLen) {
|
|
231
|
+
content = content.slice(0, maxTextLen) + '…';
|
|
232
|
+
}
|
|
32
233
|
// Assign refs to interactive elements
|
|
33
|
-
|
|
34
|
-
if (interactivePattern.test(content)) {
|
|
234
|
+
if (isInteractive) {
|
|
35
235
|
refCounter++;
|
|
36
236
|
content = `[@${refCounter}] ${content}`;
|
|
37
237
|
}
|
|
38
|
-
|
|
238
|
+
entries.push({ depth, content, role, text, trailingText, isInteractive, isLandmark, isSubtreeSkip });
|
|
239
|
+
}
|
|
240
|
+
// === Pass 2: Remove subtree-skip nodes (ads, boilerplate, contentinfo) ===
|
|
241
|
+
let noAds = [];
|
|
242
|
+
for (let i = 0; i < entries.length; i++) {
|
|
243
|
+
const entry = entries[i];
|
|
244
|
+
if (entry.isSubtreeSkip) {
|
|
245
|
+
const skipDepth = entry.depth;
|
|
246
|
+
i++;
|
|
247
|
+
while (i < entries.length && entries[i].depth > skipDepth) {
|
|
248
|
+
i++;
|
|
249
|
+
}
|
|
250
|
+
i--;
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
noAds.push(entry);
|
|
254
|
+
}
|
|
255
|
+
// === Pass 3: Deduplicate child generic/text matching parent label ===
|
|
256
|
+
let deduped = [];
|
|
257
|
+
for (let i = 0; i < noAds.length; i++) {
|
|
258
|
+
const entry = noAds[i];
|
|
259
|
+
if (entry.role === 'generic' || entry.role === 'text') {
|
|
260
|
+
let parent;
|
|
261
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
262
|
+
if (deduped[j].depth < entry.depth) {
|
|
263
|
+
parent = deduped[j];
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
if (deduped[j].depth === entry.depth)
|
|
267
|
+
break;
|
|
268
|
+
}
|
|
269
|
+
if (parent) {
|
|
270
|
+
const childText = entry.trailingText || entry.text;
|
|
271
|
+
if (childText && parent.text && childText === parent.text) {
|
|
272
|
+
continue;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
deduped.push(entry);
|
|
277
|
+
}
|
|
278
|
+
// === Pass 4: Deduplicate heading + child link with identical label ===
|
|
279
|
+
// Pattern: heading "Title": → link "Title": (same text) → skip the link
|
|
280
|
+
const deduped2 = [];
|
|
281
|
+
for (let i = 0; i < deduped.length; i++) {
|
|
282
|
+
const entry = deduped[i];
|
|
283
|
+
if (entry.role === 'heading' && entry.text) {
|
|
284
|
+
const next = deduped[i + 1];
|
|
285
|
+
if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
|
|
286
|
+
// Keep the heading, skip the link. But preserve link's children re-parented.
|
|
287
|
+
deduped2.push(entry);
|
|
288
|
+
i++; // skip the link
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
deduped2.push(entry);
|
|
293
|
+
}
|
|
294
|
+
// === Pass 5: Deduplicate nested identical links ===
|
|
295
|
+
const deduped3 = [];
|
|
296
|
+
for (let i = 0; i < deduped2.length; i++) {
|
|
297
|
+
const entry = deduped2[i];
|
|
298
|
+
if (entry.role === 'link' && entry.text) {
|
|
299
|
+
const next = deduped2[i + 1];
|
|
300
|
+
if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
|
|
301
|
+
continue; // Skip parent, keep child
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
deduped3.push(entry);
|
|
305
|
+
}
|
|
306
|
+
// === Pass 6: Iteratively prune empty containers (bottom-up) ===
|
|
307
|
+
let current = deduped3;
|
|
308
|
+
let changed = true;
|
|
309
|
+
while (changed) {
|
|
310
|
+
changed = false;
|
|
311
|
+
const next = [];
|
|
312
|
+
for (let i = 0; i < current.length; i++) {
|
|
313
|
+
const entry = current[i];
|
|
314
|
+
if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
|
|
315
|
+
let hasChildren = false;
|
|
316
|
+
for (let j = i + 1; j < current.length; j++) {
|
|
317
|
+
if (current[j].depth <= entry.depth)
|
|
318
|
+
break;
|
|
319
|
+
if (current[j].depth > entry.depth) {
|
|
320
|
+
hasChildren = true;
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
if (!hasChildren) {
|
|
325
|
+
changed = true;
|
|
326
|
+
continue;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
next.push(entry);
|
|
330
|
+
}
|
|
331
|
+
current = next;
|
|
332
|
+
}
|
|
333
|
+
// === Pass 7: Collapse single-child containers ===
|
|
334
|
+
const collapsed = [];
|
|
335
|
+
for (let i = 0; i < current.length; i++) {
|
|
336
|
+
const entry = current[i];
|
|
337
|
+
if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
|
|
338
|
+
let childCount = 0;
|
|
339
|
+
let childIdx = -1;
|
|
340
|
+
for (let j = i + 1; j < current.length; j++) {
|
|
341
|
+
if (current[j].depth <= entry.depth)
|
|
342
|
+
break;
|
|
343
|
+
if (current[j].depth === entry.depth + 1) {
|
|
344
|
+
childCount++;
|
|
345
|
+
if (childCount === 1)
|
|
346
|
+
childIdx = j;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
if (childCount === 1 && childIdx !== -1) {
|
|
350
|
+
const child = current[childIdx];
|
|
351
|
+
let hasGrandchildren = false;
|
|
352
|
+
for (let j = childIdx + 1; j < current.length; j++) {
|
|
353
|
+
if (current[j].depth <= child.depth)
|
|
354
|
+
break;
|
|
355
|
+
if (current[j].depth > child.depth) {
|
|
356
|
+
hasGrandchildren = true;
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (!hasGrandchildren) {
|
|
361
|
+
const mergedContent = entry.content.replace(/:$/, '') + ' > ' + child.content;
|
|
362
|
+
collapsed.push({
|
|
363
|
+
...entry,
|
|
364
|
+
content: mergedContent,
|
|
365
|
+
role: child.role,
|
|
366
|
+
text: child.text,
|
|
367
|
+
trailingText: child.trailingText,
|
|
368
|
+
isInteractive: child.isInteractive,
|
|
369
|
+
});
|
|
370
|
+
i++;
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
collapsed.push(entry);
|
|
39
376
|
}
|
|
40
|
-
return
|
|
377
|
+
return collapsed.map(e => ' '.repeat(e.depth) + e.content).join('\n');
|
|
41
378
|
}
|