@jackwener/opencli 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/LICENSE +190 -28
  2. package/README.md +6 -5
  3. package/README.zh-CN.md +5 -4
  4. package/SKILL.md +18 -4
  5. package/dist/browser.js +2 -3
  6. package/dist/cli-manifest.json +195 -22
  7. package/dist/clis/linkedin/search.d.ts +1 -0
  8. package/dist/clis/linkedin/search.js +366 -0
  9. package/dist/clis/reddit/read.d.ts +1 -0
  10. package/dist/clis/reddit/read.js +184 -0
  11. package/dist/clis/youtube/transcript-group.d.ts +44 -0
  12. package/dist/clis/youtube/transcript-group.js +226 -0
  13. package/dist/clis/youtube/transcript-group.test.d.ts +1 -0
  14. package/dist/clis/youtube/transcript-group.test.js +99 -0
  15. package/dist/clis/youtube/transcript.d.ts +1 -0
  16. package/dist/clis/youtube/transcript.js +264 -0
  17. package/dist/clis/youtube/utils.d.ts +8 -0
  18. package/dist/clis/youtube/utils.js +28 -0
  19. package/dist/clis/youtube/video.d.ts +1 -0
  20. package/dist/clis/youtube/video.js +114 -0
  21. package/dist/engine.js +2 -1
  22. package/dist/main.js +10 -2
  23. package/dist/output.js +2 -1
  24. package/dist/registry.d.ts +1 -8
  25. package/dist/snapshotFormatter.d.ts +9 -0
  26. package/dist/snapshotFormatter.js +352 -15
  27. package/dist/snapshotFormatter.test.d.ts +7 -0
  28. package/dist/snapshotFormatter.test.js +521 -0
  29. package/dist/validate.d.ts +14 -2
  30. package/dist/verify.d.ts +14 -2
  31. package/package.json +2 -2
  32. package/src/browser.ts +2 -4
  33. package/src/clis/linkedin/search.ts +416 -0
  34. package/src/clis/reddit/read.ts +186 -0
  35. package/src/clis/youtube/transcript-group.test.ts +108 -0
  36. package/src/clis/youtube/transcript-group.ts +287 -0
  37. package/src/clis/youtube/transcript.ts +280 -0
  38. package/src/clis/youtube/utils.ts +28 -0
  39. package/src/clis/youtube/video.ts +116 -0
  40. package/src/engine.ts +4 -1
  41. package/src/main.ts +10 -2
  42. package/src/output.ts +2 -1
  43. package/src/registry.ts +1 -8
  44. package/src/snapshotFormatter.test.ts +579 -0
  45. package/src/snapshotFormatter.ts +399 -13
  46. package/src/validate.ts +19 -4
  47. package/src/verify.ts +17 -3
  48. package/vitest.config.ts +15 -1
  49. package/dist/clis/reddit/read.yaml +0 -76
  50. package/src/clis/reddit/read.yaml +0 -76
@@ -0,0 +1,114 @@
1
+ /**
2
+ * YouTube video metadata — read ytInitialPlayerResponse + ytInitialData from video page.
3
+ */
4
+ import { cli, Strategy } from '../../registry.js';
5
+ import { parseVideoId } from './utils.js';
6
+ cli({
7
+ site: 'youtube',
8
+ name: 'video',
9
+ description: 'Get YouTube video metadata (title, views, description, etc.)',
10
+ domain: 'www.youtube.com',
11
+ strategy: Strategy.COOKIE,
12
+ args: [
13
+ { name: 'url', required: true, help: 'YouTube video URL or video ID' },
14
+ ],
15
+ columns: ['field', 'value'],
16
+ func: async (page, kwargs) => {
17
+ const videoId = parseVideoId(kwargs.url);
18
+ const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
19
+ await page.goto(videoUrl);
20
+ await page.wait(3);
21
+ const data = await page.evaluate(`
22
+ (async () => {
23
+ const player = window.ytInitialPlayerResponse;
24
+ const yt = window.ytInitialData;
25
+ if (!player) return { error: 'ytInitialPlayerResponse not found' };
26
+
27
+ const details = player.videoDetails || {};
28
+ const microformat = player.microformat?.playerMicroformatRenderer || {};
29
+
30
+ // Try to get full description from ytInitialData
31
+ let fullDescription = details.shortDescription || '';
32
+ try {
33
+ const contents = yt?.contents?.twoColumnWatchNextResults
34
+ ?.results?.results?.contents;
35
+ if (contents) {
36
+ for (const c of contents) {
37
+ const desc = c.videoSecondaryInfoRenderer?.attributedDescription?.content;
38
+ if (desc) { fullDescription = desc; break; }
39
+ }
40
+ }
41
+ } catch {}
42
+
43
+ // Get like count if available
44
+ let likes = '';
45
+ try {
46
+ const contents = yt?.contents?.twoColumnWatchNextResults
47
+ ?.results?.results?.contents;
48
+ if (contents) {
49
+ for (const c of contents) {
50
+ const buttons = c.videoPrimaryInfoRenderer?.videoActions
51
+ ?.menuRenderer?.topLevelButtons;
52
+ if (buttons) {
53
+ for (const b of buttons) {
54
+ const toggle = b.segmentedLikeDislikeButtonViewModel
55
+ ?.likeButtonViewModel?.likeButtonViewModel?.toggleButtonViewModel
56
+ ?.toggleButtonViewModel?.defaultButtonViewModel?.buttonViewModel;
57
+ if (toggle?.title) { likes = toggle.title; break; }
58
+ }
59
+ }
60
+ }
61
+ }
62
+ } catch {}
63
+
64
+ // Get publish date
65
+ const publishDate = microformat.publishDate
66
+ || microformat.uploadDate
67
+ || details.publishDate || '';
68
+
69
+ // Get category
70
+ const category = microformat.category || '';
71
+
72
+ // Get channel subscriber count if available
73
+ let subscribers = '';
74
+ try {
75
+ const contents = yt?.contents?.twoColumnWatchNextResults
76
+ ?.results?.results?.contents;
77
+ if (contents) {
78
+ for (const c of contents) {
79
+ const owner = c.videoSecondaryInfoRenderer?.owner
80
+ ?.videoOwnerRenderer?.subscriberCountText?.simpleText;
81
+ if (owner) { subscribers = owner; break; }
82
+ }
83
+ }
84
+ } catch {}
85
+
86
+ return {
87
+ title: details.title || '',
88
+ channel: details.author || '',
89
+ channelId: details.channelId || '',
90
+ videoId: details.videoId || '',
91
+ views: details.viewCount || '',
92
+ likes,
93
+ subscribers,
94
+ duration: details.lengthSeconds ? details.lengthSeconds + 's' : '',
95
+ publishDate,
96
+ category,
97
+ description: fullDescription,
98
+ keywords: (details.keywords || []).join(', '),
99
+ isLive: details.isLiveContent || false,
100
+ thumbnail: details.thumbnail?.thumbnails?.slice(-1)?.[0]?.url || '',
101
+ };
102
+ })()
103
+ `);
104
+ if (!data || typeof data !== 'object')
105
+ throw new Error('Failed to extract video metadata from page');
106
+ if (data.error)
107
+ throw new Error(data.error);
108
+ // Return as field/value pairs for table display
109
+ return Object.entries(data).map(([field, value]) => ({
110
+ field,
111
+ value: String(value),
112
+ }));
113
+ },
114
+ });
package/dist/engine.js CHANGED
@@ -100,7 +100,8 @@ async function discoverClisFromFs(dir) {
100
100
  if (file.endsWith('.yaml') || file.endsWith('.yml')) {
101
101
  registerYamlCli(filePath, site);
102
102
  }
103
- else if (file.endsWith('.js') && !file.endsWith('.d.js')) {
103
+ else if ((file.endsWith('.js') && !file.endsWith('.d.js')) ||
104
+ (file.endsWith('.ts') && !file.endsWith('.d.ts') && !file.endsWith('.test.ts'))) {
104
105
  promises.push(import(`file://${filePath}`).catch((err) => {
105
106
  process.stderr.write(`Warning: failed to load module ${filePath}: ${err.message}\n`);
106
107
  }));
package/dist/main.js CHANGED
@@ -65,9 +65,17 @@ program.command('list').description('List all available CLI commands').option('-
65
65
  console.log();
66
66
  });
67
67
  program.command('validate').description('Validate CLI definitions').argument('[target]', 'site or site/name')
68
- .action(async (target) => { const { validateClisWithTarget, renderValidationReport } = await import('./validate.js'); console.log(renderValidationReport(validateClisWithTarget([BUILTIN_CLIS, USER_CLIS], target))); });
68
+ .action(async (target) => {
69
+ const { validateClisWithTarget, renderValidationReport } = await import('./validate.js');
70
+ console.log(renderValidationReport(validateClisWithTarget([BUILTIN_CLIS, USER_CLIS], target)));
71
+ });
69
72
  program.command('verify').description('Validate + smoke test').argument('[target]').option('--smoke', 'Run smoke tests', false)
70
- .action(async (target, opts) => { const { verifyClis, renderVerifyReport } = await import('./verify.js'); const r = await verifyClis({ builtinClis: BUILTIN_CLIS, userClis: USER_CLIS, target, smoke: opts.smoke }); console.log(renderVerifyReport(r)); process.exitCode = r.ok ? 0 : 1; });
73
+ .action(async (target, opts) => {
74
+ const { verifyClis, renderVerifyReport } = await import('./verify.js');
75
+ const r = await verifyClis({ builtinClis: BUILTIN_CLIS, userClis: USER_CLIS, target, smoke: opts.smoke });
76
+ console.log(renderVerifyReport(r));
77
+ process.exitCode = r.ok ? 0 : 1;
78
+ });
71
79
  program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3').option('--auto', 'Enable interactive fuzzing (simulate clicks to trigger lazy APIs)').option('--click <labels>', 'Comma-separated labels to click before fuzzing (e.g. "字幕,CC,评论")')
72
80
  .action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); const clickLabels = opts.click ? opts.click.split(',').map((s) => s.trim()) : undefined; console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait), auto: opts.auto, clickLabels }))); });
73
81
  program.command('synthesize').description('Synthesize CLIs from explore').argument('<target>').option('--top <n>', '', '3')
package/dist/output.js CHANGED
@@ -85,7 +85,8 @@ function renderCsv(data, opts) {
85
85
  for (const row of rows) {
86
86
  console.log(columns.map(c => {
87
87
  const v = String(row[c] ?? '');
88
- return v.includes(',') || v.includes('"') ? `"${v.replace(/"/g, '""')}"` : v;
88
+ return v.includes(',') || v.includes('"') || v.includes('\n')
89
+ ? `"${v.replace(/"/g, '""')}"` : v;
89
90
  }).join(','));
90
91
  }
91
92
  }
@@ -37,18 +37,11 @@ export interface InternalCliCommand extends CliCommand {
37
37
  _lazy?: boolean;
38
38
  _modulePath?: string;
39
39
  }
40
- export interface CliOptions {
40
+ export interface CliOptions extends Partial<Omit<CliCommand, 'args' | 'description'>> {
41
41
  site: string;
42
42
  name: string;
43
43
  description?: string;
44
- domain?: string;
45
- strategy?: Strategy;
46
- browser?: boolean;
47
44
  args?: Arg[];
48
- columns?: string[];
49
- func?: (page: IPage, kwargs: Record<string, any>, debug?: boolean) => Promise<any>;
50
- pipeline?: any[];
51
- timeoutSeconds?: number;
52
45
  }
53
46
  export declare function cli(opts: CliOptions): CliCommand;
54
47
  export declare function getRegistry(): Map<string, CliCommand>;
@@ -1,9 +1,18 @@
1
1
  /**
2
2
  * Aria snapshot formatter: parses Playwright MCP snapshot text into clean format.
3
+ *
4
+ * Multi-pass pipeline:
5
+ * 1. Parse & filter: strip annotations, metadata, noise roles, ads, decorators
6
+ * 2. Deduplicate: generic/text child matching parent label
7
+ * 3. Deduplicate: heading + link with identical labels
8
+ * 4. Deduplicate: nested identical links
9
+ * 5. Prune: empty containers (iterative bottom-up)
10
+ * 6. Collapse: single-child containers
3
11
  */
4
12
  export interface FormatOptions {
5
13
  interactive?: boolean;
6
14
  compact?: boolean;
7
15
  maxDepth?: number;
16
+ maxTextLength?: number;
8
17
  }
9
18
  export declare function formatSnapshot(raw: string, opts?: FormatOptions): string;
@@ -1,41 +1,378 @@
1
1
  /**
2
2
  * Aria snapshot formatter: parses Playwright MCP snapshot text into clean format.
3
+ *
4
+ * Multi-pass pipeline:
5
+ * 1. Parse & filter: strip annotations, metadata, noise roles, ads, decorators
6
+ * 2. Deduplicate: generic/text child matching parent label
7
+ * 3. Deduplicate: heading + link with identical labels
8
+ * 4. Deduplicate: nested identical links
9
+ * 5. Prune: empty containers (iterative bottom-up)
10
+ * 6. Collapse: single-child containers
3
11
  */
12
+ const DEFAULT_MAX_TEXT_LENGTH = 200;
13
+ // Roles that are pure noise and should always be filtered
14
+ const NOISE_ROLES = new Set([
15
+ 'none', 'presentation', 'separator', 'paragraph', 'tooltip', 'status',
16
+ ]);
17
+ // Roles whose entire subtree should be removed (footer boilerplate, etc.)
18
+ const SUBTREE_NOISE_ROLES = new Set([
19
+ 'contentinfo',
20
+ ]);
21
+ // Roles considered interactive (clickable/typeable)
22
+ const INTERACTIVE_ROLES = new Set([
23
+ 'button', 'link', 'textbox', 'checkbox', 'radio',
24
+ 'combobox', 'tab', 'menuitem', 'option', 'switch',
25
+ 'slider', 'spinbutton', 'searchbox',
26
+ ]);
27
+ // Structural landmark roles kept even in interactive mode
28
+ const LANDMARK_ROLES = new Set([
29
+ 'main', 'navigation', 'banner', 'heading', 'search',
30
+ 'region', 'list', 'listitem', 'article', 'complementary',
31
+ 'group', 'toolbar', 'tablist',
32
+ ]);
33
+ // Container roles eligible for pruning and collapse
34
+ const CONTAINER_ROLES = new Set([
35
+ 'list', 'listitem', 'group', 'toolbar', 'tablist',
36
+ 'navigation', 'region', 'complementary',
37
+ 'search', 'article', 'paragraph', 'figure',
38
+ ]);
39
+ // Decorator / separator text that adds no semantic value
40
+ const DECORATOR_TEXT = new Set(['•', '·', '|', '—', '-', '/', '\\']);
41
+ // Ad-related URL patterns
42
+ const AD_URL_PATTERNS = [
43
+ 'googleadservices.com/pagead/',
44
+ 'alb.reddit.com/cr?',
45
+ 'doubleclick.net/',
46
+ 'cm.bilibili.com/cm/api/fees/',
47
+ ];
48
+ // Boilerplate button labels to filter (back-to-top, etc.)
49
+ const BOILERPLATE_LABELS = [
50
+ '回到顶部', 'back to top', 'scroll to top', 'go to top',
51
+ ];
52
+ /**
53
+ * Parse role and text from a trimmed snapshot line.
54
+ * Handles quoted labels and trailing text after colon correctly,
55
+ * including lines wrapped in single quotes by Playwright.
56
+ */
57
+ function parseLine(trimmed) {
58
+ // Unwrap outer single quotes if present (Playwright wraps lines with special chars)
59
+ let line = trimmed;
60
+ if (line.startsWith("'") && line.endsWith("':")) {
61
+ line = line.slice(1, -2) + ':';
62
+ }
63
+ else if (line.startsWith("'") && line.endsWith("'")) {
64
+ line = line.slice(1, -1);
65
+ }
66
+ // Role is the first word
67
+ const roleMatch = line.match(/^([a-zA-Z]+)\b/);
68
+ const role = roleMatch ? roleMatch[1].toLowerCase() : '';
69
+ // Extract quoted text content (the semantic label)
70
+ const textMatch = line.match(/"([^"]*)"/);
71
+ const text = textMatch ? textMatch[1] : '';
72
+ // For trailing text: strip annotations and quoted strings first, then check after last colon
73
+ // This avoids matching colons inside quoted labels like "Account: user@email.com"
74
+ let stripped = line;
75
+ // Remove all quoted strings
76
+ stripped = stripped.replace(/"[^"]*"/g, '""');
77
+ // Remove all bracket annotations
78
+ stripped = stripped.replace(/\[[^\]]*\]/g, '');
79
+ const colonIdx = stripped.lastIndexOf(':');
80
+ let trailingText = '';
81
+ if (colonIdx !== -1) {
82
+ const afterColon = stripped.slice(colonIdx + 1).trim();
83
+ if (afterColon.length > 0) {
84
+ // Get the actual trailing text from original line at same position
85
+ const origColonIdx = line.lastIndexOf(':');
86
+ if (origColonIdx !== -1) {
87
+ trailingText = line.slice(origColonIdx + 1).trim();
88
+ }
89
+ }
90
+ }
91
+ return { role, text, hasText: text.length > 0 || trailingText.length > 0, trailingText };
92
+ }
93
+ /**
94
+ * Strip ALL bracket annotations from a content line, preserving quoted strings.
95
+ * Handles both double-quoted and outer single-quoted lines from Playwright.
96
+ */
97
+ function stripAnnotations(content) {
98
+ // Unwrap outer single quotes first
99
+ let line = content;
100
+ if (line.startsWith("'") && (line.endsWith("':") || line.endsWith("'"))) {
101
+ if (line.endsWith("':")) {
102
+ line = line.slice(1, -2) + ':';
103
+ }
104
+ else {
105
+ line = line.slice(1, -1);
106
+ }
107
+ }
108
+ // Split by double quotes to protect quoted content
109
+ const parts = line.split('"');
110
+ for (let i = 0; i < parts.length; i += 2) {
111
+ // Only strip annotations from non-quoted parts (even indices)
112
+ parts[i] = parts[i].replace(/\s*\[[^\]]*\]/g, '');
113
+ }
114
+ let result = parts.join('"').replace(/\s{2,}/g, ' ').trim();
115
+ return result;
116
+ }
117
+ /**
118
+ * Check if a line is a metadata-only line (like /url: ...).
119
+ */
120
+ function isMetadataLine(trimmed) {
121
+ return /^\/[a-zA-Z]+:/.test(trimmed);
122
+ }
123
+ /**
124
+ * Check if text content is purely decorative (separators, dots, etc.)
125
+ */
126
+ function isDecoratorText(text) {
127
+ return DECORATOR_TEXT.has(text.trim());
128
+ }
129
+ /**
130
+ * Check if a node is ad-related based on its text content.
131
+ */
132
+ function isAdNode(text, trailingText) {
133
+ const t = (text + ' ' + trailingText).toLowerCase();
134
+ if (t.includes('sponsored') || t.includes('advertisement'))
135
+ return true;
136
+ if (t.includes('广告'))
137
+ return true;
138
+ // Check for ad tracking URLs in the label
139
+ for (const pattern of AD_URL_PATTERNS) {
140
+ if (text.includes(pattern) || trailingText.includes(pattern))
141
+ return true;
142
+ }
143
+ return false;
144
+ }
145
+ /**
146
+ * Check if a node is boilerplate UI (back-to-top, etc.)
147
+ */
148
+ function isBoilerplateNode(text) {
149
+ const t = text.toLowerCase();
150
+ return BOILERPLATE_LABELS.some(label => t.includes(label));
151
+ }
152
+ /**
153
+ * Check if a role is noise that should be filtered.
154
+ */
155
+ function isNoiseNode(role, hasText, text, trailingText) {
156
+ if (NOISE_ROLES.has(role))
157
+ return true;
158
+ // generic without text is a wrapper
159
+ if (role === 'generic' && !hasText)
160
+ return true;
161
+ // img without alt text is noise
162
+ if (role === 'img' && !hasText)
163
+ return true;
164
+ // Decorator-only text nodes
165
+ if ((role === 'generic' || role === 'text') && hasText) {
166
+ const content = trailingText || text;
167
+ if (isDecoratorText(content))
168
+ return true;
169
+ }
170
+ return false;
171
+ }
4
172
  export function formatSnapshot(raw, opts = {}) {
5
173
  if (!raw || typeof raw !== 'string')
6
174
  return '';
175
+ const maxTextLen = opts.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
7
176
  const lines = raw.split('\n');
8
- const result = [];
177
+ // === Pass 1: Parse, filter, and collect entries ===
178
+ const entries = [];
9
179
  let refCounter = 0;
10
- for (const line of lines) {
180
+ let skipUntilDepth = -1; // When >= 0, skip all nodes at depth > this value
181
+ for (let i = 0; i < lines.length; i++) {
182
+ const line = lines[i];
11
183
  if (!line.trim())
12
184
  continue;
13
185
  const indent = line.length - line.trimStart().length;
14
186
  const depth = Math.floor(indent / 2);
15
- if (opts.maxDepth && depth > opts.maxDepth)
16
- continue;
187
+ // If we're in a subtree skip zone, check depth
188
+ if (skipUntilDepth >= 0) {
189
+ if (depth > skipUntilDepth)
190
+ continue; // still inside subtree
191
+ skipUntilDepth = -1; // exited subtree
192
+ }
17
193
  let content = line.trimStart();
18
- // Skip non-interactive elements in interactive mode
19
- if (opts.interactive) {
20
- const interactiveRoles = ['button', 'link', 'textbox', 'checkbox', 'radio', 'combobox', 'tab', 'menuitem', 'option'];
21
- const role = content.split(/[\s[]/)[0]?.toLowerCase() ?? '';
22
- if (!interactiveRoles.some(r => role.includes(r)) && depth > 1)
23
- continue;
194
+ // Strip leading "- "
195
+ if (content.startsWith('- ')) {
196
+ content = content.slice(2);
197
+ }
198
+ // Skip metadata lines
199
+ if (isMetadataLine(content))
200
+ continue;
201
+ // Apply maxDepth filter
202
+ if (opts.maxDepth !== undefined && depth > opts.maxDepth)
203
+ continue;
204
+ const { role, text, hasText, trailingText } = parseLine(content);
205
+ // Skip noise nodes
206
+ if (isNoiseNode(role, hasText, text, trailingText))
207
+ continue;
208
+ // Skip subtree noise roles (contentinfo footer, etc.) — skip entire subtree
209
+ if (SUBTREE_NOISE_ROLES.has(role)) {
210
+ skipUntilDepth = depth;
211
+ continue;
24
212
  }
25
- // Compact: strip verbose role descriptions
213
+ // Strip annotations
214
+ content = stripAnnotations(content);
215
+ // Check if node should trigger subtree skip (ads, boilerplate)
216
+ const isSubtreeSkip = isAdNode(text, trailingText) || isBoilerplateNode(text);
217
+ // Interactive mode filter
218
+ const isInteractive = INTERACTIVE_ROLES.has(role);
219
+ const isLandmark = LANDMARK_ROLES.has(role);
220
+ if (opts.interactive && !isInteractive && !isLandmark && !hasText)
221
+ continue;
222
+ // Compact mode
26
223
  if (opts.compact) {
27
224
  content = content
28
225
  .replace(/\s*\[.*?\]\s*/g, ' ')
29
226
  .replace(/\s+/g, ' ')
30
227
  .trim();
31
228
  }
229
+ // Text truncation
230
+ if (maxTextLen > 0 && content.length > maxTextLen) {
231
+ content = content.slice(0, maxTextLen) + '…';
232
+ }
32
233
  // Assign refs to interactive elements
33
- const interactivePattern = /^(button|link|textbox|checkbox|radio|combobox|tab|menuitem|option)\b/i;
34
- if (interactivePattern.test(content)) {
234
+ if (isInteractive) {
35
235
  refCounter++;
36
236
  content = `[@${refCounter}] ${content}`;
37
237
  }
38
- result.push(' '.repeat(depth) + content);
238
+ entries.push({ depth, content, role, text, trailingText, isInteractive, isLandmark, isSubtreeSkip });
239
+ }
240
+ // === Pass 2: Remove subtree-skip nodes (ads, boilerplate, contentinfo) ===
241
+ let noAds = [];
242
+ for (let i = 0; i < entries.length; i++) {
243
+ const entry = entries[i];
244
+ if (entry.isSubtreeSkip) {
245
+ const skipDepth = entry.depth;
246
+ i++;
247
+ while (i < entries.length && entries[i].depth > skipDepth) {
248
+ i++;
249
+ }
250
+ i--;
251
+ continue;
252
+ }
253
+ noAds.push(entry);
254
+ }
255
+ // === Pass 3: Deduplicate child generic/text matching parent label ===
256
+ let deduped = [];
257
+ for (let i = 0; i < noAds.length; i++) {
258
+ const entry = noAds[i];
259
+ if (entry.role === 'generic' || entry.role === 'text') {
260
+ let parent;
261
+ for (let j = deduped.length - 1; j >= 0; j--) {
262
+ if (deduped[j].depth < entry.depth) {
263
+ parent = deduped[j];
264
+ break;
265
+ }
266
+ if (deduped[j].depth === entry.depth)
267
+ break;
268
+ }
269
+ if (parent) {
270
+ const childText = entry.trailingText || entry.text;
271
+ if (childText && parent.text && childText === parent.text) {
272
+ continue;
273
+ }
274
+ }
275
+ }
276
+ deduped.push(entry);
277
+ }
278
+ // === Pass 4: Deduplicate heading + child link with identical label ===
279
+ // Pattern: heading "Title": → link "Title": (same text) → skip the link
280
+ const deduped2 = [];
281
+ for (let i = 0; i < deduped.length; i++) {
282
+ const entry = deduped[i];
283
+ if (entry.role === 'heading' && entry.text) {
284
+ const next = deduped[i + 1];
285
+ if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
286
+ // Keep the heading, skip the link. But preserve link's children re-parented.
287
+ deduped2.push(entry);
288
+ i++; // skip the link
289
+ continue;
290
+ }
291
+ }
292
+ deduped2.push(entry);
293
+ }
294
+ // === Pass 5: Deduplicate nested identical links ===
295
+ const deduped3 = [];
296
+ for (let i = 0; i < deduped2.length; i++) {
297
+ const entry = deduped2[i];
298
+ if (entry.role === 'link' && entry.text) {
299
+ const next = deduped2[i + 1];
300
+ if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
301
+ continue; // Skip parent, keep child
302
+ }
303
+ }
304
+ deduped3.push(entry);
305
+ }
306
+ // === Pass 6: Iteratively prune empty containers (bottom-up) ===
307
+ let current = deduped3;
308
+ let changed = true;
309
+ while (changed) {
310
+ changed = false;
311
+ const next = [];
312
+ for (let i = 0; i < current.length; i++) {
313
+ const entry = current[i];
314
+ if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
315
+ let hasChildren = false;
316
+ for (let j = i + 1; j < current.length; j++) {
317
+ if (current[j].depth <= entry.depth)
318
+ break;
319
+ if (current[j].depth > entry.depth) {
320
+ hasChildren = true;
321
+ break;
322
+ }
323
+ }
324
+ if (!hasChildren) {
325
+ changed = true;
326
+ continue;
327
+ }
328
+ }
329
+ next.push(entry);
330
+ }
331
+ current = next;
332
+ }
333
+ // === Pass 7: Collapse single-child containers ===
334
+ const collapsed = [];
335
+ for (let i = 0; i < current.length; i++) {
336
+ const entry = current[i];
337
+ if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
338
+ let childCount = 0;
339
+ let childIdx = -1;
340
+ for (let j = i + 1; j < current.length; j++) {
341
+ if (current[j].depth <= entry.depth)
342
+ break;
343
+ if (current[j].depth === entry.depth + 1) {
344
+ childCount++;
345
+ if (childCount === 1)
346
+ childIdx = j;
347
+ }
348
+ }
349
+ if (childCount === 1 && childIdx !== -1) {
350
+ const child = current[childIdx];
351
+ let hasGrandchildren = false;
352
+ for (let j = childIdx + 1; j < current.length; j++) {
353
+ if (current[j].depth <= child.depth)
354
+ break;
355
+ if (current[j].depth > child.depth) {
356
+ hasGrandchildren = true;
357
+ break;
358
+ }
359
+ }
360
+ if (!hasGrandchildren) {
361
+ const mergedContent = entry.content.replace(/:$/, '') + ' > ' + child.content;
362
+ collapsed.push({
363
+ ...entry,
364
+ content: mergedContent,
365
+ role: child.role,
366
+ text: child.text,
367
+ trailingText: child.trailingText,
368
+ isInteractive: child.isInteractive,
369
+ });
370
+ i++;
371
+ continue;
372
+ }
373
+ }
374
+ }
375
+ collapsed.push(entry);
39
376
  }
40
- return result.join('\n');
377
+ return collapsed.map(e => ' '.repeat(e.depth) + e.content).join('\n');
41
378
  }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Tests for snapshotFormatter.ts: Playwright MCP snapshot tree filtering.
3
+ *
4
+ * Uses sanitized excerpts from real websites (GitHub, Bilibili, Twitter)
5
+ * to validate noise filtering, annotation stripping, and output quality.
6
+ */
7
+ export {};