@lightcone-ai/daemon 0.15.52 → 0.15.54
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp-servers/_thin-proxy/forward.js +80 -0
- package/mcp-servers/official/audience-research/index.js +24 -376
- package/mcp-servers/official/hook-pattern-library/index.js +17 -410
- package/mcp-servers/official/keyword-research/index.js +17 -324
- package/mcp-servers/official/page-understanding/index.js +17 -96
- package/mcp-servers/official/platform-policy-db/index.js +19 -264
- package/mcp-servers/official/video-narration-planner/index.js +30 -130
- package/package.json +1 -1
- package/mcp-servers/official/keyword-research/keyword-fixtures.json +0 -58
- package/mcp-servers/official/platform-policy-db/policy-fixtures.json +0 -257
- package/mcp-servers/official/video-narration-planner/core.js +0 -1403
- package/mcp-servers/official/video-narration-planner/planner-config.json +0 -112
- package/src/_vendor/video/understanding/analyze-page.js +0 -737
- package/src/_vendor/video/understanding/heuristics.js +0 -826
- package/src/_vendor/video/understanding/index.js +0 -11
- package/src/_vendor/video/understanding/llm-client.js +0 -261
- package/src/_vendor/video/understanding/schema.js +0 -254
- package/src/_vendor/video/understanding/site-selectors.js +0 -47
|
@@ -1,737 +0,0 @@
|
|
|
1
|
-
import { existsSync } from 'fs';
|
|
2
|
-
import { createRequire } from 'module';
|
|
3
|
-
import { dirname } from 'path';
|
|
4
|
-
import { fileURLToPath } from 'url';
|
|
5
|
-
|
|
6
|
-
import {
|
|
7
|
-
buildHeuristicCoreMessage,
|
|
8
|
-
buildHeuristicHighlights,
|
|
9
|
-
buildRecruitmentSemanticSlots,
|
|
10
|
-
computeFocusRange,
|
|
11
|
-
derivePageType,
|
|
12
|
-
} from './heuristics.js';
|
|
13
|
-
import { isAnthropicConfigured, runTextLlm, runVisionLlm } from './llm-client.js';
|
|
14
|
-
import { chooseUnderstandingPath, normalizePageUnderstanding } from './schema.js';
|
|
15
|
-
import { SEMANTIC_CORE_SELECTORS, SITE_CORE_SELECTORS } from './site-selectors.js';
|
|
16
|
-
|
|
17
|
-
const require = createRequire(import.meta.url);
|
|
18
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
-
const __dirname = dirname(__filename);
|
|
20
|
-
|
|
21
|
-
const IOS_UA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
|
|
22
|
-
|
|
23
|
-
const DEFAULT_OPTIONS = Object.freeze({
|
|
24
|
-
viewportWidth: 1080,
|
|
25
|
-
viewportHeight: 1920,
|
|
26
|
-
settleMs: 3500,
|
|
27
|
-
timeoutMs: 120000,
|
|
28
|
-
binHeight: 400,
|
|
29
|
-
minBinChars: 30,
|
|
30
|
-
minTextBins: 10,
|
|
31
|
-
maxVisionChunks: 6,
|
|
32
|
-
visionChunkHeight: 2200,
|
|
33
|
-
allowVisionFallback: true,
|
|
34
|
-
useLlm: true,
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
function clampInt(value, min, max, fallback) {
|
|
38
|
-
const n = Number(value);
|
|
39
|
-
if (!Number.isFinite(n)) return fallback;
|
|
40
|
-
return Math.max(min, Math.min(max, Math.round(n)));
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
function normalizeOptions(options = {}) {
|
|
44
|
-
return {
|
|
45
|
-
viewportWidth: clampInt(options.viewportWidth, 360, 2160, DEFAULT_OPTIONS.viewportWidth),
|
|
46
|
-
viewportHeight: clampInt(options.viewportHeight, 480, 3840, DEFAULT_OPTIONS.viewportHeight),
|
|
47
|
-
settleMs: clampInt(options.settleMs, 500, 30000, DEFAULT_OPTIONS.settleMs),
|
|
48
|
-
timeoutMs: clampInt(options.timeoutMs, 5000, 240000, DEFAULT_OPTIONS.timeoutMs),
|
|
49
|
-
binHeight: clampInt(options.binHeight, 200, 900, DEFAULT_OPTIONS.binHeight),
|
|
50
|
-
minBinChars: clampInt(options.minBinChars, 12, 200, DEFAULT_OPTIONS.minBinChars),
|
|
51
|
-
minTextBins: clampInt(options.minTextBins, 3, 40, DEFAULT_OPTIONS.minTextBins),
|
|
52
|
-
maxVisionChunks: clampInt(options.maxVisionChunks, 1, 12, DEFAULT_OPTIONS.maxVisionChunks),
|
|
53
|
-
visionChunkHeight: clampInt(options.visionChunkHeight, 700, 3000, DEFAULT_OPTIONS.visionChunkHeight),
|
|
54
|
-
allowVisionFallback: options.allowVisionFallback !== false,
|
|
55
|
-
useLlm: options.useLlm !== false,
|
|
56
|
-
executablePath: typeof options.executablePath === 'string' ? options.executablePath.trim() : '',
|
|
57
|
-
disableSandbox: options.disableSandbox !== false,
|
|
58
|
-
};
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
function resolveReadabilityScriptPath() {
|
|
62
|
-
try {
|
|
63
|
-
return require.resolve('@mozilla/readability/Readability.js');
|
|
64
|
-
} catch {
|
|
65
|
-
return '';
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
function resolveChromeExecutable(options = {}) {
|
|
70
|
-
const candidates = [
|
|
71
|
-
options.executablePath,
|
|
72
|
-
process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH,
|
|
73
|
-
process.env.GOOGLE_CHROME_BIN,
|
|
74
|
-
process.env.GOOGLE_CHROME_SHIM,
|
|
75
|
-
'/usr/bin/google-chrome',
|
|
76
|
-
'/usr/bin/google-chrome-stable',
|
|
77
|
-
'/usr/bin/chromium',
|
|
78
|
-
'/usr/bin/chromium-browser',
|
|
79
|
-
]
|
|
80
|
-
.map(item => String(item ?? '').trim())
|
|
81
|
-
.filter(Boolean);
|
|
82
|
-
|
|
83
|
-
for (const filePath of candidates) {
|
|
84
|
-
if (existsSync(filePath)) return filePath;
|
|
85
|
-
}
|
|
86
|
-
return '';
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
async function loadPlaywrightChromium() {
|
|
90
|
-
try {
|
|
91
|
-
const mod = await import('playwright-core');
|
|
92
|
-
return mod.chromium;
|
|
93
|
-
} catch {
|
|
94
|
-
throw new Error('playwright_core_missing: run `npm install playwright-core`');
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
async function maybeInjectReadability(page) {
|
|
99
|
-
const readabilityPath = resolveReadabilityScriptPath();
|
|
100
|
-
if (!readabilityPath) return false;
|
|
101
|
-
try {
|
|
102
|
-
await page.addScriptTag({ path: readabilityPath });
|
|
103
|
-
return true;
|
|
104
|
-
} catch {
|
|
105
|
-
return false;
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
function normalizeMeta(meta) {
|
|
110
|
-
const input = meta && typeof meta === 'object' ? meta : {};
|
|
111
|
-
return {
|
|
112
|
-
og_title: input.og_title ?? null,
|
|
113
|
-
og_description: input.og_description ?? null,
|
|
114
|
-
og_image: input.og_image ?? null,
|
|
115
|
-
title: input.title ?? null,
|
|
116
|
-
description: input.description ?? null,
|
|
117
|
-
publish_time: input.publish_time ?? null,
|
|
118
|
-
date: input.date ?? null,
|
|
119
|
-
};
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
async function extractStructure(page, { binHeight, minBinChars }) {
|
|
123
|
-
const structure = await page.evaluate(({
|
|
124
|
-
siteSelectors,
|
|
125
|
-
semanticSelectors,
|
|
126
|
-
binHeightPx,
|
|
127
|
-
minBinCharsCount,
|
|
128
|
-
}) => {
|
|
129
|
-
function cleanText(value) {
|
|
130
|
-
return String(value ?? '').replace(/\s+/g, ' ').trim();
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
function isVisibleEnough(element, minHeight = 120) {
|
|
134
|
-
if (!element) return false;
|
|
135
|
-
const rect = element.getBoundingClientRect();
|
|
136
|
-
if (!rect || rect.height < minHeight || rect.width < 200) return false;
|
|
137
|
-
const style = window.getComputedStyle(element);
|
|
138
|
-
return style.display !== 'none' && style.visibility !== 'hidden';
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
function pickElementFromSelectors(selectors) {
|
|
142
|
-
for (const selector of selectors) {
|
|
143
|
-
const el = document.querySelector(selector);
|
|
144
|
-
if (isVisibleEnough(el, 120)) return { element: el, selector };
|
|
145
|
-
}
|
|
146
|
-
return null;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
function rangeFromMatches(matches, fallbackTotalHeight) {
|
|
150
|
-
if (!matches.length) return null;
|
|
151
|
-
const ys = matches.map(item => item.y).filter(Number.isFinite);
|
|
152
|
-
if (!ys.length) return null;
|
|
153
|
-
const top = Math.max(0, Math.min(...ys) - 120);
|
|
154
|
-
const bottom = Math.min(fallbackTotalHeight, Math.max(...ys) + 420);
|
|
155
|
-
if (bottom <= top + 100) return null;
|
|
156
|
-
return [Math.round(top), Math.round(bottom)];
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
function readabilityRange() {
|
|
160
|
-
if (typeof window.Readability !== 'function') return null;
|
|
161
|
-
const cloned = document.cloneNode(true);
|
|
162
|
-
let article = null;
|
|
163
|
-
try {
|
|
164
|
-
article = new window.Readability(cloned).parse();
|
|
165
|
-
} catch {
|
|
166
|
-
article = null;
|
|
167
|
-
}
|
|
168
|
-
if (!article?.textContent || article.textContent.length < 260) return null;
|
|
169
|
-
|
|
170
|
-
const snippets = article.textContent
|
|
171
|
-
.split(/[\n。!?!?]/g)
|
|
172
|
-
.map(text => cleanText(text))
|
|
173
|
-
.filter(text => text.length >= 24)
|
|
174
|
-
.slice(0, 28)
|
|
175
|
-
.map(text => text.slice(0, 32));
|
|
176
|
-
|
|
177
|
-
if (!snippets.length) return null;
|
|
178
|
-
|
|
179
|
-
const root = document.scrollingElement || document.documentElement;
|
|
180
|
-
const scrollTop = root.scrollTop;
|
|
181
|
-
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
|
|
182
|
-
acceptNode(node) {
|
|
183
|
-
const text = cleanText(node.nodeValue || '');
|
|
184
|
-
if (text.length < 20) return NodeFilter.FILTER_REJECT;
|
|
185
|
-
return NodeFilter.FILTER_ACCEPT;
|
|
186
|
-
},
|
|
187
|
-
});
|
|
188
|
-
|
|
189
|
-
const matches = [];
|
|
190
|
-
let node;
|
|
191
|
-
while ((node = walker.nextNode())) {
|
|
192
|
-
const text = cleanText(node.nodeValue || '');
|
|
193
|
-
const hit = snippets.some(snippet => text.includes(snippet));
|
|
194
|
-
if (!hit) continue;
|
|
195
|
-
const parent = node.parentElement;
|
|
196
|
-
if (!parent) continue;
|
|
197
|
-
const rect = parent.getBoundingClientRect();
|
|
198
|
-
const y = Math.round(rect.top + scrollTop);
|
|
199
|
-
if (!Number.isFinite(y) || rect.height < 10) continue;
|
|
200
|
-
matches.push({ y });
|
|
201
|
-
if (matches.length >= 400) break;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
return rangeFromMatches(matches, root.scrollHeight);
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
function trimNoiseFromBottom(range, pageHeight, detectionStrategy) {
|
|
208
|
-
// Site-specific selectors already point to the exact content element — trust them.
|
|
209
|
-
if (detectionStrategy && detectionStrategy.startsWith('site:')) return range;
|
|
210
|
-
|
|
211
|
-
const [coreTop, rawBottom] = range;
|
|
212
|
-
const minContentHeight = 600; // always keep at least 600px of core content
|
|
213
|
-
|
|
214
|
-
// Selectors that reliably identify non-article noise below the body text.
|
|
215
|
-
const noiseSelectors = [
|
|
216
|
-
// Comment sections
|
|
217
|
-
'[class*="comment-list"]', '[class*="comment-wrap"]', '[class*="commentList"]',
|
|
218
|
-
'[id*="comment"]', '[class*="discuss"]', '[class*="reply-list"]',
|
|
219
|
-
// Related / recommended articles
|
|
220
|
-
'[class*="related-read"]', '[class*="related-article"]', '[class*="relatedArticle"]',
|
|
221
|
-
'[class*="recommend"]', '[class*="more-article"]', '[class*="further-reading"]',
|
|
222
|
-
// Article footer / bottom toolbars
|
|
223
|
-
'[class*="article-footer"]', '[class*="post-footer"]', '[class*="article-bottom"]',
|
|
224
|
-
// Social sharing bars that sit below the body
|
|
225
|
-
'[class*="share-bar"]', '[class*="share-wrap"]', '[class*="article-share"]',
|
|
226
|
-
// Generic page footer inside the core element
|
|
227
|
-
'footer',
|
|
228
|
-
];
|
|
229
|
-
|
|
230
|
-
const root = document.scrollingElement || document.documentElement;
|
|
231
|
-
let trimBottom = rawBottom;
|
|
232
|
-
|
|
233
|
-
for (const sel of noiseSelectors) {
|
|
234
|
-
try {
|
|
235
|
-
const elements = document.querySelectorAll(sel);
|
|
236
|
-
for (const el of elements) {
|
|
237
|
-
if (!isVisibleEnough(el, 40)) continue;
|
|
238
|
-
const rect = el.getBoundingClientRect();
|
|
239
|
-
const elTop = Math.round(rect.top + root.scrollTop);
|
|
240
|
-
if (
|
|
241
|
-
elTop > coreTop + minContentHeight
|
|
242
|
-
&& elTop < trimBottom
|
|
243
|
-
&& rect.height > 60
|
|
244
|
-
) {
|
|
245
|
-
trimBottom = elTop;
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
} catch {
|
|
249
|
-
// ignore selector errors for individual sites
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
return [coreTop, trimBottom];
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
function collectTextBins({ range, coreElement }) {
|
|
257
|
-
const [rangeTop, rangeBottom] = range;
|
|
258
|
-
const root = document.scrollingElement || document.documentElement;
|
|
259
|
-
const scrollTop = root.scrollTop;
|
|
260
|
-
const scope = coreElement || document.body;
|
|
261
|
-
const chunks = [];
|
|
262
|
-
|
|
263
|
-
const walker = document.createTreeWalker(scope, NodeFilter.SHOW_TEXT, {
|
|
264
|
-
acceptNode(node) {
|
|
265
|
-
const text = cleanText(node.nodeValue || '');
|
|
266
|
-
if (text.length < 4) return NodeFilter.FILTER_REJECT;
|
|
267
|
-
return NodeFilter.FILTER_ACCEPT;
|
|
268
|
-
},
|
|
269
|
-
});
|
|
270
|
-
|
|
271
|
-
let node;
|
|
272
|
-
while ((node = walker.nextNode())) {
|
|
273
|
-
const text = cleanText(node.nodeValue || '');
|
|
274
|
-
const parent = node.parentElement;
|
|
275
|
-
if (!parent) continue;
|
|
276
|
-
const rect = parent.getBoundingClientRect();
|
|
277
|
-
if (!rect || rect.height < 8) continue;
|
|
278
|
-
const y = Math.round(rect.top + scrollTop);
|
|
279
|
-
if (y < rangeTop || y > rangeBottom) continue;
|
|
280
|
-
chunks.push({ y, text });
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
chunks.sort((a, b) => a.y - b.y);
|
|
284
|
-
const bins = [];
|
|
285
|
-
const startY = Math.floor(rangeTop / binHeightPx) * binHeightPx;
|
|
286
|
-
|
|
287
|
-
for (let cursor = startY, idx = 0; cursor < rangeBottom; cursor += binHeightPx) {
|
|
288
|
-
const inBin = chunks.filter(item => item.y >= cursor && item.y < cursor + binHeightPx);
|
|
289
|
-
if (!inBin.length) continue;
|
|
290
|
-
const text = cleanText(inBin.map(item => item.text).join(' ')).slice(0, 420);
|
|
291
|
-
if (text.length <= minBinCharsCount) continue;
|
|
292
|
-
bins.push({
|
|
293
|
-
id: `bin_${idx}`,
|
|
294
|
-
y_start: cursor,
|
|
295
|
-
y_center: cursor + Math.floor(binHeightPx / 2),
|
|
296
|
-
text,
|
|
297
|
-
});
|
|
298
|
-
idx += 1;
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
return bins;
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
function collectHotspots({ range }) {
|
|
305
|
-
const [rangeTop, rangeBottom] = range;
|
|
306
|
-
const root = document.scrollingElement || document.documentElement;
|
|
307
|
-
const scrollTop = root.scrollTop;
|
|
308
|
-
const selectors = ['h1', 'h2', 'h3', 'strong', 'b', 'img'];
|
|
309
|
-
const list = [];
|
|
310
|
-
|
|
311
|
-
for (const selector of selectors) {
|
|
312
|
-
const nodes = document.querySelectorAll(selector);
|
|
313
|
-
for (const node of nodes) {
|
|
314
|
-
const rect = node.getBoundingClientRect();
|
|
315
|
-
if (!rect || rect.height < 8 || rect.width < 40) continue;
|
|
316
|
-
const y = Math.round(rect.top + scrollTop + rect.height / 2);
|
|
317
|
-
if (!Number.isFinite(y) || y < rangeTop || y > rangeBottom) continue;
|
|
318
|
-
const text = cleanText(node.textContent || '');
|
|
319
|
-
list.push({
|
|
320
|
-
id: `${selector}_${list.length + 1}`,
|
|
321
|
-
y,
|
|
322
|
-
type: selector,
|
|
323
|
-
reason: text ? text.slice(0, 80) : `anchor_${selector}`,
|
|
324
|
-
weight: selector.startsWith('h') ? 9 : selector === 'img' ? 8 : 7,
|
|
325
|
-
});
|
|
326
|
-
if (list.length >= 24) break;
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
return list;
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
function collectMeta() {
|
|
334
|
-
const pick = (selector, attr = 'content') => {
|
|
335
|
-
const element = document.querySelector(selector);
|
|
336
|
-
if (!element) return null;
|
|
337
|
-
const value = element.getAttribute(attr);
|
|
338
|
-
return cleanText(value);
|
|
339
|
-
};
|
|
340
|
-
|
|
341
|
-
return {
|
|
342
|
-
og_title: pick('meta[property="og:title"]'),
|
|
343
|
-
og_description: pick('meta[property="og:description"]'),
|
|
344
|
-
og_image: pick('meta[property="og:image"]'),
|
|
345
|
-
title: cleanText(document.title || ''),
|
|
346
|
-
description: pick('meta[name="description"]'),
|
|
347
|
-
publish_time: cleanText(document.querySelector('#publish_time')?.textContent || ''),
|
|
348
|
-
date: cleanText(document.querySelector('#publish_time, time, meta[property=\"article:published_time\"]')?.getAttribute?.('content')
|
|
349
|
-
|| document.querySelector('#publish_time, time')?.textContent
|
|
350
|
-
|| ''),
|
|
351
|
-
};
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
const host = String(location.hostname || '').toLowerCase();
|
|
355
|
-
const root = document.scrollingElement || document.documentElement;
|
|
356
|
-
const totalHeight = Math.max(root.scrollHeight, document.body?.scrollHeight || 0, window.innerHeight);
|
|
357
|
-
let strategy = 'fallback:body';
|
|
358
|
-
let coreElement = null;
|
|
359
|
-
let coreRange = null;
|
|
360
|
-
|
|
361
|
-
for (const [domain, selectors] of Object.entries(siteSelectors || {})) {
|
|
362
|
-
if (!host.includes(domain)) continue;
|
|
363
|
-
const picked = pickElementFromSelectors(selectors);
|
|
364
|
-
if (!picked) continue;
|
|
365
|
-
const rect = picked.element.getBoundingClientRect();
|
|
366
|
-
const top = Math.round(rect.top + root.scrollTop);
|
|
367
|
-
const bottom = Math.round(rect.bottom + root.scrollTop);
|
|
368
|
-
coreElement = picked.element;
|
|
369
|
-
coreRange = [Math.max(0, top), Math.min(totalHeight, bottom)];
|
|
370
|
-
strategy = `site:${domain}:${picked.selector}`;
|
|
371
|
-
break;
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
if (!coreRange) {
|
|
375
|
-
const readableRange = readabilityRange();
|
|
376
|
-
if (readableRange) {
|
|
377
|
-
coreRange = readableRange;
|
|
378
|
-
strategy = 'readability';
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
if (!coreRange) {
|
|
383
|
-
const semanticPicked = pickElementFromSelectors(semanticSelectors || []);
|
|
384
|
-
if (semanticPicked) {
|
|
385
|
-
const rect = semanticPicked.element.getBoundingClientRect();
|
|
386
|
-
const top = Math.round(rect.top + root.scrollTop);
|
|
387
|
-
const bottom = Math.round(rect.bottom + root.scrollTop);
|
|
388
|
-
coreElement = semanticPicked.element;
|
|
389
|
-
coreRange = [Math.max(0, top), Math.min(totalHeight, bottom)];
|
|
390
|
-
strategy = `semantic:${semanticPicked.selector}`;
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
if (!coreRange) {
|
|
395
|
-
coreRange = [0, totalHeight];
|
|
396
|
-
coreElement = document.body;
|
|
397
|
-
strategy = 'fallback:body';
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
if (coreRange[1] <= coreRange[0]) {
|
|
401
|
-
coreRange = [0, totalHeight];
|
|
402
|
-
strategy = `${strategy}:range_fixed`;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
// Trim coreBottom to exclude footer noise (comments, related articles, recommendations)
|
|
406
|
-
// that commonly appear below the article body.
|
|
407
|
-
coreRange = trimNoiseFromBottom(coreRange, totalHeight, strategy);
|
|
408
|
-
|
|
409
|
-
const bins = collectTextBins({ range: coreRange, coreElement });
|
|
410
|
-
const focusRange = (() => {
|
|
411
|
-
const span = coreRange[1] - coreRange[0];
|
|
412
|
-
if (span <= 3000) return [...coreRange];
|
|
413
|
-
return [coreRange[0], coreRange[0] + Math.round(span * 0.7)];
|
|
414
|
-
})();
|
|
415
|
-
|
|
416
|
-
return {
|
|
417
|
-
strategy,
|
|
418
|
-
hostname: host,
|
|
419
|
-
total_height: totalHeight,
|
|
420
|
-
viewport_height: window.innerHeight,
|
|
421
|
-
core_y_range: coreRange,
|
|
422
|
-
focus_y_range: focusRange,
|
|
423
|
-
bins,
|
|
424
|
-
hotspots: collectHotspots({ range: coreRange }),
|
|
425
|
-
meta: collectMeta(),
|
|
426
|
-
};
|
|
427
|
-
}, {
|
|
428
|
-
siteSelectors: SITE_CORE_SELECTORS,
|
|
429
|
-
semanticSelectors: SEMANTIC_CORE_SELECTORS,
|
|
430
|
-
binHeightPx: binHeight,
|
|
431
|
-
minBinCharsCount: minBinChars,
|
|
432
|
-
});
|
|
433
|
-
|
|
434
|
-
return {
|
|
435
|
-
...structure,
|
|
436
|
-
meta: normalizeMeta(structure.meta),
|
|
437
|
-
};
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
async function captureVisionChunks(page, structure, options) {
|
|
441
|
-
const chunks = [];
|
|
442
|
-
const [focusTop, focusBottom] = structure.focus_y_range;
|
|
443
|
-
const chunkHeight = options.visionChunkHeight;
|
|
444
|
-
const maxChunks = options.maxVisionChunks;
|
|
445
|
-
|
|
446
|
-
let y = focusTop;
|
|
447
|
-
let idx = 0;
|
|
448
|
-
while (y < focusBottom && idx < maxChunks) {
|
|
449
|
-
const height = Math.min(chunkHeight, focusBottom - y);
|
|
450
|
-
const imageBase64 = await page.screenshot({
|
|
451
|
-
type: 'png',
|
|
452
|
-
fullPage: true,
|
|
453
|
-
clip: {
|
|
454
|
-
x: 0,
|
|
455
|
-
y,
|
|
456
|
-
width: options.viewportWidth,
|
|
457
|
-
height,
|
|
458
|
-
},
|
|
459
|
-
encoding: 'base64',
|
|
460
|
-
});
|
|
461
|
-
|
|
462
|
-
chunks.push({
|
|
463
|
-
id: `chunk_${idx}`,
|
|
464
|
-
y_start: y,
|
|
465
|
-
y_end: y + height,
|
|
466
|
-
image_base64: imageBase64,
|
|
467
|
-
});
|
|
468
|
-
|
|
469
|
-
y += height;
|
|
470
|
-
idx += 1;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
return chunks;
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
function coerceHighlights(highlights, focusRange) {
|
|
477
|
-
const [focusTop, focusBottom] = focusRange;
|
|
478
|
-
return (Array.isArray(highlights) ? highlights : [])
|
|
479
|
-
.map((row) => ({
|
|
480
|
-
y: Number.isFinite(Number(row?.y)) ? Math.round(Number(row.y)) : NaN,
|
|
481
|
-
from_chunk: String(row?.from_chunk ?? '').trim() || 'unknown',
|
|
482
|
-
reason: String(row?.reason ?? '').trim() || 'highlight',
|
|
483
|
-
}))
|
|
484
|
-
.filter((row) => Number.isFinite(row.y) && row.y >= focusTop && row.y <= focusBottom)
|
|
485
|
-
.slice(0, 3);
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
async function runTextPath({ persona, structure, options }) {
|
|
489
|
-
if (options.useLlm && isAnthropicConfigured()) {
|
|
490
|
-
try {
|
|
491
|
-
return await runTextLlm({ persona, structure });
|
|
492
|
-
} catch {
|
|
493
|
-
// fall through to heuristic
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
return {
|
|
498
|
-
core_message: buildHeuristicCoreMessage({
|
|
499
|
-
persona,
|
|
500
|
-
meta: structure.meta,
|
|
501
|
-
bins: structure.bins,
|
|
502
|
-
}),
|
|
503
|
-
highlights: buildHeuristicHighlights({
|
|
504
|
-
bins: structure.bins,
|
|
505
|
-
hotspots: structure.hotspots,
|
|
506
|
-
coreRange: structure.core_y_range,
|
|
507
|
-
focusRange: structure.focus_y_range,
|
|
508
|
-
}),
|
|
509
|
-
model_used: 'heuristic',
|
|
510
|
-
};
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
async function runVisionPath({ page, persona, structure, options }) {
|
|
514
|
-
const chunks = await captureVisionChunks(page, structure, options);
|
|
515
|
-
if (options.useLlm && chunks.length > 0 && isAnthropicConfigured()) {
|
|
516
|
-
try {
|
|
517
|
-
return await runVisionLlm({
|
|
518
|
-
persona,
|
|
519
|
-
structure,
|
|
520
|
-
chunks,
|
|
521
|
-
});
|
|
522
|
-
} catch {
|
|
523
|
-
// fall through to heuristic
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
return {
|
|
528
|
-
core_message: buildHeuristicCoreMessage({
|
|
529
|
-
persona,
|
|
530
|
-
meta: structure.meta,
|
|
531
|
-
bins: structure.bins,
|
|
532
|
-
}),
|
|
533
|
-
highlights: buildHeuristicHighlights({
|
|
534
|
-
bins: structure.bins,
|
|
535
|
-
hotspots: structure.hotspots,
|
|
536
|
-
coreRange: structure.core_y_range,
|
|
537
|
-
focusRange: structure.focus_y_range,
|
|
538
|
-
}),
|
|
539
|
-
model_used: 'heuristic',
|
|
540
|
-
};
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
function buildSkipZones(structure) {
|
|
544
|
-
const [coreTop, coreBottom] = structure.core_y_range;
|
|
545
|
-
const totalHeight = structure.total_height;
|
|
546
|
-
const zones = [];
|
|
547
|
-
|
|
548
|
-
if (coreTop > 0) zones.push({ y_range: [0, coreTop], reason: 'non_core_top' });
|
|
549
|
-
if (coreBottom < totalHeight) zones.push({ y_range: [coreBottom, totalHeight], reason: 'non_core_bottom' });
|
|
550
|
-
|
|
551
|
-
return zones;
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
function buildRecommendedDuration(highlightsCount) {
|
|
555
|
-
if (highlightsCount >= 3) return [45, 65];
|
|
556
|
-
if (highlightsCount === 2) return [35, 55];
|
|
557
|
-
return [28, 45];
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
export async function analyzePage({
|
|
561
|
-
url,
|
|
562
|
-
persona = '',
|
|
563
|
-
options = {},
|
|
564
|
-
} = {}) {
|
|
565
|
-
const targetUrl = String(url ?? '').trim();
|
|
566
|
-
if (!targetUrl) throw new Error('url_required');
|
|
567
|
-
|
|
568
|
-
const normalizedOptions = normalizeOptions(options);
|
|
569
|
-
const chromium = await loadPlaywrightChromium();
|
|
570
|
-
const executablePath = resolveChromeExecutable(normalizedOptions);
|
|
571
|
-
|
|
572
|
-
if (!executablePath) {
|
|
573
|
-
throw new Error('chrome_executable_not_found');
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
const browser = await chromium.launch({
|
|
577
|
-
executablePath,
|
|
578
|
-
headless: true,
|
|
579
|
-
args: normalizedOptions.disableSandbox
|
|
580
|
-
? ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
|
581
|
-
: ['--disable-dev-shm-usage'],
|
|
582
|
-
});
|
|
583
|
-
|
|
584
|
-
let pagePath = 'vision';
|
|
585
|
-
try {
|
|
586
|
-
const context = await browser.newContext({
|
|
587
|
-
viewport: {
|
|
588
|
-
width: normalizedOptions.viewportWidth,
|
|
589
|
-
height: normalizedOptions.viewportHeight,
|
|
590
|
-
},
|
|
591
|
-
isMobile: true,
|
|
592
|
-
hasTouch: true,
|
|
593
|
-
userAgent: IOS_UA,
|
|
594
|
-
deviceScaleFactor: 1,
|
|
595
|
-
});
|
|
596
|
-
const page = await context.newPage();
|
|
597
|
-
|
|
598
|
-
await page.goto(targetUrl, {
|
|
599
|
-
waitUntil: 'domcontentloaded',
|
|
600
|
-
timeout: normalizedOptions.timeoutMs,
|
|
601
|
-
});
|
|
602
|
-
try {
|
|
603
|
-
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
|
604
|
-
} catch {
|
|
605
|
-
// ignore networkidle timeout
|
|
606
|
-
}
|
|
607
|
-
await page.waitForTimeout(normalizedOptions.settleMs);
|
|
608
|
-
await maybeInjectReadability(page);
|
|
609
|
-
|
|
610
|
-
const structure = await extractStructure(page, normalizedOptions);
|
|
611
|
-
const focusRange = computeFocusRange(structure.core_y_range);
|
|
612
|
-
structure.focus_y_range = focusRange;
|
|
613
|
-
structure.url = targetUrl;
|
|
614
|
-
|
|
615
|
-
pagePath = chooseUnderstandingPath(structure, {
|
|
616
|
-
minCount: normalizedOptions.minTextBins,
|
|
617
|
-
minChars: normalizedOptions.minBinChars,
|
|
618
|
-
});
|
|
619
|
-
|
|
620
|
-
let llmResult = null;
|
|
621
|
-
if (pagePath === 'text') {
|
|
622
|
-
llmResult = await runTextPath({ page, persona, structure, options: normalizedOptions });
|
|
623
|
-
const needsFallback = normalizedOptions.allowVisionFallback && (llmResult.highlights?.length ?? 0) < 1;
|
|
624
|
-
if (needsFallback) {
|
|
625
|
-
const visionResult = await runVisionPath({ page, persona, structure, options: normalizedOptions });
|
|
626
|
-
llmResult = {
|
|
627
|
-
...visionResult,
|
|
628
|
-
fallback_from: 'text',
|
|
629
|
-
};
|
|
630
|
-
pagePath = 'vision_fallback';
|
|
631
|
-
}
|
|
632
|
-
} else {
|
|
633
|
-
llmResult = await runVisionPath({ page, persona, structure, options: normalizedOptions });
|
|
634
|
-
}
|
|
635
|
-
|
|
636
|
-
const normalizedHighlights = coerceHighlights(llmResult.highlights, structure.focus_y_range);
|
|
637
|
-
const pageType = derivePageType({ hostname: structure.hostname, meta: structure.meta });
|
|
638
|
-
|
|
639
|
-
const semanticOutput = buildRecruitmentSemanticSlots({
|
|
640
|
-
pageType,
|
|
641
|
-
url: targetUrl,
|
|
642
|
-
meta: structure.meta,
|
|
643
|
-
bins: structure.bins,
|
|
644
|
-
hotspots: structure.hotspots,
|
|
645
|
-
coreRange: structure.core_y_range,
|
|
646
|
-
focusRange: structure.focus_y_range,
|
|
647
|
-
});
|
|
648
|
-
|
|
649
|
-
return normalizePageUnderstanding({
|
|
650
|
-
url: targetUrl,
|
|
651
|
-
page_type: pageType,
|
|
652
|
-
hostname: structure.hostname,
|
|
653
|
-
core_message: llmResult.core_message || buildHeuristicCoreMessage({
|
|
654
|
-
persona,
|
|
655
|
-
meta: structure.meta,
|
|
656
|
-
bins: structure.bins,
|
|
657
|
-
}),
|
|
658
|
-
core_y_range: structure.core_y_range,
|
|
659
|
-
focus_y_range: structure.focus_y_range,
|
|
660
|
-
total_height: structure.total_height,
|
|
661
|
-
candidate_hotspots: normalizedHighlights,
|
|
662
|
-
skip_zones: buildSkipZones(structure),
|
|
663
|
-
recommended_duration_s: buildRecommendedDuration(normalizedHighlights.length),
|
|
664
|
-
semantic_slots: semanticOutput.semantic_slots,
|
|
665
|
-
mode_hint: semanticOutput.mode_hint,
|
|
666
|
-
mode_hint_confidence: semanticOutput.mode_hint_confidence,
|
|
667
|
-
meta: structure.meta,
|
|
668
|
-
hotspots: structure.hotspots,
|
|
669
|
-
debug: {
|
|
670
|
-
extraction_strategy: structure.strategy,
|
|
671
|
-
path: pagePath,
|
|
672
|
-
model_used: llmResult.model_used ?? 'heuristic',
|
|
673
|
-
bins_count: Array.isArray(structure.bins) ? structure.bins.length : 0,
|
|
674
|
-
},
|
|
675
|
-
});
|
|
676
|
-
} finally {
|
|
677
|
-
await browser.close().catch(() => {});
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
export async function analyzePageFromHtmlFixture({
|
|
682
|
-
url = 'https://fixture.local',
|
|
683
|
-
hostname = 'fixture.local',
|
|
684
|
-
coreRange = [200, 2200],
|
|
685
|
-
focusRange = [200, 1800],
|
|
686
|
-
totalHeight = 3000,
|
|
687
|
-
bins = [],
|
|
688
|
-
hotspots = [],
|
|
689
|
-
meta = {},
|
|
690
|
-
persona = '',
|
|
691
|
-
} = {}) {
|
|
692
|
-
const highlights = buildHeuristicHighlights({
|
|
693
|
-
bins,
|
|
694
|
-
hotspots,
|
|
695
|
-
coreRange,
|
|
696
|
-
focusRange,
|
|
697
|
-
});
|
|
698
|
-
|
|
699
|
-
const pageType = derivePageType({ hostname, meta });
|
|
700
|
-
const semanticOutput = buildRecruitmentSemanticSlots({
|
|
701
|
-
pageType,
|
|
702
|
-
url,
|
|
703
|
-
meta,
|
|
704
|
-
bins,
|
|
705
|
-
hotspots,
|
|
706
|
-
coreRange,
|
|
707
|
-
focusRange,
|
|
708
|
-
});
|
|
709
|
-
|
|
710
|
-
return normalizePageUnderstanding({
|
|
711
|
-
url,
|
|
712
|
-
page_type: pageType,
|
|
713
|
-
hostname,
|
|
714
|
-
core_message: buildHeuristicCoreMessage({ persona, meta, bins }),
|
|
715
|
-
core_y_range: coreRange,
|
|
716
|
-
focus_y_range: focusRange,
|
|
717
|
-
total_height: totalHeight,
|
|
718
|
-
candidate_hotspots: highlights,
|
|
719
|
-
skip_zones: [
|
|
720
|
-
{ y_range: [0, coreRange[0]], reason: 'non_core_top' },
|
|
721
|
-
{ y_range: [coreRange[1], totalHeight], reason: 'non_core_bottom' },
|
|
722
|
-
],
|
|
723
|
-
recommended_duration_s: buildRecommendedDuration(highlights.length),
|
|
724
|
-
semantic_slots: semanticOutput.semantic_slots,
|
|
725
|
-
mode_hint: semanticOutput.mode_hint,
|
|
726
|
-
mode_hint_confidence: semanticOutput.mode_hint_confidence,
|
|
727
|
-
meta,
|
|
728
|
-
hotspots,
|
|
729
|
-
debug: {
|
|
730
|
-
extraction_strategy: 'fixture',
|
|
731
|
-
path: chooseUnderstandingPath({ bins }),
|
|
732
|
-
model_used: 'heuristic',
|
|
733
|
-
bins_count: bins.length,
|
|
734
|
-
fixture_dir: __dirname,
|
|
735
|
-
},
|
|
736
|
-
});
|
|
737
|
-
}
|