khoji 2.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -0
- package/dist/ai/GeminiAdapter.d.ts +7 -0
- package/dist/ai/GeminiAdapter.d.ts.map +1 -0
- package/dist/ai/GeminiAdapter.js +40 -0
- package/dist/ai/GeminiAdapter.js.map +1 -0
- package/dist/browser/BrowserManager.d.ts +17 -0
- package/dist/browser/BrowserManager.d.ts.map +1 -0
- package/dist/browser/BrowserManager.js +61 -0
- package/dist/browser/BrowserManager.js.map +1 -0
- package/dist/browser/PageLoader.d.ts +21 -0
- package/dist/browser/PageLoader.d.ts.map +1 -0
- package/dist/browser/PageLoader.js +116 -0
- package/dist/browser/PageLoader.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +98 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/extractors/AnimationExtractor.d.ts +12 -0
- package/dist/extractors/AnimationExtractor.d.ts.map +1 -0
- package/dist/extractors/AnimationExtractor.js +247 -0
- package/dist/extractors/AnimationExtractor.js.map +1 -0
- package/dist/extractors/AssetExtractor.d.ts +11 -0
- package/dist/extractors/AssetExtractor.d.ts.map +1 -0
- package/dist/extractors/AssetExtractor.js +124 -0
- package/dist/extractors/AssetExtractor.js.map +1 -0
- package/dist/extractors/ContentExtractor.d.ts +13 -0
- package/dist/extractors/ContentExtractor.d.ts.map +1 -0
- package/dist/extractors/ContentExtractor.js +60 -0
- package/dist/extractors/ContentExtractor.js.map +1 -0
- package/dist/extractors/DomExtractor.d.ts +11 -0
- package/dist/extractors/DomExtractor.d.ts.map +1 -0
- package/dist/extractors/DomExtractor.js +68 -0
- package/dist/extractors/DomExtractor.js.map +1 -0
- package/dist/extractors/InteractionExtractor.d.ts +10 -0
- package/dist/extractors/InteractionExtractor.d.ts.map +1 -0
- package/dist/extractors/InteractionExtractor.js +64 -0
- package/dist/extractors/InteractionExtractor.js.map +1 -0
- package/dist/extractors/MetaExtractor.d.ts +8 -0
- package/dist/extractors/MetaExtractor.d.ts.map +1 -0
- package/dist/extractors/MetaExtractor.js +33 -0
- package/dist/extractors/MetaExtractor.js.map +1 -0
- package/dist/extractors/StyleExtractor.d.ts +10 -0
- package/dist/extractors/StyleExtractor.d.ts.map +1 -0
- package/dist/extractors/StyleExtractor.js +87 -0
- package/dist/extractors/StyleExtractor.js.map +1 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/output/Writer.d.ts +5 -0
- package/dist/output/Writer.d.ts.map +1 -0
- package/dist/output/Writer.js +13 -0
- package/dist/output/Writer.js.map +1 -0
- package/dist/pipeline/Cleaner.d.ts +12 -0
- package/dist/pipeline/Cleaner.d.ts.map +1 -0
- package/dist/pipeline/Cleaner.js +41 -0
- package/dist/pipeline/Cleaner.js.map +1 -0
- package/dist/pipeline/ComponentDetector.d.ts +8 -0
- package/dist/pipeline/ComponentDetector.d.ts.map +1 -0
- package/dist/pipeline/ComponentDetector.js +43 -0
- package/dist/pipeline/ComponentDetector.js.map +1 -0
- package/dist/pipeline/runner.d.ts +3 -0
- package/dist/pipeline/runner.d.ts.map +1 -0
- package/dist/pipeline/runner.js +182 -0
- package/dist/pipeline/runner.js.map +1 -0
- package/dist/prompting/PromptGenerator.d.ts +5 -0
- package/dist/prompting/PromptGenerator.d.ts.map +1 -0
- package/dist/prompting/PromptGenerator.js +30 -0
- package/dist/prompting/PromptGenerator.js.map +1 -0
- package/dist/serializer/JsonSerializer.d.ts +6 -0
- package/dist/serializer/JsonSerializer.d.ts.map +1 -0
- package/dist/serializer/JsonSerializer.js +7 -0
- package/dist/serializer/JsonSerializer.js.map +1 -0
- package/dist/serializer/MarkdownSerializer.d.ts +7 -0
- package/dist/serializer/MarkdownSerializer.d.ts.map +1 -0
- package/dist/serializer/MarkdownSerializer.js +143 -0
- package/dist/serializer/MarkdownSerializer.js.map +1 -0
- package/dist/types/KhojContext.d.ts +141 -0
- package/dist/types/KhojContext.d.ts.map +1 -0
- package/dist/types/KhojContext.js +6 -0
- package/dist/types/KhojContext.js.map +1 -0
- package/dist/utils/logger.d.ts +15 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +70 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/text.d.ts +2 -0
- package/dist/utils/text.d.ts.map +1 -0
- package/dist/utils/text.js +6 -0
- package/dist/utils/text.js.map +1 -0
- package/dist/utils/tokenEstimator.d.ts +10 -0
- package/dist/utils/tokenEstimator.d.ts.map +1 -0
- package/dist/utils/tokenEstimator.js +17 -0
- package/dist/utils/tokenEstimator.js.map +1 -0
- package/khoj-context.schema.json +48 -0
- package/package.json +75 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AnimationExtractor — 3-pass animation intelligence:
|
|
3
|
+
*
|
|
4
|
+
* Pass 1: CSS @keyframes + transitions (from document.styleSheets)
|
|
5
|
+
* Pass 2: JS animation library detection (GSAP, Framer Motion, AOS, Lottie, etc.)
|
|
6
|
+
* Pass 3: GIF purpose inference from context
|
|
7
|
+
*/
|
|
8
|
+
export async function extractAnimations(page, gifs) {
|
|
9
|
+
const [cssResult, jsResult] = await Promise.all([
|
|
10
|
+
extractCSSAnimations(page),
|
|
11
|
+
extractJSAnimations(page),
|
|
12
|
+
]);
|
|
13
|
+
const gifAnimations = inferGifPurpose(gifs);
|
|
14
|
+
const summary = buildSummary(cssResult.animations, cssResult.transitions, jsResult.jsAnimations, jsResult.scrollAnimations, gifAnimations);
|
|
15
|
+
return {
|
|
16
|
+
cssAnimations: cssResult.animations,
|
|
17
|
+
cssTransitions: cssResult.transitions,
|
|
18
|
+
jsAnimations: jsResult.jsAnimations,
|
|
19
|
+
scrollAnimations: jsResult.scrollAnimations,
|
|
20
|
+
gifAnimations,
|
|
21
|
+
summary,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
// ─── Pass 1: CSS ──────────────────────────────────────────────────────────────
|
|
25
|
+
async function extractCSSAnimations(page) {
|
|
26
|
+
return page.evaluate(() => {
|
|
27
|
+
const keyframeMap = new Map(); // name → description of steps
|
|
28
|
+
const animationRules = [];
|
|
29
|
+
const transitionRules = [];
|
|
30
|
+
// Collect all @keyframes first
|
|
31
|
+
for (const sheet of Array.from(document.styleSheets)) {
|
|
32
|
+
let rules;
|
|
33
|
+
try {
|
|
34
|
+
rules = sheet.cssRules;
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
for (const rule of Array.from(rules)) {
|
|
40
|
+
if (rule instanceof CSSKeyframesRule) {
|
|
41
|
+
const steps = Array.from(rule.cssRules)
|
|
42
|
+
.map((r) => `${r.keyText}: ${r.style.cssText.slice(0, 80)}`)
|
|
43
|
+
.join('; ');
|
|
44
|
+
keyframeMap.set(rule.name, steps);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
// Now map animation-name on elements back to keyframes
|
|
49
|
+
const processedSelectors = new Set();
|
|
50
|
+
for (const sheet of Array.from(document.styleSheets)) {
|
|
51
|
+
let rules;
|
|
52
|
+
try {
|
|
53
|
+
rules = sheet.cssRules;
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
for (const rule of Array.from(rules)) {
|
|
59
|
+
if (!(rule instanceof CSSStyleRule))
|
|
60
|
+
continue;
|
|
61
|
+
const style = rule.style;
|
|
62
|
+
// CSS animations
|
|
63
|
+
const animName = style.getPropertyValue('animation-name') ||
|
|
64
|
+
style.getPropertyValue('animation')?.split(' ')[0] || '';
|
|
65
|
+
if (animName && animName !== 'none' && !processedSelectors.has(`anim:${rule.selectorText}`)) {
|
|
66
|
+
processedSelectors.add(`anim:${rule.selectorText}`);
|
|
67
|
+
const keyframeDesc = keyframeMap.get(animName) ?? '';
|
|
68
|
+
const trigger = rule.selectorText.includes(':hover') ? 'hover'
|
|
69
|
+
: rule.selectorText.includes(':focus') ? 'focus'
|
|
70
|
+
: 'page-load';
|
|
71
|
+
animationRules.push({
|
|
72
|
+
name: animName,
|
|
73
|
+
selector: rule.selectorText.slice(0, 120),
|
|
74
|
+
duration: style.getPropertyValue('animation-duration') || '1s',
|
|
75
|
+
timingFunction: style.getPropertyValue('animation-timing-function') || 'ease',
|
|
76
|
+
iterationCount: style.getPropertyValue('animation-iteration-count') || '1',
|
|
77
|
+
delay: style.getPropertyValue('animation-delay') || '0s',
|
|
78
|
+
trigger,
|
|
79
|
+
description: keyframeDesc
|
|
80
|
+
? `"${animName}" — ${keyframeDesc.slice(0, 150)}`
|
|
81
|
+
: `Animation "${animName}" applied`,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
// CSS transitions
|
|
85
|
+
const transitionProp = style.getPropertyValue('transition');
|
|
86
|
+
if (transitionProp && transitionProp !== 'none' && !processedSelectors.has(`trans:${rule.selectorText}`)) {
|
|
87
|
+
processedSelectors.add(`trans:${rule.selectorText}`);
|
|
88
|
+
const trigger = rule.selectorText.includes(':hover') ? 'hover'
|
|
89
|
+
: rule.selectorText.includes(':focus') ? 'focus'
|
|
90
|
+
: rule.selectorText.includes(':active') ? 'active'
|
|
91
|
+
: 'unknown';
|
|
92
|
+
const props = transitionProp.split(',').map((t) => t.trim().split(' ')[0]);
|
|
93
|
+
transitionRules.push({
|
|
94
|
+
selector: rule.selectorText.slice(0, 120),
|
|
95
|
+
properties: props,
|
|
96
|
+
duration: transitionProp.match(/[\d.]+s/)?.[0] ?? '0.3s',
|
|
97
|
+
timingFunction: transitionProp.match(/ease[a-z-]*|linear|cubic-bezier\([^)]+\)/)?.[0] ?? 'ease',
|
|
98
|
+
trigger: trigger,
|
|
99
|
+
description: `Transition on ${props.join(', ')} — ${trigger}`,
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
animations: animationRules.slice(0, 50),
|
|
106
|
+
transitions: transitionRules.slice(0, 50),
|
|
107
|
+
};
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
// ─── Pass 2: JS Libraries ─────────────────────────────────────────────────────
|
|
111
|
+
async function extractJSAnimations(page) {
|
|
112
|
+
return page.evaluate(() => {
|
|
113
|
+
const win = window;
|
|
114
|
+
const jsAnimations = [];
|
|
115
|
+
const scrollAnimations = [];
|
|
116
|
+
// GSAP
|
|
117
|
+
if (win['gsap']) {
|
|
118
|
+
jsAnimations.push({
|
|
119
|
+
library: 'gsap',
|
|
120
|
+
selector: 'document',
|
|
121
|
+
description: 'GSAP detected — timeline-based animation library in use',
|
|
122
|
+
trigger: 'page-load',
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
// Framer Motion
|
|
126
|
+
if (win['Motion'] || win['__framer_motion__'] || document.querySelector('[data-framer-motion]')) {
|
|
127
|
+
jsAnimations.push({
|
|
128
|
+
library: 'framer-motion',
|
|
129
|
+
selector: '[data-framer-motion]',
|
|
130
|
+
description: 'Framer Motion detected — declarative React animation library',
|
|
131
|
+
trigger: 'page-load',
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
// Anime.js
|
|
135
|
+
if (win['anime']) {
|
|
136
|
+
jsAnimations.push({
|
|
137
|
+
library: 'animejs',
|
|
138
|
+
selector: 'document',
|
|
139
|
+
description: 'Anime.js detected — lightweight JavaScript animation library',
|
|
140
|
+
trigger: 'page-load',
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
// Lottie
|
|
144
|
+
if (win['lottie'] || win['Lottie'] || document.querySelector('lottie-player, [data-lottie]')) {
|
|
145
|
+
jsAnimations.push({
|
|
146
|
+
library: 'lottie',
|
|
147
|
+
selector: 'lottie-player, [data-lottie]',
|
|
148
|
+
description: 'Lottie detected — JSON-based vector animations in use',
|
|
149
|
+
trigger: 'page-load',
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
// Web Animations API
|
|
153
|
+
if (typeof Element.prototype.animate === 'function') {
|
|
154
|
+
// Only flag if scripts are actually using it (heuristic: check for animate() on visible elements)
|
|
155
|
+
const hasWaapi = document.querySelectorAll('[data-animate], .animate').length > 0;
|
|
156
|
+
if (hasWaapi) {
|
|
157
|
+
jsAnimations.push({
|
|
158
|
+
library: 'web-animations-api',
|
|
159
|
+
selector: '[data-animate], .animate',
|
|
160
|
+
description: 'Web Animations API in use',
|
|
161
|
+
trigger: 'page-load',
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// AOS (scroll animations)
|
|
166
|
+
if (win['AOS'] || document.querySelector('[data-aos]')) {
|
|
167
|
+
const aosEls = Array.from(document.querySelectorAll('[data-aos]')).slice(0, 20);
|
|
168
|
+
aosEls.forEach((el) => {
|
|
169
|
+
scrollAnimations.push({
|
|
170
|
+
selector: el.tagName.toLowerCase() + (el.className ? `.${el.className.toString().split(' ')[0]}` : ''),
|
|
171
|
+
library: 'aos',
|
|
172
|
+
animationType: el.getAttribute('data-aos') ?? 'fade',
|
|
173
|
+
description: `AOS scroll animation: ${el.getAttribute('data-aos') ?? 'fade'} — triggers when element enters viewport`,
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
// GSAP ScrollTrigger heuristic
|
|
178
|
+
if (win['ScrollTrigger'] || (win['gsap'] && document.querySelector('[data-scroll]'))) {
|
|
179
|
+
document.querySelectorAll('[data-scroll], [data-scroll-trigger]').forEach((el) => {
|
|
180
|
+
scrollAnimations.push({
|
|
181
|
+
selector: el.tagName.toLowerCase(),
|
|
182
|
+
library: 'gsap-scrolltrigger',
|
|
183
|
+
animationType: el.getAttribute('data-scroll') ?? 'scroll-driven',
|
|
184
|
+
description: 'GSAP ScrollTrigger — animation tied to scroll position',
|
|
185
|
+
});
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
return { jsAnimations, scrollAnimations };
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
// ─── Pass 3: GIF Purpose Inference ───────────────────────────────────────────
|
|
192
|
+
function inferGifPurpose(gifs) {
|
|
193
|
+
return gifs.map((gif) => {
|
|
194
|
+
const combined = `${gif.alt} ${gif.selector} ${gif.url}`.toLowerCase();
|
|
195
|
+
let purpose = 'unknown';
|
|
196
|
+
if (/load|spin|wait|progress|preload|buffer/.test(combined)) {
|
|
197
|
+
purpose = 'loading-spinner';
|
|
198
|
+
}
|
|
199
|
+
else if (/demo|preview|product|feature|how|tutorial|guide|walkthrough|example/.test(combined)) {
|
|
200
|
+
purpose = gif.alt.toLowerCase().includes('tutorial') || gif.selector.includes('tutorial')
|
|
201
|
+
? 'tutorial'
|
|
202
|
+
: 'product-demo';
|
|
203
|
+
}
|
|
204
|
+
else if (/bg|background|decoration|hero|banner|pattern|abstract/.test(combined)) {
|
|
205
|
+
purpose = 'decorative';
|
|
206
|
+
}
|
|
207
|
+
else if (gif.alt.length === 0 && gif.selector.includes('bg')) {
|
|
208
|
+
purpose = 'decorative';
|
|
209
|
+
}
|
|
210
|
+
return {
|
|
211
|
+
url: gif.url,
|
|
212
|
+
selector: gif.selector,
|
|
213
|
+
alt: gif.alt,
|
|
214
|
+
purpose,
|
|
215
|
+
};
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
// ─── Summary Builder ──────────────────────────────────────────────────────────
|
|
219
|
+
function buildSummary(cssAnimations, cssTransitions, jsAnimations, scrollAnimations, gifAnimations) {
|
|
220
|
+
const parts = [];
|
|
221
|
+
if (cssAnimations.length > 0) {
|
|
222
|
+
const names = [...new Set(cssAnimations.map((a) => a.name))].slice(0, 3).join(', ');
|
|
223
|
+
parts.push(`CSS animations: ${names} (${cssAnimations.length} total)`);
|
|
224
|
+
}
|
|
225
|
+
if (cssTransitions.length > 0) {
|
|
226
|
+
const hoverCount = cssTransitions.filter((t) => t.trigger === 'hover').length;
|
|
227
|
+
parts.push(`${cssTransitions.length} CSS transitions${hoverCount > 0 ? ` (${hoverCount} on hover)` : ''}`);
|
|
228
|
+
}
|
|
229
|
+
if (jsAnimations.length > 0) {
|
|
230
|
+
const libs = [...new Set(jsAnimations.map((j) => j.library))].join(', ');
|
|
231
|
+
parts.push(`JS: ${libs}`);
|
|
232
|
+
}
|
|
233
|
+
if (scrollAnimations.length > 0) {
|
|
234
|
+
const lib = scrollAnimations[0]?.library ?? 'scroll';
|
|
235
|
+
parts.push(`${scrollAnimations.length} scroll animations (${lib})`);
|
|
236
|
+
}
|
|
237
|
+
if (gifAnimations.length > 0) {
|
|
238
|
+
const spinners = gifAnimations.filter((g) => g.purpose === 'loading-spinner').length;
|
|
239
|
+
const demos = gifAnimations.filter((g) => g.purpose === 'product-demo').length;
|
|
240
|
+
const detail = [spinners > 0 && `${spinners} spinner`, demos > 0 && `${demos} demo`]
|
|
241
|
+
.filter(Boolean)
|
|
242
|
+
.join(', ');
|
|
243
|
+
parts.push(`${gifAnimations.length} GIF${gifAnimations.length > 1 ? 's' : ''}${detail ? ` (${detail})` : ''}`);
|
|
244
|
+
}
|
|
245
|
+
return parts.length > 0 ? parts.join('. ') + '.' : 'No animations detected.';
|
|
246
|
+
}
|
|
247
|
+
//# sourceMappingURL=AnimationExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AnimationExtractor.js","sourceRoot":"","sources":["../../src/extractors/AnimationExtractor.ts"],"names":[],"mappings":"AAcA;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACnC,IAAU,EACV,IAAkB;IAElB,MAAM,CAAC,SAAS,EAAE,QAAQ,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC5C,oBAAoB,CAAC,IAAI,CAAC;QAC1B,mBAAmB,CAAC,IAAI,CAAC;KAC5B,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;IAE5C,MAAM,OAAO,GAAG,YAAY,CACxB,SAAS,CAAC,UAAU,EACpB,SAAS,CAAC,WAAW,EACrB,QAAQ,CAAC,YAAY,EACrB,QAAQ,CAAC,gBAAgB,EACzB,aAAa,CAChB,CAAC;IAEF,OAAO;QACH,aAAa,EAAE,SAAS,CAAC,UAAU;QACnC,cAAc,EAAE,SAAS,CAAC,WAAW;QACrC,YAAY,EAAE,QAAQ,CAAC,YAAY;QACnC,gBAAgB,EAAE,QAAQ,CAAC,gBAAgB;QAC3C,aAAa;QACb,OAAO;KACV,CAAC;AACN,CAAC;AAED,iFAAiF;AAEjF,KAAK,UAAU,oBAAoB,CAC/B,IAAU;IAEV,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAiE,EAAE;QACpF,MAAM,WAAW,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAG,8BAA8B;QAC/E,MAAM,cAAc,GAAmB,EAAE,CAAC;QAC1C,MAAM,eAAe,GAAoB,EAAE,CAAC;QAE5C,+BAA+B;QAC/B,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;YACnD,IAAI,KAAkB,CAAC;YACvB,IAAI,CAAC;gBAAC,KAAK,GAAG,KAAK,CAAC,QAAQ,CAAC;YAAC,CAAC;YAAC,MAAM,CAAC;gBAAC,SAAS;YAAC,CAAC;YAEnD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACnC,IAAI,IAAI,YAAY,gBAAgB,EAAE,CAAC;oBACnC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;yBAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAI,CAAqB,CAAC,OAAO,KAAM,CAAqB,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;yBACrG,IAAI,CAAC,IAAI,CAAC,CAAC;oBAChB,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;gBACtC,CAAC;YACL,CAAC;QACL,CAAC;QAED,uDAAuD;QACvD,MAAM,kBAAkB,GAAG,IAAI,GAAG,EAAU,CAAC;QAE7C,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;YACnD,IAAI,KAAkB,CAAC;YACvB,IAAI,CAAC;gBAAC,KAAK,GAAG,KAAK,CAAC,QAAQ,CAAC;YAAC,CAAC;YAAC,MAAM,CAAC;gBAAC,SAAS;YAAC,CAAC;YAEnD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACnC,IAAI,CAAC,CAAC,IAAI,YAAY,YAAY,CAAC;oBAAE,SAAS;gBAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;gBAEzB,iBAAiB;gBACjB,MAAM,QAAQ,GAAG,KAAK,CAAC,gBAAgB,CAAC,gBAAgB,CAAC;oBACrD,KAAK,CAAC,gBAAgB,CAAC,WAAW,CAAC,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAE7D,IAAI,QAAQ,IAAI,QAAQ,KAAK,MAAM,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,YAAY,EAAE,CAAC,EAAE,CAAC;oBAC1F,kBAAkB,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC;oBACpD,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;oBACrD,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,OAAO;wBAC1D,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,OAAO;4BAC5C,CAAC,CAAC,WAAW,CAAC;oBAEtB,cAAc,CAAC,IAAI,CAAC;wBAChB,IAAI,EAAE,QAAQ;wBACd,QAAQ,EAAE,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;wBACzC,QAAQ,EAAE,KAAK,CAAC,gBAAgB,CAAC,oBAAoB,CAAC,IAAI,IAAI;wBAC9D,cAAc,EAAE,KAAK,CAAC,gBAAgB,CAAC,2BAA2B,CAAC,IAAI,MAAM;wBAC7E,cAAc,EAAE,KAAK,CAAC,gBAAgB,CAAC,2BAA2B,CAAC,IAAI,GAAG;wBAC1E,KAAK,EAAE,KAAK,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,IAAI,IAAI;wBACxD,OAAO;wBACP,WAAW,EAAE,YAAY;4BACrB,CAAC,CAAC,IAAI,QAAQ,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;4BACjD,CAAC,CAAC,cAAc,QAAQ,WAAW;qBAC1C,CAAC,CAAC;gBACP,CAAC;gBAED,kBAAkB;gBAClB,MAAM,cAAc,GAAG,KAAK,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC;gBAC5D,IAAI,cAAc,IAAI,cAAc,KAAK,MAAM,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,YAAY,EAAE,CAAC,EAAE,CAAC;oBACvG,kBAAkB,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC;oBACrD,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,OAAO;wBAC1D,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,OAAO;4BAC5C,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,QAAQ;gCAC9C,CAAC,CAAC,SAAS,CAAC;oBAExB,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;oBAE3E,eAAe,CAAC,IAAI,CAAC;wBACjB,QAAQ,EAAE,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;wBACzC,UAAU,EAAE,KAAK;wBACjB,QAAQ,EAAE,cAAc,CAAC,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,MAAM;wBACxD,cAAc,EAAE,cAAc,CAAC,KAAK,CAAC,0CAA0C,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,MAAM;wBAC/F,OAAO,EAAE,OAAmC;wBAC5C,WAAW,EAAE,iBAAiB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,OAAO,EAAE;qBAChE,CAAC,CAAC;gBACP,CAAC;YACL,CAAC;QACL,CAAC;QAED,OAAO;YACH,UAAU,EAAE,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;YACvC,WAAW,EAAE,eAAe,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;SAC5C,CAAC;IACN,CAAC,CAAC,CAAC;AACP,CAAC;AAED,iFAAiF;AAEjF,KAAK,UAAU,mBAAmB,CAC9B,IAAU;IAEV,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAyE,EAAE;QAC5F,MAAM,GAAG,GAAG,MAA4C,CAAC;QACzD,MAAM,YAAY,GAAkB,EAAE,CAAC;QACvC,MAAM,gBAAgB,GAAsB,EAAE,CAAC;QAE/C,OAAO;QACP,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC;gBACd,OAAO,EAAE,MAAM;gBACf,QAAQ,EAAE,UAAU;gBACpB,WAAW,EAAE,yDAAyD;gBACtE,OAAO,EAAE,WAAW;aACvB,CAAC,CAAC;QACP,CAAC;QAED,gBAAgB;QAChB,IAAI,GAAG,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,mBAAmB,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,sBAAsB,CAAC,EAAE,CAAC;YAC9F,YAAY,CAAC,IAAI,CAAC;gBACd,OAAO,EAAE,eAAe;gBACxB,QAAQ,EAAE,sBAAsB;gBAChC,WAAW,EAAE,8DAA8D;gBAC3E,OAAO,EAAE,WAAW;aACvB,CAAC,CAAC;QACP,CAAC;QAED,WAAW;QACX,IAAI,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACf,YAAY,CAAC,IAAI,CAAC;gBACd,OAAO,EAAE,SAAS;gBAClB,QAAQ,EAAE,UAAU;gBACpB,WAAW,EAAE,8DAA8D;gBAC3E,OAAO,EAAE,WAAW;aACvB,CAAC,CAAC;QACP,CAAC;QAED,SAAS;QACT,IAAI,GAAG,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,8BAA8B,CAAC,EAAE,CAAC;YAC3F,YAAY,CAAC,IAAI,CAAC;gBACd,OAAO,EAAE,QAAQ;gBACjB,QAAQ,EAAE,8BAA8B;gBACxC,WAAW,EAAE,uDAAuD;gBACpE,OAAO,EAAE,WAAW;aACvB,CAAC,CAAC;QACP,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,OAAO,CAAC,SAAS,CAAC,OAAO,KAAK,UAAU,EAAE,CAAC;YAClD,kGAAkG;YAClG,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,0BAA0B,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAClF,IAAI,QAAQ,EAAE,CAAC;gBACX,YAAY,CAAC,IAAI,CAAC;oBACd,OAAO,EAAE,oBAAoB;oBAC7B,QAAQ,EAAE,0BAA0B;oBACpC,WAAW,EAAE,2BAA2B;oBACxC,OAAO,EAAE,WAAW;iBACvB,CAAC,CAAC;YACP,CAAC;QACL,CAAC;QAED,0BAA0B;QAC1B,IAAI,GAAG,CAAC,KAAK,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC,EAAE,CAAC;YACrD,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAChF,MAAM,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;gBAClB,gBAAgB,CAAC,IAAI,CAAC;oBAClB,QAAQ,EAAE,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBACtG,OAAO,EAAE,KAAK;oBACd,aAAa,EAAE,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,MAAM;oBACpD,WAAW,EAAE,yBAAyB,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,MAAM,0CAA0C;iBACxH,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;QACP,CAAC;QAED,+BAA+B;QAC/B,IAAI,GAAG,CAAC,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC,EAAE,CAAC;YACnF,QAAQ,CAAC,gBAAgB,CAAC,sCAAsC,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;gBAC7E,gBAAgB,CAAC,IAAI,CAAC;oBAClB,QAAQ,EAAE,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE;oBAClC,OAAO,EAAE,oBAAoB;oBAC7B,aAAa,EAAE,EAAE,CAAC,YAAY,CAAC,aAAa,CAAC,IAAI,eAAe;oBAChE,WAAW,EAAE,wDAAwD;iBACxE,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;QACP,CAAC;QAED,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,CAAC;IAC9C,CAAC,CAAC,CAAC;AACP,CAAC;AAED,gFAAgF;AAEhF,SAAS,eAAe,CAAC,IAAkB;IACvC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QACpB,MAAM,QAAQ,GAAG,GAAG,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,IAAI,GAAG,CAAC,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC;QAEvE,IAAI,OAAO,GAAe,SAAS,CAAC;QAEpC,IAAI,wCAAwC,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1D,OAAO,GAAG,iBAAiB,CAAC;QAChC,CAAC;aAAM,IAAI,qEAAqE,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC9F,OAAO,GAAG,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC;gBACrF,CAAC,CAAC,UAAU;gBACZ,CAAC,CAAC,cAAc,CAAC;QACzB,CAAC;aAAM,IAAI,uDAAuD,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YAChF,OAAO,GAAG,YAAY,CAAC;QAC3B,CAAC;aAAM,IAAI,GAAG,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7D,OAAO,GAAG,YAAY,CAAC;QAC3B,CAAC;QAED,OAAO;YACH,GAAG,EAAE,GAAG,CAAC,GAAG;YACZ,QAAQ,EAAE,GAAG,CAAC,QAAQ;YACtB,GAAG,EAAE,GAAG,CAAC,GAAG;YACZ,OAAO;SACV,CAAC;IACN,CAAC,CAAC,CAAC;AACP,CAAC;AAED,iFAAiF;AAEjF,SAAS,YAAY,CACjB,aAA6B,EAC7B,cAA+B,EAC/B,YAA2B,EAC3B,gBAAmC,EACnC,aAA6B;IAE7B,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpF,KAAK,CAAC,IAAI,CAAC,mBAAmB,KAAK,KAAK,aAAa,CAAC,MAAM,SAAS,CAAC,CAAC;IAC3E,CAAC;IAED,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,OAAO,CAAC,CAAC,MAAM,CAAC;QAC9E,KAAK,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,MAAM,mBAAmB,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,UAAU,YAAY,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC/G,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzE,KAAK,CAAC,IAAI,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC;IAC9B,CAAC;IAED,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,gBAAgB,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,QAAQ,CAAC;QACrD,KAAK,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC,CAAC;IACxE,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,QAAQ,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,iBAAiB,CAAC,CAAC,MAAM,CAAC;QACrF,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,cAAc,CAAC,CAAC,MAAM,CAAC;QAC/E,MAAM,MAAM,GAAG,CAAC,QAAQ,GAAG,CAAC,IAAI,GAAG,QAAQ,UAAU,EAAE,KAAK,GAAG,CAAC,IAAI,GAAG,KAAK,OAAO,CAAC;aAC/E,MAAM,CAAC,OAAO,CAAC;aACf,IAAI,CAAC,IAAI,CAAC,CAAC;QAChB,KAAK,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,MAAM,OAAO,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,KAAK,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACnH,CAAC;IAED,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,yBAAyB,CAAC;AACjF,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Page } from 'playwright';
|
|
2
|
+
import type { AssetMap } from '../types/KhojContext.js';
|
|
3
|
+
/**
|
|
4
|
+
* Collects all assets from the page:
|
|
5
|
+
* - Images and GIFs with metadata (alt, dimensions, lazy-load flag, CSS selector)
|
|
6
|
+
* - External scripts
|
|
7
|
+
* - Favicon and icon links
|
|
8
|
+
* - Font URLs from @font-face rules
|
|
9
|
+
*/
|
|
10
|
+
export declare function extractAssets(page: Page, baseUrl: string): Promise<AssetMap>;
|
|
11
|
+
//# sourceMappingURL=AssetExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AssetExtractor.d.ts","sourceRoot":"","sources":["../../src/extractors/AssetExtractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,QAAQ,EAAc,MAAM,yBAAyB,CAAC;AAEpE;;;;;;GAMG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,CAAC,CA2HlF"}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Collects all assets from the page:
|
|
3
|
+
* - Images and GIFs with metadata (alt, dimensions, lazy-load flag, CSS selector)
|
|
4
|
+
* - External scripts
|
|
5
|
+
* - Favicon and icon links
|
|
6
|
+
* - Font URLs from @font-face rules
|
|
7
|
+
*/
|
|
8
|
+
export async function extractAssets(page, baseUrl) {
|
|
9
|
+
const result = await page.evaluate(({ base }) => {
|
|
10
|
+
function toAbsolute(url) {
|
|
11
|
+
if (!url || url.startsWith('data:'))
|
|
12
|
+
return url;
|
|
13
|
+
try {
|
|
14
|
+
return new URL(url, base).href;
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
return url;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
function getCssSelector(el) {
|
|
21
|
+
if (el.id)
|
|
22
|
+
return `#${el.id}`;
|
|
23
|
+
const parts = [];
|
|
24
|
+
let current = el;
|
|
25
|
+
while (current && current !== document.body && parts.length < 4) {
|
|
26
|
+
let selector = current.tagName.toLowerCase();
|
|
27
|
+
if (current.className) {
|
|
28
|
+
const firstClass = current.className.toString().trim().split(/\s+/)[0];
|
|
29
|
+
if (firstClass)
|
|
30
|
+
selector += `.${firstClass}`;
|
|
31
|
+
}
|
|
32
|
+
parts.unshift(selector);
|
|
33
|
+
current = current.parentElement;
|
|
34
|
+
}
|
|
35
|
+
return parts.join(' > ');
|
|
36
|
+
}
|
|
37
|
+
function isGif(url) {
|
|
38
|
+
return url.toLowerCase().includes('.gif') || url.toLowerCase().endsWith('.gif');
|
|
39
|
+
}
|
|
40
|
+
// ── Images ────────────────────────────────────────────────────────────
|
|
41
|
+
const rawImages = [];
|
|
42
|
+
document.querySelectorAll('img').forEach((img) => {
|
|
43
|
+
const src = img.getAttribute('src') ?? img.getAttribute('data-src') ?? '';
|
|
44
|
+
if (!src || src.startsWith('data:'))
|
|
45
|
+
return;
|
|
46
|
+
const absoluteSrc = toAbsolute(src);
|
|
47
|
+
rawImages.push({
|
|
48
|
+
url: absoluteSrc,
|
|
49
|
+
alt: img.getAttribute('alt') ?? '',
|
|
50
|
+
type: isGif(absoluteSrc) ? 'gif' : 'image',
|
|
51
|
+
width: img.naturalWidth || parseInt(img.getAttribute('width') ?? '0') || null,
|
|
52
|
+
height: img.naturalHeight || parseInt(img.getAttribute('height') ?? '0') || null,
|
|
53
|
+
isLazy: img.getAttribute('loading') === 'lazy' || !!img.getAttribute('data-src'),
|
|
54
|
+
selector: getCssSelector(img),
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
// ── Scripts ───────────────────────────────────────────────────────────
|
|
58
|
+
const scripts = [];
|
|
59
|
+
document.querySelectorAll('script[src]').forEach((s) => {
|
|
60
|
+
const src = s.src;
|
|
61
|
+
if (src && !src.startsWith(window.location.origin)) {
|
|
62
|
+
scripts.push(toAbsolute(src));
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
// ── Icons / Favicons ──────────────────────────────────────────────────
|
|
66
|
+
const icons = [];
|
|
67
|
+
document.querySelectorAll('link[rel~="icon"], link[rel~="apple-touch-icon"], link[rel~="shortcut"]').forEach((l) => {
|
|
68
|
+
const href = l.href;
|
|
69
|
+
if (href)
|
|
70
|
+
icons.push(toAbsolute(href));
|
|
71
|
+
});
|
|
72
|
+
// ── Fonts ─────────────────────────────────────────────────────────────
|
|
73
|
+
const fonts = [];
|
|
74
|
+
for (const sheet of Array.from(document.styleSheets)) {
|
|
75
|
+
try {
|
|
76
|
+
for (const rule of Array.from(sheet.cssRules)) {
|
|
77
|
+
if (rule instanceof CSSFontFaceRule) {
|
|
78
|
+
const src = rule.style.getPropertyValue('src');
|
|
79
|
+
const matches = src.match(/url\(['"]?([^'")\s]+)['"]?\)/g) ?? [];
|
|
80
|
+
matches.forEach((m) => {
|
|
81
|
+
const urlMatch = m.match(/url\(['"]?([^'")\s]+)['"]?\)/);
|
|
82
|
+
if (urlMatch?.[1])
|
|
83
|
+
fonts.push(toAbsolute(urlMatch[1]));
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return { rawImages, images: [], scripts: [...new Set(scripts)], icons: [...new Set(icons)], fonts: [...new Set(fonts)] };
|
|
93
|
+
}, { base: baseUrl });
|
|
94
|
+
// Retrieve GIFs intercepted from network responses
|
|
95
|
+
const networkGifs = await page.evaluate(() => {
|
|
96
|
+
return window.__khoj_gifs__ ?? [];
|
|
97
|
+
});
|
|
98
|
+
const allImages = result.rawImages;
|
|
99
|
+
// Merge network-discovered GIFs that weren't in <img> tags
|
|
100
|
+
const knownUrls = new Set(allImages.map((i) => i.url));
|
|
101
|
+
for (const gifUrl of networkGifs) {
|
|
102
|
+
if (!knownUrls.has(gifUrl)) {
|
|
103
|
+
allImages.push({
|
|
104
|
+
url: gifUrl,
|
|
105
|
+
alt: '',
|
|
106
|
+
type: 'gif',
|
|
107
|
+
width: null,
|
|
108
|
+
height: null,
|
|
109
|
+
isLazy: false,
|
|
110
|
+
selector: '',
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
const images = allImages.filter((i) => i.type === 'image');
|
|
115
|
+
const gifs = allImages.filter((i) => i.type === 'gif');
|
|
116
|
+
return {
|
|
117
|
+
images,
|
|
118
|
+
gifs,
|
|
119
|
+
fonts: result.fonts,
|
|
120
|
+
icons: result.icons,
|
|
121
|
+
scripts: result.scripts,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
//# sourceMappingURL=AssetExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AssetExtractor.js","sourceRoot":"","sources":["../../src/extractors/AssetExtractor.ts"],"names":[],"mappings":"AAGA;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,IAAU,EAAE,OAAe;IAC3D,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAC9B,CAAC,EAAE,IAAI,EAAoB,EAAwD,EAAE;QACjF,SAAS,UAAU,CAAC,GAAW;YAC3B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC;gBAAE,OAAO,GAAG,CAAC;YAChD,IAAI,CAAC;gBACD,OAAO,IAAI,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC;YACnC,CAAC;YAAC,MAAM,CAAC;gBACL,OAAO,GAAG,CAAC;YACf,CAAC;QACL,CAAC;QAED,SAAS,cAAc,CAAC,EAAW;YAC/B,IAAI,EAAE,CAAC,EAAE;gBAAE,OAAO,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,IAAI,OAAO,GAAmB,EAAE,CAAC;YACjC,OAAO,OAAO,IAAI,OAAO,KAAK,QAAQ,CAAC,IAAI,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC9D,IAAI,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;gBAC7C,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;oBACpB,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;oBACvE,IAAI,UAAU;wBAAE,QAAQ,IAAI,IAAI,UAAU,EAAE,CAAC;gBACjD,CAAC;gBACD,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;gBACxB,OAAO,GAAG,OAAO,CAAC,aAAa,CAAC;YACpC,CAAC;YACD,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC7B,CAAC;QAED,SAAS,KAAK,CAAC,GAAW;YACtB,OAAO,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpF,CAAC;QAED,yEAAyE;QACzE,MAAM,SAAS,GAAiB,EAAE,CAAC;QACnC,QAAQ,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;YAC7C,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;YAC1E,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC;gBAAE,OAAO;YAE5C,MAAM,WAAW,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;YACpC,SAAS,CAAC,IAAI,CAAC;gBACX,GAAG,EAAE,WAAW;gBAChB,GAAG,EAAE,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE;gBAClC,IAAI,EAAE,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO;gBAC1C,KAAK,EAAE,GAAG,CAAC,YAAY,IAAI,QAAQ,CAAC,GAAG,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,IAAI;gBAC7E,MAAM,EAAE,GAAG,CAAC,aAAa,IAAI,QAAQ,CAAC,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,IAAI,IAAI;gBAChF,MAAM,EAAE,GAAG,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,YAAY,CAAC,UAAU,CAAC;gBAChF,QAAQ,EAAE,cAAc,CAAC,GAAG,CAAC;aAChC,CAAC,CAAC;QACP,CAAC,CAAC,CAAC;QAEH,yEAAyE;QACzE,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,QAAQ,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE;YACnD,MAAM,GAAG,GAAI,CAAuB,CAAC,GAAG,CAAC;YACzC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;gBACjD,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;YAClC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,yEAAyE;QACzE,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,QAAQ,CAAC,gBAAgB,CAAC,yEAAyE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE;YAC/G,MAAM,IAAI,GAAI,CAAqB,CAAC,IAAI,CAAC;YACzC,IAAI,IAAI;gBAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,yEAAyE;QACzE,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;YACnD,IAAI,CAAC;gBACD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAC5C,IAAI,IAAI,YAAY,eAAe,EAAE,CAAC;wBAClC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC;wBAC/C,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,+BAA+B,CAAC,IAAI,EAAE,CAAC;wBACjE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE;4BAClB,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;4BACzD,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC;gCAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;wBAC3D,CAAC,CAAC,CAAC;oBACP,CAAC;gBACL,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACL,SAAS;YACb,CAAC;QACL,CAAC;QAED,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;IAC7H,CAAC,EACD,EAAE,IAAI,EAAE,OAAO,EAAE,CACpB,CAAC;IAEF,mDAAmD;IACnD,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAa,EAAE;QACnD,OAAQ,MAAgD,CAAC,aAAa,IAAI,EAAE,CAAC;IACjF,CAAC,CAAC,CAAC;IAEH,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;IAEnC,2DAA2D;IAC3D,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvD,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;QAC/B,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YACzB,SAAS,CAAC,IAAI,CAAC;gBACX,GAAG,EAAE,MAAM;gBACX,GAAG,EAAE,EAAE;gBACP,IAAI,EAAE,KAAK;gBACX,KAAK,EAAE,IAAI;gBACX,MAAM,EAAE,IAAI;gBACZ,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,EAAE;aACf,CAAC,CAAC;QACP,CAAC;IACL,CAAC;IAED,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC,CAAC;IAC3D,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC;IAEvD,OAAO;QACH,MAAM;QACN,IAAI;QACJ,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,OAAO,EAAE,MAAM,CAAC,OAAO;KAC1B,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Page } from 'playwright';
|
|
2
|
+
import type { ContentBlock } from '../types/KhojContext.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extracts structured content blocks:
|
|
5
|
+
* - Headings h1–h6 with level
|
|
6
|
+
* - Paragraphs (truncated to 300 chars)
|
|
7
|
+
* - Button labels
|
|
8
|
+
* - Anchor links with href
|
|
9
|
+
* - Form labels
|
|
10
|
+
* Deduplicates by text.
|
|
11
|
+
*/
|
|
12
|
+
export declare function extractContent(page: Page): Promise<ContentBlock[]>;
|
|
13
|
+
//# sourceMappingURL=ContentExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ContentExtractor.d.ts","sourceRoot":"","sources":["../../src/extractors/ContentExtractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAI5D;;;;;;;;GAQG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAuDxE"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
const MAX_PARAGRAPH_LENGTH = 300;
|
|
2
|
+
/**
|
|
3
|
+
* Extracts structured content blocks:
|
|
4
|
+
* - Headings h1–h6 with level
|
|
5
|
+
* - Paragraphs (truncated to 300 chars)
|
|
6
|
+
* - Button labels
|
|
7
|
+
* - Anchor links with href
|
|
8
|
+
* - Form labels
|
|
9
|
+
* Deduplicates by text.
|
|
10
|
+
*/
|
|
11
|
+
export async function extractContent(page) {
|
|
12
|
+
return page.evaluate(({ maxParaLen }) => {
|
|
13
|
+
const blocks = [];
|
|
14
|
+
const seen = new Set();
|
|
15
|
+
function add(block) {
|
|
16
|
+
const key = `${block.type}:${block.text}`;
|
|
17
|
+
if (!seen.has(key) && block.text.length > 0) {
|
|
18
|
+
seen.add(key);
|
|
19
|
+
blocks.push(block);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
// Headings
|
|
23
|
+
for (let level = 1; level <= 6; level++) {
|
|
24
|
+
document.querySelectorAll(`h${level}`).forEach((el) => {
|
|
25
|
+
const text = el.textContent?.trim() ?? '';
|
|
26
|
+
if (text)
|
|
27
|
+
add({ type: 'heading', text, level });
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
// Paragraphs
|
|
31
|
+
document.querySelectorAll('p').forEach((el) => {
|
|
32
|
+
const text = (el.textContent?.trim() ?? '').slice(0, maxParaLen);
|
|
33
|
+
if (text.split(' ').length >= 3) {
|
|
34
|
+
add({ type: 'paragraph', text });
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
// Buttons
|
|
38
|
+
document.querySelectorAll('button, [role="button"], input[type="submit"], input[type="button"]').forEach((el) => {
|
|
39
|
+
const text = el.textContent?.trim() ?? el.value ?? '';
|
|
40
|
+
if (text)
|
|
41
|
+
add({ type: 'button', text });
|
|
42
|
+
});
|
|
43
|
+
// Links
|
|
44
|
+
document.querySelectorAll('a[href]').forEach((el) => {
|
|
45
|
+
const text = el.textContent?.trim() ?? '';
|
|
46
|
+
const href = el.href;
|
|
47
|
+
if (text && href && !href.startsWith('javascript:')) {
|
|
48
|
+
add({ type: 'link', text, href });
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
// Labels
|
|
52
|
+
document.querySelectorAll('label').forEach((el) => {
|
|
53
|
+
const text = el.textContent?.trim() ?? '';
|
|
54
|
+
if (text)
|
|
55
|
+
add({ type: 'label', text });
|
|
56
|
+
});
|
|
57
|
+
return blocks;
|
|
58
|
+
}, { maxParaLen: MAX_PARAGRAPH_LENGTH });
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=ContentExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ContentExtractor.js","sourceRoot":"","sources":["../../src/extractors/ContentExtractor.ts"],"names":[],"mappings":"AAGA,MAAM,oBAAoB,GAAG,GAAG,CAAC;AAEjC;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAU;IAC3C,OAAO,IAAI,CAAC,QAAQ,CAChB,CAAC,EAAE,UAAU,EAA0B,EAAkB,EAAE;QACvD,MAAM,MAAM,GAAmB,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAE/B,SAAS,GAAG,CAAC,KAAmB;YAC5B,MAAM,GAAG,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC1C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBACd,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACvB,CAAC;QACL,CAAC;QAED,WAAW;QACX,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC;YACtC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;gBAClD,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;gBAC1C,IAAI,IAAI;oBAAE,GAAG,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YACpD,CAAC,CAAC,CAAC;QACP,CAAC;QAED,aAAa;QACb,QAAQ,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;YAC1C,MAAM,IAAI,GAAG,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;YACjE,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;gBAC9B,GAAG,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;YACrC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,UAAU;QACV,QAAQ,CAAC,gBAAgB,CAAC,qEAAqE,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;YAC5G,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAK,EAAuB,CAAC,KAAK,IAAI,EAAE,CAAC;YAC5E,IAAI,IAAI;gBAAE,GAAG,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,QAAQ;QACR,QAAQ,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;YAChD,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAI,EAAwB,CAAC,IAAI,CAAC;YAC5C,IAAI,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;gBAClD,GAAG,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;YACtC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,SAAS;QACT,QAAQ,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;YAC9C,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC1C,IAAI,IAAI;gBAAE,GAAG,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAClB,CAAC,EACD,EAAE,UAAU,EAAE,oBAAoB,EAAE,CACvC,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Page } from 'playwright';
|
|
2
|
+
import type { DomNode } from '../types/KhojContext.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extracts a semantic DOM tree from the page body.
|
|
5
|
+
* - Skips noise tags (script, style, svg, etc.)
|
|
6
|
+
* - Caps depth at 10 levels
|
|
7
|
+
* - Trims text content to 200 characters
|
|
8
|
+
* - Captures: tag, id, classes, role, text
|
|
9
|
+
*/
|
|
10
|
+
export declare function extractDom(page: Page): Promise<DomNode[]>;
|
|
11
|
+
//# sourceMappingURL=DomExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DomExtractor.d.ts","sourceRoot":"","sources":["../../src/extractors/DomExtractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAYvD;;;;;;GAMG;AACH,wBAAsB,UAAU,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC,CA4D/D"}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// Tags to skip entirely — they add noise, not signal
|
|
2
|
+
const NOISE_TAGS = new Set([
|
|
3
|
+
'script', 'style', 'noscript', 'svg', 'path', 'defs',
|
|
4
|
+
'symbol', 'use', 'clippath', 'head', 'meta', 'link',
|
|
5
|
+
'br', 'hr', 'wbr', 'template', 'iframe',
|
|
6
|
+
]);
|
|
7
|
+
const MAX_DEPTH = 10;
|
|
8
|
+
const MAX_TEXT_LENGTH = 200;
|
|
9
|
+
/**
|
|
10
|
+
* Extracts a semantic DOM tree from the page body.
|
|
11
|
+
* - Skips noise tags (script, style, svg, etc.)
|
|
12
|
+
* - Caps depth at 10 levels
|
|
13
|
+
* - Trims text content to 200 characters
|
|
14
|
+
* - Captures: tag, id, classes, role, text
|
|
15
|
+
*/
|
|
16
|
+
export async function extractDom(page) {
|
|
17
|
+
return page.evaluate(({ noiseTags, maxDepth, maxTextLen }) => {
|
|
18
|
+
const noiseSet = new Set(noiseTags);
|
|
19
|
+
function walk(el, depth) {
|
|
20
|
+
if (depth > maxDepth)
|
|
21
|
+
return null;
|
|
22
|
+
const tag = el.tagName.toLowerCase();
|
|
23
|
+
if (noiseSet.has(tag))
|
|
24
|
+
return null;
|
|
25
|
+
const children = [];
|
|
26
|
+
for (const child of Array.from(el.children)) {
|
|
27
|
+
const node = walk(child, depth + 1);
|
|
28
|
+
if (node)
|
|
29
|
+
children.push(node);
|
|
30
|
+
}
|
|
31
|
+
// Get direct text (not from children)
|
|
32
|
+
let text;
|
|
33
|
+
const directText = Array.from(el.childNodes)
|
|
34
|
+
.filter((n) => n.nodeType === Node.TEXT_NODE)
|
|
35
|
+
.map((n) => n.textContent?.trim() ?? '')
|
|
36
|
+
.join(' ')
|
|
37
|
+
.trim();
|
|
38
|
+
if (directText.length > 0) {
|
|
39
|
+
text = directText.slice(0, maxTextLen);
|
|
40
|
+
}
|
|
41
|
+
const node = {
|
|
42
|
+
tag,
|
|
43
|
+
classes: Array.from(el.classList),
|
|
44
|
+
children: children.length > 0 ? children : undefined,
|
|
45
|
+
};
|
|
46
|
+
const id = el.getAttribute('id');
|
|
47
|
+
if (id)
|
|
48
|
+
node.id = id;
|
|
49
|
+
const role = el.getAttribute('role');
|
|
50
|
+
if (role)
|
|
51
|
+
node.role = role;
|
|
52
|
+
if (text)
|
|
53
|
+
node.text = text;
|
|
54
|
+
// Prune leaf nodes with no info
|
|
55
|
+
if (!text && !node.id && node.classes.length === 0 && !node.role && !children.length) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
return node;
|
|
59
|
+
}
|
|
60
|
+
const body = document.body;
|
|
61
|
+
if (!body)
|
|
62
|
+
return [];
|
|
63
|
+
return Array.from(body.children)
|
|
64
|
+
.map((child) => walk(child, 1))
|
|
65
|
+
.filter((n) => n !== null);
|
|
66
|
+
}, { noiseTags: Array.from(NOISE_TAGS), maxDepth: MAX_DEPTH, maxTextLen: MAX_TEXT_LENGTH });
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=DomExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DomExtractor.js","sourceRoot":"","sources":["../../src/extractors/DomExtractor.ts"],"names":[],"mappings":"AAGA,qDAAqD;AACrD,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;IACvB,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM;IACpD,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IACnD,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,UAAU,EAAE,QAAQ;CAC1C,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG,EAAE,CAAC;AACrB,MAAM,eAAe,GAAG,GAAG,CAAC;AAE5B;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,IAAU;IACvC,OAAO,IAAI,CAAC,QAAQ,CAChB,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAiE,EAAa,EAAE;QAC9G,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAEpC,SAAS,IAAI,CAAC,EAAW,EAAE,KAAa;YACpC,IAAI,KAAK,GAAG,QAAQ;gBAAE,OAAO,IAAI,CAAC;YAElC,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YACrC,IAAI,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,OAAO,IAAI,CAAC;YAEnC,MAAM,QAAQ,GAAc,EAAE,CAAC;YAC/B,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC1C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;gBACpC,IAAI,IAAI;oBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClC,CAAC;YAED,sCAAsC;YACtC,IAAI,IAAwB,CAAC;YAC7B,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC;iBACvC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,IAAI,CAAC,SAAS,CAAC;iBAC5C,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;iBACvC,IAAI,CAAC,GAAG,CAAC;iBACT,IAAI,EAAE,CAAC;YAEZ,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;YAC3C,CAAC;YAED,MAAM,IAAI,GAAY;gBAClB,GAAG;gBACH,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,SAAS,CAAC;gBACjC,QAAQ,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;aACvD,CAAC;YAEF,MAAM,EAAE,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YACjC,IAAI,EAAE;gBAAE,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;YAErB,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACrC,IAAI,IAAI;gBAAE,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;YAE3B,IAAI,IAAI;gBAAE,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;YAE3B,gCAAgC;YAChC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;gBACnF,OAAO,IAAI,CAAC;YAChB,CAAC;YAED,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;QAC3B,IAAI,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAErB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;aAC3B,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAgB,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;IACjD,CAAC,EACD,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,eAAe,EAAE,CAC1F,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { Page } from 'playwright';
|
|
2
|
+
import type { Interaction } from '../types/KhojContext.js';
|
|
3
|
+
/**
|
|
4
|
+
* Maps interactive regions of the page:
|
|
5
|
+
* - Forms with all their input fields
|
|
6
|
+
* - Navigation elements
|
|
7
|
+
* - Interactive button-like elements
|
|
8
|
+
*/
|
|
9
|
+
export declare function extractInteractions(page: Page): Promise<Interaction[]>;
|
|
10
|
+
//# sourceMappingURL=InteractionExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"InteractionExtractor.d.ts","sourceRoot":"","sources":["../../src/extractors/InteractionExtractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,WAAW,EAAa,MAAM,yBAAyB,CAAC;AAEtE;;;;;GAKG;AACH,wBAAsB,mBAAmB,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAmE5E"}
|