khoji 2.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -0
- package/dist/ai/GeminiAdapter.d.ts +7 -0
- package/dist/ai/GeminiAdapter.d.ts.map +1 -0
- package/dist/ai/GeminiAdapter.js +40 -0
- package/dist/ai/GeminiAdapter.js.map +1 -0
- package/dist/browser/BrowserManager.d.ts +17 -0
- package/dist/browser/BrowserManager.d.ts.map +1 -0
- package/dist/browser/BrowserManager.js +61 -0
- package/dist/browser/BrowserManager.js.map +1 -0
- package/dist/browser/PageLoader.d.ts +21 -0
- package/dist/browser/PageLoader.d.ts.map +1 -0
- package/dist/browser/PageLoader.js +116 -0
- package/dist/browser/PageLoader.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +98 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/extractors/AnimationExtractor.d.ts +12 -0
- package/dist/extractors/AnimationExtractor.d.ts.map +1 -0
- package/dist/extractors/AnimationExtractor.js +247 -0
- package/dist/extractors/AnimationExtractor.js.map +1 -0
- package/dist/extractors/AssetExtractor.d.ts +11 -0
- package/dist/extractors/AssetExtractor.d.ts.map +1 -0
- package/dist/extractors/AssetExtractor.js +124 -0
- package/dist/extractors/AssetExtractor.js.map +1 -0
- package/dist/extractors/ContentExtractor.d.ts +13 -0
- package/dist/extractors/ContentExtractor.d.ts.map +1 -0
- package/dist/extractors/ContentExtractor.js +60 -0
- package/dist/extractors/ContentExtractor.js.map +1 -0
- package/dist/extractors/DomExtractor.d.ts +11 -0
- package/dist/extractors/DomExtractor.d.ts.map +1 -0
- package/dist/extractors/DomExtractor.js +68 -0
- package/dist/extractors/DomExtractor.js.map +1 -0
- package/dist/extractors/InteractionExtractor.d.ts +10 -0
- package/dist/extractors/InteractionExtractor.d.ts.map +1 -0
- package/dist/extractors/InteractionExtractor.js +64 -0
- package/dist/extractors/InteractionExtractor.js.map +1 -0
- package/dist/extractors/MetaExtractor.d.ts +8 -0
- package/dist/extractors/MetaExtractor.d.ts.map +1 -0
- package/dist/extractors/MetaExtractor.js +33 -0
- package/dist/extractors/MetaExtractor.js.map +1 -0
- package/dist/extractors/StyleExtractor.d.ts +10 -0
- package/dist/extractors/StyleExtractor.d.ts.map +1 -0
- package/dist/extractors/StyleExtractor.js +87 -0
- package/dist/extractors/StyleExtractor.js.map +1 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/output/Writer.d.ts +5 -0
- package/dist/output/Writer.d.ts.map +1 -0
- package/dist/output/Writer.js +13 -0
- package/dist/output/Writer.js.map +1 -0
- package/dist/pipeline/Cleaner.d.ts +12 -0
- package/dist/pipeline/Cleaner.d.ts.map +1 -0
- package/dist/pipeline/Cleaner.js +41 -0
- package/dist/pipeline/Cleaner.js.map +1 -0
- package/dist/pipeline/ComponentDetector.d.ts +8 -0
- package/dist/pipeline/ComponentDetector.d.ts.map +1 -0
- package/dist/pipeline/ComponentDetector.js +43 -0
- package/dist/pipeline/ComponentDetector.js.map +1 -0
- package/dist/pipeline/runner.d.ts +3 -0
- package/dist/pipeline/runner.d.ts.map +1 -0
- package/dist/pipeline/runner.js +182 -0
- package/dist/pipeline/runner.js.map +1 -0
- package/dist/prompting/PromptGenerator.d.ts +5 -0
- package/dist/prompting/PromptGenerator.d.ts.map +1 -0
- package/dist/prompting/PromptGenerator.js +30 -0
- package/dist/prompting/PromptGenerator.js.map +1 -0
- package/dist/serializer/JsonSerializer.d.ts +6 -0
- package/dist/serializer/JsonSerializer.d.ts.map +1 -0
- package/dist/serializer/JsonSerializer.js +7 -0
- package/dist/serializer/JsonSerializer.js.map +1 -0
- package/dist/serializer/MarkdownSerializer.d.ts +7 -0
- package/dist/serializer/MarkdownSerializer.d.ts.map +1 -0
- package/dist/serializer/MarkdownSerializer.js +143 -0
- package/dist/serializer/MarkdownSerializer.js.map +1 -0
- package/dist/types/KhojContext.d.ts +141 -0
- package/dist/types/KhojContext.d.ts.map +1 -0
- package/dist/types/KhojContext.js +6 -0
- package/dist/types/KhojContext.js.map +1 -0
- package/dist/utils/logger.d.ts +15 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +70 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/text.d.ts +2 -0
- package/dist/utils/text.d.ts.map +1 -0
- package/dist/utils/text.js +6 -0
- package/dist/utils/text.js.map +1 -0
- package/dist/utils/tokenEstimator.d.ts +10 -0
- package/dist/utils/tokenEstimator.d.ts.map +1 -0
- package/dist/utils/tokenEstimator.js +17 -0
- package/dist/utils/tokenEstimator.js.map +1 -0
- package/khoj-context.schema.json +48 -0
- package/package.json +75 -0
package/README.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Khoj (खोज)
|
|
2
|
+
|
|
3
|
+
[](https://npmjs.org/package/khoj)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
**Khoj** is a focused, open-source CLI tool and Node.js package that visits any public URL and extracts only the meaningful layers of the website — outputting a compact `khoj-context.json` specifically designed for AI agents (like Gemini, Claude, and GPT-4).
|
|
7
|
+
|
|
8
|
+
Raw HTML is noisy and wastes LLM tokens. Khoj solves the **token bloat problem** by stripping the noise and feeding your agent exactly what it needs to understand the page.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- **Token Efficient**: Reduces raw HTML token size by up to 90%.
|
|
13
|
+
- **Design Tokens**: Automatically extracts CSS custom properties (colors, spacing, fonts).
|
|
14
|
+
- **Animation Aware**: Detects CSS animations, GSAP timelines, Framer Motion, AOS, and infers the purpose of GIFs.
|
|
15
|
+
- **Semantic DOM**: Provides a clean, depth-capped, text-truncated structural tree.
|
|
16
|
+
- **Component Detection**: Automatically flags repeating patterns (e.g., Cards, ListItems).
|
|
17
|
+
- **Interactive Map**: Extracts forms, fields, and navigation menus.
|
|
18
|
+
- **Clone Mode**: Extracts a full-page PNG screenshot, the raw HTML, and a concatenated CSS file for pixel-perfect AI reproduction.
|
|
19
|
+
- **Gemini Native**: Built-in `--send-to-gemini` flag to pipe context straight to an LLM.
|
|
20
|
+
|
|
21
|
+
## Installation & Usage
|
|
22
|
+
|
|
23
|
+
Khoj can be run instantly, or installed either globally or locally to suit your workflow.
|
|
24
|
+
|
|
25
|
+
### 1. Run Instantly (No Install)
|
|
26
|
+
If you don't want to install anything, you can run Khoj directly using `npx`:
|
|
27
|
+
```bash
|
|
28
|
+
npx khoji https://example.com --send-to-gemini --prompt "Identify all primary call-to-action buttons."
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### 3. Install as a Dev Dependency
|
|
32
|
+
If you are building an AI project and want Khoj locally:
|
|
33
|
+
```bash
|
|
34
|
+
npm install -D khoji
|
|
35
|
+
npx khoji https://example.com
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Bypassing "Click to Enter" Preloaders
|
|
39
|
+
Many high-end award-winning sites hide their entire layout behind an initial "Click to Enter" or "Start Experience" overlay screen. If you extract the site normally, you will only capture the loader screen.
|
|
40
|
+
|
|
41
|
+
To bypass this natively, inspect the website to find the CSS Selector of the start button (e.g., `#enter-button` or `.preloader-enter`), and pass it to Khoj using the `--click` flag:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
npx khoji https://dich-fashion.webflow.io/ --clone --click ".preloader-enter"
|
|
45
|
+
```
|
|
46
|
+
Khoj will automatically navigate to the site, wait for the overlay button, click it, wait for the intro animations to clear, and *then* run the full clone extraction of the underlying page!
|
|
47
|
+
|
|
48
|
+
### 2. Install Globally
|
|
49
|
+
If you plan to use Khoj frequently from your terminal:
|
|
50
|
+
```bash
|
|
51
|
+
npm install -g khoji
|
|
52
|
+
```
|
|
53
|
+
Once installed globally, you can drop the `npx` prefix and just type:
|
|
54
|
+
```bash
|
|
55
|
+
khoj https://example.com
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
> **Tip:** If you just type `khoj` or `npx khoj` in your terminal without any URL, it will print out the full help menu and list all available options.
|
|
59
|
+
|
|
60
|
+
### What happens next?
|
|
61
|
+
Whichever way you run it, Khoj will create an `output/` folder in your **current working directory**. Inside that folder, you will find a subdirectory named after the website (e.g., `output/example.com/`).
|
|
62
|
+
|
|
63
|
+
You can then manually drag and drop these generated files (`khoj-context.json` or `khoj-context.md`) into ChatGPT, Claude, Cursor, or any other AI coding agent as highly-efficient context!
|
|
64
|
+
|
|
65
|
+
### Options
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
Usage: khoj <url> [options]
|
|
69
|
+
|
|
70
|
+
Extract token-efficient website context for AI agents
|
|
71
|
+
|
|
72
|
+
Arguments:
|
|
73
|
+
url Target URL to extract context from
|
|
74
|
+
|
|
75
|
+
Options:
|
|
76
|
+
-o, --output <dir> Output directory (default: "./output")
|
|
77
|
+
-f, --format <type> Output format: json | markdown | both (default: "both")
|
|
78
|
+
-t, --timeout <ms> Page load timeout in milliseconds (default: "30000")
|
|
79
|
+
--fast Fast mode: skip image loading (reduces extraction time)
|
|
80
|
+
--clone Clone mode: Extract full-page screenshot, raw HTML, and CSS
|
|
81
|
+
--send-to-gemini Send output to Gemini API after extraction
|
|
82
|
+
--prompt <text> Custom instruction to send to Gemini along with context
|
|
83
|
+
-V, --version output the version number
|
|
84
|
+
-h, --help display help for command
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Output Structure
|
|
88
|
+
|
|
89
|
+
All extracted data is automatically placed in a subdirectory named after the target domain (e.g., `./output/stripe.com/`).
|
|
90
|
+
|
|
91
|
+
### Programmatic API
|
|
92
|
+
|
|
93
|
+
You can use Khoj within your own Node.js or TypeScript projects:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
npm install khoj playwright
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
import { runExtraction } from 'khoj';
|
|
101
|
+
|
|
102
|
+
await runExtraction({
|
|
103
|
+
url: 'https://example.com',
|
|
104
|
+
outputDir: './context',
|
|
105
|
+
format: 'json',
|
|
106
|
+
timeout: 30000,
|
|
107
|
+
fast: false
|
|
108
|
+
});
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Output Structure
|
|
112
|
+
|
|
113
|
+
Khoj produces a structured JSON output (and an optional Markdown summary). See the [JSON Schema definition](./khoj-context.schema.json).
|
|
114
|
+
|
|
115
|
+
Key sections in `khoj-context.json`:
|
|
116
|
+
- `meta`: Page title, OpenGraph image, theme-color, JSON-LD
|
|
117
|
+
- `structure`: Cleaned semantic tree
|
|
118
|
+
- `designTokens`: Colors, fonts, typography, spacing, breakpoints
|
|
119
|
+
- `components`: Detected repeating UI patterns
|
|
120
|
+
- `assets`: Images, isolated GIFs, fonts, icons, external scripts
|
|
121
|
+
- `content`: Extracted headings, buttons, and text blocks
|
|
122
|
+
- `interactions`: Actionable forms and nav menus
|
|
123
|
+
- `animations`: CSS keyframes, transitions, JS libraries (GSAP, Framer), and GIF intents.
|
|
124
|
+
|
|
125
|
+
### Clone Mode Artifacts
|
|
126
|
+
When using the `--clone` flag, three additional raw files are saved directly into the domain folder:
|
|
127
|
+
- **`khoj-clone-YYYY-MM-DD.png`**: A full-page visual screenshot captured by Playwright.
|
|
128
|
+
- **`khoj-clone-YYYY-MM-DD.html`**: The fully hydrated, raw HTML source code.
|
|
129
|
+
- **`khoj-clone-YYYY-MM-DD.css`**: All styling rules needed for pixel-perfect cloning (combines inline `<style>` and external `<link rel="stylesheet">` tags).
|
|
130
|
+
|
|
131
|
+
## Requirements
|
|
132
|
+
- Node.js >= 18
|
|
133
|
+
- Playwright (installed automatically)
|
|
134
|
+
|
|
135
|
+
## License
|
|
136
|
+
MIT
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { KhojContext } from '../types/KhojContext.js';
|
|
2
|
+
/**
|
|
3
|
+
* Sends the extracted KhojContext to Google Gemini with an optional user prompt.
|
|
4
|
+
* Streams the model response to stdout.
|
|
5
|
+
*/
|
|
6
|
+
export declare function sendToGemini(ctx: KhojContext, prompt?: string): Promise<void>;
|
|
7
|
+
//# sourceMappingURL=GeminiAdapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"GeminiAdapter.d.ts","sourceRoot":"","sources":["../../src/ai/GeminiAdapter.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AAI3D;;;GAGG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,WAAW,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAmCnF"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
2
|
+
import { logger } from '../utils/logger.js';
|
|
3
|
+
const DEFAULT_MODEL = 'gemini-1.5-flash';
|
|
4
|
+
/**
|
|
5
|
+
* Sends the extracted KhojContext to Google Gemini with an optional user prompt.
|
|
6
|
+
* Streams the model response to stdout.
|
|
7
|
+
*/
|
|
8
|
+
export async function sendToGemini(ctx, prompt) {
|
|
9
|
+
const apiKey = process.env['GEMINI_API_KEY'];
|
|
10
|
+
if (!apiKey) {
|
|
11
|
+
logger.error('GEMINI_API_KEY is not set. Add it to your .env file.');
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
const model = process.env['GEMINI_MODEL'] ?? DEFAULT_MODEL;
|
|
15
|
+
const genAI = new GoogleGenerativeAI(apiKey);
|
|
16
|
+
const geminiModel = genAI.getGenerativeModel({ model });
|
|
17
|
+
const systemContext = JSON.stringify(ctx, null, 2);
|
|
18
|
+
const userPrompt = prompt ?? 'Summarise this website and suggest how to replicate it with modern web technologies.';
|
|
19
|
+
const fullPrompt = `You are a professional web developer assistant. Below is a structured JSON snapshot of a website, extracted by the Khoj tool. Use this to answer the user's request.
|
|
20
|
+
|
|
21
|
+
<site-context>
|
|
22
|
+
${systemContext}
|
|
23
|
+
</site-context>
|
|
24
|
+
|
|
25
|
+
User request: ${userPrompt}`;
|
|
26
|
+
logger.step('🤖', `Sending to Gemini (${model})...`);
|
|
27
|
+
try {
|
|
28
|
+
const result = await geminiModel.generateContentStream(fullPrompt);
|
|
29
|
+
process.stdout.write('\n');
|
|
30
|
+
for await (const chunk of result.stream) {
|
|
31
|
+
process.stdout.write(chunk.text());
|
|
32
|
+
}
|
|
33
|
+
process.stdout.write('\n\n');
|
|
34
|
+
logger.success('Gemini response complete');
|
|
35
|
+
}
|
|
36
|
+
catch (err) {
|
|
37
|
+
logger.error('Gemini API request failed', err);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=GeminiAdapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"GeminiAdapter.js","sourceRoot":"","sources":["../../src/ai/GeminiAdapter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAC3D,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAG5C,MAAM,aAAa,GAAG,kBAAkB,CAAC;AAEzC;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,GAAgB,EAAE,MAAe;IAChE,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC7C,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,MAAM,CAAC,KAAK,CAAC,sDAAsD,CAAC,CAAC;QACrE,OAAO;IACX,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,aAAa,CAAC;IAC3D,MAAM,KAAK,GAAG,IAAI,kBAAkB,CAAC,MAAM,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,KAAK,CAAC,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;IAExD,MAAM,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;IACnD,MAAM,UAAU,GAAG,MAAM,IAAI,sFAAsF,CAAC;IAEpH,MAAM,UAAU,GAAG;;;EAGrB,aAAa;;;gBAGC,UAAU,EAAE,CAAC;IAEzB,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,sBAAsB,KAAK,MAAM,CAAC,CAAC;IAErD,IAAI,CAAC;QACD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,qBAAqB,CAAC,UAAU,CAAC,CAAC;QACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC3B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YACtC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAC7B,MAAM,CAAC,OAAO,CAAC,0BAA0B,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,CAAC,KAAK,CAAC,2BAA2B,EAAE,GAAG,CAAC,CAAC;IACnD,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { type Browser, type BrowserContext } from 'playwright';
|
|
2
|
+
export type LoadMode = 'full' | 'fast';
|
|
3
|
+
export interface BrowserSession {
|
|
4
|
+
browser: Browser;
|
|
5
|
+
context: BrowserContext;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Manages the Playwright browser lifecycle for a single extraction run.
|
|
9
|
+
* Each run gets an isolated browser context — no shared cookies, storage, or state.
|
|
10
|
+
*/
|
|
11
|
+
export declare class BrowserManager {
|
|
12
|
+
private browser;
|
|
13
|
+
private context;
|
|
14
|
+
launch(mode?: LoadMode): Promise<BrowserContext>;
|
|
15
|
+
close(): Promise<void>;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=BrowserManager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"BrowserManager.d.ts","sourceRoot":"","sources":["../../src/browser/BrowserManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAY,KAAK,OAAO,EAAE,KAAK,cAAc,EAAE,MAAM,YAAY,CAAC;AAGzE,MAAM,MAAM,QAAQ,GAAG,MAAM,GAAG,MAAM,CAAC;AAEvC,MAAM,WAAW,cAAc;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE,cAAc,CAAC;CAC3B;AAED;;;GAGG;AACH,qBAAa,cAAc;IACvB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAA+B;IAExC,MAAM,CAAC,IAAI,GAAE,QAAiB,GAAG,OAAO,CAAC,cAAc,CAAC;IA8CxD,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAW/B"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { chromium } from 'playwright';
|
|
2
|
+
import { logger } from '../utils/logger.js';
|
|
3
|
+
/**
|
|
4
|
+
* Manages the Playwright browser lifecycle for a single extraction run.
|
|
5
|
+
* Each run gets an isolated browser context — no shared cookies, storage, or state.
|
|
6
|
+
*/
|
|
7
|
+
export class BrowserManager {
|
|
8
|
+
browser = null;
|
|
9
|
+
context = null;
|
|
10
|
+
async launch(mode = 'fast') {
|
|
11
|
+
logger.step('🌐', 'Launching browser...');
|
|
12
|
+
this.browser = await chromium.launch({
|
|
13
|
+
headless: true,
|
|
14
|
+
args: [
|
|
15
|
+
'--no-sandbox',
|
|
16
|
+
'--disable-setuid-sandbox',
|
|
17
|
+
'--disable-dev-shm-usage',
|
|
18
|
+
'--disable-gpu',
|
|
19
|
+
'--no-zygote',
|
|
20
|
+
],
|
|
21
|
+
});
|
|
22
|
+
this.context = await this.browser.newContext({
|
|
23
|
+
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
|
24
|
+
locale: 'en-US',
|
|
25
|
+
timezoneId: 'America/New_York',
|
|
26
|
+
viewport: { width: 1440, height: 900 },
|
|
27
|
+
javaScriptEnabled: true,
|
|
28
|
+
ignoreHTTPSErrors: true,
|
|
29
|
+
extraHTTPHeaders: {
|
|
30
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
31
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
// Block heavy resources in fast mode to speed up load
|
|
35
|
+
if (mode === 'fast') {
|
|
36
|
+
await this.context.route('**/*', (route) => {
|
|
37
|
+
const resourceType = route.request().resourceType();
|
|
38
|
+
// Allow documents and scripts (needed for JS-rendered content)
|
|
39
|
+
// Block media, fonts, and stylesheets for speed
|
|
40
|
+
if (['media', 'font', 'websocket', 'eventsource'].includes(resourceType)) {
|
|
41
|
+
return route.abort();
|
|
42
|
+
}
|
|
43
|
+
return route.continue();
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
logger.success('Browser ready');
|
|
47
|
+
return this.context;
|
|
48
|
+
}
|
|
49
|
+
async close() {
|
|
50
|
+
if (this.context) {
|
|
51
|
+
await this.context.close().catch(() => undefined);
|
|
52
|
+
this.context = null;
|
|
53
|
+
}
|
|
54
|
+
if (this.browser) {
|
|
55
|
+
await this.browser.close().catch(() => undefined);
|
|
56
|
+
this.browser = null;
|
|
57
|
+
}
|
|
58
|
+
logger.dim('Browser closed');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=BrowserManager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"BrowserManager.js","sourceRoot":"","sources":["../../src/browser/BrowserManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAqC,MAAM,YAAY,CAAC;AACzE,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAS5C;;;GAGG;AACH,MAAM,OAAO,cAAc;IACf,OAAO,GAAmB,IAAI,CAAC;IAC/B,OAAO,GAA0B,IAAI,CAAC;IAE9C,KAAK,CAAC,MAAM,CAAC,OAAiB,MAAM;QAChC,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;QAE1C,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC;YACjC,QAAQ,EAAE,IAAI;YACd,IAAI,EAAE;gBACF,cAAc;gBACd,0BAA0B;gBAC1B,yBAAyB;gBACzB,eAAe;gBACf,aAAa;aAChB;SACJ,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;YACzC,SAAS,EACL,uHAAuH;YAC3H,MAAM,EAAE,OAAO;YACf,UAAU,EAAE,kBAAkB;YAC9B,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;YACtC,iBAAiB,EAAE,IAAI;YACvB,iBAAiB,EAAE,IAAI;YACvB,gBAAgB,EAAE;gBACd,iBAAiB,EAAE,gBAAgB;gBACnC,MAAM,EACF,uFAAuF;aAC9F;SACJ,CAAC,CAAC;QAEH,sDAAsD;QACtD,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;gBACvC,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;gBACpD,+DAA+D;gBAC/D,gDAAgD;gBAChD,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,aAAa,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;oBACvE,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;gBACzB,CAAC;gBACD,OAAO,KAAK,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;QACP,CAAC;QAED,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;QAChC,OAAO,IAAI,CAAC,OAAO,CAAC;IACxB,CAAC;IAED,KAAK,CAAC,KAAK;QACP,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACf,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,CAAC;YAClD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACxB,CAAC;QACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACf,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,CAAC;YAClD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACxB,CAAC;QACD,MAAM,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IACjC,CAAC;CACJ"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { BrowserContext, Page } from 'playwright';
|
|
2
|
+
export interface LoadResult {
|
|
3
|
+
page: Page;
|
|
4
|
+
finalUrl: string;
|
|
5
|
+
statusCode: number | null;
|
|
6
|
+
loadTime: number;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Loads a URL into a new Playwright page with:
|
|
10
|
+
* - networkidle wait strategy (all network activity settles)
|
|
11
|
+
* - configurable timeout
|
|
12
|
+
* - redirect tracking
|
|
13
|
+
* - graceful failure with partial result
|
|
14
|
+
*/
|
|
15
|
+
export declare class PageLoader {
|
|
16
|
+
private readonly context;
|
|
17
|
+
private readonly timeoutMs;
|
|
18
|
+
constructor(context: BrowserContext, timeoutMs?: number);
|
|
19
|
+
load(url: string, clickSelector?: string): Promise<LoadResult>;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=PageLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PageLoader.d.ts","sourceRoot":"","sources":["../../src/browser/PageLoader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAGvD,MAAM,WAAW,UAAU;IACvB,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,QAAQ,EAAE,MAAM,CAAC;CACpB;AAED;;;;;;GAMG;AACH,qBAAa,UAAU;IAEf,OAAO,CAAC,QAAQ,CAAC,OAAO;IACxB,OAAO,CAAC,QAAQ,CAAC,SAAS;gBADT,OAAO,EAAE,cAAc,EACvB,SAAS,GAAE,MAAe;IAGzC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,aAAa,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC;CAgHvE"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { logger } from '../utils/logger.js';
|
|
2
|
+
/**
|
|
3
|
+
* Loads a URL into a new Playwright page with:
|
|
4
|
+
* - networkidle wait strategy (all network activity settles)
|
|
5
|
+
* - configurable timeout
|
|
6
|
+
* - redirect tracking
|
|
7
|
+
* - graceful failure with partial result
|
|
8
|
+
*/
|
|
9
|
+
export class PageLoader {
|
|
10
|
+
context;
|
|
11
|
+
timeoutMs;
|
|
12
|
+
constructor(context, timeoutMs = 30_000) {
|
|
13
|
+
this.context = context;
|
|
14
|
+
this.timeoutMs = timeoutMs;
|
|
15
|
+
}
|
|
16
|
+
async load(url, clickSelector) {
|
|
17
|
+
const page = await this.context.newPage();
|
|
18
|
+
const start = Date.now();
|
|
19
|
+
// Track GIF/image responses for the asset extractor
|
|
20
|
+
page.on('response', (response) => {
|
|
21
|
+
const contentType = response.headers()['content-type'] ?? '';
|
|
22
|
+
const reqUrl = response.url();
|
|
23
|
+
if (contentType.includes('image/gif') || reqUrl.endsWith('.gif')) {
|
|
24
|
+
// Store on page for later retrieval by AssetExtractor
|
|
25
|
+
page.evaluate((u) => {
|
|
26
|
+
window.__khoj_gifs__ = [
|
|
27
|
+
...(window.__khoj_gifs__ ?? []),
|
|
28
|
+
u,
|
|
29
|
+
];
|
|
30
|
+
}, reqUrl).catch(() => undefined);
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
let statusCode = null;
|
|
34
|
+
try {
|
|
35
|
+
const response = await page.goto(url, {
|
|
36
|
+
waitUntil: 'networkidle',
|
|
37
|
+
timeout: this.timeoutMs,
|
|
38
|
+
});
|
|
39
|
+
statusCode = response?.status() ?? null;
|
|
40
|
+
if (statusCode !== null && statusCode >= 400) {
|
|
41
|
+
logger.warn(`Server responded with HTTP ${statusCode} for ${url}`);
|
|
42
|
+
}
|
|
43
|
+
// Handle Click-to-Enter Preloaders
|
|
44
|
+
if (clickSelector) {
|
|
45
|
+
logger.step('🖱️', `Found --click flag. Waiting for and clicking: ${clickSelector}`);
|
|
46
|
+
try {
|
|
47
|
+
await page.waitForSelector(clickSelector, { timeout: 10000 });
|
|
48
|
+
await page.click(clickSelector);
|
|
49
|
+
// Wait 3 seconds for intro animations/overlays to fade out
|
|
50
|
+
await page.waitForTimeout(3000);
|
|
51
|
+
}
|
|
52
|
+
catch (e) {
|
|
53
|
+
logger.warn(`Failed to click selector "${clickSelector}". Proceeding anyway.`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Auto-scroll the page to trigger lazy-loaded images and intersection observers (scroll animations)
|
|
57
|
+
// We use native mouse.wheel events here instead of window.scrollBy because award-winning
|
|
58
|
+
// websites often use Virtual Scroll libraries (Locomotive, Lenis) that ONLY respond to real WheelEvents.
|
|
59
|
+
logger.step('⏬', 'Scrolling page to trigger GSAP/Virtual-Scroll animations & lazy-loading...');
|
|
60
|
+
// Move mouse to center of screen to ensure wheel events are captured by the main body
|
|
61
|
+
const viewport = page.viewportSize();
|
|
62
|
+
if (viewport) {
|
|
63
|
+
await page.mouse.move(viewport.width / 2, viewport.height / 2);
|
|
64
|
+
}
|
|
65
|
+
let previousScrollY = -1;
|
|
66
|
+
let unchangedCount = 0;
|
|
67
|
+
const maxScrolls = 50;
|
|
68
|
+
for (let i = 0; i < maxScrolls; i++) {
|
|
69
|
+
await page.mouse.wheel(0, 400); // 400px per scroll tick
|
|
70
|
+
// Wait 150ms for smooth scroll momentum (Lenis/Locomotive) AND animations to render
|
|
71
|
+
await page.waitForTimeout(150);
|
|
72
|
+
const scrollData = await page.evaluate(() => {
|
|
73
|
+
return {
|
|
74
|
+
scrollY: window.scrollY,
|
|
75
|
+
scrollHeight: document.body.scrollHeight,
|
|
76
|
+
innerHeight: window.innerHeight
|
|
77
|
+
};
|
|
78
|
+
});
|
|
79
|
+
if (scrollData.scrollY === previousScrollY) {
|
|
80
|
+
unchangedCount++;
|
|
81
|
+
// If height hasn't changed for 5 ticks, we hit bottom or scroll is fully hijacked
|
|
82
|
+
if (unchangedCount > 5)
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
unchangedCount = 0;
|
|
87
|
+
}
|
|
88
|
+
previousScrollY = scrollData.scrollY;
|
|
89
|
+
// Break if we natively hit the bottom bounds
|
|
90
|
+
if (scrollData.scrollY + scrollData.innerHeight >= scrollData.scrollHeight - 10) {
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
// Scroll instantly back to top so screenshot looks normal
|
|
95
|
+
await page.evaluate(() => window.scrollTo(0, 0));
|
|
96
|
+
// Extra settle time for single-page apps running animations or deferred renders
|
|
97
|
+
await page.waitForTimeout(1000);
|
|
98
|
+
const loadTime = Date.now() - start;
|
|
99
|
+
const finalUrl = page.url();
|
|
100
|
+
logger.success(`Page loaded in ${(loadTime / 1000).toFixed(2)}s → ${finalUrl}`);
|
|
101
|
+
return { page, finalUrl, statusCode, loadTime };
|
|
102
|
+
}
|
|
103
|
+
catch (err) {
|
|
104
|
+
const loadTime = Date.now() - start;
|
|
105
|
+
if (err instanceof Error && err.message.includes('timeout')) {
|
|
106
|
+
logger.warn(`Page load timed out after ${this.timeoutMs / 1000}s — continuing with partial content`);
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
logger.error('Failed to load page', err);
|
|
110
|
+
throw err;
|
|
111
|
+
}
|
|
112
|
+
return { page, finalUrl: page.url() || url, statusCode, loadTime };
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
//# sourceMappingURL=PageLoader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PageLoader.js","sourceRoot":"","sources":["../../src/browser/PageLoader.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAS5C;;;;;;GAMG;AACH,MAAM,OAAO,UAAU;IAEE;IACA;IAFrB,YACqB,OAAuB,EACvB,YAAoB,MAAM;QAD1B,YAAO,GAAP,OAAO,CAAgB;QACvB,cAAS,GAAT,SAAS,CAAiB;IAC3C,CAAC;IAEL,KAAK,CAAC,IAAI,CAAC,GAAW,EAAE,aAAsB;QAC1C,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAEzB,oDAAoD;QACpD,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,CAAC,QAAQ,EAAE,EAAE;YAC7B,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC7D,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;YAC9B,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC/D,sDAAsD;gBACtD,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,EAAE;oBACf,MAAgD,CAAC,aAAa,GAAG;wBAC9D,GAAG,CAAE,MAAgD,CAAC,aAAa,IAAI,EAAE,CAAC;wBAC1E,CAAC;qBACJ,CAAC;gBACN,CAAC,EAAE,MAAM,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,CAAC;YACtC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,IAAI,UAAU,GAAkB,IAAI,CAAC;QAErC,IAAI,CAAC;YACD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBAClC,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,IAAI,CAAC,SAAS;aAC1B,CAAC,CAAC;YAEH,UAAU,GAAG,QAAQ,EAAE,MAAM,EAAE,IAAI,IAAI,CAAC;YAExC,IAAI,UAAU,KAAK,IAAI,IAAI,UAAU,IAAI,GAAG,EAAE,CAAC;gBAC3C,MAAM,CAAC,IAAI,CAAC,8BAA8B,UAAU,QAAQ,GAAG,EAAE,CAAC,CAAC;YACvE,CAAC;YAED,mCAAmC;YACnC,IAAI,aAAa,EAAE,CAAC;gBAChB,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,iDAAiD,aAAa,EAAE,CAAC,CAAC;gBACrF,IAAI,CAAC;oBACD,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;oBAC9D,MAAM,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;oBAChC,2DAA2D;oBAC3D,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;gBACpC,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACT,MAAM,CAAC,IAAI,CAAC,6BAA6B,aAAa,uBAAuB,CAAC,CAAC;gBACnF,CAAC;YACL,CAAC;YAED,oGAAoG;YACpG,0FAA0F;YAC1F,yGAAyG;YACzG,MAAM,CAAC,IAAI,CAAC,GAAG,EAAE,4EAA4E,CAAC,CAAC;YAE/F,sFAAsF;YACtF,MAAM,QAAQ,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;YACrC,IAAI,QAAQ,EAAE,CAAC;gBACX,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,GAAG,CAAC,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACnE,CAAC;YAED,IAAI,eAAe,GAAG,CAAC,CAAC,CAAC;YACzB,IAAI,cAAc,GAAG,CAAC,CAAC;YACvB,MAAM,UAAU,GAAG,EAAE,CAAC;YAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,wBAAwB;gBACxD,oFAAoF;gBACpF,MAAM,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;gBAE/B,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;oBACxC,OAAO;wBACH,OAAO,EAAE,MAAM,CAAC,OAAO;wBACvB,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;wBACxC,WAAW,EAAE,MAAM,CAAC,WAAW;qBAClC,CAAC;gBACN,CAAC,CAAC,CAAC;gBAEH,IAAI,UAAU,CAAC,OAAO,KAAK,eAAe,EAAE,CAAC;oBACzC,cAAc,EAAE,CAAC;oBACjB,kFAAkF;oBAClF,IAAI,cAAc,GAAG,CAAC;wBAAE,MAAM;gBAClC,CAAC;qBAAM,CAAC;oBACJ,cAAc,GAAG,CAAC,CAAC;gBACvB,CAAC;gBACD,eAAe,GAAG,UAAU,CAAC,OAAO,CAAC;gBAErC,6CAA6C;gBAC7C,IAAI,UAAU,CAAC,OAAO,GAAG,UAAU,CAAC,WAAW,IAAI,UAAU,CAAC,YAAY,GAAG,EAAE,EAAE,CAAC;oBAC9E,MAAM;gBACV,CAAC;YACL,CAAC;YAED,0DAA0D;YAC1D,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YAEjD,gFAAgF;YAChF,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;YACpC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAE5B,MAAM,CAAC,OAAO,CAAC,kBAAkB,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,QAAQ,EAAE,CAAC,CAAC;YAEhF,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC;QACpD,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;YACpC,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC1D,MAAM,CAAC,IAAI,CAAC,6BAA6B,IAAI,CAAC,SAAS,GAAG,IAAI,qCAAqC,CAAC,CAAC;YACzG,CAAC;iBAAM,CAAC;gBACJ,MAAM,CAAC,KAAK,CAAC,qBAAqB,EAAE,GAAG,CAAC,CAAC;gBACzC,MAAM,GAAG,CAAC;YACd,CAAC;YACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,IAAI,GAAG,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC;QACvE,CAAC;IACL,CAAC;CACJ"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/cli/index.ts"],"names":[],"mappings":";AACA,OAAO,eAAe,CAAC"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
3
|
+
import { Command } from 'commander';
|
|
4
|
+
import chalk from 'chalk';
|
|
5
|
+
import { confirm, checkbox } from '@inquirer/prompts';
|
|
6
|
+
import { logger } from '../utils/logger.js';
|
|
7
|
+
import { runExtraction } from '../pipeline/runner.js';
|
|
8
|
+
const program = new Command();
|
|
9
|
+
program
|
|
10
|
+
.name('khoj')
|
|
11
|
+
.description('Extract token-efficient website context for AI agents')
|
|
12
|
+
.version('2.1.4')
|
|
13
|
+
.argument('<url>', 'Target URL to extract context from')
|
|
14
|
+
.option('-o, --output <dir>', 'Output directory', './output')
|
|
15
|
+
.option('-f, --format <type>', 'Output format: json | markdown | both', 'both')
|
|
16
|
+
.option('-t, --timeout <ms>', 'Page load timeout in milliseconds', '30000')
|
|
17
|
+
.option('--send-to-gemini', 'Send output to Gemini API after extraction')
|
|
18
|
+
.option('--prompt <text>', 'Custom instruction to send to Gemini along with context')
|
|
19
|
+
.option('--fast', 'Fast mode: skip image loading (reduces extraction time)')
|
|
20
|
+
.option('--clone', 'Clone mode: Extract full-page screenshot, raw HTML, and CSS')
|
|
21
|
+
.option('--click <selector>', 'CSS selector of an element to click before extraction (useful for "Enter Site" preloaders)')
|
|
22
|
+
.action(async (url, options) => {
|
|
23
|
+
logger.banner();
|
|
24
|
+
// Validator — must be a valid URL
|
|
25
|
+
try {
|
|
26
|
+
new URL(url);
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
logger.error(`Invalid URL provided: ${url}`);
|
|
30
|
+
logger.error(`Please ensure the URL includes http:// or https:// (e.g., https://${url})`);
|
|
31
|
+
process.exit(1);
|
|
32
|
+
}
|
|
33
|
+
logger.step('🚀', `Starting Khoj extraction for ${chalk.cyan(url)}`);
|
|
34
|
+
let cloneSkills = undefined;
|
|
35
|
+
if (options.clone) {
|
|
36
|
+
const wantsPrompt = await confirm({
|
|
37
|
+
message: 'Do you want to generate a custom AI instruction prompt for this clone?',
|
|
38
|
+
default: true
|
|
39
|
+
});
|
|
40
|
+
if (wantsPrompt) {
|
|
41
|
+
const selections = await checkbox({
|
|
42
|
+
message: 'Select the guidelines the AI should follow when rebuilding this site (Press <space> to select):',
|
|
43
|
+
choices: [
|
|
44
|
+
{ name: '★ All of the above', value: 'all' },
|
|
45
|
+
{ name: 'Frontend Design (Avoid cliché AI traits)', value: 'frontend-design' },
|
|
46
|
+
{ name: 'SEO Best Practices', value: 'seo-audit' },
|
|
47
|
+
{ name: 'Web Design Guidelines (a11y, contrast)', value: 'web-design-guidelines' },
|
|
48
|
+
{ name: 'Award-Winning Site (3D, GSAP, etc.)', value: 'award-winning-website' }
|
|
49
|
+
]
|
|
50
|
+
});
|
|
51
|
+
if (selections.includes('all')) {
|
|
52
|
+
cloneSkills = ['frontend-design', 'seo-audit', 'web-design-guidelines', 'award-winning-website'];
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
// Safe cast since the only non-CloneSkill option is 'all'
|
|
56
|
+
cloneSkills = selections;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
const extractOpts = {
|
|
61
|
+
url,
|
|
62
|
+
outputDir: options.output,
|
|
63
|
+
format: options.format, // Type assertion handled by options type
|
|
64
|
+
timeout: parseInt(options.timeout, 10),
|
|
65
|
+
fast: options.fast ?? false, // Ensure fast is boolean
|
|
66
|
+
clone: options.clone,
|
|
67
|
+
cloneSkills: cloneSkills,
|
|
68
|
+
sendToGemini: options.sendToGemini,
|
|
69
|
+
prompt: options.prompt,
|
|
70
|
+
clickSelector: options.click,
|
|
71
|
+
};
|
|
72
|
+
// Validate format option
|
|
73
|
+
if (!['json', 'markdown', 'both'].includes(extractOpts.format)) {
|
|
74
|
+
logger.error('--format must be one of: json, markdown, both');
|
|
75
|
+
process.exit(1);
|
|
76
|
+
}
|
|
77
|
+
// Validate timeout option
|
|
78
|
+
if (isNaN(extractOpts.timeout) || extractOpts.timeout < 1000) {
|
|
79
|
+
logger.error('--timeout must be a number >= 1000 (ms)');
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
logger.step('🔎', `Analysing: ${url}`);
|
|
83
|
+
logger.divider();
|
|
84
|
+
try {
|
|
85
|
+
// Will be wired in Phase 4 after all extractors are built
|
|
86
|
+
await runExtraction(extractOpts);
|
|
87
|
+
}
|
|
88
|
+
catch (err) {
|
|
89
|
+
logger.error('Extraction failed', err);
|
|
90
|
+
process.exit(1);
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
if (!process.argv.slice(2).length) {
|
|
94
|
+
program.outputHelp();
|
|
95
|
+
process.exit(0);
|
|
96
|
+
}
|
|
97
|
+
program.parseAsync(process.argv);
|
|
98
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/cli/index.ts"],"names":[],"mappings":";AACA,OAAO,eAAe,CAAC;AACvB,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAC5C,OAAO,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAItD,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACF,IAAI,CAAC,MAAM,CAAC;KACZ,WAAW,CAAC,uDAAuD,CAAC;KACpE,OAAO,CAAC,OAAO,CAAC;KAChB,QAAQ,CAAC,OAAO,EAAE,oCAAoC,CAAC;KACvD,MAAM,CAAC,oBAAoB,EAAE,kBAAkB,EAAE,UAAU,CAAC;KAC5D,MAAM,CAAC,qBAAqB,EAAE,uCAAuC,EAAE,MAAM,CAAC;KAC9E,MAAM,CAAC,oBAAoB,EAAE,mCAAmC,EAAE,OAAO,CAAC;KAC1E,MAAM,CAAC,kBAAkB,EAAE,4CAA4C,CAAC;KACxE,MAAM,CAAC,iBAAiB,EAAE,yDAAyD,CAAC;KACpF,MAAM,CAAC,QAAQ,EAAE,yDAAyD,CAAC;KAC3E,MAAM,CAAC,SAAS,EAAE,6DAA6D,CAAC;KAChF,MAAM,CAAC,oBAAoB,EAAE,4FAA4F,CAAC;KAC1H,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAS3B,EAAE,EAAE;IACD,MAAM,CAAC,MAAM,EAAE,CAAC;IAEhB,kCAAkC;IAClC,IAAI,CAAC;QACD,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACjB,CAAC;IAAC,MAAM,CAAC;QACL,MAAM,CAAC,KAAK,CAAC,yBAAyB,GAAG,EAAE,CAAC,CAAC;QAC7C,MAAM,CAAC,KAAK,CAAC,qEAAqE,GAAG,GAAG,CAAC,CAAC;QAC1F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,gCAAgC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAErE,IAAI,WAAW,GAA6B,SAAS,CAAC;IAEtD,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAChB,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC;YAC9B,OAAO,EAAE,wEAAwE;YACjF,OAAO,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,IAAI,WAAW,EAAE,CAAC;YACd,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC;gBAC9B,OAAO,EAAE,iGAAiG;gBAC1G,OAAO,EAAE;oBACL,EAAE,IAAI,EAAE,oBAAoB,EAAE,KAAK,EAAE,KAAK,EAAE;oBAC5C,EAAE,IAAI,EAAE,0CAA0C,EAAE,KAAK,EAAE,iBAAiB,EAAE;oBAC9E,EAAE,IAAI,EAAE,oBAAoB,EAAE,KAAK,EAAE,WAAW,EAAE;oBAClD,EAAE,IAAI,EAAE,wCAAwC,EAAE,KAAK,EAAE,uBAAuB,EAAE;oBAClF,EAAE,IAAI,EAAE,qCAAqC,EAAE,KAAK,EAAE,uBAAuB,EAAE;iBAClF;aACJ,CAAC,CAAC;YAEH,IAAI,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC7B,WAAW,GAAG,CAAC,iBAAiB,EAAE,WAAW,EAAE,uBAAuB,EAAE,uBAAuB,CAAC,CAAC;YACrG,CAAC;iBAAM,CAAC;gBACJ,0DAA0D;gBAC1D,WAAW,GAAG,UAA0B,CAAC;YAC7C,CAAC;QACL,CAAC;IACL,CAAC;IAED,MAAM,WAAW,GAAsB;QACnC,GAAG;QACH,SAAS,EAAE,OAAO,CAAC,MAAM;QACzB,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,yCAAyC;QACjE,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;QACtC,IAAI,EAAE,OAAO,CAAC,IAAI,IAAI,KAAK,EAAE,yBAAyB;QACtD,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,WAAW,EAAE,WAAW;QACxB,YAAY,EAAE,OAAO,CAAC,YAAY;QAClC,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,aAAa,EAAE,OAAO,CAAC,KAAK;KAC/B,CAAC;IAEF,yBAAyB;IACzB,IAAI,CAAC,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7D,MAAM,CAAC,KAAK,CAAC,+CAA+C,CAAC,CAAC;QAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,0BAA0B;IAC1B,IAAI,KAAK,CAAC,WAAW,CAAC,OAAO,CAAC,IAAI,WAAW,CAAC,OAAO,GAAG,IAAI,EAAE,CAAC;QAC3D,MAAM,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,cAAc,GAAG,EAAE,CAAC,CAAC;IACvC,MAAM,CAAC,OAAO,EAAE,CAAC;IAEjB,IAAI,CAAC;QACD,0DAA0D;QAC1D,MAAM,aAAa,CAAC,WAAW,CAAC,CAAC;IACrC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,MAAM,CAAC,KAAK,CAAC,mBAAmB,EAAE,GAAG,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;AACL,CAAC,CAAC,CAAC;AAEP,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IAChC,OAAO,CAAC,UAAU,EAAE,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACpB,CAAC;AAED,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { Page } from 'playwright';
|
|
2
|
+
import type { AnimationMap, GifAnimationPurpose, ImageAsset } from '../types/KhojContext.js';
|
|
3
|
+
/**
|
|
4
|
+
* AnimationExtractor — 3-pass animation intelligence:
|
|
5
|
+
*
|
|
6
|
+
* Pass 1: CSS @keyframes + transitions (from document.styleSheets)
|
|
7
|
+
* Pass 2: JS animation library detection (GSAP, Framer Motion, AOS, Lottie, etc.)
|
|
8
|
+
* Pass 3: GIF purpose inference from context
|
|
9
|
+
*/
|
|
10
|
+
export declare function extractAnimations(page: Page, gifs: ImageAsset[]): Promise<AnimationMap>;
|
|
11
|
+
export type { GifAnimationPurpose };
|
|
12
|
+
//# sourceMappingURL=AnimationExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AnimationExtractor.d.ts","sourceRoot":"","sources":["../../src/extractors/AnimationExtractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EACR,YAAY,EAMZ,mBAAmB,EACnB,UAAU,EACb,MAAM,yBAAyB,CAAC;AAIjC;;;;;;GAMG;AACH,wBAAsB,iBAAiB,CACnC,IAAI,EAAE,IAAI,EACV,IAAI,EAAE,UAAU,EAAE,GACnB,OAAO,CAAC,YAAY,CAAC,CAwBvB;AAmQD,YAAY,EAAE,mBAAmB,EAAE,CAAC"}
|