contextmd-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ name: Publish to NPM
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - uses: actions/setup-node@v4
14
+ with:
15
+ node-version: '18.x'
16
+ registry-url: 'https://registry.npmjs.org'
17
+
18
+ - run: npm ci
19
+ - run: npm run build
20
+
21
+ # This publishes to NPM using the token you added to GitHub Secrets
22
+ - run: npm publish
23
+ env:
24
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Antigravity
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,146 @@
1
+ # ๐Ÿง  ContextMD
2
+
3
+ <p align="center">
4
+ <strong>Feed your Agents the Context they deserve.</strong>
5
+ </p>
6
+
7
+ <p align="center">
8
+ <a href="https://github.com/UditAkhourii/contextmd/actions"><img src="https://img.shields.io/github/actions/workflow/status/UditAkhourii/contextmd/ci.yml?branch=main&style=for-the-badge" alt="CI status"></a>
9
+ <a href="https://www.npmjs.com/package/contextmd-cli"><img src="https://img.shields.io/npm/v/contextmd-cli?style=for-the-badge&color=blue" alt="NPM Version"></a>
10
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=for-the-badge" alt="MIT License"></a>
11
+ <a href="https://typescriptlang.org"><img src="https://img.shields.io/badge/Written_in-TypeScript-3178C6?style=for-the-badge&logo=typescript&logoColor=white" alt="TypeScript"></a>
12
+ </p>
13
+
14
+ ---
15
+
16
+ **ContextMD** is the ultimate terminal utility for turning complex documentation websites into a single, high-density **AI Context File**.
17
+
18
+ Modern LLMs and Agents (like Claude 3.5 Sonnet, GPT-4o, or Gemini 1.5 Pro) are powerful, but they struggle to navigate multi-page documentation sites effectively. They get lost in navigation bars, footers, duplicate content, and fragmented pages.
19
+
20
+ **ContextMD solves this.** It crawls, cleans, and chemically refines entire documentation sites into a single `context.md` file that you can drop directly into your LLM's context window.
21
+
22
+ ## โœจ Features
23
+
24
+ - **๐Ÿ•ท๏ธ Deep Crawling**: Intelligently traverses documentation sites, following links and building a comprehensive map of the content.
25
+ - **๐Ÿง  AI-Powered Refinement**: Uses OpenAI's models (configurable) to "read" each page and rewrite it for machine comprehension, stripping fluff and prioritizing logic, API signatures, and examples.
26
+ - **๐Ÿงน Noise Reduction**: Automatically detects and separates main content from sidebars, headers, footers, and advertisements.
27
+ - **โšก High Performance**: Concurrent processing with a beautiful, real-time CLI dashboard.
28
+ - **๐Ÿ“„ Single File Output**: Produces a consolidated Markdown file with clear headers and structure, perfect for RAG systems or direct LLM context.
29
+
30
+ ## ๐Ÿš€ Installation
31
+
32
+ Ensure you have **Node.js 18+** installed.
33
+
34
+ ### Global Install (Recommended)
35
+
36
+ ```bash
37
+ npm install -g contextmd-cli
38
+ ```
39
+
40
+ ### Run via npx (No install required)
41
+
42
+ ```bash
43
+ npx contextmd https://docs.example.com
44
+ ```
45
+
46
+ ## ๐Ÿ› ๏ธ Usage
47
+
48
+ ### Quick Start
49
+
50
+ 1. **Get an OpenAI API Key**: ContextMD uses AI to compress and refine the content.
51
+ 2. **Run the tool**:
52
+
53
+ ```bash
54
+ export OPENAI_API_KEY=sk-proj-...
55
+ contextmd https://docs.turso.tech
56
+ ```
57
+
58
+ This will generate a `context.md` file in your current directory.
59
+
60
+ ### Command Line Options
61
+
62
+ ```bash
63
+ Usage: contextmd [options] <url>
64
+
65
+ Arguments:
66
+ url Base URL of the documentation to convert
67
+
68
+ Options:
69
+ -k, --key <key> OpenAI API Key (can also be set via OPENAI_API_KEY env var)
70
+ -o, --output <path> Output file path (default: "context.md")
71
+ -l, --limit <number> Max pages to crawl (default: "100")
72
+ -h, --help display help for command
73
+ ```
74
+
75
+ ### Examples
76
+
77
+ **Crawl a specific documentation site with a page limit:**
78
+
79
+ ```bash
80
+ contextmd https://developer.spotify.com/documentation/web-api --limit 50
81
+ ```
82
+
83
+ **Save to a specific location:**
84
+
85
+ ```bash
86
+ contextmd https://stripe.com/docs/api -o ./stripe-context.md
87
+ ```
88
+
89
+ ## ๐Ÿ—๏ธ How It Works
90
+
91
+ ContextMD operates in a three-stage pipeline:
92
+
93
+ 1. **The Crawler**:
94
+ * Starts at the provided `url`.
95
+ * Uses a Breadth-First Search (BFS) algorithm to find internal links.
96
+ * Filters out external links, social media, and irrelevant pages.
97
+ * Respects the `--limit` flag to prevent infinite loops on massive sites.
98
+
99
+ 2. **The Processor (The "Brain")**:
100
+ * Downloads the raw HTML of each discovered page.
101
+ * Uses `turndown` and `cheerio` to convert HTML to Markdown.
102
+ * **AI Step**: Sends the raw Markdown to an LLM with a specialized system prompt designed to:
103
+ * Summarize verbose sections.
104
+ * Preserve code blocks and API schemas exactly.
105
+ * Remove marketing fluff.
106
+ * Standardize formatting.
107
+
108
+ 3. **The Compiler**:
109
+ * Stitches all processed pages into a single `context.md` file.
110
+ * Adds a metadata header and table of contents structure (implicitly via markdown headers).
111
+
112
+ ## ๐Ÿ“ฆ For Developers
113
+
114
+ Want to build this from source?
115
+
116
+ 1. **Clone the repo**:
117
+ ```bash
118
+ git clone https://github.com/UditAkhourii/contextmd.git
119
+ cd contextmd
120
+ ```
121
+
122
+ 2. **Install dependencies**:
123
+ ```bash
124
+ npm install
125
+ ```
126
+
127
+ 3. **Build**:
128
+ ```bash
129
+ npm run build
130
+ ```
131
+
132
+ 4. **Run locally**:
133
+ ```bash
134
+ node dist/index.js https://example.com
135
+ ```
136
+
137
+ ## ๐Ÿค Contributing
138
+
139
+ We welcome contributions! Please open an issue or submit a PR if you have ideas for:
140
+ - Support for local LLMs (Ollama, etc.)
141
+ - Better crawling heuristics for SPA (Single Page Apps).
142
+ - Output formats (JSON, JSONL for fine-tuning).
143
+
144
+ ## ๐Ÿ“„ License
145
+
146
+ MIT ยฉ [Antigravity](https://github.com/UditAkhourii)
@@ -0,0 +1,107 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.Crawler = void 0;
40
+ const axios_1 = __importDefault(require("axios"));
41
+ const cheerio = __importStar(require("cheerio"));
42
+ const url_1 = require("url");
43
+ class Crawler {
44
+ constructor(baseUrl) {
45
+ this.visited = new Set();
46
+ this.queue = [];
47
+ this.baseUrl = baseUrl;
48
+ this.domain = new url_1.URL(baseUrl).hostname;
49
+ this.queue.push(baseUrl);
50
+ }
51
+ normalizeUrl(url, currentUrl) {
52
+ try {
53
+ const absolute = new url_1.URL(url, currentUrl);
54
+ // Only keep http(s)
55
+ if (!['http:', 'https:'].includes(absolute.protocol))
56
+ return null;
57
+ // Stay on domain
58
+ if (absolute.hostname !== this.domain)
59
+ return null;
60
+ // Remove hash
61
+ absolute.hash = '';
62
+ return absolute.toString();
63
+ }
64
+ catch (e) {
65
+ return null;
66
+ }
67
+ }
68
+ async crawl(maxPages = 500, onUrlFound) {
69
+ const pages = [];
70
+ while (this.queue.length > 0 && pages.length < maxPages) {
71
+ const url = this.queue.shift();
72
+ if (this.visited.has(url))
73
+ continue;
74
+ this.visited.add(url);
75
+ if (onUrlFound)
76
+ onUrlFound(url);
77
+ try {
78
+ const { data, headers } = await axios_1.default.get(url, {
79
+ headers: { 'User-Agent': 'AgenticDocsConverter/1.0' },
80
+ timeout: 10000
81
+ });
82
+ const contentType = headers['content-type'] || '';
83
+ if (!contentType.includes('text/html'))
84
+ continue;
85
+ const $ = cheerio.load(data);
86
+ const title = $('title').text() || url;
87
+ // Extract links
88
+ $('a').each((_, element) => {
89
+ const href = $(element).attr('href');
90
+ if (href) {
91
+ const normalized = this.normalizeUrl(href, url);
92
+ if (normalized && !this.visited.has(normalized) && !this.queue.includes(normalized)) {
93
+ this.queue.push(normalized);
94
+ }
95
+ }
96
+ });
97
+ pages.push({ url, content: data, title });
98
+ }
99
+ catch (error) {
100
+ // console.error(`Failed to crawl ${url}: ${(error as Error).message}`);
101
+ // Continue despite errors
102
+ }
103
+ }
104
+ return pages;
105
+ }
106
+ }
107
+ exports.Crawler = Crawler;
package/dist/index.js ADDED
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
4
+ if (k2 === undefined) k2 = k;
5
+ var desc = Object.getOwnPropertyDescriptor(m, k);
6
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
7
+ desc = { enumerable: true, get: function() { return m[k]; } };
8
+ }
9
+ Object.defineProperty(o, k2, desc);
10
+ }) : (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ o[k2] = m[k];
13
+ }));
14
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
15
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
16
+ }) : function(o, v) {
17
+ o["default"] = v;
18
+ });
19
+ var __importStar = (this && this.__importStar) || (function () {
20
+ var ownKeys = function(o) {
21
+ ownKeys = Object.getOwnPropertyNames || function (o) {
22
+ var ar = [];
23
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
24
+ return ar;
25
+ };
26
+ return ownKeys(o);
27
+ };
28
+ return function (mod) {
29
+ if (mod && mod.__esModule) return mod;
30
+ var result = {};
31
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
32
+ __setModuleDefault(result, mod);
33
+ return result;
34
+ };
35
+ })();
36
+ var __importDefault = (this && this.__importDefault) || function (mod) {
37
+ return (mod && mod.__esModule) ? mod : { "default": mod };
38
+ };
39
+ Object.defineProperty(exports, "__esModule", { value: true });
40
+ const commander_1 = require("commander");
41
+ const ora_1 = __importDefault(require("ora"));
42
+ const chalk_1 = __importDefault(require("chalk"));
43
+ const fs = __importStar(require("fs/promises"));
44
+ const path = __importStar(require("path"));
45
+ require("dotenv/config"); // Load .env if present
46
+ const crawler_1 = require("./crawler");
47
+ const processor_1 = require("./processor");
48
+ const program = new commander_1.Command();
49
+ program
50
+ .name('contextmd')
51
+ .description('Convert any documentation into an Agentic AI ready context file')
52
+ .argument('<url>', 'Base URL of the documentation to convert')
53
+ .option('-k, --key <key>', 'OpenAI API Key (or set OPENAI_API_KEY env var)')
54
+ .option('-o, --output <path>', 'Output file path', 'context.md')
55
+ .option('-l, --limit <number>', 'Max pages to crawl', '100')
56
+ .action(async (url, options) => {
57
+ try {
58
+ console.log(chalk_1.default.bold.cyan('\n๐Ÿš€ ContextMD - Agentic AI Context Generator\n'));
59
+ const apiKey = options.key || process.env.OPENAI_API_KEY;
60
+ if (!apiKey) {
61
+ console.error(chalk_1.default.red('โŒ Error: OpenAI API Key is required. Provide it via -k flag or OPENAI_API_KEY env var.'));
62
+ process.exit(1);
63
+ }
64
+ const crawler = new crawler_1.Crawler(url);
65
+ const processor = new processor_1.Processor(apiKey);
66
+ const spinner = (0, ora_1.default)('Initializing crawler...').start();
67
+ // 1. Crawl
68
+ spinner.text = `Crawling ${url}...`;
69
+ const pages = await crawler.crawl(parseInt(options.limit), (foundUrl) => {
70
+ spinner.text = `Crawling... Found: ${foundUrl}`;
71
+ });
72
+ spinner.succeed(chalk_1.default.green(`Crawling complete! Found ${pages.length} pages.`));
73
+ // 2. Process
74
+ const outputContent = [];
75
+ // Header for context.md
76
+ outputContent.push(`# Documentation Context\n\nGenerated by ContextMD from ${url} at ${new Date().toISOString()}\n\n---\n\n`);
77
+ const processSpinner = (0, ora_1.default)('Converting and refining pages with AI...').start();
78
+ // Process sequentially or in small batches to avoid Rate Limits
79
+ const batchSize = 5;
80
+ let processedCount = 0;
81
+ for (let i = 0; i < pages.length; i += batchSize) {
82
+ const batch = pages.slice(i, i + batchSize);
83
+ const results = await Promise.all(batch.map(async (page) => {
84
+ const result = await processor.processPage(page);
85
+ return result;
86
+ }));
87
+ outputContent.push(...results);
88
+ processedCount += batch.length;
89
+ processSpinner.text = `Processing pages... (${Math.min(processedCount, pages.length)}/${pages.length})`;
90
+ }
91
+ processSpinner.succeed(chalk_1.default.green('Conversion complete!'));
92
+ // 3. Write
93
+ const outputPath = path.resolve(process.cwd(), options.output);
94
+ await fs.writeFile(outputPath, outputContent.join('\n'));
95
+ console.log(chalk_1.default.bold.green(`\nโœ… Success! Agentic context written to: ${outputPath}`));
96
+ console.log(chalk_1.default.dim(`\nUsage tip: Give this file to your Agent/LLM to fully understand "${url}".\n`));
97
+ }
98
+ catch (error) {
99
+ console.error(chalk_1.default.red('\nโŒ Fatal Error:'), error.message);
100
+ process.exit(1);
101
+ }
102
+ });
103
+ program.parse(process.argv);
@@ -0,0 +1,107 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.Processor = void 0;
40
+ const turndown_1 = __importDefault(require("turndown"));
41
+ const openai_1 = require("openai");
42
+ const cheerio = __importStar(require("cheerio"));
43
+ class Processor {
44
+ constructor(apiKey) {
45
+ this.openai = new openai_1.OpenAI({ apiKey });
46
+ this.turndown = new turndown_1.default({
47
+ headingStyle: 'atx',
48
+ codeBlockStyle: 'fenced'
49
+ });
50
+ }
51
+ cleanHtml(html) {
52
+ const $ = cheerio.load(html);
53
+ // Remove clutter
54
+ $('script').remove();
55
+ $('style').remove();
56
+ $('nav').remove();
57
+ $('footer').remove();
58
+ $('iframe').remove();
59
+ $('noscript').remove();
60
+ $('[role="navigation"]').remove();
61
+ $('.nav').remove();
62
+ $('.footer').remove();
63
+ $('.sidebar').remove(); // Risk: might remove content if class naming is bad, but usually safe for docs sidebar
64
+ // Get main content if possible
65
+ const main = $('main').html() || $('article').html() || $('body').html() || '';
66
+ return main;
67
+ }
68
+ async processPage(page) {
69
+ // 1. Clean HTML
70
+ const cleanedHtml = this.cleanHtml(page.content);
71
+ // 2. Convert to Markdown
72
+ let markdown = this.turndown.turndown(cleanedHtml);
73
+ // 3. Enhance with LLM
74
+ // We use a cheap fast model for basic formatting/cleanup
75
+ try {
76
+ const response = await this.openai.chat.completions.create({
77
+ model: 'gpt-4o-mini', // Cost effective
78
+ messages: [
79
+ {
80
+ role: 'system',
81
+ content: `You are an expert technical writer optimizing documentation for AI Agents.
82
+ Your task is to rewrite the provided documentation Markdown to be:
83
+ 1. Extremely high-density and concise.
84
+ 2. Optimized for retrieval (keywords, clear logic).
85
+ 3. Stripped of conversational filler ("In this tutorial you will...").
86
+ 4. Strictly preserving ALL code blocks and technical constraints.
87
+ 5. Formatted with clear headers.
88
+
89
+ Input is a raw scrape. Fix broken markdown if any.
90
+ Return ONLY the refined markdown.`
91
+ },
92
+ {
93
+ role: 'user',
94
+ content: `URL: ${page.url}\nTitle: ${page.title}\n\nContent:\n${markdown}`
95
+ }
96
+ ],
97
+ temperature: 0.1,
98
+ });
99
+ return `## Source: [${page.title}](${page.url})\n\n${response.choices[0].message.content || markdown}\n\n---\n\n`;
100
+ }
101
+ catch (e) {
102
+ // Fallback to raw markdown if OpenAI fails
103
+ return `## Source: [${page.title}](${page.url})\n\n${markdown}\n\n---\n\n`;
104
+ }
105
+ }
106
+ }
107
+ exports.Processor = Processor;
package/package.json ADDED
@@ -0,0 +1,39 @@
1
+ {
2
+ "name": "contextmd-cli",
3
+ "version": "1.0.0",
4
+ "description": "The ultimate Agentic AI Context Generator for Documentation.",
5
+ "bin": {
6
+ "contextmd": "dist/index.js"
7
+ },
8
+ "main": "dist/index.js",
9
+ "scripts": {
10
+ "build": "tsc",
11
+ "start": "node dist/index.js",
12
+ "test": "echo \"Error: no test specified\" && exit 1"
13
+ },
14
+ "keywords": [],
15
+ "author": "Antigravity",
16
+ "license": "MIT",
17
+ "type": "commonjs",
18
+ "devDependencies": {
19
+ "@types/node": "^25.0.10",
20
+ "@types/turndown": "^5.0.6",
21
+ "ts-node": "^10.9.2",
22
+ "typescript": "^5.9.3"
23
+ },
24
+ "dependencies": {
25
+ "axios": "^1.13.3",
26
+ "chalk": "^4.1.2",
27
+ "cheerio": "^1.2.0",
28
+ "commander": "^14.0.2",
29
+ "dotenv": "^17.2.3",
30
+ "gpt-3-encoder": "^1.1.4",
31
+ "openai": "^6.16.0",
32
+ "ora": "^5.4.1",
33
+ "turndown": "^7.2.2"
34
+ },
35
+ "repository": {
36
+ "type": "git",
37
+ "url": "git+https://github.com/UditAkhourii/contextmd.git"
38
+ }
39
+ }
package/src/crawler.ts ADDED
@@ -0,0 +1,83 @@
1
+
2
+ import axios from 'axios';
3
+ import * as cheerio from 'cheerio';
4
+ import { URL } from 'url';
5
+
6
+ export interface Page {
7
+ url: string;
8
+ content: string; // HTML
9
+ title: string;
10
+ }
11
+
12
+ export class Crawler {
13
+ private visited = new Set<string>();
14
+ private queue: string[] = [];
15
+ private baseUrl: string;
16
+ private domain: string;
17
+
18
+ constructor(baseUrl: string) {
19
+ this.baseUrl = baseUrl;
20
+ this.domain = new URL(baseUrl).hostname;
21
+ this.queue.push(baseUrl);
22
+ }
23
+
24
+ private normalizeUrl(url: string, currentUrl: string): string | null {
25
+ try {
26
+ const absolute = new URL(url, currentUrl);
27
+ // Only keep http(s)
28
+ if (!['http:', 'https:'].includes(absolute.protocol)) return null;
29
+ // Stay on domain
30
+ if (absolute.hostname !== this.domain) return null;
31
+ // Remove hash
32
+ absolute.hash = '';
33
+ return absolute.toString();
34
+ } catch (e) {
35
+ return null;
36
+ }
37
+ }
38
+
39
+ async crawl(maxPages: number = 500, onUrlFound?: (url: string) => void): Promise<Page[]> {
40
+ const pages: Page[] = [];
41
+
42
+ while (this.queue.length > 0 && pages.length < maxPages) {
43
+ const url = this.queue.shift()!;
44
+
45
+ if (this.visited.has(url)) continue;
46
+ this.visited.add(url);
47
+
48
+ if (onUrlFound) onUrlFound(url);
49
+
50
+ try {
51
+ const { data, headers } = await axios.get(url, {
52
+ headers: { 'User-Agent': 'AgenticDocsConverter/1.0' },
53
+ timeout: 10000
54
+ });
55
+
56
+ const contentType = headers['content-type'] || '';
57
+ if (!contentType.includes('text/html')) continue;
58
+
59
+ const $ = cheerio.load(data);
60
+ const title = $('title').text() || url;
61
+
62
+ // Extract links
63
+ $('a').each((_, element) => {
64
+ const href = $(element).attr('href');
65
+ if (href) {
66
+ const normalized = this.normalizeUrl(href, url);
67
+ if (normalized && !this.visited.has(normalized) && !this.queue.includes(normalized)) {
68
+ this.queue.push(normalized);
69
+ }
70
+ }
71
+ });
72
+
73
+ pages.push({ url, content: data, title });
74
+
75
+ } catch (error) {
76
+ // console.error(`Failed to crawl ${url}: ${(error as Error).message}`);
77
+ // Continue despite errors
78
+ }
79
+ }
80
+
81
+ return pages;
82
+ }
83
+ }
package/src/index.ts ADDED
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from 'commander';
3
+ import ora from 'ora';
4
+ import chalk from 'chalk';
5
+ import * as fs from 'fs/promises';
6
+ import * as path from 'path';
7
+ import 'dotenv/config'; // Load .env if present
8
+
9
+ import { Crawler } from './crawler';
10
+ import { Processor } from './processor';
11
+
12
+ const program = new Command();
13
+
14
+ program
15
+ .name('contextmd')
16
+ .description('Convert any documentation into an Agentic AI ready context file')
17
+ .argument('<url>', 'Base URL of the documentation to convert')
18
+ .option('-k, --key <key>', 'OpenAI API Key (or set OPENAI_API_KEY env var)')
19
+ .option('-o, --output <path>', 'Output file path', 'context.md')
20
+ .option('-l, --limit <number>', 'Max pages to crawl', '100')
21
+ .action(async (url, options) => {
22
+ try {
23
+ console.log(chalk.bold.cyan('\n๐Ÿš€ ContextMD - Agentic AI Context Generator\n'));
24
+
25
+ const apiKey = options.key || process.env.OPENAI_API_KEY;
26
+
27
+ if (!apiKey) {
28
+ console.error(chalk.red('โŒ Error: OpenAI API Key is required. Provide it via -k flag or OPENAI_API_KEY env var.'));
29
+ process.exit(1);
30
+ }
31
+
32
+ const crawler = new Crawler(url);
33
+ const processor = new Processor(apiKey);
34
+
35
+ const spinner = ora('Initializing crawler...').start();
36
+
37
+ // 1. Crawl
38
+ spinner.text = `Crawling ${url}...`;
39
+ const pages = await crawler.crawl(parseInt(options.limit), (foundUrl) => {
40
+ spinner.text = `Crawling... Found: ${foundUrl}`;
41
+ });
42
+
43
+ spinner.succeed(chalk.green(`Crawling complete! Found ${pages.length} pages.`));
44
+
45
+ // 2. Process
46
+ const outputContent: string[] = [];
47
+
48
+ // Header for context.md
49
+ outputContent.push(`# Documentation Context\n\nGenerated by ContextMD from ${url} at ${new Date().toISOString()}\n\n---\n\n`);
50
+
51
+ const processSpinner = ora('Converting and refining pages with AI...').start();
52
+
53
+ // Process sequentially or in small batches to avoid Rate Limits
54
+ const batchSize = 5;
55
+ let processedCount = 0;
56
+
57
+ for (let i = 0; i < pages.length; i += batchSize) {
58
+ const batch = pages.slice(i, i + batchSize);
59
+ const results = await Promise.all(batch.map(async (page) => {
60
+ const result = await processor.processPage(page);
61
+ return result;
62
+ }));
63
+
64
+ outputContent.push(...results);
65
+ processedCount += batch.length;
66
+ processSpinner.text = `Processing pages... (${Math.min(processedCount, pages.length)}/${pages.length})`;
67
+ }
68
+
69
+ processSpinner.succeed(chalk.green('Conversion complete!'));
70
+
71
+ // 3. Write
72
+ const outputPath = path.resolve(process.cwd(), options.output);
73
+ await fs.writeFile(outputPath, outputContent.join('\n'));
74
+
75
+ console.log(chalk.bold.green(`\nโœ… Success! Agentic context written to: ${outputPath}`));
76
+ console.log(chalk.dim(`\nUsage tip: Give this file to your Agent/LLM to fully understand "${url}".\n`));
77
+
78
+ } catch (error) {
79
+ console.error(chalk.red('\nโŒ Fatal Error:'), (error as Error).message);
80
+ process.exit(1);
81
+ }
82
+ });
83
+
84
+ program.parse(process.argv);
@@ -0,0 +1,80 @@
1
+
2
+ import TurndownService from 'turndown';
3
+ import { OpenAI } from 'openai';
4
+ import * as cheerio from 'cheerio';
5
+ import { Page } from './crawler';
6
+
7
+ export class Processor {
8
+ private openai: OpenAI;
9
+ private turndown: TurndownService;
10
+
11
+ constructor(apiKey: string) {
12
+ this.openai = new OpenAI({ apiKey });
13
+ this.turndown = new TurndownService({
14
+ headingStyle: 'atx',
15
+ codeBlockStyle: 'fenced'
16
+ });
17
+ }
18
+
19
+ private cleanHtml(html: string): string {
20
+ const $ = cheerio.load(html);
21
+
22
+ // Remove clutter
23
+ $('script').remove();
24
+ $('style').remove();
25
+ $('nav').remove();
26
+ $('footer').remove();
27
+ $('iframe').remove();
28
+ $('noscript').remove();
29
+ $('[role="navigation"]').remove();
30
+ $('.nav').remove();
31
+ $('.footer').remove();
32
+ $('.sidebar').remove(); // Risk: might remove content if class naming is bad, but usually safe for docs sidebar
33
+
34
+ // Get main content if possible
35
+ const main = $('main').html() || $('article').html() || $('body').html() || '';
36
+ return main;
37
+ }
38
+
39
+ async processPage(page: Page): Promise<string> {
40
+ // 1. Clean HTML
41
+ const cleanedHtml = this.cleanHtml(page.content);
42
+
43
+ // 2. Convert to Markdown
44
+ let markdown = this.turndown.turndown(cleanedHtml);
45
+
46
+ // 3. Enhance with LLM
47
+ // We use a cheap fast model for basic formatting/cleanup
48
+ try {
49
+ const response = await this.openai.chat.completions.create({
50
+ model: 'gpt-4o-mini', // Cost effective
51
+ messages: [
52
+ {
53
+ role: 'system',
54
+ content: `You are an expert technical writer optimizing documentation for AI Agents.
55
+ Your task is to rewrite the provided documentation Markdown to be:
56
+ 1. Extremely high-density and concise.
57
+ 2. Optimized for retrieval (keywords, clear logic).
58
+ 3. Stripped of conversational filler ("In this tutorial you will...").
59
+ 4. Strictly preserving ALL code blocks and technical constraints.
60
+ 5. Formatted with clear headers.
61
+
62
+ Input is a raw scrape. Fix broken markdown if any.
63
+ Return ONLY the refined markdown.`
64
+ },
65
+ {
66
+ role: 'user',
67
+ content: `URL: ${page.url}\nTitle: ${page.title}\n\nContent:\n${markdown}`
68
+ }
69
+ ],
70
+ temperature: 0.1,
71
+ });
72
+
73
+ return `## Source: [${page.title}](${page.url})\n\n${response.choices[0].message.content || markdown}\n\n---\n\n`;
74
+
75
+ } catch (e) {
76
+ // Fallback to raw markdown if OpenAI fails
77
+ return `## Source: [${page.title}](${page.url})\n\n${markdown}\n\n---\n\n`;
78
+ }
79
+ }
80
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,15 @@
1
+
2
+ {
3
+ "compilerOptions": {
4
+ "target": "es2020",
5
+ "module": "commonjs",
6
+ "outDir": "./dist",
7
+ "rootDir": "./src",
8
+ "strict": true,
9
+ "esModuleInterop": true,
10
+ "skipLibCheck": true,
11
+ "forceConsistentCasingInFileNames": true
12
+ },
13
+ "include": ["src/**/*"],
14
+ "exclude": ["node_modules"]
15
+ }