contextmd-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +146 -0
- package/dist/crawler.js +107 -0
- package/dist/index.js +103 -0
- package/dist/processor.js +107 -0
- package/package.json +39 -0
- package/src/crawler.ts +83 -0
- package/src/index.ts +84 -0
- package/src/processor.ts +80 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: Publish to NPM
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [created]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
|
|
13
|
+
- uses: actions/setup-node@v4
|
|
14
|
+
with:
|
|
15
|
+
node-version: '18.x'
|
|
16
|
+
registry-url: 'https://registry.npmjs.org'
|
|
17
|
+
|
|
18
|
+
- run: npm ci
|
|
19
|
+
- run: npm run build
|
|
20
|
+
|
|
21
|
+
# This publishes to NPM using the token you added to GitHub Secrets
|
|
22
|
+
- run: npm publish
|
|
23
|
+
env:
|
|
24
|
+
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Antigravity
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# ๐ง ContextMD
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<strong>Feed your Agents the Context they deserve.</strong>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://github.com/UditAkhourii/contextmd/actions"><img src="https://img.shields.io/github/actions/workflow/status/UditAkhourii/contextmd/ci.yml?branch=main&style=for-the-badge" alt="CI status"></a>
|
|
9
|
+
<a href="https://www.npmjs.com/package/contextmd-cli"><img src="https://img.shields.io/npm/v/contextmd-cli?style=for-the-badge&color=blue" alt="NPM Version"></a>
|
|
10
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=for-the-badge" alt="MIT License"></a>
|
|
11
|
+
<a href="https://typescriptlang.org"><img src="https://img.shields.io/badge/Written_in-TypeScript-3178C6?style=for-the-badge&logo=typescript&logoColor=white" alt="TypeScript"></a>
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
**ContextMD** is the ultimate terminal utility for turning complex documentation websites into a single, high-density **AI Context File**.
|
|
17
|
+
|
|
18
|
+
Modern LLMs and Agents (like Claude 3.5 Sonnet, GPT-4o, or Gemini 1.5 Pro) are powerful, but they struggle to navigate multi-page documentation sites effectively. They get lost in navigation bars, footers, duplicate content, and fragmented pages.
|
|
19
|
+
|
|
20
|
+
**ContextMD solves this.** It crawls, cleans, and chemically refines entire documentation sites into a single `context.md` file that you can drop directly into your LLM's context window.
|
|
21
|
+
|
|
22
|
+
## โจ Features
|
|
23
|
+
|
|
24
|
+
- **๐ท๏ธ Deep Crawling**: Intelligently traverses documentation sites, following links and building a comprehensive map of the content.
|
|
25
|
+
- **๐ง AI-Powered Refinement**: Uses OpenAI's models (configurable) to "read" each page and rewrite it for machine comprehension, stripping fluff and prioritizing logic, API signatures, and examples.
|
|
26
|
+
- **๐งน Noise Reduction**: Automatically detects and separates main content from sidebars, headers, footers, and advertisements.
|
|
27
|
+
- **โก High Performance**: Concurrent processing with a beautiful, real-time CLI dashboard.
|
|
28
|
+
- **๐ Single File Output**: Produces a consolidated Markdown file with clear headers and structure, perfect for RAG systems or direct LLM context.
|
|
29
|
+
|
|
30
|
+
## ๐ Installation
|
|
31
|
+
|
|
32
|
+
Ensure you have **Node.js 18+** installed.
|
|
33
|
+
|
|
34
|
+
### Global Install (Recommended)
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npm install -g contextmd-cli
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Run via npx (No install required)
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
npx contextmd https://docs.example.com
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## ๐ ๏ธ Usage
|
|
47
|
+
|
|
48
|
+
### Quick Start
|
|
49
|
+
|
|
50
|
+
1. **Get an OpenAI API Key**: ContextMD uses AI to compress and refine the content.
|
|
51
|
+
2. **Run the tool**:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
export OPENAI_API_KEY=sk-proj-...
|
|
55
|
+
contextmd https://docs.turso.tech
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
This will generate a `context.md` file in your current directory.
|
|
59
|
+
|
|
60
|
+
### Command Line Options
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
Usage: contextmd [options] <url>
|
|
64
|
+
|
|
65
|
+
Arguments:
|
|
66
|
+
url Base URL of the documentation to convert
|
|
67
|
+
|
|
68
|
+
Options:
|
|
69
|
+
-k, --key <key> OpenAI API Key (can also be set via OPENAI_API_KEY env var)
|
|
70
|
+
-o, --output <path> Output file path (default: "context.md")
|
|
71
|
+
-l, --limit <number> Max pages to crawl (default: "100")
|
|
72
|
+
-h, --help display help for command
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Examples
|
|
76
|
+
|
|
77
|
+
**Crawl a specific documentation site with a page limit:**
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
contextmd https://developer.spotify.com/documentation/web-api --limit 50
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Save to a specific location:**
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
contextmd https://stripe.com/docs/api -o ./stripe-context.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## ๐๏ธ How It Works
|
|
90
|
+
|
|
91
|
+
ContextMD operates in a three-stage pipeline:
|
|
92
|
+
|
|
93
|
+
1. **The Crawler**:
|
|
94
|
+
* Starts at the provided `url`.
|
|
95
|
+
* Uses a Breadth-First Search (BFS) algorithm to find internal links.
|
|
96
|
+
* Filters out external links, social media, and irrelevant pages.
|
|
97
|
+
* Respects the `--limit` flag to prevent infinite loops on massive sites.
|
|
98
|
+
|
|
99
|
+
2. **The Processor (The "Brain")**:
|
|
100
|
+
* Downloads the raw HTML of each discovered page.
|
|
101
|
+
* Uses `turndown` and `cheerio` to convert HTML to Markdown.
|
|
102
|
+
* **AI Step**: Sends the raw Markdown to an LLM with a specialized system prompt designed to:
|
|
103
|
+
* Summarize verbose sections.
|
|
104
|
+
* Preserve code blocks and API schemas exactly.
|
|
105
|
+
* Remove marketing fluff.
|
|
106
|
+
* Standardize formatting.
|
|
107
|
+
|
|
108
|
+
3. **The Compiler**:
|
|
109
|
+
* Stitches all processed pages into a single `context.md` file.
|
|
110
|
+
* Adds a metadata header and table of contents structure (implicitly via markdown headers).
|
|
111
|
+
|
|
112
|
+
## ๐ฆ For Developers
|
|
113
|
+
|
|
114
|
+
Want to build this from source?
|
|
115
|
+
|
|
116
|
+
1. **Clone the repo**:
|
|
117
|
+
```bash
|
|
118
|
+
git clone https://github.com/UditAkhourii/contextmd.git
|
|
119
|
+
cd contextmd
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
2. **Install dependencies**:
|
|
123
|
+
```bash
|
|
124
|
+
npm install
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
3. **Build**:
|
|
128
|
+
```bash
|
|
129
|
+
npm run build
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
4. **Run locally**:
|
|
133
|
+
```bash
|
|
134
|
+
node dist/index.js https://example.com
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## ๐ค Contributing
|
|
138
|
+
|
|
139
|
+
We welcome contributions! Please open an issue or submit a PR if you have ideas for:
|
|
140
|
+
- Support for local LLMs (Ollama, etc.)
|
|
141
|
+
- Better crawling heuristics for SPA (Single Page Apps).
|
|
142
|
+
- Output formats (JSON, JSONL for fine-tuning).
|
|
143
|
+
|
|
144
|
+
## ๐ License
|
|
145
|
+
|
|
146
|
+
MIT ยฉ [Antigravity](https://github.com/UditAkhourii)
|
package/dist/crawler.js
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.Crawler = void 0;
|
|
40
|
+
const axios_1 = __importDefault(require("axios"));
|
|
41
|
+
const cheerio = __importStar(require("cheerio"));
|
|
42
|
+
const url_1 = require("url");
|
|
43
|
+
class Crawler {
|
|
44
|
+
constructor(baseUrl) {
|
|
45
|
+
this.visited = new Set();
|
|
46
|
+
this.queue = [];
|
|
47
|
+
this.baseUrl = baseUrl;
|
|
48
|
+
this.domain = new url_1.URL(baseUrl).hostname;
|
|
49
|
+
this.queue.push(baseUrl);
|
|
50
|
+
}
|
|
51
|
+
normalizeUrl(url, currentUrl) {
|
|
52
|
+
try {
|
|
53
|
+
const absolute = new url_1.URL(url, currentUrl);
|
|
54
|
+
// Only keep http(s)
|
|
55
|
+
if (!['http:', 'https:'].includes(absolute.protocol))
|
|
56
|
+
return null;
|
|
57
|
+
// Stay on domain
|
|
58
|
+
if (absolute.hostname !== this.domain)
|
|
59
|
+
return null;
|
|
60
|
+
// Remove hash
|
|
61
|
+
absolute.hash = '';
|
|
62
|
+
return absolute.toString();
|
|
63
|
+
}
|
|
64
|
+
catch (e) {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async crawl(maxPages = 500, onUrlFound) {
|
|
69
|
+
const pages = [];
|
|
70
|
+
while (this.queue.length > 0 && pages.length < maxPages) {
|
|
71
|
+
const url = this.queue.shift();
|
|
72
|
+
if (this.visited.has(url))
|
|
73
|
+
continue;
|
|
74
|
+
this.visited.add(url);
|
|
75
|
+
if (onUrlFound)
|
|
76
|
+
onUrlFound(url);
|
|
77
|
+
try {
|
|
78
|
+
const { data, headers } = await axios_1.default.get(url, {
|
|
79
|
+
headers: { 'User-Agent': 'AgenticDocsConverter/1.0' },
|
|
80
|
+
timeout: 10000
|
|
81
|
+
});
|
|
82
|
+
const contentType = headers['content-type'] || '';
|
|
83
|
+
if (!contentType.includes('text/html'))
|
|
84
|
+
continue;
|
|
85
|
+
const $ = cheerio.load(data);
|
|
86
|
+
const title = $('title').text() || url;
|
|
87
|
+
// Extract links
|
|
88
|
+
$('a').each((_, element) => {
|
|
89
|
+
const href = $(element).attr('href');
|
|
90
|
+
if (href) {
|
|
91
|
+
const normalized = this.normalizeUrl(href, url);
|
|
92
|
+
if (normalized && !this.visited.has(normalized) && !this.queue.includes(normalized)) {
|
|
93
|
+
this.queue.push(normalized);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
pages.push({ url, content: data, title });
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
// console.error(`Failed to crawl ${url}: ${(error as Error).message}`);
|
|
101
|
+
// Continue despite errors
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return pages;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
exports.Crawler = Crawler;
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
4
|
+
if (k2 === undefined) k2 = k;
|
|
5
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
6
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
7
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
8
|
+
}
|
|
9
|
+
Object.defineProperty(o, k2, desc);
|
|
10
|
+
}) : (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
o[k2] = m[k];
|
|
13
|
+
}));
|
|
14
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
15
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
16
|
+
}) : function(o, v) {
|
|
17
|
+
o["default"] = v;
|
|
18
|
+
});
|
|
19
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
20
|
+
var ownKeys = function(o) {
|
|
21
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
22
|
+
var ar = [];
|
|
23
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
24
|
+
return ar;
|
|
25
|
+
};
|
|
26
|
+
return ownKeys(o);
|
|
27
|
+
};
|
|
28
|
+
return function (mod) {
|
|
29
|
+
if (mod && mod.__esModule) return mod;
|
|
30
|
+
var result = {};
|
|
31
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
32
|
+
__setModuleDefault(result, mod);
|
|
33
|
+
return result;
|
|
34
|
+
};
|
|
35
|
+
})();
|
|
36
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
37
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
38
|
+
};
|
|
39
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
40
|
+
const commander_1 = require("commander");
|
|
41
|
+
const ora_1 = __importDefault(require("ora"));
|
|
42
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
43
|
+
const fs = __importStar(require("fs/promises"));
|
|
44
|
+
const path = __importStar(require("path"));
|
|
45
|
+
require("dotenv/config"); // Load .env if present
|
|
46
|
+
const crawler_1 = require("./crawler");
|
|
47
|
+
const processor_1 = require("./processor");
|
|
48
|
+
const program = new commander_1.Command();
|
|
49
|
+
program
|
|
50
|
+
.name('contextmd')
|
|
51
|
+
.description('Convert any documentation into an Agentic AI ready context file')
|
|
52
|
+
.argument('<url>', 'Base URL of the documentation to convert')
|
|
53
|
+
.option('-k, --key <key>', 'OpenAI API Key (or set OPENAI_API_KEY env var)')
|
|
54
|
+
.option('-o, --output <path>', 'Output file path', 'context.md')
|
|
55
|
+
.option('-l, --limit <number>', 'Max pages to crawl', '100')
|
|
56
|
+
.action(async (url, options) => {
|
|
57
|
+
try {
|
|
58
|
+
console.log(chalk_1.default.bold.cyan('\n๐ ContextMD - Agentic AI Context Generator\n'));
|
|
59
|
+
const apiKey = options.key || process.env.OPENAI_API_KEY;
|
|
60
|
+
if (!apiKey) {
|
|
61
|
+
console.error(chalk_1.default.red('โ Error: OpenAI API Key is required. Provide it via -k flag or OPENAI_API_KEY env var.'));
|
|
62
|
+
process.exit(1);
|
|
63
|
+
}
|
|
64
|
+
const crawler = new crawler_1.Crawler(url);
|
|
65
|
+
const processor = new processor_1.Processor(apiKey);
|
|
66
|
+
const spinner = (0, ora_1.default)('Initializing crawler...').start();
|
|
67
|
+
// 1. Crawl
|
|
68
|
+
spinner.text = `Crawling ${url}...`;
|
|
69
|
+
const pages = await crawler.crawl(parseInt(options.limit), (foundUrl) => {
|
|
70
|
+
spinner.text = `Crawling... Found: ${foundUrl}`;
|
|
71
|
+
});
|
|
72
|
+
spinner.succeed(chalk_1.default.green(`Crawling complete! Found ${pages.length} pages.`));
|
|
73
|
+
// 2. Process
|
|
74
|
+
const outputContent = [];
|
|
75
|
+
// Header for context.md
|
|
76
|
+
outputContent.push(`# Documentation Context\n\nGenerated by ContextMD from ${url} at ${new Date().toISOString()}\n\n---\n\n`);
|
|
77
|
+
const processSpinner = (0, ora_1.default)('Converting and refining pages with AI...').start();
|
|
78
|
+
// Process sequentially or in small batches to avoid Rate Limits
|
|
79
|
+
const batchSize = 5;
|
|
80
|
+
let processedCount = 0;
|
|
81
|
+
for (let i = 0; i < pages.length; i += batchSize) {
|
|
82
|
+
const batch = pages.slice(i, i + batchSize);
|
|
83
|
+
const results = await Promise.all(batch.map(async (page) => {
|
|
84
|
+
const result = await processor.processPage(page);
|
|
85
|
+
return result;
|
|
86
|
+
}));
|
|
87
|
+
outputContent.push(...results);
|
|
88
|
+
processedCount += batch.length;
|
|
89
|
+
processSpinner.text = `Processing pages... (${Math.min(processedCount, pages.length)}/${pages.length})`;
|
|
90
|
+
}
|
|
91
|
+
processSpinner.succeed(chalk_1.default.green('Conversion complete!'));
|
|
92
|
+
// 3. Write
|
|
93
|
+
const outputPath = path.resolve(process.cwd(), options.output);
|
|
94
|
+
await fs.writeFile(outputPath, outputContent.join('\n'));
|
|
95
|
+
console.log(chalk_1.default.bold.green(`\nโ
Success! Agentic context written to: ${outputPath}`));
|
|
96
|
+
console.log(chalk_1.default.dim(`\nUsage tip: Give this file to your Agent/LLM to fully understand "${url}".\n`));
|
|
97
|
+
}
|
|
98
|
+
catch (error) {
|
|
99
|
+
console.error(chalk_1.default.red('\nโ Fatal Error:'), error.message);
|
|
100
|
+
process.exit(1);
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
program.parse(process.argv);
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.Processor = void 0;
|
|
40
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
41
|
+
const openai_1 = require("openai");
|
|
42
|
+
const cheerio = __importStar(require("cheerio"));
|
|
43
|
+
class Processor {
|
|
44
|
+
constructor(apiKey) {
|
|
45
|
+
this.openai = new openai_1.OpenAI({ apiKey });
|
|
46
|
+
this.turndown = new turndown_1.default({
|
|
47
|
+
headingStyle: 'atx',
|
|
48
|
+
codeBlockStyle: 'fenced'
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
cleanHtml(html) {
|
|
52
|
+
const $ = cheerio.load(html);
|
|
53
|
+
// Remove clutter
|
|
54
|
+
$('script').remove();
|
|
55
|
+
$('style').remove();
|
|
56
|
+
$('nav').remove();
|
|
57
|
+
$('footer').remove();
|
|
58
|
+
$('iframe').remove();
|
|
59
|
+
$('noscript').remove();
|
|
60
|
+
$('[role="navigation"]').remove();
|
|
61
|
+
$('.nav').remove();
|
|
62
|
+
$('.footer').remove();
|
|
63
|
+
$('.sidebar').remove(); // Risk: might remove content if class naming is bad, but usually safe for docs sidebar
|
|
64
|
+
// Get main content if possible
|
|
65
|
+
const main = $('main').html() || $('article').html() || $('body').html() || '';
|
|
66
|
+
return main;
|
|
67
|
+
}
|
|
68
|
+
async processPage(page) {
|
|
69
|
+
// 1. Clean HTML
|
|
70
|
+
const cleanedHtml = this.cleanHtml(page.content);
|
|
71
|
+
// 2. Convert to Markdown
|
|
72
|
+
let markdown = this.turndown.turndown(cleanedHtml);
|
|
73
|
+
// 3. Enhance with LLM
|
|
74
|
+
// We use a cheap fast model for basic formatting/cleanup
|
|
75
|
+
try {
|
|
76
|
+
const response = await this.openai.chat.completions.create({
|
|
77
|
+
model: 'gpt-4o-mini', // Cost effective
|
|
78
|
+
messages: [
|
|
79
|
+
{
|
|
80
|
+
role: 'system',
|
|
81
|
+
content: `You are an expert technical writer optimizing documentation for AI Agents.
|
|
82
|
+
Your task is to rewrite the provided documentation Markdown to be:
|
|
83
|
+
1. Extremely high-density and concise.
|
|
84
|
+
2. Optimized for retrieval (keywords, clear logic).
|
|
85
|
+
3. Stripped of conversational filler ("In this tutorial you will...").
|
|
86
|
+
4. Strictly preserving ALL code blocks and technical constraints.
|
|
87
|
+
5. Formatted with clear headers.
|
|
88
|
+
|
|
89
|
+
Input is a raw scrape. Fix broken markdown if any.
|
|
90
|
+
Return ONLY the refined markdown.`
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
role: 'user',
|
|
94
|
+
content: `URL: ${page.url}\nTitle: ${page.title}\n\nContent:\n${markdown}`
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
temperature: 0.1,
|
|
98
|
+
});
|
|
99
|
+
return `## Source: [${page.title}](${page.url})\n\n${response.choices[0].message.content || markdown}\n\n---\n\n`;
|
|
100
|
+
}
|
|
101
|
+
catch (e) {
|
|
102
|
+
// Fallback to raw markdown if OpenAI fails
|
|
103
|
+
return `## Source: [${page.title}](${page.url})\n\n${markdown}\n\n---\n\n`;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
exports.Processor = Processor;
|
package/package.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "contextmd-cli",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "The ultimate Agentic AI Context Generator for Documentation.",
|
|
5
|
+
"bin": {
|
|
6
|
+
"contextmd": "dist/index.js"
|
|
7
|
+
},
|
|
8
|
+
"main": "dist/index.js",
|
|
9
|
+
"scripts": {
|
|
10
|
+
"build": "tsc",
|
|
11
|
+
"start": "node dist/index.js",
|
|
12
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [],
|
|
15
|
+
"author": "Antigravity",
|
|
16
|
+
"license": "MIT",
|
|
17
|
+
"type": "commonjs",
|
|
18
|
+
"devDependencies": {
|
|
19
|
+
"@types/node": "^25.0.10",
|
|
20
|
+
"@types/turndown": "^5.0.6",
|
|
21
|
+
"ts-node": "^10.9.2",
|
|
22
|
+
"typescript": "^5.9.3"
|
|
23
|
+
},
|
|
24
|
+
"dependencies": {
|
|
25
|
+
"axios": "^1.13.3",
|
|
26
|
+
"chalk": "^4.1.2",
|
|
27
|
+
"cheerio": "^1.2.0",
|
|
28
|
+
"commander": "^14.0.2",
|
|
29
|
+
"dotenv": "^17.2.3",
|
|
30
|
+
"gpt-3-encoder": "^1.1.4",
|
|
31
|
+
"openai": "^6.16.0",
|
|
32
|
+
"ora": "^5.4.1",
|
|
33
|
+
"turndown": "^7.2.2"
|
|
34
|
+
},
|
|
35
|
+
"repository": {
|
|
36
|
+
"type": "git",
|
|
37
|
+
"url": "git+https://github.com/UditAkhourii/contextmd.git"
|
|
38
|
+
}
|
|
39
|
+
}
|
package/src/crawler.ts
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
|
|
2
|
+
import axios from 'axios';
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
import { URL } from 'url';
|
|
5
|
+
|
|
6
|
+
export interface Page {
|
|
7
|
+
url: string;
|
|
8
|
+
content: string; // HTML
|
|
9
|
+
title: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export class Crawler {
|
|
13
|
+
private visited = new Set<string>();
|
|
14
|
+
private queue: string[] = [];
|
|
15
|
+
private baseUrl: string;
|
|
16
|
+
private domain: string;
|
|
17
|
+
|
|
18
|
+
constructor(baseUrl: string) {
|
|
19
|
+
this.baseUrl = baseUrl;
|
|
20
|
+
this.domain = new URL(baseUrl).hostname;
|
|
21
|
+
this.queue.push(baseUrl);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
private normalizeUrl(url: string, currentUrl: string): string | null {
|
|
25
|
+
try {
|
|
26
|
+
const absolute = new URL(url, currentUrl);
|
|
27
|
+
// Only keep http(s)
|
|
28
|
+
if (!['http:', 'https:'].includes(absolute.protocol)) return null;
|
|
29
|
+
// Stay on domain
|
|
30
|
+
if (absolute.hostname !== this.domain) return null;
|
|
31
|
+
// Remove hash
|
|
32
|
+
absolute.hash = '';
|
|
33
|
+
return absolute.toString();
|
|
34
|
+
} catch (e) {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async crawl(maxPages: number = 500, onUrlFound?: (url: string) => void): Promise<Page[]> {
|
|
40
|
+
const pages: Page[] = [];
|
|
41
|
+
|
|
42
|
+
while (this.queue.length > 0 && pages.length < maxPages) {
|
|
43
|
+
const url = this.queue.shift()!;
|
|
44
|
+
|
|
45
|
+
if (this.visited.has(url)) continue;
|
|
46
|
+
this.visited.add(url);
|
|
47
|
+
|
|
48
|
+
if (onUrlFound) onUrlFound(url);
|
|
49
|
+
|
|
50
|
+
try {
|
|
51
|
+
const { data, headers } = await axios.get(url, {
|
|
52
|
+
headers: { 'User-Agent': 'AgenticDocsConverter/1.0' },
|
|
53
|
+
timeout: 10000
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
const contentType = headers['content-type'] || '';
|
|
57
|
+
if (!contentType.includes('text/html')) continue;
|
|
58
|
+
|
|
59
|
+
const $ = cheerio.load(data);
|
|
60
|
+
const title = $('title').text() || url;
|
|
61
|
+
|
|
62
|
+
// Extract links
|
|
63
|
+
$('a').each((_, element) => {
|
|
64
|
+
const href = $(element).attr('href');
|
|
65
|
+
if (href) {
|
|
66
|
+
const normalized = this.normalizeUrl(href, url);
|
|
67
|
+
if (normalized && !this.visited.has(normalized) && !this.queue.includes(normalized)) {
|
|
68
|
+
this.queue.push(normalized);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
pages.push({ url, content: data, title });
|
|
74
|
+
|
|
75
|
+
} catch (error) {
|
|
76
|
+
// console.error(`Failed to crawl ${url}: ${(error as Error).message}`);
|
|
77
|
+
// Continue despite errors
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return pages;
|
|
82
|
+
}
|
|
83
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from 'commander';
|
|
3
|
+
import ora from 'ora';
|
|
4
|
+
import chalk from 'chalk';
|
|
5
|
+
import * as fs from 'fs/promises';
|
|
6
|
+
import * as path from 'path';
|
|
7
|
+
import 'dotenv/config'; // Load .env if present
|
|
8
|
+
|
|
9
|
+
import { Crawler } from './crawler';
|
|
10
|
+
import { Processor } from './processor';
|
|
11
|
+
|
|
12
|
+
const program = new Command();
|
|
13
|
+
|
|
14
|
+
program
|
|
15
|
+
.name('contextmd')
|
|
16
|
+
.description('Convert any documentation into an Agentic AI ready context file')
|
|
17
|
+
.argument('<url>', 'Base URL of the documentation to convert')
|
|
18
|
+
.option('-k, --key <key>', 'OpenAI API Key (or set OPENAI_API_KEY env var)')
|
|
19
|
+
.option('-o, --output <path>', 'Output file path', 'context.md')
|
|
20
|
+
.option('-l, --limit <number>', 'Max pages to crawl', '100')
|
|
21
|
+
.action(async (url, options) => {
|
|
22
|
+
try {
|
|
23
|
+
console.log(chalk.bold.cyan('\n๐ ContextMD - Agentic AI Context Generator\n'));
|
|
24
|
+
|
|
25
|
+
const apiKey = options.key || process.env.OPENAI_API_KEY;
|
|
26
|
+
|
|
27
|
+
if (!apiKey) {
|
|
28
|
+
console.error(chalk.red('โ Error: OpenAI API Key is required. Provide it via -k flag or OPENAI_API_KEY env var.'));
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const crawler = new Crawler(url);
|
|
33
|
+
const processor = new Processor(apiKey);
|
|
34
|
+
|
|
35
|
+
const spinner = ora('Initializing crawler...').start();
|
|
36
|
+
|
|
37
|
+
// 1. Crawl
|
|
38
|
+
spinner.text = `Crawling ${url}...`;
|
|
39
|
+
const pages = await crawler.crawl(parseInt(options.limit), (foundUrl) => {
|
|
40
|
+
spinner.text = `Crawling... Found: ${foundUrl}`;
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
spinner.succeed(chalk.green(`Crawling complete! Found ${pages.length} pages.`));
|
|
44
|
+
|
|
45
|
+
// 2. Process
|
|
46
|
+
const outputContent: string[] = [];
|
|
47
|
+
|
|
48
|
+
// Header for context.md
|
|
49
|
+
outputContent.push(`# Documentation Context\n\nGenerated by ContextMD from ${url} at ${new Date().toISOString()}\n\n---\n\n`);
|
|
50
|
+
|
|
51
|
+
const processSpinner = ora('Converting and refining pages with AI...').start();
|
|
52
|
+
|
|
53
|
+
// Process sequentially or in small batches to avoid Rate Limits
|
|
54
|
+
const batchSize = 5;
|
|
55
|
+
let processedCount = 0;
|
|
56
|
+
|
|
57
|
+
for (let i = 0; i < pages.length; i += batchSize) {
|
|
58
|
+
const batch = pages.slice(i, i + batchSize);
|
|
59
|
+
const results = await Promise.all(batch.map(async (page) => {
|
|
60
|
+
const result = await processor.processPage(page);
|
|
61
|
+
return result;
|
|
62
|
+
}));
|
|
63
|
+
|
|
64
|
+
outputContent.push(...results);
|
|
65
|
+
processedCount += batch.length;
|
|
66
|
+
processSpinner.text = `Processing pages... (${Math.min(processedCount, pages.length)}/${pages.length})`;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
processSpinner.succeed(chalk.green('Conversion complete!'));
|
|
70
|
+
|
|
71
|
+
// 3. Write
|
|
72
|
+
const outputPath = path.resolve(process.cwd(), options.output);
|
|
73
|
+
await fs.writeFile(outputPath, outputContent.join('\n'));
|
|
74
|
+
|
|
75
|
+
console.log(chalk.bold.green(`\nโ
Success! Agentic context written to: ${outputPath}`));
|
|
76
|
+
console.log(chalk.dim(`\nUsage tip: Give this file to your Agent/LLM to fully understand "${url}".\n`));
|
|
77
|
+
|
|
78
|
+
} catch (error) {
|
|
79
|
+
console.error(chalk.red('\nโ Fatal Error:'), (error as Error).message);
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
program.parse(process.argv);
|
package/src/processor.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
import { OpenAI } from 'openai';
|
|
4
|
+
import * as cheerio from 'cheerio';
|
|
5
|
+
import { Page } from './crawler';
|
|
6
|
+
|
|
7
|
+
export class Processor {
|
|
8
|
+
private openai: OpenAI;
|
|
9
|
+
private turndown: TurndownService;
|
|
10
|
+
|
|
11
|
+
constructor(apiKey: string) {
|
|
12
|
+
this.openai = new OpenAI({ apiKey });
|
|
13
|
+
this.turndown = new TurndownService({
|
|
14
|
+
headingStyle: 'atx',
|
|
15
|
+
codeBlockStyle: 'fenced'
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
private cleanHtml(html: string): string {
|
|
20
|
+
const $ = cheerio.load(html);
|
|
21
|
+
|
|
22
|
+
// Remove clutter
|
|
23
|
+
$('script').remove();
|
|
24
|
+
$('style').remove();
|
|
25
|
+
$('nav').remove();
|
|
26
|
+
$('footer').remove();
|
|
27
|
+
$('iframe').remove();
|
|
28
|
+
$('noscript').remove();
|
|
29
|
+
$('[role="navigation"]').remove();
|
|
30
|
+
$('.nav').remove();
|
|
31
|
+
$('.footer').remove();
|
|
32
|
+
$('.sidebar').remove(); // Risk: might remove content if class naming is bad, but usually safe for docs sidebar
|
|
33
|
+
|
|
34
|
+
// Get main content if possible
|
|
35
|
+
const main = $('main').html() || $('article').html() || $('body').html() || '';
|
|
36
|
+
return main;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async processPage(page: Page): Promise<string> {
|
|
40
|
+
// 1. Clean HTML
|
|
41
|
+
const cleanedHtml = this.cleanHtml(page.content);
|
|
42
|
+
|
|
43
|
+
// 2. Convert to Markdown
|
|
44
|
+
let markdown = this.turndown.turndown(cleanedHtml);
|
|
45
|
+
|
|
46
|
+
// 3. Enhance with LLM
|
|
47
|
+
// We use a cheap fast model for basic formatting/cleanup
|
|
48
|
+
try {
|
|
49
|
+
const response = await this.openai.chat.completions.create({
|
|
50
|
+
model: 'gpt-4o-mini', // Cost effective
|
|
51
|
+
messages: [
|
|
52
|
+
{
|
|
53
|
+
role: 'system',
|
|
54
|
+
content: `You are an expert technical writer optimizing documentation for AI Agents.
|
|
55
|
+
Your task is to rewrite the provided documentation Markdown to be:
|
|
56
|
+
1. Extremely high-density and concise.
|
|
57
|
+
2. Optimized for retrieval (keywords, clear logic).
|
|
58
|
+
3. Stripped of conversational filler ("In this tutorial you will...").
|
|
59
|
+
4. Strictly preserving ALL code blocks and technical constraints.
|
|
60
|
+
5. Formatted with clear headers.
|
|
61
|
+
|
|
62
|
+
Input is a raw scrape. Fix broken markdown if any.
|
|
63
|
+
Return ONLY the refined markdown.`
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
role: 'user',
|
|
67
|
+
content: `URL: ${page.url}\nTitle: ${page.title}\n\nContent:\n${markdown}`
|
|
68
|
+
}
|
|
69
|
+
],
|
|
70
|
+
temperature: 0.1,
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
return `## Source: [${page.title}](${page.url})\n\n${response.choices[0].message.content || markdown}\n\n---\n\n`;
|
|
74
|
+
|
|
75
|
+
} catch (e) {
|
|
76
|
+
// Fallback to raw markdown if OpenAI fails
|
|
77
|
+
return `## Source: [${page.title}](${page.url})\n\n${markdown}\n\n---\n\n`;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
|
|
2
|
+
{
|
|
3
|
+
"compilerOptions": {
|
|
4
|
+
"target": "es2020",
|
|
5
|
+
"module": "commonjs",
|
|
6
|
+
"outDir": "./dist",
|
|
7
|
+
"rootDir": "./src",
|
|
8
|
+
"strict": true,
|
|
9
|
+
"esModuleInterop": true,
|
|
10
|
+
"skipLibCheck": true,
|
|
11
|
+
"forceConsistentCasingInFileNames": true
|
|
12
|
+
},
|
|
13
|
+
"include": ["src/**/*"],
|
|
14
|
+
"exclude": ["node_modules"]
|
|
15
|
+
}
|