unified-sf-docs-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/db.js +173 -0
- package/dist/index.js +149 -0
- package/dist/scraper.js +275 -0
- package/package.json +45 -0
package/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Unified Salesforce Documentation MCP Server
|
|
2
|
+
|
|
3
|
+
A powerful Model Context Protocol (MCP) server that empowers LLMs to scrape, digest, and search through modern and legacy Salesforce documentation. It elegantly handles deeply nested Shadow DOMs, typical of Lightning Web Components (LWC), and legacy iframe-based documentation structures.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Deep Shadow DOM Piercing:** Bypasses 400KB+ of SPA boilerplate on `help.salesforce.com` and `developer.salesforce.com` to extract only the pure article Markdown.
|
|
8
|
+
- **Hierarchical Spidering:** Automatically queues and scrapes all related pages linked from a central guide using `mass_extract_guide`.
|
|
9
|
+
- **Offline RAG Capabilities:** Chunks and indexes scraped Markdown into a local SQLite database (`docs.db`) allowing for instantaneous local search using `search_local_docs`.
|
|
10
|
+
|
|
11
|
+
## Available Tools
|
|
12
|
+
|
|
13
|
+
1. **`scrape_single_page`**: Provide a Salesforce documentation URL. The server will use a headless browser (Puppeteer) to load the page, wait for dynamic content, pierce all shadow DOMs, and return clean Markdown.
|
|
14
|
+
2. **`mass_extract_guide`**: Provide a "Table of Contents" or central guide URL. The server will extract the parent page, find all hierarchical child links, scrape them concurrently, chunk their content, and save them to a local SQLite database for offline querying.
|
|
15
|
+
3. **`search_local_docs`**: Provide a natural language query (e.g., `LWC lifecycle hooks`). The server queries the SQLite database using fuzzy SQL search to instantly return the best matching pre-scraped chunks of documentation.
|
|
16
|
+
|
|
17
|
+
## Quick Start Installation
|
|
18
|
+
|
|
19
|
+
For anyone downloading this project for the first time:
|
|
20
|
+
|
|
21
|
+
1. **Clone the Repository:**
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/ttrevisan-ilmn/unified-sf-docs-mcp.git
|
|
24
|
+
cd unified-sf-docs-mcp
|
|
25
|
+
```
|
|
26
|
+
2. **Install Dependencies:**
|
|
27
|
+
```bash
|
|
28
|
+
npm install
|
|
29
|
+
```
|
|
30
|
+
3. **Build the Project:**
|
|
31
|
+
```bash
|
|
32
|
+
npm run build
|
|
33
|
+
```
|
|
34
|
+
*(Note: The server runs from the compiled `/dist` directory, so building is required).*
|
|
35
|
+
*(Note: To use the tools interactively, integrate this MCP server with an MCP client like Claude Desktop or Cursor.)*
|
|
36
|
+
|
|
37
|
+
## Testing
|
|
38
|
+
|
|
39
|
+
You can use the provided test scripts to verify the core functionality or the scraper against different Salesforce URL layouts:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Test the database, chunking, and search functionality
|
|
43
|
+
npx tsx tests/test-core.js
|
|
44
|
+
|
|
45
|
+
# Test the robust Shadow DOM scraper against 4 different URL permutations
|
|
46
|
+
npx tsx tests/test-all.js
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Integrating with AI Assistants
|
|
50
|
+
|
|
51
|
+
MCP servers act as a bridge between an LLM and local tools. To actually use this server, you need to plug it into an AI coding assistant like **Cursor** or **Claude Desktop**.
|
|
52
|
+
|
|
53
|
+
### 1. Claude Desktop
|
|
54
|
+
|
|
55
|
+
1. Open the Claude Desktop configuration file:
|
|
56
|
+
- macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
57
|
+
- Windows: `%APPDATA%\Claude\claude_desktop_config.json`
|
|
58
|
+
2. Add the following entry to your `mcpServers` object, replacing `/PATH/TO` with the absolute path to where you cloned this repository:
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"mcpServers": {
|
|
63
|
+
"unified-sf-docs": {
|
|
64
|
+
"command": "node",
|
|
65
|
+
"args": [
|
|
66
|
+
"/PATH/TO/unified-sf-docs-mcp/dist/index.js"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
3. Restart Claude Desktop. The tools will now be available when talking to Claude!
|
|
73
|
+
|
|
74
|
+
### 2. Cursor
|
|
75
|
+
|
|
76
|
+
1. Open Cursor Settings -> Features -> MCP
|
|
77
|
+
2. Click **+ Add new MCP server**
|
|
78
|
+
3. Configure the settings:
|
|
79
|
+
- **Type**: `command`
|
|
80
|
+
- **Name**: `unified-sf-docs`
|
|
81
|
+
- **Command**: `node /PATH/TO/unified-sf-docs-mcp/dist/index.js` (Be sure to use the absolute path)
|
|
82
|
+
4. Click Save. Cursor will connect to the server and surface the 3 new tools to Cursor Agent.
|
package/dist/db.js
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
// @ts-ignore
|
|
2
|
+
import initSqlJs from "sql.js";
|
|
3
|
+
import { fileURLToPath } from "url";
|
|
4
|
+
import { dirname, join } from "path";
|
|
5
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = dirname(__filename);
|
|
8
|
+
const DATA_DIR = join(__dirname, "..", "..", "db");
|
|
9
|
+
const DB_PATH = join(DATA_DIR, "salesforce-docs.db");
|
|
10
|
+
let db = null;
|
|
11
|
+
let SQL = null;
|
|
12
|
+
async function initSql() {
|
|
13
|
+
if (!SQL) {
|
|
14
|
+
SQL = await initSqlJs();
|
|
15
|
+
}
|
|
16
|
+
return SQL;
|
|
17
|
+
}
|
|
18
|
+
export async function getDatabase() {
|
|
19
|
+
if (!db) {
|
|
20
|
+
const sqljs = await initSql();
|
|
21
|
+
if (!existsSync(DATA_DIR))
|
|
22
|
+
mkdirSync(DATA_DIR, { recursive: true });
|
|
23
|
+
if (existsSync(DB_PATH)) {
|
|
24
|
+
const buffer = readFileSync(DB_PATH);
|
|
25
|
+
db = new sqljs.Database(buffer);
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
db = new sqljs.Database();
|
|
29
|
+
initializeDatabaseSchema(db);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return db;
|
|
33
|
+
}
|
|
34
|
+
function initializeDatabaseSchema(dbInstance) {
|
|
35
|
+
dbInstance.run(`
|
|
36
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
37
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
38
|
+
url TEXT NOT NULL UNIQUE,
|
|
39
|
+
title TEXT NOT NULL,
|
|
40
|
+
hash TEXT NOT NULL,
|
|
41
|
+
category TEXT,
|
|
42
|
+
last_scraped DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
43
|
+
)
|
|
44
|
+
`);
|
|
45
|
+
dbInstance.run(`
|
|
46
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
47
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
48
|
+
document_id INTEGER NOT NULL,
|
|
49
|
+
chunk_index INTEGER NOT NULL,
|
|
50
|
+
content TEXT NOT NULL,
|
|
51
|
+
content_lower TEXT NOT NULL,
|
|
52
|
+
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
53
|
+
)
|
|
54
|
+
`);
|
|
55
|
+
dbInstance.run(`CREATE INDEX IF NOT EXISTS idx_documents_url ON documents(url)`);
|
|
56
|
+
dbInstance.run(`CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)`);
|
|
57
|
+
saveDatabase(dbInstance);
|
|
58
|
+
}
|
|
59
|
+
export function saveDatabase(dbInstance = db) {
|
|
60
|
+
if (dbInstance) {
|
|
61
|
+
const data = dbInstance.export();
|
|
62
|
+
const buffer = Buffer.from(data);
|
|
63
|
+
writeFileSync(DB_PATH, buffer);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
export async function saveDocument(url, title, markdown, hash, category) {
|
|
67
|
+
const database = await getDatabase();
|
|
68
|
+
// Check if it already exists and hasn't changed
|
|
69
|
+
const checkStmt = database.prepare('SELECT id, hash FROM documents WHERE url = ?');
|
|
70
|
+
checkStmt.bind([url]);
|
|
71
|
+
let existingId = null;
|
|
72
|
+
let existingHash = null;
|
|
73
|
+
if (checkStmt.step()) {
|
|
74
|
+
const row = checkStmt.get();
|
|
75
|
+
existingId = row[0];
|
|
76
|
+
existingHash = row[1];
|
|
77
|
+
}
|
|
78
|
+
checkStmt.free();
|
|
79
|
+
if (existingId && existingHash === hash) {
|
|
80
|
+
// Unchanged, skip
|
|
81
|
+
return { action: 'skipped', id: existingId };
|
|
82
|
+
}
|
|
83
|
+
if (existingId) {
|
|
84
|
+
// Delete old chunks
|
|
85
|
+
database.run('DELETE FROM chunks WHERE document_id = ?', [existingId]);
|
|
86
|
+
database.run('UPDATE documents SET title = ?, hash = ?, last_scraped = CURRENT_TIMESTAMP WHERE id = ?', [title, hash, existingId]);
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
database.run('INSERT INTO documents (url, title, hash, category) VALUES (?, ?, ?, ?)', [url, title, hash, category]);
|
|
90
|
+
const res = database.exec('SELECT last_insert_rowid()');
|
|
91
|
+
existingId = res[0].values[0][0];
|
|
92
|
+
}
|
|
93
|
+
// Split markdown into chunks (approx 1000 chars)
|
|
94
|
+
const chunks = splitIntoChunks(markdown, 1000);
|
|
95
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
96
|
+
database.run('INSERT INTO chunks (document_id, chunk_index, content, content_lower) VALUES (?, ?, ?, ?)', [existingId, i, chunks[i], chunks[i].toLowerCase()]);
|
|
97
|
+
}
|
|
98
|
+
saveDatabase();
|
|
99
|
+
return { action: 'saved', id: existingId };
|
|
100
|
+
}
|
|
101
|
+
function splitIntoChunks(text, maxLen) {
|
|
102
|
+
const chunks = [];
|
|
103
|
+
const paragraphs = text.split('\\n\\n');
|
|
104
|
+
let currentChunk = '';
|
|
105
|
+
for (const p of paragraphs) {
|
|
106
|
+
if ((currentChunk.length + p.length) > maxLen && currentChunk.length > 0) {
|
|
107
|
+
chunks.push(currentChunk.trim());
|
|
108
|
+
currentChunk = '';
|
|
109
|
+
}
|
|
110
|
+
currentChunk += p + '\\n\\n';
|
|
111
|
+
}
|
|
112
|
+
if (currentChunk.trim().length > 0)
|
|
113
|
+
chunks.push(currentChunk.trim());
|
|
114
|
+
return chunks;
|
|
115
|
+
}
|
|
116
|
+
export async function searchDocuments(query, maxResults = 5) {
|
|
117
|
+
const database = await getDatabase();
|
|
118
|
+
const queryLower = query.toLowerCase();
|
|
119
|
+
const searchTerms = queryLower.split(/\\s+/).filter(w => w.length > 2);
|
|
120
|
+
if (searchTerms.length === 0)
|
|
121
|
+
return [];
|
|
122
|
+
const likeConditions = searchTerms.map(t => 'c.content_lower LIKE ?').join(' OR ');
|
|
123
|
+
const params = searchTerms.map(t => `%${t}%`);
|
|
124
|
+
const sql = `
|
|
125
|
+
SELECT
|
|
126
|
+
d.id, d.url, d.title, d.category,
|
|
127
|
+
c.content, c.content_lower
|
|
128
|
+
FROM chunks c
|
|
129
|
+
JOIN documents d ON c.document_id = d.id
|
|
130
|
+
WHERE (${likeConditions})
|
|
131
|
+
LIMIT 200
|
|
132
|
+
`;
|
|
133
|
+
const stmt = database.prepare(sql);
|
|
134
|
+
stmt.bind(params);
|
|
135
|
+
const rows = [];
|
|
136
|
+
const columns = stmt.getColumnNames();
|
|
137
|
+
while (stmt.step()) {
|
|
138
|
+
const rowData = stmt.get();
|
|
139
|
+
const row = {};
|
|
140
|
+
columns.forEach((col, idx) => row[col] = rowData[idx]);
|
|
141
|
+
rows.push(row);
|
|
142
|
+
}
|
|
143
|
+
stmt.free();
|
|
144
|
+
// Score based on word hits
|
|
145
|
+
const scoredRows = rows.map(row => {
|
|
146
|
+
let hits = 0;
|
|
147
|
+
for (const term of searchTerms) {
|
|
148
|
+
if (row.content_lower.includes(term))
|
|
149
|
+
hits++;
|
|
150
|
+
}
|
|
151
|
+
const density = hits / searchTerms.length;
|
|
152
|
+
return { ...row, score: density, hits };
|
|
153
|
+
});
|
|
154
|
+
scoredRows.sort((a, b) => b.score - a.score);
|
|
155
|
+
// Deduplicate by URL
|
|
156
|
+
const seenUrls = new Set();
|
|
157
|
+
const finalResults = [];
|
|
158
|
+
for (const row of scoredRows) {
|
|
159
|
+
if (!seenUrls.has(row.url)) {
|
|
160
|
+
seenUrls.add(row.url);
|
|
161
|
+
finalResults.push({
|
|
162
|
+
url: row.url,
|
|
163
|
+
title: row.title,
|
|
164
|
+
category: row.category,
|
|
165
|
+
matchContent: row.content,
|
|
166
|
+
score: row.score
|
|
167
|
+
});
|
|
168
|
+
if (finalResults.length >= maxResults)
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return finalResults;
|
|
173
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
3
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
5
|
+
import { scrapePage, closeBrowser } from "./scraper.js";
|
|
6
|
+
import { saveDocument, searchDocuments } from "./db.js";
|
|
7
|
+
import { z } from "zod";
|
|
8
|
+
const server = new Server({ name: "unified-sf-docs-mcp", version: "1.0.0" }, { capabilities: { tools: {} } });
|
|
9
|
+
const ScrapePageSchema = z.object({
|
|
10
|
+
url: z.string().url(),
|
|
11
|
+
category: z.string().optional().default("general")
|
|
12
|
+
});
|
|
13
|
+
const MassExtractSchema = z.object({
|
|
14
|
+
rootUrl: z.string().url(),
|
|
15
|
+
maxPages: z.number().int().min(1).max(100).optional().default(20),
|
|
16
|
+
category: z.string().optional().default("general")
|
|
17
|
+
});
|
|
18
|
+
const SearchDocsSchema = z.object({
|
|
19
|
+
query: z.string().min(1).max(500),
|
|
20
|
+
maxResults: z.number().int().min(1).max(20).optional().default(5)
|
|
21
|
+
});
|
|
22
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
23
|
+
return {
|
|
24
|
+
tools: [
|
|
25
|
+
{
|
|
26
|
+
name: "scrape_single_page",
|
|
27
|
+
description: "Scrape a single Salesforce documentation page (handles both developer.salesforce and help.salesforce iframes/structures). Returns markdown.",
|
|
28
|
+
inputSchema: {
|
|
29
|
+
type: "object",
|
|
30
|
+
properties: {
|
|
31
|
+
url: { type: "string" },
|
|
32
|
+
category: { type: "string" }
|
|
33
|
+
},
|
|
34
|
+
required: ["url"]
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: "mass_extract_guide",
|
|
39
|
+
description: "Spiders a root Salesforce documentation page, extracts hierarchical links, and scrapes them in bulk. Stores contents in a local SQLite database for later searching.",
|
|
40
|
+
inputSchema: {
|
|
41
|
+
type: "object",
|
|
42
|
+
properties: {
|
|
43
|
+
rootUrl: { type: "string", description: "The Table of Contents or landing page." },
|
|
44
|
+
maxPages: { type: "number", description: "Maximum number of pages to extract (default 20, max 100)." },
|
|
45
|
+
category: { type: "string" }
|
|
46
|
+
},
|
|
47
|
+
required: ["rootUrl"]
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: "search_local_docs",
|
|
52
|
+
description: "Search locally extracted Salesforce documentation in the SQLite database.",
|
|
53
|
+
inputSchema: {
|
|
54
|
+
type: "object",
|
|
55
|
+
properties: {
|
|
56
|
+
query: { type: "string" },
|
|
57
|
+
maxResults: { type: "number" }
|
|
58
|
+
},
|
|
59
|
+
required: ["query"]
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
};
|
|
64
|
+
});
|
|
65
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
66
|
+
const { name, arguments: args } = request.params;
|
|
67
|
+
try {
|
|
68
|
+
if (name === "scrape_single_page") {
|
|
69
|
+
const { url, category } = ScrapePageSchema.parse(args);
|
|
70
|
+
console.error(`Scraping ${url}...`);
|
|
71
|
+
const result = await scrapePage(url);
|
|
72
|
+
if (result.error) {
|
|
73
|
+
return { content: [{ type: "text", text: `Failed to scrape: ${result.error}` }], isError: true };
|
|
74
|
+
}
|
|
75
|
+
// Save automatically to local DB
|
|
76
|
+
await saveDocument(url, result.title, result.markdown, result.hash, category);
|
|
77
|
+
return {
|
|
78
|
+
content: [{ type: "text", text: `# ${result.title}\n\n${result.markdown}` }]
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
if (name === "mass_extract_guide") {
|
|
82
|
+
const { rootUrl, maxPages, category } = MassExtractSchema.parse(args);
|
|
83
|
+
console.error(`Starting mass extraction at ${rootUrl}`);
|
|
84
|
+
// Scrape root to get links
|
|
85
|
+
const rootResult = await scrapePage(rootUrl, new URL(rootUrl).origin);
|
|
86
|
+
if (rootResult.error) {
|
|
87
|
+
return { content: [{ type: "text", text: `Root scrape failed: ${rootResult.error}` }], isError: true };
|
|
88
|
+
}
|
|
89
|
+
await saveDocument(rootUrl, rootResult.title, rootResult.markdown, rootResult.hash, category);
|
|
90
|
+
const queue = [...new Set(rootResult.childLinks)].filter(l => l !== rootUrl).slice(0, maxPages);
|
|
91
|
+
let successRaw = 1;
|
|
92
|
+
let failureCount = 0;
|
|
93
|
+
for (const link of queue) {
|
|
94
|
+
console.error(`Scraping queued link: ${link}`);
|
|
95
|
+
const pg = await scrapePage(link, new URL(rootUrl).origin);
|
|
96
|
+
if (!pg.error) {
|
|
97
|
+
await saveDocument(pg.url, pg.title, pg.markdown, pg.hash, category);
|
|
98
|
+
successRaw++;
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
console.error(`Failed on ${link}: ${pg.error}`);
|
|
102
|
+
failureCount++;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return {
|
|
106
|
+
content: [{ type: "text", text: `Mass extraction complete.\nSuccessfully extracted and saved ${successRaw} pages.\nFailed: ${failureCount} pages.\nDatabase updated.` }]
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
if (name === "search_local_docs") {
|
|
110
|
+
const { query, maxResults } = SearchDocsSchema.parse(args);
|
|
111
|
+
const results = await searchDocuments(query, maxResults);
|
|
112
|
+
if (results.length === 0) {
|
|
113
|
+
return { content: [{ type: "text", text: "No results found in the local database." }] };
|
|
114
|
+
}
|
|
115
|
+
let output = `# Search Results for "${query}"\n\n`;
|
|
116
|
+
for (const r of results) {
|
|
117
|
+
output += `## [${r.title}](${r.url})\n*Category: ${r.category}* | *Score: ${(r.score * 100).toFixed(1)}%*\n\n`;
|
|
118
|
+
output += `> ${r.matchContent.substring(0, 500)}...\n\n---\n`;
|
|
119
|
+
}
|
|
120
|
+
return { content: [{ type: "text", text: output }] };
|
|
121
|
+
}
|
|
122
|
+
return {
|
|
123
|
+
content: [{ type: "text", text: `Unknown tool: ${name}` }],
|
|
124
|
+
isError: true
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
catch (e) {
|
|
128
|
+
return {
|
|
129
|
+
content: [{ type: "text", text: `Error: ${e.message}` }],
|
|
130
|
+
isError: true
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
});
|
|
134
|
+
// Clean up Puppeteer on exit
|
|
135
|
+
process.on('SIGINT', async () => {
|
|
136
|
+
await closeBrowser();
|
|
137
|
+
process.exit(0);
|
|
138
|
+
});
|
|
139
|
+
process.on('SIGTERM', async () => {
|
|
140
|
+
await closeBrowser();
|
|
141
|
+
process.exit(0);
|
|
142
|
+
});
|
|
143
|
+
async function main() {
|
|
144
|
+
console.error("Starting Unified Salesforce Docs MCP Server...");
|
|
145
|
+
const transport = new StdioServerTransport();
|
|
146
|
+
await server.connect(transport);
|
|
147
|
+
console.error("Server running on stdio transport.");
|
|
148
|
+
}
|
|
149
|
+
main().catch(console.error);
|
package/dist/scraper.js
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import puppeteer from 'puppeteer';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
// @ts-ignore
|
|
4
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
5
|
+
import crypto from 'crypto';
|
|
6
|
+
// Configure Turndown for better markdown output
|
|
7
|
+
const turndownService = new TurndownService({
|
|
8
|
+
headingStyle: 'atx',
|
|
9
|
+
codeBlockStyle: 'fenced',
|
|
10
|
+
bulletListMarker: '-',
|
|
11
|
+
});
|
|
12
|
+
turndownService.use(gfm);
|
|
13
|
+
turndownService.addRule('codeBlocks', {
|
|
14
|
+
filter: ['pre'],
|
|
15
|
+
replacement: function (_content, node) {
|
|
16
|
+
const element = node;
|
|
17
|
+
const code = element.querySelector('code');
|
|
18
|
+
const language = code?.className?.match(/language-(\w+)/)?.[1] || '';
|
|
19
|
+
const codeContent = code?.textContent || element.textContent || '';
|
|
20
|
+
return `\n\`\`\`${language}\n${codeContent}\n\`\`\`\n`;
|
|
21
|
+
},
|
|
22
|
+
});
|
|
23
|
+
let browser = null;
|
|
24
|
+
async function getBrowser() {
|
|
25
|
+
if (!browser) {
|
|
26
|
+
browser = await puppeteer.launch({
|
|
27
|
+
headless: true,
|
|
28
|
+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
return browser;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Extracts content from a single URL, handling shadow DOMs, iframes, and various SFDC template structures.
|
|
35
|
+
*/
|
|
36
|
+
export async function scrapePage(url, baseDomain) {
|
|
37
|
+
const browserInstance = await getBrowser();
|
|
38
|
+
const page = await browserInstance.newPage();
|
|
39
|
+
try {
|
|
40
|
+
await page.setViewport({ width: 1280, height: 800 });
|
|
41
|
+
// User agent to look normal
|
|
42
|
+
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
43
|
+
// Wait until network is idle specifically to handle SPA renders and iframe loads
|
|
44
|
+
await page.goto(url, { waitUntil: 'networkidle0', timeout: 60000 });
|
|
45
|
+
// Wait for specific Salesforce content locators to appear to avoid grabbing 'Loading...' pages
|
|
46
|
+
try {
|
|
47
|
+
if (url.includes('help.salesforce.com')) {
|
|
48
|
+
await page.waitForSelector('.slds-text-longform', { timeout: 15000 });
|
|
49
|
+
}
|
|
50
|
+
else if (url.includes('developer.salesforce.com')) {
|
|
51
|
+
await page.waitForFunction(() => {
|
|
52
|
+
return document.querySelector('doc-content-layout') ||
|
|
53
|
+
document.querySelector('doc-xml-content') ||
|
|
54
|
+
document.querySelector('iframe');
|
|
55
|
+
}, { timeout: 10000 });
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
catch (e) {
|
|
59
|
+
console.warn(`Timeout waiting for specific content selectors on ${url}`);
|
|
60
|
+
}
|
|
61
|
+
// Additional wait just in case visual components are still sliding in
|
|
62
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
63
|
+
// Take an opportunistic screenshot if development debugging layout issues
|
|
64
|
+
if (url.includes('help.salesforce.com')) {
|
|
65
|
+
await page.screenshot({ path: 'help_debug.png' }).catch(() => { });
|
|
66
|
+
}
|
|
67
|
+
// In-page extraction script
|
|
68
|
+
const extraction = await page.evaluate(() => {
|
|
69
|
+
let title = 'Untitled';
|
|
70
|
+
let html = '';
|
|
71
|
+
const childLinks = new Set();
|
|
72
|
+
// Collect all same-site hierarchical links (help to spider)
|
|
73
|
+
document.querySelectorAll('a').forEach(a => {
|
|
74
|
+
if (a.href && !a.href.startsWith('java') && !a.href.startsWith('mailto')) {
|
|
75
|
+
childLinks.add(a.href);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
// Helper function to extract readable HTML piercing shadow DOMs (legacy sf-doc-scraper behavior)
|
|
79
|
+
function extractReadableHTML(element) {
|
|
80
|
+
if (!element)
|
|
81
|
+
return '';
|
|
82
|
+
const tagName = element.tagName?.toLowerCase();
|
|
83
|
+
if (tagName === 'doc-heading') {
|
|
84
|
+
const headingEl = element.shadowRoot?.querySelector('h2, h3, h4');
|
|
85
|
+
const headingContent = element.shadowRoot?.querySelector('doc-heading-content');
|
|
86
|
+
const titleSpan = headingContent?.shadowRoot?.querySelector('.title');
|
|
87
|
+
const headingText = titleSpan?.textContent?.trim() || element.getAttribute('header') || '';
|
|
88
|
+
const level = headingEl?.tagName?.toLowerCase() || 'h2';
|
|
89
|
+
return `<${level}>${headingText}</${level}>`;
|
|
90
|
+
}
|
|
91
|
+
if (tagName === 'doc-content-callout') {
|
|
92
|
+
const shadowDiv = element.shadowRoot?.querySelector('.dx-callout');
|
|
93
|
+
const isTip = shadowDiv?.classList?.contains('dx-callout-tip');
|
|
94
|
+
const isWarning = shadowDiv?.classList?.contains('dx-callout-warning');
|
|
95
|
+
let calloutType = 'Note';
|
|
96
|
+
if (isTip)
|
|
97
|
+
calloutType = 'Tip';
|
|
98
|
+
if (isWarning)
|
|
99
|
+
calloutType = 'Warning';
|
|
100
|
+
const slottedContent = element.innerHTML;
|
|
101
|
+
return `<blockquote><strong>${calloutType}:</strong> ${slottedContent}</blockquote>`;
|
|
102
|
+
}
|
|
103
|
+
if (tagName === 'dx-code-block') {
|
|
104
|
+
const language = element.getAttribute('language') || '';
|
|
105
|
+
const code = element.getAttribute('code-block') || element.textContent || '';
|
|
106
|
+
const decodedCode = code
|
|
107
|
+
.replace(/"/g, '"')
|
|
108
|
+
.replace(/</g, '<')
|
|
109
|
+
.replace(/>/g, '>')
|
|
110
|
+
.replace(/&/g, '&');
|
|
111
|
+
return `<pre><code class="language-${language}">${decodedCode}</code></pre>`;
|
|
112
|
+
}
|
|
113
|
+
if (tagName === 'div' && element.classList?.contains('custom-code-block')) {
|
|
114
|
+
const codeBlock = element.querySelector('dx-code-block');
|
|
115
|
+
if (codeBlock)
|
|
116
|
+
return extractReadableHTML(codeBlock);
|
|
117
|
+
}
|
|
118
|
+
return element.outerHTML;
|
|
119
|
+
}
|
|
120
|
+
// Helper function to find element deep in shadow DOMs
|
|
121
|
+
function deepQuerySelector(root, selector) {
|
|
122
|
+
const found = root.querySelector(selector);
|
|
123
|
+
if (found)
|
|
124
|
+
return found;
|
|
125
|
+
const allElements = root.querySelectorAll('*');
|
|
126
|
+
for (const el of Array.from(allElements)) {
|
|
127
|
+
if (el.shadowRoot) {
|
|
128
|
+
const deepFound = deepQuerySelector(el.shadowRoot, selector);
|
|
129
|
+
if (deepFound)
|
|
130
|
+
return deepFound;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
// 1. Try to extract from an iframe (Older Developer Guides like life_sciences_dev_guide)
|
|
136
|
+
const iframe = document.querySelector('iframe');
|
|
137
|
+
if (iframe && iframe.contentDocument && iframe.contentDocument.body) {
|
|
138
|
+
// Find main content inside iframe
|
|
139
|
+
const docHtml = iframe.contentDocument.querySelector('#doc')?.innerHTML ||
|
|
140
|
+
iframe.contentDocument.querySelector('body')?.innerHTML || '';
|
|
141
|
+
const docTitle = iframe.contentDocument.querySelector('title')?.innerText ||
|
|
142
|
+
iframe.contentDocument.querySelector('h1')?.innerText || 'Untitled';
|
|
143
|
+
// Get links inside iframe
|
|
144
|
+
iframe.contentDocument.querySelectorAll('a').forEach(a => {
|
|
145
|
+
if (a.href && !a.href.startsWith('java') && !a.href.startsWith('mailto')) {
|
|
146
|
+
childLinks.add(a.href);
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
if (docHtml.length > 500) {
|
|
150
|
+
return { html: docHtml, title: docTitle, childLinks: Array.from(childLinks) };
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// 2. Try help.salesforce.com specific structure (often inside shadow DOMs)
|
|
154
|
+
const sldsText = deepQuerySelector(document, '.slds-text-longform');
|
|
155
|
+
if (sldsText) {
|
|
156
|
+
const rawTitle = document.querySelector('title')?.innerText || 'Untitled';
|
|
157
|
+
const cleanTitle = rawTitle.replace(' | Salesforce', '').trim();
|
|
158
|
+
return { html: sldsText.innerHTML, title: cleanTitle, childLinks: Array.from(childLinks) };
|
|
159
|
+
}
|
|
160
|
+
// 2.5 Try doc-xml-content (Legacy Developer Guides, like Health Cloud / Life Sciences)
|
|
161
|
+
const docXmlContent = document.querySelector('doc-xml-content');
|
|
162
|
+
if (docXmlContent?.shadowRoot) {
|
|
163
|
+
const docContent = docXmlContent.shadowRoot.querySelector('doc-content');
|
|
164
|
+
if (docContent?.shadowRoot) {
|
|
165
|
+
const innerHtml = docContent.shadowRoot.innerHTML;
|
|
166
|
+
// Extract title from h1
|
|
167
|
+
const h1Match = innerHtml.match(/<h1[^>]*>(.*?)<\/h1>/);
|
|
168
|
+
if (h1Match)
|
|
169
|
+
title = h1Match[1].replace(/<[^>]*>?/gm, '');
|
|
170
|
+
// Find child links within the shadow DOM
|
|
171
|
+
const shadowLinks = docContent.shadowRoot.querySelectorAll('a');
|
|
172
|
+
shadowLinks.forEach(a => {
|
|
173
|
+
if (a.href && !a.href.startsWith('java') && !a.href.startsWith('mailto')) {
|
|
174
|
+
childLinks.add(a.href);
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
return { html: innerHtml, title, childLinks: Array.from(childLinks) };
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
// 3. Try doc-content-layout / doc-amf-reference (New Developer Guides)
|
|
181
|
+
const docRef = document.querySelector('doc-amf-reference');
|
|
182
|
+
if (docRef) {
|
|
183
|
+
const markdownContent = docRef.querySelector('.markdown-content');
|
|
184
|
+
if (markdownContent) {
|
|
185
|
+
let refHtml = '';
|
|
186
|
+
for (const el of Array.from(markdownContent.children)) {
|
|
187
|
+
if (el.tagName?.toLowerCase() === 'h1')
|
|
188
|
+
title = el.textContent?.trim() || title;
|
|
189
|
+
refHtml += extractReadableHTML(el);
|
|
190
|
+
}
|
|
191
|
+
if (!title || title === 'Untitled')
|
|
192
|
+
title = document.querySelector('title')?.innerText || 'Untitled';
|
|
193
|
+
return { html: refHtml, title, childLinks: Array.from(childLinks) };
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
const docLayout = document.querySelector('doc-content-layout');
|
|
197
|
+
if (docLayout?.shadowRoot) {
|
|
198
|
+
const slot = docLayout.shadowRoot.querySelector('.content-body slot');
|
|
199
|
+
if (slot) {
|
|
200
|
+
const assignedElements = slot.assignedElements();
|
|
201
|
+
if (assignedElements.length > 0) {
|
|
202
|
+
let guideHtml = '';
|
|
203
|
+
for (const el of assignedElements) {
|
|
204
|
+
if (el.tagName?.toLowerCase() === 'h1')
|
|
205
|
+
title = el.textContent?.trim() || title;
|
|
206
|
+
guideHtml += extractReadableHTML(el);
|
|
207
|
+
}
|
|
208
|
+
if (!title || title === 'Untitled')
|
|
209
|
+
title = document.querySelector('title')?.innerText || 'Untitled';
|
|
210
|
+
return { html: guideHtml, title, childLinks: Array.from(childLinks) };
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
// 4. Fallback: <article> or <main>
|
|
215
|
+
const container = document.querySelector('article') || document.querySelector('main');
|
|
216
|
+
if (container) {
|
|
217
|
+
title = document.querySelector('h1')?.innerText || document.querySelector('title')?.innerText || 'Untitled';
|
|
218
|
+
return { html: container.innerHTML, title, childLinks: Array.from(childLinks) };
|
|
219
|
+
}
|
|
220
|
+
// Complete fallback
|
|
221
|
+
return {
|
|
222
|
+
html: document.body.innerHTML,
|
|
223
|
+
title: document.querySelector('title')?.innerText || 'Untitled',
|
|
224
|
+
childLinks: Array.from(childLinks)
|
|
225
|
+
};
|
|
226
|
+
});
|
|
227
|
+
if (!extraction.html || extraction.html.trim() === '') {
|
|
228
|
+
return {
|
|
229
|
+
url,
|
|
230
|
+
title: 'Untitled',
|
|
231
|
+
markdown: '',
|
|
232
|
+
hash: '',
|
|
233
|
+
error: 'No content found on page',
|
|
234
|
+
childLinks: []
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
// Convert to markdown
|
|
238
|
+
let markdown = turndownService.turndown(extraction.html);
|
|
239
|
+
// Filter child links to stay within the domain/base if provided, to avoid massive spidering
|
|
240
|
+
let validLinks = extraction.childLinks;
|
|
241
|
+
if (baseDomain) {
|
|
242
|
+
validLinks = validLinks.filter(l => l.startsWith(baseDomain));
|
|
243
|
+
}
|
|
244
|
+
const hash = crypto.createHash('sha256').update(markdown).digest('hex');
|
|
245
|
+
return {
|
|
246
|
+
url,
|
|
247
|
+
title: extraction.title,
|
|
248
|
+
markdown,
|
|
249
|
+
hash,
|
|
250
|
+
childLinks: validLinks,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
catch (error) {
|
|
254
|
+
return {
|
|
255
|
+
url,
|
|
256
|
+
title: 'Error',
|
|
257
|
+
markdown: '',
|
|
258
|
+
hash: '',
|
|
259
|
+
error: error.message,
|
|
260
|
+
childLinks: [],
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
finally {
|
|
264
|
+
await page.close();
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Ensures the browser is closed when application shuts down
|
|
269
|
+
*/
|
|
270
|
+
export async function closeBrowser() {
|
|
271
|
+
if (browser) {
|
|
272
|
+
await browser.close();
|
|
273
|
+
browser = null;
|
|
274
|
+
}
|
|
275
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "unified-sf-docs-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Unified MCP server for scraping Salesforce developer documentation and searching a local SQLite vector-like database.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"unified-sf-docs-mcp": "dist/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsc",
|
|
15
|
+
"start": "node dist/index.js",
|
|
16
|
+
"dev": "tsx src/index.ts",
|
|
17
|
+
"test": "tsx tests/test-all.js",
|
|
18
|
+
"test:core": "tsx tests/test-core.js"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"salesforce",
|
|
22
|
+
"mcp",
|
|
23
|
+
"model-context-protocol"
|
|
24
|
+
],
|
|
25
|
+
"author": "",
|
|
26
|
+
"license": "MIT",
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"@modelcontextprotocol/sdk": "^1.0.0",
|
|
29
|
+
"puppeteer": "^24.1.0",
|
|
30
|
+
"turndown": "^7.2.0",
|
|
31
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
32
|
+
"sql.js": "^1.10.0",
|
|
33
|
+
"lru-cache": "^10.0.0",
|
|
34
|
+
"zod": "^3.22.0"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"@types/node": "^20.0.0",
|
|
38
|
+
"@types/turndown": "^5.0.4",
|
|
39
|
+
"tsx": "^4.0.0",
|
|
40
|
+
"typescript": "^5.3.0"
|
|
41
|
+
},
|
|
42
|
+
"engines": {
|
|
43
|
+
"node": ">=18.0.0"
|
|
44
|
+
}
|
|
45
|
+
}
|