@syndash/research-vault-mcp 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -0
- package/bin/research-vault-mcp.mjs +49 -0
- package/package.json +46 -0
- package/src/amplify.ts +245 -0
- package/src/ingest/arxiv.ts +64 -0
- package/src/ingest/html.ts +46 -0
- package/src/ingest/pdf.ts +30 -0
- package/src/server.ts +301 -0
- package/src/types.ts +77 -0
- package/src/vault.ts +310 -0
- package/src/vault_jobs.ts +88 -0
- package/src/vault_write.ts +347 -0
package/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# @syndash/research-vault-mcp
|
|
2
|
+
|
|
3
|
+
MCP (Model Context Protocol) server for [Nolan's research vault](https://github.com/Fearvox/dash-research-vault) — semantic search + memory persistence over 200+ markdown documents via local Gemma (Atomic Chat) or cloud LLM fallback.
|
|
4
|
+
|
|
5
|
+
**Part of**: DASH SHATTER / SynDASH ecosystem.
|
|
6
|
+
**Home**: [github.com/Fearvox/Evensong](https://github.com/Fearvox/Evensong) — `packages/research-vault-mcp/`
|
|
7
|
+
**Status**: Wave 3+ — not yet published to npm. Plan: `docs/superpowers/plans/2026-04-19-wave2d-submodule-mcp-package-prep.md`.
|
|
8
|
+
|
|
9
|
+
## Install & Run (future, post-publish)
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Via bun (recommended — native TS execution)
|
|
13
|
+
bunx @syndash/research-vault-mcp
|
|
14
|
+
|
|
15
|
+
# Via Node
|
|
16
|
+
npx @syndash/research-vault-mcp
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Configure Claude Code / Claude Desktop
|
|
20
|
+
|
|
21
|
+
Add to `~/.claude/settings.json` or Claude Desktop config:
|
|
22
|
+
|
|
23
|
+
```json
|
|
24
|
+
{
|
|
25
|
+
"mcpServers": {
|
|
26
|
+
"research-vault": {
|
|
27
|
+
"command": "bunx",
|
|
28
|
+
"args": ["@syndash/research-vault-mcp"]
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
For direct local dev from this monorepo:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"mcpServers": {
|
|
39
|
+
"research-vault-dev": {
|
|
40
|
+
"command": "bun",
|
|
41
|
+
"args": ["run", "packages/research-vault-mcp/src/server.ts"]
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Tools Exposed (MCP contract)
|
|
48
|
+
|
|
49
|
+
See `src/vault.ts` and `src/amplify.ts` for current tool definitions:
|
|
50
|
+
|
|
51
|
+
- `vault_search` — hybrid search over analyzed knowledge base
|
|
52
|
+
- `vault_status` — decay scores + retention health
|
|
53
|
+
- `vault_taxonomy` — category tree + item counts
|
|
54
|
+
- `vault_batch_analyze` — raw queue status + preview
|
|
55
|
+
- `amplify_*` — remote RAG query layer (currently requires Amplify API key — see `docs.evermind.ai`; Wave 3+ will add local Gemma fallback path via `@syndash/research-vault-mcp`'s built-in retrieval chain)
|
|
56
|
+
|
|
57
|
+
## Architecture
|
|
58
|
+
|
|
59
|
+
Per parent spec [2026-04-19 vault foundation & preamble design](https://github.com/Fearvox/Evensong/blob/main/docs/superpowers/specs/2026-04-19-vault-foundation-and-preamble-design.md) §3.4, retrieval uses a **unified multi-signal ranker** (not 3 separate subsystems):
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
score(d, q, t) = 0.35·BM25(q,d) + 0.35·cosine(embed(q), embed(d))
|
|
63
|
+
+ 0.15·exp(-(t - lastAccess)/stability)
|
|
64
|
+
+ 0.10·log1p(accessCount)/log1p(MAX_ACCESS)
|
|
65
|
+
+ 0.05·summary_level_weight(d)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Primary LLM**: Atomic Chat local Gemma-4-E4B-Uncensored-Q4_K_M (`http://127.0.0.1:1337/v1`).
|
|
69
|
+
**Fallback chain**: xai-fast → minimax-m27 → openrouter/qwen3.6-plus → openrouter/llama-3.1-8b-free.
|
|
70
|
+
|
|
71
|
+
**Prior art**: EverMemOS (arxiv 2601.02163, EverMind/Shanda, 2026-01) — LLM-orchestrated hybrid retrieval. This package adopts their Stage-1 hybrid candidate generation but replaces Stage-2 verifier-loop with direct listwise LLM judge (simpler + more deterministic).
|
|
72
|
+
|
|
73
|
+
## License
|
|
74
|
+
|
|
75
|
+
`UNLICENSED` for now (pending org-level license decision). See parent repo LICENSE.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI entry point for @syndash/research-vault-mcp.
|
|
4
|
+
* Invoked via `npx @syndash/research-vault-mcp` or `bunx @syndash/research-vault-mcp`.
|
|
5
|
+
* Delegates to src/server.ts (compiled or via bun direct).
|
|
6
|
+
*
|
|
7
|
+
* Part of DASH SHATTER (Fearvox/Evensong repo, SynDASH org).
|
|
8
|
+
* See packages/research-vault-mcp/README.md for MCP client config.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { fileURLToPath } from 'url'
|
|
12
|
+
import { dirname, join } from 'path'
|
|
13
|
+
import { existsSync } from 'fs'
|
|
14
|
+
|
|
15
|
+
const __filename = fileURLToPath(import.meta.url)
|
|
16
|
+
const __dirname = dirname(__filename)
|
|
17
|
+
const pkgRoot = join(__dirname, '..')
|
|
18
|
+
|
|
19
|
+
// Prefer compiled JS if available (post-build); fall back to bun direct execution of TS source.
|
|
20
|
+
const compiledServer = join(pkgRoot, 'dist', 'server.js')
|
|
21
|
+
const sourceServer = join(pkgRoot, 'src', 'server.ts')
|
|
22
|
+
|
|
23
|
+
async function main() {
|
|
24
|
+
const args = process.argv.slice(2)
|
|
25
|
+
let transport = 'sse'
|
|
26
|
+
|
|
27
|
+
for (let i = 0; i < args.length; i++) {
|
|
28
|
+
if (args[i] === '--transport' && args[i + 1]) {
|
|
29
|
+
transport = args[i + 1]
|
|
30
|
+
} else if (args[i].startsWith('--transport=')) {
|
|
31
|
+
transport = args[i].split('=')[1]
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
process.env.MCP_TRANSPORT = transport
|
|
35
|
+
|
|
36
|
+
if (existsSync(compiledServer)) {
|
|
37
|
+
await import(compiledServer)
|
|
38
|
+
} else if (existsSync(sourceServer)) {
|
|
39
|
+
await import(sourceServer)
|
|
40
|
+
} else {
|
|
41
|
+
console.error('research-vault-mcp: neither dist/server.js nor src/server.ts found')
|
|
42
|
+
process.exit(1)
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
main().catch(err => {
|
|
47
|
+
console.error('research-vault-mcp fatal:', err)
|
|
48
|
+
process.exit(1)
|
|
49
|
+
})
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@syndash/research-vault-mcp",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "MCP server for Nolan's research vault — semantic search + memory persistence over 200+ markdown docs via local Gemma (Atomic Chat) or cloud LLM fallback. Part of DASH SHATTER / SynDASH.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"research-vault-mcp": "./bin/research-vault-mcp.mjs"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"dev": "bun run src/server.ts",
|
|
11
|
+
"build": "bun build src/server.ts --outdir=dist --target=bun",
|
|
12
|
+
"test": "bun test"
|
|
13
|
+
},
|
|
14
|
+
"repository": {
|
|
15
|
+
"type": "git",
|
|
16
|
+
"url": "https://github.com/Fearvox/Evensong.git",
|
|
17
|
+
"directory": "packages/research-vault-mcp"
|
|
18
|
+
},
|
|
19
|
+
"homepage": "https://github.com/Fearvox/Evensong/tree/main/packages/research-vault-mcp",
|
|
20
|
+
"bugs": {
|
|
21
|
+
"url": "https://github.com/Fearvox/Evensong/issues"
|
|
22
|
+
},
|
|
23
|
+
"license": "UNLICENSED",
|
|
24
|
+
"publishConfig": {
|
|
25
|
+
"access": "public"
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"src/**/*.ts",
|
|
29
|
+
"bin/**/*.mjs",
|
|
30
|
+
"README.md",
|
|
31
|
+
"package.json"
|
|
32
|
+
],
|
|
33
|
+
"keywords": [
|
|
34
|
+
"mcp",
|
|
35
|
+
"model-context-protocol",
|
|
36
|
+
"research-vault",
|
|
37
|
+
"claude-code",
|
|
38
|
+
"evermind",
|
|
39
|
+
"dash-shatter",
|
|
40
|
+
"syndash"
|
|
41
|
+
],
|
|
42
|
+
"dependencies": {
|
|
43
|
+
"@anthropic-ai/sdk": "^0.80.0",
|
|
44
|
+
"markitdown": "latest"
|
|
45
|
+
}
|
|
46
|
+
}
|
package/src/amplify.ts
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
// Amplify API MCP Tools
|
|
2
|
+
// Vanderbilt AI Amplify platform — chat, models, file management
|
|
3
|
+
|
|
4
|
+
const AMPLIFY_BASE = 'https://prod-api.vanderbilt.ai'
|
|
5
|
+
|
|
6
|
+
export interface AmplifyConfig {
|
|
7
|
+
apiKey: string
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
let config: AmplifyConfig | null = null
|
|
11
|
+
|
|
12
|
+
export function configureAmplify(apiKey: string) {
|
|
13
|
+
config = { apiKey }
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function getHeaders() {
|
|
17
|
+
if (!config?.apiKey) throw new Error('Amplify API key not configured. Call configureAmplify() first.')
|
|
18
|
+
return {
|
|
19
|
+
'Authorization': `Bearer ${config.apiKey}`,
|
|
20
|
+
'Content-Type': 'application/json'
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface ModelInfo {
|
|
25
|
+
id: string
|
|
26
|
+
name: string
|
|
27
|
+
provider: string
|
|
28
|
+
inputContextWindow: number
|
|
29
|
+
outputTokenLimit: number
|
|
30
|
+
supportsImages: boolean
|
|
31
|
+
supportsSystemPrompts: boolean
|
|
32
|
+
systemPrompt?: string
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface ChatMessage {
|
|
36
|
+
role: 'system' | 'user' | 'assistant'
|
|
37
|
+
content: string
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface ChatOptions {
|
|
41
|
+
temperature?: number
|
|
42
|
+
maxTokens?: number
|
|
43
|
+
dataSources?: string[]
|
|
44
|
+
modelId?: string
|
|
45
|
+
ragOnly?: boolean
|
|
46
|
+
skipRag?: boolean
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export const amplifyTools = [
|
|
50
|
+
{
|
|
51
|
+
name: 'amplify_list_models',
|
|
52
|
+
description: 'List available models on Vanderbilt Amplify. Returns model IDs, context windows, providers, and pricing tiers.',
|
|
53
|
+
inputSchema: { type: 'object', properties: {} },
|
|
54
|
+
call: async () => {
|
|
55
|
+
try {
|
|
56
|
+
const res = await fetch(`${AMPLIFY_BASE}/available_models`, {
|
|
57
|
+
headers: getHeaders()
|
|
58
|
+
})
|
|
59
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}`)
|
|
60
|
+
const data = await res.json()
|
|
61
|
+
return {
|
|
62
|
+
content: [{
|
|
63
|
+
type: 'text',
|
|
64
|
+
text: JSON.stringify(data, null, 2)
|
|
65
|
+
}]
|
|
66
|
+
}
|
|
67
|
+
} catch (e: any) {
|
|
68
|
+
return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
|
|
73
|
+
{
|
|
74
|
+
name: 'amplify_chat',
|
|
75
|
+
description: 'Send a streaming chat message to Amplify. Returns Claude/GPT/Mistral responses via SSE.',
|
|
76
|
+
inputSchema: {
|
|
77
|
+
type: 'object',
|
|
78
|
+
required: ['message'],
|
|
79
|
+
properties: {
|
|
80
|
+
message: { type: 'string', description: 'User message' },
|
|
81
|
+
modelId: { type: 'string', description: 'Model ID (from amplify_list_models)' },
|
|
82
|
+
systemPrompt: { type: 'string', description: 'Optional system prompt override' },
|
|
83
|
+
temperature: { type: 'number', description: 'Temperature (0-2, default 0.7)' },
|
|
84
|
+
maxTokens: { type: 'number', description: 'Max output tokens (default 4000)' },
|
|
85
|
+
stream: { type: 'boolean', description: 'If true, yield chunks via onProgress callback instead of waiting for complete response (default false)' }
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
call: async ({ message, modelId, systemPrompt, temperature = 0.7, maxTokens = 4000, stream = false }: {
|
|
89
|
+
message: string, modelId?: string, systemPrompt?: string, temperature?: number, maxTokens?: number, stream?: boolean
|
|
90
|
+
}, onProgress?: (data: { type: string; text?: string }) => void) => {
|
|
91
|
+
try {
|
|
92
|
+
const body: any = {
|
|
93
|
+
data: {
|
|
94
|
+
model: modelId || 'gpt-4o',
|
|
95
|
+
temperature,
|
|
96
|
+
max_tokens: maxTokens,
|
|
97
|
+
messages: [{ role: 'user', content: message }]
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
if (systemPrompt) {
|
|
101
|
+
body.data.messages.unshift({ role: 'system', content: systemPrompt })
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const res = await fetch(`${AMPLIFY_BASE}/chat`, {
|
|
105
|
+
method: 'POST',
|
|
106
|
+
headers: getHeaders(),
|
|
107
|
+
body: JSON.stringify(body)
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
if (!res.ok) {
|
|
111
|
+
const err = await res.text()
|
|
112
|
+
throw new Error(`HTTP ${res.status}: ${err}`)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Read SSE stream
|
|
116
|
+
const reader = res.body?.getReader()
|
|
117
|
+
if (!reader) throw new Error('No response body')
|
|
118
|
+
|
|
119
|
+
let fullText = ''
|
|
120
|
+
const decoder = new TextDecoder()
|
|
121
|
+
|
|
122
|
+
while (true) {
|
|
123
|
+
const { done, value } = await reader.read()
|
|
124
|
+
if (done) break
|
|
125
|
+
const chunk = decoder.decode(value, { stream: true })
|
|
126
|
+
// Parse SSE lines: data: {...}
|
|
127
|
+
for (const line of chunk.split('\n')) {
|
|
128
|
+
if (line.startsWith('data: ')) {
|
|
129
|
+
try {
|
|
130
|
+
const parsed = JSON.parse(line.slice(6))
|
|
131
|
+
if (parsed.data?.content) fullText += parsed.data.content
|
|
132
|
+
else if (parsed.data) fullText += typeof parsed.data === 'string' ? parsed.data : JSON.stringify(parsed.data)
|
|
133
|
+
} catch {}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ── Stream mode: yield chunks via onProgress ─────────────────────────
|
|
139
|
+
if (stream && onProgress) {
|
|
140
|
+
const res2 = await fetch(`${AMPLIFY_BASE}/chat`, {
|
|
141
|
+
method: 'POST',
|
|
142
|
+
headers: getHeaders(),
|
|
143
|
+
body: JSON.stringify(body)
|
|
144
|
+
})
|
|
145
|
+
if (!res2.ok) throw new Error(`HTTP ${res2.status}`)
|
|
146
|
+
const reader2 = res2.body?.getReader()
|
|
147
|
+
if (!reader2) throw new Error('No response body')
|
|
148
|
+
const decoder2 = new TextDecoder()
|
|
149
|
+
let buffer2 = ''
|
|
150
|
+
while (true) {
|
|
151
|
+
const { done, value } = await reader2.read()
|
|
152
|
+
if (done) break
|
|
153
|
+
buffer2 += decoder2.decode(value, { stream: true })
|
|
154
|
+
for (const line of buffer2.split('\n')) {
|
|
155
|
+
if (line.startsWith('data: ')) {
|
|
156
|
+
try {
|
|
157
|
+
const parsed = JSON.parse(line.slice(6))
|
|
158
|
+
if (parsed.data?.content) {
|
|
159
|
+
onProgress({ type: 'chunk', text: parsed.data.content })
|
|
160
|
+
}
|
|
161
|
+
} catch {}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return { content: [{ type: 'text', text: '(streamed)' }] }
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
content: [{ type: 'text', text: fullText || '(no response)' }]
|
|
170
|
+
}
|
|
171
|
+
} catch (e: any) {
|
|
172
|
+
return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
},
|
|
176
|
+
|
|
177
|
+
{
|
|
178
|
+
name: 'amplify_files_query',
|
|
179
|
+
description: 'Query uploaded files on Amplify using semantic search. Returns relevant file chunks.',
|
|
180
|
+
inputSchema: {
|
|
181
|
+
type: 'object',
|
|
182
|
+
required: ['query'],
|
|
183
|
+
properties: {
|
|
184
|
+
query: { type: 'string', description: 'Search query' },
|
|
185
|
+
limit: { type: 'number', description: 'Max results (default 5)' }
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
call: async ({ query, limit = 5 }: { query: string, limit?: number }) => {
|
|
189
|
+
try {
|
|
190
|
+
const res = await fetch(`${AMPLIFY_BASE}/files/query`, {
|
|
191
|
+
method: 'POST',
|
|
192
|
+
headers: getHeaders(),
|
|
193
|
+
body: JSON.stringify({ query, limit })
|
|
194
|
+
})
|
|
195
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}`)
|
|
196
|
+
const data = await res.json()
|
|
197
|
+
return {
|
|
198
|
+
content: [{ type: 'text', text: JSON.stringify(data, null, 2) }]
|
|
199
|
+
}
|
|
200
|
+
} catch (e: any) {
|
|
201
|
+
return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
},
|
|
205
|
+
|
|
206
|
+
{
|
|
207
|
+
name: 'amplify_files_list',
|
|
208
|
+
description: 'List tags/categories of uploaded files on Amplify.',
|
|
209
|
+
inputSchema: { type: 'object', properties: {} },
|
|
210
|
+
call: async () => {
|
|
211
|
+
try {
|
|
212
|
+
const res = await fetch(`${AMPLIFY_BASE}/files/tags/list`, {
|
|
213
|
+
headers: getHeaders()
|
|
214
|
+
})
|
|
215
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}`)
|
|
216
|
+
const data = await res.json()
|
|
217
|
+
return {
|
|
218
|
+
content: [{ type: 'text', text: JSON.stringify(data, null, 2) }]
|
|
219
|
+
}
|
|
220
|
+
} catch (e: any) {
|
|
221
|
+
return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
},
|
|
225
|
+
|
|
226
|
+
{
|
|
227
|
+
name: 'amplify_assistants_list',
|
|
228
|
+
description: 'List your Amplify assistants.',
|
|
229
|
+
inputSchema: { type: 'object', properties: {} },
|
|
230
|
+
call: async () => {
|
|
231
|
+
try {
|
|
232
|
+
const res = await fetch(`${AMPLIFY_BASE}/assistant/list`, {
|
|
233
|
+
headers: getHeaders()
|
|
234
|
+
})
|
|
235
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}`)
|
|
236
|
+
const data = await res.json()
|
|
237
|
+
return {
|
|
238
|
+
content: [{ type: 'text', text: JSON.stringify(data, null, 2) }]
|
|
239
|
+
}
|
|
240
|
+
} catch (e: any) {
|
|
241
|
+
return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import type { ArxivMetadata } from '../types.js'
|
|
2
|
+
|
|
3
|
+
const ARXIV_API = 'https://export.arxiv.org/api/query'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Parse an ArXiv ID from various URL formats.
|
|
7
|
+
* Handles:
|
|
8
|
+
* https://arxiv.org/abs/2501.00001
|
|
9
|
+
* http://arxiv.org/abs/2501.00001v2
|
|
10
|
+
* abs/2501.00001
|
|
11
|
+
* 2501.00001v2
|
|
12
|
+
*/
|
|
13
|
+
export function parseArxivId(value: string): string | null {
|
|
14
|
+
// Bare versioned ID: 2501.00001v2
|
|
15
|
+
if (/^\d{4}\.\d{4,}(v\d+)?$/.test(value.trim())) {
|
|
16
|
+
return value.trim()
|
|
17
|
+
}
|
|
18
|
+
// URL or abs/ shorthand
|
|
19
|
+
const m = value.match(/(?:arxiv\.org\/abs\/|abs\/?)(\d{4}\.\d{4,}(?:v\d+)?)/i)
|
|
20
|
+
return m ? m[1] : null
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export async function fetchArxivMetadata(id: string): Promise<ArxivMetadata> {
|
|
24
|
+
const url = `${ARXIV_API}?id_list=${id}`
|
|
25
|
+
const res = await fetch(url)
|
|
26
|
+
if (!res.ok) throw new Error(`ArXiv API error: ${res.status}`)
|
|
27
|
+
const xml = await res.text()
|
|
28
|
+
return parseArxivXml(xml)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function parseArxivXml(xml: string): ArxivMetadata {
|
|
32
|
+
// Extract title
|
|
33
|
+
const titleMatch = xml.match(/<title[^>]*>([\s\S]*?)<\/title>/i)
|
|
34
|
+
const title = titleMatch
|
|
35
|
+
? titleMatch[1].replace(/\s+/g, ' ').trim()
|
|
36
|
+
: null
|
|
37
|
+
|
|
38
|
+
// Extract abstract/summary
|
|
39
|
+
const summaryMatch = xml.match(/<summary[^>]*>([\s\S]*?)<\/summary>/i)
|
|
40
|
+
const abstract = summaryMatch
|
|
41
|
+
? summaryMatch[1].replace(/\s+/g, ' ').trim()
|
|
42
|
+
: null
|
|
43
|
+
|
|
44
|
+
// Extract all authors
|
|
45
|
+
const authors: string[] = []
|
|
46
|
+
const authorRe = /<author>[\s\S]*?<name>([\s\S]*?)<\/name>[\s\S]*?<\/author>/gi
|
|
47
|
+
let m
|
|
48
|
+
while ((m = authorRe.exec(xml)) !== null) {
|
|
49
|
+
authors.push(m[1].replace(/\s+/g, ' ').trim())
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Extract categories
|
|
53
|
+
const categories: string[] = []
|
|
54
|
+
const catRe = /<category[^>]*term="([^"]+)"/gi
|
|
55
|
+
while ((m = catRe.exec(xml)) !== null) categories.push(m[1])
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
title,
|
|
59
|
+
authors: authors.length ? authors : null,
|
|
60
|
+
abstract,
|
|
61
|
+
arxivId: null, // set by caller
|
|
62
|
+
categories: categories.length ? categories : null
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// packages/research-vault-mcp/src/ingest/html.ts
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Fetch a URL and convert HTML to plain markdown-like text.
|
|
5
|
+
* Strips scripts, styles, nav, footer, header, aside elements.
|
|
6
|
+
* Uses Bun's native fetch — no external dependencies.
|
|
7
|
+
*/
|
|
8
|
+
export async function fetchHtml(url: string): Promise<string> {
|
|
9
|
+
const res = await fetch(url, {
|
|
10
|
+
headers: {
|
|
11
|
+
'User-Agent': 'Mozilla/5.0 research-vault-mcp/1.1.0',
|
|
12
|
+
'Accept': 'text/html'
|
|
13
|
+
}
|
|
14
|
+
})
|
|
15
|
+
if (!res.ok) throw new Error(`HTTP ${res.status} fetching ${url}`)
|
|
16
|
+
const html = await res.text()
|
|
17
|
+
|
|
18
|
+
let text = html
|
|
19
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
20
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
21
|
+
.replace(/<nav[\s\S]*?<\/nav>/gi, '')
|
|
22
|
+
.replace(/<footer[\s\S]*?<\/footer>/gi, '')
|
|
23
|
+
.replace(/<header[\s\S]*?<\/header>/gi, '')
|
|
24
|
+
.replace(/<aside[\s\S]*?<\/aside>/gi, '')
|
|
25
|
+
.replace(/<!--[\s\S]*?-->/g, '')
|
|
26
|
+
|
|
27
|
+
// Block elements → newlines
|
|
28
|
+
text = text.replace(/<\/(p|div|br|h[1-6]|li|tr)>/gi, '\n')
|
|
29
|
+
|
|
30
|
+
// Remove all remaining tags
|
|
31
|
+
text = text.replace(/<[^>]+>/g, '')
|
|
32
|
+
|
|
33
|
+
// Decode common HTML entities
|
|
34
|
+
text = text
|
|
35
|
+
.replace(/ /g, ' ')
|
|
36
|
+
.replace(/&/g, '&')
|
|
37
|
+
.replace(/</g, '<')
|
|
38
|
+
.replace(/>/g, '>')
|
|
39
|
+
.replace(/"/g, '"')
|
|
40
|
+
.replace(/'/g, "'")
|
|
41
|
+
|
|
42
|
+
// Collapse whitespace
|
|
43
|
+
text = text.replace(/\n{3,}/g, '\n\n').trim()
|
|
44
|
+
|
|
45
|
+
return text
|
|
46
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
// packages/research-vault-mcp/src/ingest/pdf.ts
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Convert PDF to markdown using markitdown (preferred) or pandoc.
|
|
5
|
+
* Uses Bun.spawn for process execution — no child_process module needed.
|
|
6
|
+
* Returns null if neither tool is available.
|
|
7
|
+
*/
|
|
8
|
+
export async function convertPdfToMarkdown(pdfPath: string): Promise<string | null> {
|
|
9
|
+
// Try markitdown first
|
|
10
|
+
try {
|
|
11
|
+
const proc = Bun.spawn(['markitdown', pdfPath], { timeout: 60_000 })
|
|
12
|
+
const [exited] = await proc.exited
|
|
13
|
+
if (exited === 0) {
|
|
14
|
+
const output = await new Response(proc.stdout as Blob).text()
|
|
15
|
+
if (output.trim()) return output
|
|
16
|
+
}
|
|
17
|
+
} catch {}
|
|
18
|
+
|
|
19
|
+
// Fallback: pandoc
|
|
20
|
+
try {
|
|
21
|
+
const proc = Bun.spawn(['pandoc', '--to', 'markdown', pdfPath], { timeout: 60_000 })
|
|
22
|
+
const [exited] = await proc.exited
|
|
23
|
+
if (exited === 0) {
|
|
24
|
+
const output = await new Response(proc.stdout as Blob).text()
|
|
25
|
+
if (output.trim()) return output
|
|
26
|
+
}
|
|
27
|
+
} catch {}
|
|
28
|
+
|
|
29
|
+
return null
|
|
30
|
+
}
|