@meyverick/omnicode 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -35
- package/package.json +6 -3
- package/src/bin/omnicode-runtime.js +66 -20
- package/src/bin/omnicode.js +59 -8
- package/src/installer/AGENTS.template.md +13 -0
- package/src/installer/lib.js +917 -22
- package/src/installer/mineru-client.js +164 -0
- package/src/installer/tree-sitter.js +270 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MinerU API Client Integration
|
|
3
|
+
*
|
|
4
|
+
* Provides concurrent, non-blocking interaction with the MinerU API
|
|
5
|
+
* for complex document structural extraction.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const API_BASE = 'https://mineru.net/api/v4';
|
|
9
|
+
const MAX_RETRIES = 3;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Helper to sleep for a given number of milliseconds
|
|
13
|
+
*/
|
|
14
|
+
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Fetch wrapper with built-in retry logic for 5xx errors and timeouts.
|
|
18
|
+
*/
|
|
19
|
+
async function fetchWithRetry(url, options, retries = MAX_RETRIES) {
|
|
20
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
21
|
+
try {
|
|
22
|
+
// We set an AbortController for timeout
|
|
23
|
+
const controller = new AbortController();
|
|
24
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
|
|
25
|
+
|
|
26
|
+
const response = await fetch(url, {
|
|
27
|
+
...options,
|
|
28
|
+
signal: controller.signal
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
clearTimeout(timeoutId);
|
|
32
|
+
|
|
33
|
+
// Handle quota exhaustion or invalid key explicitly (do not retry)
|
|
34
|
+
if (response.status === 401 || response.status === 402) {
|
|
35
|
+
const error = new Error(`MinerU API rejected request: HTTP ${response.status}`);
|
|
36
|
+
error.status = response.status;
|
|
37
|
+
throw error;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// If 5xx error, throw so it can be caught and retried
|
|
41
|
+
if (response.status >= 500 && response.status < 600) {
|
|
42
|
+
throw new Error(`MinerU API returned 5xx error: HTTP ${response.status}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (!response.ok) {
|
|
46
|
+
throw new Error(`MinerU API error: HTTP ${response.status}`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return response;
|
|
50
|
+
} catch (error) {
|
|
51
|
+
// If it's a 401/402, bubble it up immediately
|
|
52
|
+
if (error.status === 401 || error.status === 402) {
|
|
53
|
+
throw error;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (attempt === retries) {
|
|
57
|
+
throw new Error(`MinerU API request failed after ${retries} attempts: ${error.message}`);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Exponential backoff
|
|
61
|
+
await sleep(attempt * 2000);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Submit a document to the MinerU extraction API.
|
|
68
|
+
* @param {Buffer} fileBuffer - The binary content of the file.
|
|
69
|
+
* @param {string} fileName - The name of the file (e.g. for extension extraction).
|
|
70
|
+
* @param {string} apiKey - The MINERU_API_KEY.
|
|
71
|
+
* @returns {Promise<string>} The MinerU task ID.
|
|
72
|
+
*/
|
|
73
|
+
export async function submitExtractionTask(fileBuffer, fileName, apiKey) {
|
|
74
|
+
const formData = new FormData();
|
|
75
|
+
// Wrap buffer in a Blob for FormData
|
|
76
|
+
const blob = new Blob([fileBuffer]);
|
|
77
|
+
formData.append('file', blob, fileName);
|
|
78
|
+
formData.append('language', 'en');
|
|
79
|
+
formData.append('is_ocr', 'true');
|
|
80
|
+
|
|
81
|
+
const response = await fetchWithRetry(`${API_BASE}/extract/task`, {
|
|
82
|
+
method: 'POST',
|
|
83
|
+
headers: {
|
|
84
|
+
'Authorization': `Bearer ${apiKey}`
|
|
85
|
+
// Note: browser/node fetch automatically sets the proper multipart boundary
|
|
86
|
+
// when passing a FormData object.
|
|
87
|
+
},
|
|
88
|
+
body: formData
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
const data = await response.json();
|
|
92
|
+
if (!data || !data.taskId) {
|
|
93
|
+
throw new Error('Invalid response from MinerU API: missing taskId');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return data.taskId;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Poll the MinerU API for task completion and download the extraction result.
|
|
101
|
+
* @param {string} taskId - The task ID returned from submission.
|
|
102
|
+
* @param {string} apiKey - The MINERU_API_KEY.
|
|
103
|
+
* @returns {Promise<string>} The extracted markdown content.
|
|
104
|
+
*/
|
|
105
|
+
export async function pollAndDownloadExtraction(taskId, apiKey) {
|
|
106
|
+
const pollInterval = 5000; // 5 seconds
|
|
107
|
+
const maxWaitTime = 300000; // 5 minutes max wait
|
|
108
|
+
const startTime = Date.now();
|
|
109
|
+
|
|
110
|
+
while (Date.now() - startTime < maxWaitTime) {
|
|
111
|
+
const response = await fetchWithRetry(`${API_BASE}/extract/task/${taskId}`, {
|
|
112
|
+
method: 'GET',
|
|
113
|
+
headers: {
|
|
114
|
+
'Authorization': `Bearer ${apiKey}`
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
const data = await response.json();
|
|
119
|
+
|
|
120
|
+
// Assuming status 'done' or 'completed'
|
|
121
|
+
if (data.status === 'completed' || data.status === 'done') {
|
|
122
|
+
if (!data.downloadUrl) {
|
|
123
|
+
throw new Error('MinerU API reported completion but no downloadUrl was provided.');
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Download the extracted result
|
|
127
|
+
const downloadResponse = await fetchWithRetry(data.downloadUrl, {
|
|
128
|
+
method: 'GET'
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// Assuming the result is directly markdown text or a JSON payload containing the markdown
|
|
132
|
+
// We try to parse it as JSON first, if it fails, treat as text
|
|
133
|
+
const textResponse = await downloadResponse.text();
|
|
134
|
+
try {
|
|
135
|
+
const jsonPayload = JSON.parse(textResponse);
|
|
136
|
+
return jsonPayload.markdown || jsonPayload.content || textResponse;
|
|
137
|
+
} catch (e) {
|
|
138
|
+
return textResponse; // Was just plain text/markdown
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (data.status === 'failed' || data.status === 'error') {
|
|
143
|
+
throw new Error(`MinerU API extraction failed: ${data.errorMessage || 'Unknown error'}`);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Wait before polling again
|
|
147
|
+
await sleep(pollInterval);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
throw new Error('MinerU API polling timed out');
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* High-level orchestration function to process a complex document.
|
|
155
|
+
* @param {Buffer} fileBuffer
|
|
156
|
+
* @param {string} fileName
|
|
157
|
+
* @param {string} apiKey
|
|
158
|
+
* @returns {Promise<string>} The markdown result.
|
|
159
|
+
*/
|
|
160
|
+
export async function processComplexDocument(fileBuffer, fileName, apiKey) {
|
|
161
|
+
const taskId = await submitExtractionTask(fileBuffer, fileName, apiKey);
|
|
162
|
+
const markdown = await pollAndDownloadExtraction(taskId, apiKey);
|
|
163
|
+
return markdown;
|
|
164
|
+
}
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import fsPromises from "fs/promises";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import os from "os";
|
|
5
|
+
import Parser from "web-tree-sitter";
|
|
6
|
+
|
|
7
|
+
// Configure paths for grammar cache
|
|
8
|
+
const CONFIG_DIR = path.join(os.homedir(), ".config", "omnicode");
|
|
9
|
+
const GRAMMARS_CACHE_DIR = path.join(CONFIG_DIR, "grammars");
|
|
10
|
+
|
|
11
|
+
// Ensure cache directory exists
|
|
12
|
+
if (!fs.existsSync(GRAMMARS_CACHE_DIR)) {
|
|
13
|
+
fs.mkdirSync(GRAMMARS_CACHE_DIR, { recursive: true });
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const EXTENSION_MAP = {
|
|
17
|
+
js: "javascript",
|
|
18
|
+
ts: "typescript",
|
|
19
|
+
py: "python",
|
|
20
|
+
go: "go",
|
|
21
|
+
rs: "rust",
|
|
22
|
+
cpp: "cpp",
|
|
23
|
+
c: "c",
|
|
24
|
+
cs: "c_sharp",
|
|
25
|
+
java: "java",
|
|
26
|
+
rb: "ruby",
|
|
27
|
+
php: "php"
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
// Tree-sitter must be initialized before use
|
|
31
|
+
let parserInitialized = false;
|
|
32
|
+
const LOADED_LANGUAGES_CACHE = new Map();
|
|
33
|
+
const parserPool = [];
|
|
34
|
+
async function ensureParserInitialized() {
|
|
35
|
+
if (!parserInitialized) {
|
|
36
|
+
await Parser.init();
|
|
37
|
+
parserInitialized = true;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Downloads a tree-sitter language parser from CDN if not cached
|
|
43
|
+
* @param {string} extension The file extension (e.g. 'js', 'go')
|
|
44
|
+
* @returns {Promise<Parser.Language|null>}
|
|
45
|
+
*/
|
|
46
|
+
export async function getOrDownloadLanguage(extension) {
|
|
47
|
+
await ensureParserInitialized();
|
|
48
|
+
|
|
49
|
+
// Clean extension (remove leading dot)
|
|
50
|
+
const ext = extension.startsWith(".") ? extension.substring(1) : extension;
|
|
51
|
+
const grammarName = EXTENSION_MAP[ext];
|
|
52
|
+
|
|
53
|
+
if (!grammarName) {
|
|
54
|
+
return null; // Unsupported language
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Check memory cache first
|
|
58
|
+
if (LOADED_LANGUAGES_CACHE.has(grammarName)) {
|
|
59
|
+
return LOADED_LANGUAGES_CACHE.get(grammarName);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const localWasmPath = path.join(GRAMMARS_CACHE_DIR, `tree-sitter-${grammarName}.wasm`);
|
|
63
|
+
|
|
64
|
+
// 1. Check local cache
|
|
65
|
+
if (fs.existsSync(localWasmPath)) {
|
|
66
|
+
try {
|
|
67
|
+
const lang = await Parser.Language.load(localWasmPath);
|
|
68
|
+
LOADED_LANGUAGES_CACHE.set(grammarName, lang);
|
|
69
|
+
return lang;
|
|
70
|
+
} catch (err) {
|
|
71
|
+
console.warn(`[omnicode] Error loading cached parser ${grammarName}: ${err.message}. Removing cache and retrying.`);
|
|
72
|
+
fs.unlinkSync(localWasmPath);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// 2. Download from CDN
|
|
77
|
+
try {
|
|
78
|
+
// Pin to latest 0.26 to match web-tree-sitter ABI (adjust if needed)
|
|
79
|
+
const cdnUrl = `https://unpkg.com/tree-sitter-wasms@0.1.11/out/tree-sitter-${grammarName}.wasm`;
|
|
80
|
+
console.log(`[omnicode] Downloading Tree-sitter parser for ${grammarName}...`);
|
|
81
|
+
|
|
82
|
+
const response = await fetch(cdnUrl);
|
|
83
|
+
if (!response.ok) {
|
|
84
|
+
throw new Error(`HTTP ${response.status}`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
88
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
89
|
+
|
|
90
|
+
await fsPromises.writeFile(localWasmPath, buffer);
|
|
91
|
+
const lang = await Parser.Language.load(localWasmPath);
|
|
92
|
+
LOADED_LANGUAGES_CACHE.set(grammarName, lang);
|
|
93
|
+
return lang;
|
|
94
|
+
} catch (error) {
|
|
95
|
+
console.warn(`[omnicode] Failed to download parser for ${grammarName}: ${error.message}. Falling back to linear chunking.`);
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Manually download a grammar via CLI
|
|
102
|
+
*/
|
|
103
|
+
export async function downloadLanguageCmd(language) {
|
|
104
|
+
// Allow user to pass "js" or "javascript"
|
|
105
|
+
const grammarName = Object.values(EXTENSION_MAP).includes(language)
|
|
106
|
+
? language
|
|
107
|
+
: EXTENSION_MAP[language];
|
|
108
|
+
|
|
109
|
+
if (!grammarName) {
|
|
110
|
+
console.error(`[omnicode] Error: Unsupported language '${language}'. Supported: ${Object.keys(EXTENSION_MAP).join(", ")}`);
|
|
111
|
+
return false;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Force extension representation to reuse download logic
|
|
115
|
+
const mockExtension = Object.keys(EXTENSION_MAP).find(key => EXTENSION_MAP[key] === grammarName);
|
|
116
|
+
|
|
117
|
+
const localWasmPath = path.join(GRAMMARS_CACHE_DIR, `tree-sitter-${grammarName}.wasm`);
|
|
118
|
+
if (fs.existsSync(localWasmPath)) {
|
|
119
|
+
fs.unlinkSync(localWasmPath); // Force re-download
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const lang = await getOrDownloadLanguage(mockExtension);
|
|
123
|
+
if (lang) {
|
|
124
|
+
console.log(`[omnicode] Successfully downloaded and cached parser for ${grammarName} at ${localWasmPath}`);
|
|
125
|
+
return true;
|
|
126
|
+
}
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const MAX_CHUNK_SIZE = 4000;
|
|
131
|
+
|
|
132
|
+
function getLines(sourceCode, startLine, endLine) {
|
|
133
|
+
const lines = sourceCode.split("\n");
|
|
134
|
+
// tree-sitter lines are 0-indexed
|
|
135
|
+
return lines.slice(startLine, endLine + 1).join("\n").trim();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Recursively splits large AST nodes
|
|
140
|
+
*/
|
|
141
|
+
function extractChunksFromNode(node, sourceCode, chunks, coveredLines) {
|
|
142
|
+
const nodeText = sourceCode.substring(node.startIndex, node.endIndex);
|
|
143
|
+
|
|
144
|
+
if (nodeText.length <= MAX_CHUNK_SIZE) {
|
|
145
|
+
// Fits perfectly!
|
|
146
|
+
chunks.push({
|
|
147
|
+
text: nodeText,
|
|
148
|
+
type: node.type,
|
|
149
|
+
startLine: node.startPosition.row,
|
|
150
|
+
endLine: node.endPosition.row
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
// Mark lines as covered
|
|
154
|
+
for (let i = node.startPosition.row; i <= node.endPosition.row; i++) {
|
|
155
|
+
coveredLines[i] = true;
|
|
156
|
+
}
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Node too large, attempt to split by structural children
|
|
161
|
+
let hasStructuralChildren = false;
|
|
162
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
163
|
+
const child = node.child(i);
|
|
164
|
+
// Common block definitions across languages
|
|
165
|
+
if (["method_definition", "function_declaration", "class_declaration", "declaration", "function_definition", "statement_block", "block"].includes(child.type)) {
|
|
166
|
+
hasStructuralChildren = true;
|
|
167
|
+
extractChunksFromNode(child, sourceCode, chunks, coveredLines);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
if (!hasStructuralChildren) {
|
|
172
|
+
// No structural children to split by, slice linearly
|
|
173
|
+
let index = 0;
|
|
174
|
+
while (index < nodeText.length) {
|
|
175
|
+
chunks.push({
|
|
176
|
+
text: nodeText.substring(index, index + MAX_CHUNK_SIZE),
|
|
177
|
+
type: `${node.type}_slice`,
|
|
178
|
+
startLine: node.startPosition.row,
|
|
179
|
+
endLine: node.endPosition.row
|
|
180
|
+
});
|
|
181
|
+
index += MAX_CHUNK_SIZE;
|
|
182
|
+
}
|
|
183
|
+
// Mark lines as covered
|
|
184
|
+
for (let i = node.startPosition.row; i <= node.endPosition.row; i++) {
|
|
185
|
+
coveredLines[i] = true;
|
|
186
|
+
}
|
|
187
|
+
} else {
|
|
188
|
+
// If we extracted children, we don't mark the whole parent as covered yet.
|
|
189
|
+
// The "orphaned lines" logic will catch the unextracted boilerplate (like class signatures)
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Chunks a file structurally using Tree-sitter
|
|
195
|
+
* @returns {Promise<string[]|null>} Array of text chunks, or null if parser fails/unsupported
|
|
196
|
+
*/
|
|
197
|
+
export async function chunkWithTreeSitter(content, filePath) {
|
|
198
|
+
const ext = path.extname(filePath);
|
|
199
|
+
if (!ext) return null;
|
|
200
|
+
|
|
201
|
+
const language = await getOrDownloadLanguage(ext);
|
|
202
|
+
if (!language) return null; // Fall back to linear chunker
|
|
203
|
+
|
|
204
|
+
await ensureParserInitialized();
|
|
205
|
+
const parser = parserPool.pop() || new Parser();
|
|
206
|
+
parser.setLanguage(language);
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
const tree = parser.parse(content);
|
|
210
|
+
const chunks = [];
|
|
211
|
+
|
|
212
|
+
// Track covered lines for orphaned lines capture
|
|
213
|
+
const totalLines = content.split("\n").length;
|
|
214
|
+
const coveredLines = new Array(totalLines).fill(false);
|
|
215
|
+
|
|
216
|
+
// Walk the AST
|
|
217
|
+
const cursor = tree.walk();
|
|
218
|
+
|
|
219
|
+
function traverse(node) {
|
|
220
|
+
if (["function_declaration", "class_declaration", "method_definition", "function_definition", "type_declaration", "interface_declaration"].includes(node.type)) {
|
|
221
|
+
extractChunksFromNode(node, content, chunks, coveredLines);
|
|
222
|
+
// Don't traverse inside, extractChunksFromNode handles it if needed
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
226
|
+
traverse(node.child(i));
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
traverse(tree.rootNode);
|
|
231
|
+
|
|
232
|
+
// Collect Orphaned Lines
|
|
233
|
+
const sourceLines = content.split("\n");
|
|
234
|
+
let currentOrphanBlock = [];
|
|
235
|
+
|
|
236
|
+
for (let i = 0; i < totalLines; i++) {
|
|
237
|
+
if (!coveredLines[i]) {
|
|
238
|
+
currentOrphanBlock.push(sourceLines[i]);
|
|
239
|
+
} else {
|
|
240
|
+
if (currentOrphanBlock.length > 0) {
|
|
241
|
+
const orphanText = currentOrphanBlock.join("\n").trim();
|
|
242
|
+
if (orphanText.length > 0) {
|
|
243
|
+
chunks.push({
|
|
244
|
+
text: `// Global/Module Scope Fragment from: ${filePath}\n${orphanText}`,
|
|
245
|
+
type: "module_scope"
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
currentOrphanBlock = [];
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
if (currentOrphanBlock.length > 0) {
|
|
253
|
+
const orphanText = currentOrphanBlock.join("\n").trim();
|
|
254
|
+
if (orphanText.length > 0) {
|
|
255
|
+
chunks.push({
|
|
256
|
+
text: `// Global/Module Scope Fragment from: ${filePath}\n${orphanText}`,
|
|
257
|
+
type: "module_scope"
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Filter and return plain text array matching `chunkFile` signature
|
|
263
|
+
return chunks.map(c => c.text).filter(t => t.length > 0);
|
|
264
|
+
} catch (err) {
|
|
265
|
+
console.warn(`[omnicode] Tree-sitter parsing failed for ${filePath}: ${err.message}. Falling back to linear chunker.`);
|
|
266
|
+
return null;
|
|
267
|
+
} finally {
|
|
268
|
+
parserPool.push(parser);
|
|
269
|
+
}
|
|
270
|
+
}
|