@meyverick/omnicode 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ /**
2
+ * MinerU API Client Integration
3
+ *
4
+ * Provides concurrent, non-blocking interaction with the MinerU API
5
+ * for complex document structural extraction.
6
+ */
7
+
8
+ const API_BASE = 'https://mineru.net/api/v4';
9
+ const MAX_RETRIES = 3;
10
+
11
+ /**
12
+ * Helper to sleep for a given number of milliseconds
13
+ */
14
+ const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
15
+
16
+ /**
17
+ * Fetch wrapper with built-in retry logic for 5xx errors and timeouts.
18
+ */
19
+ async function fetchWithRetry(url, options, retries = MAX_RETRIES) {
20
+ for (let attempt = 1; attempt <= retries; attempt++) {
21
+ try {
22
+ // We set an AbortController for timeout
23
+ const controller = new AbortController();
24
+ const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
25
+
26
+ const response = await fetch(url, {
27
+ ...options,
28
+ signal: controller.signal
29
+ });
30
+
31
+ clearTimeout(timeoutId);
32
+
33
+ // Handle quota exhaustion or invalid key explicitly (do not retry)
34
+ if (response.status === 401 || response.status === 402) {
35
+ const error = new Error(`MinerU API rejected request: HTTP ${response.status}`);
36
+ error.status = response.status;
37
+ throw error;
38
+ }
39
+
40
+ // If 5xx error, throw so it can be caught and retried
41
+ if (response.status >= 500 && response.status < 600) {
42
+ throw new Error(`MinerU API returned 5xx error: HTTP ${response.status}`);
43
+ }
44
+
45
+ if (!response.ok) {
46
+ throw new Error(`MinerU API error: HTTP ${response.status}`);
47
+ }
48
+
49
+ return response;
50
+ } catch (error) {
51
+ // If it's a 401/402, bubble it up immediately
52
+ if (error.status === 401 || error.status === 402) {
53
+ throw error;
54
+ }
55
+
56
+ if (attempt === retries) {
57
+ throw new Error(`MinerU API request failed after ${retries} attempts: ${error.message}`);
58
+ }
59
+
60
+ // Exponential backoff
61
+ await sleep(attempt * 2000);
62
+ }
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Submit a document to the MinerU extraction API.
68
+ * @param {Buffer} fileBuffer - The binary content of the file.
69
+ * @param {string} fileName - The name of the file (e.g. for extension extraction).
70
+ * @param {string} apiKey - The MINERU_API_KEY.
71
+ * @returns {Promise<string>} The MinerU task ID.
72
+ */
73
+ export async function submitExtractionTask(fileBuffer, fileName, apiKey) {
74
+ const formData = new FormData();
75
+ // Wrap buffer in a Blob for FormData
76
+ const blob = new Blob([fileBuffer]);
77
+ formData.append('file', blob, fileName);
78
+ formData.append('language', 'en');
79
+ formData.append('is_ocr', 'true');
80
+
81
+ const response = await fetchWithRetry(`${API_BASE}/extract/task`, {
82
+ method: 'POST',
83
+ headers: {
84
+ 'Authorization': `Bearer ${apiKey}`
85
+ // Note: browser/node fetch automatically sets the proper multipart boundary
86
+ // when passing a FormData object.
87
+ },
88
+ body: formData
89
+ });
90
+
91
+ const data = await response.json();
92
+ if (!data || !data.taskId) {
93
+ throw new Error('Invalid response from MinerU API: missing taskId');
94
+ }
95
+
96
+ return data.taskId;
97
+ }
98
+
99
+ /**
100
+ * Poll the MinerU API for task completion and download the extraction result.
101
+ * @param {string} taskId - The task ID returned from submission.
102
+ * @param {string} apiKey - The MINERU_API_KEY.
103
+ * @returns {Promise<string>} The extracted markdown content.
104
+ */
105
+ export async function pollAndDownloadExtraction(taskId, apiKey) {
106
+ const pollInterval = 5000; // 5 seconds
107
+ const maxWaitTime = 300000; // 5 minutes max wait
108
+ const startTime = Date.now();
109
+
110
+ while (Date.now() - startTime < maxWaitTime) {
111
+ const response = await fetchWithRetry(`${API_BASE}/extract/task/${taskId}`, {
112
+ method: 'GET',
113
+ headers: {
114
+ 'Authorization': `Bearer ${apiKey}`
115
+ }
116
+ });
117
+
118
+ const data = await response.json();
119
+
120
+ // Assuming status 'done' or 'completed'
121
+ if (data.status === 'completed' || data.status === 'done') {
122
+ if (!data.downloadUrl) {
123
+ throw new Error('MinerU API reported completion but no downloadUrl was provided.');
124
+ }
125
+
126
+ // Download the extracted result
127
+ const downloadResponse = await fetchWithRetry(data.downloadUrl, {
128
+ method: 'GET'
129
+ });
130
+
131
+ // Assuming the result is directly markdown text or a JSON payload containing the markdown
132
+ // We try to parse it as JSON first, if it fails, treat as text
133
+ const textResponse = await downloadResponse.text();
134
+ try {
135
+ const jsonPayload = JSON.parse(textResponse);
136
+ return jsonPayload.markdown || jsonPayload.content || textResponse;
137
+ } catch (e) {
138
+ return textResponse; // Was just plain text/markdown
139
+ }
140
+ }
141
+
142
+ if (data.status === 'failed' || data.status === 'error') {
143
+ throw new Error(`MinerU API extraction failed: ${data.errorMessage || 'Unknown error'}`);
144
+ }
145
+
146
+ // Wait before polling again
147
+ await sleep(pollInterval);
148
+ }
149
+
150
+ throw new Error('MinerU API polling timed out');
151
+ }
152
+
153
+ /**
154
+ * High-level orchestration function to process a complex document.
155
+ * @param {Buffer} fileBuffer
156
+ * @param {string} fileName
157
+ * @param {string} apiKey
158
+ * @returns {Promise<string>} The markdown result.
159
+ */
160
+ export async function processComplexDocument(fileBuffer, fileName, apiKey) {
161
+ const taskId = await submitExtractionTask(fileBuffer, fileName, apiKey);
162
+ const markdown = await pollAndDownloadExtraction(taskId, apiKey);
163
+ return markdown;
164
+ }
@@ -0,0 +1,270 @@
1
+ import fs from "fs";
2
+ import fsPromises from "fs/promises";
3
+ import path from "path";
4
+ import os from "os";
5
+ import Parser from "web-tree-sitter";
6
+
7
+ // Configure paths for grammar cache
8
+ const CONFIG_DIR = path.join(os.homedir(), ".config", "omnicode");
9
+ const GRAMMARS_CACHE_DIR = path.join(CONFIG_DIR, "grammars");
10
+
11
+ // Ensure cache directory exists
12
+ if (!fs.existsSync(GRAMMARS_CACHE_DIR)) {
13
+ fs.mkdirSync(GRAMMARS_CACHE_DIR, { recursive: true });
14
+ }
15
+
16
+ const EXTENSION_MAP = {
17
+ js: "javascript",
18
+ ts: "typescript",
19
+ py: "python",
20
+ go: "go",
21
+ rs: "rust",
22
+ cpp: "cpp",
23
+ c: "c",
24
+ cs: "c_sharp",
25
+ java: "java",
26
+ rb: "ruby",
27
+ php: "php"
28
+ };
29
+
30
+ // Tree-sitter must be initialized before use
31
+ let parserInitialized = false;
32
+ const LOADED_LANGUAGES_CACHE = new Map();
33
+ const parserPool = [];
34
+ async function ensureParserInitialized() {
35
+ if (!parserInitialized) {
36
+ await Parser.init();
37
+ parserInitialized = true;
38
+ }
39
+ }
40
+
41
+ /**
42
+ * Downloads a tree-sitter language parser from CDN if not cached
43
+ * @param {string} extension The file extension (e.g. 'js', 'go')
44
+ * @returns {Promise<Parser.Language|null>}
45
+ */
46
+ export async function getOrDownloadLanguage(extension) {
47
+ await ensureParserInitialized();
48
+
49
+ // Clean extension (remove leading dot)
50
+ const ext = extension.startsWith(".") ? extension.substring(1) : extension;
51
+ const grammarName = EXTENSION_MAP[ext];
52
+
53
+ if (!grammarName) {
54
+ return null; // Unsupported language
55
+ }
56
+
57
+ // Check memory cache first
58
+ if (LOADED_LANGUAGES_CACHE.has(grammarName)) {
59
+ return LOADED_LANGUAGES_CACHE.get(grammarName);
60
+ }
61
+
62
+ const localWasmPath = path.join(GRAMMARS_CACHE_DIR, `tree-sitter-${grammarName}.wasm`);
63
+
64
+ // 1. Check local cache
65
+ if (fs.existsSync(localWasmPath)) {
66
+ try {
67
+ const lang = await Parser.Language.load(localWasmPath);
68
+ LOADED_LANGUAGES_CACHE.set(grammarName, lang);
69
+ return lang;
70
+ } catch (err) {
71
+ console.warn(`[omnicode] Error loading cached parser ${grammarName}: ${err.message}. Removing cache and retrying.`);
72
+ fs.unlinkSync(localWasmPath);
73
+ }
74
+ }
75
+
76
+ // 2. Download from CDN
77
+ try {
78
+ // Pin to latest 0.26 to match web-tree-sitter ABI (adjust if needed)
79
+ const cdnUrl = `https://unpkg.com/tree-sitter-wasms@0.1.11/out/tree-sitter-${grammarName}.wasm`;
80
+ console.log(`[omnicode] Downloading Tree-sitter parser for ${grammarName}...`);
81
+
82
+ const response = await fetch(cdnUrl);
83
+ if (!response.ok) {
84
+ throw new Error(`HTTP ${response.status}`);
85
+ }
86
+
87
+ const arrayBuffer = await response.arrayBuffer();
88
+ const buffer = Buffer.from(arrayBuffer);
89
+
90
+ await fsPromises.writeFile(localWasmPath, buffer);
91
+ const lang = await Parser.Language.load(localWasmPath);
92
+ LOADED_LANGUAGES_CACHE.set(grammarName, lang);
93
+ return lang;
94
+ } catch (error) {
95
+ console.warn(`[omnicode] Failed to download parser for ${grammarName}: ${error.message}. Falling back to linear chunking.`);
96
+ return null;
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Manually download a grammar via CLI
102
+ */
103
+ export async function downloadLanguageCmd(language) {
104
+ // Allow user to pass "js" or "javascript"
105
+ const grammarName = Object.values(EXTENSION_MAP).includes(language)
106
+ ? language
107
+ : EXTENSION_MAP[language];
108
+
109
+ if (!grammarName) {
110
+ console.error(`[omnicode] Error: Unsupported language '${language}'. Supported: ${Object.keys(EXTENSION_MAP).join(", ")}`);
111
+ return false;
112
+ }
113
+
114
+ // Force extension representation to reuse download logic
115
+ const mockExtension = Object.keys(EXTENSION_MAP).find(key => EXTENSION_MAP[key] === grammarName);
116
+
117
+ const localWasmPath = path.join(GRAMMARS_CACHE_DIR, `tree-sitter-${grammarName}.wasm`);
118
+ if (fs.existsSync(localWasmPath)) {
119
+ fs.unlinkSync(localWasmPath); // Force re-download
120
+ }
121
+
122
+ const lang = await getOrDownloadLanguage(mockExtension);
123
+ if (lang) {
124
+ console.log(`[omnicode] Successfully downloaded and cached parser for ${grammarName} at ${localWasmPath}`);
125
+ return true;
126
+ }
127
+ return false;
128
+ }
129
+
130
+ const MAX_CHUNK_SIZE = 4000;
131
+
132
+ function getLines(sourceCode, startLine, endLine) {
133
+ const lines = sourceCode.split("\n");
134
+ // tree-sitter lines are 0-indexed
135
+ return lines.slice(startLine, endLine + 1).join("\n").trim();
136
+ }
137
+
138
+ /**
139
+ * Recursively splits large AST nodes
140
+ */
141
+ function extractChunksFromNode(node, sourceCode, chunks, coveredLines) {
142
+ const nodeText = sourceCode.substring(node.startIndex, node.endIndex);
143
+
144
+ if (nodeText.length <= MAX_CHUNK_SIZE) {
145
+ // Fits perfectly!
146
+ chunks.push({
147
+ text: nodeText,
148
+ type: node.type,
149
+ startLine: node.startPosition.row,
150
+ endLine: node.endPosition.row
151
+ });
152
+
153
+ // Mark lines as covered
154
+ for (let i = node.startPosition.row; i <= node.endPosition.row; i++) {
155
+ coveredLines[i] = true;
156
+ }
157
+ return;
158
+ }
159
+
160
+ // Node too large, attempt to split by structural children
161
+ let hasStructuralChildren = false;
162
+ for (let i = 0; i < node.childCount; i++) {
163
+ const child = node.child(i);
164
+ // Common block definitions across languages
165
+ if (["method_definition", "function_declaration", "class_declaration", "declaration", "function_definition", "statement_block", "block"].includes(child.type)) {
166
+ hasStructuralChildren = true;
167
+ extractChunksFromNode(child, sourceCode, chunks, coveredLines);
168
+ }
169
+ }
170
+
171
+ if (!hasStructuralChildren) {
172
+ // No structural children to split by, slice linearly
173
+ let index = 0;
174
+ while (index < nodeText.length) {
175
+ chunks.push({
176
+ text: nodeText.substring(index, index + MAX_CHUNK_SIZE),
177
+ type: `${node.type}_slice`,
178
+ startLine: node.startPosition.row,
179
+ endLine: node.endPosition.row
180
+ });
181
+ index += MAX_CHUNK_SIZE;
182
+ }
183
+ // Mark lines as covered
184
+ for (let i = node.startPosition.row; i <= node.endPosition.row; i++) {
185
+ coveredLines[i] = true;
186
+ }
187
+ } else {
188
+ // If we extracted children, we don't mark the whole parent as covered yet.
189
+ // The "orphaned lines" logic will catch the unextracted boilerplate (like class signatures)
190
+ }
191
+ }
192
+
193
+ /**
194
+ * Chunks a file structurally using Tree-sitter
195
+ * @returns {Promise<string[]|null>} Array of text chunks, or null if parser fails/unsupported
196
+ */
197
+ export async function chunkWithTreeSitter(content, filePath) {
198
+ const ext = path.extname(filePath);
199
+ if (!ext) return null;
200
+
201
+ const language = await getOrDownloadLanguage(ext);
202
+ if (!language) return null; // Fall back to linear chunker
203
+
204
+ await ensureParserInitialized();
205
+ const parser = parserPool.pop() || new Parser();
206
+ parser.setLanguage(language);
207
+
208
+ try {
209
+ const tree = parser.parse(content);
210
+ const chunks = [];
211
+
212
+ // Track covered lines for orphaned lines capture
213
+ const totalLines = content.split("\n").length;
214
+ const coveredLines = new Array(totalLines).fill(false);
215
+
216
+ // Walk the AST
217
+ const cursor = tree.walk();
218
+
219
+ function traverse(node) {
220
+ if (["function_declaration", "class_declaration", "method_definition", "function_definition", "type_declaration", "interface_declaration"].includes(node.type)) {
221
+ extractChunksFromNode(node, content, chunks, coveredLines);
222
+ // Don't traverse inside, extractChunksFromNode handles it if needed
223
+ return;
224
+ }
225
+ for (let i = 0; i < node.childCount; i++) {
226
+ traverse(node.child(i));
227
+ }
228
+ }
229
+
230
+ traverse(tree.rootNode);
231
+
232
+ // Collect Orphaned Lines
233
+ const sourceLines = content.split("\n");
234
+ let currentOrphanBlock = [];
235
+
236
+ for (let i = 0; i < totalLines; i++) {
237
+ if (!coveredLines[i]) {
238
+ currentOrphanBlock.push(sourceLines[i]);
239
+ } else {
240
+ if (currentOrphanBlock.length > 0) {
241
+ const orphanText = currentOrphanBlock.join("\n").trim();
242
+ if (orphanText.length > 0) {
243
+ chunks.push({
244
+ text: `// Global/Module Scope Fragment from: ${filePath}\n${orphanText}`,
245
+ type: "module_scope"
246
+ });
247
+ }
248
+ currentOrphanBlock = [];
249
+ }
250
+ }
251
+ }
252
+ if (currentOrphanBlock.length > 0) {
253
+ const orphanText = currentOrphanBlock.join("\n").trim();
254
+ if (orphanText.length > 0) {
255
+ chunks.push({
256
+ text: `// Global/Module Scope Fragment from: ${filePath}\n${orphanText}`,
257
+ type: "module_scope"
258
+ });
259
+ }
260
+ }
261
+
262
+ // Filter and return plain text array matching `chunkFile` signature
263
+ return chunks.map(c => c.text).filter(t => t.length > 0);
264
+ } catch (err) {
265
+ console.warn(`[omnicode] Tree-sitter parsing failed for ${filePath}: ${err.message}. Falling back to linear chunker.`);
266
+ return null;
267
+ } finally {
268
+ parserPool.push(parser);
269
+ }
270
+ }