byterover-cli 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -5
- package/dist/core/domain/cipher/errors/file-system-error.d.ts +11 -0
- package/dist/core/domain/cipher/errors/file-system-error.js +17 -0
- package/dist/core/domain/cipher/file-system/types.d.ts +40 -6
- package/dist/core/domain/entities/agent.d.ts +1 -1
- package/dist/core/domain/entities/agent.js +5 -0
- package/dist/core/interfaces/cipher/cipher-services.d.ts +0 -3
- package/dist/core/interfaces/cipher/index.d.ts +0 -2
- package/dist/infra/cipher/file-system/binary-utils.d.ts +15 -2
- package/dist/infra/cipher/file-system/binary-utils.js +26 -3
- package/dist/infra/cipher/file-system/file-system-service.d.ts +9 -0
- package/dist/infra/cipher/file-system/file-system-service.js +91 -8
- package/dist/infra/cipher/file-system/pdf-extractor.d.ts +100 -0
- package/dist/infra/cipher/file-system/pdf-extractor.js +226 -0
- package/dist/infra/cipher/tools/implementations/read-file-tool.js +24 -4
- package/dist/infra/connectors/rules/rules-connector-config.d.ts +4 -0
- package/dist/infra/connectors/rules/rules-connector-config.js +4 -0
- package/dist/infra/mcp/tools/brv-curate-tool.d.ts +10 -4
- package/dist/infra/mcp/tools/brv-curate-tool.js +9 -4
- package/dist/infra/repl/commands/space/switch-command.js +0 -2
- package/dist/infra/usecase/curate-use-case.js +10 -4
- package/dist/infra/usecase/space-switch-use-case.d.ts +0 -10
- package/dist/infra/usecase/space-switch-use-case.js +7 -37
- package/dist/oclif/hooks/init/welcome.js +4 -17
- package/dist/resources/prompts/curate.yml +1 -0
- package/dist/resources/tools/read_file.txt +5 -2
- package/dist/utils/file-validator.js +8 -4
- package/oclif.manifest.json +1 -54
- package/package.json +4 -2
- package/dist/core/interfaces/cipher/i-coding-agent-log-parser.d.ts +0 -20
- package/dist/core/interfaces/cipher/i-coding-agent-log-parser.js +0 -1
- package/dist/core/interfaces/cipher/i-coding-agent-log-watcher.d.ts +0 -31
- package/dist/core/interfaces/cipher/i-coding-agent-log-watcher.js +0 -1
- package/dist/core/interfaces/i-file-watcher-service.d.ts +0 -41
- package/dist/core/interfaces/i-file-watcher-service.js +0 -1
- package/dist/core/interfaces/parser/i-clean-parser-service.d.ts +0 -18
- package/dist/core/interfaces/parser/i-clean-parser-service.js +0 -1
- package/dist/core/interfaces/parser/i-raw-parser-service.d.ts +0 -17
- package/dist/core/interfaces/parser/i-raw-parser-service.js +0 -1
- package/dist/core/interfaces/parser/i-session-normalizer.d.ts +0 -56
- package/dist/core/interfaces/parser/i-session-normalizer.js +0 -1
- package/dist/infra/cipher/parsers/coding-agent-log-parser.d.ts +0 -24
- package/dist/infra/cipher/parsers/coding-agent-log-parser.js +0 -51
- package/dist/infra/cipher/watcher/coding-agent-log-watcher.d.ts +0 -14
- package/dist/infra/cipher/watcher/coding-agent-log-watcher.js +0 -55
- package/dist/infra/parsers/clean/clean-claude-service.d.ts +0 -111
- package/dist/infra/parsers/clean/clean-claude-service.js +0 -271
- package/dist/infra/parsers/clean/clean-codex-service.d.ts +0 -231
- package/dist/infra/parsers/clean/clean-codex-service.js +0 -534
- package/dist/infra/parsers/clean/clean-copilot-service.d.ts +0 -255
- package/dist/infra/parsers/clean/clean-copilot-service.js +0 -729
- package/dist/infra/parsers/clean/clean-cursor-service.d.ts +0 -161
- package/dist/infra/parsers/clean/clean-cursor-service.js +0 -432
- package/dist/infra/parsers/clean/clean-parser-service-factory.d.ts +0 -54
- package/dist/infra/parsers/clean/clean-parser-service-factory.js +0 -80
- package/dist/infra/parsers/clean/shared.d.ts +0 -84
- package/dist/infra/parsers/clean/shared.js +0 -273
- package/dist/infra/parsers/raw/raw-claude-service.d.ts +0 -195
- package/dist/infra/parsers/raw/raw-claude-service.js +0 -548
- package/dist/infra/parsers/raw/raw-codex-service.d.ts +0 -313
- package/dist/infra/parsers/raw/raw-codex-service.js +0 -782
- package/dist/infra/parsers/raw/raw-copilot-service.d.ts +0 -196
- package/dist/infra/parsers/raw/raw-copilot-service.js +0 -558
- package/dist/infra/parsers/raw/raw-cursor-service.d.ts +0 -316
- package/dist/infra/parsers/raw/raw-cursor-service.js +0 -818
- package/dist/infra/parsers/raw/raw-parser-service-factory.d.ts +0 -54
- package/dist/infra/parsers/raw/raw-parser-service-factory.js +0 -81
- package/dist/infra/watcher/file-watcher-service.d.ts +0 -10
- package/dist/infra/watcher/file-watcher-service.js +0 -81
- package/dist/oclif/commands/watch.d.ts +0 -25
- package/dist/oclif/commands/watch.js +0 -175
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ByteRover CLI
|
|
2
2
|
|
|
3
|
-
Command-line interface for ByteRover, featuring an interactive REPL with a modern React/Ink terminal UI for managing your project's context tree and knowledge storage. Seamlessly integrate with
|
|
3
|
+
Command-line interface for ByteRover, featuring an interactive REPL with a modern React/Ink terminal UI for managing your project's context tree and knowledge storage. Seamlessly integrate with 19 AI coding agents via modern skill files, MCP tools, or rules-based integration—supports Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and more.
|
|
4
4
|
|
|
5
5
|
[](https://npmjs.org/package/byterover-cli)
|
|
6
6
|
[](https://npmjs.org/package/byterover-cli)
|
|
@@ -154,7 +154,7 @@ The **Context Tree** is ByteRover's structured knowledge system that helps you a
|
|
|
154
154
|
- **Organized Knowledge**: Structure your project knowledge by domain and topic
|
|
155
155
|
- **Easy Retrieval**: Find relevant context quickly when you need it
|
|
156
156
|
- **Persistent Memory**: Maintain project-specific knowledge across sessions
|
|
157
|
-
- **Agent-Friendly**: Works seamlessly with
|
|
157
|
+
- **Agent-Friendly**: Works seamlessly with 19 AI coding agents (Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and more) via skill files, MCP tools, hooks, or rules
|
|
158
158
|
- **Cloud Sync**: Push and sync your context tree to ByteRover's cloud storage for backup and team collaboration
|
|
159
159
|
- **Dynamic Domains**: Automatically creates new domains as your knowledge grows
|
|
160
160
|
|
|
@@ -167,13 +167,16 @@ The context tree organizes knowledge into:
|
|
|
167
167
|
|
|
168
168
|
## Supported AI Agents
|
|
169
169
|
|
|
170
|
-
ByteRover integrates with
|
|
170
|
+
ByteRover integrates with 19 AI coding agents:
|
|
171
171
|
|
|
172
172
|
**Skill Connector (Default):**
|
|
173
173
|
- Claude Code, Cursor
|
|
174
174
|
|
|
175
175
|
**MCP Connector (Default):**
|
|
176
|
-
- Amp, Augment Code, Cline, Gemini CLI, Github Copilot, Junie, Kilo Code, Kiro, Qoder, Qwen Code, Roo Code, Trae.ai, Warp, Windsurf, Zed
|
|
176
|
+
- Amp, Augment Code, Cline, Codex, Gemini CLI, Github Copilot, Junie, Kilo Code, Kiro, Qoder, Qwen Code, Roo Code, Trae.ai, Warp, Windsurf, Zed
|
|
177
|
+
|
|
178
|
+
**Rules Connector (Default):**
|
|
179
|
+
- Antigravity (rules-only integration)
|
|
177
180
|
|
|
178
181
|
**All agents support rules-based integration as a universal fallback option.**
|
|
179
182
|
|
|
@@ -190,12 +193,13 @@ Use `/connectors` to manage integrations with your AI coding agents:
|
|
|
190
193
|
ByteRover supports four connector types:
|
|
191
194
|
|
|
192
195
|
1. **Skill integration** (Claude Code, Cursor - default): Modern integration that writes 3 markdown files (SKILL.md, TROUBLESHOOTING.md, WORKFLOWS.md) to your agent's skills directory for easy discovery and guidance
|
|
193
|
-
2. **MCP integration** (16
|
|
196
|
+
2. **MCP integration** (16 agents - default): Exposes brv-query and brv-curate as Model Context Protocol tools that AI agents can call directly
|
|
194
197
|
3. **Rules-based** (all agents): Generates agent-specific rule files (e.g., CLAUDE.md, .cursorrules) with instructions for using ByteRover
|
|
195
198
|
4. **Hook integration** (Claude Code only - legacy): Direct injection via IDE settings, replaced by skill connector
|
|
196
199
|
|
|
197
200
|
**Defaults by agent:**
|
|
198
201
|
- Claude Code, Cursor: Skill connector
|
|
202
|
+
- Antigravity: Rules connector (only supported type)
|
|
199
203
|
- All others (16 agents): MCP connector
|
|
200
204
|
- Rules: Available for all agents as fallback
|
|
201
205
|
|
|
@@ -269,6 +273,7 @@ ByteRover supports four connector types:
|
|
|
269
273
|
|
|
270
274
|
**Defaults:**
|
|
271
275
|
- Claude Code, Cursor: `skill`
|
|
276
|
+
- Antigravity: `rules` (only supported type)
|
|
272
277
|
- All others: `mcp`
|
|
273
278
|
|
|
274
279
|
**Reset options:**
|
|
@@ -210,3 +210,14 @@ export declare class TooManyResultsError extends FileSystemError {
|
|
|
210
210
|
*/
|
|
211
211
|
constructor(operation: string, count: number, maxResults: number);
|
|
212
212
|
}
|
|
213
|
+
/**
|
|
214
|
+
* Error thrown when PDF text extraction fails.
|
|
215
|
+
*/
|
|
216
|
+
export declare class PdfExtractionError extends FileSystemError {
|
|
217
|
+
/**
|
|
218
|
+
* Creates a new PDF extraction error
|
|
219
|
+
* @param path - Path to the PDF file
|
|
220
|
+
* @param reason - Reason for the extraction failure
|
|
221
|
+
*/
|
|
222
|
+
constructor(path: string, reason: string);
|
|
223
|
+
}
|
|
@@ -290,3 +290,20 @@ export class TooManyResultsError extends FileSystemError {
|
|
|
290
290
|
this.name = 'TooManyResultsError';
|
|
291
291
|
}
|
|
292
292
|
}
|
|
293
|
+
/**
|
|
294
|
+
* Error thrown when PDF text extraction fails.
|
|
295
|
+
*/
|
|
296
|
+
export class PdfExtractionError extends FileSystemError {
|
|
297
|
+
/**
|
|
298
|
+
* Creates a new PDF extraction error
|
|
299
|
+
* @param path - Path to the PDF file
|
|
300
|
+
* @param reason - Reason for the extraction failure
|
|
301
|
+
*/
|
|
302
|
+
constructor(path, reason) {
|
|
303
|
+
super(`Failed to extract text from PDF: ${path}. ${reason}`, 'PDF_EXTRACTION_FAILED', {
|
|
304
|
+
path,
|
|
305
|
+
reason,
|
|
306
|
+
});
|
|
307
|
+
this.name = 'PdfExtractionError';
|
|
308
|
+
}
|
|
309
|
+
}
|
|
@@ -18,16 +18,46 @@ export interface FileSystemConfig {
|
|
|
18
18
|
/** Working directory for relative path resolution */
|
|
19
19
|
workingDirectory: string;
|
|
20
20
|
}
|
|
21
|
+
/**
|
|
22
|
+
* PDF read mode for controlling how PDF files are returned.
|
|
23
|
+
* - 'text': Extract text content page by page (default)
|
|
24
|
+
* - 'base64': Return raw PDF as base64 attachment (for multimodal LLMs)
|
|
25
|
+
*/
|
|
26
|
+
export type PdfReadMode = 'base64' | 'text';
|
|
27
|
+
/**
|
|
28
|
+
* Metadata extracted from a PDF file.
|
|
29
|
+
*/
|
|
30
|
+
export interface PdfMetadata {
|
|
31
|
+
/** Author of the PDF (if available) */
|
|
32
|
+
author?: string;
|
|
33
|
+
/** Creation date of the PDF (if available) */
|
|
34
|
+
creationDate?: Date;
|
|
35
|
+
/** Total number of pages in the PDF */
|
|
36
|
+
pageCount: number;
|
|
37
|
+
/** Title of the PDF (if available) */
|
|
38
|
+
title?: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Content extracted from a single PDF page.
|
|
42
|
+
*/
|
|
43
|
+
export interface PdfPageContent {
|
|
44
|
+
/** 1-based page number */
|
|
45
|
+
pageNumber: number;
|
|
46
|
+
/** Extracted text content from the page */
|
|
47
|
+
text: string;
|
|
48
|
+
}
|
|
21
49
|
/**
|
|
22
50
|
* Options for reading files.
|
|
23
51
|
*/
|
|
24
52
|
export interface ReadFileOptions {
|
|
25
53
|
/** Character encoding */
|
|
26
54
|
encoding?: BufferEncoding;
|
|
27
|
-
/** Maximum number of lines to read */
|
|
55
|
+
/** Maximum number of lines to read (for text files) or pages (for PDFs in text mode) */
|
|
28
56
|
limit?: number;
|
|
29
|
-
/** Starting line number (1-based
|
|
57
|
+
/** Starting line number (1-based) for text files, or starting page number for PDFs */
|
|
30
58
|
offset?: number;
|
|
59
|
+
/** PDF read mode: 'text' (default) extracts text, 'base64' returns raw attachment */
|
|
60
|
+
pdfMode?: PdfReadMode;
|
|
31
61
|
}
|
|
32
62
|
/**
|
|
33
63
|
* Options for writing files.
|
|
@@ -126,23 +156,27 @@ export interface FileAttachment {
|
|
|
126
156
|
* Result of a file read operation.
|
|
127
157
|
*/
|
|
128
158
|
export interface FileContent {
|
|
129
|
-
/** Attachment data for binary files (images, PDFs) */
|
|
159
|
+
/** Attachment data for binary files (images, PDFs in base64 mode) */
|
|
130
160
|
attachment?: FileAttachment;
|
|
131
161
|
/** File content as string */
|
|
132
162
|
content: string;
|
|
133
163
|
/** Character encoding used */
|
|
134
164
|
encoding: string;
|
|
135
|
-
/** Formatted content with line numbers (00001| content format) */
|
|
165
|
+
/** Formatted content with line numbers (00001| content format) or PDF page separators */
|
|
136
166
|
formattedContent: string;
|
|
137
|
-
/** Total number of lines in the returned content */
|
|
167
|
+
/** Total number of lines in the returned content (or pages for PDF text mode) */
|
|
138
168
|
lines: number;
|
|
139
169
|
/** Human-readable message about file status (truncation info, etc.) */
|
|
140
170
|
message: string;
|
|
171
|
+
/** PDF metadata when reading PDF in text mode */
|
|
172
|
+
pdfMetadata?: PdfMetadata;
|
|
173
|
+
/** PDF page contents when reading PDF in text mode */
|
|
174
|
+
pdfPages?: PdfPageContent[];
|
|
141
175
|
/** Preview of content (first 20 lines) for UI display */
|
|
142
176
|
preview?: string;
|
|
143
177
|
/** File size in bytes */
|
|
144
178
|
size: number;
|
|
145
|
-
/** Total lines in the entire file */
|
|
179
|
+
/** Total lines in the entire file (or total pages for PDF text mode) */
|
|
146
180
|
totalLines: number;
|
|
147
181
|
/** Whether content was truncated due to size/line limits */
|
|
148
182
|
truncated: boolean;
|
|
@@ -2,7 +2,7 @@ import type { ConnectorType } from './connector-type.js';
|
|
|
2
2
|
/**
|
|
3
3
|
* Array of all supported Agents.
|
|
4
4
|
*/
|
|
5
|
-
export declare const AGENT_VALUES: readonly ["Amp", "Augment Code", "Claude Code", "Cline", "Codex", "Cursor", "Gemini CLI", "Github Copilot", "Junie", "Kilo Code", "Kiro", "Qoder", "Qwen Code", "Roo Code", "Trae.ai", "Warp", "Windsurf", "Zed"];
|
|
5
|
+
export declare const AGENT_VALUES: readonly ["Amp", "Antigravity", "Augment Code", "Claude Code", "Cline", "Codex", "Cursor", "Gemini CLI", "Github Copilot", "Junie", "Kilo Code", "Kiro", "Qoder", "Qwen Code", "Roo Code", "Trae.ai", "Warp", "Windsurf", "Zed"];
|
|
6
6
|
export type Agent = (typeof AGENT_VALUES)[number];
|
|
7
7
|
/**
|
|
8
8
|
* Connector availability configuration for an agent.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export const AGENT_VALUES = [
|
|
5
5
|
'Amp',
|
|
6
|
+
'Antigravity',
|
|
6
7
|
'Augment Code',
|
|
7
8
|
'Claude Code',
|
|
8
9
|
'Cline',
|
|
@@ -30,6 +31,10 @@ export const AGENT_CONNECTOR_CONFIG = {
|
|
|
30
31
|
default: 'mcp',
|
|
31
32
|
supported: ['rules', 'mcp'],
|
|
32
33
|
},
|
|
34
|
+
Antigravity: {
|
|
35
|
+
default: 'rules',
|
|
36
|
+
supported: ['rules'],
|
|
37
|
+
},
|
|
33
38
|
'Augment Code': {
|
|
34
39
|
default: 'mcp',
|
|
35
40
|
supported: ['rules', 'mcp'],
|
|
@@ -8,7 +8,6 @@ import type { SystemPromptManager } from '../../../infra/cipher/system-prompt/sy
|
|
|
8
8
|
import type { ToolManager } from '../../../infra/cipher/tools/tool-manager.js';
|
|
9
9
|
import type { ToolProvider } from '../../../infra/cipher/tools/tool-provider.js';
|
|
10
10
|
import type { IBlobStorage } from './i-blob-storage.js';
|
|
11
|
-
import type { ICodingAgentLogWatcher } from './i-coding-agent-log-watcher.js';
|
|
12
11
|
import type { IHistoryStorage } from './i-history-storage.js';
|
|
13
12
|
import type { ILLMService } from './i-llm-service.js';
|
|
14
13
|
import type { IPolicyEngine } from './i-policy-engine.js';
|
|
@@ -28,12 +27,10 @@ import type { IToolScheduler } from './i-tool-scheduler.js';
|
|
|
28
27
|
* - HistoryStorage: Conversation history persistence
|
|
29
28
|
* - MemoryManager: Agent memory system
|
|
30
29
|
* - ToolProvider: Provides available tools
|
|
31
|
-
* - CodingAgentLogWatcher: Watches coding agent logs for learning (optional)
|
|
32
30
|
*/
|
|
33
31
|
export interface CipherAgentServices {
|
|
34
32
|
agentEventBus: AgentEventBus;
|
|
35
33
|
blobStorage: IBlobStorage;
|
|
36
|
-
codingAgentLogWatcher?: ICodingAgentLogWatcher;
|
|
37
34
|
/**
|
|
38
35
|
* CompactionService for context overflow management.
|
|
39
36
|
* Only available when granular storage is enabled (useGranularStorage: true).
|
|
@@ -8,8 +8,6 @@ export * from './cipher-services.js';
|
|
|
8
8
|
export type { IBlobStorage } from './i-blob-storage.js';
|
|
9
9
|
export type { IChatSession } from './i-chat-session.js';
|
|
10
10
|
export type { ICipherAgent } from './i-cipher-agent.js';
|
|
11
|
-
export type { ICodingAgentLogParser } from './i-coding-agent-log-parser.js';
|
|
12
|
-
export type { ICodingAgentLogWatcher } from './i-coding-agent-log-watcher.js';
|
|
13
11
|
export type { IContentGenerator } from './i-content-generator.js';
|
|
14
12
|
export type { IEventEmitter } from './i-event-emitter.js';
|
|
15
13
|
export type { IFileSystem } from './i-file-system.js';
|
|
@@ -33,6 +33,19 @@ export declare function isPdfFile(filePath: string, buffer?: Buffer): boolean;
|
|
|
33
33
|
*/
|
|
34
34
|
export declare function getMimeType(filePath: string): null | string;
|
|
35
35
|
/**
|
|
36
|
-
* Checks if a file is a media file (
|
|
36
|
+
* Checks if a file is a media file (only images supported at this point). PDFs are handled separately.
|
|
37
|
+
* @param filePath - Path to the file
|
|
38
|
+
*/
|
|
39
|
+
export declare function isMediaFile(filePath: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Determines if a file should be returned as a base64 attachment.
|
|
42
|
+
*
|
|
43
|
+
* - Images: Always returned as attachment
|
|
44
|
+
* - PDFs: Depends on pdfMode ('base64' = attachment, 'text' = extract text)
|
|
45
|
+
* - Other files: Never returned as attachment
|
|
46
|
+
*
|
|
47
|
+
* @param filePath - Path to the file
|
|
48
|
+
* @param pdfMode - PDF read mode ('text' | 'base64'), defaults to 'text'
|
|
49
|
+
* @returns true if file should be returned as base64 attachment
|
|
37
50
|
*/
|
|
38
|
-
export declare function
|
|
51
|
+
export declare function shouldReturnAsAttachment(filePath: string, pdfMode?: 'base64' | 'text'): boolean;
|
|
@@ -172,8 +172,31 @@ export function getMimeType(filePath) {
|
|
|
172
172
|
return MIME_TYPES[ext] ?? null;
|
|
173
173
|
}
|
|
174
174
|
/**
|
|
175
|
-
* Checks if a file is a media file (
|
|
175
|
+
* Checks if a file is a media file (only images supported at this point). PDFs are handled separately.
|
|
176
|
+
* @param filePath - Path to the file
|
|
176
177
|
*/
|
|
177
|
-
export function isMediaFile(filePath
|
|
178
|
-
return isImageFile(filePath)
|
|
178
|
+
export function isMediaFile(filePath) {
|
|
179
|
+
return isImageFile(filePath);
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Determines if a file should be returned as a base64 attachment.
|
|
183
|
+
*
|
|
184
|
+
* - Images: Always returned as attachment
|
|
185
|
+
* - PDFs: Depends on pdfMode ('base64' = attachment, 'text' = extract text)
|
|
186
|
+
* - Other files: Never returned as attachment
|
|
187
|
+
*
|
|
188
|
+
* @param filePath - Path to the file
|
|
189
|
+
* @param pdfMode - PDF read mode ('text' | 'base64'), defaults to 'text'
|
|
190
|
+
* @returns true if file should be returned as base64 attachment
|
|
191
|
+
*/
|
|
192
|
+
export function shouldReturnAsAttachment(filePath, pdfMode) {
|
|
193
|
+
// Images are always returned as attachments
|
|
194
|
+
if (isImageFile(filePath)) {
|
|
195
|
+
return true;
|
|
196
|
+
}
|
|
197
|
+
// PDFs depend on pdfMode (if pdfMode is 'base64', return true)
|
|
198
|
+
if (isPdfFile(filePath) && pdfMode === 'base64') {
|
|
199
|
+
return true;
|
|
200
|
+
}
|
|
201
|
+
return false;
|
|
179
202
|
}
|
|
@@ -89,6 +89,15 @@ export declare class FileSystemService implements IFileSystem {
|
|
|
89
89
|
* Returns null if grep is not available or fails.
|
|
90
90
|
*/
|
|
91
91
|
private executeSystemGrep;
|
|
92
|
+
/**
|
|
93
|
+
* Extracts text content from a PDF file with pagination support.
|
|
94
|
+
* @param buffer - PDF file buffer
|
|
95
|
+
* @param filePath - Path to the PDF file
|
|
96
|
+
* @param fileSize - Size of the file in bytes
|
|
97
|
+
* @param options - Read options including offset and limit
|
|
98
|
+
* @returns FileContent with extracted text
|
|
99
|
+
*/
|
|
100
|
+
private extractPdfTextContent;
|
|
92
101
|
/**
|
|
93
102
|
* Checks if a command is available in the system's PATH.
|
|
94
103
|
*/
|
|
@@ -3,12 +3,13 @@ import { spawn } from 'node:child_process';
|
|
|
3
3
|
import fs from 'node:fs/promises';
|
|
4
4
|
import { EOL } from 'node:os';
|
|
5
5
|
import path from 'node:path';
|
|
6
|
-
import { DirectoryNotFoundError, EditOperationError, FileNotFoundError, FileTooLargeError, GlobOperationError, InvalidExtensionError, InvalidPathError, InvalidPatternError, PathBlockedError, PathNotAllowedError, PathTraversalError, ReadOperationError, SearchOperationError, ServiceNotInitializedError, StringNotFoundError, StringNotUniqueError, WriteOperationError, } from '../../../core/domain/cipher/errors/file-system-error.js';
|
|
6
|
+
import { DirectoryNotFoundError, EditOperationError, FileNotFoundError, FileTooLargeError, GlobOperationError, InvalidExtensionError, InvalidPathError, InvalidPatternError, PathBlockedError, PathNotAllowedError, PathTraversalError, PdfExtractionError, ReadOperationError, SearchOperationError, ServiceNotInitializedError, StringNotFoundError, StringNotUniqueError, WriteOperationError, } from '../../../core/domain/cipher/errors/file-system-error.js';
|
|
7
7
|
import { getErrorMessage } from '../../../utils/error-helpers.js';
|
|
8
|
-
import { getMimeType, isBinaryFile,
|
|
8
|
+
import { getMimeType, isBinaryFile, isImageFile, isPdfFile, shouldReturnAsAttachment } from './binary-utils.js';
|
|
9
9
|
import { createGitignoreFilter } from './gitignore-filter.js';
|
|
10
10
|
import { collectFileMetadata, escapeIfExactMatch, extractPaths, sortFilesByRecency } from './glob-utils.js';
|
|
11
11
|
import { PathValidator } from './path-validator.js';
|
|
12
|
+
import { formatPdfContent, PdfExtractor } from './pdf-extractor.js';
|
|
12
13
|
/**
|
|
13
14
|
* Maximum line length for search results.
|
|
14
15
|
* Prevents context overflow from minified files or long lines.
|
|
@@ -434,12 +435,12 @@ export class FileSystemService {
|
|
|
434
435
|
if (stats.size > this.config.maxFileSize) {
|
|
435
436
|
throw new FileTooLargeError(normalizedPath, stats.size, this.config.maxFileSize);
|
|
436
437
|
}
|
|
437
|
-
// Handle
|
|
438
|
-
if (
|
|
438
|
+
// Handle files that should be returned as base64 attachments (images always, PDFs when pdfMode='base64')
|
|
439
|
+
if (shouldReturnAsAttachment(normalizedPath, options.pdfMode)) {
|
|
439
440
|
const buffer = await fs.readFile(normalizedPath);
|
|
440
|
-
const mimeType = getMimeType(normalizedPath) ?? 'application/octet-stream';
|
|
441
|
-
const fileType = isPdfFile(normalizedPath) ? 'PDF' : 'Image';
|
|
442
441
|
const baseName = path.basename(normalizedPath);
|
|
442
|
+
const mimeType = getMimeType(normalizedPath) ?? 'application/octet-stream';
|
|
443
|
+
const fileType = isImageFile(normalizedPath) ? 'Image' : 'PDF';
|
|
443
444
|
return {
|
|
444
445
|
attachment: {
|
|
445
446
|
base64: buffer.toString('base64'),
|
|
@@ -456,6 +457,11 @@ export class FileSystemService {
|
|
|
456
457
|
truncated: false,
|
|
457
458
|
};
|
|
458
459
|
}
|
|
460
|
+
// Handle PDF files with text extraction (pdfMode='text')
|
|
461
|
+
if (isPdfFile(normalizedPath)) {
|
|
462
|
+
const buffer = await fs.readFile(normalizedPath);
|
|
463
|
+
return this.extractPdfTextContent(buffer, normalizedPath, stats.size, options);
|
|
464
|
+
}
|
|
459
465
|
// Check for binary files (read first 4KB for detection)
|
|
460
466
|
const handle = await fs.open(normalizedPath, 'r');
|
|
461
467
|
const sampleBuffer = Buffer.alloc(BINARY_DETECTION_BUFFER_SIZE);
|
|
@@ -486,7 +492,7 @@ export class FileSystemService {
|
|
|
486
492
|
if (truncated) {
|
|
487
493
|
const remainingLines = totalLines - lastReadLine;
|
|
488
494
|
message =
|
|
489
|
-
`Read lines ${offset + 1}-${lastReadLine}
|
|
495
|
+
`Read lines ${offset + 1}-${lastReadLine}. ` +
|
|
490
496
|
`${remainingLines} more lines available. Use offset=${lastReadLine + 1} to continue reading.`;
|
|
491
497
|
}
|
|
492
498
|
else {
|
|
@@ -520,7 +526,8 @@ export class FileSystemService {
|
|
|
520
526
|
error instanceof PathNotAllowedError ||
|
|
521
527
|
error instanceof PathTraversalError ||
|
|
522
528
|
error instanceof PathBlockedError ||
|
|
523
|
-
error instanceof ReadOperationError
|
|
529
|
+
error instanceof ReadOperationError ||
|
|
530
|
+
error instanceof PdfExtractionError) {
|
|
524
531
|
throw error;
|
|
525
532
|
}
|
|
526
533
|
// Wrap other errors
|
|
@@ -725,6 +732,82 @@ export class FileSystemService {
|
|
|
725
732
|
return null;
|
|
726
733
|
}
|
|
727
734
|
}
|
|
735
|
+
/**
|
|
736
|
+
* Extracts text content from a PDF file with pagination support.
|
|
737
|
+
* @param buffer - PDF file buffer
|
|
738
|
+
* @param filePath - Path to the PDF file
|
|
739
|
+
* @param fileSize - Size of the file in bytes
|
|
740
|
+
* @param options - Read options including offset and limit
|
|
741
|
+
* @returns FileContent with extracted text
|
|
742
|
+
*/
|
|
743
|
+
async extractPdfTextContent(buffer, filePath, fileSize, options) {
|
|
744
|
+
// Extract text with pagination
|
|
745
|
+
const result = await PdfExtractor.extractText(buffer, filePath, {
|
|
746
|
+
limit: options.limit,
|
|
747
|
+
offset: options.offset,
|
|
748
|
+
});
|
|
749
|
+
const { hasMore, metadata, pages } = result;
|
|
750
|
+
const totalPages = metadata.pageCount;
|
|
751
|
+
// Check if PDF has no extractable text
|
|
752
|
+
const hasText = pages.some((p) => p.text.trim().length > 0);
|
|
753
|
+
if (!hasText && pages.length > 0) {
|
|
754
|
+
// Return helpful message for scanned/image-only PDFs
|
|
755
|
+
const metaInfo = metadata.title ? ` Title: "${metadata.title}".` : '';
|
|
756
|
+
return {
|
|
757
|
+
content: '',
|
|
758
|
+
encoding: 'utf8',
|
|
759
|
+
formattedContent: `<file type="pdf" pages="${totalPages}">\n[PDF has no extractable text - likely scanned or image-only]${metaInfo}\n</file>`,
|
|
760
|
+
lines: 0,
|
|
761
|
+
message: `PDF has no extractable text (${totalPages} pages).${metaInfo} ` +
|
|
762
|
+
"This PDF may be scanned or contain only images. Try reading with pdfMode='base64' for multimodal analysis.",
|
|
763
|
+
pdfMetadata: metadata,
|
|
764
|
+
pdfPages: pages,
|
|
765
|
+
size: fileSize,
|
|
766
|
+
totalLines: totalPages,
|
|
767
|
+
truncated: false,
|
|
768
|
+
};
|
|
769
|
+
}
|
|
770
|
+
// Calculate next offset for continuation
|
|
771
|
+
const startPage = options.offset ?? 1;
|
|
772
|
+
const pagesRead = pages.length;
|
|
773
|
+
const nextOffset = startPage + pagesRead;
|
|
774
|
+
// Format content with page separators
|
|
775
|
+
const formattedText = formatPdfContent(pages, metadata, hasMore, nextOffset);
|
|
776
|
+
// Build XML-wrapped formatted content
|
|
777
|
+
const formattedContent = `<file type="pdf" pages="${totalPages}">\n${formattedText}\n</file>`;
|
|
778
|
+
// Build message
|
|
779
|
+
let message;
|
|
780
|
+
if (pagesRead === 0) {
|
|
781
|
+
message = `PDF has ${totalPages} pages. Requested offset ${startPage} is beyond the last page.`;
|
|
782
|
+
}
|
|
783
|
+
else if (hasMore) {
|
|
784
|
+
const endPage = startPage + pagesRead - 1;
|
|
785
|
+
const remainingPages = totalPages - endPage;
|
|
786
|
+
message =
|
|
787
|
+
`Read pages ${startPage}-${endPage}. ` +
|
|
788
|
+
`${remainingPages} more pages available. Must set offset=${nextOffset} to continue reading.`;
|
|
789
|
+
}
|
|
790
|
+
else {
|
|
791
|
+
message = `End of PDF - read ${pagesRead} pages (${totalPages} total).`;
|
|
792
|
+
}
|
|
793
|
+
// Generate preview (first page text, truncated)
|
|
794
|
+
const previewText = pages[0]?.text ?? '';
|
|
795
|
+
const previewLines = previewText.split('\n').slice(0, PREVIEW_LINES);
|
|
796
|
+
const preview = previewLines.join('\n');
|
|
797
|
+
return {
|
|
798
|
+
content: pages.map((p) => p.text).join('\n\n'),
|
|
799
|
+
encoding: 'utf8',
|
|
800
|
+
formattedContent,
|
|
801
|
+
lines: pagesRead,
|
|
802
|
+
message,
|
|
803
|
+
pdfMetadata: metadata,
|
|
804
|
+
pdfPages: pages,
|
|
805
|
+
preview,
|
|
806
|
+
size: fileSize,
|
|
807
|
+
totalLines: totalPages,
|
|
808
|
+
truncated: hasMore,
|
|
809
|
+
};
|
|
810
|
+
}
|
|
728
811
|
/**
|
|
729
812
|
* Checks if a command is available in the system's PATH.
|
|
730
813
|
*/
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import type { PdfMetadata, PdfPageContent } from '../../../core/domain/cipher/file-system/types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Options for PDF text extraction.
|
|
4
|
+
*/
|
|
5
|
+
export interface PdfExtractOptions {
|
|
6
|
+
/** Maximum number of pages to extract (default: 100, max: 200) */
|
|
7
|
+
limit?: number;
|
|
8
|
+
/** Starting page number (1-based, default: 1) */
|
|
9
|
+
offset?: number;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Result of PDF text extraction.
|
|
13
|
+
*/
|
|
14
|
+
export interface PdfExtractResult {
|
|
15
|
+
/** Whether there are more pages available after this extraction */
|
|
16
|
+
hasMore: boolean;
|
|
17
|
+
/** PDF metadata (page count, title, author, etc.) */
|
|
18
|
+
metadata: PdfMetadata;
|
|
19
|
+
/** Extracted page contents */
|
|
20
|
+
pages: PdfPageContent[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* PDF text extraction and metadata extraction utility.
|
|
24
|
+
* Provides page-by-page extraction with pagination support.
|
|
25
|
+
*
|
|
26
|
+
* Features:
|
|
27
|
+
* - Magic byte validation
|
|
28
|
+
* - Fast metadata-only extraction
|
|
29
|
+
* - Page-by-page text extraction with offset/limit
|
|
30
|
+
* - Default: 100 pages, max: 200 pages per extraction
|
|
31
|
+
*/
|
|
32
|
+
export declare class PdfExtractor {
|
|
33
|
+
/**
|
|
34
|
+
* Extracts metadata from a PDF buffer without extracting text.
|
|
35
|
+
* This is a fast path when you only need page count, title, author, etc.
|
|
36
|
+
*
|
|
37
|
+
* @param buffer - PDF file buffer
|
|
38
|
+
* @param filePath - Path to the PDF file (for error messages)
|
|
39
|
+
* @returns PDF metadata
|
|
40
|
+
*/
|
|
41
|
+
static extractMetadata(buffer: Buffer, filePath: string): Promise<PdfMetadata>;
|
|
42
|
+
/**
|
|
43
|
+
* Extracts text from a PDF buffer with pagination support.
|
|
44
|
+
*
|
|
45
|
+
* @param buffer - PDF file buffer
|
|
46
|
+
* @param filePath - Path to the PDF file (for error messages)
|
|
47
|
+
* @param options - Extraction options (offset, limit)
|
|
48
|
+
* @returns Extraction result with pages, metadata, and continuation info
|
|
49
|
+
*/
|
|
50
|
+
static extractText(buffer: Buffer, filePath: string, options?: PdfExtractOptions): Promise<PdfExtractResult>;
|
|
51
|
+
/**
|
|
52
|
+
* Checks if a buffer contains valid PDF magic bytes.
|
|
53
|
+
* @param buffer - Buffer to check
|
|
54
|
+
* @returns true if buffer starts with %PDF-
|
|
55
|
+
*/
|
|
56
|
+
static isValidPdf(buffer: Buffer): boolean;
|
|
57
|
+
/**
|
|
58
|
+
* Builds PdfMetadata from unpdf meta info object.
|
|
59
|
+
* @param pageCount - Total number of pages
|
|
60
|
+
* @param info - Optional info object from unpdf getMeta
|
|
61
|
+
* @returns PdfMetadata object
|
|
62
|
+
*/
|
|
63
|
+
private static buildMetadataFromInfo;
|
|
64
|
+
/**
|
|
65
|
+
* Extracts text from specific pages of a PDF document.
|
|
66
|
+
* Uses PDF.js page-level API for efficient extraction of page ranges.
|
|
67
|
+
*
|
|
68
|
+
* @param pdf - PDF document proxy from unpdf
|
|
69
|
+
* @param startPage - Starting page number (1-based)
|
|
70
|
+
* @param endPage - Ending page number (1-based, inclusive)
|
|
71
|
+
* @returns Array of PdfPageContent with extracted text
|
|
72
|
+
*/
|
|
73
|
+
private static extractPagesFromDocument;
|
|
74
|
+
/**
|
|
75
|
+
* Extracts a meaningful error message from an unknown error.
|
|
76
|
+
*/
|
|
77
|
+
private static getExtractionErrorMessage;
|
|
78
|
+
/**
|
|
79
|
+
* Parses PDF date string format (D:YYYYMMDDHHmmSS) to Date object.
|
|
80
|
+
* @param dateStr - PDF date string
|
|
81
|
+
* @returns Parsed Date or undefined if invalid
|
|
82
|
+
*/
|
|
83
|
+
private static parsePdfDate;
|
|
84
|
+
/**
|
|
85
|
+
* Wraps extraction errors with appropriate PdfExtractionError.
|
|
86
|
+
* @param error - The caught error
|
|
87
|
+
* @param filePath - Path to the PDF file
|
|
88
|
+
* @returns PdfExtractionError with appropriate message
|
|
89
|
+
*/
|
|
90
|
+
private static wrapExtractionError;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Formats extracted PDF pages into a readable string with page separators.
|
|
94
|
+
* @param pages - Array of extracted page contents
|
|
95
|
+
* @param metadata - PDF metadata
|
|
96
|
+
* @param hasMore - Whether there are more pages
|
|
97
|
+
* @param nextOffset - Next offset for continuation (if hasMore is true)
|
|
98
|
+
* @returns Formatted string with page separators
|
|
99
|
+
*/
|
|
100
|
+
export declare function formatPdfContent(pages: PdfPageContent[], metadata: PdfMetadata, hasMore: boolean, nextOffset: number): string;
|