byterover-cli 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +10 -5
  2. package/dist/core/domain/cipher/errors/file-system-error.d.ts +11 -0
  3. package/dist/core/domain/cipher/errors/file-system-error.js +17 -0
  4. package/dist/core/domain/cipher/file-system/types.d.ts +40 -6
  5. package/dist/core/domain/entities/agent.d.ts +1 -1
  6. package/dist/core/domain/entities/agent.js +5 -0
  7. package/dist/core/interfaces/cipher/cipher-services.d.ts +0 -3
  8. package/dist/core/interfaces/cipher/index.d.ts +0 -2
  9. package/dist/infra/cipher/file-system/binary-utils.d.ts +15 -2
  10. package/dist/infra/cipher/file-system/binary-utils.js +26 -3
  11. package/dist/infra/cipher/file-system/file-system-service.d.ts +9 -0
  12. package/dist/infra/cipher/file-system/file-system-service.js +91 -8
  13. package/dist/infra/cipher/file-system/pdf-extractor.d.ts +100 -0
  14. package/dist/infra/cipher/file-system/pdf-extractor.js +226 -0
  15. package/dist/infra/cipher/tools/implementations/read-file-tool.js +24 -4
  16. package/dist/infra/connectors/rules/rules-connector-config.d.ts +4 -0
  17. package/dist/infra/connectors/rules/rules-connector-config.js +4 -0
  18. package/dist/infra/mcp/tools/brv-curate-tool.d.ts +10 -4
  19. package/dist/infra/mcp/tools/brv-curate-tool.js +9 -4
  20. package/dist/infra/repl/commands/space/switch-command.js +0 -2
  21. package/dist/infra/usecase/curate-use-case.js +10 -4
  22. package/dist/infra/usecase/space-switch-use-case.d.ts +0 -10
  23. package/dist/infra/usecase/space-switch-use-case.js +7 -37
  24. package/dist/oclif/hooks/init/welcome.js +4 -17
  25. package/dist/resources/prompts/curate.yml +1 -0
  26. package/dist/resources/tools/read_file.txt +5 -2
  27. package/dist/utils/file-validator.js +8 -4
  28. package/oclif.manifest.json +1 -54
  29. package/package.json +4 -2
  30. package/dist/core/interfaces/cipher/i-coding-agent-log-parser.d.ts +0 -20
  31. package/dist/core/interfaces/cipher/i-coding-agent-log-parser.js +0 -1
  32. package/dist/core/interfaces/cipher/i-coding-agent-log-watcher.d.ts +0 -31
  33. package/dist/core/interfaces/cipher/i-coding-agent-log-watcher.js +0 -1
  34. package/dist/core/interfaces/i-file-watcher-service.d.ts +0 -41
  35. package/dist/core/interfaces/i-file-watcher-service.js +0 -1
  36. package/dist/core/interfaces/parser/i-clean-parser-service.d.ts +0 -18
  37. package/dist/core/interfaces/parser/i-clean-parser-service.js +0 -1
  38. package/dist/core/interfaces/parser/i-raw-parser-service.d.ts +0 -17
  39. package/dist/core/interfaces/parser/i-raw-parser-service.js +0 -1
  40. package/dist/core/interfaces/parser/i-session-normalizer.d.ts +0 -56
  41. package/dist/core/interfaces/parser/i-session-normalizer.js +0 -1
  42. package/dist/infra/cipher/parsers/coding-agent-log-parser.d.ts +0 -24
  43. package/dist/infra/cipher/parsers/coding-agent-log-parser.js +0 -51
  44. package/dist/infra/cipher/watcher/coding-agent-log-watcher.d.ts +0 -14
  45. package/dist/infra/cipher/watcher/coding-agent-log-watcher.js +0 -55
  46. package/dist/infra/parsers/clean/clean-claude-service.d.ts +0 -111
  47. package/dist/infra/parsers/clean/clean-claude-service.js +0 -271
  48. package/dist/infra/parsers/clean/clean-codex-service.d.ts +0 -231
  49. package/dist/infra/parsers/clean/clean-codex-service.js +0 -534
  50. package/dist/infra/parsers/clean/clean-copilot-service.d.ts +0 -255
  51. package/dist/infra/parsers/clean/clean-copilot-service.js +0 -729
  52. package/dist/infra/parsers/clean/clean-cursor-service.d.ts +0 -161
  53. package/dist/infra/parsers/clean/clean-cursor-service.js +0 -432
  54. package/dist/infra/parsers/clean/clean-parser-service-factory.d.ts +0 -54
  55. package/dist/infra/parsers/clean/clean-parser-service-factory.js +0 -80
  56. package/dist/infra/parsers/clean/shared.d.ts +0 -84
  57. package/dist/infra/parsers/clean/shared.js +0 -273
  58. package/dist/infra/parsers/raw/raw-claude-service.d.ts +0 -195
  59. package/dist/infra/parsers/raw/raw-claude-service.js +0 -548
  60. package/dist/infra/parsers/raw/raw-codex-service.d.ts +0 -313
  61. package/dist/infra/parsers/raw/raw-codex-service.js +0 -782
  62. package/dist/infra/parsers/raw/raw-copilot-service.d.ts +0 -196
  63. package/dist/infra/parsers/raw/raw-copilot-service.js +0 -558
  64. package/dist/infra/parsers/raw/raw-cursor-service.d.ts +0 -316
  65. package/dist/infra/parsers/raw/raw-cursor-service.js +0 -818
  66. package/dist/infra/parsers/raw/raw-parser-service-factory.d.ts +0 -54
  67. package/dist/infra/parsers/raw/raw-parser-service-factory.js +0 -81
  68. package/dist/infra/watcher/file-watcher-service.d.ts +0 -10
  69. package/dist/infra/watcher/file-watcher-service.js +0 -81
  70. package/dist/oclif/commands/watch.d.ts +0 -25
  71. package/dist/oclif/commands/watch.js +0 -175
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # ByteRover CLI
2
2
 
3
- Command-line interface for ByteRover, featuring an interactive REPL with a modern React/Ink terminal UI for managing your project's context tree and knowledge storage. Seamlessly integrate with 18 AI coding agents via modern skill files, MCP tools, or rules-based integration—supports Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and 13 more.
3
+ Command-line interface for ByteRover, featuring an interactive REPL with a modern React/Ink terminal UI for managing your project's context tree and knowledge storage. Seamlessly integrate with 19 AI coding agents via modern skill files, MCP tools, or rules-based integration—supports Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and more.
4
4
 
5
5
  [![Version](https://img.shields.io/npm/v/byterover-cli.svg)](https://npmjs.org/package/byterover-cli)
6
6
  [![Downloads/week](https://img.shields.io/npm/dw/byterover-cli.svg)](https://npmjs.org/package/byterover-cli)
@@ -154,7 +154,7 @@ The **Context Tree** is ByteRover's structured knowledge system that helps you a
154
154
  - **Organized Knowledge**: Structure your project knowledge by domain and topic
155
155
  - **Easy Retrieval**: Find relevant context quickly when you need it
156
156
  - **Persistent Memory**: Maintain project-specific knowledge across sessions
157
- - **Agent-Friendly**: Works seamlessly with 18 AI coding agents (Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and 13 more) via skill files, MCP tools, hooks, or rules
157
+ - **Agent-Friendly**: Works seamlessly with 19 AI coding agents (Claude Code, Cursor, Windsurf, GitHub Copilot, Cline, and more) via skill files, MCP tools, hooks, or rules
158
158
  - **Cloud Sync**: Push and sync your context tree to ByteRover's cloud storage for backup and team collaboration
159
159
  - **Dynamic Domains**: Automatically creates new domains as your knowledge grows
160
160
 
@@ -167,13 +167,16 @@ The context tree organizes knowledge into:
167
167
 
168
168
  ## Supported AI Agents
169
169
 
170
- ByteRover integrates with 18 AI coding agents:
170
+ ByteRover integrates with 19 AI coding agents:
171
171
 
172
172
  **Skill Connector (Default):**
173
173
  - Claude Code, Cursor
174
174
 
175
175
  **MCP Connector (Default):**
176
- - Amp, Augment Code, Cline, Gemini CLI, Github Copilot, Junie, Kilo Code, Kiro, Qoder, Qwen Code, Roo Code, Trae.ai, Warp, Windsurf, Zed (and Codex via global scope)
176
+ - Amp, Augment Code, Cline, Codex, Gemini CLI, Github Copilot, Junie, Kilo Code, Kiro, Qoder, Qwen Code, Roo Code, Trae.ai, Warp, Windsurf, Zed
177
+
178
+ **Rules Connector (Default):**
179
+ - Antigravity (rules-only integration)
177
180
 
178
181
  **All agents support rules-based integration as a universal fallback option.**
179
182
 
@@ -190,12 +193,13 @@ Use `/connectors` to manage integrations with your AI coding agents:
190
193
  ByteRover supports four connector types:
191
194
 
192
195
  1. **Skill integration** (Claude Code, Cursor - default): Modern integration that writes 3 markdown files (SKILL.md, TROUBLESHOOTING.md, WORKFLOWS.md) to your agent's skills directory for easy discovery and guidance
193
- 2. **MCP integration** (16 other agents - default): Exposes brv-query and brv-curate as Model Context Protocol tools that AI agents can call directly
196
+ 2. **MCP integration** (16 agents - default): Exposes brv-query and brv-curate as Model Context Protocol tools that AI agents can call directly
194
197
  3. **Rules-based** (all agents): Generates agent-specific rule files (e.g., CLAUDE.md, .cursorrules) with instructions for using ByteRover
195
198
  4. **Hook integration** (Claude Code only - legacy): Direct injection via IDE settings, replaced by skill connector
196
199
 
197
200
  **Defaults by agent:**
198
201
  - Claude Code, Cursor: Skill connector
202
+ - Antigravity: Rules connector (only supported type)
199
203
  - All others (16 agents): MCP connector
200
204
  - Rules: Available for all agents as fallback
201
205
 
@@ -269,6 +273,7 @@ ByteRover supports four connector types:
269
273
 
270
274
  **Defaults:**
271
275
  - Claude Code, Cursor: `skill`
276
+ - Antigravity: `rules` (only supported type)
272
277
  - All others: `mcp`
273
278
 
274
279
  **Reset options:**
@@ -210,3 +210,14 @@ export declare class TooManyResultsError extends FileSystemError {
210
210
  */
211
211
  constructor(operation: string, count: number, maxResults: number);
212
212
  }
213
+ /**
214
+ * Error thrown when PDF text extraction fails.
215
+ */
216
+ export declare class PdfExtractionError extends FileSystemError {
217
+ /**
218
+ * Creates a new PDF extraction error
219
+ * @param path - Path to the PDF file
220
+ * @param reason - Reason for the extraction failure
221
+ */
222
+ constructor(path: string, reason: string);
223
+ }
@@ -290,3 +290,20 @@ export class TooManyResultsError extends FileSystemError {
290
290
  this.name = 'TooManyResultsError';
291
291
  }
292
292
  }
293
+ /**
294
+ * Error thrown when PDF text extraction fails.
295
+ */
296
+ export class PdfExtractionError extends FileSystemError {
297
+ /**
298
+ * Creates a new PDF extraction error
299
+ * @param path - Path to the PDF file
300
+ * @param reason - Reason for the extraction failure
301
+ */
302
+ constructor(path, reason) {
303
+ super(`Failed to extract text from PDF: ${path}. ${reason}`, 'PDF_EXTRACTION_FAILED', {
304
+ path,
305
+ reason,
306
+ });
307
+ this.name = 'PdfExtractionError';
308
+ }
309
+ }
@@ -18,16 +18,46 @@ export interface FileSystemConfig {
18
18
  /** Working directory for relative path resolution */
19
19
  workingDirectory: string;
20
20
  }
21
+ /**
22
+ * PDF read mode for controlling how PDF files are returned.
23
+ * - 'text': Extract text content page by page (default)
24
+ * - 'base64': Return raw PDF as base64 attachment (for multimodal LLMs)
25
+ */
26
+ export type PdfReadMode = 'base64' | 'text';
27
+ /**
28
+ * Metadata extracted from a PDF file.
29
+ */
30
+ export interface PdfMetadata {
31
+ /** Author of the PDF (if available) */
32
+ author?: string;
33
+ /** Creation date of the PDF (if available) */
34
+ creationDate?: Date;
35
+ /** Total number of pages in the PDF */
36
+ pageCount: number;
37
+ /** Title of the PDF (if available) */
38
+ title?: string;
39
+ }
40
+ /**
41
+ * Content extracted from a single PDF page.
42
+ */
43
+ export interface PdfPageContent {
44
+ /** 1-based page number */
45
+ pageNumber: number;
46
+ /** Extracted text content from the page */
47
+ text: string;
48
+ }
21
49
  /**
22
50
  * Options for reading files.
23
51
  */
24
52
  export interface ReadFileOptions {
25
53
  /** Character encoding */
26
54
  encoding?: BufferEncoding;
27
- /** Maximum number of lines to read */
55
+ /** Maximum number of lines to read (for text files) or pages (for PDFs in text mode) */
28
56
  limit?: number;
29
- /** Starting line number (1-based, like text editors) */
57
+ /** Starting line number (1-based) for text files, or starting page number for PDFs */
30
58
  offset?: number;
59
+ /** PDF read mode: 'text' (default) extracts text, 'base64' returns raw attachment */
60
+ pdfMode?: PdfReadMode;
31
61
  }
32
62
  /**
33
63
  * Options for writing files.
@@ -126,23 +156,27 @@ export interface FileAttachment {
126
156
  * Result of a file read operation.
127
157
  */
128
158
  export interface FileContent {
129
- /** Attachment data for binary files (images, PDFs) */
159
+ /** Attachment data for binary files (images, PDFs in base64 mode) */
130
160
  attachment?: FileAttachment;
131
161
  /** File content as string */
132
162
  content: string;
133
163
  /** Character encoding used */
134
164
  encoding: string;
135
- /** Formatted content with line numbers (00001| content format) */
165
+ /** Formatted content with line numbers (00001| content format) or PDF page separators */
136
166
  formattedContent: string;
137
- /** Total number of lines in the returned content */
167
+ /** Total number of lines in the returned content (or pages for PDF text mode) */
138
168
  lines: number;
139
169
  /** Human-readable message about file status (truncation info, etc.) */
140
170
  message: string;
171
+ /** PDF metadata when reading PDF in text mode */
172
+ pdfMetadata?: PdfMetadata;
173
+ /** PDF page contents when reading PDF in text mode */
174
+ pdfPages?: PdfPageContent[];
141
175
  /** Preview of content (first 20 lines) for UI display */
142
176
  preview?: string;
143
177
  /** File size in bytes */
144
178
  size: number;
145
- /** Total lines in the entire file */
179
+ /** Total lines in the entire file (or total pages for PDF text mode) */
146
180
  totalLines: number;
147
181
  /** Whether content was truncated due to size/line limits */
148
182
  truncated: boolean;
@@ -2,7 +2,7 @@ import type { ConnectorType } from './connector-type.js';
2
2
  /**
3
3
  * Array of all supported Agents.
4
4
  */
5
- export declare const AGENT_VALUES: readonly ["Amp", "Augment Code", "Claude Code", "Cline", "Codex", "Cursor", "Gemini CLI", "Github Copilot", "Junie", "Kilo Code", "Kiro", "Qoder", "Qwen Code", "Roo Code", "Trae.ai", "Warp", "Windsurf", "Zed"];
5
+ export declare const AGENT_VALUES: readonly ["Amp", "Antigravity", "Augment Code", "Claude Code", "Cline", "Codex", "Cursor", "Gemini CLI", "Github Copilot", "Junie", "Kilo Code", "Kiro", "Qoder", "Qwen Code", "Roo Code", "Trae.ai", "Warp", "Windsurf", "Zed"];
6
6
  export type Agent = (typeof AGENT_VALUES)[number];
7
7
  /**
8
8
  * Connector availability configuration for an agent.
@@ -3,6 +3,7 @@
3
3
  */
4
4
  export const AGENT_VALUES = [
5
5
  'Amp',
6
+ 'Antigravity',
6
7
  'Augment Code',
7
8
  'Claude Code',
8
9
  'Cline',
@@ -30,6 +31,10 @@ export const AGENT_CONNECTOR_CONFIG = {
30
31
  default: 'mcp',
31
32
  supported: ['rules', 'mcp'],
32
33
  },
34
+ Antigravity: {
35
+ default: 'rules',
36
+ supported: ['rules'],
37
+ },
33
38
  'Augment Code': {
34
39
  default: 'mcp',
35
40
  supported: ['rules', 'mcp'],
@@ -8,7 +8,6 @@ import type { SystemPromptManager } from '../../../infra/cipher/system-prompt/sy
8
8
  import type { ToolManager } from '../../../infra/cipher/tools/tool-manager.js';
9
9
  import type { ToolProvider } from '../../../infra/cipher/tools/tool-provider.js';
10
10
  import type { IBlobStorage } from './i-blob-storage.js';
11
- import type { ICodingAgentLogWatcher } from './i-coding-agent-log-watcher.js';
12
11
  import type { IHistoryStorage } from './i-history-storage.js';
13
12
  import type { ILLMService } from './i-llm-service.js';
14
13
  import type { IPolicyEngine } from './i-policy-engine.js';
@@ -28,12 +27,10 @@ import type { IToolScheduler } from './i-tool-scheduler.js';
28
27
  * - HistoryStorage: Conversation history persistence
29
28
  * - MemoryManager: Agent memory system
30
29
  * - ToolProvider: Provides available tools
31
- * - CodingAgentLogWatcher: Watches coding agent logs for learning (optional)
32
30
  */
33
31
  export interface CipherAgentServices {
34
32
  agentEventBus: AgentEventBus;
35
33
  blobStorage: IBlobStorage;
36
- codingAgentLogWatcher?: ICodingAgentLogWatcher;
37
34
  /**
38
35
  * CompactionService for context overflow management.
39
36
  * Only available when granular storage is enabled (useGranularStorage: true).
@@ -8,8 +8,6 @@ export * from './cipher-services.js';
8
8
  export type { IBlobStorage } from './i-blob-storage.js';
9
9
  export type { IChatSession } from './i-chat-session.js';
10
10
  export type { ICipherAgent } from './i-cipher-agent.js';
11
- export type { ICodingAgentLogParser } from './i-coding-agent-log-parser.js';
12
- export type { ICodingAgentLogWatcher } from './i-coding-agent-log-watcher.js';
13
11
  export type { IContentGenerator } from './i-content-generator.js';
14
12
  export type { IEventEmitter } from './i-event-emitter.js';
15
13
  export type { IFileSystem } from './i-file-system.js';
@@ -33,6 +33,19 @@ export declare function isPdfFile(filePath: string, buffer?: Buffer): boolean;
33
33
  */
34
34
  export declare function getMimeType(filePath: string): null | string;
35
35
  /**
36
- * Checks if a file is a media file (image or PDF) for base64 attachment handling.
36
+ * Checks if a file is a media file (only images supported at this point). PDFs are handled separately.
37
+ * @param filePath - Path to the file
38
+ */
39
+ export declare function isMediaFile(filePath: string): boolean;
40
+ /**
41
+ * Determines if a file should be returned as a base64 attachment.
42
+ *
43
+ * - Images: Always returned as attachment
44
+ * - PDFs: Depends on pdfMode ('base64' = attachment, 'text' = extract text)
45
+ * - Other files: Never returned as attachment
46
+ *
47
+ * @param filePath - Path to the file
48
+ * @param pdfMode - PDF read mode ('text' | 'base64'), defaults to 'text'
49
+ * @returns true if file should be returned as base64 attachment
37
50
  */
38
- export declare function isMediaFile(filePath: string, buffer?: Buffer): boolean;
51
+ export declare function shouldReturnAsAttachment(filePath: string, pdfMode?: 'base64' | 'text'): boolean;
@@ -172,8 +172,31 @@ export function getMimeType(filePath) {
172
172
  return MIME_TYPES[ext] ?? null;
173
173
  }
174
174
  /**
175
- * Checks if a file is a media file (image or PDF) for base64 attachment handling.
175
+ * Checks if a file is a media file (only images supported at this point). PDFs are handled separately.
176
+ * @param filePath - Path to the file
176
177
  */
177
- export function isMediaFile(filePath, buffer) {
178
- return isImageFile(filePath) || isPdfFile(filePath, buffer);
178
+ export function isMediaFile(filePath) {
179
+ return isImageFile(filePath);
180
+ }
181
+ /**
182
+ * Determines if a file should be returned as a base64 attachment.
183
+ *
184
+ * - Images: Always returned as attachment
185
+ * - PDFs: Depends on pdfMode ('base64' = attachment, 'text' = extract text)
186
+ * - Other files: Never returned as attachment
187
+ *
188
+ * @param filePath - Path to the file
189
+ * @param pdfMode - PDF read mode ('text' | 'base64'), defaults to 'text'
190
+ * @returns true if file should be returned as base64 attachment
191
+ */
192
+ export function shouldReturnAsAttachment(filePath, pdfMode) {
193
+ // Images are always returned as attachments
194
+ if (isImageFile(filePath)) {
195
+ return true;
196
+ }
197
+ // PDFs depend on pdfMode (if pdfMode is 'base64', return true)
198
+ if (isPdfFile(filePath) && pdfMode === 'base64') {
199
+ return true;
200
+ }
201
+ return false;
179
202
  }
@@ -89,6 +89,15 @@ export declare class FileSystemService implements IFileSystem {
89
89
  * Returns null if grep is not available or fails.
90
90
  */
91
91
  private executeSystemGrep;
92
+ /**
93
+ * Extracts text content from a PDF file with pagination support.
94
+ * @param buffer - PDF file buffer
95
+ * @param filePath - Path to the PDF file
96
+ * @param fileSize - Size of the file in bytes
97
+ * @param options - Read options including offset and limit
98
+ * @returns FileContent with extracted text
99
+ */
100
+ private extractPdfTextContent;
92
101
  /**
93
102
  * Checks if a command is available in the system's PATH.
94
103
  */
@@ -3,12 +3,13 @@ import { spawn } from 'node:child_process';
3
3
  import fs from 'node:fs/promises';
4
4
  import { EOL } from 'node:os';
5
5
  import path from 'node:path';
6
- import { DirectoryNotFoundError, EditOperationError, FileNotFoundError, FileTooLargeError, GlobOperationError, InvalidExtensionError, InvalidPathError, InvalidPatternError, PathBlockedError, PathNotAllowedError, PathTraversalError, ReadOperationError, SearchOperationError, ServiceNotInitializedError, StringNotFoundError, StringNotUniqueError, WriteOperationError, } from '../../../core/domain/cipher/errors/file-system-error.js';
6
+ import { DirectoryNotFoundError, EditOperationError, FileNotFoundError, FileTooLargeError, GlobOperationError, InvalidExtensionError, InvalidPathError, InvalidPatternError, PathBlockedError, PathNotAllowedError, PathTraversalError, PdfExtractionError, ReadOperationError, SearchOperationError, ServiceNotInitializedError, StringNotFoundError, StringNotUniqueError, WriteOperationError, } from '../../../core/domain/cipher/errors/file-system-error.js';
7
7
  import { getErrorMessage } from '../../../utils/error-helpers.js';
8
- import { getMimeType, isBinaryFile, isMediaFile, isPdfFile } from './binary-utils.js';
8
+ import { getMimeType, isBinaryFile, isImageFile, isPdfFile, shouldReturnAsAttachment } from './binary-utils.js';
9
9
  import { createGitignoreFilter } from './gitignore-filter.js';
10
10
  import { collectFileMetadata, escapeIfExactMatch, extractPaths, sortFilesByRecency } from './glob-utils.js';
11
11
  import { PathValidator } from './path-validator.js';
12
+ import { formatPdfContent, PdfExtractor } from './pdf-extractor.js';
12
13
  /**
13
14
  * Maximum line length for search results.
14
15
  * Prevents context overflow from minified files or long lines.
@@ -434,12 +435,12 @@ export class FileSystemService {
434
435
  if (stats.size > this.config.maxFileSize) {
435
436
  throw new FileTooLargeError(normalizedPath, stats.size, this.config.maxFileSize);
436
437
  }
437
- // Handle image/PDF files - return as base64 attachment
438
- if (isMediaFile(normalizedPath)) {
438
+ // Handle files that should be returned as base64 attachments (images always, PDFs when pdfMode='base64')
439
+ if (shouldReturnAsAttachment(normalizedPath, options.pdfMode)) {
439
440
  const buffer = await fs.readFile(normalizedPath);
440
- const mimeType = getMimeType(normalizedPath) ?? 'application/octet-stream';
441
- const fileType = isPdfFile(normalizedPath) ? 'PDF' : 'Image';
442
441
  const baseName = path.basename(normalizedPath);
442
+ const mimeType = getMimeType(normalizedPath) ?? 'application/octet-stream';
443
+ const fileType = isImageFile(normalizedPath) ? 'Image' : 'PDF';
443
444
  return {
444
445
  attachment: {
445
446
  base64: buffer.toString('base64'),
@@ -456,6 +457,11 @@ export class FileSystemService {
456
457
  truncated: false,
457
458
  };
458
459
  }
460
+ // Handle PDF files with text extraction (pdfMode='text')
461
+ if (isPdfFile(normalizedPath)) {
462
+ const buffer = await fs.readFile(normalizedPath);
463
+ return this.extractPdfTextContent(buffer, normalizedPath, stats.size, options);
464
+ }
459
465
  // Check for binary files (read first 4KB for detection)
460
466
  const handle = await fs.open(normalizedPath, 'r');
461
467
  const sampleBuffer = Buffer.alloc(BINARY_DETECTION_BUFFER_SIZE);
@@ -486,7 +492,7 @@ export class FileSystemService {
486
492
  if (truncated) {
487
493
  const remainingLines = totalLines - lastReadLine;
488
494
  message =
489
- `Read lines ${offset + 1}-${lastReadLine} of ${totalLines} total lines. ` +
495
+ `Read lines ${offset + 1}-${lastReadLine}. ` +
490
496
  `${remainingLines} more lines available. Use offset=${lastReadLine + 1} to continue reading.`;
491
497
  }
492
498
  else {
@@ -520,7 +526,8 @@ export class FileSystemService {
520
526
  error instanceof PathNotAllowedError ||
521
527
  error instanceof PathTraversalError ||
522
528
  error instanceof PathBlockedError ||
523
- error instanceof ReadOperationError) {
529
+ error instanceof ReadOperationError ||
530
+ error instanceof PdfExtractionError) {
524
531
  throw error;
525
532
  }
526
533
  // Wrap other errors
@@ -725,6 +732,82 @@ export class FileSystemService {
725
732
  return null;
726
733
  }
727
734
  }
735
+ /**
736
+ * Extracts text content from a PDF file with pagination support.
737
+ * @param buffer - PDF file buffer
738
+ * @param filePath - Path to the PDF file
739
+ * @param fileSize - Size of the file in bytes
740
+ * @param options - Read options including offset and limit
741
+ * @returns FileContent with extracted text
742
+ */
743
+ async extractPdfTextContent(buffer, filePath, fileSize, options) {
744
+ // Extract text with pagination
745
+ const result = await PdfExtractor.extractText(buffer, filePath, {
746
+ limit: options.limit,
747
+ offset: options.offset,
748
+ });
749
+ const { hasMore, metadata, pages } = result;
750
+ const totalPages = metadata.pageCount;
751
+ // Check if PDF has no extractable text
752
+ const hasText = pages.some((p) => p.text.trim().length > 0);
753
+ if (!hasText && pages.length > 0) {
754
+ // Return helpful message for scanned/image-only PDFs
755
+ const metaInfo = metadata.title ? ` Title: "${metadata.title}".` : '';
756
+ return {
757
+ content: '',
758
+ encoding: 'utf8',
759
+ formattedContent: `<file type="pdf" pages="${totalPages}">\n[PDF has no extractable text - likely scanned or image-only]${metaInfo}\n</file>`,
760
+ lines: 0,
761
+ message: `PDF has no extractable text (${totalPages} pages).${metaInfo} ` +
762
+ "This PDF may be scanned or contain only images. Try reading with pdfMode='base64' for multimodal analysis.",
763
+ pdfMetadata: metadata,
764
+ pdfPages: pages,
765
+ size: fileSize,
766
+ totalLines: totalPages,
767
+ truncated: false,
768
+ };
769
+ }
770
+ // Calculate next offset for continuation
771
+ const startPage = options.offset ?? 1;
772
+ const pagesRead = pages.length;
773
+ const nextOffset = startPage + pagesRead;
774
+ // Format content with page separators
775
+ const formattedText = formatPdfContent(pages, metadata, hasMore, nextOffset);
776
+ // Build XML-wrapped formatted content
777
+ const formattedContent = `<file type="pdf" pages="${totalPages}">\n${formattedText}\n</file>`;
778
+ // Build message
779
+ let message;
780
+ if (pagesRead === 0) {
781
+ message = `PDF has ${totalPages} pages. Requested offset ${startPage} is beyond the last page.`;
782
+ }
783
+ else if (hasMore) {
784
+ const endPage = startPage + pagesRead - 1;
785
+ const remainingPages = totalPages - endPage;
786
+ message =
787
+ `Read pages ${startPage}-${endPage}. ` +
788
+ `${remainingPages} more pages available. Must set offset=${nextOffset} to continue reading.`;
789
+ }
790
+ else {
791
+ message = `End of PDF - read ${pagesRead} pages (${totalPages} total).`;
792
+ }
793
+ // Generate preview (first page text, truncated)
794
+ const previewText = pages[0]?.text ?? '';
795
+ const previewLines = previewText.split('\n').slice(0, PREVIEW_LINES);
796
+ const preview = previewLines.join('\n');
797
+ return {
798
+ content: pages.map((p) => p.text).join('\n\n'),
799
+ encoding: 'utf8',
800
+ formattedContent,
801
+ lines: pagesRead,
802
+ message,
803
+ pdfMetadata: metadata,
804
+ pdfPages: pages,
805
+ preview,
806
+ size: fileSize,
807
+ totalLines: totalPages,
808
+ truncated: hasMore,
809
+ };
810
+ }
728
811
  /**
729
812
  * Checks if a command is available in the system's PATH.
730
813
  */
@@ -0,0 +1,100 @@
1
+ import type { PdfMetadata, PdfPageContent } from '../../../core/domain/cipher/file-system/types.js';
2
+ /**
3
+ * Options for PDF text extraction.
4
+ */
5
+ export interface PdfExtractOptions {
6
+ /** Maximum number of pages to extract (default: 100, max: 200) */
7
+ limit?: number;
8
+ /** Starting page number (1-based, default: 1) */
9
+ offset?: number;
10
+ }
11
+ /**
12
+ * Result of PDF text extraction.
13
+ */
14
+ export interface PdfExtractResult {
15
+ /** Whether there are more pages available after this extraction */
16
+ hasMore: boolean;
17
+ /** PDF metadata (page count, title, author, etc.) */
18
+ metadata: PdfMetadata;
19
+ /** Extracted page contents */
20
+ pages: PdfPageContent[];
21
+ }
22
+ /**
23
+ * PDF text extraction and metadata extraction utility.
24
+ * Provides page-by-page extraction with pagination support.
25
+ *
26
+ * Features:
27
+ * - Magic byte validation
28
+ * - Fast metadata-only extraction
29
+ * - Page-by-page text extraction with offset/limit
30
+ * - Default: 100 pages, max: 200 pages per extraction
31
+ */
32
+ export declare class PdfExtractor {
33
+ /**
34
+ * Extracts metadata from a PDF buffer without extracting text.
35
+ * This is a fast path when you only need page count, title, author, etc.
36
+ *
37
+ * @param buffer - PDF file buffer
38
+ * @param filePath - Path to the PDF file (for error messages)
39
+ * @returns PDF metadata
40
+ */
41
+ static extractMetadata(buffer: Buffer, filePath: string): Promise<PdfMetadata>;
42
+ /**
43
+ * Extracts text from a PDF buffer with pagination support.
44
+ *
45
+ * @param buffer - PDF file buffer
46
+ * @param filePath - Path to the PDF file (for error messages)
47
+ * @param options - Extraction options (offset, limit)
48
+ * @returns Extraction result with pages, metadata, and continuation info
49
+ */
50
+ static extractText(buffer: Buffer, filePath: string, options?: PdfExtractOptions): Promise<PdfExtractResult>;
51
+ /**
52
+ * Checks if a buffer contains valid PDF magic bytes.
53
+ * @param buffer - Buffer to check
54
+ * @returns true if buffer starts with %PDF-
55
+ */
56
+ static isValidPdf(buffer: Buffer): boolean;
57
+ /**
58
+ * Builds PdfMetadata from unpdf meta info object.
59
+ * @param pageCount - Total number of pages
60
+ * @param info - Optional info object from unpdf getMeta
61
+ * @returns PdfMetadata object
62
+ */
63
+ private static buildMetadataFromInfo;
64
+ /**
65
+ * Extracts text from specific pages of a PDF document.
66
+ * Uses PDF.js page-level API for efficient extraction of page ranges.
67
+ *
68
+ * @param pdf - PDF document proxy from unpdf
69
+ * @param startPage - Starting page number (1-based)
70
+ * @param endPage - Ending page number (1-based, inclusive)
71
+ * @returns Array of PdfPageContent with extracted text
72
+ */
73
+ private static extractPagesFromDocument;
74
+ /**
75
+ * Extracts a meaningful error message from an unknown error.
76
+ */
77
+ private static getExtractionErrorMessage;
78
+ /**
79
+ * Parses PDF date string format (D:YYYYMMDDHHmmSS) to Date object.
80
+ * @param dateStr - PDF date string
81
+ * @returns Parsed Date or undefined if invalid
82
+ */
83
+ private static parsePdfDate;
84
+ /**
85
+ * Wraps extraction errors with appropriate PdfExtractionError.
86
+ * @param error - The caught error
87
+ * @param filePath - Path to the PDF file
88
+ * @returns PdfExtractionError with appropriate message
89
+ */
90
+ private static wrapExtractionError;
91
+ }
92
+ /**
93
+ * Formats extracted PDF pages into a readable string with page separators.
94
+ * @param pages - Array of extracted page contents
95
+ * @param metadata - PDF metadata
96
+ * @param hasMore - Whether there are more pages
97
+ * @param nextOffset - Next offset for continuation (if hasMore is true)
98
+ * @returns Formatted string with page separators
99
+ */
100
+ export declare function formatPdfContent(pages: PdfPageContent[], metadata: PdfMetadata, hasMore: boolean, nextOffset: number): string;