codevault 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +37 -0
- package/LICENSE +21 -0
- package/README.md +174 -0
- package/dist/chunking/file-grouper.d.ts +39 -0
- package/dist/chunking/file-grouper.d.ts.map +1 -0
- package/dist/chunking/file-grouper.js +181 -0
- package/dist/chunking/file-grouper.js.map +1 -0
- package/dist/chunking/semantic-chunker.d.ts +37 -0
- package/dist/chunking/semantic-chunker.d.ts.map +1 -0
- package/dist/chunking/semantic-chunker.js +172 -0
- package/dist/chunking/semantic-chunker.js.map +1 -0
- package/dist/chunking/token-counter.d.ts +28 -0
- package/dist/chunking/token-counter.d.ts.map +1 -0
- package/dist/chunking/token-counter.js +207 -0
- package/dist/chunking/token-counter.js.map +1 -0
- package/dist/cli/commands/context.d.ts +3 -0
- package/dist/cli/commands/context.d.ts.map +1 -0
- package/dist/cli/commands/context.js +98 -0
- package/dist/cli/commands/context.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +222 -0
- package/dist/cli.js.map +1 -0
- package/dist/codemap/io.d.ts +5 -0
- package/dist/codemap/io.d.ts.map +1 -0
- package/dist/codemap/io.js +30 -0
- package/dist/codemap/io.js.map +1 -0
- package/dist/context/packs.d.ts +33 -0
- package/dist/context/packs.d.ts.map +1 -0
- package/dist/context/packs.js +180 -0
- package/dist/context/packs.js.map +1 -0
- package/dist/core/indexer.d.ts +3 -0
- package/dist/core/indexer.d.ts.map +1 -0
- package/dist/core/indexer.js +610 -0
- package/dist/core/indexer.js.map +1 -0
- package/dist/core/metadata.d.ts +19 -0
- package/dist/core/metadata.d.ts.map +1 -0
- package/dist/core/metadata.js +161 -0
- package/dist/core/metadata.js.map +1 -0
- package/dist/core/search.d.ts +6 -0
- package/dist/core/search.d.ts.map +1 -0
- package/dist/core/search.js +482 -0
- package/dist/core/search.js.map +1 -0
- package/dist/core/symbol-extractor.d.ts +3 -0
- package/dist/core/symbol-extractor.d.ts.map +1 -0
- package/dist/core/symbol-extractor.js +78 -0
- package/dist/core/symbol-extractor.js.map +1 -0
- package/dist/core/types.d.ts +96 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +2 -0
- package/dist/core/types.js.map +1 -0
- package/dist/database/db.d.ts +63 -0
- package/dist/database/db.d.ts.map +1 -0
- package/dist/database/db.js +205 -0
- package/dist/database/db.js.map +1 -0
- package/dist/indexer/merkle.d.ts +13 -0
- package/dist/indexer/merkle.d.ts.map +1 -0
- package/dist/indexer/merkle.js +86 -0
- package/dist/indexer/merkle.js.map +1 -0
- package/dist/indexer/update.d.ts +19 -0
- package/dist/indexer/update.d.ts.map +1 -0
- package/dist/indexer/update.js +40 -0
- package/dist/indexer/update.js.map +1 -0
- package/dist/indexer/watch.d.ts +21 -0
- package/dist/indexer/watch.d.ts.map +1 -0
- package/dist/indexer/watch.js +204 -0
- package/dist/indexer/watch.js.map +1 -0
- package/dist/languages/rules.d.ts +11 -0
- package/dist/languages/rules.d.ts.map +1 -0
- package/dist/languages/rules.js +371 -0
- package/dist/languages/rules.js.map +1 -0
- package/dist/languages/tree-sitter-loader.d.ts +27 -0
- package/dist/languages/tree-sitter-loader.d.ts.map +1 -0
- package/dist/languages/tree-sitter-loader.js +75 -0
- package/dist/languages/tree-sitter-loader.js.map +1 -0
- package/dist/mcp/tools/use-context-pack.d.ts +57 -0
- package/dist/mcp/tools/use-context-pack.d.ts.map +1 -0
- package/dist/mcp/tools/use-context-pack.js +91 -0
- package/dist/mcp/tools/use-context-pack.js.map +1 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +363 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/providers/base.d.ts +34 -0
- package/dist/providers/base.d.ts.map +1 -0
- package/dist/providers/base.js +160 -0
- package/dist/providers/base.js.map +1 -0
- package/dist/providers/index.d.ts +6 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +22 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/ollama.d.ts +13 -0
- package/dist/providers/ollama.d.ts.map +1 -0
- package/dist/providers/ollama.js +50 -0
- package/dist/providers/ollama.js.map +1 -0
- package/dist/providers/openai.d.ts +13 -0
- package/dist/providers/openai.d.ts.map +1 -0
- package/dist/providers/openai.js +59 -0
- package/dist/providers/openai.js.map +1 -0
- package/dist/providers/token-counter.d.ts +2 -0
- package/dist/providers/token-counter.d.ts.map +1 -0
- package/dist/providers/token-counter.js +18 -0
- package/dist/providers/token-counter.js.map +1 -0
- package/dist/ranking/api-reranker.d.ts +18 -0
- package/dist/ranking/api-reranker.d.ts.map +1 -0
- package/dist/ranking/api-reranker.js +148 -0
- package/dist/ranking/api-reranker.js.map +1 -0
- package/dist/ranking/symbol-boost.d.ts +15 -0
- package/dist/ranking/symbol-boost.d.ts.map +1 -0
- package/dist/ranking/symbol-boost.js +154 -0
- package/dist/ranking/symbol-boost.js.map +1 -0
- package/dist/search/bm25.d.ts +17 -0
- package/dist/search/bm25.d.ts.map +1 -0
- package/dist/search/bm25.js +56 -0
- package/dist/search/bm25.js.map +1 -0
- package/dist/search/hybrid.d.ts +21 -0
- package/dist/search/hybrid.d.ts.map +1 -0
- package/dist/search/hybrid.js +50 -0
- package/dist/search/hybrid.js.map +1 -0
- package/dist/search/scope.d.ts +5 -0
- package/dist/search/scope.d.ts.map +1 -0
- package/dist/search/scope.js +107 -0
- package/dist/search/scope.js.map +1 -0
- package/dist/storage/encrypted-chunks.d.ts +40 -0
- package/dist/storage/encrypted-chunks.d.ts.map +1 -0
- package/dist/storage/encrypted-chunks.js +236 -0
- package/dist/storage/encrypted-chunks.js.map +1 -0
- package/dist/symbols/extract.d.ts +15 -0
- package/dist/symbols/extract.d.ts.map +1 -0
- package/dist/symbols/extract.js +187 -0
- package/dist/symbols/extract.js.map +1 -0
- package/dist/symbols/graph.d.ts +3 -0
- package/dist/symbols/graph.d.ts.map +1 -0
- package/dist/symbols/graph.js +89 -0
- package/dist/symbols/graph.js.map +1 -0
- package/dist/types/ast.d.ts +3 -0
- package/dist/types/ast.d.ts.map +1 -0
- package/dist/types/ast.js +2 -0
- package/dist/types/ast.js.map +1 -0
- package/dist/types/codemap.d.ts +58 -0
- package/dist/types/codemap.d.ts.map +1 -0
- package/dist/types/codemap.js +224 -0
- package/dist/types/codemap.js.map +1 -0
- package/dist/types/context-pack.d.ts +47 -0
- package/dist/types/context-pack.d.ts.map +1 -0
- package/dist/types/context-pack.js +44 -0
- package/dist/types/context-pack.js.map +1 -0
- package/dist/types/search.d.ts +15 -0
- package/dist/types/search.d.ts.map +1 -0
- package/dist/types/search.js +11 -0
- package/dist/types/search.js.map +1 -0
- package/dist/utils/rate-limiter.d.ts +31 -0
- package/dist/utils/rate-limiter.d.ts.map +1 -0
- package/dist/utils/rate-limiter.js +168 -0
- package/dist/utils/rate-limiter.js.map +1 -0
- package/package.json +92 -0
package/.env.example
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# OpenAI / Nebius API Configuration
|
|
2
|
+
# For Nebius AI (Qwen embeddings), use your Nebius JWT token and base URL
|
|
3
|
+
OPENAI_API_KEY=your-api-key-here
|
|
4
|
+
OPENAI_BASE_URL=https://api.openai.com/v1
|
|
5
|
+
|
|
6
|
+
# Embedding Model Configuration
|
|
7
|
+
# For standard OpenAI: text-embedding-3-large
|
|
8
|
+
# For Nebius/Qwen: Qwen/Qwen3-Embedding-8B
|
|
9
|
+
CODEVAULT_OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
|
10
|
+
|
|
11
|
+
# For Ollama (local embeddings)
|
|
12
|
+
CODEVAULT_OLLAMA_MODEL=nomic-embed-text
|
|
13
|
+
|
|
14
|
+
# Chunking Configuration
|
|
15
|
+
# Standard: 8192 tokens, Qwen: 32000 tokens
|
|
16
|
+
CODEVAULT_MAX_TOKENS=8192
|
|
17
|
+
|
|
18
|
+
# Embedding Dimensions
|
|
19
|
+
# OpenAI text-embedding-3-large: 3072
|
|
20
|
+
# Qwen3-Embedding-8B: 4096
|
|
21
|
+
# Ollama nomic-embed-text: 768
|
|
22
|
+
CODEVAULT_DIMENSIONS=3072
|
|
23
|
+
|
|
24
|
+
# Rate Limiting
|
|
25
|
+
# Adjust based on your API tier
|
|
26
|
+
CODEVAULT_RATE_LIMIT_RPM=10000
|
|
27
|
+
CODEVAULT_RATE_LIMIT_TPM=600000
|
|
28
|
+
|
|
29
|
+
# Reranking API (Optional)
|
|
30
|
+
# For NVIDIA reranking service
|
|
31
|
+
CODEVAULT_RERANK_API_URL=https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking
|
|
32
|
+
CODEVAULT_RERANK_API_KEY=your-nvidia-api-key-here
|
|
33
|
+
CODEVAULT_RERANK_MODEL=nvidia/nv-rerankqa-mistral-4b-v3
|
|
34
|
+
|
|
35
|
+
# Encryption (Optional)
|
|
36
|
+
# 32-byte key encoded as base64 or hex
|
|
37
|
+
# CODEVAULT_ENCRYPTION_KEY=your-encryption-key-here
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Shariq Riaz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# CodeVault
|
|
2
|
+
|
|
3
|
+
> AI-powered semantic code search via Model Context Protocol (MCP)
|
|
4
|
+
|
|
5
|
+
CodeVault is an intelligent code indexing and search system that enables AI assistants to understand and navigate your codebase using semantic search, symbol-aware ranking, and hybrid retrieval techniques.
|
|
6
|
+
|
|
7
|
+
## 🌟 Features
|
|
8
|
+
|
|
9
|
+
- **🔍 Semantic Code Search**: Find code by meaning, not just keywords
|
|
10
|
+
- **🤖 MCP Integration**: Native support for Claude Desktop and other MCP clients
|
|
11
|
+
- **🎯 Symbol-Aware Ranking**: Boost results based on function signatures and relationships
|
|
12
|
+
- **⚡ Hybrid Search**: Combines vector embeddings with BM25 keyword matching
|
|
13
|
+
- **🔄 Context Packs**: Save and reuse search scopes for different projects
|
|
14
|
+
- **📊 Multi-Language Support**: 25+ programming languages via Tree-sitter
|
|
15
|
+
- **🏠 Local-First**: Works with local models (Ollama) or cloud APIs
|
|
16
|
+
- **🔐 Optional Encryption**: Secure your indexed code chunks
|
|
17
|
+
- **📈 Smart Chunking**: Token-aware code splitting for optimal context
|
|
18
|
+
|
|
19
|
+
## 🚀 Quick Start
|
|
20
|
+
|
|
21
|
+
### Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npm install
|
|
25
|
+
npm run build
|
|
26
|
+
npm link
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Index Your Project
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Navigate to your project
|
|
33
|
+
cd /path/to/your/project
|
|
34
|
+
|
|
35
|
+
# Using Ollama (local, no API key required)
|
|
36
|
+
codevault index --provider ollama
|
|
37
|
+
|
|
38
|
+
# Using OpenAI
|
|
39
|
+
export OPENAI_API_KEY=your-key-here
|
|
40
|
+
codevault index --provider openai
|
|
41
|
+
|
|
42
|
+
# Using Qwen (via Nebius AI Studio)
|
|
43
|
+
export OPENAI_API_KEY=your-nebius-api-key
|
|
44
|
+
export OPENAI_BASE_URL=https://api.studio.nebius.com/v1/
|
|
45
|
+
export CODEVAULT_OPENAI_EMBEDDING_MODEL=Qwen/Qwen3-Embedding-8B
|
|
46
|
+
codevault index --provider openai
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Search Your Code
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
codevault search "authentication function"
|
|
53
|
+
codevault search "stripe checkout" --tags stripe --lang php
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Use with Claude Desktop
|
|
57
|
+
|
|
58
|
+
Add to your `claude_desktop_config.json`:
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"mcpServers": {
|
|
63
|
+
"codevault": {
|
|
64
|
+
"command": "node",
|
|
65
|
+
"args": ["/path/to/codevault-v2/dist/mcp-server.js"],
|
|
66
|
+
"env": {
|
|
67
|
+
"OPENAI_API_KEY": "your-api-key-here"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 📖 Documentation
|
|
75
|
+
|
|
76
|
+
### Supported Languages
|
|
77
|
+
|
|
78
|
+
- **Web**: JavaScript, TypeScript, TSX, HTML, CSS, JSON, Markdown
|
|
79
|
+
- **Backend**: Python, PHP, Go, Java, Kotlin, C#, Ruby, Scala, Swift
|
|
80
|
+
- **Systems**: C, C++, Rust
|
|
81
|
+
- **Functional**: Haskell, OCaml, Elixir
|
|
82
|
+
- **Scripting**: Bash, Lua
|
|
83
|
+
|
|
84
|
+
### Embedding Providers
|
|
85
|
+
|
|
86
|
+
| Provider | Model | Dimensions | Context | Best For | API Key Required |
|
|
87
|
+
|----------|-------|------------|---------|----------|------------------|
|
|
88
|
+
| **ollama** | nomic-embed-text | 768 | 8K | Local, no API costs | ❌ No |
|
|
89
|
+
| **openai** | text-embedding-3-large | 3072 | 8K | Highest quality | ✅ Yes |
|
|
90
|
+
| **openai** | Qwen/Qwen3-Embedding-8B | 4096 | 32K | Large context, high quality | ✅ Yes (Nebius AI) |
|
|
91
|
+
|
|
92
|
+
### CLI Commands
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Indexing
|
|
96
|
+
codevault index [path] # Index project
|
|
97
|
+
codevault update [path] # Update existing index
|
|
98
|
+
codevault watch [path] # Watch for changes
|
|
99
|
+
|
|
100
|
+
# Searching
|
|
101
|
+
codevault search <query> [path] # Search code
|
|
102
|
+
--limit <num> # Max results
|
|
103
|
+
--provider <name> # Embedding provider
|
|
104
|
+
--path_glob <pattern> # Filter by file pattern
|
|
105
|
+
--tags <tag...> # Filter by tags
|
|
106
|
+
--lang <language...> # Filter by language
|
|
107
|
+
|
|
108
|
+
# Context Packs
|
|
109
|
+
codevault context list # List saved contexts
|
|
110
|
+
codevault context show <name> # Show context pack
|
|
111
|
+
codevault context use <name> # Activate context pack
|
|
112
|
+
|
|
113
|
+
# Utilities
|
|
114
|
+
codevault info # Project statistics
|
|
115
|
+
codevault mcp # Start MCP server
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### MCP Tools
|
|
119
|
+
|
|
120
|
+
- **`search_code`**: Semantic code search with filters
|
|
121
|
+
- **`search_code_with_chunks`**: Search + retrieve full code
|
|
122
|
+
- **`get_code_chunk`**: Get specific code by SHA
|
|
123
|
+
- **`index_project`**: Index a new project
|
|
124
|
+
- **`update_project`**: Update existing index
|
|
125
|
+
- **`get_project_stats`**: Project overview
|
|
126
|
+
- **`use_context_pack`**: Apply saved search context
|
|
127
|
+
|
|
128
|
+
### Environment Variables
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# OpenAI Configuration
|
|
132
|
+
OPENAI_API_KEY=sk-...
|
|
133
|
+
OPENAI_BASE_URL=https://api.openai.com/v1
|
|
134
|
+
CODEVAULT_OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
|
135
|
+
|
|
136
|
+
# Ollama Configuration
|
|
137
|
+
CODEVAULT_OLLAMA_MODEL=nomic-embed-text
|
|
138
|
+
|
|
139
|
+
# Chunking
|
|
140
|
+
CODEVAULT_MAX_TOKENS=8192
|
|
141
|
+
CODEVAULT_DIMENSIONS=3072
|
|
142
|
+
|
|
143
|
+
# Rate Limiting
|
|
144
|
+
CODEVAULT_RATE_LIMIT_RPM=10000 # Requests per minute
|
|
145
|
+
CODEVAULT_RATE_LIMIT_TPM=600000 # Tokens per minute
|
|
146
|
+
|
|
147
|
+
# Reranking
|
|
148
|
+
CODEVAULT_RERANK_API_URL=...
|
|
149
|
+
CODEVAULT_RERANK_API_KEY=...
|
|
150
|
+
CODEVAULT_RERANK_MODEL=...
|
|
151
|
+
|
|
152
|
+
# Encryption
|
|
153
|
+
CODEVAULT_ENCRYPTION_KEY=...
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## 🏗️ Architecture
|
|
157
|
+
|
|
158
|
+
### Project Structure
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
.codevault/
|
|
162
|
+
├── codevault.db # SQLite database
|
|
163
|
+
├── chunks/ # Compressed code chunks
|
|
164
|
+
└── contextpacks/ # Saved search contexts
|
|
165
|
+
codevault.codemap.json # Lightweight index
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## 📄 License
|
|
169
|
+
|
|
170
|
+
MIT License
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
**Built by Shariq Riaz**
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import type { ModelProfile } from '../providers/base.js';
|
|
2
|
+
import type { TreeSitterNode } from '../types/ast.js';
|
|
3
|
+
interface LanguageRule {
|
|
4
|
+
subdivisionTypes?: Record<string, string[]>;
|
|
5
|
+
[key: string]: any;
|
|
6
|
+
}
|
|
7
|
+
interface NodeAnalysis {
|
|
8
|
+
node: TreeSitterNode;
|
|
9
|
+
size: number;
|
|
10
|
+
code: string;
|
|
11
|
+
}
|
|
12
|
+
interface SemanticGroup {
|
|
13
|
+
type: string;
|
|
14
|
+
containerType?: string;
|
|
15
|
+
nodes: TreeSitterNode[];
|
|
16
|
+
analyses: NodeAnalysis[];
|
|
17
|
+
parentNode: TreeSitterNode | null;
|
|
18
|
+
}
|
|
19
|
+
export interface NodeGroup {
|
|
20
|
+
nodes: TreeSitterNode[];
|
|
21
|
+
totalSize: number;
|
|
22
|
+
groupInfo: SemanticGroup[];
|
|
23
|
+
}
|
|
24
|
+
export declare function groupNodesForChunking(nodes: TreeSitterNode[], source: string, profile: ModelProfile, rule: LanguageRule): Promise<NodeGroup[]>;
|
|
25
|
+
export interface CombinedChunk {
|
|
26
|
+
code: string;
|
|
27
|
+
node: TreeSitterNode & {
|
|
28
|
+
type: string;
|
|
29
|
+
};
|
|
30
|
+
metadata: {
|
|
31
|
+
isGroup: boolean;
|
|
32
|
+
nodeCount: number;
|
|
33
|
+
totalSize: number;
|
|
34
|
+
groupTypes: string[];
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
export declare function createCombinedChunk(nodeGroup: NodeGroup, source: string, filerel: string): CombinedChunk | null;
|
|
38
|
+
export {};
|
|
39
|
+
//# sourceMappingURL=file-grouper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"file-grouper.d.ts","sourceRoot":"","sources":["../../src/chunking/file-grouper.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEtD,UAAU,YAAY;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IAC5C,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAUD,UAAU,YAAY;IACpB,IAAI,EAAE,cAAc,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,UAAU,aAAa;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,QAAQ,EAAE,YAAY,EAAE,CAAC;IACzB,UAAU,EAAE,cAAc,GAAG,IAAI,CAAC;CACnC;AAED,MAAM,WAAW,SAAS;IACxB,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,aAAa,EAAE,CAAC;CAC5B;AA2KD,wBAAsB,qBAAqB,CACzC,KAAK,EAAE,cAAc,EAAE,EACvB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,YAAY,EACrB,IAAI,EAAE,YAAY,GACjB,OAAO,CAAC,SAAS,EAAE,CAAC,CAkBtB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,cAAc,GAAG;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC;IACxC,QAAQ,EAAE;QACR,OAAO,EAAE,OAAO,CAAC;QACjB,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAED,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CA4B/G"}
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { batchAnalyzeCodeSize } from './token-counter.js';
|
|
2
|
+
function getSizeLimits(profile) {
|
|
3
|
+
if (profile.useTokens && profile.tokenCounter) {
|
|
4
|
+
return {
|
|
5
|
+
optimal: profile.optimalTokens,
|
|
6
|
+
min: profile.minChunkTokens,
|
|
7
|
+
max: profile.maxChunkTokens,
|
|
8
|
+
overlap: profile.overlapTokens,
|
|
9
|
+
unit: 'tokens'
|
|
10
|
+
};
|
|
11
|
+
}
|
|
12
|
+
return {
|
|
13
|
+
optimal: profile.optimalChars,
|
|
14
|
+
min: profile.minChunkChars,
|
|
15
|
+
max: profile.maxChunkChars,
|
|
16
|
+
overlap: profile.overlapChars,
|
|
17
|
+
unit: 'characters'
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
async function batchAnalyzeNodesInternal(nodes, source, profile) {
|
|
21
|
+
const codes = nodes.map(node => source.slice(node.startIndex, node.endIndex));
|
|
22
|
+
const limits = getSizeLimits(profile);
|
|
23
|
+
if (profile.useTokens && profile.tokenCounter) {
|
|
24
|
+
const analyses = await batchAnalyzeCodeSize(codes, limits, profile.tokenCounter, false);
|
|
25
|
+
return nodes.map((node, i) => ({
|
|
26
|
+
node,
|
|
27
|
+
size: analyses[i].size,
|
|
28
|
+
code: codes[i]
|
|
29
|
+
}));
|
|
30
|
+
}
|
|
31
|
+
return nodes.map((node, i) => ({
|
|
32
|
+
node,
|
|
33
|
+
size: codes[i].length,
|
|
34
|
+
code: codes[i]
|
|
35
|
+
}));
|
|
36
|
+
}
|
|
37
|
+
function isContainerNode(node, rule) {
|
|
38
|
+
const containerTypes = [
|
|
39
|
+
'class_declaration',
|
|
40
|
+
'class_definition',
|
|
41
|
+
'interface_declaration',
|
|
42
|
+
'module_declaration',
|
|
43
|
+
'namespace_declaration',
|
|
44
|
+
'trait_declaration',
|
|
45
|
+
'enum_declaration'
|
|
46
|
+
];
|
|
47
|
+
return containerTypes.includes(node.type);
|
|
48
|
+
}
|
|
49
|
+
function identifySemanticGroups(nodes, source, nodeAnalyses, rule) {
|
|
50
|
+
const groups = [];
|
|
51
|
+
let currentGroup = {
|
|
52
|
+
type: 'file_section',
|
|
53
|
+
nodes: [],
|
|
54
|
+
analyses: [],
|
|
55
|
+
parentNode: null
|
|
56
|
+
};
|
|
57
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
58
|
+
const node = nodes[i];
|
|
59
|
+
const analysis = nodeAnalyses[i];
|
|
60
|
+
if (isContainerNode(node, rule)) {
|
|
61
|
+
if (currentGroup.nodes.length > 0) {
|
|
62
|
+
groups.push(currentGroup);
|
|
63
|
+
}
|
|
64
|
+
currentGroup = {
|
|
65
|
+
type: 'container',
|
|
66
|
+
containerType: node.type,
|
|
67
|
+
nodes: [node],
|
|
68
|
+
analyses: [analysis],
|
|
69
|
+
parentNode: node
|
|
70
|
+
};
|
|
71
|
+
groups.push(currentGroup);
|
|
72
|
+
currentGroup = {
|
|
73
|
+
type: 'file_section',
|
|
74
|
+
nodes: [],
|
|
75
|
+
analyses: [],
|
|
76
|
+
parentNode: null
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
currentGroup.nodes.push(node);
|
|
81
|
+
currentGroup.analyses.push(analysis);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (currentGroup.nodes.length > 0) {
|
|
85
|
+
groups.push(currentGroup);
|
|
86
|
+
}
|
|
87
|
+
return groups;
|
|
88
|
+
}
|
|
89
|
+
async function combineGroupsToOptimalSize(semanticGroups, source, profile, limits) {
|
|
90
|
+
const optimalGroups = [];
|
|
91
|
+
let currentCombinedGroup = {
|
|
92
|
+
nodes: [],
|
|
93
|
+
totalSize: 0,
|
|
94
|
+
groupInfo: []
|
|
95
|
+
};
|
|
96
|
+
for (const group of semanticGroups) {
|
|
97
|
+
const groupTotalSize = group.analyses.reduce((sum, a) => sum + a.size, 0);
|
|
98
|
+
if (groupTotalSize > limits.optimal) {
|
|
99
|
+
if (currentCombinedGroup.nodes.length > 0) {
|
|
100
|
+
optimalGroups.push(currentCombinedGroup);
|
|
101
|
+
}
|
|
102
|
+
optimalGroups.push({
|
|
103
|
+
nodes: group.nodes,
|
|
104
|
+
totalSize: groupTotalSize,
|
|
105
|
+
groupInfo: [group]
|
|
106
|
+
});
|
|
107
|
+
currentCombinedGroup = {
|
|
108
|
+
nodes: [],
|
|
109
|
+
totalSize: 0,
|
|
110
|
+
groupInfo: []
|
|
111
|
+
};
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (currentCombinedGroup.totalSize + groupTotalSize > limits.max) {
|
|
115
|
+
if (currentCombinedGroup.nodes.length > 0) {
|
|
116
|
+
optimalGroups.push(currentCombinedGroup);
|
|
117
|
+
}
|
|
118
|
+
currentCombinedGroup = {
|
|
119
|
+
nodes: group.nodes,
|
|
120
|
+
totalSize: groupTotalSize,
|
|
121
|
+
groupInfo: [group]
|
|
122
|
+
};
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
currentCombinedGroup.nodes.push(...group.nodes);
|
|
126
|
+
currentCombinedGroup.totalSize += groupTotalSize;
|
|
127
|
+
currentCombinedGroup.groupInfo.push(group);
|
|
128
|
+
if (currentCombinedGroup.totalSize >= limits.optimal * 0.9) {
|
|
129
|
+
optimalGroups.push(currentCombinedGroup);
|
|
130
|
+
currentCombinedGroup = {
|
|
131
|
+
nodes: [],
|
|
132
|
+
totalSize: 0,
|
|
133
|
+
groupInfo: []
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (currentCombinedGroup.nodes.length > 0) {
|
|
138
|
+
optimalGroups.push(currentCombinedGroup);
|
|
139
|
+
}
|
|
140
|
+
return optimalGroups;
|
|
141
|
+
}
|
|
142
|
+
export async function groupNodesForChunking(nodes, source, profile, rule) {
|
|
143
|
+
if (!nodes || nodes.length === 0)
|
|
144
|
+
return [];
|
|
145
|
+
const limits = getSizeLimits(profile);
|
|
146
|
+
if (nodes.length <= 10) {
|
|
147
|
+
return nodes.map(node => ({
|
|
148
|
+
nodes: [node],
|
|
149
|
+
totalSize: 0,
|
|
150
|
+
groupInfo: []
|
|
151
|
+
}));
|
|
152
|
+
}
|
|
153
|
+
const nodeAnalyses = await batchAnalyzeNodesInternal(nodes, source, profile);
|
|
154
|
+
const semanticGroups = identifySemanticGroups(nodes, source, nodeAnalyses, rule);
|
|
155
|
+
const optimalGroups = await combineGroupsToOptimalSize(semanticGroups, source, profile, limits);
|
|
156
|
+
return optimalGroups;
|
|
157
|
+
}
|
|
158
|
+
export function createCombinedChunk(nodeGroup, source, filerel) {
|
|
159
|
+
if (!nodeGroup.nodes || nodeGroup.nodes.length === 0) {
|
|
160
|
+
return null;
|
|
161
|
+
}
|
|
162
|
+
const codes = nodeGroup.nodes.map(node => source.slice(node.startIndex, node.endIndex));
|
|
163
|
+
const combinedCode = codes.join('\n\n');
|
|
164
|
+
const firstNode = nodeGroup.nodes[0];
|
|
165
|
+
const lastNode = nodeGroup.nodes[nodeGroup.nodes.length - 1];
|
|
166
|
+
return {
|
|
167
|
+
code: combinedCode,
|
|
168
|
+
node: {
|
|
169
|
+
...firstNode,
|
|
170
|
+
type: `${firstNode.type}_group_${nodeGroup.nodes.length}`,
|
|
171
|
+
endIndex: lastNode.endIndex
|
|
172
|
+
},
|
|
173
|
+
metadata: {
|
|
174
|
+
isGroup: true,
|
|
175
|
+
nodeCount: nodeGroup.nodes.length,
|
|
176
|
+
totalSize: nodeGroup.totalSize,
|
|
177
|
+
groupTypes: nodeGroup.groupInfo?.map(g => g.type) || ['combined']
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
//# sourceMappingURL=file-grouper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"file-grouper.js","sourceRoot":"","sources":["../../src/chunking/file-grouper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAmB,oBAAoB,EAAyB,MAAM,oBAAoB,CAAC;AAqClG,SAAS,aAAa,CAAC,OAAqB;IAC1C,IAAI,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;QAC9C,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,aAAa;YAC9B,GAAG,EAAE,OAAO,CAAC,cAAc;YAC3B,GAAG,EAAE,OAAO,CAAC,cAAc;YAC3B,OAAO,EAAE,OAAO,CAAC,aAAa;YAC9B,IAAI,EAAE,QAAQ;SACf,CAAC;IACJ,CAAC;IACD,OAAO;QACL,OAAO,EAAE,OAAO,CAAC,YAAY;QAC7B,GAAG,EAAE,OAAO,CAAC,aAAa;QAC1B,GAAG,EAAE,OAAO,CAAC,aAAa;QAC1B,OAAO,EAAE,OAAO,CAAC,YAAY;QAC7B,IAAI,EAAE,YAAY;KACnB,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,yBAAyB,CAAC,KAAuB,EAAE,MAAc,EAAE,OAAqB;IACrG,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC9E,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;QAC9C,MAAM,QAAQ,GAAG,MAAM,oBAAoB,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;QACxF,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC7B,IAAI;YACJ,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI;YACtB,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;SACf,CAAC,CAAC,CAAC;IACN,CAAC;IAED,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7B,IAAI;QACJ,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM;QACrB,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;KACf,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,eAAe,CAAC,IAAoB,EAAE,IAAkB;IAC/D,MAAM,cAAc,GAAG;QACrB,mBAAmB;QACnB,kBAAkB;QAClB,uBAAuB;QACvB,oBAAoB;QACpB,uBAAuB;QACvB,mBAAmB;QACnB,kBAAkB;KACnB,CAAC;IAEF,OAAO,cAAc,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,sBAAsB,CAAC,KAAuB,EAAE,MAAc,EAAE,YAA4B,EAAE,IAAkB;IACvH,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,IAAI,YAAY,GAAkB;QAChC,IAAI,EAAE,cAAc;QACpB,KAAK,EAAE,EAAE;QACT,QAAQ,EAAE,EAAE;QACZ,UAAU,EAAE,IAAI;KACjB,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;QAEjC,IAAI,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC;YAChC,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAC5B,CAAC;YAED,YAAY,GAAG;gBACb,IAAI,EAAE,WAAW;gBACjB,aAAa,EAAE,IAAI,CAAC,IAAI;gBACxB,KAAK,EAAE,CAAC,IAAI,CAAC;gBACb,QAAQ,EAAE,CAAC,QAAQ,CAAC;gBACpB,UAAU,EAAE,IAAI;aACjB,CAAC;YAEF,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAE1B,YAAY,GAAG;gBACb,IAAI,EAAE,cAAc;gBACpB,KAAK,EAAE,EAAE;gBACT,QAAQ,EAAE,EAAE;gBACZ,UAAU,EAAE,IAAI;aACjB,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,YAAY,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC9B,YAAY,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClC,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5B,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,KAAK,UAAU,0BAA0B,CACvC,cAA+B,EAC/B,MAAc,EACd,OAAqB,EACrB,MAAkB;IAElB,MAAM,aAAa,GAAgB,EAAE,CAAC;IACtC,IAAI,oBAAoB,GAAc;QACpC,KAAK,EAAE,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,SAAS,EAAE,EAAE;KACd,CAAC;IAEF,KAAK,MAAM,KAAK,IAAI,cAAc,EAAE,CAAC;QACnC,MAAM,cAAc,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAE1E,IAAI,cAAc,GAAG,MAAM,CAAC,OAAO,EAAE,CAAC;YACpC,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC;YAED,aAAa,CAAC,IAAI,CAAC;gBACjB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,SAAS,EAAE,cAAc;gBACzB,SAAS,EAAE,CAAC,KAAK,CAAC;aACnB,CAAC,CAAC;YAEH,oBAAoB,GAAG;gBACrB,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,EAAE;aACd,CAAC;YACF,SAAS;QACX,CAAC;QAED,IAAI,oBAAoB,CAAC,SAAS,GAAG,cAAc,GAAG,MAAM,CAAC,GAAG,EAAE,CAAC;YACjE,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC;YAED,oBAAoB,GAAG;gBACrB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,SAAS,EAAE,cAAc;gBACzB,SAAS,EAAE,CAAC,KAAK,CAAC;aACnB,CAAC;YACF,SAAS;QACX,CAAC;QAED,oBAAoB,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC;QAChD,oBAAoB,CAAC,SAAS,IAAI,cAAc,CAAC;QACjD,oBAAoB,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAE3C,IAAI,oBAAoB,CAAC,SAAS,IAAI,MAAM,CAAC,OAAO,GAAG,GAAG,EAAE,CAAC;YAC3D,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YACzC,oBAAoB,GAAG;gBACrB,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,EAAE;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,aAAa,CAAC;AACvB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,KAAuB,EACvB,MAAc,EACd,OAAqB,EACrB,IAAkB;IAElB,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE5C,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IAEtC,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;QACvB,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACxB,KAAK,EAAE,CAAC,IAAI,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,EAAE;SACd,CAAC,CAAC,CAAC;IACN,CAAC;IAED,MAAM,YAAY,GAAG,MAAM,yBAAyB,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;IAC7E,MAAM,cAAc,GAAG,sBAAsB,CAAC,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,IAAI,CAAC,CAAC;IACjF,MAAM,aAAa,GAAG,MAAM,0BAA0B,CAAC,cAAc,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;IAEhG,OAAO,aAAa,CAAC;AACvB,CAAC;AAaD,MAAM,UAAU,mBAAmB,CAAC,SAAoB,EAAE,MAAc,EAAE,OAAe;IACvF,IAAI,CAAC,SAAS,CAAC,KAAK,IAAI,SAAS,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CACvC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,CAAC,CAC7C,CAAC;IAEF,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAExC,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACrC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE7D,OAAO;QACL,IAAI,EAAE,YAAY;QAClB,IAAI,EAAE;YACJ,GAAG,SAAS;YACZ,IAAI,EAAE,GAAG,SAAS,CAAC,IAAI,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE;YACzD,QAAQ,EAAE,QAAQ,CAAC,QAAQ;SAC5B;QACD,QAAQ,EAAE;YACR,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM;YACjC,SAAS,EAAE,SAAS,CAAC,SAAS;YAC9B,UAAU,EAAE,SAAS,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;SAClE;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { ModelProfile } from '../providers/base.js';
|
|
2
|
+
import type { TreeSitterNode } from '../types/ast.js';
|
|
3
|
+
interface LanguageRule {
|
|
4
|
+
subdivisionTypes?: Record<string, string[]>;
|
|
5
|
+
[key: string]: any;
|
|
6
|
+
}
|
|
7
|
+
export declare function findSemanticSubdivisions(node: TreeSitterNode, rule: LanguageRule): TreeSitterNode[];
|
|
8
|
+
export declare function findLastCompleteBoundary(code: string, maxSize: number): number;
|
|
9
|
+
export declare function extractSignature(node: TreeSitterNode, source: string): string;
|
|
10
|
+
export declare function extractLinesBeforeNode(node: TreeSitterNode, source: string, numLines: number): string;
|
|
11
|
+
export declare function extractParentContext(node: TreeSitterNode, source: string): {
|
|
12
|
+
signature: string;
|
|
13
|
+
startLine: number;
|
|
14
|
+
endLine: number;
|
|
15
|
+
};
|
|
16
|
+
export declare function getLineNumber(byteOffset: number, source: string): number;
|
|
17
|
+
export interface NodeAnalysis {
|
|
18
|
+
isSingleChunk: boolean;
|
|
19
|
+
needsSubdivision: boolean;
|
|
20
|
+
subdivisionCandidates: TreeSitterNode[];
|
|
21
|
+
size: number;
|
|
22
|
+
unit: string;
|
|
23
|
+
method: string;
|
|
24
|
+
estimatedSubchunks: number;
|
|
25
|
+
}
|
|
26
|
+
export declare function analyzeNodeForChunking(node: TreeSitterNode, source: string, rule: LanguageRule, profile: ModelProfile): Promise<NodeAnalysis>;
|
|
27
|
+
export declare function batchAnalyzeNodes(nodes: TreeSitterNode[], source: string, rule: LanguageRule, profile: ModelProfile, isSubdivision?: boolean): Promise<Array<NodeAnalysis & {
|
|
28
|
+
node: TreeSitterNode;
|
|
29
|
+
}>>;
|
|
30
|
+
export interface StatementChunk {
|
|
31
|
+
code: string;
|
|
32
|
+
size: number;
|
|
33
|
+
unit: string;
|
|
34
|
+
}
|
|
35
|
+
export declare function yieldStatementChunks(node: TreeSitterNode, source: string, maxSize: number, overlapSize: number, profile: ModelProfile): Promise<StatementChunk[]>;
|
|
36
|
+
export {};
|
|
37
|
+
//# sourceMappingURL=semantic-chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunking/semantic-chunker.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEtD,UAAU,YAAY;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IAC5C,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAUD,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,GAAG,cAAc,EAAE,CAwBnG;AAED,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAgB9E;AAED,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,CAO7E;AAED,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAIrG;AAED,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,GAAG;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;CACjB,CAMA;AAED,wBAAgB,aAAa,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,CAGxE;AAqBD,MAAM,WAAW,YAAY;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,gBAAgB,EAAE,OAAO,CAAC;IAC1B,qBAAqB,EAAE,cAAc,EAAE,CAAC;IACxC,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,EAAE,MAAM,CAAC;CAC5B;AAED,wBAAsB,sBAAsB,CAC1C,IAAI,EAAE,cAAc,EACpB,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,YAAY,GACpB,OAAO,CAAC,YAAY,CAAC,CA2BvB;AAED,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,cAAc,EAAE,EACvB,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,YAAY,EACrB,aAAa,UAAQ,GACpB,OAAO,CAAC,KAAK,CAAC,YAAY,GAAG;IAAE,IAAI,EAAE,cAAc,CAAA;CAAE,CAAC,CAAC,CAgCzD;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,wBAAsB,oBAAoB,CACxC,IAAI,EAAE,cAAc,EACpB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,WAAW,EAAE,MAAM,EACnB,OAAO,EAAE,YAAY,GACpB,OAAO,CAAC,cAAc,EAAE,CAAC,CAwC3B"}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { analyzeCodeSize, batchAnalyzeCodeSize } from './token-counter.js';
|
|
2
|
+
export function findSemanticSubdivisions(node, rule) {
|
|
3
|
+
if (!node || !rule)
|
|
4
|
+
return [];
|
|
5
|
+
const subdivisionTypes = rule.subdivisionTypes?.[node.type] || [];
|
|
6
|
+
if (subdivisionTypes.length === 0)
|
|
7
|
+
return [];
|
|
8
|
+
const candidates = [];
|
|
9
|
+
function walk(n, depth = 0) {
|
|
10
|
+
if (depth > 0 && subdivisionTypes.includes(n.type)) {
|
|
11
|
+
candidates.push(n);
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
for (let i = 0; i < n.childCount; i++) {
|
|
15
|
+
const child = n.child(i);
|
|
16
|
+
if (child) {
|
|
17
|
+
walk(child, depth + 1);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
walk(node);
|
|
22
|
+
return candidates;
|
|
23
|
+
}
|
|
24
|
+
export function findLastCompleteBoundary(code, maxSize) {
|
|
25
|
+
const boundaries = [
|
|
26
|
+
{ pattern: /\n\s*}\s*$/gm, priority: 1 },
|
|
27
|
+
{ pattern: /;\s*$/gm, priority: 2 },
|
|
28
|
+
{ pattern: /\n\s*$/gm, priority: 3 }
|
|
29
|
+
];
|
|
30
|
+
for (const boundary of boundaries) {
|
|
31
|
+
const matches = [...code.substring(0, maxSize).matchAll(boundary.pattern)];
|
|
32
|
+
if (matches.length > 0) {
|
|
33
|
+
const lastMatch = matches[matches.length - 1];
|
|
34
|
+
return lastMatch.index + lastMatch[0].length;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return maxSize;
|
|
38
|
+
}
|
|
39
|
+
export function extractSignature(node, source) {
|
|
40
|
+
const code = source.slice(node.startIndex, node.endIndex);
|
|
41
|
+
const firstBrace = code.indexOf('{');
|
|
42
|
+
if (firstBrace !== -1) {
|
|
43
|
+
return code.substring(0, firstBrace).trim() + ' {';
|
|
44
|
+
}
|
|
45
|
+
return code.split('\n')[0];
|
|
46
|
+
}
|
|
47
|
+
export function extractLinesBeforeNode(node, source, numLines) {
|
|
48
|
+
const beforeCode = source.substring(0, node.startIndex);
|
|
49
|
+
const lines = beforeCode.split('\n');
|
|
50
|
+
return lines.slice(-numLines).join('\n');
|
|
51
|
+
}
|
|
52
|
+
export function extractParentContext(node, source) {
|
|
53
|
+
return {
|
|
54
|
+
signature: extractSignature(node, source),
|
|
55
|
+
startLine: getLineNumber(node.startIndex, source),
|
|
56
|
+
endLine: getLineNumber(node.endIndex, source)
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
export function getLineNumber(byteOffset, source) {
|
|
60
|
+
const before = source.substring(0, byteOffset);
|
|
61
|
+
return before.split('\n').length;
|
|
62
|
+
}
|
|
63
|
+
function getSizeLimits(profile) {
|
|
64
|
+
if (profile.useTokens && profile.tokenCounter) {
|
|
65
|
+
return {
|
|
66
|
+
optimal: profile.optimalTokens,
|
|
67
|
+
min: profile.minChunkTokens,
|
|
68
|
+
max: profile.maxChunkTokens,
|
|
69
|
+
overlap: profile.overlapTokens,
|
|
70
|
+
unit: 'tokens'
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
optimal: profile.optimalChars,
|
|
75
|
+
min: profile.minChunkChars,
|
|
76
|
+
max: profile.maxChunkChars,
|
|
77
|
+
overlap: profile.overlapChars,
|
|
78
|
+
unit: 'characters'
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
export async function analyzeNodeForChunking(node, source, rule, profile) {
|
|
82
|
+
const code = source.slice(node.startIndex, node.endIndex);
|
|
83
|
+
const limits = getSizeLimits(profile);
|
|
84
|
+
let actualSize;
|
|
85
|
+
let method;
|
|
86
|
+
if (profile.useTokens && profile.tokenCounter) {
|
|
87
|
+
const analysis = await analyzeCodeSize(code, limits, profile.tokenCounter);
|
|
88
|
+
actualSize = analysis.size;
|
|
89
|
+
method = analysis.method;
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
actualSize = code.length;
|
|
93
|
+
method = 'chars';
|
|
94
|
+
}
|
|
95
|
+
const subdivisionThreshold = limits.max;
|
|
96
|
+
return {
|
|
97
|
+
isSingleChunk: actualSize <= subdivisionThreshold,
|
|
98
|
+
needsSubdivision: actualSize > subdivisionThreshold,
|
|
99
|
+
subdivisionCandidates: findSemanticSubdivisions(node, rule),
|
|
100
|
+
size: actualSize,
|
|
101
|
+
unit: limits.unit,
|
|
102
|
+
method,
|
|
103
|
+
estimatedSubchunks: Math.ceil(actualSize / limits.optimal)
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
export async function batchAnalyzeNodes(nodes, source, rule, profile, isSubdivision = false) {
|
|
107
|
+
const codes = nodes.map(node => source.slice(node.startIndex, node.endIndex));
|
|
108
|
+
const limits = getSizeLimits(profile);
|
|
109
|
+
let analyses;
|
|
110
|
+
if (profile.useTokens && profile.tokenCounter) {
|
|
111
|
+
analyses = await batchAnalyzeCodeSize(codes, limits, profile.tokenCounter, isSubdivision);
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
analyses = codes.map(code => ({
|
|
115
|
+
size: code.length,
|
|
116
|
+
decision: code.length < limits.min ? 'too_small'
|
|
117
|
+
: code.length > limits.max ? 'too_large'
|
|
118
|
+
: code.length <= limits.optimal ? 'optimal'
|
|
119
|
+
: 'needs_tokenization',
|
|
120
|
+
method: 'chars'
|
|
121
|
+
}));
|
|
122
|
+
}
|
|
123
|
+
return nodes.map((node, i) => {
|
|
124
|
+
const analysis = analyses[i];
|
|
125
|
+
const subdivisionThreshold = limits.max;
|
|
126
|
+
return {
|
|
127
|
+
node,
|
|
128
|
+
isSingleChunk: analysis.size <= subdivisionThreshold,
|
|
129
|
+
needsSubdivision: analysis.size > subdivisionThreshold,
|
|
130
|
+
subdivisionCandidates: findSemanticSubdivisions(node, rule),
|
|
131
|
+
size: analysis.size,
|
|
132
|
+
unit: limits.unit,
|
|
133
|
+
method: analysis.method,
|
|
134
|
+
estimatedSubchunks: Math.ceil(analysis.size / limits.optimal)
|
|
135
|
+
};
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
export async function yieldStatementChunks(node, source, maxSize, overlapSize, profile) {
|
|
139
|
+
const code = source.slice(node.startIndex, node.endIndex);
|
|
140
|
+
const lines = code.split('\n');
|
|
141
|
+
const chunks = [];
|
|
142
|
+
let currentChunk = [];
|
|
143
|
+
let currentSize = 0;
|
|
144
|
+
for (const line of lines) {
|
|
145
|
+
const lineSize = profile.useTokens && profile.tokenCounter
|
|
146
|
+
? await profile.tokenCounter(line)
|
|
147
|
+
: line.length;
|
|
148
|
+
if (currentSize + lineSize > maxSize && currentChunk.length > 0) {
|
|
149
|
+
chunks.push({
|
|
150
|
+
code: currentChunk.join('\n'),
|
|
151
|
+
size: currentSize,
|
|
152
|
+
unit: profile.useTokens ? 'tokens' : 'characters'
|
|
153
|
+
});
|
|
154
|
+
const overlapLines = Math.floor(currentChunk.length * 0.2);
|
|
155
|
+
currentChunk = currentChunk.slice(-overlapLines);
|
|
156
|
+
currentSize = profile.useTokens && profile.tokenCounter
|
|
157
|
+
? await profile.tokenCounter(currentChunk.join('\n'))
|
|
158
|
+
: currentChunk.join('\n').length;
|
|
159
|
+
}
|
|
160
|
+
currentChunk.push(line);
|
|
161
|
+
currentSize += lineSize;
|
|
162
|
+
}
|
|
163
|
+
if (currentChunk.length > 0) {
|
|
164
|
+
chunks.push({
|
|
165
|
+
code: currentChunk.join('\n'),
|
|
166
|
+
size: currentSize,
|
|
167
|
+
unit: profile.useTokens ? 'tokens' : 'characters'
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
return chunks;
|
|
171
|
+
}
|
|
172
|
+
//# sourceMappingURL=semantic-chunker.js.map
|