codevault 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +37 -0
  2. package/LICENSE +21 -0
  3. package/README.md +174 -0
  4. package/dist/chunking/file-grouper.d.ts +39 -0
  5. package/dist/chunking/file-grouper.d.ts.map +1 -0
  6. package/dist/chunking/file-grouper.js +181 -0
  7. package/dist/chunking/file-grouper.js.map +1 -0
  8. package/dist/chunking/semantic-chunker.d.ts +37 -0
  9. package/dist/chunking/semantic-chunker.d.ts.map +1 -0
  10. package/dist/chunking/semantic-chunker.js +172 -0
  11. package/dist/chunking/semantic-chunker.js.map +1 -0
  12. package/dist/chunking/token-counter.d.ts +28 -0
  13. package/dist/chunking/token-counter.d.ts.map +1 -0
  14. package/dist/chunking/token-counter.js +207 -0
  15. package/dist/chunking/token-counter.js.map +1 -0
  16. package/dist/cli/commands/context.d.ts +3 -0
  17. package/dist/cli/commands/context.d.ts.map +1 -0
  18. package/dist/cli/commands/context.js +98 -0
  19. package/dist/cli/commands/context.js.map +1 -0
  20. package/dist/cli.d.ts +3 -0
  21. package/dist/cli.d.ts.map +1 -0
  22. package/dist/cli.js +222 -0
  23. package/dist/cli.js.map +1 -0
  24. package/dist/codemap/io.d.ts +5 -0
  25. package/dist/codemap/io.d.ts.map +1 -0
  26. package/dist/codemap/io.js +30 -0
  27. package/dist/codemap/io.js.map +1 -0
  28. package/dist/context/packs.d.ts +33 -0
  29. package/dist/context/packs.d.ts.map +1 -0
  30. package/dist/context/packs.js +180 -0
  31. package/dist/context/packs.js.map +1 -0
  32. package/dist/core/indexer.d.ts +3 -0
  33. package/dist/core/indexer.d.ts.map +1 -0
  34. package/dist/core/indexer.js +610 -0
  35. package/dist/core/indexer.js.map +1 -0
  36. package/dist/core/metadata.d.ts +19 -0
  37. package/dist/core/metadata.d.ts.map +1 -0
  38. package/dist/core/metadata.js +161 -0
  39. package/dist/core/metadata.js.map +1 -0
  40. package/dist/core/search.d.ts +6 -0
  41. package/dist/core/search.d.ts.map +1 -0
  42. package/dist/core/search.js +482 -0
  43. package/dist/core/search.js.map +1 -0
  44. package/dist/core/symbol-extractor.d.ts +3 -0
  45. package/dist/core/symbol-extractor.d.ts.map +1 -0
  46. package/dist/core/symbol-extractor.js +78 -0
  47. package/dist/core/symbol-extractor.js.map +1 -0
  48. package/dist/core/types.d.ts +96 -0
  49. package/dist/core/types.d.ts.map +1 -0
  50. package/dist/core/types.js +2 -0
  51. package/dist/core/types.js.map +1 -0
  52. package/dist/database/db.d.ts +63 -0
  53. package/dist/database/db.d.ts.map +1 -0
  54. package/dist/database/db.js +205 -0
  55. package/dist/database/db.js.map +1 -0
  56. package/dist/indexer/merkle.d.ts +13 -0
  57. package/dist/indexer/merkle.d.ts.map +1 -0
  58. package/dist/indexer/merkle.js +86 -0
  59. package/dist/indexer/merkle.js.map +1 -0
  60. package/dist/indexer/update.d.ts +19 -0
  61. package/dist/indexer/update.d.ts.map +1 -0
  62. package/dist/indexer/update.js +40 -0
  63. package/dist/indexer/update.js.map +1 -0
  64. package/dist/indexer/watch.d.ts +21 -0
  65. package/dist/indexer/watch.d.ts.map +1 -0
  66. package/dist/indexer/watch.js +204 -0
  67. package/dist/indexer/watch.js.map +1 -0
  68. package/dist/languages/rules.d.ts +11 -0
  69. package/dist/languages/rules.d.ts.map +1 -0
  70. package/dist/languages/rules.js +371 -0
  71. package/dist/languages/rules.js.map +1 -0
  72. package/dist/languages/tree-sitter-loader.d.ts +27 -0
  73. package/dist/languages/tree-sitter-loader.d.ts.map +1 -0
  74. package/dist/languages/tree-sitter-loader.js +75 -0
  75. package/dist/languages/tree-sitter-loader.js.map +1 -0
  76. package/dist/mcp/tools/use-context-pack.d.ts +57 -0
  77. package/dist/mcp/tools/use-context-pack.d.ts.map +1 -0
  78. package/dist/mcp/tools/use-context-pack.js +91 -0
  79. package/dist/mcp/tools/use-context-pack.js.map +1 -0
  80. package/dist/mcp-server.d.ts +3 -0
  81. package/dist/mcp-server.d.ts.map +1 -0
  82. package/dist/mcp-server.js +363 -0
  83. package/dist/mcp-server.js.map +1 -0
  84. package/dist/providers/base.d.ts +34 -0
  85. package/dist/providers/base.d.ts.map +1 -0
  86. package/dist/providers/base.js +160 -0
  87. package/dist/providers/base.js.map +1 -0
  88. package/dist/providers/index.d.ts +6 -0
  89. package/dist/providers/index.d.ts.map +1 -0
  90. package/dist/providers/index.js +22 -0
  91. package/dist/providers/index.js.map +1 -0
  92. package/dist/providers/ollama.d.ts +13 -0
  93. package/dist/providers/ollama.d.ts.map +1 -0
  94. package/dist/providers/ollama.js +50 -0
  95. package/dist/providers/ollama.js.map +1 -0
  96. package/dist/providers/openai.d.ts +13 -0
  97. package/dist/providers/openai.d.ts.map +1 -0
  98. package/dist/providers/openai.js +59 -0
  99. package/dist/providers/openai.js.map +1 -0
  100. package/dist/providers/token-counter.d.ts +2 -0
  101. package/dist/providers/token-counter.d.ts.map +1 -0
  102. package/dist/providers/token-counter.js +18 -0
  103. package/dist/providers/token-counter.js.map +1 -0
  104. package/dist/ranking/api-reranker.d.ts +18 -0
  105. package/dist/ranking/api-reranker.d.ts.map +1 -0
  106. package/dist/ranking/api-reranker.js +148 -0
  107. package/dist/ranking/api-reranker.js.map +1 -0
  108. package/dist/ranking/symbol-boost.d.ts +15 -0
  109. package/dist/ranking/symbol-boost.d.ts.map +1 -0
  110. package/dist/ranking/symbol-boost.js +154 -0
  111. package/dist/ranking/symbol-boost.js.map +1 -0
  112. package/dist/search/bm25.d.ts +17 -0
  113. package/dist/search/bm25.d.ts.map +1 -0
  114. package/dist/search/bm25.js +56 -0
  115. package/dist/search/bm25.js.map +1 -0
  116. package/dist/search/hybrid.d.ts +21 -0
  117. package/dist/search/hybrid.d.ts.map +1 -0
  118. package/dist/search/hybrid.js +50 -0
  119. package/dist/search/hybrid.js.map +1 -0
  120. package/dist/search/scope.d.ts +5 -0
  121. package/dist/search/scope.d.ts.map +1 -0
  122. package/dist/search/scope.js +107 -0
  123. package/dist/search/scope.js.map +1 -0
  124. package/dist/storage/encrypted-chunks.d.ts +40 -0
  125. package/dist/storage/encrypted-chunks.d.ts.map +1 -0
  126. package/dist/storage/encrypted-chunks.js +236 -0
  127. package/dist/storage/encrypted-chunks.js.map +1 -0
  128. package/dist/symbols/extract.d.ts +15 -0
  129. package/dist/symbols/extract.d.ts.map +1 -0
  130. package/dist/symbols/extract.js +187 -0
  131. package/dist/symbols/extract.js.map +1 -0
  132. package/dist/symbols/graph.d.ts +3 -0
  133. package/dist/symbols/graph.d.ts.map +1 -0
  134. package/dist/symbols/graph.js +89 -0
  135. package/dist/symbols/graph.js.map +1 -0
  136. package/dist/types/ast.d.ts +3 -0
  137. package/dist/types/ast.d.ts.map +1 -0
  138. package/dist/types/ast.js +2 -0
  139. package/dist/types/ast.js.map +1 -0
  140. package/dist/types/codemap.d.ts +58 -0
  141. package/dist/types/codemap.d.ts.map +1 -0
  142. package/dist/types/codemap.js +224 -0
  143. package/dist/types/codemap.js.map +1 -0
  144. package/dist/types/context-pack.d.ts +47 -0
  145. package/dist/types/context-pack.d.ts.map +1 -0
  146. package/dist/types/context-pack.js +44 -0
  147. package/dist/types/context-pack.js.map +1 -0
  148. package/dist/types/search.d.ts +15 -0
  149. package/dist/types/search.d.ts.map +1 -0
  150. package/dist/types/search.js +11 -0
  151. package/dist/types/search.js.map +1 -0
  152. package/dist/utils/rate-limiter.d.ts +31 -0
  153. package/dist/utils/rate-limiter.d.ts.map +1 -0
  154. package/dist/utils/rate-limiter.js +168 -0
  155. package/dist/utils/rate-limiter.js.map +1 -0
  156. package/package.json +92 -0
package/.env.example ADDED
@@ -0,0 +1,37 @@
1
+ # OpenAI / Nebius API Configuration
2
+ # For Nebius AI (Qwen embeddings), use your Nebius JWT token and base URL
3
+ OPENAI_API_KEY=your-api-key-here
4
+ OPENAI_BASE_URL=https://api.openai.com/v1
5
+
6
+ # Embedding Model Configuration
7
+ # For standard OpenAI: text-embedding-3-large
8
+ # For Nebius/Qwen: Qwen/Qwen3-Embedding-8B
9
+ CODEVAULT_OPENAI_EMBEDDING_MODEL=text-embedding-3-large
10
+
11
+ # For Ollama (local embeddings)
12
+ CODEVAULT_OLLAMA_MODEL=nomic-embed-text
13
+
14
+ # Chunking Configuration
15
+ # Standard: 8192 tokens, Qwen: 32000 tokens
16
+ CODEVAULT_MAX_TOKENS=8192
17
+
18
+ # Embedding Dimensions
19
+ # OpenAI text-embedding-3-large: 3072
20
+ # Qwen3-Embedding-8B: 4096
21
+ # Ollama nomic-embed-text: 768
22
+ CODEVAULT_DIMENSIONS=3072
23
+
24
+ # Rate Limiting
25
+ # Adjust based on your API tier
26
+ CODEVAULT_RATE_LIMIT_RPM=10000
27
+ CODEVAULT_RATE_LIMIT_TPM=600000
28
+
29
+ # Reranking API (Optional)
30
+ # For NVIDIA reranking service
31
+ CODEVAULT_RERANK_API_URL=https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking
32
+ CODEVAULT_RERANK_API_KEY=your-nvidia-api-key-here
33
+ CODEVAULT_RERANK_MODEL=nvidia/nv-rerankqa-mistral-4b-v3
34
+
35
+ # Encryption (Optional)
36
+ # 32-byte key encoded as base64 or hex
37
+ # CODEVAULT_ENCRYPTION_KEY=your-encryption-key-here
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shariq Riaz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,174 @@
1
+ # CodeVault
2
+
3
+ > AI-powered semantic code search via Model Context Protocol (MCP)
4
+
5
+ CodeVault is an intelligent code indexing and search system that enables AI assistants to understand and navigate your codebase using semantic search, symbol-aware ranking, and hybrid retrieval techniques.
6
+
7
+ ## 🌟 Features
8
+
9
+ - **🔍 Semantic Code Search**: Find code by meaning, not just keywords
10
+ - **🤖 MCP Integration**: Native support for Claude Desktop and other MCP clients
11
+ - **🎯 Symbol-Aware Ranking**: Boost results based on function signatures and relationships
12
+ - **⚡ Hybrid Search**: Combines vector embeddings with BM25 keyword matching
13
+ - **🔄 Context Packs**: Save and reuse search scopes for different projects
14
+ - **📊 Multi-Language Support**: 25+ programming languages via Tree-sitter
15
+ - **🏠 Local-First**: Works with local models (Ollama) or cloud APIs
16
+ - **🔐 Optional Encryption**: Secure your indexed code chunks
17
+ - **📈 Smart Chunking**: Token-aware code splitting for optimal context
18
+
19
+ ## 🚀 Quick Start
20
+
21
+ ### Installation
22
+
23
+ ```bash
24
+ npm install
25
+ npm run build
26
+ npm link
27
+ ```
28
+
29
+ ### Index Your Project
30
+
31
+ ```bash
32
+ # Navigate to your project
33
+ cd /path/to/your/project
34
+
35
+ # Using Ollama (local, no API key required)
36
+ codevault index --provider ollama
37
+
38
+ # Using OpenAI
39
+ export OPENAI_API_KEY=your-key-here
40
+ codevault index --provider openai
41
+
42
+ # Using Qwen (via Nebius AI Studio)
43
+ export OPENAI_API_KEY=your-nebius-api-key
44
+ export OPENAI_BASE_URL=https://api.studio.nebius.com/v1/
45
+ export CODEVAULT_OPENAI_EMBEDDING_MODEL=Qwen/Qwen3-Embedding-8B
46
+ codevault index --provider openai
47
+ ```
48
+
49
+ ### Search Your Code
50
+
51
+ ```bash
52
+ codevault search "authentication function"
53
+ codevault search "stripe checkout" --tags stripe --lang php
54
+ ```
55
+
56
+ ### Use with Claude Desktop
57
+
58
+ Add to your `claude_desktop_config.json`:
59
+
60
+ ```json
61
+ {
62
+ "mcpServers": {
63
+ "codevault": {
64
+ "command": "node",
65
+ "args": ["/path/to/codevault-v2/dist/mcp-server.js"],
66
+ "env": {
67
+ "OPENAI_API_KEY": "your-api-key-here"
68
+ }
69
+ }
70
+ }
71
+ }
72
+ ```
73
+
74
+ ## 📖 Documentation
75
+
76
+ ### Supported Languages
77
+
78
+ - **Web**: JavaScript, TypeScript, TSX, HTML, CSS, JSON, Markdown
79
+ - **Backend**: Python, PHP, Go, Java, Kotlin, C#, Ruby, Scala, Swift
80
+ - **Systems**: C, C++, Rust
81
+ - **Functional**: Haskell, OCaml, Elixir
82
+ - **Scripting**: Bash, Lua
83
+
84
+ ### Embedding Providers
85
+
86
+ | Provider | Model | Dimensions | Context | Best For | API Key Required |
87
+ |----------|-------|------------|---------|----------|------------------|
88
+ | **ollama** | nomic-embed-text | 768 | 8K | Local, no API costs | ❌ No |
89
+ | **openai** | text-embedding-3-large | 3072 | 8K | Highest quality | ✅ Yes |
90
+ | **openai** | Qwen/Qwen3-Embedding-8B | 4096 | 32K | Large context, high quality | ✅ Yes (Nebius AI) |
91
+
92
+ ### CLI Commands
93
+
94
+ ```bash
95
+ # Indexing
96
+ codevault index [path] # Index project
97
+ codevault update [path] # Update existing index
98
+ codevault watch [path] # Watch for changes
99
+
100
+ # Searching
101
+ codevault search <query> [path] # Search code
102
+ --limit <num> # Max results
103
+ --provider <name> # Embedding provider
104
+ --path_glob <pattern> # Filter by file pattern
105
+ --tags <tag...> # Filter by tags
106
+ --lang <language...> # Filter by language
107
+
108
+ # Context Packs
109
+ codevault context list # List saved contexts
110
+ codevault context show <name> # Show context pack
111
+ codevault context use <name> # Activate context pack
112
+
113
+ # Utilities
114
+ codevault info # Project statistics
115
+ codevault mcp # Start MCP server
116
+ ```
117
+
118
+ ### MCP Tools
119
+
120
+ - **`search_code`**: Semantic code search with filters
121
+ - **`search_code_with_chunks`**: Search + retrieve full code
122
+ - **`get_code_chunk`**: Get specific code by SHA
123
+ - **`index_project`**: Index a new project
124
+ - **`update_project`**: Update existing index
125
+ - **`get_project_stats`**: Project overview
126
+ - **`use_context_pack`**: Apply saved search context
127
+
128
+ ### Environment Variables
129
+
130
+ ```bash
131
+ # OpenAI Configuration
132
+ OPENAI_API_KEY=sk-...
133
+ OPENAI_BASE_URL=https://api.openai.com/v1
134
+ CODEVAULT_OPENAI_EMBEDDING_MODEL=text-embedding-3-large
135
+
136
+ # Ollama Configuration
137
+ CODEVAULT_OLLAMA_MODEL=nomic-embed-text
138
+
139
+ # Chunking
140
+ CODEVAULT_MAX_TOKENS=8192
141
+ CODEVAULT_DIMENSIONS=3072
142
+
143
+ # Rate Limiting
144
+ CODEVAULT_RATE_LIMIT_RPM=10000 # Requests per minute
145
+ CODEVAULT_RATE_LIMIT_TPM=600000 # Tokens per minute
146
+
147
+ # Reranking
148
+ CODEVAULT_RERANK_API_URL=...
149
+ CODEVAULT_RERANK_API_KEY=...
150
+ CODEVAULT_RERANK_MODEL=...
151
+
152
+ # Encryption
153
+ CODEVAULT_ENCRYPTION_KEY=...
154
+ ```
155
+
156
+ ## 🏗️ Architecture
157
+
158
+ ### Project Structure
159
+
160
+ ```
161
+ .codevault/
162
+ ├── codevault.db # SQLite database
163
+ ├── chunks/ # Compressed code chunks
164
+ └── contextpacks/ # Saved search contexts
165
+ codevault.codemap.json # Lightweight index
166
+ ```
167
+
168
+ ## 📄 License
169
+
170
+ MIT License
171
+
172
+ ---
173
+
174
+ **Built by Shariq Riaz**
@@ -0,0 +1,39 @@
1
+ import type { ModelProfile } from '../providers/base.js';
2
+ import type { TreeSitterNode } from '../types/ast.js';
3
+ interface LanguageRule {
4
+ subdivisionTypes?: Record<string, string[]>;
5
+ [key: string]: any;
6
+ }
7
+ interface NodeAnalysis {
8
+ node: TreeSitterNode;
9
+ size: number;
10
+ code: string;
11
+ }
12
+ interface SemanticGroup {
13
+ type: string;
14
+ containerType?: string;
15
+ nodes: TreeSitterNode[];
16
+ analyses: NodeAnalysis[];
17
+ parentNode: TreeSitterNode | null;
18
+ }
19
+ export interface NodeGroup {
20
+ nodes: TreeSitterNode[];
21
+ totalSize: number;
22
+ groupInfo: SemanticGroup[];
23
+ }
24
+ export declare function groupNodesForChunking(nodes: TreeSitterNode[], source: string, profile: ModelProfile, rule: LanguageRule): Promise<NodeGroup[]>;
25
+ export interface CombinedChunk {
26
+ code: string;
27
+ node: TreeSitterNode & {
28
+ type: string;
29
+ };
30
+ metadata: {
31
+ isGroup: boolean;
32
+ nodeCount: number;
33
+ totalSize: number;
34
+ groupTypes: string[];
35
+ };
36
+ }
37
+ export declare function createCombinedChunk(nodeGroup: NodeGroup, source: string, filerel: string): CombinedChunk | null;
38
+ export {};
39
+ //# sourceMappingURL=file-grouper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"file-grouper.d.ts","sourceRoot":"","sources":["../../src/chunking/file-grouper.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEtD,UAAU,YAAY;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IAC5C,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAUD,UAAU,YAAY;IACpB,IAAI,EAAE,cAAc,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,UAAU,aAAa;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,QAAQ,EAAE,YAAY,EAAE,CAAC;IACzB,UAAU,EAAE,cAAc,GAAG,IAAI,CAAC;CACnC;AAED,MAAM,WAAW,SAAS;IACxB,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,aAAa,EAAE,CAAC;CAC5B;AA2KD,wBAAsB,qBAAqB,CACzC,KAAK,EAAE,cAAc,EAAE,EACvB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,YAAY,EACrB,IAAI,EAAE,YAAY,GACjB,OAAO,CAAC,SAAS,EAAE,CAAC,CAkBtB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,cAAc,GAAG;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC;IACxC,QAAQ,EAAE;QACR,OAAO,EAAE,OAAO,CAAC;QACjB,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,EAAE,CAAC;KACtB,CAAC;CACH;AAED,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CA4B/G"}
@@ -0,0 +1,181 @@
1
+ import { batchAnalyzeCodeSize } from './token-counter.js';
2
+ function getSizeLimits(profile) {
3
+ if (profile.useTokens && profile.tokenCounter) {
4
+ return {
5
+ optimal: profile.optimalTokens,
6
+ min: profile.minChunkTokens,
7
+ max: profile.maxChunkTokens,
8
+ overlap: profile.overlapTokens,
9
+ unit: 'tokens'
10
+ };
11
+ }
12
+ return {
13
+ optimal: profile.optimalChars,
14
+ min: profile.minChunkChars,
15
+ max: profile.maxChunkChars,
16
+ overlap: profile.overlapChars,
17
+ unit: 'characters'
18
+ };
19
+ }
20
+ async function batchAnalyzeNodesInternal(nodes, source, profile) {
21
+ const codes = nodes.map(node => source.slice(node.startIndex, node.endIndex));
22
+ const limits = getSizeLimits(profile);
23
+ if (profile.useTokens && profile.tokenCounter) {
24
+ const analyses = await batchAnalyzeCodeSize(codes, limits, profile.tokenCounter, false);
25
+ return nodes.map((node, i) => ({
26
+ node,
27
+ size: analyses[i].size,
28
+ code: codes[i]
29
+ }));
30
+ }
31
+ return nodes.map((node, i) => ({
32
+ node,
33
+ size: codes[i].length,
34
+ code: codes[i]
35
+ }));
36
+ }
37
+ function isContainerNode(node, rule) {
38
+ const containerTypes = [
39
+ 'class_declaration',
40
+ 'class_definition',
41
+ 'interface_declaration',
42
+ 'module_declaration',
43
+ 'namespace_declaration',
44
+ 'trait_declaration',
45
+ 'enum_declaration'
46
+ ];
47
+ return containerTypes.includes(node.type);
48
+ }
49
+ function identifySemanticGroups(nodes, source, nodeAnalyses, rule) {
50
+ const groups = [];
51
+ let currentGroup = {
52
+ type: 'file_section',
53
+ nodes: [],
54
+ analyses: [],
55
+ parentNode: null
56
+ };
57
+ for (let i = 0; i < nodes.length; i++) {
58
+ const node = nodes[i];
59
+ const analysis = nodeAnalyses[i];
60
+ if (isContainerNode(node, rule)) {
61
+ if (currentGroup.nodes.length > 0) {
62
+ groups.push(currentGroup);
63
+ }
64
+ currentGroup = {
65
+ type: 'container',
66
+ containerType: node.type,
67
+ nodes: [node],
68
+ analyses: [analysis],
69
+ parentNode: node
70
+ };
71
+ groups.push(currentGroup);
72
+ currentGroup = {
73
+ type: 'file_section',
74
+ nodes: [],
75
+ analyses: [],
76
+ parentNode: null
77
+ };
78
+ }
79
+ else {
80
+ currentGroup.nodes.push(node);
81
+ currentGroup.analyses.push(analysis);
82
+ }
83
+ }
84
+ if (currentGroup.nodes.length > 0) {
85
+ groups.push(currentGroup);
86
+ }
87
+ return groups;
88
+ }
89
+ async function combineGroupsToOptimalSize(semanticGroups, source, profile, limits) {
90
+ const optimalGroups = [];
91
+ let currentCombinedGroup = {
92
+ nodes: [],
93
+ totalSize: 0,
94
+ groupInfo: []
95
+ };
96
+ for (const group of semanticGroups) {
97
+ const groupTotalSize = group.analyses.reduce((sum, a) => sum + a.size, 0);
98
+ if (groupTotalSize > limits.optimal) {
99
+ if (currentCombinedGroup.nodes.length > 0) {
100
+ optimalGroups.push(currentCombinedGroup);
101
+ }
102
+ optimalGroups.push({
103
+ nodes: group.nodes,
104
+ totalSize: groupTotalSize,
105
+ groupInfo: [group]
106
+ });
107
+ currentCombinedGroup = {
108
+ nodes: [],
109
+ totalSize: 0,
110
+ groupInfo: []
111
+ };
112
+ continue;
113
+ }
114
+ if (currentCombinedGroup.totalSize + groupTotalSize > limits.max) {
115
+ if (currentCombinedGroup.nodes.length > 0) {
116
+ optimalGroups.push(currentCombinedGroup);
117
+ }
118
+ currentCombinedGroup = {
119
+ nodes: group.nodes,
120
+ totalSize: groupTotalSize,
121
+ groupInfo: [group]
122
+ };
123
+ continue;
124
+ }
125
+ currentCombinedGroup.nodes.push(...group.nodes);
126
+ currentCombinedGroup.totalSize += groupTotalSize;
127
+ currentCombinedGroup.groupInfo.push(group);
128
+ if (currentCombinedGroup.totalSize >= limits.optimal * 0.9) {
129
+ optimalGroups.push(currentCombinedGroup);
130
+ currentCombinedGroup = {
131
+ nodes: [],
132
+ totalSize: 0,
133
+ groupInfo: []
134
+ };
135
+ }
136
+ }
137
+ if (currentCombinedGroup.nodes.length > 0) {
138
+ optimalGroups.push(currentCombinedGroup);
139
+ }
140
+ return optimalGroups;
141
+ }
142
+ export async function groupNodesForChunking(nodes, source, profile, rule) {
143
+ if (!nodes || nodes.length === 0)
144
+ return [];
145
+ const limits = getSizeLimits(profile);
146
+ if (nodes.length <= 10) {
147
+ return nodes.map(node => ({
148
+ nodes: [node],
149
+ totalSize: 0,
150
+ groupInfo: []
151
+ }));
152
+ }
153
+ const nodeAnalyses = await batchAnalyzeNodesInternal(nodes, source, profile);
154
+ const semanticGroups = identifySemanticGroups(nodes, source, nodeAnalyses, rule);
155
+ const optimalGroups = await combineGroupsToOptimalSize(semanticGroups, source, profile, limits);
156
+ return optimalGroups;
157
+ }
158
+ export function createCombinedChunk(nodeGroup, source, filerel) {
159
+ if (!nodeGroup.nodes || nodeGroup.nodes.length === 0) {
160
+ return null;
161
+ }
162
+ const codes = nodeGroup.nodes.map(node => source.slice(node.startIndex, node.endIndex));
163
+ const combinedCode = codes.join('\n\n');
164
+ const firstNode = nodeGroup.nodes[0];
165
+ const lastNode = nodeGroup.nodes[nodeGroup.nodes.length - 1];
166
+ return {
167
+ code: combinedCode,
168
+ node: {
169
+ ...firstNode,
170
+ type: `${firstNode.type}_group_${nodeGroup.nodes.length}`,
171
+ endIndex: lastNode.endIndex
172
+ },
173
+ metadata: {
174
+ isGroup: true,
175
+ nodeCount: nodeGroup.nodes.length,
176
+ totalSize: nodeGroup.totalSize,
177
+ groupTypes: nodeGroup.groupInfo?.map(g => g.type) || ['combined']
178
+ }
179
+ };
180
+ }
181
+ //# sourceMappingURL=file-grouper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"file-grouper.js","sourceRoot":"","sources":["../../src/chunking/file-grouper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAmB,oBAAoB,EAAyB,MAAM,oBAAoB,CAAC;AAqClG,SAAS,aAAa,CAAC,OAAqB;IAC1C,IAAI,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;QAC9C,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,aAAa;YAC9B,GAAG,EAAE,OAAO,CAAC,cAAc;YAC3B,GAAG,EAAE,OAAO,CAAC,cAAc;YAC3B,OAAO,EAAE,OAAO,CAAC,aAAa;YAC9B,IAAI,EAAE,QAAQ;SACf,CAAC;IACJ,CAAC;IACD,OAAO;QACL,OAAO,EAAE,OAAO,CAAC,YAAY;QAC7B,GAAG,EAAE,OAAO,CAAC,aAAa;QAC1B,GAAG,EAAE,OAAO,CAAC,aAAa;QAC1B,OAAO,EAAE,OAAO,CAAC,YAAY;QAC7B,IAAI,EAAE,YAAY;KACnB,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,yBAAyB,CAAC,KAAuB,EAAE,MAAc,EAAE,OAAqB;IACrG,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC9E,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,EAAE,CAAC;QAC9C,MAAM,QAAQ,GAAG,MAAM,oBAAoB,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;QACxF,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC7B,IAAI;YACJ,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI;YACtB,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;SACf,CAAC,CAAC,CAAC;IACN,CAAC;IAED,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7B,IAAI;QACJ,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM;QACrB,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC;KACf,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,eAAe,CAAC,IAAoB,EAAE,IAAkB;IAC/D,MAAM,cAAc,GAAG;QACrB,mBAAmB;QACnB,kBAAkB;QAClB,uBAAuB;QACvB,oBAAoB;QACpB,uBAAuB;QACvB,mBAAmB;QACnB,kBAAkB;KACnB,CAAC;IAEF,OAAO,cAAc,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,sBAAsB,CAAC,KAAuB,EAAE,MAAc,EAAE,YAA4B,EAAE,IAAkB;IACvH,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,IAAI,YAAY,GAAkB;QAChC,IAAI,EAAE,cAAc;QACpB,KAAK,EAAE,EAAE;QACT,QAAQ,EAAE,EAAE;QACZ,UAAU,EAAE,IAAI;KACjB,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;QAEjC,IAAI,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC;YAChC,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAC5B,CAAC;YAED,YAAY,GAAG;gBACb,IAAI,EAAE,WAAW;gBACjB,aAAa,EAAE,IAAI,CAAC,IAAI;gBACxB,KAAK,EAAE,CAAC,IAAI,CAAC;gBACb,QAAQ,EAAE,CAAC,QAAQ,CAAC;gBACpB,UAAU,EAAE,IAAI;aACjB,CAAC;YAEF,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAE1B,YAAY,GAAG;gBACb,IAAI,EAAE,cAAc;gBACpB,KAAK,EAAE,EAAE;gBACT,QAAQ,EAAE,EAAE;gBACZ,UAAU,EAAE,IAAI;aACjB,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,YAAY,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC9B,YAAY,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClC,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5B,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,KAAK,UAAU,0BAA0B,CACvC,cAA+B,EAC/B,MAAc,EACd,OAAqB,EACrB,MAAkB;IAElB,MAAM,aAAa,GAAgB,EAAE,CAAC;IACtC,IAAI,oBAAoB,GAAc;QACpC,KAAK,EAAE,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,SAAS,EAAE,EAAE;KACd,CAAC;IAEF,KAAK,MAAM,KAAK,IAAI,cAAc,EAAE,CAAC;QACnC,MAAM,cAAc,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAE1E,IAAI,cAAc,GAAG,MAAM,CAAC,OAAO,EAAE,CAAC;YACpC,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC;YAED,aAAa,CAAC,IAAI,CAAC;gBACjB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,SAAS,EAAE,cAAc;gBACzB,SAAS,EAAE,CAAC,KAAK,CAAC;aACnB,CAAC,CAAC;YAEH,oBAAoB,GAAG;gBACrB,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,EAAE;aACd,CAAC;YACF,SAAS;QACX,CAAC;QAED,IAAI,oBAAoB,CAAC,SAAS,GAAG,cAAc,GAAG,MAAM,CAAC,GAAG,EAAE,CAAC;YACjE,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC;YAED,oBAAoB,GAAG;gBACrB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,SAAS,EAAE,cAAc;gBACzB,SAAS,EAAE,CAAC,KAAK,CAAC;aACnB,CAAC;YACF,SAAS;QACX,CAAC;QAED,oBAAoB,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC;QAChD,oBAAoB,CAAC,SAAS,IAAI,cAAc,CAAC;QACjD,oBAAoB,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAE3C,IAAI,oBAAoB,CAAC,SAAS,IAAI,MAAM,CAAC,OAAO,GAAG,GAAG,EAAE,CAAC;YAC3D,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YACzC,oBAAoB,GAAG;gBACrB,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,EAAE;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED,IAAI,oBAAoB,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1C,aAAa,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,aAAa,CAAC;AACvB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,KAAuB,EACvB,MAAc,EACd,OAAqB,EACrB,IAAkB;IAElB,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE5C,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IAEtC,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;QACvB,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACxB,KAAK,EAAE,CAAC,IAAI,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,EAAE;SACd,CAAC,CAAC,CAAC;IACN,CAAC;IAED,MAAM,YAAY,GAAG,MAAM,yBAAyB,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;IAC7E,MAAM,cAAc,GAAG,sBAAsB,CAAC,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,IAAI,CAAC,CAAC;IACjF,MAAM,aAAa,GAAG,MAAM,0BAA0B,CAAC,cAAc,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;IAEhG,OAAO,aAAa,CAAC;AACvB,CAAC;AAaD,MAAM,UAAU,mBAAmB,CAAC,SAAoB,EAAE,MAAc,EAAE,OAAe;IACvF,IAAI,CAAC,SAAS,CAAC,KAAK,IAAI,SAAS,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CACvC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,CAAC,CAC7C,CAAC;IAEF,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAExC,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACrC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE7D,OAAO;QACL,IAAI,EAAE,YAAY;QAClB,IAAI,EAAE;YACJ,GAAG,SAAS;YACZ,IAAI,EAAE,GAAG,SAAS,CAAC,IAAI,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE;YACzD,QAAQ,EAAE,QAAQ,CAAC,QAAQ;SAC5B;QACD,QAAQ,EAAE;YACR,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM;YACjC,SAAS,EAAE,SAAS,CAAC,SAAS;YAC9B,UAAU,EAAE,SAAS,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;SAClE;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,37 @@
1
+ import type { ModelProfile } from '../providers/base.js';
2
+ import type { TreeSitterNode } from '../types/ast.js';
3
+ interface LanguageRule {
4
+ subdivisionTypes?: Record<string, string[]>;
5
+ [key: string]: any;
6
+ }
7
+ export declare function findSemanticSubdivisions(node: TreeSitterNode, rule: LanguageRule): TreeSitterNode[];
8
+ export declare function findLastCompleteBoundary(code: string, maxSize: number): number;
9
+ export declare function extractSignature(node: TreeSitterNode, source: string): string;
10
+ export declare function extractLinesBeforeNode(node: TreeSitterNode, source: string, numLines: number): string;
11
+ export declare function extractParentContext(node: TreeSitterNode, source: string): {
12
+ signature: string;
13
+ startLine: number;
14
+ endLine: number;
15
+ };
16
+ export declare function getLineNumber(byteOffset: number, source: string): number;
17
+ export interface NodeAnalysis {
18
+ isSingleChunk: boolean;
19
+ needsSubdivision: boolean;
20
+ subdivisionCandidates: TreeSitterNode[];
21
+ size: number;
22
+ unit: string;
23
+ method: string;
24
+ estimatedSubchunks: number;
25
+ }
26
+ export declare function analyzeNodeForChunking(node: TreeSitterNode, source: string, rule: LanguageRule, profile: ModelProfile): Promise<NodeAnalysis>;
27
+ export declare function batchAnalyzeNodes(nodes: TreeSitterNode[], source: string, rule: LanguageRule, profile: ModelProfile, isSubdivision?: boolean): Promise<Array<NodeAnalysis & {
28
+ node: TreeSitterNode;
29
+ }>>;
30
+ export interface StatementChunk {
31
+ code: string;
32
+ size: number;
33
+ unit: string;
34
+ }
35
+ export declare function yieldStatementChunks(node: TreeSitterNode, source: string, maxSize: number, overlapSize: number, profile: ModelProfile): Promise<StatementChunk[]>;
36
+ export {};
37
+ //# sourceMappingURL=semantic-chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunking/semantic-chunker.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEtD,UAAU,YAAY;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IAC5C,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;CACpB;AAUD,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,GAAG,cAAc,EAAE,CAwBnG;AAED,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAgB9E;AAED,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,CAO7E;AAED,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAIrG;AAED,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,GAAG;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;CACjB,CAMA;AAED,wBAAgB,aAAa,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,CAGxE;AAqBD,MAAM,WAAW,YAAY;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,gBAAgB,EAAE,OAAO,CAAC;IAC1B,qBAAqB,EAAE,cAAc,EAAE,CAAC;IACxC,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,EAAE,MAAM,CAAC;CAC5B;AAED,wBAAsB,sBAAsB,CAC1C,IAAI,EAAE,cAAc,EACpB,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,YAAY,GACpB,OAAO,CAAC,YAAY,CAAC,CA2BvB;AAED,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,cAAc,EAAE,EACvB,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,YAAY,EACrB,aAAa,UAAQ,GACpB,OAAO,CAAC,KAAK,CAAC,YAAY,GAAG;IAAE,IAAI,EAAE,cAAc,CAAA;CAAE,CAAC,CAAC,CAgCzD;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,wBAAsB,oBAAoB,CACxC,IAAI,EAAE,cAAc,EACpB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,WAAW,EAAE,MAAM,EACnB,OAAO,EAAE,YAAY,GACpB,OAAO,CAAC,cAAc,EAAE,CAAC,CAwC3B"}
@@ -0,0 +1,172 @@
1
+ import { analyzeCodeSize, batchAnalyzeCodeSize } from './token-counter.js';
2
+ export function findSemanticSubdivisions(node, rule) {
3
+ if (!node || !rule)
4
+ return [];
5
+ const subdivisionTypes = rule.subdivisionTypes?.[node.type] || [];
6
+ if (subdivisionTypes.length === 0)
7
+ return [];
8
+ const candidates = [];
9
+ function walk(n, depth = 0) {
10
+ if (depth > 0 && subdivisionTypes.includes(n.type)) {
11
+ candidates.push(n);
12
+ return;
13
+ }
14
+ for (let i = 0; i < n.childCount; i++) {
15
+ const child = n.child(i);
16
+ if (child) {
17
+ walk(child, depth + 1);
18
+ }
19
+ }
20
+ }
21
+ walk(node);
22
+ return candidates;
23
+ }
24
+ export function findLastCompleteBoundary(code, maxSize) {
25
+ const boundaries = [
26
+ { pattern: /\n\s*}\s*$/gm, priority: 1 },
27
+ { pattern: /;\s*$/gm, priority: 2 },
28
+ { pattern: /\n\s*$/gm, priority: 3 }
29
+ ];
30
+ for (const boundary of boundaries) {
31
+ const matches = [...code.substring(0, maxSize).matchAll(boundary.pattern)];
32
+ if (matches.length > 0) {
33
+ const lastMatch = matches[matches.length - 1];
34
+ return lastMatch.index + lastMatch[0].length;
35
+ }
36
+ }
37
+ return maxSize;
38
+ }
39
+ export function extractSignature(node, source) {
40
+ const code = source.slice(node.startIndex, node.endIndex);
41
+ const firstBrace = code.indexOf('{');
42
+ if (firstBrace !== -1) {
43
+ return code.substring(0, firstBrace).trim() + ' {';
44
+ }
45
+ return code.split('\n')[0];
46
+ }
47
+ export function extractLinesBeforeNode(node, source, numLines) {
48
+ const beforeCode = source.substring(0, node.startIndex);
49
+ const lines = beforeCode.split('\n');
50
+ return lines.slice(-numLines).join('\n');
51
+ }
52
+ export function extractParentContext(node, source) {
53
+ return {
54
+ signature: extractSignature(node, source),
55
+ startLine: getLineNumber(node.startIndex, source),
56
+ endLine: getLineNumber(node.endIndex, source)
57
+ };
58
+ }
59
+ export function getLineNumber(byteOffset, source) {
60
+ const before = source.substring(0, byteOffset);
61
+ return before.split('\n').length;
62
+ }
63
+ function getSizeLimits(profile) {
64
+ if (profile.useTokens && profile.tokenCounter) {
65
+ return {
66
+ optimal: profile.optimalTokens,
67
+ min: profile.minChunkTokens,
68
+ max: profile.maxChunkTokens,
69
+ overlap: profile.overlapTokens,
70
+ unit: 'tokens'
71
+ };
72
+ }
73
+ return {
74
+ optimal: profile.optimalChars,
75
+ min: profile.minChunkChars,
76
+ max: profile.maxChunkChars,
77
+ overlap: profile.overlapChars,
78
+ unit: 'characters'
79
+ };
80
+ }
81
+ export async function analyzeNodeForChunking(node, source, rule, profile) {
82
+ const code = source.slice(node.startIndex, node.endIndex);
83
+ const limits = getSizeLimits(profile);
84
+ let actualSize;
85
+ let method;
86
+ if (profile.useTokens && profile.tokenCounter) {
87
+ const analysis = await analyzeCodeSize(code, limits, profile.tokenCounter);
88
+ actualSize = analysis.size;
89
+ method = analysis.method;
90
+ }
91
+ else {
92
+ actualSize = code.length;
93
+ method = 'chars';
94
+ }
95
+ const subdivisionThreshold = limits.max;
96
+ return {
97
+ isSingleChunk: actualSize <= subdivisionThreshold,
98
+ needsSubdivision: actualSize > subdivisionThreshold,
99
+ subdivisionCandidates: findSemanticSubdivisions(node, rule),
100
+ size: actualSize,
101
+ unit: limits.unit,
102
+ method,
103
+ estimatedSubchunks: Math.ceil(actualSize / limits.optimal)
104
+ };
105
+ }
106
+ export async function batchAnalyzeNodes(nodes, source, rule, profile, isSubdivision = false) {
107
+ const codes = nodes.map(node => source.slice(node.startIndex, node.endIndex));
108
+ const limits = getSizeLimits(profile);
109
+ let analyses;
110
+ if (profile.useTokens && profile.tokenCounter) {
111
+ analyses = await batchAnalyzeCodeSize(codes, limits, profile.tokenCounter, isSubdivision);
112
+ }
113
+ else {
114
+ analyses = codes.map(code => ({
115
+ size: code.length,
116
+ decision: code.length < limits.min ? 'too_small'
117
+ : code.length > limits.max ? 'too_large'
118
+ : code.length <= limits.optimal ? 'optimal'
119
+ : 'needs_tokenization',
120
+ method: 'chars'
121
+ }));
122
+ }
123
+ return nodes.map((node, i) => {
124
+ const analysis = analyses[i];
125
+ const subdivisionThreshold = limits.max;
126
+ return {
127
+ node,
128
+ isSingleChunk: analysis.size <= subdivisionThreshold,
129
+ needsSubdivision: analysis.size > subdivisionThreshold,
130
+ subdivisionCandidates: findSemanticSubdivisions(node, rule),
131
+ size: analysis.size,
132
+ unit: limits.unit,
133
+ method: analysis.method,
134
+ estimatedSubchunks: Math.ceil(analysis.size / limits.optimal)
135
+ };
136
+ });
137
+ }
138
+ export async function yieldStatementChunks(node, source, maxSize, overlapSize, profile) {
139
+ const code = source.slice(node.startIndex, node.endIndex);
140
+ const lines = code.split('\n');
141
+ const chunks = [];
142
+ let currentChunk = [];
143
+ let currentSize = 0;
144
+ for (const line of lines) {
145
+ const lineSize = profile.useTokens && profile.tokenCounter
146
+ ? await profile.tokenCounter(line)
147
+ : line.length;
148
+ if (currentSize + lineSize > maxSize && currentChunk.length > 0) {
149
+ chunks.push({
150
+ code: currentChunk.join('\n'),
151
+ size: currentSize,
152
+ unit: profile.useTokens ? 'tokens' : 'characters'
153
+ });
154
+ const overlapLines = Math.floor(currentChunk.length * 0.2);
155
+ currentChunk = currentChunk.slice(-overlapLines);
156
+ currentSize = profile.useTokens && profile.tokenCounter
157
+ ? await profile.tokenCounter(currentChunk.join('\n'))
158
+ : currentChunk.join('\n').length;
159
+ }
160
+ currentChunk.push(line);
161
+ currentSize += lineSize;
162
+ }
163
+ if (currentChunk.length > 0) {
164
+ chunks.push({
165
+ code: currentChunk.join('\n'),
166
+ size: currentSize,
167
+ unit: profile.useTokens ? 'tokens' : 'characters'
168
+ });
169
+ }
170
+ return chunks;
171
+ }
172
+ //# sourceMappingURL=semantic-chunker.js.map