headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""Code structure handler using AST parsing.
|
|
2
|
+
|
|
3
|
+
Extracts structural elements from source code:
|
|
4
|
+
- Import statements
|
|
5
|
+
- Function/method signatures
|
|
6
|
+
- Class definitions
|
|
7
|
+
- Type annotations
|
|
8
|
+
- Decorators
|
|
9
|
+
|
|
10
|
+
Function bodies are marked as compressible while preserving signatures.
|
|
11
|
+
This enables the LLM to see all available functions/methods while body
|
|
12
|
+
implementations are compressed.
|
|
13
|
+
|
|
14
|
+
Uses tree-sitter for parsing when available, falls back to regex patterns.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
import threading
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from enum import Enum
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from headroom.compression.handlers.base import BaseStructureHandler, HandlerResult
|
|
27
|
+
from headroom.compression.masks import StructureMask
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Lazy-loaded tree-sitter
|
|
32
|
+
_tree_sitter_available: bool | None = None
|
|
33
|
+
_tree_sitter_parsers: dict[str, Any] = {}
|
|
34
|
+
_tree_sitter_lock = threading.Lock()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _check_tree_sitter() -> bool:
|
|
38
|
+
"""Check if tree-sitter is available."""
|
|
39
|
+
global _tree_sitter_available
|
|
40
|
+
if _tree_sitter_available is None:
|
|
41
|
+
try:
|
|
42
|
+
import tree_sitter_language_pack # noqa: F401
|
|
43
|
+
|
|
44
|
+
_tree_sitter_available = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
_tree_sitter_available = False
|
|
47
|
+
return _tree_sitter_available
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_parser(language: str) -> Any:
|
|
51
|
+
"""Get tree-sitter parser for language."""
|
|
52
|
+
global _tree_sitter_parsers
|
|
53
|
+
|
|
54
|
+
if not _check_tree_sitter():
|
|
55
|
+
raise ImportError("tree-sitter-language-pack not installed")
|
|
56
|
+
|
|
57
|
+
with _tree_sitter_lock:
|
|
58
|
+
if language not in _tree_sitter_parsers:
|
|
59
|
+
from tree_sitter_language_pack import get_parser
|
|
60
|
+
|
|
61
|
+
_tree_sitter_parsers[language] = get_parser(language) # type: ignore[arg-type]
|
|
62
|
+
|
|
63
|
+
return _tree_sitter_parsers[language]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CodeLanguage(Enum):
|
|
67
|
+
"""Supported programming languages."""
|
|
68
|
+
|
|
69
|
+
PYTHON = "python"
|
|
70
|
+
JAVASCRIPT = "javascript"
|
|
71
|
+
TYPESCRIPT = "typescript"
|
|
72
|
+
GO = "go"
|
|
73
|
+
RUST = "rust"
|
|
74
|
+
JAVA = "java"
|
|
75
|
+
C = "c"
|
|
76
|
+
CPP = "cpp"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class CodeSpan:
|
|
81
|
+
"""A span of code with its structural role."""
|
|
82
|
+
|
|
83
|
+
start: int
|
|
84
|
+
end: int
|
|
85
|
+
role: str # "import", "signature", "body", "decorator", etc.
|
|
86
|
+
is_structural: bool
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# Language-specific AST node types that are structural
|
|
90
|
+
_STRUCTURAL_NODE_TYPES: dict[str, set[str]] = {
|
|
91
|
+
"python": {
|
|
92
|
+
"import_statement",
|
|
93
|
+
"import_from_statement",
|
|
94
|
+
"function_definition", # Just the signature part
|
|
95
|
+
"class_definition",
|
|
96
|
+
"decorated_definition",
|
|
97
|
+
"type_alias_statement",
|
|
98
|
+
},
|
|
99
|
+
"javascript": {
|
|
100
|
+
"import_statement",
|
|
101
|
+
"export_statement",
|
|
102
|
+
"function_declaration",
|
|
103
|
+
"class_declaration",
|
|
104
|
+
"method_definition",
|
|
105
|
+
"arrow_function", # Signature only
|
|
106
|
+
},
|
|
107
|
+
"typescript": {
|
|
108
|
+
"import_statement",
|
|
109
|
+
"export_statement",
|
|
110
|
+
"function_declaration",
|
|
111
|
+
"class_declaration",
|
|
112
|
+
"method_definition",
|
|
113
|
+
"interface_declaration",
|
|
114
|
+
"type_alias_declaration",
|
|
115
|
+
},
|
|
116
|
+
"go": {
|
|
117
|
+
"import_declaration",
|
|
118
|
+
"function_declaration",
|
|
119
|
+
"method_declaration",
|
|
120
|
+
"type_declaration",
|
|
121
|
+
"interface_type",
|
|
122
|
+
},
|
|
123
|
+
"rust": {
|
|
124
|
+
"use_declaration",
|
|
125
|
+
"function_item",
|
|
126
|
+
"impl_item",
|
|
127
|
+
"struct_item",
|
|
128
|
+
"enum_item",
|
|
129
|
+
"trait_item",
|
|
130
|
+
},
|
|
131
|
+
"java": {
|
|
132
|
+
"import_declaration",
|
|
133
|
+
"class_declaration",
|
|
134
|
+
"method_declaration",
|
|
135
|
+
"interface_declaration",
|
|
136
|
+
"annotation",
|
|
137
|
+
},
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Regex patterns for fallback detection
|
|
141
|
+
_SIGNATURE_PATTERNS: dict[str, list[re.Pattern[str]]] = {
|
|
142
|
+
"python": [
|
|
143
|
+
re.compile(r"^\s*(async\s+)?def\s+\w+\s*\([^)]*\)\s*(->\s*[^:]+)?:", re.MULTILINE),
|
|
144
|
+
re.compile(r"^\s*class\s+\w+(\([^)]*\))?:", re.MULTILINE),
|
|
145
|
+
re.compile(r"^\s*@\w+(\([^)]*\))?\s*$", re.MULTILINE),
|
|
146
|
+
],
|
|
147
|
+
"javascript": [
|
|
148
|
+
re.compile(r"^\s*(async\s+)?function\s+\w+\s*\([^)]*\)", re.MULTILINE),
|
|
149
|
+
re.compile(r"^\s*class\s+\w+(\s+extends\s+\w+)?", re.MULTILINE),
|
|
150
|
+
re.compile(r"^\s*(const|let|var)\s+\w+\s*=\s*(async\s+)?\([^)]*\)\s*=>", re.MULTILINE),
|
|
151
|
+
],
|
|
152
|
+
"typescript": [
|
|
153
|
+
re.compile(r"^\s*(async\s+)?function\s+\w+\s*(<[^>]+>)?\s*\([^)]*\)", re.MULTILINE),
|
|
154
|
+
re.compile(r"^\s*class\s+\w+(<[^>]+>)?(\s+extends\s+\w+)?", re.MULTILINE),
|
|
155
|
+
re.compile(r"^\s*interface\s+\w+(<[^>]+>)?", re.MULTILINE),
|
|
156
|
+
re.compile(r"^\s*type\s+\w+(<[^>]+>)?\s*=", re.MULTILINE),
|
|
157
|
+
],
|
|
158
|
+
"go": [
|
|
159
|
+
re.compile(r"^\s*func\s+(\([^)]+\)\s+)?\w+\s*\([^)]*\)", re.MULTILINE),
|
|
160
|
+
re.compile(r"^\s*type\s+\w+\s+(struct|interface)", re.MULTILINE),
|
|
161
|
+
],
|
|
162
|
+
"rust": [
|
|
163
|
+
re.compile(r"^\s*(pub\s+)?(async\s+)?fn\s+\w+\s*(<[^>]+>)?\s*\([^)]*\)", re.MULTILINE),
|
|
164
|
+
re.compile(r"^\s*(pub\s+)?struct\s+\w+", re.MULTILINE),
|
|
165
|
+
re.compile(r"^\s*(pub\s+)?enum\s+\w+", re.MULTILINE),
|
|
166
|
+
re.compile(r"^\s*(pub\s+)?trait\s+\w+", re.MULTILINE),
|
|
167
|
+
re.compile(r"^\s*impl(<[^>]+>)?\s+\w+", re.MULTILINE),
|
|
168
|
+
],
|
|
169
|
+
"java": [
|
|
170
|
+
re.compile(
|
|
171
|
+
r"^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\([^)]*\)", re.MULTILINE
|
|
172
|
+
),
|
|
173
|
+
re.compile(r"^\s*(public\s+)?(class|interface|enum)\s+\w+", re.MULTILINE),
|
|
174
|
+
re.compile(r"^\s*@\w+(\([^)]*\))?\s*$", re.MULTILINE),
|
|
175
|
+
],
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Import patterns for fallback
|
|
179
|
+
_IMPORT_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
180
|
+
"python": re.compile(r"^\s*(import\s+\w+|from\s+\w+\s+import)", re.MULTILINE),
|
|
181
|
+
"javascript": re.compile(r"^\s*(import\s+.*from|require\s*\()", re.MULTILINE),
|
|
182
|
+
"typescript": re.compile(r"^\s*(import\s+.*from|require\s*\()", re.MULTILINE),
|
|
183
|
+
"go": re.compile(r'^\s*import\s+(\(|")', re.MULTILINE),
|
|
184
|
+
"rust": re.compile(r"^\s*use\s+\w+", re.MULTILINE),
|
|
185
|
+
"java": re.compile(r"^\s*import\s+[\w.]+;", re.MULTILINE),
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class CodeStructureHandler(BaseStructureHandler):
|
|
190
|
+
"""Handler for source code.
|
|
191
|
+
|
|
192
|
+
Preserves:
|
|
193
|
+
- Import/use statements
|
|
194
|
+
- Function/method signatures (not bodies)
|
|
195
|
+
- Class/struct/interface definitions
|
|
196
|
+
- Type declarations
|
|
197
|
+
- Decorators/annotations
|
|
198
|
+
|
|
199
|
+
Marks as compressible:
|
|
200
|
+
- Function/method bodies
|
|
201
|
+
- Comments (optionally preserved)
|
|
202
|
+
- Whitespace
|
|
203
|
+
|
|
204
|
+
Example:
|
|
205
|
+
>>> handler = CodeStructureHandler()
|
|
206
|
+
>>> code = '''
|
|
207
|
+
... def hello(name: str) -> str:
|
|
208
|
+
... message = f"Hello, {name}!"
|
|
209
|
+
... return message
|
|
210
|
+
... '''
|
|
211
|
+
>>> result = handler.get_mask(code, language="python")
|
|
212
|
+
>>> # Signature "def hello(name: str) -> str:" preserved
|
|
213
|
+
>>> # Body content compressed
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(
|
|
217
|
+
self,
|
|
218
|
+
preserve_comments: bool = False,
|
|
219
|
+
use_tree_sitter: bool = True,
|
|
220
|
+
default_language: str = "python",
|
|
221
|
+
):
|
|
222
|
+
"""Initialize the code handler.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
preserve_comments: Whether to preserve comments as structural.
|
|
226
|
+
use_tree_sitter: Whether to use tree-sitter for parsing.
|
|
227
|
+
Falls back to regex if False or unavailable.
|
|
228
|
+
default_language: Default language when detection fails.
|
|
229
|
+
"""
|
|
230
|
+
super().__init__(name="code")
|
|
231
|
+
self.preserve_comments = preserve_comments
|
|
232
|
+
self.use_tree_sitter = use_tree_sitter
|
|
233
|
+
self.default_language = default_language
|
|
234
|
+
|
|
235
|
+
def can_handle(self, content: str) -> bool:
|
|
236
|
+
"""Check if content looks like source code."""
|
|
237
|
+
# Quick heuristic checks
|
|
238
|
+
code_indicators = [
|
|
239
|
+
"def ",
|
|
240
|
+
"class ",
|
|
241
|
+
"function ",
|
|
242
|
+
"import ",
|
|
243
|
+
"const ",
|
|
244
|
+
"let ",
|
|
245
|
+
"var ",
|
|
246
|
+
"func ",
|
|
247
|
+
"fn ",
|
|
248
|
+
"pub ",
|
|
249
|
+
"package ",
|
|
250
|
+
"struct ",
|
|
251
|
+
"interface ",
|
|
252
|
+
]
|
|
253
|
+
return any(indicator in content for indicator in code_indicators)
|
|
254
|
+
|
|
255
|
+
def _extract_mask(
|
|
256
|
+
self,
|
|
257
|
+
content: str,
|
|
258
|
+
tokens: list[str],
|
|
259
|
+
language: str | None = None,
|
|
260
|
+
**kwargs: Any,
|
|
261
|
+
) -> HandlerResult:
|
|
262
|
+
"""Extract structure mask from code.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
content: Source code content.
|
|
266
|
+
tokens: Character-level tokens.
|
|
267
|
+
language: Programming language (auto-detected if None).
|
|
268
|
+
**kwargs: Additional options.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
HandlerResult with mask marking structural elements.
|
|
272
|
+
"""
|
|
273
|
+
# Detect language if not provided
|
|
274
|
+
if language is None:
|
|
275
|
+
language = self._detect_language(content)
|
|
276
|
+
|
|
277
|
+
# Try tree-sitter first
|
|
278
|
+
if self.use_tree_sitter and _check_tree_sitter():
|
|
279
|
+
try:
|
|
280
|
+
return self._extract_with_tree_sitter(content, tokens, language)
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.debug("Tree-sitter parsing failed, using fallback: %s", e)
|
|
283
|
+
|
|
284
|
+
# Fallback to regex
|
|
285
|
+
return self._extract_with_regex(content, tokens, language)
|
|
286
|
+
|
|
287
|
+
def _extract_with_tree_sitter(
|
|
288
|
+
self,
|
|
289
|
+
content: str,
|
|
290
|
+
tokens: list[str],
|
|
291
|
+
language: str,
|
|
292
|
+
) -> HandlerResult:
|
|
293
|
+
"""Extract structure using tree-sitter AST.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
content: Source code.
|
|
297
|
+
tokens: Character tokens.
|
|
298
|
+
language: Language name.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
HandlerResult with mask.
|
|
302
|
+
"""
|
|
303
|
+
parser = _get_parser(language)
|
|
304
|
+
tree = parser.parse(content.encode("utf-8"))
|
|
305
|
+
|
|
306
|
+
# Collect structural spans
|
|
307
|
+
spans: list[CodeSpan] = []
|
|
308
|
+
|
|
309
|
+
def visit_node(node: Any, depth: int = 0) -> None:
|
|
310
|
+
"""Visit AST node and collect structural spans."""
|
|
311
|
+
node_type = node.type
|
|
312
|
+
structural_types = _STRUCTURAL_NODE_TYPES.get(language, set())
|
|
313
|
+
|
|
314
|
+
# Check if this is a structural node type
|
|
315
|
+
if node_type in structural_types:
|
|
316
|
+
# For functions, only the signature is structural
|
|
317
|
+
if "function" in node_type or "method" in node_type:
|
|
318
|
+
# Find the body node and exclude it
|
|
319
|
+
body_node = None
|
|
320
|
+
for child in node.children:
|
|
321
|
+
if child.type in ("block", "statement_block", "compound_statement"):
|
|
322
|
+
body_node = child
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
if body_node:
|
|
326
|
+
# Signature is from start to body start
|
|
327
|
+
spans.append(
|
|
328
|
+
CodeSpan(
|
|
329
|
+
start=node.start_byte,
|
|
330
|
+
end=body_node.start_byte,
|
|
331
|
+
role="signature",
|
|
332
|
+
is_structural=True,
|
|
333
|
+
)
|
|
334
|
+
)
|
|
335
|
+
# Body is compressible
|
|
336
|
+
spans.append(
|
|
337
|
+
CodeSpan(
|
|
338
|
+
start=body_node.start_byte,
|
|
339
|
+
end=body_node.end_byte,
|
|
340
|
+
role="body",
|
|
341
|
+
is_structural=False,
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
else:
|
|
345
|
+
# No body found, preserve whole thing
|
|
346
|
+
spans.append(
|
|
347
|
+
CodeSpan(
|
|
348
|
+
start=node.start_byte,
|
|
349
|
+
end=node.end_byte,
|
|
350
|
+
role=node_type,
|
|
351
|
+
is_structural=True,
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
# Non-function structural nodes
|
|
356
|
+
spans.append(
|
|
357
|
+
CodeSpan(
|
|
358
|
+
start=node.start_byte,
|
|
359
|
+
end=node.end_byte,
|
|
360
|
+
role=node_type,
|
|
361
|
+
is_structural=True,
|
|
362
|
+
)
|
|
363
|
+
)
|
|
364
|
+
elif node_type == "comment" and self.preserve_comments:
|
|
365
|
+
spans.append(
|
|
366
|
+
CodeSpan(
|
|
367
|
+
start=node.start_byte,
|
|
368
|
+
end=node.end_byte,
|
|
369
|
+
role="comment",
|
|
370
|
+
is_structural=True,
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Recurse into children
|
|
375
|
+
for child in node.children:
|
|
376
|
+
visit_node(child, depth + 1)
|
|
377
|
+
|
|
378
|
+
visit_node(tree.root_node)
|
|
379
|
+
|
|
380
|
+
# Build mask from spans
|
|
381
|
+
mask = self._spans_to_mask(spans, len(content))
|
|
382
|
+
|
|
383
|
+
return HandlerResult(
|
|
384
|
+
mask=StructureMask(tokens=tokens, mask=mask),
|
|
385
|
+
handler_name=self.name,
|
|
386
|
+
confidence=0.95,
|
|
387
|
+
metadata={
|
|
388
|
+
"language": language,
|
|
389
|
+
"parser": "tree-sitter",
|
|
390
|
+
"structural_spans": len([s for s in spans if s.is_structural]),
|
|
391
|
+
},
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _extract_with_regex(
|
|
395
|
+
self,
|
|
396
|
+
content: str,
|
|
397
|
+
tokens: list[str],
|
|
398
|
+
language: str,
|
|
399
|
+
) -> HandlerResult:
|
|
400
|
+
"""Extract structure using regex patterns (fallback).
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
content: Source code.
|
|
404
|
+
tokens: Character tokens.
|
|
405
|
+
language: Language name.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
HandlerResult with mask.
|
|
409
|
+
"""
|
|
410
|
+
spans: list[CodeSpan] = []
|
|
411
|
+
|
|
412
|
+
# Match imports
|
|
413
|
+
import_pattern = _IMPORT_PATTERNS.get(language)
|
|
414
|
+
if import_pattern:
|
|
415
|
+
for match in import_pattern.finditer(content):
|
|
416
|
+
# Find end of import line
|
|
417
|
+
end = content.find("\n", match.end())
|
|
418
|
+
if end == -1:
|
|
419
|
+
end = len(content)
|
|
420
|
+
spans.append(
|
|
421
|
+
CodeSpan(
|
|
422
|
+
start=match.start(),
|
|
423
|
+
end=end,
|
|
424
|
+
role="import",
|
|
425
|
+
is_structural=True,
|
|
426
|
+
)
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Match signatures
|
|
430
|
+
signature_patterns = _SIGNATURE_PATTERNS.get(language, [])
|
|
431
|
+
for pattern in signature_patterns:
|
|
432
|
+
for match in pattern.finditer(content):
|
|
433
|
+
spans.append(
|
|
434
|
+
CodeSpan(
|
|
435
|
+
start=match.start(),
|
|
436
|
+
end=match.end(),
|
|
437
|
+
role="signature",
|
|
438
|
+
is_structural=True,
|
|
439
|
+
)
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Build mask from spans
|
|
443
|
+
mask = self._spans_to_mask(spans, len(content))
|
|
444
|
+
|
|
445
|
+
return HandlerResult(
|
|
446
|
+
mask=StructureMask(tokens=tokens, mask=mask),
|
|
447
|
+
handler_name=self.name,
|
|
448
|
+
confidence=0.7, # Lower confidence for regex
|
|
449
|
+
metadata={
|
|
450
|
+
"language": language,
|
|
451
|
+
"parser": "regex",
|
|
452
|
+
"structural_spans": len(spans),
|
|
453
|
+
},
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
def _spans_to_mask(self, spans: list[CodeSpan], length: int) -> list[bool]:
|
|
457
|
+
"""Convert spans to character-level mask.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
spans: List of code spans.
|
|
461
|
+
length: Total content length.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Boolean mask aligned to characters.
|
|
465
|
+
"""
|
|
466
|
+
mask = [False] * length
|
|
467
|
+
|
|
468
|
+
for span in spans:
|
|
469
|
+
if span.is_structural:
|
|
470
|
+
for i in range(span.start, min(span.end, length)):
|
|
471
|
+
mask[i] = True
|
|
472
|
+
|
|
473
|
+
return mask
|
|
474
|
+
|
|
475
|
+
def _detect_language(self, content: str) -> str:
|
|
476
|
+
"""Detect programming language from content.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
content: Source code content.
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
Language name (lowercase).
|
|
483
|
+
"""
|
|
484
|
+
# Check for language-specific markers
|
|
485
|
+
markers = {
|
|
486
|
+
"python": ["def ", "import ", "from ", "class ", "async def"],
|
|
487
|
+
"javascript": ["function ", "const ", "let ", "var ", "=>"],
|
|
488
|
+
"typescript": ["interface ", "type ", ": string", ": number"],
|
|
489
|
+
"go": ["func ", "package ", "import (", "type "],
|
|
490
|
+
"rust": ["fn ", "let mut", "impl ", "pub fn", "use "],
|
|
491
|
+
"java": ["public class", "private ", "protected ", "void "],
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
scores: dict[str, int] = {}
|
|
495
|
+
for lang, patterns in markers.items():
|
|
496
|
+
scores[lang] = sum(1 for p in patterns if p in content)
|
|
497
|
+
|
|
498
|
+
if not scores or max(scores.values()) == 0:
|
|
499
|
+
return self.default_language
|
|
500
|
+
|
|
501
|
+
return max(scores, key=lambda k: scores[k])
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def is_tree_sitter_available() -> bool:
|
|
505
|
+
"""Check if tree-sitter is available."""
|
|
506
|
+
return _check_tree_sitter()
|