code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown Chunker for Claude Context
|
|
3
|
+
|
|
4
|
+
Section-aware chunking of markdown files (README, docs) that:
|
|
5
|
+
1. Preserves header hierarchy (H1 > H2 > H3) as scope chains
|
|
6
|
+
2. Extracts code blocks with language tags
|
|
7
|
+
3. Tracks links and references
|
|
8
|
+
4. Creates semantically meaningful chunks for retrieval
|
|
9
|
+
|
|
10
|
+
This complements the AST chunker by handling documentation content
|
|
11
|
+
that explains "what/why/how-to-use" rather than implementation details.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import logging
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class MarkdownChunk:
|
|
25
|
+
"""Represents a chunk from a markdown file."""
|
|
26
|
+
text: str # Raw markdown content
|
|
27
|
+
contextualized_text: str # Text with header context prepended
|
|
28
|
+
chunk_type: str # 'section', 'code_block', 'list', 'paragraph'
|
|
29
|
+
name: str # Section title or code language
|
|
30
|
+
|
|
31
|
+
# Location
|
|
32
|
+
line_range: Tuple[int, int] # (start, end) 0-indexed
|
|
33
|
+
file_path: Optional[Path] = None
|
|
34
|
+
|
|
35
|
+
# Hierarchy
|
|
36
|
+
scope: List[str] = field(default_factory=list) # Header chain: ["Getting Started", "Installation"]
|
|
37
|
+
header_level: int = 0 # 1 for H1, 2 for H2, etc.
|
|
38
|
+
|
|
39
|
+
# Code block metadata
|
|
40
|
+
code_language: Optional[str] = None
|
|
41
|
+
code_content: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
# Links found in this section
|
|
44
|
+
links: List[Dict[str, str]] = field(default_factory=list) # [{"text": "...", "url": "..."}]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def size_chars(self) -> int:
|
|
48
|
+
return len(self.text)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MarkdownChunker:
|
|
52
|
+
"""
|
|
53
|
+
Markdown-aware chunker that creates semantically meaningful chunks.
|
|
54
|
+
|
|
55
|
+
Unlike generic text splitting, this:
|
|
56
|
+
- Keeps sections together as logical units
|
|
57
|
+
- Tracks header hierarchy for scope chains
|
|
58
|
+
- Extracts code blocks as separate, searchable chunks
|
|
59
|
+
- Preserves context through header prepending
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
max_chunk_size: int = 2000,
|
|
65
|
+
min_chunk_size: int = 100,
|
|
66
|
+
extract_code_blocks: bool = True,
|
|
67
|
+
include_header_context: bool = True
|
|
68
|
+
):
|
|
69
|
+
"""
|
|
70
|
+
Initialize the markdown chunker.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
max_chunk_size: Maximum characters per chunk
|
|
74
|
+
min_chunk_size: Minimum characters to create a chunk
|
|
75
|
+
extract_code_blocks: Whether to extract code blocks as separate chunks
|
|
76
|
+
include_header_context: Whether to prepend header hierarchy to chunks
|
|
77
|
+
"""
|
|
78
|
+
self.max_chunk_size = max_chunk_size
|
|
79
|
+
self.min_chunk_size = min_chunk_size
|
|
80
|
+
self.extract_code_blocks = extract_code_blocks
|
|
81
|
+
self.include_header_context = include_header_context
|
|
82
|
+
|
|
83
|
+
# Regex patterns
|
|
84
|
+
self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
|
85
|
+
self.code_block_pattern = re.compile(
|
|
86
|
+
r'^```(\w*)\n(.*?)^```',
|
|
87
|
+
re.MULTILINE | re.DOTALL
|
|
88
|
+
)
|
|
89
|
+
self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
|
90
|
+
self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
|
91
|
+
|
|
92
|
+
def chunk_file(self, file_path: Path, content: Optional[str] = None) -> List[MarkdownChunk]:
|
|
93
|
+
"""
|
|
94
|
+
Chunk a markdown file into semantic units.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
file_path: Path to the markdown file
|
|
98
|
+
content: Optional content string (if not provided, reads from file)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of MarkdownChunk objects
|
|
102
|
+
"""
|
|
103
|
+
if content is None:
|
|
104
|
+
try:
|
|
105
|
+
content = file_path.read_text(encoding='utf-8')
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Failed to read {file_path}: {e}")
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
if not content.strip():
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
chunks = []
|
|
114
|
+
|
|
115
|
+
# First, extract and replace code blocks with placeholders
|
|
116
|
+
code_blocks = []
|
|
117
|
+
if self.extract_code_blocks:
|
|
118
|
+
content, code_blocks = self._extract_code_blocks(content)
|
|
119
|
+
|
|
120
|
+
# Parse sections based on headers
|
|
121
|
+
sections = self._parse_sections(content, file_path)
|
|
122
|
+
|
|
123
|
+
# Create chunks from sections
|
|
124
|
+
for section in sections:
|
|
125
|
+
section_chunks = self._chunk_section(section, file_path)
|
|
126
|
+
chunks.extend(section_chunks)
|
|
127
|
+
|
|
128
|
+
# Add code blocks as separate chunks with their section context
|
|
129
|
+
for cb in code_blocks:
|
|
130
|
+
chunk = self._create_code_block_chunk(cb, file_path)
|
|
131
|
+
if chunk:
|
|
132
|
+
chunks.append(chunk)
|
|
133
|
+
|
|
134
|
+
logger.debug(f"Created {len(chunks)} chunks from {file_path.name}")
|
|
135
|
+
return chunks
|
|
136
|
+
|
|
137
|
+
def _extract_code_blocks(self, content: str) -> Tuple[str, List[Dict]]:
|
|
138
|
+
"""Extract code blocks and replace with placeholders."""
|
|
139
|
+
code_blocks = []
|
|
140
|
+
placeholder_content = content
|
|
141
|
+
|
|
142
|
+
for i, match in enumerate(self.code_block_pattern.finditer(content)):
|
|
143
|
+
language = match.group(1) or 'text'
|
|
144
|
+
code = match.group(2).strip()
|
|
145
|
+
|
|
146
|
+
# Find line numbers
|
|
147
|
+
pre_content = content[:match.start()]
|
|
148
|
+
start_line = pre_content.count('\n')
|
|
149
|
+
end_line = start_line + code.count('\n') + 2 # +2 for ``` lines
|
|
150
|
+
|
|
151
|
+
# Find the section context (last header before this code block)
|
|
152
|
+
scope = self._find_scope_at_position(content, match.start())
|
|
153
|
+
|
|
154
|
+
code_blocks.append({
|
|
155
|
+
'language': language,
|
|
156
|
+
'code': code,
|
|
157
|
+
'start_line': start_line,
|
|
158
|
+
'end_line': end_line,
|
|
159
|
+
'scope': scope,
|
|
160
|
+
'full_match': match.group(0)
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
# Replace with placeholder to preserve line numbers
|
|
164
|
+
placeholder = f"\n[CODE_BLOCK_{i}]\n"
|
|
165
|
+
placeholder_content = placeholder_content.replace(match.group(0), placeholder, 1)
|
|
166
|
+
|
|
167
|
+
return placeholder_content, code_blocks
|
|
168
|
+
|
|
169
|
+
def _find_scope_at_position(self, content: str, position: int) -> List[str]:
|
|
170
|
+
"""Find the header hierarchy at a given position in the content."""
|
|
171
|
+
pre_content = content[:position]
|
|
172
|
+
|
|
173
|
+
scope = []
|
|
174
|
+
current_level = 0
|
|
175
|
+
|
|
176
|
+
for match in self.header_pattern.finditer(pre_content):
|
|
177
|
+
level = len(match.group(1))
|
|
178
|
+
title = match.group(2).strip()
|
|
179
|
+
|
|
180
|
+
if level <= current_level:
|
|
181
|
+
# Pop scope back to parent level
|
|
182
|
+
while scope and len(scope) >= level:
|
|
183
|
+
scope.pop()
|
|
184
|
+
|
|
185
|
+
scope.append(title)
|
|
186
|
+
current_level = level
|
|
187
|
+
|
|
188
|
+
return scope
|
|
189
|
+
|
|
190
|
+
def _parse_sections(self, content: str, file_path: Path) -> List[Dict]:
|
|
191
|
+
"""Parse markdown into sections based on headers."""
|
|
192
|
+
lines = content.split('\n')
|
|
193
|
+
sections = []
|
|
194
|
+
current_section = {
|
|
195
|
+
'title': file_path.stem, # Use filename as default title
|
|
196
|
+
'level': 0,
|
|
197
|
+
'content_lines': [],
|
|
198
|
+
'start_line': 0,
|
|
199
|
+
'scope': []
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
scope_stack = [] # [(level, title), ...]
|
|
203
|
+
|
|
204
|
+
for i, line in enumerate(lines):
|
|
205
|
+
header_match = self.header_pattern.match(line)
|
|
206
|
+
|
|
207
|
+
if header_match:
|
|
208
|
+
# Save current section if it has content
|
|
209
|
+
if current_section['content_lines']:
|
|
210
|
+
current_section['end_line'] = i - 1
|
|
211
|
+
sections.append(current_section)
|
|
212
|
+
|
|
213
|
+
level = len(header_match.group(1))
|
|
214
|
+
title = header_match.group(2).strip()
|
|
215
|
+
|
|
216
|
+
# Update scope stack
|
|
217
|
+
while scope_stack and scope_stack[-1][0] >= level:
|
|
218
|
+
scope_stack.pop()
|
|
219
|
+
scope_stack.append((level, title))
|
|
220
|
+
|
|
221
|
+
# Create new section
|
|
222
|
+
current_section = {
|
|
223
|
+
'title': title,
|
|
224
|
+
'level': level,
|
|
225
|
+
'content_lines': [],
|
|
226
|
+
'start_line': i,
|
|
227
|
+
'scope': [s[1] for s in scope_stack]
|
|
228
|
+
}
|
|
229
|
+
else:
|
|
230
|
+
current_section['content_lines'].append(line)
|
|
231
|
+
|
|
232
|
+
# Don't forget the last section
|
|
233
|
+
if current_section['content_lines']:
|
|
234
|
+
current_section['end_line'] = len(lines) - 1
|
|
235
|
+
sections.append(current_section)
|
|
236
|
+
|
|
237
|
+
return sections
|
|
238
|
+
|
|
239
|
+
def _chunk_section(self, section: Dict, file_path: Path) -> List[MarkdownChunk]:
|
|
240
|
+
"""Create chunks from a section, splitting if too large."""
|
|
241
|
+
content = '\n'.join(section['content_lines']).strip()
|
|
242
|
+
|
|
243
|
+
if not content or len(content) < self.min_chunk_size:
|
|
244
|
+
# Skip very small sections or merge with title only
|
|
245
|
+
if section['title'] and section['level'] > 0:
|
|
246
|
+
content = f"# {section['title']}\n\n{content}" if content else f"# {section['title']}"
|
|
247
|
+
if len(content) < self.min_chunk_size:
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
chunks = []
|
|
251
|
+
scope = section['scope']
|
|
252
|
+
|
|
253
|
+
# Build contextualized text
|
|
254
|
+
if self.include_header_context and scope:
|
|
255
|
+
context_prefix = " > ".join(scope) + "\n\n"
|
|
256
|
+
else:
|
|
257
|
+
context_prefix = ""
|
|
258
|
+
|
|
259
|
+
# Extract links from content
|
|
260
|
+
links = self._extract_links(content)
|
|
261
|
+
|
|
262
|
+
if len(content) <= self.max_chunk_size:
|
|
263
|
+
# Single chunk for this section
|
|
264
|
+
chunk = MarkdownChunk(
|
|
265
|
+
text=content,
|
|
266
|
+
contextualized_text=context_prefix + content,
|
|
267
|
+
chunk_type='section',
|
|
268
|
+
name=section['title'],
|
|
269
|
+
line_range=(section['start_line'], section.get('end_line', section['start_line'])),
|
|
270
|
+
file_path=file_path,
|
|
271
|
+
scope=scope[:-1] if scope else [], # Parent scope (excluding self)
|
|
272
|
+
header_level=section['level'],
|
|
273
|
+
links=links
|
|
274
|
+
)
|
|
275
|
+
chunks.append(chunk)
|
|
276
|
+
else:
|
|
277
|
+
# Split large sections by paragraphs
|
|
278
|
+
paragraphs = self._split_by_paragraphs(content)
|
|
279
|
+
current_chunk_lines = []
|
|
280
|
+
current_size = 0
|
|
281
|
+
chunk_start = section['start_line']
|
|
282
|
+
|
|
283
|
+
for para in paragraphs:
|
|
284
|
+
para_size = len(para)
|
|
285
|
+
|
|
286
|
+
if current_size + para_size > self.max_chunk_size and current_chunk_lines:
|
|
287
|
+
# Emit current chunk
|
|
288
|
+
chunk_content = '\n\n'.join(current_chunk_lines)
|
|
289
|
+
chunk = MarkdownChunk(
|
|
290
|
+
text=chunk_content,
|
|
291
|
+
contextualized_text=context_prefix + chunk_content,
|
|
292
|
+
chunk_type='section',
|
|
293
|
+
name=section['title'],
|
|
294
|
+
line_range=(chunk_start, chunk_start + chunk_content.count('\n')),
|
|
295
|
+
file_path=file_path,
|
|
296
|
+
scope=scope[:-1] if scope else [],
|
|
297
|
+
header_level=section['level'],
|
|
298
|
+
links=self._extract_links(chunk_content)
|
|
299
|
+
)
|
|
300
|
+
chunks.append(chunk)
|
|
301
|
+
|
|
302
|
+
current_chunk_lines = []
|
|
303
|
+
current_size = 0
|
|
304
|
+
chunk_start = chunk_start + chunk_content.count('\n') + 1
|
|
305
|
+
|
|
306
|
+
current_chunk_lines.append(para)
|
|
307
|
+
current_size += para_size
|
|
308
|
+
|
|
309
|
+
# Don't forget the last chunk
|
|
310
|
+
if current_chunk_lines:
|
|
311
|
+
chunk_content = '\n\n'.join(current_chunk_lines)
|
|
312
|
+
chunk = MarkdownChunk(
|
|
313
|
+
text=chunk_content,
|
|
314
|
+
contextualized_text=context_prefix + chunk_content,
|
|
315
|
+
chunk_type='section',
|
|
316
|
+
name=section['title'],
|
|
317
|
+
line_range=(chunk_start, section.get('end_line', chunk_start)),
|
|
318
|
+
file_path=file_path,
|
|
319
|
+
scope=scope[:-1] if scope else [],
|
|
320
|
+
header_level=section['level'],
|
|
321
|
+
links=self._extract_links(chunk_content)
|
|
322
|
+
)
|
|
323
|
+
chunks.append(chunk)
|
|
324
|
+
|
|
325
|
+
return chunks
|
|
326
|
+
|
|
327
|
+
def _create_code_block_chunk(self, cb: Dict, file_path: Path) -> Optional[MarkdownChunk]:
|
|
328
|
+
"""Create a chunk from an extracted code block."""
|
|
329
|
+
code = cb['code']
|
|
330
|
+
if len(code) < self.min_chunk_size // 2: # Allow smaller code blocks
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
language = cb['language']
|
|
334
|
+
scope = cb['scope']
|
|
335
|
+
|
|
336
|
+
# Build contextualized text
|
|
337
|
+
if self.include_header_context and scope:
|
|
338
|
+
context_prefix = " > ".join(scope) + f"\n\nCode example ({language}):\n\n"
|
|
339
|
+
else:
|
|
340
|
+
context_prefix = f"Code example ({language}):\n\n"
|
|
341
|
+
|
|
342
|
+
return MarkdownChunk(
|
|
343
|
+
text=f"```{language}\n{code}\n```",
|
|
344
|
+
contextualized_text=context_prefix + code,
|
|
345
|
+
chunk_type='code_block',
|
|
346
|
+
name=f"{language} example",
|
|
347
|
+
line_range=(cb['start_line'], cb['end_line']),
|
|
348
|
+
file_path=file_path,
|
|
349
|
+
scope=scope,
|
|
350
|
+
header_level=0,
|
|
351
|
+
code_language=language,
|
|
352
|
+
code_content=code
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def _split_by_paragraphs(self, content: str) -> List[str]:
|
|
356
|
+
"""Split content into paragraphs."""
|
|
357
|
+
# Split on double newlines
|
|
358
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
359
|
+
return [p.strip() for p in paragraphs if p.strip()]
|
|
360
|
+
|
|
361
|
+
def _extract_links(self, content: str) -> List[Dict[str, str]]:
|
|
362
|
+
"""Extract markdown links from content."""
|
|
363
|
+
links = []
|
|
364
|
+
|
|
365
|
+
for match in self.link_pattern.finditer(content):
|
|
366
|
+
# Skip images
|
|
367
|
+
if content[match.start()-1:match.start()] == '!':
|
|
368
|
+
continue
|
|
369
|
+
links.append({
|
|
370
|
+
'text': match.group(1),
|
|
371
|
+
'url': match.group(2)
|
|
372
|
+
})
|
|
373
|
+
|
|
374
|
+
return links
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# Convenience function
|
|
378
|
+
def chunk_markdown_file(file_path: Path, **kwargs) -> List[MarkdownChunk]:
|
|
379
|
+
"""
|
|
380
|
+
Chunk a markdown file using default settings.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
file_path: Path to the markdown file
|
|
384
|
+
**kwargs: Additional arguments for MarkdownChunker
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
List of MarkdownChunk objects
|
|
388
|
+
"""
|
|
389
|
+
chunker = MarkdownChunker(**kwargs)
|
|
390
|
+
return chunker.chunk_file(file_path)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
if __name__ == "__main__":
|
|
394
|
+
# Test with a sample README
|
|
395
|
+
import sys
|
|
396
|
+
|
|
397
|
+
if len(sys.argv) > 1:
|
|
398
|
+
test_file = Path(sys.argv[1])
|
|
399
|
+
else:
|
|
400
|
+
test_file = Path("README.md")
|
|
401
|
+
|
|
402
|
+
if test_file.exists():
|
|
403
|
+
chunker = MarkdownChunker()
|
|
404
|
+
chunks = chunker.chunk_file(test_file)
|
|
405
|
+
|
|
406
|
+
print(f"\n📄 Chunked {test_file.name}: {len(chunks)} chunks\n")
|
|
407
|
+
|
|
408
|
+
for i, chunk in enumerate(chunks, 1):
|
|
409
|
+
print(f"{'='*60}")
|
|
410
|
+
print(f"Chunk {i}: {chunk.name} ({chunk.chunk_type})")
|
|
411
|
+
print(f" Scope: {' > '.join(chunk.scope) if chunk.scope else '(root)'}")
|
|
412
|
+
print(f" Lines: {chunk.line_range[0]}-{chunk.line_range[1]}")
|
|
413
|
+
print(f" Size: {chunk.size_chars} chars")
|
|
414
|
+
if chunk.code_language:
|
|
415
|
+
print(f" Language: {chunk.code_language}")
|
|
416
|
+
if chunk.links:
|
|
417
|
+
print(f" Links: {len(chunk.links)}")
|
|
418
|
+
print(f"\n Preview: {chunk.text[:150]}...")
|
|
419
|
+
print()
|
|
420
|
+
else:
|
|
421
|
+
print(f"File not found: {test_file}")
|