code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,421 @@
1
+ """
2
+ Markdown Chunker for Claude Context
3
+
4
+ Section-aware chunking of markdown files (README, docs) that:
5
+ 1. Preserves header hierarchy (H1 > H2 > H3) as scope chains
6
+ 2. Extracts code blocks with language tags
7
+ 3. Tracks links and references
8
+ 4. Creates semantically meaningful chunks for retrieval
9
+
10
+ This complements the AST chunker by handling documentation content
11
+ that explains "what/why/how-to-use" rather than implementation details.
12
+ """
13
+
14
+ import re
15
+ import logging
16
+ from pathlib import Path
17
+ from dataclasses import dataclass, field
18
+ from typing import List, Dict, Any, Optional, Tuple
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class MarkdownChunk:
25
+ """Represents a chunk from a markdown file."""
26
+ text: str # Raw markdown content
27
+ contextualized_text: str # Text with header context prepended
28
+ chunk_type: str # 'section', 'code_block', 'list', 'paragraph'
29
+ name: str # Section title or code language
30
+
31
+ # Location
32
+ line_range: Tuple[int, int] # (start, end) 0-indexed
33
+ file_path: Optional[Path] = None
34
+
35
+ # Hierarchy
36
+ scope: List[str] = field(default_factory=list) # Header chain: ["Getting Started", "Installation"]
37
+ header_level: int = 0 # 1 for H1, 2 for H2, etc.
38
+
39
+ # Code block metadata
40
+ code_language: Optional[str] = None
41
+ code_content: Optional[str] = None
42
+
43
+ # Links found in this section
44
+ links: List[Dict[str, str]] = field(default_factory=list) # [{"text": "...", "url": "..."}]
45
+
46
+ @property
47
+ def size_chars(self) -> int:
48
+ return len(self.text)
49
+
50
+
51
+ class MarkdownChunker:
52
+ """
53
+ Markdown-aware chunker that creates semantically meaningful chunks.
54
+
55
+ Unlike generic text splitting, this:
56
+ - Keeps sections together as logical units
57
+ - Tracks header hierarchy for scope chains
58
+ - Extracts code blocks as separate, searchable chunks
59
+ - Preserves context through header prepending
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ max_chunk_size: int = 2000,
65
+ min_chunk_size: int = 100,
66
+ extract_code_blocks: bool = True,
67
+ include_header_context: bool = True
68
+ ):
69
+ """
70
+ Initialize the markdown chunker.
71
+
72
+ Args:
73
+ max_chunk_size: Maximum characters per chunk
74
+ min_chunk_size: Minimum characters to create a chunk
75
+ extract_code_blocks: Whether to extract code blocks as separate chunks
76
+ include_header_context: Whether to prepend header hierarchy to chunks
77
+ """
78
+ self.max_chunk_size = max_chunk_size
79
+ self.min_chunk_size = min_chunk_size
80
+ self.extract_code_blocks = extract_code_blocks
81
+ self.include_header_context = include_header_context
82
+
83
+ # Regex patterns
84
+ self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
85
+ self.code_block_pattern = re.compile(
86
+ r'^```(\w*)\n(.*?)^```',
87
+ re.MULTILINE | re.DOTALL
88
+ )
89
+ self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
90
+ self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
91
+
92
+ def chunk_file(self, file_path: Path, content: Optional[str] = None) -> List[MarkdownChunk]:
93
+ """
94
+ Chunk a markdown file into semantic units.
95
+
96
+ Args:
97
+ file_path: Path to the markdown file
98
+ content: Optional content string (if not provided, reads from file)
99
+
100
+ Returns:
101
+ List of MarkdownChunk objects
102
+ """
103
+ if content is None:
104
+ try:
105
+ content = file_path.read_text(encoding='utf-8')
106
+ except Exception as e:
107
+ logger.error(f"Failed to read {file_path}: {e}")
108
+ return []
109
+
110
+ if not content.strip():
111
+ return []
112
+
113
+ chunks = []
114
+
115
+ # First, extract and replace code blocks with placeholders
116
+ code_blocks = []
117
+ if self.extract_code_blocks:
118
+ content, code_blocks = self._extract_code_blocks(content)
119
+
120
+ # Parse sections based on headers
121
+ sections = self._parse_sections(content, file_path)
122
+
123
+ # Create chunks from sections
124
+ for section in sections:
125
+ section_chunks = self._chunk_section(section, file_path)
126
+ chunks.extend(section_chunks)
127
+
128
+ # Add code blocks as separate chunks with their section context
129
+ for cb in code_blocks:
130
+ chunk = self._create_code_block_chunk(cb, file_path)
131
+ if chunk:
132
+ chunks.append(chunk)
133
+
134
+ logger.debug(f"Created {len(chunks)} chunks from {file_path.name}")
135
+ return chunks
136
+
137
+ def _extract_code_blocks(self, content: str) -> Tuple[str, List[Dict]]:
138
+ """Extract code blocks and replace with placeholders."""
139
+ code_blocks = []
140
+ placeholder_content = content
141
+
142
+ for i, match in enumerate(self.code_block_pattern.finditer(content)):
143
+ language = match.group(1) or 'text'
144
+ code = match.group(2).strip()
145
+
146
+ # Find line numbers
147
+ pre_content = content[:match.start()]
148
+ start_line = pre_content.count('\n')
149
+ end_line = start_line + code.count('\n') + 2 # +2 for ``` lines
150
+
151
+ # Find the section context (last header before this code block)
152
+ scope = self._find_scope_at_position(content, match.start())
153
+
154
+ code_blocks.append({
155
+ 'language': language,
156
+ 'code': code,
157
+ 'start_line': start_line,
158
+ 'end_line': end_line,
159
+ 'scope': scope,
160
+ 'full_match': match.group(0)
161
+ })
162
+
163
+ # Replace with placeholder to preserve line numbers
164
+ placeholder = f"\n[CODE_BLOCK_{i}]\n"
165
+ placeholder_content = placeholder_content.replace(match.group(0), placeholder, 1)
166
+
167
+ return placeholder_content, code_blocks
168
+
169
+ def _find_scope_at_position(self, content: str, position: int) -> List[str]:
170
+ """Find the header hierarchy at a given position in the content."""
171
+ pre_content = content[:position]
172
+
173
+ scope = []
174
+ current_level = 0
175
+
176
+ for match in self.header_pattern.finditer(pre_content):
177
+ level = len(match.group(1))
178
+ title = match.group(2).strip()
179
+
180
+ if level <= current_level:
181
+ # Pop scope back to parent level
182
+ while scope and len(scope) >= level:
183
+ scope.pop()
184
+
185
+ scope.append(title)
186
+ current_level = level
187
+
188
+ return scope
189
+
190
+ def _parse_sections(self, content: str, file_path: Path) -> List[Dict]:
191
+ """Parse markdown into sections based on headers."""
192
+ lines = content.split('\n')
193
+ sections = []
194
+ current_section = {
195
+ 'title': file_path.stem, # Use filename as default title
196
+ 'level': 0,
197
+ 'content_lines': [],
198
+ 'start_line': 0,
199
+ 'scope': []
200
+ }
201
+
202
+ scope_stack = [] # [(level, title), ...]
203
+
204
+ for i, line in enumerate(lines):
205
+ header_match = self.header_pattern.match(line)
206
+
207
+ if header_match:
208
+ # Save current section if it has content
209
+ if current_section['content_lines']:
210
+ current_section['end_line'] = i - 1
211
+ sections.append(current_section)
212
+
213
+ level = len(header_match.group(1))
214
+ title = header_match.group(2).strip()
215
+
216
+ # Update scope stack
217
+ while scope_stack and scope_stack[-1][0] >= level:
218
+ scope_stack.pop()
219
+ scope_stack.append((level, title))
220
+
221
+ # Create new section
222
+ current_section = {
223
+ 'title': title,
224
+ 'level': level,
225
+ 'content_lines': [],
226
+ 'start_line': i,
227
+ 'scope': [s[1] for s in scope_stack]
228
+ }
229
+ else:
230
+ current_section['content_lines'].append(line)
231
+
232
+ # Don't forget the last section
233
+ if current_section['content_lines']:
234
+ current_section['end_line'] = len(lines) - 1
235
+ sections.append(current_section)
236
+
237
+ return sections
238
+
239
+ def _chunk_section(self, section: Dict, file_path: Path) -> List[MarkdownChunk]:
240
+ """Create chunks from a section, splitting if too large."""
241
+ content = '\n'.join(section['content_lines']).strip()
242
+
243
+ if not content or len(content) < self.min_chunk_size:
244
+ # Skip very small sections or merge with title only
245
+ if section['title'] and section['level'] > 0:
246
+ content = f"# {section['title']}\n\n{content}" if content else f"# {section['title']}"
247
+ if len(content) < self.min_chunk_size:
248
+ return []
249
+
250
+ chunks = []
251
+ scope = section['scope']
252
+
253
+ # Build contextualized text
254
+ if self.include_header_context and scope:
255
+ context_prefix = " > ".join(scope) + "\n\n"
256
+ else:
257
+ context_prefix = ""
258
+
259
+ # Extract links from content
260
+ links = self._extract_links(content)
261
+
262
+ if len(content) <= self.max_chunk_size:
263
+ # Single chunk for this section
264
+ chunk = MarkdownChunk(
265
+ text=content,
266
+ contextualized_text=context_prefix + content,
267
+ chunk_type='section',
268
+ name=section['title'],
269
+ line_range=(section['start_line'], section.get('end_line', section['start_line'])),
270
+ file_path=file_path,
271
+ scope=scope[:-1] if scope else [], # Parent scope (excluding self)
272
+ header_level=section['level'],
273
+ links=links
274
+ )
275
+ chunks.append(chunk)
276
+ else:
277
+ # Split large sections by paragraphs
278
+ paragraphs = self._split_by_paragraphs(content)
279
+ current_chunk_lines = []
280
+ current_size = 0
281
+ chunk_start = section['start_line']
282
+
283
+ for para in paragraphs:
284
+ para_size = len(para)
285
+
286
+ if current_size + para_size > self.max_chunk_size and current_chunk_lines:
287
+ # Emit current chunk
288
+ chunk_content = '\n\n'.join(current_chunk_lines)
289
+ chunk = MarkdownChunk(
290
+ text=chunk_content,
291
+ contextualized_text=context_prefix + chunk_content,
292
+ chunk_type='section',
293
+ name=section['title'],
294
+ line_range=(chunk_start, chunk_start + chunk_content.count('\n')),
295
+ file_path=file_path,
296
+ scope=scope[:-1] if scope else [],
297
+ header_level=section['level'],
298
+ links=self._extract_links(chunk_content)
299
+ )
300
+ chunks.append(chunk)
301
+
302
+ current_chunk_lines = []
303
+ current_size = 0
304
+ chunk_start = chunk_start + chunk_content.count('\n') + 1
305
+
306
+ current_chunk_lines.append(para)
307
+ current_size += para_size
308
+
309
+ # Don't forget the last chunk
310
+ if current_chunk_lines:
311
+ chunk_content = '\n\n'.join(current_chunk_lines)
312
+ chunk = MarkdownChunk(
313
+ text=chunk_content,
314
+ contextualized_text=context_prefix + chunk_content,
315
+ chunk_type='section',
316
+ name=section['title'],
317
+ line_range=(chunk_start, section.get('end_line', chunk_start)),
318
+ file_path=file_path,
319
+ scope=scope[:-1] if scope else [],
320
+ header_level=section['level'],
321
+ links=self._extract_links(chunk_content)
322
+ )
323
+ chunks.append(chunk)
324
+
325
+ return chunks
326
+
327
+ def _create_code_block_chunk(self, cb: Dict, file_path: Path) -> Optional[MarkdownChunk]:
328
+ """Create a chunk from an extracted code block."""
329
+ code = cb['code']
330
+ if len(code) < self.min_chunk_size // 2: # Allow smaller code blocks
331
+ return None
332
+
333
+ language = cb['language']
334
+ scope = cb['scope']
335
+
336
+ # Build contextualized text
337
+ if self.include_header_context and scope:
338
+ context_prefix = " > ".join(scope) + f"\n\nCode example ({language}):\n\n"
339
+ else:
340
+ context_prefix = f"Code example ({language}):\n\n"
341
+
342
+ return MarkdownChunk(
343
+ text=f"```{language}\n{code}\n```",
344
+ contextualized_text=context_prefix + code,
345
+ chunk_type='code_block',
346
+ name=f"{language} example",
347
+ line_range=(cb['start_line'], cb['end_line']),
348
+ file_path=file_path,
349
+ scope=scope,
350
+ header_level=0,
351
+ code_language=language,
352
+ code_content=code
353
+ )
354
+
355
+ def _split_by_paragraphs(self, content: str) -> List[str]:
356
+ """Split content into paragraphs."""
357
+ # Split on double newlines
358
+ paragraphs = re.split(r'\n\s*\n', content)
359
+ return [p.strip() for p in paragraphs if p.strip()]
360
+
361
+ def _extract_links(self, content: str) -> List[Dict[str, str]]:
362
+ """Extract markdown links from content."""
363
+ links = []
364
+
365
+ for match in self.link_pattern.finditer(content):
366
+ # Skip images
367
+ if content[match.start()-1:match.start()] == '!':
368
+ continue
369
+ links.append({
370
+ 'text': match.group(1),
371
+ 'url': match.group(2)
372
+ })
373
+
374
+ return links
375
+
376
+
377
+ # Convenience function
378
+ def chunk_markdown_file(file_path: Path, **kwargs) -> List[MarkdownChunk]:
379
+ """
380
+ Chunk a markdown file using default settings.
381
+
382
+ Args:
383
+ file_path: Path to the markdown file
384
+ **kwargs: Additional arguments for MarkdownChunker
385
+
386
+ Returns:
387
+ List of MarkdownChunk objects
388
+ """
389
+ chunker = MarkdownChunker(**kwargs)
390
+ return chunker.chunk_file(file_path)
391
+
392
+
393
+ if __name__ == "__main__":
394
+ # Test with a sample README
395
+ import sys
396
+
397
+ if len(sys.argv) > 1:
398
+ test_file = Path(sys.argv[1])
399
+ else:
400
+ test_file = Path("README.md")
401
+
402
+ if test_file.exists():
403
+ chunker = MarkdownChunker()
404
+ chunks = chunker.chunk_file(test_file)
405
+
406
+ print(f"\n📄 Chunked {test_file.name}: {len(chunks)} chunks\n")
407
+
408
+ for i, chunk in enumerate(chunks, 1):
409
+ print(f"{'='*60}")
410
+ print(f"Chunk {i}: {chunk.name} ({chunk.chunk_type})")
411
+ print(f" Scope: {' > '.join(chunk.scope) if chunk.scope else '(root)'}")
412
+ print(f" Lines: {chunk.line_range[0]}-{chunk.line_range[1]}")
413
+ print(f" Size: {chunk.size_chars} chars")
414
+ if chunk.code_language:
415
+ print(f" Language: {chunk.code_language}")
416
+ if chunk.links:
417
+ print(f" Links: {len(chunk.links)}")
418
+ print(f"\n Preview: {chunk.text[:150]}...")
419
+ print()
420
+ else:
421
+ print(f"File not found: {test_file}")