git-llm-tool 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ """Smart chunking strategies for git diffs."""
2
+
3
+ import re
4
+ from typing import List, Tuple, Optional
5
+ from dataclasses import dataclass
6
+ from langchain_core.documents import Document
7
+
8
+
9
+ @dataclass
10
+ class ChunkInfo:
11
+ """Information about a chunk."""
12
+ content: str
13
+ file_path: Optional[str]
14
+ chunk_type: str # 'file', 'hunk', 'size-based'
15
+ size: int
16
+ is_complete_file: bool
17
+
18
+
19
+ class SmartChunker:
20
+ """Smart chunker that prioritizes file-based splitting over size-based."""
21
+
22
+ def __init__(self, chunk_size: int = 10000, chunk_overlap: int = 300):
23
+ self.chunk_size = chunk_size
24
+ self.chunk_overlap = chunk_overlap
25
+
26
+ def chunk_diff(self, diff: str) -> List[ChunkInfo]:
27
+ """
28
+ Intelligently chunk a git diff.
29
+
30
+ Strategy:
31
+ 1. First try to split by files
32
+ 2. If files are too large, split by hunks
33
+ 3. If hunks are still too large, apply size-based splitting to oversized hunks
34
+ 4. As last resort, use pure size-based splitting
35
+ """
36
+ chunks = []
37
+
38
+ # Split diff into files
39
+ file_sections = self._split_by_files(diff)
40
+
41
+ for file_path, file_content in file_sections:
42
+ if len(file_content) <= self.chunk_size:
43
+ # File fits in one chunk
44
+ chunks.append(ChunkInfo(
45
+ content=file_content,
46
+ file_path=file_path,
47
+ chunk_type='file',
48
+ size=len(file_content),
49
+ is_complete_file=True
50
+ ))
51
+ else:
52
+ # File is too large, try to split by hunks
53
+ hunk_chunks = self._split_file_by_hunks(file_content, file_path)
54
+ if hunk_chunks:
55
+ # Check if any hunk chunks are still too large
56
+ final_chunks = []
57
+ for hunk_chunk in hunk_chunks:
58
+ if len(hunk_chunk.content) <= self.chunk_size:
59
+ # Hunk chunk is reasonable size
60
+ final_chunks.append(hunk_chunk)
61
+ else:
62
+ # Hunk chunk is still too large, apply size-based splitting
63
+ oversized_chunks = self._split_by_size(hunk_chunk.content, file_path)
64
+ # Update chunk types to indicate mixed strategy
65
+ for chunk in oversized_chunks:
66
+ chunk.chunk_type = 'hunk-size-based'
67
+ final_chunks.extend(oversized_chunks)
68
+ chunks.extend(final_chunks)
69
+ else:
70
+ # Fallback to pure size-based splitting
71
+ size_chunks = self._split_by_size(file_content, file_path)
72
+ chunks.extend(size_chunks)
73
+
74
+ return chunks
75
+
76
+ def _split_by_files(self, diff: str) -> List[Tuple[Optional[str], str]]:
77
+ """Split diff by files."""
78
+ files = []
79
+ current_file = []
80
+ current_path = None
81
+
82
+ lines = diff.split('\n')
83
+
84
+ for line in lines:
85
+ if line.startswith('diff --git'):
86
+ # Start of new file
87
+ if current_file:
88
+ files.append((current_path, '\n'.join(current_file)))
89
+
90
+ current_file = [line]
91
+ # Extract file path
92
+ match = re.search(r'diff --git a/(.+?) b/', line)
93
+ current_path = match.group(1) if match else None
94
+ else:
95
+ current_file.append(line)
96
+
97
+ # Add last file
98
+ if current_file:
99
+ files.append((current_path, '\n'.join(current_file)))
100
+
101
+ return files
102
+
103
+ def _split_file_by_hunks(self, file_content: str, file_path: Optional[str]) -> List[ChunkInfo]:
104
+ """Split a large file by hunks."""
105
+ chunks = []
106
+ lines = file_content.split('\n')
107
+
108
+ # Keep file header
109
+ header_lines = []
110
+ content_start = 0
111
+ found_hunk = False
112
+
113
+ for i, line in enumerate(lines):
114
+ if line.startswith('@@'):
115
+ content_start = i
116
+ found_hunk = True
117
+ break
118
+ header_lines.append(line)
119
+
120
+ # Check if we found any hunks - this is the key fix
121
+ if not found_hunk:
122
+ return [] # No hunk markers found, not a proper git diff format
123
+
124
+ header = '\n'.join(header_lines)
125
+
126
+ # Split by hunks
127
+ current_hunk = []
128
+ hunks = []
129
+
130
+ for line in lines[content_start:]:
131
+ if line.startswith('@@') and current_hunk:
132
+ # Start of new hunk, save current
133
+ hunks.append('\n'.join(current_hunk))
134
+ current_hunk = [line]
135
+ else:
136
+ current_hunk.append(line)
137
+
138
+ # Add last hunk
139
+ if current_hunk:
140
+ hunks.append('\n'.join(current_hunk))
141
+
142
+ # Create chunks from hunks
143
+ current_chunk_lines = header_lines[:]
144
+ current_size = len(header)
145
+
146
+ for hunk in hunks:
147
+ hunk_size = len(hunk)
148
+
149
+ if current_size + hunk_size <= self.chunk_size:
150
+ # Add hunk to current chunk
151
+ current_chunk_lines.extend(hunk.split('\n'))
152
+ current_size += hunk_size
153
+ else:
154
+ # Save current chunk and start new one
155
+ if len(current_chunk_lines) > len(header_lines):
156
+ chunks.append(ChunkInfo(
157
+ content='\n'.join(current_chunk_lines),
158
+ file_path=file_path,
159
+ chunk_type='hunk',
160
+ size=current_size,
161
+ is_complete_file=False
162
+ ))
163
+
164
+ # Start new chunk with header + current hunk
165
+ current_chunk_lines = header_lines[:] + hunk.split('\n')
166
+ current_size = len(header) + hunk_size
167
+
168
+ # Add final chunk
169
+ if len(current_chunk_lines) > len(header_lines):
170
+ chunks.append(ChunkInfo(
171
+ content='\n'.join(current_chunk_lines),
172
+ file_path=file_path,
173
+ chunk_type='hunk',
174
+ size=current_size,
175
+ is_complete_file=False
176
+ ))
177
+
178
+ return chunks
179
+
180
+ def _split_by_size(self, content: str, file_path: Optional[str]) -> List[ChunkInfo]:
181
+ """Fallback size-based splitting."""
182
+ chunks = []
183
+ lines = content.split('\n')
184
+
185
+ current_chunk = []
186
+ current_size = 0
187
+
188
+ for line in lines:
189
+ line_size = len(line) + 1 # +1 for newline
190
+
191
+ if current_size + line_size > self.chunk_size and current_chunk:
192
+ # Save current chunk
193
+ chunks.append(ChunkInfo(
194
+ content='\n'.join(current_chunk),
195
+ file_path=file_path,
196
+ chunk_type='size-based',
197
+ size=current_size,
198
+ is_complete_file=False
199
+ ))
200
+
201
+ # Start new chunk with overlap
202
+ if self.chunk_overlap > 0:
203
+ overlap_lines = current_chunk[-self.chunk_overlap//50:] # Rough estimation
204
+ current_chunk = overlap_lines + [line]
205
+ current_size = sum(len(l) + 1 for l in current_chunk)
206
+ else:
207
+ current_chunk = [line]
208
+ current_size = line_size
209
+ else:
210
+ current_chunk.append(line)
211
+ current_size += line_size
212
+
213
+ # Add final chunk
214
+ if current_chunk:
215
+ chunks.append(ChunkInfo(
216
+ content='\n'.join(current_chunk),
217
+ file_path=file_path,
218
+ chunk_type='size-based',
219
+ size=current_size,
220
+ is_complete_file=False
221
+ ))
222
+
223
+ return chunks
224
+
225
+ def chunks_to_documents(self, chunks: List[ChunkInfo]) -> List[Document]:
226
+ """Convert ChunkInfo to LangChain Documents."""
227
+ documents = []
228
+
229
+ for i, chunk in enumerate(chunks):
230
+ metadata = {
231
+ 'chunk_id': i,
232
+ 'file_path': chunk.file_path,
233
+ 'chunk_type': chunk.chunk_type,
234
+ 'size': chunk.size,
235
+ 'is_complete_file': chunk.is_complete_file
236
+ }
237
+
238
+ documents.append(Document(
239
+ page_content=chunk.content,
240
+ metadata=metadata
241
+ ))
242
+
243
+ return documents
244
+
245
+ def get_chunking_stats(self, chunks: List[ChunkInfo]) -> dict:
246
+ """Get statistics about the chunking process."""
247
+ total_size = sum(chunk.size for chunk in chunks)
248
+ file_chunks = len([c for c in chunks if c.chunk_type == 'file'])
249
+ hunk_chunks = len([c for c in chunks if c.chunk_type == 'hunk'])
250
+ size_chunks = len([c for c in chunks if c.chunk_type == 'size-based'])
251
+ mixed_chunks = len([c for c in chunks if c.chunk_type == 'hunk-size-based'])
252
+
253
+ return {
254
+ 'total_chunks': len(chunks),
255
+ 'total_size': total_size,
256
+ 'file_chunks': file_chunks,
257
+ 'hunk_chunks': hunk_chunks,
258
+ 'size_based_chunks': size_chunks,
259
+ 'mixed_hunk_size_chunks': mixed_chunks,
260
+ 'average_chunk_size': total_size // len(chunks) if chunks else 0,
261
+ 'complete_files': len([c for c in chunks if c.is_complete_file])
262
+ }
@@ -0,0 +1,169 @@
1
+ """Accurate token counting using tiktoken."""
2
+
3
+ import tiktoken
4
+ from typing import Optional, Dict
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass
9
+ class TokenStats:
10
+ """Token counting statistics."""
11
+ text_length: int
12
+ token_count: int
13
+ tokens_per_char: float
14
+ model_used: str
15
+
16
+
17
+ class TokenCounter:
18
+ """Accurate token counter using tiktoken."""
19
+
20
+ # Model to encoding mapping
21
+ MODEL_ENCODINGS = {
22
+ # OpenAI models
23
+ 'gpt-4': 'cl100k_base',
24
+ 'gpt-4-turbo': 'cl100k_base',
25
+ 'gpt-4o': 'o200k_base',
26
+ 'gpt-4o-mini': 'o200k_base',
27
+ 'gpt-3.5-turbo': 'cl100k_base',
28
+ 'text-embedding-3-small': 'cl100k_base',
29
+ 'text-embedding-3-large': 'cl100k_base',
30
+
31
+ # Anthropic models (use OpenAI compatible encoding as approximation)
32
+ 'claude-3-sonnet': 'cl100k_base',
33
+ 'claude-3-haiku': 'cl100k_base',
34
+ 'claude-3-opus': 'cl100k_base',
35
+ 'claude-3.5-sonnet': 'cl100k_base',
36
+
37
+ # Fallback
38
+ 'default': 'cl100k_base'
39
+ }
40
+
41
+ def __init__(self, model_name: str = "gpt-4o"):
42
+ """Initialize token counter for specific model."""
43
+ self.model_name = model_name
44
+ self.encoding_name = self._get_encoding_name(model_name)
45
+
46
+ try:
47
+ self.encoding = tiktoken.get_encoding(self.encoding_name)
48
+ except Exception:
49
+ # Fallback to default encoding
50
+ self.encoding = tiktoken.get_encoding('cl100k_base')
51
+ self.encoding_name = 'cl100k_base'
52
+
53
+ def _get_encoding_name(self, model_name: str) -> str:
54
+ """Get appropriate encoding name for model."""
55
+ # Try exact match first
56
+ if model_name in self.MODEL_ENCODINGS:
57
+ return self.MODEL_ENCODINGS[model_name]
58
+
59
+ # Try partial matches
60
+ model_lower = model_name.lower()
61
+ for model_key in self.MODEL_ENCODINGS:
62
+ if model_key in model_lower or model_lower in model_key:
63
+ return self.MODEL_ENCODINGS[model_key]
64
+
65
+ # Default fallback
66
+ return self.MODEL_ENCODINGS['default']
67
+
68
+ def count_tokens(self, text: str) -> int:
69
+ """Count tokens in text accurately."""
70
+ if not text:
71
+ return 0
72
+
73
+ try:
74
+ return len(self.encoding.encode(text))
75
+ except Exception:
76
+ # Fallback to rough estimation if encoding fails
77
+ return len(text) // 4
78
+
79
+ def get_token_stats(self, text: str) -> TokenStats:
80
+ """Get detailed token statistics."""
81
+ token_count = self.count_tokens(text)
82
+ text_length = len(text)
83
+
84
+ return TokenStats(
85
+ text_length=text_length,
86
+ token_count=token_count,
87
+ tokens_per_char=token_count / text_length if text_length > 0 else 0,
88
+ model_used=f"{self.model_name} ({self.encoding_name})"
89
+ )
90
+
91
+ def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
92
+ """Truncate text to specific token count."""
93
+ if not text:
94
+ return text
95
+
96
+ try:
97
+ tokens = self.encoding.encode(text)
98
+ if len(tokens) <= max_tokens:
99
+ return text
100
+
101
+ # Truncate and decode
102
+ truncated_tokens = tokens[:max_tokens]
103
+ return self.encoding.decode(truncated_tokens)
104
+ except Exception:
105
+ # Fallback to character-based truncation
106
+ estimated_chars = max_tokens * 4
107
+ return text[:estimated_chars] if len(text) > estimated_chars else text
108
+
109
+ def split_by_tokens(self, text: str, max_tokens: int, overlap: int = 0) -> list[str]:
110
+ """Split text into chunks by token count."""
111
+ if not text:
112
+ return []
113
+
114
+ try:
115
+ tokens = self.encoding.encode(text)
116
+ if len(tokens) <= max_tokens:
117
+ return [text]
118
+
119
+ chunks = []
120
+ start = 0
121
+
122
+ while start < len(tokens):
123
+ end = min(start + max_tokens, len(tokens))
124
+ chunk_tokens = tokens[start:end]
125
+ chunk_text = self.encoding.decode(chunk_tokens)
126
+ chunks.append(chunk_text)
127
+
128
+ # Move start position with overlap
129
+ start = end - overlap
130
+ if start >= len(tokens):
131
+ break
132
+
133
+ return chunks
134
+ except Exception:
135
+ # Fallback to character-based splitting
136
+ estimated_chars = max_tokens * 4
137
+ overlap_chars = overlap * 4
138
+
139
+ chunks = []
140
+ start = 0
141
+
142
+ while start < len(text):
143
+ end = min(start + estimated_chars, len(text))
144
+ chunks.append(text[start:end])
145
+ start = end - overlap_chars
146
+ if start >= len(text):
147
+ break
148
+
149
+ return chunks
150
+
151
+ def estimate_cost(self, text: str, input_cost_per_1k: float = 0.0, output_cost_per_1k: float = 0.0) -> dict:
152
+ """Estimate API cost based on token count."""
153
+ token_count = self.count_tokens(text)
154
+
155
+ return {
156
+ 'tokens': token_count,
157
+ 'input_cost': (token_count / 1000) * input_cost_per_1k,
158
+ 'output_cost': (token_count / 1000) * output_cost_per_1k,
159
+ 'total_cost': (token_count / 1000) * (input_cost_per_1k + output_cost_per_1k)
160
+ }
161
+
162
+ def is_within_limit(self, text: str, max_tokens: int) -> bool:
163
+ """Check if text is within token limit."""
164
+ return self.count_tokens(text) <= max_tokens
165
+
166
+ @classmethod
167
+ def create_for_model(cls, model_name: str) -> 'TokenCounter':
168
+ """Factory method to create counter for specific model."""
169
+ return cls(model_name)
@@ -0,0 +1,21 @@
1
+ """LLM providers module."""
2
+
3
+ from git_llm_tool.providers.base import LlmProvider
4
+ from git_llm_tool.providers.factory import get_provider
5
+
6
+ # LangChain providers (primary providers)
7
+ from git_llm_tool.providers.openai_langchain import OpenAiLangChainProvider
8
+ from git_llm_tool.providers.anthropic_langchain import AnthropicLangChainProvider
9
+ from git_llm_tool.providers.azure_openai_langchain import AzureOpenAiLangChainProvider
10
+ from git_llm_tool.providers.ollama_langchain import OllamaLangChainProvider
11
+ from git_llm_tool.providers.gemini_langchain import GeminiLangChainProvider
12
+
13
+ __all__ = [
14
+ "LlmProvider",
15
+ "get_provider",
16
+ "OpenAiLangChainProvider",
17
+ "AnthropicLangChainProvider",
18
+ "AzureOpenAiLangChainProvider",
19
+ "OllamaLangChainProvider",
20
+ "GeminiLangChainProvider"
21
+ ]
@@ -0,0 +1,42 @@
1
+ """Anthropic Claude LangChain provider implementation."""
2
+
3
+ from langchain_anthropic import ChatAnthropic
4
+ from langchain_core.language_models import BaseLanguageModel
5
+
6
+ from git_llm_tool.core.config import AppConfig
7
+ from git_llm_tool.core.exceptions import ApiError
8
+ from git_llm_tool.providers.langchain_base import LangChainProvider
9
+
10
+
11
+ class AnthropicLangChainProvider(LangChainProvider):
12
+ """Anthropic Claude provider using LangChain with intelligent chunking support."""
13
+
14
+ def _create_llm(self) -> BaseLanguageModel:
15
+ """Create Anthropic LangChain LLM instance."""
16
+ # Get API key
17
+ api_key = self.config.llm.api_keys.get("anthropic")
18
+ if not api_key:
19
+ raise ApiError("Anthropic API key not found in configuration")
20
+
21
+ # Determine model
22
+ model = self.config.llm.default_model
23
+ if not model.startswith("claude-"):
24
+ # Fallback to Claude 3.5 Sonnet if model doesn't look like Anthropic model
25
+ model = "claude-3-5-sonnet-20241024"
26
+
27
+ try:
28
+ # Create LangChain Anthropic instance
29
+ return ChatAnthropic(
30
+ api_key=api_key,
31
+ model=model,
32
+ temperature=0.7,
33
+ max_tokens=500, # Increased for better commit messages
34
+ # LangChain will handle retries and error handling automatically
35
+ )
36
+
37
+ except Exception as e:
38
+ raise ApiError(f"Failed to create Anthropic LangChain instance: {e}")
39
+
40
+ def __str__(self) -> str:
41
+ """String representation for debugging."""
42
+ return f"AnthropicLangChainProvider(model={self.llm.model})"
@@ -0,0 +1,59 @@
1
+ """Azure OpenAI LangChain provider implementation."""
2
+
3
+ from langchain_openai import AzureChatOpenAI
4
+ from langchain_core.language_models import BaseLanguageModel
5
+
6
+ from git_llm_tool.core.config import AppConfig
7
+ from git_llm_tool.core.exceptions import ApiError
8
+ from git_llm_tool.providers.langchain_base import LangChainProvider
9
+
10
+
11
+ class AzureOpenAiLangChainProvider(LangChainProvider):
12
+ """Azure OpenAI provider using LangChain with intelligent chunking support."""
13
+
14
+ def _create_llm(self) -> BaseLanguageModel:
15
+ """Create Azure OpenAI LangChain LLM instance."""
16
+ # Get Azure OpenAI configuration
17
+ azure_config = self.config.llm.azure_openai
18
+ if not azure_config.get("endpoint"):
19
+ raise ApiError("Azure OpenAI endpoint not found in configuration")
20
+
21
+ api_key = self.config.llm.api_keys.get("azure_openai")
22
+ if not api_key:
23
+ raise ApiError("Azure OpenAI API key not found in configuration")
24
+
25
+ # Default values for Azure OpenAI
26
+ api_version = azure_config.get("api_version", "2024-02-15-preview")
27
+ deployment_name = azure_config.get("deployment_name")
28
+
29
+ # Determine model/deployment name
30
+ if deployment_name:
31
+ model = deployment_name
32
+ else:
33
+ # For Azure, we typically use deployment names instead of model names
34
+ model = self.config.llm.default_model
35
+ if model.startswith(("gpt-", "o1-")):
36
+ model = model
37
+ else:
38
+ # Default to gpt-4o deployment if model doesn't look like OpenAI model
39
+ model = "gpt-4o"
40
+
41
+ try:
42
+ # Create LangChain Azure OpenAI instance
43
+ return AzureChatOpenAI(
44
+ api_key=api_key,
45
+ api_version=api_version,
46
+ azure_endpoint=azure_config["endpoint"],
47
+ deployment_name=model, # In Azure, this is the deployment name
48
+ temperature=0.7,
49
+ max_tokens=500, # Increased for better commit messages
50
+ # LangChain will handle retries and error handling automatically
51
+ )
52
+
53
+ except Exception as e:
54
+ raise ApiError(f"Failed to create Azure OpenAI LangChain instance: {e}")
55
+
56
+ def __str__(self) -> str:
57
+ """String representation for debugging."""
58
+ deployment = self.config.llm.azure_openai.get("deployment_name", "unknown")
59
+ return f"AzureOpenAiLangChainProvider(deployment={deployment})"