PyPI - git-llm-tool - Versions diffs - 0.1.12__py3-none-any.whl - Mend

git-llm-tool 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

git_llm_tool/__init__.py +5 -0
git_llm_tool/__main__.py +6 -0
git_llm_tool/cli.py +167 -0
git_llm_tool/commands/__init__.py +1 -0
git_llm_tool/commands/changelog_cmd.py +189 -0
git_llm_tool/commands/commit_cmd.py +134 -0
git_llm_tool/core/__init__.py +1 -0
git_llm_tool/core/config.py +352 -0
git_llm_tool/core/diff_optimizer.py +206 -0
git_llm_tool/core/exceptions.py +26 -0
git_llm_tool/core/git_helper.py +250 -0
git_llm_tool/core/jira_helper.py +238 -0
git_llm_tool/core/rate_limiter.py +136 -0
git_llm_tool/core/smart_chunker.py +262 -0
git_llm_tool/core/token_counter.py +169 -0
git_llm_tool/providers/__init__.py +21 -0
git_llm_tool/providers/anthropic_langchain.py +42 -0
git_llm_tool/providers/azure_openai_langchain.py +59 -0
git_llm_tool/providers/base.py +203 -0
git_llm_tool/providers/factory.py +85 -0
git_llm_tool/providers/gemini_langchain.py +57 -0
git_llm_tool/providers/langchain_base.py +608 -0
git_llm_tool/providers/ollama_langchain.py +45 -0
git_llm_tool/providers/openai_langchain.py +42 -0
git_llm_tool-0.1.12.dist-info/LICENSE +21 -0
git_llm_tool-0.1.12.dist-info/METADATA +645 -0
git_llm_tool-0.1.12.dist-info/RECORD +29 -0
git_llm_tool-0.1.12.dist-info/WHEEL +4 -0
git_llm_tool-0.1.12.dist-info/entry_points.txt +3 -0

git_llm_tool/core/smart_chunker.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Smart chunking strategies for git diffs."""
+import re
+from typing import List, Tuple, Optional
+from dataclasses import dataclass
+from langchain_core.documents import Document
+@dataclass
+class ChunkInfo:
+    """Information about a chunk."""
+    content: str
+    file_path: Optional[str]
+    chunk_type: str  # 'file', 'hunk', 'size-based'
+    size: int
+    is_complete_file: bool
+class SmartChunker:
+    """Smart chunker that prioritizes file-based splitting over size-based."""
+    def __init__(self, chunk_size: int = 10000, chunk_overlap: int = 300):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def chunk_diff(self, diff: str) -> List[ChunkInfo]:
+        """
+        Intelligently chunk a git diff.
+        Strategy:
+        1. First try to split by files
+        2. If files are too large, split by hunks
+        3. If hunks are still too large, apply size-based splitting to oversized hunks
+        4. As last resort, use pure size-based splitting
+        """
+        chunks = []
+        # Split diff into files
+        file_sections = self._split_by_files(diff)
+        for file_path, file_content in file_sections:
+            if len(file_content) <= self.chunk_size:
+                # File fits in one chunk
+                chunks.append(ChunkInfo(
+                    content=file_content,
+                    file_path=file_path,
+                    chunk_type='file',
+                    size=len(file_content),
+                    is_complete_file=True
+                ))
+            else:
+                # File is too large, try to split by hunks
+                hunk_chunks = self._split_file_by_hunks(file_content, file_path)
+                if hunk_chunks:
+                    # Check if any hunk chunks are still too large
+                    final_chunks = []
+                    for hunk_chunk in hunk_chunks:
+                        if len(hunk_chunk.content) <= self.chunk_size:
+                            # Hunk chunk is reasonable size
+                            final_chunks.append(hunk_chunk)
+                        else:
+                            # Hunk chunk is still too large, apply size-based splitting
+                            oversized_chunks = self._split_by_size(hunk_chunk.content, file_path)
+                            # Update chunk types to indicate mixed strategy
+                            for chunk in oversized_chunks:
+                                chunk.chunk_type = 'hunk-size-based'
+                            final_chunks.extend(oversized_chunks)
+                    chunks.extend(final_chunks)
+                else:
+                    # Fallback to pure size-based splitting
+                    size_chunks = self._split_by_size(file_content, file_path)
+                    chunks.extend(size_chunks)
+        return chunks
+    def _split_by_files(self, diff: str) -> List[Tuple[Optional[str], str]]:
+        """Split diff by files."""
+        files = []
+        current_file = []
+        current_path = None
+        lines = diff.split('\n')
+        for line in lines:
+            if line.startswith('diff --git'):
+                # Start of new file
+                if current_file:
+                    files.append((current_path, '\n'.join(current_file)))
+                current_file = [line]
+                # Extract file path
+                match = re.search(r'diff --git a/(.+?) b/', line)
+                current_path = match.group(1) if match else None
+            else:
+                current_file.append(line)
+        # Add last file
+        if current_file:
+            files.append((current_path, '\n'.join(current_file)))
+        return files
+    def _split_file_by_hunks(self, file_content: str, file_path: Optional[str]) -> List[ChunkInfo]:
+        """Split a large file by hunks."""
+        chunks = []
+        lines = file_content.split('\n')
+        # Keep file header
+        header_lines = []
+        content_start = 0
+        found_hunk = False
+        for i, line in enumerate(lines):
+            if line.startswith('@@'):
+                content_start = i
+                found_hunk = True
+                break
+            header_lines.append(line)
+        # Check if we found any hunks - this is the key fix
+        if not found_hunk:
+            return []  # No hunk markers found, not a proper git diff format
+        header = '\n'.join(header_lines)
+        # Split by hunks
+        current_hunk = []
+        hunks = []
+        for line in lines[content_start:]:
+            if line.startswith('@@') and current_hunk:
+                # Start of new hunk, save current
+                hunks.append('\n'.join(current_hunk))
+                current_hunk = [line]
+            else:
+                current_hunk.append(line)
+        # Add last hunk
+        if current_hunk:
+            hunks.append('\n'.join(current_hunk))
+        # Create chunks from hunks
+        current_chunk_lines = header_lines[:]
+        current_size = len(header)
+        for hunk in hunks:
+            hunk_size = len(hunk)
+            if current_size + hunk_size <= self.chunk_size:
+                # Add hunk to current chunk
+                current_chunk_lines.extend(hunk.split('\n'))
+                current_size += hunk_size
+            else:
+                # Save current chunk and start new one
+                if len(current_chunk_lines) > len(header_lines):
+                    chunks.append(ChunkInfo(
+                        content='\n'.join(current_chunk_lines),
+                        file_path=file_path,
+                        chunk_type='hunk',
+                        size=current_size,
+                        is_complete_file=False
+                    ))
+                # Start new chunk with header + current hunk
+                current_chunk_lines = header_lines[:] + hunk.split('\n')
+                current_size = len(header) + hunk_size
+        # Add final chunk
+        if len(current_chunk_lines) > len(header_lines):
+            chunks.append(ChunkInfo(
+                content='\n'.join(current_chunk_lines),
+                file_path=file_path,
+                chunk_type='hunk',
+                size=current_size,
+                is_complete_file=False
+            ))
+        return chunks
+    def _split_by_size(self, content: str, file_path: Optional[str]) -> List[ChunkInfo]:
+        """Fallback size-based splitting."""
+        chunks = []
+        lines = content.split('\n')
+        current_chunk = []
+        current_size = 0
+        for line in lines:
+            line_size = len(line) + 1  # +1 for newline
+            if current_size + line_size > self.chunk_size and current_chunk:
+                # Save current chunk
+                chunks.append(ChunkInfo(
+                    content='\n'.join(current_chunk),
+                    file_path=file_path,
+                    chunk_type='size-based',
+                    size=current_size,
+                    is_complete_file=False
+                ))
+                # Start new chunk with overlap
+                if self.chunk_overlap > 0:
+                    overlap_lines = current_chunk[-self.chunk_overlap//50:]  # Rough estimation
+                    current_chunk = overlap_lines + [line]
+                    current_size = sum(len(l) + 1 for l in current_chunk)
+                else:
+                    current_chunk = [line]
+                    current_size = line_size
+            else:
+                current_chunk.append(line)
+                current_size += line_size
+        # Add final chunk
+        if current_chunk:
+            chunks.append(ChunkInfo(
+                content='\n'.join(current_chunk),
+                file_path=file_path,
+                chunk_type='size-based',
+                size=current_size,
+                is_complete_file=False
+            ))
+        return chunks
+    def chunks_to_documents(self, chunks: List[ChunkInfo]) -> List[Document]:
+        """Convert ChunkInfo to LangChain Documents."""
+        documents = []
+        for i, chunk in enumerate(chunks):
+            metadata = {
+                'chunk_id': i,
+                'file_path': chunk.file_path,
+                'chunk_type': chunk.chunk_type,
+                'size': chunk.size,
+                'is_complete_file': chunk.is_complete_file
+            }
+            documents.append(Document(
+                page_content=chunk.content,
+                metadata=metadata
+            ))
+        return documents
+    def get_chunking_stats(self, chunks: List[ChunkInfo]) -> dict:
+        """Get statistics about the chunking process."""
+        total_size = sum(chunk.size for chunk in chunks)
+        file_chunks = len([c for c in chunks if c.chunk_type == 'file'])
+        hunk_chunks = len([c for c in chunks if c.chunk_type == 'hunk'])
+        size_chunks = len([c for c in chunks if c.chunk_type == 'size-based'])
+        mixed_chunks = len([c for c in chunks if c.chunk_type == 'hunk-size-based'])
+        return {
+            'total_chunks': len(chunks),
+            'total_size': total_size,
+            'file_chunks': file_chunks,
+            'hunk_chunks': hunk_chunks,
+            'size_based_chunks': size_chunks,
+            'mixed_hunk_size_chunks': mixed_chunks,
+            'average_chunk_size': total_size // len(chunks) if chunks else 0,
+            'complete_files': len([c for c in chunks if c.is_complete_file])
+        }

git_llm_tool/core/token_counter.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""Accurate token counting using tiktoken."""
+import tiktoken
+from typing import Optional, Dict
+from dataclasses import dataclass
+@dataclass
+class TokenStats:
+    """Token counting statistics."""
+    text_length: int
+    token_count: int
+    tokens_per_char: float
+    model_used: str
+class TokenCounter:
+    """Accurate token counter using tiktoken."""
+    # Model to encoding mapping
+    MODEL_ENCODINGS = {
+        # OpenAI models
+        'gpt-4': 'cl100k_base',
+        'gpt-4-turbo': 'cl100k_base',
+        'gpt-4o': 'o200k_base',
+        'gpt-4o-mini': 'o200k_base',
+        'gpt-3.5-turbo': 'cl100k_base',
+        'text-embedding-3-small': 'cl100k_base',
+        'text-embedding-3-large': 'cl100k_base',
+        # Anthropic models (use OpenAI compatible encoding as approximation)
+        'claude-3-sonnet': 'cl100k_base',
+        'claude-3-haiku': 'cl100k_base',
+        'claude-3-opus': 'cl100k_base',
+        'claude-3.5-sonnet': 'cl100k_base',
+        # Fallback
+        'default': 'cl100k_base'
+    }
+    def __init__(self, model_name: str = "gpt-4o"):
+        """Initialize token counter for specific model."""
+        self.model_name = model_name
+        self.encoding_name = self._get_encoding_name(model_name)
+        try:
+            self.encoding = tiktoken.get_encoding(self.encoding_name)
+        except Exception:
+            # Fallback to default encoding
+            self.encoding = tiktoken.get_encoding('cl100k_base')
+            self.encoding_name = 'cl100k_base'
+    def _get_encoding_name(self, model_name: str) -> str:
+        """Get appropriate encoding name for model."""
+        # Try exact match first
+        if model_name in self.MODEL_ENCODINGS:
+            return self.MODEL_ENCODINGS[model_name]
+        # Try partial matches
+        model_lower = model_name.lower()
+        for model_key in self.MODEL_ENCODINGS:
+            if model_key in model_lower or model_lower in model_key:
+                return self.MODEL_ENCODINGS[model_key]
+        # Default fallback
+        return self.MODEL_ENCODINGS['default']
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in text accurately."""
+        if not text:
+            return 0
+        try:
+            return len(self.encoding.encode(text))
+        except Exception:
+            # Fallback to rough estimation if encoding fails
+            return len(text) // 4
+    def get_token_stats(self, text: str) -> TokenStats:
+        """Get detailed token statistics."""
+        token_count = self.count_tokens(text)
+        text_length = len(text)
+        return TokenStats(
+            text_length=text_length,
+            token_count=token_count,
+            tokens_per_char=token_count / text_length if text_length > 0 else 0,
+            model_used=f"{self.model_name} ({self.encoding_name})"
+        )
+    def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
+        """Truncate text to specific token count."""
+        if not text:
+            return text
+        try:
+            tokens = self.encoding.encode(text)
+            if len(tokens) <= max_tokens:
+                return text
+            # Truncate and decode
+            truncated_tokens = tokens[:max_tokens]
+            return self.encoding.decode(truncated_tokens)
+        except Exception:
+            # Fallback to character-based truncation
+            estimated_chars = max_tokens * 4
+            return text[:estimated_chars] if len(text) > estimated_chars else text
+    def split_by_tokens(self, text: str, max_tokens: int, overlap: int = 0) -> list[str]:
+        """Split text into chunks by token count."""
+        if not text:
+            return []
+        try:
+            tokens = self.encoding.encode(text)
+            if len(tokens) <= max_tokens:
+                return [text]
+            chunks = []
+            start = 0
+            while start < len(tokens):
+                end = min(start + max_tokens, len(tokens))
+                chunk_tokens = tokens[start:end]
+                chunk_text = self.encoding.decode(chunk_tokens)
+                chunks.append(chunk_text)
+                # Move start position with overlap
+                start = end - overlap
+                if start >= len(tokens):
+                    break
+            return chunks
+        except Exception:
+            # Fallback to character-based splitting
+            estimated_chars = max_tokens * 4
+            overlap_chars = overlap * 4
+            chunks = []
+            start = 0
+            while start < len(text):
+                end = min(start + estimated_chars, len(text))
+                chunks.append(text[start:end])
+                start = end - overlap_chars
+                if start >= len(text):
+                    break
+            return chunks
+    def estimate_cost(self, text: str, input_cost_per_1k: float = 0.0, output_cost_per_1k: float = 0.0) -> dict:
+        """Estimate API cost based on token count."""
+        token_count = self.count_tokens(text)
+        return {
+            'tokens': token_count,
+            'input_cost': (token_count / 1000) * input_cost_per_1k,
+            'output_cost': (token_count / 1000) * output_cost_per_1k,
+            'total_cost': (token_count / 1000) * (input_cost_per_1k + output_cost_per_1k)
+        }
+    def is_within_limit(self, text: str, max_tokens: int) -> bool:
+        """Check if text is within token limit."""
+        return self.count_tokens(text) <= max_tokens
+    @classmethod
+    def create_for_model(cls, model_name: str) -> 'TokenCounter':
+        """Factory method to create counter for specific model."""
+        return cls(model_name)

git_llm_tool/providers/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""LLM providers module."""
+from git_llm_tool.providers.base import LlmProvider
+from git_llm_tool.providers.factory import get_provider
+# LangChain providers (primary providers)
+from git_llm_tool.providers.openai_langchain import OpenAiLangChainProvider
+from git_llm_tool.providers.anthropic_langchain import AnthropicLangChainProvider
+from git_llm_tool.providers.azure_openai_langchain import AzureOpenAiLangChainProvider
+from git_llm_tool.providers.ollama_langchain import OllamaLangChainProvider
+from git_llm_tool.providers.gemini_langchain import GeminiLangChainProvider
+__all__ = [
+    "LlmProvider",
+    "get_provider",
+    "OpenAiLangChainProvider",
+    "AnthropicLangChainProvider",
+    "AzureOpenAiLangChainProvider",
+    "OllamaLangChainProvider",
+    "GeminiLangChainProvider"
+]

git_llm_tool/providers/anthropic_langchain.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Anthropic Claude LangChain provider implementation."""
+from langchain_anthropic import ChatAnthropic
+from langchain_core.language_models import BaseLanguageModel
+from git_llm_tool.core.config import AppConfig
+from git_llm_tool.core.exceptions import ApiError
+from git_llm_tool.providers.langchain_base import LangChainProvider
+class AnthropicLangChainProvider(LangChainProvider):
+    """Anthropic Claude provider using LangChain with intelligent chunking support."""
+    def _create_llm(self) -> BaseLanguageModel:
+        """Create Anthropic LangChain LLM instance."""
+        # Get API key
+        api_key = self.config.llm.api_keys.get("anthropic")
+        if not api_key:
+            raise ApiError("Anthropic API key not found in configuration")
+        # Determine model
+        model = self.config.llm.default_model
+        if not model.startswith("claude-"):
+            # Fallback to Claude 3.5 Sonnet if model doesn't look like Anthropic model
+            model = "claude-3-5-sonnet-20241024"
+        try:
+            # Create LangChain Anthropic instance
+            return ChatAnthropic(
+                api_key=api_key,
+                model=model,
+                temperature=0.7,
+                max_tokens=500,  # Increased for better commit messages
+                # LangChain will handle retries and error handling automatically
+            )
+        except Exception as e:
+            raise ApiError(f"Failed to create Anthropic LangChain instance: {e}")
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        return f"AnthropicLangChainProvider(model={self.llm.model})"

git_llm_tool/providers/azure_openai_langchain.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Azure OpenAI LangChain provider implementation."""
+from langchain_openai import AzureChatOpenAI
+from langchain_core.language_models import BaseLanguageModel
+from git_llm_tool.core.config import AppConfig
+from git_llm_tool.core.exceptions import ApiError
+from git_llm_tool.providers.langchain_base import LangChainProvider
+class AzureOpenAiLangChainProvider(LangChainProvider):
+    """Azure OpenAI provider using LangChain with intelligent chunking support."""
+    def _create_llm(self) -> BaseLanguageModel:
+        """Create Azure OpenAI LangChain LLM instance."""
+        # Get Azure OpenAI configuration
+        azure_config = self.config.llm.azure_openai
+        if not azure_config.get("endpoint"):
+            raise ApiError("Azure OpenAI endpoint not found in configuration")
+        api_key = self.config.llm.api_keys.get("azure_openai")
+        if not api_key:
+            raise ApiError("Azure OpenAI API key not found in configuration")
+        # Default values for Azure OpenAI
+        api_version = azure_config.get("api_version", "2024-02-15-preview")
+        deployment_name = azure_config.get("deployment_name")
+        # Determine model/deployment name
+        if deployment_name:
+            model = deployment_name
+        else:
+            # For Azure, we typically use deployment names instead of model names
+            model = self.config.llm.default_model
+            if model.startswith(("gpt-", "o1-")):
+                model = model
+            else:
+                # Default to gpt-4o deployment if model doesn't look like OpenAI model
+                model = "gpt-4o"
+        try:
+            # Create LangChain Azure OpenAI instance
+            return AzureChatOpenAI(
+                api_key=api_key,
+                api_version=api_version,
+                azure_endpoint=azure_config["endpoint"],
+                deployment_name=model,  # In Azure, this is the deployment name
+                temperature=0.7,
+                max_tokens=500,  # Increased for better commit messages
+                # LangChain will handle retries and error handling automatically
+            )
+        except Exception as e:
+            raise ApiError(f"Failed to create Azure OpenAI LangChain instance: {e}")
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        deployment = self.config.llm.azure_openai.get("deployment_name", "unknown")
+        return f"AzureOpenAiLangChainProvider(deployment={deployment})"