PyPI - repgen-ai - Versions diffs - 0.1.0__py3-none-any.whl - Mend

repgen-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

repgen/__init__.py +51 -0
repgen/__pycache__/__init__.cpython-313.pyc +0 -0
repgen/__pycache__/cli.cpython-313.pyc +0 -0
repgen/__pycache__/core.cpython-313.pyc +0 -0
repgen/__pycache__/server.cpython-313.pyc +0 -0
repgen/__pycache__/utils.cpython-313.pyc +0 -0
repgen/cli.py +375 -0
repgen/core.py +239 -0
repgen/retrieval/__init__.py +4 -0
repgen/retrieval/__pycache__/__init__.cpython-313.pyc +0 -0
repgen/retrieval/__pycache__/config.cpython-313.pyc +0 -0
repgen/retrieval/__pycache__/pipeline.cpython-313.pyc +0 -0
repgen/retrieval/config.py +53 -0
repgen/retrieval/core/__init__.py +0 -0
repgen/retrieval/core/__pycache__/__init__.cpython-313.pyc +0 -0
repgen/retrieval/core/__pycache__/code_indexer.cpython-313.pyc +0 -0
repgen/retrieval/core/__pycache__/dependency_analyzer.cpython-313.pyc +0 -0
repgen/retrieval/core/__pycache__/module_analyzer.cpython-313.pyc +0 -0
repgen/retrieval/core/__pycache__/training_code_detector.cpython-313.pyc +0 -0
repgen/retrieval/core/__pycache__/utils.cpython-313.pyc +0 -0
repgen/retrieval/core/code_indexer.py +138 -0
repgen/retrieval/core/dependency_analyzer.py +121 -0
repgen/retrieval/core/module_analyzer.py +65 -0
repgen/retrieval/core/training_code_detector.py +240 -0
repgen/retrieval/core/utils.py +52 -0
repgen/retrieval/models/__init__.py +0 -0
repgen/retrieval/models/__pycache__/__init__.cpython-313.pyc +0 -0
repgen/retrieval/models/__pycache__/hybrid_search.cpython-313.pyc +0 -0
repgen/retrieval/models/hybrid_search.py +151 -0
repgen/retrieval/pipeline.py +166 -0
repgen/server.py +111 -0
repgen/utils.py +550 -0
repgen_ai-0.1.0.dist-info/METADATA +199 -0
repgen_ai-0.1.0.dist-info/RECORD +36 -0
repgen_ai-0.1.0.dist-info/WHEEL +5 -0
repgen_ai-0.1.0.dist-info/top_level.txt +1 -0

repgen/retrieval/models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file

repgen/retrieval/models/__pycache__/hybrid_search.cpython-313.pyc ADDED Viewed

Binary file

repgen/retrieval/models/hybrid_search.py ADDED Viewed

@@ -0,0 +1,151 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+import numpy as np
+import torch
+from annoy import AnnoyIndex
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder, SentenceTransformer
+from sklearn.preprocessing import normalize
+from transformers import AutoTokenizer
+from ..core.utils import tokenize
+# Suppress logs from transformers and sentence_transformers
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
+class HybridSearchIndex:
+    def __init__(
+        self,
+        embedding_model: str,
+        reranker_model: str,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        config: Optional[Any] = None,
+    ):
+        self.device = device
+        self.encoder = SentenceTransformer(embedding_model, device=device)
+        self.cross_encoder = CrossEncoder(reranker_model, device=device)
+        self.tokenizer = AutoTokenizer.from_pretrained(reranker_model)
+        self.max_seq_length = 512
+        self.bm25 = None
+        self.embeddings = None
+        self.code_chunks = None
+        self.annoy_index = None
+        self.config = config
+    def build_index(self, code_chunks: List[dict], corpus: List[List[str]]) -> None:
+        """Build the hybrid search index."""
+        self.code_chunks = code_chunks
+        self.bm25 = BM25Okapi(corpus)
+        texts = [chunk["page_content"] for chunk in code_chunks]
+        self.embeddings = self.encoder.encode(
+            texts, convert_to_tensor=True, show_progress_bar=False
+        )
+        self.embeddings = normalize(self.embeddings.cpu().numpy())
+        self._build_annoy_index()
+    def _build_annoy_index(self) -> None:
+        """Build Annoy index for approximate nearest neighbor search."""
+        dim = self.embeddings.shape[1]
+        self.annoy_index = AnnoyIndex(dim, "angular")
+        for i, vec in enumerate(self.embeddings):
+            self.annoy_index.add_item(i, vec)
+        self.annoy_index.build(n_trees=50)
+    def save_index(self, index_dir: Path) -> None:
+        """Save the index to disk."""
+        index_dir.mkdir(exist_ok=True)
+        with open(index_dir / "documents.json", "w") as f:
+            json.dump(self.code_chunks, f)
+        np.save(index_dir / "embeddings.npy", self.embeddings)
+        self.annoy_index.save(str(index_dir / "annoy_index.ann"))
+    def load_index(self, index_dir: Path) -> None:
+        """Load the index from disk."""
+        with open(index_dir / "documents.json", "r") as f:
+            self.code_chunks = json.load(f)
+        self.embeddings = np.load(index_dir / "embeddings.npy")
+        corpus = [tokenize(doc["page_content"]) for doc in self.code_chunks]
+        self.bm25 = BM25Okapi(corpus)
+        self.annoy_index = AnnoyIndex(self.embeddings.shape[1], "angular")
+        self.annoy_index.load(str(index_dir / "annoy_index.ann"))
+    def semantic_search(
+        self, query_embedding: np.ndarray, top_k: int
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Perform semantic search using the index."""
+        indices, distances = self.annoy_index.get_nns_by_vector(
+            query_embedding.flatten(), top_k, include_distances=True
+        )
+        return np.array(indices), 1 - np.array(distances)
+    def search(
+        self,
+        query: str,
+        top_k: int = 200,
+        alpha: float = 0.55,
+        rerank_top_k: int = 20,
+        ann_top_k: int = 200,
+    ) -> List[dict]:
+        """Perform hybrid search with BM25 and semantic search."""
+        if not query or not isinstance(query, str):
+            return []
+        if self.config:
+            alpha = self.config.ALPHA
+            rerank_top_k = self.config.RERANK_TOP_K
+        query = query[:100000]  # Truncate very long queries
+        # BM25 search
+        query_tokens = tokenize(query)
+        bm25_scores = np.array(self.bm25.get_scores(query_tokens))
+        bm25_scores = (bm25_scores - np.min(bm25_scores)) / (
+            np.max(bm25_scores) - np.min(bm25_scores) + 1e-6
+        )
+        # Semantic search
+        query_embedding = self.encoder.encode(query, convert_to_tensor=True)
+        query_embedding = normalize(query_embedding.cpu().numpy().reshape(1, -1))
+        ann_indices, ann_scores = self.semantic_search(query_embedding, ann_top_k)
+        ann_indices = np.array(ann_indices, dtype=int)
+        if len(ann_indices) == 0:
+            return []
+        # Combine scores
+        combined_scores = (1 - alpha) * bm25_scores[ann_indices] + alpha * ann_scores
+        combined_indices_sorted = ann_indices[np.argsort(combined_scores)[::-1]]
+        top_combined_indices = combined_indices_sorted[:rerank_top_k]
+        # Prepare for cross-encoder
+        top_chunks = [self.code_chunks[i] for i in top_combined_indices]
+        # Tokenize with proper truncation
+        features = self.tokenizer(
+            [query] * len(top_chunks),
+            [chunk["page_content"][:100000] for chunk in top_chunks],
+            padding=True,
+            truncation="longest_first",
+            max_length=self.max_seq_length,
+            return_tensors="pt",
+        ).to(self.device)
+        # Run cross-encoder
+        with torch.no_grad():
+            rerank_scores = self.cross_encoder.model(**features).logits.squeeze()
+        # Sort by cross-encoder scores
+        reranked_indices = np.argsort(rerank_scores.cpu().numpy())[::-1]
+        return [top_chunks[i] for i in reranked_indices]

repgen/retrieval/pipeline.py ADDED Viewed

@@ -0,0 +1,166 @@
+import json
+import logging
+from typing import Any, Dict, Optional
+from .config import Config
+from .core.code_indexer import CodeIndexer
+from .core.dependency_analyzer import DependencyAnalyzer
+from .core.module_analyzer import ModuleAnalyzer
+from .core.training_code_detector import TrainingCodeDetector
+from .core.utils import load_bug_report
+logger = logging.getLogger(__name__)
+class RetrievalPipeline:
+    def __init__(
+        self,
+        repo_path: str,
+        bug_report_path: str,
+        output_dir: str,
+        config: Config = None,
+    ):
+        self.config = config or Config(
+            repo_path=repo_path, bug_report_path=bug_report_path, output_dir=output_dir
+        )
+        self.code_indexer = CodeIndexer(self.config)
+        self.module_analyzer = ModuleAnalyzer(self.config)
+        self.training_detector = TrainingCodeDetector(self.config)
+        self.dependency_analyzer = DependencyAnalyzer(self.config)
+    def run_pipeline(self) -> Optional[Dict[str, Any]]:
+        """Run full pipeline for the configured bug report and repository."""
+        try:
+            # 1. Setup paths
+            bug_report_path = self.config.BUG_REPORT_FILE
+            code_dir = self.config.CODE_DIR
+            if not bug_report_path.exists():
+                raise FileNotFoundError(f"Bug report not found: {bug_report_path}")
+            if not code_dir.exists():
+                raise FileNotFoundError(f"Code directory not found: {code_dir}")
+            # 2. Index codebase
+            logger.info(f"Indexing codebase: {code_dir.name}")
+            hybrid_index = self.code_indexer.index_codebase(code_dir)
+            # 3. Find relevant code
+            logger.info(f"Finding relevant code for bug report: {bug_report_path.name}")
+            bug_report = load_bug_report(bug_report_path)
+            relevant_code = self.code_indexer.find_relevant_code(
+                bug_report, hybrid_index
+            )
+            if not relevant_code:
+                logger.warning(
+                    f"No relevant code found for bug report: {bug_report_path.name}"
+                )
+                return None
+            # 4. Analyze modules
+            logger.info("Analyzing modules")
+            module_report = {
+                "bug_report": self.config.PROJECT_ID,
+                "modules": self.module_analyzer.analyze_modules(relevant_code),
+            }
+            # 5. Detect training code
+            logger.info("Detecting training code")
+            training_report = self.training_detector.detect_training_code(
+                module_report, bug_report_path
+            )
+            # # 6. Analyze dependencies
+            logger.info("Analyzing dependencies")
+            dependency_report = self.dependency_analyzer.analyze_training_dependencies(
+                training_report
+            )
+            # 7. Generate final context
+            logger.info("Generating final context")
+            self.create_context_files(
+                self.config.PROJECT_ID, module_report, dependency_report
+            )
+            return {
+                "status": "success",
+                "context_dir": str(self.config.CONTEXT_DIR_OUT),
+            }
+        except Exception as e:
+            logger.error(f"Pipeline failed: {str(e)}")
+            raise
+    def create_context_files(self, project_id, module_report, dependency_report):
+        # Create the context directory if it doesn't exist
+        context_dir = self.config.CONTEXT_DIR_OUT
+        dependencies = dependency_report.get("dependencies", [])
+        if not dependencies:
+            for i, module in enumerate(module_report["modules"], 1):
+                context = {
+                    "bug_report": project_id,
+                    "module": module,
+                    "module_snippets": [],
+                }
+                # Find relevant module snippets for this file
+                for file in module["files"]:
+                    context["module_snippets"].append(
+                        {"file": file["path"], "snippets": file["snippets"]}
+                    )
+                # Create the output file path
+                output_filename = f"{project_id}_module_{i}.json"
+                filename = context_dir / output_filename
+                # Write to file
+                with open(filename, "w") as f:
+                    json.dump(context, f, indent=2)
+        else:
+            for i, dep in enumerate(dependencies[:5], 1):
+                training_file_path = self.config.CODE_DIR / dep["file"]
+                try:
+                    with open(training_file_path, "r") as f:
+                        training_file_content = f.read()
+                except FileNotFoundError:
+                    print(f"Warning: File not found - {training_file_path}")
+                    training_file_content = (
+                        f"Content not available for {training_file_path}"
+                    )
+                # Create the context structure
+                context = {
+                    "bug_report": project_id,
+                    "rank": i,
+                    "score": dep["score"],
+                    "main_file": {
+                        "path": dep["file"],
+                        "content": training_file_content,
+                    },
+                    "module_snippets": [],
+                    "dependencies": dep["external_dependencies"],
+                }
+                # Find relevant module snippets for this file
+                for module in module_report["modules"]:
+                    for file in module["files"]:
+                        if file["path"] == dep["file"]:
+                            context["module_snippets"].append(
+                                {"file": file["path"], "snippets": file["snippets"]}
+                            )
+                        else:
+                            # Check if this module file is a dependency
+                            for ext_dep in dep["external_dependencies"]:
+                                if ext_dep["module"].replace("/", "_") in file["path"]:
+                                    context["module_snippets"].append(
+                                        {
+                                            "file": file["path"],
+                                            "snippets": file["snippets"],
+                                        }
+                                    )
+                # Create the output file path
+                output_filename = f"{project_id}_{i}.json"
+                filename = context_dir / output_filename
+                # Write to file
+                with open(filename, "w") as f:
+                    json.dump(context, f, indent=2)

repgen/server.py ADDED Viewed

@@ -0,0 +1,111 @@
+import uuid
+from typing import Any, Dict, Optional
+from fastapi import BackgroundTasks, FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from .core import RepGenService
+app = FastAPI(title="RepGen API")
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# In-memory store for task status
+# (In production, use Redis/db)
+tasks: Dict[str, Dict[str, Any]] = {}
+class ReproductionRequest(BaseModel):
+    bug_report: str
+    repo_url: str
+    backend: str = "openai"  # Default
+    model: str = "gpt-4o"
+    backend: str = "openai"  # Default
+    model: str = "gpt-4o"
+    commit: Optional[str] = None
+    api_key: Optional[str] = None
+def run_reproduction_task(task_id: str, req: ReproductionRequest):
+    service = RepGenService(output_dir="./repgen_results")
+    tasks[task_id]["status"] = "running"
+    def progress_cb(stage, msg, data=None):
+        tasks[task_id]["logs"].append(f"[{stage}] {msg}")
+        tasks[task_id]["stage"] = stage
+        if data:
+            if "artifacts" not in tasks[task_id]:
+                tasks[task_id]["artifacts"] = {}
+            if data["type"] == "refined_report":
+                tasks[task_id]["artifacts"]["refined_report"] = data["content"]
+            elif data["type"] == "plan":
+                tasks[task_id]["artifacts"]["plan"] = data["content"]
+            elif data["type"] == "code":
+                tasks[task_id]["artifacts"]["code"] = data["content"]
+                tasks[task_id]["artifacts"]["code_path"] = data["path"]
+            elif data["type"] == "context":
+                tasks[task_id]["artifacts"]["context"] = data[
+                    "content"
+                ]  # Maybe accumulate? For now, last one.
+    try:
+        result = service.run_reproduction(
+            bug_report_source=req.bug_report,
+            repo_source=req.repo_url,
+            backend=req.backend,
+            model=req.model,
+            commit=req.commit,
+            api_key=req.api_key,
+            progress_callback=progress_cb,
+        )
+        if result["success"]:
+            tasks[task_id]["status"] = "completed"
+            tasks[task_id]["result"] = result[
+                "files"
+            ]  # This will now be a list of dicts {path, content}
+        else:
+            tasks[task_id]["status"] = "failed"
+            tasks[task_id]["error"] = result["error"]
+    except Exception as e:
+        tasks[task_id]["status"] = "failed"
+        tasks[task_id]["error"] = str(e)
+@app.post("/api/reproduce")
+async def start_reproduction(
+    req: ReproductionRequest, background_tasks: BackgroundTasks
+):
+    task_id = str(uuid.uuid4())
+    tasks[task_id] = {
+        "status": "pending",
+        "stage": "init",
+        "logs": [],
+        "artifacts": {},
+        "result": None,
+        "error": None,
+    }
+    background_tasks.add_task(run_reproduction_task, task_id, req)
+    return {"task_id": task_id}
+@app.get("/api/status/{task_id}")
+async def get_status(task_id: str):
+    if task_id not in tasks:
+        raise HTTPException(status_code=404, detail="Task not found")
+    return tasks[task_id]
+# Serve UI (We will build this next)
+# app.mount("/", StaticFiles(directory="ui/dist", html=True), name="ui")