repgen-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. repgen/__init__.py +51 -0
  2. repgen/__pycache__/__init__.cpython-313.pyc +0 -0
  3. repgen/__pycache__/cli.cpython-313.pyc +0 -0
  4. repgen/__pycache__/core.cpython-313.pyc +0 -0
  5. repgen/__pycache__/server.cpython-313.pyc +0 -0
  6. repgen/__pycache__/utils.cpython-313.pyc +0 -0
  7. repgen/cli.py +375 -0
  8. repgen/core.py +239 -0
  9. repgen/retrieval/__init__.py +4 -0
  10. repgen/retrieval/__pycache__/__init__.cpython-313.pyc +0 -0
  11. repgen/retrieval/__pycache__/config.cpython-313.pyc +0 -0
  12. repgen/retrieval/__pycache__/pipeline.cpython-313.pyc +0 -0
  13. repgen/retrieval/config.py +53 -0
  14. repgen/retrieval/core/__init__.py +0 -0
  15. repgen/retrieval/core/__pycache__/__init__.cpython-313.pyc +0 -0
  16. repgen/retrieval/core/__pycache__/code_indexer.cpython-313.pyc +0 -0
  17. repgen/retrieval/core/__pycache__/dependency_analyzer.cpython-313.pyc +0 -0
  18. repgen/retrieval/core/__pycache__/module_analyzer.cpython-313.pyc +0 -0
  19. repgen/retrieval/core/__pycache__/training_code_detector.cpython-313.pyc +0 -0
  20. repgen/retrieval/core/__pycache__/utils.cpython-313.pyc +0 -0
  21. repgen/retrieval/core/code_indexer.py +138 -0
  22. repgen/retrieval/core/dependency_analyzer.py +121 -0
  23. repgen/retrieval/core/module_analyzer.py +65 -0
  24. repgen/retrieval/core/training_code_detector.py +240 -0
  25. repgen/retrieval/core/utils.py +52 -0
  26. repgen/retrieval/models/__init__.py +0 -0
  27. repgen/retrieval/models/__pycache__/__init__.cpython-313.pyc +0 -0
  28. repgen/retrieval/models/__pycache__/hybrid_search.cpython-313.pyc +0 -0
  29. repgen/retrieval/models/hybrid_search.py +151 -0
  30. repgen/retrieval/pipeline.py +166 -0
  31. repgen/server.py +111 -0
  32. repgen/utils.py +550 -0
  33. repgen_ai-0.1.0.dist-info/METADATA +199 -0
  34. repgen_ai-0.1.0.dist-info/RECORD +36 -0
  35. repgen_ai-0.1.0.dist-info/WHEEL +5 -0
  36. repgen_ai-0.1.0.dist-info/top_level.txt +1 -0
repgen/utils.py ADDED
@@ -0,0 +1,550 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ import requests
12
+
13
+ # Rich Imports (needed for logging in utils)
14
+ from rich.console import Console
15
+ from rich.logging import RichHandler
16
+ from rich.theme import Theme
17
+
18
+ # ==========================================
19
+ # RICH CONFIGURATION (Shared)
20
+ # ==========================================
21
+
22
+ custom_theme = Theme(
23
+ {
24
+ "info": "cyan",
25
+ "warning": "yellow",
26
+ "error": "bold red",
27
+ "success": "bold green",
28
+ "highlight": "magenta",
29
+ "code": "bold white on black",
30
+ }
31
+ )
32
+ console = Console(theme=custom_theme)
33
+
34
+ # Configure logging to use RichHandler
35
+ # Note: We want this configuration to be consistent
36
+ logging.basicConfig(
37
+ level="INFO",
38
+ format="%(message)s",
39
+ datefmt="[%X]",
40
+ handlers=[RichHandler(console=console, rich_tracebacks=True, markup=True)],
41
+ )
42
+ logger = logging.getLogger("repgen.utils")
43
+
44
+ # ==========================================
45
+ # INPUT HELPERS (REMOTE & LOCAL)
46
+ # ==========================================
47
+
48
+
49
+ def fetch_content(source: str) -> str:
50
+ """
51
+ Fetches content from a local file or a URL (e.g., GitHub Issue).
52
+ """
53
+ if source.startswith("http://") or source.startswith("https://"):
54
+ # Handle GitHub Issues special case
55
+ # GitHub URL: https://github.com/owner/repo/issues/num
56
+ # API URL: https://api.github.com/repos/owner/repo/issues/num
57
+ github_issue_regex = r"github\.com/([^/]+)/([^/]+)/issues/(\d+)"
58
+ match = re.search(github_issue_regex, source)
59
+
60
+ if match:
61
+ owner, repo, issue_num = match.groups()
62
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_num}"
63
+ console.log(
64
+ f"[info]Detected GitHub Issue. Fetching from API: {api_url}[/info]"
65
+ )
66
+ try:
67
+ # Try without token first (public repos)
68
+ response = requests.get(api_url)
69
+ if response.status_code == 404 or response.status_code == 403:
70
+ # Provide token option if needed in future, for now warn
71
+ console.log(
72
+ f"[warning]Failed to access public API ({response.status_code}).[/warning]"
73
+ )
74
+
75
+ response.raise_for_status()
76
+ data = response.json()
77
+ title = data.get("title", "")
78
+ body = data.get("body", "")
79
+ return f"Title: {title}\n\nBody:\n{body}"
80
+ except Exception as e:
81
+ console.log(f"[error]Error fetching GitHub issue: {e}[/error]")
82
+ raise
83
+ else:
84
+ # Generic URL
85
+ console.log(f"[info]Fetching generic URL: {source}[/info]")
86
+ try:
87
+ response = requests.get(source)
88
+ response.raise_for_status()
89
+ return response.text
90
+ except Exception as e:
91
+ console.log(f"[error]Error fetching URL: {e}[/error]")
92
+ raise
93
+
94
+ else:
95
+ # Local File
96
+ path = Path(source)
97
+ if not path.exists():
98
+ raise FileNotFoundError(f"Local file not found: {path}")
99
+ return path.read_text()
100
+
101
+
102
+ def prepare_repository(
103
+ source: str, temp_dir: Optional[str] = None
104
+ ) -> str: # Use a temporary directory outside the project if not provided
105
+ if temp_dir is None:
106
+ import tempfile
107
+
108
+ temp_dir = tempfile.mkdtemp(prefix="repgen_")
109
+ """
110
+ Prepare the repository. If remote URL, clones it. If local, validates it.
111
+ Returns the absolute path to the local repository.
112
+ """
113
+ if (
114
+ source.startswith("http://")
115
+ or source.startswith("https://")
116
+ or source.startswith("git@")
117
+ ):
118
+ # It's a remote repo
119
+ repo_name = source.rstrip("/").split("/")[-1].replace(".git", "")
120
+ local_path = Path(temp_dir) / repo_name
121
+
122
+ if local_path.exists():
123
+ console.log(
124
+ f"[warning]Directory {local_path} exists. Using existing content.[/warning]"
125
+ )
126
+ # Optional: Pull latest? For safety, we just use it or fail if user wants fresh.
127
+ else:
128
+ console.log(f"[info]Cloning {source} to {local_path}...[/info]")
129
+ try:
130
+ subprocess.run(
131
+ ["git", "clone", source, str(local_path)],
132
+ check=True,
133
+ capture_output=True,
134
+ )
135
+ console.log("[success]Cloned successfully.[/success]")
136
+ except subprocess.CalledProcessError as e:
137
+ raise RuntimeError(f"Failed to clone repository: {e}")
138
+
139
+ return str(local_path.resolve())
140
+ else:
141
+ # Local Path
142
+ path = Path(source).resolve()
143
+ if not path.exists():
144
+ raise FileNotFoundError(f"Repository path not found: {path}")
145
+ return str(path)
146
+
147
+
148
+ # ==========================================
149
+ # LLM BACKENDS
150
+ # ==========================================
151
+
152
+
153
+ def query_ollama(prompt: str, model: str) -> str:
154
+ try:
155
+ cmd = ["ollama", "run", model]
156
+ process = subprocess.Popen(
157
+ cmd,
158
+ stdin=subprocess.PIPE,
159
+ stdout=subprocess.PIPE,
160
+ stderr=subprocess.PIPE,
161
+ text=True,
162
+ encoding="utf-8",
163
+ )
164
+ stdout, stderr = process.communicate(input=prompt)
165
+ if process.returncode != 0:
166
+ logger.error(f"Ollama Error ({model}): {stderr.strip()}")
167
+ return ""
168
+ return stdout.strip()
169
+ except Exception as e:
170
+ logger.error(f"Ollama execution failed: {e}")
171
+ return ""
172
+
173
+
174
+ def query_openai(
175
+ prompt: str,
176
+ model: str = "gpt-4o",
177
+ temperature: float = 0.0,
178
+ max_retries: int = 3,
179
+ api_key: Optional[str] = None,
180
+ ) -> str:
181
+ try:
182
+ from openai import OpenAI
183
+ except ImportError:
184
+ logger.error("OpenAI library not installed. Please run 'pip install openai'")
185
+ return ""
186
+
187
+ # Use provided key or env var
188
+ api_key = api_key or os.getenv("OPENAI_API_KEY")
189
+ if not api_key:
190
+ logger.error("OPENAI_API_KEY environment variable not set")
191
+ return ""
192
+
193
+ for attempt in range(max_retries):
194
+ try:
195
+ client = OpenAI(api_key=api_key)
196
+ response = client.chat.completions.create(
197
+ model=model,
198
+ messages=[{"role": "user", "content": prompt}],
199
+ temperature=temperature,
200
+ max_tokens=4096,
201
+ )
202
+ content = response.choices[0].message.content
203
+ return content.strip() if content else ""
204
+ except Exception as e:
205
+ logger.warning(f"OpenAI API error (attempt {attempt+1}/{max_retries}): {e}")
206
+ if attempt < max_retries - 1:
207
+ time.sleep(2**attempt)
208
+ else:
209
+ logger.error("Max retries reached for OpenAI API")
210
+ return ""
211
+
212
+
213
+ def query_gemini(
214
+ prompt: str,
215
+ model: str = "gemini-1.5-pro",
216
+ temperature: float = 0.0,
217
+ api_key: Optional[str] = None,
218
+ ) -> str:
219
+ api_key = api_key or os.getenv("GEMINI_API_KEY")
220
+ if not api_key:
221
+ logger.error("GEMINI_API_KEY environment variable not set")
222
+ return ""
223
+
224
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
225
+ headers = {"Content-Type": "application/json"}
226
+ payload = {
227
+ "contents": [{"parts": [{"text": prompt}]}],
228
+ "generationConfig": {"temperature": temperature},
229
+ }
230
+
231
+ try:
232
+ # Note: server.py and cli.py import requests.
233
+ response = requests.post(url, headers=headers, json=payload)
234
+ response.raise_for_status()
235
+ result = response.json()
236
+ return result["candidates"][0]["content"]["parts"][0]["text"].strip()
237
+ except Exception as e:
238
+ logger.error(f"Gemini API error: {e}")
239
+ return ""
240
+
241
+
242
+ def query_claude(
243
+ prompt: str,
244
+ model: str = "claude-3-5-sonnet-20240620",
245
+ temperature: float = 0.0,
246
+ api_key: Optional[str] = None,
247
+ ) -> str:
248
+ api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
249
+ if not api_key:
250
+ logger.error("ANTHROPIC_API_KEY environment variable not set")
251
+ return ""
252
+
253
+ url = "https://api.anthropic.com/v1/messages"
254
+ headers = {
255
+ "x-api-key": api_key,
256
+ "anthropic-version": "2023-06-01",
257
+ "content-type": "application/json",
258
+ }
259
+ payload = {
260
+ "model": model,
261
+ "max_tokens": 4096,
262
+ "messages": [{"role": "user", "content": prompt}],
263
+ "temperature": temperature,
264
+ }
265
+
266
+ try:
267
+ response = requests.post(url, headers=headers, json=payload)
268
+ response.raise_for_status()
269
+ result = response.json()
270
+ return result["content"][0]["text"].strip()
271
+ except Exception as e:
272
+ logger.error(f"Claude API error: {e}")
273
+ return ""
274
+
275
+
276
+ def query_llm(
277
+ prompt: str, backend: str, model: str, api_key: Optional[str] = None
278
+ ) -> str:
279
+ if backend == "ollama":
280
+ return query_ollama(prompt, model)
281
+ elif backend == "openai":
282
+ return query_openai(prompt, model, api_key=api_key)
283
+ elif backend == "gemini":
284
+ return query_gemini(prompt, model, api_key=api_key)
285
+ elif backend == "claude":
286
+ return query_claude(prompt, model, api_key=api_key)
287
+ else:
288
+ logger.error(f"Unknown backend: {backend}")
289
+ return ""
290
+
291
+
292
+ # ==========================================
293
+ # PROMPT BUILDERS
294
+ # ==========================================
295
+
296
+
297
+ def create_prompt_refinement(bug_report_content):
298
+ return f"""You are a software development assistant. Analyze and restructure this bug report.
299
+
300
+ Original bug report:
301
+ {bug_report_content}
302
+
303
+ Provide your analysis in exactly this format:
304
+
305
+ TITLE
306
+ [One-line summary of the core issue]
307
+
308
+ SYMPTOMS
309
+ • [List each observed problem]
310
+ • [Include error messages exactly as shown]
311
+ • [Include all reported unexpected behaviors]
312
+
313
+ EXPECTED BEHAVIOR
314
+ [Describe what should happen when the software works correctly]
315
+
316
+ REPRODUCTION STEPS
317
+ 1. [First step to reproduce]
318
+ 2. [Next step]
319
+ 3. [Continue until complete]
320
+
321
+ Begin your structured analysis:"""
322
+
323
+
324
+ def create_prompt_plan(bug_report_content, context):
325
+ if "main_file" in context:
326
+ file_paths = [context["main_file"]["path"]]
327
+ file_contents = [context["main_file"]["content"]]
328
+ dependencies = context.get("dependencies", [])
329
+ dep_string = (
330
+ "\n\nImport Dependencies:\n" + json.dumps(dependencies, indent=2)
331
+ if dependencies
332
+ else ""
333
+ )
334
+ elif "module" in context:
335
+ file_paths = [file["path"] for file in context["module"]["files"]]
336
+ file_contents = [
337
+ snippet["code"]
338
+ for file in context["module"]["files"]
339
+ for snippet in file["snippets"]
340
+ ]
341
+ dep_string = ""
342
+ else:
343
+ file_paths = []
344
+ file_contents = []
345
+ dep_string = ""
346
+
347
+ file_paths_string = "\n\n".join(
348
+ [
349
+ f"Module Path {i+1}:\n{json.dumps(path, indent=2)}"
350
+ for i, path in enumerate(file_paths)
351
+ ]
352
+ )
353
+ file_contents_string = "\n\n".join(
354
+ [
355
+ f"Code Context {i+1}:\n{json.dumps(content, indent=2)}"
356
+ for i, content in enumerate(file_contents)
357
+ ]
358
+ )
359
+
360
+ return f"""You are a code generation planner. Create a detailed step-by-step plan to reproduce this bug. Focus on concrete, technical steps with specific values and assertions.
361
+
362
+ Bug Report:
363
+ {bug_report_content}
364
+
365
+ {file_paths_string}
366
+
367
+ {file_contents_string}{dep_string}
368
+
369
+ Your task is to create a precise technical plan that an LLM can follow to generate code that reproduces this bug. Each step should be specific and actionable.
370
+
371
+ Requirements:
372
+ - Include specific technical details (e.g., dimensions, batch sizes, function parameters)
373
+ - Focus only on reproducing the bug, not fixing it
374
+ - Include setup steps (imports, data preparation)
375
+ - Include validation steps to verify the bug occurs
376
+ - Make steps granular and specific
377
+
378
+ Output must be a valid JSON array of strings, formatted like this example:
379
+ [
380
+ "Import TensorFlow and the inception module from inception_test.py",
381
+ "Define a batch size of 5 and image dimensions of 299x299",
382
+ "Create random uniform input data with shape (batch_size, height, width, 3)",
383
+ "Call inception_v3 function with num_classes=1000",
384
+ "Verify output contains NaN values in loss calculation"
385
+ ]
386
+
387
+ Generate plan steps:"""
388
+
389
+
390
+ def _build_prompt(
391
+ bug_report: str, code_context: str, plan: str, feedback: str = ""
392
+ ) -> str:
393
+ prompt = f"""You are a senior software engineer fluent in reproducing deep learning bugs. Generate a code snippet to reproduce this bug:
394
+
395
+ Bug Report:
396
+ {bug_report}
397
+
398
+ Relevant Code Context:
399
+ {code_context}
400
+
401
+ Reproduction Plan:
402
+ {plan}"""
403
+
404
+ if feedback:
405
+ prompt += f"""
406
+
407
+ Previous Attempt Feedback:
408
+ {feedback}"""
409
+
410
+ prompt += """
411
+
412
+ Requirements:
413
+ 1. Minimal Python script
414
+ 2. Include necessary setup
415
+ 3. Do not add any explanation comments, pure code, nothing else.
416
+ 4. Use standard libraries where possible
417
+ 5. Mention dependencies in comments if needed
418
+ 6. Do not generate any code for the module, use the existing imports.
419
+ 7. Use the existing imports and their respective methods from the main file to generate the code snippet.
420
+ 8. Output ONLY the code without explanation:"""
421
+ return prompt
422
+
423
+
424
+ # ==========================================
425
+ # VERIFICATION HELPERS
426
+ # ==========================================
427
+
428
+
429
+ def check_structural_correctness(code: str) -> tuple[bool, str]:
430
+ try:
431
+ ast.parse(code)
432
+ return True, ""
433
+ except SyntaxError as e:
434
+ error_msg = f"Syntax Error: {e.msg}\nLine {e.lineno}: {e.text.strip() if e.text else 'N/A'}"
435
+ return (
436
+ True,
437
+ error_msg,
438
+ ) # RETURN TRUE anyway to allow feedback loop, but log error
439
+ except Exception as e:
440
+ return False, f"Structural Error: {str(e)}"
441
+
442
+
443
+ def extract_json_content(text):
444
+ pattern = r"```(?:json)?\s*(.*?)\s*```"
445
+ match = re.search(pattern, text, re.DOTALL)
446
+ if match:
447
+ return match.group(1)
448
+ if text.strip().startswith("{") and text.strip().endswith("}"):
449
+ return text.strip()
450
+ if text.strip().startswith("[") and text.strip().endswith("]"):
451
+ return text.strip()
452
+ return None
453
+
454
+
455
+ def check_relevance(
456
+ bug_report: str, code: str, backend: str, model: str, api_key: Optional[str] = None
457
+ ) -> bool:
458
+ prompt = f"""Analyze the provided bug report and the code segment to determine if the code segment is relevant to the bug.
459
+ Output only JSON: {{"relevance": "yes"}} or {{"relevance": "no"}}.
460
+
461
+ Bug Report: {bug_report}
462
+ Code Segment: {code}"""
463
+ stdout = query_llm(prompt, backend, model, api_key=api_key)
464
+ try:
465
+ json_str = extract_json_content(stdout) or stdout
466
+ response = json.loads(json_str)
467
+ return response.get("relevance", "").lower() == "yes"
468
+ except Exception:
469
+ return "yes" in stdout.lower()
470
+
471
+
472
+ # ==========================================
473
+ # GIT HELPERS
474
+ # ==========================================
475
+
476
+
477
+ def checkout_commit(repo_path: str, commit: str) -> bool:
478
+ try:
479
+ logger.info(f"Checking out commit: {commit}")
480
+ # Check if it's a git repo
481
+ if not (Path(repo_path) / ".git").exists():
482
+ logger.error(f"Not a git repository: {repo_path}")
483
+ return False
484
+
485
+ # Run git checkout
486
+ subprocess.run(
487
+ ["git", "checkout", commit],
488
+ cwd=repo_path,
489
+ check=True,
490
+ capture_output=True,
491
+ text=True,
492
+ )
493
+ console.log(f"[success]Successfully checked out {commit}[/success]")
494
+ return True
495
+ except subprocess.CalledProcessError as e:
496
+ logger.error(f"Git checkout failed: {e.stderr.strip()}")
497
+ return False
498
+ except Exception as e:
499
+ logger.error(f"Error during git checkout: {e}")
500
+ return False
501
+
502
+
503
+ # Export check_ollama_status and ensure_ollama_model if needed by others
504
+ # But for now they seem local to CLI or used by query_llm (indirectly? No, interactive config uses them)
505
+ # Actually interactive config uses them. Let's move them too so interactive config in CLI can use them?
506
+ # Or keep them in CLI. query_llm calls `query_ollama` which just runs `ollama run`.
507
+ # `check_ollama_status` and `ensure_ollama_model` are helper checking functions.
508
+ # They are only used in `get_interactive_config` which is in CLI.
509
+ # So I will keep them (or move them if I move `get_interactive_config` later, but for now `get_interactive_config` is in CLI).
510
+ # Wait, `get_interactive_config` IS in CLI in my plan. So `check_ollama_status` and `ensure_ollama_model` should stay in CLI or be imported.
511
+ # It's cleaner to have them in utility if they are general, but they are used for interactive setup.
512
+ # I'll leave `get_interactive_config` and its helpers in CLI for now.
513
+ # However, `query_ollama` (moved to utils) doesn't use them.
514
+
515
+
516
+ # ==========================================
517
+ # CLEANING HELPERS
518
+ # ==========================================
519
+
520
+
521
+ def clean_context(context_json_str: str) -> str:
522
+ """
523
+ Parses the context JSON string and returns a user-friendly summary.
524
+ Removes absolute paths and unnecessary details.
525
+ """
526
+ try:
527
+ data = json.loads(context_json_str)
528
+ output = []
529
+
530
+ if "main_file" in data:
531
+ path = Path(data["main_file"]["path"]).name
532
+ output.append(f"Main File: {path}")
533
+ # output.append("Content (truncated)...")
534
+
535
+ if "dependencies" in data and data["dependencies"]:
536
+ output.append("\nDependencies:")
537
+ for dep in data["dependencies"]:
538
+ path = Path(dep["path"]).name if "path" in dep else "Unknown"
539
+ output.append(f" - {path}")
540
+
541
+ if "module" in data:
542
+ output.append("\nModule Files:")
543
+ for file in data["module"]["files"]:
544
+ path = Path(file["path"]).name
545
+ output.append(f" - {path}")
546
+
547
+ return "\n".join(output)
548
+ except Exception as e:
549
+ logger.error(f"Error cleaning context: {e}")
550
+ return "Error parsing context."