repr-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repr/__init__.py +10 -0
- repr/analyzer.py +915 -0
- repr/api.py +263 -0
- repr/auth.py +300 -0
- repr/cli.py +858 -0
- repr/config.py +392 -0
- repr/discovery.py +472 -0
- repr/extractor.py +388 -0
- repr/highlights.py +712 -0
- repr/openai_analysis.py +597 -0
- repr/tools.py +446 -0
- repr/ui.py +430 -0
- repr_cli-0.1.0.dist-info/METADATA +326 -0
- repr_cli-0.1.0.dist-info/RECORD +18 -0
- repr_cli-0.1.0.dist-info/WHEEL +5 -0
- repr_cli-0.1.0.dist-info/entry_points.txt +2 -0
- repr_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- repr_cli-0.1.0.dist-info/top_level.txt +1 -0
repr/openai_analysis.py
ADDED
|
@@ -0,0 +1,597 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI-based analysis for repository profiling.
|
|
3
|
+
|
|
4
|
+
This module implements a direct OpenAI integration for analyzing git repositories
|
|
5
|
+
using a two-phase approach:
|
|
6
|
+
1. EXTRACTION: Process batches of commits with diffs using gpt-5-nano
|
|
7
|
+
2. SYNTHESIS: Combine summaries into final profile using gpt-5.2
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from openai import AsyncOpenAI
|
|
14
|
+
|
|
15
|
+
from .tools import get_commits_with_diffs
|
|
16
|
+
from .discovery import RepoInfo
|
|
17
|
+
from .config import get_litellm_config, get_llm_config, get_api_base
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Model configuration (defaults for OpenAI)
|
|
21
|
+
DEFAULT_EXTRACTION_MODEL = "openai/gpt-5-nano-2025-08-07"
|
|
22
|
+
DEFAULT_SYNTHESIS_MODEL = "openai/gpt-5.2-2025-12-11"
|
|
23
|
+
EXTRACTION_TEMPERATURE = 0.3
|
|
24
|
+
SYNTHESIS_TEMPERATURE = 0.7
|
|
25
|
+
COMMITS_PER_BATCH = 25
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_openai_client(api_key: str = None, base_url: str = None) -> AsyncOpenAI:
|
|
29
|
+
"""
|
|
30
|
+
Get OpenAI-compatible client that proxies through our backend.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
api_key: API key (optional, for local LLM mode)
|
|
34
|
+
base_url: Base URL for API (optional, for local LLM mode)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
AsyncOpenAI client
|
|
38
|
+
"""
|
|
39
|
+
# If explicit parameters provided, use them (for local mode)
|
|
40
|
+
if api_key:
|
|
41
|
+
kwargs = {"api_key": api_key}
|
|
42
|
+
if base_url:
|
|
43
|
+
kwargs["base_url"] = base_url
|
|
44
|
+
return AsyncOpenAI(**kwargs)
|
|
45
|
+
|
|
46
|
+
# Use our backend as the proxy - it will forward to LiteLLM
|
|
47
|
+
# The rf_* token is used to authenticate with our backend
|
|
48
|
+
_, litellm_key = get_litellm_config()
|
|
49
|
+
if not litellm_key:
|
|
50
|
+
raise ValueError("Not logged in. Please run 'rf login' first.")
|
|
51
|
+
|
|
52
|
+
# Point to our backend's LLM proxy endpoint
|
|
53
|
+
backend_url = get_api_base().replace("/api/cli", "")
|
|
54
|
+
|
|
55
|
+
return AsyncOpenAI(
|
|
56
|
+
api_key=litellm_key,
|
|
57
|
+
base_url=f"{backend_url}/api/llm/v1"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def extract_commit_batch(
|
|
62
|
+
client: AsyncOpenAI,
|
|
63
|
+
commits: list[dict[str, Any]],
|
|
64
|
+
batch_num: int,
|
|
65
|
+
total_batches: int,
|
|
66
|
+
model: str = None,
|
|
67
|
+
) -> str:
|
|
68
|
+
"""
|
|
69
|
+
Extraction phase: Extract accomplishments from a batch of commits.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
client: OpenAI client
|
|
73
|
+
commits: List of commits with diffs
|
|
74
|
+
batch_num: Current batch number (for context)
|
|
75
|
+
total_batches: Total number of batches
|
|
76
|
+
model: Model name to use (defaults to stored config or DEFAULT_EXTRACTION_MODEL)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Summary of technical accomplishments in this batch
|
|
80
|
+
"""
|
|
81
|
+
if not model:
|
|
82
|
+
llm_config = get_llm_config()
|
|
83
|
+
model = llm_config.get("extraction_model") or DEFAULT_EXTRACTION_MODEL
|
|
84
|
+
# Format commits for the prompt
|
|
85
|
+
commits_text = []
|
|
86
|
+
for commit in commits:
|
|
87
|
+
commit_text = f"""
|
|
88
|
+
Commit: {commit['sha']}
|
|
89
|
+
Date: {commit['date']}
|
|
90
|
+
Message: {commit['message']}
|
|
91
|
+
|
|
92
|
+
Files changed:"""
|
|
93
|
+
|
|
94
|
+
for file_info in commit['files'][:10]: # Limit files per commit
|
|
95
|
+
change_type = {
|
|
96
|
+
'A': 'Added',
|
|
97
|
+
'D': 'Deleted',
|
|
98
|
+
'M': 'Modified',
|
|
99
|
+
'R': 'Renamed'
|
|
100
|
+
}.get(file_info['change_type'], 'Changed')
|
|
101
|
+
|
|
102
|
+
commit_text += f"\n {change_type}: {file_info['path']}"
|
|
103
|
+
|
|
104
|
+
if file_info['diff']:
|
|
105
|
+
# Truncate diff if too long (for token management)
|
|
106
|
+
diff = file_info['diff'][:2000]
|
|
107
|
+
commit_text += f"\n```diff\n{diff}\n```"
|
|
108
|
+
|
|
109
|
+
commits_text.append(commit_text)
|
|
110
|
+
|
|
111
|
+
commits_formatted = "\n\n---\n".join(commits_text)
|
|
112
|
+
|
|
113
|
+
system_prompt = """You are analyzing a developer's actual code commits to extract specific technical accomplishments WITH the reasoning behind them.
|
|
114
|
+
|
|
115
|
+
Your job: Read the commit messages and diffs, then list CONCRETE technical accomplishments with SPECIFIC details AND infer WHY those decisions were made.
|
|
116
|
+
|
|
117
|
+
For each accomplishment, capture:
|
|
118
|
+
1. WHAT was built (the technical implementation)
|
|
119
|
+
2. WHY it was needed (the problem being solved, the user/business need, or the technical constraint)
|
|
120
|
+
|
|
121
|
+
Rules:
|
|
122
|
+
- Use EXACT technology names from the code (FastAPI, React, SQLAlchemy, not "web framework")
|
|
123
|
+
- Describe SPECIFIC features built (e.g., "JWT authentication with refresh tokens", not "auth system")
|
|
124
|
+
- INFER the motivation when possible:
|
|
125
|
+
- Performance changes → what latency/throughput problem was being solved?
|
|
126
|
+
- New features → what user capability was being enabled?
|
|
127
|
+
- Refactors → what maintainability or scalability issue was being addressed?
|
|
128
|
+
- Error handling → what failure mode was being prevented?
|
|
129
|
+
- Mention architectural patterns when evident (microservices, event-driven, REST API, etc.)
|
|
130
|
+
- Include scale indicators (number of endpoints, integrations, etc.)
|
|
131
|
+
- Be concise but specific - bullet points are fine
|
|
132
|
+
|
|
133
|
+
What NOT to do:
|
|
134
|
+
- Don't write vague statements like "worked on backend"
|
|
135
|
+
- Don't guess technologies not shown in the diffs
|
|
136
|
+
- Don't include process/methodology unless there's evidence
|
|
137
|
+
- Don't fabricate motivations that aren't supported by the code/commits"""
|
|
138
|
+
|
|
139
|
+
user_prompt = f"""Analyze commits batch {batch_num}/{total_batches} and extract technical accomplishments:
|
|
140
|
+
|
|
141
|
+
{commits_formatted}
|
|
142
|
+
|
|
143
|
+
List the specific technical work done in this batch. For each item:
|
|
144
|
+
1. What was BUILT (the concrete implementation)
|
|
145
|
+
2. Why it was needed (infer from context: what problem was solved? what user need? what constraint?)
|
|
146
|
+
|
|
147
|
+
Focus on substance, not process."""
|
|
148
|
+
|
|
149
|
+
response = await client.chat.completions.create(
|
|
150
|
+
model=model,
|
|
151
|
+
messages=[
|
|
152
|
+
{"role": "system", "content": system_prompt},
|
|
153
|
+
{"role": "user", "content": user_prompt},
|
|
154
|
+
],
|
|
155
|
+
temperature=EXTRACTION_TEMPERATURE,
|
|
156
|
+
max_tokens=16000, # Increased for reasoning models that use tokens for thinking
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return response.choices[0].message.content or ""
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def synthesize_profile(
|
|
163
|
+
client: AsyncOpenAI,
|
|
164
|
+
summaries: list[str],
|
|
165
|
+
repo_info: dict[str, Any],
|
|
166
|
+
model: str = None,
|
|
167
|
+
) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Synthesis phase: Combine batch summaries into final developer profile.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
client: OpenAI client
|
|
173
|
+
summaries: List of batch summaries from extraction phase
|
|
174
|
+
repo_info: Repository metadata
|
|
175
|
+
model: Model name to use (defaults to stored config or DEFAULT_SYNTHESIS_MODEL)
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Final developer profile in markdown
|
|
179
|
+
"""
|
|
180
|
+
if not model:
|
|
181
|
+
llm_config = get_llm_config()
|
|
182
|
+
model = llm_config.get("synthesis_model") or DEFAULT_SYNTHESIS_MODEL
|
|
183
|
+
summaries_text = "\n\n---\n\n".join([
|
|
184
|
+
f"## Batch {i+1}\n\n{summary}"
|
|
185
|
+
for i, summary in enumerate(summaries)
|
|
186
|
+
])
|
|
187
|
+
|
|
188
|
+
system_prompt = """You are an expert technical resume writer creating a developer profile from their ACTUAL code commits.
|
|
189
|
+
|
|
190
|
+
Transform the batch analyses into COMPELLING RESUME CONTENT that shows not just WHAT was built, but WHY decisions were made.
|
|
191
|
+
|
|
192
|
+
CRITICAL - NO GENERIC STATEMENTS:
|
|
193
|
+
- ❌ "Experience with web frameworks" → ✅ "Built REST APIs with FastAPI including WebSocket support for real-time updates"
|
|
194
|
+
- ❌ "Strong Python skills" → ✅ "Architected async Python backend with SQLAlchemy, Celery task queues, and Redis caching"
|
|
195
|
+
- ❌ "Agile methodologies" → Don't mention process/methodology
|
|
196
|
+
|
|
197
|
+
CRITICAL - INCLUDE THE WHY:
|
|
198
|
+
For significant technical work, explain the reasoning:
|
|
199
|
+
- ✅ "Built WebSocket token streaming—users expect ChatGPT-like instant feedback; REST endpoints that return only after full completion feel broken for 10-30 second responses"
|
|
200
|
+
- ✅ "Implemented Redis-backed auth caching to short-circuit repeated Supabase validation—every API call was adding 50-100ms of overhead"
|
|
201
|
+
- ✅ "Added explicit rollback paths in DB transactions—SQLAlchemy's implicit rollback doesn't always fire when expected, causing connection pool pollution"
|
|
202
|
+
|
|
203
|
+
The WHY demonstrates engineering judgment:
|
|
204
|
+
- What problem was being solved?
|
|
205
|
+
- What tradeoffs were considered?
|
|
206
|
+
- What would have happened without this change?
|
|
207
|
+
- What user/business need drove this?
|
|
208
|
+
|
|
209
|
+
STRUCTURE:
|
|
210
|
+
1. **Summary**: 2-3 sentences capturing UNIQUE expertise (not generic "versatile developer")
|
|
211
|
+
2. **Key Technical Skills (used in this codebase)**: ONLY technologies ACTUALLY used, with context of HOW they were used
|
|
212
|
+
3. **Notable Projects & Contributions**: SPECIFIC features/achievements with technical details AND the reasoning behind key decisions. Group related work under descriptive subsection headers. For each major piece of work, include a "**Why**:" line explaining the problem/motivation.
|
|
213
|
+
4. **Development Philosophy (evidence-based)**: ONLY if there's clear evidence (comprehensive tests, specific patterns). Include *Why?* explanations that show the thinking.
|
|
214
|
+
|
|
215
|
+
Use strong action verbs: Built, Architected, Implemented, Designed, Optimized, Integrated
|
|
216
|
+
Every claim must be backed by evidence from the commits."""
|
|
217
|
+
|
|
218
|
+
# Build metadata header (injected directly, not LLM-generated)
|
|
219
|
+
languages = repo_info.get('languages', {})
|
|
220
|
+
languages_str = ", ".join([f"{k} ({v}%)" for k, v in languages.items()]) if languages else "Unknown"
|
|
221
|
+
|
|
222
|
+
# Calculate age display
|
|
223
|
+
age_months = repo_info.get('age_months', 0)
|
|
224
|
+
if age_months < 1:
|
|
225
|
+
age_str = "< 1 month"
|
|
226
|
+
elif age_months < 12:
|
|
227
|
+
age_str = f"{age_months} months"
|
|
228
|
+
else:
|
|
229
|
+
years = age_months // 12
|
|
230
|
+
remaining_months = age_months % 12
|
|
231
|
+
age_str = f"{years} year{'s' if years > 1 else ''}" + (f", {remaining_months} months" if remaining_months else "")
|
|
232
|
+
|
|
233
|
+
# Format remote URL (clean up if present)
|
|
234
|
+
remote_url = repo_info.get('remote_url', '')
|
|
235
|
+
if remote_url:
|
|
236
|
+
remote_display = remote_url.replace('git@github.com:', 'github.com/').replace('.git', '')
|
|
237
|
+
if remote_display.startswith('https://'):
|
|
238
|
+
remote_display = remote_display[8:]
|
|
239
|
+
else:
|
|
240
|
+
remote_display = None
|
|
241
|
+
|
|
242
|
+
# Build the metadata header to prepend
|
|
243
|
+
metadata_lines = [
|
|
244
|
+
f"- **Repository**: {repo_info.get('name', 'Unknown')}",
|
|
245
|
+
f"- **Languages**: {languages_str}",
|
|
246
|
+
f"- **Total Commits**: {repo_info.get('commit_count', 'Unknown')}",
|
|
247
|
+
f"- **Contributors**: {repo_info.get('contributors', 'Unknown')}",
|
|
248
|
+
f"- **Active Period**: {repo_info.get('first_commit_date', 'Unknown')} to {repo_info.get('last_commit_date', 'Unknown')} ({age_str})",
|
|
249
|
+
]
|
|
250
|
+
if remote_display:
|
|
251
|
+
metadata_lines.append(f"- **Remote**: {remote_display}")
|
|
252
|
+
if repo_info.get('is_fork'):
|
|
253
|
+
metadata_lines.append("- **Fork**: Yes")
|
|
254
|
+
|
|
255
|
+
metadata_header = "\n".join(metadata_lines)
|
|
256
|
+
|
|
257
|
+
user_prompt = f"""Create a developer profile from these commit analyses:
|
|
258
|
+
|
|
259
|
+
## Technical Work (from commit analysis):
|
|
260
|
+
|
|
261
|
+
{summaries_text}
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
Synthesize this into a cohesive developer profile in Markdown format starting with Summary, then Key Technical Skills, Notable Projects & Contributions, and Development Philosophy.
|
|
266
|
+
|
|
267
|
+
Focus on CONCRETE technical accomplishments AND the reasoning behind key decisions. For each major feature or system, explain WHY it was built that way—what problem it solved, what user need it addressed, or what technical constraint it navigated."""
|
|
268
|
+
|
|
269
|
+
response = await client.chat.completions.create(
|
|
270
|
+
model=model,
|
|
271
|
+
messages=[
|
|
272
|
+
{"role": "system", "content": system_prompt},
|
|
273
|
+
{"role": "user", "content": user_prompt},
|
|
274
|
+
],
|
|
275
|
+
temperature=SYNTHESIS_TEMPERATURE,
|
|
276
|
+
max_tokens=16000, # Increased for reasoning models
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
llm_content = response.choices[0].message.content or ""
|
|
280
|
+
|
|
281
|
+
# Prepend metadata header
|
|
282
|
+
return f"{metadata_header}\n\n---\n\n{llm_content}"
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
async def analyze_repo_openai(
|
|
286
|
+
repo: RepoInfo,
|
|
287
|
+
api_key: str = None,
|
|
288
|
+
base_url: str = None,
|
|
289
|
+
extraction_model: str = None,
|
|
290
|
+
synthesis_model: str = None,
|
|
291
|
+
verbose: bool = False,
|
|
292
|
+
progress_callback: callable = None,
|
|
293
|
+
) -> str:
|
|
294
|
+
"""
|
|
295
|
+
Analyze a single repository using OpenAI-compatible API.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
repo: Repository information
|
|
299
|
+
api_key: API key (defaults to OPENAI_API_KEY env var)
|
|
300
|
+
base_url: Base URL for API (for local LLMs like Ollama)
|
|
301
|
+
extraction_model: Model for extracting accomplishments (defaults to DEFAULT_EXTRACTION_MODEL)
|
|
302
|
+
synthesis_model: Model for synthesizing profile (defaults to DEFAULT_SYNTHESIS_MODEL)
|
|
303
|
+
verbose: Whether to print verbose output
|
|
304
|
+
progress_callback: Optional callback for progress updates
|
|
305
|
+
Signature: callback(step: str, detail: str, repo: str, progress: float)
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Repository analysis/narrative in markdown
|
|
309
|
+
"""
|
|
310
|
+
client = get_openai_client(api_key=api_key, base_url=base_url)
|
|
311
|
+
|
|
312
|
+
if progress_callback:
|
|
313
|
+
progress_callback(
|
|
314
|
+
step="Extracting",
|
|
315
|
+
detail=f"Reading git history ({repo.commit_count} commits)",
|
|
316
|
+
repo=repo.name,
|
|
317
|
+
progress=5.0,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Get commits with diffs
|
|
321
|
+
commits = get_commits_with_diffs(
|
|
322
|
+
repo_path=repo.path,
|
|
323
|
+
count=200, # Last 200 commits
|
|
324
|
+
days=730, # Last 2 years
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if not commits:
|
|
328
|
+
return f"No commits found in {repo.name}"
|
|
329
|
+
|
|
330
|
+
if progress_callback:
|
|
331
|
+
progress_callback(
|
|
332
|
+
step="Preparing",
|
|
333
|
+
detail=f"Found {len(commits)} commits with diffs to analyze",
|
|
334
|
+
repo=repo.name,
|
|
335
|
+
progress=10.0,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Split into batches
|
|
339
|
+
batches = [
|
|
340
|
+
commits[i:i + COMMITS_PER_BATCH]
|
|
341
|
+
for i in range(0, len(commits), COMMITS_PER_BATCH)
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
total_batches = len(batches)
|
|
345
|
+
|
|
346
|
+
if progress_callback:
|
|
347
|
+
progress_callback(
|
|
348
|
+
step="Analyzing",
|
|
349
|
+
detail=f"Processing {total_batches} batches ({COMMITS_PER_BATCH} commits each)",
|
|
350
|
+
repo=repo.name,
|
|
351
|
+
progress=15.0,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# EXTRACTION phase: Process batches with progress tracking
|
|
355
|
+
async def process_batch_with_progress(batch, batch_num):
|
|
356
|
+
"""Process a single batch and report progress."""
|
|
357
|
+
result = await extract_commit_batch(client, batch, batch_num, total_batches, model=extraction_model)
|
|
358
|
+
if progress_callback:
|
|
359
|
+
# Progress goes from 15% to 75% during extraction phase
|
|
360
|
+
batch_progress = 15.0 + (60.0 * batch_num / total_batches)
|
|
361
|
+
progress_callback(
|
|
362
|
+
step="Analyzing",
|
|
363
|
+
detail=f"Batch {batch_num}/{total_batches} complete",
|
|
364
|
+
repo=repo.name,
|
|
365
|
+
progress=batch_progress,
|
|
366
|
+
)
|
|
367
|
+
return result
|
|
368
|
+
|
|
369
|
+
# Process batches concurrently but track progress
|
|
370
|
+
extraction_tasks = [
|
|
371
|
+
process_batch_with_progress(batch, i + 1)
|
|
372
|
+
for i, batch in enumerate(batches)
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
summaries = await asyncio.gather(*extraction_tasks)
|
|
376
|
+
|
|
377
|
+
# Filter out empty summaries
|
|
378
|
+
summaries = [s for s in summaries if s.strip()]
|
|
379
|
+
|
|
380
|
+
if not summaries:
|
|
381
|
+
return f"Could not extract meaningful information from {repo.name}"
|
|
382
|
+
|
|
383
|
+
if progress_callback:
|
|
384
|
+
progress_callback(
|
|
385
|
+
step="Synthesizing",
|
|
386
|
+
detail="Generating developer profile from analysis...",
|
|
387
|
+
repo=repo.name,
|
|
388
|
+
progress=80.0,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# SYNTHESIS phase: Combine into final profile
|
|
392
|
+
repo_dict = {
|
|
393
|
+
"name": repo.name,
|
|
394
|
+
"path": str(repo.path),
|
|
395
|
+
"languages": repo.languages,
|
|
396
|
+
"primary_language": repo.primary_language,
|
|
397
|
+
"commit_count": repo.commit_count,
|
|
398
|
+
"contributors": repo.contributors,
|
|
399
|
+
"first_commit_date": repo.first_commit_date.isoformat() if repo.first_commit_date else None,
|
|
400
|
+
"last_commit_date": repo.last_commit_date.isoformat() if repo.last_commit_date else None,
|
|
401
|
+
"remote_url": repo.remote_url,
|
|
402
|
+
"is_fork": repo.is_fork,
|
|
403
|
+
"age_months": repo.age_months,
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
profile = await synthesize_profile(client, summaries, repo_dict, model=synthesis_model)
|
|
407
|
+
|
|
408
|
+
if progress_callback:
|
|
409
|
+
progress_callback(
|
|
410
|
+
step="Complete",
|
|
411
|
+
detail=f"Profile generated for {repo.name}",
|
|
412
|
+
repo=repo.name,
|
|
413
|
+
progress=100.0,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
return profile
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
async def analyze_repos_openai(
|
|
420
|
+
repos: list[RepoInfo],
|
|
421
|
+
api_key: str = None,
|
|
422
|
+
base_url: str = None,
|
|
423
|
+
extraction_model: str = None,
|
|
424
|
+
synthesis_model: str = None,
|
|
425
|
+
verbose: bool = False,
|
|
426
|
+
progress_callback: callable = None,
|
|
427
|
+
) -> str:
|
|
428
|
+
"""
|
|
429
|
+
Analyze multiple repositories and create a combined profile.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
repos: List of repositories to analyze
|
|
433
|
+
api_key: API key (defaults to OPENAI_API_KEY env var)
|
|
434
|
+
base_url: Base URL for API (for local LLMs like Ollama)
|
|
435
|
+
extraction_model: Model for extracting accomplishments (defaults to DEFAULT_EXTRACTION_MODEL)
|
|
436
|
+
synthesis_model: Model for synthesizing profile (defaults to DEFAULT_SYNTHESIS_MODEL)
|
|
437
|
+
verbose: Whether to print verbose output
|
|
438
|
+
progress_callback: Optional callback for progress updates
|
|
439
|
+
Signature: callback(step: str, detail: str, repo: str, progress: float)
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
Combined developer profile in markdown
|
|
443
|
+
"""
|
|
444
|
+
if not repos:
|
|
445
|
+
return "No repositories to analyze"
|
|
446
|
+
|
|
447
|
+
total_repos = len(repos)
|
|
448
|
+
|
|
449
|
+
if progress_callback:
|
|
450
|
+
progress_callback(
|
|
451
|
+
step="Starting",
|
|
452
|
+
detail=f"Analyzing {total_repos} {'repository' if total_repos == 1 else 'repositories'}",
|
|
453
|
+
repo="",
|
|
454
|
+
progress=0.0,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Analyze each repo
|
|
458
|
+
repo_profiles = []
|
|
459
|
+
for i, repo in enumerate(repos):
|
|
460
|
+
# Create a scoped progress callback for this repo
|
|
461
|
+
def make_repo_callback(repo_idx, repo_name):
|
|
462
|
+
def repo_callback(step, detail, repo, progress):
|
|
463
|
+
# Scale progress: each repo gets equal share
|
|
464
|
+
repo_start = (repo_idx / total_repos) * 90 # Save 10% for final merge
|
|
465
|
+
repo_end = ((repo_idx + 1) / total_repos) * 90
|
|
466
|
+
scaled_progress = repo_start + (progress / 100) * (repo_end - repo_start)
|
|
467
|
+
|
|
468
|
+
if progress_callback:
|
|
469
|
+
progress_callback(
|
|
470
|
+
step=step,
|
|
471
|
+
detail=f"[{repo_idx + 1}/{total_repos}] {detail}",
|
|
472
|
+
repo=repo_name,
|
|
473
|
+
progress=scaled_progress,
|
|
474
|
+
)
|
|
475
|
+
return repo_callback
|
|
476
|
+
|
|
477
|
+
profile = await analyze_repo_openai(
|
|
478
|
+
repo,
|
|
479
|
+
api_key=api_key,
|
|
480
|
+
base_url=base_url,
|
|
481
|
+
extraction_model=extraction_model,
|
|
482
|
+
synthesis_model=synthesis_model,
|
|
483
|
+
verbose=verbose,
|
|
484
|
+
progress_callback=make_repo_callback(i, repo.name),
|
|
485
|
+
)
|
|
486
|
+
repo_profiles.append({
|
|
487
|
+
"name": repo.name,
|
|
488
|
+
"profile": profile,
|
|
489
|
+
})
|
|
490
|
+
|
|
491
|
+
# If only one repo, return its profile directly
|
|
492
|
+
if len(repos) == 1:
|
|
493
|
+
return repo_profiles[0]["profile"]
|
|
494
|
+
|
|
495
|
+
# Multiple repos: combine them
|
|
496
|
+
if progress_callback:
|
|
497
|
+
progress_callback(
|
|
498
|
+
step="Merging",
|
|
499
|
+
detail=f"Combining profiles from {total_repos} repositories...",
|
|
500
|
+
repo="all",
|
|
501
|
+
progress=92.0,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
client = get_openai_client(api_key=api_key, base_url=base_url)
|
|
505
|
+
|
|
506
|
+
# Aggregate metadata from all repos (injected directly, not LLM-generated)
|
|
507
|
+
total_commits = sum(r.commit_count for r in repos)
|
|
508
|
+
all_languages = {}
|
|
509
|
+
for repo in repos:
|
|
510
|
+
if repo.languages:
|
|
511
|
+
for lang, pct in repo.languages.items():
|
|
512
|
+
all_languages[lang] = all_languages.get(lang, 0) + pct
|
|
513
|
+
# Normalize percentages
|
|
514
|
+
if all_languages:
|
|
515
|
+
total_pct = sum(all_languages.values())
|
|
516
|
+
all_languages = {k: round(v * 100 / total_pct) for k, v in sorted(all_languages.items(), key=lambda x: -x[1])}
|
|
517
|
+
|
|
518
|
+
# Find date range across all repos
|
|
519
|
+
first_dates = [r.first_commit_date for r in repos if r.first_commit_date]
|
|
520
|
+
last_dates = [r.last_commit_date for r in repos if r.last_commit_date]
|
|
521
|
+
earliest_date = min(first_dates).isoformat() if first_dates else "Unknown"
|
|
522
|
+
latest_date = max(last_dates).isoformat() if last_dates else "Unknown"
|
|
523
|
+
|
|
524
|
+
# Build metadata header to prepend
|
|
525
|
+
repos_list = ", ".join(r.name for r in repos)
|
|
526
|
+
languages_str = ", ".join([f"{k} ({v}%)" for k, v in all_languages.items()]) if all_languages else "Unknown"
|
|
527
|
+
|
|
528
|
+
metadata_header = f"""- **Repositories**: {repos_list}
|
|
529
|
+
- **Total Commits**: {total_commits}
|
|
530
|
+
- **Languages**: {languages_str}
|
|
531
|
+
- **Active Period**: {earliest_date} to {latest_date}"""
|
|
532
|
+
|
|
533
|
+
profiles_text = "\n\n---\n\n".join([
|
|
534
|
+
f"## Repository: {rp['name']}\n\n{rp['profile']}"
|
|
535
|
+
for rp in repo_profiles
|
|
536
|
+
])
|
|
537
|
+
|
|
538
|
+
system_prompt = """You are creating a unified developer profile from multiple project analyses.
|
|
539
|
+
|
|
540
|
+
Combine the insights into a single cohesive profile that:
|
|
541
|
+
1. Highlights the breadth of technical skills across projects
|
|
542
|
+
2. Identifies common patterns and expertise areas
|
|
543
|
+
3. Showcases the most impressive accomplishments WITH the reasoning behind them
|
|
544
|
+
4. Maintains specificity - don't generalize away the concrete details
|
|
545
|
+
5. Preserves the "why" explanations that demonstrate engineering judgment
|
|
546
|
+
|
|
547
|
+
Structure:
|
|
548
|
+
1. **Summary**: Overall technical profile (2-3 sentences)
|
|
549
|
+
2. **Key Technical Skills (used across these codebases)**: Technologies used across projects, with context on HOW they were used
|
|
550
|
+
3. **Notable Projects & Contributions**: One section per major project with key accomplishments. For significant work, include "**Why**:" explanations that show the problem being solved or the motivation behind the decision.
|
|
551
|
+
4. **Development Philosophy (evidence-based)**: Patterns that emerge across the work, with evidence-based reasoning (e.g., "Instrument first, optimize with data—introduced timing utilities before optimization to avoid guessing at bottlenecks")"""
|
|
552
|
+
|
|
553
|
+
user_prompt = f"""Combine these repository analyses into a unified developer profile:
|
|
554
|
+
|
|
555
|
+
{profiles_text}
|
|
556
|
+
|
|
557
|
+
Create a cohesive markdown profile that represents the developer's complete body of work, starting with Summary.
|
|
558
|
+
|
|
559
|
+
Preserve and highlight the "why" explanations that demonstrate engineering judgment—these show the developer thinks about problems, not just code."""
|
|
560
|
+
|
|
561
|
+
# Get model for final synthesis
|
|
562
|
+
final_synthesis_model = synthesis_model
|
|
563
|
+
if not final_synthesis_model:
|
|
564
|
+
llm_config = get_llm_config()
|
|
565
|
+
final_synthesis_model = llm_config.get("synthesis_model") or DEFAULT_SYNTHESIS_MODEL
|
|
566
|
+
|
|
567
|
+
if progress_callback:
|
|
568
|
+
progress_callback(
|
|
569
|
+
step="Finalizing",
|
|
570
|
+
detail="Generating unified developer profile...",
|
|
571
|
+
repo="all",
|
|
572
|
+
progress=95.0,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
response = await client.chat.completions.create(
|
|
576
|
+
model=final_synthesis_model,
|
|
577
|
+
messages=[
|
|
578
|
+
{"role": "system", "content": system_prompt},
|
|
579
|
+
{"role": "user", "content": user_prompt},
|
|
580
|
+
],
|
|
581
|
+
temperature=SYNTHESIS_TEMPERATURE,
|
|
582
|
+
max_tokens=16000,
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if progress_callback:
|
|
586
|
+
progress_callback(
|
|
587
|
+
step="Complete",
|
|
588
|
+
detail="Profile ready!",
|
|
589
|
+
repo="",
|
|
590
|
+
progress=100.0,
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
llm_content = response.choices[0].message.content or ""
|
|
594
|
+
|
|
595
|
+
# Prepend metadata header
|
|
596
|
+
return f"{metadata_header}\n\n---\n\n{llm_content}"
|
|
597
|
+
|