rex-machine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """rex-machine - Extract lessons learned from code repositories."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from rex_machine.cli import app
2
+
3
+ app()
rex_machine/agents.py ADDED
@@ -0,0 +1,638 @@
1
+ """Agentic pipeline for rex-machine using the Anthropic SDK.
2
+
3
+ Each sub-agent runs an autonomous tool-use loop where Claude navigates
4
+ the repository by calling tools (list_files, read_file, grep), similar
5
+ to how Claude Code explores a codebase.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import fnmatch
11
+ import json
12
+ import logging
13
+ import os
14
+ import re
15
+ from datetime import datetime, timezone
16
+ from enum import Enum
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ import anthropic
21
+ import anyio
22
+ from jinja2 import Environment, FileSystemLoader
23
+
24
+ from rex_machine.models import RepoQuality, RexReport
25
+ from rex_machine.scanner import SKIP_DIRS, RepoMap, scan_repo
26
+
27
+ logger = logging.getLogger("rex_machine")
28
+
29
+ SUBAGENT_MAX_TOKENS = 4096
30
+ SYNTHESIS_MAX_TOKENS = 8192
31
+ DEFAULT_MAX_TOOL_CALLS = 30
32
+
33
+ LANG_NAMES = {"en": "English", "fr": "French"}
34
+ SUPPORTED_LANGS = frozenset(LANG_NAMES)
35
+
36
+ # ─── Prompt templates ────────────────────────────────────────────
37
+
38
+ _TEMPLATE_DIR = Path(__file__).parent / "templates" / "prompts"
39
+ _jinja = Environment(
40
+ loader=FileSystemLoader(str(_TEMPLATE_DIR)),
41
+ keep_trailing_newline=False,
42
+ autoescape=False,
43
+ )
44
+
45
+
46
+ def _load_prompt(name: str, **kwargs: object) -> str:
47
+ return _jinja.get_template(f"{name}.j2").render(**kwargs).strip()
48
+
49
+
50
+ # ─── Provider / Client ───────────────────────────────────────────
51
+
52
+
53
+ class Provider(str, Enum):
54
+ ANTHROPIC = "anthropic"
55
+ FOUNDRY = "foundry"
56
+ BEDROCK = "bedrock"
57
+ VERTEX = "vertex"
58
+
59
+
60
+ AsyncClient = (
61
+ anthropic.AsyncAnthropic | anthropic.AsyncAnthropicBedrock | anthropic.AsyncAnthropicVertex
62
+ )
63
+
64
+
65
+ def create_client(
66
+ provider: Provider = Provider.ANTHROPIC,
67
+ *,
68
+ api_key: str | None = None,
69
+ foundry_resource: str | None = None,
70
+ foundry_api_key: str | None = None,
71
+ aws_region: str | None = None,
72
+ gcp_project_id: str | None = None,
73
+ gcp_region: str | None = None,
74
+ ) -> AsyncClient:
75
+ """Create the appropriate async client for the chosen provider."""
76
+ if provider == Provider.FOUNDRY:
77
+ resource = foundry_resource or ""
78
+ base_url = f"https://{resource}.services.ai.azure.com/anthropic/"
79
+ if not foundry_api_key:
80
+ raise ValueError("Foundry API key is required")
81
+ return anthropic.AsyncAnthropic(
82
+ api_key=foundry_api_key,
83
+ base_url=base_url,
84
+ default_headers={"api-key": foundry_api_key},
85
+ )
86
+ if provider == Provider.BEDROCK:
87
+ return anthropic.AsyncAnthropicBedrock(
88
+ aws_region=aws_region or "us-east-1",
89
+ )
90
+ if provider == Provider.VERTEX:
91
+ return anthropic.AsyncAnthropicVertex(
92
+ project_id=gcp_project_id or "",
93
+ region=gcp_region or "us-east5",
94
+ )
95
+ return anthropic.AsyncAnthropic(api_key=api_key)
96
+
97
+
98
+ # ─── Tool definitions (JSON Schema for Claude) ──────────────────
99
+
100
+
101
+ REPO_TOOLS: list[dict[str, Any]] = [
102
+ {
103
+ "name": "list_files",
104
+ "description": (
105
+ "List files and directories at a path in the repository. "
106
+ "Use 'pattern' for glob matching (e.g. '*.py', '**/*.test.ts')."
107
+ ),
108
+ "input_schema": {
109
+ "type": "object",
110
+ "properties": {
111
+ "path": {
112
+ "type": "string",
113
+ "description": "Relative directory path. Defaults to repo root.",
114
+ },
115
+ "pattern": {
116
+ "type": "string",
117
+ "description": "Glob pattern to filter files (e.g. '*.py').",
118
+ },
119
+ },
120
+ "required": [],
121
+ },
122
+ },
123
+ {
124
+ "name": "read_file",
125
+ "description": (
126
+ "Read the contents of a file. Returns text with line numbers. "
127
+ "For large files, use start_line/end_line to read a section."
128
+ ),
129
+ "input_schema": {
130
+ "type": "object",
131
+ "properties": {
132
+ "path": {
133
+ "type": "string",
134
+ "description": "Relative file path within the repository.",
135
+ },
136
+ "start_line": {
137
+ "type": "integer",
138
+ "description": "First line to read (1-based).",
139
+ "minimum": 1,
140
+ },
141
+ "end_line": {
142
+ "type": "integer",
143
+ "description": "Last line to read (inclusive).",
144
+ "minimum": 1,
145
+ },
146
+ },
147
+ "required": ["path"],
148
+ },
149
+ },
150
+ {
151
+ "name": "grep",
152
+ "description": (
153
+ "Search for a text or regex pattern across files. "
154
+ "Returns matching lines with file paths and line numbers."
155
+ ),
156
+ "input_schema": {
157
+ "type": "object",
158
+ "properties": {
159
+ "pattern": {
160
+ "type": "string",
161
+ "description": "Text or regex pattern to search for.",
162
+ },
163
+ "path": {
164
+ "type": "string",
165
+ "description": "Restrict search to this subdirectory.",
166
+ },
167
+ "file_pattern": {
168
+ "type": "string",
169
+ "description": "Glob to filter file types (e.g. '*.py').",
170
+ },
171
+ "max_results": {
172
+ "type": "integer",
173
+ "description": "Max matches to return (default 50, max 100).",
174
+ "default": 50,
175
+ "maximum": 100,
176
+ },
177
+ },
178
+ "required": ["pattern"],
179
+ },
180
+ },
181
+ ]
182
+
183
+
184
+ # ─── Tool executor (read-only, scoped to repo) ──────────────────
185
+
186
+
187
+ class ToolExecutor:
188
+ """Executes repository exploration tools. All operations are read-only."""
189
+
190
+ def __init__(self, repo_path: str) -> None:
191
+ self.root = Path(repo_path).resolve()
192
+
193
+ def _safe_path(self, relative: str) -> Path:
194
+ """Resolve a path safely within the repo root."""
195
+ cleaned = relative.replace("\\", "/").lstrip("/")
196
+ resolved = (self.root / cleaned).resolve()
197
+ try:
198
+ resolved.relative_to(self.root)
199
+ except ValueError:
200
+ raise ValueError(f"Path outside repository: {relative}")
201
+ return resolved
202
+
203
+ def execute(self, name: str, input_data: dict[str, Any]) -> str:
204
+ """Execute a tool call and return the result string."""
205
+ dispatch = {
206
+ "list_files": self._list_files,
207
+ "read_file": self._read_file,
208
+ "grep": self._grep,
209
+ }
210
+ handler = dispatch.get(name)
211
+ if not handler:
212
+ return f"Unknown tool: {name}"
213
+ try:
214
+ return handler(input_data)
215
+ except Exception as exc:
216
+ return f"Error: {exc}"
217
+
218
+ def _list_files(self, data: dict[str, Any]) -> str:
219
+ path = data.get("path", ".")
220
+ pattern = data.get("pattern")
221
+ target = self._safe_path(path)
222
+
223
+ if not target.is_dir():
224
+ return f"Not a directory: {path}"
225
+
226
+ entries: list[str] = []
227
+ if pattern:
228
+ for match in sorted(target.rglob(pattern)):
229
+ rel = str(match.relative_to(self.root)).replace("\\", "/")
230
+ if match.is_dir():
231
+ entries.append(f" {rel}/")
232
+ else:
233
+ try:
234
+ size = match.stat().st_size
235
+ entries.append(f" {rel} ({size:,} bytes)")
236
+ except OSError:
237
+ entries.append(f" {rel}")
238
+ if len(entries) >= 200:
239
+ entries.append(" ... (capped at 200)")
240
+ break
241
+ if not entries:
242
+ return f"No files matching '{pattern}' in {path}/"
243
+ return f"Files matching '{pattern}' in {path}/:\n" + "\n".join(entries)
244
+
245
+ for item in sorted(target.iterdir()):
246
+ if item.is_dir() and item.name in SKIP_DIRS:
247
+ continue
248
+ rel = str(item.relative_to(self.root)).replace("\\", "/")
249
+ if item.is_dir():
250
+ entries.append(f" {rel}/")
251
+ else:
252
+ try:
253
+ size = item.stat().st_size
254
+ entries.append(f" {rel} ({size:,} bytes)")
255
+ except OSError:
256
+ entries.append(f" {rel}")
257
+ if not entries:
258
+ return f"Empty directory: {path}/"
259
+ return f"Contents of {path}/:\n" + "\n".join(entries[:200])
260
+
261
+ def _read_file(self, data: dict[str, Any]) -> str:
262
+ path = data["path"]
263
+ start = data.get("start_line")
264
+ end = data.get("end_line")
265
+ target = self._safe_path(path)
266
+
267
+ if not target.is_file():
268
+ return f"File not found: {path}"
269
+
270
+ size = target.stat().st_size
271
+ if size > 2 * 1024 * 1024:
272
+ return f"File too large ({size:,} bytes). Use start_line/end_line to read a section."
273
+
274
+ try:
275
+ text = target.read_text(encoding="utf-8", errors="replace")
276
+ except OSError as exc:
277
+ return f"Cannot read: {exc}"
278
+
279
+ lines = text.splitlines()
280
+ total = len(lines)
281
+
282
+ if start or end:
283
+ s = max(1, start or 1) - 1
284
+ e = min(total, end or total)
285
+ selected = lines[s:e]
286
+ start_num = s + 1
287
+ else:
288
+ if total > 500:
289
+ selected = lines[:500]
290
+ numbered = "\n".join(f"{i + 1:4d} | {ln}" for i, ln in enumerate(selected))
291
+ return (
292
+ f"{path} ({total} lines, showing 1-500):\n{numbered}\n\n"
293
+ f"... [truncated — use start_line/end_line for more]"
294
+ )
295
+ selected = lines
296
+ start_num = 1
297
+
298
+ numbered = "\n".join(f"{start_num + i:4d} | {ln}" for i, ln in enumerate(selected))
299
+ lo = start_num
300
+ hi = start_num + len(selected) - 1 if selected else start_num
301
+ return f"{path} (lines {lo}-{hi} of {total}):\n{numbered}"
302
+
303
+ def _grep(self, data: dict[str, Any]) -> str:
304
+ pattern_str = data["pattern"]
305
+ path = data.get("path", ".")
306
+ file_pattern = data.get("file_pattern")
307
+ max_results = min(data.get("max_results", 50), 100)
308
+
309
+ target = self._safe_path(path)
310
+ if not target.exists():
311
+ return f"Path not found: {path}"
312
+
313
+ try:
314
+ regex = re.compile(pattern_str, re.IGNORECASE)
315
+ except re.error:
316
+ regex = re.compile(re.escape(pattern_str), re.IGNORECASE)
317
+
318
+ matches: list[str] = []
319
+ files_searched = 0
320
+
321
+ for root_dir, dirs, files in os.walk(target):
322
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")]
323
+ for fname in files:
324
+ if len(matches) >= max_results:
325
+ break
326
+ fp = Path(root_dir) / fname
327
+ if file_pattern and not fnmatch.fnmatch(fname, file_pattern):
328
+ continue
329
+ try:
330
+ if fp.stat().st_size > 1024 * 1024:
331
+ continue
332
+ text = fp.read_text(encoding="utf-8", errors="replace")
333
+ except OSError:
334
+ continue
335
+ files_searched += 1
336
+ for line_num, line in enumerate(text.splitlines(), 1):
337
+ if regex.search(line):
338
+ rel = str(fp.relative_to(self.root)).replace("\\", "/")
339
+ matches.append(f" {rel}:{line_num}: {line.rstrip()[:200]}")
340
+ if len(matches) >= max_results:
341
+ break
342
+
343
+ if not matches:
344
+ return f"No matches for '{pattern_str}' in {path}/ ({files_searched} files searched)"
345
+ header = f"Found {len(matches)} match(es) for '{pattern_str}'"
346
+ if len(matches) >= max_results:
347
+ header += f" (capped at {max_results})"
348
+ header += f" ({files_searched} files searched)"
349
+ return header + ":\n" + "\n".join(matches)
350
+
351
+
352
+ # ─── Agentic loop ────────────────────────────────────────────────
353
+
354
+
355
+ async def run_subagent(
356
+ client: AsyncClient,
357
+ model: str,
358
+ system_prompt: str,
359
+ file_tree: str,
360
+ repo_path: str,
361
+ label: str,
362
+ max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
363
+ ) -> str:
364
+ """Run a sub-agent with an autonomous tool-use loop.
365
+
366
+ Claude navigates the repository by calling tools (list_files, read_file,
367
+ grep), deciding on its own what to explore based on its analysis mandate.
368
+ The loop continues until Claude produces a final text response or hits
369
+ the tool call limit.
370
+ """
371
+ logger.info("Starting sub-agent: %s", label)
372
+ executor = ToolExecutor(repo_path)
373
+
374
+ user_message = (
375
+ f"Here is the repository file tree:\n\n```\n{file_tree}\n```\n\n"
376
+ f"Use the available tools to explore this repository. Read the files "
377
+ f"you need, search for patterns, and build your analysis from actual "
378
+ f"code evidence. Start by reading key files (README, entry points, "
379
+ f"configs), then dig deeper based on what you find."
380
+ )
381
+
382
+ messages: list[dict[str, Any]] = [{"role": "user", "content": user_message}]
383
+
384
+ for turn in range(max_tool_calls):
385
+ response = await client.messages.create(
386
+ model=model,
387
+ max_tokens=SUBAGENT_MAX_TOKENS,
388
+ system=system_prompt,
389
+ messages=messages,
390
+ tools=REPO_TOOLS,
391
+ temperature=0.0,
392
+ )
393
+
394
+ if response.stop_reason != "tool_use":
395
+ result = ""
396
+ for block in response.content:
397
+ if block.type == "text":
398
+ result += block.text
399
+ logger.info(
400
+ "Sub-agent %s completed after %d turn(s) (%d chars)",
401
+ label,
402
+ turn + 1,
403
+ len(result),
404
+ )
405
+ return result
406
+
407
+ messages.append({"role": "assistant", "content": response.content})
408
+
409
+ tool_results: list[dict[str, Any]] = []
410
+ for block in response.content:
411
+ if block.type == "tool_use":
412
+ logger.debug(
413
+ " %s → %s(%s)",
414
+ label,
415
+ block.name,
416
+ json.dumps(block.input)[:120],
417
+ )
418
+ output = executor.execute(block.name, block.input)
419
+ tool_results.append(
420
+ {
421
+ "type": "tool_result",
422
+ "tool_use_id": block.id,
423
+ "content": output,
424
+ }
425
+ )
426
+
427
+ messages.append({"role": "user", "content": tool_results})
428
+
429
+ logger.warning(
430
+ "Sub-agent %s hit tool call limit (%d). Requesting final answer.",
431
+ label,
432
+ max_tool_calls,
433
+ )
434
+ messages.append(
435
+ {
436
+ "role": "user",
437
+ "content": (
438
+ "You've reached the exploration limit. Produce your final "
439
+ "analysis now based on everything you've read so far."
440
+ ),
441
+ }
442
+ )
443
+ response = await client.messages.create(
444
+ model=model,
445
+ max_tokens=SUBAGENT_MAX_TOKENS,
446
+ system=system_prompt,
447
+ messages=messages,
448
+ temperature=0.0,
449
+ )
450
+ result = ""
451
+ for block in response.content:
452
+ if block.type == "text":
453
+ result += block.text
454
+ return result
455
+
456
+
457
+ # ─── Synthesis (forced structured output via tool_use) ───────────
458
+
459
+
460
+ async def _run_synthesis(
461
+ client: AsyncClient,
462
+ model: str,
463
+ repo_name: str,
464
+ repo_path: str,
465
+ files_scanned: int,
466
+ subagent_reports: dict[str, str],
467
+ lang: str = "en",
468
+ ) -> dict[str, Any]:
469
+ """Merge all sub-agent reports into a structured RexReport.
470
+
471
+ Uses tool_choice to force Claude to output valid JSON matching the schema.
472
+ """
473
+ logger.info("Starting synthesis agent")
474
+
475
+ parts = [
476
+ f"Repository: {repo_name}",
477
+ f"Path: {repo_path}",
478
+ f"Files scanned: {files_scanned}",
479
+ "",
480
+ ]
481
+ for agent_name, report in subagent_reports.items():
482
+ parts.append(f"## {agent_name} Report\n")
483
+ parts.append(report)
484
+ parts.append("")
485
+
486
+ combined = "\n".join(parts)
487
+
488
+ lang_name = LANG_NAMES.get(lang, "English")
489
+ main_prompt = _load_prompt("main", lang_name=lang_name)
490
+ synth_prompt = _load_prompt("synthesis", lang_name=lang_name)
491
+ system = f"{main_prompt}\n\n{synth_prompt}"
492
+
493
+ user_message = (
494
+ f"Based on the following sub-agent analysis reports, produce the final "
495
+ f"REX report as a JSON object matching the RexReport schema.\n\n"
496
+ f"Important fields to fill:\n"
497
+ f'- repo_name: "{repo_name}"\n'
498
+ f'- repo_path: "{repo_path}"\n'
499
+ f"- analyzed_at: current ISO timestamp\n"
500
+ f'- model_used: "{model}"\n'
501
+ f"- files_scanned: {files_scanned}\n\n"
502
+ f"Sub-agent reports:\n\n{combined}\n\n"
503
+ f"Output the complete RexReport as valid JSON. Use the exact field "
504
+ f"names and enum values from the schema."
505
+ )
506
+
507
+ rex_report_schema = RexReport.model_json_schema()
508
+
509
+ response = await client.messages.create(
510
+ model=model,
511
+ max_tokens=SYNTHESIS_MAX_TOKENS,
512
+ system=system,
513
+ messages=[{"role": "user", "content": user_message}],
514
+ tools=[
515
+ {
516
+ "name": "produce_rex_report",
517
+ "description": (
518
+ "Produce the final REX report. Call this tool with the "
519
+ "complete report data as a single JSON object."
520
+ ),
521
+ "input_schema": rex_report_schema,
522
+ }
523
+ ],
524
+ tool_choice={"type": "tool", "name": "produce_rex_report"},
525
+ temperature=0.0,
526
+ )
527
+
528
+ for block in response.content:
529
+ if block.type == "tool_use" and block.name == "produce_rex_report":
530
+ logger.info("Synthesis complete")
531
+ return block.input # type: ignore[return-value]
532
+
533
+ raise RuntimeError("Synthesis agent did not produce a valid report")
534
+
535
+
536
+ # ─── Main pipeline ───────────────────────────────────────────────
537
+
538
+
539
+ async def run_analysis(
540
+ repo_path: str,
541
+ model: str = "claude-sonnet-4-6",
542
+ api_key: str | None = None,
543
+ provider: Provider = Provider.ANTHROPIC,
544
+ foundry_resource: str | None = None,
545
+ foundry_api_key: str | None = None,
546
+ aws_region: str | None = None,
547
+ gcp_project_id: str | None = None,
548
+ gcp_region: str | None = None,
549
+ max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
550
+ lang: str = "en",
551
+ ) -> RexReport:
552
+ """Run the full rex-machine analysis pipeline on a repository.
553
+
554
+ 1. Scan repo for file tree
555
+ 2. Run 4 sub-agents in parallel (each with autonomous tool-use loop)
556
+ 3. Synthesize findings into a structured RexReport
557
+ """
558
+ logger.info("Scanning repository: %s", repo_path)
559
+ repo_map: RepoMap = scan_repo(repo_path)
560
+ logger.info(
561
+ "Found %d files (%d source files)",
562
+ repo_map.total_files,
563
+ len(repo_map.source_files),
564
+ )
565
+
566
+ if repo_map.total_files == 0:
567
+ return RexReport(
568
+ repo_name=_extract_repo_name(repo_path),
569
+ repo_path=str(repo_path),
570
+ analyzed_at=datetime.now(timezone.utc).isoformat(),
571
+ model_used=model,
572
+ files_scanned=0,
573
+ repo_quality=RepoQuality.INSUFFICIENT,
574
+ warnings=["Repository contains no scannable files."],
575
+ rex_items=[],
576
+ global_summary=("The repository is empty or contains only binary/ignored files."),
577
+ strengths=[],
578
+ improvement_suggestions=["Add source code to the repository."],
579
+ )
580
+
581
+ client = create_client(
582
+ provider,
583
+ api_key=api_key,
584
+ foundry_resource=foundry_resource,
585
+ foundry_api_key=foundry_api_key,
586
+ aws_region=aws_region,
587
+ gcp_project_id=gcp_project_id,
588
+ gcp_region=gcp_region,
589
+ )
590
+
591
+ subagent_configs = [
592
+ (_load_prompt("structure_analyzer"), "Structure Analyzer"),
593
+ (_load_prompt("code_pattern_analyzer"), "Code Pattern Analyzer"),
594
+ (_load_prompt("doc_analyzer"), "Documentation Analyzer"),
595
+ (_load_prompt("config_analyzer"), "Configuration Analyzer"),
596
+ ]
597
+
598
+ results: list[str | None] = [None] * len(subagent_configs)
599
+
600
+ lang_name = LANG_NAMES.get(lang, "English")
601
+
602
+ async def _run_one(index: int, prompt: str, label: str) -> None:
603
+ main = _load_prompt("main", lang_name=lang_name)
604
+ system = f"{main}\n\n{prompt}"
605
+ results[index] = await run_subagent(
606
+ client,
607
+ model,
608
+ system,
609
+ repo_map.file_tree,
610
+ repo_path,
611
+ label,
612
+ max_tool_calls=max_tool_calls,
613
+ )
614
+
615
+ async with anyio.create_task_group() as tg:
616
+ for i, (prompt, label) in enumerate(subagent_configs):
617
+ tg.start_soon(_run_one, i, prompt, label)
618
+
619
+ subagent_reports = {
620
+ label: result for (_, label), result in zip(subagent_configs, results) if result
621
+ }
622
+
623
+ report_data = await _run_synthesis(
624
+ client=client,
625
+ model=model,
626
+ repo_name=_extract_repo_name(repo_path),
627
+ repo_path=str(repo_path),
628
+ files_scanned=repo_map.total_files,
629
+ subagent_reports=subagent_reports,
630
+ lang=lang,
631
+ )
632
+
633
+ return RexReport.model_validate(report_data)
634
+
635
+
636
+ def _extract_repo_name(repo_path: str) -> str:
637
+ """Extract a human-readable repo name from a path."""
638
+ return Path(repo_path).resolve().name