rex-machine 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rex_machine-0.2.2 → rex_machine-0.3.0}/PKG-INFO +8 -1
- {rex_machine-0.2.2 → rex_machine-0.3.0}/pyproject.toml +9 -1
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/__init__.py +1 -1
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/agents.py +43 -16
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/cli.py +27 -4
- rex_machine-0.3.0/src/rex_machine/documents.py +79 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/scanner.py +10 -5
- rex_machine-0.3.0/src/rex_machine/tokens.py +33 -0
- rex_machine-0.3.0/tests/test_documents.py +62 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/test_scanner.py +28 -0
- rex_machine-0.3.0/tests/test_tokens.py +50 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.claude/settings.local.json +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/workflows/ci.yml +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/workflows/release.yml +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/.gitignore +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/CHANGELOG.md +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/CLAUDE.md +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/CONTRIBUTING.md +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/LICENSE +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/README.md +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/__main__.py +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/config.py +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/models.py +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/py.typed +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/renderer.py +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/code_pattern_analyzer.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/config_analyzer.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/doc_analyzer.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/main.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/structure_analyzer.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/synthesis.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/report.md.j2 +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/__init__.py +0 -0
- {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/test_models.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rex-machine
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Extract lessons learned (REX) from code repositories
|
|
5
5
|
Project-URL: Homepage, https://github.com/NicoJuiced/rex-machine
|
|
6
6
|
Project-URL: Repository, https://github.com/NicoJuiced/rex-machine
|
|
@@ -27,9 +27,16 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
27
27
|
Requires-Dist: rich>=13.0.0
|
|
28
28
|
Requires-Dist: typer>=0.12.0
|
|
29
29
|
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pypdf>=4.0.0; extra == 'dev'
|
|
30
31
|
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
31
32
|
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: python-docx>=1.0.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: python-pptx>=1.0.0; extra == 'dev'
|
|
32
35
|
Requires-Dist: ruff>=0.4.0; extra == 'dev'
|
|
36
|
+
Provides-Extra: docs
|
|
37
|
+
Requires-Dist: pypdf>=4.0.0; extra == 'docs'
|
|
38
|
+
Requires-Dist: python-docx>=1.0.0; extra == 'docs'
|
|
39
|
+
Requires-Dist: python-pptx>=1.0.0; extra == 'docs'
|
|
33
40
|
Description-Content-Type: text/markdown
|
|
34
41
|
|
|
35
42
|
# rex-machine
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rex-machine"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Extract lessons learned (REX) from code repositories"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -35,10 +35,18 @@ dependencies = [
|
|
|
35
35
|
]
|
|
36
36
|
|
|
37
37
|
[project.optional-dependencies]
|
|
38
|
+
docs = [
|
|
39
|
+
"python-docx>=1.0.0",
|
|
40
|
+
"python-pptx>=1.0.0",
|
|
41
|
+
"pypdf>=4.0.0",
|
|
42
|
+
]
|
|
38
43
|
dev = [
|
|
39
44
|
"pytest>=8.0.0",
|
|
40
45
|
"pytest-asyncio>=0.23.0",
|
|
41
46
|
"ruff>=0.4.0",
|
|
47
|
+
"python-docx>=1.0.0",
|
|
48
|
+
"python-pptx>=1.0.0",
|
|
49
|
+
"pypdf>=4.0.0",
|
|
42
50
|
]
|
|
43
51
|
|
|
44
52
|
[project.urls]
|
|
@@ -21,8 +21,10 @@ import anthropic
|
|
|
21
21
|
import anyio
|
|
22
22
|
from jinja2 import Environment, FileSystemLoader
|
|
23
23
|
|
|
24
|
+
from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
|
|
24
25
|
from rex_machine.models import RepoQuality, RexReport
|
|
25
26
|
from rex_machine.scanner import SKIP_DIRS, RepoMap, scan_repo
|
|
27
|
+
from rex_machine.tokens import TokenTracker
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger("rex_machine")
|
|
28
30
|
|
|
@@ -267,6 +269,12 @@ class ToolExecutor:
|
|
|
267
269
|
if not target.is_file():
|
|
268
270
|
return f"File not found: {path}"
|
|
269
271
|
|
|
272
|
+
if target.suffix.lower() in DOCUMENT_EXTENSIONS:
|
|
273
|
+
result = extract_text(target)
|
|
274
|
+
if result is None:
|
|
275
|
+
return f"Unsupported document format: {path}"
|
|
276
|
+
return f"{path} (document):\n{result}"
|
|
277
|
+
|
|
270
278
|
size = target.stat().st_size
|
|
271
279
|
if size > 2 * 1024 * 1024:
|
|
272
280
|
return f"File too large ({size:,} bytes). Use start_line/end_line to read a section."
|
|
@@ -360,6 +368,7 @@ async def run_subagent(
|
|
|
360
368
|
repo_path: str,
|
|
361
369
|
label: str,
|
|
362
370
|
max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
|
|
371
|
+
tracker: TokenTracker | None = None,
|
|
363
372
|
) -> str:
|
|
364
373
|
"""Run a sub-agent with an autonomous tool-use loop.
|
|
365
374
|
|
|
@@ -390,6 +399,8 @@ async def run_subagent(
|
|
|
390
399
|
tools=REPO_TOOLS,
|
|
391
400
|
temperature=0.0,
|
|
392
401
|
)
|
|
402
|
+
if tracker:
|
|
403
|
+
tracker.record(response.usage.input_tokens, response.usage.output_tokens)
|
|
393
404
|
|
|
394
405
|
if response.stop_reason != "tool_use":
|
|
395
406
|
result = ""
|
|
@@ -404,6 +415,10 @@ async def run_subagent(
|
|
|
404
415
|
)
|
|
405
416
|
return result
|
|
406
417
|
|
|
418
|
+
if tracker and tracker.budget_exceeded:
|
|
419
|
+
logger.warning("Sub-agent %s stopped: token budget exceeded", label)
|
|
420
|
+
break
|
|
421
|
+
|
|
407
422
|
messages.append({"role": "assistant", "content": response.content})
|
|
408
423
|
|
|
409
424
|
tool_results: list[dict[str, Any]] = []
|
|
@@ -427,9 +442,8 @@ async def run_subagent(
|
|
|
427
442
|
messages.append({"role": "user", "content": tool_results})
|
|
428
443
|
|
|
429
444
|
logger.warning(
|
|
430
|
-
"Sub-agent %s hit
|
|
445
|
+
"Sub-agent %s hit limit. Requesting final answer.",
|
|
431
446
|
label,
|
|
432
|
-
max_tool_calls,
|
|
433
447
|
)
|
|
434
448
|
messages.append(
|
|
435
449
|
{
|
|
@@ -447,6 +461,8 @@ async def run_subagent(
|
|
|
447
461
|
messages=messages,
|
|
448
462
|
temperature=0.0,
|
|
449
463
|
)
|
|
464
|
+
if tracker:
|
|
465
|
+
tracker.record(response.usage.input_tokens, response.usage.output_tokens)
|
|
450
466
|
result = ""
|
|
451
467
|
for block in response.content:
|
|
452
468
|
if block.type == "text":
|
|
@@ -465,6 +481,7 @@ async def _run_synthesis(
|
|
|
465
481
|
files_scanned: int,
|
|
466
482
|
subagent_reports: dict[str, str],
|
|
467
483
|
lang: str = "en",
|
|
484
|
+
tracker: TokenTracker | None = None,
|
|
468
485
|
) -> dict[str, Any]:
|
|
469
486
|
"""Merge all sub-agent reports into a structured RexReport.
|
|
470
487
|
|
|
@@ -524,6 +541,8 @@ async def _run_synthesis(
|
|
|
524
541
|
tool_choice={"type": "tool", "name": "produce_rex_report"},
|
|
525
542
|
temperature=0.0,
|
|
526
543
|
)
|
|
544
|
+
if tracker:
|
|
545
|
+
tracker.record(response.usage.input_tokens, response.usage.output_tokens)
|
|
527
546
|
|
|
528
547
|
for block in response.content:
|
|
529
548
|
if block.type == "tool_use" and block.name == "produce_rex_report":
|
|
@@ -548,13 +567,16 @@ async def run_analysis(
|
|
|
548
567
|
gcp_region: str | None = None,
|
|
549
568
|
max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
|
|
550
569
|
lang: str = "en",
|
|
551
|
-
|
|
570
|
+
max_tokens: int | None = None,
|
|
571
|
+
) -> tuple[RexReport, TokenTracker]:
|
|
552
572
|
"""Run the full rex-machine analysis pipeline on a repository.
|
|
553
573
|
|
|
554
574
|
1. Scan repo for file tree
|
|
555
575
|
2. Run 4 sub-agents in parallel (each with autonomous tool-use loop)
|
|
556
576
|
3. Synthesize findings into a structured RexReport
|
|
557
577
|
"""
|
|
578
|
+
tracker = TokenTracker(max_tokens=max_tokens)
|
|
579
|
+
|
|
558
580
|
logger.info("Scanning repository: %s", repo_path)
|
|
559
581
|
repo_map: RepoMap = scan_repo(repo_path)
|
|
560
582
|
logger.info(
|
|
@@ -564,18 +586,21 @@ async def run_analysis(
|
|
|
564
586
|
)
|
|
565
587
|
|
|
566
588
|
if repo_map.total_files == 0:
|
|
567
|
-
return
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
589
|
+
return (
|
|
590
|
+
RexReport(
|
|
591
|
+
repo_name=_extract_repo_name(repo_path),
|
|
592
|
+
repo_path=str(repo_path),
|
|
593
|
+
analyzed_at=datetime.now(timezone.utc).isoformat(),
|
|
594
|
+
model_used=model,
|
|
595
|
+
files_scanned=0,
|
|
596
|
+
repo_quality=RepoQuality.INSUFFICIENT,
|
|
597
|
+
warnings=["Repository contains no scannable files."],
|
|
598
|
+
rex_items=[],
|
|
599
|
+
global_summary=("The repository is empty or contains only binary/ignored files."),
|
|
600
|
+
strengths=[],
|
|
601
|
+
improvement_suggestions=["Add source code to the repository."],
|
|
602
|
+
),
|
|
603
|
+
tracker,
|
|
579
604
|
)
|
|
580
605
|
|
|
581
606
|
client = create_client(
|
|
@@ -610,6 +635,7 @@ async def run_analysis(
|
|
|
610
635
|
repo_path,
|
|
611
636
|
label,
|
|
612
637
|
max_tool_calls=max_tool_calls,
|
|
638
|
+
tracker=tracker,
|
|
613
639
|
)
|
|
614
640
|
|
|
615
641
|
async with anyio.create_task_group() as tg:
|
|
@@ -628,9 +654,10 @@ async def run_analysis(
|
|
|
628
654
|
files_scanned=repo_map.total_files,
|
|
629
655
|
subagent_reports=subagent_reports,
|
|
630
656
|
lang=lang,
|
|
657
|
+
tracker=tracker,
|
|
631
658
|
)
|
|
632
659
|
|
|
633
|
-
return RexReport.model_validate(report_data)
|
|
660
|
+
return RexReport.model_validate(report_data), tracker
|
|
634
661
|
|
|
635
662
|
|
|
636
663
|
def _extract_repo_name(repo_path: str) -> str:
|
|
@@ -28,8 +28,8 @@ from rex_machine.config import (
|
|
|
28
28
|
save_global,
|
|
29
29
|
save_project,
|
|
30
30
|
)
|
|
31
|
-
from rex_machine.models import RexReport
|
|
32
31
|
from rex_machine.renderer import render_console, render_json, render_markdown
|
|
32
|
+
from rex_machine.tokens import TokenTracker
|
|
33
33
|
|
|
34
34
|
app = typer.Typer(
|
|
35
35
|
name="rex",
|
|
@@ -430,6 +430,10 @@ def extract(
|
|
|
430
430
|
Provider | None,
|
|
431
431
|
typer.Option("--provider", "-p", help="API provider (overrides config)."),
|
|
432
432
|
] = None,
|
|
433
|
+
max_tokens: Annotated[
|
|
434
|
+
int | None,
|
|
435
|
+
typer.Option("--max-tokens", "-t", help="Global token budget (stops when exceeded)."),
|
|
436
|
+
] = None,
|
|
433
437
|
lang: Annotated[
|
|
434
438
|
str,
|
|
435
439
|
typer.Option("--lang", "-l", help="Output language: en (default) or fr."),
|
|
@@ -478,29 +482,31 @@ def extract(
|
|
|
478
482
|
console.print(f"Analyzing: [cyan]{repo_path}[/cyan]")
|
|
479
483
|
console.print(f"Model: [cyan]{effective_model}[/cyan]")
|
|
480
484
|
console.print(f"Provider: [cyan]{effective_provider.value}[/cyan]")
|
|
485
|
+
if max_tokens:
|
|
486
|
+
console.print(f"Token budget: [cyan]{max_tokens:,}[/cyan]")
|
|
481
487
|
if lang != "en":
|
|
482
488
|
console.print(f"Language: [cyan]{lang}[/cyan]")
|
|
483
489
|
console.print()
|
|
484
490
|
|
|
485
491
|
try:
|
|
486
492
|
|
|
487
|
-
async def _run() ->
|
|
493
|
+
async def _run() -> tuple:
|
|
488
494
|
return await run_analysis(
|
|
489
495
|
repo_path,
|
|
490
496
|
effective_model,
|
|
491
497
|
provider=effective_provider,
|
|
492
498
|
max_tool_calls=max_tool_calls,
|
|
493
499
|
lang=lang,
|
|
500
|
+
max_tokens=max_tokens,
|
|
494
501
|
**creds,
|
|
495
502
|
)
|
|
496
503
|
|
|
497
504
|
with console.status("[bold green]Analyzing repository...", spinner="dots"):
|
|
498
|
-
report = anyio.run(_run)
|
|
505
|
+
report, tracker = anyio.run(_run)
|
|
499
506
|
except KeyboardInterrupt:
|
|
500
507
|
console.print("\n[yellow]Analysis cancelled.[/yellow]")
|
|
501
508
|
raise typer.Exit(code=130)
|
|
502
509
|
except BaseException as exc:
|
|
503
|
-
# Unwrap ExceptionGroup → first cause only (except* requires Python 3.11+)
|
|
504
510
|
cause = exc
|
|
505
511
|
while hasattr(cause, "exceptions"):
|
|
506
512
|
cause = cause.exceptions[0]
|
|
@@ -528,6 +534,23 @@ def extract(
|
|
|
528
534
|
else:
|
|
529
535
|
console.print_json(result)
|
|
530
536
|
|
|
537
|
+
_print_token_summary(tracker)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def _print_token_summary(tracker: TokenTracker) -> None:
|
|
541
|
+
table = Table(title="Token Usage", show_header=False, border_style="dim")
|
|
542
|
+
table.add_column("Metric", style="bold")
|
|
543
|
+
table.add_column("Value", justify="right")
|
|
544
|
+
table.add_row("Input tokens", f"{tracker.input_tokens:,}")
|
|
545
|
+
table.add_row("Output tokens", f"{tracker.output_tokens:,}")
|
|
546
|
+
table.add_row("Total tokens", f"{tracker.total_tokens:,}")
|
|
547
|
+
table.add_row("API calls", str(tracker.api_calls))
|
|
548
|
+
if tracker.max_tokens:
|
|
549
|
+
pct = tracker.total_tokens / tracker.max_tokens * 100
|
|
550
|
+
table.add_row("Budget used", f"{pct:.1f}%")
|
|
551
|
+
console.print()
|
|
552
|
+
console.print(table)
|
|
553
|
+
|
|
531
554
|
|
|
532
555
|
def _write_file(path: Path, content: str) -> None:
|
|
533
556
|
try:
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Document text extraction for rex-machine.
|
|
2
|
+
|
|
3
|
+
Supports .docx, .pptx, and .pdf files. Libraries are imported at runtime
|
|
4
|
+
so the tool works without them — install with `pip install rex-machine[docs]`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
DOCUMENT_EXTENSIONS = frozenset({".docx", ".pptx", ".pdf"})
|
|
12
|
+
|
|
13
|
+
_INSTALL_HINT = "Install document support with: pip install rex-machine[docs]"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_text(filepath: Path, max_chars: int = 5000) -> str | None:
|
|
17
|
+
"""Extract plain text from a document file.
|
|
18
|
+
|
|
19
|
+
Returns None if the file is not a supported document format.
|
|
20
|
+
Returns an error message string if extraction fails.
|
|
21
|
+
"""
|
|
22
|
+
ext = filepath.suffix.lower()
|
|
23
|
+
if ext not in DOCUMENT_EXTENSIONS:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
extractors = {
|
|
27
|
+
".docx": _extract_docx,
|
|
28
|
+
".pptx": _extract_pptx,
|
|
29
|
+
".pdf": _extract_pdf,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
text = extractors[ext](filepath)
|
|
34
|
+
except ImportError:
|
|
35
|
+
return f"Cannot read {ext} files. {_INSTALL_HINT}"
|
|
36
|
+
except Exception as exc:
|
|
37
|
+
return f"Error reading document: {exc}"
|
|
38
|
+
|
|
39
|
+
if not text.strip():
|
|
40
|
+
return "(empty document)"
|
|
41
|
+
|
|
42
|
+
if len(text) > max_chars:
|
|
43
|
+
return text[:max_chars] + "\n\n... [truncated]"
|
|
44
|
+
return text
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _extract_docx(filepath: Path) -> str:
|
|
48
|
+
from docx import Document
|
|
49
|
+
|
|
50
|
+
doc = Document(str(filepath))
|
|
51
|
+
return "\n".join(p.text for p in doc.paragraphs)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _extract_pptx(filepath: Path) -> str:
|
|
55
|
+
from pptx import Presentation
|
|
56
|
+
|
|
57
|
+
prs = Presentation(str(filepath))
|
|
58
|
+
parts: list[str] = []
|
|
59
|
+
for i, slide in enumerate(prs.slides, 1):
|
|
60
|
+
slide_texts: list[str] = []
|
|
61
|
+
for shape in slide.shapes:
|
|
62
|
+
if shape.has_text_frame:
|
|
63
|
+
slide_texts.append(shape.text_frame.text)
|
|
64
|
+
if slide_texts:
|
|
65
|
+
parts.append(f"--- Slide {i} ---")
|
|
66
|
+
parts.append("\n".join(slide_texts))
|
|
67
|
+
return "\n".join(parts)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _extract_pdf(filepath: Path) -> str:
|
|
71
|
+
from pypdf import PdfReader
|
|
72
|
+
|
|
73
|
+
reader = PdfReader(str(filepath))
|
|
74
|
+
parts: list[str] = []
|
|
75
|
+
for page in reader.pages:
|
|
76
|
+
text = page.extract_text()
|
|
77
|
+
if text:
|
|
78
|
+
parts.append(text)
|
|
79
|
+
return "\n".join(parts)
|
|
@@ -8,6 +8,8 @@ import os
|
|
|
8
8
|
from dataclasses import dataclass, field
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
|
|
11
|
+
from rex_machine.documents import DOCUMENT_EXTENSIONS
|
|
12
|
+
|
|
11
13
|
# Extensions that are almost certainly binary
|
|
12
14
|
BINARY_EXTENSIONS = frozenset(
|
|
13
15
|
{
|
|
@@ -42,13 +44,10 @@ BINARY_EXTENSIONS = frozenset(
|
|
|
42
44
|
".pyo",
|
|
43
45
|
".class",
|
|
44
46
|
".wasm",
|
|
45
|
-
".pdf",
|
|
46
47
|
".doc",
|
|
47
|
-
".docx",
|
|
48
48
|
".xls",
|
|
49
49
|
".xlsx",
|
|
50
50
|
".ppt",
|
|
51
|
-
".pptx",
|
|
52
51
|
".ttf",
|
|
53
52
|
".otf",
|
|
54
53
|
".woff",
|
|
@@ -101,6 +100,7 @@ class FileInfo:
|
|
|
101
100
|
size_bytes: int
|
|
102
101
|
extension: str
|
|
103
102
|
is_binary: bool
|
|
103
|
+
is_document: bool = False
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
@dataclass
|
|
@@ -117,9 +117,9 @@ class RepoMap:
|
|
|
117
117
|
def file_tree(self) -> str:
|
|
118
118
|
"""Return a textual file tree representation."""
|
|
119
119
|
lines: list[str] = []
|
|
120
|
+
doc_paths = {f.relative_path for f in self.files if f.is_document}
|
|
120
121
|
dirs: dict[str, list[str]] = {}
|
|
121
122
|
for f in self.files:
|
|
122
|
-
# Use forward-slash splitting to avoid Windows backslash issues
|
|
123
123
|
if "/" in f.relative_path:
|
|
124
124
|
parent = f.relative_path.rsplit("/", 1)[0]
|
|
125
125
|
name = f.relative_path.rsplit("/", 1)[1]
|
|
@@ -133,7 +133,9 @@ class RepoMap:
|
|
|
133
133
|
lines.append(f"{dir_path}/")
|
|
134
134
|
for name in sorted(dirs[dir_path]):
|
|
135
135
|
prefix = f" {dir_path}/" if dir_path else ""
|
|
136
|
-
|
|
136
|
+
rel = f"{dir_path}/{name}" if dir_path else name
|
|
137
|
+
tag = " [doc]" if rel in doc_paths else ""
|
|
138
|
+
lines.append(f" {prefix}{name}{tag}")
|
|
137
139
|
|
|
138
140
|
return "\n".join(lines)
|
|
139
141
|
|
|
@@ -188,6 +190,8 @@ def _should_skip_dir(dirname: str) -> bool:
|
|
|
188
190
|
def _is_binary_file(filepath: Path) -> bool:
|
|
189
191
|
"""Heuristic check for binary files."""
|
|
190
192
|
ext = filepath.suffix.lower()
|
|
193
|
+
if ext in DOCUMENT_EXTENSIONS:
|
|
194
|
+
return False
|
|
191
195
|
if ext in BINARY_EXTENSIONS:
|
|
192
196
|
return True
|
|
193
197
|
|
|
@@ -266,6 +270,7 @@ def scan_repo(path: str | Path) -> RepoMap:
|
|
|
266
270
|
size_bytes=size,
|
|
267
271
|
extension=ext,
|
|
268
272
|
is_binary=is_binary,
|
|
273
|
+
is_document=ext in DOCUMENT_EXTENSIONS,
|
|
269
274
|
)
|
|
270
275
|
repo_map.files.append(file_info)
|
|
271
276
|
repo_map.total_files += 1
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Token usage tracking for rex-machine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TokenTracker:
|
|
11
|
+
"""Tracks token usage across all API calls in a run."""
|
|
12
|
+
|
|
13
|
+
max_tokens: int | None = None
|
|
14
|
+
input_tokens: int = 0
|
|
15
|
+
output_tokens: int = 0
|
|
16
|
+
api_calls: int = 0
|
|
17
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def total_tokens(self) -> int:
|
|
21
|
+
return self.input_tokens + self.output_tokens
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def budget_exceeded(self) -> bool:
|
|
25
|
+
if self.max_tokens is None:
|
|
26
|
+
return False
|
|
27
|
+
return self.total_tokens >= self.max_tokens
|
|
28
|
+
|
|
29
|
+
def record(self, input_tokens: int, output_tokens: int) -> None:
|
|
30
|
+
with self._lock:
|
|
31
|
+
self.input_tokens += input_tokens
|
|
32
|
+
self.output_tokens += output_tokens
|
|
33
|
+
self.api_calls += 1
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Tests for documents.py."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestDocumentExtensions:
|
|
9
|
+
def test_supported_formats(self):
|
|
10
|
+
assert ".docx" in DOCUMENT_EXTENSIONS
|
|
11
|
+
assert ".pptx" in DOCUMENT_EXTENSIONS
|
|
12
|
+
assert ".pdf" in DOCUMENT_EXTENSIONS
|
|
13
|
+
|
|
14
|
+
def test_old_formats_not_included(self):
|
|
15
|
+
assert ".doc" not in DOCUMENT_EXTENSIONS
|
|
16
|
+
assert ".ppt" not in DOCUMENT_EXTENSIONS
|
|
17
|
+
assert ".xls" not in DOCUMENT_EXTENSIONS
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestExtractText:
|
|
21
|
+
def test_non_document_returns_none(self, tmp_path: Path):
|
|
22
|
+
f = tmp_path / "code.py"
|
|
23
|
+
f.write_text("print('hi')", encoding="utf-8")
|
|
24
|
+
assert extract_text(f) is None
|
|
25
|
+
|
|
26
|
+
def test_missing_library_returns_hint(self, tmp_path: Path, monkeypatch):
|
|
27
|
+
f = tmp_path / "test.docx"
|
|
28
|
+
f.write_bytes(b"fake docx content")
|
|
29
|
+
|
|
30
|
+
def _raise_import(filepath):
|
|
31
|
+
raise ImportError("No module named 'docx'")
|
|
32
|
+
|
|
33
|
+
monkeypatch.setattr("rex_machine.documents._extract_docx", _raise_import)
|
|
34
|
+
result = extract_text(f)
|
|
35
|
+
assert result is not None
|
|
36
|
+
assert "Cannot read .docx" in result
|
|
37
|
+
assert "pip install" in result
|
|
38
|
+
|
|
39
|
+
def test_corrupt_file_returns_error(self, tmp_path: Path):
|
|
40
|
+
f = tmp_path / "bad.pdf"
|
|
41
|
+
f.write_bytes(b"not a real pdf")
|
|
42
|
+
result = extract_text(f)
|
|
43
|
+
assert result is not None
|
|
44
|
+
assert "Error reading document" in result
|
|
45
|
+
|
|
46
|
+
def test_truncation(self, tmp_path: Path, monkeypatch):
|
|
47
|
+
long_text = "x" * 10000
|
|
48
|
+
|
|
49
|
+
monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: long_text)
|
|
50
|
+
f = tmp_path / "big.docx"
|
|
51
|
+
f.write_bytes(b"fake")
|
|
52
|
+
result = extract_text(f, max_chars=100)
|
|
53
|
+
assert result is not None
|
|
54
|
+
assert len(result) < 200
|
|
55
|
+
assert "[truncated]" in result
|
|
56
|
+
|
|
57
|
+
def test_empty_document(self, tmp_path: Path, monkeypatch):
|
|
58
|
+
monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: " ")
|
|
59
|
+
f = tmp_path / "empty.docx"
|
|
60
|
+
f.write_bytes(b"fake")
|
|
61
|
+
result = extract_text(f)
|
|
62
|
+
assert result == "(empty document)"
|
|
@@ -93,6 +93,21 @@ class TestIsBinaryFile:
|
|
|
93
93
|
f.write_bytes(b"hello\x00world")
|
|
94
94
|
assert _is_binary_file(f) is True
|
|
95
95
|
|
|
96
|
+
def test_docx_not_binary(self, tmp_path: Path):
|
|
97
|
+
f = tmp_path / "report.docx"
|
|
98
|
+
f.write_bytes(b"PK\x03\x04fake")
|
|
99
|
+
assert _is_binary_file(f) is False
|
|
100
|
+
|
|
101
|
+
def test_pptx_not_binary(self, tmp_path: Path):
|
|
102
|
+
f = tmp_path / "slides.pptx"
|
|
103
|
+
f.write_bytes(b"PK\x03\x04fake")
|
|
104
|
+
assert _is_binary_file(f) is False
|
|
105
|
+
|
|
106
|
+
def test_pdf_not_binary(self, tmp_path: Path):
|
|
107
|
+
f = tmp_path / "doc.pdf"
|
|
108
|
+
f.write_bytes(b"%PDF-1.4 fake")
|
|
109
|
+
assert _is_binary_file(f) is False
|
|
110
|
+
|
|
96
111
|
|
|
97
112
|
# ─── scan_repo ───────────────────────────────────────────────────
|
|
98
113
|
|
|
@@ -135,6 +150,19 @@ class TestScanRepo:
|
|
|
135
150
|
source_paths = {f.relative_path for f in repo_map.source_files}
|
|
136
151
|
assert "image.png" not in source_paths
|
|
137
152
|
|
|
153
|
+
def test_document_files_detected(self, tmp_repo: Path):
|
|
154
|
+
(tmp_repo / "report.docx").write_bytes(b"PK\x03\x04fake")
|
|
155
|
+
(tmp_repo / "slides.pptx").write_bytes(b"PK\x03\x04fake")
|
|
156
|
+
repo_map = scan_repo(tmp_repo)
|
|
157
|
+
doc_files = {f.relative_path for f in repo_map.files if f.is_document}
|
|
158
|
+
assert "report.docx" in doc_files
|
|
159
|
+
assert "slides.pptx" in doc_files
|
|
160
|
+
|
|
161
|
+
def test_document_in_file_tree_tagged(self, tmp_repo: Path):
|
|
162
|
+
(tmp_repo / "notes.docx").write_bytes(b"PK\x03\x04fake")
|
|
163
|
+
repo_map = scan_repo(tmp_repo)
|
|
164
|
+
assert "[doc]" in repo_map.file_tree
|
|
165
|
+
|
|
138
166
|
|
|
139
167
|
# ─── SKIP_DIRS consistency ───────────────────────────────────────
|
|
140
168
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Tests for tokens.py."""
|
|
2
|
+
|
|
3
|
+
from rex_machine.tokens import TokenTracker
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestTokenTracker:
|
|
7
|
+
def test_initial_state(self):
|
|
8
|
+
t = TokenTracker()
|
|
9
|
+
assert t.input_tokens == 0
|
|
10
|
+
assert t.output_tokens == 0
|
|
11
|
+
assert t.total_tokens == 0
|
|
12
|
+
assert t.api_calls == 0
|
|
13
|
+
assert t.budget_exceeded is False
|
|
14
|
+
|
|
15
|
+
def test_record(self):
|
|
16
|
+
t = TokenTracker()
|
|
17
|
+
t.record(100, 50)
|
|
18
|
+
assert t.input_tokens == 100
|
|
19
|
+
assert t.output_tokens == 50
|
|
20
|
+
assert t.total_tokens == 150
|
|
21
|
+
assert t.api_calls == 1
|
|
22
|
+
|
|
23
|
+
def test_multiple_records(self):
|
|
24
|
+
t = TokenTracker()
|
|
25
|
+
t.record(100, 50)
|
|
26
|
+
t.record(200, 80)
|
|
27
|
+
assert t.input_tokens == 300
|
|
28
|
+
assert t.output_tokens == 130
|
|
29
|
+
assert t.total_tokens == 430
|
|
30
|
+
assert t.api_calls == 2
|
|
31
|
+
|
|
32
|
+
def test_budget_not_exceeded(self):
|
|
33
|
+
t = TokenTracker(max_tokens=1000)
|
|
34
|
+
t.record(200, 100)
|
|
35
|
+
assert t.budget_exceeded is False
|
|
36
|
+
|
|
37
|
+
def test_budget_exceeded(self):
|
|
38
|
+
t = TokenTracker(max_tokens=500)
|
|
39
|
+
t.record(300, 250)
|
|
40
|
+
assert t.budget_exceeded is True
|
|
41
|
+
|
|
42
|
+
def test_no_budget_never_exceeded(self):
|
|
43
|
+
t = TokenTracker()
|
|
44
|
+
t.record(999999, 999999)
|
|
45
|
+
assert t.budget_exceeded is False
|
|
46
|
+
|
|
47
|
+
def test_budget_exact_boundary(self):
|
|
48
|
+
t = TokenTracker(max_tokens=100)
|
|
49
|
+
t.record(50, 50)
|
|
50
|
+
assert t.budget_exceeded is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/code_pattern_analyzer.j2
RENAMED
|
File without changes
|
{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/config_analyzer.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/structure_analyzer.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|