PyPI - rex-machine - Versions diffs - 0.2.2__tar.gz → 0.3.0__tar.gz - Mend

rex-machine 0.2.2tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{rex_machine-0.2.2 → rex_machine-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rex-machine
-Version: 0.2.2
+Version: 0.3.0
 Summary: Extract lessons learned (REX) from code repositories
 Project-URL: Homepage, https://github.com/NicoJuiced/rex-machine
 Project-URL: Repository, https://github.com/NicoJuiced/rex-machine
@@ -27,9 +27,16 @@ Requires-Dist: pydantic>=2.0.0
 Requires-Dist: rich>=13.0.0
 Requires-Dist: typer>=0.12.0
 Provides-Extra: dev
+Requires-Dist: pypdf>=4.0.0; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
 Requires-Dist: pytest>=8.0.0; extra == 'dev'
+Requires-Dist: python-docx>=1.0.0; extra == 'dev'
+Requires-Dist: python-pptx>=1.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.4.0; extra == 'dev'
+Provides-Extra: docs
+Requires-Dist: pypdf>=4.0.0; extra == 'docs'
+Requires-Dist: python-docx>=1.0.0; extra == 'docs'
+Requires-Dist: python-pptx>=1.0.0; extra == 'docs'
 Description-Content-Type: text/markdown
 # rex-machine

{rex_machine-0.2.2 → rex_machine-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "rex-machine"
-version = "0.2.2"
+version = "0.3.0"
 description = "Extract lessons learned (REX) from code repositories"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -35,10 +35,18 @@ dependencies = [
 ]
 [project.optional-dependencies]
+docs = [
+    "python-docx>=1.0.0",
+    "python-pptx>=1.0.0",
+    "pypdf>=4.0.0",
+]
 dev = [
     "pytest>=8.0.0",
     "pytest-asyncio>=0.23.0",
     "ruff>=0.4.0",
+    "python-docx>=1.0.0",
+    "python-pptx>=1.0.0",
+    "pypdf>=4.0.0",
 ]
 [project.urls]

{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """rex-machine - Extract lessons learned from code repositories."""
-__version__ = "0.2.2"
+__version__ = "0.3.0"

{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/agents.py RENAMED Viewed

@@ -21,8 +21,10 @@ import anthropic
 import anyio
 from jinja2 import Environment, FileSystemLoader
+from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
 from rex_machine.models import RepoQuality, RexReport
 from rex_machine.scanner import SKIP_DIRS, RepoMap, scan_repo
+from rex_machine.tokens import TokenTracker
 logger = logging.getLogger("rex_machine")
@@ -267,6 +269,12 @@ class ToolExecutor:
         if not target.is_file():
             return f"File not found: {path}"
+        if target.suffix.lower() in DOCUMENT_EXTENSIONS:
+            result = extract_text(target)
+            if result is None:
+                return f"Unsupported document format: {path}"
+            return f"{path} (document):\n{result}"
         size = target.stat().st_size
         if size > 2 * 1024 * 1024:
             return f"File too large ({size:,} bytes). Use start_line/end_line to read a section."
@@ -360,6 +368,7 @@ async def run_subagent(
     repo_path: str,
     label: str,
     max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
+    tracker: TokenTracker | None = None,
 ) -> str:
     """Run a sub-agent with an autonomous tool-use loop.
@@ -390,6 +399,8 @@ async def run_subagent(
             tools=REPO_TOOLS,
             temperature=0.0,
         )
+        if tracker:
+            tracker.record(response.usage.input_tokens, response.usage.output_tokens)
         if response.stop_reason != "tool_use":
             result = ""
@@ -404,6 +415,10 @@ async def run_subagent(
             )
             return result
+        if tracker and tracker.budget_exceeded:
+            logger.warning("Sub-agent %s stopped: token budget exceeded", label)
+            break
         messages.append({"role": "assistant", "content": response.content})
         tool_results: list[dict[str, Any]] = []
@@ -427,9 +442,8 @@ async def run_subagent(
         messages.append({"role": "user", "content": tool_results})
     logger.warning(
-        "Sub-agent %s hit tool call limit (%d). Requesting final answer.",
+        "Sub-agent %s hit limit. Requesting final answer.",
         label,
-        max_tool_calls,
     )
     messages.append(
         {
@@ -447,6 +461,8 @@ async def run_subagent(
         messages=messages,
         temperature=0.0,
     )
+    if tracker:
+        tracker.record(response.usage.input_tokens, response.usage.output_tokens)
     result = ""
     for block in response.content:
         if block.type == "text":
@@ -465,6 +481,7 @@ async def _run_synthesis(
     files_scanned: int,
     subagent_reports: dict[str, str],
     lang: str = "en",
+    tracker: TokenTracker | None = None,
 ) -> dict[str, Any]:
     """Merge all sub-agent reports into a structured RexReport.
@@ -524,6 +541,8 @@ async def _run_synthesis(
         tool_choice={"type": "tool", "name": "produce_rex_report"},
         temperature=0.0,
     )
+    if tracker:
+        tracker.record(response.usage.input_tokens, response.usage.output_tokens)
     for block in response.content:
         if block.type == "tool_use" and block.name == "produce_rex_report":
@@ -548,13 +567,16 @@ async def run_analysis(
     gcp_region: str | None = None,
     max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
     lang: str = "en",
-) -> RexReport:
+    max_tokens: int | None = None,
+) -> tuple[RexReport, TokenTracker]:
     """Run the full rex-machine analysis pipeline on a repository.
     1. Scan repo for file tree
     2. Run 4 sub-agents in parallel (each with autonomous tool-use loop)
     3. Synthesize findings into a structured RexReport
     """
+    tracker = TokenTracker(max_tokens=max_tokens)
     logger.info("Scanning repository: %s", repo_path)
     repo_map: RepoMap = scan_repo(repo_path)
     logger.info(
@@ -564,18 +586,21 @@ async def run_analysis(
     )
     if repo_map.total_files == 0:
-        return RexReport(
-            repo_name=_extract_repo_name(repo_path),
-            repo_path=str(repo_path),
-            analyzed_at=datetime.now(timezone.utc).isoformat(),
-            model_used=model,
-            files_scanned=0,
-            repo_quality=RepoQuality.INSUFFICIENT,
-            warnings=["Repository contains no scannable files."],
-            rex_items=[],
-            global_summary=("The repository is empty or contains only binary/ignored files."),
-            strengths=[],
-            improvement_suggestions=["Add source code to the repository."],
+        return (
+            RexReport(
+                repo_name=_extract_repo_name(repo_path),
+                repo_path=str(repo_path),
+                analyzed_at=datetime.now(timezone.utc).isoformat(),
+                model_used=model,
+                files_scanned=0,
+                repo_quality=RepoQuality.INSUFFICIENT,
+                warnings=["Repository contains no scannable files."],
+                rex_items=[],
+                global_summary=("The repository is empty or contains only binary/ignored files."),
+                strengths=[],
+                improvement_suggestions=["Add source code to the repository."],
+            ),
+            tracker,
         )
     client = create_client(
@@ -610,6 +635,7 @@ async def run_analysis(
             repo_path,
             label,
             max_tool_calls=max_tool_calls,
+            tracker=tracker,
         )
     async with anyio.create_task_group() as tg:
@@ -628,9 +654,10 @@ async def run_analysis(
         files_scanned=repo_map.total_files,
         subagent_reports=subagent_reports,
         lang=lang,
+        tracker=tracker,
     )
-    return RexReport.model_validate(report_data)
+    return RexReport.model_validate(report_data), tracker
 def _extract_repo_name(repo_path: str) -> str:

{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/cli.py RENAMED Viewed

@@ -28,8 +28,8 @@ from rex_machine.config import (
     save_global,
     save_project,
 )
-from rex_machine.models import RexReport
 from rex_machine.renderer import render_console, render_json, render_markdown
+from rex_machine.tokens import TokenTracker
 app = typer.Typer(
     name="rex",
@@ -430,6 +430,10 @@ def extract(
         Provider | None,
         typer.Option("--provider", "-p", help="API provider (overrides config)."),
     ] = None,
+    max_tokens: Annotated[
+        int | None,
+        typer.Option("--max-tokens", "-t", help="Global token budget (stops when exceeded)."),
+    ] = None,
     lang: Annotated[
         str,
         typer.Option("--lang", "-l", help="Output language: en (default) or fr."),
@@ -478,29 +482,31 @@ def extract(
     console.print(f"Analyzing: [cyan]{repo_path}[/cyan]")
     console.print(f"Model: [cyan]{effective_model}[/cyan]")
     console.print(f"Provider: [cyan]{effective_provider.value}[/cyan]")
+    if max_tokens:
+        console.print(f"Token budget: [cyan]{max_tokens:,}[/cyan]")
     if lang != "en":
         console.print(f"Language: [cyan]{lang}[/cyan]")
     console.print()
     try:
-        async def _run() -> RexReport:
+        async def _run() -> tuple:
             return await run_analysis(
                 repo_path,
                 effective_model,
                 provider=effective_provider,
                 max_tool_calls=max_tool_calls,
                 lang=lang,
+                max_tokens=max_tokens,
                 **creds,
             )
         with console.status("[bold green]Analyzing repository...", spinner="dots"):
-            report = anyio.run(_run)
+            report, tracker = anyio.run(_run)
     except KeyboardInterrupt:
         console.print("\n[yellow]Analysis cancelled.[/yellow]")
         raise typer.Exit(code=130)
     except BaseException as exc:
-        # Unwrap ExceptionGroup → first cause only (except* requires Python 3.11+)
         cause = exc
         while hasattr(cause, "exceptions"):
             cause = cause.exceptions[0]
@@ -528,6 +534,23 @@ def extract(
         else:
             console.print_json(result)
+    _print_token_summary(tracker)
+def _print_token_summary(tracker: TokenTracker) -> None:
+    table = Table(title="Token Usage", show_header=False, border_style="dim")
+    table.add_column("Metric", style="bold")
+    table.add_column("Value", justify="right")
+    table.add_row("Input tokens", f"{tracker.input_tokens:,}")
+    table.add_row("Output tokens", f"{tracker.output_tokens:,}")
+    table.add_row("Total tokens", f"{tracker.total_tokens:,}")
+    table.add_row("API calls", str(tracker.api_calls))
+    if tracker.max_tokens:
+        pct = tracker.total_tokens / tracker.max_tokens * 100
+        table.add_row("Budget used", f"{pct:.1f}%")
+    console.print()
+    console.print(table)
 def _write_file(path: Path, content: str) -> None:
     try:

rex_machine-0.3.0/src/rex_machine/documents.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Document text extraction for rex-machine.
+Supports .docx, .pptx, and .pdf files. Libraries are imported at runtime
+so the tool works without them — install with `pip install rex-machine[docs]`.
+"""
+from __future__ import annotations
+from pathlib import Path
+DOCUMENT_EXTENSIONS = frozenset({".docx", ".pptx", ".pdf"})
+_INSTALL_HINT = "Install document support with: pip install rex-machine[docs]"
+def extract_text(filepath: Path, max_chars: int = 5000) -> str | None:
+    """Extract plain text from a document file.
+    Returns None if the file is not a supported document format.
+    Returns an error message string if extraction fails.
+    """
+    ext = filepath.suffix.lower()
+    if ext not in DOCUMENT_EXTENSIONS:
+        return None
+    extractors = {
+        ".docx": _extract_docx,
+        ".pptx": _extract_pptx,
+        ".pdf": _extract_pdf,
+    }
+    try:
+        text = extractors[ext](filepath)
+    except ImportError:
+        return f"Cannot read {ext} files. {_INSTALL_HINT}"
+    except Exception as exc:
+        return f"Error reading document: {exc}"
+    if not text.strip():
+        return "(empty document)"
+    if len(text) > max_chars:
+        return text[:max_chars] + "\n\n... [truncated]"
+    return text
+def _extract_docx(filepath: Path) -> str:
+    from docx import Document
+    doc = Document(str(filepath))
+    return "\n".join(p.text for p in doc.paragraphs)
+def _extract_pptx(filepath: Path) -> str:
+    from pptx import Presentation
+    prs = Presentation(str(filepath))
+    parts: list[str] = []
+    for i, slide in enumerate(prs.slides, 1):
+        slide_texts: list[str] = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                slide_texts.append(shape.text_frame.text)
+        if slide_texts:
+            parts.append(f"--- Slide {i} ---")
+            parts.append("\n".join(slide_texts))
+    return "\n".join(parts)
+def _extract_pdf(filepath: Path) -> str:
+    from pypdf import PdfReader
+    reader = PdfReader(str(filepath))
+    parts: list[str] = []
+    for page in reader.pages:
+        text = page.extract_text()
+        if text:
+            parts.append(text)
+    return "\n".join(parts)

{rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/scanner.py RENAMED Viewed

@@ -8,6 +8,8 @@ import os
 from dataclasses import dataclass, field
 from pathlib import Path
+from rex_machine.documents import DOCUMENT_EXTENSIONS
 # Extensions that are almost certainly binary
 BINARY_EXTENSIONS = frozenset(
     {
@@ -42,13 +44,10 @@ BINARY_EXTENSIONS = frozenset(
         ".pyo",
         ".class",
         ".wasm",
-        ".pdf",
         ".doc",
-        ".docx",
         ".xls",
         ".xlsx",
         ".ppt",
-        ".pptx",
         ".ttf",
         ".otf",
         ".woff",
@@ -101,6 +100,7 @@ class FileInfo:
     size_bytes: int
     extension: str
     is_binary: bool
+    is_document: bool = False
 @dataclass
@@ -117,9 +117,9 @@ class RepoMap:
     def file_tree(self) -> str:
         """Return a textual file tree representation."""
         lines: list[str] = []
+        doc_paths = {f.relative_path for f in self.files if f.is_document}
         dirs: dict[str, list[str]] = {}
         for f in self.files:
-            # Use forward-slash splitting to avoid Windows backslash issues
             if "/" in f.relative_path:
                 parent = f.relative_path.rsplit("/", 1)[0]
                 name = f.relative_path.rsplit("/", 1)[1]
@@ -133,7 +133,9 @@ class RepoMap:
                 lines.append(f"{dir_path}/")
             for name in sorted(dirs[dir_path]):
                 prefix = f"  {dir_path}/" if dir_path else ""
-                lines.append(f"  {prefix}{name}")
+                rel = f"{dir_path}/{name}" if dir_path else name
+                tag = "  [doc]" if rel in doc_paths else ""
+                lines.append(f"  {prefix}{name}{tag}")
         return "\n".join(lines)
@@ -188,6 +190,8 @@ def _should_skip_dir(dirname: str) -> bool:
 def _is_binary_file(filepath: Path) -> bool:
     """Heuristic check for binary files."""
     ext = filepath.suffix.lower()
+    if ext in DOCUMENT_EXTENSIONS:
+        return False
     if ext in BINARY_EXTENSIONS:
         return True
@@ -266,6 +270,7 @@ def scan_repo(path: str | Path) -> RepoMap:
                 size_bytes=size,
                 extension=ext,
                 is_binary=is_binary,
+                is_document=ext in DOCUMENT_EXTENSIONS,
             )
             repo_map.files.append(file_info)
             repo_map.total_files += 1

rex_machine-0.3.0/src/rex_machine/tokens.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Token usage tracking for rex-machine."""
+from __future__ import annotations
+import threading
+from dataclasses import dataclass, field
+@dataclass
+class TokenTracker:
+    """Tracks token usage across all API calls in a run."""
+    max_tokens: int | None = None
+    input_tokens: int = 0
+    output_tokens: int = 0
+    api_calls: int = 0
+    _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
+    @property
+    def total_tokens(self) -> int:
+        return self.input_tokens + self.output_tokens
+    @property
+    def budget_exceeded(self) -> bool:
+        if self.max_tokens is None:
+            return False
+        return self.total_tokens >= self.max_tokens
+    def record(self, input_tokens: int, output_tokens: int) -> None:
+        with self._lock:
+            self.input_tokens += input_tokens
+            self.output_tokens += output_tokens
+            self.api_calls += 1

rex_machine-0.3.0/tests/test_documents.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Tests for documents.py."""
+from pathlib import Path
+from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
+class TestDocumentExtensions:
+    def test_supported_formats(self):
+        assert ".docx" in DOCUMENT_EXTENSIONS
+        assert ".pptx" in DOCUMENT_EXTENSIONS
+        assert ".pdf" in DOCUMENT_EXTENSIONS
+    def test_old_formats_not_included(self):
+        assert ".doc" not in DOCUMENT_EXTENSIONS
+        assert ".ppt" not in DOCUMENT_EXTENSIONS
+        assert ".xls" not in DOCUMENT_EXTENSIONS
+class TestExtractText:
+    def test_non_document_returns_none(self, tmp_path: Path):
+        f = tmp_path / "code.py"
+        f.write_text("print('hi')", encoding="utf-8")
+        assert extract_text(f) is None
+    def test_missing_library_returns_hint(self, tmp_path: Path, monkeypatch):
+        f = tmp_path / "test.docx"
+        f.write_bytes(b"fake docx content")
+        def _raise_import(filepath):
+            raise ImportError("No module named 'docx'")
+        monkeypatch.setattr("rex_machine.documents._extract_docx", _raise_import)
+        result = extract_text(f)
+        assert result is not None
+        assert "Cannot read .docx" in result
+        assert "pip install" in result
+    def test_corrupt_file_returns_error(self, tmp_path: Path):
+        f = tmp_path / "bad.pdf"
+        f.write_bytes(b"not a real pdf")
+        result = extract_text(f)
+        assert result is not None
+        assert "Error reading document" in result
+    def test_truncation(self, tmp_path: Path, monkeypatch):
+        long_text = "x" * 10000
+        monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: long_text)
+        f = tmp_path / "big.docx"
+        f.write_bytes(b"fake")
+        result = extract_text(f, max_chars=100)
+        assert result is not None
+        assert len(result) < 200
+        assert "[truncated]" in result
+    def test_empty_document(self, tmp_path: Path, monkeypatch):
+        monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: "   ")
+        f = tmp_path / "empty.docx"
+        f.write_bytes(b"fake")
+        result = extract_text(f)
+        assert result == "(empty document)"

{rex_machine-0.2.2 → rex_machine-0.3.0}/tests/test_scanner.py RENAMED Viewed

@@ -93,6 +93,21 @@ class TestIsBinaryFile:
         f.write_bytes(b"hello\x00world")
         assert _is_binary_file(f) is True
+    def test_docx_not_binary(self, tmp_path: Path):
+        f = tmp_path / "report.docx"
+        f.write_bytes(b"PK\x03\x04fake")
+        assert _is_binary_file(f) is False
+    def test_pptx_not_binary(self, tmp_path: Path):
+        f = tmp_path / "slides.pptx"
+        f.write_bytes(b"PK\x03\x04fake")
+        assert _is_binary_file(f) is False
+    def test_pdf_not_binary(self, tmp_path: Path):
+        f = tmp_path / "doc.pdf"
+        f.write_bytes(b"%PDF-1.4 fake")
+        assert _is_binary_file(f) is False
 # ─── scan_repo ───────────────────────────────────────────────────
@@ -135,6 +150,19 @@ class TestScanRepo:
         source_paths = {f.relative_path for f in repo_map.source_files}
         assert "image.png" not in source_paths
+    def test_document_files_detected(self, tmp_repo: Path):
+        (tmp_repo / "report.docx").write_bytes(b"PK\x03\x04fake")
+        (tmp_repo / "slides.pptx").write_bytes(b"PK\x03\x04fake")
+        repo_map = scan_repo(tmp_repo)
+        doc_files = {f.relative_path for f in repo_map.files if f.is_document}
+        assert "report.docx" in doc_files
+        assert "slides.pptx" in doc_files
+    def test_document_in_file_tree_tagged(self, tmp_repo: Path):
+        (tmp_repo / "notes.docx").write_bytes(b"PK\x03\x04fake")
+        repo_map = scan_repo(tmp_repo)
+        assert "[doc]" in repo_map.file_tree
 # ─── SKIP_DIRS consistency ───────────────────────────────────────

rex_machine-0.3.0/tests/test_tokens.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Tests for tokens.py."""
+from rex_machine.tokens import TokenTracker
+class TestTokenTracker:
+    def test_initial_state(self):
+        t = TokenTracker()
+        assert t.input_tokens == 0
+        assert t.output_tokens == 0
+        assert t.total_tokens == 0
+        assert t.api_calls == 0
+        assert t.budget_exceeded is False
+    def test_record(self):
+        t = TokenTracker()
+        t.record(100, 50)
+        assert t.input_tokens == 100
+        assert t.output_tokens == 50
+        assert t.total_tokens == 150
+        assert t.api_calls == 1
+    def test_multiple_records(self):
+        t = TokenTracker()
+        t.record(100, 50)
+        t.record(200, 80)
+        assert t.input_tokens == 300
+        assert t.output_tokens == 130
+        assert t.total_tokens == 430
+        assert t.api_calls == 2
+    def test_budget_not_exceeded(self):
+        t = TokenTracker(max_tokens=1000)
+        t.record(200, 100)
+        assert t.budget_exceeded is False
+    def test_budget_exceeded(self):
+        t = TokenTracker(max_tokens=500)
+        t.record(300, 250)
+        assert t.budget_exceeded is True
+    def test_no_budget_never_exceeded(self):
+        t = TokenTracker()
+        t.record(999999, 999999)
+        assert t.budget_exceeded is False
+    def test_budget_exact_boundary(self):
+        t = TokenTracker(max_tokens=100)
+        t.record(50, 50)
+        assert t.budget_exceeded is True