rex-machine 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {rex_machine-0.2.2 → rex_machine-0.3.0}/PKG-INFO +8 -1
  2. {rex_machine-0.2.2 → rex_machine-0.3.0}/pyproject.toml +9 -1
  3. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/__init__.py +1 -1
  4. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/agents.py +43 -16
  5. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/cli.py +27 -4
  6. rex_machine-0.3.0/src/rex_machine/documents.py +79 -0
  7. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/scanner.py +10 -5
  8. rex_machine-0.3.0/src/rex_machine/tokens.py +33 -0
  9. rex_machine-0.3.0/tests/test_documents.py +62 -0
  10. {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/test_scanner.py +28 -0
  11. rex_machine-0.3.0/tests/test_tokens.py +50 -0
  12. {rex_machine-0.2.2 → rex_machine-0.3.0}/.claude/settings.local.json +0 -0
  13. {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  14. {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  15. {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/workflows/ci.yml +0 -0
  16. {rex_machine-0.2.2 → rex_machine-0.3.0}/.github/workflows/release.yml +0 -0
  17. {rex_machine-0.2.2 → rex_machine-0.3.0}/.gitignore +0 -0
  18. {rex_machine-0.2.2 → rex_machine-0.3.0}/CHANGELOG.md +0 -0
  19. {rex_machine-0.2.2 → rex_machine-0.3.0}/CLAUDE.md +0 -0
  20. {rex_machine-0.2.2 → rex_machine-0.3.0}/CONTRIBUTING.md +0 -0
  21. {rex_machine-0.2.2 → rex_machine-0.3.0}/LICENSE +0 -0
  22. {rex_machine-0.2.2 → rex_machine-0.3.0}/README.md +0 -0
  23. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/__main__.py +0 -0
  24. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/config.py +0 -0
  25. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/models.py +0 -0
  26. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/py.typed +0 -0
  27. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/renderer.py +0 -0
  28. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/code_pattern_analyzer.j2 +0 -0
  29. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/config_analyzer.j2 +0 -0
  30. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/doc_analyzer.j2 +0 -0
  31. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/main.j2 +0 -0
  32. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/structure_analyzer.j2 +0 -0
  33. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/prompts/synthesis.j2 +0 -0
  34. {rex_machine-0.2.2 → rex_machine-0.3.0}/src/rex_machine/templates/report.md.j2 +0 -0
  35. {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/__init__.py +0 -0
  36. {rex_machine-0.2.2 → rex_machine-0.3.0}/tests/test_models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rex-machine
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Extract lessons learned (REX) from code repositories
5
5
  Project-URL: Homepage, https://github.com/NicoJuiced/rex-machine
6
6
  Project-URL: Repository, https://github.com/NicoJuiced/rex-machine
@@ -27,9 +27,16 @@ Requires-Dist: pydantic>=2.0.0
27
27
  Requires-Dist: rich>=13.0.0
28
28
  Requires-Dist: typer>=0.12.0
29
29
  Provides-Extra: dev
30
+ Requires-Dist: pypdf>=4.0.0; extra == 'dev'
30
31
  Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
31
32
  Requires-Dist: pytest>=8.0.0; extra == 'dev'
33
+ Requires-Dist: python-docx>=1.0.0; extra == 'dev'
34
+ Requires-Dist: python-pptx>=1.0.0; extra == 'dev'
32
35
  Requires-Dist: ruff>=0.4.0; extra == 'dev'
36
+ Provides-Extra: docs
37
+ Requires-Dist: pypdf>=4.0.0; extra == 'docs'
38
+ Requires-Dist: python-docx>=1.0.0; extra == 'docs'
39
+ Requires-Dist: python-pptx>=1.0.0; extra == 'docs'
33
40
  Description-Content-Type: text/markdown
34
41
 
35
42
  # rex-machine
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rex-machine"
7
- version = "0.2.2"
7
+ version = "0.3.0"
8
8
  description = "Extract lessons learned (REX) from code repositories"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -35,10 +35,18 @@ dependencies = [
35
35
  ]
36
36
 
37
37
  [project.optional-dependencies]
38
+ docs = [
39
+ "python-docx>=1.0.0",
40
+ "python-pptx>=1.0.0",
41
+ "pypdf>=4.0.0",
42
+ ]
38
43
  dev = [
39
44
  "pytest>=8.0.0",
40
45
  "pytest-asyncio>=0.23.0",
41
46
  "ruff>=0.4.0",
47
+ "python-docx>=1.0.0",
48
+ "python-pptx>=1.0.0",
49
+ "pypdf>=4.0.0",
42
50
  ]
43
51
 
44
52
  [project.urls]
@@ -1,3 +1,3 @@
1
1
  """rex-machine - Extract lessons learned from code repositories."""
2
2
 
3
- __version__ = "0.2.2"
3
+ __version__ = "0.3.0"
@@ -21,8 +21,10 @@ import anthropic
21
21
  import anyio
22
22
  from jinja2 import Environment, FileSystemLoader
23
23
 
24
+ from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
24
25
  from rex_machine.models import RepoQuality, RexReport
25
26
  from rex_machine.scanner import SKIP_DIRS, RepoMap, scan_repo
27
+ from rex_machine.tokens import TokenTracker
26
28
 
27
29
  logger = logging.getLogger("rex_machine")
28
30
 
@@ -267,6 +269,12 @@ class ToolExecutor:
267
269
  if not target.is_file():
268
270
  return f"File not found: {path}"
269
271
 
272
+ if target.suffix.lower() in DOCUMENT_EXTENSIONS:
273
+ result = extract_text(target)
274
+ if result is None:
275
+ return f"Unsupported document format: {path}"
276
+ return f"{path} (document):\n{result}"
277
+
270
278
  size = target.stat().st_size
271
279
  if size > 2 * 1024 * 1024:
272
280
  return f"File too large ({size:,} bytes). Use start_line/end_line to read a section."
@@ -360,6 +368,7 @@ async def run_subagent(
360
368
  repo_path: str,
361
369
  label: str,
362
370
  max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
371
+ tracker: TokenTracker | None = None,
363
372
  ) -> str:
364
373
  """Run a sub-agent with an autonomous tool-use loop.
365
374
 
@@ -390,6 +399,8 @@ async def run_subagent(
390
399
  tools=REPO_TOOLS,
391
400
  temperature=0.0,
392
401
  )
402
+ if tracker:
403
+ tracker.record(response.usage.input_tokens, response.usage.output_tokens)
393
404
 
394
405
  if response.stop_reason != "tool_use":
395
406
  result = ""
@@ -404,6 +415,10 @@ async def run_subagent(
404
415
  )
405
416
  return result
406
417
 
418
+ if tracker and tracker.budget_exceeded:
419
+ logger.warning("Sub-agent %s stopped: token budget exceeded", label)
420
+ break
421
+
407
422
  messages.append({"role": "assistant", "content": response.content})
408
423
 
409
424
  tool_results: list[dict[str, Any]] = []
@@ -427,9 +442,8 @@ async def run_subagent(
427
442
  messages.append({"role": "user", "content": tool_results})
428
443
 
429
444
  logger.warning(
430
- "Sub-agent %s hit tool call limit (%d). Requesting final answer.",
445
+ "Sub-agent %s hit limit. Requesting final answer.",
431
446
  label,
432
- max_tool_calls,
433
447
  )
434
448
  messages.append(
435
449
  {
@@ -447,6 +461,8 @@ async def run_subagent(
447
461
  messages=messages,
448
462
  temperature=0.0,
449
463
  )
464
+ if tracker:
465
+ tracker.record(response.usage.input_tokens, response.usage.output_tokens)
450
466
  result = ""
451
467
  for block in response.content:
452
468
  if block.type == "text":
@@ -465,6 +481,7 @@ async def _run_synthesis(
465
481
  files_scanned: int,
466
482
  subagent_reports: dict[str, str],
467
483
  lang: str = "en",
484
+ tracker: TokenTracker | None = None,
468
485
  ) -> dict[str, Any]:
469
486
  """Merge all sub-agent reports into a structured RexReport.
470
487
 
@@ -524,6 +541,8 @@ async def _run_synthesis(
524
541
  tool_choice={"type": "tool", "name": "produce_rex_report"},
525
542
  temperature=0.0,
526
543
  )
544
+ if tracker:
545
+ tracker.record(response.usage.input_tokens, response.usage.output_tokens)
527
546
 
528
547
  for block in response.content:
529
548
  if block.type == "tool_use" and block.name == "produce_rex_report":
@@ -548,13 +567,16 @@ async def run_analysis(
548
567
  gcp_region: str | None = None,
549
568
  max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
550
569
  lang: str = "en",
551
- ) -> RexReport:
570
+ max_tokens: int | None = None,
571
+ ) -> tuple[RexReport, TokenTracker]:
552
572
  """Run the full rex-machine analysis pipeline on a repository.
553
573
 
554
574
  1. Scan repo for file tree
555
575
  2. Run 4 sub-agents in parallel (each with autonomous tool-use loop)
556
576
  3. Synthesize findings into a structured RexReport
557
577
  """
578
+ tracker = TokenTracker(max_tokens=max_tokens)
579
+
558
580
  logger.info("Scanning repository: %s", repo_path)
559
581
  repo_map: RepoMap = scan_repo(repo_path)
560
582
  logger.info(
@@ -564,18 +586,21 @@ async def run_analysis(
564
586
  )
565
587
 
566
588
  if repo_map.total_files == 0:
567
- return RexReport(
568
- repo_name=_extract_repo_name(repo_path),
569
- repo_path=str(repo_path),
570
- analyzed_at=datetime.now(timezone.utc).isoformat(),
571
- model_used=model,
572
- files_scanned=0,
573
- repo_quality=RepoQuality.INSUFFICIENT,
574
- warnings=["Repository contains no scannable files."],
575
- rex_items=[],
576
- global_summary=("The repository is empty or contains only binary/ignored files."),
577
- strengths=[],
578
- improvement_suggestions=["Add source code to the repository."],
589
+ return (
590
+ RexReport(
591
+ repo_name=_extract_repo_name(repo_path),
592
+ repo_path=str(repo_path),
593
+ analyzed_at=datetime.now(timezone.utc).isoformat(),
594
+ model_used=model,
595
+ files_scanned=0,
596
+ repo_quality=RepoQuality.INSUFFICIENT,
597
+ warnings=["Repository contains no scannable files."],
598
+ rex_items=[],
599
+ global_summary=("The repository is empty or contains only binary/ignored files."),
600
+ strengths=[],
601
+ improvement_suggestions=["Add source code to the repository."],
602
+ ),
603
+ tracker,
579
604
  )
580
605
 
581
606
  client = create_client(
@@ -610,6 +635,7 @@ async def run_analysis(
610
635
  repo_path,
611
636
  label,
612
637
  max_tool_calls=max_tool_calls,
638
+ tracker=tracker,
613
639
  )
614
640
 
615
641
  async with anyio.create_task_group() as tg:
@@ -628,9 +654,10 @@ async def run_analysis(
628
654
  files_scanned=repo_map.total_files,
629
655
  subagent_reports=subagent_reports,
630
656
  lang=lang,
657
+ tracker=tracker,
631
658
  )
632
659
 
633
- return RexReport.model_validate(report_data)
660
+ return RexReport.model_validate(report_data), tracker
634
661
 
635
662
 
636
663
  def _extract_repo_name(repo_path: str) -> str:
@@ -28,8 +28,8 @@ from rex_machine.config import (
28
28
  save_global,
29
29
  save_project,
30
30
  )
31
- from rex_machine.models import RexReport
32
31
  from rex_machine.renderer import render_console, render_json, render_markdown
32
+ from rex_machine.tokens import TokenTracker
33
33
 
34
34
  app = typer.Typer(
35
35
  name="rex",
@@ -430,6 +430,10 @@ def extract(
430
430
  Provider | None,
431
431
  typer.Option("--provider", "-p", help="API provider (overrides config)."),
432
432
  ] = None,
433
+ max_tokens: Annotated[
434
+ int | None,
435
+ typer.Option("--max-tokens", "-t", help="Global token budget (stops when exceeded)."),
436
+ ] = None,
433
437
  lang: Annotated[
434
438
  str,
435
439
  typer.Option("--lang", "-l", help="Output language: en (default) or fr."),
@@ -478,29 +482,31 @@ def extract(
478
482
  console.print(f"Analyzing: [cyan]{repo_path}[/cyan]")
479
483
  console.print(f"Model: [cyan]{effective_model}[/cyan]")
480
484
  console.print(f"Provider: [cyan]{effective_provider.value}[/cyan]")
485
+ if max_tokens:
486
+ console.print(f"Token budget: [cyan]{max_tokens:,}[/cyan]")
481
487
  if lang != "en":
482
488
  console.print(f"Language: [cyan]{lang}[/cyan]")
483
489
  console.print()
484
490
 
485
491
  try:
486
492
 
487
- async def _run() -> RexReport:
493
+ async def _run() -> tuple:
488
494
  return await run_analysis(
489
495
  repo_path,
490
496
  effective_model,
491
497
  provider=effective_provider,
492
498
  max_tool_calls=max_tool_calls,
493
499
  lang=lang,
500
+ max_tokens=max_tokens,
494
501
  **creds,
495
502
  )
496
503
 
497
504
  with console.status("[bold green]Analyzing repository...", spinner="dots"):
498
- report = anyio.run(_run)
505
+ report, tracker = anyio.run(_run)
499
506
  except KeyboardInterrupt:
500
507
  console.print("\n[yellow]Analysis cancelled.[/yellow]")
501
508
  raise typer.Exit(code=130)
502
509
  except BaseException as exc:
503
- # Unwrap ExceptionGroup → first cause only (except* requires Python 3.11+)
504
510
  cause = exc
505
511
  while hasattr(cause, "exceptions"):
506
512
  cause = cause.exceptions[0]
@@ -528,6 +534,23 @@ def extract(
528
534
  else:
529
535
  console.print_json(result)
530
536
 
537
+ _print_token_summary(tracker)
538
+
539
+
540
+ def _print_token_summary(tracker: TokenTracker) -> None:
541
+ table = Table(title="Token Usage", show_header=False, border_style="dim")
542
+ table.add_column("Metric", style="bold")
543
+ table.add_column("Value", justify="right")
544
+ table.add_row("Input tokens", f"{tracker.input_tokens:,}")
545
+ table.add_row("Output tokens", f"{tracker.output_tokens:,}")
546
+ table.add_row("Total tokens", f"{tracker.total_tokens:,}")
547
+ table.add_row("API calls", str(tracker.api_calls))
548
+ if tracker.max_tokens:
549
+ pct = tracker.total_tokens / tracker.max_tokens * 100
550
+ table.add_row("Budget used", f"{pct:.1f}%")
551
+ console.print()
552
+ console.print(table)
553
+
531
554
 
532
555
  def _write_file(path: Path, content: str) -> None:
533
556
  try:
@@ -0,0 +1,79 @@
1
+ """Document text extraction for rex-machine.
2
+
3
+ Supports .docx, .pptx, and .pdf files. Libraries are imported at runtime
4
+ so the tool works without them — install with `pip install rex-machine[docs]`.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ DOCUMENT_EXTENSIONS = frozenset({".docx", ".pptx", ".pdf"})
12
+
13
+ _INSTALL_HINT = "Install document support with: pip install rex-machine[docs]"
14
+
15
+
16
+ def extract_text(filepath: Path, max_chars: int = 5000) -> str | None:
17
+ """Extract plain text from a document file.
18
+
19
+ Returns None if the file is not a supported document format.
20
+ Returns an error message string if extraction fails.
21
+ """
22
+ ext = filepath.suffix.lower()
23
+ if ext not in DOCUMENT_EXTENSIONS:
24
+ return None
25
+
26
+ extractors = {
27
+ ".docx": _extract_docx,
28
+ ".pptx": _extract_pptx,
29
+ ".pdf": _extract_pdf,
30
+ }
31
+
32
+ try:
33
+ text = extractors[ext](filepath)
34
+ except ImportError:
35
+ return f"Cannot read {ext} files. {_INSTALL_HINT}"
36
+ except Exception as exc:
37
+ return f"Error reading document: {exc}"
38
+
39
+ if not text.strip():
40
+ return "(empty document)"
41
+
42
+ if len(text) > max_chars:
43
+ return text[:max_chars] + "\n\n... [truncated]"
44
+ return text
45
+
46
+
47
+ def _extract_docx(filepath: Path) -> str:
48
+ from docx import Document
49
+
50
+ doc = Document(str(filepath))
51
+ return "\n".join(p.text for p in doc.paragraphs)
52
+
53
+
54
+ def _extract_pptx(filepath: Path) -> str:
55
+ from pptx import Presentation
56
+
57
+ prs = Presentation(str(filepath))
58
+ parts: list[str] = []
59
+ for i, slide in enumerate(prs.slides, 1):
60
+ slide_texts: list[str] = []
61
+ for shape in slide.shapes:
62
+ if shape.has_text_frame:
63
+ slide_texts.append(shape.text_frame.text)
64
+ if slide_texts:
65
+ parts.append(f"--- Slide {i} ---")
66
+ parts.append("\n".join(slide_texts))
67
+ return "\n".join(parts)
68
+
69
+
70
+ def _extract_pdf(filepath: Path) -> str:
71
+ from pypdf import PdfReader
72
+
73
+ reader = PdfReader(str(filepath))
74
+ parts: list[str] = []
75
+ for page in reader.pages:
76
+ text = page.extract_text()
77
+ if text:
78
+ parts.append(text)
79
+ return "\n".join(parts)
@@ -8,6 +8,8 @@ import os
8
8
  from dataclasses import dataclass, field
9
9
  from pathlib import Path
10
10
 
11
+ from rex_machine.documents import DOCUMENT_EXTENSIONS
12
+
11
13
  # Extensions that are almost certainly binary
12
14
  BINARY_EXTENSIONS = frozenset(
13
15
  {
@@ -42,13 +44,10 @@ BINARY_EXTENSIONS = frozenset(
42
44
  ".pyo",
43
45
  ".class",
44
46
  ".wasm",
45
- ".pdf",
46
47
  ".doc",
47
- ".docx",
48
48
  ".xls",
49
49
  ".xlsx",
50
50
  ".ppt",
51
- ".pptx",
52
51
  ".ttf",
53
52
  ".otf",
54
53
  ".woff",
@@ -101,6 +100,7 @@ class FileInfo:
101
100
  size_bytes: int
102
101
  extension: str
103
102
  is_binary: bool
103
+ is_document: bool = False
104
104
 
105
105
 
106
106
  @dataclass
@@ -117,9 +117,9 @@ class RepoMap:
117
117
  def file_tree(self) -> str:
118
118
  """Return a textual file tree representation."""
119
119
  lines: list[str] = []
120
+ doc_paths = {f.relative_path for f in self.files if f.is_document}
120
121
  dirs: dict[str, list[str]] = {}
121
122
  for f in self.files:
122
- # Use forward-slash splitting to avoid Windows backslash issues
123
123
  if "/" in f.relative_path:
124
124
  parent = f.relative_path.rsplit("/", 1)[0]
125
125
  name = f.relative_path.rsplit("/", 1)[1]
@@ -133,7 +133,9 @@ class RepoMap:
133
133
  lines.append(f"{dir_path}/")
134
134
  for name in sorted(dirs[dir_path]):
135
135
  prefix = f" {dir_path}/" if dir_path else ""
136
- lines.append(f" {prefix}{name}")
136
+ rel = f"{dir_path}/{name}" if dir_path else name
137
+ tag = " [doc]" if rel in doc_paths else ""
138
+ lines.append(f" {prefix}{name}{tag}")
137
139
 
138
140
  return "\n".join(lines)
139
141
 
@@ -188,6 +190,8 @@ def _should_skip_dir(dirname: str) -> bool:
188
190
  def _is_binary_file(filepath: Path) -> bool:
189
191
  """Heuristic check for binary files."""
190
192
  ext = filepath.suffix.lower()
193
+ if ext in DOCUMENT_EXTENSIONS:
194
+ return False
191
195
  if ext in BINARY_EXTENSIONS:
192
196
  return True
193
197
 
@@ -266,6 +270,7 @@ def scan_repo(path: str | Path) -> RepoMap:
266
270
  size_bytes=size,
267
271
  extension=ext,
268
272
  is_binary=is_binary,
273
+ is_document=ext in DOCUMENT_EXTENSIONS,
269
274
  )
270
275
  repo_map.files.append(file_info)
271
276
  repo_map.total_files += 1
@@ -0,0 +1,33 @@
1
+ """Token usage tracking for rex-machine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class TokenTracker:
11
+ """Tracks token usage across all API calls in a run."""
12
+
13
+ max_tokens: int | None = None
14
+ input_tokens: int = 0
15
+ output_tokens: int = 0
16
+ api_calls: int = 0
17
+ _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
18
+
19
+ @property
20
+ def total_tokens(self) -> int:
21
+ return self.input_tokens + self.output_tokens
22
+
23
+ @property
24
+ def budget_exceeded(self) -> bool:
25
+ if self.max_tokens is None:
26
+ return False
27
+ return self.total_tokens >= self.max_tokens
28
+
29
+ def record(self, input_tokens: int, output_tokens: int) -> None:
30
+ with self._lock:
31
+ self.input_tokens += input_tokens
32
+ self.output_tokens += output_tokens
33
+ self.api_calls += 1
@@ -0,0 +1,62 @@
1
+ """Tests for documents.py."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rex_machine.documents import DOCUMENT_EXTENSIONS, extract_text
6
+
7
+
8
+ class TestDocumentExtensions:
9
+ def test_supported_formats(self):
10
+ assert ".docx" in DOCUMENT_EXTENSIONS
11
+ assert ".pptx" in DOCUMENT_EXTENSIONS
12
+ assert ".pdf" in DOCUMENT_EXTENSIONS
13
+
14
+ def test_old_formats_not_included(self):
15
+ assert ".doc" not in DOCUMENT_EXTENSIONS
16
+ assert ".ppt" not in DOCUMENT_EXTENSIONS
17
+ assert ".xls" not in DOCUMENT_EXTENSIONS
18
+
19
+
20
+ class TestExtractText:
21
+ def test_non_document_returns_none(self, tmp_path: Path):
22
+ f = tmp_path / "code.py"
23
+ f.write_text("print('hi')", encoding="utf-8")
24
+ assert extract_text(f) is None
25
+
26
+ def test_missing_library_returns_hint(self, tmp_path: Path, monkeypatch):
27
+ f = tmp_path / "test.docx"
28
+ f.write_bytes(b"fake docx content")
29
+
30
+ def _raise_import(filepath):
31
+ raise ImportError("No module named 'docx'")
32
+
33
+ monkeypatch.setattr("rex_machine.documents._extract_docx", _raise_import)
34
+ result = extract_text(f)
35
+ assert result is not None
36
+ assert "Cannot read .docx" in result
37
+ assert "pip install" in result
38
+
39
+ def test_corrupt_file_returns_error(self, tmp_path: Path):
40
+ f = tmp_path / "bad.pdf"
41
+ f.write_bytes(b"not a real pdf")
42
+ result = extract_text(f)
43
+ assert result is not None
44
+ assert "Error reading document" in result
45
+
46
+ def test_truncation(self, tmp_path: Path, monkeypatch):
47
+ long_text = "x" * 10000
48
+
49
+ monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: long_text)
50
+ f = tmp_path / "big.docx"
51
+ f.write_bytes(b"fake")
52
+ result = extract_text(f, max_chars=100)
53
+ assert result is not None
54
+ assert len(result) < 200
55
+ assert "[truncated]" in result
56
+
57
+ def test_empty_document(self, tmp_path: Path, monkeypatch):
58
+ monkeypatch.setattr("rex_machine.documents._extract_docx", lambda _: " ")
59
+ f = tmp_path / "empty.docx"
60
+ f.write_bytes(b"fake")
61
+ result = extract_text(f)
62
+ assert result == "(empty document)"
@@ -93,6 +93,21 @@ class TestIsBinaryFile:
93
93
  f.write_bytes(b"hello\x00world")
94
94
  assert _is_binary_file(f) is True
95
95
 
96
+ def test_docx_not_binary(self, tmp_path: Path):
97
+ f = tmp_path / "report.docx"
98
+ f.write_bytes(b"PK\x03\x04fake")
99
+ assert _is_binary_file(f) is False
100
+
101
+ def test_pptx_not_binary(self, tmp_path: Path):
102
+ f = tmp_path / "slides.pptx"
103
+ f.write_bytes(b"PK\x03\x04fake")
104
+ assert _is_binary_file(f) is False
105
+
106
+ def test_pdf_not_binary(self, tmp_path: Path):
107
+ f = tmp_path / "doc.pdf"
108
+ f.write_bytes(b"%PDF-1.4 fake")
109
+ assert _is_binary_file(f) is False
110
+
96
111
 
97
112
  # ─── scan_repo ───────────────────────────────────────────────────
98
113
 
@@ -135,6 +150,19 @@ class TestScanRepo:
135
150
  source_paths = {f.relative_path for f in repo_map.source_files}
136
151
  assert "image.png" not in source_paths
137
152
 
153
+ def test_document_files_detected(self, tmp_repo: Path):
154
+ (tmp_repo / "report.docx").write_bytes(b"PK\x03\x04fake")
155
+ (tmp_repo / "slides.pptx").write_bytes(b"PK\x03\x04fake")
156
+ repo_map = scan_repo(tmp_repo)
157
+ doc_files = {f.relative_path for f in repo_map.files if f.is_document}
158
+ assert "report.docx" in doc_files
159
+ assert "slides.pptx" in doc_files
160
+
161
+ def test_document_in_file_tree_tagged(self, tmp_repo: Path):
162
+ (tmp_repo / "notes.docx").write_bytes(b"PK\x03\x04fake")
163
+ repo_map = scan_repo(tmp_repo)
164
+ assert "[doc]" in repo_map.file_tree
165
+
138
166
 
139
167
  # ─── SKIP_DIRS consistency ───────────────────────────────────────
140
168
 
@@ -0,0 +1,50 @@
1
+ """Tests for tokens.py."""
2
+
3
+ from rex_machine.tokens import TokenTracker
4
+
5
+
6
+ class TestTokenTracker:
7
+ def test_initial_state(self):
8
+ t = TokenTracker()
9
+ assert t.input_tokens == 0
10
+ assert t.output_tokens == 0
11
+ assert t.total_tokens == 0
12
+ assert t.api_calls == 0
13
+ assert t.budget_exceeded is False
14
+
15
+ def test_record(self):
16
+ t = TokenTracker()
17
+ t.record(100, 50)
18
+ assert t.input_tokens == 100
19
+ assert t.output_tokens == 50
20
+ assert t.total_tokens == 150
21
+ assert t.api_calls == 1
22
+
23
+ def test_multiple_records(self):
24
+ t = TokenTracker()
25
+ t.record(100, 50)
26
+ t.record(200, 80)
27
+ assert t.input_tokens == 300
28
+ assert t.output_tokens == 130
29
+ assert t.total_tokens == 430
30
+ assert t.api_calls == 2
31
+
32
+ def test_budget_not_exceeded(self):
33
+ t = TokenTracker(max_tokens=1000)
34
+ t.record(200, 100)
35
+ assert t.budget_exceeded is False
36
+
37
+ def test_budget_exceeded(self):
38
+ t = TokenTracker(max_tokens=500)
39
+ t.record(300, 250)
40
+ assert t.budget_exceeded is True
41
+
42
+ def test_no_budget_never_exceeded(self):
43
+ t = TokenTracker()
44
+ t.record(999999, 999999)
45
+ assert t.budget_exceeded is False
46
+
47
+ def test_budget_exact_boundary(self):
48
+ t = TokenTracker(max_tokens=100)
49
+ t.record(50, 50)
50
+ assert t.budget_exceeded is True
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes