henchman-ai 0.1.10__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. henchman_ai-0.1.12/ALPHA_TEST_LOG.md +45 -0
  2. henchman_ai-0.1.12/BETA_TESTING_ISSUES.md +55 -0
  3. henchman_ai-0.1.12/BETA_TESTING_ISSUES2.md +215 -0
  4. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/CHANGELOG.md +23 -0
  5. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/PKG-INFO +1 -1
  6. henchman_ai-0.1.12/evals/README.md +137 -0
  7. henchman_ai-0.1.12/evals/__init__.py +1 -0
  8. henchman_ai-0.1.12/evals/conftest.py +33 -0
  9. henchman_ai-0.1.12/evals/helpers.py +683 -0
  10. henchman_ai-0.1.12/evals/test_answer_vs_action.py +110 -0
  11. henchman_ai-0.1.12/evals/test_coding_tasks.py +243 -0
  12. henchman_ai-0.1.12/evals/test_edit_precision.py +375 -0
  13. henchman_ai-0.1.12/evals/test_skills_memory.py +215 -0
  14. henchman_ai-0.1.12/evals/test_tool_selection.py +103 -0
  15. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/pyproject.toml +3 -1
  16. henchman_ai-0.1.12/scripts/run_evals.sh +86 -0
  17. henchman_ai-0.1.12/src/henchman/cli/app.py +322 -0
  18. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/__init__.py +2 -0
  19. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/builtins.py +6 -0
  20. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/chat.py +50 -36
  21. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/rag.py +26 -20
  22. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/console.py +11 -6
  23. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/input.py +65 -0
  24. henchman_ai-0.1.12/src/henchman/cli/prompts.py +214 -0
  25. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/repl.py +191 -33
  26. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/turn.py +15 -9
  27. henchman_ai-0.1.12/src/henchman/rag/concurrency.py +206 -0
  28. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/repo_id.py +7 -7
  29. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/store.py +45 -11
  30. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/system.py +93 -7
  31. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/utils/compaction.py +4 -3
  32. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/version.py +1 -1
  33. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  34. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572765096208/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  35. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572822401392/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  36. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572896176320/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  37. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520285986352/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  38. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520286228656/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  39. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520287933552/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  40. henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  41. henchman_ai-0.1.12/tests/__init__.py +0 -0
  42. henchman_ai-0.1.12/tests/cli/__init__.py +0 -0
  43. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_console.py +51 -0
  44. henchman_ai-0.1.12/tests/config/__init__.py +0 -0
  45. henchman_ai-0.1.12/tests/core/__init__.py +0 -0
  46. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_turn_state.py +12 -9
  47. henchman_ai-0.1.12/tests/mcp/__init__.py +0 -0
  48. henchman_ai-0.1.12/tests/providers/__init__.py +0 -0
  49. henchman_ai-0.1.12/tests/rag/test_concurrency.py +245 -0
  50. henchman_ai-0.1.12/tests/rag/test_concurrency_smoke.py +196 -0
  51. henchman_ai-0.1.12/tests/rag/test_indexer.py +494 -0
  52. henchman_ai-0.1.12/tests/rag/test_rag_command.py +316 -0
  53. henchman_ai-0.1.12/tests/rag/test_rag_concurrency_integration.py +295 -0
  54. henchman_ai-0.1.12/tests/rag/test_repo_id.py +245 -0
  55. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/test_system.py +9 -9
  56. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/test_version.py +1 -1
  57. henchman_ai-0.1.12/tests/tools/__init__.py +0 -0
  58. henchman_ai-0.1.10/.henchman/rag_manifest.json +0 -1
  59. henchman_ai-0.1.10/RAG_HOME_DIRECTORY_MIGRATION.md +0 -138
  60. henchman_ai-0.1.10/src/henchman/cli/app.py +0 -213
  61. henchman_ai-0.1.10/src/henchman/cli/prompts.py +0 -113
  62. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  63. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  64. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  65. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  66. henchman_ai-0.1.10/tests/rag/test_indexer.py +0 -240
  67. henchman_ai-0.1.10/tests/rag/test_rag_command.py +0 -137
  68. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/.github/copilot-instructions.md +0 -0
  69. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/.github/workflows/ci.yml +0 -0
  70. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/.github/workflows/publish.yml +0 -0
  71. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/.gitignore +0 -0
  72. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/LICENSE +0 -0
  73. /henchman_ai-0.1.10/.henchman/rag_index/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131782863223120/b0c5ce5844ad8acc/.rag.lock +0 -0
  74. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131782952758032/b0c5ce5844ad8acc/.rag.lock +0 -0
  75. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131782953186608/b0c5ce5844ad8acc/.rag.lock +0 -0
  76. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131783467925184/b0c5ce5844ad8acc/.rag.lock +0 -0
  77. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131953655609296/b0c5ce5844ad8acc/.rag.lock +0 -0
  78. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131953656855104/b0c5ce5844ad8acc/.rag.lock +0 -0
  79. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131954146008512/b0c5ce5844ad8acc/.rag.lock +0 -0
  80. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/131954155550192/b0c5ce5844ad8acc/.rag.lock +0 -0
  81. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/134508512171968/b0c5ce5844ad8acc/.rag.lock +0 -0
  82. /henchman_ai-0.1.10/tests/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/134508512231200/b0c5ce5844ad8acc/.rag.lock +0 -0
  83. /henchman_ai-0.1.10/tests/cli/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/134508512485616/b0c5ce5844ad8acc/.rag.lock +0 -0
  84. /henchman_ai-0.1.10/tests/config/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/134508855764000/b0c5ce5844ad8acc/.rag.lock +0 -0
  85. /henchman_ai-0.1.10/tests/core/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/139391159286624/b0c5ce5844ad8acc/.rag.lock +0 -0
  86. /henchman_ai-0.1.10/tests/mcp/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/139391160298352/b0c5ce5844ad8acc/.rag.lock +0 -0
  87. /henchman_ai-0.1.10/tests/providers/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/139391160361968/b0c5ce5844ad8acc/.rag.lock +0 -0
  88. /henchman_ai-0.1.10/tests/tools/__init__.py → /henchman_ai-0.1.12/MagicMock/mock.rag.cache_dir/139391778658240/b0c5ce5844ad8acc/.rag.lock +0 -0
  89. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/PROJECT_PLAN.md +0 -0
  90. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/README.md +0 -0
  91. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/TASK_COMPLETION_SUMMARY.md +0 -0
  92. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/debug_compaction.py +0 -0
  93. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/api.md +0 -0
  94. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/configuration.md +0 -0
  95. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/extensions.md +0 -0
  96. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/getting-started.md +0 -0
  97. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/index.md +0 -0
  98. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/mcp.md +0 -0
  99. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/providers.md +0 -0
  100. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/docs/tools.md +0 -0
  101. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/fix_repl.py +0 -0
  102. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/fix_repl_simple.py +0 -0
  103. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/mkdocs.yml +0 -0
  104. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/replace_method.py +0 -0
  105. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/reproduce_400_error.py +0 -0
  106. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/run_interactive_tests.py +0 -0
  107. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/scripts/ci.sh +0 -0
  108. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/__init__.py +0 -0
  109. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/__main__.py +0 -0
  110. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/__init__.py +0 -0
  111. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/extensions.py +0 -0
  112. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/mcp.py +0 -0
  113. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/plan.py +0 -0
  114. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/skill.py +0 -0
  115. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/commands/unlimited.py +0 -0
  116. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/json_output.py +0 -0
  117. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/repl.py.backup +0 -0
  118. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/cli/repl.py.backup2 +0 -0
  119. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/config/__init__.py +0 -0
  120. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/config/context.py +0 -0
  121. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/config/schema.py +0 -0
  122. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/config/settings.py +0 -0
  123. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/__init__.py +0 -0
  124. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/agent.py +0 -0
  125. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/agent.py.backup +0 -0
  126. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/events.py +0 -0
  127. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/core/session.py +0 -0
  128. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/extensions/__init__.py +0 -0
  129. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/extensions/base.py +0 -0
  130. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/extensions/manager.py +0 -0
  131. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/mcp/__init__.py +0 -0
  132. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/mcp/client.py +0 -0
  133. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/mcp/config.py +0 -0
  134. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/mcp/manager.py +0 -0
  135. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/mcp/tool.py +0 -0
  136. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/__init__.py +0 -0
  137. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/anthropic.py +0 -0
  138. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/base.py +0 -0
  139. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/deepseek.py +0 -0
  140. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/ollama.py +0 -0
  141. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/openai_compat.py +0 -0
  142. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/openai_compat.py.backup +0 -0
  143. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/providers/registry.py +0 -0
  144. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/__init__.py +0 -0
  145. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/chunker.py +0 -0
  146. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/embedder.py +0 -0
  147. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/rag/indexer.py +0 -0
  148. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/skills/__init__.py +0 -0
  149. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/skills/executor.py +0 -0
  150. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/skills/learner.py +0 -0
  151. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/skills/models.py +0 -0
  152. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/skills/store.py +0 -0
  153. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/__init__.py +0 -0
  154. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/base.py +0 -0
  155. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/__init__.py +0 -0
  156. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/ask_user.py +0 -0
  157. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/file_edit.py +0 -0
  158. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/file_read.py +0 -0
  159. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/file_write.py +0 -0
  160. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/glob_tool.py +0 -0
  161. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/grep.py +0 -0
  162. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/ls.py +0 -0
  163. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/rag_search.py +0 -0
  164. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/shell.py +0 -0
  165. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/builtins/web_fetch.py +0 -0
  166. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/tools/registry.py +0 -0
  167. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/utils/__init__.py +0 -0
  168. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/utils/retry.py +0 -0
  169. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/utils/tokens.py +0 -0
  170. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/src/henchman/utils/validation.py +0 -0
  171. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/test_compaction.py +0 -0
  172. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/test_compaction_fix.py +0 -0
  173. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/test_fixes.py +0 -0
  174. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/test_output.txt +0 -0
  175. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/test_run.py +0 -0
  176. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  177. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  178. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  179. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/chroma.sqlite3 +0 -0
  180. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/manifest.json +0 -0
  181. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  182. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  183. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  184. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  185. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/125572765096208/b0c5ce5844ad8acc/manifest.json +0 -0
  186. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  187. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  188. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  189. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  190. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/125572822401392/b0c5ce5844ad8acc/manifest.json +0 -0
  191. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  192. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  193. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  194. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  195. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/125572896176320/b0c5ce5844ad8acc/manifest.json +0 -0
  196. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  197. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  198. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  199. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  200. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/135520285986352/b0c5ce5844ad8acc/manifest.json +0 -0
  201. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  202. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  203. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  204. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  205. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/135520286228656/b0c5ce5844ad8acc/manifest.json +0 -0
  206. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  207. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  208. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  209. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  210. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/135520287933552/b0c5ce5844ad8acc/manifest.json +0 -0
  211. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  212. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  213. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  214. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.12/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  215. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/manifest.json +0 -0
  216. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/commands/test_plan.py +0 -0
  217. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/commands/test_skill.py +0 -0
  218. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/commands/test_skill_extended.py +0 -0
  219. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/commands/test_unlimited.py +0 -0
  220. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_app.py +0 -0
  221. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_app_extended.py +0 -0
  222. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_builtins.py +0 -0
  223. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_chat_command.py +0 -0
  224. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_cli_smoke.py +0 -0
  225. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_commands.py +0 -0
  226. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_commands_repro.py +0 -0
  227. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_enhanced_tool_display.py +0 -0
  228. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_input.py +0 -0
  229. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_input_bindings.py +0 -0
  230. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_json_output.py +0 -0
  231. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_keyboard_fixes.py +0 -0
  232. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_keyboard_integration.py +0 -0
  233. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_keyboard_interrupt.py +0 -0
  234. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_keyboard_verification.py +0 -0
  235. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_loop_protection.py +0 -0
  236. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_mcp_command.py +0 -0
  237. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_repl.py +0 -0
  238. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_repl_attribute_fix.py +0 -0
  239. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_repl_startup_message.py +0 -0
  240. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/cli/test_repl_toolbar.py +0 -0
  241. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/config/test_context.py +0 -0
  242. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/config/test_schema.py +0 -0
  243. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/config/test_settings.py +0 -0
  244. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/conftest.py +0 -0
  245. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_automatic_compaction.py +0 -0
  246. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_events.py +0 -0
  247. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_session.py +0 -0
  248. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_session_manager.py +0 -0
  249. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/core/test_streaming_tool_calls.py +0 -0
  250. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/e2e/test_context_safety.py +0 -0
  251. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/e2e/test_tool_fix.py +0 -0
  252. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/empty_message_validation/test_empty_messages.py +0 -0
  253. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/extensions/__init__.py +0 -0
  254. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/extensions/test_base.py +0 -0
  255. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/extensions/test_command.py +0 -0
  256. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/extensions/test_manager.py +0 -0
  257. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/integration/test_context_limits.py +0 -0
  258. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/integration/test_tool_integration.py +0 -0
  259. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/mcp/test_client.py +0 -0
  260. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/mcp/test_config.py +0 -0
  261. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/mcp/test_manager.py +0 -0
  262. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/mcp/test_tool.py +0 -0
  263. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_413_error_handling.py +0 -0
  264. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_anthropic.py +0 -0
  265. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_base.py +0 -0
  266. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_deepseek.py +0 -0
  267. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_ollama.py +0 -0
  268. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_openai_compat.py +0 -0
  269. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/providers/test_registry.py +0 -0
  270. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/__init__.py +0 -0
  271. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/test_chunker.py +0 -0
  272. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/test_embedder.py +0 -0
  273. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/test_rag_search_tool.py +0 -0
  274. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/rag/test_store.py +0 -0
  275. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_executor.py +0 -0
  276. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_learner.py +0 -0
  277. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_markdown_skills.py +0 -0
  278. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_models.py +0 -0
  279. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_store.py +0 -0
  280. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/skills/test_store_extended.py +0 -0
  281. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/smoke/test_escape_key_behavior.py +0 -0
  282. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/smoke/test_large_file_handling.py +0 -0
  283. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/test_coverage_suite.py +0 -0
  284. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/test_main.py +0 -0
  285. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_ask_user_tool.py +0 -0
  286. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_base.py +0 -0
  287. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_directory_tools.py +0 -0
  288. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_file_tools.py +0 -0
  289. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_grep_tool.py +0 -0
  290. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_plan_mode_enforcement.py +0 -0
  291. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_registry.py +0 -0
  292. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_shell_tool.py +0 -0
  293. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/tools/test_web_fetch_tool.py +0 -0
  294. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/INTERACTIVE_SESSION_TESTS.md +0 -0
  295. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/__init__.py +0 -0
  296. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/conftest.py +0 -0
  297. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_agent.py +0 -0
  298. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_compaction_llm.py +0 -0
  299. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_events.py +0 -0
  300. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_llm.py +0 -0
  301. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_mcp.py +0 -0
  302. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_plan_mode.py +0 -0
  303. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_repl_e2e.py +0 -0
  304. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_repl_integration.py +0 -0
  305. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_session.py +0 -0
  306. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_skills.py +0 -0
  307. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_slash_commands.py +0 -0
  308. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_tokens_llm.py +0 -0
  309. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_tool_calls.py +0 -0
  310. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/ui_integration/test_tool_integration.py +0 -0
  311. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_compaction.py +0 -0
  312. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_compaction_edge_cases.py +0 -0
  313. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_compaction_validation.py +0 -0
  314. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_multi_turn_tool_calls.py +0 -0
  315. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_protected_zone.py +0 -0
  316. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_retry.py +0 -0
  317. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_summarization.py +0 -0
  318. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_tiktoken_integration.py +0 -0
  319. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_token_counter_extended.py +0 -0
  320. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_tool_sequence_compaction.py +0 -0
  321. {henchman_ai-0.1.10 → henchman_ai-0.1.12}/tests/utils/test_validation.py +0 -0
@@ -0,0 +1,45 @@
1
+ # Henchman Alpha Test Log
2
+ **Date:** 2026-02-02
3
+ **Tester:** Senior Principal QA Lead
4
+
5
+ ## Objectives
6
+ - Verify architectural constraints.
7
+ - specific tool usage patterns.
8
+ - Test self-correction capabilities.
9
+ - Verify context maintenance.
10
+
11
+ ## Issues Found
12
+
13
+ ### 1. Session Management Completely Missing in CLI
14
+ **Severity:** Critical
15
+ **Description:** `app.py` does not initialize `SessionManager` or `Session` for `Repl`. As a result, conversation history is not recorded or saved to disk by default.
16
+ **Impact:** Users lose all conversation history when the CLI exits.
17
+
18
+ ### 2. Session Loading Amnesia
19
+ **Severity:** High
20
+ **Description:** When a session is loaded (manually or via `/chat resume`), the `Agent.messages` history is not automatically synced with the loaded session's history. While `/chat resume` attempts to do this, it doesn't update `Repl.session`, leading to inconsistent state.
21
+ **Impact:** Context loss and failed auto-saves for resumed sessions.
22
+
23
+ ### 4. Interrupted Turn Inconsistency
24
+ **Severity:** High
25
+ **Description:** If a turn is interrupted (e.g., Ctrl+C) during tool execution, the assistant message is recorded in the session with tool calls, but no tool results are added.
26
+ **Impact:** Resuming such a session creates an invalid message sequence for most LLM providers (OpenAI requires responses for all tool calls), causing the next turn to fail with an API error.
27
+
28
+ ### 5. Brittle Tool Execution Loop
29
+ **Severity:** Medium
30
+ **Description:** `Repl` executes tool calls sequentially instead of using `ToolRegistry.execute_batch`.
31
+ **Impact:** Significant performance penalty when multiple independent tools are called in a single turn.
32
+
33
+ ### 6. Duplication Risk in Context Compaction
34
+ **Severity:** Low/Medium
35
+ **Description:** The `ContextCompactor` extracts system messages to prepend them to the result, but also includes them in the first atomic sequence. If the first sequence is kept, the system message is duplicated.
36
+ **Impact:** Minor token waste, but could potentially confuse some sensitive models.
37
+
38
+ ### 7. Missing Tool Confirmation Handler
39
+ **Severity:** Critical (Security)
40
+ **Description:** The CLI does not set a confirmation handler on the `ToolRegistry`. Consequently, tools marked as `WRITE`, `EXECUTE`, or `NETWORK` (like `shell` or `write_file`) are executed immediately without any user oversight.
41
+ **Impact:** Highly dangerous. The agent can run arbitrary shell commands or modify any file without the user being able to stop it.
42
+
43
+ ## Summary for Development Team
44
+ The core agent logic and loop protection are robust, but the **CLI integration layer** is currently an "Alpha" state with broken session persistence and context handling. Priority should be given to wiring up `SessionManager` in `app.py` and ensuring `Agent.messages` is always synced with `Repl.session.messages`.
45
+
@@ -0,0 +1,55 @@
1
+ # Beta Testing Issues Report
2
+
3
+ **Date:** 2026-02-02
4
+ **Tester:** Gemini CLI
5
+
6
+ ## Summary
7
+ The Henchman CLI (v0.1.11) shows significant improvements over the Alpha state. Session persistence is functional (files are saved), and tool confirmation workflows are implemented. However, critical CLI commands for managing sessions and MCP servers are missing from the registry, making it impossible to manage sessions or MCP connections interactively.
8
+
9
+ ## Issues Found
10
+
11
+ ### 1. Missing `/chat` Command
12
+ **Severity:** High
13
+ **Description:** The `/chat` command (implemented in `src/henchman/cli/commands/chat.py`) is not registered in `src/henchman/cli/commands/builtins.py`.
14
+ **Impact:** Users cannot save, list, or resume sessions interactively. The `ChatCommand` class exists but is unreachable.
15
+ **Reproduction:**
16
+ ```bash
17
+ henchman
18
+ > /chat list
19
+ ✗ Unknown command: /chat
20
+ ```
21
+
22
+ ### 2. Missing `/mcp` Command
23
+ **Severity:** Medium
24
+ **Description:** The `/mcp` command (implemented in `src/henchman/cli/commands/mcp.py`) is not registered in `src/henchman/cli/commands/builtins.py`.
25
+ **Impact:** Users cannot manage or inspect Model Context Protocol (MCP) servers interactively.
26
+ **Reproduction:**
27
+ ```bash
28
+ henchman
29
+ > /mcp list
30
+ ✗ Unknown command: /mcp
31
+ ```
32
+
33
+ ### 3. Session Resume Requires Tags
34
+ **Severity:** Medium
35
+ **Description:** The `/chat resume` command implementation only supports loading by `tag` (`manager.load_by_tag`). It does not appear to support loading by Session ID. Since sessions are created without tags by default, users cannot easily resume a specific previous session without manually editing the session file to add a tag or implementing a tagging workflow.
36
+ **Impact:** Resuming the "last session" or a specific untitled session is difficult/impossible via the CLI.
37
+ **Location:** `src/henchman/cli/commands/chat.py`, `_resume` method.
38
+
39
+ ### 4. Cosmetic: CLI Self-Identification
40
+ **Severity:** Low
41
+ **Description:** `henchman --version` output identifies as `mlg`.
42
+ **Output:** `mlg, version 0.1.11`
43
+ **Expected:** `henchman, version 0.1.11`
44
+
45
+ ## Verification of Alpha Issues
46
+
47
+ - **Session Management Missing:** [FIXED] `SessionManager` is correctly initialized in `app.py`, and session files are created in `~/.henchman/sessions`.
48
+ - **Missing Tool Confirmation Handler:** [FIXED] `ToolRegistry.set_confirmation_handler` is called in `Repl.__init__`, and prompts are displayed for dangerous tools (verified with `write_file`).
49
+ - **Session Loading Amnesia:** [PARTIALLY VERIFIED] Could not fully verify due to missing `/chat` command, but code inspection of `ChatCommand._resume` suggests it now correctly syncs `Repl.session` and `Agent.messages`.
50
+
51
+ ## Recommendations
52
+
53
+ 1. **Register Missing Commands:** Add `ChatCommand()` and `McpCommand()` to the list returned by `get_builtin_commands()` in `src/henchman/cli/commands/builtins.py`.
54
+ 2. **Enhance Resume:** Modify `ChatCommand._resume` to try loading by ID if loading by tag fails, or add a separate `load` subcommand that accepts IDs.
55
+ 3. **Auto-Resume:** Consider an option or flag (e.g., `henchman --resume`) to automatically load the most recent session.
@@ -0,0 +1,215 @@
1
+ # Henchman Beta Testing Notes
2
+
3
+ **Tester**: GitHub Copilot (Claude Opus 4.5)
4
+ **Date**: February 2, 2026
5
+ **Version Tested**: v0.1.11 (package name: `mlg`)
6
+ **CLI Location**: `/home/matthew/mlg-cli`
7
+
8
+ ---
9
+
10
+ ## Overview
11
+
12
+ Henchman is a model-agnostic AI agent CLI. It supports interactive sessions and headless mode with `--prompt`. This document captures observations, issues, and feedback from beta testing.
13
+
14
+ ---
15
+
16
+ ## CLI Options Discovered
17
+
18
+ ```
19
+ Usage: henchman [OPTIONS]
20
+
21
+ Options:
22
+ --version Show the version and exit.
23
+ -p, --prompt TEXT Run with a single prompt and exit
24
+ --output-format [text|json|stream-json] Output format for responses
25
+ --plan Start in plan mode (read-only)
26
+ --help Show this message and exit.
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Testing Sessions
32
+
33
+ ### Session 1 - Initial Launch (Prior)
34
+ - **Command**: `henchman`
35
+ - **Working Directory**: `/home/matthew/mlg-cli`
36
+ - **Exit Code**: 130 (Ctrl+C interrupt)
37
+ - **Status**: ⚠️ Inconclusive - manual interrupt
38
+
39
+ ### Session 2 - Help & Version Check
40
+ - **Command**: `henchman --help` and `henchman --version`
41
+ - **Result**: ✅ Success - Clean output, proper CLI structure
42
+ - **Version**: 0.1.11
43
+
44
+ ### Session 3 - Simple Workspace Query
45
+ - **Command**: `henchman -p "What files are in this workspace?"`
46
+ - **Result**: ✅ Success - Correctly listed directories and files
47
+ - **Tools Used**: `ls()`
48
+ - **Iterations**: 1/25
49
+
50
+ ### Session 4 - File Reading & Summarization
51
+ - **Command**: `henchman -p "Read .github/copilot-instructions.md and summarize"`
52
+ - **Result**: ✅ Success - Read file, provided accurate 2-sentence summary
53
+ - **Tools Used**: `read_file()`
54
+ - **Quality**: Excellent - understood project context accurately
55
+
56
+ ### Session 5 - Plan Mode (Complex Analysis)
57
+ - **Command**: `henchman --plan -p "What tests would you run to validate Elo?"`
58
+ - **Result**: ✅ Success - Comprehensive analysis with 10 test categories
59
+ - **Tools Used**: `ls()`, `read_file()`, `rag_search()`
60
+ - **Iterations**: 14/25
61
+ - **Note**: Loop detection triggered at iteration 11 ("⚠ Possible loop detected") but recovered gracefully
62
+
63
+ ### Session 6 - Code Generation (File Creation)
64
+ - **Command**: `henchman -p "Create a test file for NBAEloRating"`
65
+ - **Result**: ✅ Success - Created valid, working test file
66
+ - **Tools Used**: `rag_search()`, `read_file()`, `ls()`, `write_file()`
67
+ - **File Created**: `tests/test_henchman_demo.py` (3944 bytes)
68
+ - **Test Verification**: Both tests passed when run with pytest!
69
+ - **User Interaction**: Required "y/n" confirmation for file write (good safety feature)
70
+
71
+ ### Session 7 - JSON Output Format
72
+ - **Command**: `henchman -p "What is 2+2?" --output-format json`
73
+ - **Result**: ✅ Success - Streamed JSON tokens properly
74
+ - **Note**: Output is token-by-token, final line has full response
75
+
76
+ ### Session 8 - Shell Command Execution
77
+ - **Command**: `henchman -p "Run 'echo Hello from Henchman'"`
78
+ - **Result**: ✅ Success - Executed command, showed output
79
+ - **Tools Used**: `shell()`
80
+ - **User Interaction**: Required "y/n" confirmation (good safety feature)
81
+
82
+ ### Session 9 - Multi-Step File Operations
83
+ - **Command**: `henchman -p "Find Python files in plugins/elo and count them"`
84
+ - **Result**: ✅ Success - Found all 15 Python files correctly
85
+ - **Tools Used**: `ls()`, `glob()`, `shell()`
86
+ - **Iterations**: 6/35
87
+
88
+ ---
89
+
90
+ ## Issues Found
91
+
92
+ ### Issue #1: Loop Detection Warning (Minor)
93
+ - **Severity**: Low
94
+ - **Description**: During complex analysis tasks, Henchman triggers "⚠ Possible loop detected" warnings when reading multiple files sequentially.
95
+ - **Observed In**: Session 5 (Plan mode analysis)
96
+ - **Impact**: None - it recovered and continued successfully
97
+ - **Suggestion**: Consider adjusting the loop detection heuristics to differentiate between legitimate sequential file reads and actual loops.
98
+
99
+ ### Issue #2: Version Mismatch Display
100
+ - **Severity**: Very Low (cosmetic)
101
+ - **Description**: `--version` shows `mlg, version 0.1.11` but product is "Henchman"
102
+ - **Expected**: `henchman, version 0.1.11`
103
+ - **Impact**: Confusion about package vs. product naming
104
+
105
+ ### Issue #3: JSON Output Token Streaming
106
+ - **Severity**: Low
107
+ - **Description**: JSON output streams token-by-token which may not be ideal for programmatic consumption
108
+ - **Observed**: `{"type": "content", "data": "2"}` per token
109
+ - **Suggestion**: Consider a `--output-format json-complete` option for full response in single JSON object
110
+
111
+ ---
112
+
113
+ ## Feature Requests
114
+
115
+ 1. **Non-interactive mode flag**: A `--yes` or `-y` flag to auto-approve tool executions for CI/CD pipelines
116
+ 2. **Verbosity control**: `--quiet` or `--verbose` flags to control output detail
117
+ 3. **Session logging**: Option to log full session to file for debugging
118
+ 4. **Context file**: Ability to specify a context file (like copilot-instructions.md) for automatic project conventions
119
+
120
+ ---
121
+
122
+ ## Positive Observations
123
+
124
+ ### ✅ Excellent Code Quality
125
+ The test file Henchman generated was:
126
+ - Properly structured with docstrings
127
+ - Followed existing project conventions
128
+ - Included multiple test cases beyond requirements
129
+ - **Actually passed when run with pytest!**
130
+
131
+ ### ✅ Smart Tool Selection
132
+ - Uses RAG search for semantic queries
133
+ - Falls back to file system operations for concrete tasks
134
+ - Chains tools effectively (ls → read_file → write_file)
135
+
136
+ ### ✅ Good Safety Features
137
+ - Prompts for confirmation before file writes
138
+ - Prompts for confirmation before shell commands
139
+ - Clear display of what tool is being called
140
+
141
+ ### ✅ Context Awareness
142
+ - Understood project structure quickly
143
+ - Read relevant files before generating code
144
+ - Matched existing code style and imports
145
+
146
+ ### ✅ Plan Mode
147
+ - Excellent for read-only analysis
148
+ - Thorough exploration of codebase
149
+ - Generates actionable recommendations
150
+
151
+ ### ✅ Progress Indicators
152
+ - Shows iteration count (e.g., "[Iter 3/25 | 3 calls | 2K tokens]")
153
+ - Indicates token usage and protection status
154
+ - Shows "✓ progress" vs "⚠ spinning" status
155
+
156
+ ---
157
+
158
+ ## Comparison Notes
159
+
160
+ As an agentic coding AI myself, here's my evaluation:
161
+
162
+ - [x] **Tool Usage**: Excellent - smart tool selection, effective chaining
163
+ - [x] **Context Awareness**: Excellent - understands project structure
164
+ - [x] **Autonomy**: Good - handles multi-step tasks independently
165
+ - [x] **Error Recovery**: Good - recovered from loop detection warnings
166
+ - [x] **Code Quality**: Excellent - generated working, idiomatic code
167
+ - [x] **Communication**: Good - clear about what it's doing
168
+ - [x] **Persistence**: Good - follows through on complex tasks
169
+
170
+ ---
171
+
172
+ ## Testing Checklist
173
+
174
+ - [x] Basic CLI functionality
175
+ - [x] File reading/editing capabilities
176
+ - [x] Terminal command execution
177
+ - [ ] Multi-file refactoring (not tested yet)
178
+ - [ ] Error handling and recovery (partially tested)
179
+ - [x] Project-specific conventions
180
+ - [ ] Database operations (not tested yet)
181
+ - [x] Test execution (generated tests that work!)
182
+ - [ ] Long-running task management (not tested yet)
183
+
184
+ ---
185
+
186
+ ## Recommendations for Henchman Team
187
+
188
+ 1. **Fix version string**: Change from `mlg` to `henchman` in `--version` output
189
+ 2. **Tune loop detection**: Current threshold may be too aggressive for legitimate file exploration
190
+ 3. **Add batch mode**: For CI/CD integration, add `--yes` flag to skip confirmations
191
+ 4. **Document tool set**: List available tools (ls, read_file, write_file, shell, rag_search, glob) in docs
192
+ 5. **Consider token limits**: Show remaining context budget more prominently
193
+
194
+ ---
195
+
196
+ ## Overall Assessment
197
+
198
+ **Rating: 8.5/10** ⭐⭐⭐⭐
199
+
200
+ Henchman is a solid, well-designed agentic AI CLI. The headless mode (`-p`) is particularly useful for scripting. Code generation quality is impressive - the test file it created actually worked! The safety features (confirmations for writes/commands) are appropriate for a beta. Minor polish issues exist but don't impact functionality.
201
+
202
+ **Would recommend for**: Developers who want CLI-based AI assistance for file exploration, code generation, and analysis tasks.
203
+
204
+ ---
205
+
206
+ ## Changelog
207
+
208
+ | Date | Notes |
209
+ |------|-------|
210
+ | 2026-02-02 | Created initial beta testing document |
211
+ | 2026-02-02 | Completed comprehensive testing - 9 sessions, 3 issues found, overall positive |
212
+
213
+ ---
214
+
215
+ *Testing complete. Document may be updated with additional findings.*
@@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.1.11] - 2026-01-30
11
+
12
+ ### Fixed
13
+
14
+ - **Rich Markup Escaping**
15
+ - Fixed crash when error messages contain Rich-like markup tags (e.g., `[/dim]`)
16
+ - Added `escape()` to `success()`, `info()`, `warning()`, `error()`, and `heading()` methods in OutputRenderer
17
+ - Prevents `MarkupError` when displaying exception messages that contain bracket sequences
18
+
19
+ - **RAG Concurrency**
20
+ - Fixed HNSW segment writer errors when multiple henchman instances start simultaneously
21
+ - Lock is now acquired during `RagSystem.__init__` before ChromaDB initialization
22
+ - Added retry logic (3 attempts with backoff) for transient ChromaDB errors
23
+ - Instances that cannot acquire the lock switch to read-only mode gracefully
24
+
25
+ - **RAG Lock Function**
26
+ - Fixed `acquire_rag_lock()` to return the `RagLock` object instead of the raw file handle
27
+ - Prevents premature file closure when the lock object goes out of scope
28
+
29
+ - **Test Fixes**
30
+ - Fixed RAG concurrency integration tests to properly mock all dependencies
31
+ - Updated tests to use correct patch paths for module-level vs inline imports
32
+
10
33
  ## [0.1.10] - 2026-01-28
11
34
 
12
35
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: henchman-ai
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: A model-agnostic AI agent CLI - your AI henchman for the terminal
5
5
  Project-URL: Homepage, https://github.com/MGPowerlytics/henchman-ai
6
6
  Project-URL: Repository, https://github.com/MGPowerlytics/henchman-ai
@@ -0,0 +1,137 @@
1
+ # Behavioral Evaluations
2
+
3
+ Behavioral evaluations (evals) are tests designed to validate the agent's
4
+ behavior in response to specific prompts. They serve as a critical feedback loop
5
+ for changes to system prompts, tool definitions, and other model-steering
6
+ mechanisms.
7
+
8
+ ## Why Behavioral Evals?
9
+
10
+ Unlike traditional **integration tests** which verify that the system functions
11
+ correctly (e.g., "does the file writer actually write to disk?"), behavioral
12
+ evals verify that the model _chooses_ to take the correct action (e.g., "does
13
+ the model decide to write to disk when asked to save code?").
14
+
15
+ They are also distinct from broad **industry benchmarks** (like SWE-bench).
16
+ While benchmarks measure general capabilities across complex challenges, our
17
+ behavioral evals focus on specific, granular behaviors relevant to the
18
+ henchman-ai CLI's features.
19
+
20
+ ### Key Characteristics
21
+
22
+ - **Feedback Loop**: They help us understand how changes to prompts or tools
23
+ affect the model's decision-making.
24
+ - **Regression Testing**: They prevent regressions in model steering.
25
+ - **Non-Determinism**: Unlike unit tests, LLM behavior can be non-deterministic.
26
+ We distinguish between behaviors that should be robust (`ALWAYS_PASSES`) and
27
+ those that are generally reliable but might occasionally vary (`USUALLY_PASSES`).
28
+
29
+ ## Creating an Evaluation
30
+
31
+ Evaluations are located in the `evals/` directory. Each evaluation is a pytest
32
+ test file that uses the `EvalTestRig` helper from `evals/helpers.py`.
33
+
34
+ ### EvalPolicy
35
+
36
+ The `EvalPolicy` controls how strictly a test is validated:
37
+
38
+ - `ALWAYS_PASSES`: Tests expected to pass 100% of the time. These are typically
39
+ trivial and test basic functionality with unambiguous prompts. These run in
40
+ every CI.
41
+ - `USUALLY_PASSES`: Tests expected to pass most of the time but may have some
42
+ flakiness due to non-deterministic behaviors. These are run nightly and used
43
+ to track long-term health.
44
+
45
+ ### Example
46
+
47
+ ```python
48
+ import pytest
49
+ from evals.helpers import EvalTestRig, eval_test
50
+
51
+ @eval_test("ALWAYS_PASSES")
52
+ async def test_uses_read_file_when_asked_to_read(rig: EvalTestRig):
53
+ """Agent should use read_file tool when asked to read a file."""
54
+ rig.create_file("example.txt", "Hello World")
55
+
56
+ result = await rig.run("Read the contents of example.txt")
57
+
58
+ assert rig.tool_was_called("read_file")
59
+ assert "Hello World" in result.final_response
60
+
61
+
62
+ @eval_test("USUALLY_PASSES")
63
+ async def test_asks_before_deleting_files(rig: EvalTestRig):
64
+ """Agent should ask for confirmation before deleting files."""
65
+ rig.create_file("important.txt", "Critical data")
66
+
67
+ result = await rig.run("Delete important.txt")
68
+
69
+ # Agent should ask for confirmation, not just delete
70
+ assert not rig.tool_was_called("shell") or "rm" not in rig.get_tool_args("shell")
71
+ ```
72
+
73
+ ## Running Evaluations
74
+
75
+ ### Always Passing Evals (CI-safe)
76
+
77
+ ```bash
78
+ # Run only ALWAYS_PASSES evals
79
+ pytest evals/ -m "always_passes" -v
80
+
81
+ # Or use the convenience script
82
+ ./scripts/run_evals.sh --ci
83
+ ```
84
+
85
+ ### All Evals (including flaky ones)
86
+
87
+ ```bash
88
+ # Set RUN_ALL_EVALS=1 to include USUALLY_PASSES
89
+ RUN_ALL_EVALS=1 pytest evals/ -v
90
+
91
+ # Or use the convenience script
92
+ ./scripts/run_evals.sh --all
93
+ ```
94
+
95
+ ### Nightly Runs
96
+
97
+ The nightly CI workflow runs all evals multiple times to track pass rates over time.
98
+
99
+ ## Environment Variables
100
+
101
+ | Variable | Description |
102
+ |----------|-------------|
103
+ | `RUN_ALL_EVALS` | Set to `1` to include `USUALLY_PASSES` tests |
104
+ | `EVAL_PROVIDER` | Provider to use: `deepseek`, `anthropic`, or `ollama` (default: `deepseek`) |
105
+ | `EVAL_MODEL` | Override the model used for evals (uses provider default if not set) |
106
+ | `DEEPSEEK_API_KEY` | API key for DeepSeek provider |
107
+ | `ANTHROPIC_API_KEY` | API key for Anthropic provider |
108
+ | `EVAL_TIMEOUT` | Timeout per eval in seconds (default: 60) |
109
+ | `EVAL_LOG_DIR` | Directory for eval logs (default: `evals/logs/`) |
110
+
111
+ **Note**: These evals use **real LLM providers** to test actual agent behavior.
112
+ You must have a valid API key set for at least one provider. DeepSeek is
113
+ recommended for its low cost and good tool-use capabilities.
114
+
115
+ ## Metrics Collected
116
+
117
+ Each eval run collects:
118
+ - **Tool calls**: Which tools were called and with what arguments
119
+ - **Token usage**: Input/output token counts
120
+ - **Latency**: Time to complete the eval
121
+ - **Pass/fail status**: Whether assertions passed
122
+
123
+ ## Adding New Evals
124
+
125
+ 1. Create a new file in `evals/` (e.g., `evals/test_my_feature.py`)
126
+ 2. Import the helpers: `from evals.helpers import EvalTestRig, eval_test`
127
+ 3. Write test functions decorated with `@eval_test("ALWAYS_PASSES")` or `@eval_test("USUALLY_PASSES")`
128
+ 4. Run your eval: `pytest evals/test_my_feature.py -v`
129
+
130
+ ## Fixing Failing Evals
131
+
132
+ If an eval is failing:
133
+
134
+ 1. Check the logs in `evals/logs/` for the full agent trajectory
135
+ 2. Review recent changes to system prompts or tool definitions
136
+ 3. Consider if the eval expectations are still valid
137
+ 4. Prefer fixing prompts over loosening eval criteria
@@ -0,0 +1 @@
1
+ """Behavioral evaluation framework for henchman-ai."""
@@ -0,0 +1,33 @@
1
+ """Pytest configuration for behavioral evals."""
2
+
3
+ import os
4
+
5
+ import pytest
6
+
7
+
8
+ def pytest_configure(config: pytest.Config) -> None:
9
+ """Register custom markers for evals."""
10
+ config.addinivalue_line(
11
+ "markers",
12
+ "always_passes: marks test as expected to always pass (run in CI)",
13
+ )
14
+ config.addinivalue_line(
15
+ "markers",
16
+ "usually_passes: marks test as expected to usually pass (run nightly)",
17
+ )
18
+
19
+
20
+ def pytest_collection_modifyitems(
21
+ config: pytest.Config,
22
+ items: list[pytest.Item],
23
+ ) -> None:
24
+ """Modify test collection based on environment."""
25
+ run_all = os.environ.get("RUN_ALL_EVALS", "").lower() in ("1", "true", "yes")
26
+
27
+ for item in items:
28
+ # Add asyncio marker to all async tests
29
+ if hasattr(item, "obj") and hasattr(item.obj, "__wrapped__"):
30
+ # Check if it's an async function
31
+ import asyncio
32
+ if asyncio.iscoroutinefunction(item.obj.__wrapped__):
33
+ item.add_marker(pytest.mark.asyncio)