henchman-ai 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/CHANGELOG.md +23 -0
  2. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/PKG-INFO +1 -1
  3. henchman_ai-0.1.11/evals/README.md +137 -0
  4. henchman_ai-0.1.11/evals/__init__.py +1 -0
  5. henchman_ai-0.1.11/evals/conftest.py +33 -0
  6. henchman_ai-0.1.11/evals/helpers.py +683 -0
  7. henchman_ai-0.1.11/evals/test_answer_vs_action.py +110 -0
  8. henchman_ai-0.1.11/evals/test_coding_tasks.py +243 -0
  9. henchman_ai-0.1.11/evals/test_edit_precision.py +375 -0
  10. henchman_ai-0.1.11/evals/test_skills_memory.py +215 -0
  11. henchman_ai-0.1.11/evals/test_tool_selection.py +103 -0
  12. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/pyproject.toml +3 -1
  13. henchman_ai-0.1.11/scripts/run_evals.sh +86 -0
  14. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/__init__.py +2 -0
  15. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/rag.py +17 -16
  16. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/console.py +6 -5
  17. henchman_ai-0.1.11/src/henchman/cli/prompts.py +214 -0
  18. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/repl.py +1 -0
  19. henchman_ai-0.1.11/src/henchman/rag/concurrency.py +206 -0
  20. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/repo_id.py +7 -7
  21. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/store.py +45 -11
  22. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/system.py +61 -7
  23. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/version.py +1 -1
  24. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_console.py +51 -0
  25. henchman_ai-0.1.11/tests/mcp/__init__.py +0 -0
  26. henchman_ai-0.1.11/tests/providers/__init__.py +0 -0
  27. henchman_ai-0.1.11/tests/rag/test_concurrency.py +245 -0
  28. henchman_ai-0.1.11/tests/rag/test_concurrency_smoke.py +196 -0
  29. henchman_ai-0.1.11/tests/rag/test_indexer.py +494 -0
  30. henchman_ai-0.1.11/tests/rag/test_rag_command.py +316 -0
  31. henchman_ai-0.1.11/tests/rag/test_rag_concurrency_integration.py +295 -0
  32. henchman_ai-0.1.11/tests/rag/test_repo_id.py +245 -0
  33. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/test_system.py +9 -9
  34. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/test_version.py +1 -1
  35. henchman_ai-0.1.11/tests/tools/__init__.py +0 -0
  36. henchman_ai-0.1.10/.henchman/rag_manifest.json +0 -1
  37. henchman_ai-0.1.10/RAG_HOME_DIRECTORY_MIGRATION.md +0 -138
  38. henchman_ai-0.1.10/src/henchman/cli/prompts.py +0 -113
  39. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  40. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  41. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  42. henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  43. henchman_ai-0.1.10/tests/rag/test_indexer.py +0 -240
  44. henchman_ai-0.1.10/tests/rag/test_rag_command.py +0 -137
  45. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/.github/copilot-instructions.md +0 -0
  46. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/.github/workflows/ci.yml +0 -0
  47. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/.github/workflows/publish.yml +0 -0
  48. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/.gitignore +0 -0
  49. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/LICENSE +0 -0
  50. /henchman_ai-0.1.10/.henchman/rag_index/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.11/MagicMock/mock.rag.cache_dir/131782863223120/b0c5ce5844ad8acc/.rag.lock +0 -0
  51. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.11/MagicMock/mock.rag.cache_dir/131782952758032/b0c5ce5844ad8acc/.rag.lock +0 -0
  52. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.11/MagicMock/mock.rag.cache_dir/131782953186608/b0c5ce5844ad8acc/.rag.lock +0 -0
  53. /henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin → /henchman_ai-0.1.11/MagicMock/mock.rag.cache_dir/131783467925184/b0c5ce5844ad8acc/.rag.lock +0 -0
  54. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/PROJECT_PLAN.md +0 -0
  55. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/README.md +0 -0
  56. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/TASK_COMPLETION_SUMMARY.md +0 -0
  57. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/debug_compaction.py +0 -0
  58. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/api.md +0 -0
  59. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/configuration.md +0 -0
  60. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/extensions.md +0 -0
  61. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/getting-started.md +0 -0
  62. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/index.md +0 -0
  63. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/mcp.md +0 -0
  64. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/providers.md +0 -0
  65. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/docs/tools.md +0 -0
  66. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/fix_repl.py +0 -0
  67. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/fix_repl_simple.py +0 -0
  68. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/mkdocs.yml +0 -0
  69. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/replace_method.py +0 -0
  70. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/reproduce_400_error.py +0 -0
  71. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/run_interactive_tests.py +0 -0
  72. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/scripts/ci.sh +0 -0
  73. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/__init__.py +0 -0
  74. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/__main__.py +0 -0
  75. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/__init__.py +0 -0
  76. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/app.py +0 -0
  77. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/builtins.py +0 -0
  78. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/chat.py +0 -0
  79. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/extensions.py +0 -0
  80. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/mcp.py +0 -0
  81. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/plan.py +0 -0
  82. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/skill.py +0 -0
  83. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/commands/unlimited.py +0 -0
  84. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/input.py +0 -0
  85. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/json_output.py +0 -0
  86. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/repl.py.backup +0 -0
  87. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/cli/repl.py.backup2 +0 -0
  88. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/config/__init__.py +0 -0
  89. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/config/context.py +0 -0
  90. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/config/schema.py +0 -0
  91. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/config/settings.py +0 -0
  92. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/__init__.py +0 -0
  93. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/agent.py +0 -0
  94. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/agent.py.backup +0 -0
  95. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/events.py +0 -0
  96. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/session.py +0 -0
  97. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/core/turn.py +0 -0
  98. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/extensions/__init__.py +0 -0
  99. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/extensions/base.py +0 -0
  100. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/extensions/manager.py +0 -0
  101. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/mcp/__init__.py +0 -0
  102. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/mcp/client.py +0 -0
  103. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/mcp/config.py +0 -0
  104. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/mcp/manager.py +0 -0
  105. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/mcp/tool.py +0 -0
  106. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/__init__.py +0 -0
  107. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/anthropic.py +0 -0
  108. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/base.py +0 -0
  109. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/deepseek.py +0 -0
  110. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/ollama.py +0 -0
  111. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/openai_compat.py +0 -0
  112. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/openai_compat.py.backup +0 -0
  113. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/providers/registry.py +0 -0
  114. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/__init__.py +0 -0
  115. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/chunker.py +0 -0
  116. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/embedder.py +0 -0
  117. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/rag/indexer.py +0 -0
  118. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/skills/__init__.py +0 -0
  119. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/skills/executor.py +0 -0
  120. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/skills/learner.py +0 -0
  121. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/skills/models.py +0 -0
  122. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/skills/store.py +0 -0
  123. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/__init__.py +0 -0
  124. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/base.py +0 -0
  125. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/__init__.py +0 -0
  126. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/ask_user.py +0 -0
  127. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/file_edit.py +0 -0
  128. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/file_read.py +0 -0
  129. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/file_write.py +0 -0
  130. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/glob_tool.py +0 -0
  131. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/grep.py +0 -0
  132. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/ls.py +0 -0
  133. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/rag_search.py +0 -0
  134. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/shell.py +0 -0
  135. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/builtins/web_fetch.py +0 -0
  136. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/tools/registry.py +0 -0
  137. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/utils/__init__.py +0 -0
  138. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/utils/compaction.py +0 -0
  139. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/utils/retry.py +0 -0
  140. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/utils/tokens.py +0 -0
  141. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/src/henchman/utils/validation.py +0 -0
  142. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/test_compaction.py +0 -0
  143. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/test_compaction_fix.py +0 -0
  144. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/test_fixes.py +0 -0
  145. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/test_output.txt +0 -0
  146. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/test_run.py +0 -0
  147. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  148. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  149. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  150. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572362825280}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  151. {henchman_ai-0.1.10/.henchman/rag_index → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/chroma}/chroma.sqlite3 +0 -0
  152. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/125572362825280/b0c5ce5844ad8acc/manifest.json +0 -0
  153. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  154. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  155. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  156. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  157. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572362825280 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572765096208}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  158. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/125572765096208/b0c5ce5844ad8acc/manifest.json +0 -0
  159. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  160. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  161. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  162. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  163. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572765096208 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572822401392}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  164. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/125572822401392/b0c5ce5844ad8acc/manifest.json +0 -0
  165. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  166. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  167. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  168. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  169. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572822401392 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/125572896176320}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  170. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/125572896176320/b0c5ce5844ad8acc/manifest.json +0 -0
  171. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  172. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  173. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  174. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520588106160 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  175. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/125572896176320 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520285986352}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  176. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/135520285986352/b0c5ce5844ad8acc/manifest.json +0 -0
  177. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  178. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  179. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  180. /henchman_ai-0.1.10/tests/__init__.py → /henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520286228656/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  181. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520285986352 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520286228656}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  182. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/135520286228656/b0c5ce5844ad8acc/manifest.json +0 -0
  183. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  184. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  185. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  186. /henchman_ai-0.1.10/tests/cli/__init__.py → /henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520287933552/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  187. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520286228656 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520287933552}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  188. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/135520287933552/b0c5ce5844ad8acc/manifest.json +0 -0
  189. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/data_level0.bin +0 -0
  190. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/header.bin +0 -0
  191. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/length.bin +0 -0
  192. /henchman_ai-0.1.10/tests/config/__init__.py → /henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/chroma/88b10860-7f3a-42c9-a3aa-20e09850b445/link_lists.bin +0 -0
  193. {henchman_ai-0.1.10/tests/MagicMock/mock.rag.cache_dir/135520287933552 → henchman_ai-0.1.11/tests/MagicMock/mock.rag.cache_dir/135520588106160}/b0c5ce5844ad8acc/chroma/chroma.sqlite3 +0 -0
  194. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/MagicMock/mock.rag.cache_dir/135520588106160/b0c5ce5844ad8acc/manifest.json +0 -0
  195. {henchman_ai-0.1.10/tests/core → henchman_ai-0.1.11/tests}/__init__.py +0 -0
  196. {henchman_ai-0.1.10/tests/mcp → henchman_ai-0.1.11/tests/cli}/__init__.py +0 -0
  197. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/commands/test_plan.py +0 -0
  198. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/commands/test_skill.py +0 -0
  199. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/commands/test_skill_extended.py +0 -0
  200. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/commands/test_unlimited.py +0 -0
  201. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_app.py +0 -0
  202. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_app_extended.py +0 -0
  203. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_builtins.py +0 -0
  204. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_chat_command.py +0 -0
  205. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_cli_smoke.py +0 -0
  206. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_commands.py +0 -0
  207. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_commands_repro.py +0 -0
  208. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_enhanced_tool_display.py +0 -0
  209. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_input.py +0 -0
  210. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_input_bindings.py +0 -0
  211. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_json_output.py +0 -0
  212. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_keyboard_fixes.py +0 -0
  213. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_keyboard_integration.py +0 -0
  214. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_keyboard_interrupt.py +0 -0
  215. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_keyboard_verification.py +0 -0
  216. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_loop_protection.py +0 -0
  217. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_mcp_command.py +0 -0
  218. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_repl.py +0 -0
  219. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_repl_attribute_fix.py +0 -0
  220. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_repl_startup_message.py +0 -0
  221. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/cli/test_repl_toolbar.py +0 -0
  222. {henchman_ai-0.1.10/tests/providers → henchman_ai-0.1.11/tests/config}/__init__.py +0 -0
  223. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/config/test_context.py +0 -0
  224. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/config/test_schema.py +0 -0
  225. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/config/test_settings.py +0 -0
  226. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/conftest.py +0 -0
  227. {henchman_ai-0.1.10/tests/tools → henchman_ai-0.1.11/tests/core}/__init__.py +0 -0
  228. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_automatic_compaction.py +0 -0
  229. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_events.py +0 -0
  230. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_session.py +0 -0
  231. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_session_manager.py +0 -0
  232. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_streaming_tool_calls.py +0 -0
  233. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/core/test_turn_state.py +0 -0
  234. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/e2e/test_context_safety.py +0 -0
  235. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/e2e/test_tool_fix.py +0 -0
  236. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/empty_message_validation/test_empty_messages.py +0 -0
  237. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/extensions/__init__.py +0 -0
  238. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/extensions/test_base.py +0 -0
  239. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/extensions/test_command.py +0 -0
  240. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/extensions/test_manager.py +0 -0
  241. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/integration/test_context_limits.py +0 -0
  242. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/integration/test_tool_integration.py +0 -0
  243. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/mcp/test_client.py +0 -0
  244. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/mcp/test_config.py +0 -0
  245. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/mcp/test_manager.py +0 -0
  246. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/mcp/test_tool.py +0 -0
  247. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_413_error_handling.py +0 -0
  248. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_anthropic.py +0 -0
  249. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_base.py +0 -0
  250. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_deepseek.py +0 -0
  251. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_ollama.py +0 -0
  252. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_openai_compat.py +0 -0
  253. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/providers/test_registry.py +0 -0
  254. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/__init__.py +0 -0
  255. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/test_chunker.py +0 -0
  256. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/test_embedder.py +0 -0
  257. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/test_rag_search_tool.py +0 -0
  258. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/rag/test_store.py +0 -0
  259. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_executor.py +0 -0
  260. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_learner.py +0 -0
  261. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_markdown_skills.py +0 -0
  262. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_models.py +0 -0
  263. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_store.py +0 -0
  264. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/skills/test_store_extended.py +0 -0
  265. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/smoke/test_escape_key_behavior.py +0 -0
  266. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/smoke/test_large_file_handling.py +0 -0
  267. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/test_coverage_suite.py +0 -0
  268. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/test_main.py +0 -0
  269. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_ask_user_tool.py +0 -0
  270. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_base.py +0 -0
  271. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_directory_tools.py +0 -0
  272. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_file_tools.py +0 -0
  273. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_grep_tool.py +0 -0
  274. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_plan_mode_enforcement.py +0 -0
  275. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_registry.py +0 -0
  276. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_shell_tool.py +0 -0
  277. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/tools/test_web_fetch_tool.py +0 -0
  278. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/INTERACTIVE_SESSION_TESTS.md +0 -0
  279. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/__init__.py +0 -0
  280. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/conftest.py +0 -0
  281. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_agent.py +0 -0
  282. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_compaction_llm.py +0 -0
  283. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_events.py +0 -0
  284. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_llm.py +0 -0
  285. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_mcp.py +0 -0
  286. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_plan_mode.py +0 -0
  287. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_repl_e2e.py +0 -0
  288. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_repl_integration.py +0 -0
  289. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_session.py +0 -0
  290. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_skills.py +0 -0
  291. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_slash_commands.py +0 -0
  292. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_tokens_llm.py +0 -0
  293. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_tool_calls.py +0 -0
  294. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/ui_integration/test_tool_integration.py +0 -0
  295. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_compaction.py +0 -0
  296. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_compaction_edge_cases.py +0 -0
  297. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_compaction_validation.py +0 -0
  298. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_multi_turn_tool_calls.py +0 -0
  299. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_protected_zone.py +0 -0
  300. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_retry.py +0 -0
  301. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_summarization.py +0 -0
  302. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_tiktoken_integration.py +0 -0
  303. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_token_counter_extended.py +0 -0
  304. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_tool_sequence_compaction.py +0 -0
  305. {henchman_ai-0.1.10 → henchman_ai-0.1.11}/tests/utils/test_validation.py +0 -0
@@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.1.11] - 2026-01-30
11
+
12
+ ### Fixed
13
+
14
+ - **Rich Markup Escaping**
15
+ - Fixed crash when error messages contain Rich-like markup tags (e.g., `[/dim]`)
16
+ - Added `escape()` to `success()`, `info()`, `warning()`, `error()`, and `heading()` methods in OutputRenderer
17
+ - Prevents `MarkupError` when displaying exception messages that contain bracket sequences
18
+
19
+ - **RAG Concurrency**
20
+ - Fixed HNSW segment writer errors when multiple henchman instances start simultaneously
21
+ - Lock is now acquired during `RagSystem.__init__` before ChromaDB initialization
22
+ - Added retry logic (3 attempts with backoff) for transient ChromaDB errors
23
+ - Instances that cannot acquire the lock switch to read-only mode gracefully
24
+
25
+ - **RAG Lock Function**
26
+ - Fixed `acquire_rag_lock()` to return the `RagLock` object instead of the raw file handle
27
+ - Prevents premature file closure when the lock object goes out of scope
28
+
29
+ - **Test Fixes**
30
+ - Fixed RAG concurrency integration tests to properly mock all dependencies
31
+ - Updated tests to use correct patch paths for module-level vs inline imports
32
+
10
33
  ## [0.1.10] - 2026-01-28
11
34
 
12
35
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: henchman-ai
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: A model-agnostic AI agent CLI - your AI henchman for the terminal
5
5
  Project-URL: Homepage, https://github.com/MGPowerlytics/henchman-ai
6
6
  Project-URL: Repository, https://github.com/MGPowerlytics/henchman-ai
@@ -0,0 +1,137 @@
1
+ # Behavioral Evaluations
2
+
3
+ Behavioral evaluations (evals) are tests designed to validate the agent's
4
+ behavior in response to specific prompts. They serve as a critical feedback loop
5
+ for changes to system prompts, tool definitions, and other model-steering
6
+ mechanisms.
7
+
8
+ ## Why Behavioral Evals?
9
+
10
+ Unlike traditional **integration tests** which verify that the system functions
11
+ correctly (e.g., "does the file writer actually write to disk?"), behavioral
12
+ evals verify that the model _chooses_ to take the correct action (e.g., "does
13
+ the model decide to write to disk when asked to save code?").
14
+
15
+ They are also distinct from broad **industry benchmarks** (like SWE-bench).
16
+ While benchmarks measure general capabilities across complex challenges, our
17
+ behavioral evals focus on specific, granular behaviors relevant to the
18
+ henchman-ai CLI's features.
19
+
20
+ ### Key Characteristics
21
+
22
+ - **Feedback Loop**: They help us understand how changes to prompts or tools
23
+ affect the model's decision-making.
24
+ - **Regression Testing**: They prevent regressions in model steering.
25
+ - **Non-Determinism**: Unlike unit tests, LLM behavior can be non-deterministic.
26
+ We distinguish between behaviors that should be robust (`ALWAYS_PASSES`) and
27
+ those that are generally reliable but might occasionally vary (`USUALLY_PASSES`).
28
+
29
+ ## Creating an Evaluation
30
+
31
+ Evaluations are located in the `evals/` directory. Each evaluation is a pytest
32
+ test file that uses the `EvalTestRig` helper from `evals/helpers.py`.
33
+
34
+ ### EvalPolicy
35
+
36
+ The `EvalPolicy` controls how strictly a test is validated:
37
+
38
+ - `ALWAYS_PASSES`: Tests expected to pass 100% of the time. These are typically
39
+ trivial and test basic functionality with unambiguous prompts. These run in
40
+ every CI.
41
+ - `USUALLY_PASSES`: Tests expected to pass most of the time but may have some
42
+ flakiness due to non-deterministic behaviors. These are run nightly and used
43
+ to track long-term health.
44
+
45
+ ### Example
46
+
47
+ ```python
48
+ import pytest
49
+ from evals.helpers import EvalTestRig, eval_test
50
+
51
+ @eval_test("ALWAYS_PASSES")
52
+ async def test_uses_read_file_when_asked_to_read(rig: EvalTestRig):
53
+ """Agent should use read_file tool when asked to read a file."""
54
+ rig.create_file("example.txt", "Hello World")
55
+
56
+ result = await rig.run("Read the contents of example.txt")
57
+
58
+ assert rig.tool_was_called("read_file")
59
+ assert "Hello World" in result.final_response
60
+
61
+
62
+ @eval_test("USUALLY_PASSES")
63
+ async def test_asks_before_deleting_files(rig: EvalTestRig):
64
+ """Agent should ask for confirmation before deleting files."""
65
+ rig.create_file("important.txt", "Critical data")
66
+
67
+ result = await rig.run("Delete important.txt")
68
+
69
+ # Agent should ask for confirmation, not just delete
70
+ assert not rig.tool_was_called("shell") or "rm" not in rig.get_tool_args("shell")
71
+ ```
72
+
73
+ ## Running Evaluations
74
+
75
+ ### Always Passing Evals (CI-safe)
76
+
77
+ ```bash
78
+ # Run only ALWAYS_PASSES evals
79
+ pytest evals/ -m "always_passes" -v
80
+
81
+ # Or use the convenience script
82
+ ./scripts/run_evals.sh --ci
83
+ ```
84
+
85
+ ### All Evals (including flaky ones)
86
+
87
+ ```bash
88
+ # Set RUN_ALL_EVALS=1 to include USUALLY_PASSES
89
+ RUN_ALL_EVALS=1 pytest evals/ -v
90
+
91
+ # Or use the convenience script
92
+ ./scripts/run_evals.sh --all
93
+ ```
94
+
95
+ ### Nightly Runs
96
+
97
+ The nightly CI workflow runs all evals multiple times to track pass rates over time.
98
+
99
+ ## Environment Variables
100
+
101
+ | Variable | Description |
102
+ |----------|-------------|
103
+ | `RUN_ALL_EVALS` | Set to `1` to include `USUALLY_PASSES` tests |
104
+ | `EVAL_PROVIDER` | Provider to use: `deepseek`, `anthropic`, or `ollama` (default: `deepseek`) |
105
+ | `EVAL_MODEL` | Override the model used for evals (uses provider default if not set) |
106
+ | `DEEPSEEK_API_KEY` | API key for DeepSeek provider |
107
+ | `ANTHROPIC_API_KEY` | API key for Anthropic provider |
108
+ | `EVAL_TIMEOUT` | Timeout per eval in seconds (default: 60) |
109
+ | `EVAL_LOG_DIR` | Directory for eval logs (default: `evals/logs/`) |
110
+
111
+ **Note**: These evals use **real LLM providers** to test actual agent behavior.
112
+ You must have a valid API key set for at least one provider. DeepSeek is
113
+ recommended for its low cost and good tool-use capabilities.
114
+
115
+ ## Metrics Collected
116
+
117
+ Each eval run collects:
118
+ - **Tool calls**: Which tools were called and with what arguments
119
+ - **Token usage**: Input/output token counts
120
+ - **Latency**: Time to complete the eval
121
+ - **Pass/fail status**: Whether assertions passed
122
+
123
+ ## Adding New Evals
124
+
125
+ 1. Create a new file in `evals/` (e.g., `evals/test_my_feature.py`)
126
+ 2. Import the helpers: `from evals.helpers import EvalTestRig, eval_test`
127
+ 3. Write test functions decorated with `@eval_test("ALWAYS_PASSES")` or `@eval_test("USUALLY_PASSES")`
128
+ 4. Run your eval: `pytest evals/test_my_feature.py -v`
129
+
130
+ ## Fixing Failing Evals
131
+
132
+ If an eval is failing:
133
+
134
+ 1. Check the logs in `evals/logs/` for the full agent trajectory
135
+ 2. Review recent changes to system prompts or tool definitions
136
+ 3. Consider if the eval expectations are still valid
137
+ 4. Prefer fixing prompts over loosening eval criteria
@@ -0,0 +1 @@
1
+ """Behavioral evaluation framework for henchman-ai."""
@@ -0,0 +1,33 @@
1
+ """Pytest configuration for behavioral evals."""
2
+
3
+ import os
4
+
5
+ import pytest
6
+
7
+
8
+ def pytest_configure(config: pytest.Config) -> None:
9
+ """Register custom markers for evals."""
10
+ config.addinivalue_line(
11
+ "markers",
12
+ "always_passes: marks test as expected to always pass (run in CI)",
13
+ )
14
+ config.addinivalue_line(
15
+ "markers",
16
+ "usually_passes: marks test as expected to usually pass (run nightly)",
17
+ )
18
+
19
+
20
+ def pytest_collection_modifyitems(
21
+ config: pytest.Config,
22
+ items: list[pytest.Item],
23
+ ) -> None:
24
+ """Modify test collection based on environment."""
25
+ run_all = os.environ.get("RUN_ALL_EVALS", "").lower() in ("1", "true", "yes")
26
+
27
+ for item in items:
28
+ # Add asyncio marker to all async tests
29
+ if hasattr(item, "obj") and hasattr(item.obj, "__wrapped__"):
30
+ # Check if it's an async function
31
+ import asyncio
32
+ if asyncio.iscoroutinefunction(item.obj.__wrapped__):
33
+ item.add_marker(pytest.mark.asyncio)