@crownpeak/dqm-react-component-dev-mcp 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/README.md +138 -0
  2. package/data/.env.example +22 -0
  3. package/data/.gitattributes +47 -0
  4. package/data/.glfrc.json +7 -0
  5. package/data/.husky/pre-commit +5 -0
  6. package/data/.nvmrc +1 -0
  7. package/data/CHANGELOG.md +75 -0
  8. package/data/CODE_OF_CONDUCT.md +129 -0
  9. package/data/CONTRIBUTING.md +203 -0
  10. package/data/DOCS-STRUCTURE.md +307 -0
  11. package/data/I18N.md +292 -0
  12. package/data/LICENSE +22 -0
  13. package/data/README.md +315 -0
  14. package/data/SECURITY.md +125 -0
  15. package/data/WIKI-DEPLOYMENT.md +348 -0
  16. package/data/docs/AI-FEATURES.md +610 -0
  17. package/data/docs/API-REFERENCE.md +1022 -0
  18. package/data/docs/AUTHENTICATION.md +301 -0
  19. package/data/docs/BACKEND-API.md +468 -0
  20. package/data/docs/DEVELOPMENT.md +375 -0
  21. package/data/docs/EXAMPLES.md +622 -0
  22. package/data/docs/MCP-SERVER.md +307 -0
  23. package/data/docs/MIGRATION-GUIDE.md +367 -0
  24. package/data/docs/NPM-PUBLISH.md +193 -0
  25. package/data/docs/QUICKSTART.md +206 -0
  26. package/data/docs/REDIS-SETUP.md +162 -0
  27. package/data/docs/SERVER.md +228 -0
  28. package/data/docs/TROUBLESHOOTING.md +657 -0
  29. package/data/docs/WIDGET-GUIDE.md +638 -0
  30. package/data/docs/WIKI-HOME.md +58 -0
  31. package/data/docs/WIKI-SIDEBAR.md +39 -0
  32. package/data/package.json +171 -0
  33. package/data/playwright.config.ts +64 -0
  34. package/data/probe/.cargo/config.toml +10 -0
  35. package/data/probe/.claude/commands/performance-review.md +15 -0
  36. package/data/probe/.clinerules +288 -0
  37. package/data/probe/.dockerignore +57 -0
  38. package/data/probe/.githooks/post-commit +11 -0
  39. package/data/probe/.githooks/pre-commit +99 -0
  40. package/data/probe/.githooks/pre-commit-vow +9 -0
  41. package/data/probe/.prompts/engineer.md +41 -0
  42. package/data/probe/.roomodes +28 -0
  43. package/data/probe/.windsurfrules +0 -0
  44. package/data/probe/BASH_TOOL_SUMMARY.md +148 -0
  45. package/data/probe/BENCHMARKING.md +256 -0
  46. package/data/probe/CLAUDE.md +226 -0
  47. package/data/probe/CODE_OF_CONDUCT.md +128 -0
  48. package/data/probe/CONTRIBUTING.md +193 -0
  49. package/data/probe/Cargo.toml +120 -0
  50. package/data/probe/Cross.toml +10 -0
  51. package/data/probe/DOCKER-README.md +224 -0
  52. package/data/probe/Dockerfile +32 -0
  53. package/data/probe/ENHANCED_DEBUG_TELEMETRY.md +188 -0
  54. package/data/probe/LICENSE +201 -0
  55. package/data/probe/Makefile +210 -0
  56. package/data/probe/README.md +824 -0
  57. package/data/probe/SECURITY.md +67 -0
  58. package/data/probe/WINDOWS-GUIDE.md +294 -0
  59. package/data/probe/benches/parsing_benchmarks.rs +370 -0
  60. package/data/probe/benches/search_benchmarks.rs +599 -0
  61. package/data/probe/benches/simd_benchmarks.rs +372 -0
  62. package/data/probe/benches/timing_benchmarks.rs +287 -0
  63. package/data/probe/build-windows.bat +229 -0
  64. package/data/probe/codex-config/config.toml +6 -0
  65. package/data/probe/docs/PERFORMANCE_OPTIMIZATION.md +161 -0
  66. package/data/probe/examples/cache_demo.rs +46 -0
  67. package/data/probe/examples/chat/.dockerignore +37 -0
  68. package/data/probe/examples/chat/ChatSessionManager.js +295 -0
  69. package/data/probe/examples/chat/Dockerfile +98 -0
  70. package/data/probe/examples/chat/LICENSE +201 -0
  71. package/data/probe/examples/chat/LOCAL_IMAGE_SUPPORT.md +195 -0
  72. package/data/probe/examples/chat/MCP_INTEGRATION.md +400 -0
  73. package/data/probe/examples/chat/README.md +338 -0
  74. package/data/probe/examples/chat/TRACING.md +226 -0
  75. package/data/probe/examples/chat/appTracer.js +968 -0
  76. package/data/probe/examples/chat/auth.js +76 -0
  77. package/data/probe/examples/chat/bin/probe-chat.js +13 -0
  78. package/data/probe/examples/chat/build.js +104 -0
  79. package/data/probe/examples/chat/cancelRequest.js +84 -0
  80. package/data/probe/examples/chat/demo-agentic-image-flow.js +88 -0
  81. package/data/probe/examples/chat/demo-local-images.js +128 -0
  82. package/data/probe/examples/chat/fileSpanExporter.js +181 -0
  83. package/data/probe/examples/chat/implement/README.md +228 -0
  84. package/data/probe/examples/chat/implement/backends/AiderBackend.js +750 -0
  85. package/data/probe/examples/chat/implement/backends/BaseBackend.js +276 -0
  86. package/data/probe/examples/chat/implement/backends/ClaudeCodeBackend.js +767 -0
  87. package/data/probe/examples/chat/implement/backends/MockBackend.js +237 -0
  88. package/data/probe/examples/chat/implement/backends/registry.js +85 -0
  89. package/data/probe/examples/chat/implement/core/BackendManager.js +567 -0
  90. package/data/probe/examples/chat/implement/core/ImplementTool.js +354 -0
  91. package/data/probe/examples/chat/implement/core/config.js +428 -0
  92. package/data/probe/examples/chat/implement/core/timeouts.js +58 -0
  93. package/data/probe/examples/chat/implement/core/utils.js +496 -0
  94. package/data/probe/examples/chat/implement/types/BackendTypes.js +126 -0
  95. package/data/probe/examples/chat/index.js +669 -0
  96. package/data/probe/examples/chat/mcpServer.js +341 -0
  97. package/data/probe/examples/chat/npm/LICENSE +15 -0
  98. package/data/probe/examples/chat/npm/README.md +168 -0
  99. package/data/probe/examples/chat/npm/bin/probe-chat.js +156 -0
  100. package/data/probe/examples/chat/npm/index.js +259 -0
  101. package/data/probe/examples/chat/npm/package.json +54 -0
  102. package/data/probe/examples/chat/package.json +102 -0
  103. package/data/probe/examples/chat/probeChat.js +456 -0
  104. package/data/probe/examples/chat/probeTool.js +491 -0
  105. package/data/probe/examples/chat/storage/JsonChatStorage.js +476 -0
  106. package/data/probe/examples/chat/telemetry.js +281 -0
  107. package/data/probe/examples/chat/test/integration/chatFlows.test.js +320 -0
  108. package/data/probe/examples/chat/test/integration/toolCalling.test.js +471 -0
  109. package/data/probe/examples/chat/test/mocks/mockLLMProvider.js +269 -0
  110. package/data/probe/examples/chat/test/test-backends.js +90 -0
  111. package/data/probe/examples/chat/test/testUtils.js +530 -0
  112. package/data/probe/examples/chat/test/unit/backendTimeout.test.js +161 -0
  113. package/data/probe/examples/chat/test/unit/packageFiles.test.js +120 -0
  114. package/data/probe/examples/chat/test/verify-tests.js +118 -0
  115. package/data/probe/examples/chat/test-agentic-image-loading.js +294 -0
  116. package/data/probe/examples/chat/test-ai-sdk-telemetry.js +204 -0
  117. package/data/probe/examples/chat/test-chat-tracing.js +38 -0
  118. package/data/probe/examples/chat/test-direct-function.js +49 -0
  119. package/data/probe/examples/chat/test-file-size-validation.js +103 -0
  120. package/data/probe/examples/chat/test-full-mcp-integration.js +258 -0
  121. package/data/probe/examples/chat/test-github-context.txt +12 -0
  122. package/data/probe/examples/chat/test-hierarchy.js +203 -0
  123. package/data/probe/examples/chat/test-image-spans.js +37 -0
  124. package/data/probe/examples/chat/test-local-image-reading.js +176 -0
  125. package/data/probe/examples/chat/test-mcp-integration.js +136 -0
  126. package/data/probe/examples/chat/test-mcp-probe-server.js +161 -0
  127. package/data/probe/examples/chat/test-mcp-with-ai.js +279 -0
  128. package/data/probe/examples/chat/test-multiple-allowed-dirs.js +111 -0
  129. package/data/probe/examples/chat/test-probe-mcp-server.js +110 -0
  130. package/data/probe/examples/chat/test-security-validation.js +145 -0
  131. package/data/probe/examples/chat/test-simple-tracing.js +32 -0
  132. package/data/probe/examples/chat/test-trace-verification.js +235 -0
  133. package/data/probe/examples/chat/test-tracing.js +114 -0
  134. package/data/probe/examples/chat/tokenCounter.js +419 -0
  135. package/data/probe/examples/chat/tokenUsageDisplay.js +134 -0
  136. package/data/probe/examples/chat/webServer.js +1103 -0
  137. package/data/probe/examples/reranker/Cargo.toml +33 -0
  138. package/data/probe/examples/reranker/DEBUG_OUTPUT_ANALYSIS.md +71 -0
  139. package/data/probe/examples/reranker/MODELS.md +66 -0
  140. package/data/probe/examples/reranker/MODEL_COMPARISON.md +60 -0
  141. package/data/probe/examples/reranker/MULTI_MODEL_ANALYSIS.md +176 -0
  142. package/data/probe/examples/reranker/PERFORMANCE_SUMMARY.md +156 -0
  143. package/data/probe/examples/reranker/README.md +347 -0
  144. package/data/probe/examples/reranker/RUST_BERT_COMPARISON.md +82 -0
  145. package/data/probe/examples/reranker/TOKENIZATION_GUIDE.md +120 -0
  146. package/data/probe/examples/reranker/check_rust_tokenizer.py +108 -0
  147. package/data/probe/examples/reranker/convert_to_torchscript.py +109 -0
  148. package/data/probe/examples/reranker/debug_scoring.py +189 -0
  149. package/data/probe/examples/reranker/debug_tokenization.py +154 -0
  150. package/data/probe/examples/reranker/download_models.sh +73 -0
  151. package/data/probe/examples/reranker/requirements.txt +13 -0
  152. package/data/probe/examples/reranker/run_comprehensive_benchmark.sh +83 -0
  153. package/data/probe/examples/reranker/rust_bert_test/Cargo.toml +12 -0
  154. package/data/probe/examples/reranker/rust_bert_test/README.md +54 -0
  155. package/data/probe/examples/reranker/simple_test.py +50 -0
  156. package/data/probe/examples/reranker/test_all_models.sh +63 -0
  157. package/data/probe/examples/reranker/test_bert_results.sh +44 -0
  158. package/data/probe/examples/reranker/test_cross_encoder.py +334 -0
  159. package/data/probe/examples/reranker/test_cross_encoder.sh +80 -0
  160. package/data/probe/examples/reranker/test_exact_comparison.py +151 -0
  161. package/data/probe/examples/reranker/test_parallel_performance.sh +56 -0
  162. package/data/probe/examples/reranker/test_scores.py +132 -0
  163. package/data/probe/install.ps1 +508 -0
  164. package/data/probe/install.sh +460 -0
  165. package/data/probe/npm/CLONE_METHOD_EXAMPLES.md +596 -0
  166. package/data/probe/npm/CONTEXT_COMPACTION.md +303 -0
  167. package/data/probe/npm/DELEGATE_TOOL_README.md +166 -0
  168. package/data/probe/npm/MAID_INTEGRATION.md +313 -0
  169. package/data/probe/npm/MCP_INTEGRATION_SUMMARY.md +241 -0
  170. package/data/probe/npm/README.md +824 -0
  171. package/data/probe/npm/bin/.gitignore +7 -0
  172. package/data/probe/npm/bin/.gitkeep +0 -0
  173. package/data/probe/npm/bin/README.md +12 -0
  174. package/data/probe/npm/bin/probe +167 -0
  175. package/data/probe/npm/docs/CLAUDE_CODE_INTEGRATION.md +414 -0
  176. package/data/probe/npm/docs/CODEX_INTEGRATION.md +502 -0
  177. package/data/probe/npm/docs/EDIT_CREATE_TOOLS.md +233 -0
  178. package/data/probe/npm/docs/RETRY_AND_FALLBACK.md +674 -0
  179. package/data/probe/npm/example-usage.js +335 -0
  180. package/data/probe/npm/examples/multi-engine-demo.js +117 -0
  181. package/data/probe/npm/examples/probe-agent-cli.js +113 -0
  182. package/data/probe/npm/examples/test-agent-edit.js +114 -0
  183. package/data/probe/npm/examples/test-edit-create.js +120 -0
  184. package/data/probe/npm/examples/test-edit-direct.js +114 -0
  185. package/data/probe/npm/index.d.ts +744 -0
  186. package/data/probe/npm/jest.config.js +52 -0
  187. package/data/probe/npm/package.json +117 -0
  188. package/data/probe/npm/scripts/build-agent.cjs +75 -0
  189. package/data/probe/npm/scripts/build-cjs.js +124 -0
  190. package/data/probe/npm/scripts/build-mcp.cjs +36 -0
  191. package/data/probe/npm/scripts/postinstall.js +216 -0
  192. package/data/probe/npm/test-codex-e2e.js +78 -0
  193. package/data/probe/npm/test-download-lock.js +109 -0
  194. package/data/probe/npm/test-grep-security.js +94 -0
  195. package/data/probe/npm/test-grep-simplified.js +63 -0
  196. package/data/probe/npm/test-grep.js +51 -0
  197. package/data/probe/npm/tests/README.md +96 -0
  198. package/data/probe/npm/tests/agent-compact-history.test.js +174 -0
  199. package/data/probe/npm/tests/allow-tests-default.test.js +151 -0
  200. package/data/probe/npm/tests/contextCompactor.test.js +498 -0
  201. package/data/probe/npm/tests/delegate-config.test.js +353 -0
  202. package/data/probe/npm/tests/delegate-integration.test.js +348 -0
  203. package/data/probe/npm/tests/extractor-integration.test.js +162 -0
  204. package/data/probe/npm/tests/extractor.test.js +317 -0
  205. package/data/probe/npm/tests/fixtures/sampleDiagrams.js +267 -0
  206. package/data/probe/npm/tests/integration/claude-code-auto-fallback.spec.js +148 -0
  207. package/data/probe/npm/tests/integration/claude-code-multi-step.spec.js +127 -0
  208. package/data/probe/npm/tests/integration/claude-code-tool-events.spec.js +163 -0
  209. package/data/probe/npm/tests/integration/codex-auto-fallback.spec.js +191 -0
  210. package/data/probe/npm/tests/integration/codex-tool-events.spec.js +147 -0
  211. package/data/probe/npm/tests/integration/examplesChatMcp.test.js +402 -0
  212. package/data/probe/npm/tests/integration/mcpDotenvSupport.test.js +174 -0
  213. package/data/probe/npm/tests/integration/mcpErrorHandling.test.js +566 -0
  214. package/data/probe/npm/tests/integration/mcpRobustness.test.js +564 -0
  215. package/data/probe/npm/tests/integration/mcpStdoutPurity.test.js +355 -0
  216. package/data/probe/npm/tests/integration/probeAgentMcp.test.js +398 -0
  217. package/data/probe/npm/tests/integration/retryFallback.test.js +368 -0
  218. package/data/probe/npm/tests/integration/schema-in-initial-message.test.js +318 -0
  219. package/data/probe/npm/tests/integration/schema-validation-loop-prevention.test.js +244 -0
  220. package/data/probe/npm/tests/integration/schemaRetryLogic.test.js +94 -0
  221. package/data/probe/npm/tests/integration/validationFlow.test.js +329 -0
  222. package/data/probe/npm/tests/manual/test-codex-basic.js +110 -0
  223. package/data/probe/npm/tests/mcp/mcpClientManager.test.js +614 -0
  224. package/data/probe/npm/tests/mcp/mcpConfig.test.js +359 -0
  225. package/data/probe/npm/tests/mcp/mcpXmlBridge.test.js +436 -0
  226. package/data/probe/npm/tests/mcp/mockMcpServer.js +510 -0
  227. package/data/probe/npm/tests/mcp-strict-syntax.test.js +319 -0
  228. package/data/probe/npm/tests/mermaidQuoteEscaping.test.js +214 -0
  229. package/data/probe/npm/tests/nestedQuoteFix.test.js +40 -0
  230. package/data/probe/npm/tests/setup.js +46 -0
  231. package/data/probe/npm/tests/unit/allowed-tools.test.js +513 -0
  232. package/data/probe/npm/tests/unit/attempt-completion-closing-tag-in-content.test.js +188 -0
  233. package/data/probe/npm/tests/unit/attemptCompletionJsonFix.test.js +238 -0
  234. package/data/probe/npm/tests/unit/attemptCompletionJsonIssue.test.js +128 -0
  235. package/data/probe/npm/tests/unit/backtickAutoFix.test.js +35 -0
  236. package/data/probe/npm/tests/unit/bash-probe-agent-integration.test.js +389 -0
  237. package/data/probe/npm/tests/unit/bash-simple-commands.test.js +324 -0
  238. package/data/probe/npm/tests/unit/bash-tool-comprehensive.test.js +371 -0
  239. package/data/probe/npm/tests/unit/bash-tool-integration.test.js +310 -0
  240. package/data/probe/npm/tests/unit/bash-tool.test.js +341 -0
  241. package/data/probe/npm/tests/unit/completion-prompt.test.js +379 -0
  242. package/data/probe/npm/tests/unit/cwd-path-options.test.js +287 -0
  243. package/data/probe/npm/tests/unit/delegate-limits.test.js +422 -0
  244. package/data/probe/npm/tests/unit/direct-content-attempt-completion.test.js +235 -0
  245. package/data/probe/npm/tests/unit/edit-create-tools.test.js +609 -0
  246. package/data/probe/npm/tests/unit/enhancedMermaidValidation.test.js +577 -0
  247. package/data/probe/npm/tests/unit/extract-content.test.js +83 -0
  248. package/data/probe/npm/tests/unit/extract-multiple-targets.test.js +89 -0
  249. package/data/probe/npm/tests/unit/fallbackManager.test.js +442 -0
  250. package/data/probe/npm/tests/unit/githubCompatibilityValidation.test.js +258 -0
  251. package/data/probe/npm/tests/unit/imageConfig.test.js +149 -0
  252. package/data/probe/npm/tests/unit/imagePathResolution.test.js +345 -0
  253. package/data/probe/npm/tests/unit/json-fixing-agent.test.js +238 -0
  254. package/data/probe/npm/tests/unit/json-validation-enhanced-errors.test.js +199 -0
  255. package/data/probe/npm/tests/unit/jsonValidationInfiniteLoopFix.test.js +228 -0
  256. package/data/probe/npm/tests/unit/maidIntegration.test.js +139 -0
  257. package/data/probe/npm/tests/unit/maxIterationsWarning.test.js +195 -0
  258. package/data/probe/npm/tests/unit/mermaidEdgeLabelFix.test.js +161 -0
  259. package/data/probe/npm/tests/unit/mermaidHtmlEntities.test.js +76 -0
  260. package/data/probe/npm/tests/unit/mermaidInfiniteLoopFix.test.js +64 -0
  261. package/data/probe/npm/tests/unit/mermaidValidation.test.js +723 -0
  262. package/data/probe/npm/tests/unit/mermaidValidationVisorExample.test.js +309 -0
  263. package/data/probe/npm/tests/unit/probe-agent-clone-realistic.test.js +643 -0
  264. package/data/probe/npm/tests/unit/probe-agent-clone.test.js +476 -0
  265. package/data/probe/npm/tests/unit/probe-agent-delegate.test.js +400 -0
  266. package/data/probe/npm/tests/unit/probe-agent-model-option.test.js +118 -0
  267. package/data/probe/npm/tests/unit/probeTool-security.test.js +283 -0
  268. package/data/probe/npm/tests/unit/readImageTool.test.js +418 -0
  269. package/data/probe/npm/tests/unit/retryManager.test.js +317 -0
  270. package/data/probe/npm/tests/unit/schema-aware-reminders.test.js +288 -0
  271. package/data/probe/npm/tests/unit/schemaDefinitionDetection.test.js +115 -0
  272. package/data/probe/npm/tests/unit/schemaUtils.test.js +1268 -0
  273. package/data/probe/npm/tests/unit/simpleTelemetry.test.js +282 -0
  274. package/data/probe/npm/tests/unit/simplified-attempt-completion.test.js +274 -0
  275. package/data/probe/npm/tests/unit/single-quote-json-bug.test.js +231 -0
  276. package/data/probe/npm/tests/unit/subgraphAutoFix.test.js +110 -0
  277. package/data/probe/npm/tests/unit/system-prompt.test.js +32 -0
  278. package/data/probe/npm/tests/unit/types-probe-agent-options.test.js +42 -0
  279. package/data/probe/npm/tests/unit/xmlParsing.test.js +720 -0
  280. package/data/probe/npm/tsconfig.json +21 -0
  281. package/data/probe/result1.txt +19 -0
  282. package/data/probe/result2.txt +26 -0
  283. package/data/probe/scripts/benchmark.sh +270 -0
  284. package/data/probe/scripts/cache_memory_analysis.rs +844 -0
  285. package/data/probe/scripts/claude-hook-wrapper.sh +56 -0
  286. package/data/probe/site/.env.example +10 -0
  287. package/data/probe/site/DEPLOYMENT.md +86 -0
  288. package/data/probe/site/README.md +183 -0
  289. package/data/probe/site/adding-languages.md +135 -0
  290. package/data/probe/site/ai-chat.md +427 -0
  291. package/data/probe/site/ai-integration.md +1488 -0
  292. package/data/probe/site/blog/agentic-flow-custom-xml-protocol.md +407 -0
  293. package/data/probe/site/blog/index.md +118 -0
  294. package/data/probe/site/blog/v0.6.0-release.md +426 -0
  295. package/data/probe/site/blog.md +8 -0
  296. package/data/probe/site/changelog.md +200 -0
  297. package/data/probe/site/cli-mode.md +437 -0
  298. package/data/probe/site/code-extraction.md +436 -0
  299. package/data/probe/site/contributing/README.md +9 -0
  300. package/data/probe/site/contributing/documentation-cross-references.md +215 -0
  301. package/data/probe/site/contributing/documentation-maintenance.md +275 -0
  302. package/data/probe/site/contributing/documentation-structure.md +75 -0
  303. package/data/probe/site/documentation-cross-references.md +215 -0
  304. package/data/probe/site/documentation-guide.md +132 -0
  305. package/data/probe/site/documentation-maintenance.md +275 -0
  306. package/data/probe/site/features.md +147 -0
  307. package/data/probe/site/how-it-works.md +118 -0
  308. package/data/probe/site/index.md +175 -0
  309. package/data/probe/site/index.md.bak +133 -0
  310. package/data/probe/site/installation.md +235 -0
  311. package/data/probe/site/integrations/docker.md +248 -0
  312. package/data/probe/site/integrations/github-actions.md +413 -0
  313. package/data/probe/site/language-support-overview.md +168 -0
  314. package/data/probe/site/mcp-integration.md +587 -0
  315. package/data/probe/site/mcp-server.md +304 -0
  316. package/data/probe/site/navigation-structure.md +76 -0
  317. package/data/probe/site/nodejs-sdk.md +798 -0
  318. package/data/probe/site/output-formats.md +625 -0
  319. package/data/probe/site/package.json +21 -0
  320. package/data/probe/site/public/_headers +28 -0
  321. package/data/probe/site/public/_redirects +11 -0
  322. package/data/probe/site/quick-start.md +289 -0
  323. package/data/probe/site/search-functionality.md +291 -0
  324. package/data/probe/site/search-reference.md +291 -0
  325. package/data/probe/site/supported-languages.md +215 -0
  326. package/data/probe/site/use-cases/README.md +8 -0
  327. package/data/probe/site/use-cases/advanced-cli.md +253 -0
  328. package/data/probe/site/use-cases/ai-code-editors.md +239 -0
  329. package/data/probe/site/use-cases/building-ai-tools.md +529 -0
  330. package/data/probe/site/use-cases/cli-ai-workflows.md +285 -0
  331. package/data/probe/site/use-cases/deploying-probe-web-interface.md +255 -0
  332. package/data/probe/site/use-cases/integrating-probe-into-ai-code-editors.md +161 -0
  333. package/data/probe/site/use-cases/nodejs-sdk.md +596 -0
  334. package/data/probe/site/use-cases/team-chat.md +350 -0
  335. package/data/probe/site/web-interface.md +434 -0
  336. package/data/probe/site/wrangler.toml +9 -0
  337. package/data/probe/test-api-key.sh +1 -0
  338. package/data/probe/test-probe-implementation/hello.js +7 -0
  339. package/data/probe/test_cases/demonstrate_early_termination_issues.sh +176 -0
  340. package/data/probe/test_cases/early_termination_issues.rs +533 -0
  341. package/data/probe/test_data/test_nested_struct.go +26 -0
  342. package/data/probe/tests/README.md +286 -0
  343. package/data/probe/tests/README_search_determinism_tests.md +116 -0
  344. package/data/probe/tests/adjacent_comment_test.rs +152 -0
  345. package/data/probe/tests/apostrophe_handling_tests.rs +132 -0
  346. package/data/probe/tests/block_filtering_with_ast_tests.rs +669 -0
  347. package/data/probe/tests/block_merging_tests.rs +396 -0
  348. package/data/probe/tests/c_outline_format_tests.rs +2179 -0
  349. package/data/probe/tests/cache_invalidation_issues.rs.disabled +682 -0
  350. package/data/probe/tests/cache_order_tests.rs +147 -0
  351. package/data/probe/tests/cache_query_scoping_tests.rs +221 -0
  352. package/data/probe/tests/cli_tests.rs +680 -0
  353. package/data/probe/tests/comment_context_integration_test.rs +240 -0
  354. package/data/probe/tests/common.rs +33 -0
  355. package/data/probe/tests/complex_block_merging_tests.rs +599 -0
  356. package/data/probe/tests/complex_query_block_filtering_tests.rs +422 -0
  357. package/data/probe/tests/control_flow_closing_braces_test.rs +91 -0
  358. package/data/probe/tests/cpp_outline_format_tests.rs +1507 -0
  359. package/data/probe/tests/csharp_outline_format_tests.rs +941 -0
  360. package/data/probe/tests/elastic_query_integration_tests.rs +922 -0
  361. package/data/probe/tests/extract_command_tests.rs +1848 -0
  362. package/data/probe/tests/extract_deduplication_tests.rs +146 -0
  363. package/data/probe/tests/extract_input_file_tests.rs +84 -0
  364. package/data/probe/tests/extract_prompt_tests.rs +102 -0
  365. package/data/probe/tests/filename_search_tests.rs +96 -0
  366. package/data/probe/tests/fixtures/user/AssemblyInfo.cs +3 -0
  367. package/data/probe/tests/github_extract_tests.rs +234 -0
  368. package/data/probe/tests/go_comment_test.rs +253 -0
  369. package/data/probe/tests/go_outline_format_tests.rs +2587 -0
  370. package/data/probe/tests/go_path_resolver_tests.rs +96 -0
  371. package/data/probe/tests/html_outline_format_tests.rs +637 -0
  372. package/data/probe/tests/integration_tests.rs +837 -0
  373. package/data/probe/tests/ip_whitelist_test.rs +148 -0
  374. package/data/probe/tests/java_outline_format_tests.rs +1611 -0
  375. package/data/probe/tests/javascript_extract_tests.rs +315 -0
  376. package/data/probe/tests/javascript_outline_format_tests.rs +1464 -0
  377. package/data/probe/tests/json_format_tests.rs +436 -0
  378. package/data/probe/tests/json_schema_validation_tests.rs +450 -0
  379. package/data/probe/tests/lib_usage.rs +60 -0
  380. package/data/probe/tests/line_comment_context_extension_test.rs +459 -0
  381. package/data/probe/tests/line_map_cache_tests.rs +114 -0
  382. package/data/probe/tests/markdown_integration_tests.rs +190 -0
  383. package/data/probe/tests/mocks/test_ip_whitelist.go +11 -0
  384. package/data/probe/tests/mocks/test_object.js +27 -0
  385. package/data/probe/tests/mocks/test_struct.go +50 -0
  386. package/data/probe/tests/multi_keyword_pattern_tests.rs +464 -0
  387. package/data/probe/tests/multi_language_syntax_integration_tests.rs +218 -0
  388. package/data/probe/tests/multiple_capture_groups_tests.rs +169 -0
  389. package/data/probe/tests/negative_compound_word_tests.rs +246 -0
  390. package/data/probe/tests/nested_symbol_extraction_tests.rs +99 -0
  391. package/data/probe/tests/outline_cross_file_interference_test.rs +335 -0
  392. package/data/probe/tests/outline_keyword_preservation_test.rs +67 -0
  393. package/data/probe/tests/output_format_edge_cases_tests.rs +693 -0
  394. package/data/probe/tests/parallel_extraction_tests.rs +178 -0
  395. package/data/probe/tests/parallel_search_tests.rs +355 -0
  396. package/data/probe/tests/path_resolver_tests.rs +698 -0
  397. package/data/probe/tests/php_outline_format_extended_tests.rs +928 -0
  398. package/data/probe/tests/php_outline_format_tests.rs +768 -0
  399. package/data/probe/tests/property_tests.proptest-regressions +9 -0
  400. package/data/probe/tests/property_tests.rs +118 -0
  401. package/data/probe/tests/python_outline_format_tests.rs +1538 -0
  402. package/data/probe/tests/query_command_json_tests.rs +438 -0
  403. package/data/probe/tests/query_command_tests.rs +232 -0
  404. package/data/probe/tests/query_command_xml_tests.rs +569 -0
  405. package/data/probe/tests/quoted_term_with_negative_keyword_tests.rs +216 -0
  406. package/data/probe/tests/required_terms_filename_tests.rs +116 -0
  407. package/data/probe/tests/ruby_outline_format_tests.rs +1011 -0
  408. package/data/probe/tests/rust_line_comment_context_test.rs +151 -0
  409. package/data/probe/tests/rust_outline_format_enhanced_tests.rs +725 -0
  410. package/data/probe/tests/rust_outline_format_tests.rs +843 -0
  411. package/data/probe/tests/schemas/xml_output_schema.xsd +38 -0
  412. package/data/probe/tests/search_determinism_tests.rs +451 -0
  413. package/data/probe/tests/search_hints_tests.rs +253 -0
  414. package/data/probe/tests/special_character_escaping_tests.rs +417 -0
  415. package/data/probe/tests/stemming_compound_word_filtering_tests.rs +535 -0
  416. package/data/probe/tests/strict_elastic_syntax_tests.rs +404 -0
  417. package/data/probe/tests/swift_outline_format_tests.rs +3319 -0
  418. package/data/probe/tests/symbols_tests.rs +166 -0
  419. package/data/probe/tests/test_file.rs +45 -0
  420. package/data/probe/tests/test_tokenize.rs +28 -0
  421. package/data/probe/tests/timeout_tests.rs +82 -0
  422. package/data/probe/tests/tokenization_tests.rs +195 -0
  423. package/data/probe/tests/tokenized_block_filtering_tests.rs +174 -0
  424. package/data/probe/tests/typescript_extract_tests.rs +214 -0
  425. package/data/probe/tests/typescript_outline_format_tests.rs +2188 -0
  426. package/data/probe/tests/xml_format_tests.rs +568 -0
  427. package/data/probe/tests/xml_schema_validation_tests.rs +497 -0
  428. package/data/scripts/postinstall.mjs +9 -0
  429. package/data/scripts/set-version.js +0 -0
  430. package/data/scripts/wiki-build.sh +111 -0
  431. package/data/scripts/wiki-deploy.sh +73 -0
  432. package/data/serve.json +12 -0
  433. package/data/test/demo-dynamic.html +134 -0
  434. package/data/test/demo-esm.html +105 -0
  435. package/data/test/demo-iife.html +78 -0
  436. package/data/tsconfig.json +7 -0
  437. package/data/vite.server.ts +483 -0
  438. package/data/vitest.config.ts +40 -0
  439. package/data/wiki/Home.md +58 -0
  440. package/data/wiki/_Sidebar.md +39 -0
  441. package/docs-mcp.config.json +20 -0
  442. package/package.json +56 -0
  443. package/src/config.js +111 -0
  444. package/src/index.js +395 -0
@@ -0,0 +1,347 @@
1
+ # BERT Reranker Example
2
+
3
+ A complete working Rust implementation of a BERT-based document reranker using the Candle framework. This example demonstrates how to use transformer models for document reranking tasks, specifically using the ms-marco-MiniLM-L-2-v2 model.
4
+
5
+ ## Overview
6
+
7
+ This implementation provides a cross-encoder based reranker that:
8
+ - Loads pre-trained BERT models from HuggingFace Hub
9
+ - Processes query-document pairs through the transformer
10
+ - Computes relevance scores for ranking
11
+ - Sorts documents by relevance to the query
12
+
13
+ ## Features
14
+
15
+ - **Pure Rust Implementation**: Uses the Candle framework for ML inference
16
+ - **HuggingFace Integration**: Automatic model and tokenizer downloading
17
+ - **Cross-Encoder Architecture**: Proper query-document pair encoding
18
+ - **Flexible Model Support**: Works with various BERT-based reranking models
19
+ - **Interactive Mode**: Test reranking with custom queries and documents
20
+ - **Command Line Interface**: Easy to use from command line or scripts
21
+
22
+ ## Installation and Setup
23
+
24
+ ### Prerequisites
25
+
26
+ - Rust 1.70 or later
27
+ - Internet connection (for model downloading)
28
+
29
+ ### Building the Project
30
+
31
+ ```bash
32
+ cd examples/reranker
33
+ cargo build --release
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ ### Basic Usage with Default Documents
39
+
40
+ ```bash
41
+ # Use the default ms-marco-MiniLM-L-2-v2 model
42
+ cargo run --release -- --query "machine learning"
43
+
44
+ # Or run the binary directly after building
45
+ ./target/release/reranker --query "rust programming"
46
+ ```
47
+
48
+ ### Using Custom Documents
49
+
50
+ ```bash
51
+ cargo run --release -- \
52
+ --query "natural language processing" \
53
+ --documents "BERT is a transformer model,Python is a programming language,NLP involves text processing,Rust is systems programming"
54
+ ```
55
+
56
+ ### Interactive Mode
57
+
58
+ ```bash
59
+ cargo run --release -- --query "your query here" --interactive
60
+ ```
61
+
62
+ This will prompt you to enter documents one by one, then rerank them.
63
+
64
+ ### Using Different Models
65
+
66
+ ```bash
67
+ # Use a different cross-encoder model
68
+ cargo run --release -- \
69
+ --model "cross-encoder/ms-marco-MiniLM-L-6-v2" \
70
+ --query "information retrieval"
71
+
72
+ # Use PyTorch weights instead of SafeTensors
73
+ cargo run --release -- \
74
+ --model "cross-encoder/ms-marco-MiniLM-L-2-v2" \
75
+ --use-pth \
76
+ --query "document ranking"
77
+ ```
78
+
79
+ ## Supported Models
80
+
81
+ This implementation works with cross-encoder models from HuggingFace Hub. Recommended models:
82
+
83
+ - `cross-encoder/ms-marco-MiniLM-L-2-v2` (default, fast and efficient)
84
+ - `cross-encoder/ms-marco-MiniLM-L-6-v2` (larger, potentially more accurate)
85
+ - `cross-encoder/ms-marco-MiniLM-L-12-v2` (largest, highest accuracy)
86
+
87
+ ## Command Line Options
88
+
89
+ - `--model, -m`: HuggingFace model ID (default: `cross-encoder/ms-marco-MiniLM-L-2-v2`)
90
+ - `--revision, -r`: Model revision/branch (default: `main`)
91
+ - `--use-pth`: Use PyTorch weights instead of SafeTensors
92
+ - `--query, -q`: Search query (required)
93
+ - `--documents, -d`: Comma-separated list of documents to rerank
94
+ - `--interactive, -i`: Run in interactive mode
95
+
96
+ ## Example Output
97
+
98
+ ```
99
+ Initializing BERT Reranker...
100
+ Model: cross-encoder/ms-marco-MiniLM-L-2-v2
101
+ Revision: main
102
+ Using PyTorch weights: false
103
+
104
+ === Example Usage ===
105
+ Query: machine learning
106
+ Documents to rerank:
107
+ 1. Rust is a systems programming language focused on safety and performance.
108
+ 2. Python is a high-level programming language known for its simplicity.
109
+ 3. Machine learning involves training algorithms on data to make predictions.
110
+ 4. BERT is a transformer-based model for natural language understanding.
111
+ 5. The Candle framework provides machine learning capabilities in Rust.
112
+ 6. Cross-encoders are used for reranking tasks in information retrieval.
113
+ 7. Tokenization is the process of breaking text into individual tokens.
114
+ 8. Neural networks consist of interconnected nodes that process information.
115
+
116
+ Loading BERT reranker model: cross-encoder/ms-marco-MiniLM-L-2-v2
117
+ Config file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/config.json"
118
+ Tokenizer file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/tokenizer.json"
119
+ Weights file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/model.safetensors"
120
+ BERT model loaded successfully
121
+
122
+ Reranking 8 documents for query: 'machine learning'
123
+ Reranking completed
124
+
125
+ === Reranking Results ===
126
+ Documents ranked by relevance to query:
127
+ 1. #3: 2.8934 - Machine learning involves training algorithms on data to make predictions.
128
+ 2. #5: 2.1203 - The Candle framework provides machine learning capabilities in Rust.
129
+ 3. #4: 1.9876 - BERT is a transformer-based model for natural language understanding.
130
+ 4. #8: 1.7432 - Neural networks consist of interconnected nodes that process information.
131
+ 5. #6: 1.5621 - Cross-encoders are used for reranking tasks in information retrieval.
132
+ 6. #2: 0.9834 - Python is a high-level programming language known for its simplicity.
133
+ 7. #7: 0.8976 - Tokenization is the process of breaking text into individual tokens.
134
+ 8. #1: 0.7654 - Rust is a systems programming language focused on safety and performance.
135
+ ```
136
+
137
+ ## Architecture Details
138
+
139
+ ### Cross-Encoder Approach
140
+
141
+ This implementation uses a cross-encoder architecture where:
142
+ 1. Query and document are concatenated with a `[SEP]` token
143
+ 2. The combined text is tokenized and fed through BERT
144
+ 3. The `[CLS]` token embedding is used to compute a relevance score
145
+ 4. Documents are ranked by their scores
146
+
147
+ ### Model Components
148
+
149
+ - **Tokenizer**: Converts text to tokens using HuggingFace tokenizers
150
+ - **BERT Model**: Transformer encoder for processing text
151
+ - **Scoring**: Uses the CLS token embedding sum as relevance score
152
+
153
+ ### Performance Considerations
154
+
155
+ - **CPU Inference**: Runs on CPU by default (GPU support can be added)
156
+ - **Memory Usage**: Models are loaded once and reused for multiple queries
157
+ - **Caching**: HuggingFace Hub automatically caches downloaded models
158
+
159
+ ## Extending the Example
160
+
161
+ ### Adding GPU Support
162
+
163
+ To enable GPU acceleration, modify the device initialization:
164
+
165
+ ```rust
166
+ let device = Device::new_cuda(0)?; // Use GPU 0
167
+ // or
168
+ let device = Device::new_metal(0)?; // Use Metal on macOS
169
+ ```
170
+
171
+ ### Custom Scoring Functions
172
+
173
+ The current implementation uses a simple sum of CLS embeddings. For production use, consider:
174
+ - Adding a linear classification head
175
+ - Using cosine similarity between query and document embeddings
176
+ - Implementing attention-based scoring mechanisms
177
+
178
+ ### Batch Processing
179
+
180
+ For better performance with multiple documents, implement batch processing:
181
+
182
+ ```rust
183
+ // Process multiple query-document pairs simultaneously
184
+ fn batch_rerank(&self, query: &str, documents: &[&str]) -> Result<Vec<f32>> {
185
+ // Implementation for batch processing
186
+ }
187
+ ```
188
+
189
+ ## Troubleshooting
190
+
191
+ ### Common Issues
192
+
193
+ 1. **Model Download Failures**
194
+ - Check internet connection
195
+ - Verify model ID exists on HuggingFace Hub
196
+ - Try using `--use-pth` flag if SafeTensors download fails
197
+ - **For testing**: Use the demo version (`./target/release/demo`) which doesn't require model downloads
198
+
199
+ 2. **Memory Issues**
200
+ - Use smaller models (L-2 instead of L-12)
201
+ - Process documents in smaller batches
202
+ - Reduce sequence length in tokenizer
203
+
204
+ 3. **Performance Issues**
205
+ - Enable GPU support if available
206
+ - Use release builds (`cargo build --release`)
207
+ - Consider model quantization for faster inference
208
+
209
+ 4. **HuggingFace Hub API Issues**
210
+ - Some models may have download restrictions or require authentication
211
+ - The demo version provides the same interface without requiring model downloads
212
+ - Check HuggingFace Hub status if experiencing consistent download failures
213
+
214
+ ### Debug Mode
215
+
216
+ Enable debug logging by setting the `RUST_LOG` environment variable:
217
+
218
+ ```bash
219
+ RUST_LOG=debug cargo run --release -- --query "your query"
220
+ ```
221
+
222
+ ## Integration with Code Search
223
+
224
+ This reranker can be integrated with the main probe code search tool to improve result relevance:
225
+
226
+ ```rust
227
+ // Example integration
228
+ let search_results = probe::search("function authentication")?;
229
+ let documents: Vec<&str> = search_results.iter().map(|r| r.content.as_str()).collect();
230
+ let reranked = reranker.rerank("user authentication", &documents)?;
231
+ ```
232
+
233
+ ## Testing
234
+
235
+ ### Demo Version (No Model Download Required)
236
+
237
+ For quick testing without downloading models, use the demo version:
238
+
239
+ ```bash
240
+ # Build both the real and demo versions
241
+ cargo build --release
242
+
243
+ # Test the demo version with mock reranking
244
+ ./target/release/demo --query "machine learning"
245
+
246
+ # Test interactive mode
247
+ ./target/release/demo --query "neural networks" --interactive
248
+ ```
249
+
250
+ The demo version uses simple word overlap instead of BERT models and demonstrates the complete interface.
251
+
252
+ ### Testing with Real Models
253
+
254
+ Run the test suite:
255
+
256
+ ```bash
257
+ cargo test
258
+ ```
259
+
260
+ For integration tests with actual models:
261
+
262
+ ```bash
263
+ cargo test --release -- --ignored
264
+ ```
265
+
266
+ ## Contributing
267
+
268
+ This is an example implementation demonstrating BERT reranking with Candle. For production use, consider:
269
+ - Adding comprehensive error handling
270
+ - Implementing proper cross-encoder head architecture
271
+ - Adding support for different similarity metrics
272
+ - Optimizing for batch processing and GPU acceleration
273
+
274
+ ## Python Cross-Encoder Testing
275
+
276
+ For debugging and comparing Python vs Rust implementations, several Python testing tools are provided:
277
+
278
+ ### Comprehensive Testing Script
279
+
280
+ ```bash
281
+ # Run comprehensive cross-encoder testing
282
+ ./test_cross_encoder.sh
283
+
284
+ # Or run Python script directly (requires dependencies)
285
+ python3 test_cross_encoder.py
286
+ ```
287
+
288
+ This script:
289
+ - Tests both `transformers` and `sentence-transformers` libraries
290
+ - Shows detailed tokenization analysis (token IDs, attention masks, special tokens)
291
+ - Compares scores between relevant and irrelevant queries
292
+ - Saves results to JSON for further analysis
293
+ - Provides debugging recommendations for Rust implementation
294
+
295
+ ### Quick Debugging Script
296
+
297
+ ```bash
298
+ # Run focused debugging tests
299
+ python3 debug_scoring.py
300
+ ```
301
+
302
+ This minimal script:
303
+ - Tests hardcoded query-document pairs
304
+ - Shows raw logits and final scores
305
+ - Easy to modify for specific test cases
306
+ - Highlights score differences and discrimination quality
307
+
308
+ ### Dependencies
309
+
310
+ Install Python dependencies:
311
+
312
+ ```bash
313
+ pip3 install -r requirements.txt
314
+ ```
315
+
316
+ Required packages:
317
+ - `torch` - PyTorch for model inference
318
+ - `transformers` - HuggingFace transformers library
319
+ - `sentence-transformers` - Cross-encoder wrapper (optional but recommended)
320
+ - `numpy` - Numerical operations
321
+
322
+ ### Test Cases
323
+
324
+ Both scripts test these scenarios by default:
325
+ - **Relevant Query**: "how does authentication work"
326
+ - **Irrelevant Query**: "foobar random nonsense gibberish"
327
+ - **Document**: Authentication-related text snippet
328
+
329
+ Expected behavior:
330
+ - Relevant query should score >0.5 (high relevance)
331
+ - Irrelevant query should score <0.5 (low relevance)
332
+ - Score difference should be significant (>0.1)
333
+
334
+ ### Debugging Rust Implementation
335
+
336
+ Use these Python scripts to debug Rust cross-encoder issues:
337
+
338
+ 1. **Compare tokenization**: Check if token IDs match between Python and Rust
339
+ 2. **Compare raw logits**: Verify model outputs before activation functions
340
+ 3. **Compare final scores**: Check if score calculation methods are identical
341
+ 4. **Model configuration**: Ensure same model version and weights are loaded
342
+
343
+ The Python scripts provide detailed output to help identify where discrepancies occur.
344
+
345
+ ## License
346
+
347
+ This example follows the same license as the main probe project.
@@ -0,0 +1,82 @@
1
+ # Rust-BERT vs Candle for Cross-Encoders
2
+
3
+ ## Summary
4
+
5
+ After investigating rust-bert for cross-encoder support, here are the key findings:
6
+
7
+ ### rust-bert Limitations for Cross-Encoders
8
+
9
+ 1. **No Native Cross-Encoder Support**: rust-bert doesn't have a dedicated cross-encoder pipeline
10
+ 2. **Classification Focus**: The sequence classification pipeline expects discrete labels (POSITIVE/NEGATIVE), not continuous relevance scores
11
+ 3. **Model Format**: Requires TorchScript (.ot) format, not standard PyTorch .bin files
12
+ 4. **Architecture Mismatch**: Cross-encoders output a single score, but rust-bert's classification expects label probabilities
13
+
14
+ ### Our Candle Implementation Advantages
15
+
16
+ 1. **Direct PyTorch Support**: Loads .bin files directly from HuggingFace
17
+ 2. **Custom Architecture**: We implement the exact cross-encoder architecture
18
+ 3. **Raw Scores**: Returns raw logits for scoring, which is what cross-encoders need
19
+ 4. **Flexibility**: Full control over tokenization and model behavior
20
+
21
+ ## Model Availability
22
+
23
+ The MS-MARCO models on HuggingFace include:
24
+ - PyTorch formats: `pytorch_model.bin`, `model.safetensors`
25
+ - ONNX formats: Multiple optimized ONNX versions
26
+ - No TorchScript (.ot) versions available
27
+
28
+ ## Conversion Options
29
+
30
+ ### 1. PyTorch to TorchScript
31
+ ```python
32
+ # See convert_to_torchscript.py
33
+ traced_model = torch.jit.trace(model, example_inputs)
34
+ traced_model.save("rust_model.ot")
35
+ ```
36
+
37
+ ### 2. Use ONNX Runtime
38
+ Instead of rust-bert, consider using ONNX Runtime with Rust bindings:
39
+ ```toml
40
+ [dependencies]
41
+ ort = "1.16" # ONNX Runtime for Rust
42
+ ```
43
+
44
+ ### 3. Continue with Candle
45
+ Our current Candle implementation is actually well-suited for cross-encoders.
46
+
47
+ ## Recommendation
48
+
49
+ **Stay with Candle** for cross-encoder implementation because:
50
+
51
+ 1. It already works correctly with HuggingFace models
52
+ 2. No conversion needed
53
+ 3. Better control over the scoring pipeline
54
+ 4. The issue isn't with Candle - it's that TinyBERT (4M params) is too small
55
+
56
+ **To improve results:**
57
+ 1. Switch to a larger model (MiniLM-L-6-v2 with 85M params)
58
+ 2. Make the model configurable via CLI
59
+ 3. Consider adding ONNX support as an alternative backend
60
+
61
+ ## Code Comparison
62
+
63
+ ### rust-bert Approach (Would Require Modifications)
64
+ ```rust
65
+ // rust-bert expects classification, not scoring
66
+ let config = SequenceClassificationConfig { ... };
67
+ let model = SequenceClassificationModel::new(config)?;
68
+ let output = model.predict(&[text]); // Returns Label with probability
69
+ ```
70
+
71
+ ### Our Candle Approach (Current)
72
+ ```rust
73
+ // Direct cross-encoder implementation
74
+ let bert_outputs = self.bert.forward(&input_ids, &attention_mask, token_type_ids.as_ref())?;
75
+ let cls_output = bert_outputs.i((.., 0, ..))?;
76
+ let logits = self.classifier.forward(&cls_output)?;
77
+ let score = logits.i((0, 0))?.to_scalar::<f32>()?; // Raw relevance score
78
+ ```
79
+
80
+ ## Conclusion
81
+
82
+ rust-bert isn't suitable for cross-encoder models without significant modifications. Our Candle implementation is the right approach. The scoring issues are due to model size (TinyBERT), not the implementation framework.
@@ -0,0 +1,120 @@
1
+ # Cross-Encoder Tokenization Guide for Rust Implementation
2
+
3
+ ## Critical Points for Correct Implementation
4
+
5
+ ### 1. **Use `encode_pair()` NOT Manual Concatenation**
6
+
7
+ āŒ **WRONG** (Manual concatenation):
8
+ ```rust
9
+ let text = format!("{} [SEP] {}", query, document);
10
+ let encoding = tokenizer.encode(text, true)?;
11
+ ```
12
+
13
+ āœ… **CORRECT** (Tokenizer pair encoding):
14
+ ```rust
15
+ let encoding = tokenizer.encode((query, document), true)?;
16
+ ```
17
+
18
+ ### 2. **Why This Matters**
19
+
20
+ When you use `encode_pair()`, the tokenizer:
21
+ - Automatically adds [CLS] at the start
22
+ - Adds [SEP] after the query
23
+ - Adds [SEP] at the end (for BERT)
24
+ - **Correctly sets token_type_ids**: 0 for query, 1 for document
25
+ - Handles special tokens properly
26
+
27
+ Manual concatenation will:
28
+ - Add extra [SEP] tokens (you get [SEP] [SEP] in the middle)
29
+ - Set ALL token_type_ids to 0 (incorrect!)
30
+ - Produce different tokenization due to whitespace handling
31
+
32
+ ### 3. **Expected Token Structure**
33
+
34
+ For input:
35
+ - Query: "how does authentication work"
36
+ - Document: "Authentication is the process..."
37
+
38
+ The correct tokenization should be:
39
+ ```
40
+ [CLS] how does authentication work [SEP] authentication is the process ... [SEP]
41
+ 0 0 0 0 0 0 1 1 1 1 ... 1
42
+ ```
43
+
44
+ Token type IDs:
45
+ - 0 = Query segment (including [CLS] and first [SEP])
46
+ - 1 = Document segment (including final [SEP])
47
+
48
+ ### 4. **Special Token IDs (for BERT)**
49
+
50
+ ```
51
+ [CLS] = 101
52
+ [SEP] = 102
53
+ [PAD] = 0
54
+ ```
55
+
56
+ ### 5. **Verification in Rust**
57
+
58
+ ```rust
59
+ // After tokenization, check:
60
+ let token_ids = encoding.get_ids();
61
+ let type_ids = encoding.get_type_ids();
62
+
63
+ // First token should be [CLS] (101)
64
+ assert_eq!(token_ids[0], 101);
65
+
66
+ // Look for [SEP] tokens (102)
67
+ let sep_positions: Vec<_> = token_ids.iter()
68
+ .enumerate()
69
+ .filter(|(_, &id)| id == 102)
70
+ .map(|(i, _)| i)
71
+ .collect();
72
+
73
+ // Should have 2 [SEP] tokens for pair encoding
74
+ assert_eq!(sep_positions.len(), 2);
75
+
76
+ // Check token type IDs switch from 0 to 1 after first [SEP]
77
+ if let Some(first_sep) = sep_positions.first() {
78
+ // Tokens before first [SEP] should have type 0
79
+ // Tokens after should have type 1
80
+ }
81
+ ```
82
+
83
+ ### 6. **Common Issues**
84
+
85
+ 1. **Using wrong tokenizer**: Make sure you load tokenizer.json from the same model directory
86
+ 2. **Not using pair encoding**: Always use `encode_pair()` for cross-encoders
87
+ 3. **Missing token type IDs**: These are crucial for BERT to understand query vs document
88
+
89
+ ### 7. **Score Differences**
90
+
91
+ If you see different scores between Python and Rust:
92
+ 1. First check tokenization matches exactly (same token IDs)
93
+ 2. Check token type IDs are correct (0 for query, 1 for document)
94
+ 3. Verify attention masks are the same
95
+ 4. Ensure model weights loaded correctly
96
+
97
+ ### 8. **Debug Output**
98
+
99
+ Add this to your Rust code to debug:
100
+ ```rust
101
+ println!("Token IDs: {:?}", encoding.get_ids());
102
+ println!("Type IDs: {:?}", encoding.get_type_ids());
103
+ println!("Attention mask: {:?}", encoding.get_attention_mask());
104
+ ```
105
+
106
+ Compare with Python:
107
+ ```python
108
+ print(f"Token IDs: {encoding['input_ids'][0].tolist()}")
109
+ print(f"Type IDs: {encoding['token_type_ids'][0].tolist()}")
110
+ print(f"Attention: {encoding['attention_mask'][0].tolist()}")
111
+ ```
112
+
113
+ ## Summary
114
+
115
+ The key issue is likely that our Rust implementation was using manual concatenation instead of proper pair encoding. This would result in:
116
+ - Wrong token type IDs (all 0s instead of 0s and 1s)
117
+ - Extra [SEP] tokens
118
+ - Different tokenization
119
+
120
+ Fixing this should improve the model's ability to distinguish between query and document, leading to better discrimination between relevant and irrelevant queries.
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Check what tokenizer files our Rust implementation is using
4
+ and compare with Python tokenizer output.
5
+ """
6
+
7
+ import json
8
+ import os
9
+
10
+ print("="*80)
11
+ print("CHECKING RUST TOKENIZER CONFIGURATION")
12
+ print("="*80)
13
+
14
+ # Check what tokenizer files we have
15
+ tokenizer_path = "models/ms-marco-TinyBERT-L-2-v2/tokenizer.json"
16
+
17
+ if os.path.exists(tokenizer_path):
18
+ print(f"āœ“ Found tokenizer at: {tokenizer_path}")
19
+
20
+ # Load and inspect the tokenizer
21
+ with open(tokenizer_path, 'r') as f:
22
+ tokenizer_data = json.load(f)
23
+
24
+ print("\n--- TOKENIZER STRUCTURE ---")
25
+ print(f"Tokenizer type: {tokenizer_data.get('model', {}).get('type', 'Unknown')}")
26
+
27
+ # Check for special tokens
28
+ if 'added_tokens' in tokenizer_data:
29
+ print("\nSpecial tokens:")
30
+ for token in tokenizer_data['added_tokens'][:10]: # Show first 10
31
+ print(f" {token}")
32
+
33
+ # Check post-processor (important for BERT!)
34
+ if 'post_processor' in tokenizer_data:
35
+ post_proc = tokenizer_data['post_processor']
36
+ print(f"\nPost-processor type: {post_proc.get('type', 'Unknown')}")
37
+
38
+ # For BERT, should be TemplateProcessing
39
+ if post_proc.get('type') == 'TemplateProcessing':
40
+ if 'single' in post_proc:
41
+ print(f"Single sequence template: {post_proc['single']}")
42
+ if 'pair' in post_proc:
43
+ print(f"Pair sequence template: {post_proc['pair']}")
44
+ else:
45
+ print(f"āŒ Tokenizer not found at: {tokenizer_path}")
46
+
47
+ # Now let's create a test to verify Rust tokenization
48
+ print("\n" + "="*80)
49
+ print("RUST TOKENIZATION TEST CASES")
50
+ print("="*80)
51
+
52
+ # These should match Python exactly
53
+ test_cases = [
54
+ {
55
+ "name": "Simple pair",
56
+ "query": "how does authentication work",
57
+ "document": "Authentication is the process of verifying the identity of a user.",
58
+ "method": "pair" # tokenizer.encode_pair(query, document)
59
+ },
60
+ {
61
+ "name": "Manual concat (wrong)",
62
+ "text": "how does authentication work [SEP] Authentication is the process of verifying the identity of a user.",
63
+ "method": "single" # tokenizer.encode(text)
64
+ }
65
+ ]
66
+
67
+ print("\nExpected Rust code for correct tokenization:")
68
+ print("```rust")
69
+ print('// CORRECT: Use encode_pair for cross-encoder')
70
+ print('let encoding = tokenizer.encode((query, document), true)?;')
71
+ print('')
72
+ print('// WRONG: Do not manually concatenate')
73
+ print('let text = format!("{} [SEP] {}", query, document);')
74
+ print('let encoding = tokenizer.encode(text, true)?;')
75
+ print("```")
76
+
77
+ # Key differences to check
78
+ print("\n--- KEY THINGS TO VERIFY IN RUST ---")
79
+ print("1. Token IDs match exactly")
80
+ print("2. Token type IDs are generated correctly:")
81
+ print(" - 0 for query tokens (including [CLS])")
82
+ print(" - 0 for first [SEP]")
83
+ print(" - 1 for document tokens")
84
+ print(" - 1 for final [SEP] (if present)")
85
+ print("3. Special tokens are in the right positions")
86
+ print("4. Padding is handled correctly")
87
+
88
+ # Load Python results if available
89
+ if os.path.exists("tokenizer_debug_info.json"):
90
+ with open("tokenizer_debug_info.json", 'r') as f:
91
+ python_info = json.load(f)
92
+
93
+ print("\n--- PYTHON REFERENCE ---")
94
+ print(f"Query: '{python_info['test_case']['query']}'")
95
+ print(f"Document: '{python_info['test_case']['document']}'")
96
+ print(f"Correct score: {python_info['test_case']['correct_score']:.6f}")
97
+ print(f"Manual concat score: {python_info['test_case']['manual_concat_score']:.6f}")
98
+
99
+ # Show first 20 tokens
100
+ ids = python_info['test_case']['correct_input_ids'][:20]
101
+ types = python_info['test_case']['correct_token_types'][:20] if python_info['test_case']['correct_token_types'] else None
102
+
103
+ print("\nFirst 20 tokens (Python):")
104
+ print(f"IDs: {ids}")
105
+ if types:
106
+ print(f"Types: {types}")
107
+
108
+ print("\nāœ… Your Rust implementation should produce these EXACT token IDs and types!")