@crownpeak/dqm-react-component-dev-mcp 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/README.md +138 -0
  2. package/data/.env.example +22 -0
  3. package/data/.gitattributes +47 -0
  4. package/data/.glfrc.json +7 -0
  5. package/data/.husky/pre-commit +5 -0
  6. package/data/.nvmrc +1 -0
  7. package/data/CHANGELOG.md +75 -0
  8. package/data/CODE_OF_CONDUCT.md +129 -0
  9. package/data/CONTRIBUTING.md +203 -0
  10. package/data/DOCS-STRUCTURE.md +307 -0
  11. package/data/I18N.md +292 -0
  12. package/data/LICENSE +22 -0
  13. package/data/README.md +315 -0
  14. package/data/SECURITY.md +125 -0
  15. package/data/WIKI-DEPLOYMENT.md +348 -0
  16. package/data/docs/AI-FEATURES.md +610 -0
  17. package/data/docs/API-REFERENCE.md +1022 -0
  18. package/data/docs/AUTHENTICATION.md +301 -0
  19. package/data/docs/BACKEND-API.md +468 -0
  20. package/data/docs/DEVELOPMENT.md +375 -0
  21. package/data/docs/EXAMPLES.md +622 -0
  22. package/data/docs/MCP-SERVER.md +307 -0
  23. package/data/docs/MIGRATION-GUIDE.md +367 -0
  24. package/data/docs/NPM-PUBLISH.md +193 -0
  25. package/data/docs/QUICKSTART.md +206 -0
  26. package/data/docs/REDIS-SETUP.md +162 -0
  27. package/data/docs/SERVER.md +228 -0
  28. package/data/docs/TROUBLESHOOTING.md +657 -0
  29. package/data/docs/WIDGET-GUIDE.md +638 -0
  30. package/data/docs/WIKI-HOME.md +58 -0
  31. package/data/docs/WIKI-SIDEBAR.md +39 -0
  32. package/data/package.json +171 -0
  33. package/data/playwright.config.ts +64 -0
  34. package/data/probe/.cargo/config.toml +10 -0
  35. package/data/probe/.claude/commands/performance-review.md +15 -0
  36. package/data/probe/.clinerules +288 -0
  37. package/data/probe/.dockerignore +57 -0
  38. package/data/probe/.githooks/post-commit +11 -0
  39. package/data/probe/.githooks/pre-commit +99 -0
  40. package/data/probe/.githooks/pre-commit-vow +9 -0
  41. package/data/probe/.prompts/engineer.md +41 -0
  42. package/data/probe/.roomodes +28 -0
  43. package/data/probe/.windsurfrules +0 -0
  44. package/data/probe/BASH_TOOL_SUMMARY.md +148 -0
  45. package/data/probe/BENCHMARKING.md +256 -0
  46. package/data/probe/CLAUDE.md +226 -0
  47. package/data/probe/CODE_OF_CONDUCT.md +128 -0
  48. package/data/probe/CONTRIBUTING.md +193 -0
  49. package/data/probe/Cargo.toml +120 -0
  50. package/data/probe/Cross.toml +10 -0
  51. package/data/probe/DOCKER-README.md +224 -0
  52. package/data/probe/Dockerfile +32 -0
  53. package/data/probe/ENHANCED_DEBUG_TELEMETRY.md +188 -0
  54. package/data/probe/LICENSE +201 -0
  55. package/data/probe/Makefile +210 -0
  56. package/data/probe/README.md +824 -0
  57. package/data/probe/SECURITY.md +67 -0
  58. package/data/probe/WINDOWS-GUIDE.md +294 -0
  59. package/data/probe/benches/parsing_benchmarks.rs +370 -0
  60. package/data/probe/benches/search_benchmarks.rs +599 -0
  61. package/data/probe/benches/simd_benchmarks.rs +372 -0
  62. package/data/probe/benches/timing_benchmarks.rs +287 -0
  63. package/data/probe/build-windows.bat +229 -0
  64. package/data/probe/codex-config/config.toml +6 -0
  65. package/data/probe/docs/PERFORMANCE_OPTIMIZATION.md +161 -0
  66. package/data/probe/examples/cache_demo.rs +46 -0
  67. package/data/probe/examples/chat/.dockerignore +37 -0
  68. package/data/probe/examples/chat/ChatSessionManager.js +295 -0
  69. package/data/probe/examples/chat/Dockerfile +98 -0
  70. package/data/probe/examples/chat/LICENSE +201 -0
  71. package/data/probe/examples/chat/LOCAL_IMAGE_SUPPORT.md +195 -0
  72. package/data/probe/examples/chat/MCP_INTEGRATION.md +400 -0
  73. package/data/probe/examples/chat/README.md +338 -0
  74. package/data/probe/examples/chat/TRACING.md +226 -0
  75. package/data/probe/examples/chat/appTracer.js +968 -0
  76. package/data/probe/examples/chat/auth.js +76 -0
  77. package/data/probe/examples/chat/bin/probe-chat.js +13 -0
  78. package/data/probe/examples/chat/build.js +104 -0
  79. package/data/probe/examples/chat/cancelRequest.js +84 -0
  80. package/data/probe/examples/chat/demo-agentic-image-flow.js +88 -0
  81. package/data/probe/examples/chat/demo-local-images.js +128 -0
  82. package/data/probe/examples/chat/fileSpanExporter.js +181 -0
  83. package/data/probe/examples/chat/implement/README.md +228 -0
  84. package/data/probe/examples/chat/implement/backends/AiderBackend.js +750 -0
  85. package/data/probe/examples/chat/implement/backends/BaseBackend.js +276 -0
  86. package/data/probe/examples/chat/implement/backends/ClaudeCodeBackend.js +767 -0
  87. package/data/probe/examples/chat/implement/backends/MockBackend.js +237 -0
  88. package/data/probe/examples/chat/implement/backends/registry.js +85 -0
  89. package/data/probe/examples/chat/implement/core/BackendManager.js +567 -0
  90. package/data/probe/examples/chat/implement/core/ImplementTool.js +354 -0
  91. package/data/probe/examples/chat/implement/core/config.js +428 -0
  92. package/data/probe/examples/chat/implement/core/timeouts.js +58 -0
  93. package/data/probe/examples/chat/implement/core/utils.js +496 -0
  94. package/data/probe/examples/chat/implement/types/BackendTypes.js +126 -0
  95. package/data/probe/examples/chat/index.js +669 -0
  96. package/data/probe/examples/chat/mcpServer.js +341 -0
  97. package/data/probe/examples/chat/npm/LICENSE +15 -0
  98. package/data/probe/examples/chat/npm/README.md +168 -0
  99. package/data/probe/examples/chat/npm/bin/probe-chat.js +156 -0
  100. package/data/probe/examples/chat/npm/index.js +259 -0
  101. package/data/probe/examples/chat/npm/package.json +54 -0
  102. package/data/probe/examples/chat/package.json +102 -0
  103. package/data/probe/examples/chat/probeChat.js +456 -0
  104. package/data/probe/examples/chat/probeTool.js +491 -0
  105. package/data/probe/examples/chat/storage/JsonChatStorage.js +476 -0
  106. package/data/probe/examples/chat/telemetry.js +281 -0
  107. package/data/probe/examples/chat/test/integration/chatFlows.test.js +320 -0
  108. package/data/probe/examples/chat/test/integration/toolCalling.test.js +471 -0
  109. package/data/probe/examples/chat/test/mocks/mockLLMProvider.js +269 -0
  110. package/data/probe/examples/chat/test/test-backends.js +90 -0
  111. package/data/probe/examples/chat/test/testUtils.js +530 -0
  112. package/data/probe/examples/chat/test/unit/backendTimeout.test.js +161 -0
  113. package/data/probe/examples/chat/test/unit/packageFiles.test.js +120 -0
  114. package/data/probe/examples/chat/test/verify-tests.js +118 -0
  115. package/data/probe/examples/chat/test-agentic-image-loading.js +294 -0
  116. package/data/probe/examples/chat/test-ai-sdk-telemetry.js +204 -0
  117. package/data/probe/examples/chat/test-chat-tracing.js +38 -0
  118. package/data/probe/examples/chat/test-direct-function.js +49 -0
  119. package/data/probe/examples/chat/test-file-size-validation.js +103 -0
  120. package/data/probe/examples/chat/test-full-mcp-integration.js +258 -0
  121. package/data/probe/examples/chat/test-github-context.txt +12 -0
  122. package/data/probe/examples/chat/test-hierarchy.js +203 -0
  123. package/data/probe/examples/chat/test-image-spans.js +37 -0
  124. package/data/probe/examples/chat/test-local-image-reading.js +176 -0
  125. package/data/probe/examples/chat/test-mcp-integration.js +136 -0
  126. package/data/probe/examples/chat/test-mcp-probe-server.js +161 -0
  127. package/data/probe/examples/chat/test-mcp-with-ai.js +279 -0
  128. package/data/probe/examples/chat/test-multiple-allowed-dirs.js +111 -0
  129. package/data/probe/examples/chat/test-probe-mcp-server.js +110 -0
  130. package/data/probe/examples/chat/test-security-validation.js +145 -0
  131. package/data/probe/examples/chat/test-simple-tracing.js +32 -0
  132. package/data/probe/examples/chat/test-trace-verification.js +235 -0
  133. package/data/probe/examples/chat/test-tracing.js +114 -0
  134. package/data/probe/examples/chat/tokenCounter.js +419 -0
  135. package/data/probe/examples/chat/tokenUsageDisplay.js +134 -0
  136. package/data/probe/examples/chat/webServer.js +1103 -0
  137. package/data/probe/examples/reranker/Cargo.toml +33 -0
  138. package/data/probe/examples/reranker/DEBUG_OUTPUT_ANALYSIS.md +71 -0
  139. package/data/probe/examples/reranker/MODELS.md +66 -0
  140. package/data/probe/examples/reranker/MODEL_COMPARISON.md +60 -0
  141. package/data/probe/examples/reranker/MULTI_MODEL_ANALYSIS.md +176 -0
  142. package/data/probe/examples/reranker/PERFORMANCE_SUMMARY.md +156 -0
  143. package/data/probe/examples/reranker/README.md +347 -0
  144. package/data/probe/examples/reranker/RUST_BERT_COMPARISON.md +82 -0
  145. package/data/probe/examples/reranker/TOKENIZATION_GUIDE.md +120 -0
  146. package/data/probe/examples/reranker/check_rust_tokenizer.py +108 -0
  147. package/data/probe/examples/reranker/convert_to_torchscript.py +109 -0
  148. package/data/probe/examples/reranker/debug_scoring.py +189 -0
  149. package/data/probe/examples/reranker/debug_tokenization.py +154 -0
  150. package/data/probe/examples/reranker/download_models.sh +73 -0
  151. package/data/probe/examples/reranker/requirements.txt +13 -0
  152. package/data/probe/examples/reranker/run_comprehensive_benchmark.sh +83 -0
  153. package/data/probe/examples/reranker/rust_bert_test/Cargo.toml +12 -0
  154. package/data/probe/examples/reranker/rust_bert_test/README.md +54 -0
  155. package/data/probe/examples/reranker/simple_test.py +50 -0
  156. package/data/probe/examples/reranker/test_all_models.sh +63 -0
  157. package/data/probe/examples/reranker/test_bert_results.sh +44 -0
  158. package/data/probe/examples/reranker/test_cross_encoder.py +334 -0
  159. package/data/probe/examples/reranker/test_cross_encoder.sh +80 -0
  160. package/data/probe/examples/reranker/test_exact_comparison.py +151 -0
  161. package/data/probe/examples/reranker/test_parallel_performance.sh +56 -0
  162. package/data/probe/examples/reranker/test_scores.py +132 -0
  163. package/data/probe/install.ps1 +508 -0
  164. package/data/probe/install.sh +460 -0
  165. package/data/probe/npm/CLONE_METHOD_EXAMPLES.md +596 -0
  166. package/data/probe/npm/CONTEXT_COMPACTION.md +303 -0
  167. package/data/probe/npm/DELEGATE_TOOL_README.md +166 -0
  168. package/data/probe/npm/MAID_INTEGRATION.md +313 -0
  169. package/data/probe/npm/MCP_INTEGRATION_SUMMARY.md +241 -0
  170. package/data/probe/npm/README.md +824 -0
  171. package/data/probe/npm/bin/.gitignore +7 -0
  172. package/data/probe/npm/bin/.gitkeep +0 -0
  173. package/data/probe/npm/bin/README.md +12 -0
  174. package/data/probe/npm/bin/probe +167 -0
  175. package/data/probe/npm/docs/CLAUDE_CODE_INTEGRATION.md +414 -0
  176. package/data/probe/npm/docs/CODEX_INTEGRATION.md +502 -0
  177. package/data/probe/npm/docs/EDIT_CREATE_TOOLS.md +233 -0
  178. package/data/probe/npm/docs/RETRY_AND_FALLBACK.md +674 -0
  179. package/data/probe/npm/example-usage.js +335 -0
  180. package/data/probe/npm/examples/multi-engine-demo.js +117 -0
  181. package/data/probe/npm/examples/probe-agent-cli.js +113 -0
  182. package/data/probe/npm/examples/test-agent-edit.js +114 -0
  183. package/data/probe/npm/examples/test-edit-create.js +120 -0
  184. package/data/probe/npm/examples/test-edit-direct.js +114 -0
  185. package/data/probe/npm/index.d.ts +744 -0
  186. package/data/probe/npm/jest.config.js +52 -0
  187. package/data/probe/npm/package.json +117 -0
  188. package/data/probe/npm/scripts/build-agent.cjs +75 -0
  189. package/data/probe/npm/scripts/build-cjs.js +124 -0
  190. package/data/probe/npm/scripts/build-mcp.cjs +36 -0
  191. package/data/probe/npm/scripts/postinstall.js +216 -0
  192. package/data/probe/npm/test-codex-e2e.js +78 -0
  193. package/data/probe/npm/test-download-lock.js +109 -0
  194. package/data/probe/npm/test-grep-security.js +94 -0
  195. package/data/probe/npm/test-grep-simplified.js +63 -0
  196. package/data/probe/npm/test-grep.js +51 -0
  197. package/data/probe/npm/tests/README.md +96 -0
  198. package/data/probe/npm/tests/agent-compact-history.test.js +174 -0
  199. package/data/probe/npm/tests/allow-tests-default.test.js +151 -0
  200. package/data/probe/npm/tests/contextCompactor.test.js +498 -0
  201. package/data/probe/npm/tests/delegate-config.test.js +353 -0
  202. package/data/probe/npm/tests/delegate-integration.test.js +348 -0
  203. package/data/probe/npm/tests/extractor-integration.test.js +162 -0
  204. package/data/probe/npm/tests/extractor.test.js +317 -0
  205. package/data/probe/npm/tests/fixtures/sampleDiagrams.js +267 -0
  206. package/data/probe/npm/tests/integration/claude-code-auto-fallback.spec.js +148 -0
  207. package/data/probe/npm/tests/integration/claude-code-multi-step.spec.js +127 -0
  208. package/data/probe/npm/tests/integration/claude-code-tool-events.spec.js +163 -0
  209. package/data/probe/npm/tests/integration/codex-auto-fallback.spec.js +191 -0
  210. package/data/probe/npm/tests/integration/codex-tool-events.spec.js +147 -0
  211. package/data/probe/npm/tests/integration/examplesChatMcp.test.js +402 -0
  212. package/data/probe/npm/tests/integration/mcpDotenvSupport.test.js +174 -0
  213. package/data/probe/npm/tests/integration/mcpErrorHandling.test.js +566 -0
  214. package/data/probe/npm/tests/integration/mcpRobustness.test.js +564 -0
  215. package/data/probe/npm/tests/integration/mcpStdoutPurity.test.js +355 -0
  216. package/data/probe/npm/tests/integration/probeAgentMcp.test.js +398 -0
  217. package/data/probe/npm/tests/integration/retryFallback.test.js +368 -0
  218. package/data/probe/npm/tests/integration/schema-in-initial-message.test.js +318 -0
  219. package/data/probe/npm/tests/integration/schema-validation-loop-prevention.test.js +244 -0
  220. package/data/probe/npm/tests/integration/schemaRetryLogic.test.js +94 -0
  221. package/data/probe/npm/tests/integration/validationFlow.test.js +329 -0
  222. package/data/probe/npm/tests/manual/test-codex-basic.js +110 -0
  223. package/data/probe/npm/tests/mcp/mcpClientManager.test.js +614 -0
  224. package/data/probe/npm/tests/mcp/mcpConfig.test.js +359 -0
  225. package/data/probe/npm/tests/mcp/mcpXmlBridge.test.js +436 -0
  226. package/data/probe/npm/tests/mcp/mockMcpServer.js +510 -0
  227. package/data/probe/npm/tests/mcp-strict-syntax.test.js +319 -0
  228. package/data/probe/npm/tests/mermaidQuoteEscaping.test.js +214 -0
  229. package/data/probe/npm/tests/nestedQuoteFix.test.js +40 -0
  230. package/data/probe/npm/tests/setup.js +46 -0
  231. package/data/probe/npm/tests/unit/allowed-tools.test.js +513 -0
  232. package/data/probe/npm/tests/unit/attempt-completion-closing-tag-in-content.test.js +188 -0
  233. package/data/probe/npm/tests/unit/attemptCompletionJsonFix.test.js +238 -0
  234. package/data/probe/npm/tests/unit/attemptCompletionJsonIssue.test.js +128 -0
  235. package/data/probe/npm/tests/unit/backtickAutoFix.test.js +35 -0
  236. package/data/probe/npm/tests/unit/bash-probe-agent-integration.test.js +389 -0
  237. package/data/probe/npm/tests/unit/bash-simple-commands.test.js +324 -0
  238. package/data/probe/npm/tests/unit/bash-tool-comprehensive.test.js +371 -0
  239. package/data/probe/npm/tests/unit/bash-tool-integration.test.js +310 -0
  240. package/data/probe/npm/tests/unit/bash-tool.test.js +341 -0
  241. package/data/probe/npm/tests/unit/completion-prompt.test.js +379 -0
  242. package/data/probe/npm/tests/unit/cwd-path-options.test.js +287 -0
  243. package/data/probe/npm/tests/unit/delegate-limits.test.js +422 -0
  244. package/data/probe/npm/tests/unit/direct-content-attempt-completion.test.js +235 -0
  245. package/data/probe/npm/tests/unit/edit-create-tools.test.js +609 -0
  246. package/data/probe/npm/tests/unit/enhancedMermaidValidation.test.js +577 -0
  247. package/data/probe/npm/tests/unit/extract-content.test.js +83 -0
  248. package/data/probe/npm/tests/unit/extract-multiple-targets.test.js +89 -0
  249. package/data/probe/npm/tests/unit/fallbackManager.test.js +442 -0
  250. package/data/probe/npm/tests/unit/githubCompatibilityValidation.test.js +258 -0
  251. package/data/probe/npm/tests/unit/imageConfig.test.js +149 -0
  252. package/data/probe/npm/tests/unit/imagePathResolution.test.js +345 -0
  253. package/data/probe/npm/tests/unit/json-fixing-agent.test.js +238 -0
  254. package/data/probe/npm/tests/unit/json-validation-enhanced-errors.test.js +199 -0
  255. package/data/probe/npm/tests/unit/jsonValidationInfiniteLoopFix.test.js +228 -0
  256. package/data/probe/npm/tests/unit/maidIntegration.test.js +139 -0
  257. package/data/probe/npm/tests/unit/maxIterationsWarning.test.js +195 -0
  258. package/data/probe/npm/tests/unit/mermaidEdgeLabelFix.test.js +161 -0
  259. package/data/probe/npm/tests/unit/mermaidHtmlEntities.test.js +76 -0
  260. package/data/probe/npm/tests/unit/mermaidInfiniteLoopFix.test.js +64 -0
  261. package/data/probe/npm/tests/unit/mermaidValidation.test.js +723 -0
  262. package/data/probe/npm/tests/unit/mermaidValidationVisorExample.test.js +309 -0
  263. package/data/probe/npm/tests/unit/probe-agent-clone-realistic.test.js +643 -0
  264. package/data/probe/npm/tests/unit/probe-agent-clone.test.js +476 -0
  265. package/data/probe/npm/tests/unit/probe-agent-delegate.test.js +400 -0
  266. package/data/probe/npm/tests/unit/probe-agent-model-option.test.js +118 -0
  267. package/data/probe/npm/tests/unit/probeTool-security.test.js +283 -0
  268. package/data/probe/npm/tests/unit/readImageTool.test.js +418 -0
  269. package/data/probe/npm/tests/unit/retryManager.test.js +317 -0
  270. package/data/probe/npm/tests/unit/schema-aware-reminders.test.js +288 -0
  271. package/data/probe/npm/tests/unit/schemaDefinitionDetection.test.js +115 -0
  272. package/data/probe/npm/tests/unit/schemaUtils.test.js +1268 -0
  273. package/data/probe/npm/tests/unit/simpleTelemetry.test.js +282 -0
  274. package/data/probe/npm/tests/unit/simplified-attempt-completion.test.js +274 -0
  275. package/data/probe/npm/tests/unit/single-quote-json-bug.test.js +231 -0
  276. package/data/probe/npm/tests/unit/subgraphAutoFix.test.js +110 -0
  277. package/data/probe/npm/tests/unit/system-prompt.test.js +32 -0
  278. package/data/probe/npm/tests/unit/types-probe-agent-options.test.js +42 -0
  279. package/data/probe/npm/tests/unit/xmlParsing.test.js +720 -0
  280. package/data/probe/npm/tsconfig.json +21 -0
  281. package/data/probe/result1.txt +19 -0
  282. package/data/probe/result2.txt +26 -0
  283. package/data/probe/scripts/benchmark.sh +270 -0
  284. package/data/probe/scripts/cache_memory_analysis.rs +844 -0
  285. package/data/probe/scripts/claude-hook-wrapper.sh +56 -0
  286. package/data/probe/site/.env.example +10 -0
  287. package/data/probe/site/DEPLOYMENT.md +86 -0
  288. package/data/probe/site/README.md +183 -0
  289. package/data/probe/site/adding-languages.md +135 -0
  290. package/data/probe/site/ai-chat.md +427 -0
  291. package/data/probe/site/ai-integration.md +1488 -0
  292. package/data/probe/site/blog/agentic-flow-custom-xml-protocol.md +407 -0
  293. package/data/probe/site/blog/index.md +118 -0
  294. package/data/probe/site/blog/v0.6.0-release.md +426 -0
  295. package/data/probe/site/blog.md +8 -0
  296. package/data/probe/site/changelog.md +200 -0
  297. package/data/probe/site/cli-mode.md +437 -0
  298. package/data/probe/site/code-extraction.md +436 -0
  299. package/data/probe/site/contributing/README.md +9 -0
  300. package/data/probe/site/contributing/documentation-cross-references.md +215 -0
  301. package/data/probe/site/contributing/documentation-maintenance.md +275 -0
  302. package/data/probe/site/contributing/documentation-structure.md +75 -0
  303. package/data/probe/site/documentation-cross-references.md +215 -0
  304. package/data/probe/site/documentation-guide.md +132 -0
  305. package/data/probe/site/documentation-maintenance.md +275 -0
  306. package/data/probe/site/features.md +147 -0
  307. package/data/probe/site/how-it-works.md +118 -0
  308. package/data/probe/site/index.md +175 -0
  309. package/data/probe/site/index.md.bak +133 -0
  310. package/data/probe/site/installation.md +235 -0
  311. package/data/probe/site/integrations/docker.md +248 -0
  312. package/data/probe/site/integrations/github-actions.md +413 -0
  313. package/data/probe/site/language-support-overview.md +168 -0
  314. package/data/probe/site/mcp-integration.md +587 -0
  315. package/data/probe/site/mcp-server.md +304 -0
  316. package/data/probe/site/navigation-structure.md +76 -0
  317. package/data/probe/site/nodejs-sdk.md +798 -0
  318. package/data/probe/site/output-formats.md +625 -0
  319. package/data/probe/site/package.json +21 -0
  320. package/data/probe/site/public/_headers +28 -0
  321. package/data/probe/site/public/_redirects +11 -0
  322. package/data/probe/site/quick-start.md +289 -0
  323. package/data/probe/site/search-functionality.md +291 -0
  324. package/data/probe/site/search-reference.md +291 -0
  325. package/data/probe/site/supported-languages.md +215 -0
  326. package/data/probe/site/use-cases/README.md +8 -0
  327. package/data/probe/site/use-cases/advanced-cli.md +253 -0
  328. package/data/probe/site/use-cases/ai-code-editors.md +239 -0
  329. package/data/probe/site/use-cases/building-ai-tools.md +529 -0
  330. package/data/probe/site/use-cases/cli-ai-workflows.md +285 -0
  331. package/data/probe/site/use-cases/deploying-probe-web-interface.md +255 -0
  332. package/data/probe/site/use-cases/integrating-probe-into-ai-code-editors.md +161 -0
  333. package/data/probe/site/use-cases/nodejs-sdk.md +596 -0
  334. package/data/probe/site/use-cases/team-chat.md +350 -0
  335. package/data/probe/site/web-interface.md +434 -0
  336. package/data/probe/site/wrangler.toml +9 -0
  337. package/data/probe/test-api-key.sh +1 -0
  338. package/data/probe/test-probe-implementation/hello.js +7 -0
  339. package/data/probe/test_cases/demonstrate_early_termination_issues.sh +176 -0
  340. package/data/probe/test_cases/early_termination_issues.rs +533 -0
  341. package/data/probe/test_data/test_nested_struct.go +26 -0
  342. package/data/probe/tests/README.md +286 -0
  343. package/data/probe/tests/README_search_determinism_tests.md +116 -0
  344. package/data/probe/tests/adjacent_comment_test.rs +152 -0
  345. package/data/probe/tests/apostrophe_handling_tests.rs +132 -0
  346. package/data/probe/tests/block_filtering_with_ast_tests.rs +669 -0
  347. package/data/probe/tests/block_merging_tests.rs +396 -0
  348. package/data/probe/tests/c_outline_format_tests.rs +2179 -0
  349. package/data/probe/tests/cache_invalidation_issues.rs.disabled +682 -0
  350. package/data/probe/tests/cache_order_tests.rs +147 -0
  351. package/data/probe/tests/cache_query_scoping_tests.rs +221 -0
  352. package/data/probe/tests/cli_tests.rs +680 -0
  353. package/data/probe/tests/comment_context_integration_test.rs +240 -0
  354. package/data/probe/tests/common.rs +33 -0
  355. package/data/probe/tests/complex_block_merging_tests.rs +599 -0
  356. package/data/probe/tests/complex_query_block_filtering_tests.rs +422 -0
  357. package/data/probe/tests/control_flow_closing_braces_test.rs +91 -0
  358. package/data/probe/tests/cpp_outline_format_tests.rs +1507 -0
  359. package/data/probe/tests/csharp_outline_format_tests.rs +941 -0
  360. package/data/probe/tests/elastic_query_integration_tests.rs +922 -0
  361. package/data/probe/tests/extract_command_tests.rs +1848 -0
  362. package/data/probe/tests/extract_deduplication_tests.rs +146 -0
  363. package/data/probe/tests/extract_input_file_tests.rs +84 -0
  364. package/data/probe/tests/extract_prompt_tests.rs +102 -0
  365. package/data/probe/tests/filename_search_tests.rs +96 -0
  366. package/data/probe/tests/fixtures/user/AssemblyInfo.cs +3 -0
  367. package/data/probe/tests/github_extract_tests.rs +234 -0
  368. package/data/probe/tests/go_comment_test.rs +253 -0
  369. package/data/probe/tests/go_outline_format_tests.rs +2587 -0
  370. package/data/probe/tests/go_path_resolver_tests.rs +96 -0
  371. package/data/probe/tests/html_outline_format_tests.rs +637 -0
  372. package/data/probe/tests/integration_tests.rs +837 -0
  373. package/data/probe/tests/ip_whitelist_test.rs +148 -0
  374. package/data/probe/tests/java_outline_format_tests.rs +1611 -0
  375. package/data/probe/tests/javascript_extract_tests.rs +315 -0
  376. package/data/probe/tests/javascript_outline_format_tests.rs +1464 -0
  377. package/data/probe/tests/json_format_tests.rs +436 -0
  378. package/data/probe/tests/json_schema_validation_tests.rs +450 -0
  379. package/data/probe/tests/lib_usage.rs +60 -0
  380. package/data/probe/tests/line_comment_context_extension_test.rs +459 -0
  381. package/data/probe/tests/line_map_cache_tests.rs +114 -0
  382. package/data/probe/tests/markdown_integration_tests.rs +190 -0
  383. package/data/probe/tests/mocks/test_ip_whitelist.go +11 -0
  384. package/data/probe/tests/mocks/test_object.js +27 -0
  385. package/data/probe/tests/mocks/test_struct.go +50 -0
  386. package/data/probe/tests/multi_keyword_pattern_tests.rs +464 -0
  387. package/data/probe/tests/multi_language_syntax_integration_tests.rs +218 -0
  388. package/data/probe/tests/multiple_capture_groups_tests.rs +169 -0
  389. package/data/probe/tests/negative_compound_word_tests.rs +246 -0
  390. package/data/probe/tests/nested_symbol_extraction_tests.rs +99 -0
  391. package/data/probe/tests/outline_cross_file_interference_test.rs +335 -0
  392. package/data/probe/tests/outline_keyword_preservation_test.rs +67 -0
  393. package/data/probe/tests/output_format_edge_cases_tests.rs +693 -0
  394. package/data/probe/tests/parallel_extraction_tests.rs +178 -0
  395. package/data/probe/tests/parallel_search_tests.rs +355 -0
  396. package/data/probe/tests/path_resolver_tests.rs +698 -0
  397. package/data/probe/tests/php_outline_format_extended_tests.rs +928 -0
  398. package/data/probe/tests/php_outline_format_tests.rs +768 -0
  399. package/data/probe/tests/property_tests.proptest-regressions +9 -0
  400. package/data/probe/tests/property_tests.rs +118 -0
  401. package/data/probe/tests/python_outline_format_tests.rs +1538 -0
  402. package/data/probe/tests/query_command_json_tests.rs +438 -0
  403. package/data/probe/tests/query_command_tests.rs +232 -0
  404. package/data/probe/tests/query_command_xml_tests.rs +569 -0
  405. package/data/probe/tests/quoted_term_with_negative_keyword_tests.rs +216 -0
  406. package/data/probe/tests/required_terms_filename_tests.rs +116 -0
  407. package/data/probe/tests/ruby_outline_format_tests.rs +1011 -0
  408. package/data/probe/tests/rust_line_comment_context_test.rs +151 -0
  409. package/data/probe/tests/rust_outline_format_enhanced_tests.rs +725 -0
  410. package/data/probe/tests/rust_outline_format_tests.rs +843 -0
  411. package/data/probe/tests/schemas/xml_output_schema.xsd +38 -0
  412. package/data/probe/tests/search_determinism_tests.rs +451 -0
  413. package/data/probe/tests/search_hints_tests.rs +253 -0
  414. package/data/probe/tests/special_character_escaping_tests.rs +417 -0
  415. package/data/probe/tests/stemming_compound_word_filtering_tests.rs +535 -0
  416. package/data/probe/tests/strict_elastic_syntax_tests.rs +404 -0
  417. package/data/probe/tests/swift_outline_format_tests.rs +3319 -0
  418. package/data/probe/tests/symbols_tests.rs +166 -0
  419. package/data/probe/tests/test_file.rs +45 -0
  420. package/data/probe/tests/test_tokenize.rs +28 -0
  421. package/data/probe/tests/timeout_tests.rs +82 -0
  422. package/data/probe/tests/tokenization_tests.rs +195 -0
  423. package/data/probe/tests/tokenized_block_filtering_tests.rs +174 -0
  424. package/data/probe/tests/typescript_extract_tests.rs +214 -0
  425. package/data/probe/tests/typescript_outline_format_tests.rs +2188 -0
  426. package/data/probe/tests/xml_format_tests.rs +568 -0
  427. package/data/probe/tests/xml_schema_validation_tests.rs +497 -0
  428. package/data/scripts/postinstall.mjs +9 -0
  429. package/data/scripts/set-version.js +0 -0
  430. package/data/scripts/wiki-build.sh +111 -0
  431. package/data/scripts/wiki-deploy.sh +73 -0
  432. package/data/serve.json +12 -0
  433. package/data/test/demo-dynamic.html +134 -0
  434. package/data/test/demo-esm.html +105 -0
  435. package/data/test/demo-iife.html +78 -0
  436. package/data/tsconfig.json +7 -0
  437. package/data/vite.server.ts +483 -0
  438. package/data/vitest.config.ts +40 -0
  439. package/data/wiki/Home.md +58 -0
  440. package/data/wiki/_Sidebar.md +39 -0
  441. package/docs-mcp.config.json +20 -0
  442. package/package.json +56 -0
  443. package/src/config.js +111 -0
  444. package/src/index.js +395 -0
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert MS-MARCO TinyBERT model to TorchScript format for rust-bert
4
+ """
5
+
6
+ import torch
7
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
8
+ import os
9
+ import sys
10
+
11
+ def convert_to_torchscript(model_name="cross-encoder/ms-marco-TinyBERT-L-2-v2", output_dir="models/ms-marco-TinyBERT-L-2-v2"):
12
+ print(f"Converting {model_name} to TorchScript format...")
13
+
14
+ # Create output directory
15
+ os.makedirs(output_dir, exist_ok=True)
16
+
17
+ # Load model and tokenizer
18
+ print("Loading model...")
19
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+
22
+ # Set to eval mode
23
+ model.eval()
24
+
25
+ # Create dummy inputs for tracing
26
+ dummy_text = "What is machine learning?"
27
+ dummy_inputs = tokenizer(
28
+ dummy_text,
29
+ return_tensors="pt",
30
+ padding=True,
31
+ truncation=True,
32
+ max_length=512
33
+ )
34
+
35
+ # Get the input tensors
36
+ input_ids = dummy_inputs["input_ids"]
37
+ attention_mask = dummy_inputs["attention_mask"]
38
+ token_type_ids = dummy_inputs.get("token_type_ids", torch.zeros_like(input_ids))
39
+
40
+ print(f"Input shapes:")
41
+ print(f" input_ids: {input_ids.shape}")
42
+ print(f" attention_mask: {attention_mask.shape}")
43
+ print(f" token_type_ids: {token_type_ids.shape}")
44
+
45
+ # Trace the model
46
+ print("\nTracing model...")
47
+ try:
48
+ # Method 1: Trace with all inputs
49
+ traced_model = torch.jit.trace(
50
+ model,
51
+ (input_ids, attention_mask, token_type_ids),
52
+ strict=False
53
+ )
54
+ print("✓ Model traced successfully with all inputs")
55
+ except Exception as e:
56
+ print(f"Failed to trace with all inputs: {e}")
57
+ # Method 2: Try with just input_ids
58
+ try:
59
+ traced_model = torch.jit.trace(model, input_ids)
60
+ print("✓ Model traced with input_ids only")
61
+ except Exception as e2:
62
+ print(f"Failed to trace with input_ids only: {e2}")
63
+ return False
64
+
65
+ # Save the traced model
66
+ output_path = os.path.join(output_dir, "rust_model.ot")
67
+ traced_model.save(output_path)
68
+ print(f"\n✓ Saved TorchScript model to: {output_path}")
69
+
70
+ # Also save the tokenizer vocab
71
+ tokenizer.save_pretrained(output_dir)
72
+ print(f"✓ Saved tokenizer files to: {output_dir}")
73
+
74
+ # Test the traced model
75
+ print("\nTesting traced model...")
76
+ with torch.no_grad():
77
+ original_output = model(input_ids, attention_mask, token_type_ids)
78
+ traced_output = traced_model(input_ids, attention_mask, token_type_ids)
79
+
80
+ orig_logits = original_output.logits[0][0].item()
81
+ traced_logits = traced_output.logits[0][0].item()
82
+
83
+ print(f"Original model logits: {orig_logits:.6f}")
84
+ print(f"Traced model logits: {traced_logits:.6f}")
85
+ print(f"Difference: {abs(orig_logits - traced_logits):.6f}")
86
+
87
+ if abs(orig_logits - traced_logits) < 1e-5:
88
+ print("✓ Models produce identical results!")
89
+ else:
90
+ print("⚠ Warning: Models produce different results")
91
+
92
+ return True
93
+
94
+ def main():
95
+ # Convert TinyBERT
96
+ if convert_to_torchscript():
97
+ print("\n" + "="*60)
98
+ print("Conversion successful!")
99
+ print("="*60)
100
+ print("\nTo use with rust-bert:")
101
+ print("1. Copy the rust_model.ot file to your rust project")
102
+ print("2. Use LocalResource to load the model")
103
+ print("3. The vocab files are also saved for tokenization")
104
+ else:
105
+ print("\nConversion failed!")
106
+ sys.exit(1)
107
+
108
+ if __name__ == "__main__":
109
+ main()
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Focused debugging script for cross-encoder scoring issues.
4
+
5
+ This script provides a minimal, easily modifiable test harness for debugging
6
+ specific query-document pairs and comparing with Rust implementation results.
7
+
8
+ Usage:
9
+ python debug_scoring.py
10
+
11
+ Or modify the test cases in the script and run again.
12
+ """
13
+
14
+ import sys
15
+ import torch
16
+ import numpy as np
17
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
+
19
+ # Configuration - MODIFY THESE FOR YOUR TESTS
20
+ MODEL_NAME = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
21
+ MAX_LENGTH = 512
22
+
23
+ # Test cases - MODIFY THESE FOR YOUR SPECIFIC DEBUGGING
24
+ TEST_CASES = [
25
+ {
26
+ "name": "Relevant Query",
27
+ "query": "how does authentication work",
28
+ "document": """Authentication is the process of verifying the identity of a user, device, or system.
29
+ In web applications, authentication typically involves checking credentials like usernames
30
+ and passwords against a database. The authentication process usually follows these steps:
31
+ - User provides credentials
32
+ - System validates credentials against stored data
33
+ - If valid, system grants access and creates a session"""
34
+ },
35
+ {
36
+ "name": "Irrelevant Query",
37
+ "query": "foobar random nonsense gibberish",
38
+ "document": """Authentication is the process of verifying the identity of a user, device, or system.
39
+ In web applications, authentication typically involves checking credentials like usernames
40
+ and passwords against a database. The authentication process usually follows these steps:
41
+ - User provides credentials
42
+ - System validates credentials against stored data
43
+ - If valid, system grants access and creates a session"""
44
+ }
45
+ ]
46
+
47
+ def debug_single_case(tokenizer, model, query: str, document: str, case_name: str):
48
+ """Debug a single query-document pair with detailed output."""
49
+ print(f"\n{'='*60}")
50
+ print(f"DEBUGGING: {case_name}")
51
+ print(f"{'='*60}")
52
+ print(f"Query: '{query}'")
53
+ print(f"Document: '{document[:100]}...'")
54
+
55
+ # Tokenize
56
+ encoded = tokenizer(
57
+ query,
58
+ document,
59
+ truncation=True,
60
+ padding=True,
61
+ max_length=MAX_LENGTH,
62
+ return_tensors="pt",
63
+ return_attention_mask=True,
64
+ return_token_type_ids=True
65
+ )
66
+
67
+ # Print tokenization info
68
+ input_ids = encoded['input_ids'][0]
69
+ attention_mask = encoded['attention_mask'][0]
70
+ token_type_ids = encoded.get('token_type_ids', [None])[0]
71
+
72
+ print(f"\nTokenization:")
73
+ print(f" Input IDs shape: {input_ids.shape}")
74
+ print(f" Number of tokens: {len(input_ids)}")
75
+ print(f" Attention mask sum: {attention_mask.sum().item()}")
76
+
77
+ # Show first few and last few tokens
78
+ tokens = tokenizer.convert_ids_to_tokens(input_ids)
79
+ print(f" First 10 tokens: {tokens[:10]}")
80
+ print(f" Last 10 tokens: {tokens[-10:]}")
81
+
82
+ # Find special tokens
83
+ cls_positions = [i for i, token in enumerate(tokens) if token == '[CLS]']
84
+ sep_positions = [i for i, token in enumerate(tokens) if token == '[SEP]']
85
+ print(f" [CLS] positions: {cls_positions}")
86
+ print(f" [SEP] positions: {sep_positions}")
87
+
88
+ # Model inference
89
+ model.eval()
90
+ with torch.no_grad():
91
+ outputs = model(**encoded)
92
+ logits = outputs.logits
93
+
94
+ print(f"\nModel Output:")
95
+ print(f" Raw logits: {logits}")
96
+ print(f" Logits shape: {logits.shape}")
97
+
98
+ # Calculate different score interpretations
99
+ if logits.shape[-1] == 1:
100
+ # Single output - treat as regression
101
+ sigmoid_score = torch.sigmoid(logits[0, 0]).item()
102
+ raw_score = logits[0, 0].item()
103
+ print(f" Raw score: {raw_score}")
104
+ print(f" Sigmoid score: {sigmoid_score}")
105
+ final_score = sigmoid_score
106
+ else:
107
+ # Multiple outputs - treat as classification
108
+ probabilities = torch.softmax(logits, dim=-1)
109
+ print(f" Softmax probabilities: {probabilities}")
110
+ if logits.shape[-1] == 2:
111
+ # Binary classification
112
+ final_score = probabilities[0, 1].item()
113
+ print(f" Relevance probability (class 1): {final_score}")
114
+ else:
115
+ final_score = probabilities[0, 0].item()
116
+ print(f" First class probability: {final_score}")
117
+
118
+ print(f"\nFINAL SCORE: {final_score:.6f}")
119
+
120
+ # Return data for comparison
121
+ return {
122
+ 'case_name': case_name,
123
+ 'query': query,
124
+ 'document_preview': document[:100] + '...',
125
+ 'num_tokens': len(input_ids),
126
+ 'raw_logits': logits.cpu().numpy().tolist(),
127
+ 'final_score': final_score
128
+ }
129
+
130
+ def main():
131
+ """Run focused debugging tests."""
132
+ print("Cross-Encoder Scoring Debug Tool")
133
+ print(f"Model: {MODEL_NAME}")
134
+ print(f"PyTorch device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
135
+
136
+ # Load model and tokenizer
137
+ print("\nLoading model...")
138
+ try:
139
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
140
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
141
+ print("✓ Model loaded successfully")
142
+ except Exception as e:
143
+ print(f"❌ Failed to load model: {e}")
144
+ sys.exit(1)
145
+
146
+ # Run test cases
147
+ results = []
148
+ for test_case in TEST_CASES:
149
+ result = debug_single_case(
150
+ tokenizer,
151
+ model,
152
+ test_case["query"],
153
+ test_case["document"],
154
+ test_case["name"]
155
+ )
156
+ results.append(result)
157
+
158
+ # Summary
159
+ print(f"\n{'='*60}")
160
+ print("SUMMARY")
161
+ print(f"{'='*60}")
162
+
163
+ print(f"{'Case':<20} {'Tokens':<8} {'Score':<12} {'Expected':<12}")
164
+ print("-" * 52)
165
+
166
+ for result in results:
167
+ expected = "HIGH (>0.5)" if "Relevant" in result['case_name'] else "LOW (<0.5)"
168
+ actual = "HIGH" if result['final_score'] > 0.5 else "LOW"
169
+ status = "✓" if (actual == "HIGH") == ("Relevant" in result['case_name']) else "❌"
170
+
171
+ print(f"{result['case_name']:<20} {result['num_tokens']:<8} {result['final_score']:<12.6f} {expected:<12} {status}")
172
+
173
+ # Score difference
174
+ if len(results) >= 2:
175
+ score_diff = abs(results[0]['final_score'] - results[1]['final_score'])
176
+ print(f"\nScore difference: {score_diff:.6f}")
177
+ if score_diff < 0.1:
178
+ print("⚠️ WARNING: Score difference is very small - model may not be discriminating well")
179
+ else:
180
+ print("✓ Good score separation between relevant and irrelevant queries")
181
+
182
+ print("\nFor Rust debugging, compare:")
183
+ print("1. Token IDs and their order")
184
+ print("2. Raw logits values")
185
+ print("3. Final score calculation method")
186
+ print("4. Model configuration and weights")
187
+
188
+ if __name__ == "__main__":
189
+ main()
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script to understand exactly how the Python implementation works
4
+ so we can ensure our Rust implementation matches it perfectly.
5
+ """
6
+
7
+ import os
8
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
9
+
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import torch
12
+ import json
13
+
14
+ print("="*80)
15
+ print("TOKENIZATION AND MODEL LOADING DEBUG")
16
+ print("="*80)
17
+
18
+ # Load model and tokenizer
19
+ model_name = 'cross-encoder/ms-marco-TinyBERT-L-2-v2'
20
+ print(f"Loading model: {model_name}")
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
+
25
+ # Print tokenizer info
26
+ print("\n--- TOKENIZER INFO ---")
27
+ print(f"Tokenizer class: {type(tokenizer).__name__}")
28
+ print(f"Vocab size: {tokenizer.vocab_size}")
29
+ print(f"Model max length: {tokenizer.model_max_length}")
30
+ print(f"Padding token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
31
+ print(f"SEP token: '{tokenizer.sep_token}' (ID: {tokenizer.sep_token_id})")
32
+ print(f"CLS token: '{tokenizer.cls_token}' (ID: {tokenizer.cls_token_id})")
33
+
34
+ # Test inputs
35
+ query = "how does authentication work"
36
+ document = "Authentication is the process of verifying the identity of a user."
37
+
38
+ print(f"\nQuery: '{query}'")
39
+ print(f"Document: '{document}'")
40
+
41
+ # Method 1: Tokenize as pair (CORRECT for cross-encoder)
42
+ print("\n--- METHOD 1: Tokenize as pair (query, document) ---")
43
+ encoding = tokenizer(
44
+ query,
45
+ document,
46
+ padding=True,
47
+ truncation=True,
48
+ max_length=512,
49
+ return_tensors="pt"
50
+ )
51
+
52
+ print(f"Keys in encoding: {list(encoding.keys())}")
53
+ print(f"Input IDs shape: {encoding['input_ids'].shape}")
54
+ print(f"Input IDs: {encoding['input_ids'][0].tolist()}")
55
+
56
+ # Decode to see what was tokenized
57
+ decoded = tokenizer.decode(encoding['input_ids'][0])
58
+ print(f"\nDecoded text: '{decoded}'")
59
+
60
+ # Show token type IDs if present
61
+ if 'token_type_ids' in encoding:
62
+ print(f"\nToken type IDs: {encoding['token_type_ids'][0].tolist()}")
63
+ # Find where document starts (token type switches from 0 to 1)
64
+ token_types = encoding['token_type_ids'][0].tolist()
65
+ for i, (token_id, token_type) in enumerate(zip(encoding['input_ids'][0].tolist(), token_types)):
66
+ if i < 30: # Show first 30 tokens
67
+ token_text = tokenizer.decode([token_id])
68
+ print(f" [{i}] '{token_text}' (ID: {token_id}, Type: {token_type})")
69
+
70
+ # Method 2: Manual concatenation (WRONG - for comparison)
71
+ print("\n--- METHOD 2: Manual concatenation (WRONG approach) ---")
72
+ manual_text = f"{query} [SEP] {document}"
73
+ encoding2 = tokenizer(
74
+ manual_text,
75
+ padding=True,
76
+ truncation=True,
77
+ max_length=512,
78
+ return_tensors="pt"
79
+ )
80
+
81
+ print(f"Input IDs: {encoding2['input_ids'][0].tolist()}")
82
+ if 'token_type_ids' in encoding2:
83
+ print(f"Token type IDs: {encoding2['token_type_ids'][0].tolist()}")
84
+
85
+ # Compare the two methods
86
+ print("\n--- COMPARISON ---")
87
+ ids1 = encoding['input_ids'][0].tolist()
88
+ ids2 = encoding2['input_ids'][0].tolist()
89
+
90
+ if ids1 == ids2:
91
+ print("✓ Both methods produce SAME token IDs")
92
+ else:
93
+ print("❌ Methods produce DIFFERENT token IDs!")
94
+ print(f" Length difference: {len(ids1)} vs {len(ids2)}")
95
+ # Find first difference
96
+ for i, (t1, t2) in enumerate(zip(ids1, ids2)):
97
+ if t1 != t2:
98
+ print(f" First difference at position {i}: {t1} vs {t2}")
99
+ break
100
+
101
+ # Test model forward pass
102
+ print("\n--- MODEL FORWARD PASS ---")
103
+ model.eval()
104
+
105
+ # Show model configuration
106
+ print(f"Model config:")
107
+ print(f" Hidden size: {model.config.hidden_size}")
108
+ print(f" Num labels: {model.config.num_labels}")
109
+ print(f" Problem type: {getattr(model.config, 'problem_type', 'Not specified')}")
110
+
111
+ # Test both encodings
112
+ with torch.no_grad():
113
+ # Correct tokenization
114
+ output1 = model(**encoding)
115
+ logits1 = output1.logits[0][0].item()
116
+
117
+ # Manual concatenation
118
+ output2 = model(**encoding2)
119
+ logits2 = output2.logits[0][0].item()
120
+
121
+ print(f"\nResults:")
122
+ print(f" Correct tokenization score: {logits1:.6f}")
123
+ print(f" Manual concatenation score: {logits2:.6f}")
124
+ print(f" Difference: {abs(logits1 - logits2):.6f}")
125
+
126
+ if abs(logits1 - logits2) > 0.01:
127
+ print(" ⚠️ Significant difference! Tokenization method matters!")
128
+
129
+ # Save tokenizer info for Rust comparison
130
+ print("\n--- SAVING DEBUG INFO ---")
131
+ debug_info = {
132
+ "model_name": model_name,
133
+ "tokenizer_class": type(tokenizer).__name__,
134
+ "vocab_size": tokenizer.vocab_size,
135
+ "special_tokens": {
136
+ "pad": {"token": tokenizer.pad_token, "id": tokenizer.pad_token_id},
137
+ "sep": {"token": tokenizer.sep_token, "id": tokenizer.sep_token_id},
138
+ "cls": {"token": tokenizer.cls_token, "id": tokenizer.cls_token_id},
139
+ },
140
+ "test_case": {
141
+ "query": query,
142
+ "document": document,
143
+ "correct_input_ids": ids1,
144
+ "correct_token_types": encoding['token_type_ids'][0].tolist() if 'token_type_ids' in encoding else None,
145
+ "correct_score": logits1,
146
+ "manual_concat_score": logits2,
147
+ }
148
+ }
149
+
150
+ with open("tokenizer_debug_info.json", "w") as f:
151
+ json.dump(debug_info, f, indent=2)
152
+
153
+ print("Debug info saved to tokenizer_debug_info.json")
154
+ print("\n✅ Use this info to verify your Rust implementation matches exactly!")
@@ -0,0 +1,73 @@
1
+ #!/bin/bash
2
+
3
+ # Script to download MS-MARCO cross-encoder models for local use
4
+
5
+ set -e
6
+
7
+ echo "=== MS-MARCO Model Downloader ==="
8
+ echo
9
+
10
+ # Base directory for models
11
+ MODEL_DIR="models"
12
+ mkdir -p "$MODEL_DIR"
13
+
14
+ # Function to download a model
15
+ download_model() {
16
+ local model_name=$1
17
+ local model_dir=$2
18
+
19
+ echo "Downloading $model_name..."
20
+ mkdir -p "$MODEL_DIR/$model_dir"
21
+
22
+ # Download essential files
23
+ FILES=(
24
+ "config.json"
25
+ "tokenizer.json"
26
+ "tokenizer_config.json"
27
+ "vocab.txt"
28
+ "pytorch_model.bin"
29
+ "special_tokens_map.json"
30
+ )
31
+
32
+ for file in "${FILES[@]}"; do
33
+ if [ -f "$MODEL_DIR/$model_dir/$file" ]; then
34
+ echo " ✓ $file already exists"
35
+ else
36
+ echo " ⬇ Downloading $file..."
37
+ curl -L -o "$MODEL_DIR/$model_dir/$file" \
38
+ "https://huggingface.co/$model_name/resolve/main/$file" 2>/dev/null || {
39
+ echo " ⚠ $file not found (might be optional)"
40
+ }
41
+ fi
42
+ done
43
+
44
+ echo "✓ $model_name download complete"
45
+ echo
46
+ }
47
+
48
+ # Download models
49
+ echo "Downloading cross-encoder models..."
50
+ echo
51
+
52
+ # TinyBERT (4M params) - already have this
53
+ if [ -d "$MODEL_DIR/ms-marco-TinyBERT-L-2-v2" ]; then
54
+ echo "✓ TinyBERT model already exists"
55
+ else
56
+ download_model "cross-encoder/ms-marco-TinyBERT-L-2-v2" "ms-marco-TinyBERT-L-2-v2"
57
+ fi
58
+
59
+ # MiniLM-L6 (22M params)
60
+ download_model "cross-encoder/ms-marco-MiniLM-L-6-v2" "ms-marco-MiniLM-L-6-v2"
61
+
62
+ # MiniLM-L12 (33M params)
63
+ download_model "cross-encoder/ms-marco-MiniLM-L-12-v2" "ms-marco-MiniLM-L-12-v2"
64
+
65
+ echo "=== Download Complete ==="
66
+ echo
67
+ echo "Models available in $MODEL_DIR/:"
68
+ ls -la "$MODEL_DIR/"
69
+ echo
70
+ echo "You can now use these rerankers:"
71
+ echo " --reranker ms-marco-tinybert (4M params, fastest)"
72
+ echo " --reranker ms-marco-minilm-l6 (22M params, balanced)"
73
+ echo " --reranker ms-marco-minilm-l12 (33M params, most accurate)"
@@ -0,0 +1,13 @@
1
+ # Requirements for cross-encoder testing and debugging
2
+ torch>=1.9.0
3
+ transformers>=4.20.0
4
+ sentence-transformers>=2.2.0
5
+ numpy>=1.21.0
6
+
7
+ # Optional but recommended for better performance
8
+ tokenizers>=0.13.0
9
+
10
+ # For additional debugging and analysis
11
+ matplotlib>=3.5.0
12
+ seaborn>=0.11.0
13
+ pandas>=1.3.0
@@ -0,0 +1,83 @@
1
+ #!/bin/bash
2
+
3
+ echo "🚀 COMPREHENSIVE RERANKER PERFORMANCE ANALYSIS"
4
+ echo "=============================================="
5
+ echo ""
6
+
7
+ # Build first
8
+ echo "Building release binary..."
9
+ cargo build --release
10
+ echo ""
11
+
12
+ # Test different document counts
13
+ echo "=== SCALABILITY ANALYSIS ==="
14
+ echo ""
15
+
16
+ echo "Testing with 100 documents:"
17
+ ./target/release/benchmark --demo --query "search algorithm implementation" --num-docs 100 --iterations 5 --batch-size 20
18
+
19
+ echo ""
20
+ echo "Testing with 500 documents:"
21
+ ./target/release/benchmark --demo --query "search algorithm implementation" --num-docs 500 --iterations 5 --batch-size 50
22
+
23
+ echo ""
24
+ echo "Testing with 1000 documents:"
25
+ ./target/release/benchmark --demo --query "search algorithm implementation" --num-docs 1000 --iterations 5 --batch-size 100
26
+
27
+ echo ""
28
+ echo "Testing with 2000 documents:"
29
+ ./target/release/benchmark --demo --query "search algorithm implementation" --num-docs 2000 --iterations 3 --batch-size 200
30
+
31
+ echo ""
32
+ echo "=== QUERY COMPLEXITY ANALYSIS ==="
33
+ echo ""
34
+
35
+ echo "Simple query (2 words):"
36
+ ./target/release/benchmark --demo --query "rust async" --num-docs 500 --iterations 5 --batch-size 50
37
+
38
+ echo ""
39
+ echo "Medium query (4 words):"
40
+ ./target/release/benchmark --demo --query "vector search embedding similarity" --num-docs 500 --iterations 5 --batch-size 50
41
+
42
+ echo ""
43
+ echo "Complex query (8 words):"
44
+ ./target/release/benchmark --demo --query "distributed search engine indexing algorithm optimization performance tuning" --num-docs 500 --iterations 5 --batch-size 50
45
+
46
+ echo ""
47
+ echo "=== BATCH SIZE OPTIMIZATION ==="
48
+ echo ""
49
+
50
+ echo "Batch size 10:"
51
+ ./target/release/benchmark --demo --query "machine learning model" --num-docs 500 --iterations 3 --batch-size 10
52
+
53
+ echo ""
54
+ echo "Batch size 50:"
55
+ ./target/release/benchmark --demo --query "machine learning model" --num-docs 500 --iterations 3 --batch-size 50
56
+
57
+ echo ""
58
+ echo "Batch size 100:"
59
+ ./target/release/benchmark --demo --query "machine learning model" --num-docs 500 --iterations 3 --batch-size 100
60
+
61
+ echo ""
62
+ echo "Batch size 250:"
63
+ ./target/release/benchmark --demo --query "machine learning model" --num-docs 500 --iterations 3 --batch-size 250
64
+
65
+ echo ""
66
+ echo "=== FILE TYPE ANALYSIS ==="
67
+ echo ""
68
+
69
+ echo "Only Rust files:"
70
+ ./target/release/benchmark --demo --query "struct impl trait" --num-docs 200 --iterations 5 --batch-size 40 --extensions rs
71
+
72
+ echo ""
73
+ echo "Only JavaScript/TypeScript files:"
74
+ ./target/release/benchmark --demo --query "async function promise" --num-docs 200 --iterations 5 --batch-size 40 --extensions js --extensions ts
75
+
76
+ echo ""
77
+ echo "Multiple file types:"
78
+ ./target/release/benchmark --demo --query "algorithm optimization" --num-docs 200 --iterations 5 --batch-size 40 --extensions rs --extensions js --extensions ts --extensions go --extensions py --extensions java
79
+
80
+ echo ""
81
+ echo "=============================================="
82
+ echo "✅ COMPREHENSIVE BENCHMARK COMPLETE"
83
+ echo "=============================================="
@@ -0,0 +1,12 @@
1
+ [package]
2
+ name = "rust-bert-reranker-test"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ rust-bert = "0.21"
8
+ anyhow = "1.0"
9
+ tokio = { version = "1.0", features = ["full"] }
10
+
11
+ # For downloading model files
12
+ tch = "0.13.0"
@@ -0,0 +1,54 @@
1
+ # Rust-BERT Cross-Encoder Test
2
+
3
+ This example tests cross-encoder functionality using rust-bert to compare with our Candle implementation.
4
+
5
+ ## Setup
6
+
7
+ 1. Install libtorch (required by rust-bert):
8
+ - macOS: `brew install pytorch`
9
+ - Linux: Download from https://pytorch.org/get-started/locally/
10
+
11
+ 2. Set environment variables:
12
+ ```bash
13
+ export LIBTORCH=/usr/local/opt/pytorch # macOS with Homebrew
14
+ # or
15
+ export LIBTORCH=/path/to/libtorch # Linux/custom installation
16
+ ```
17
+
18
+ 3. Build and run:
19
+ ```bash
20
+ cargo run --release
21
+ ```
22
+
23
+ ## Model Conversion
24
+
25
+ To use the TinyBERT model with rust-bert, you need to convert it to the .ot format:
26
+
27
+ ```python
28
+ # convert_model.py
29
+ import torch
30
+ from transformers import AutoModelForSequenceClassification
31
+
32
+ model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2-v2')
33
+ traced = torch.jit.trace(model, (torch.zeros(1, 512, dtype=torch.long),))
34
+ traced.save("rust_model.ot")
35
+ ```
36
+
37
+ ## Notes
38
+
39
+ - rust-bert expects models in TorchScript format (.ot files)
40
+ - The sequence classification pipeline is designed for classification, not regression
41
+ - For true cross-encoder scoring, you may need to modify the pipeline
42
+ - This example demonstrates the approach but may not give identical results to Python
43
+
44
+ ## Comparison with Candle
45
+
46
+ Our Candle implementation:
47
+ - Loads PyTorch .bin files directly
48
+ - Implements cross-encoder architecture manually
49
+ - Returns raw logits for scoring
50
+
51
+ rust-bert approach:
52
+ - Uses TorchScript format
53
+ - Provides high-level pipelines
54
+ - Returns classification labels with confidence scores