@crownpeak/dqm-react-component-dev-mcp 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +138 -0
- package/data/.env.example +22 -0
- package/data/.gitattributes +47 -0
- package/data/.glfrc.json +7 -0
- package/data/.husky/pre-commit +5 -0
- package/data/.nvmrc +1 -0
- package/data/CHANGELOG.md +75 -0
- package/data/CODE_OF_CONDUCT.md +129 -0
- package/data/CONTRIBUTING.md +203 -0
- package/data/DOCS-STRUCTURE.md +307 -0
- package/data/I18N.md +292 -0
- package/data/LICENSE +22 -0
- package/data/README.md +315 -0
- package/data/SECURITY.md +125 -0
- package/data/WIKI-DEPLOYMENT.md +348 -0
- package/data/docs/AI-FEATURES.md +610 -0
- package/data/docs/API-REFERENCE.md +1022 -0
- package/data/docs/AUTHENTICATION.md +301 -0
- package/data/docs/BACKEND-API.md +468 -0
- package/data/docs/DEVELOPMENT.md +375 -0
- package/data/docs/EXAMPLES.md +622 -0
- package/data/docs/MCP-SERVER.md +307 -0
- package/data/docs/MIGRATION-GUIDE.md +367 -0
- package/data/docs/NPM-PUBLISH.md +193 -0
- package/data/docs/QUICKSTART.md +206 -0
- package/data/docs/REDIS-SETUP.md +162 -0
- package/data/docs/SERVER.md +228 -0
- package/data/docs/TROUBLESHOOTING.md +657 -0
- package/data/docs/WIDGET-GUIDE.md +638 -0
- package/data/docs/WIKI-HOME.md +58 -0
- package/data/docs/WIKI-SIDEBAR.md +39 -0
- package/data/package.json +171 -0
- package/data/playwright.config.ts +64 -0
- package/data/probe/.cargo/config.toml +10 -0
- package/data/probe/.claude/commands/performance-review.md +15 -0
- package/data/probe/.clinerules +288 -0
- package/data/probe/.dockerignore +57 -0
- package/data/probe/.githooks/post-commit +11 -0
- package/data/probe/.githooks/pre-commit +99 -0
- package/data/probe/.githooks/pre-commit-vow +9 -0
- package/data/probe/.prompts/engineer.md +41 -0
- package/data/probe/.roomodes +28 -0
- package/data/probe/.windsurfrules +0 -0
- package/data/probe/BASH_TOOL_SUMMARY.md +148 -0
- package/data/probe/BENCHMARKING.md +256 -0
- package/data/probe/CLAUDE.md +226 -0
- package/data/probe/CODE_OF_CONDUCT.md +128 -0
- package/data/probe/CONTRIBUTING.md +193 -0
- package/data/probe/Cargo.toml +120 -0
- package/data/probe/Cross.toml +10 -0
- package/data/probe/DOCKER-README.md +224 -0
- package/data/probe/Dockerfile +32 -0
- package/data/probe/ENHANCED_DEBUG_TELEMETRY.md +188 -0
- package/data/probe/LICENSE +201 -0
- package/data/probe/Makefile +210 -0
- package/data/probe/README.md +824 -0
- package/data/probe/SECURITY.md +67 -0
- package/data/probe/WINDOWS-GUIDE.md +294 -0
- package/data/probe/benches/parsing_benchmarks.rs +370 -0
- package/data/probe/benches/search_benchmarks.rs +599 -0
- package/data/probe/benches/simd_benchmarks.rs +372 -0
- package/data/probe/benches/timing_benchmarks.rs +287 -0
- package/data/probe/build-windows.bat +229 -0
- package/data/probe/codex-config/config.toml +6 -0
- package/data/probe/docs/PERFORMANCE_OPTIMIZATION.md +161 -0
- package/data/probe/examples/cache_demo.rs +46 -0
- package/data/probe/examples/chat/.dockerignore +37 -0
- package/data/probe/examples/chat/ChatSessionManager.js +295 -0
- package/data/probe/examples/chat/Dockerfile +98 -0
- package/data/probe/examples/chat/LICENSE +201 -0
- package/data/probe/examples/chat/LOCAL_IMAGE_SUPPORT.md +195 -0
- package/data/probe/examples/chat/MCP_INTEGRATION.md +400 -0
- package/data/probe/examples/chat/README.md +338 -0
- package/data/probe/examples/chat/TRACING.md +226 -0
- package/data/probe/examples/chat/appTracer.js +968 -0
- package/data/probe/examples/chat/auth.js +76 -0
- package/data/probe/examples/chat/bin/probe-chat.js +13 -0
- package/data/probe/examples/chat/build.js +104 -0
- package/data/probe/examples/chat/cancelRequest.js +84 -0
- package/data/probe/examples/chat/demo-agentic-image-flow.js +88 -0
- package/data/probe/examples/chat/demo-local-images.js +128 -0
- package/data/probe/examples/chat/fileSpanExporter.js +181 -0
- package/data/probe/examples/chat/implement/README.md +228 -0
- package/data/probe/examples/chat/implement/backends/AiderBackend.js +750 -0
- package/data/probe/examples/chat/implement/backends/BaseBackend.js +276 -0
- package/data/probe/examples/chat/implement/backends/ClaudeCodeBackend.js +767 -0
- package/data/probe/examples/chat/implement/backends/MockBackend.js +237 -0
- package/data/probe/examples/chat/implement/backends/registry.js +85 -0
- package/data/probe/examples/chat/implement/core/BackendManager.js +567 -0
- package/data/probe/examples/chat/implement/core/ImplementTool.js +354 -0
- package/data/probe/examples/chat/implement/core/config.js +428 -0
- package/data/probe/examples/chat/implement/core/timeouts.js +58 -0
- package/data/probe/examples/chat/implement/core/utils.js +496 -0
- package/data/probe/examples/chat/implement/types/BackendTypes.js +126 -0
- package/data/probe/examples/chat/index.js +669 -0
- package/data/probe/examples/chat/mcpServer.js +341 -0
- package/data/probe/examples/chat/npm/LICENSE +15 -0
- package/data/probe/examples/chat/npm/README.md +168 -0
- package/data/probe/examples/chat/npm/bin/probe-chat.js +156 -0
- package/data/probe/examples/chat/npm/index.js +259 -0
- package/data/probe/examples/chat/npm/package.json +54 -0
- package/data/probe/examples/chat/package.json +102 -0
- package/data/probe/examples/chat/probeChat.js +456 -0
- package/data/probe/examples/chat/probeTool.js +491 -0
- package/data/probe/examples/chat/storage/JsonChatStorage.js +476 -0
- package/data/probe/examples/chat/telemetry.js +281 -0
- package/data/probe/examples/chat/test/integration/chatFlows.test.js +320 -0
- package/data/probe/examples/chat/test/integration/toolCalling.test.js +471 -0
- package/data/probe/examples/chat/test/mocks/mockLLMProvider.js +269 -0
- package/data/probe/examples/chat/test/test-backends.js +90 -0
- package/data/probe/examples/chat/test/testUtils.js +530 -0
- package/data/probe/examples/chat/test/unit/backendTimeout.test.js +161 -0
- package/data/probe/examples/chat/test/unit/packageFiles.test.js +120 -0
- package/data/probe/examples/chat/test/verify-tests.js +118 -0
- package/data/probe/examples/chat/test-agentic-image-loading.js +294 -0
- package/data/probe/examples/chat/test-ai-sdk-telemetry.js +204 -0
- package/data/probe/examples/chat/test-chat-tracing.js +38 -0
- package/data/probe/examples/chat/test-direct-function.js +49 -0
- package/data/probe/examples/chat/test-file-size-validation.js +103 -0
- package/data/probe/examples/chat/test-full-mcp-integration.js +258 -0
- package/data/probe/examples/chat/test-github-context.txt +12 -0
- package/data/probe/examples/chat/test-hierarchy.js +203 -0
- package/data/probe/examples/chat/test-image-spans.js +37 -0
- package/data/probe/examples/chat/test-local-image-reading.js +176 -0
- package/data/probe/examples/chat/test-mcp-integration.js +136 -0
- package/data/probe/examples/chat/test-mcp-probe-server.js +161 -0
- package/data/probe/examples/chat/test-mcp-with-ai.js +279 -0
- package/data/probe/examples/chat/test-multiple-allowed-dirs.js +111 -0
- package/data/probe/examples/chat/test-probe-mcp-server.js +110 -0
- package/data/probe/examples/chat/test-security-validation.js +145 -0
- package/data/probe/examples/chat/test-simple-tracing.js +32 -0
- package/data/probe/examples/chat/test-trace-verification.js +235 -0
- package/data/probe/examples/chat/test-tracing.js +114 -0
- package/data/probe/examples/chat/tokenCounter.js +419 -0
- package/data/probe/examples/chat/tokenUsageDisplay.js +134 -0
- package/data/probe/examples/chat/webServer.js +1103 -0
- package/data/probe/examples/reranker/Cargo.toml +33 -0
- package/data/probe/examples/reranker/DEBUG_OUTPUT_ANALYSIS.md +71 -0
- package/data/probe/examples/reranker/MODELS.md +66 -0
- package/data/probe/examples/reranker/MODEL_COMPARISON.md +60 -0
- package/data/probe/examples/reranker/MULTI_MODEL_ANALYSIS.md +176 -0
- package/data/probe/examples/reranker/PERFORMANCE_SUMMARY.md +156 -0
- package/data/probe/examples/reranker/README.md +347 -0
- package/data/probe/examples/reranker/RUST_BERT_COMPARISON.md +82 -0
- package/data/probe/examples/reranker/TOKENIZATION_GUIDE.md +120 -0
- package/data/probe/examples/reranker/check_rust_tokenizer.py +108 -0
- package/data/probe/examples/reranker/convert_to_torchscript.py +109 -0
- package/data/probe/examples/reranker/debug_scoring.py +189 -0
- package/data/probe/examples/reranker/debug_tokenization.py +154 -0
- package/data/probe/examples/reranker/download_models.sh +73 -0
- package/data/probe/examples/reranker/requirements.txt +13 -0
- package/data/probe/examples/reranker/run_comprehensive_benchmark.sh +83 -0
- package/data/probe/examples/reranker/rust_bert_test/Cargo.toml +12 -0
- package/data/probe/examples/reranker/rust_bert_test/README.md +54 -0
- package/data/probe/examples/reranker/simple_test.py +50 -0
- package/data/probe/examples/reranker/test_all_models.sh +63 -0
- package/data/probe/examples/reranker/test_bert_results.sh +44 -0
- package/data/probe/examples/reranker/test_cross_encoder.py +334 -0
- package/data/probe/examples/reranker/test_cross_encoder.sh +80 -0
- package/data/probe/examples/reranker/test_exact_comparison.py +151 -0
- package/data/probe/examples/reranker/test_parallel_performance.sh +56 -0
- package/data/probe/examples/reranker/test_scores.py +132 -0
- package/data/probe/install.ps1 +508 -0
- package/data/probe/install.sh +460 -0
- package/data/probe/npm/CLONE_METHOD_EXAMPLES.md +596 -0
- package/data/probe/npm/CONTEXT_COMPACTION.md +303 -0
- package/data/probe/npm/DELEGATE_TOOL_README.md +166 -0
- package/data/probe/npm/MAID_INTEGRATION.md +313 -0
- package/data/probe/npm/MCP_INTEGRATION_SUMMARY.md +241 -0
- package/data/probe/npm/README.md +824 -0
- package/data/probe/npm/bin/.gitignore +7 -0
- package/data/probe/npm/bin/.gitkeep +0 -0
- package/data/probe/npm/bin/README.md +12 -0
- package/data/probe/npm/bin/probe +167 -0
- package/data/probe/npm/docs/CLAUDE_CODE_INTEGRATION.md +414 -0
- package/data/probe/npm/docs/CODEX_INTEGRATION.md +502 -0
- package/data/probe/npm/docs/EDIT_CREATE_TOOLS.md +233 -0
- package/data/probe/npm/docs/RETRY_AND_FALLBACK.md +674 -0
- package/data/probe/npm/example-usage.js +335 -0
- package/data/probe/npm/examples/multi-engine-demo.js +117 -0
- package/data/probe/npm/examples/probe-agent-cli.js +113 -0
- package/data/probe/npm/examples/test-agent-edit.js +114 -0
- package/data/probe/npm/examples/test-edit-create.js +120 -0
- package/data/probe/npm/examples/test-edit-direct.js +114 -0
- package/data/probe/npm/index.d.ts +744 -0
- package/data/probe/npm/jest.config.js +52 -0
- package/data/probe/npm/package.json +117 -0
- package/data/probe/npm/scripts/build-agent.cjs +75 -0
- package/data/probe/npm/scripts/build-cjs.js +124 -0
- package/data/probe/npm/scripts/build-mcp.cjs +36 -0
- package/data/probe/npm/scripts/postinstall.js +216 -0
- package/data/probe/npm/test-codex-e2e.js +78 -0
- package/data/probe/npm/test-download-lock.js +109 -0
- package/data/probe/npm/test-grep-security.js +94 -0
- package/data/probe/npm/test-grep-simplified.js +63 -0
- package/data/probe/npm/test-grep.js +51 -0
- package/data/probe/npm/tests/README.md +96 -0
- package/data/probe/npm/tests/agent-compact-history.test.js +174 -0
- package/data/probe/npm/tests/allow-tests-default.test.js +151 -0
- package/data/probe/npm/tests/contextCompactor.test.js +498 -0
- package/data/probe/npm/tests/delegate-config.test.js +353 -0
- package/data/probe/npm/tests/delegate-integration.test.js +348 -0
- package/data/probe/npm/tests/extractor-integration.test.js +162 -0
- package/data/probe/npm/tests/extractor.test.js +317 -0
- package/data/probe/npm/tests/fixtures/sampleDiagrams.js +267 -0
- package/data/probe/npm/tests/integration/claude-code-auto-fallback.spec.js +148 -0
- package/data/probe/npm/tests/integration/claude-code-multi-step.spec.js +127 -0
- package/data/probe/npm/tests/integration/claude-code-tool-events.spec.js +163 -0
- package/data/probe/npm/tests/integration/codex-auto-fallback.spec.js +191 -0
- package/data/probe/npm/tests/integration/codex-tool-events.spec.js +147 -0
- package/data/probe/npm/tests/integration/examplesChatMcp.test.js +402 -0
- package/data/probe/npm/tests/integration/mcpDotenvSupport.test.js +174 -0
- package/data/probe/npm/tests/integration/mcpErrorHandling.test.js +566 -0
- package/data/probe/npm/tests/integration/mcpRobustness.test.js +564 -0
- package/data/probe/npm/tests/integration/mcpStdoutPurity.test.js +355 -0
- package/data/probe/npm/tests/integration/probeAgentMcp.test.js +398 -0
- package/data/probe/npm/tests/integration/retryFallback.test.js +368 -0
- package/data/probe/npm/tests/integration/schema-in-initial-message.test.js +318 -0
- package/data/probe/npm/tests/integration/schema-validation-loop-prevention.test.js +244 -0
- package/data/probe/npm/tests/integration/schemaRetryLogic.test.js +94 -0
- package/data/probe/npm/tests/integration/validationFlow.test.js +329 -0
- package/data/probe/npm/tests/manual/test-codex-basic.js +110 -0
- package/data/probe/npm/tests/mcp/mcpClientManager.test.js +614 -0
- package/data/probe/npm/tests/mcp/mcpConfig.test.js +359 -0
- package/data/probe/npm/tests/mcp/mcpXmlBridge.test.js +436 -0
- package/data/probe/npm/tests/mcp/mockMcpServer.js +510 -0
- package/data/probe/npm/tests/mcp-strict-syntax.test.js +319 -0
- package/data/probe/npm/tests/mermaidQuoteEscaping.test.js +214 -0
- package/data/probe/npm/tests/nestedQuoteFix.test.js +40 -0
- package/data/probe/npm/tests/setup.js +46 -0
- package/data/probe/npm/tests/unit/allowed-tools.test.js +513 -0
- package/data/probe/npm/tests/unit/attempt-completion-closing-tag-in-content.test.js +188 -0
- package/data/probe/npm/tests/unit/attemptCompletionJsonFix.test.js +238 -0
- package/data/probe/npm/tests/unit/attemptCompletionJsonIssue.test.js +128 -0
- package/data/probe/npm/tests/unit/backtickAutoFix.test.js +35 -0
- package/data/probe/npm/tests/unit/bash-probe-agent-integration.test.js +389 -0
- package/data/probe/npm/tests/unit/bash-simple-commands.test.js +324 -0
- package/data/probe/npm/tests/unit/bash-tool-comprehensive.test.js +371 -0
- package/data/probe/npm/tests/unit/bash-tool-integration.test.js +310 -0
- package/data/probe/npm/tests/unit/bash-tool.test.js +341 -0
- package/data/probe/npm/tests/unit/completion-prompt.test.js +379 -0
- package/data/probe/npm/tests/unit/cwd-path-options.test.js +287 -0
- package/data/probe/npm/tests/unit/delegate-limits.test.js +422 -0
- package/data/probe/npm/tests/unit/direct-content-attempt-completion.test.js +235 -0
- package/data/probe/npm/tests/unit/edit-create-tools.test.js +609 -0
- package/data/probe/npm/tests/unit/enhancedMermaidValidation.test.js +577 -0
- package/data/probe/npm/tests/unit/extract-content.test.js +83 -0
- package/data/probe/npm/tests/unit/extract-multiple-targets.test.js +89 -0
- package/data/probe/npm/tests/unit/fallbackManager.test.js +442 -0
- package/data/probe/npm/tests/unit/githubCompatibilityValidation.test.js +258 -0
- package/data/probe/npm/tests/unit/imageConfig.test.js +149 -0
- package/data/probe/npm/tests/unit/imagePathResolution.test.js +345 -0
- package/data/probe/npm/tests/unit/json-fixing-agent.test.js +238 -0
- package/data/probe/npm/tests/unit/json-validation-enhanced-errors.test.js +199 -0
- package/data/probe/npm/tests/unit/jsonValidationInfiniteLoopFix.test.js +228 -0
- package/data/probe/npm/tests/unit/maidIntegration.test.js +139 -0
- package/data/probe/npm/tests/unit/maxIterationsWarning.test.js +195 -0
- package/data/probe/npm/tests/unit/mermaidEdgeLabelFix.test.js +161 -0
- package/data/probe/npm/tests/unit/mermaidHtmlEntities.test.js +76 -0
- package/data/probe/npm/tests/unit/mermaidInfiniteLoopFix.test.js +64 -0
- package/data/probe/npm/tests/unit/mermaidValidation.test.js +723 -0
- package/data/probe/npm/tests/unit/mermaidValidationVisorExample.test.js +309 -0
- package/data/probe/npm/tests/unit/probe-agent-clone-realistic.test.js +643 -0
- package/data/probe/npm/tests/unit/probe-agent-clone.test.js +476 -0
- package/data/probe/npm/tests/unit/probe-agent-delegate.test.js +400 -0
- package/data/probe/npm/tests/unit/probe-agent-model-option.test.js +118 -0
- package/data/probe/npm/tests/unit/probeTool-security.test.js +283 -0
- package/data/probe/npm/tests/unit/readImageTool.test.js +418 -0
- package/data/probe/npm/tests/unit/retryManager.test.js +317 -0
- package/data/probe/npm/tests/unit/schema-aware-reminders.test.js +288 -0
- package/data/probe/npm/tests/unit/schemaDefinitionDetection.test.js +115 -0
- package/data/probe/npm/tests/unit/schemaUtils.test.js +1268 -0
- package/data/probe/npm/tests/unit/simpleTelemetry.test.js +282 -0
- package/data/probe/npm/tests/unit/simplified-attempt-completion.test.js +274 -0
- package/data/probe/npm/tests/unit/single-quote-json-bug.test.js +231 -0
- package/data/probe/npm/tests/unit/subgraphAutoFix.test.js +110 -0
- package/data/probe/npm/tests/unit/system-prompt.test.js +32 -0
- package/data/probe/npm/tests/unit/types-probe-agent-options.test.js +42 -0
- package/data/probe/npm/tests/unit/xmlParsing.test.js +720 -0
- package/data/probe/npm/tsconfig.json +21 -0
- package/data/probe/result1.txt +19 -0
- package/data/probe/result2.txt +26 -0
- package/data/probe/scripts/benchmark.sh +270 -0
- package/data/probe/scripts/cache_memory_analysis.rs +844 -0
- package/data/probe/scripts/claude-hook-wrapper.sh +56 -0
- package/data/probe/site/.env.example +10 -0
- package/data/probe/site/DEPLOYMENT.md +86 -0
- package/data/probe/site/README.md +183 -0
- package/data/probe/site/adding-languages.md +135 -0
- package/data/probe/site/ai-chat.md +427 -0
- package/data/probe/site/ai-integration.md +1488 -0
- package/data/probe/site/blog/agentic-flow-custom-xml-protocol.md +407 -0
- package/data/probe/site/blog/index.md +118 -0
- package/data/probe/site/blog/v0.6.0-release.md +426 -0
- package/data/probe/site/blog.md +8 -0
- package/data/probe/site/changelog.md +200 -0
- package/data/probe/site/cli-mode.md +437 -0
- package/data/probe/site/code-extraction.md +436 -0
- package/data/probe/site/contributing/README.md +9 -0
- package/data/probe/site/contributing/documentation-cross-references.md +215 -0
- package/data/probe/site/contributing/documentation-maintenance.md +275 -0
- package/data/probe/site/contributing/documentation-structure.md +75 -0
- package/data/probe/site/documentation-cross-references.md +215 -0
- package/data/probe/site/documentation-guide.md +132 -0
- package/data/probe/site/documentation-maintenance.md +275 -0
- package/data/probe/site/features.md +147 -0
- package/data/probe/site/how-it-works.md +118 -0
- package/data/probe/site/index.md +175 -0
- package/data/probe/site/index.md.bak +133 -0
- package/data/probe/site/installation.md +235 -0
- package/data/probe/site/integrations/docker.md +248 -0
- package/data/probe/site/integrations/github-actions.md +413 -0
- package/data/probe/site/language-support-overview.md +168 -0
- package/data/probe/site/mcp-integration.md +587 -0
- package/data/probe/site/mcp-server.md +304 -0
- package/data/probe/site/navigation-structure.md +76 -0
- package/data/probe/site/nodejs-sdk.md +798 -0
- package/data/probe/site/output-formats.md +625 -0
- package/data/probe/site/package.json +21 -0
- package/data/probe/site/public/_headers +28 -0
- package/data/probe/site/public/_redirects +11 -0
- package/data/probe/site/quick-start.md +289 -0
- package/data/probe/site/search-functionality.md +291 -0
- package/data/probe/site/search-reference.md +291 -0
- package/data/probe/site/supported-languages.md +215 -0
- package/data/probe/site/use-cases/README.md +8 -0
- package/data/probe/site/use-cases/advanced-cli.md +253 -0
- package/data/probe/site/use-cases/ai-code-editors.md +239 -0
- package/data/probe/site/use-cases/building-ai-tools.md +529 -0
- package/data/probe/site/use-cases/cli-ai-workflows.md +285 -0
- package/data/probe/site/use-cases/deploying-probe-web-interface.md +255 -0
- package/data/probe/site/use-cases/integrating-probe-into-ai-code-editors.md +161 -0
- package/data/probe/site/use-cases/nodejs-sdk.md +596 -0
- package/data/probe/site/use-cases/team-chat.md +350 -0
- package/data/probe/site/web-interface.md +434 -0
- package/data/probe/site/wrangler.toml +9 -0
- package/data/probe/test-api-key.sh +1 -0
- package/data/probe/test-probe-implementation/hello.js +7 -0
- package/data/probe/test_cases/demonstrate_early_termination_issues.sh +176 -0
- package/data/probe/test_cases/early_termination_issues.rs +533 -0
- package/data/probe/test_data/test_nested_struct.go +26 -0
- package/data/probe/tests/README.md +286 -0
- package/data/probe/tests/README_search_determinism_tests.md +116 -0
- package/data/probe/tests/adjacent_comment_test.rs +152 -0
- package/data/probe/tests/apostrophe_handling_tests.rs +132 -0
- package/data/probe/tests/block_filtering_with_ast_tests.rs +669 -0
- package/data/probe/tests/block_merging_tests.rs +396 -0
- package/data/probe/tests/c_outline_format_tests.rs +2179 -0
- package/data/probe/tests/cache_invalidation_issues.rs.disabled +682 -0
- package/data/probe/tests/cache_order_tests.rs +147 -0
- package/data/probe/tests/cache_query_scoping_tests.rs +221 -0
- package/data/probe/tests/cli_tests.rs +680 -0
- package/data/probe/tests/comment_context_integration_test.rs +240 -0
- package/data/probe/tests/common.rs +33 -0
- package/data/probe/tests/complex_block_merging_tests.rs +599 -0
- package/data/probe/tests/complex_query_block_filtering_tests.rs +422 -0
- package/data/probe/tests/control_flow_closing_braces_test.rs +91 -0
- package/data/probe/tests/cpp_outline_format_tests.rs +1507 -0
- package/data/probe/tests/csharp_outline_format_tests.rs +941 -0
- package/data/probe/tests/elastic_query_integration_tests.rs +922 -0
- package/data/probe/tests/extract_command_tests.rs +1848 -0
- package/data/probe/tests/extract_deduplication_tests.rs +146 -0
- package/data/probe/tests/extract_input_file_tests.rs +84 -0
- package/data/probe/tests/extract_prompt_tests.rs +102 -0
- package/data/probe/tests/filename_search_tests.rs +96 -0
- package/data/probe/tests/fixtures/user/AssemblyInfo.cs +3 -0
- package/data/probe/tests/github_extract_tests.rs +234 -0
- package/data/probe/tests/go_comment_test.rs +253 -0
- package/data/probe/tests/go_outline_format_tests.rs +2587 -0
- package/data/probe/tests/go_path_resolver_tests.rs +96 -0
- package/data/probe/tests/html_outline_format_tests.rs +637 -0
- package/data/probe/tests/integration_tests.rs +837 -0
- package/data/probe/tests/ip_whitelist_test.rs +148 -0
- package/data/probe/tests/java_outline_format_tests.rs +1611 -0
- package/data/probe/tests/javascript_extract_tests.rs +315 -0
- package/data/probe/tests/javascript_outline_format_tests.rs +1464 -0
- package/data/probe/tests/json_format_tests.rs +436 -0
- package/data/probe/tests/json_schema_validation_tests.rs +450 -0
- package/data/probe/tests/lib_usage.rs +60 -0
- package/data/probe/tests/line_comment_context_extension_test.rs +459 -0
- package/data/probe/tests/line_map_cache_tests.rs +114 -0
- package/data/probe/tests/markdown_integration_tests.rs +190 -0
- package/data/probe/tests/mocks/test_ip_whitelist.go +11 -0
- package/data/probe/tests/mocks/test_object.js +27 -0
- package/data/probe/tests/mocks/test_struct.go +50 -0
- package/data/probe/tests/multi_keyword_pattern_tests.rs +464 -0
- package/data/probe/tests/multi_language_syntax_integration_tests.rs +218 -0
- package/data/probe/tests/multiple_capture_groups_tests.rs +169 -0
- package/data/probe/tests/negative_compound_word_tests.rs +246 -0
- package/data/probe/tests/nested_symbol_extraction_tests.rs +99 -0
- package/data/probe/tests/outline_cross_file_interference_test.rs +335 -0
- package/data/probe/tests/outline_keyword_preservation_test.rs +67 -0
- package/data/probe/tests/output_format_edge_cases_tests.rs +693 -0
- package/data/probe/tests/parallel_extraction_tests.rs +178 -0
- package/data/probe/tests/parallel_search_tests.rs +355 -0
- package/data/probe/tests/path_resolver_tests.rs +698 -0
- package/data/probe/tests/php_outline_format_extended_tests.rs +928 -0
- package/data/probe/tests/php_outline_format_tests.rs +768 -0
- package/data/probe/tests/property_tests.proptest-regressions +9 -0
- package/data/probe/tests/property_tests.rs +118 -0
- package/data/probe/tests/python_outline_format_tests.rs +1538 -0
- package/data/probe/tests/query_command_json_tests.rs +438 -0
- package/data/probe/tests/query_command_tests.rs +232 -0
- package/data/probe/tests/query_command_xml_tests.rs +569 -0
- package/data/probe/tests/quoted_term_with_negative_keyword_tests.rs +216 -0
- package/data/probe/tests/required_terms_filename_tests.rs +116 -0
- package/data/probe/tests/ruby_outline_format_tests.rs +1011 -0
- package/data/probe/tests/rust_line_comment_context_test.rs +151 -0
- package/data/probe/tests/rust_outline_format_enhanced_tests.rs +725 -0
- package/data/probe/tests/rust_outline_format_tests.rs +843 -0
- package/data/probe/tests/schemas/xml_output_schema.xsd +38 -0
- package/data/probe/tests/search_determinism_tests.rs +451 -0
- package/data/probe/tests/search_hints_tests.rs +253 -0
- package/data/probe/tests/special_character_escaping_tests.rs +417 -0
- package/data/probe/tests/stemming_compound_word_filtering_tests.rs +535 -0
- package/data/probe/tests/strict_elastic_syntax_tests.rs +404 -0
- package/data/probe/tests/swift_outline_format_tests.rs +3319 -0
- package/data/probe/tests/symbols_tests.rs +166 -0
- package/data/probe/tests/test_file.rs +45 -0
- package/data/probe/tests/test_tokenize.rs +28 -0
- package/data/probe/tests/timeout_tests.rs +82 -0
- package/data/probe/tests/tokenization_tests.rs +195 -0
- package/data/probe/tests/tokenized_block_filtering_tests.rs +174 -0
- package/data/probe/tests/typescript_extract_tests.rs +214 -0
- package/data/probe/tests/typescript_outline_format_tests.rs +2188 -0
- package/data/probe/tests/xml_format_tests.rs +568 -0
- package/data/probe/tests/xml_schema_validation_tests.rs +497 -0
- package/data/scripts/postinstall.mjs +9 -0
- package/data/scripts/set-version.js +0 -0
- package/data/scripts/wiki-build.sh +111 -0
- package/data/scripts/wiki-deploy.sh +73 -0
- package/data/serve.json +12 -0
- package/data/test/demo-dynamic.html +134 -0
- package/data/test/demo-esm.html +105 -0
- package/data/test/demo-iife.html +78 -0
- package/data/tsconfig.json +7 -0
- package/data/vite.server.ts +483 -0
- package/data/vitest.config.ts +40 -0
- package/data/wiki/Home.md +58 -0
- package/data/wiki/_Sidebar.md +39 -0
- package/docs-mcp.config.json +20 -0
- package/package.json +56 -0
- package/src/config.js +111 -0
- package/src/index.js +395 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
# BERT Reranker Example
|
|
2
|
+
|
|
3
|
+
A complete working Rust implementation of a BERT-based document reranker using the Candle framework. This example demonstrates how to use transformer models for document reranking tasks, specifically using the ms-marco-MiniLM-L-2-v2 model.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
This implementation provides a cross-encoder based reranker that:
|
|
8
|
+
- Loads pre-trained BERT models from HuggingFace Hub
|
|
9
|
+
- Processes query-document pairs through the transformer
|
|
10
|
+
- Computes relevance scores for ranking
|
|
11
|
+
- Sorts documents by relevance to the query
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Pure Rust Implementation**: Uses the Candle framework for ML inference
|
|
16
|
+
- **HuggingFace Integration**: Automatic model and tokenizer downloading
|
|
17
|
+
- **Cross-Encoder Architecture**: Proper query-document pair encoding
|
|
18
|
+
- **Flexible Model Support**: Works with various BERT-based reranking models
|
|
19
|
+
- **Interactive Mode**: Test reranking with custom queries and documents
|
|
20
|
+
- **Command Line Interface**: Easy to use from command line or scripts
|
|
21
|
+
|
|
22
|
+
## Installation and Setup
|
|
23
|
+
|
|
24
|
+
### Prerequisites
|
|
25
|
+
|
|
26
|
+
- Rust 1.70 or later
|
|
27
|
+
- Internet connection (for model downloading)
|
|
28
|
+
|
|
29
|
+
### Building the Project
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
cd examples/reranker
|
|
33
|
+
cargo build --release
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
### Basic Usage with Default Documents
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Use the default ms-marco-MiniLM-L-2-v2 model
|
|
42
|
+
cargo run --release -- --query "machine learning"
|
|
43
|
+
|
|
44
|
+
# Or run the binary directly after building
|
|
45
|
+
./target/release/reranker --query "rust programming"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Using Custom Documents
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
cargo run --release -- \
|
|
52
|
+
--query "natural language processing" \
|
|
53
|
+
--documents "BERT is a transformer model,Python is a programming language,NLP involves text processing,Rust is systems programming"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Interactive Mode
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
cargo run --release -- --query "your query here" --interactive
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
This will prompt you to enter documents one by one, then rerank them.
|
|
63
|
+
|
|
64
|
+
### Using Different Models
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Use a different cross-encoder model
|
|
68
|
+
cargo run --release -- \
|
|
69
|
+
--model "cross-encoder/ms-marco-MiniLM-L-6-v2" \
|
|
70
|
+
--query "information retrieval"
|
|
71
|
+
|
|
72
|
+
# Use PyTorch weights instead of SafeTensors
|
|
73
|
+
cargo run --release -- \
|
|
74
|
+
--model "cross-encoder/ms-marco-MiniLM-L-2-v2" \
|
|
75
|
+
--use-pth \
|
|
76
|
+
--query "document ranking"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Supported Models
|
|
80
|
+
|
|
81
|
+
This implementation works with cross-encoder models from HuggingFace Hub. Recommended models:
|
|
82
|
+
|
|
83
|
+
- `cross-encoder/ms-marco-MiniLM-L-2-v2` (default, fast and efficient)
|
|
84
|
+
- `cross-encoder/ms-marco-MiniLM-L-6-v2` (larger, potentially more accurate)
|
|
85
|
+
- `cross-encoder/ms-marco-MiniLM-L-12-v2` (largest, highest accuracy)
|
|
86
|
+
|
|
87
|
+
## Command Line Options
|
|
88
|
+
|
|
89
|
+
- `--model, -m`: HuggingFace model ID (default: `cross-encoder/ms-marco-MiniLM-L-2-v2`)
|
|
90
|
+
- `--revision, -r`: Model revision/branch (default: `main`)
|
|
91
|
+
- `--use-pth`: Use PyTorch weights instead of SafeTensors
|
|
92
|
+
- `--query, -q`: Search query (required)
|
|
93
|
+
- `--documents, -d`: Comma-separated list of documents to rerank
|
|
94
|
+
- `--interactive, -i`: Run in interactive mode
|
|
95
|
+
|
|
96
|
+
## Example Output
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
Initializing BERT Reranker...
|
|
100
|
+
Model: cross-encoder/ms-marco-MiniLM-L-2-v2
|
|
101
|
+
Revision: main
|
|
102
|
+
Using PyTorch weights: false
|
|
103
|
+
|
|
104
|
+
=== Example Usage ===
|
|
105
|
+
Query: machine learning
|
|
106
|
+
Documents to rerank:
|
|
107
|
+
1. Rust is a systems programming language focused on safety and performance.
|
|
108
|
+
2. Python is a high-level programming language known for its simplicity.
|
|
109
|
+
3. Machine learning involves training algorithms on data to make predictions.
|
|
110
|
+
4. BERT is a transformer-based model for natural language understanding.
|
|
111
|
+
5. The Candle framework provides machine learning capabilities in Rust.
|
|
112
|
+
6. Cross-encoders are used for reranking tasks in information retrieval.
|
|
113
|
+
7. Tokenization is the process of breaking text into individual tokens.
|
|
114
|
+
8. Neural networks consist of interconnected nodes that process information.
|
|
115
|
+
|
|
116
|
+
Loading BERT reranker model: cross-encoder/ms-marco-MiniLM-L-2-v2
|
|
117
|
+
Config file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/config.json"
|
|
118
|
+
Tokenizer file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/tokenizer.json"
|
|
119
|
+
Weights file: "/Users/username/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-2-v2/snapshots/main/model.safetensors"
|
|
120
|
+
BERT model loaded successfully
|
|
121
|
+
|
|
122
|
+
Reranking 8 documents for query: 'machine learning'
|
|
123
|
+
Reranking completed
|
|
124
|
+
|
|
125
|
+
=== Reranking Results ===
|
|
126
|
+
Documents ranked by relevance to query:
|
|
127
|
+
1. #3: 2.8934 - Machine learning involves training algorithms on data to make predictions.
|
|
128
|
+
2. #5: 2.1203 - The Candle framework provides machine learning capabilities in Rust.
|
|
129
|
+
3. #4: 1.9876 - BERT is a transformer-based model for natural language understanding.
|
|
130
|
+
4. #8: 1.7432 - Neural networks consist of interconnected nodes that process information.
|
|
131
|
+
5. #6: 1.5621 - Cross-encoders are used for reranking tasks in information retrieval.
|
|
132
|
+
6. #2: 0.9834 - Python is a high-level programming language known for its simplicity.
|
|
133
|
+
7. #7: 0.8976 - Tokenization is the process of breaking text into individual tokens.
|
|
134
|
+
8. #1: 0.7654 - Rust is a systems programming language focused on safety and performance.
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Architecture Details
|
|
138
|
+
|
|
139
|
+
### Cross-Encoder Approach
|
|
140
|
+
|
|
141
|
+
This implementation uses a cross-encoder architecture where:
|
|
142
|
+
1. Query and document are concatenated with a `[SEP]` token
|
|
143
|
+
2. The combined text is tokenized and fed through BERT
|
|
144
|
+
3. The `[CLS]` token embedding is used to compute a relevance score
|
|
145
|
+
4. Documents are ranked by their scores
|
|
146
|
+
|
|
147
|
+
### Model Components
|
|
148
|
+
|
|
149
|
+
- **Tokenizer**: Converts text to tokens using HuggingFace tokenizers
|
|
150
|
+
- **BERT Model**: Transformer encoder for processing text
|
|
151
|
+
- **Scoring**: Uses the CLS token embedding sum as relevance score
|
|
152
|
+
|
|
153
|
+
### Performance Considerations
|
|
154
|
+
|
|
155
|
+
- **CPU Inference**: Runs on CPU by default (GPU support can be added)
|
|
156
|
+
- **Memory Usage**: Models are loaded once and reused for multiple queries
|
|
157
|
+
- **Caching**: HuggingFace Hub automatically caches downloaded models
|
|
158
|
+
|
|
159
|
+
## Extending the Example
|
|
160
|
+
|
|
161
|
+
### Adding GPU Support
|
|
162
|
+
|
|
163
|
+
To enable GPU acceleration, modify the device initialization:
|
|
164
|
+
|
|
165
|
+
```rust
|
|
166
|
+
let device = Device::new_cuda(0)?; // Use GPU 0
|
|
167
|
+
// or
|
|
168
|
+
let device = Device::new_metal(0)?; // Use Metal on macOS
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Custom Scoring Functions
|
|
172
|
+
|
|
173
|
+
The current implementation uses a simple sum of CLS embeddings. For production use, consider:
|
|
174
|
+
- Adding a linear classification head
|
|
175
|
+
- Using cosine similarity between query and document embeddings
|
|
176
|
+
- Implementing attention-based scoring mechanisms
|
|
177
|
+
|
|
178
|
+
### Batch Processing
|
|
179
|
+
|
|
180
|
+
For better performance with multiple documents, implement batch processing:
|
|
181
|
+
|
|
182
|
+
```rust
|
|
183
|
+
// Process multiple query-document pairs simultaneously
|
|
184
|
+
fn batch_rerank(&self, query: &str, documents: &[&str]) -> Result<Vec<f32>> {
|
|
185
|
+
// Implementation for batch processing
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Troubleshooting
|
|
190
|
+
|
|
191
|
+
### Common Issues
|
|
192
|
+
|
|
193
|
+
1. **Model Download Failures**
|
|
194
|
+
- Check internet connection
|
|
195
|
+
- Verify model ID exists on HuggingFace Hub
|
|
196
|
+
- Try using `--use-pth` flag if SafeTensors download fails
|
|
197
|
+
- **For testing**: Use the demo version (`./target/release/demo`) which doesn't require model downloads
|
|
198
|
+
|
|
199
|
+
2. **Memory Issues**
|
|
200
|
+
- Use smaller models (L-2 instead of L-12)
|
|
201
|
+
- Process documents in smaller batches
|
|
202
|
+
- Reduce sequence length in tokenizer
|
|
203
|
+
|
|
204
|
+
3. **Performance Issues**
|
|
205
|
+
- Enable GPU support if available
|
|
206
|
+
- Use release builds (`cargo build --release`)
|
|
207
|
+
- Consider model quantization for faster inference
|
|
208
|
+
|
|
209
|
+
4. **HuggingFace Hub API Issues**
|
|
210
|
+
- Some models may have download restrictions or require authentication
|
|
211
|
+
- The demo version provides the same interface without requiring model downloads
|
|
212
|
+
- Check HuggingFace Hub status if experiencing consistent download failures
|
|
213
|
+
|
|
214
|
+
### Debug Mode
|
|
215
|
+
|
|
216
|
+
Enable debug logging by setting the `RUST_LOG` environment variable:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
RUST_LOG=debug cargo run --release -- --query "your query"
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Integration with Code Search
|
|
223
|
+
|
|
224
|
+
This reranker can be integrated with the main probe code search tool to improve result relevance:
|
|
225
|
+
|
|
226
|
+
```rust
|
|
227
|
+
// Example integration
|
|
228
|
+
let search_results = probe::search("function authentication")?;
|
|
229
|
+
let documents: Vec<&str> = search_results.iter().map(|r| r.content.as_str()).collect();
|
|
230
|
+
let reranked = reranker.rerank("user authentication", &documents)?;
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Testing
|
|
234
|
+
|
|
235
|
+
### Demo Version (No Model Download Required)
|
|
236
|
+
|
|
237
|
+
For quick testing without downloading models, use the demo version:
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# Build both the real and demo versions
|
|
241
|
+
cargo build --release
|
|
242
|
+
|
|
243
|
+
# Test the demo version with mock reranking
|
|
244
|
+
./target/release/demo --query "machine learning"
|
|
245
|
+
|
|
246
|
+
# Test interactive mode
|
|
247
|
+
./target/release/demo --query "neural networks" --interactive
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
The demo version uses simple word overlap instead of BERT models and demonstrates the complete interface.
|
|
251
|
+
|
|
252
|
+
### Testing with Real Models
|
|
253
|
+
|
|
254
|
+
Run the test suite:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
cargo test
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
For integration tests with actual models:
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
cargo test --release -- --ignored
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Contributing
|
|
267
|
+
|
|
268
|
+
This is an example implementation demonstrating BERT reranking with Candle. For production use, consider:
|
|
269
|
+
- Adding comprehensive error handling
|
|
270
|
+
- Implementing proper cross-encoder head architecture
|
|
271
|
+
- Adding support for different similarity metrics
|
|
272
|
+
- Optimizing for batch processing and GPU acceleration
|
|
273
|
+
|
|
274
|
+
## Python Cross-Encoder Testing
|
|
275
|
+
|
|
276
|
+
For debugging and comparing Python vs Rust implementations, several Python testing tools are provided:
|
|
277
|
+
|
|
278
|
+
### Comprehensive Testing Script
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
# Run comprehensive cross-encoder testing
|
|
282
|
+
./test_cross_encoder.sh
|
|
283
|
+
|
|
284
|
+
# Or run Python script directly (requires dependencies)
|
|
285
|
+
python3 test_cross_encoder.py
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
This script:
|
|
289
|
+
- Tests both `transformers` and `sentence-transformers` libraries
|
|
290
|
+
- Shows detailed tokenization analysis (token IDs, attention masks, special tokens)
|
|
291
|
+
- Compares scores between relevant and irrelevant queries
|
|
292
|
+
- Saves results to JSON for further analysis
|
|
293
|
+
- Provides debugging recommendations for Rust implementation
|
|
294
|
+
|
|
295
|
+
### Quick Debugging Script
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# Run focused debugging tests
|
|
299
|
+
python3 debug_scoring.py
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
This minimal script:
|
|
303
|
+
- Tests hardcoded query-document pairs
|
|
304
|
+
- Shows raw logits and final scores
|
|
305
|
+
- Easy to modify for specific test cases
|
|
306
|
+
- Highlights score differences and discrimination quality
|
|
307
|
+
|
|
308
|
+
### Dependencies
|
|
309
|
+
|
|
310
|
+
Install Python dependencies:
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
pip3 install -r requirements.txt
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Required packages:
|
|
317
|
+
- `torch` - PyTorch for model inference
|
|
318
|
+
- `transformers` - HuggingFace transformers library
|
|
319
|
+
- `sentence-transformers` - Cross-encoder wrapper (optional but recommended)
|
|
320
|
+
- `numpy` - Numerical operations
|
|
321
|
+
|
|
322
|
+
### Test Cases
|
|
323
|
+
|
|
324
|
+
Both scripts test these scenarios by default:
|
|
325
|
+
- **Relevant Query**: "how does authentication work"
|
|
326
|
+
- **Irrelevant Query**: "foobar random nonsense gibberish"
|
|
327
|
+
- **Document**: Authentication-related text snippet
|
|
328
|
+
|
|
329
|
+
Expected behavior:
|
|
330
|
+
- Relevant query should score >0.5 (high relevance)
|
|
331
|
+
- Irrelevant query should score <0.5 (low relevance)
|
|
332
|
+
- Score difference should be significant (>0.1)
|
|
333
|
+
|
|
334
|
+
### Debugging Rust Implementation
|
|
335
|
+
|
|
336
|
+
Use these Python scripts to debug Rust cross-encoder issues:
|
|
337
|
+
|
|
338
|
+
1. **Compare tokenization**: Check if token IDs match between Python and Rust
|
|
339
|
+
2. **Compare raw logits**: Verify model outputs before activation functions
|
|
340
|
+
3. **Compare final scores**: Check if score calculation methods are identical
|
|
341
|
+
4. **Model configuration**: Ensure same model version and weights are loaded
|
|
342
|
+
|
|
343
|
+
The Python scripts provide detailed output to help identify where discrepancies occur.
|
|
344
|
+
|
|
345
|
+
## License
|
|
346
|
+
|
|
347
|
+
This example follows the same license as the main probe project.
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Rust-BERT vs Candle for Cross-Encoders
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
After investigating rust-bert for cross-encoder support, here are the key findings:
|
|
6
|
+
|
|
7
|
+
### rust-bert Limitations for Cross-Encoders
|
|
8
|
+
|
|
9
|
+
1. **No Native Cross-Encoder Support**: rust-bert doesn't have a dedicated cross-encoder pipeline
|
|
10
|
+
2. **Classification Focus**: The sequence classification pipeline expects discrete labels (POSITIVE/NEGATIVE), not continuous relevance scores
|
|
11
|
+
3. **Model Format**: Requires TorchScript (.ot) format, not standard PyTorch .bin files
|
|
12
|
+
4. **Architecture Mismatch**: Cross-encoders output a single score, but rust-bert's classification expects label probabilities
|
|
13
|
+
|
|
14
|
+
### Our Candle Implementation Advantages
|
|
15
|
+
|
|
16
|
+
1. **Direct PyTorch Support**: Loads .bin files directly from HuggingFace
|
|
17
|
+
2. **Custom Architecture**: We implement the exact cross-encoder architecture
|
|
18
|
+
3. **Raw Scores**: Returns raw logits for scoring, which is what cross-encoders need
|
|
19
|
+
4. **Flexibility**: Full control over tokenization and model behavior
|
|
20
|
+
|
|
21
|
+
## Model Availability
|
|
22
|
+
|
|
23
|
+
The MS-MARCO models on HuggingFace include:
|
|
24
|
+
- PyTorch formats: `pytorch_model.bin`, `model.safetensors`
|
|
25
|
+
- ONNX formats: Multiple optimized ONNX versions
|
|
26
|
+
- No TorchScript (.ot) versions available
|
|
27
|
+
|
|
28
|
+
## Conversion Options
|
|
29
|
+
|
|
30
|
+
### 1. PyTorch to TorchScript
|
|
31
|
+
```python
|
|
32
|
+
# See convert_to_torchscript.py
|
|
33
|
+
traced_model = torch.jit.trace(model, example_inputs)
|
|
34
|
+
traced_model.save("rust_model.ot")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### 2. Use ONNX Runtime
|
|
38
|
+
Instead of rust-bert, consider using ONNX Runtime with Rust bindings:
|
|
39
|
+
```toml
|
|
40
|
+
[dependencies]
|
|
41
|
+
ort = "1.16" # ONNX Runtime for Rust
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 3. Continue with Candle
|
|
45
|
+
Our current Candle implementation is actually well-suited for cross-encoders.
|
|
46
|
+
|
|
47
|
+
## Recommendation
|
|
48
|
+
|
|
49
|
+
**Stay with Candle** for cross-encoder implementation because:
|
|
50
|
+
|
|
51
|
+
1. It already works correctly with HuggingFace models
|
|
52
|
+
2. No conversion needed
|
|
53
|
+
3. Better control over the scoring pipeline
|
|
54
|
+
4. The issue isn't with Candle - it's that TinyBERT (4M params) is too small
|
|
55
|
+
|
|
56
|
+
**To improve results:**
|
|
57
|
+
1. Switch to a larger model (MiniLM-L-6-v2 with 85M params)
|
|
58
|
+
2. Make the model configurable via CLI
|
|
59
|
+
3. Consider adding ONNX support as an alternative backend
|
|
60
|
+
|
|
61
|
+
## Code Comparison
|
|
62
|
+
|
|
63
|
+
### rust-bert Approach (Would Require Modifications)
|
|
64
|
+
```rust
|
|
65
|
+
// rust-bert expects classification, not scoring
|
|
66
|
+
let config = SequenceClassificationConfig { ... };
|
|
67
|
+
let model = SequenceClassificationModel::new(config)?;
|
|
68
|
+
let output = model.predict(&[text]); // Returns Label with probability
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Our Candle Approach (Current)
|
|
72
|
+
```rust
|
|
73
|
+
// Direct cross-encoder implementation
|
|
74
|
+
let bert_outputs = self.bert.forward(&input_ids, &attention_mask, token_type_ids.as_ref())?;
|
|
75
|
+
let cls_output = bert_outputs.i((.., 0, ..))?;
|
|
76
|
+
let logits = self.classifier.forward(&cls_output)?;
|
|
77
|
+
let score = logits.i((0, 0))?.to_scalar::<f32>()?; // Raw relevance score
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Conclusion
|
|
81
|
+
|
|
82
|
+
rust-bert isn't suitable for cross-encoder models without significant modifications. Our Candle implementation is the right approach. The scoring issues are due to model size (TinyBERT), not the implementation framework.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Cross-Encoder Tokenization Guide for Rust Implementation
|
|
2
|
+
|
|
3
|
+
## Critical Points for Correct Implementation
|
|
4
|
+
|
|
5
|
+
### 1. **Use `encode_pair()` NOT Manual Concatenation**
|
|
6
|
+
|
|
7
|
+
ā **WRONG** (Manual concatenation):
|
|
8
|
+
```rust
|
|
9
|
+
let text = format!("{} [SEP] {}", query, document);
|
|
10
|
+
let encoding = tokenizer.encode(text, true)?;
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
ā
**CORRECT** (Tokenizer pair encoding):
|
|
14
|
+
```rust
|
|
15
|
+
let encoding = tokenizer.encode((query, document), true)?;
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
### 2. **Why This Matters**
|
|
19
|
+
|
|
20
|
+
When you use `encode_pair()`, the tokenizer:
|
|
21
|
+
- Automatically adds [CLS] at the start
|
|
22
|
+
- Adds [SEP] after the query
|
|
23
|
+
- Adds [SEP] at the end (for BERT)
|
|
24
|
+
- **Correctly sets token_type_ids**: 0 for query, 1 for document
|
|
25
|
+
- Handles special tokens properly
|
|
26
|
+
|
|
27
|
+
Manual concatenation will:
|
|
28
|
+
- Add extra [SEP] tokens (you get [SEP] [SEP] in the middle)
|
|
29
|
+
- Set ALL token_type_ids to 0 (incorrect!)
|
|
30
|
+
- Produce different tokenization due to whitespace handling
|
|
31
|
+
|
|
32
|
+
### 3. **Expected Token Structure**
|
|
33
|
+
|
|
34
|
+
For input:
|
|
35
|
+
- Query: "how does authentication work"
|
|
36
|
+
- Document: "Authentication is the process..."
|
|
37
|
+
|
|
38
|
+
The correct tokenization should be:
|
|
39
|
+
```
|
|
40
|
+
[CLS] how does authentication work [SEP] authentication is the process ... [SEP]
|
|
41
|
+
0 0 0 0 0 0 1 1 1 1 ... 1
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Token type IDs:
|
|
45
|
+
- 0 = Query segment (including [CLS] and first [SEP])
|
|
46
|
+
- 1 = Document segment (including final [SEP])
|
|
47
|
+
|
|
48
|
+
### 4. **Special Token IDs (for BERT)**
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
[CLS] = 101
|
|
52
|
+
[SEP] = 102
|
|
53
|
+
[PAD] = 0
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 5. **Verification in Rust**
|
|
57
|
+
|
|
58
|
+
```rust
|
|
59
|
+
// After tokenization, check:
|
|
60
|
+
let token_ids = encoding.get_ids();
|
|
61
|
+
let type_ids = encoding.get_type_ids();
|
|
62
|
+
|
|
63
|
+
// First token should be [CLS] (101)
|
|
64
|
+
assert_eq!(token_ids[0], 101);
|
|
65
|
+
|
|
66
|
+
// Look for [SEP] tokens (102)
|
|
67
|
+
let sep_positions: Vec<_> = token_ids.iter()
|
|
68
|
+
.enumerate()
|
|
69
|
+
.filter(|(_, &id)| id == 102)
|
|
70
|
+
.map(|(i, _)| i)
|
|
71
|
+
.collect();
|
|
72
|
+
|
|
73
|
+
// Should have 2 [SEP] tokens for pair encoding
|
|
74
|
+
assert_eq!(sep_positions.len(), 2);
|
|
75
|
+
|
|
76
|
+
// Check token type IDs switch from 0 to 1 after first [SEP]
|
|
77
|
+
if let Some(first_sep) = sep_positions.first() {
|
|
78
|
+
// Tokens before first [SEP] should have type 0
|
|
79
|
+
// Tokens after should have type 1
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 6. **Common Issues**
|
|
84
|
+
|
|
85
|
+
1. **Using wrong tokenizer**: Make sure you load tokenizer.json from the same model directory
|
|
86
|
+
2. **Not using pair encoding**: Always use `encode_pair()` for cross-encoders
|
|
87
|
+
3. **Missing token type IDs**: These are crucial for BERT to understand query vs document
|
|
88
|
+
|
|
89
|
+
### 7. **Score Differences**
|
|
90
|
+
|
|
91
|
+
If you see different scores between Python and Rust:
|
|
92
|
+
1. First check tokenization matches exactly (same token IDs)
|
|
93
|
+
2. Check token type IDs are correct (0 for query, 1 for document)
|
|
94
|
+
3. Verify attention masks are the same
|
|
95
|
+
4. Ensure model weights loaded correctly
|
|
96
|
+
|
|
97
|
+
### 8. **Debug Output**
|
|
98
|
+
|
|
99
|
+
Add this to your Rust code to debug:
|
|
100
|
+
```rust
|
|
101
|
+
println!("Token IDs: {:?}", encoding.get_ids());
|
|
102
|
+
println!("Type IDs: {:?}", encoding.get_type_ids());
|
|
103
|
+
println!("Attention mask: {:?}", encoding.get_attention_mask());
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Compare with Python:
|
|
107
|
+
```python
|
|
108
|
+
print(f"Token IDs: {encoding['input_ids'][0].tolist()}")
|
|
109
|
+
print(f"Type IDs: {encoding['token_type_ids'][0].tolist()}")
|
|
110
|
+
print(f"Attention: {encoding['attention_mask'][0].tolist()}")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Summary
|
|
114
|
+
|
|
115
|
+
The key issue is likely that our Rust implementation was using manual concatenation instead of proper pair encoding. This would result in:
|
|
116
|
+
- Wrong token type IDs (all 0s instead of 0s and 1s)
|
|
117
|
+
- Extra [SEP] tokens
|
|
118
|
+
- Different tokenization
|
|
119
|
+
|
|
120
|
+
Fixing this should improve the model's ability to distinguish between query and document, leading to better discrimination between relevant and irrelevant queries.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Check what tokenizer files our Rust implementation is using
|
|
4
|
+
and compare with Python tokenizer output.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
print("="*80)
|
|
11
|
+
print("CHECKING RUST TOKENIZER CONFIGURATION")
|
|
12
|
+
print("="*80)
|
|
13
|
+
|
|
14
|
+
# Check what tokenizer files we have
|
|
15
|
+
tokenizer_path = "models/ms-marco-TinyBERT-L-2-v2/tokenizer.json"
|
|
16
|
+
|
|
17
|
+
if os.path.exists(tokenizer_path):
|
|
18
|
+
print(f"ā Found tokenizer at: {tokenizer_path}")
|
|
19
|
+
|
|
20
|
+
# Load and inspect the tokenizer
|
|
21
|
+
with open(tokenizer_path, 'r') as f:
|
|
22
|
+
tokenizer_data = json.load(f)
|
|
23
|
+
|
|
24
|
+
print("\n--- TOKENIZER STRUCTURE ---")
|
|
25
|
+
print(f"Tokenizer type: {tokenizer_data.get('model', {}).get('type', 'Unknown')}")
|
|
26
|
+
|
|
27
|
+
# Check for special tokens
|
|
28
|
+
if 'added_tokens' in tokenizer_data:
|
|
29
|
+
print("\nSpecial tokens:")
|
|
30
|
+
for token in tokenizer_data['added_tokens'][:10]: # Show first 10
|
|
31
|
+
print(f" {token}")
|
|
32
|
+
|
|
33
|
+
# Check post-processor (important for BERT!)
|
|
34
|
+
if 'post_processor' in tokenizer_data:
|
|
35
|
+
post_proc = tokenizer_data['post_processor']
|
|
36
|
+
print(f"\nPost-processor type: {post_proc.get('type', 'Unknown')}")
|
|
37
|
+
|
|
38
|
+
# For BERT, should be TemplateProcessing
|
|
39
|
+
if post_proc.get('type') == 'TemplateProcessing':
|
|
40
|
+
if 'single' in post_proc:
|
|
41
|
+
print(f"Single sequence template: {post_proc['single']}")
|
|
42
|
+
if 'pair' in post_proc:
|
|
43
|
+
print(f"Pair sequence template: {post_proc['pair']}")
|
|
44
|
+
else:
|
|
45
|
+
print(f"ā Tokenizer not found at: {tokenizer_path}")
|
|
46
|
+
|
|
47
|
+
# Now let's create a test to verify Rust tokenization
|
|
48
|
+
print("\n" + "="*80)
|
|
49
|
+
print("RUST TOKENIZATION TEST CASES")
|
|
50
|
+
print("="*80)
|
|
51
|
+
|
|
52
|
+
# These should match Python exactly
|
|
53
|
+
test_cases = [
|
|
54
|
+
{
|
|
55
|
+
"name": "Simple pair",
|
|
56
|
+
"query": "how does authentication work",
|
|
57
|
+
"document": "Authentication is the process of verifying the identity of a user.",
|
|
58
|
+
"method": "pair" # tokenizer.encode_pair(query, document)
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"name": "Manual concat (wrong)",
|
|
62
|
+
"text": "how does authentication work [SEP] Authentication is the process of verifying the identity of a user.",
|
|
63
|
+
"method": "single" # tokenizer.encode(text)
|
|
64
|
+
}
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
print("\nExpected Rust code for correct tokenization:")
|
|
68
|
+
print("```rust")
|
|
69
|
+
print('// CORRECT: Use encode_pair for cross-encoder')
|
|
70
|
+
print('let encoding = tokenizer.encode((query, document), true)?;')
|
|
71
|
+
print('')
|
|
72
|
+
print('// WRONG: Do not manually concatenate')
|
|
73
|
+
print('let text = format!("{} [SEP] {}", query, document);')
|
|
74
|
+
print('let encoding = tokenizer.encode(text, true)?;')
|
|
75
|
+
print("```")
|
|
76
|
+
|
|
77
|
+
# Key differences to check
|
|
78
|
+
print("\n--- KEY THINGS TO VERIFY IN RUST ---")
|
|
79
|
+
print("1. Token IDs match exactly")
|
|
80
|
+
print("2. Token type IDs are generated correctly:")
|
|
81
|
+
print(" - 0 for query tokens (including [CLS])")
|
|
82
|
+
print(" - 0 for first [SEP]")
|
|
83
|
+
print(" - 1 for document tokens")
|
|
84
|
+
print(" - 1 for final [SEP] (if present)")
|
|
85
|
+
print("3. Special tokens are in the right positions")
|
|
86
|
+
print("4. Padding is handled correctly")
|
|
87
|
+
|
|
88
|
+
# Load Python results if available
|
|
89
|
+
if os.path.exists("tokenizer_debug_info.json"):
|
|
90
|
+
with open("tokenizer_debug_info.json", 'r') as f:
|
|
91
|
+
python_info = json.load(f)
|
|
92
|
+
|
|
93
|
+
print("\n--- PYTHON REFERENCE ---")
|
|
94
|
+
print(f"Query: '{python_info['test_case']['query']}'")
|
|
95
|
+
print(f"Document: '{python_info['test_case']['document']}'")
|
|
96
|
+
print(f"Correct score: {python_info['test_case']['correct_score']:.6f}")
|
|
97
|
+
print(f"Manual concat score: {python_info['test_case']['manual_concat_score']:.6f}")
|
|
98
|
+
|
|
99
|
+
# Show first 20 tokens
|
|
100
|
+
ids = python_info['test_case']['correct_input_ids'][:20]
|
|
101
|
+
types = python_info['test_case']['correct_token_types'][:20] if python_info['test_case']['correct_token_types'] else None
|
|
102
|
+
|
|
103
|
+
print("\nFirst 20 tokens (Python):")
|
|
104
|
+
print(f"IDs: {ids}")
|
|
105
|
+
if types:
|
|
106
|
+
print(f"Types: {types}")
|
|
107
|
+
|
|
108
|
+
print("\nā
Your Rust implementation should produce these EXACT token IDs and types!")
|