gobby 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gobby/__init__.py +3 -0
- gobby/adapters/__init__.py +30 -0
- gobby/adapters/base.py +93 -0
- gobby/adapters/claude_code.py +276 -0
- gobby/adapters/codex.py +1292 -0
- gobby/adapters/gemini.py +343 -0
- gobby/agents/__init__.py +37 -0
- gobby/agents/codex_session.py +120 -0
- gobby/agents/constants.py +112 -0
- gobby/agents/context.py +362 -0
- gobby/agents/definitions.py +133 -0
- gobby/agents/gemini_session.py +111 -0
- gobby/agents/registry.py +618 -0
- gobby/agents/runner.py +968 -0
- gobby/agents/session.py +259 -0
- gobby/agents/spawn.py +916 -0
- gobby/agents/spawners/__init__.py +77 -0
- gobby/agents/spawners/base.py +142 -0
- gobby/agents/spawners/cross_platform.py +266 -0
- gobby/agents/spawners/embedded.py +225 -0
- gobby/agents/spawners/headless.py +226 -0
- gobby/agents/spawners/linux.py +125 -0
- gobby/agents/spawners/macos.py +277 -0
- gobby/agents/spawners/windows.py +308 -0
- gobby/agents/tty_config.py +319 -0
- gobby/autonomous/__init__.py +32 -0
- gobby/autonomous/progress_tracker.py +447 -0
- gobby/autonomous/stop_registry.py +269 -0
- gobby/autonomous/stuck_detector.py +383 -0
- gobby/cli/__init__.py +67 -0
- gobby/cli/__main__.py +8 -0
- gobby/cli/agents.py +529 -0
- gobby/cli/artifacts.py +266 -0
- gobby/cli/daemon.py +329 -0
- gobby/cli/extensions.py +526 -0
- gobby/cli/github.py +263 -0
- gobby/cli/init.py +53 -0
- gobby/cli/install.py +614 -0
- gobby/cli/installers/__init__.py +37 -0
- gobby/cli/installers/antigravity.py +65 -0
- gobby/cli/installers/claude.py +363 -0
- gobby/cli/installers/codex.py +192 -0
- gobby/cli/installers/gemini.py +294 -0
- gobby/cli/installers/git_hooks.py +377 -0
- gobby/cli/installers/shared.py +737 -0
- gobby/cli/linear.py +250 -0
- gobby/cli/mcp.py +30 -0
- gobby/cli/mcp_proxy.py +698 -0
- gobby/cli/memory.py +304 -0
- gobby/cli/merge.py +384 -0
- gobby/cli/projects.py +79 -0
- gobby/cli/sessions.py +622 -0
- gobby/cli/tasks/__init__.py +30 -0
- gobby/cli/tasks/_utils.py +658 -0
- gobby/cli/tasks/ai.py +1025 -0
- gobby/cli/tasks/commits.py +169 -0
- gobby/cli/tasks/crud.py +685 -0
- gobby/cli/tasks/deps.py +135 -0
- gobby/cli/tasks/labels.py +63 -0
- gobby/cli/tasks/main.py +273 -0
- gobby/cli/tasks/search.py +178 -0
- gobby/cli/tui.py +34 -0
- gobby/cli/utils.py +513 -0
- gobby/cli/workflows.py +927 -0
- gobby/cli/worktrees.py +481 -0
- gobby/config/__init__.py +129 -0
- gobby/config/app.py +551 -0
- gobby/config/extensions.py +167 -0
- gobby/config/features.py +472 -0
- gobby/config/llm_providers.py +98 -0
- gobby/config/logging.py +66 -0
- gobby/config/mcp.py +346 -0
- gobby/config/persistence.py +247 -0
- gobby/config/servers.py +141 -0
- gobby/config/sessions.py +250 -0
- gobby/config/tasks.py +784 -0
- gobby/hooks/__init__.py +104 -0
- gobby/hooks/artifact_capture.py +213 -0
- gobby/hooks/broadcaster.py +243 -0
- gobby/hooks/event_handlers.py +723 -0
- gobby/hooks/events.py +218 -0
- gobby/hooks/git.py +169 -0
- gobby/hooks/health_monitor.py +171 -0
- gobby/hooks/hook_manager.py +856 -0
- gobby/hooks/hook_types.py +575 -0
- gobby/hooks/plugins.py +813 -0
- gobby/hooks/session_coordinator.py +396 -0
- gobby/hooks/verification_runner.py +268 -0
- gobby/hooks/webhooks.py +339 -0
- gobby/install/claude/commands/gobby/bug.md +51 -0
- gobby/install/claude/commands/gobby/chore.md +51 -0
- gobby/install/claude/commands/gobby/epic.md +52 -0
- gobby/install/claude/commands/gobby/eval.md +235 -0
- gobby/install/claude/commands/gobby/feat.md +49 -0
- gobby/install/claude/commands/gobby/nit.md +52 -0
- gobby/install/claude/commands/gobby/ref.md +52 -0
- gobby/install/claude/hooks/HOOK_SCHEMAS.md +632 -0
- gobby/install/claude/hooks/hook_dispatcher.py +364 -0
- gobby/install/claude/hooks/validate_settings.py +102 -0
- gobby/install/claude/hooks-template.json +118 -0
- gobby/install/codex/hooks/hook_dispatcher.py +153 -0
- gobby/install/codex/prompts/forget.md +7 -0
- gobby/install/codex/prompts/memories.md +7 -0
- gobby/install/codex/prompts/recall.md +7 -0
- gobby/install/codex/prompts/remember.md +13 -0
- gobby/install/gemini/hooks/hook_dispatcher.py +268 -0
- gobby/install/gemini/hooks-template.json +138 -0
- gobby/install/shared/plugins/code_guardian.py +456 -0
- gobby/install/shared/plugins/example_notify.py +331 -0
- gobby/integrations/__init__.py +10 -0
- gobby/integrations/github.py +145 -0
- gobby/integrations/linear.py +145 -0
- gobby/llm/__init__.py +40 -0
- gobby/llm/base.py +120 -0
- gobby/llm/claude.py +578 -0
- gobby/llm/claude_executor.py +503 -0
- gobby/llm/codex.py +322 -0
- gobby/llm/codex_executor.py +513 -0
- gobby/llm/executor.py +316 -0
- gobby/llm/factory.py +34 -0
- gobby/llm/gemini.py +258 -0
- gobby/llm/gemini_executor.py +339 -0
- gobby/llm/litellm.py +287 -0
- gobby/llm/litellm_executor.py +303 -0
- gobby/llm/resolver.py +499 -0
- gobby/llm/service.py +236 -0
- gobby/mcp_proxy/__init__.py +29 -0
- gobby/mcp_proxy/actions.py +175 -0
- gobby/mcp_proxy/daemon_control.py +198 -0
- gobby/mcp_proxy/importer.py +436 -0
- gobby/mcp_proxy/lazy.py +325 -0
- gobby/mcp_proxy/manager.py +798 -0
- gobby/mcp_proxy/metrics.py +609 -0
- gobby/mcp_proxy/models.py +139 -0
- gobby/mcp_proxy/registries.py +215 -0
- gobby/mcp_proxy/schema_hash.py +381 -0
- gobby/mcp_proxy/semantic_search.py +706 -0
- gobby/mcp_proxy/server.py +549 -0
- gobby/mcp_proxy/services/__init__.py +0 -0
- gobby/mcp_proxy/services/fallback.py +306 -0
- gobby/mcp_proxy/services/recommendation.py +224 -0
- gobby/mcp_proxy/services/server_mgmt.py +214 -0
- gobby/mcp_proxy/services/system.py +72 -0
- gobby/mcp_proxy/services/tool_filter.py +231 -0
- gobby/mcp_proxy/services/tool_proxy.py +309 -0
- gobby/mcp_proxy/stdio.py +565 -0
- gobby/mcp_proxy/tools/__init__.py +27 -0
- gobby/mcp_proxy/tools/agents.py +1103 -0
- gobby/mcp_proxy/tools/artifacts.py +207 -0
- gobby/mcp_proxy/tools/hub.py +335 -0
- gobby/mcp_proxy/tools/internal.py +337 -0
- gobby/mcp_proxy/tools/memory.py +543 -0
- gobby/mcp_proxy/tools/merge.py +422 -0
- gobby/mcp_proxy/tools/metrics.py +283 -0
- gobby/mcp_proxy/tools/orchestration/__init__.py +23 -0
- gobby/mcp_proxy/tools/orchestration/cleanup.py +619 -0
- gobby/mcp_proxy/tools/orchestration/monitor.py +380 -0
- gobby/mcp_proxy/tools/orchestration/orchestrate.py +746 -0
- gobby/mcp_proxy/tools/orchestration/review.py +736 -0
- gobby/mcp_proxy/tools/orchestration/utils.py +16 -0
- gobby/mcp_proxy/tools/session_messages.py +1056 -0
- gobby/mcp_proxy/tools/task_dependencies.py +219 -0
- gobby/mcp_proxy/tools/task_expansion.py +591 -0
- gobby/mcp_proxy/tools/task_github.py +393 -0
- gobby/mcp_proxy/tools/task_linear.py +379 -0
- gobby/mcp_proxy/tools/task_orchestration.py +77 -0
- gobby/mcp_proxy/tools/task_readiness.py +522 -0
- gobby/mcp_proxy/tools/task_sync.py +351 -0
- gobby/mcp_proxy/tools/task_validation.py +843 -0
- gobby/mcp_proxy/tools/tasks/__init__.py +25 -0
- gobby/mcp_proxy/tools/tasks/_context.py +112 -0
- gobby/mcp_proxy/tools/tasks/_crud.py +516 -0
- gobby/mcp_proxy/tools/tasks/_factory.py +176 -0
- gobby/mcp_proxy/tools/tasks/_helpers.py +129 -0
- gobby/mcp_proxy/tools/tasks/_lifecycle.py +517 -0
- gobby/mcp_proxy/tools/tasks/_lifecycle_validation.py +301 -0
- gobby/mcp_proxy/tools/tasks/_resolution.py +55 -0
- gobby/mcp_proxy/tools/tasks/_search.py +215 -0
- gobby/mcp_proxy/tools/tasks/_session.py +125 -0
- gobby/mcp_proxy/tools/workflows.py +973 -0
- gobby/mcp_proxy/tools/worktrees.py +1264 -0
- gobby/mcp_proxy/transports/__init__.py +0 -0
- gobby/mcp_proxy/transports/base.py +95 -0
- gobby/mcp_proxy/transports/factory.py +44 -0
- gobby/mcp_proxy/transports/http.py +139 -0
- gobby/mcp_proxy/transports/stdio.py +213 -0
- gobby/mcp_proxy/transports/websocket.py +136 -0
- gobby/memory/backends/__init__.py +116 -0
- gobby/memory/backends/mem0.py +408 -0
- gobby/memory/backends/memu.py +485 -0
- gobby/memory/backends/null.py +111 -0
- gobby/memory/backends/openmemory.py +537 -0
- gobby/memory/backends/sqlite.py +304 -0
- gobby/memory/context.py +87 -0
- gobby/memory/manager.py +1001 -0
- gobby/memory/protocol.py +451 -0
- gobby/memory/search/__init__.py +66 -0
- gobby/memory/search/text.py +127 -0
- gobby/memory/viz.py +258 -0
- gobby/prompts/__init__.py +13 -0
- gobby/prompts/defaults/expansion/system.md +119 -0
- gobby/prompts/defaults/expansion/user.md +48 -0
- gobby/prompts/defaults/external_validation/agent.md +72 -0
- gobby/prompts/defaults/external_validation/external.md +63 -0
- gobby/prompts/defaults/external_validation/spawn.md +83 -0
- gobby/prompts/defaults/external_validation/system.md +6 -0
- gobby/prompts/defaults/features/import_mcp.md +22 -0
- gobby/prompts/defaults/features/import_mcp_github.md +17 -0
- gobby/prompts/defaults/features/import_mcp_search.md +16 -0
- gobby/prompts/defaults/features/recommend_tools.md +32 -0
- gobby/prompts/defaults/features/recommend_tools_hybrid.md +35 -0
- gobby/prompts/defaults/features/recommend_tools_llm.md +30 -0
- gobby/prompts/defaults/features/server_description.md +20 -0
- gobby/prompts/defaults/features/server_description_system.md +6 -0
- gobby/prompts/defaults/features/task_description.md +31 -0
- gobby/prompts/defaults/features/task_description_system.md +6 -0
- gobby/prompts/defaults/features/tool_summary.md +17 -0
- gobby/prompts/defaults/features/tool_summary_system.md +6 -0
- gobby/prompts/defaults/research/step.md +58 -0
- gobby/prompts/defaults/validation/criteria.md +47 -0
- gobby/prompts/defaults/validation/validate.md +38 -0
- gobby/prompts/loader.py +346 -0
- gobby/prompts/models.py +113 -0
- gobby/py.typed +0 -0
- gobby/runner.py +488 -0
- gobby/search/__init__.py +23 -0
- gobby/search/protocol.py +104 -0
- gobby/search/tfidf.py +232 -0
- gobby/servers/__init__.py +7 -0
- gobby/servers/http.py +636 -0
- gobby/servers/models.py +31 -0
- gobby/servers/routes/__init__.py +23 -0
- gobby/servers/routes/admin.py +416 -0
- gobby/servers/routes/dependencies.py +118 -0
- gobby/servers/routes/mcp/__init__.py +24 -0
- gobby/servers/routes/mcp/hooks.py +135 -0
- gobby/servers/routes/mcp/plugins.py +121 -0
- gobby/servers/routes/mcp/tools.py +1337 -0
- gobby/servers/routes/mcp/webhooks.py +159 -0
- gobby/servers/routes/sessions.py +582 -0
- gobby/servers/websocket.py +766 -0
- gobby/sessions/__init__.py +13 -0
- gobby/sessions/analyzer.py +322 -0
- gobby/sessions/lifecycle.py +240 -0
- gobby/sessions/manager.py +563 -0
- gobby/sessions/processor.py +225 -0
- gobby/sessions/summary.py +532 -0
- gobby/sessions/transcripts/__init__.py +41 -0
- gobby/sessions/transcripts/base.py +125 -0
- gobby/sessions/transcripts/claude.py +386 -0
- gobby/sessions/transcripts/codex.py +143 -0
- gobby/sessions/transcripts/gemini.py +195 -0
- gobby/storage/__init__.py +21 -0
- gobby/storage/agents.py +409 -0
- gobby/storage/artifact_classifier.py +341 -0
- gobby/storage/artifacts.py +285 -0
- gobby/storage/compaction.py +67 -0
- gobby/storage/database.py +357 -0
- gobby/storage/inter_session_messages.py +194 -0
- gobby/storage/mcp.py +680 -0
- gobby/storage/memories.py +562 -0
- gobby/storage/merge_resolutions.py +550 -0
- gobby/storage/migrations.py +860 -0
- gobby/storage/migrations_legacy.py +1359 -0
- gobby/storage/projects.py +166 -0
- gobby/storage/session_messages.py +251 -0
- gobby/storage/session_tasks.py +97 -0
- gobby/storage/sessions.py +817 -0
- gobby/storage/task_dependencies.py +223 -0
- gobby/storage/tasks/__init__.py +42 -0
- gobby/storage/tasks/_aggregates.py +180 -0
- gobby/storage/tasks/_crud.py +449 -0
- gobby/storage/tasks/_id.py +104 -0
- gobby/storage/tasks/_lifecycle.py +311 -0
- gobby/storage/tasks/_manager.py +889 -0
- gobby/storage/tasks/_models.py +300 -0
- gobby/storage/tasks/_ordering.py +119 -0
- gobby/storage/tasks/_path_cache.py +110 -0
- gobby/storage/tasks/_queries.py +343 -0
- gobby/storage/tasks/_search.py +143 -0
- gobby/storage/workflow_audit.py +393 -0
- gobby/storage/worktrees.py +547 -0
- gobby/sync/__init__.py +29 -0
- gobby/sync/github.py +333 -0
- gobby/sync/linear.py +304 -0
- gobby/sync/memories.py +284 -0
- gobby/sync/tasks.py +641 -0
- gobby/tasks/__init__.py +8 -0
- gobby/tasks/build_verification.py +193 -0
- gobby/tasks/commits.py +633 -0
- gobby/tasks/context.py +747 -0
- gobby/tasks/criteria.py +342 -0
- gobby/tasks/enhanced_validator.py +226 -0
- gobby/tasks/escalation.py +263 -0
- gobby/tasks/expansion.py +626 -0
- gobby/tasks/external_validator.py +764 -0
- gobby/tasks/issue_extraction.py +171 -0
- gobby/tasks/prompts/expand.py +327 -0
- gobby/tasks/research.py +421 -0
- gobby/tasks/tdd.py +352 -0
- gobby/tasks/tree_builder.py +263 -0
- gobby/tasks/validation.py +712 -0
- gobby/tasks/validation_history.py +357 -0
- gobby/tasks/validation_models.py +89 -0
- gobby/tools/__init__.py +0 -0
- gobby/tools/summarizer.py +170 -0
- gobby/tui/__init__.py +5 -0
- gobby/tui/api_client.py +281 -0
- gobby/tui/app.py +327 -0
- gobby/tui/screens/__init__.py +25 -0
- gobby/tui/screens/agents.py +333 -0
- gobby/tui/screens/chat.py +450 -0
- gobby/tui/screens/dashboard.py +377 -0
- gobby/tui/screens/memory.py +305 -0
- gobby/tui/screens/metrics.py +231 -0
- gobby/tui/screens/orchestrator.py +904 -0
- gobby/tui/screens/sessions.py +412 -0
- gobby/tui/screens/tasks.py +442 -0
- gobby/tui/screens/workflows.py +289 -0
- gobby/tui/screens/worktrees.py +174 -0
- gobby/tui/widgets/__init__.py +21 -0
- gobby/tui/widgets/chat.py +210 -0
- gobby/tui/widgets/conductor.py +104 -0
- gobby/tui/widgets/menu.py +132 -0
- gobby/tui/widgets/message_panel.py +160 -0
- gobby/tui/widgets/review_gate.py +224 -0
- gobby/tui/widgets/task_tree.py +99 -0
- gobby/tui/widgets/token_budget.py +166 -0
- gobby/tui/ws_client.py +258 -0
- gobby/utils/__init__.py +3 -0
- gobby/utils/daemon_client.py +235 -0
- gobby/utils/git.py +222 -0
- gobby/utils/id.py +38 -0
- gobby/utils/json_helpers.py +161 -0
- gobby/utils/logging.py +376 -0
- gobby/utils/machine_id.py +135 -0
- gobby/utils/metrics.py +589 -0
- gobby/utils/project_context.py +182 -0
- gobby/utils/project_init.py +263 -0
- gobby/utils/status.py +256 -0
- gobby/utils/validation.py +80 -0
- gobby/utils/version.py +23 -0
- gobby/workflows/__init__.py +4 -0
- gobby/workflows/actions.py +1310 -0
- gobby/workflows/approval_flow.py +138 -0
- gobby/workflows/artifact_actions.py +103 -0
- gobby/workflows/audit_helpers.py +110 -0
- gobby/workflows/autonomous_actions.py +286 -0
- gobby/workflows/context_actions.py +394 -0
- gobby/workflows/definitions.py +130 -0
- gobby/workflows/detection_helpers.py +208 -0
- gobby/workflows/engine.py +485 -0
- gobby/workflows/evaluator.py +669 -0
- gobby/workflows/git_utils.py +96 -0
- gobby/workflows/hooks.py +169 -0
- gobby/workflows/lifecycle_evaluator.py +613 -0
- gobby/workflows/llm_actions.py +70 -0
- gobby/workflows/loader.py +333 -0
- gobby/workflows/mcp_actions.py +60 -0
- gobby/workflows/memory_actions.py +272 -0
- gobby/workflows/premature_stop.py +164 -0
- gobby/workflows/session_actions.py +139 -0
- gobby/workflows/state_actions.py +123 -0
- gobby/workflows/state_manager.py +104 -0
- gobby/workflows/stop_signal_actions.py +163 -0
- gobby/workflows/summary_actions.py +344 -0
- gobby/workflows/task_actions.py +249 -0
- gobby/workflows/task_enforcement_actions.py +901 -0
- gobby/workflows/templates.py +52 -0
- gobby/workflows/todo_actions.py +84 -0
- gobby/workflows/webhook.py +223 -0
- gobby/workflows/webhook_executor.py +399 -0
- gobby/worktrees/__init__.py +5 -0
- gobby/worktrees/git.py +690 -0
- gobby/worktrees/merge/__init__.py +20 -0
- gobby/worktrees/merge/conflict_parser.py +177 -0
- gobby/worktrees/merge/resolver.py +485 -0
- gobby-0.2.5.dist-info/METADATA +351 -0
- gobby-0.2.5.dist-info/RECORD +383 -0
- gobby-0.2.5.dist-info/WHEEL +5 -0
- gobby-0.2.5.dist-info/entry_points.txt +2 -0
- gobby-0.2.5.dist-info/licenses/LICENSE.md +193 -0
- gobby-0.2.5.dist-info/top_level.txt +1 -0
gobby/search/tfidf.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TF-IDF based semantic search backend.
|
|
3
|
+
|
|
4
|
+
Provides local semantic search using scikit-learn's TfidfVectorizer.
|
|
5
|
+
No API calls required - works completely offline.
|
|
6
|
+
|
|
7
|
+
Requires: scikit-learn (pip install scikit-learn)
|
|
8
|
+
|
|
9
|
+
Features:
|
|
10
|
+
- Unigram + bigram matching for better phrase detection
|
|
11
|
+
- Cosine similarity ranking
|
|
12
|
+
- Fast sub-millisecond search for thousands of items
|
|
13
|
+
|
|
14
|
+
Note: Only full fit() is implemented. Incremental updates are tracked via
|
|
15
|
+
mark_update() and needs_refit() but require calling fit() to rebuild the index.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
from typing import TYPE_CHECKING, Any
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from scipy.sparse import csr_matrix
|
|
25
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TFIDFSearcher:
|
|
31
|
+
"""
|
|
32
|
+
TF-IDF based search backend using sklearn.
|
|
33
|
+
|
|
34
|
+
This is the default search backend for memory recall and task search.
|
|
35
|
+
It uses TF-IDF (Term Frequency-Inverse Document Frequency) vectorization
|
|
36
|
+
with cosine similarity for ranking.
|
|
37
|
+
|
|
38
|
+
Configuration options:
|
|
39
|
+
- ngram_range: Tuple of (min, max) n-gram sizes (default: (1, 2))
|
|
40
|
+
- max_features: Maximum vocabulary size (default: 10000)
|
|
41
|
+
- min_df: Minimum document frequency for terms (default: 1)
|
|
42
|
+
- stop_words: Language for stop words removal (default: "english")
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
searcher = TFIDFSearcher()
|
|
46
|
+
searcher.fit([("id1", "content1"), ("id2", "content2")])
|
|
47
|
+
results = searcher.search("query", top_k=5)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
ngram_range: tuple[int, int] = (1, 2),
|
|
53
|
+
max_features: int = 10000,
|
|
54
|
+
min_df: int = 1,
|
|
55
|
+
stop_words: str | None = "english",
|
|
56
|
+
refit_threshold: int = 10,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize TF-IDF searcher.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
ngram_range: Min/max n-gram sizes for tokenization
|
|
63
|
+
max_features: Maximum vocabulary size
|
|
64
|
+
min_df: Minimum document frequency for inclusion
|
|
65
|
+
stop_words: Language for stop words (None to disable)
|
|
66
|
+
refit_threshold: Number of updates before automatic refit
|
|
67
|
+
"""
|
|
68
|
+
self._ngram_range = ngram_range
|
|
69
|
+
self._max_features = max_features
|
|
70
|
+
self._min_df = min_df
|
|
71
|
+
self._stop_words = stop_words
|
|
72
|
+
self._refit_threshold = refit_threshold
|
|
73
|
+
|
|
74
|
+
# Lazy-loaded sklearn components
|
|
75
|
+
self._vectorizer: TfidfVectorizer | None = None
|
|
76
|
+
self._vectors: csr_matrix | None = None
|
|
77
|
+
self._item_ids: list[str] = []
|
|
78
|
+
self._fitted = False
|
|
79
|
+
self._pending_updates = 0
|
|
80
|
+
|
|
81
|
+
def _ensure_vectorizer(self) -> TfidfVectorizer:
|
|
82
|
+
"""Create or return the TF-IDF vectorizer."""
|
|
83
|
+
if self._vectorizer is None:
|
|
84
|
+
try:
|
|
85
|
+
from sklearn.feature_extraction.text import (
|
|
86
|
+
TfidfVectorizer as SklearnTfidfVectorizer,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
self._vectorizer = SklearnTfidfVectorizer(
|
|
90
|
+
ngram_range=self._ngram_range,
|
|
91
|
+
max_features=self._max_features,
|
|
92
|
+
min_df=self._min_df,
|
|
93
|
+
stop_words=self._stop_words,
|
|
94
|
+
)
|
|
95
|
+
except ImportError as e:
|
|
96
|
+
raise ImportError(
|
|
97
|
+
"TF-IDF search requires scikit-learn. Install with: pip install scikit-learn"
|
|
98
|
+
) from e
|
|
99
|
+
return self._vectorizer
|
|
100
|
+
|
|
101
|
+
def fit(self, items: list[tuple[str, str]]) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Build TF-IDF index from all items.
|
|
104
|
+
|
|
105
|
+
This should be called:
|
|
106
|
+
- On startup to build initial index
|
|
107
|
+
- After bulk item operations
|
|
108
|
+
- When needs_refit() returns True
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
items: List of (item_id, content) tuples to index
|
|
112
|
+
"""
|
|
113
|
+
if not items:
|
|
114
|
+
self._fitted = False
|
|
115
|
+
self._item_ids = []
|
|
116
|
+
self._vectors = None
|
|
117
|
+
self._pending_updates = 0
|
|
118
|
+
logger.debug("TF-IDF index cleared (no items)")
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
vectorizer = self._ensure_vectorizer()
|
|
122
|
+
|
|
123
|
+
self._item_ids = [item_id for item_id, _ in items]
|
|
124
|
+
contents = [content for _, content in items]
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
self._vectors = vectorizer.fit_transform(contents)
|
|
128
|
+
self._fitted = True
|
|
129
|
+
self._pending_updates = 0
|
|
130
|
+
logger.info(f"TF-IDF index built with {len(items)} items")
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to build TF-IDF index: {e}")
|
|
133
|
+
self._fitted = False
|
|
134
|
+
raise
|
|
135
|
+
|
|
136
|
+
def search(self, query: str, top_k: int = 10) -> list[tuple[str, float]]:
|
|
137
|
+
"""
|
|
138
|
+
Search for items matching the query.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
query: Search query text
|
|
142
|
+
top_k: Maximum number of results to return
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of (item_id, similarity_score) tuples, sorted by
|
|
146
|
+
similarity descending. Scores are in range [0, 1].
|
|
147
|
+
"""
|
|
148
|
+
if not self._fitted or self._vectors is None or len(self._item_ids) == 0:
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
import numpy as np
|
|
153
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
154
|
+
|
|
155
|
+
vectorizer = self._ensure_vectorizer()
|
|
156
|
+
|
|
157
|
+
# Transform query using fitted vocabulary
|
|
158
|
+
query_vec = vectorizer.transform([query])
|
|
159
|
+
|
|
160
|
+
# Compute cosine similarities
|
|
161
|
+
similarities = cosine_similarity(query_vec, self._vectors)[0]
|
|
162
|
+
|
|
163
|
+
# Get top-k indices (handling case where we have fewer results)
|
|
164
|
+
k = min(top_k, len(similarities))
|
|
165
|
+
if k == 0:
|
|
166
|
+
return []
|
|
167
|
+
|
|
168
|
+
# Get indices of top-k highest similarities
|
|
169
|
+
top_indices = np.argsort(similarities)[-k:][::-1]
|
|
170
|
+
|
|
171
|
+
# Return results with non-zero similarity
|
|
172
|
+
results = [
|
|
173
|
+
(self._item_ids[i], float(similarities[i]))
|
|
174
|
+
for i in top_indices
|
|
175
|
+
if similarities[i] > 0
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
return results
|
|
179
|
+
|
|
180
|
+
except ImportError as e:
|
|
181
|
+
logger.error(f"TF-IDF search requires scikit-learn: {e}", exc_info=True)
|
|
182
|
+
raise
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f"TF-IDF search failed: {e}", exc_info=True)
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
def needs_refit(self) -> bool:
|
|
188
|
+
"""
|
|
189
|
+
Check if the index needs rebuilding.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
True if fit() should be called before search()
|
|
193
|
+
"""
|
|
194
|
+
return not self._fitted or self._pending_updates >= self._refit_threshold
|
|
195
|
+
|
|
196
|
+
def mark_update(self) -> None:
|
|
197
|
+
"""
|
|
198
|
+
Mark that an item update occurred.
|
|
199
|
+
|
|
200
|
+
Call this after adding/updating/removing items to track
|
|
201
|
+
when a refit is needed.
|
|
202
|
+
"""
|
|
203
|
+
self._pending_updates += 1
|
|
204
|
+
|
|
205
|
+
def get_stats(self) -> dict[str, Any]:
|
|
206
|
+
"""
|
|
207
|
+
Get statistics about the TF-IDF index.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dict with index statistics
|
|
211
|
+
"""
|
|
212
|
+
stats: dict[str, Any] = {
|
|
213
|
+
"fitted": self._fitted,
|
|
214
|
+
"item_count": len(self._item_ids),
|
|
215
|
+
"pending_updates": self._pending_updates,
|
|
216
|
+
"refit_threshold": self._refit_threshold,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if self._vectorizer is not None and self._fitted:
|
|
220
|
+
vocab = getattr(self._vectorizer, "vocabulary_", {})
|
|
221
|
+
stats["vocabulary_size"] = len(vocab)
|
|
222
|
+
stats["ngram_range"] = self._ngram_range
|
|
223
|
+
stats["max_features"] = self._max_features
|
|
224
|
+
|
|
225
|
+
return stats
|
|
226
|
+
|
|
227
|
+
def clear(self) -> None:
|
|
228
|
+
"""Clear the search index."""
|
|
229
|
+
self._item_ids = []
|
|
230
|
+
self._vectors = None
|
|
231
|
+
self._fitted = False
|
|
232
|
+
self._pending_updates = 0
|