htmlgraph 0.9.3__py3-none-any.whl → 0.27.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- htmlgraph/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/.htmlgraph/agents.json +72 -0
- htmlgraph/.htmlgraph/htmlgraph.db +0 -0
- htmlgraph/__init__.py +173 -17
- htmlgraph/__init__.pyi +123 -0
- htmlgraph/agent_detection.py +127 -0
- htmlgraph/agent_registry.py +45 -30
- htmlgraph/agents.py +160 -107
- htmlgraph/analytics/__init__.py +9 -2
- htmlgraph/analytics/cli.py +190 -51
- htmlgraph/analytics/cost_analyzer.py +391 -0
- htmlgraph/analytics/cost_monitor.py +664 -0
- htmlgraph/analytics/cost_reporter.py +675 -0
- htmlgraph/analytics/cross_session.py +617 -0
- htmlgraph/analytics/dependency.py +192 -100
- htmlgraph/analytics/pattern_learning.py +771 -0
- htmlgraph/analytics/session_graph.py +707 -0
- htmlgraph/analytics/strategic/__init__.py +80 -0
- htmlgraph/analytics/strategic/cost_optimizer.py +611 -0
- htmlgraph/analytics/strategic/pattern_detector.py +876 -0
- htmlgraph/analytics/strategic/preference_manager.py +709 -0
- htmlgraph/analytics/strategic/suggestion_engine.py +747 -0
- htmlgraph/analytics/work_type.py +190 -14
- htmlgraph/analytics_index.py +135 -51
- htmlgraph/api/__init__.py +3 -0
- htmlgraph/api/cost_alerts_websocket.py +416 -0
- htmlgraph/api/main.py +2498 -0
- htmlgraph/api/static/htmx.min.js +1 -0
- htmlgraph/api/static/style-redesign.css +1344 -0
- htmlgraph/api/static/style.css +1079 -0
- htmlgraph/api/templates/dashboard-redesign.html +1366 -0
- htmlgraph/api/templates/dashboard.html +794 -0
- htmlgraph/api/templates/partials/activity-feed-hierarchical.html +326 -0
- htmlgraph/api/templates/partials/activity-feed.html +1100 -0
- htmlgraph/api/templates/partials/agents-redesign.html +317 -0
- htmlgraph/api/templates/partials/agents.html +317 -0
- htmlgraph/api/templates/partials/event-traces.html +373 -0
- htmlgraph/api/templates/partials/features-kanban-redesign.html +509 -0
- htmlgraph/api/templates/partials/features.html +578 -0
- htmlgraph/api/templates/partials/metrics-redesign.html +346 -0
- htmlgraph/api/templates/partials/metrics.html +346 -0
- htmlgraph/api/templates/partials/orchestration-redesign.html +443 -0
- htmlgraph/api/templates/partials/orchestration.html +198 -0
- htmlgraph/api/templates/partials/spawners.html +375 -0
- htmlgraph/api/templates/partials/work-items.html +613 -0
- htmlgraph/api/websocket.py +538 -0
- htmlgraph/archive/__init__.py +24 -0
- htmlgraph/archive/bloom.py +234 -0
- htmlgraph/archive/fts.py +297 -0
- htmlgraph/archive/manager.py +583 -0
- htmlgraph/archive/search.py +244 -0
- htmlgraph/atomic_ops.py +560 -0
- htmlgraph/attribute_index.py +208 -0
- htmlgraph/bounded_paths.py +539 -0
- htmlgraph/builders/__init__.py +14 -0
- htmlgraph/builders/base.py +118 -29
- htmlgraph/builders/bug.py +150 -0
- htmlgraph/builders/chore.py +119 -0
- htmlgraph/builders/epic.py +150 -0
- htmlgraph/builders/feature.py +31 -6
- htmlgraph/builders/insight.py +195 -0
- htmlgraph/builders/metric.py +217 -0
- htmlgraph/builders/pattern.py +202 -0
- htmlgraph/builders/phase.py +162 -0
- htmlgraph/builders/spike.py +52 -19
- htmlgraph/builders/track.py +148 -72
- htmlgraph/cigs/__init__.py +81 -0
- htmlgraph/cigs/autonomy.py +385 -0
- htmlgraph/cigs/cost.py +475 -0
- htmlgraph/cigs/messages_basic.py +472 -0
- htmlgraph/cigs/messaging.py +365 -0
- htmlgraph/cigs/models.py +771 -0
- htmlgraph/cigs/pattern_storage.py +427 -0
- htmlgraph/cigs/patterns.py +503 -0
- htmlgraph/cigs/posttool_analyzer.py +234 -0
- htmlgraph/cigs/reporter.py +818 -0
- htmlgraph/cigs/tracker.py +317 -0
- htmlgraph/cli/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/cli/.htmlgraph/agents.json +72 -0
- htmlgraph/cli/.htmlgraph/htmlgraph.db +0 -0
- htmlgraph/cli/__init__.py +42 -0
- htmlgraph/cli/__main__.py +6 -0
- htmlgraph/cli/analytics.py +1424 -0
- htmlgraph/cli/base.py +685 -0
- htmlgraph/cli/constants.py +206 -0
- htmlgraph/cli/core.py +954 -0
- htmlgraph/cli/main.py +147 -0
- htmlgraph/cli/models.py +475 -0
- htmlgraph/cli/templates/__init__.py +1 -0
- htmlgraph/cli/templates/cost_dashboard.py +399 -0
- htmlgraph/cli/work/__init__.py +239 -0
- htmlgraph/cli/work/browse.py +115 -0
- htmlgraph/cli/work/features.py +568 -0
- htmlgraph/cli/work/orchestration.py +676 -0
- htmlgraph/cli/work/report.py +728 -0
- htmlgraph/cli/work/sessions.py +466 -0
- htmlgraph/cli/work/snapshot.py +559 -0
- htmlgraph/cli/work/tracks.py +486 -0
- htmlgraph/cli_commands/__init__.py +1 -0
- htmlgraph/cli_commands/feature.py +195 -0
- htmlgraph/cli_framework.py +115 -0
- htmlgraph/collections/__init__.py +18 -0
- htmlgraph/collections/base.py +415 -98
- htmlgraph/collections/bug.py +53 -0
- htmlgraph/collections/chore.py +53 -0
- htmlgraph/collections/epic.py +53 -0
- htmlgraph/collections/feature.py +12 -26
- htmlgraph/collections/insight.py +100 -0
- htmlgraph/collections/metric.py +92 -0
- htmlgraph/collections/pattern.py +97 -0
- htmlgraph/collections/phase.py +53 -0
- htmlgraph/collections/session.py +194 -0
- htmlgraph/collections/spike.py +56 -16
- htmlgraph/collections/task_delegation.py +241 -0
- htmlgraph/collections/todo.py +511 -0
- htmlgraph/collections/traces.py +487 -0
- htmlgraph/config/cost_models.json +56 -0
- htmlgraph/config.py +190 -0
- htmlgraph/context_analytics.py +344 -0
- htmlgraph/converter.py +216 -28
- htmlgraph/cost_analysis/__init__.py +5 -0
- htmlgraph/cost_analysis/analyzer.py +438 -0
- htmlgraph/dashboard.html +2406 -307
- htmlgraph/dashboard.html.backup +6592 -0
- htmlgraph/dashboard.html.bak +7181 -0
- htmlgraph/dashboard.html.bak2 +7231 -0
- htmlgraph/dashboard.html.bak3 +7232 -0
- htmlgraph/db/__init__.py +38 -0
- htmlgraph/db/queries.py +790 -0
- htmlgraph/db/schema.py +1788 -0
- htmlgraph/decorators.py +317 -0
- htmlgraph/dependency_models.py +19 -2
- htmlgraph/deploy.py +142 -125
- htmlgraph/deployment_models.py +474 -0
- htmlgraph/docs/API_REFERENCE.md +841 -0
- htmlgraph/docs/HTTP_API.md +750 -0
- htmlgraph/docs/INTEGRATION_GUIDE.md +752 -0
- htmlgraph/docs/ORCHESTRATION_PATTERNS.md +717 -0
- htmlgraph/docs/README.md +532 -0
- htmlgraph/docs/__init__.py +77 -0
- htmlgraph/docs/docs_version.py +55 -0
- htmlgraph/docs/metadata.py +93 -0
- htmlgraph/docs/migrations.py +232 -0
- htmlgraph/docs/template_engine.py +143 -0
- htmlgraph/docs/templates/_sections/cli_reference.md.j2 +52 -0
- htmlgraph/docs/templates/_sections/core_concepts.md.j2 +29 -0
- htmlgraph/docs/templates/_sections/sdk_basics.md.j2 +69 -0
- htmlgraph/docs/templates/base_agents.md.j2 +78 -0
- htmlgraph/docs/templates/example_user_override.md.j2 +47 -0
- htmlgraph/docs/version_check.py +163 -0
- htmlgraph/edge_index.py +182 -27
- htmlgraph/error_handler.py +544 -0
- htmlgraph/event_log.py +100 -52
- htmlgraph/event_migration.py +13 -4
- htmlgraph/exceptions.py +49 -0
- htmlgraph/file_watcher.py +101 -28
- htmlgraph/find_api.py +75 -63
- htmlgraph/git_events.py +145 -63
- htmlgraph/graph.py +1122 -106
- htmlgraph/hooks/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/hooks/.htmlgraph/agents.json +72 -0
- htmlgraph/hooks/.htmlgraph/index.sqlite +0 -0
- htmlgraph/hooks/__init__.py +45 -0
- htmlgraph/hooks/bootstrap.py +169 -0
- htmlgraph/hooks/cigs_pretool_enforcer.py +354 -0
- htmlgraph/hooks/concurrent_sessions.py +208 -0
- htmlgraph/hooks/context.py +350 -0
- htmlgraph/hooks/drift_handler.py +525 -0
- htmlgraph/hooks/event_tracker.py +1314 -0
- htmlgraph/hooks/git_commands.py +175 -0
- htmlgraph/hooks/hooks-config.example.json +12 -0
- htmlgraph/hooks/installer.py +343 -0
- htmlgraph/hooks/orchestrator.py +674 -0
- htmlgraph/hooks/orchestrator_reflector.py +223 -0
- htmlgraph/hooks/post-checkout.sh +28 -0
- htmlgraph/hooks/post-commit.sh +24 -0
- htmlgraph/hooks/post-merge.sh +26 -0
- htmlgraph/hooks/post_tool_use_failure.py +273 -0
- htmlgraph/hooks/post_tool_use_handler.py +257 -0
- htmlgraph/hooks/posttooluse.py +408 -0
- htmlgraph/hooks/pre-commit.sh +94 -0
- htmlgraph/hooks/pre-push.sh +28 -0
- htmlgraph/hooks/pretooluse.py +819 -0
- htmlgraph/hooks/prompt_analyzer.py +637 -0
- htmlgraph/hooks/session_handler.py +668 -0
- htmlgraph/hooks/session_summary.py +395 -0
- htmlgraph/hooks/state_manager.py +504 -0
- htmlgraph/hooks/subagent_detection.py +202 -0
- htmlgraph/hooks/subagent_stop.py +369 -0
- htmlgraph/hooks/task_enforcer.py +255 -0
- htmlgraph/hooks/task_validator.py +177 -0
- htmlgraph/hooks/validator.py +628 -0
- htmlgraph/ids.py +41 -27
- htmlgraph/index.d.ts +286 -0
- htmlgraph/learning.py +767 -0
- htmlgraph/mcp_server.py +69 -23
- htmlgraph/models.py +1586 -87
- htmlgraph/operations/README.md +62 -0
- htmlgraph/operations/__init__.py +79 -0
- htmlgraph/operations/analytics.py +339 -0
- htmlgraph/operations/bootstrap.py +289 -0
- htmlgraph/operations/events.py +244 -0
- htmlgraph/operations/fastapi_server.py +231 -0
- htmlgraph/operations/hooks.py +350 -0
- htmlgraph/operations/initialization.py +597 -0
- htmlgraph/operations/initialization.py.backup +228 -0
- htmlgraph/operations/server.py +303 -0
- htmlgraph/orchestration/__init__.py +58 -0
- htmlgraph/orchestration/claude_launcher.py +179 -0
- htmlgraph/orchestration/command_builder.py +72 -0
- htmlgraph/orchestration/headless_spawner.py +281 -0
- htmlgraph/orchestration/live_events.py +377 -0
- htmlgraph/orchestration/model_selection.py +327 -0
- htmlgraph/orchestration/plugin_manager.py +140 -0
- htmlgraph/orchestration/prompts.py +137 -0
- htmlgraph/orchestration/spawner_event_tracker.py +383 -0
- htmlgraph/orchestration/spawners/__init__.py +16 -0
- htmlgraph/orchestration/spawners/base.py +194 -0
- htmlgraph/orchestration/spawners/claude.py +173 -0
- htmlgraph/orchestration/spawners/codex.py +435 -0
- htmlgraph/orchestration/spawners/copilot.py +294 -0
- htmlgraph/orchestration/spawners/gemini.py +471 -0
- htmlgraph/orchestration/subprocess_runner.py +36 -0
- htmlgraph/orchestration/task_coordination.py +343 -0
- htmlgraph/orchestration.md +563 -0
- htmlgraph/orchestrator-system-prompt-optimized.txt +863 -0
- htmlgraph/orchestrator.py +669 -0
- htmlgraph/orchestrator_config.py +357 -0
- htmlgraph/orchestrator_mode.py +328 -0
- htmlgraph/orchestrator_validator.py +133 -0
- htmlgraph/parallel.py +646 -0
- htmlgraph/parser.py +160 -35
- htmlgraph/path_query.py +608 -0
- htmlgraph/pattern_matcher.py +636 -0
- htmlgraph/planning.py +147 -52
- htmlgraph/pydantic_models.py +476 -0
- htmlgraph/quality_gates.py +350 -0
- htmlgraph/query_builder.py +109 -72
- htmlgraph/query_composer.py +509 -0
- htmlgraph/reflection.py +443 -0
- htmlgraph/refs.py +344 -0
- htmlgraph/repo_hash.py +512 -0
- htmlgraph/repositories/__init__.py +292 -0
- htmlgraph/repositories/analytics_repository.py +455 -0
- htmlgraph/repositories/analytics_repository_standard.py +628 -0
- htmlgraph/repositories/feature_repository.py +581 -0
- htmlgraph/repositories/feature_repository_htmlfile.py +668 -0
- htmlgraph/repositories/feature_repository_memory.py +607 -0
- htmlgraph/repositories/feature_repository_sqlite.py +858 -0
- htmlgraph/repositories/filter_service.py +620 -0
- htmlgraph/repositories/filter_service_standard.py +445 -0
- htmlgraph/repositories/shared_cache.py +621 -0
- htmlgraph/repositories/shared_cache_memory.py +395 -0
- htmlgraph/repositories/track_repository.py +552 -0
- htmlgraph/repositories/track_repository_htmlfile.py +619 -0
- htmlgraph/repositories/track_repository_memory.py +508 -0
- htmlgraph/repositories/track_repository_sqlite.py +711 -0
- htmlgraph/routing.py +8 -19
- htmlgraph/scripts/deploy.py +1 -2
- htmlgraph/sdk/__init__.py +398 -0
- htmlgraph/sdk/__init__.pyi +14 -0
- htmlgraph/sdk/analytics/__init__.py +19 -0
- htmlgraph/sdk/analytics/engine.py +155 -0
- htmlgraph/sdk/analytics/helpers.py +178 -0
- htmlgraph/sdk/analytics/registry.py +109 -0
- htmlgraph/sdk/base.py +484 -0
- htmlgraph/sdk/constants.py +216 -0
- htmlgraph/sdk/core.pyi +308 -0
- htmlgraph/sdk/discovery.py +120 -0
- htmlgraph/sdk/help/__init__.py +12 -0
- htmlgraph/sdk/help/mixin.py +699 -0
- htmlgraph/sdk/mixins/__init__.py +15 -0
- htmlgraph/sdk/mixins/attribution.py +113 -0
- htmlgraph/sdk/mixins/mixin.py +410 -0
- htmlgraph/sdk/operations/__init__.py +12 -0
- htmlgraph/sdk/operations/mixin.py +427 -0
- htmlgraph/sdk/orchestration/__init__.py +17 -0
- htmlgraph/sdk/orchestration/coordinator.py +203 -0
- htmlgraph/sdk/orchestration/spawner.py +204 -0
- htmlgraph/sdk/planning/__init__.py +19 -0
- htmlgraph/sdk/planning/bottlenecks.py +93 -0
- htmlgraph/sdk/planning/mixin.py +211 -0
- htmlgraph/sdk/planning/parallel.py +186 -0
- htmlgraph/sdk/planning/queue.py +210 -0
- htmlgraph/sdk/planning/recommendations.py +87 -0
- htmlgraph/sdk/planning/smart_planning.py +319 -0
- htmlgraph/sdk/session/__init__.py +19 -0
- htmlgraph/sdk/session/continuity.py +57 -0
- htmlgraph/sdk/session/handoff.py +110 -0
- htmlgraph/sdk/session/info.py +309 -0
- htmlgraph/sdk/session/manager.py +103 -0
- htmlgraph/sdk/strategic/__init__.py +26 -0
- htmlgraph/sdk/strategic/mixin.py +563 -0
- htmlgraph/server.py +685 -180
- htmlgraph/services/__init__.py +10 -0
- htmlgraph/services/claiming.py +199 -0
- htmlgraph/session_hooks.py +300 -0
- htmlgraph/session_manager.py +1392 -175
- htmlgraph/session_registry.py +587 -0
- htmlgraph/session_state.py +436 -0
- htmlgraph/session_warning.py +201 -0
- htmlgraph/sessions/__init__.py +23 -0
- htmlgraph/sessions/handoff.py +756 -0
- htmlgraph/setup.py +34 -17
- htmlgraph/spike_index.py +143 -0
- htmlgraph/sync_docs.py +12 -15
- htmlgraph/system_prompts.py +450 -0
- htmlgraph/templates/AGENTS.md.template +366 -0
- htmlgraph/templates/CLAUDE.md.template +97 -0
- htmlgraph/templates/GEMINI.md.template +87 -0
- htmlgraph/templates/orchestration-view.html +350 -0
- htmlgraph/track_builder.py +146 -15
- htmlgraph/track_manager.py +69 -21
- htmlgraph/transcript.py +890 -0
- htmlgraph/transcript_analytics.py +699 -0
- htmlgraph/types.py +323 -0
- htmlgraph/validation.py +115 -0
- htmlgraph/watch.py +8 -5
- htmlgraph/work_type_utils.py +3 -2
- {htmlgraph-0.9.3.data → htmlgraph-0.27.5.data}/data/htmlgraph/dashboard.html +2406 -307
- htmlgraph-0.27.5.data/data/htmlgraph/templates/AGENTS.md.template +366 -0
- htmlgraph-0.27.5.data/data/htmlgraph/templates/CLAUDE.md.template +97 -0
- htmlgraph-0.27.5.data/data/htmlgraph/templates/GEMINI.md.template +87 -0
- {htmlgraph-0.9.3.dist-info → htmlgraph-0.27.5.dist-info}/METADATA +97 -64
- htmlgraph-0.27.5.dist-info/RECORD +337 -0
- {htmlgraph-0.9.3.dist-info → htmlgraph-0.27.5.dist-info}/entry_points.txt +1 -1
- htmlgraph/cli.py +0 -2688
- htmlgraph/sdk.py +0 -709
- htmlgraph-0.9.3.dist-info/RECORD +0 -61
- {htmlgraph-0.9.3.data → htmlgraph-0.27.5.data}/data/htmlgraph/styles.css +0 -0
- {htmlgraph-0.9.3.dist-info → htmlgraph-0.27.5.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Archive search engine with three-tier optimization.
|
|
3
|
+
|
|
4
|
+
Tier 1: Bloom filters (skip 70-90% of archives)
|
|
5
|
+
Tier 2: SQLite FTS5 with BM25 ranking (O(log n) search)
|
|
6
|
+
Tier 3: Snippet extraction and highlighting
|
|
7
|
+
|
|
8
|
+
Target: 67x faster than naive multi-file search.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from htmlgraph.archive.bloom import BloomFilter
|
|
17
|
+
from htmlgraph.archive.fts import ArchiveFTS5Index
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SearchResult:
|
|
22
|
+
"""
|
|
23
|
+
Search result from archive search.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
entity_id: Entity identifier
|
|
27
|
+
archive_file: Archive file containing entity
|
|
28
|
+
entity_type: Type of entity (feature, bug, etc.)
|
|
29
|
+
status: Entity status
|
|
30
|
+
title_snippet: Title with highlighted matches
|
|
31
|
+
description_snippet: Description with highlighted matches
|
|
32
|
+
rank: BM25 relevance score (lower is better)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
entity_id: str
|
|
36
|
+
archive_file: str
|
|
37
|
+
entity_type: str
|
|
38
|
+
status: str
|
|
39
|
+
title_snippet: str
|
|
40
|
+
description_snippet: str
|
|
41
|
+
rank: float
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ArchiveSearchEngine:
|
|
45
|
+
"""
|
|
46
|
+
Orchestrates three-tier archive search.
|
|
47
|
+
|
|
48
|
+
Workflow:
|
|
49
|
+
1. Check Bloom filters to skip irrelevant archives (70-90% filtered)
|
|
50
|
+
2. Search remaining archives with FTS5 + BM25 ranking
|
|
51
|
+
3. Extract and highlight snippets for top results
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, archive_dir: Path, index_dir: Path) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Initialize search engine.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
archive_dir: Directory containing archive HTML files
|
|
60
|
+
index_dir: Directory for Bloom filters and FTS5 index
|
|
61
|
+
"""
|
|
62
|
+
self.archive_dir = archive_dir
|
|
63
|
+
self.index_dir = index_dir
|
|
64
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Initialize FTS5 index
|
|
67
|
+
self.fts_index = ArchiveFTS5Index(index_dir / "archives.db")
|
|
68
|
+
|
|
69
|
+
# Cache for Bloom filters (avoid reloading)
|
|
70
|
+
self._bloom_cache: dict[str, BloomFilter] = {}
|
|
71
|
+
|
|
72
|
+
def _get_bloom_filter(self, archive_file: str) -> BloomFilter | None:
|
|
73
|
+
"""
|
|
74
|
+
Get Bloom filter for archive (with caching).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
archive_file: Archive filename
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
BloomFilter or None if not indexed
|
|
81
|
+
"""
|
|
82
|
+
if archive_file in self._bloom_cache:
|
|
83
|
+
return self._bloom_cache[archive_file]
|
|
84
|
+
|
|
85
|
+
bloom_path = self.index_dir / f"{archive_file}.bloom"
|
|
86
|
+
if bloom_path.exists():
|
|
87
|
+
bloom = BloomFilter.load(bloom_path)
|
|
88
|
+
self._bloom_cache[archive_file] = bloom
|
|
89
|
+
return bloom
|
|
90
|
+
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
def _filter_archives_with_bloom(
|
|
94
|
+
self, query: str, archive_files: list[str]
|
|
95
|
+
) -> list[str]:
|
|
96
|
+
"""
|
|
97
|
+
Filter archive files using Bloom filters.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
query: Search query
|
|
101
|
+
archive_files: List of all archive files
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Filtered list of archives that might contain query
|
|
105
|
+
"""
|
|
106
|
+
# Tokenize query into words
|
|
107
|
+
query_tokens = query.lower().split()
|
|
108
|
+
|
|
109
|
+
candidates = []
|
|
110
|
+
|
|
111
|
+
for archive_file in archive_files:
|
|
112
|
+
bloom = self._get_bloom_filter(archive_file)
|
|
113
|
+
|
|
114
|
+
if bloom is None:
|
|
115
|
+
# No Bloom filter - include archive
|
|
116
|
+
candidates.append(archive_file)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Check if any query token might be in archive
|
|
120
|
+
might_match = any(bloom.might_contain(token) for token in query_tokens)
|
|
121
|
+
|
|
122
|
+
if might_match:
|
|
123
|
+
candidates.append(archive_file)
|
|
124
|
+
|
|
125
|
+
return candidates
|
|
126
|
+
|
|
127
|
+
@lru_cache(maxsize=100)
|
|
128
|
+
def _cached_search(self, query: str, limit: int) -> tuple[SearchResult, ...]:
|
|
129
|
+
"""
|
|
130
|
+
Cached search implementation.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
query: Search query
|
|
134
|
+
limit: Maximum results
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Tuple of SearchResult (immutable for caching)
|
|
138
|
+
"""
|
|
139
|
+
# Get all archive files
|
|
140
|
+
archive_files = [f.name for f in self.archive_dir.glob("*.html")]
|
|
141
|
+
|
|
142
|
+
if not archive_files:
|
|
143
|
+
return tuple()
|
|
144
|
+
|
|
145
|
+
# Tier 1: Filter with Bloom filters
|
|
146
|
+
candidate_archives = self._filter_archives_with_bloom(query, archive_files)
|
|
147
|
+
|
|
148
|
+
if not candidate_archives:
|
|
149
|
+
return tuple()
|
|
150
|
+
|
|
151
|
+
# Tier 2 & 3: Search with FTS5 (includes snippet highlighting)
|
|
152
|
+
results = self.fts_index.search(
|
|
153
|
+
query, limit=limit, archive_files=candidate_archives
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Convert to SearchResult objects
|
|
157
|
+
search_results = [
|
|
158
|
+
SearchResult(
|
|
159
|
+
entity_id=r["entity_id"],
|
|
160
|
+
archive_file=r["archive_file"],
|
|
161
|
+
entity_type=r["entity_type"],
|
|
162
|
+
status=r["status"],
|
|
163
|
+
title_snippet=r["title_snippet"],
|
|
164
|
+
description_snippet=r["description_snippet"],
|
|
165
|
+
rank=r["rank"],
|
|
166
|
+
)
|
|
167
|
+
for r in results
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
return tuple(search_results)
|
|
171
|
+
|
|
172
|
+
def search(
|
|
173
|
+
self, query: str, include_archived: bool = True, limit: int = 10
|
|
174
|
+
) -> list[SearchResult]:
|
|
175
|
+
"""
|
|
176
|
+
Search archives with three-tier optimization.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
query: Search query
|
|
180
|
+
include_archived: Whether to search archives (future: also search active)
|
|
181
|
+
limit: Maximum results to return
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of SearchResult objects sorted by relevance
|
|
185
|
+
"""
|
|
186
|
+
if not include_archived:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
# Use cached search
|
|
190
|
+
results = self._cached_search(query, limit)
|
|
191
|
+
return list(results)
|
|
192
|
+
|
|
193
|
+
def get_search_stats(self, query: str) -> dict[str, Any]:
|
|
194
|
+
"""
|
|
195
|
+
Get statistics about a search query.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
query: Search query
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dictionary with bloom_filtered_count, searched_count, etc.
|
|
202
|
+
"""
|
|
203
|
+
archive_files = [f.name for f in self.archive_dir.glob("*.html")]
|
|
204
|
+
total_archives = len(archive_files)
|
|
205
|
+
|
|
206
|
+
candidate_archives = self._filter_archives_with_bloom(query, archive_files)
|
|
207
|
+
searched_count = len(candidate_archives)
|
|
208
|
+
|
|
209
|
+
bloom_filtered = total_archives - searched_count
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
"total_archives": total_archives,
|
|
213
|
+
"bloom_filtered": bloom_filtered,
|
|
214
|
+
"searched_count": searched_count,
|
|
215
|
+
"filter_rate": bloom_filtered / total_archives if total_archives > 0 else 0,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def rebuild_bloom_filters(self) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Rebuild all Bloom filters from scratch.
|
|
221
|
+
|
|
222
|
+
Useful after archiving new entities.
|
|
223
|
+
"""
|
|
224
|
+
# This will be implemented by ArchiveManager when creating archives
|
|
225
|
+
# For now, this is a placeholder
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
def clear_cache(self) -> None:
|
|
229
|
+
"""Clear the search cache."""
|
|
230
|
+
self._cached_search.cache_clear()
|
|
231
|
+
self._bloom_cache.clear()
|
|
232
|
+
|
|
233
|
+
def close(self) -> None:
|
|
234
|
+
"""Close all resources."""
|
|
235
|
+
self.fts_index.close()
|
|
236
|
+
self._bloom_cache.clear()
|
|
237
|
+
|
|
238
|
+
def __enter__(self) -> "ArchiveSearchEngine":
|
|
239
|
+
"""Context manager entry."""
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def __exit__(self, *args: Any) -> None:
|
|
243
|
+
"""Context manager exit."""
|
|
244
|
+
self.close()
|