htmlgraph 0.20.1__py3-none-any.whl → 0.27.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- htmlgraph/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/.htmlgraph/agents.json +72 -0
- htmlgraph/.htmlgraph/htmlgraph.db +0 -0
- htmlgraph/__init__.py +51 -1
- htmlgraph/__init__.pyi +123 -0
- htmlgraph/agent_detection.py +26 -10
- htmlgraph/agent_registry.py +2 -1
- htmlgraph/analytics/__init__.py +8 -1
- htmlgraph/analytics/cli.py +86 -20
- htmlgraph/analytics/cost_analyzer.py +391 -0
- htmlgraph/analytics/cost_monitor.py +664 -0
- htmlgraph/analytics/cost_reporter.py +675 -0
- htmlgraph/analytics/cross_session.py +617 -0
- htmlgraph/analytics/dependency.py +10 -6
- htmlgraph/analytics/pattern_learning.py +771 -0
- htmlgraph/analytics/session_graph.py +707 -0
- htmlgraph/analytics/strategic/__init__.py +80 -0
- htmlgraph/analytics/strategic/cost_optimizer.py +611 -0
- htmlgraph/analytics/strategic/pattern_detector.py +876 -0
- htmlgraph/analytics/strategic/preference_manager.py +709 -0
- htmlgraph/analytics/strategic/suggestion_engine.py +747 -0
- htmlgraph/analytics/work_type.py +67 -27
- htmlgraph/analytics_index.py +53 -20
- htmlgraph/api/__init__.py +3 -0
- htmlgraph/api/cost_alerts_websocket.py +416 -0
- htmlgraph/api/main.py +2498 -0
- htmlgraph/api/static/htmx.min.js +1 -0
- htmlgraph/api/static/style-redesign.css +1344 -0
- htmlgraph/api/static/style.css +1079 -0
- htmlgraph/api/templates/dashboard-redesign.html +1366 -0
- htmlgraph/api/templates/dashboard.html +794 -0
- htmlgraph/api/templates/partials/activity-feed-hierarchical.html +326 -0
- htmlgraph/api/templates/partials/activity-feed.html +1100 -0
- htmlgraph/api/templates/partials/agents-redesign.html +317 -0
- htmlgraph/api/templates/partials/agents.html +317 -0
- htmlgraph/api/templates/partials/event-traces.html +373 -0
- htmlgraph/api/templates/partials/features-kanban-redesign.html +509 -0
- htmlgraph/api/templates/partials/features.html +578 -0
- htmlgraph/api/templates/partials/metrics-redesign.html +346 -0
- htmlgraph/api/templates/partials/metrics.html +346 -0
- htmlgraph/api/templates/partials/orchestration-redesign.html +443 -0
- htmlgraph/api/templates/partials/orchestration.html +198 -0
- htmlgraph/api/templates/partials/spawners.html +375 -0
- htmlgraph/api/templates/partials/work-items.html +613 -0
- htmlgraph/api/websocket.py +538 -0
- htmlgraph/archive/__init__.py +24 -0
- htmlgraph/archive/bloom.py +234 -0
- htmlgraph/archive/fts.py +297 -0
- htmlgraph/archive/manager.py +583 -0
- htmlgraph/archive/search.py +244 -0
- htmlgraph/atomic_ops.py +560 -0
- htmlgraph/attribute_index.py +2 -1
- htmlgraph/bounded_paths.py +539 -0
- htmlgraph/builders/base.py +57 -2
- htmlgraph/builders/bug.py +19 -3
- htmlgraph/builders/chore.py +19 -3
- htmlgraph/builders/epic.py +19 -3
- htmlgraph/builders/feature.py +27 -3
- htmlgraph/builders/insight.py +2 -1
- htmlgraph/builders/metric.py +2 -1
- htmlgraph/builders/pattern.py +2 -1
- htmlgraph/builders/phase.py +19 -3
- htmlgraph/builders/spike.py +29 -3
- htmlgraph/builders/track.py +42 -1
- htmlgraph/cigs/__init__.py +81 -0
- htmlgraph/cigs/autonomy.py +385 -0
- htmlgraph/cigs/cost.py +475 -0
- htmlgraph/cigs/messages_basic.py +472 -0
- htmlgraph/cigs/messaging.py +365 -0
- htmlgraph/cigs/models.py +771 -0
- htmlgraph/cigs/pattern_storage.py +427 -0
- htmlgraph/cigs/patterns.py +503 -0
- htmlgraph/cigs/posttool_analyzer.py +234 -0
- htmlgraph/cigs/reporter.py +818 -0
- htmlgraph/cigs/tracker.py +317 -0
- htmlgraph/cli/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/cli/.htmlgraph/agents.json +72 -0
- htmlgraph/cli/.htmlgraph/htmlgraph.db +0 -0
- htmlgraph/cli/__init__.py +42 -0
- htmlgraph/cli/__main__.py +6 -0
- htmlgraph/cli/analytics.py +1424 -0
- htmlgraph/cli/base.py +685 -0
- htmlgraph/cli/constants.py +206 -0
- htmlgraph/cli/core.py +954 -0
- htmlgraph/cli/main.py +147 -0
- htmlgraph/cli/models.py +475 -0
- htmlgraph/cli/templates/__init__.py +1 -0
- htmlgraph/cli/templates/cost_dashboard.py +399 -0
- htmlgraph/cli/work/__init__.py +239 -0
- htmlgraph/cli/work/browse.py +115 -0
- htmlgraph/cli/work/features.py +568 -0
- htmlgraph/cli/work/orchestration.py +676 -0
- htmlgraph/cli/work/report.py +728 -0
- htmlgraph/cli/work/sessions.py +466 -0
- htmlgraph/cli/work/snapshot.py +559 -0
- htmlgraph/cli/work/tracks.py +486 -0
- htmlgraph/cli_commands/__init__.py +1 -0
- htmlgraph/cli_commands/feature.py +195 -0
- htmlgraph/cli_framework.py +115 -0
- htmlgraph/collections/__init__.py +2 -0
- htmlgraph/collections/base.py +197 -14
- htmlgraph/collections/bug.py +2 -1
- htmlgraph/collections/chore.py +2 -1
- htmlgraph/collections/epic.py +2 -1
- htmlgraph/collections/feature.py +2 -1
- htmlgraph/collections/insight.py +2 -1
- htmlgraph/collections/metric.py +2 -1
- htmlgraph/collections/pattern.py +2 -1
- htmlgraph/collections/phase.py +2 -1
- htmlgraph/collections/session.py +194 -0
- htmlgraph/collections/spike.py +13 -2
- htmlgraph/collections/task_delegation.py +241 -0
- htmlgraph/collections/todo.py +14 -1
- htmlgraph/collections/traces.py +487 -0
- htmlgraph/config/cost_models.json +56 -0
- htmlgraph/config.py +190 -0
- htmlgraph/context_analytics.py +2 -1
- htmlgraph/converter.py +116 -7
- htmlgraph/cost_analysis/__init__.py +5 -0
- htmlgraph/cost_analysis/analyzer.py +438 -0
- htmlgraph/dashboard.html +2246 -248
- htmlgraph/dashboard.html.backup +6592 -0
- htmlgraph/dashboard.html.bak +7181 -0
- htmlgraph/dashboard.html.bak2 +7231 -0
- htmlgraph/dashboard.html.bak3 +7232 -0
- htmlgraph/db/__init__.py +38 -0
- htmlgraph/db/queries.py +790 -0
- htmlgraph/db/schema.py +1788 -0
- htmlgraph/decorators.py +317 -0
- htmlgraph/dependency_models.py +2 -1
- htmlgraph/deploy.py +26 -27
- htmlgraph/docs/API_REFERENCE.md +841 -0
- htmlgraph/docs/HTTP_API.md +750 -0
- htmlgraph/docs/INTEGRATION_GUIDE.md +752 -0
- htmlgraph/docs/ORCHESTRATION_PATTERNS.md +717 -0
- htmlgraph/docs/README.md +532 -0
- htmlgraph/docs/__init__.py +77 -0
- htmlgraph/docs/docs_version.py +55 -0
- htmlgraph/docs/metadata.py +93 -0
- htmlgraph/docs/migrations.py +232 -0
- htmlgraph/docs/template_engine.py +143 -0
- htmlgraph/docs/templates/_sections/cli_reference.md.j2 +52 -0
- htmlgraph/docs/templates/_sections/core_concepts.md.j2 +29 -0
- htmlgraph/docs/templates/_sections/sdk_basics.md.j2 +69 -0
- htmlgraph/docs/templates/base_agents.md.j2 +78 -0
- htmlgraph/docs/templates/example_user_override.md.j2 +47 -0
- htmlgraph/docs/version_check.py +163 -0
- htmlgraph/edge_index.py +2 -1
- htmlgraph/error_handler.py +544 -0
- htmlgraph/event_log.py +86 -37
- htmlgraph/event_migration.py +2 -1
- htmlgraph/file_watcher.py +12 -8
- htmlgraph/find_api.py +2 -1
- htmlgraph/git_events.py +67 -9
- htmlgraph/hooks/.htmlgraph/.session-warning-state.json +6 -0
- htmlgraph/hooks/.htmlgraph/agents.json +72 -0
- htmlgraph/hooks/.htmlgraph/index.sqlite +0 -0
- htmlgraph/hooks/__init__.py +8 -0
- htmlgraph/hooks/bootstrap.py +169 -0
- htmlgraph/hooks/cigs_pretool_enforcer.py +354 -0
- htmlgraph/hooks/concurrent_sessions.py +208 -0
- htmlgraph/hooks/context.py +350 -0
- htmlgraph/hooks/drift_handler.py +525 -0
- htmlgraph/hooks/event_tracker.py +790 -99
- htmlgraph/hooks/git_commands.py +175 -0
- htmlgraph/hooks/installer.py +5 -1
- htmlgraph/hooks/orchestrator.py +327 -76
- htmlgraph/hooks/orchestrator_reflector.py +31 -4
- htmlgraph/hooks/post_tool_use_failure.py +32 -7
- htmlgraph/hooks/post_tool_use_handler.py +257 -0
- htmlgraph/hooks/posttooluse.py +92 -19
- htmlgraph/hooks/pretooluse.py +527 -7
- htmlgraph/hooks/prompt_analyzer.py +637 -0
- htmlgraph/hooks/session_handler.py +668 -0
- htmlgraph/hooks/session_summary.py +395 -0
- htmlgraph/hooks/state_manager.py +504 -0
- htmlgraph/hooks/subagent_detection.py +202 -0
- htmlgraph/hooks/subagent_stop.py +369 -0
- htmlgraph/hooks/task_enforcer.py +99 -4
- htmlgraph/hooks/validator.py +212 -91
- htmlgraph/ids.py +2 -1
- htmlgraph/learning.py +125 -100
- htmlgraph/mcp_server.py +2 -1
- htmlgraph/models.py +217 -18
- htmlgraph/operations/README.md +62 -0
- htmlgraph/operations/__init__.py +79 -0
- htmlgraph/operations/analytics.py +339 -0
- htmlgraph/operations/bootstrap.py +289 -0
- htmlgraph/operations/events.py +244 -0
- htmlgraph/operations/fastapi_server.py +231 -0
- htmlgraph/operations/hooks.py +350 -0
- htmlgraph/operations/initialization.py +597 -0
- htmlgraph/operations/initialization.py.backup +228 -0
- htmlgraph/operations/server.py +303 -0
- htmlgraph/orchestration/__init__.py +58 -0
- htmlgraph/orchestration/claude_launcher.py +179 -0
- htmlgraph/orchestration/command_builder.py +72 -0
- htmlgraph/orchestration/headless_spawner.py +281 -0
- htmlgraph/orchestration/live_events.py +377 -0
- htmlgraph/orchestration/model_selection.py +327 -0
- htmlgraph/orchestration/plugin_manager.py +140 -0
- htmlgraph/orchestration/prompts.py +137 -0
- htmlgraph/orchestration/spawner_event_tracker.py +383 -0
- htmlgraph/orchestration/spawners/__init__.py +16 -0
- htmlgraph/orchestration/spawners/base.py +194 -0
- htmlgraph/orchestration/spawners/claude.py +173 -0
- htmlgraph/orchestration/spawners/codex.py +435 -0
- htmlgraph/orchestration/spawners/copilot.py +294 -0
- htmlgraph/orchestration/spawners/gemini.py +471 -0
- htmlgraph/orchestration/subprocess_runner.py +36 -0
- htmlgraph/{orchestration.py → orchestration/task_coordination.py} +16 -8
- htmlgraph/orchestration.md +563 -0
- htmlgraph/orchestrator-system-prompt-optimized.txt +863 -0
- htmlgraph/orchestrator.py +2 -1
- htmlgraph/orchestrator_config.py +357 -0
- htmlgraph/orchestrator_mode.py +115 -4
- htmlgraph/parallel.py +2 -1
- htmlgraph/parser.py +86 -6
- htmlgraph/path_query.py +608 -0
- htmlgraph/pattern_matcher.py +636 -0
- htmlgraph/pydantic_models.py +476 -0
- htmlgraph/quality_gates.py +350 -0
- htmlgraph/query_builder.py +2 -1
- htmlgraph/query_composer.py +509 -0
- htmlgraph/reflection.py +443 -0
- htmlgraph/refs.py +344 -0
- htmlgraph/repo_hash.py +512 -0
- htmlgraph/repositories/__init__.py +292 -0
- htmlgraph/repositories/analytics_repository.py +455 -0
- htmlgraph/repositories/analytics_repository_standard.py +628 -0
- htmlgraph/repositories/feature_repository.py +581 -0
- htmlgraph/repositories/feature_repository_htmlfile.py +668 -0
- htmlgraph/repositories/feature_repository_memory.py +607 -0
- htmlgraph/repositories/feature_repository_sqlite.py +858 -0
- htmlgraph/repositories/filter_service.py +620 -0
- htmlgraph/repositories/filter_service_standard.py +445 -0
- htmlgraph/repositories/shared_cache.py +621 -0
- htmlgraph/repositories/shared_cache_memory.py +395 -0
- htmlgraph/repositories/track_repository.py +552 -0
- htmlgraph/repositories/track_repository_htmlfile.py +619 -0
- htmlgraph/repositories/track_repository_memory.py +508 -0
- htmlgraph/repositories/track_repository_sqlite.py +711 -0
- htmlgraph/sdk/__init__.py +398 -0
- htmlgraph/sdk/__init__.pyi +14 -0
- htmlgraph/sdk/analytics/__init__.py +19 -0
- htmlgraph/sdk/analytics/engine.py +155 -0
- htmlgraph/sdk/analytics/helpers.py +178 -0
- htmlgraph/sdk/analytics/registry.py +109 -0
- htmlgraph/sdk/base.py +484 -0
- htmlgraph/sdk/constants.py +216 -0
- htmlgraph/sdk/core.pyi +308 -0
- htmlgraph/sdk/discovery.py +120 -0
- htmlgraph/sdk/help/__init__.py +12 -0
- htmlgraph/sdk/help/mixin.py +699 -0
- htmlgraph/sdk/mixins/__init__.py +15 -0
- htmlgraph/sdk/mixins/attribution.py +113 -0
- htmlgraph/sdk/mixins/mixin.py +410 -0
- htmlgraph/sdk/operations/__init__.py +12 -0
- htmlgraph/sdk/operations/mixin.py +427 -0
- htmlgraph/sdk/orchestration/__init__.py +17 -0
- htmlgraph/sdk/orchestration/coordinator.py +203 -0
- htmlgraph/sdk/orchestration/spawner.py +204 -0
- htmlgraph/sdk/planning/__init__.py +19 -0
- htmlgraph/sdk/planning/bottlenecks.py +93 -0
- htmlgraph/sdk/planning/mixin.py +211 -0
- htmlgraph/sdk/planning/parallel.py +186 -0
- htmlgraph/sdk/planning/queue.py +210 -0
- htmlgraph/sdk/planning/recommendations.py +87 -0
- htmlgraph/sdk/planning/smart_planning.py +319 -0
- htmlgraph/sdk/session/__init__.py +19 -0
- htmlgraph/sdk/session/continuity.py +57 -0
- htmlgraph/sdk/session/handoff.py +110 -0
- htmlgraph/sdk/session/info.py +309 -0
- htmlgraph/sdk/session/manager.py +103 -0
- htmlgraph/sdk/strategic/__init__.py +26 -0
- htmlgraph/sdk/strategic/mixin.py +563 -0
- htmlgraph/server.py +295 -107
- htmlgraph/session_hooks.py +300 -0
- htmlgraph/session_manager.py +285 -3
- htmlgraph/session_registry.py +587 -0
- htmlgraph/session_state.py +436 -0
- htmlgraph/session_warning.py +2 -1
- htmlgraph/sessions/__init__.py +23 -0
- htmlgraph/sessions/handoff.py +756 -0
- htmlgraph/system_prompts.py +450 -0
- htmlgraph/templates/orchestration-view.html +350 -0
- htmlgraph/track_builder.py +33 -1
- htmlgraph/track_manager.py +38 -0
- htmlgraph/transcript.py +18 -5
- htmlgraph/validation.py +115 -0
- htmlgraph/watch.py +2 -1
- htmlgraph/work_type_utils.py +2 -1
- {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/dashboard.html +2246 -248
- {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/METADATA +95 -64
- htmlgraph-0.27.5.dist-info/RECORD +337 -0
- {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/entry_points.txt +1 -1
- htmlgraph/cli.py +0 -4839
- htmlgraph/sdk.py +0 -2359
- htmlgraph-0.20.1.dist-info/RECORD +0 -118
- {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/styles.css +0 -0
- {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/AGENTS.md.template +0 -0
- {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/CLAUDE.md.template +0 -0
- {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/GEMINI.md.template +0 -0
- {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Bloom filter implementation for archive search optimization.
|
|
3
|
+
|
|
4
|
+
Uses MurmurHash3 for 22x faster hashing with hardware optimizations.
|
|
5
|
+
Target: 32.8% latency reduction by skipping 70-90% of archives.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
import math
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import mmh3 # type: ignore
|
|
16
|
+
|
|
17
|
+
HAS_MMH3 = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_MMH3 = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BloomFilter:
|
|
23
|
+
"""
|
|
24
|
+
Space-efficient probabilistic data structure for archive filtering.
|
|
25
|
+
|
|
26
|
+
Optimized for speed with:
|
|
27
|
+
- MurmurHash3 hardware acceleration (if available)
|
|
28
|
+
- Configurable false positive rate (default 0.01)
|
|
29
|
+
- Efficient bit array storage
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self, expected_items: int = 1000, false_positive_rate: float = 0.01
|
|
34
|
+
) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Initialize Bloom filter.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
expected_items: Expected number of items to add
|
|
40
|
+
false_positive_rate: Desired false positive rate (0.01 = 1%)
|
|
41
|
+
"""
|
|
42
|
+
self.expected_items = expected_items
|
|
43
|
+
self.false_positive_rate = false_positive_rate
|
|
44
|
+
|
|
45
|
+
# Calculate optimal bit array size
|
|
46
|
+
self.bit_count = self._optimal_bit_count(expected_items, false_positive_rate)
|
|
47
|
+
|
|
48
|
+
# Calculate optimal number of hash functions
|
|
49
|
+
self.hash_count = self._optimal_hash_count(self.bit_count, expected_items)
|
|
50
|
+
|
|
51
|
+
# Initialize bit array (using bytearray for efficiency)
|
|
52
|
+
self.bit_array = bytearray((self.bit_count + 7) // 8)
|
|
53
|
+
|
|
54
|
+
self.items_added = 0
|
|
55
|
+
|
|
56
|
+
def _optimal_bit_count(self, n: int, p: float) -> int:
|
|
57
|
+
"""
|
|
58
|
+
Calculate optimal bit array size.
|
|
59
|
+
|
|
60
|
+
Formula: m = -(n * ln(p)) / (ln(2)^2)
|
|
61
|
+
"""
|
|
62
|
+
return int(-n * math.log(p) / (math.log(2) ** 2))
|
|
63
|
+
|
|
64
|
+
def _optimal_hash_count(self, m: int, n: int) -> int:
|
|
65
|
+
"""
|
|
66
|
+
Calculate optimal number of hash functions.
|
|
67
|
+
|
|
68
|
+
Formula: k = (m / n) * ln(2)
|
|
69
|
+
"""
|
|
70
|
+
return max(1, int((m / n) * math.log(2)))
|
|
71
|
+
|
|
72
|
+
def _hash(self, item: str, seed: int) -> int:
|
|
73
|
+
"""
|
|
74
|
+
Hash item with seed using MurmurHash3 or fallback to hashlib.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
item: Item to hash
|
|
78
|
+
seed: Hash seed for different hash functions
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Hash value modulo bit_count
|
|
82
|
+
"""
|
|
83
|
+
if HAS_MMH3:
|
|
84
|
+
# MurmurHash3 - 22x faster with hardware optimization
|
|
85
|
+
hash_val: int = mmh3.hash(item, seed) # type: ignore
|
|
86
|
+
return hash_val % self.bit_count
|
|
87
|
+
else:
|
|
88
|
+
# Fallback to hashlib (slower but always available)
|
|
89
|
+
hash_obj = hashlib.sha256(f"{item}{seed}".encode())
|
|
90
|
+
return int.from_bytes(hash_obj.digest()[:4], "big") % self.bit_count
|
|
91
|
+
|
|
92
|
+
def _set_bit(self, position: int) -> None:
|
|
93
|
+
"""Set bit at position to 1."""
|
|
94
|
+
byte_index = position // 8
|
|
95
|
+
bit_index = position % 8
|
|
96
|
+
self.bit_array[byte_index] |= 1 << bit_index
|
|
97
|
+
|
|
98
|
+
def _get_bit(self, position: int) -> bool:
|
|
99
|
+
"""Get bit value at position."""
|
|
100
|
+
byte_index = position // 8
|
|
101
|
+
bit_index = position % 8
|
|
102
|
+
return bool(self.bit_array[byte_index] & (1 << bit_index))
|
|
103
|
+
|
|
104
|
+
def add(self, item: str) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Add item to Bloom filter.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
item: String to add
|
|
110
|
+
"""
|
|
111
|
+
for seed in range(self.hash_count):
|
|
112
|
+
position = self._hash(item, seed)
|
|
113
|
+
self._set_bit(position)
|
|
114
|
+
|
|
115
|
+
self.items_added += 1
|
|
116
|
+
|
|
117
|
+
def might_contain(self, item: str) -> bool:
|
|
118
|
+
"""
|
|
119
|
+
Check if item might be in the set.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
item: String to check
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
True if item might be present (or false positive)
|
|
126
|
+
False if item is definitely not present
|
|
127
|
+
"""
|
|
128
|
+
for seed in range(self.hash_count):
|
|
129
|
+
position = self._hash(item, seed)
|
|
130
|
+
if not self._get_bit(position):
|
|
131
|
+
return False
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
def build_for_archive(self, entities: list[dict[str, Any]]) -> None:
|
|
135
|
+
"""
|
|
136
|
+
Build Bloom filter from archive entities.
|
|
137
|
+
|
|
138
|
+
Indexes:
|
|
139
|
+
- Entity IDs
|
|
140
|
+
- Titles (lowercased, tokenized)
|
|
141
|
+
- Description text (lowercased, tokenized)
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
entities: List of entity dictionaries with id, title, description
|
|
145
|
+
"""
|
|
146
|
+
for entity in entities:
|
|
147
|
+
# Add entity ID
|
|
148
|
+
self.add(entity["id"])
|
|
149
|
+
|
|
150
|
+
# Add title tokens (lowercased)
|
|
151
|
+
if "title" in entity and entity["title"]:
|
|
152
|
+
for word in entity["title"].lower().split():
|
|
153
|
+
self.add(word)
|
|
154
|
+
|
|
155
|
+
# Add description tokens (lowercased)
|
|
156
|
+
if "description" in entity and entity["description"]:
|
|
157
|
+
for word in entity["description"].lower().split():
|
|
158
|
+
self.add(word)
|
|
159
|
+
|
|
160
|
+
def save(self, filepath: Path) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Save Bloom filter to disk.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
filepath: Path to save .bloom file
|
|
166
|
+
"""
|
|
167
|
+
data = {
|
|
168
|
+
"expected_items": self.expected_items,
|
|
169
|
+
"false_positive_rate": self.false_positive_rate,
|
|
170
|
+
"bit_count": self.bit_count,
|
|
171
|
+
"hash_count": self.hash_count,
|
|
172
|
+
"items_added": self.items_added,
|
|
173
|
+
"bit_array": list(self.bit_array), # Convert bytearray to list for JSON
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
with open(filepath, "w") as f:
|
|
177
|
+
json.dump(data, f)
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def load(cls, filepath: Path) -> "BloomFilter":
|
|
181
|
+
"""
|
|
182
|
+
Load Bloom filter from disk.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
filepath: Path to .bloom file
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Loaded BloomFilter instance
|
|
189
|
+
"""
|
|
190
|
+
with open(filepath) as f:
|
|
191
|
+
data = json.load(f)
|
|
192
|
+
|
|
193
|
+
# Create instance with saved parameters
|
|
194
|
+
bloom = cls(
|
|
195
|
+
expected_items=data["expected_items"],
|
|
196
|
+
false_positive_rate=data["false_positive_rate"],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Restore state
|
|
200
|
+
bloom.bit_count = data["bit_count"]
|
|
201
|
+
bloom.hash_count = data["hash_count"]
|
|
202
|
+
bloom.items_added = data["items_added"]
|
|
203
|
+
bloom.bit_array = bytearray(data["bit_array"])
|
|
204
|
+
|
|
205
|
+
return bloom
|
|
206
|
+
|
|
207
|
+
def get_stats(self) -> dict[str, Any]:
|
|
208
|
+
"""
|
|
209
|
+
Get Bloom filter statistics.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Dictionary with stats (size, items, FPR, etc.)
|
|
213
|
+
"""
|
|
214
|
+
# Calculate actual false positive rate
|
|
215
|
+
actual_fpr = (
|
|
216
|
+
(1 - math.exp(-self.hash_count * self.items_added / self.bit_count))
|
|
217
|
+
** self.hash_count
|
|
218
|
+
if self.items_added > 0
|
|
219
|
+
else 0
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
"expected_items": self.expected_items,
|
|
224
|
+
"items_added": self.items_added,
|
|
225
|
+
"bit_count": self.bit_count,
|
|
226
|
+
"hash_count": self.hash_count,
|
|
227
|
+
"bytes_used": len(self.bit_array),
|
|
228
|
+
"target_fpr": self.false_positive_rate,
|
|
229
|
+
"actual_fpr": actual_fpr,
|
|
230
|
+
"utilization": self.items_added / self.expected_items
|
|
231
|
+
if self.expected_items > 0
|
|
232
|
+
else 0,
|
|
233
|
+
"using_mmh3": HAS_MMH3,
|
|
234
|
+
}
|
htmlgraph/archive/fts.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQLite FTS5 full-text search index for archive content.
|
|
3
|
+
|
|
4
|
+
Uses BM25 ranking for relevance scoring with O(log n) search performance.
|
|
5
|
+
Provides snippet extraction with highlighting for matched terms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sqlite3
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArchiveFTS5Index:
|
|
14
|
+
"""
|
|
15
|
+
Full-text search index using SQLite FTS5.
|
|
16
|
+
|
|
17
|
+
Features:
|
|
18
|
+
- Porter stemming for better matching
|
|
19
|
+
- Unicode61 tokenization for international text
|
|
20
|
+
- BM25 ranking for relevance scoring
|
|
21
|
+
- Snippet extraction with highlighting
|
|
22
|
+
- Metadata table for quick lookups
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db_path: Path) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Initialize FTS5 index.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
db_path: Path to SQLite database file
|
|
31
|
+
"""
|
|
32
|
+
self.db_path = db_path
|
|
33
|
+
self.conn: sqlite3.Connection | None = None
|
|
34
|
+
self._ensure_schema()
|
|
35
|
+
|
|
36
|
+
def _ensure_schema(self) -> None:
|
|
37
|
+
"""Create FTS5 tables if they don't exist."""
|
|
38
|
+
conn = self._get_connection()
|
|
39
|
+
|
|
40
|
+
# Create FTS5 virtual table with porter stemming
|
|
41
|
+
conn.execute(
|
|
42
|
+
"""
|
|
43
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS archive_fts USING fts5(
|
|
44
|
+
entity_id UNINDEXED,
|
|
45
|
+
title,
|
|
46
|
+
description,
|
|
47
|
+
content,
|
|
48
|
+
tokenize='porter unicode61'
|
|
49
|
+
)
|
|
50
|
+
"""
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Create metadata table for quick lookups
|
|
54
|
+
conn.execute(
|
|
55
|
+
"""
|
|
56
|
+
CREATE TABLE IF NOT EXISTS archive_metadata (
|
|
57
|
+
entity_id TEXT PRIMARY KEY,
|
|
58
|
+
archive_file TEXT NOT NULL,
|
|
59
|
+
entity_type TEXT,
|
|
60
|
+
status TEXT,
|
|
61
|
+
created TEXT,
|
|
62
|
+
updated TEXT
|
|
63
|
+
)
|
|
64
|
+
"""
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Create index on archive_file for filtering
|
|
68
|
+
conn.execute(
|
|
69
|
+
"""
|
|
70
|
+
CREATE INDEX IF NOT EXISTS idx_archive_file
|
|
71
|
+
ON archive_metadata(archive_file)
|
|
72
|
+
"""
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
conn.commit()
|
|
76
|
+
|
|
77
|
+
def _get_connection(self) -> sqlite3.Connection:
|
|
78
|
+
"""Get database connection (create if needed)."""
|
|
79
|
+
if self.conn is None:
|
|
80
|
+
self.conn = sqlite3.connect(str(self.db_path))
|
|
81
|
+
self.conn.row_factory = sqlite3.Row # Enable dict-like access
|
|
82
|
+
return self.conn
|
|
83
|
+
|
|
84
|
+
def index_archive(self, archive_file: str, entities: list[dict[str, Any]]) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Index entities from an archive file.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
archive_file: Name of archive file (e.g., '2024-Q4-completed.html')
|
|
90
|
+
entities: List of entity dictionaries
|
|
91
|
+
"""
|
|
92
|
+
conn = self._get_connection()
|
|
93
|
+
|
|
94
|
+
for entity in entities:
|
|
95
|
+
entity_id = entity.get("id", "")
|
|
96
|
+
title = entity.get("title", "")
|
|
97
|
+
description = entity.get("description", "")
|
|
98
|
+
content = entity.get("content", "")
|
|
99
|
+
|
|
100
|
+
# Insert into FTS5 table
|
|
101
|
+
conn.execute(
|
|
102
|
+
"""
|
|
103
|
+
INSERT INTO archive_fts (entity_id, title, description, content)
|
|
104
|
+
VALUES (?, ?, ?, ?)
|
|
105
|
+
""",
|
|
106
|
+
(entity_id, title, description, content),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Insert into metadata table
|
|
110
|
+
conn.execute(
|
|
111
|
+
"""
|
|
112
|
+
INSERT OR REPLACE INTO archive_metadata
|
|
113
|
+
(entity_id, archive_file, entity_type, status, created, updated)
|
|
114
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
115
|
+
""",
|
|
116
|
+
(
|
|
117
|
+
entity_id,
|
|
118
|
+
archive_file,
|
|
119
|
+
entity.get("type", ""),
|
|
120
|
+
entity.get("status", ""),
|
|
121
|
+
entity.get("created", ""),
|
|
122
|
+
entity.get("updated", ""),
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
conn.commit()
|
|
127
|
+
|
|
128
|
+
def search(
|
|
129
|
+
self,
|
|
130
|
+
query: str,
|
|
131
|
+
limit: int = 10,
|
|
132
|
+
archive_files: list[str] | None = None,
|
|
133
|
+
) -> list[dict[str, Any]]:
|
|
134
|
+
"""
|
|
135
|
+
Search indexed archives with BM25 ranking.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
query: Search query
|
|
139
|
+
limit: Maximum number of results
|
|
140
|
+
archive_files: Optional list of archive files to search
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of results with entity_id, title, rank, snippet, archive_file
|
|
144
|
+
"""
|
|
145
|
+
conn = self._get_connection()
|
|
146
|
+
|
|
147
|
+
# Build query with optional archive file filter
|
|
148
|
+
if archive_files:
|
|
149
|
+
placeholders = ",".join("?" * len(archive_files))
|
|
150
|
+
sql = f"""
|
|
151
|
+
SELECT
|
|
152
|
+
fts.entity_id,
|
|
153
|
+
meta.archive_file,
|
|
154
|
+
meta.entity_type,
|
|
155
|
+
meta.status,
|
|
156
|
+
snippet(archive_fts, 1, '<mark>', '</mark>', '...', 32) as title_snippet,
|
|
157
|
+
snippet(archive_fts, 2, '<mark>', '</mark>', '...', 64) as description_snippet,
|
|
158
|
+
bm25(archive_fts) as rank
|
|
159
|
+
FROM archive_fts fts
|
|
160
|
+
JOIN archive_metadata meta ON fts.entity_id = meta.entity_id
|
|
161
|
+
WHERE archive_fts MATCH ?
|
|
162
|
+
AND meta.archive_file IN ({placeholders})
|
|
163
|
+
ORDER BY rank
|
|
164
|
+
LIMIT ?
|
|
165
|
+
"""
|
|
166
|
+
params = [query] + archive_files + [limit]
|
|
167
|
+
else:
|
|
168
|
+
sql = """
|
|
169
|
+
SELECT
|
|
170
|
+
fts.entity_id,
|
|
171
|
+
meta.archive_file,
|
|
172
|
+
meta.entity_type,
|
|
173
|
+
meta.status,
|
|
174
|
+
snippet(archive_fts, 1, '<mark>', '</mark>', '...', 32) as title_snippet,
|
|
175
|
+
snippet(archive_fts, 2, '<mark>', '</mark>', '...', 64) as description_snippet,
|
|
176
|
+
bm25(archive_fts) as rank
|
|
177
|
+
FROM archive_fts fts
|
|
178
|
+
JOIN archive_metadata meta ON fts.entity_id = meta.entity_id
|
|
179
|
+
WHERE archive_fts MATCH ?
|
|
180
|
+
ORDER BY rank
|
|
181
|
+
LIMIT ?
|
|
182
|
+
"""
|
|
183
|
+
params = [query, limit]
|
|
184
|
+
|
|
185
|
+
cursor = conn.execute(sql, params)
|
|
186
|
+
|
|
187
|
+
results = []
|
|
188
|
+
for row in cursor:
|
|
189
|
+
results.append(
|
|
190
|
+
{
|
|
191
|
+
"entity_id": row["entity_id"],
|
|
192
|
+
"archive_file": row["archive_file"],
|
|
193
|
+
"entity_type": row["entity_type"],
|
|
194
|
+
"status": row["status"],
|
|
195
|
+
"title_snippet": row["title_snippet"],
|
|
196
|
+
"description_snippet": row["description_snippet"],
|
|
197
|
+
"rank": row["rank"],
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return results
|
|
202
|
+
|
|
203
|
+
def get_entity_metadata(self, entity_id: str) -> dict[str, Any] | None:
|
|
204
|
+
"""
|
|
205
|
+
Get metadata for an entity.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
entity_id: Entity identifier
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Metadata dictionary or None if not found
|
|
212
|
+
"""
|
|
213
|
+
conn = self._get_connection()
|
|
214
|
+
|
|
215
|
+
cursor = conn.execute(
|
|
216
|
+
"""
|
|
217
|
+
SELECT entity_id, archive_file, entity_type, status, created, updated
|
|
218
|
+
FROM archive_metadata
|
|
219
|
+
WHERE entity_id = ?
|
|
220
|
+
""",
|
|
221
|
+
(entity_id,),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
row = cursor.fetchone()
|
|
225
|
+
if row:
|
|
226
|
+
return dict(row)
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def remove_archive(self, archive_file: str) -> None:
|
|
230
|
+
"""
|
|
231
|
+
Remove all entities from a specific archive file.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
archive_file: Archive file to remove
|
|
235
|
+
"""
|
|
236
|
+
conn = self._get_connection()
|
|
237
|
+
|
|
238
|
+
# Get entity IDs to remove
|
|
239
|
+
cursor = conn.execute(
|
|
240
|
+
"SELECT entity_id FROM archive_metadata WHERE archive_file = ?",
|
|
241
|
+
(archive_file,),
|
|
242
|
+
)
|
|
243
|
+
entity_ids = [row["entity_id"] for row in cursor]
|
|
244
|
+
|
|
245
|
+
# Remove from FTS5
|
|
246
|
+
for entity_id in entity_ids:
|
|
247
|
+
conn.execute("DELETE FROM archive_fts WHERE entity_id = ?", (entity_id,))
|
|
248
|
+
|
|
249
|
+
# Remove from metadata
|
|
250
|
+
conn.execute(
|
|
251
|
+
"DELETE FROM archive_metadata WHERE archive_file = ?", (archive_file,)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
conn.commit()
|
|
255
|
+
|
|
256
|
+
def get_stats(self) -> dict[str, Any]:
|
|
257
|
+
"""
|
|
258
|
+
Get index statistics.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Dictionary with entity count, archive count, etc.
|
|
262
|
+
"""
|
|
263
|
+
conn = self._get_connection()
|
|
264
|
+
|
|
265
|
+
# Count entities
|
|
266
|
+
cursor = conn.execute("SELECT COUNT(*) as count FROM archive_metadata")
|
|
267
|
+
entity_count = cursor.fetchone()["count"]
|
|
268
|
+
|
|
269
|
+
# Count archives
|
|
270
|
+
cursor = conn.execute(
|
|
271
|
+
"SELECT COUNT(DISTINCT archive_file) as count FROM archive_metadata"
|
|
272
|
+
)
|
|
273
|
+
archive_count = cursor.fetchone()["count"]
|
|
274
|
+
|
|
275
|
+
# Get database size
|
|
276
|
+
db_size = self.db_path.stat().st_size if self.db_path.exists() else 0
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
"entity_count": entity_count,
|
|
280
|
+
"archive_count": archive_count,
|
|
281
|
+
"db_size_bytes": db_size,
|
|
282
|
+
"db_size_mb": db_size / (1024 * 1024),
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
def close(self) -> None:
|
|
286
|
+
"""Close database connection."""
|
|
287
|
+
if self.conn:
|
|
288
|
+
self.conn.close()
|
|
289
|
+
self.conn = None
|
|
290
|
+
|
|
291
|
+
def __enter__(self) -> "ArchiveFTS5Index":
|
|
292
|
+
"""Context manager entry."""
|
|
293
|
+
return self
|
|
294
|
+
|
|
295
|
+
def __exit__(self, *args: Any) -> None:
|
|
296
|
+
"""Context manager exit."""
|
|
297
|
+
self.close()
|