htmlgraph 0.20.1__py3-none-any.whl → 0.27.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (304) hide show
  1. htmlgraph/.htmlgraph/.session-warning-state.json +6 -0
  2. htmlgraph/.htmlgraph/agents.json +72 -0
  3. htmlgraph/.htmlgraph/htmlgraph.db +0 -0
  4. htmlgraph/__init__.py +51 -1
  5. htmlgraph/__init__.pyi +123 -0
  6. htmlgraph/agent_detection.py +26 -10
  7. htmlgraph/agent_registry.py +2 -1
  8. htmlgraph/analytics/__init__.py +8 -1
  9. htmlgraph/analytics/cli.py +86 -20
  10. htmlgraph/analytics/cost_analyzer.py +391 -0
  11. htmlgraph/analytics/cost_monitor.py +664 -0
  12. htmlgraph/analytics/cost_reporter.py +675 -0
  13. htmlgraph/analytics/cross_session.py +617 -0
  14. htmlgraph/analytics/dependency.py +10 -6
  15. htmlgraph/analytics/pattern_learning.py +771 -0
  16. htmlgraph/analytics/session_graph.py +707 -0
  17. htmlgraph/analytics/strategic/__init__.py +80 -0
  18. htmlgraph/analytics/strategic/cost_optimizer.py +611 -0
  19. htmlgraph/analytics/strategic/pattern_detector.py +876 -0
  20. htmlgraph/analytics/strategic/preference_manager.py +709 -0
  21. htmlgraph/analytics/strategic/suggestion_engine.py +747 -0
  22. htmlgraph/analytics/work_type.py +67 -27
  23. htmlgraph/analytics_index.py +53 -20
  24. htmlgraph/api/__init__.py +3 -0
  25. htmlgraph/api/cost_alerts_websocket.py +416 -0
  26. htmlgraph/api/main.py +2498 -0
  27. htmlgraph/api/static/htmx.min.js +1 -0
  28. htmlgraph/api/static/style-redesign.css +1344 -0
  29. htmlgraph/api/static/style.css +1079 -0
  30. htmlgraph/api/templates/dashboard-redesign.html +1366 -0
  31. htmlgraph/api/templates/dashboard.html +794 -0
  32. htmlgraph/api/templates/partials/activity-feed-hierarchical.html +326 -0
  33. htmlgraph/api/templates/partials/activity-feed.html +1100 -0
  34. htmlgraph/api/templates/partials/agents-redesign.html +317 -0
  35. htmlgraph/api/templates/partials/agents.html +317 -0
  36. htmlgraph/api/templates/partials/event-traces.html +373 -0
  37. htmlgraph/api/templates/partials/features-kanban-redesign.html +509 -0
  38. htmlgraph/api/templates/partials/features.html +578 -0
  39. htmlgraph/api/templates/partials/metrics-redesign.html +346 -0
  40. htmlgraph/api/templates/partials/metrics.html +346 -0
  41. htmlgraph/api/templates/partials/orchestration-redesign.html +443 -0
  42. htmlgraph/api/templates/partials/orchestration.html +198 -0
  43. htmlgraph/api/templates/partials/spawners.html +375 -0
  44. htmlgraph/api/templates/partials/work-items.html +613 -0
  45. htmlgraph/api/websocket.py +538 -0
  46. htmlgraph/archive/__init__.py +24 -0
  47. htmlgraph/archive/bloom.py +234 -0
  48. htmlgraph/archive/fts.py +297 -0
  49. htmlgraph/archive/manager.py +583 -0
  50. htmlgraph/archive/search.py +244 -0
  51. htmlgraph/atomic_ops.py +560 -0
  52. htmlgraph/attribute_index.py +2 -1
  53. htmlgraph/bounded_paths.py +539 -0
  54. htmlgraph/builders/base.py +57 -2
  55. htmlgraph/builders/bug.py +19 -3
  56. htmlgraph/builders/chore.py +19 -3
  57. htmlgraph/builders/epic.py +19 -3
  58. htmlgraph/builders/feature.py +27 -3
  59. htmlgraph/builders/insight.py +2 -1
  60. htmlgraph/builders/metric.py +2 -1
  61. htmlgraph/builders/pattern.py +2 -1
  62. htmlgraph/builders/phase.py +19 -3
  63. htmlgraph/builders/spike.py +29 -3
  64. htmlgraph/builders/track.py +42 -1
  65. htmlgraph/cigs/__init__.py +81 -0
  66. htmlgraph/cigs/autonomy.py +385 -0
  67. htmlgraph/cigs/cost.py +475 -0
  68. htmlgraph/cigs/messages_basic.py +472 -0
  69. htmlgraph/cigs/messaging.py +365 -0
  70. htmlgraph/cigs/models.py +771 -0
  71. htmlgraph/cigs/pattern_storage.py +427 -0
  72. htmlgraph/cigs/patterns.py +503 -0
  73. htmlgraph/cigs/posttool_analyzer.py +234 -0
  74. htmlgraph/cigs/reporter.py +818 -0
  75. htmlgraph/cigs/tracker.py +317 -0
  76. htmlgraph/cli/.htmlgraph/.session-warning-state.json +6 -0
  77. htmlgraph/cli/.htmlgraph/agents.json +72 -0
  78. htmlgraph/cli/.htmlgraph/htmlgraph.db +0 -0
  79. htmlgraph/cli/__init__.py +42 -0
  80. htmlgraph/cli/__main__.py +6 -0
  81. htmlgraph/cli/analytics.py +1424 -0
  82. htmlgraph/cli/base.py +685 -0
  83. htmlgraph/cli/constants.py +206 -0
  84. htmlgraph/cli/core.py +954 -0
  85. htmlgraph/cli/main.py +147 -0
  86. htmlgraph/cli/models.py +475 -0
  87. htmlgraph/cli/templates/__init__.py +1 -0
  88. htmlgraph/cli/templates/cost_dashboard.py +399 -0
  89. htmlgraph/cli/work/__init__.py +239 -0
  90. htmlgraph/cli/work/browse.py +115 -0
  91. htmlgraph/cli/work/features.py +568 -0
  92. htmlgraph/cli/work/orchestration.py +676 -0
  93. htmlgraph/cli/work/report.py +728 -0
  94. htmlgraph/cli/work/sessions.py +466 -0
  95. htmlgraph/cli/work/snapshot.py +559 -0
  96. htmlgraph/cli/work/tracks.py +486 -0
  97. htmlgraph/cli_commands/__init__.py +1 -0
  98. htmlgraph/cli_commands/feature.py +195 -0
  99. htmlgraph/cli_framework.py +115 -0
  100. htmlgraph/collections/__init__.py +2 -0
  101. htmlgraph/collections/base.py +197 -14
  102. htmlgraph/collections/bug.py +2 -1
  103. htmlgraph/collections/chore.py +2 -1
  104. htmlgraph/collections/epic.py +2 -1
  105. htmlgraph/collections/feature.py +2 -1
  106. htmlgraph/collections/insight.py +2 -1
  107. htmlgraph/collections/metric.py +2 -1
  108. htmlgraph/collections/pattern.py +2 -1
  109. htmlgraph/collections/phase.py +2 -1
  110. htmlgraph/collections/session.py +194 -0
  111. htmlgraph/collections/spike.py +13 -2
  112. htmlgraph/collections/task_delegation.py +241 -0
  113. htmlgraph/collections/todo.py +14 -1
  114. htmlgraph/collections/traces.py +487 -0
  115. htmlgraph/config/cost_models.json +56 -0
  116. htmlgraph/config.py +190 -0
  117. htmlgraph/context_analytics.py +2 -1
  118. htmlgraph/converter.py +116 -7
  119. htmlgraph/cost_analysis/__init__.py +5 -0
  120. htmlgraph/cost_analysis/analyzer.py +438 -0
  121. htmlgraph/dashboard.html +2246 -248
  122. htmlgraph/dashboard.html.backup +6592 -0
  123. htmlgraph/dashboard.html.bak +7181 -0
  124. htmlgraph/dashboard.html.bak2 +7231 -0
  125. htmlgraph/dashboard.html.bak3 +7232 -0
  126. htmlgraph/db/__init__.py +38 -0
  127. htmlgraph/db/queries.py +790 -0
  128. htmlgraph/db/schema.py +1788 -0
  129. htmlgraph/decorators.py +317 -0
  130. htmlgraph/dependency_models.py +2 -1
  131. htmlgraph/deploy.py +26 -27
  132. htmlgraph/docs/API_REFERENCE.md +841 -0
  133. htmlgraph/docs/HTTP_API.md +750 -0
  134. htmlgraph/docs/INTEGRATION_GUIDE.md +752 -0
  135. htmlgraph/docs/ORCHESTRATION_PATTERNS.md +717 -0
  136. htmlgraph/docs/README.md +532 -0
  137. htmlgraph/docs/__init__.py +77 -0
  138. htmlgraph/docs/docs_version.py +55 -0
  139. htmlgraph/docs/metadata.py +93 -0
  140. htmlgraph/docs/migrations.py +232 -0
  141. htmlgraph/docs/template_engine.py +143 -0
  142. htmlgraph/docs/templates/_sections/cli_reference.md.j2 +52 -0
  143. htmlgraph/docs/templates/_sections/core_concepts.md.j2 +29 -0
  144. htmlgraph/docs/templates/_sections/sdk_basics.md.j2 +69 -0
  145. htmlgraph/docs/templates/base_agents.md.j2 +78 -0
  146. htmlgraph/docs/templates/example_user_override.md.j2 +47 -0
  147. htmlgraph/docs/version_check.py +163 -0
  148. htmlgraph/edge_index.py +2 -1
  149. htmlgraph/error_handler.py +544 -0
  150. htmlgraph/event_log.py +86 -37
  151. htmlgraph/event_migration.py +2 -1
  152. htmlgraph/file_watcher.py +12 -8
  153. htmlgraph/find_api.py +2 -1
  154. htmlgraph/git_events.py +67 -9
  155. htmlgraph/hooks/.htmlgraph/.session-warning-state.json +6 -0
  156. htmlgraph/hooks/.htmlgraph/agents.json +72 -0
  157. htmlgraph/hooks/.htmlgraph/index.sqlite +0 -0
  158. htmlgraph/hooks/__init__.py +8 -0
  159. htmlgraph/hooks/bootstrap.py +169 -0
  160. htmlgraph/hooks/cigs_pretool_enforcer.py +354 -0
  161. htmlgraph/hooks/concurrent_sessions.py +208 -0
  162. htmlgraph/hooks/context.py +350 -0
  163. htmlgraph/hooks/drift_handler.py +525 -0
  164. htmlgraph/hooks/event_tracker.py +790 -99
  165. htmlgraph/hooks/git_commands.py +175 -0
  166. htmlgraph/hooks/installer.py +5 -1
  167. htmlgraph/hooks/orchestrator.py +327 -76
  168. htmlgraph/hooks/orchestrator_reflector.py +31 -4
  169. htmlgraph/hooks/post_tool_use_failure.py +32 -7
  170. htmlgraph/hooks/post_tool_use_handler.py +257 -0
  171. htmlgraph/hooks/posttooluse.py +92 -19
  172. htmlgraph/hooks/pretooluse.py +527 -7
  173. htmlgraph/hooks/prompt_analyzer.py +637 -0
  174. htmlgraph/hooks/session_handler.py +668 -0
  175. htmlgraph/hooks/session_summary.py +395 -0
  176. htmlgraph/hooks/state_manager.py +504 -0
  177. htmlgraph/hooks/subagent_detection.py +202 -0
  178. htmlgraph/hooks/subagent_stop.py +369 -0
  179. htmlgraph/hooks/task_enforcer.py +99 -4
  180. htmlgraph/hooks/validator.py +212 -91
  181. htmlgraph/ids.py +2 -1
  182. htmlgraph/learning.py +125 -100
  183. htmlgraph/mcp_server.py +2 -1
  184. htmlgraph/models.py +217 -18
  185. htmlgraph/operations/README.md +62 -0
  186. htmlgraph/operations/__init__.py +79 -0
  187. htmlgraph/operations/analytics.py +339 -0
  188. htmlgraph/operations/bootstrap.py +289 -0
  189. htmlgraph/operations/events.py +244 -0
  190. htmlgraph/operations/fastapi_server.py +231 -0
  191. htmlgraph/operations/hooks.py +350 -0
  192. htmlgraph/operations/initialization.py +597 -0
  193. htmlgraph/operations/initialization.py.backup +228 -0
  194. htmlgraph/operations/server.py +303 -0
  195. htmlgraph/orchestration/__init__.py +58 -0
  196. htmlgraph/orchestration/claude_launcher.py +179 -0
  197. htmlgraph/orchestration/command_builder.py +72 -0
  198. htmlgraph/orchestration/headless_spawner.py +281 -0
  199. htmlgraph/orchestration/live_events.py +377 -0
  200. htmlgraph/orchestration/model_selection.py +327 -0
  201. htmlgraph/orchestration/plugin_manager.py +140 -0
  202. htmlgraph/orchestration/prompts.py +137 -0
  203. htmlgraph/orchestration/spawner_event_tracker.py +383 -0
  204. htmlgraph/orchestration/spawners/__init__.py +16 -0
  205. htmlgraph/orchestration/spawners/base.py +194 -0
  206. htmlgraph/orchestration/spawners/claude.py +173 -0
  207. htmlgraph/orchestration/spawners/codex.py +435 -0
  208. htmlgraph/orchestration/spawners/copilot.py +294 -0
  209. htmlgraph/orchestration/spawners/gemini.py +471 -0
  210. htmlgraph/orchestration/subprocess_runner.py +36 -0
  211. htmlgraph/{orchestration.py → orchestration/task_coordination.py} +16 -8
  212. htmlgraph/orchestration.md +563 -0
  213. htmlgraph/orchestrator-system-prompt-optimized.txt +863 -0
  214. htmlgraph/orchestrator.py +2 -1
  215. htmlgraph/orchestrator_config.py +357 -0
  216. htmlgraph/orchestrator_mode.py +115 -4
  217. htmlgraph/parallel.py +2 -1
  218. htmlgraph/parser.py +86 -6
  219. htmlgraph/path_query.py +608 -0
  220. htmlgraph/pattern_matcher.py +636 -0
  221. htmlgraph/pydantic_models.py +476 -0
  222. htmlgraph/quality_gates.py +350 -0
  223. htmlgraph/query_builder.py +2 -1
  224. htmlgraph/query_composer.py +509 -0
  225. htmlgraph/reflection.py +443 -0
  226. htmlgraph/refs.py +344 -0
  227. htmlgraph/repo_hash.py +512 -0
  228. htmlgraph/repositories/__init__.py +292 -0
  229. htmlgraph/repositories/analytics_repository.py +455 -0
  230. htmlgraph/repositories/analytics_repository_standard.py +628 -0
  231. htmlgraph/repositories/feature_repository.py +581 -0
  232. htmlgraph/repositories/feature_repository_htmlfile.py +668 -0
  233. htmlgraph/repositories/feature_repository_memory.py +607 -0
  234. htmlgraph/repositories/feature_repository_sqlite.py +858 -0
  235. htmlgraph/repositories/filter_service.py +620 -0
  236. htmlgraph/repositories/filter_service_standard.py +445 -0
  237. htmlgraph/repositories/shared_cache.py +621 -0
  238. htmlgraph/repositories/shared_cache_memory.py +395 -0
  239. htmlgraph/repositories/track_repository.py +552 -0
  240. htmlgraph/repositories/track_repository_htmlfile.py +619 -0
  241. htmlgraph/repositories/track_repository_memory.py +508 -0
  242. htmlgraph/repositories/track_repository_sqlite.py +711 -0
  243. htmlgraph/sdk/__init__.py +398 -0
  244. htmlgraph/sdk/__init__.pyi +14 -0
  245. htmlgraph/sdk/analytics/__init__.py +19 -0
  246. htmlgraph/sdk/analytics/engine.py +155 -0
  247. htmlgraph/sdk/analytics/helpers.py +178 -0
  248. htmlgraph/sdk/analytics/registry.py +109 -0
  249. htmlgraph/sdk/base.py +484 -0
  250. htmlgraph/sdk/constants.py +216 -0
  251. htmlgraph/sdk/core.pyi +308 -0
  252. htmlgraph/sdk/discovery.py +120 -0
  253. htmlgraph/sdk/help/__init__.py +12 -0
  254. htmlgraph/sdk/help/mixin.py +699 -0
  255. htmlgraph/sdk/mixins/__init__.py +15 -0
  256. htmlgraph/sdk/mixins/attribution.py +113 -0
  257. htmlgraph/sdk/mixins/mixin.py +410 -0
  258. htmlgraph/sdk/operations/__init__.py +12 -0
  259. htmlgraph/sdk/operations/mixin.py +427 -0
  260. htmlgraph/sdk/orchestration/__init__.py +17 -0
  261. htmlgraph/sdk/orchestration/coordinator.py +203 -0
  262. htmlgraph/sdk/orchestration/spawner.py +204 -0
  263. htmlgraph/sdk/planning/__init__.py +19 -0
  264. htmlgraph/sdk/planning/bottlenecks.py +93 -0
  265. htmlgraph/sdk/planning/mixin.py +211 -0
  266. htmlgraph/sdk/planning/parallel.py +186 -0
  267. htmlgraph/sdk/planning/queue.py +210 -0
  268. htmlgraph/sdk/planning/recommendations.py +87 -0
  269. htmlgraph/sdk/planning/smart_planning.py +319 -0
  270. htmlgraph/sdk/session/__init__.py +19 -0
  271. htmlgraph/sdk/session/continuity.py +57 -0
  272. htmlgraph/sdk/session/handoff.py +110 -0
  273. htmlgraph/sdk/session/info.py +309 -0
  274. htmlgraph/sdk/session/manager.py +103 -0
  275. htmlgraph/sdk/strategic/__init__.py +26 -0
  276. htmlgraph/sdk/strategic/mixin.py +563 -0
  277. htmlgraph/server.py +295 -107
  278. htmlgraph/session_hooks.py +300 -0
  279. htmlgraph/session_manager.py +285 -3
  280. htmlgraph/session_registry.py +587 -0
  281. htmlgraph/session_state.py +436 -0
  282. htmlgraph/session_warning.py +2 -1
  283. htmlgraph/sessions/__init__.py +23 -0
  284. htmlgraph/sessions/handoff.py +756 -0
  285. htmlgraph/system_prompts.py +450 -0
  286. htmlgraph/templates/orchestration-view.html +350 -0
  287. htmlgraph/track_builder.py +33 -1
  288. htmlgraph/track_manager.py +38 -0
  289. htmlgraph/transcript.py +18 -5
  290. htmlgraph/validation.py +115 -0
  291. htmlgraph/watch.py +2 -1
  292. htmlgraph/work_type_utils.py +2 -1
  293. {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/dashboard.html +2246 -248
  294. {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/METADATA +95 -64
  295. htmlgraph-0.27.5.dist-info/RECORD +337 -0
  296. {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/entry_points.txt +1 -1
  297. htmlgraph/cli.py +0 -4839
  298. htmlgraph/sdk.py +0 -2359
  299. htmlgraph-0.20.1.dist-info/RECORD +0 -118
  300. {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/styles.css +0 -0
  301. {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/AGENTS.md.template +0 -0
  302. {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/CLAUDE.md.template +0 -0
  303. {htmlgraph-0.20.1.data → htmlgraph-0.27.5.data}/data/htmlgraph/templates/GEMINI.md.template +0 -0
  304. {htmlgraph-0.20.1.dist-info → htmlgraph-0.27.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,234 @@
1
+ """
2
+ Bloom filter implementation for archive search optimization.
3
+
4
+ Uses MurmurHash3 for 22x faster hashing with hardware optimizations.
5
+ Target: 32.8% latency reduction by skipping 70-90% of archives.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import math
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ try:
15
+ import mmh3 # type: ignore
16
+
17
+ HAS_MMH3 = True
18
+ except ImportError:
19
+ HAS_MMH3 = False
20
+
21
+
22
+ class BloomFilter:
23
+ """
24
+ Space-efficient probabilistic data structure for archive filtering.
25
+
26
+ Optimized for speed with:
27
+ - MurmurHash3 hardware acceleration (if available)
28
+ - Configurable false positive rate (default 0.01)
29
+ - Efficient bit array storage
30
+ """
31
+
32
+ def __init__(
33
+ self, expected_items: int = 1000, false_positive_rate: float = 0.01
34
+ ) -> None:
35
+ """
36
+ Initialize Bloom filter.
37
+
38
+ Args:
39
+ expected_items: Expected number of items to add
40
+ false_positive_rate: Desired false positive rate (0.01 = 1%)
41
+ """
42
+ self.expected_items = expected_items
43
+ self.false_positive_rate = false_positive_rate
44
+
45
+ # Calculate optimal bit array size
46
+ self.bit_count = self._optimal_bit_count(expected_items, false_positive_rate)
47
+
48
+ # Calculate optimal number of hash functions
49
+ self.hash_count = self._optimal_hash_count(self.bit_count, expected_items)
50
+
51
+ # Initialize bit array (using bytearray for efficiency)
52
+ self.bit_array = bytearray((self.bit_count + 7) // 8)
53
+
54
+ self.items_added = 0
55
+
56
+ def _optimal_bit_count(self, n: int, p: float) -> int:
57
+ """
58
+ Calculate optimal bit array size.
59
+
60
+ Formula: m = -(n * ln(p)) / (ln(2)^2)
61
+ """
62
+ return int(-n * math.log(p) / (math.log(2) ** 2))
63
+
64
+ def _optimal_hash_count(self, m: int, n: int) -> int:
65
+ """
66
+ Calculate optimal number of hash functions.
67
+
68
+ Formula: k = (m / n) * ln(2)
69
+ """
70
+ return max(1, int((m / n) * math.log(2)))
71
+
72
+ def _hash(self, item: str, seed: int) -> int:
73
+ """
74
+ Hash item with seed using MurmurHash3 or fallback to hashlib.
75
+
76
+ Args:
77
+ item: Item to hash
78
+ seed: Hash seed for different hash functions
79
+
80
+ Returns:
81
+ Hash value modulo bit_count
82
+ """
83
+ if HAS_MMH3:
84
+ # MurmurHash3 - 22x faster with hardware optimization
85
+ hash_val: int = mmh3.hash(item, seed) # type: ignore
86
+ return hash_val % self.bit_count
87
+ else:
88
+ # Fallback to hashlib (slower but always available)
89
+ hash_obj = hashlib.sha256(f"{item}{seed}".encode())
90
+ return int.from_bytes(hash_obj.digest()[:4], "big") % self.bit_count
91
+
92
+ def _set_bit(self, position: int) -> None:
93
+ """Set bit at position to 1."""
94
+ byte_index = position // 8
95
+ bit_index = position % 8
96
+ self.bit_array[byte_index] |= 1 << bit_index
97
+
98
+ def _get_bit(self, position: int) -> bool:
99
+ """Get bit value at position."""
100
+ byte_index = position // 8
101
+ bit_index = position % 8
102
+ return bool(self.bit_array[byte_index] & (1 << bit_index))
103
+
104
+ def add(self, item: str) -> None:
105
+ """
106
+ Add item to Bloom filter.
107
+
108
+ Args:
109
+ item: String to add
110
+ """
111
+ for seed in range(self.hash_count):
112
+ position = self._hash(item, seed)
113
+ self._set_bit(position)
114
+
115
+ self.items_added += 1
116
+
117
+ def might_contain(self, item: str) -> bool:
118
+ """
119
+ Check if item might be in the set.
120
+
121
+ Args:
122
+ item: String to check
123
+
124
+ Returns:
125
+ True if item might be present (or false positive)
126
+ False if item is definitely not present
127
+ """
128
+ for seed in range(self.hash_count):
129
+ position = self._hash(item, seed)
130
+ if not self._get_bit(position):
131
+ return False
132
+ return True
133
+
134
+ def build_for_archive(self, entities: list[dict[str, Any]]) -> None:
135
+ """
136
+ Build Bloom filter from archive entities.
137
+
138
+ Indexes:
139
+ - Entity IDs
140
+ - Titles (lowercased, tokenized)
141
+ - Description text (lowercased, tokenized)
142
+
143
+ Args:
144
+ entities: List of entity dictionaries with id, title, description
145
+ """
146
+ for entity in entities:
147
+ # Add entity ID
148
+ self.add(entity["id"])
149
+
150
+ # Add title tokens (lowercased)
151
+ if "title" in entity and entity["title"]:
152
+ for word in entity["title"].lower().split():
153
+ self.add(word)
154
+
155
+ # Add description tokens (lowercased)
156
+ if "description" in entity and entity["description"]:
157
+ for word in entity["description"].lower().split():
158
+ self.add(word)
159
+
160
+ def save(self, filepath: Path) -> None:
161
+ """
162
+ Save Bloom filter to disk.
163
+
164
+ Args:
165
+ filepath: Path to save .bloom file
166
+ """
167
+ data = {
168
+ "expected_items": self.expected_items,
169
+ "false_positive_rate": self.false_positive_rate,
170
+ "bit_count": self.bit_count,
171
+ "hash_count": self.hash_count,
172
+ "items_added": self.items_added,
173
+ "bit_array": list(self.bit_array), # Convert bytearray to list for JSON
174
+ }
175
+
176
+ with open(filepath, "w") as f:
177
+ json.dump(data, f)
178
+
179
+ @classmethod
180
+ def load(cls, filepath: Path) -> "BloomFilter":
181
+ """
182
+ Load Bloom filter from disk.
183
+
184
+ Args:
185
+ filepath: Path to .bloom file
186
+
187
+ Returns:
188
+ Loaded BloomFilter instance
189
+ """
190
+ with open(filepath) as f:
191
+ data = json.load(f)
192
+
193
+ # Create instance with saved parameters
194
+ bloom = cls(
195
+ expected_items=data["expected_items"],
196
+ false_positive_rate=data["false_positive_rate"],
197
+ )
198
+
199
+ # Restore state
200
+ bloom.bit_count = data["bit_count"]
201
+ bloom.hash_count = data["hash_count"]
202
+ bloom.items_added = data["items_added"]
203
+ bloom.bit_array = bytearray(data["bit_array"])
204
+
205
+ return bloom
206
+
207
+ def get_stats(self) -> dict[str, Any]:
208
+ """
209
+ Get Bloom filter statistics.
210
+
211
+ Returns:
212
+ Dictionary with stats (size, items, FPR, etc.)
213
+ """
214
+ # Calculate actual false positive rate
215
+ actual_fpr = (
216
+ (1 - math.exp(-self.hash_count * self.items_added / self.bit_count))
217
+ ** self.hash_count
218
+ if self.items_added > 0
219
+ else 0
220
+ )
221
+
222
+ return {
223
+ "expected_items": self.expected_items,
224
+ "items_added": self.items_added,
225
+ "bit_count": self.bit_count,
226
+ "hash_count": self.hash_count,
227
+ "bytes_used": len(self.bit_array),
228
+ "target_fpr": self.false_positive_rate,
229
+ "actual_fpr": actual_fpr,
230
+ "utilization": self.items_added / self.expected_items
231
+ if self.expected_items > 0
232
+ else 0,
233
+ "using_mmh3": HAS_MMH3,
234
+ }
@@ -0,0 +1,297 @@
1
+ """
2
+ SQLite FTS5 full-text search index for archive content.
3
+
4
+ Uses BM25 ranking for relevance scoring with O(log n) search performance.
5
+ Provides snippet extraction with highlighting for matched terms.
6
+ """
7
+
8
+ import sqlite3
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ class ArchiveFTS5Index:
14
+ """
15
+ Full-text search index using SQLite FTS5.
16
+
17
+ Features:
18
+ - Porter stemming for better matching
19
+ - Unicode61 tokenization for international text
20
+ - BM25 ranking for relevance scoring
21
+ - Snippet extraction with highlighting
22
+ - Metadata table for quick lookups
23
+ """
24
+
25
+ def __init__(self, db_path: Path) -> None:
26
+ """
27
+ Initialize FTS5 index.
28
+
29
+ Args:
30
+ db_path: Path to SQLite database file
31
+ """
32
+ self.db_path = db_path
33
+ self.conn: sqlite3.Connection | None = None
34
+ self._ensure_schema()
35
+
36
+ def _ensure_schema(self) -> None:
37
+ """Create FTS5 tables if they don't exist."""
38
+ conn = self._get_connection()
39
+
40
+ # Create FTS5 virtual table with porter stemming
41
+ conn.execute(
42
+ """
43
+ CREATE VIRTUAL TABLE IF NOT EXISTS archive_fts USING fts5(
44
+ entity_id UNINDEXED,
45
+ title,
46
+ description,
47
+ content,
48
+ tokenize='porter unicode61'
49
+ )
50
+ """
51
+ )
52
+
53
+ # Create metadata table for quick lookups
54
+ conn.execute(
55
+ """
56
+ CREATE TABLE IF NOT EXISTS archive_metadata (
57
+ entity_id TEXT PRIMARY KEY,
58
+ archive_file TEXT NOT NULL,
59
+ entity_type TEXT,
60
+ status TEXT,
61
+ created TEXT,
62
+ updated TEXT
63
+ )
64
+ """
65
+ )
66
+
67
+ # Create index on archive_file for filtering
68
+ conn.execute(
69
+ """
70
+ CREATE INDEX IF NOT EXISTS idx_archive_file
71
+ ON archive_metadata(archive_file)
72
+ """
73
+ )
74
+
75
+ conn.commit()
76
+
77
+ def _get_connection(self) -> sqlite3.Connection:
78
+ """Get database connection (create if needed)."""
79
+ if self.conn is None:
80
+ self.conn = sqlite3.connect(str(self.db_path))
81
+ self.conn.row_factory = sqlite3.Row # Enable dict-like access
82
+ return self.conn
83
+
84
+ def index_archive(self, archive_file: str, entities: list[dict[str, Any]]) -> None:
85
+ """
86
+ Index entities from an archive file.
87
+
88
+ Args:
89
+ archive_file: Name of archive file (e.g., '2024-Q4-completed.html')
90
+ entities: List of entity dictionaries
91
+ """
92
+ conn = self._get_connection()
93
+
94
+ for entity in entities:
95
+ entity_id = entity.get("id", "")
96
+ title = entity.get("title", "")
97
+ description = entity.get("description", "")
98
+ content = entity.get("content", "")
99
+
100
+ # Insert into FTS5 table
101
+ conn.execute(
102
+ """
103
+ INSERT INTO archive_fts (entity_id, title, description, content)
104
+ VALUES (?, ?, ?, ?)
105
+ """,
106
+ (entity_id, title, description, content),
107
+ )
108
+
109
+ # Insert into metadata table
110
+ conn.execute(
111
+ """
112
+ INSERT OR REPLACE INTO archive_metadata
113
+ (entity_id, archive_file, entity_type, status, created, updated)
114
+ VALUES (?, ?, ?, ?, ?, ?)
115
+ """,
116
+ (
117
+ entity_id,
118
+ archive_file,
119
+ entity.get("type", ""),
120
+ entity.get("status", ""),
121
+ entity.get("created", ""),
122
+ entity.get("updated", ""),
123
+ ),
124
+ )
125
+
126
+ conn.commit()
127
+
128
+ def search(
129
+ self,
130
+ query: str,
131
+ limit: int = 10,
132
+ archive_files: list[str] | None = None,
133
+ ) -> list[dict[str, Any]]:
134
+ """
135
+ Search indexed archives with BM25 ranking.
136
+
137
+ Args:
138
+ query: Search query
139
+ limit: Maximum number of results
140
+ archive_files: Optional list of archive files to search
141
+
142
+ Returns:
143
+ List of results with entity_id, title, rank, snippet, archive_file
144
+ """
145
+ conn = self._get_connection()
146
+
147
+ # Build query with optional archive file filter
148
+ if archive_files:
149
+ placeholders = ",".join("?" * len(archive_files))
150
+ sql = f"""
151
+ SELECT
152
+ fts.entity_id,
153
+ meta.archive_file,
154
+ meta.entity_type,
155
+ meta.status,
156
+ snippet(archive_fts, 1, '<mark>', '</mark>', '...', 32) as title_snippet,
157
+ snippet(archive_fts, 2, '<mark>', '</mark>', '...', 64) as description_snippet,
158
+ bm25(archive_fts) as rank
159
+ FROM archive_fts fts
160
+ JOIN archive_metadata meta ON fts.entity_id = meta.entity_id
161
+ WHERE archive_fts MATCH ?
162
+ AND meta.archive_file IN ({placeholders})
163
+ ORDER BY rank
164
+ LIMIT ?
165
+ """
166
+ params = [query] + archive_files + [limit]
167
+ else:
168
+ sql = """
169
+ SELECT
170
+ fts.entity_id,
171
+ meta.archive_file,
172
+ meta.entity_type,
173
+ meta.status,
174
+ snippet(archive_fts, 1, '<mark>', '</mark>', '...', 32) as title_snippet,
175
+ snippet(archive_fts, 2, '<mark>', '</mark>', '...', 64) as description_snippet,
176
+ bm25(archive_fts) as rank
177
+ FROM archive_fts fts
178
+ JOIN archive_metadata meta ON fts.entity_id = meta.entity_id
179
+ WHERE archive_fts MATCH ?
180
+ ORDER BY rank
181
+ LIMIT ?
182
+ """
183
+ params = [query, limit]
184
+
185
+ cursor = conn.execute(sql, params)
186
+
187
+ results = []
188
+ for row in cursor:
189
+ results.append(
190
+ {
191
+ "entity_id": row["entity_id"],
192
+ "archive_file": row["archive_file"],
193
+ "entity_type": row["entity_type"],
194
+ "status": row["status"],
195
+ "title_snippet": row["title_snippet"],
196
+ "description_snippet": row["description_snippet"],
197
+ "rank": row["rank"],
198
+ }
199
+ )
200
+
201
+ return results
202
+
203
+ def get_entity_metadata(self, entity_id: str) -> dict[str, Any] | None:
204
+ """
205
+ Get metadata for an entity.
206
+
207
+ Args:
208
+ entity_id: Entity identifier
209
+
210
+ Returns:
211
+ Metadata dictionary or None if not found
212
+ """
213
+ conn = self._get_connection()
214
+
215
+ cursor = conn.execute(
216
+ """
217
+ SELECT entity_id, archive_file, entity_type, status, created, updated
218
+ FROM archive_metadata
219
+ WHERE entity_id = ?
220
+ """,
221
+ (entity_id,),
222
+ )
223
+
224
+ row = cursor.fetchone()
225
+ if row:
226
+ return dict(row)
227
+ return None
228
+
229
+ def remove_archive(self, archive_file: str) -> None:
230
+ """
231
+ Remove all entities from a specific archive file.
232
+
233
+ Args:
234
+ archive_file: Archive file to remove
235
+ """
236
+ conn = self._get_connection()
237
+
238
+ # Get entity IDs to remove
239
+ cursor = conn.execute(
240
+ "SELECT entity_id FROM archive_metadata WHERE archive_file = ?",
241
+ (archive_file,),
242
+ )
243
+ entity_ids = [row["entity_id"] for row in cursor]
244
+
245
+ # Remove from FTS5
246
+ for entity_id in entity_ids:
247
+ conn.execute("DELETE FROM archive_fts WHERE entity_id = ?", (entity_id,))
248
+
249
+ # Remove from metadata
250
+ conn.execute(
251
+ "DELETE FROM archive_metadata WHERE archive_file = ?", (archive_file,)
252
+ )
253
+
254
+ conn.commit()
255
+
256
+ def get_stats(self) -> dict[str, Any]:
257
+ """
258
+ Get index statistics.
259
+
260
+ Returns:
261
+ Dictionary with entity count, archive count, etc.
262
+ """
263
+ conn = self._get_connection()
264
+
265
+ # Count entities
266
+ cursor = conn.execute("SELECT COUNT(*) as count FROM archive_metadata")
267
+ entity_count = cursor.fetchone()["count"]
268
+
269
+ # Count archives
270
+ cursor = conn.execute(
271
+ "SELECT COUNT(DISTINCT archive_file) as count FROM archive_metadata"
272
+ )
273
+ archive_count = cursor.fetchone()["count"]
274
+
275
+ # Get database size
276
+ db_size = self.db_path.stat().st_size if self.db_path.exists() else 0
277
+
278
+ return {
279
+ "entity_count": entity_count,
280
+ "archive_count": archive_count,
281
+ "db_size_bytes": db_size,
282
+ "db_size_mb": db_size / (1024 * 1024),
283
+ }
284
+
285
+ def close(self) -> None:
286
+ """Close database connection."""
287
+ if self.conn:
288
+ self.conn.close()
289
+ self.conn = None
290
+
291
+ def __enter__(self) -> "ArchiveFTS5Index":
292
+ """Context manager entry."""
293
+ return self
294
+
295
+ def __exit__(self, *args: Any) -> None:
296
+ """Context manager exit."""
297
+ self.close()