htmlgraph 0.20.9__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,244 @@
1
+ """
2
+ Archive search engine with three-tier optimization.
3
+
4
+ Tier 1: Bloom filters (skip 70-90% of archives)
5
+ Tier 2: SQLite FTS5 with BM25 ranking (O(log n) search)
6
+ Tier 3: Snippet extraction and highlighting
7
+
8
+ Target: 67x faster than naive multi-file search.
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+ from functools import lru_cache
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from htmlgraph.archive.bloom import BloomFilter
17
+ from htmlgraph.archive.fts import ArchiveFTS5Index
18
+
19
+
20
+ @dataclass
21
+ class SearchResult:
22
+ """
23
+ Search result from archive search.
24
+
25
+ Attributes:
26
+ entity_id: Entity identifier
27
+ archive_file: Archive file containing entity
28
+ entity_type: Type of entity (feature, bug, etc.)
29
+ status: Entity status
30
+ title_snippet: Title with highlighted matches
31
+ description_snippet: Description with highlighted matches
32
+ rank: BM25 relevance score (lower is better)
33
+ """
34
+
35
+ entity_id: str
36
+ archive_file: str
37
+ entity_type: str
38
+ status: str
39
+ title_snippet: str
40
+ description_snippet: str
41
+ rank: float
42
+
43
+
44
+ class ArchiveSearchEngine:
45
+ """
46
+ Orchestrates three-tier archive search.
47
+
48
+ Workflow:
49
+ 1. Check Bloom filters to skip irrelevant archives (70-90% filtered)
50
+ 2. Search remaining archives with FTS5 + BM25 ranking
51
+ 3. Extract and highlight snippets for top results
52
+ """
53
+
54
+ def __init__(self, archive_dir: Path, index_dir: Path) -> None:
55
+ """
56
+ Initialize search engine.
57
+
58
+ Args:
59
+ archive_dir: Directory containing archive HTML files
60
+ index_dir: Directory for Bloom filters and FTS5 index
61
+ """
62
+ self.archive_dir = archive_dir
63
+ self.index_dir = index_dir
64
+ self.index_dir.mkdir(parents=True, exist_ok=True)
65
+
66
+ # Initialize FTS5 index
67
+ self.fts_index = ArchiveFTS5Index(index_dir / "archives.db")
68
+
69
+ # Cache for Bloom filters (avoid reloading)
70
+ self._bloom_cache: dict[str, BloomFilter] = {}
71
+
72
+ def _get_bloom_filter(self, archive_file: str) -> BloomFilter | None:
73
+ """
74
+ Get Bloom filter for archive (with caching).
75
+
76
+ Args:
77
+ archive_file: Archive filename
78
+
79
+ Returns:
80
+ BloomFilter or None if not indexed
81
+ """
82
+ if archive_file in self._bloom_cache:
83
+ return self._bloom_cache[archive_file]
84
+
85
+ bloom_path = self.index_dir / f"{archive_file}.bloom"
86
+ if bloom_path.exists():
87
+ bloom = BloomFilter.load(bloom_path)
88
+ self._bloom_cache[archive_file] = bloom
89
+ return bloom
90
+
91
+ return None
92
+
93
+ def _filter_archives_with_bloom(
94
+ self, query: str, archive_files: list[str]
95
+ ) -> list[str]:
96
+ """
97
+ Filter archive files using Bloom filters.
98
+
99
+ Args:
100
+ query: Search query
101
+ archive_files: List of all archive files
102
+
103
+ Returns:
104
+ Filtered list of archives that might contain query
105
+ """
106
+ # Tokenize query into words
107
+ query_tokens = query.lower().split()
108
+
109
+ candidates = []
110
+
111
+ for archive_file in archive_files:
112
+ bloom = self._get_bloom_filter(archive_file)
113
+
114
+ if bloom is None:
115
+ # No Bloom filter - include archive
116
+ candidates.append(archive_file)
117
+ continue
118
+
119
+ # Check if any query token might be in archive
120
+ might_match = any(bloom.might_contain(token) for token in query_tokens)
121
+
122
+ if might_match:
123
+ candidates.append(archive_file)
124
+
125
+ return candidates
126
+
127
+ @lru_cache(maxsize=100)
128
+ def _cached_search(self, query: str, limit: int) -> tuple[SearchResult, ...]:
129
+ """
130
+ Cached search implementation.
131
+
132
+ Args:
133
+ query: Search query
134
+ limit: Maximum results
135
+
136
+ Returns:
137
+ Tuple of SearchResult (immutable for caching)
138
+ """
139
+ # Get all archive files
140
+ archive_files = [f.name for f in self.archive_dir.glob("*.html")]
141
+
142
+ if not archive_files:
143
+ return tuple()
144
+
145
+ # Tier 1: Filter with Bloom filters
146
+ candidate_archives = self._filter_archives_with_bloom(query, archive_files)
147
+
148
+ if not candidate_archives:
149
+ return tuple()
150
+
151
+ # Tier 2 & 3: Search with FTS5 (includes snippet highlighting)
152
+ results = self.fts_index.search(
153
+ query, limit=limit, archive_files=candidate_archives
154
+ )
155
+
156
+ # Convert to SearchResult objects
157
+ search_results = [
158
+ SearchResult(
159
+ entity_id=r["entity_id"],
160
+ archive_file=r["archive_file"],
161
+ entity_type=r["entity_type"],
162
+ status=r["status"],
163
+ title_snippet=r["title_snippet"],
164
+ description_snippet=r["description_snippet"],
165
+ rank=r["rank"],
166
+ )
167
+ for r in results
168
+ ]
169
+
170
+ return tuple(search_results)
171
+
172
+ def search(
173
+ self, query: str, include_archived: bool = True, limit: int = 10
174
+ ) -> list[SearchResult]:
175
+ """
176
+ Search archives with three-tier optimization.
177
+
178
+ Args:
179
+ query: Search query
180
+ include_archived: Whether to search archives (future: also search active)
181
+ limit: Maximum results to return
182
+
183
+ Returns:
184
+ List of SearchResult objects sorted by relevance
185
+ """
186
+ if not include_archived:
187
+ return []
188
+
189
+ # Use cached search
190
+ results = self._cached_search(query, limit)
191
+ return list(results)
192
+
193
+ def get_search_stats(self, query: str) -> dict[str, Any]:
194
+ """
195
+ Get statistics about a search query.
196
+
197
+ Args:
198
+ query: Search query
199
+
200
+ Returns:
201
+ Dictionary with bloom_filtered_count, searched_count, etc.
202
+ """
203
+ archive_files = [f.name for f in self.archive_dir.glob("*.html")]
204
+ total_archives = len(archive_files)
205
+
206
+ candidate_archives = self._filter_archives_with_bloom(query, archive_files)
207
+ searched_count = len(candidate_archives)
208
+
209
+ bloom_filtered = total_archives - searched_count
210
+
211
+ return {
212
+ "total_archives": total_archives,
213
+ "bloom_filtered": bloom_filtered,
214
+ "searched_count": searched_count,
215
+ "filter_rate": bloom_filtered / total_archives if total_archives > 0 else 0,
216
+ }
217
+
218
+ def rebuild_bloom_filters(self) -> None:
219
+ """
220
+ Rebuild all Bloom filters from scratch.
221
+
222
+ Useful after archiving new entities.
223
+ """
224
+ # This will be implemented by ArchiveManager when creating archives
225
+ # For now, this is a placeholder
226
+ pass
227
+
228
+ def clear_cache(self) -> None:
229
+ """Clear the search cache."""
230
+ self._cached_search.cache_clear()
231
+ self._bloom_cache.clear()
232
+
233
+ def close(self) -> None:
234
+ """Close all resources."""
235
+ self.fts_index.close()
236
+ self._bloom_cache.clear()
237
+
238
+ def __enter__(self) -> "ArchiveSearchEngine":
239
+ """Context manager entry."""
240
+ return self
241
+
242
+ def __exit__(self, *args: Any) -> None:
243
+ """Context manager exit."""
244
+ self.close()
htmlgraph/cli.py CHANGED
@@ -4326,6 +4326,87 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
4326
4326
  "--format", "-f", choices=["text", "json"], default="text", help="Output format"
4327
4327
  )
4328
4328
 
4329
+ # =========================================================================
4330
+ # Archive Management
4331
+ # =========================================================================
4332
+
4333
+ # archive (with subcommands)
4334
+ archive_parser = subparsers.add_parser(
4335
+ "archive", help="Archive management with optimized search"
4336
+ )
4337
+ archive_subparsers = archive_parser.add_subparsers(
4338
+ dest="archive_command", help="Archive command"
4339
+ )
4340
+
4341
+ # archive create
4342
+ archive_create = archive_subparsers.add_parser(
4343
+ "create", help="Create archive from old entities"
4344
+ )
4345
+ archive_create.add_argument(
4346
+ "--older-than",
4347
+ type=int,
4348
+ default=90,
4349
+ help="Archive entities older than N days (default: 90)",
4350
+ )
4351
+ archive_create.add_argument(
4352
+ "--period",
4353
+ choices=["quarter", "month", "year"],
4354
+ default="quarter",
4355
+ help="Archive grouping period (default: quarter)",
4356
+ )
4357
+ archive_create.add_argument(
4358
+ "--dry-run",
4359
+ action="store_true",
4360
+ help="Preview what would be archived without making changes",
4361
+ )
4362
+ archive_create.add_argument(
4363
+ "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
4364
+ )
4365
+
4366
+ # archive search
4367
+ archive_search = archive_subparsers.add_parser(
4368
+ "search", help="Search archived entities"
4369
+ )
4370
+ archive_search.add_argument("query", help="Search query")
4371
+ archive_search.add_argument(
4372
+ "--limit", "-l", type=int, default=10, help="Maximum results (default: 10)"
4373
+ )
4374
+ archive_search.add_argument(
4375
+ "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
4376
+ )
4377
+ archive_search.add_argument(
4378
+ "--format", "-f", choices=["text", "json"], default="text", help="Output format"
4379
+ )
4380
+
4381
+ # archive stats
4382
+ archive_stats = archive_subparsers.add_parser(
4383
+ "stats", help="Show archive statistics"
4384
+ )
4385
+ archive_stats.add_argument(
4386
+ "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
4387
+ )
4388
+ archive_stats.add_argument(
4389
+ "--format", "-f", choices=["text", "json"], default="text", help="Output format"
4390
+ )
4391
+
4392
+ # archive restore
4393
+ archive_restore = archive_subparsers.add_parser(
4394
+ "restore", help="Restore archived entity"
4395
+ )
4396
+ archive_restore.add_argument("entity_id", help="Entity ID to restore")
4397
+ archive_restore.add_argument(
4398
+ "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
4399
+ )
4400
+
4401
+ # archive list
4402
+ archive_list = archive_subparsers.add_parser("list", help="List all archive files")
4403
+ archive_list.add_argument(
4404
+ "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
4405
+ )
4406
+ archive_list.add_argument(
4407
+ "--format", "-f", choices=["text", "json"], default="text", help="Output format"
4408
+ )
4409
+
4329
4410
  # =========================================================================
4330
4411
  # Analytics
4331
4412
  # =========================================================================
@@ -4700,6 +4781,21 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
4700
4781
  else:
4701
4782
  track_parser.print_help()
4702
4783
  sys.exit(1)
4784
+ elif args.command == "archive":
4785
+ # Archive management
4786
+ if args.archive_command == "create":
4787
+ cmd_archive_create(args)
4788
+ elif args.archive_command == "search":
4789
+ cmd_archive_search(args)
4790
+ elif args.archive_command == "stats":
4791
+ cmd_archive_stats(args)
4792
+ elif args.archive_command == "restore":
4793
+ cmd_archive_restore(args)
4794
+ elif args.archive_command == "list":
4795
+ cmd_archive_list(args)
4796
+ else:
4797
+ archive_parser.print_help()
4798
+ sys.exit(1)
4703
4799
  elif args.command == "work":
4704
4800
  # Work management with smart routing
4705
4801
  if args.work_command == "next":
@@ -4971,5 +5067,180 @@ def cmd_sync_docs(args: argparse.Namespace) -> int:
4971
5067
  return 1 if any("⚠️" in c or "❌" in c for c in changes) else 0
4972
5068
 
4973
5069
 
5070
+ # =============================================================================
5071
+ # Archive Management Commands
5072
+ # =============================================================================
5073
+
5074
+
5075
+ def cmd_archive_create(args: argparse.Namespace) -> None:
5076
+ """Create archive from old entities."""
5077
+ from pathlib import Path
5078
+
5079
+ from htmlgraph.archive import ArchiveManager
5080
+
5081
+ htmlgraph_dir = Path(args.graph_dir).resolve()
5082
+
5083
+ if not htmlgraph_dir.exists():
5084
+ print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
5085
+ sys.exit(1)
5086
+
5087
+ manager = ArchiveManager(htmlgraph_dir)
5088
+
5089
+ # Run archive operation
5090
+ result = manager.archive_entities(
5091
+ older_than_days=args.older_than,
5092
+ period=args.period,
5093
+ dry_run=args.dry_run,
5094
+ )
5095
+
5096
+ if result["dry_run"]:
5097
+ print("\n🔍 DRY RUN - Preview (no changes made)\n")
5098
+ print(f"Would archive: {result['would_archive']} entities")
5099
+ print(f"Archive files: {len(result['archive_files'])}")
5100
+ print("\nDetails:")
5101
+ for archive_key, count in result["details"].items():
5102
+ print(f" {archive_key}: {count} entities")
5103
+ else:
5104
+ print(f"\n✅ Archived {result['archived_count']} entities")
5105
+ print(f"Created {len(result['archive_files'])} archive file(s):")
5106
+ for archive_file in result["archive_files"]:
5107
+ count = result["details"].get(archive_file.replace(".html", ""), 0)
5108
+ print(f" - {archive_file} ({count} entities)")
5109
+
5110
+ manager.close()
5111
+
5112
+
5113
+ def cmd_archive_search(args: argparse.Namespace) -> None:
5114
+ """Search archived entities."""
5115
+ import json
5116
+ from pathlib import Path
5117
+
5118
+ from htmlgraph.archive import ArchiveManager
5119
+
5120
+ htmlgraph_dir = Path(args.graph_dir).resolve()
5121
+
5122
+ if not htmlgraph_dir.exists():
5123
+ print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
5124
+ sys.exit(1)
5125
+
5126
+ manager = ArchiveManager(htmlgraph_dir)
5127
+
5128
+ # Search archives
5129
+ results = manager.search(args.query, limit=args.limit)
5130
+
5131
+ if args.format == "json":
5132
+ print(json.dumps({"query": args.query, "results": results}, indent=2))
5133
+ else:
5134
+ print(f"\n🔍 Search results for: '{args.query}'\n")
5135
+ print(f"Found {len(results)} result(s):\n")
5136
+
5137
+ for i, result in enumerate(results, 1):
5138
+ print(f"{i}. {result['entity_id']} ({result['entity_type']})")
5139
+ print(f" Archive: {result['archive_file']}")
5140
+ print(f" Status: {result['status']}")
5141
+ print(f" Title: {result['title_snippet']}")
5142
+ if result["description_snippet"]:
5143
+ print(f" Description: {result['description_snippet']}")
5144
+ print(f" Relevance: {result['rank']:.2f}")
5145
+ print()
5146
+
5147
+ manager.close()
5148
+
5149
+
5150
+ def cmd_archive_stats(args: argparse.Namespace) -> None:
5151
+ """Show archive statistics."""
5152
+ import json
5153
+ from pathlib import Path
5154
+
5155
+ from htmlgraph.archive import ArchiveManager
5156
+
5157
+ htmlgraph_dir = Path(args.graph_dir).resolve()
5158
+
5159
+ if not htmlgraph_dir.exists():
5160
+ print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
5161
+ sys.exit(1)
5162
+
5163
+ manager = ArchiveManager(htmlgraph_dir)
5164
+
5165
+ # Get statistics
5166
+ stats = manager.get_archive_stats()
5167
+
5168
+ if args.format == "json":
5169
+ print(json.dumps(stats, indent=2))
5170
+ else:
5171
+ print("\n📊 Archive Statistics\n")
5172
+ print(f"Archive files: {stats['archive_count']}")
5173
+ print(f"Archived entities: {stats['entity_count']}")
5174
+ print(f"Total size: {stats['total_size_mb']:.2f} MB")
5175
+ print(f"FTS5 index: {stats['fts_size_mb']:.2f} MB")
5176
+ print(
5177
+ f"Bloom filters: {stats['bloom_size_kb']:.2f} KB ({stats['bloom_count']} files)"
5178
+ )
5179
+
5180
+ manager.close()
5181
+
5182
+
5183
+ def cmd_archive_restore(args: argparse.Namespace) -> None:
5184
+ """Restore archived entity."""
5185
+ from pathlib import Path
5186
+
5187
+ from htmlgraph.archive import ArchiveManager
5188
+
5189
+ htmlgraph_dir = Path(args.graph_dir).resolve()
5190
+
5191
+ if not htmlgraph_dir.exists():
5192
+ print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
5193
+ sys.exit(1)
5194
+
5195
+ manager = ArchiveManager(htmlgraph_dir)
5196
+
5197
+ # Restore entity
5198
+ success = manager.unarchive(args.entity_id)
5199
+
5200
+ if success:
5201
+ print(f"✅ Restored {args.entity_id} from archive")
5202
+ else:
5203
+ print(f"❌ Entity not found in archives: {args.entity_id}", file=sys.stderr)
5204
+ sys.exit(1)
5205
+
5206
+ manager.close()
5207
+
5208
+
5209
+ def cmd_archive_list(args: argparse.Namespace) -> None:
5210
+ """List all archive files."""
5211
+ import json
5212
+ from pathlib import Path
5213
+
5214
+ htmlgraph_dir = Path(args.graph_dir).resolve()
5215
+
5216
+ if not htmlgraph_dir.exists():
5217
+ print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
5218
+ sys.exit(1)
5219
+
5220
+ archive_dir = htmlgraph_dir / "archives"
5221
+
5222
+ if not archive_dir.exists():
5223
+ print("No archives found")
5224
+ return
5225
+
5226
+ archive_files = sorted(archive_dir.glob("*.html"))
5227
+
5228
+ if args.format == "json":
5229
+ file_list = [
5230
+ {
5231
+ "filename": f.name,
5232
+ "size_kb": f.stat().st_size / 1024,
5233
+ "modified": f.stat().st_mtime,
5234
+ }
5235
+ for f in archive_files
5236
+ ]
5237
+ print(json.dumps({"archives": file_list}, indent=2))
5238
+ else:
5239
+ print(f"\n📦 Archive Files ({len(archive_files)})\n")
5240
+ for f in archive_files:
5241
+ size_kb = f.stat().st_size / 1024
5242
+ print(f" - {f.name} ({size_kb:.1f} KB)")
5243
+
5244
+
4974
5245
  if __name__ == "__main__":
4975
5246
  main()
htmlgraph/converter.py CHANGED
@@ -548,6 +548,45 @@ def html_to_session(filepath: Path | str) -> Session:
548
548
  # Activity log in HTML is reversed (newest first), so reverse back
549
549
  data["activity_log"] = list(reversed(activity_log))
550
550
 
551
+ # Parse detected patterns from table (if present)
552
+ detected_patterns = []
553
+ for tr in parser.query("section[data-detected-patterns] table tbody tr"):
554
+ # Extract pattern data from table row
555
+ pattern_type = tr.attrs.get("data-pattern-type", "neutral")
556
+
557
+ # Extract sequence from first <td class="sequence">
558
+ seq_td = tr.query_one("td.sequence")
559
+ sequence_str = seq_td.to_text().strip() if seq_td else ""
560
+ sequence = [s.strip() for s in sequence_str.split("→")] if sequence_str else []
561
+
562
+ # Extract count from third <td>
563
+ count_td = tr.query_all("td")[2] if len(tr.query_all("td")) > 2 else None
564
+ count_str = count_td.to_text().strip() if count_td else "0"
565
+ try:
566
+ count = int(count_str)
567
+ except (ValueError, TypeError):
568
+ count = 0
569
+
570
+ # Extract timestamps from fourth <td>
571
+ time_td = tr.query_all("td")[3] if len(tr.query_all("td")) > 3 else None
572
+ time_str = time_td.to_text().strip() if time_td else ""
573
+ times = time_str.split(" / ")
574
+ first_detected = times[0].strip() if len(times) > 0 else ""
575
+ last_detected = times[1].strip() if len(times) > 1 else ""
576
+
577
+ if sequence: # Only add if we have a valid sequence
578
+ detected_patterns.append(
579
+ {
580
+ "sequence": sequence,
581
+ "pattern_type": pattern_type,
582
+ "detection_count": count,
583
+ "first_detected": first_detected,
584
+ "last_detected": last_detected,
585
+ }
586
+ )
587
+
588
+ data["detected_patterns"] = detected_patterns
589
+
551
590
  return Session(**data)
552
591
 
553
592