htmlgraph 0.20.9__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- htmlgraph/__init__.py +1 -1
- htmlgraph/analytics/__init__.py +3 -1
- htmlgraph/analytics/cross_session.py +612 -0
- htmlgraph/archive/__init__.py +24 -0
- htmlgraph/archive/bloom.py +234 -0
- htmlgraph/archive/fts.py +297 -0
- htmlgraph/archive/manager.py +583 -0
- htmlgraph/archive/search.py +244 -0
- htmlgraph/cli.py +271 -0
- htmlgraph/converter.py +39 -0
- htmlgraph/learning.py +121 -97
- htmlgraph/models.py +53 -1
- htmlgraph/sdk.py +4 -1
- {htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/METADATA +1 -1
- {htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/RECORD +22 -16
- {htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/dashboard.html +0 -0
- {htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/styles.css +0 -0
- {htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/AGENTS.md.template +0 -0
- {htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/CLAUDE.md.template +0 -0
- {htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/GEMINI.md.template +0 -0
- {htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/WHEEL +0 -0
- {htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Archive search engine with three-tier optimization.
|
|
3
|
+
|
|
4
|
+
Tier 1: Bloom filters (skip 70-90% of archives)
|
|
5
|
+
Tier 2: SQLite FTS5 with BM25 ranking (O(log n) search)
|
|
6
|
+
Tier 3: Snippet extraction and highlighting
|
|
7
|
+
|
|
8
|
+
Target: 67x faster than naive multi-file search.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from htmlgraph.archive.bloom import BloomFilter
|
|
17
|
+
from htmlgraph.archive.fts import ArchiveFTS5Index
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SearchResult:
|
|
22
|
+
"""
|
|
23
|
+
Search result from archive search.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
entity_id: Entity identifier
|
|
27
|
+
archive_file: Archive file containing entity
|
|
28
|
+
entity_type: Type of entity (feature, bug, etc.)
|
|
29
|
+
status: Entity status
|
|
30
|
+
title_snippet: Title with highlighted matches
|
|
31
|
+
description_snippet: Description with highlighted matches
|
|
32
|
+
rank: BM25 relevance score (lower is better)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
entity_id: str
|
|
36
|
+
archive_file: str
|
|
37
|
+
entity_type: str
|
|
38
|
+
status: str
|
|
39
|
+
title_snippet: str
|
|
40
|
+
description_snippet: str
|
|
41
|
+
rank: float
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ArchiveSearchEngine:
|
|
45
|
+
"""
|
|
46
|
+
Orchestrates three-tier archive search.
|
|
47
|
+
|
|
48
|
+
Workflow:
|
|
49
|
+
1. Check Bloom filters to skip irrelevant archives (70-90% filtered)
|
|
50
|
+
2. Search remaining archives with FTS5 + BM25 ranking
|
|
51
|
+
3. Extract and highlight snippets for top results
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, archive_dir: Path, index_dir: Path) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Initialize search engine.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
archive_dir: Directory containing archive HTML files
|
|
60
|
+
index_dir: Directory for Bloom filters and FTS5 index
|
|
61
|
+
"""
|
|
62
|
+
self.archive_dir = archive_dir
|
|
63
|
+
self.index_dir = index_dir
|
|
64
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Initialize FTS5 index
|
|
67
|
+
self.fts_index = ArchiveFTS5Index(index_dir / "archives.db")
|
|
68
|
+
|
|
69
|
+
# Cache for Bloom filters (avoid reloading)
|
|
70
|
+
self._bloom_cache: dict[str, BloomFilter] = {}
|
|
71
|
+
|
|
72
|
+
def _get_bloom_filter(self, archive_file: str) -> BloomFilter | None:
|
|
73
|
+
"""
|
|
74
|
+
Get Bloom filter for archive (with caching).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
archive_file: Archive filename
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
BloomFilter or None if not indexed
|
|
81
|
+
"""
|
|
82
|
+
if archive_file in self._bloom_cache:
|
|
83
|
+
return self._bloom_cache[archive_file]
|
|
84
|
+
|
|
85
|
+
bloom_path = self.index_dir / f"{archive_file}.bloom"
|
|
86
|
+
if bloom_path.exists():
|
|
87
|
+
bloom = BloomFilter.load(bloom_path)
|
|
88
|
+
self._bloom_cache[archive_file] = bloom
|
|
89
|
+
return bloom
|
|
90
|
+
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
def _filter_archives_with_bloom(
|
|
94
|
+
self, query: str, archive_files: list[str]
|
|
95
|
+
) -> list[str]:
|
|
96
|
+
"""
|
|
97
|
+
Filter archive files using Bloom filters.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
query: Search query
|
|
101
|
+
archive_files: List of all archive files
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Filtered list of archives that might contain query
|
|
105
|
+
"""
|
|
106
|
+
# Tokenize query into words
|
|
107
|
+
query_tokens = query.lower().split()
|
|
108
|
+
|
|
109
|
+
candidates = []
|
|
110
|
+
|
|
111
|
+
for archive_file in archive_files:
|
|
112
|
+
bloom = self._get_bloom_filter(archive_file)
|
|
113
|
+
|
|
114
|
+
if bloom is None:
|
|
115
|
+
# No Bloom filter - include archive
|
|
116
|
+
candidates.append(archive_file)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Check if any query token might be in archive
|
|
120
|
+
might_match = any(bloom.might_contain(token) for token in query_tokens)
|
|
121
|
+
|
|
122
|
+
if might_match:
|
|
123
|
+
candidates.append(archive_file)
|
|
124
|
+
|
|
125
|
+
return candidates
|
|
126
|
+
|
|
127
|
+
@lru_cache(maxsize=100)
|
|
128
|
+
def _cached_search(self, query: str, limit: int) -> tuple[SearchResult, ...]:
|
|
129
|
+
"""
|
|
130
|
+
Cached search implementation.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
query: Search query
|
|
134
|
+
limit: Maximum results
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Tuple of SearchResult (immutable for caching)
|
|
138
|
+
"""
|
|
139
|
+
# Get all archive files
|
|
140
|
+
archive_files = [f.name for f in self.archive_dir.glob("*.html")]
|
|
141
|
+
|
|
142
|
+
if not archive_files:
|
|
143
|
+
return tuple()
|
|
144
|
+
|
|
145
|
+
# Tier 1: Filter with Bloom filters
|
|
146
|
+
candidate_archives = self._filter_archives_with_bloom(query, archive_files)
|
|
147
|
+
|
|
148
|
+
if not candidate_archives:
|
|
149
|
+
return tuple()
|
|
150
|
+
|
|
151
|
+
# Tier 2 & 3: Search with FTS5 (includes snippet highlighting)
|
|
152
|
+
results = self.fts_index.search(
|
|
153
|
+
query, limit=limit, archive_files=candidate_archives
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Convert to SearchResult objects
|
|
157
|
+
search_results = [
|
|
158
|
+
SearchResult(
|
|
159
|
+
entity_id=r["entity_id"],
|
|
160
|
+
archive_file=r["archive_file"],
|
|
161
|
+
entity_type=r["entity_type"],
|
|
162
|
+
status=r["status"],
|
|
163
|
+
title_snippet=r["title_snippet"],
|
|
164
|
+
description_snippet=r["description_snippet"],
|
|
165
|
+
rank=r["rank"],
|
|
166
|
+
)
|
|
167
|
+
for r in results
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
return tuple(search_results)
|
|
171
|
+
|
|
172
|
+
def search(
|
|
173
|
+
self, query: str, include_archived: bool = True, limit: int = 10
|
|
174
|
+
) -> list[SearchResult]:
|
|
175
|
+
"""
|
|
176
|
+
Search archives with three-tier optimization.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
query: Search query
|
|
180
|
+
include_archived: Whether to search archives (future: also search active)
|
|
181
|
+
limit: Maximum results to return
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of SearchResult objects sorted by relevance
|
|
185
|
+
"""
|
|
186
|
+
if not include_archived:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
# Use cached search
|
|
190
|
+
results = self._cached_search(query, limit)
|
|
191
|
+
return list(results)
|
|
192
|
+
|
|
193
|
+
def get_search_stats(self, query: str) -> dict[str, Any]:
|
|
194
|
+
"""
|
|
195
|
+
Get statistics about a search query.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
query: Search query
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dictionary with bloom_filtered_count, searched_count, etc.
|
|
202
|
+
"""
|
|
203
|
+
archive_files = [f.name for f in self.archive_dir.glob("*.html")]
|
|
204
|
+
total_archives = len(archive_files)
|
|
205
|
+
|
|
206
|
+
candidate_archives = self._filter_archives_with_bloom(query, archive_files)
|
|
207
|
+
searched_count = len(candidate_archives)
|
|
208
|
+
|
|
209
|
+
bloom_filtered = total_archives - searched_count
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
"total_archives": total_archives,
|
|
213
|
+
"bloom_filtered": bloom_filtered,
|
|
214
|
+
"searched_count": searched_count,
|
|
215
|
+
"filter_rate": bloom_filtered / total_archives if total_archives > 0 else 0,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def rebuild_bloom_filters(self) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Rebuild all Bloom filters from scratch.
|
|
221
|
+
|
|
222
|
+
Useful after archiving new entities.
|
|
223
|
+
"""
|
|
224
|
+
# This will be implemented by ArchiveManager when creating archives
|
|
225
|
+
# For now, this is a placeholder
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
def clear_cache(self) -> None:
|
|
229
|
+
"""Clear the search cache."""
|
|
230
|
+
self._cached_search.cache_clear()
|
|
231
|
+
self._bloom_cache.clear()
|
|
232
|
+
|
|
233
|
+
def close(self) -> None:
|
|
234
|
+
"""Close all resources."""
|
|
235
|
+
self.fts_index.close()
|
|
236
|
+
self._bloom_cache.clear()
|
|
237
|
+
|
|
238
|
+
def __enter__(self) -> "ArchiveSearchEngine":
|
|
239
|
+
"""Context manager entry."""
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def __exit__(self, *args: Any) -> None:
|
|
243
|
+
"""Context manager exit."""
|
|
244
|
+
self.close()
|
htmlgraph/cli.py
CHANGED
|
@@ -4326,6 +4326,87 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
|
|
|
4326
4326
|
"--format", "-f", choices=["text", "json"], default="text", help="Output format"
|
|
4327
4327
|
)
|
|
4328
4328
|
|
|
4329
|
+
# =========================================================================
|
|
4330
|
+
# Archive Management
|
|
4331
|
+
# =========================================================================
|
|
4332
|
+
|
|
4333
|
+
# archive (with subcommands)
|
|
4334
|
+
archive_parser = subparsers.add_parser(
|
|
4335
|
+
"archive", help="Archive management with optimized search"
|
|
4336
|
+
)
|
|
4337
|
+
archive_subparsers = archive_parser.add_subparsers(
|
|
4338
|
+
dest="archive_command", help="Archive command"
|
|
4339
|
+
)
|
|
4340
|
+
|
|
4341
|
+
# archive create
|
|
4342
|
+
archive_create = archive_subparsers.add_parser(
|
|
4343
|
+
"create", help="Create archive from old entities"
|
|
4344
|
+
)
|
|
4345
|
+
archive_create.add_argument(
|
|
4346
|
+
"--older-than",
|
|
4347
|
+
type=int,
|
|
4348
|
+
default=90,
|
|
4349
|
+
help="Archive entities older than N days (default: 90)",
|
|
4350
|
+
)
|
|
4351
|
+
archive_create.add_argument(
|
|
4352
|
+
"--period",
|
|
4353
|
+
choices=["quarter", "month", "year"],
|
|
4354
|
+
default="quarter",
|
|
4355
|
+
help="Archive grouping period (default: quarter)",
|
|
4356
|
+
)
|
|
4357
|
+
archive_create.add_argument(
|
|
4358
|
+
"--dry-run",
|
|
4359
|
+
action="store_true",
|
|
4360
|
+
help="Preview what would be archived without making changes",
|
|
4361
|
+
)
|
|
4362
|
+
archive_create.add_argument(
|
|
4363
|
+
"--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
|
|
4364
|
+
)
|
|
4365
|
+
|
|
4366
|
+
# archive search
|
|
4367
|
+
archive_search = archive_subparsers.add_parser(
|
|
4368
|
+
"search", help="Search archived entities"
|
|
4369
|
+
)
|
|
4370
|
+
archive_search.add_argument("query", help="Search query")
|
|
4371
|
+
archive_search.add_argument(
|
|
4372
|
+
"--limit", "-l", type=int, default=10, help="Maximum results (default: 10)"
|
|
4373
|
+
)
|
|
4374
|
+
archive_search.add_argument(
|
|
4375
|
+
"--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
|
|
4376
|
+
)
|
|
4377
|
+
archive_search.add_argument(
|
|
4378
|
+
"--format", "-f", choices=["text", "json"], default="text", help="Output format"
|
|
4379
|
+
)
|
|
4380
|
+
|
|
4381
|
+
# archive stats
|
|
4382
|
+
archive_stats = archive_subparsers.add_parser(
|
|
4383
|
+
"stats", help="Show archive statistics"
|
|
4384
|
+
)
|
|
4385
|
+
archive_stats.add_argument(
|
|
4386
|
+
"--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
|
|
4387
|
+
)
|
|
4388
|
+
archive_stats.add_argument(
|
|
4389
|
+
"--format", "-f", choices=["text", "json"], default="text", help="Output format"
|
|
4390
|
+
)
|
|
4391
|
+
|
|
4392
|
+
# archive restore
|
|
4393
|
+
archive_restore = archive_subparsers.add_parser(
|
|
4394
|
+
"restore", help="Restore archived entity"
|
|
4395
|
+
)
|
|
4396
|
+
archive_restore.add_argument("entity_id", help="Entity ID to restore")
|
|
4397
|
+
archive_restore.add_argument(
|
|
4398
|
+
"--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
|
|
4399
|
+
)
|
|
4400
|
+
|
|
4401
|
+
# archive list
|
|
4402
|
+
archive_list = archive_subparsers.add_parser("list", help="List all archive files")
|
|
4403
|
+
archive_list.add_argument(
|
|
4404
|
+
"--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
|
|
4405
|
+
)
|
|
4406
|
+
archive_list.add_argument(
|
|
4407
|
+
"--format", "-f", choices=["text", "json"], default="text", help="Output format"
|
|
4408
|
+
)
|
|
4409
|
+
|
|
4329
4410
|
# =========================================================================
|
|
4330
4411
|
# Analytics
|
|
4331
4412
|
# =========================================================================
|
|
@@ -4700,6 +4781,21 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
|
|
|
4700
4781
|
else:
|
|
4701
4782
|
track_parser.print_help()
|
|
4702
4783
|
sys.exit(1)
|
|
4784
|
+
elif args.command == "archive":
|
|
4785
|
+
# Archive management
|
|
4786
|
+
if args.archive_command == "create":
|
|
4787
|
+
cmd_archive_create(args)
|
|
4788
|
+
elif args.archive_command == "search":
|
|
4789
|
+
cmd_archive_search(args)
|
|
4790
|
+
elif args.archive_command == "stats":
|
|
4791
|
+
cmd_archive_stats(args)
|
|
4792
|
+
elif args.archive_command == "restore":
|
|
4793
|
+
cmd_archive_restore(args)
|
|
4794
|
+
elif args.archive_command == "list":
|
|
4795
|
+
cmd_archive_list(args)
|
|
4796
|
+
else:
|
|
4797
|
+
archive_parser.print_help()
|
|
4798
|
+
sys.exit(1)
|
|
4703
4799
|
elif args.command == "work":
|
|
4704
4800
|
# Work management with smart routing
|
|
4705
4801
|
if args.work_command == "next":
|
|
@@ -4971,5 +5067,180 @@ def cmd_sync_docs(args: argparse.Namespace) -> int:
|
|
|
4971
5067
|
return 1 if any("⚠️" in c or "❌" in c for c in changes) else 0
|
|
4972
5068
|
|
|
4973
5069
|
|
|
5070
|
+
# =============================================================================
|
|
5071
|
+
# Archive Management Commands
|
|
5072
|
+
# =============================================================================
|
|
5073
|
+
|
|
5074
|
+
|
|
5075
|
+
def cmd_archive_create(args: argparse.Namespace) -> None:
|
|
5076
|
+
"""Create archive from old entities."""
|
|
5077
|
+
from pathlib import Path
|
|
5078
|
+
|
|
5079
|
+
from htmlgraph.archive import ArchiveManager
|
|
5080
|
+
|
|
5081
|
+
htmlgraph_dir = Path(args.graph_dir).resolve()
|
|
5082
|
+
|
|
5083
|
+
if not htmlgraph_dir.exists():
|
|
5084
|
+
print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
|
|
5085
|
+
sys.exit(1)
|
|
5086
|
+
|
|
5087
|
+
manager = ArchiveManager(htmlgraph_dir)
|
|
5088
|
+
|
|
5089
|
+
# Run archive operation
|
|
5090
|
+
result = manager.archive_entities(
|
|
5091
|
+
older_than_days=args.older_than,
|
|
5092
|
+
period=args.period,
|
|
5093
|
+
dry_run=args.dry_run,
|
|
5094
|
+
)
|
|
5095
|
+
|
|
5096
|
+
if result["dry_run"]:
|
|
5097
|
+
print("\n🔍 DRY RUN - Preview (no changes made)\n")
|
|
5098
|
+
print(f"Would archive: {result['would_archive']} entities")
|
|
5099
|
+
print(f"Archive files: {len(result['archive_files'])}")
|
|
5100
|
+
print("\nDetails:")
|
|
5101
|
+
for archive_key, count in result["details"].items():
|
|
5102
|
+
print(f" {archive_key}: {count} entities")
|
|
5103
|
+
else:
|
|
5104
|
+
print(f"\n✅ Archived {result['archived_count']} entities")
|
|
5105
|
+
print(f"Created {len(result['archive_files'])} archive file(s):")
|
|
5106
|
+
for archive_file in result["archive_files"]:
|
|
5107
|
+
count = result["details"].get(archive_file.replace(".html", ""), 0)
|
|
5108
|
+
print(f" - {archive_file} ({count} entities)")
|
|
5109
|
+
|
|
5110
|
+
manager.close()
|
|
5111
|
+
|
|
5112
|
+
|
|
5113
|
+
def cmd_archive_search(args: argparse.Namespace) -> None:
|
|
5114
|
+
"""Search archived entities."""
|
|
5115
|
+
import json
|
|
5116
|
+
from pathlib import Path
|
|
5117
|
+
|
|
5118
|
+
from htmlgraph.archive import ArchiveManager
|
|
5119
|
+
|
|
5120
|
+
htmlgraph_dir = Path(args.graph_dir).resolve()
|
|
5121
|
+
|
|
5122
|
+
if not htmlgraph_dir.exists():
|
|
5123
|
+
print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
|
|
5124
|
+
sys.exit(1)
|
|
5125
|
+
|
|
5126
|
+
manager = ArchiveManager(htmlgraph_dir)
|
|
5127
|
+
|
|
5128
|
+
# Search archives
|
|
5129
|
+
results = manager.search(args.query, limit=args.limit)
|
|
5130
|
+
|
|
5131
|
+
if args.format == "json":
|
|
5132
|
+
print(json.dumps({"query": args.query, "results": results}, indent=2))
|
|
5133
|
+
else:
|
|
5134
|
+
print(f"\n🔍 Search results for: '{args.query}'\n")
|
|
5135
|
+
print(f"Found {len(results)} result(s):\n")
|
|
5136
|
+
|
|
5137
|
+
for i, result in enumerate(results, 1):
|
|
5138
|
+
print(f"{i}. {result['entity_id']} ({result['entity_type']})")
|
|
5139
|
+
print(f" Archive: {result['archive_file']}")
|
|
5140
|
+
print(f" Status: {result['status']}")
|
|
5141
|
+
print(f" Title: {result['title_snippet']}")
|
|
5142
|
+
if result["description_snippet"]:
|
|
5143
|
+
print(f" Description: {result['description_snippet']}")
|
|
5144
|
+
print(f" Relevance: {result['rank']:.2f}")
|
|
5145
|
+
print()
|
|
5146
|
+
|
|
5147
|
+
manager.close()
|
|
5148
|
+
|
|
5149
|
+
|
|
5150
|
+
def cmd_archive_stats(args: argparse.Namespace) -> None:
|
|
5151
|
+
"""Show archive statistics."""
|
|
5152
|
+
import json
|
|
5153
|
+
from pathlib import Path
|
|
5154
|
+
|
|
5155
|
+
from htmlgraph.archive import ArchiveManager
|
|
5156
|
+
|
|
5157
|
+
htmlgraph_dir = Path(args.graph_dir).resolve()
|
|
5158
|
+
|
|
5159
|
+
if not htmlgraph_dir.exists():
|
|
5160
|
+
print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
|
|
5161
|
+
sys.exit(1)
|
|
5162
|
+
|
|
5163
|
+
manager = ArchiveManager(htmlgraph_dir)
|
|
5164
|
+
|
|
5165
|
+
# Get statistics
|
|
5166
|
+
stats = manager.get_archive_stats()
|
|
5167
|
+
|
|
5168
|
+
if args.format == "json":
|
|
5169
|
+
print(json.dumps(stats, indent=2))
|
|
5170
|
+
else:
|
|
5171
|
+
print("\n📊 Archive Statistics\n")
|
|
5172
|
+
print(f"Archive files: {stats['archive_count']}")
|
|
5173
|
+
print(f"Archived entities: {stats['entity_count']}")
|
|
5174
|
+
print(f"Total size: {stats['total_size_mb']:.2f} MB")
|
|
5175
|
+
print(f"FTS5 index: {stats['fts_size_mb']:.2f} MB")
|
|
5176
|
+
print(
|
|
5177
|
+
f"Bloom filters: {stats['bloom_size_kb']:.2f} KB ({stats['bloom_count']} files)"
|
|
5178
|
+
)
|
|
5179
|
+
|
|
5180
|
+
manager.close()
|
|
5181
|
+
|
|
5182
|
+
|
|
5183
|
+
def cmd_archive_restore(args: argparse.Namespace) -> None:
|
|
5184
|
+
"""Restore archived entity."""
|
|
5185
|
+
from pathlib import Path
|
|
5186
|
+
|
|
5187
|
+
from htmlgraph.archive import ArchiveManager
|
|
5188
|
+
|
|
5189
|
+
htmlgraph_dir = Path(args.graph_dir).resolve()
|
|
5190
|
+
|
|
5191
|
+
if not htmlgraph_dir.exists():
|
|
5192
|
+
print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
|
|
5193
|
+
sys.exit(1)
|
|
5194
|
+
|
|
5195
|
+
manager = ArchiveManager(htmlgraph_dir)
|
|
5196
|
+
|
|
5197
|
+
# Restore entity
|
|
5198
|
+
success = manager.unarchive(args.entity_id)
|
|
5199
|
+
|
|
5200
|
+
if success:
|
|
5201
|
+
print(f"✅ Restored {args.entity_id} from archive")
|
|
5202
|
+
else:
|
|
5203
|
+
print(f"❌ Entity not found in archives: {args.entity_id}", file=sys.stderr)
|
|
5204
|
+
sys.exit(1)
|
|
5205
|
+
|
|
5206
|
+
manager.close()
|
|
5207
|
+
|
|
5208
|
+
|
|
5209
|
+
def cmd_archive_list(args: argparse.Namespace) -> None:
|
|
5210
|
+
"""List all archive files."""
|
|
5211
|
+
import json
|
|
5212
|
+
from pathlib import Path
|
|
5213
|
+
|
|
5214
|
+
htmlgraph_dir = Path(args.graph_dir).resolve()
|
|
5215
|
+
|
|
5216
|
+
if not htmlgraph_dir.exists():
|
|
5217
|
+
print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
|
|
5218
|
+
sys.exit(1)
|
|
5219
|
+
|
|
5220
|
+
archive_dir = htmlgraph_dir / "archives"
|
|
5221
|
+
|
|
5222
|
+
if not archive_dir.exists():
|
|
5223
|
+
print("No archives found")
|
|
5224
|
+
return
|
|
5225
|
+
|
|
5226
|
+
archive_files = sorted(archive_dir.glob("*.html"))
|
|
5227
|
+
|
|
5228
|
+
if args.format == "json":
|
|
5229
|
+
file_list = [
|
|
5230
|
+
{
|
|
5231
|
+
"filename": f.name,
|
|
5232
|
+
"size_kb": f.stat().st_size / 1024,
|
|
5233
|
+
"modified": f.stat().st_mtime,
|
|
5234
|
+
}
|
|
5235
|
+
for f in archive_files
|
|
5236
|
+
]
|
|
5237
|
+
print(json.dumps({"archives": file_list}, indent=2))
|
|
5238
|
+
else:
|
|
5239
|
+
print(f"\n📦 Archive Files ({len(archive_files)})\n")
|
|
5240
|
+
for f in archive_files:
|
|
5241
|
+
size_kb = f.stat().st_size / 1024
|
|
5242
|
+
print(f" - {f.name} ({size_kb:.1f} KB)")
|
|
5243
|
+
|
|
5244
|
+
|
|
4974
5245
|
if __name__ == "__main__":
|
|
4975
5246
|
main()
|
htmlgraph/converter.py
CHANGED
|
@@ -548,6 +548,45 @@ def html_to_session(filepath: Path | str) -> Session:
|
|
|
548
548
|
# Activity log in HTML is reversed (newest first), so reverse back
|
|
549
549
|
data["activity_log"] = list(reversed(activity_log))
|
|
550
550
|
|
|
551
|
+
# Parse detected patterns from table (if present)
|
|
552
|
+
detected_patterns = []
|
|
553
|
+
for tr in parser.query("section[data-detected-patterns] table tbody tr"):
|
|
554
|
+
# Extract pattern data from table row
|
|
555
|
+
pattern_type = tr.attrs.get("data-pattern-type", "neutral")
|
|
556
|
+
|
|
557
|
+
# Extract sequence from first <td class="sequence">
|
|
558
|
+
seq_td = tr.query_one("td.sequence")
|
|
559
|
+
sequence_str = seq_td.to_text().strip() if seq_td else ""
|
|
560
|
+
sequence = [s.strip() for s in sequence_str.split("→")] if sequence_str else []
|
|
561
|
+
|
|
562
|
+
# Extract count from third <td>
|
|
563
|
+
count_td = tr.query_all("td")[2] if len(tr.query_all("td")) > 2 else None
|
|
564
|
+
count_str = count_td.to_text().strip() if count_td else "0"
|
|
565
|
+
try:
|
|
566
|
+
count = int(count_str)
|
|
567
|
+
except (ValueError, TypeError):
|
|
568
|
+
count = 0
|
|
569
|
+
|
|
570
|
+
# Extract timestamps from fourth <td>
|
|
571
|
+
time_td = tr.query_all("td")[3] if len(tr.query_all("td")) > 3 else None
|
|
572
|
+
time_str = time_td.to_text().strip() if time_td else ""
|
|
573
|
+
times = time_str.split(" / ")
|
|
574
|
+
first_detected = times[0].strip() if len(times) > 0 else ""
|
|
575
|
+
last_detected = times[1].strip() if len(times) > 1 else ""
|
|
576
|
+
|
|
577
|
+
if sequence: # Only add if we have a valid sequence
|
|
578
|
+
detected_patterns.append(
|
|
579
|
+
{
|
|
580
|
+
"sequence": sequence,
|
|
581
|
+
"pattern_type": pattern_type,
|
|
582
|
+
"detection_count": count,
|
|
583
|
+
"first_detected": first_detected,
|
|
584
|
+
"last_detected": last_detected,
|
|
585
|
+
}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
data["detected_patterns"] = detected_patterns
|
|
589
|
+
|
|
551
590
|
return Session(**data)
|
|
552
591
|
|
|
553
592
|
|