PyPI - htmlgraph - Versions diffs - 0.20.9__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

htmlgraph 0.20.9py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

htmlgraph/__init__.py +1 -1
htmlgraph/analytics/__init__.py +3 -1
htmlgraph/analytics/cross_session.py +612 -0
htmlgraph/archive/__init__.py +24 -0
htmlgraph/archive/bloom.py +234 -0
htmlgraph/archive/fts.py +297 -0
htmlgraph/archive/manager.py +583 -0
htmlgraph/archive/search.py +244 -0
htmlgraph/cli.py +271 -0
htmlgraph/converter.py +39 -0
htmlgraph/learning.py +121 -97
htmlgraph/models.py +53 -1
htmlgraph/sdk.py +4 -1
{htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/METADATA +1 -1
{htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/RECORD +22 -16
{htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/dashboard.html +0 -0
{htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/styles.css +0 -0
{htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/AGENTS.md.template +0 -0
{htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/CLAUDE.md.template +0 -0
{htmlgraph-0.20.9.data → htmlgraph-0.21.0.data}/data/htmlgraph/templates/GEMINI.md.template +0 -0
{htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/WHEEL +0 -0
{htmlgraph-0.20.9.dist-info → htmlgraph-0.21.0.dist-info}/entry_points.txt +0 -0

htmlgraph/archive/search.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""
+Archive search engine with three-tier optimization.
+Tier 1: Bloom filters (skip 70-90% of archives)
+Tier 2: SQLite FTS5 with BM25 ranking (O(log n) search)
+Tier 3: Snippet extraction and highlighting
+Target: 67x faster than naive multi-file search.
+"""
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from htmlgraph.archive.bloom import BloomFilter
+from htmlgraph.archive.fts import ArchiveFTS5Index
+@dataclass
+class SearchResult:
+    """
+    Search result from archive search.
+    Attributes:
+        entity_id: Entity identifier
+        archive_file: Archive file containing entity
+        entity_type: Type of entity (feature, bug, etc.)
+        status: Entity status
+        title_snippet: Title with highlighted matches
+        description_snippet: Description with highlighted matches
+        rank: BM25 relevance score (lower is better)
+    """
+    entity_id: str
+    archive_file: str
+    entity_type: str
+    status: str
+    title_snippet: str
+    description_snippet: str
+    rank: float
+class ArchiveSearchEngine:
+    """
+    Orchestrates three-tier archive search.
+    Workflow:
+    1. Check Bloom filters to skip irrelevant archives (70-90% filtered)
+    2. Search remaining archives with FTS5 + BM25 ranking
+    3. Extract and highlight snippets for top results
+    """
+    def __init__(self, archive_dir: Path, index_dir: Path) -> None:
+        """
+        Initialize search engine.
+        Args:
+            archive_dir: Directory containing archive HTML files
+            index_dir: Directory for Bloom filters and FTS5 index
+        """
+        self.archive_dir = archive_dir
+        self.index_dir = index_dir
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+        # Initialize FTS5 index
+        self.fts_index = ArchiveFTS5Index(index_dir / "archives.db")
+        # Cache for Bloom filters (avoid reloading)
+        self._bloom_cache: dict[str, BloomFilter] = {}
+    def _get_bloom_filter(self, archive_file: str) -> BloomFilter | None:
+        """
+        Get Bloom filter for archive (with caching).
+        Args:
+            archive_file: Archive filename
+        Returns:
+            BloomFilter or None if not indexed
+        """
+        if archive_file in self._bloom_cache:
+            return self._bloom_cache[archive_file]
+        bloom_path = self.index_dir / f"{archive_file}.bloom"
+        if bloom_path.exists():
+            bloom = BloomFilter.load(bloom_path)
+            self._bloom_cache[archive_file] = bloom
+            return bloom
+        return None
+    def _filter_archives_with_bloom(
+        self, query: str, archive_files: list[str]
+    ) -> list[str]:
+        """
+        Filter archive files using Bloom filters.
+        Args:
+            query: Search query
+            archive_files: List of all archive files
+        Returns:
+            Filtered list of archives that might contain query
+        """
+        # Tokenize query into words
+        query_tokens = query.lower().split()
+        candidates = []
+        for archive_file in archive_files:
+            bloom = self._get_bloom_filter(archive_file)
+            if bloom is None:
+                # No Bloom filter - include archive
+                candidates.append(archive_file)
+                continue
+            # Check if any query token might be in archive
+            might_match = any(bloom.might_contain(token) for token in query_tokens)
+            if might_match:
+                candidates.append(archive_file)
+        return candidates
+    @lru_cache(maxsize=100)
+    def _cached_search(self, query: str, limit: int) -> tuple[SearchResult, ...]:
+        """
+        Cached search implementation.
+        Args:
+            query: Search query
+            limit: Maximum results
+        Returns:
+            Tuple of SearchResult (immutable for caching)
+        """
+        # Get all archive files
+        archive_files = [f.name for f in self.archive_dir.glob("*.html")]
+        if not archive_files:
+            return tuple()
+        # Tier 1: Filter with Bloom filters
+        candidate_archives = self._filter_archives_with_bloom(query, archive_files)
+        if not candidate_archives:
+            return tuple()
+        # Tier 2 & 3: Search with FTS5 (includes snippet highlighting)
+        results = self.fts_index.search(
+            query, limit=limit, archive_files=candidate_archives
+        )
+        # Convert to SearchResult objects
+        search_results = [
+            SearchResult(
+                entity_id=r["entity_id"],
+                archive_file=r["archive_file"],
+                entity_type=r["entity_type"],
+                status=r["status"],
+                title_snippet=r["title_snippet"],
+                description_snippet=r["description_snippet"],
+                rank=r["rank"],
+            )
+            for r in results
+        ]
+        return tuple(search_results)
+    def search(
+        self, query: str, include_archived: bool = True, limit: int = 10
+    ) -> list[SearchResult]:
+        """
+        Search archives with three-tier optimization.
+        Args:
+            query: Search query
+            include_archived: Whether to search archives (future: also search active)
+            limit: Maximum results to return
+        Returns:
+            List of SearchResult objects sorted by relevance
+        """
+        if not include_archived:
+            return []
+        # Use cached search
+        results = self._cached_search(query, limit)
+        return list(results)
+    def get_search_stats(self, query: str) -> dict[str, Any]:
+        """
+        Get statistics about a search query.
+        Args:
+            query: Search query
+        Returns:
+            Dictionary with bloom_filtered_count, searched_count, etc.
+        """
+        archive_files = [f.name for f in self.archive_dir.glob("*.html")]
+        total_archives = len(archive_files)
+        candidate_archives = self._filter_archives_with_bloom(query, archive_files)
+        searched_count = len(candidate_archives)
+        bloom_filtered = total_archives - searched_count
+        return {
+            "total_archives": total_archives,
+            "bloom_filtered": bloom_filtered,
+            "searched_count": searched_count,
+            "filter_rate": bloom_filtered / total_archives if total_archives > 0 else 0,
+        }
+    def rebuild_bloom_filters(self) -> None:
+        """
+        Rebuild all Bloom filters from scratch.
+        Useful after archiving new entities.
+        """
+        # This will be implemented by ArchiveManager when creating archives
+        # For now, this is a placeholder
+        pass
+    def clear_cache(self) -> None:
+        """Clear the search cache."""
+        self._cached_search.cache_clear()
+        self._bloom_cache.clear()
+    def close(self) -> None:
+        """Close all resources."""
+        self.fts_index.close()
+        self._bloom_cache.clear()
+    def __enter__(self) -> "ArchiveSearchEngine":
+        """Context manager entry."""
+        return self
+    def __exit__(self, *args: Any) -> None:
+        """Context manager exit."""
+        self.close()

htmlgraph/cli.py CHANGED Viewed

@@ -4326,6 +4326,87 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
         "--format", "-f", choices=["text", "json"], default="text", help="Output format"
     )
+    # =========================================================================
+    # Archive Management
+    # =========================================================================
+    # archive (with subcommands)
+    archive_parser = subparsers.add_parser(
+        "archive", help="Archive management with optimized search"
+    )
+    archive_subparsers = archive_parser.add_subparsers(
+        dest="archive_command", help="Archive command"
+    )
+    # archive create
+    archive_create = archive_subparsers.add_parser(
+        "create", help="Create archive from old entities"
+    )
+    archive_create.add_argument(
+        "--older-than",
+        type=int,
+        default=90,
+        help="Archive entities older than N days (default: 90)",
+    )
+    archive_create.add_argument(
+        "--period",
+        choices=["quarter", "month", "year"],
+        default="quarter",
+        help="Archive grouping period (default: quarter)",
+    )
+    archive_create.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview what would be archived without making changes",
+    )
+    archive_create.add_argument(
+        "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
+    )
+    # archive search
+    archive_search = archive_subparsers.add_parser(
+        "search", help="Search archived entities"
+    )
+    archive_search.add_argument("query", help="Search query")
+    archive_search.add_argument(
+        "--limit", "-l", type=int, default=10, help="Maximum results (default: 10)"
+    )
+    archive_search.add_argument(
+        "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
+    )
+    archive_search.add_argument(
+        "--format", "-f", choices=["text", "json"], default="text", help="Output format"
+    )
+    # archive stats
+    archive_stats = archive_subparsers.add_parser(
+        "stats", help="Show archive statistics"
+    )
+    archive_stats.add_argument(
+        "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
+    )
+    archive_stats.add_argument(
+        "--format", "-f", choices=["text", "json"], default="text", help="Output format"
+    )
+    # archive restore
+    archive_restore = archive_subparsers.add_parser(
+        "restore", help="Restore archived entity"
+    )
+    archive_restore.add_argument("entity_id", help="Entity ID to restore")
+    archive_restore.add_argument(
+        "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
+    )
+    # archive list
+    archive_list = archive_subparsers.add_parser("list", help="List all archive files")
+    archive_list.add_argument(
+        "--graph-dir", "-g", default=".htmlgraph", help="Graph directory"
+    )
+    archive_list.add_argument(
+        "--format", "-f", choices=["text", "json"], default="text", help="Output format"
+    )
     # =========================================================================
     # Analytics
     # =========================================================================
@@ -4700,6 +4781,21 @@ For more help: https://github.com/Shakes-tzd/htmlgraph
         else:
             track_parser.print_help()
             sys.exit(1)
+    elif args.command == "archive":
+        # Archive management
+        if args.archive_command == "create":
+            cmd_archive_create(args)
+        elif args.archive_command == "search":
+            cmd_archive_search(args)
+        elif args.archive_command == "stats":
+            cmd_archive_stats(args)
+        elif args.archive_command == "restore":
+            cmd_archive_restore(args)
+        elif args.archive_command == "list":
+            cmd_archive_list(args)
+        else:
+            archive_parser.print_help()
+            sys.exit(1)
     elif args.command == "work":
         # Work management with smart routing
         if args.work_command == "next":
@@ -4971,5 +5067,180 @@ def cmd_sync_docs(args: argparse.Namespace) -> int:
         return 1 if any("⚠️" in c or "❌" in c for c in changes) else 0
+# =============================================================================
+# Archive Management Commands
+# =============================================================================
+def cmd_archive_create(args: argparse.Namespace) -> None:
+    """Create archive from old entities."""
+    from pathlib import Path
+    from htmlgraph.archive import ArchiveManager
+    htmlgraph_dir = Path(args.graph_dir).resolve()
+    if not htmlgraph_dir.exists():
+        print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
+        sys.exit(1)
+    manager = ArchiveManager(htmlgraph_dir)
+    # Run archive operation
+    result = manager.archive_entities(
+        older_than_days=args.older_than,
+        period=args.period,
+        dry_run=args.dry_run,
+    )
+    if result["dry_run"]:
+        print("\n🔍 DRY RUN - Preview (no changes made)\n")
+        print(f"Would archive: {result['would_archive']} entities")
+        print(f"Archive files: {len(result['archive_files'])}")
+        print("\nDetails:")
+        for archive_key, count in result["details"].items():
+            print(f"  {archive_key}: {count} entities")
+    else:
+        print(f"\n✅ Archived {result['archived_count']} entities")
+        print(f"Created {len(result['archive_files'])} archive file(s):")
+        for archive_file in result["archive_files"]:
+            count = result["details"].get(archive_file.replace(".html", ""), 0)
+            print(f"  - {archive_file} ({count} entities)")
+    manager.close()
+def cmd_archive_search(args: argparse.Namespace) -> None:
+    """Search archived entities."""
+    import json
+    from pathlib import Path
+    from htmlgraph.archive import ArchiveManager
+    htmlgraph_dir = Path(args.graph_dir).resolve()
+    if not htmlgraph_dir.exists():
+        print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
+        sys.exit(1)
+    manager = ArchiveManager(htmlgraph_dir)
+    # Search archives
+    results = manager.search(args.query, limit=args.limit)
+    if args.format == "json":
+        print(json.dumps({"query": args.query, "results": results}, indent=2))
+    else:
+        print(f"\n🔍 Search results for: '{args.query}'\n")
+        print(f"Found {len(results)} result(s):\n")
+        for i, result in enumerate(results, 1):
+            print(f"{i}. {result['entity_id']} ({result['entity_type']})")
+            print(f"   Archive: {result['archive_file']}")
+            print(f"   Status: {result['status']}")
+            print(f"   Title: {result['title_snippet']}")
+            if result["description_snippet"]:
+                print(f"   Description: {result['description_snippet']}")
+            print(f"   Relevance: {result['rank']:.2f}")
+            print()
+    manager.close()
+def cmd_archive_stats(args: argparse.Namespace) -> None:
+    """Show archive statistics."""
+    import json
+    from pathlib import Path
+    from htmlgraph.archive import ArchiveManager
+    htmlgraph_dir = Path(args.graph_dir).resolve()
+    if not htmlgraph_dir.exists():
+        print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
+        sys.exit(1)
+    manager = ArchiveManager(htmlgraph_dir)
+    # Get statistics
+    stats = manager.get_archive_stats()
+    if args.format == "json":
+        print(json.dumps(stats, indent=2))
+    else:
+        print("\n📊 Archive Statistics\n")
+        print(f"Archive files: {stats['archive_count']}")
+        print(f"Archived entities: {stats['entity_count']}")
+        print(f"Total size: {stats['total_size_mb']:.2f} MB")
+        print(f"FTS5 index: {stats['fts_size_mb']:.2f} MB")
+        print(
+            f"Bloom filters: {stats['bloom_size_kb']:.2f} KB ({stats['bloom_count']} files)"
+        )
+    manager.close()
+def cmd_archive_restore(args: argparse.Namespace) -> None:
+    """Restore archived entity."""
+    from pathlib import Path
+    from htmlgraph.archive import ArchiveManager
+    htmlgraph_dir = Path(args.graph_dir).resolve()
+    if not htmlgraph_dir.exists():
+        print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
+        sys.exit(1)
+    manager = ArchiveManager(htmlgraph_dir)
+    # Restore entity
+    success = manager.unarchive(args.entity_id)
+    if success:
+        print(f"✅ Restored {args.entity_id} from archive")
+    else:
+        print(f"❌ Entity not found in archives: {args.entity_id}", file=sys.stderr)
+        sys.exit(1)
+    manager.close()
+def cmd_archive_list(args: argparse.Namespace) -> None:
+    """List all archive files."""
+    import json
+    from pathlib import Path
+    htmlgraph_dir = Path(args.graph_dir).resolve()
+    if not htmlgraph_dir.exists():
+        print(f"Error: Directory not found: {htmlgraph_dir}", file=sys.stderr)
+        sys.exit(1)
+    archive_dir = htmlgraph_dir / "archives"
+    if not archive_dir.exists():
+        print("No archives found")
+        return
+    archive_files = sorted(archive_dir.glob("*.html"))
+    if args.format == "json":
+        file_list = [
+            {
+                "filename": f.name,
+                "size_kb": f.stat().st_size / 1024,
+                "modified": f.stat().st_mtime,
+            }
+            for f in archive_files
+        ]
+        print(json.dumps({"archives": file_list}, indent=2))
+    else:
+        print(f"\n📦 Archive Files ({len(archive_files)})\n")
+        for f in archive_files:
+            size_kb = f.stat().st_size / 1024
+            print(f"  - {f.name} ({size_kb:.1f} KB)")
 if __name__ == "__main__":
     main()

htmlgraph/converter.py CHANGED Viewed

@@ -548,6 +548,45 @@ def html_to_session(filepath: Path | str) -> Session:
     # Activity log in HTML is reversed (newest first), so reverse back
     data["activity_log"] = list(reversed(activity_log))
+    # Parse detected patterns from table (if present)
+    detected_patterns = []
+    for tr in parser.query("section[data-detected-patterns] table tbody tr"):
+        # Extract pattern data from table row
+        pattern_type = tr.attrs.get("data-pattern-type", "neutral")
+        # Extract sequence from first <td class="sequence">
+        seq_td = tr.query_one("td.sequence")
+        sequence_str = seq_td.to_text().strip() if seq_td else ""
+        sequence = [s.strip() for s in sequence_str.split("→")] if sequence_str else []
+        # Extract count from third <td>
+        count_td = tr.query_all("td")[2] if len(tr.query_all("td")) > 2 else None
+        count_str = count_td.to_text().strip() if count_td else "0"
+        try:
+            count = int(count_str)
+        except (ValueError, TypeError):
+            count = 0
+        # Extract timestamps from fourth <td>
+        time_td = tr.query_all("td")[3] if len(tr.query_all("td")) > 3 else None
+        time_str = time_td.to_text().strip() if time_td else ""
+        times = time_str.split(" / ")
+        first_detected = times[0].strip() if len(times) > 0 else ""
+        last_detected = times[1].strip() if len(times) > 1 else ""
+        if sequence:  # Only add if we have a valid sequence
+            detected_patterns.append(
+                {
+                    "sequence": sequence,
+                    "pattern_type": pattern_type,
+                    "detection_count": count,
+                    "first_detected": first_detected,
+                    "last_detected": last_detected,
+                }
+            )
+    data["detected_patterns"] = detected_patterns
     return Session(**data)

htmlgraph 0.20.9__py3-none-any.whl → 0.21.0__py3-none-any.whl

htmlgraph 0.20.9py3-none-any.whl → 0.21.0py3-none-any.whl