PyPI - footprinter-cli - Versions diffs - 1.0.0__py3-none-any.whl - Mend

footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

footprinter/__init__.py +8 -0
footprinter/access.py +444 -0
footprinter/api/__init__.py +1 -0
footprinter/api/db.py +61 -0
footprinter/api/entities.py +250 -0
footprinter/api/search.py +47 -0
footprinter/api/semantic.py +33 -0
footprinter/api/server.py +66 -0
footprinter/api/status.py +15 -0
footprinter/bundled/__init__.py +0 -0
footprinter/bundled/config.example.yaml +161 -0
footprinter/bundled/patterns/context_patterns.yaml +18 -0
footprinter/bundled/patterns/extensions.yaml +283 -0
footprinter/bundled/patterns/filename_patterns.yaml +61 -0
footprinter/bundled/patterns/mime_mappings.yaml +68 -0
footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
footprinter/bundled/patterns/security_patterns.yaml +27 -0
footprinter/cli/__init__.py +128 -0
footprinter/cli/__main__.py +6 -0
footprinter/cli/_common.py +332 -0
footprinter/cli/_policy_helpers.py +646 -0
footprinter/cli/_prompt.py +220 -0
footprinter/cli/api_cmd.py +32 -0
footprinter/cli/connect.py +591 -0
footprinter/cli/data.py +879 -0
footprinter/cli/delete.py +128 -0
footprinter/cli/ingest.py +579 -0
footprinter/cli/mcp_cmd.py +750 -0
footprinter/cli/mcp_setup.py +306 -0
footprinter/cli/search.py +393 -0
footprinter/cli/search_cmd.py +69 -0
footprinter/cli/setup.py +1836 -0
footprinter/cli/status.py +729 -0
footprinter/cli/status_cmd.py +104 -0
footprinter/cli/upsert.py +794 -0
footprinter/cli/vectorize_cmd.py +215 -0
footprinter/cli/view.py +322 -0
footprinter/connectors/__init__.py +171 -0
footprinter/connectors/config_utils.py +141 -0
footprinter/db/__init__.py +37 -0
footprinter/db/browser.py +198 -0
footprinter/db/chats.py +610 -0
footprinter/db/clients.py +307 -0
footprinter/db/emails.py +279 -0
footprinter/db/files.py +741 -0
footprinter/db/folders.py +659 -0
footprinter/db/messages.py +192 -0
footprinter/db/policies.py +151 -0
footprinter/db/projects.py +673 -0
footprinter/db/search.py +573 -0
footprinter/db/sql_utils.py +168 -0
footprinter/db/status.py +320 -0
footprinter/db/uploads.py +70 -0
footprinter/ingest/__init__.py +0 -0
footprinter/ingest/adapters/__init__.py +33 -0
footprinter/ingest/adapters/browser.py +54 -0
footprinter/ingest/adapters/chat.py +57 -0
footprinter/ingest/adapters/ingest.py +146 -0
footprinter/ingest/adapters/local_files.py +68 -0
footprinter/ingest/adapters/local_folders.py +52 -0
footprinter/ingest/adapters/protocol.py +174 -0
footprinter/ingest/browser_indexer.py +216 -0
footprinter/ingest/chat_dedup.py +156 -0
footprinter/ingest/chat_indexer.py +515 -0
footprinter/ingest/chat_parsers/__init__.py +8 -0
footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
footprinter/ingest/chat_parsers/claude_parser.py +161 -0
footprinter/ingest/cli.py +827 -0
footprinter/ingest/content_extractors.py +117 -0
footprinter/ingest/database.py +36 -0
footprinter/ingest/db/__init__.py +1 -0
footprinter/ingest/db/connector_schema.py +47 -0
footprinter/ingest/db/migration.py +328 -0
footprinter/ingest/db/schema.py +1043 -0
footprinter/ingest/db/security.py +6 -0
footprinter/ingest/file_indexer.py +261 -0
footprinter/ingest/file_scanner.py +277 -0
footprinter/ingest/folder_indexer.py +226 -0
footprinter/ingest/full_content_extractor.py +321 -0
footprinter/ingest/orchestrator.py +125 -0
footprinter/ingest/pipe_runner.py +217 -0
footprinter/ingest/processing.py +165 -0
footprinter/ingest/registry.py +201 -0
footprinter/ingest/run_record.py +91 -0
footprinter/ingest/status.py +346 -0
footprinter/mcp/__init__.py +0 -0
footprinter/mcp/__main__.py +5 -0
footprinter/mcp/db.py +57 -0
footprinter/mcp/errors.py +102 -0
footprinter/mcp/extraction.py +226 -0
footprinter/mcp/server.py +39 -0
footprinter/mcp/tools/__init__.py +0 -0
footprinter/mcp/tools/navigation.py +70 -0
footprinter/mcp/tools/read.py +75 -0
footprinter/mcp/tools/search.py +158 -0
footprinter/mcp/tools/semantic.py +79 -0
footprinter/mcp/tools/status.py +15 -0
footprinter/paths.py +91 -0
footprinter/permissions.py +1160 -0
footprinter/semantic/__init__.py +13 -0
footprinter/semantic/chunking.py +52 -0
footprinter/semantic/embeddings.py +23 -0
footprinter/semantic/hybrid_search.py +273 -0
footprinter/semantic/vector_store.py +471 -0
footprinter/services/__init__.py +49 -0
footprinter/services/access_service.py +342 -0
footprinter/services/chat_service.py +85 -0
footprinter/services/client_service.py +267 -0
footprinter/services/content_service.py +181 -0
footprinter/services/email_service.py +89 -0
footprinter/services/file_service.py +83 -0
footprinter/services/folder_service.py +122 -0
footprinter/services/includes.py +19 -0
footprinter/services/ingest_service.py +231 -0
footprinter/services/project_service.py +262 -0
footprinter/services/roles.py +25 -0
footprinter/services/search_service.py +177 -0
footprinter/services/semantic_service.py +360 -0
footprinter/services/status_service.py +18 -0
footprinter/services/visit_service.py +65 -0
footprinter/source_registry.py +194 -0
footprinter/utils/__init__.py +7 -0
footprinter/utils/hash_utils.py +59 -0
footprinter/utils/logging_config.py +68 -0
footprinter/utils/mime.py +30 -0
footprinter/utils/text.py +6 -0
footprinter/utils/time.py +11 -0
footprinter/visibility.py +1272 -0
footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
footprinter_cli-1.0.0.dist-info/METADATA +229 -0
footprinter_cli-1.0.0.dist-info/RECORD +134 -0
footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0

footprinter/ingest/browser_indexer.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""
+Browser parsers for Safari and Chrome.
+"""
+import logging
+import platform
+import shutil
+import sqlite3
+import tempfile
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Dict, Generator
+from footprinter.utils.time import UTC_FMT
+logger = logging.getLogger(__name__)
+class BrowserParser:
+    """Base class for browser history parsing."""
+    def __init__(self, lookback_days: int = 14, since: datetime | None = None):
+        self.lookback_days = lookback_days
+        # Ensure cutoff is tz-aware UTC for comparison with tz-aware epoch constants
+        if since is not None:
+            self.cutoff_date = since.astimezone(timezone.utc) if since.tzinfo else since.replace(tzinfo=timezone.utc)
+        else:
+            self.cutoff_date = datetime.now(timezone.utc) - timedelta(days=lookback_days)
+    def parse(self) -> Generator[Dict, None, None]:
+        """Parse browser history. To be implemented by subclasses."""
+        raise NotImplementedError
+class SafariParser(BrowserParser):
+    """Parse Safari browser history."""
+    def __init__(self, lookback_days: int = 14, since: datetime | None = None):
+        super().__init__(lookback_days, since=since)
+        if platform.system() != "Darwin":
+            self.history_db_path = None
+        else:
+            self.history_db_path = Path.home() / "Library" / "Safari" / "History.db"
+    def parse(self) -> Generator[Dict, None, None]:
+        """Parse Safari history from SQLite database."""
+        if self.history_db_path is None:
+            logger.warning(
+                "Safari history parsing skipped (unsupported platform: %s)",
+                platform.system(),
+            )
+            return
+        if not self.history_db_path.exists():
+            logger.warning(f"Safari history not found at {self.history_db_path}")
+            return
+        # Safari's History.db may be locked, so copy it first
+        with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+        conn = None
+        try:
+            shutil.copy2(self.history_db_path, tmp_path)
+            conn = sqlite3.connect(tmp_path)
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            # Safari stores visit time as seconds since 2001-01-01 UTC (Core Data timestamp)
+            core_data_epoch = datetime(2001, 1, 1, tzinfo=timezone.utc)
+            cutoff_timestamp = (self.cutoff_date - core_data_epoch).total_seconds()
+            query = """
+                SELECT
+                    hv.visit_time,
+                    hi.url,
+                    hi.title
+                FROM history_visits hv
+                JOIN history_items hi ON hv.history_item = hi.id
+                WHERE hv.visit_time > ?
+                ORDER BY hv.visit_time DESC
+            """
+            cursor.execute(query, (cutoff_timestamp,))
+            for row in cursor:
+                # Convert Safari's Core Data timestamp to datetime
+                visit_time = core_data_epoch + timedelta(seconds=row["visit_time"])
+                yield {
+                    "url": row["url"],
+                    "title": row["title"],
+                    "visit_time": visit_time.strftime(UTC_FMT),
+                    "browser": "safari",
+                    "visit_count": 1,
+                }
+        except Exception as e:
+            logger.error(f"Error parsing Safari history: {e}")
+        finally:
+            if conn:
+                conn.close()
+            # Clean up temp file
+            try:
+                Path(tmp_path).unlink()
+            except OSError:
+                pass
+class ChromeParser(BrowserParser):
+    """Parse Chrome browser history."""
+    def __init__(self, lookback_days: int = 14, since: datetime | None = None):
+        super().__init__(lookback_days, since=since)
+        system = platform.system()
+        if system == "Darwin":
+            self.history_db_path = (
+                Path.home() / "Library" / "Application Support" / "Google" / "Chrome" / "Default" / "History"
+            )
+        elif system == "Linux":
+            self.history_db_path = Path.home() / ".config" / "google-chrome" / "Default" / "History"
+        else:
+            self.history_db_path = None
+    def parse(self) -> Generator[Dict, None, None]:
+        """Parse Chrome history from SQLite database."""
+        if self.history_db_path is None:
+            logger.warning(
+                "Chrome history parsing skipped (unsupported platform: %s)",
+                platform.system(),
+            )
+            return
+        if not self.history_db_path.exists():
+            logger.warning(f"Chrome history not found at {self.history_db_path}")
+            return
+        # Chrome's History may be locked, so copy it first
+        with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+        conn = None
+        try:
+            shutil.copy2(self.history_db_path, tmp_path)
+            conn = sqlite3.connect(tmp_path)
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            # Chrome stores time as microseconds since 1601-01-01 UTC (Windows epoch)
+            chrome_epoch = datetime(1601, 1, 1, tzinfo=timezone.utc)
+            cutoff_timestamp = int((self.cutoff_date - chrome_epoch).total_seconds() * 1_000_000)
+            query = """
+                SELECT
+                    urls.url,
+                    urls.title,
+                    urls.visit_count,
+                    visits.visit_time
+                FROM urls
+                LEFT JOIN visits ON urls.id = visits.url
+                WHERE visits.visit_time > ?
+                ORDER BY visits.visit_time DESC
+            """
+            cursor.execute(query, (cutoff_timestamp,))
+            for row in cursor:
+                # Convert Chrome's timestamp to datetime
+                visit_time = chrome_epoch + timedelta(microseconds=row["visit_time"])
+                yield {
+                    "url": row["url"],
+                    "title": row["title"],
+                    "visit_time": visit_time.strftime(UTC_FMT),
+                    "browser": "chrome",
+                    "visit_count": row["visit_count"] or 1,
+                }
+        except Exception as e:
+            logger.error(f"Error parsing Chrome history: {e}")
+        finally:
+            if conn:
+                conn.close()
+            # Clean up temp file
+            try:
+                Path(tmp_path).unlink()
+            except OSError:
+                pass
+class BrowserManager:
+    """Manage parsing of multiple browsers."""
+    def __init__(self, config: Dict, since: datetime | None = None):
+        self.config = config
+        self.lookback_days = config.get("indexing", {}).get("lookback_days", 14)
+        self.browsers = config.get("browsers", [])
+        self.since = since
+    def parse_all(self) -> Generator[Dict, None, None]:
+        """Parse history from all configured browsers."""
+        for browser in self.browsers:
+            browser_lower = browser.lower()
+            if browser_lower == "safari":
+                parser = SafariParser(self.lookback_days, since=self.since)
+                logger.info("Parsing Safari history...")
+                yield from parser.parse()
+            elif browser_lower == "chrome":
+                parser = ChromeParser(self.lookback_days, since=self.since)
+                logger.info("Parsing Chrome history...")
+                yield from parser.parse()
+            else:
+                logger.warning(f"Unknown browser: {browser}")

footprinter/ingest/chat_dedup.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Chat dedup detection and merge.
+Orchestrates near-duplicate chat detection via db.chats and merges
+duplicates by combining unique messages from source into target,
+marking the source as status='merged', and updating vector embeddings.
+"""
+import hashlib
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from footprinter.db import chats as chats_db
+logger = logging.getLogger(__name__)
+@dataclass
+class DuplicateGroup:
+    """A group of chats detected as potential duplicates."""
+    reason: str  # 'exact_title', 'fuzzy_title', 'message_overlap'
+    confidence: str  # 'high', 'medium'
+    chats: List[Dict]  # list of chat dicts
+    detail: str = ""  # human-readable explanation
+class ChatDedup:
+    """Duplicate detection and merge for chats."""
+    def __init__(self, db):
+        self.db = db
+        self._hash_cache: Dict[int, List[str]] = {}
+    def _get_hashes(self, chat_id: int) -> List[str]:
+        """Get message content hashes, with caching."""
+        if chat_id not in self._hash_cache:
+            self._hash_cache[chat_id] = chats_db.get_chat_message_hashes(self.db.conn, chat_id)
+        return self._hash_cache[chat_id]
+    def detect_duplicates(self) -> List[DuplicateGroup]:
+        """Detect potential duplicate chats.
+        Delegates to ``footprinter.db.chats.detect_duplicates`` and
+        converts plain dicts back to ``DuplicateGroup`` dataclasses.
+        """
+        from footprinter.db.chats import detect_duplicates as _detect
+        raw_groups = _detect(self.db.conn)
+        return [
+            DuplicateGroup(
+                reason=g["reason"],
+                confidence=g["confidence"],
+                chats=g["chats"],
+                detail=g["detail"],
+            )
+            for g in raw_groups
+        ]
+    def merge(
+        self,
+        target_id: int,
+        source_id: int,
+        vector_store: Optional[Any] = None,
+    ) -> Dict:
+        """Merge source chat into target.
+        1. Validate both exist and aren't already merged
+        2. Hash target's messages
+        3. Identify unique messages in source
+        4. Move unique messages to target
+        5. Recount target's message_count
+        6. Mark source as merged
+        7. Update vectors if vector_store provided
+        Returns dict with merge stats.
+        """
+        if target_id == source_id:
+            raise ValueError("Cannot merge a chat into itself")
+        target = chats_db.get_chat_by_id(self.db.conn, target_id)
+        source = chats_db.get_chat_by_id(self.db.conn, source_id)
+        if not target:
+            raise ValueError(f"Target chat {target_id} not found")
+        if not source:
+            raise ValueError(f"Source chat {source_id} not found")
+        if target.get("status") == "merged":
+            raise ValueError(f"Target chat {target_id} is already merged")
+        if source.get("status") == "merged":
+            raise ValueError(f"Source chat {source_id} is already merged")
+        # Hash target's messages to identify what's already there
+        target_hashes = set(self._get_hashes(target_id))
+        # Find unique messages in source (not already in target)
+        source_messages = chats_db.get_chat_messages(self.db.conn, source_id)
+        unique_message_ids = []
+        duplicate_count = 0
+        for msg in source_messages:
+            content = msg["content"] or ""
+            msg_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
+            if msg_hash not in target_hashes:
+                unique_message_ids.append(msg["id"])
+            else:
+                duplicate_count += 1
+        # Move unique messages to target
+        moved = 0
+        if unique_message_ids:
+            moved = chats_db.move_messages_to_chat(self.db.conn, source_id, target_id, unique_message_ids)
+        # Recount target's messages
+        new_count = chats_db.update_chat_message_count(self.db.conn, target_id)
+        # Mark source as merged
+        chats_db.mark_chat_merged(self.db.conn, source_id, target_id)
+        # Commit the entire merge atomically (move + recount + mark)
+        self.db.conn.commit()
+        # Invalidate hash cache
+        self._hash_cache.pop(target_id, None)
+        self._hash_cache.pop(source_id, None)
+        # Update vectors if store provided
+        vectors_updated = False
+        if vector_store:
+            try:
+                # Delete source chat vectors
+                vector_store.delete_by_metadata({"chat_id": source_id})
+                # Re-index moved messages under target
+                # (Caller is responsible for full re-vectorization)
+                vectors_updated = True
+            except Exception as e:
+                logger.warning("Vector update failed (non-fatal): %s", e)
+        result = {
+            "target_id": target_id,
+            "source_id": source_id,
+            "target_title": target.get("title"),
+            "source_title": source.get("title"),
+            "messages_moved": moved,
+            "duplicates_skipped": duplicate_count,
+            "new_message_count": new_count,
+            "vectors_updated": vectors_updated,
+        }
+        logger.info(
+            "Merged chat %d into %d: %d messages moved, %d duplicates skipped",
+            source_id,
+            target_id,
+            moved,
+            duplicate_count,
+        )
+        return result