PyPI - cosma-backend - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cosma-backend 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cosma_backend/__init__.py +14 -0
cosma_backend/__main__.py +4 -0
cosma_backend/api/__init__.py +29 -0
cosma_backend/api/files.py +154 -0
cosma_backend/api/index.py +114 -0
cosma_backend/api/models.py +28 -0
cosma_backend/api/search.py +166 -0
cosma_backend/api/status.py +28 -0
cosma_backend/api/updates.py +67 -0
cosma_backend/api/watch.py +156 -0
cosma_backend/app.py +192 -0
cosma_backend/db/__init__.py +2 -0
cosma_backend/db/database.py +638 -0
cosma_backend/discoverer/__init__.py +1 -0
cosma_backend/discoverer/discoverer.py +34 -0
cosma_backend/embedder/__init__.py +1 -0
cosma_backend/embedder/embedder.py +637 -0
cosma_backend/logging.py +73 -0
cosma_backend/models/__init__.py +3 -0
cosma_backend/models/file.py +169 -0
cosma_backend/models/status.py +10 -0
cosma_backend/models/update.py +202 -0
cosma_backend/models/watch.py +132 -0
cosma_backend/pipeline/__init__.py +2 -0
cosma_backend/pipeline/pipeline.py +222 -0
cosma_backend/schema.sql +319 -0
cosma_backend/searcher/__init__.py +1 -0
cosma_backend/searcher/searcher.py +397 -0
cosma_backend/summarizer/__init__.py +44 -0
cosma_backend/summarizer/summarizer.py +1075 -0
cosma_backend/utils/bundled.py +24 -0
cosma_backend/utils/pubsub.py +31 -0
cosma_backend/utils/sse.py +92 -0
cosma_backend/watcher/__init__.py +1 -0
cosma_backend/watcher/awatchdog.py +80 -0
cosma_backend/watcher/watcher.py +257 -0
cosma_backend-0.1.0.dist-info/METADATA +23 -0
cosma_backend-0.1.0.dist-info/RECORD +39 -0
cosma_backend-0.1.0.dist-info/WHEEL +4 -0

cosma_backend/pipeline/pipeline.py ADDED Viewed

@@ -0,0 +1,222 @@
+from __future__ import annotations
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterator, Optional
+import logging
+from backend.db import Database
+from backend.logging import sm
+from backend.models import File
+from backend.models.status import ProcessingStatus
+from backend.models.update import Update
+from backend.discoverer import Discoverer
+from backend.parser import FileParser
+from backend.summarizer import AutoSummarizer
+from backend.embedder import AutoEmbedder
+from backend.utils.pubsub import Hub
+logger = logging.getLogger(__name__)
+class PipelineResult:
+    """Results from processing a batch of files."""
+    def __init__(self):
+        self.discovered = 0
+        self.skipped = 0
+        self.parsed = 0
+        self.summarized = 0
+        self.embedded = 0
+        self.failed = 0
+        self.errors: list[tuple[str, str]] = []  # (file_path, error)
+class Pipeline:
+    """
+    Main pipeline orchestrator. Processes files through:
+    Discovery → Parsing → Summarization → Embedding
+    """
+    def __init__(
+        self,
+        db: Database,
+        updates_hub: Optional[Hub] = None,
+        discoverer: Optional[Discoverer] = None,
+        parser: Optional[FileParser] = None,
+        summarizer: Optional[AutoSummarizer] = None,
+        embedder: Optional[AutoEmbedder] = None,
+    ):
+        self.db = db
+        self.updates_hub = updates_hub
+        self.discoverer = discoverer or Discoverer()
+        self.parser = parser or FileParser()
+        self.summarizer = summarizer or AutoSummarizer()
+        self.embedder = embedder or AutoEmbedder()
+    async def process_directory(self, path: str | Path):
+        """
+        Process all files in a directory through the full pipeline.
+        After processing, deletes any files from the database that weren't seen
+        (i.e., files that no longer exist in the filesystem).
+        Args:
+            path: Root directory to process
+        Returns:
+            PipelineResult with statistics
+        """
+        # result = PipelineResult()
+        # Publish directory processing started
+        self._publish_update(Update.directory_processing_started(str(path)))
+        started_processing = datetime.now(timezone.utc)
+        # Stage 1: Discovery
+        logger.info(f"Discovering files in {path}")
+        for file in self.discoverer.files_in(path):
+            # result.discovered += 1
+            try:
+                # Update the timestamp to mark this file as still present in the filesystem
+                await self.db.update_file_timestamp(file.file_path)
+                # Check if file needs processing
+                if await self._should_skip_file(file):
+                    logger.info(sm("Skipping processing file", file=file))
+                    self._publish_update(Update.file_skipped(
+                        file.file_path,
+                        file.filename,
+                        reason="already processed"
+                    ))
+                    # result.skipped += 1
+                    continue
+                # Process the file through the pipeline
+                await self.process_file(file)
+            except Exception:
+                continue
+        try:
+            logger.info(sm("Deleting files no longer present in filesystem", started_processing=started_processing, path=str(path)))
+            rows = await self.db.delete_files_not_updated_since(started_processing, str(path))
+            logger.info(sm("Deleted unused files", count=len(rows)))
+        except Exception as e:
+            logger.error(sm("Error while deleting unused files", error=str(e)))
+        logger.info(sm("Completed processing directory", directory=str(path)))
+        self._publish_update(Update.directory_processing_completed(str(path)))
+    async def process_file(self, file: File):
+        """
+        Process a single file through the pipeline.
+        Args:
+            file_path: Path to the file
+        Returns:
+            Processed File or None if failed
+        """
+        # if result is None:
+        #     result = PipelineResult()
+        try:
+            # Stage 1: Parse
+            self._publish_update(Update.file_parsing(file.file_path, file.filename))
+            await self.parser.parse_file(file)
+            self._publish_update(Update.file_parsed(file.file_path, file.filename))
+            # Check if file hash is different before proceeding
+            if not await self._has_file_changed(file):
+                logger.info(sm("Skipping processing file, hashed not changed", file=file))
+                self._publish_update(Update.file_skipped(
+                    file.file_path,
+                    file.filename,
+                    reason="content not changed"
+                ))
+                return
+            # Stage 2: Summarize
+            self._publish_update(Update.file_summarizing(file.file_path, file.filename))
+            await self.summarizer.summarize(file)
+            await self._save_to_db(file)
+            self._publish_update(Update.file_summarized(file.file_path, file.filename))
+            # Stage 3: Embed (if embedder is available)
+            self._publish_update(Update.file_embedding(file.file_path, file.filename))
+            await self.embedder.embed(file)
+            # embeddings need special care when saving
+            await self._save_embeddings(file)
+            self._publish_update(Update.file_embedded(file.file_path, file.filename))
+            # Mark as complete
+            self._publish_update(Update.file_complete(file.file_path, file.filename))
+        except Exception as e:
+            # result.failed += 1
+            # result.errors.append((str(file_path), str(e)))
+            logger.error(sm("Pipeline failed for file", file=file, error=e))
+            # Publish failure update
+            self._publish_update(Update.file_failed(
+                file.file_path,
+                file.filename,
+                error=str(e)
+            ))
+            # Save failed state to DB if we have file_data
+            file.status = ProcessingStatus.FAILED
+            file.processing_error = str(e)
+            await self._save_to_db(file)
+            raise e
+    async def is_supported(self, file: File) -> bool:
+        """Check if a file is supported for processing"""
+        return self.parser.is_supported(file)
+    async def _should_skip_file(self, file: File) -> bool:
+        """Check if file should be skipped based on DB state."""
+        if not await self.is_supported(file):
+            return False
+        saved_file = await self.db.get_file_by_path(file.file_path)
+        if not saved_file or saved_file.status not in (ProcessingStatus.COMPLETE, ProcessingStatus.FAILED):
+            logger.info(sm("Should skip", file=file, status=saved_file.status if saved_file else "No saved file"))
+            return False
+        saved_modified = saved_file.modified.replace(microsecond=0)
+        current_modified = file.modified.replace(microsecond=0)
+        logger.info(sm("Should skip", file=file, saved_modified=saved_modified, current_modified=current_modified))
+        return saved_modified == current_modified
+    async def _has_file_changed(self, file: File) -> bool:
+        """Check if file has been changed based on hash."""
+        saved_file = await self.db.get_file_by_path(file.file_path)
+        logger.info(sm("Saved file", saved_file=saved_file, status=saved_file.status if saved_file else "N/A"))
+        if not saved_file or saved_file.status is not ProcessingStatus.COMPLETE:
+            return True
+        return saved_file.content_hash != file.content_hash
+    async def _save_to_db(self, file: File) -> None:
+        """Save file data to database."""
+        await self.db.upsert_file(file)
+    async def _save_embeddings(self, file: File) -> None:
+        """Save file embeddings to database."""
+        await self._save_to_db(file)
+        await self.db.upsert_file_embeddings(file)
+    def _publish_update(self, update: Any):
+        if self.updates_hub:
+            self.updates_hub.publish(update)

cosma_backend/schema.sql ADDED Viewed

@@ -0,0 +1,319 @@
+-- =============================================================================
+-- File Processing and Organization Database Schema
+-- =============================================================================
+--
+-- =============================================================================
+-- Watched Directories Table
+-- =============================================================================
+-- Table for tracking directories that are being monitored for file changes
+CREATE TABLE IF NOT EXISTS watched_directories (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    path TEXT NOT NULL UNIQUE,
+    is_active INTEGER DEFAULT 1 CHECK (is_active IN (0, 1)),
+    recursive INTEGER DEFAULT 1 CHECK (recursive IN (0, 1)),
+    file_pattern TEXT,  -- Optional glob pattern for filtering files (e.g., "*.pdf")
+    last_scan INTEGER,
+    created_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL,
+    updated_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL
+);
+-- Index for watched directories
+CREATE INDEX IF NOT EXISTS idx_watched_directories_is_active ON watched_directories(is_active);
+CREATE INDEX IF NOT EXISTS idx_watched_directories_path ON watched_directories(path);
+-- Trigger for updating watched_directories timestamp
+CREATE TRIGGER IF NOT EXISTS update_watched_directories_timestamp
+    AFTER UPDATE ON watched_directories
+    FOR EACH ROW
+BEGIN
+    UPDATE watched_directories SET updated_at = (strftime('%s', 'now')) WHERE id = NEW.id;
+END;
+-- =============================================================================
+-- =============================================================================
+-- Files Table
+-- =============================================================================
+-- Main files table with comprehensive metadata
+CREATE TABLE IF NOT EXISTS files (
+    -- Primary key
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    -- Stage 0: Discovery (file system metadata) - Required fields
+    file_path TEXT NOT NULL,
+    filename TEXT NOT NULL,
+    extension TEXT NOT NULL,
+    file_size INTEGER NOT NULL,
+    created INTEGER NOT NULL,
+    modified INTEGER NOT NULL,
+    accessed INTEGER NOT NULL,
+    -- Stage 1: Parsing (content extraction)
+    content_type TEXT,
+    content_hash TEXT,
+    parsed_at INTEGER,
+    -- Stage 2: Summarization (AI processing)
+    summary TEXT,  -- AI-generated summary
+    title TEXT,
+    summarized_at INTEGER,
+    -- Stage 3: Embedding (vector representation)
+    embedded_at INTEGER,
+    -- Meta
+    status TEXT DEFAULT 'DISCOVERED' CHECK (status IN ('DISCOVERED', 'PARSED', 'SUMMARIZED', 'COMPLETE', 'FAILED')),
+    processing_error TEXT,
+    -- File owner and permissions (if available)
+    owner TEXT,
+    permissions TEXT,
+    -- System timestamps
+    created_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL,
+    updated_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL
+);
+-- =============================================================================
+-- Vector Embeddings Table (using sqlite-vec)
+-- =============================================================================
+-- Virtual table for storing file embeddings
+-- Note: Adjust the dimension (e.g., float[384], float[768], float[1536])
+-- based on your embedding model's output size
+CREATE VIRTUAL TABLE IF NOT EXISTS file_embeddings USING vec0(
+    file_id INTEGER PRIMARY KEY,  -- Links to files.id
+    embedding_model TEXT,
+    embedding_dimensions INTEGER,
+    embedding float[1536]
+);
+CREATE TRIGGER IF NOT EXISTS delete_file_embeddings
+AFTER DELETE ON files
+BEGIN
+    DELETE FROM file_embeddings WHERE file_id = OLD.id;
+END;
+-- =============================================================================
+-- Keywords Table
+-- =============================================================================
+-- Keywords table (many-to-many relationship with files)
+CREATE TABLE IF NOT EXISTS file_keywords (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    file_id INTEGER NOT NULL,
+    keyword TEXT NOT NULL,
+    -- Indexes for performance
+    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE,
+    UNIQUE(file_id, keyword)  -- Prevent duplicate keywords per file
+);
+-- =============================================================================
+-- Full-Text Search Table (using FTS5)
+-- =============================================================================
+-- Create a view that combines summary and keywords for each file
+CREATE VIEW IF NOT EXISTS files_searchable AS
+SELECT
+    f.id,
+    f.summary,
+    GROUP_CONCAT(fk.keyword, ' ') AS keywords
+FROM files f
+LEFT JOIN file_keywords fk ON f.id = fk.file_id
+GROUP BY f.id;
+-- Create the contentless FTS5 table
+CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
+    file_path,
+    title,
+    summary,
+    keywords,
+    content='',
+    contentless_delete=1  -- Use this for UPDATE/DELETE support
+);
+-- Triggers to keep FTS index synchronized with your data
+CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
+    INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
+    SELECT
+        new.id,
+        new.file_path,
+        new.title,
+        new.summary,
+        GROUP_CONCAT(fk.keyword, ' ')
+    FROM file_keywords fk
+    WHERE fk.file_id = new.id;
+END;
+CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
+    DELETE FROM files_fts WHERE rowid = old.id;
+END;
+CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
+    DELETE FROM files_fts WHERE rowid = old.id;
+    INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
+    SELECT
+        new.id,
+        new.file_path,
+        new.title,
+        new.summary,
+        GROUP_CONCAT(fk.keyword, ' ')
+    FROM file_keywords fk
+    WHERE fk.file_id = new.id;
+END;
+-- Trigger for keyword changes
+CREATE TRIGGER IF NOT EXISTS file_keywords_ai AFTER INSERT ON file_keywords BEGIN
+    DELETE FROM files_fts WHERE rowid = new.file_id;
+    INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
+    SELECT
+        f.id,
+        f.file_path,
+        f.title,
+        f.summary,
+        GROUP_CONCAT(fk.keyword, ' ')
+    FROM files f
+    LEFT JOIN file_keywords fk ON f.id = fk.file_id
+    WHERE f.id = new.file_id
+    GROUP BY f.id;
+END;
+CREATE TRIGGER IF NOT EXISTS file_keywords_ad AFTER DELETE ON file_keywords BEGIN
+    DELETE FROM files_fts WHERE rowid = old.file_id;
+    INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
+    SELECT
+        f.id,
+        f.file_path,
+        f.title,
+        f.summary,
+        GROUP_CONCAT(fk.keyword, ' ')
+    FROM files f
+    LEFT JOIN file_keywords fk ON f.id = fk.file_id
+    WHERE f.id = old.file_id
+    GROUP BY f.id;
+END;
+CREATE TRIGGER IF NOT EXISTS file_keywords_au AFTER UPDATE ON file_keywords BEGIN
+    DELETE FROM files_fts WHERE rowid = old.file_id;
+    INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
+    SELECT
+        f.id,
+        f.file_path,
+        f.title,
+        f.summary,
+        GROUP_CONCAT(fk.keyword, ' ')
+    FROM files f
+    LEFT JOIN file_keywords fk ON f.id = fk.file_id
+    WHERE f.id = old.file_id
+    GROUP BY f.id;
+END;
+-- =============================================================================
+-- Processing Statistics Table
+-- =============================================================================
+-- Processing statistics table
+CREATE TABLE IF NOT EXISTS processing_stats (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    session_id TEXT NOT NULL,  -- Processing session identifier
+    total_files INTEGER DEFAULT 0,
+    processed_files INTEGER DEFAULT 0,
+    failed_files INTEGER DEFAULT 0,
+    skipped_files INTEGER DEFAULT 0,
+    processing_time_seconds REAL,
+    started_at INTEGER NOT NULL,
+    completed_at INTEGER,
+    status TEXT DEFAULT 'running' CHECK (status IN ('running', 'completed', 'failed', 'cancelled'))
+);
+-- =============================================================================
+-- Indexes for Performance
+-- =============================================================================
+-- Main files table indexes
+CREATE INDEX IF NOT EXISTS idx_files_extension ON files(extension);
+CREATE INDEX IF NOT EXISTS idx_files_content_hash ON files(content_hash);
+CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
+CREATE INDEX IF NOT EXISTS idx_files_created_at ON files(created_at);
+CREATE INDEX IF NOT EXISTS idx_files_file_path ON files(file_path);
+CREATE INDEX IF NOT EXISTS idx_files_filename ON files(filename);
+-- Keywords table indexes
+CREATE INDEX IF NOT EXISTS idx_keywords_file_id ON file_keywords(file_id);
+CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON file_keywords(keyword);
+-- Processing stats indexes
+CREATE INDEX IF NOT EXISTS idx_stats_session_id ON processing_stats(session_id);
+CREATE INDEX IF NOT EXISTS idx_stats_started_at ON processing_stats(started_at);
+-- =============================================================================
+-- Triggers for Automatic Timestamp Updates
+-- =============================================================================
+-- Update the updated_at timestamp when files are modified
+CREATE TRIGGER IF NOT EXISTS update_files_timestamp
+    AFTER UPDATE ON files
+    FOR EACH ROW
+BEGIN
+    UPDATE files SET updated_at = (strftime('%s', 'now')) WHERE id = NEW.id;
+END;
+-- =============================================================================
+-- Views for Common Queries
+-- =============================================================================
+-- View for files with their keywords
+CREATE VIEW IF NOT EXISTS files_with_keywords AS
+SELECT
+    f.*,
+    GROUP_CONCAT(fk.keyword, ', ') as keywords
+FROM files f
+LEFT JOIN file_keywords fk ON f.id = fk.file_id
+GROUP BY f.id;
+-- View for processing summary
+CREATE VIEW IF NOT EXISTS processing_summary AS
+SELECT
+    status,
+    COUNT(*) as count,
+    AVG(file_size) as avg_file_size,
+    SUM(file_size) as total_size
+FROM files
+WHERE status IS NOT NULL
+GROUP BY status;
+-- View for recent activity
+CREATE VIEW IF NOT EXISTS recent_activity AS
+SELECT
+    id,
+    filename,
+    extension,
+    status,
+    datetime(created_at, 'unixepoch') as created_date,
+    datetime(parsed_at, 'unixepoch') as parsed_date,
+    datetime(summarized_at, 'unixepoch') as summarized_date,
+    datetime(embedded_at, 'unixepoch') as embedded_date
+FROM files
+ORDER BY created_at DESC
+LIMIT 100;
+-- =============================================================================
+-- Initial Data (Optional)
+-- =============================================================================
+-- Insert initial processing session if none exists
+INSERT OR IGNORE INTO processing_stats (
+    id,
+    session_id,
+    started_at,
+    status
+) VALUES (
+    1,
+    'initial_session',
+    (strftime('%s', 'now')),
+    'completed'
+);

cosma_backend/searcher/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .searcher import HybridSearcher as HybridSearcher