PyPI - cosma-backend - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cosma-backend 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cosma_backend/__init__.py +14 -0
cosma_backend/__main__.py +4 -0
cosma_backend/api/__init__.py +29 -0
cosma_backend/api/files.py +154 -0
cosma_backend/api/index.py +114 -0
cosma_backend/api/models.py +28 -0
cosma_backend/api/search.py +166 -0
cosma_backend/api/status.py +28 -0
cosma_backend/api/updates.py +67 -0
cosma_backend/api/watch.py +156 -0
cosma_backend/app.py +192 -0
cosma_backend/db/__init__.py +2 -0
cosma_backend/db/database.py +638 -0
cosma_backend/discoverer/__init__.py +1 -0
cosma_backend/discoverer/discoverer.py +34 -0
cosma_backend/embedder/__init__.py +1 -0
cosma_backend/embedder/embedder.py +637 -0
cosma_backend/logging.py +73 -0
cosma_backend/models/__init__.py +3 -0
cosma_backend/models/file.py +169 -0
cosma_backend/models/status.py +10 -0
cosma_backend/models/update.py +202 -0
cosma_backend/models/watch.py +132 -0
cosma_backend/pipeline/__init__.py +2 -0
cosma_backend/pipeline/pipeline.py +222 -0
cosma_backend/schema.sql +319 -0
cosma_backend/searcher/__init__.py +1 -0
cosma_backend/searcher/searcher.py +397 -0
cosma_backend/summarizer/__init__.py +44 -0
cosma_backend/summarizer/summarizer.py +1075 -0
cosma_backend/utils/bundled.py +24 -0
cosma_backend/utils/pubsub.py +31 -0
cosma_backend/utils/sse.py +92 -0
cosma_backend/watcher/__init__.py +1 -0
cosma_backend/watcher/awatchdog.py +80 -0
cosma_backend/watcher/watcher.py +257 -0
cosma_backend-0.1.0.dist-info/METADATA +23 -0
cosma_backend-0.1.0.dist-info/RECORD +39 -0
cosma_backend-0.1.0.dist-info/WHEEL +4 -0

cosma_backend/logging.py ADDED Viewed

@@ -0,0 +1,73 @@
+import json
+import logging
+from datetime import datetime, date, time, timedelta
+from decimal import Decimal
+from pathlib import Path
+from uuid import UUID
+import numpy as np
+class Encoder(json.JSONEncoder):
+    def default(self, o):
+        # Handle sets
+        if isinstance(o, set):
+            return tuple(o)
+        # Handle datetime types
+        if isinstance(o, datetime):
+            return o.isoformat()
+        if isinstance(o, date):
+            return o.isoformat()
+        if isinstance(o, time):
+            return o.isoformat()
+        if isinstance(o, timedelta):
+            return o.total_seconds()
+        # Handle numeric types
+        if isinstance(o, Decimal):
+            return float(o)
+        # Handle UUID (common in databases)
+        if isinstance(o, UUID):
+            return str(o)
+        # Handle Path objects
+        if isinstance(o, Path):
+            return str(o)
+        # Handle bytes
+        if isinstance(o, bytes):
+            return o.decode('utf-8', errors='replace')
+        # Handle numpy arrays
+        if isinstance(o, np.ndarray):
+            return f"<ndarray shape={o.shape} dtype={o.dtype}>"
+        # Handle File model from backend.models
+        if hasattr(o, '__class__') and o.__class__.__name__ == 'File':
+            return {
+                'id': getattr(o, 'id', None),
+                'filename': getattr(o, 'filename', None),
+                'file_path': getattr(o, 'file_path', None),
+                'status': getattr(o, 'status', None).name if hasattr(getattr(o, 'status', None), 'name') else str(getattr(o, 'status', None)),
+                'content_hash': getattr(o, 'content_hash', None)
+            }
+        # return super().default(o)
+        if hasattr(o, '__str__'):
+            return str(o)
+        return repr(o)
+class StructuredMessage:
+    def __init__(self, message, /, **kwargs):
+        self.message = message
+        self.kwargs = kwargs
+    def __str__(self):
+        s = Encoder().encode(self.kwargs)
+        return f'{self.message} {s}'
+sm = StructuredMessage   # optional, to improve readability

cosma_backend/models/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .file import File as File
+from .status import ProcessingStatus as ProcessingStatus
+from .watch import WatchedDirectory as WatchedDirectory

cosma_backend/models/file.py ADDED Viewed

@@ -0,0 +1,169 @@
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+import sqlite3
+from typing import Any, Optional, List, Self, TYPE_CHECKING
+import logging
+import numpy as np
+from backend.models.status import ProcessingStatus
+if TYPE_CHECKING:
+    from backend.api.models import FileResponse
+logger = logging.getLogger(__name__)
+@dataclass
+class File:
+    """
+    A unified file model that progresses through the pipeline stages.
+    Each stage adds more data to the model.
+    """
+    # Stage 0: Discovery (file system metadata)
+    path: Path
+    file_path: str
+    filename: str
+    extension: str
+    file_size: int
+    created: datetime
+    modified: datetime
+    accessed: datetime
+    # Stage 1: Parsing (content extraction)
+    id: Optional[int] = None
+    content_type: Optional[str] = None
+    content: Optional[str] = None
+    content_hash: Optional[str] = None
+    parsed_at: Optional[datetime] = None
+    # Stage 2: Summarization (AI processing)
+    summary: Optional[str] = None
+    title: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    summarized_at: Optional[datetime] = None
+    # Stage 3: Embedding (vector representation)
+    embedding: Optional[np.ndarray] = None
+    embedding_model: Optional[str] = None
+    embedding_dimensions: Optional[int] = None
+    embedded_at: Optional[datetime] = None
+    # Meta
+    status: ProcessingStatus = ProcessingStatus.DISCOVERED
+    processing_error: Optional[str] = None
+    @classmethod
+    def from_path(cls, path: Path) -> Self:
+        path = path.resolve()
+        file_stats = path.stat()
+        modified_at = datetime.fromtimestamp(file_stats.st_mtime)
+        created_at = datetime.fromtimestamp(file_stats.st_ctime)
+        accessed_at = datetime.fromtimestamp(file_stats.st_atime)
+        return cls(
+            path=path,
+            file_path=str(path),
+            filename=path.name,
+            extension=path.suffix,
+            file_size=file_stats.st_size,
+            created=created_at,
+            modified=modified_at,
+            accessed=accessed_at,
+        )
+    @classmethod
+    def from_row(cls, row: sqlite3.Row) -> Self:
+        """
+        Create a File instance from a database row.
+        Args:
+            row: A database row (dict-like object with column names as keys)
+        Returns:
+            A File instance populated with data from the row
+        """
+        # Helper function to safely get a value from a Row object
+        def get_value(key: str) -> Optional[Any]:
+            try:
+                return row[key]
+            except (KeyError, IndexError):
+                return None
+        # Helper function to parse unix timestamps from database
+        def parse_timestamp(value) -> Optional[datetime]:
+            if not value:
+                return None
+            # If already a datetime object, return it
+            if isinstance(value, datetime):
+                return value
+            # Parse unix timestamp
+            try:
+                return datetime.fromtimestamp(value)
+            except (ValueError, AttributeError):
+                logger.warning(f"Failed to parse timestamp: {value}")
+                return None
+        # Parse status from string to enum
+        status = ProcessingStatus[row["status"]] if row["status"] else ProcessingStatus.DISCOVERED
+        # Parse timestamps (they're stored as UNIX timestamps in the database)
+        created = parse_timestamp(row["created"])
+        modified = parse_timestamp(row["modified"])
+        accessed = parse_timestamp(row["accessed"])
+        parsed_at = parse_timestamp(get_value("parsed_at"))
+        summarized_at = parse_timestamp(get_value("summarized_at"))
+        embedded_at = parse_timestamp(get_value("embedded_at"))
+        # Parse keywords if present (stored as comma or || separated string)
+        keywords = None
+        keywords_value = get_value("keywords") or get_value("keywords_str")
+        if keywords_value:
+            # Handle both comma and || separators
+            keywords = [k.strip() for k in keywords_value.replace("||", ",").split(",") if k.strip()]
+        return cls(
+            id=get_value("id"),
+            path=Path(row["file_path"]),
+            file_path=row["file_path"],
+            filename=row["filename"],
+            extension=row["extension"],
+            file_size=row["file_size"],
+            created=created,
+            modified=modified,
+            accessed=accessed,
+            content_type=get_value("content_type"),
+            content_hash=get_value("content_hash"),
+            parsed_at=parsed_at,
+            summary=get_value("summary"),
+            title=get_value("title"),
+            keywords=keywords,
+            summarized_at=summarized_at,
+            embedded_at=embedded_at,
+            status=status,
+            processing_error=get_value("processing_error"),
+        )
+    def to_response(self) -> "FileResponse":
+        """
+        Convert this File instance to a FileResponse for API serialization.
+        Returns:
+            A FileResponse instance with the relevant fields from this File
+        """
+        from backend.api.models import FileResponse
+        return FileResponse(
+            file_path=self.file_path,
+            filename=self.filename,
+            extension=self.extension,
+            created=self.created,
+            modified=self.modified,
+            accessed=self.accessed,
+            title=self.title,
+            summary=self.summary,
+        )

cosma_backend/models/status.py ADDED Viewed

@@ -0,0 +1,10 @@
+import enum
+class ProcessingStatus(enum.Enum):
+    DISCOVERED = 0
+    PARSED = 1
+    SUMMARIZED = 2
+    COMPLETE = 3
+    FAILED = 4

cosma_backend/models/update.py ADDED Viewed

@@ -0,0 +1,202 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, Self
+import enum
+from backend.utils.sse import ServerSentEvent
+class UpdateOpcode(enum.Enum):
+    """
+    Opcodes for different types of backend updates sent to the frontend via SSE.
+    """
+    # File processing updates
+    FILE_PARSING = "file_parsing"
+    FILE_PARSED = "file_parsed"
+    FILE_SUMMARIZING = "file_summarizing"
+    FILE_SUMMARIZED = "file_summarized"
+    FILE_EMBEDDING = "file_embedding"
+    FILE_EMBEDDED = "file_embedded"
+    FILE_COMPLETE = "file_complete"
+    FILE_FAILED = "file_failed"
+    FILE_SKIPPED = "file_skipped"
+    # File system events (from watcher)
+    FILE_CREATED = "file_created"
+    FILE_MODIFIED = "file_modified"
+    FILE_DELETED = "file_deleted"
+    FILE_MOVED = "file_moved"
+    # Watch directory updates
+    WATCH_ADDED = "watch_added"
+    WATCH_REMOVED = "watch_removed"
+    WATCH_STARTED = "watch_started"
+    # Directory processing updates
+    DIRECTORY_PROCESSING_STARTED = "directory_processing_started"
+    DIRECTORY_PROCESSING_COMPLETED = "directory_processing_completed"
+    # General updates
+    STATUS_UPDATE = "status_update"
+    ERROR = "error"
+    INFO = "info"
+    SHUTTING_DOWN = "shutting_down"
+@dataclass
+class Update:
+    """
+    A model representing a backend update to be sent to the frontend via SSE.
+    Each update has an opcode (message type) and optional data payload.
+    """
+    opcode: UpdateOpcode
+    data: Dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def create(cls, opcode: UpdateOpcode, **kwargs) -> Self:
+        """
+        Create an Update instance with the given opcode and data.
+        Args:
+            opcode: The type of update (UpdateOpcode enum value)
+            **kwargs: Arbitrary keyword arguments that will be stored in the data dict
+        Returns:
+            An Update instance
+        Example:
+            >>> update = Update.create(UpdateOpcode.FILE_DISCOVERED, path="/docs/file.pdf", size=1024)
+            >>> update.opcode
+            <UpdateOpcode.FILE_DISCOVERED: 'file_discovered'>
+            >>> update.data
+            {'path': '/docs/file.pdf', 'size': 1024}
+        """
+        return cls(opcode=opcode, data=kwargs)
+    @classmethod
+    def file_parsing(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_PARSING update."""
+        return cls.create(UpdateOpcode.FILE_PARSING, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_parsed(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_PARSED update."""
+        return cls.create(UpdateOpcode.FILE_PARSED, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_summarizing(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_SUMMARIZING update."""
+        return cls.create(UpdateOpcode.FILE_SUMMARIZING, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_summarized(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_SUMMARIZED update."""
+        return cls.create(UpdateOpcode.FILE_SUMMARIZED, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_embedding(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_EMBEDDING update."""
+        return cls.create(UpdateOpcode.FILE_EMBEDDING, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_embedded(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_EMBEDDED update."""
+        return cls.create(UpdateOpcode.FILE_EMBEDDED, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_complete(cls, path: str, filename: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_COMPLETE update."""
+        return cls.create(UpdateOpcode.FILE_COMPLETE, path=path, filename=filename, **kwargs)
+    @classmethod
+    def file_skipped(cls, path: str, filename: str, reason: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_SKIPPED update."""
+        return cls.create(UpdateOpcode.FILE_SKIPPED, path=path, filename=filename, reason=reason, **kwargs)
+    @classmethod
+    def file_failed(cls, path: str, filename: str, error: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_FAILED update."""
+        return cls.create(UpdateOpcode.FILE_FAILED, path=path, filename=filename, error=error, **kwargs)
+    @classmethod
+    def file_created(cls, path: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_CREATED update."""
+        return cls.create(UpdateOpcode.FILE_CREATED, path=path, **kwargs)
+    @classmethod
+    def file_modified(cls, path: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_MODIFIED update."""
+        return cls.create(UpdateOpcode.FILE_MODIFIED, path=path, **kwargs)
+    @classmethod
+    def file_deleted(cls, path: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_DELETED update."""
+        return cls.create(UpdateOpcode.FILE_DELETED, path=path, **kwargs)
+    @classmethod
+    def file_moved(cls, src_path: str, dest_path: str, **kwargs) -> Self:
+        """Convenience method for creating a FILE_MOVED update."""
+        return cls.create(UpdateOpcode.FILE_MOVED, src_path=src_path, dest_path=dest_path, **kwargs)
+    @classmethod
+    def directory_processing_started(cls, path: str, **kwargs) -> Self:
+        """Convenience method for creating a DIRECTORY_PROCESSING_STARTED update."""
+        return cls.create(UpdateOpcode.DIRECTORY_PROCESSING_STARTED, path=path, **kwargs)
+    @classmethod
+    def directory_processing_completed(cls, path: str, **kwargs) -> Self:
+        """Convenience method for creating a DIRECTORY_PROCESSING_COMPLETED update."""
+        return cls.create(UpdateOpcode.DIRECTORY_PROCESSING_COMPLETED, path=path, **kwargs)
+    @classmethod
+    def error(cls, message: str, **kwargs) -> Self:
+        """Convenience method for creating an ERROR update."""
+        return cls.create(UpdateOpcode.ERROR, message=message, **kwargs)
+    @classmethod
+    def info(cls, message: str, **kwargs) -> Self:
+        """Convenience method for creating an INFO update."""
+        return cls.create(UpdateOpcode.INFO, message=message, **kwargs)
+    @classmethod
+    def shutting_down(cls, **kwargs) -> Self:
+        """Convenience method for creating a SHUTTING_DOWN update."""
+        return cls.create(UpdateOpcode.SHUTTING_DOWN, **kwargs)
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the Update to a dictionary for serialization.
+        Returns:
+            A dictionary with 'opcode' and 'data' keys
+        """
+        return {
+            "opcode": self.opcode.value,
+            "data": self.data
+        }
+    def to_sse(self, event_id: str | None = None) -> ServerSentEvent:
+        """
+        Convert the Update to a ServerSentEvent for SSE transmission.
+        Args:
+            event_id: Optional event ID for SSE reconnection support
+        Returns:
+            A ServerSentEvent instance ready to be encoded and sent
+        Example:
+            >>> update = Update.file_parsed("/docs/file.pdf", "file.pdf")
+            >>> sse = update.to_sse()
+            >>> message = sse.encode()  # Ready to send via SSE endpoint
+        """
+        return ServerSentEvent(
+            data=self.to_dict(),
+            event="update",  # All updates use the same event type
+            id=event_id,
+        )
+    def __str__(self) -> str:
+        """Return a string representation of the update."""
+        return f"Update(opcode={self.opcode.value}, data={self.data})"

cosma_backend/models/watch.py ADDED Viewed

@@ -0,0 +1,132 @@
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+import sqlite3
+from typing import Any, Optional, Self
+import logging
+logger = logging.getLogger(__name__)
+@dataclass
+class WatchedDirectory:
+    """
+    A model representing a directory being watched for file changes.
+    """
+    # Core fields
+    path: Path
+    is_active: bool = True
+    recursive: bool = True
+    file_pattern: Optional[str] = None
+    # Database fields
+    id: Optional[int] = None
+    last_scan: Optional[datetime] = None
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+    @classmethod
+    def from_path(cls, path: Path, recursive: bool = True, file_pattern: Optional[str] = None) -> Self:
+        """
+        Create a WatchedDirectory instance from a path.
+        Args:
+            path: Path to the directory to watch
+            recursive: Whether to watch subdirectories recursively
+            file_pattern: Optional glob pattern for filtering files (e.g., "*.pdf")
+        Returns:
+            A WatchedDirectory instance
+        """
+        path = path.resolve()
+        if not path.exists():
+            raise ValueError(f"Path does not exist: {path}")
+        if not path.is_dir():
+            raise ValueError(f"Path is not a directory: {path}")
+        return cls(
+            path=path,
+            recursive=recursive,
+            file_pattern=file_pattern,
+        )
+    @classmethod
+    def from_row(cls, row: sqlite3.Row) -> Self:
+        """
+        Create a WatchedDirectory instance from a database row.
+        Args:
+            row: A database row (dict-like object with column names as keys)
+        Returns:
+            A WatchedDirectory instance populated with data from the row
+        """
+        # Helper function to safely get a value from a Row object
+        def get_value(key: str) -> Optional[Any]:
+            try:
+                return row[key]
+            except (KeyError, IndexError):
+                return None
+        # Helper function to parse unix timestamps from database
+        def parse_timestamp(value) -> Optional[datetime]:
+            if not value:
+                return None
+            # If already a datetime object, return it
+            if isinstance(value, datetime):
+                return value
+            # Parse unix timestamp
+            try:
+                return datetime.fromtimestamp(value)
+            except (ValueError, AttributeError):
+                logger.warning(f"Failed to parse timestamp: {value}")
+                return None
+        # Parse timestamps
+        last_scan = parse_timestamp(get_value("last_scan"))
+        created_at = parse_timestamp(get_value("created_at"))
+        updated_at = parse_timestamp(get_value("updated_at"))
+        return cls(
+            id=get_value("id"),
+            path=Path(row["path"]),
+            is_active=bool(row["is_active"]),
+            recursive=bool(row["recursive"]),
+            file_pattern=get_value("file_pattern"),
+            last_scan=last_scan,
+            created_at=created_at,
+            updated_at=updated_at,
+        )
+    @property
+    def path_str(self) -> str:
+        """Return the path as a string."""
+        return str(self.path)
+    def to_response(self) -> "JobResponse":
+        """
+        Convert this WatchedDirectory instance to a JobResponse for API serialization.
+        Returns:
+            A JobResponse instance with the relevant fields from this WatchedDirectory
+        """
+        from backend.api.models import JobResponse
+        return JobResponse(
+            id=self.id,
+            path=self.path_str,
+            is_active=self.is_active,
+            recursive=self.recursive,
+            file_pattern=self.file_pattern,
+            last_scan=self.last_scan,
+            created_at=self.created_at,
+            updated_at=self.updated_at,
+        )
+    def __str__(self) -> str:
+        """Return a string representation of the watched directory."""
+        return f"WatchedDirectory(id={self.id}, path={self.path}, active={self.is_active})"

cosma_backend/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .pipeline import Pipeline as Pipeline
2	+ from .pipeline import PipelineResult as PipelineResult