PyPI - bot-knows - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

bot_knows/__init__.py +70 -0
bot_knows/config.py +115 -0
bot_knows/domain/__init__.py +5 -0
bot_knows/domain/chat.py +62 -0
bot_knows/domain/message.py +64 -0
bot_knows/domain/relation.py +56 -0
bot_knows/domain/topic.py +132 -0
bot_knows/domain/topic_evidence.py +55 -0
bot_knows/importers/__init__.py +12 -0
bot_knows/importers/base.py +116 -0
bot_knows/importers/chatgpt.py +154 -0
bot_knows/importers/claude.py +172 -0
bot_knows/importers/generic_json.py +272 -0
bot_knows/importers/registry.py +125 -0
bot_knows/infra/__init__.py +5 -0
bot_knows/infra/llm/__init__.py +6 -0
bot_knows/infra/llm/anthropic_provider.py +172 -0
bot_knows/infra/llm/openai_provider.py +195 -0
bot_knows/infra/mongo/__init__.py +5 -0
bot_knows/infra/mongo/client.py +145 -0
bot_knows/infra/mongo/repositories.py +348 -0
bot_knows/infra/neo4j/__init__.py +5 -0
bot_knows/infra/neo4j/client.py +152 -0
bot_knows/infra/neo4j/graph_repository.py +329 -0
bot_knows/infra/redis/__init__.py +6 -0
bot_knows/infra/redis/cache.py +198 -0
bot_knows/infra/redis/client.py +193 -0
bot_knows/interfaces/__init__.py +18 -0
bot_knows/interfaces/embedding.py +55 -0
bot_knows/interfaces/graph.py +194 -0
bot_knows/interfaces/llm.py +70 -0
bot_knows/interfaces/recall.py +92 -0
bot_knows/interfaces/storage.py +225 -0
bot_knows/logging.py +101 -0
bot_knows/models/__init__.py +22 -0
bot_knows/models/chat.py +55 -0
bot_knows/models/ingest.py +70 -0
bot_knows/models/message.py +49 -0
bot_knows/models/recall.py +58 -0
bot_knows/models/topic.py +100 -0
bot_knows/orchestrator.py +398 -0
bot_knows/py.typed +0 -0
bot_knows/services/__init__.py +24 -0
bot_knows/services/chat_processing.py +182 -0
bot_knows/services/dedup_service.py +161 -0
bot_knows/services/graph_service.py +217 -0
bot_knows/services/message_builder.py +135 -0
bot_knows/services/recall_service.py +296 -0
bot_knows/services/tasks.py +128 -0
bot_knows/services/topic_extraction.py +199 -0
bot_knows/utils/__init__.py +22 -0
bot_knows/utils/hashing.py +126 -0
bot_knows-0.1.0.dist-info/METADATA +294 -0
bot_knows-0.1.0.dist-info/RECORD +56 -0
bot_knows-0.1.0.dist-info/WHEEL +4 -0
bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0

bot_knows/interfaces/storage.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""Storage interface for bot_knows.
+This module defines the Protocol for persistent storage operations.
+"""
+from typing import ClassVar, Protocol, runtime_checkable
+from bot_knows.models.chat import ChatDTO
+from bot_knows.models.message import MessageDTO
+from bot_knows.models.recall import TopicRecallStateDTO
+from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
+__all__ = [
+    "StorageInterface",
+]
+@runtime_checkable
+class StorageInterface(Protocol):
+    """Contract for persistent storage operations.
+    Implementations should provide CRUD operations for
+    chats, messages, topics, evidence, and recall state.
+    """
+    config_class: ClassVar[type | None] = None
+    # Chat operations
+    async def save_chat(self, chat: ChatDTO) -> str:
+        """Save a chat to storage.
+        Args:
+            chat: Chat data to save
+        Returns:
+            Chat ID
+        """
+        ...
+    async def get_chat(self, chat_id: str) -> ChatDTO | None:
+        """Get a chat by ID.
+        Args:
+            chat_id: Chat ID to retrieve
+        Returns:
+            ChatDTO if found, None otherwise
+        """
+        ...
+    async def chat_exists(self, chat_id: str) -> bool:
+        """Check if a chat exists.
+        Args:
+            chat_id: Chat ID to check
+        Returns:
+            True if exists, False otherwise
+        """
+        ...
+    async def find_chats_by_source(self, source: str) -> list[ChatDTO]:
+        """Find all chats from a given source.
+        Args:
+            source: Import source to filter by
+        Returns:
+            List of matching chats
+        """
+        ...
+    # Message operations
+    async def save_message(self, message: MessageDTO) -> str:
+        """Save a message to storage.
+        Args:
+            message: Message data to save
+        Returns:
+            Message ID
+        """
+        ...
+    async def get_message(self, message_id: str) -> MessageDTO | None:
+        """Get a message by ID.
+        Args:
+            message_id: Message ID to retrieve
+        Returns:
+            MessageDTO if found, None otherwise
+        """
+        ...
+    async def get_messages_for_chat(self, chat_id: str) -> list[MessageDTO]:
+        """Get all messages for a chat.
+        Args:
+            chat_id: Chat ID to query
+        Returns:
+            List of messages, ordered by timestamp
+        """
+        ...
+    # Topic operations
+    async def save_topic(self, topic: TopicDTO) -> str:
+        """Save a topic to storage.
+        Args:
+            topic: Topic data to save
+        Returns:
+            Topic ID
+        """
+        ...
+    async def get_topic(self, topic_id: str) -> TopicDTO | None:
+        """Get a topic by ID.
+        Args:
+            topic_id: Topic ID to retrieve
+        Returns:
+            TopicDTO if found, None otherwise
+        """
+        ...
+    async def update_topic(self, topic: TopicDTO) -> None:
+        """Update an existing topic.
+        Args:
+            topic: Updated topic data
+        """
+        ...
+    async def find_similar_topics(
+        self,
+        embedding: list[float],
+        threshold: float,
+    ) -> list[tuple[TopicDTO, float]]:
+        """Find topics with similar embeddings.
+        Args:
+            embedding: Query embedding vector
+            threshold: Minimum similarity threshold
+        Returns:
+            List of (TopicDTO, similarity) tuples, sorted by similarity desc
+        """
+        ...
+    async def get_all_topics(self, limit: int = 1000) -> list[TopicDTO]:
+        """Get all topics (for batch operations).
+        Args:
+            limit: Maximum number of topics to return
+        Returns:
+            List of topics
+        """
+        ...
+    # Evidence operations
+    async def append_evidence(self, evidence: TopicEvidenceDTO) -> str:
+        """Append evidence record (never update or delete).
+        Args:
+            evidence: Evidence data to append
+        Returns:
+            Evidence ID
+        """
+        ...
+    async def get_evidence_for_topic(self, topic_id: str) -> list[TopicEvidenceDTO]:
+        """Get all evidence for a topic.
+        Args:
+            topic_id: Topic ID to query
+        Returns:
+            List of evidence records
+        """
+        ...
+    # Recall state operations
+    async def save_recall_state(self, state: TopicRecallStateDTO) -> None:
+        """Save or update recall state for a topic.
+        Args:
+            state: Recall state to save
+        """
+        ...
+    async def get_recall_state(self, topic_id: str) -> TopicRecallStateDTO | None:
+        """Get recall state for a topic.
+        Args:
+            topic_id: Topic ID to query
+        Returns:
+            TopicRecallStateDTO if found, None otherwise
+        """
+        ...
+    async def get_due_topics(self, threshold: float) -> list[TopicRecallStateDTO]:
+        """Get topics due for recall review.
+        Args:
+            threshold: Strength threshold (topics below this are due)
+        Returns:
+            List of recall states for due topics
+        """
+        ...
+    async def get_all_recall_states(self) -> list[TopicRecallStateDTO]:
+        """Get all recall states (for batch decay updates).
+        Returns:
+            List of all recall states
+        """
+        ...

bot_knows/logging.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Structured logging for bot_knows.
+This module provides a configured structlog logger with JSON output
+for production and pretty console output for development.
+"""
+import logging
+import sys
+from typing import Any
+import structlog
+__all__ = [
+    "configure_logging",
+    "get_logger",
+]
+def configure_logging(
+    level: int = logging.INFO,
+    json_output: bool = False,
+    add_timestamp: bool = True,
+) -> None:
+    """Configure structlog for the application.
+    Args:
+        level: Logging level (default: INFO)
+        json_output: If True, output JSON; if False, pretty console output
+        add_timestamp: If True, add ISO timestamp to log entries
+    """
+    # Common processors
+    processors: list[Any] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.stdlib.add_log_level,
+        structlog.stdlib.add_logger_name,
+        structlog.stdlib.PositionalArgumentsFormatter(),
+        structlog.processors.StackInfoRenderer(),
+        structlog.processors.UnicodeDecoder(),
+    ]
+    if add_timestamp:
+        processors.insert(0, structlog.processors.TimeStamper(fmt="iso"))
+    if json_output:
+        processors.append(structlog.processors.JSONRenderer())
+    else:
+        processors.append(
+            structlog.dev.ConsoleRenderer(
+                colors=True,
+                exception_formatter=structlog.dev.plain_traceback,
+            )
+        )
+    # Configure structlog
+    structlog.configure(
+        processors=processors,
+        wrapper_class=structlog.stdlib.BoundLogger,
+        context_class=dict,
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+    # Configure standard library logging
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=level,
+    )
+    # Set levels for noisy third-party loggers
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+    logging.getLogger("motor").setLevel(logging.WARNING)
+    logging.getLogger("neo4j").setLevel(logging.WARNING)
+def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
+    """Get a configured structlog logger.
+    Args:
+        name: Logger name (usually __name__ of the calling module)
+    Returns:
+        Configured structlog BoundLogger instance
+    """
+    return structlog.get_logger(name)
+# Convenience: configure with defaults on import if not already configured
+_configured = False
+def _ensure_configured() -> None:
+    """Ensure logging is configured with defaults."""
+    global _configured
+    if not _configured:
+        configure_logging()
+        _configured = True
+_ensure_configured()

bot_knows/models/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Public DTO models for bot_knows.
+This module exports all public data transfer objects.
+"""
+from bot_knows.models.chat import ChatCategory, ChatDTO
+from bot_knows.models.ingest import ChatIngest, IngestMessage
+from bot_knows.models.message import MessageDTO
+from bot_knows.models.recall import RecallItemDTO, TopicRecallStateDTO
+from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
+__all__ = [
+    "ChatCategory",
+    "ChatDTO",
+    "ChatIngest",
+    "IngestMessage",
+    "MessageDTO",
+    "RecallItemDTO",
+    "TopicDTO",
+    "TopicEvidenceDTO",
+    "TopicRecallStateDTO",
+]

bot_knows/models/chat.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Chat models for bot_knows.
+These models represent processed chats in the knowledge base.
+"""
+from enum import StrEnum
+from pydantic import BaseModel, Field
+__all__ = [
+    "ChatCategory",
+    "ChatDTO",
+]
+class ChatCategory(StrEnum):
+    """Categories for chat classification.
+    Used by the LLM-based classifier to categorize chats
+    based on their content and purpose.
+    """
+    CODING = "coding"
+    RESEARCH = "research"
+    WRITING = "writing"
+    BRAINSTORMING = "brainstorming"
+    DEBUGGING = "debugging"
+    LEARNING = "learning"
+    GENERAL = "general"
+    OTHER = "other"
+class ChatDTO(BaseModel, frozen=True):
+    """Public Chat data transfer object.
+    Represents a processed chat in the knowledge base.
+    Chats contain metadata only - message content is stored separately.
+    Attributes:
+        id: Deterministic chat ID (SHA256 of title + source + timestamp)
+        title: Chat title (resolved from import or first message)
+        source: Import source identifier
+        category: LLM-classified category
+        tags: Free-form tags from classification
+        created_on: Chat creation timestamp in epoch seconds
+        schema_version: Schema version for forward compatibility
+    """
+    id: str = Field(description="SHA256 hash of title + source + timestamp")
+    title: str
+    source: str = Field(description="Import source (chatgpt, claude, etc.)")
+    category: ChatCategory = Field(default=ChatCategory.GENERAL)
+    tags: list[str] = Field(default_factory=list)
+    created_on: int = Field(description="Epoch seconds")
+    schema_version: int = Field(default=1)

bot_knows/models/ingest.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Ingestion boundary models for bot_knows.
+These frozen Pydantic models define the contract between import adapters
+and the domain processing layer. They are immutable and validated at creation.
+"""
+from typing import Literal
+from pydantic import BaseModel, Field
+__all__ = [
+    "ChatIngest",
+    "IngestMessage",
+]
+class IngestMessage(BaseModel, frozen=True):
+    """Single message from import source.
+    This is a frozen (immutable) model representing one message
+    in its raw form from the import source.
+    Attributes:
+        role: Message author role (user, assistant, or system)
+        content: Message text content
+        timestamp: Message timestamp in epoch seconds
+        chat_id: Provider's original chat/conversation identifier
+        schema_version: Schema version for forward compatibility
+    """
+    role: Literal["user", "assistant", "system"]
+    content: str
+    timestamp: int = Field(description="Epoch seconds")
+    chat_id: str = Field(description="Provider's chat identifier")
+    schema_version: int = Field(default=1)
+class ChatIngest(BaseModel, frozen=True):
+    """Complete chat from import source.
+    This is a frozen (immutable) model representing one complete chat
+    conversation ready for domain processing.
+    Attributes:
+        source: Import source identifier (e.g., "chatgpt", "claude")
+        imported_chat_timestamp: Chat creation/import timestamp in epoch seconds
+        title: Chat title (may be None if not provided by source)
+        messages: List of messages in the chat, ordered by timestamp
+        provider: Original provider name (for provenance)
+        conversation_id: Provider's original conversation ID
+        schema_version: Schema version for forward compatibility
+    """
+    source: str = Field(description="Import source (chatgpt, claude, etc.)")
+    imported_chat_timestamp: int = Field(description="Epoch seconds")
+    title: str | None = Field(default=None)
+    messages: list[IngestMessage] = Field(default_factory=list)
+    provider: str | None = Field(default=None, description="Original provider")
+    conversation_id: str | None = Field(default=None, description="Provider's conversation ID")
+    schema_version: int = Field(default=1)
+    @property
+    def message_count(self) -> int:
+        """Get the number of messages in this chat."""
+        return len(self.messages)
+    @property
+    def has_messages(self) -> bool:
+        """Check if this chat has any messages."""
+        return len(self.messages) > 0

bot_knows/models/message.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Message models for bot_knows.
+These models represent processed messages in the knowledge base.
+"""
+from pydantic import BaseModel, Field
+__all__ = [
+    "MessageDTO",
+]
+class MessageDTO(BaseModel, frozen=True):
+    """User-Assistant message pair.
+    Messages are stored as user-assistant pairs rather than individual
+    messages. This reflects the conversational nature of chat data
+    and simplifies topic extraction.
+    Attributes:
+        message_id: Deterministic message ID (hash-based)
+        chat_id: Parent chat ID
+        user_content: User's message content (may be empty)
+        assistant_content: Assistant's response content (may be empty)
+        created_on: Message timestamp in epoch seconds
+        schema_version: Schema version for forward compatibility
+    """
+    message_id: str = Field(description="Hash-based message ID")
+    chat_id: str = Field(description="Parent chat ID")
+    user_content: str = Field(default="", description="User's message")
+    assistant_content: str = Field(default="", description="Assistant's response")
+    created_on: int = Field(description="Epoch seconds")
+    schema_version: int = Field(default=1)
+    @property
+    def combined_content(self) -> str:
+        """Get combined user and assistant content for processing."""
+        parts = []
+        if self.user_content:
+            parts.append(f"User: {self.user_content}")
+        if self.assistant_content:
+            parts.append(f"Assistant: {self.assistant_content}")
+        return "\n\n".join(parts)
+    @property
+    def is_empty(self) -> bool:
+        """Check if both user and assistant content are empty."""
+        return not self.user_content and not self.assistant_content

bot_knows/models/recall.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Recall models for bot_knows.
+These models represent the recall/spaced repetition state
+for topics in the knowledge base.
+"""
+from pydantic import BaseModel, Field
+from bot_knows.models.topic import TopicDTO
+__all__ = [
+    "RecallItemDTO",
+    "TopicRecallStateDTO",
+]
+class TopicRecallStateDTO(BaseModel, frozen=True):
+    """Persisted recall state per topic.
+    Tracks the spaced repetition state for a topic including
+    strength decay and stability.
+    Attributes:
+        topic_id: Topic ID this state belongs to
+        strength: Current recall strength (0.0 - 1.0)
+        last_seen: Last time topic was accessed (epoch seconds)
+        last_updated: Last time decay was applied (epoch seconds)
+        stability: Decay rate factor (higher = slower decay)
+        schema_version: Schema version for forward compatibility
+    """
+    topic_id: str = Field(description="Topic ID")
+    strength: float = Field(default=0.0, ge=0.0, le=1.0, description="Recall strength")
+    last_seen: int = Field(description="Epoch seconds")
+    last_updated: int = Field(description="Epoch seconds")
+    stability: float = Field(default=1.0, ge=0.0, description="Decay rate factor")
+    schema_version: int = Field(default=1)
+class RecallItemDTO(BaseModel, frozen=True):
+    """Topic ready for recall/review.
+    Represents a topic that is due for review along with
+    its recall state and related topics.
+    Attributes:
+        topic: The topic to review
+        recall_state: Current recall state
+        due_score: Priority score for recall (higher = more due)
+        related_topics: IDs of semantically related topics
+        schema_version: Schema version for forward compatibility
+    """
+    topic: TopicDTO
+    recall_state: TopicRecallStateDTO
+    due_score: float = Field(ge=0.0, description="Priority for recall")
+    related_topics: list[str] = Field(default_factory=list, description="Related topic IDs")
+    schema_version: int = Field(default=1)

bot_knows/models/topic.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Topic models for bot_knows.
+These models represent canonical topics and their evidence
+in the knowledge base.
+"""
+from pydantic import BaseModel, Field
+__all__ = [
+    "TopicDTO",
+    "TopicEvidenceDTO",
+]
+class TopicDTO(BaseModel, frozen=True):
+    """Canonical semantic topic.
+    Topics are deduplicated semantic concepts extracted from messages.
+    Each topic has a running centroid embedding for similarity matching.
+    Attributes:
+        topic_id: Deterministic topic ID
+        canonical_name: Canonical/normalized topic name
+        centroid_embedding: Running centroid of all evidence embeddings
+        evidence_count: Number of evidence records (for centroid updates)
+        importance: Topic importance score (0.0 - 1.0)
+        recall_strength: Current recall strength (0.0 - 1.0)
+        schema_version: Schema version for forward compatibility
+    """
+    topic_id: str = Field(description="Hash-based topic ID")
+    canonical_name: str = Field(description="Canonical topic name")
+    centroid_embedding: list[float] = Field(
+        default_factory=list, description="Running centroid embedding"
+    )
+    evidence_count: int = Field(default=0, description="Number of evidence records")
+    importance: float = Field(default=0.0, ge=0.0, le=1.0)
+    recall_strength: float = Field(default=0.0, ge=0.0, le=1.0)
+    schema_version: int = Field(default=1)
+    def with_updated_centroid(
+        self,
+        new_embedding: list[float],
+    ) -> "TopicDTO":
+        """Create a new TopicDTO with updated centroid embedding.
+        Uses incremental centroid update formula:
+            new_centroid = (old_centroid * n + new_embedding) / (n + 1)
+        Args:
+            new_embedding: New embedding to incorporate
+        Returns:
+            New TopicDTO with updated centroid and evidence_count
+        """
+        n = self.evidence_count
+        if n == 0:
+            # First embedding becomes the centroid
+            new_centroid = new_embedding
+        else:
+            # Incremental update
+            new_centroid = [
+                (old * n + new) / (n + 1)
+                for old, new in zip(self.centroid_embedding, new_embedding, strict=False)
+            ]
+        return TopicDTO(
+            topic_id=self.topic_id,
+            canonical_name=self.canonical_name,
+            centroid_embedding=new_centroid,
+            evidence_count=n + 1,
+            importance=self.importance,
+            recall_strength=self.recall_strength,
+            schema_version=self.schema_version,
+        )
+class TopicEvidenceDTO(BaseModel, frozen=True):
+    """Append-only evidence linking extraction to topic.
+    Evidence records are never modified or deleted. They provide
+    a complete audit trail of topic extractions.
+    Attributes:
+        evidence_id: Deterministic evidence ID
+        topic_id: Parent topic ID
+        extracted_name: Raw extracted topic name (before normalization)
+        source_message_id: ID of the message this was extracted from
+        confidence: Extraction confidence score (0.0 - 1.0)
+        timestamp: Extraction timestamp in epoch seconds
+        schema_version: Schema version for forward compatibility
+    """
+    evidence_id: str = Field(description="Hash-based evidence ID")
+    topic_id: str = Field(description="Parent topic ID")
+    extracted_name: str = Field(description="Raw extracted name")
+    source_message_id: str = Field(description="Source message ID")
+    confidence: float = Field(ge=0.0, le=1.0, description="Extraction confidence")
+    timestamp: int = Field(description="Epoch seconds")
+    schema_version: int = Field(default=1)