PyPI - compair-core - Versions diffs - 0.3.12__tar.gz → 0.3.14__tar.gz - Mend

compair-core 0.3.12tar.gz → 0.3.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of compair-core might be problematic. Click here for more details.

Files changed (43) hide show

{compair_core-0.3.12 → compair_core-0.3.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compair-core
-Version: 0.3.12
+Version: 0.3.14
 Summary: Open-source foundation of the Compair collaboration platform.
 Author: RocketResearch, Inc.
 License: MIT
@@ -92,6 +92,8 @@ Key environment variables for the core edition:
 - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
 - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
 - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
+- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
+- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
 See `compair_core/server/settings.py` for the full settings surface.

{compair_core-0.3.12 → compair_core-0.3.14}/README.md RENAMED Viewed

@@ -57,6 +57,8 @@ Key environment variables for the core edition:
 - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
 - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
 - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
+- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
+- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
 See `compair_core/server/settings.py` for the full settings surface.

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core/api.py RENAMED Viewed

@@ -1403,21 +1403,24 @@ def create_doc(
             datetime_modified=datetime.now(timezone.utc)
         )
         print('About to assign groups!')
-        print(groups.split(','))
-        if groups is not None:
-            group_ids = groups.split(',')
-            q = select(models.Group).filter(
-                models.Group.group_id.in_(group_ids)
-            )
-            groups = session.execute(q).fetchall()[0]
-            for group in groups:
-                document.groups.append(group)
+        target_group_ids = []
+        if groups:
+            target_group_ids = [gid.strip() for gid in groups.split(',') if gid.strip()]
+        if target_group_ids:
+            q = select(models.Group).filter(models.Group.group_id.in_(target_group_ids))
+            resolved_groups = session.execute(q).scalars().all()
+            if not resolved_groups:
+                raise HTTPException(status_code=404, detail="No matching groups found for provided IDs.")
+            document.groups = resolved_groups
         else:
-            q = select(models.Group).filter(
-                models.Group.name == current_user.username
-            )
-            group = session.execute(q).fetchone()[0]
-            document.groups = [group]
+            q = select(models.Group).filter(models.Group.name == current_user.username)
+            default_group = session.execute(q).scalars().first()
+            if default_group is None:
+                raise HTTPException(status_code=404, detail="Default group not found for user.")
+            document.groups = [default_group]
+        primary_group = document.groups[0]
         print(f'doc check!!! {document.content}')
         session.add(document)
@@ -1435,7 +1438,7 @@ def create_doc(
         log_activity(
             session=session,
             user_id=document.author_id,
-            group_id=group.group_id,
+            group_id=primary_group.group_id,
             action="create",
             object_id=document.document_id,
             object_name=document.title,

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/embeddings.py RENAMED Viewed

@@ -23,7 +23,17 @@ class Embedder:
         if self._cloud_impl is None:
             self.model = os.getenv("COMPAIR_LOCAL_EMBED_MODEL", "hash-embedding")
-            self.dimension = int(os.getenv("COMPAIR_LOCAL_EMBED_DIM", "384"))
+            default_dim = 1536 if self.edition == "cloud" else 384
+            dim_env = (
+                os.getenv("COMPAIR_EMBEDDING_DIM")
+                or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
+                or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
+                or str(default_dim)
+            )
+            try:
+                self.dimension = int(dim_env)
+            except ValueError:  # pragma: no cover - invalid configuration
+                self.dimension = default_dim
             base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
             route = os.getenv("COMPAIR_LOCAL_EMBED_ROUTE", "/embed")
             self.endpoint = f"{base_url.rstrip('/')}{route}"

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/main.py RENAMED Viewed

@@ -12,7 +12,17 @@ from sqlalchemy.orm import Session as SASession
 from .embeddings import create_embedding, Embedder
 from .feedback import get_feedback, Reviewer
-from .models import Chunk, Document, Feedback, Group, Note, Reference, User
+from .models import (
+    Chunk,
+    Document,
+    Feedback,
+    Group,
+    Note,
+    Reference,
+    User,
+    VECTOR_BACKEND,
+    cosine_similarity,
+)
 from .utils import chunk_text, log_activity
@@ -159,22 +169,41 @@ def process_text(
         Chunk.note_id == note_id,
     ).first()
+    references: list[Chunk] = []
     if generate_feedback and existing_chunk:
         doc_group_ids = [g.group_id for g in doc.groups]
-        references = (
-            session.query(Chunk)
-            .join(Chunk.document)
-            .join(Document.groups)
-            .filter(
-                Document.is_published.is_(True),
-                Document.document_id != doc.document_id,
-                Chunk.chunk_type == "document",
-                Group.group_id.in_(doc_group_ids),
+        target_embedding = existing_chunk.embedding
+        if target_embedding is not None:
+            base_query = (
+                session.query(Chunk)
+                .join(Chunk.document)
+                .join(Document.groups)
+                .filter(
+                    Document.is_published.is_(True),
+                    Document.document_id != doc.document_id,
+                    Chunk.chunk_type == "document",
+                    Group.group_id.in_(doc_group_ids),
+                )
             )
-            .order_by(Chunk.embedding.cosine_distance(existing_chunk.embedding))
-            .limit(3)
-            .all()
-        )
+            if VECTOR_BACKEND == "pgvector":
+                references = (
+                    base_query.order_by(
+                        Chunk.embedding.cosine_distance(existing_chunk.embedding)
+                    )
+                    .limit(3)
+                    .all()
+                )
+            else:
+                candidates = base_query.all()
+                scored: list[tuple[float, Chunk]] = []
+                for candidate in candidates:
+                    score = cosine_similarity(candidate.embedding, target_embedding)
+                    if score is not None:
+                        scored.append((score, candidate))
+                scored.sort(key=lambda item: item[0], reverse=True)
+                references = [chunk for _, chunk in scored[:3]]
         sql_references: list[Reference] = []
         for ref_chunk in references:

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/models.py RENAMED Viewed

@@ -5,9 +5,15 @@ import hashlib
 import os
 import secrets
 from datetime import datetime, timezone
+from math import sqrt
+from typing import Sequence
 from uuid import uuid4
-from pgvector.sqlalchemy import Vector
+try:  # Optional: only required when using pgvector backend
+    from pgvector.sqlalchemy import Vector
+except ImportError:  # pragma: no cover - optional dependency in core
+    Vector = None  # type: ignore[assignment]
 from sqlalchemy import (
     Boolean,
     Column,
@@ -15,6 +21,7 @@ from sqlalchemy import (
     ForeignKey,
     Identity,
     Integer,
+    JSON,
     String,
     Table,
     Text,
@@ -27,6 +34,65 @@ from sqlalchemy.orm import (
     relationship,
 )
+_EDITION = os.getenv("COMPAIR_EDITION", "core").lower()
+_DEFAULT_DIM = 1536 if _EDITION == "cloud" else 384
+_DIM_ENV = (
+    os.getenv("COMPAIR_EMBEDDING_DIM")
+    or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
+    or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
+    or str(_DEFAULT_DIM)
+)
+try:
+    EMBEDDING_DIMENSION = int(_DIM_ENV)
+except ValueError:  # pragma: no cover - invalid configuration
+    EMBEDDING_DIMENSION = _DEFAULT_DIM
+def _detect_vector_backend() -> str:
+    explicit = os.getenv("COMPAIR_VECTOR_BACKEND")
+    if explicit:
+        return explicit.lower()
+    db = os.getenv("DB")
+    db_user = os.getenv("DB_USER")
+    db_passw = os.getenv("DB_PASSW")
+    db_url = os.getenv("DB_URL")
+    database_url = os.getenv("DATABASE_URL", "")
+    if all([db, db_user, db_passw, db_url]):
+        return "pgvector"
+    if database_url.lower().startswith(("postgres://", "postgresql://")):
+        return "pgvector"
+    return "json"
+VECTOR_BACKEND = _detect_vector_backend()
+def _embedding_column():
+    if VECTOR_BACKEND == "pgvector":
+        if Vector is None:
+            raise RuntimeError(
+                "pgvector is required when COMPAIR_VECTOR_BACKEND is set to 'pgvector'."
+            )
+        return mapped_column(Vector(EMBEDDING_DIMENSION), nullable=True)
+    # Store embeddings as JSON arrays (works across SQLite/Postgres without pgvector)
+    return mapped_column(JSON, nullable=True)
+def cosine_similarity(vec1: Sequence[float] | None, vec2: Sequence[float] | None) -> float | None:
+    if not vec1 or not vec2:
+        return None
+    if len(vec1) != len(vec2):
+        return None
+    dot = sum(a * b for a, b in zip(vec1, vec2))
+    norm1 = sqrt(sum(a * a for a in vec1))
+    norm2 = sqrt(sum(b * b for b in vec2))
+    if norm1 == 0 or norm2 == 0:
+        return None
+    return dot / (norm1 * norm2)
 class Base(DeclarativeBase, MappedAsDataclass):
     pass
@@ -216,7 +282,7 @@ class Document(BaseObject):
     file_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
     image_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
     is_published: Mapped[bool] = mapped_column(Boolean, default=False)
-    embedding = mapped_column(Vector(1536))
+    embedding: Mapped[list[float] | None] = _embedding_column()
     user = relationship("User", back_populates="documents")
     groups = relationship("Group", secondary="document_to_group", back_populates="documents")
@@ -250,7 +316,7 @@ class Note(Base):
     group_id: Mapped[str | None] = mapped_column(ForeignKey("group.group_id", ondelete="CASCADE"), index=True, nullable=True)
     content: Mapped[str] = mapped_column(Text)
     datetime_created: Mapped[datetime] = mapped_column(default=datetime.now(timezone.utc))
-    embedding = mapped_column(Vector(1536))
+    embedding: Mapped[list[float] | None] = _embedding_column()
     document = relationship("Document", back_populates="notes")
     author = relationship("User", back_populates="notes")
@@ -279,7 +345,7 @@ class Chunk(Base):
     document_id: Mapped[str | None] = mapped_column(ForeignKey("document.document_id", ondelete="CASCADE"), index=True, nullable=True)
     note_id: Mapped[str | None] = mapped_column(ForeignKey("note.note_id", ondelete="CASCADE"), index=True, nullable=True)
     chunk_type: Mapped[str] = mapped_column(String(16), default="document")
-    embedding = mapped_column(Vector(1536))
+    embedding: Mapped[list[float] | None] = _embedding_column()
     document = relationship("Document", back_populates="chunks")
     note = relationship("Note", back_populates="chunks")

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/local_model/app.py RENAMED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 import hashlib
+import os
 from typing import List
 from fastapi import FastAPI
@@ -9,7 +10,17 @@ from pydantic import BaseModel
 app = FastAPI(title="Compair Local Model", version="0.1.0")
-EMBED_DIMENSION = 384
+_DEFAULT_DIM = 384
+_DIM_ENV = (
+    os.getenv("COMPAIR_EMBEDDING_DIM")
+    or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
+    or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
+    or str(_DEFAULT_DIM)
+)
+try:
+    EMBED_DIMENSION = int(_DIM_ENV)
+except ValueError:  # pragma: no cover - invalid configuration
+    EMBED_DIMENSION = _DEFAULT_DIM
 def _hash_embedding(text: str, dimension: int = EMBED_DIMENSION) -> List[float]:

{compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compair-core
-Version: 0.3.12
+Version: 0.3.14
 Summary: Open-source foundation of the Compair collaboration platform.
 Author: RocketResearch, Inc.
 License: MIT
@@ -92,6 +92,8 @@ Key environment variables for the core edition:
 - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
 - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
 - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
+- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
+- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
 See `compair_core/server/settings.py` for the full settings surface.

{compair_core-0.3.12 → compair_core-0.3.14}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "compair-core"
-version = "0.3.12"
+version = "0.3.14"
 description = "Open-source foundation of the Compair collaboration platform."
 readme = "README.md"
 license = { text = "MIT" }