compair-core 0.3.12__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of compair-core might be problematic. Click here for more details.

Files changed (43) hide show
  1. {compair_core-0.3.12 → compair_core-0.3.14}/PKG-INFO +3 -1
  2. {compair_core-0.3.12 → compair_core-0.3.14}/README.md +2 -0
  3. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/api.py +18 -15
  4. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/embeddings.py +11 -1
  5. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/main.py +43 -14
  6. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/models.py +70 -4
  7. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/local_model/app.py +12 -1
  8. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/PKG-INFO +3 -1
  9. {compair_core-0.3.12 → compair_core-0.3.14}/pyproject.toml +1 -1
  10. {compair_core-0.3.12 → compair_core-0.3.14}/LICENSE +0 -0
  11. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/__init__.py +0 -0
  12. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/__init__.py +0 -0
  13. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/celery_app.py +0 -0
  14. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/default_groups.py +0 -0
  15. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/feedback.py +0 -0
  16. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/logger.py +0 -0
  17. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/schema.py +0 -0
  18. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/tasks.py +0 -0
  19. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair/utils.py +0 -0
  20. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair_email/__init__.py +0 -0
  21. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair_email/email.py +0 -0
  22. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair_email/email_core.py +0 -0
  23. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair_email/templates.py +0 -0
  24. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/compair_email/templates_core.py +0 -0
  25. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/__init__.py +0 -0
  26. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/app.py +0 -0
  27. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/deps.py +0 -0
  28. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/local_model/__init__.py +0 -0
  29. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/__init__.py +0 -0
  30. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/console_mailer.py +0 -0
  31. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/contracts.py +0 -0
  32. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/local_storage.py +0 -0
  33. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/noop_analytics.py +0 -0
  34. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/noop_billing.py +0 -0
  35. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/providers/noop_ocr.py +0 -0
  36. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/routers/__init__.py +0 -0
  37. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/routers/capabilities.py +0 -0
  38. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core/server/settings.py +0 -0
  39. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/SOURCES.txt +0 -0
  40. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/dependency_links.txt +0 -0
  41. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/requires.txt +0 -0
  42. {compair_core-0.3.12 → compair_core-0.3.14}/compair_core.egg-info/top_level.txt +0 -0
  43. {compair_core-0.3.12 → compair_core-0.3.14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compair-core
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Open-source foundation of the Compair collaboration platform.
5
5
  Author: RocketResearch, Inc.
6
6
  License: MIT
@@ -92,6 +92,8 @@ Key environment variables for the core edition:
92
92
  - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
93
93
  - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
94
94
  - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
95
+ - `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
96
+ - `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
95
97
 
96
98
  See `compair_core/server/settings.py` for the full settings surface.
97
99
 
@@ -57,6 +57,8 @@ Key environment variables for the core edition:
57
57
  - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
58
58
  - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
59
59
  - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
60
+ - `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
61
+ - `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
60
62
 
61
63
  See `compair_core/server/settings.py` for the full settings surface.
62
64
 
@@ -1403,21 +1403,24 @@ def create_doc(
1403
1403
  datetime_modified=datetime.now(timezone.utc)
1404
1404
  )
1405
1405
  print('About to assign groups!')
1406
- print(groups.split(','))
1407
- if groups is not None:
1408
- group_ids = groups.split(',')
1409
- q = select(models.Group).filter(
1410
- models.Group.group_id.in_(group_ids)
1411
- )
1412
- groups = session.execute(q).fetchall()[0]
1413
- for group in groups:
1414
- document.groups.append(group)
1406
+ target_group_ids = []
1407
+ if groups:
1408
+ target_group_ids = [gid.strip() for gid in groups.split(',') if gid.strip()]
1409
+
1410
+ if target_group_ids:
1411
+ q = select(models.Group).filter(models.Group.group_id.in_(target_group_ids))
1412
+ resolved_groups = session.execute(q).scalars().all()
1413
+ if not resolved_groups:
1414
+ raise HTTPException(status_code=404, detail="No matching groups found for provided IDs.")
1415
+ document.groups = resolved_groups
1415
1416
  else:
1416
- q = select(models.Group).filter(
1417
- models.Group.name == current_user.username
1418
- )
1419
- group = session.execute(q).fetchone()[0]
1420
- document.groups = [group]
1417
+ q = select(models.Group).filter(models.Group.name == current_user.username)
1418
+ default_group = session.execute(q).scalars().first()
1419
+ if default_group is None:
1420
+ raise HTTPException(status_code=404, detail="Default group not found for user.")
1421
+ document.groups = [default_group]
1422
+
1423
+ primary_group = document.groups[0]
1421
1424
 
1422
1425
  print(f'doc check!!! {document.content}')
1423
1426
  session.add(document)
@@ -1435,7 +1438,7 @@ def create_doc(
1435
1438
  log_activity(
1436
1439
  session=session,
1437
1440
  user_id=document.author_id,
1438
- group_id=group.group_id,
1441
+ group_id=primary_group.group_id,
1439
1442
  action="create",
1440
1443
  object_id=document.document_id,
1441
1444
  object_name=document.title,
@@ -23,7 +23,17 @@ class Embedder:
23
23
 
24
24
  if self._cloud_impl is None:
25
25
  self.model = os.getenv("COMPAIR_LOCAL_EMBED_MODEL", "hash-embedding")
26
- self.dimension = int(os.getenv("COMPAIR_LOCAL_EMBED_DIM", "384"))
26
+ default_dim = 1536 if self.edition == "cloud" else 384
27
+ dim_env = (
28
+ os.getenv("COMPAIR_EMBEDDING_DIM")
29
+ or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
30
+ or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
31
+ or str(default_dim)
32
+ )
33
+ try:
34
+ self.dimension = int(dim_env)
35
+ except ValueError: # pragma: no cover - invalid configuration
36
+ self.dimension = default_dim
27
37
  base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
28
38
  route = os.getenv("COMPAIR_LOCAL_EMBED_ROUTE", "/embed")
29
39
  self.endpoint = f"{base_url.rstrip('/')}{route}"
@@ -12,7 +12,17 @@ from sqlalchemy.orm import Session as SASession
12
12
 
13
13
  from .embeddings import create_embedding, Embedder
14
14
  from .feedback import get_feedback, Reviewer
15
- from .models import Chunk, Document, Feedback, Group, Note, Reference, User
15
+ from .models import (
16
+ Chunk,
17
+ Document,
18
+ Feedback,
19
+ Group,
20
+ Note,
21
+ Reference,
22
+ User,
23
+ VECTOR_BACKEND,
24
+ cosine_similarity,
25
+ )
16
26
  from .utils import chunk_text, log_activity
17
27
 
18
28
 
@@ -159,22 +169,41 @@ def process_text(
159
169
  Chunk.note_id == note_id,
160
170
  ).first()
161
171
 
172
+ references: list[Chunk] = []
162
173
  if generate_feedback and existing_chunk:
163
174
  doc_group_ids = [g.group_id for g in doc.groups]
164
- references = (
165
- session.query(Chunk)
166
- .join(Chunk.document)
167
- .join(Document.groups)
168
- .filter(
169
- Document.is_published.is_(True),
170
- Document.document_id != doc.document_id,
171
- Chunk.chunk_type == "document",
172
- Group.group_id.in_(doc_group_ids),
175
+ target_embedding = existing_chunk.embedding
176
+
177
+ if target_embedding is not None:
178
+ base_query = (
179
+ session.query(Chunk)
180
+ .join(Chunk.document)
181
+ .join(Document.groups)
182
+ .filter(
183
+ Document.is_published.is_(True),
184
+ Document.document_id != doc.document_id,
185
+ Chunk.chunk_type == "document",
186
+ Group.group_id.in_(doc_group_ids),
187
+ )
173
188
  )
174
- .order_by(Chunk.embedding.cosine_distance(existing_chunk.embedding))
175
- .limit(3)
176
- .all()
177
- )
189
+
190
+ if VECTOR_BACKEND == "pgvector":
191
+ references = (
192
+ base_query.order_by(
193
+ Chunk.embedding.cosine_distance(existing_chunk.embedding)
194
+ )
195
+ .limit(3)
196
+ .all()
197
+ )
198
+ else:
199
+ candidates = base_query.all()
200
+ scored: list[tuple[float, Chunk]] = []
201
+ for candidate in candidates:
202
+ score = cosine_similarity(candidate.embedding, target_embedding)
203
+ if score is not None:
204
+ scored.append((score, candidate))
205
+ scored.sort(key=lambda item: item[0], reverse=True)
206
+ references = [chunk for _, chunk in scored[:3]]
178
207
 
179
208
  sql_references: list[Reference] = []
180
209
  for ref_chunk in references:
@@ -5,9 +5,15 @@ import hashlib
5
5
  import os
6
6
  import secrets
7
7
  from datetime import datetime, timezone
8
+ from math import sqrt
9
+ from typing import Sequence
8
10
  from uuid import uuid4
9
11
 
10
- from pgvector.sqlalchemy import Vector
12
+ try: # Optional: only required when using pgvector backend
13
+ from pgvector.sqlalchemy import Vector
14
+ except ImportError: # pragma: no cover - optional dependency in core
15
+ Vector = None # type: ignore[assignment]
16
+
11
17
  from sqlalchemy import (
12
18
  Boolean,
13
19
  Column,
@@ -15,6 +21,7 @@ from sqlalchemy import (
15
21
  ForeignKey,
16
22
  Identity,
17
23
  Integer,
24
+ JSON,
18
25
  String,
19
26
  Table,
20
27
  Text,
@@ -27,6 +34,65 @@ from sqlalchemy.orm import (
27
34
  relationship,
28
35
  )
29
36
 
37
+ _EDITION = os.getenv("COMPAIR_EDITION", "core").lower()
38
+ _DEFAULT_DIM = 1536 if _EDITION == "cloud" else 384
39
+ _DIM_ENV = (
40
+ os.getenv("COMPAIR_EMBEDDING_DIM")
41
+ or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
42
+ or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
43
+ or str(_DEFAULT_DIM)
44
+ )
45
+
46
+ try:
47
+ EMBEDDING_DIMENSION = int(_DIM_ENV)
48
+ except ValueError: # pragma: no cover - invalid configuration
49
+ EMBEDDING_DIMENSION = _DEFAULT_DIM
50
+
51
+
52
+ def _detect_vector_backend() -> str:
53
+ explicit = os.getenv("COMPAIR_VECTOR_BACKEND")
54
+ if explicit:
55
+ return explicit.lower()
56
+
57
+ db = os.getenv("DB")
58
+ db_user = os.getenv("DB_USER")
59
+ db_passw = os.getenv("DB_PASSW")
60
+ db_url = os.getenv("DB_URL")
61
+ database_url = os.getenv("DATABASE_URL", "")
62
+
63
+ if all([db, db_user, db_passw, db_url]):
64
+ return "pgvector"
65
+ if database_url.lower().startswith(("postgres://", "postgresql://")):
66
+ return "pgvector"
67
+ return "json"
68
+
69
+
70
+ VECTOR_BACKEND = _detect_vector_backend()
71
+
72
+
73
+ def _embedding_column():
74
+ if VECTOR_BACKEND == "pgvector":
75
+ if Vector is None:
76
+ raise RuntimeError(
77
+ "pgvector is required when COMPAIR_VECTOR_BACKEND is set to 'pgvector'."
78
+ )
79
+ return mapped_column(Vector(EMBEDDING_DIMENSION), nullable=True)
80
+ # Store embeddings as JSON arrays (works across SQLite/Postgres without pgvector)
81
+ return mapped_column(JSON, nullable=True)
82
+
83
+
84
+ def cosine_similarity(vec1: Sequence[float] | None, vec2: Sequence[float] | None) -> float | None:
85
+ if not vec1 or not vec2:
86
+ return None
87
+ if len(vec1) != len(vec2):
88
+ return None
89
+ dot = sum(a * b for a, b in zip(vec1, vec2))
90
+ norm1 = sqrt(sum(a * a for a in vec1))
91
+ norm2 = sqrt(sum(b * b for b in vec2))
92
+ if norm1 == 0 or norm2 == 0:
93
+ return None
94
+ return dot / (norm1 * norm2)
95
+
30
96
 
31
97
  class Base(DeclarativeBase, MappedAsDataclass):
32
98
  pass
@@ -216,7 +282,7 @@ class Document(BaseObject):
216
282
  file_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
217
283
  image_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
218
284
  is_published: Mapped[bool] = mapped_column(Boolean, default=False)
219
- embedding = mapped_column(Vector(1536))
285
+ embedding: Mapped[list[float] | None] = _embedding_column()
220
286
 
221
287
  user = relationship("User", back_populates="documents")
222
288
  groups = relationship("Group", secondary="document_to_group", back_populates="documents")
@@ -250,7 +316,7 @@ class Note(Base):
250
316
  group_id: Mapped[str | None] = mapped_column(ForeignKey("group.group_id", ondelete="CASCADE"), index=True, nullable=True)
251
317
  content: Mapped[str] = mapped_column(Text)
252
318
  datetime_created: Mapped[datetime] = mapped_column(default=datetime.now(timezone.utc))
253
- embedding = mapped_column(Vector(1536))
319
+ embedding: Mapped[list[float] | None] = _embedding_column()
254
320
 
255
321
  document = relationship("Document", back_populates="notes")
256
322
  author = relationship("User", back_populates="notes")
@@ -279,7 +345,7 @@ class Chunk(Base):
279
345
  document_id: Mapped[str | None] = mapped_column(ForeignKey("document.document_id", ondelete="CASCADE"), index=True, nullable=True)
280
346
  note_id: Mapped[str | None] = mapped_column(ForeignKey("note.note_id", ondelete="CASCADE"), index=True, nullable=True)
281
347
  chunk_type: Mapped[str] = mapped_column(String(16), default="document")
282
- embedding = mapped_column(Vector(1536))
348
+ embedding: Mapped[list[float] | None] = _embedding_column()
283
349
 
284
350
  document = relationship("Document", back_populates="chunks")
285
351
  note = relationship("Note", back_populates="chunks")
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import hashlib
5
+ import os
5
6
  from typing import List
6
7
 
7
8
  from fastapi import FastAPI
@@ -9,7 +10,17 @@ from pydantic import BaseModel
9
10
 
10
11
  app = FastAPI(title="Compair Local Model", version="0.1.0")
11
12
 
12
- EMBED_DIMENSION = 384
13
+ _DEFAULT_DIM = 384
14
+ _DIM_ENV = (
15
+ os.getenv("COMPAIR_EMBEDDING_DIM")
16
+ or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
17
+ or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
18
+ or str(_DEFAULT_DIM)
19
+ )
20
+ try:
21
+ EMBED_DIMENSION = int(_DIM_ENV)
22
+ except ValueError: # pragma: no cover - invalid configuration
23
+ EMBED_DIMENSION = _DEFAULT_DIM
13
24
 
14
25
 
15
26
  def _hash_embedding(text: str, dimension: int = EMBED_DIMENSION) -> List[float]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compair-core
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Open-source foundation of the Compair collaboration platform.
5
5
  Author: RocketResearch, Inc.
6
6
  License: MIT
@@ -92,6 +92,8 @@ Key environment variables for the core edition:
92
92
  - `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
93
93
  - `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
94
94
  - `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
95
+ - `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
96
+ - `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
95
97
 
96
98
  See `compair_core/server/settings.py` for the full settings surface.
97
99
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "compair-core"
7
- version = "0.3.12"
7
+ version = "0.3.14"
8
8
  description = "Open-source foundation of the Compair collaboration platform."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
File without changes
File without changes