compair-core 0.3.13__tar.gz → 0.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of compair-core might be problematic. Click here for more details.
- {compair_core-0.3.13 → compair_core-0.3.15}/PKG-INFO +5 -1
- {compair_core-0.3.13 → compair_core-0.3.15}/README.md +4 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/embeddings.py +11 -1
- compair_core-0.3.15/compair_core/compair/feedback.py +204 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/main.py +43 -14
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/models.py +74 -4
- compair_core-0.3.15/compair_core/server/local_model/app.py +87 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core.egg-info/PKG-INFO +5 -1
- {compair_core-0.3.13 → compair_core-0.3.15}/pyproject.toml +1 -1
- compair_core-0.3.13/compair_core/compair/feedback.py +0 -79
- compair_core-0.3.13/compair_core/server/local_model/app.py +0 -62
- {compair_core-0.3.13 → compair_core-0.3.15}/LICENSE +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/api.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/celery_app.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/default_groups.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/logger.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/schema.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/tasks.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair/utils.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair_email/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair_email/email.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair_email/email_core.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair_email/templates.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/compair_email/templates_core.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/app.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/deps.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/local_model/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/console_mailer.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/contracts.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/local_storage.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/noop_analytics.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/noop_billing.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/providers/noop_ocr.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/routers/__init__.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/routers/capabilities.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core/server/settings.py +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core.egg-info/SOURCES.txt +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core.egg-info/dependency_links.txt +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core.egg-info/requires.txt +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/compair_core.egg-info/top_level.txt +0 -0
- {compair_core-0.3.13 → compair_core-0.3.15}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compair-core
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: Open-source foundation of the Compair collaboration platform.
|
|
5
5
|
Author: RocketResearch, Inc.
|
|
6
6
|
License: MIT
|
|
@@ -92,6 +92,10 @@ Key environment variables for the core edition:
|
|
|
92
92
|
- `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
|
|
93
93
|
- `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
|
|
94
94
|
- `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
|
|
95
|
+
- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
|
|
96
|
+
- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
|
|
97
|
+
- `COMPAIR_GENERATION_PROVIDER` (`local`) – choose how feedback is produced. Options: `local` (call the bundled FastAPI service), `openai` (use ChatGPT-compatible APIs with an API key), or `fallback` (skip generation and surface similar references only).
|
|
98
|
+
- `COMPAIR_OPENAI_API_KEY` / `COMPAIR_OPENAI_MODEL` – when using the OpenAI provider, supply your API key and optional model name (defaults to `gpt-4o-mini`). The fallback kicks in automatically if the key or SDK is unavailable.
|
|
95
99
|
|
|
96
100
|
See `compair_core/server/settings.py` for the full settings surface.
|
|
97
101
|
|
|
@@ -57,6 +57,10 @@ Key environment variables for the core edition:
|
|
|
57
57
|
- `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
|
|
58
58
|
- `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
|
|
59
59
|
- `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
|
|
60
|
+
- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
|
|
61
|
+
- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
|
|
62
|
+
- `COMPAIR_GENERATION_PROVIDER` (`local`) – choose how feedback is produced. Options: `local` (call the bundled FastAPI service), `openai` (use ChatGPT-compatible APIs with an API key), or `fallback` (skip generation and surface similar references only).
|
|
63
|
+
- `COMPAIR_OPENAI_API_KEY` / `COMPAIR_OPENAI_MODEL` – when using the OpenAI provider, supply your API key and optional model name (defaults to `gpt-4o-mini`). The fallback kicks in automatically if the key or SDK is unavailable.
|
|
60
64
|
|
|
61
65
|
See `compair_core/server/settings.py` for the full settings surface.
|
|
62
66
|
|
|
@@ -23,7 +23,17 @@ class Embedder:
|
|
|
23
23
|
|
|
24
24
|
if self._cloud_impl is None:
|
|
25
25
|
self.model = os.getenv("COMPAIR_LOCAL_EMBED_MODEL", "hash-embedding")
|
|
26
|
-
self.
|
|
26
|
+
default_dim = 1536 if self.edition == "cloud" else 384
|
|
27
|
+
dim_env = (
|
|
28
|
+
os.getenv("COMPAIR_EMBEDDING_DIM")
|
|
29
|
+
or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
|
|
30
|
+
or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
|
|
31
|
+
or str(default_dim)
|
|
32
|
+
)
|
|
33
|
+
try:
|
|
34
|
+
self.dimension = int(dim_env)
|
|
35
|
+
except ValueError: # pragma: no cover - invalid configuration
|
|
36
|
+
self.dimension = default_dim
|
|
27
37
|
base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
|
|
28
38
|
route = os.getenv("COMPAIR_LOCAL_EMBED_ROUTE", "/embed")
|
|
29
39
|
self.endpoint = f"{base_url.rstrip('/')}{route}"
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Iterable, List
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from .logger import log_event
|
|
9
|
+
from .models import Document, User
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import openai # type: ignore
|
|
13
|
+
except ImportError: # pragma: no cover - optional dependency
|
|
14
|
+
openai = None # type: ignore
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from compair_cloud.feedback import Reviewer as CloudReviewer # type: ignore
|
|
18
|
+
from compair_cloud.feedback import get_feedback as cloud_get_feedback # type: ignore
|
|
19
|
+
except (ImportError, ModuleNotFoundError):
|
|
20
|
+
CloudReviewer = None # type: ignore
|
|
21
|
+
cloud_get_feedback = None # type: ignore
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Reviewer:
|
|
25
|
+
"""Edition-aware wrapper that selects a feedback provider based on configuration."""
|
|
26
|
+
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
29
|
+
self.provider = os.getenv("COMPAIR_GENERATION_PROVIDER", "local").lower()
|
|
30
|
+
self.length_map = {
|
|
31
|
+
"Brief": "1–2 short sentences",
|
|
32
|
+
"Detailed": "A couple short paragraphs",
|
|
33
|
+
"Verbose": "As thorough as reasonably possible without repeating information",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
self._cloud_impl = None
|
|
37
|
+
self._openai_client = None
|
|
38
|
+
self.openai_model = os.getenv("COMPAIR_OPENAI_MODEL", "gpt-4o-mini")
|
|
39
|
+
|
|
40
|
+
if self.edition == "cloud" and CloudReviewer is not None:
|
|
41
|
+
self._cloud_impl = CloudReviewer()
|
|
42
|
+
self.provider = "cloud"
|
|
43
|
+
else:
|
|
44
|
+
if self.provider == "openai":
|
|
45
|
+
api_key = os.getenv("COMPAIR_OPENAI_API_KEY")
|
|
46
|
+
if api_key and openai is not None:
|
|
47
|
+
# Support both legacy (ChatCompletion) and new SDKs
|
|
48
|
+
if hasattr(openai, "api_key"):
|
|
49
|
+
openai.api_key = api_key # type: ignore[assignment]
|
|
50
|
+
if hasattr(openai, "OpenAI"):
|
|
51
|
+
try: # pragma: no cover - optional runtime dependency
|
|
52
|
+
self._openai_client = openai.OpenAI(api_key=api_key) # type: ignore[attr-defined]
|
|
53
|
+
except Exception: # pragma: no cover - if instantiation fails
|
|
54
|
+
self._openai_client = None
|
|
55
|
+
if self._openai_client is None and not hasattr(openai, "ChatCompletion"):
|
|
56
|
+
log_event("openai_feedback_unavailable", reason="openai_library_missing")
|
|
57
|
+
self.provider = "fallback"
|
|
58
|
+
if self.provider == "local":
|
|
59
|
+
self.model = os.getenv("COMPAIR_LOCAL_GENERATION_MODEL", "local-feedback")
|
|
60
|
+
base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
|
|
61
|
+
route = os.getenv("COMPAIR_LOCAL_GENERATION_ROUTE", "/generate")
|
|
62
|
+
self.endpoint = f"{base_url.rstrip('/')}{route}"
|
|
63
|
+
else:
|
|
64
|
+
self.model = "external"
|
|
65
|
+
self.endpoint = None
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def is_cloud(self) -> bool:
|
|
69
|
+
return self._cloud_impl is not None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _reference_snippets(references: Iterable[Any], limit: int = 3) -> List[str]:
|
|
73
|
+
snippets: List[str] = []
|
|
74
|
+
for ref in references:
|
|
75
|
+
snippet = getattr(ref, "content", "") or ""
|
|
76
|
+
snippet = snippet.replace("\n", " ").strip()
|
|
77
|
+
if snippet:
|
|
78
|
+
snippets.append(snippet[:200])
|
|
79
|
+
if len(snippets) == limit:
|
|
80
|
+
break
|
|
81
|
+
return snippets
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _fallback_feedback(text: str, references: list[Any]) -> str:
|
|
85
|
+
snippets = _reference_snippets(references)
|
|
86
|
+
if not snippets:
|
|
87
|
+
return "NONE"
|
|
88
|
+
joined = "; ".join(snippets)
|
|
89
|
+
return f"Consider aligning with these reference passages: {joined}"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _openai_feedback(
|
|
93
|
+
reviewer: Reviewer,
|
|
94
|
+
doc: Document,
|
|
95
|
+
text: str,
|
|
96
|
+
references: list[Any],
|
|
97
|
+
user: User,
|
|
98
|
+
) -> str | None:
|
|
99
|
+
if openai is None:
|
|
100
|
+
return None
|
|
101
|
+
instruction = reviewer.length_map.get(user.preferred_feedback_length, "1–2 short sentences")
|
|
102
|
+
ref_text = "\n\n".join(_reference_snippets(references, limit=3))
|
|
103
|
+
messages = [
|
|
104
|
+
{
|
|
105
|
+
"role": "system",
|
|
106
|
+
"content": (
|
|
107
|
+
"You are Compair, an assistant that delivers concise, actionable feedback on a user's document. "
|
|
108
|
+
"Focus on clarity, cohesion, and usefulness."
|
|
109
|
+
),
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"role": "user",
|
|
113
|
+
"content": (
|
|
114
|
+
f"Document:\n{text}\n\nHelpful reference excerpts:\n{ref_text or 'None provided'}\n\n"
|
|
115
|
+
f"Respond with {instruction} that highlights the most valuable revision to make next."
|
|
116
|
+
),
|
|
117
|
+
},
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
if reviewer._openai_client is not None and hasattr(reviewer._openai_client, "responses"):
|
|
122
|
+
response = reviewer._openai_client.responses.create( # type: ignore[union-attr]
|
|
123
|
+
model=reviewer.openai_model,
|
|
124
|
+
input=messages,
|
|
125
|
+
max_output_tokens=256,
|
|
126
|
+
)
|
|
127
|
+
content = getattr(response, "output_text", None)
|
|
128
|
+
if not content and hasattr(response, "outputs"):
|
|
129
|
+
# Legacy compatibility: join content parts
|
|
130
|
+
parts = []
|
|
131
|
+
for item in getattr(response, "outputs", []):
|
|
132
|
+
parts.extend(getattr(item, "content", []))
|
|
133
|
+
content = " ".join(getattr(part, "text", "") for part in parts)
|
|
134
|
+
elif hasattr(openai, "ChatCompletion"):
|
|
135
|
+
chat_response = openai.ChatCompletion.create( # type: ignore[attr-defined]
|
|
136
|
+
model=reviewer.openai_model,
|
|
137
|
+
messages=messages,
|
|
138
|
+
temperature=0.3,
|
|
139
|
+
max_tokens=256,
|
|
140
|
+
)
|
|
141
|
+
content = (
|
|
142
|
+
chat_response["choices"][0]["message"]["content"].strip() # type: ignore[index, assignment]
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
content = None
|
|
146
|
+
except Exception as exc: # pragma: no cover - network/API failure
|
|
147
|
+
log_event("openai_feedback_failed", error=str(exc))
|
|
148
|
+
content = None
|
|
149
|
+
if content:
|
|
150
|
+
content = content.strip()
|
|
151
|
+
if content:
|
|
152
|
+
return content
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _local_feedback(
|
|
157
|
+
reviewer: Reviewer,
|
|
158
|
+
text: str,
|
|
159
|
+
references: list[Any],
|
|
160
|
+
user: User,
|
|
161
|
+
) -> str | None:
|
|
162
|
+
payload = {
|
|
163
|
+
"document": text,
|
|
164
|
+
"references": [getattr(ref, "content", "") for ref in references],
|
|
165
|
+
"length_instruction": reviewer.length_map.get(
|
|
166
|
+
user.preferred_feedback_length,
|
|
167
|
+
"1–2 short sentences",
|
|
168
|
+
),
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
response = requests.post(reviewer.endpoint, json=payload, timeout=30)
|
|
173
|
+
response.raise_for_status()
|
|
174
|
+
data = response.json()
|
|
175
|
+
feedback = data.get("feedback") or data.get("text")
|
|
176
|
+
if feedback:
|
|
177
|
+
return str(feedback).strip()
|
|
178
|
+
except Exception as exc: # pragma: no cover - network failures stay graceful
|
|
179
|
+
log_event("local_feedback_failed", error=str(exc))
|
|
180
|
+
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_feedback(
|
|
185
|
+
reviewer: Reviewer,
|
|
186
|
+
doc: Document,
|
|
187
|
+
text: str,
|
|
188
|
+
references: list[Any],
|
|
189
|
+
user: User,
|
|
190
|
+
) -> str:
|
|
191
|
+
if reviewer.is_cloud and cloud_get_feedback is not None:
|
|
192
|
+
return cloud_get_feedback(reviewer._cloud_impl, doc, text, references, user) # type: ignore[arg-type]
|
|
193
|
+
|
|
194
|
+
if reviewer.provider == "openai":
|
|
195
|
+
feedback = _openai_feedback(reviewer, doc, text, references, user)
|
|
196
|
+
if feedback:
|
|
197
|
+
return feedback
|
|
198
|
+
|
|
199
|
+
if reviewer.provider == "local" and getattr(reviewer, "endpoint", None):
|
|
200
|
+
feedback = _local_feedback(reviewer, text, references, user)
|
|
201
|
+
if feedback:
|
|
202
|
+
return feedback
|
|
203
|
+
|
|
204
|
+
return _fallback_feedback(text, references)
|
|
@@ -12,7 +12,17 @@ from sqlalchemy.orm import Session as SASession
|
|
|
12
12
|
|
|
13
13
|
from .embeddings import create_embedding, Embedder
|
|
14
14
|
from .feedback import get_feedback, Reviewer
|
|
15
|
-
from .models import
|
|
15
|
+
from .models import (
|
|
16
|
+
Chunk,
|
|
17
|
+
Document,
|
|
18
|
+
Feedback,
|
|
19
|
+
Group,
|
|
20
|
+
Note,
|
|
21
|
+
Reference,
|
|
22
|
+
User,
|
|
23
|
+
VECTOR_BACKEND,
|
|
24
|
+
cosine_similarity,
|
|
25
|
+
)
|
|
16
26
|
from .utils import chunk_text, log_activity
|
|
17
27
|
|
|
18
28
|
|
|
@@ -159,22 +169,41 @@ def process_text(
|
|
|
159
169
|
Chunk.note_id == note_id,
|
|
160
170
|
).first()
|
|
161
171
|
|
|
172
|
+
references: list[Chunk] = []
|
|
162
173
|
if generate_feedback and existing_chunk:
|
|
163
174
|
doc_group_ids = [g.group_id for g in doc.groups]
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
Document.
|
|
171
|
-
|
|
172
|
-
|
|
175
|
+
target_embedding = existing_chunk.embedding
|
|
176
|
+
|
|
177
|
+
if target_embedding is not None:
|
|
178
|
+
base_query = (
|
|
179
|
+
session.query(Chunk)
|
|
180
|
+
.join(Chunk.document)
|
|
181
|
+
.join(Document.groups)
|
|
182
|
+
.filter(
|
|
183
|
+
Document.is_published.is_(True),
|
|
184
|
+
Document.document_id != doc.document_id,
|
|
185
|
+
Chunk.chunk_type == "document",
|
|
186
|
+
Group.group_id.in_(doc_group_ids),
|
|
187
|
+
)
|
|
173
188
|
)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
189
|
+
|
|
190
|
+
if VECTOR_BACKEND == "pgvector":
|
|
191
|
+
references = (
|
|
192
|
+
base_query.order_by(
|
|
193
|
+
Chunk.embedding.cosine_distance(existing_chunk.embedding)
|
|
194
|
+
)
|
|
195
|
+
.limit(3)
|
|
196
|
+
.all()
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
candidates = base_query.all()
|
|
200
|
+
scored: list[tuple[float, Chunk]] = []
|
|
201
|
+
for candidate in candidates:
|
|
202
|
+
score = cosine_similarity(candidate.embedding, target_embedding)
|
|
203
|
+
if score is not None:
|
|
204
|
+
scored.append((score, candidate))
|
|
205
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
206
|
+
references = [chunk for _, chunk in scored[:3]]
|
|
178
207
|
|
|
179
208
|
sql_references: list[Reference] = []
|
|
180
209
|
for ref_chunk in references:
|
|
@@ -5,9 +5,15 @@ import hashlib
|
|
|
5
5
|
import os
|
|
6
6
|
import secrets
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
+
from math import sqrt
|
|
9
|
+
from typing import Sequence
|
|
8
10
|
from uuid import uuid4
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
try: # Optional: only required when using pgvector backend
|
|
13
|
+
from pgvector.sqlalchemy import Vector
|
|
14
|
+
except ImportError: # pragma: no cover - optional dependency in core
|
|
15
|
+
Vector = None # type: ignore[assignment]
|
|
16
|
+
|
|
11
17
|
from sqlalchemy import (
|
|
12
18
|
Boolean,
|
|
13
19
|
Column,
|
|
@@ -15,6 +21,7 @@ from sqlalchemy import (
|
|
|
15
21
|
ForeignKey,
|
|
16
22
|
Identity,
|
|
17
23
|
Integer,
|
|
24
|
+
JSON,
|
|
18
25
|
String,
|
|
19
26
|
Table,
|
|
20
27
|
Text,
|
|
@@ -27,6 +34,69 @@ from sqlalchemy.orm import (
|
|
|
27
34
|
relationship,
|
|
28
35
|
)
|
|
29
36
|
|
|
37
|
+
_EDITION = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
38
|
+
_DEFAULT_DIM = 1536 if _EDITION == "cloud" else 384
|
|
39
|
+
_DIM_ENV = (
|
|
40
|
+
os.getenv("COMPAIR_EMBEDDING_DIM")
|
|
41
|
+
or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
|
|
42
|
+
or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
|
|
43
|
+
or str(_DEFAULT_DIM)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
EMBEDDING_DIMENSION = int(_DIM_ENV)
|
|
48
|
+
except ValueError: # pragma: no cover - invalid configuration
|
|
49
|
+
EMBEDDING_DIMENSION = _DEFAULT_DIM
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _detect_vector_backend() -> str:
|
|
53
|
+
explicit = os.getenv("COMPAIR_VECTOR_BACKEND")
|
|
54
|
+
if explicit:
|
|
55
|
+
return explicit.lower()
|
|
56
|
+
|
|
57
|
+
db = os.getenv("DB")
|
|
58
|
+
db_user = os.getenv("DB_USER")
|
|
59
|
+
db_passw = os.getenv("DB_PASSW")
|
|
60
|
+
db_url = os.getenv("DB_URL")
|
|
61
|
+
database_url = os.getenv("DATABASE_URL", "")
|
|
62
|
+
|
|
63
|
+
if all([db, db_user, db_passw, db_url]):
|
|
64
|
+
return "pgvector"
|
|
65
|
+
if database_url.lower().startswith(("postgres://", "postgresql://")):
|
|
66
|
+
return "pgvector"
|
|
67
|
+
return "json"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
VECTOR_BACKEND = _detect_vector_backend()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _embedding_column():
|
|
74
|
+
if VECTOR_BACKEND == "pgvector":
|
|
75
|
+
if Vector is None:
|
|
76
|
+
raise RuntimeError(
|
|
77
|
+
"pgvector is required when COMPAIR_VECTOR_BACKEND is set to 'pgvector'."
|
|
78
|
+
)
|
|
79
|
+
return mapped_column(
|
|
80
|
+
Vector(EMBEDDING_DIMENSION),
|
|
81
|
+
nullable=True,
|
|
82
|
+
default=None,
|
|
83
|
+
)
|
|
84
|
+
# Store embeddings as JSON arrays (works across SQLite/Postgres without pgvector)
|
|
85
|
+
return mapped_column(JSON, nullable=True, default=None)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def cosine_similarity(vec1: Sequence[float] | None, vec2: Sequence[float] | None) -> float | None:
|
|
89
|
+
if not vec1 or not vec2:
|
|
90
|
+
return None
|
|
91
|
+
if len(vec1) != len(vec2):
|
|
92
|
+
return None
|
|
93
|
+
dot = sum(a * b for a, b in zip(vec1, vec2))
|
|
94
|
+
norm1 = sqrt(sum(a * a for a in vec1))
|
|
95
|
+
norm2 = sqrt(sum(b * b for b in vec2))
|
|
96
|
+
if norm1 == 0 or norm2 == 0:
|
|
97
|
+
return None
|
|
98
|
+
return dot / (norm1 * norm2)
|
|
99
|
+
|
|
30
100
|
|
|
31
101
|
class Base(DeclarativeBase, MappedAsDataclass):
|
|
32
102
|
pass
|
|
@@ -213,10 +283,10 @@ class Document(BaseObject):
|
|
|
213
283
|
doc_type: Mapped[str]
|
|
214
284
|
datetime_created: Mapped[datetime]
|
|
215
285
|
datetime_modified: Mapped[datetime]
|
|
286
|
+
embedding: Mapped[list[float] | None] = _embedding_column()
|
|
216
287
|
file_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
|
|
217
288
|
image_key: Mapped[str | None] = mapped_column(String, nullable=True, default=None)
|
|
218
289
|
is_published: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
219
|
-
embedding = mapped_column(Vector(1536))
|
|
220
290
|
|
|
221
291
|
user = relationship("User", back_populates="documents")
|
|
222
292
|
groups = relationship("Group", secondary="document_to_group", back_populates="documents")
|
|
@@ -249,8 +319,8 @@ class Note(Base):
|
|
|
249
319
|
author_id: Mapped[str] = mapped_column(ForeignKey("user.user_id", ondelete="CASCADE"), index=True)
|
|
250
320
|
group_id: Mapped[str | None] = mapped_column(ForeignKey("group.group_id", ondelete="CASCADE"), index=True, nullable=True)
|
|
251
321
|
content: Mapped[str] = mapped_column(Text)
|
|
322
|
+
embedding: Mapped[list[float] | None] = _embedding_column()
|
|
252
323
|
datetime_created: Mapped[datetime] = mapped_column(default=datetime.now(timezone.utc))
|
|
253
|
-
embedding = mapped_column(Vector(1536))
|
|
254
324
|
|
|
255
325
|
document = relationship("Document", back_populates="notes")
|
|
256
326
|
author = relationship("User", back_populates="notes")
|
|
@@ -279,7 +349,7 @@ class Chunk(Base):
|
|
|
279
349
|
document_id: Mapped[str | None] = mapped_column(ForeignKey("document.document_id", ondelete="CASCADE"), index=True, nullable=True)
|
|
280
350
|
note_id: Mapped[str | None] = mapped_column(ForeignKey("note.note_id", ondelete="CASCADE"), index=True, nullable=True)
|
|
281
351
|
chunk_type: Mapped[str] = mapped_column(String(16), default="document")
|
|
282
|
-
embedding =
|
|
352
|
+
embedding: Mapped[list[float] | None] = _embedding_column()
|
|
283
353
|
|
|
284
354
|
document = relationship("Document", back_populates="chunks")
|
|
285
355
|
note = relationship("Note", back_populates="chunks")
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Minimal FastAPI application serving local embedding and generation endpoints."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import os
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from fastapi import FastAPI
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
app = FastAPI(title="Compair Local Model", version="0.1.0")
|
|
12
|
+
|
|
13
|
+
_DEFAULT_DIM = 384
|
|
14
|
+
_DIM_ENV = (
|
|
15
|
+
os.getenv("COMPAIR_EMBEDDING_DIM")
|
|
16
|
+
or os.getenv("COMPAIR_EMBEDDING_DIMENSION")
|
|
17
|
+
or os.getenv("COMPAIR_LOCAL_EMBED_DIM")
|
|
18
|
+
or str(_DEFAULT_DIM)
|
|
19
|
+
)
|
|
20
|
+
try:
|
|
21
|
+
EMBED_DIMENSION = int(_DIM_ENV)
|
|
22
|
+
except ValueError: # pragma: no cover - invalid configuration
|
|
23
|
+
EMBED_DIMENSION = _DEFAULT_DIM
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _hash_embedding(text: str, dimension: int = EMBED_DIMENSION) -> List[float]:
|
|
27
|
+
if not text:
|
|
28
|
+
text = " "
|
|
29
|
+
digest = hashlib.sha256(text.encode("utf-8", "ignore")).digest()
|
|
30
|
+
vector: List[float] = []
|
|
31
|
+
while len(vector) < dimension:
|
|
32
|
+
for byte in digest:
|
|
33
|
+
vector.append((byte / 255.0) * 2 - 1)
|
|
34
|
+
if len(vector) == dimension:
|
|
35
|
+
break
|
|
36
|
+
digest = hashlib.sha256(digest).digest()
|
|
37
|
+
return vector
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class EmbedRequest(BaseModel):
|
|
41
|
+
text: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class EmbedResponse(BaseModel):
|
|
45
|
+
embedding: List[float]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class GenerateRequest(BaseModel):
|
|
49
|
+
# Legacy format used by the CLI shim
|
|
50
|
+
system: str | None = None
|
|
51
|
+
prompt: str | None = None
|
|
52
|
+
verbosity: str | None = None
|
|
53
|
+
|
|
54
|
+
# Core API payload (document + references)
|
|
55
|
+
document: str | None = None
|
|
56
|
+
references: List[str] | None = None
|
|
57
|
+
length_instruction: str | None = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class GenerateResponse(BaseModel):
|
|
61
|
+
feedback: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@app.post("/embed", response_model=EmbedResponse)
|
|
65
|
+
def embed(request: EmbedRequest) -> EmbedResponse:
|
|
66
|
+
return EmbedResponse(embedding=_hash_embedding(request.text))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.post("/generate", response_model=GenerateResponse)
|
|
70
|
+
def generate(request: GenerateRequest) -> GenerateResponse:
|
|
71
|
+
# Determine the main text input (document or prompt)
|
|
72
|
+
text_input = request.document or request.prompt or ""
|
|
73
|
+
text_input = text_input.strip()
|
|
74
|
+
|
|
75
|
+
if not text_input:
|
|
76
|
+
return GenerateResponse(feedback="NONE")
|
|
77
|
+
|
|
78
|
+
first_sentence = text_input.split("\n", 1)[0][:200]
|
|
79
|
+
verbosity = request.length_instruction or request.verbosity or "brief response"
|
|
80
|
+
ref_snippet = ""
|
|
81
|
+
if request.references:
|
|
82
|
+
top_ref = (request.references[0] or "").strip()
|
|
83
|
+
if top_ref:
|
|
84
|
+
ref_snippet = f" Reference: {top_ref[:160]}"
|
|
85
|
+
|
|
86
|
+
feedback = f"[local-feedback] {verbosity}: {first_sentence}{ref_snippet}".strip()
|
|
87
|
+
return GenerateResponse(feedback=feedback or "NONE")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compair-core
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: Open-source foundation of the Compair collaboration platform.
|
|
5
5
|
Author: RocketResearch, Inc.
|
|
6
6
|
License: MIT
|
|
@@ -92,6 +92,10 @@ Key environment variables for the core edition:
|
|
|
92
92
|
- `COMPAIR_REQUIRE_AUTHENTICATION` (`true`) – set to `false` to run the API in single-user mode without login or account management. When disabled, Compair auto-provisions a local user, group, and long-lived session token so you can upload documents immediately.
|
|
93
93
|
- `COMPAIR_SINGLE_USER_USERNAME` / `COMPAIR_SINGLE_USER_NAME` – override the email-style username and display name that are used for the auto-provisioned local user in single-user mode.
|
|
94
94
|
- `COMPAIR_INCLUDE_LEGACY_ROUTES` (`false`) – opt-in to the full legacy API surface (used by the hosted product) when running the core edition. Leave unset to expose only the streamlined single-user endpoints in Swagger.
|
|
95
|
+
- `COMPAIR_EMBEDDING_DIM` – force the embedding vector size stored in the database (defaults to 384 for core, 1536 for cloud). Keep this in sync with whichever embedding model you configure.
|
|
96
|
+
- `COMPAIR_VECTOR_BACKEND` (`auto`) – set to `pgvector` when running against PostgreSQL with the pgvector extension, or `json` to store embeddings as JSON (the default for SQLite deployments).
|
|
97
|
+
- `COMPAIR_GENERATION_PROVIDER` (`local`) – choose how feedback is produced. Options: `local` (call the bundled FastAPI service), `openai` (use ChatGPT-compatible APIs with an API key), or `fallback` (skip generation and surface similar references only).
|
|
98
|
+
- `COMPAIR_OPENAI_API_KEY` / `COMPAIR_OPENAI_MODEL` – when using the OpenAI provider, supply your API key and optional model name (defaults to `gpt-4o-mini`). The fallback kicks in automatically if the key or SDK is unavailable.
|
|
95
99
|
|
|
96
100
|
See `compair_core/server/settings.py` for the full settings surface.
|
|
97
101
|
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import requests
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from .logger import log_event
|
|
8
|
-
from .models import Document, User
|
|
9
|
-
|
|
10
|
-
try:
|
|
11
|
-
from compair_cloud.feedback import Reviewer as CloudReviewer # type: ignore
|
|
12
|
-
from compair_cloud.feedback import get_feedback as cloud_get_feedback # type: ignore
|
|
13
|
-
except (ImportError, ModuleNotFoundError):
|
|
14
|
-
CloudReviewer = None # type: ignore
|
|
15
|
-
cloud_get_feedback = None # type: ignore
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Reviewer:
|
|
19
|
-
"""Edition-aware wrapper that falls back to the local feedback endpoint."""
|
|
20
|
-
|
|
21
|
-
def __init__(self) -> None:
|
|
22
|
-
self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
23
|
-
self._cloud_impl = None
|
|
24
|
-
if self.edition == "cloud" and CloudReviewer is not None:
|
|
25
|
-
self._cloud_impl = CloudReviewer()
|
|
26
|
-
else:
|
|
27
|
-
self.client = None
|
|
28
|
-
self.model = os.getenv("COMPAIR_LOCAL_GENERATION_MODEL", "local-feedback")
|
|
29
|
-
base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
|
|
30
|
-
route = os.getenv("COMPAIR_LOCAL_GENERATION_ROUTE", "/generate")
|
|
31
|
-
self.endpoint = f"{base_url.rstrip('/')}{route}"
|
|
32
|
-
|
|
33
|
-
@property
|
|
34
|
-
def is_cloud(self) -> bool:
|
|
35
|
-
return self._cloud_impl is not None
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _fallback_feedback(text: str, references: list[Any]) -> str:
|
|
39
|
-
if not references:
|
|
40
|
-
return "NONE"
|
|
41
|
-
top_ref = references[0]
|
|
42
|
-
snippet = getattr(top_ref, "content", "") or ""
|
|
43
|
-
snippet = snippet.replace("\n", " ").strip()[:200]
|
|
44
|
-
if not snippet:
|
|
45
|
-
return "NONE"
|
|
46
|
-
return f"Check alignment with this reference: {snippet}"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def get_feedback(
|
|
50
|
-
reviewer: Reviewer,
|
|
51
|
-
doc: Document,
|
|
52
|
-
text: str,
|
|
53
|
-
references: list[Any],
|
|
54
|
-
user: User,
|
|
55
|
-
) -> str:
|
|
56
|
-
if reviewer.is_cloud and cloud_get_feedback is not None:
|
|
57
|
-
return cloud_get_feedback(reviewer._cloud_impl, doc, text, references, user) # type: ignore[arg-type]
|
|
58
|
-
|
|
59
|
-
payload = {
|
|
60
|
-
"document": text,
|
|
61
|
-
"references": [getattr(ref, "content", "") for ref in references],
|
|
62
|
-
"length_instruction": {
|
|
63
|
-
"Brief": "1–2 short sentences",
|
|
64
|
-
"Detailed": "A couple short paragraphs",
|
|
65
|
-
"Verbose": "As thorough as reasonably possible without repeating information",
|
|
66
|
-
}.get(user.preferred_feedback_length, "1–2 short sentences"),
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
response = requests.post(reviewer.endpoint, json=payload, timeout=30)
|
|
71
|
-
response.raise_for_status()
|
|
72
|
-
data = response.json()
|
|
73
|
-
feedback = data.get("feedback")
|
|
74
|
-
if feedback:
|
|
75
|
-
return feedback
|
|
76
|
-
except Exception as exc: # pragma: no cover - network failures stay graceful
|
|
77
|
-
log_event("local_feedback_failed", error=str(exc))
|
|
78
|
-
|
|
79
|
-
return _fallback_feedback(text, references)
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
"""Minimal FastAPI application serving local embedding and generation endpoints."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import hashlib
|
|
5
|
-
from typing import List
|
|
6
|
-
|
|
7
|
-
from fastapi import FastAPI
|
|
8
|
-
from pydantic import BaseModel
|
|
9
|
-
|
|
10
|
-
app = FastAPI(title="Compair Local Model", version="0.1.0")
|
|
11
|
-
|
|
12
|
-
EMBED_DIMENSION = 384
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _hash_embedding(text: str, dimension: int = EMBED_DIMENSION) -> List[float]:
|
|
16
|
-
if not text:
|
|
17
|
-
text = " "
|
|
18
|
-
digest = hashlib.sha256(text.encode("utf-8", "ignore")).digest()
|
|
19
|
-
vector: List[float] = []
|
|
20
|
-
while len(vector) < dimension:
|
|
21
|
-
for byte in digest:
|
|
22
|
-
vector.append((byte / 255.0) * 2 - 1)
|
|
23
|
-
if len(vector) == dimension:
|
|
24
|
-
break
|
|
25
|
-
digest = hashlib.sha256(digest).digest()
|
|
26
|
-
return vector
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class EmbedRequest(BaseModel):
|
|
30
|
-
text: str
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class EmbedResponse(BaseModel):
|
|
34
|
-
embedding: List[float]
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class GenerateRequest(BaseModel):
|
|
38
|
-
system: str | None = None
|
|
39
|
-
prompt: str
|
|
40
|
-
verbosity: str | None = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class GenerateResponse(BaseModel):
|
|
44
|
-
text: str
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@app.post("/embed", response_model=EmbedResponse)
|
|
48
|
-
def embed(request: EmbedRequest) -> EmbedResponse:
|
|
49
|
-
return EmbedResponse(embedding=_hash_embedding(request.text))
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@app.post("/generate", response_model=GenerateResponse)
|
|
53
|
-
def generate(request: GenerateRequest) -> GenerateResponse:
|
|
54
|
-
prompt = request.prompt.strip()
|
|
55
|
-
if not prompt:
|
|
56
|
-
return GenerateResponse(text="NONE")
|
|
57
|
-
|
|
58
|
-
first_sentence = prompt.split("\n", 1)[0][:200]
|
|
59
|
-
verbosity = request.verbosity or "default"
|
|
60
|
-
return GenerateResponse(
|
|
61
|
-
text=f"[local-{verbosity}] Key takeaway: {first_sentence}"
|
|
62
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|