compair-core 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compair_core/__init__.py +8 -0
- compair_core/api.py +3598 -0
- compair_core/compair/__init__.py +57 -0
- compair_core/compair/celery_app.py +31 -0
- compair_core/compair/default_groups.py +14 -0
- compair_core/compair/embeddings.py +141 -0
- compair_core/compair/feedback.py +368 -0
- compair_core/compair/logger.py +29 -0
- compair_core/compair/main.py +276 -0
- compair_core/compair/models.py +453 -0
- compair_core/compair/schema.py +146 -0
- compair_core/compair/tasks.py +106 -0
- compair_core/compair/utils.py +42 -0
- compair_core/compair_email/__init__.py +0 -0
- compair_core/compair_email/email.py +6 -0
- compair_core/compair_email/email_core.py +15 -0
- compair_core/compair_email/templates.py +6 -0
- compair_core/compair_email/templates_core.py +32 -0
- compair_core/db.py +64 -0
- compair_core/server/__init__.py +0 -0
- compair_core/server/app.py +97 -0
- compair_core/server/deps.py +77 -0
- compair_core/server/local_model/__init__.py +1 -0
- compair_core/server/local_model/app.py +87 -0
- compair_core/server/local_model/ocr.py +107 -0
- compair_core/server/providers/__init__.py +0 -0
- compair_core/server/providers/console_mailer.py +9 -0
- compair_core/server/providers/contracts.py +66 -0
- compair_core/server/providers/http_ocr.py +60 -0
- compair_core/server/providers/local_storage.py +28 -0
- compair_core/server/providers/noop_analytics.py +7 -0
- compair_core/server/providers/noop_billing.py +30 -0
- compair_core/server/providers/noop_ocr.py +10 -0
- compair_core/server/routers/__init__.py +0 -0
- compair_core/server/routers/capabilities.py +46 -0
- compair_core/server/settings.py +66 -0
- compair_core-0.4.12.dist-info/METADATA +136 -0
- compair_core-0.4.12.dist-info/RECORD +41 -0
- compair_core-0.4.12.dist-info/WHEEL +5 -0
- compair_core-0.4.12.dist-info/licenses/LICENSE +674 -0
- compair_core-0.4.12.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime, timedelta, timezone
|
|
6
|
+
from typing import Mapping
|
|
7
|
+
|
|
8
|
+
import Levenshtein
|
|
9
|
+
from sqlalchemy import select
|
|
10
|
+
from sqlalchemy.orm.attributes import get_history
|
|
11
|
+
from sqlalchemy.orm import Session as SASession
|
|
12
|
+
|
|
13
|
+
from .embeddings import create_embedding, Embedder
|
|
14
|
+
from .feedback import get_feedback, Reviewer
|
|
15
|
+
from .models import (
|
|
16
|
+
Chunk,
|
|
17
|
+
Document,
|
|
18
|
+
Feedback,
|
|
19
|
+
Group,
|
|
20
|
+
Note,
|
|
21
|
+
Reference,
|
|
22
|
+
User,
|
|
23
|
+
VECTOR_BACKEND,
|
|
24
|
+
cosine_similarity,
|
|
25
|
+
)
|
|
26
|
+
from .utils import chunk_text, log_activity
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def process_document(
|
|
30
|
+
user: User,
|
|
31
|
+
session: SASession,
|
|
32
|
+
embedder: Embedder,
|
|
33
|
+
reviewer: Reviewer,
|
|
34
|
+
doc: Document,
|
|
35
|
+
generate_feedback: bool = True,
|
|
36
|
+
) -> Mapping[str, int]:
|
|
37
|
+
new = False
|
|
38
|
+
|
|
39
|
+
prev_content = get_history(doc, "content").deleted
|
|
40
|
+
prev_chunks: list[str] = []
|
|
41
|
+
if prev_content:
|
|
42
|
+
prev_chunks = chunk_text(prev_content[-1])
|
|
43
|
+
|
|
44
|
+
feedback_limit_env = os.getenv("COMPAIR_CORE_FEEDBACK_LIMIT")
|
|
45
|
+
try:
|
|
46
|
+
feedback_limit = int(feedback_limit_env) if feedback_limit_env else None
|
|
47
|
+
except ValueError:
|
|
48
|
+
feedback_limit = None
|
|
49
|
+
time_cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
|
|
50
|
+
|
|
51
|
+
recent_feedback_count = session.query(Feedback).filter(
|
|
52
|
+
Feedback.source_chunk_id.in_(
|
|
53
|
+
session.query(Chunk.chunk_id).filter(Chunk.document_id == doc.document_id)
|
|
54
|
+
),
|
|
55
|
+
Feedback.timestamp >= time_cutoff,
|
|
56
|
+
).count()
|
|
57
|
+
|
|
58
|
+
content = doc.content
|
|
59
|
+
chunks = chunk_text(content)
|
|
60
|
+
new_chunks = list(set(chunks) - set(prev_chunks))
|
|
61
|
+
|
|
62
|
+
prioritized_chunk_indices: list[int] = []
|
|
63
|
+
if generate_feedback:
|
|
64
|
+
prioritized_chunk_indices = detect_significant_edits(prev_chunks=prev_chunks, new_chunks=new_chunks)
|
|
65
|
+
|
|
66
|
+
if feedback_limit is None:
|
|
67
|
+
indices_to_generate_feedback = prioritized_chunk_indices
|
|
68
|
+
else:
|
|
69
|
+
num_chunks_can_generate_feedback = max((feedback_limit - recent_feedback_count), 0)
|
|
70
|
+
indices_to_generate_feedback = prioritized_chunk_indices[:num_chunks_can_generate_feedback]
|
|
71
|
+
|
|
72
|
+
for i, chunk in enumerate(new_chunks):
|
|
73
|
+
should_generate_feedback = i in indices_to_generate_feedback
|
|
74
|
+
process_text(
|
|
75
|
+
session=session,
|
|
76
|
+
embedder=embedder,
|
|
77
|
+
reviewer=reviewer,
|
|
78
|
+
doc=doc,
|
|
79
|
+
text=chunk,
|
|
80
|
+
generate_feedback=should_generate_feedback,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
remove_chunks = set(prev_chunks) - set(chunks)
|
|
84
|
+
for chunk in remove_chunks:
|
|
85
|
+
remove_text(session=session, text=chunk, document_id=doc.document_id)
|
|
86
|
+
|
|
87
|
+
if doc.groups:
|
|
88
|
+
log_activity(
|
|
89
|
+
session=session,
|
|
90
|
+
user_id=doc.author_id,
|
|
91
|
+
group_id=doc.groups[0].group_id,
|
|
92
|
+
action="update",
|
|
93
|
+
object_id=doc.document_id,
|
|
94
|
+
object_name=doc.title,
|
|
95
|
+
object_type="document",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
session.commit()
|
|
99
|
+
return {"new": new}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def detect_significant_edits(
|
|
103
|
+
prev_chunks: list[str],
|
|
104
|
+
new_chunks: list[str],
|
|
105
|
+
threshold: float = 0.5,
|
|
106
|
+
) -> list[int]:
|
|
107
|
+
prev_set = set(prev_chunks)
|
|
108
|
+
new_set = set(new_chunks)
|
|
109
|
+
|
|
110
|
+
unchanged_chunks = prev_set & new_set
|
|
111
|
+
new_chunks_to_check = new_set - unchanged_chunks
|
|
112
|
+
|
|
113
|
+
process_chunks: list[str] = []
|
|
114
|
+
for new_chunk in new_chunks_to_check:
|
|
115
|
+
best_match = max((Levenshtein.ratio(new_chunk, prev_chunk) for prev_chunk in prev_chunks), default=0.0)
|
|
116
|
+
if best_match < threshold:
|
|
117
|
+
process_chunks.append(new_chunk)
|
|
118
|
+
|
|
119
|
+
prioritized_chunks = prioritize_chunks(process_chunks)
|
|
120
|
+
return [new_chunks.index(new_chunk) for new_chunk in prioritized_chunks if new_chunk in new_chunks]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def prioritize_chunks(chunks: list[str], limit: int | None = None) -> list[str]:
|
|
124
|
+
limit = limit or len(chunks)
|
|
125
|
+
indexed_chunks = [(chunk, idx) for idx, chunk in enumerate(chunks)]
|
|
126
|
+
indexed_chunks.sort(key=lambda x: (-len(x[0]), x[1]))
|
|
127
|
+
return [chunk for chunk, _ in indexed_chunks[:limit]]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def process_text(
|
|
131
|
+
session: SASession,
|
|
132
|
+
embedder: Embedder,
|
|
133
|
+
reviewer: Reviewer,
|
|
134
|
+
doc: Document,
|
|
135
|
+
text: str,
|
|
136
|
+
generate_feedback: bool = True,
|
|
137
|
+
note: Note | None = None,
|
|
138
|
+
) -> None:
|
|
139
|
+
logger = logging.getLogger(__name__)
|
|
140
|
+
chunk_hash = hash(text)
|
|
141
|
+
|
|
142
|
+
chunk_type = "note" if note else "document"
|
|
143
|
+
note_id = note.note_id if note else None
|
|
144
|
+
|
|
145
|
+
existing_chunks = session.query(Chunk).filter(
|
|
146
|
+
Chunk.hash == str(chunk_hash),
|
|
147
|
+
Chunk.document_id == doc.document_id,
|
|
148
|
+
Chunk.chunk_type == chunk_type,
|
|
149
|
+
Chunk.note_id == note_id,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
user = session.query(User).filter(User.user_id == doc.author_id).first()
|
|
153
|
+
if existing_chunks.first():
|
|
154
|
+
for chunk in existing_chunks:
|
|
155
|
+
if chunk.embedding is None:
|
|
156
|
+
embedding = create_embedding(embedder, text, user=user)
|
|
157
|
+
existing_chunks.update({"embedding": embedding})
|
|
158
|
+
session.commit()
|
|
159
|
+
else:
|
|
160
|
+
chunk = Chunk(
|
|
161
|
+
hash=chunk_hash,
|
|
162
|
+
document_id=doc.document_id,
|
|
163
|
+
note_id=note_id,
|
|
164
|
+
chunk_type=chunk_type,
|
|
165
|
+
content=text,
|
|
166
|
+
)
|
|
167
|
+
embedding = create_embedding(embedder, text, user=user)
|
|
168
|
+
chunk.embedding = embedding
|
|
169
|
+
session.add(chunk)
|
|
170
|
+
session.commit()
|
|
171
|
+
existing_chunk = chunk
|
|
172
|
+
existing_chunk = session.query(Chunk).filter(
|
|
173
|
+
Chunk.document_id == doc.document_id,
|
|
174
|
+
Chunk.hash == str(chunk_hash),
|
|
175
|
+
Chunk.chunk_type == chunk_type,
|
|
176
|
+
Chunk.note_id == note_id,
|
|
177
|
+
).first()
|
|
178
|
+
|
|
179
|
+
references: list[Chunk] = []
|
|
180
|
+
if generate_feedback and existing_chunk:
|
|
181
|
+
doc_group_ids = [g.group_id for g in doc.groups]
|
|
182
|
+
target_embedding = existing_chunk.embedding
|
|
183
|
+
|
|
184
|
+
if target_embedding is not None:
|
|
185
|
+
base_query = (
|
|
186
|
+
session.query(Chunk)
|
|
187
|
+
.join(Chunk.document)
|
|
188
|
+
.join(Document.groups)
|
|
189
|
+
.filter(
|
|
190
|
+
Document.is_published.is_(True),
|
|
191
|
+
Document.document_id != doc.document_id,
|
|
192
|
+
Chunk.chunk_type == "document",
|
|
193
|
+
Group.group_id.in_(doc_group_ids),
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if VECTOR_BACKEND == "pgvector":
|
|
198
|
+
references = (
|
|
199
|
+
base_query.order_by(
|
|
200
|
+
Chunk.embedding.cosine_distance(existing_chunk.embedding)
|
|
201
|
+
)
|
|
202
|
+
.limit(3)
|
|
203
|
+
.all()
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
candidates = base_query.all()
|
|
207
|
+
scored: list[tuple[float, Chunk]] = []
|
|
208
|
+
for candidate in candidates:
|
|
209
|
+
score = cosine_similarity(candidate.embedding, target_embedding)
|
|
210
|
+
if score is not None:
|
|
211
|
+
scored.append((score, candidate))
|
|
212
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
213
|
+
references = [chunk for _, chunk in scored[:3]]
|
|
214
|
+
|
|
215
|
+
sql_references: list[Reference] = []
|
|
216
|
+
for ref_chunk in references:
|
|
217
|
+
sql_references.append(
|
|
218
|
+
Reference(
|
|
219
|
+
source_chunk_id=existing_chunk.chunk_id,
|
|
220
|
+
reference_type="document",
|
|
221
|
+
reference_document_id=ref_chunk.document_id,
|
|
222
|
+
reference_note_id=None,
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if sql_references:
|
|
227
|
+
session.add_all(sql_references)
|
|
228
|
+
session.commit()
|
|
229
|
+
|
|
230
|
+
feedback = get_feedback(reviewer, doc, text, references, user)
|
|
231
|
+
if feedback != "NONE":
|
|
232
|
+
sql_feedback = Feedback(
|
|
233
|
+
source_chunk_id=existing_chunk.chunk_id,
|
|
234
|
+
feedback=feedback,
|
|
235
|
+
model=reviewer.model,
|
|
236
|
+
)
|
|
237
|
+
session.add(sql_feedback)
|
|
238
|
+
session.commit()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def remove_text(session: SASession, text: str, document_id: str) -> None:
|
|
242
|
+
chunks = session.query(Chunk).filter(
|
|
243
|
+
Chunk.document_id == document_id,
|
|
244
|
+
Chunk.content == text,
|
|
245
|
+
)
|
|
246
|
+
chunks.delete(synchronize_session=False)
|
|
247
|
+
session.commit()
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def get_all_chunks_for_document(session: SASession, doc: Document) -> list[Chunk]:
|
|
251
|
+
doc_chunks = session.query(Chunk).filter(Chunk.document_id == doc.document_id).all()
|
|
252
|
+
note_chunks: list[Chunk] = []
|
|
253
|
+
notes = session.query(Note).filter(Note.document_id == doc.document_id).all()
|
|
254
|
+
for note in notes:
|
|
255
|
+
note_text_chunks = chunk_text(note.content)
|
|
256
|
+
for text in note_text_chunks:
|
|
257
|
+
chunk_hash = hash(text)
|
|
258
|
+
existing = session.query(Chunk).filter(
|
|
259
|
+
Chunk.hash == str(chunk_hash),
|
|
260
|
+
Chunk.document_id == doc.document_id,
|
|
261
|
+
Chunk.content == text,
|
|
262
|
+
).first()
|
|
263
|
+
if not existing:
|
|
264
|
+
embedding = create_embedding(Embedder(), text, user=doc.author_id)
|
|
265
|
+
note_chunk = Chunk(
|
|
266
|
+
hash=str(chunk_hash),
|
|
267
|
+
document_id=doc.document_id,
|
|
268
|
+
content=text,
|
|
269
|
+
embedding=embedding,
|
|
270
|
+
)
|
|
271
|
+
session.add(note_chunk)
|
|
272
|
+
session.commit()
|
|
273
|
+
note_chunks.append(note_chunk)
|
|
274
|
+
else:
|
|
275
|
+
note_chunks.append(existing)
|
|
276
|
+
return doc_chunks + note_chunks
|