compair-core 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. compair_core/__init__.py +8 -0
  2. compair_core/api.py +3598 -0
  3. compair_core/compair/__init__.py +57 -0
  4. compair_core/compair/celery_app.py +31 -0
  5. compair_core/compair/default_groups.py +14 -0
  6. compair_core/compair/embeddings.py +141 -0
  7. compair_core/compair/feedback.py +368 -0
  8. compair_core/compair/logger.py +29 -0
  9. compair_core/compair/main.py +276 -0
  10. compair_core/compair/models.py +453 -0
  11. compair_core/compair/schema.py +146 -0
  12. compair_core/compair/tasks.py +106 -0
  13. compair_core/compair/utils.py +42 -0
  14. compair_core/compair_email/__init__.py +0 -0
  15. compair_core/compair_email/email.py +6 -0
  16. compair_core/compair_email/email_core.py +15 -0
  17. compair_core/compair_email/templates.py +6 -0
  18. compair_core/compair_email/templates_core.py +32 -0
  19. compair_core/db.py +64 -0
  20. compair_core/server/__init__.py +0 -0
  21. compair_core/server/app.py +97 -0
  22. compair_core/server/deps.py +77 -0
  23. compair_core/server/local_model/__init__.py +1 -0
  24. compair_core/server/local_model/app.py +87 -0
  25. compair_core/server/local_model/ocr.py +107 -0
  26. compair_core/server/providers/__init__.py +0 -0
  27. compair_core/server/providers/console_mailer.py +9 -0
  28. compair_core/server/providers/contracts.py +66 -0
  29. compair_core/server/providers/http_ocr.py +60 -0
  30. compair_core/server/providers/local_storage.py +28 -0
  31. compair_core/server/providers/noop_analytics.py +7 -0
  32. compair_core/server/providers/noop_billing.py +30 -0
  33. compair_core/server/providers/noop_ocr.py +10 -0
  34. compair_core/server/routers/__init__.py +0 -0
  35. compair_core/server/routers/capabilities.py +46 -0
  36. compair_core/server/settings.py +66 -0
  37. compair_core-0.4.12.dist-info/METADATA +136 -0
  38. compair_core-0.4.12.dist-info/RECORD +41 -0
  39. compair_core-0.4.12.dist-info/WHEEL +5 -0
  40. compair_core-0.4.12.dist-info/licenses/LICENSE +674 -0
  41. compair_core-0.4.12.dist-info/top_level.txt +1 -0
@@ -0,0 +1,276 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import Mapping
7
+
8
+ import Levenshtein
9
+ from sqlalchemy import select
10
+ from sqlalchemy.orm.attributes import get_history
11
+ from sqlalchemy.orm import Session as SASession
12
+
13
+ from .embeddings import create_embedding, Embedder
14
+ from .feedback import get_feedback, Reviewer
15
+ from .models import (
16
+ Chunk,
17
+ Document,
18
+ Feedback,
19
+ Group,
20
+ Note,
21
+ Reference,
22
+ User,
23
+ VECTOR_BACKEND,
24
+ cosine_similarity,
25
+ )
26
+ from .utils import chunk_text, log_activity
27
+
28
+
29
+ def process_document(
30
+ user: User,
31
+ session: SASession,
32
+ embedder: Embedder,
33
+ reviewer: Reviewer,
34
+ doc: Document,
35
+ generate_feedback: bool = True,
36
+ ) -> Mapping[str, int]:
37
+ new = False
38
+
39
+ prev_content = get_history(doc, "content").deleted
40
+ prev_chunks: list[str] = []
41
+ if prev_content:
42
+ prev_chunks = chunk_text(prev_content[-1])
43
+
44
+ feedback_limit_env = os.getenv("COMPAIR_CORE_FEEDBACK_LIMIT")
45
+ try:
46
+ feedback_limit = int(feedback_limit_env) if feedback_limit_env else None
47
+ except ValueError:
48
+ feedback_limit = None
49
+ time_cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
50
+
51
+ recent_feedback_count = session.query(Feedback).filter(
52
+ Feedback.source_chunk_id.in_(
53
+ session.query(Chunk.chunk_id).filter(Chunk.document_id == doc.document_id)
54
+ ),
55
+ Feedback.timestamp >= time_cutoff,
56
+ ).count()
57
+
58
+ content = doc.content
59
+ chunks = chunk_text(content)
60
+ new_chunks = list(set(chunks) - set(prev_chunks))
61
+
62
+ prioritized_chunk_indices: list[int] = []
63
+ if generate_feedback:
64
+ prioritized_chunk_indices = detect_significant_edits(prev_chunks=prev_chunks, new_chunks=new_chunks)
65
+
66
+ if feedback_limit is None:
67
+ indices_to_generate_feedback = prioritized_chunk_indices
68
+ else:
69
+ num_chunks_can_generate_feedback = max((feedback_limit - recent_feedback_count), 0)
70
+ indices_to_generate_feedback = prioritized_chunk_indices[:num_chunks_can_generate_feedback]
71
+
72
+ for i, chunk in enumerate(new_chunks):
73
+ should_generate_feedback = i in indices_to_generate_feedback
74
+ process_text(
75
+ session=session,
76
+ embedder=embedder,
77
+ reviewer=reviewer,
78
+ doc=doc,
79
+ text=chunk,
80
+ generate_feedback=should_generate_feedback,
81
+ )
82
+
83
+ remove_chunks = set(prev_chunks) - set(chunks)
84
+ for chunk in remove_chunks:
85
+ remove_text(session=session, text=chunk, document_id=doc.document_id)
86
+
87
+ if doc.groups:
88
+ log_activity(
89
+ session=session,
90
+ user_id=doc.author_id,
91
+ group_id=doc.groups[0].group_id,
92
+ action="update",
93
+ object_id=doc.document_id,
94
+ object_name=doc.title,
95
+ object_type="document",
96
+ )
97
+
98
+ session.commit()
99
+ return {"new": new}
100
+
101
+
102
+ def detect_significant_edits(
103
+ prev_chunks: list[str],
104
+ new_chunks: list[str],
105
+ threshold: float = 0.5,
106
+ ) -> list[int]:
107
+ prev_set = set(prev_chunks)
108
+ new_set = set(new_chunks)
109
+
110
+ unchanged_chunks = prev_set & new_set
111
+ new_chunks_to_check = new_set - unchanged_chunks
112
+
113
+ process_chunks: list[str] = []
114
+ for new_chunk in new_chunks_to_check:
115
+ best_match = max((Levenshtein.ratio(new_chunk, prev_chunk) for prev_chunk in prev_chunks), default=0.0)
116
+ if best_match < threshold:
117
+ process_chunks.append(new_chunk)
118
+
119
+ prioritized_chunks = prioritize_chunks(process_chunks)
120
+ return [new_chunks.index(new_chunk) for new_chunk in prioritized_chunks if new_chunk in new_chunks]
121
+
122
+
123
+ def prioritize_chunks(chunks: list[str], limit: int | None = None) -> list[str]:
124
+ limit = limit or len(chunks)
125
+ indexed_chunks = [(chunk, idx) for idx, chunk in enumerate(chunks)]
126
+ indexed_chunks.sort(key=lambda x: (-len(x[0]), x[1]))
127
+ return [chunk for chunk, _ in indexed_chunks[:limit]]
128
+
129
+
130
+ def process_text(
131
+ session: SASession,
132
+ embedder: Embedder,
133
+ reviewer: Reviewer,
134
+ doc: Document,
135
+ text: str,
136
+ generate_feedback: bool = True,
137
+ note: Note | None = None,
138
+ ) -> None:
139
+ logger = logging.getLogger(__name__)
140
+ chunk_hash = hash(text)
141
+
142
+ chunk_type = "note" if note else "document"
143
+ note_id = note.note_id if note else None
144
+
145
+ existing_chunks = session.query(Chunk).filter(
146
+ Chunk.hash == str(chunk_hash),
147
+ Chunk.document_id == doc.document_id,
148
+ Chunk.chunk_type == chunk_type,
149
+ Chunk.note_id == note_id,
150
+ )
151
+
152
+ user = session.query(User).filter(User.user_id == doc.author_id).first()
153
+ if existing_chunks.first():
154
+ for chunk in existing_chunks:
155
+ if chunk.embedding is None:
156
+ embedding = create_embedding(embedder, text, user=user)
157
+ existing_chunks.update({"embedding": embedding})
158
+ session.commit()
159
+ else:
160
+ chunk = Chunk(
161
+ hash=chunk_hash,
162
+ document_id=doc.document_id,
163
+ note_id=note_id,
164
+ chunk_type=chunk_type,
165
+ content=text,
166
+ )
167
+ embedding = create_embedding(embedder, text, user=user)
168
+ chunk.embedding = embedding
169
+ session.add(chunk)
170
+ session.commit()
171
+ existing_chunk = chunk
172
+ existing_chunk = session.query(Chunk).filter(
173
+ Chunk.document_id == doc.document_id,
174
+ Chunk.hash == str(chunk_hash),
175
+ Chunk.chunk_type == chunk_type,
176
+ Chunk.note_id == note_id,
177
+ ).first()
178
+
179
+ references: list[Chunk] = []
180
+ if generate_feedback and existing_chunk:
181
+ doc_group_ids = [g.group_id for g in doc.groups]
182
+ target_embedding = existing_chunk.embedding
183
+
184
+ if target_embedding is not None:
185
+ base_query = (
186
+ session.query(Chunk)
187
+ .join(Chunk.document)
188
+ .join(Document.groups)
189
+ .filter(
190
+ Document.is_published.is_(True),
191
+ Document.document_id != doc.document_id,
192
+ Chunk.chunk_type == "document",
193
+ Group.group_id.in_(doc_group_ids),
194
+ )
195
+ )
196
+
197
+ if VECTOR_BACKEND == "pgvector":
198
+ references = (
199
+ base_query.order_by(
200
+ Chunk.embedding.cosine_distance(existing_chunk.embedding)
201
+ )
202
+ .limit(3)
203
+ .all()
204
+ )
205
+ else:
206
+ candidates = base_query.all()
207
+ scored: list[tuple[float, Chunk]] = []
208
+ for candidate in candidates:
209
+ score = cosine_similarity(candidate.embedding, target_embedding)
210
+ if score is not None:
211
+ scored.append((score, candidate))
212
+ scored.sort(key=lambda item: item[0], reverse=True)
213
+ references = [chunk for _, chunk in scored[:3]]
214
+
215
+ sql_references: list[Reference] = []
216
+ for ref_chunk in references:
217
+ sql_references.append(
218
+ Reference(
219
+ source_chunk_id=existing_chunk.chunk_id,
220
+ reference_type="document",
221
+ reference_document_id=ref_chunk.document_id,
222
+ reference_note_id=None,
223
+ )
224
+ )
225
+
226
+ if sql_references:
227
+ session.add_all(sql_references)
228
+ session.commit()
229
+
230
+ feedback = get_feedback(reviewer, doc, text, references, user)
231
+ if feedback != "NONE":
232
+ sql_feedback = Feedback(
233
+ source_chunk_id=existing_chunk.chunk_id,
234
+ feedback=feedback,
235
+ model=reviewer.model,
236
+ )
237
+ session.add(sql_feedback)
238
+ session.commit()
239
+
240
+
241
+ def remove_text(session: SASession, text: str, document_id: str) -> None:
242
+ chunks = session.query(Chunk).filter(
243
+ Chunk.document_id == document_id,
244
+ Chunk.content == text,
245
+ )
246
+ chunks.delete(synchronize_session=False)
247
+ session.commit()
248
+
249
+
250
+ def get_all_chunks_for_document(session: SASession, doc: Document) -> list[Chunk]:
251
+ doc_chunks = session.query(Chunk).filter(Chunk.document_id == doc.document_id).all()
252
+ note_chunks: list[Chunk] = []
253
+ notes = session.query(Note).filter(Note.document_id == doc.document_id).all()
254
+ for note in notes:
255
+ note_text_chunks = chunk_text(note.content)
256
+ for text in note_text_chunks:
257
+ chunk_hash = hash(text)
258
+ existing = session.query(Chunk).filter(
259
+ Chunk.hash == str(chunk_hash),
260
+ Chunk.document_id == doc.document_id,
261
+ Chunk.content == text,
262
+ ).first()
263
+ if not existing:
264
+ embedding = create_embedding(Embedder(), text, user=doc.author_id)
265
+ note_chunk = Chunk(
266
+ hash=str(chunk_hash),
267
+ document_id=doc.document_id,
268
+ content=text,
269
+ embedding=embedding,
270
+ )
271
+ session.add(note_chunk)
272
+ session.commit()
273
+ note_chunks.append(note_chunk)
274
+ else:
275
+ note_chunks.append(existing)
276
+ return doc_chunks + note_chunks