compair-core 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {compair_core-0.3.0 → compair_core-0.3.2}/PKG-INFO +6 -4
  2. {compair_core-0.3.0 → compair_core-0.3.2}/README.md +5 -3
  3. compair_core-0.3.2/compair/__init__.py +87 -0
  4. compair_core-0.3.2/compair/celery_app.py +22 -0
  5. compair_core-0.3.2/compair/default_groups.py +14 -0
  6. compair_core-0.3.2/compair/embeddings.py +66 -0
  7. compair_core-0.3.2/compair/feedback.py +79 -0
  8. compair_core-0.3.2/compair/logger.py +29 -0
  9. compair_core-0.3.2/compair/main.py +240 -0
  10. compair_core-0.3.2/compair/models.py +355 -0
  11. compair_core-0.3.2/compair/schema.py +146 -0
  12. compair_core-0.3.2/compair/tasks.py +94 -0
  13. compair_core-0.3.2/compair/utils.py +61 -0
  14. {compair_core-0.3.0 → compair_core-0.3.2}/compair_core.egg-info/PKG-INFO +6 -4
  15. compair_core-0.3.2/compair_core.egg-info/SOURCES.txt +39 -0
  16. compair_core-0.3.2/compair_core.egg-info/top_level.txt +3 -0
  17. compair_core-0.3.2/compair_email/__init__.py +0 -0
  18. compair_core-0.3.2/compair_email/email.py +6 -0
  19. compair_core-0.3.2/compair_email/email_core.py +15 -0
  20. compair_core-0.3.2/compair_email/templates.py +6 -0
  21. compair_core-0.3.2/compair_email/templates_core.py +13 -0
  22. {compair_core-0.3.0 → compair_core-0.3.2}/pyproject.toml +11 -2
  23. compair_core-0.3.2/server/__init__.py +0 -0
  24. compair_core-0.3.2/server/app.py +90 -0
  25. compair_core-0.3.2/server/deps.py +67 -0
  26. compair_core-0.3.2/server/local_model/__init__.py +1 -0
  27. compair_core-0.3.2/server/local_model/app.py +62 -0
  28. compair_core-0.3.2/server/providers/__init__.py +0 -0
  29. compair_core-0.3.2/server/providers/console_mailer.py +9 -0
  30. compair_core-0.3.2/server/providers/contracts.py +66 -0
  31. compair_core-0.3.2/server/providers/local_storage.py +28 -0
  32. compair_core-0.3.2/server/providers/noop_analytics.py +7 -0
  33. compair_core-0.3.2/server/providers/noop_billing.py +30 -0
  34. compair_core-0.3.2/server/providers/noop_ocr.py +10 -0
  35. compair_core-0.3.2/server/routers/__init__.py +0 -0
  36. compair_core-0.3.2/server/routers/capabilities.py +38 -0
  37. compair_core-0.3.2/server/settings.py +51 -0
  38. compair_core-0.3.0/compair_core.egg-info/SOURCES.txt +0 -8
  39. compair_core-0.3.0/compair_core.egg-info/top_level.txt +0 -1
  40. {compair_core-0.3.0 → compair_core-0.3.2}/LICENSE +0 -0
  41. {compair_core-0.3.0 → compair_core-0.3.2}/compair_core.egg-info/dependency_links.txt +0 -0
  42. {compair_core-0.3.0 → compair_core-0.3.2}/compair_core.egg-info/requires.txt +0 -0
  43. {compair_core-0.3.0 → compair_core-0.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compair-core
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Open-source foundation of the Compair collaboration platform.
5
5
  Author: RocketResearch, Inc.
6
6
  License: MIT
@@ -70,7 +70,7 @@ git clone https://github.com/RocketResearch-Inc/compair_core.git
70
70
  cd compair_core
71
71
  python -m venv .venv
72
72
  source .venv/bin/activate
73
- pip install -e .[dev]
73
+ pip install -e ".[dev]"
74
74
  ```
75
75
 
76
76
  ## Containers
@@ -96,8 +96,8 @@ See `compair_core/server/settings.py` for the full settings surface.
96
96
  ```bash
97
97
  python -m venv .venv
98
98
  source .venv/bin/activate
99
- pip install -e .[dev]
100
- uvicorn compair_core.server.app:app --reload
99
+ pip install -e ".[dev]"
100
+ uvicorn compair.server.app:create_app --factory --reload
101
101
  ```
102
102
 
103
103
  The API will be available at http://127.0.0.1:8000 and supports the Swagger UI at `/docs`.
@@ -106,6 +106,8 @@ The API will be available at http://127.0.0.1:8000 and supports the Swagger UI a
106
106
 
107
107
  Core currently ships with a syntax sanity check (`python -m compileall ...`). You can add pytest or other tooling as needed.
108
108
 
109
+ Release and packaging steps are documented in `docs/maintainers.md`.
110
+
109
111
  ## Reporting Issues
110
112
 
111
113
  Please open GitHub issues or PRs against this repository. If you are a Compair Cloud customer, reach out through your support channel for issues related to premium features.
@@ -36,7 +36,7 @@ git clone https://github.com/RocketResearch-Inc/compair_core.git
36
36
  cd compair_core
37
37
  python -m venv .venv
38
38
  source .venv/bin/activate
39
- pip install -e .[dev]
39
+ pip install -e ".[dev]"
40
40
  ```
41
41
 
42
42
  ## Containers
@@ -62,8 +62,8 @@ See `compair_core/server/settings.py` for the full settings surface.
62
62
  ```bash
63
63
  python -m venv .venv
64
64
  source .venv/bin/activate
65
- pip install -e .[dev]
66
- uvicorn compair_core.server.app:app --reload
65
+ pip install -e ".[dev]"
66
+ uvicorn compair.server.app:create_app --factory --reload
67
67
  ```
68
68
 
69
69
  The API will be available at http://127.0.0.1:8000 and supports the Swagger UI at `/docs`.
@@ -72,6 +72,8 @@ The API will be available at http://127.0.0.1:8000 and supports the Swagger UI a
72
72
 
73
73
  Core currently ships with a syntax sanity check (`python -m compileall ...`). You can add pytest or other tooling as needed.
74
74
 
75
+ Release and packaging steps are documented in `docs/maintainers.md`.
76
+
75
77
  ## Reporting Issues
76
78
 
77
79
  Please open GitHub issues or PRs against this repository. If you are a Compair Cloud customer, reach out through your support channel for issues related to premium features.
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+
6
+ from sqlalchemy import Engine, create_engine
7
+ from sqlalchemy.orm import sessionmaker
8
+
9
+ edition = os.getenv("COMPAIR_EDITION", "core").lower()
10
+
11
+ _cloud_available = False
12
+ if edition == "cloud":
13
+ try: # Import cloud overrides if the private package is installed
14
+ from compair_cloud import (
15
+ bootstrap as cloud_bootstrap,
16
+ celery_app as cloud_celery_app,
17
+ default_groups as cloud_default_groups,
18
+ embeddings as cloud_embeddings,
19
+ feedback as cloud_feedback,
20
+ logger as cloud_logger,
21
+ main as cloud_main,
22
+ models as cloud_models,
23
+ tasks as cloud_tasks,
24
+ utils as cloud_utils,
25
+ ) # type: ignore
26
+
27
+ _cloud_available = True
28
+ except ImportError:
29
+ _cloud_available = False
30
+
31
+ if _cloud_available:
32
+ from compair_cloud.default_groups import initialize_default_groups # type: ignore
33
+ embeddings = cloud_embeddings
34
+ feedback = cloud_feedback
35
+ logger = cloud_logger
36
+ main = cloud_main
37
+ models = cloud_models
38
+ tasks = cloud_tasks
39
+ utils = cloud_utils
40
+ initialize_database_override = getattr(cloud_bootstrap, "initialize_database", None)
41
+ else:
42
+ from . import embeddings, feedback, logger, main, models, tasks, utils
43
+ from .default_groups import initialize_default_groups
44
+ initialize_database_override = None
45
+
46
+
47
+ logging.basicConfig(level=logging.INFO)
48
+
49
+
50
+ def _handle_engine() -> Engine:
51
+ db = os.getenv("DB")
52
+ db_user = os.getenv("DB_USER")
53
+ db_passw = os.getenv("DB_PASSW")
54
+ db_url = os.getenv("DB_URL")
55
+
56
+ if all([db, db_user, db_passw, db_url]):
57
+ return create_engine(
58
+ f"postgresql+psycopg2://{db_user}:{db_passw}@{db_url}/{db}",
59
+ pool_size=10,
60
+ max_overflow=0,
61
+ )
62
+
63
+ sqlite_dir = os.getenv("COMPAIR_SQLITE_DIR", "/data")
64
+ os.makedirs(sqlite_dir, exist_ok=True)
65
+ sqlite_path = os.path.join(sqlite_dir, os.getenv("COMPAIR_SQLITE_NAME", "compair.db"))
66
+ return create_engine(f"sqlite:///{sqlite_path}", connect_args={"check_same_thread": False})
67
+
68
+
69
+ engine = _handle_engine()
70
+
71
+
72
+ def initialize_database() -> None:
73
+ models.Base.metadata.create_all(engine)
74
+ if initialize_database_override:
75
+ initialize_database_override(engine)
76
+
77
+
78
+ initialize_database()
79
+
80
+ Session = sessionmaker(engine)
81
+ embedder = embeddings.Embedder()
82
+ reviewer = feedback.Reviewer()
83
+
84
+ with Session() as session:
85
+ initialize_default_groups(session)
86
+
87
+ __all__ = ["embeddings", "feedback", "main", "models", "utils", "Session"]
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from types import SimpleNamespace
4
+
5
+ try:
6
+ from compair_cloud.celery_app import celery_app # type: ignore
7
+ except (ImportError, ModuleNotFoundError):
8
+
9
+ class _NoopCelery:
10
+ def __init__(self) -> None:
11
+ self.conf = SimpleNamespace(beat_schedule={})
12
+
13
+ def task(self, func=None, *args, **kwargs):
14
+ def decorator(fn):
15
+ return fn
16
+
17
+ return decorator(func) if func else decorator
18
+
19
+ def send_task(self, *args, **kwargs):
20
+ raise RuntimeError("Celery is not available in the Compair Core edition.")
21
+
22
+ celery_app = _NoopCelery()
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from sqlalchemy.orm import Session
4
+
5
+ try:
6
+ from compair_cloud.default_groups import initialize_default_groups as cloud_initialize_default_groups # type: ignore
7
+ except (ImportError, ModuleNotFoundError):
8
+ cloud_initialize_default_groups = None
9
+
10
+
11
+ def initialize_default_groups(session: Session) -> None:
12
+ """Core builds do not seed any default groups by default."""
13
+ if cloud_initialize_default_groups:
14
+ cloud_initialize_default_groups(session)
@@ -0,0 +1,66 @@
1
+ import hashlib
2
+ import os
3
+ from typing import List
4
+
5
+ import requests
6
+
7
+ from .logger import log_event
8
+
9
+ try:
10
+ from compair_cloud.embeddings import Embedder as CloudEmbedder # type: ignore
11
+ from compair_cloud.embeddings import create_embedding as cloud_create_embedding # type: ignore
12
+ except (ImportError, ModuleNotFoundError):
13
+ CloudEmbedder = None
14
+ cloud_create_embedding = None
15
+
16
+
17
+ class Embedder:
18
+ def __init__(self) -> None:
19
+ self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
20
+ self._cloud_impl = None
21
+ if self.edition == "cloud" and CloudEmbedder is not None:
22
+ self._cloud_impl = CloudEmbedder()
23
+
24
+ if self._cloud_impl is None:
25
+ self.model = os.getenv("COMPAIR_LOCAL_EMBED_MODEL", "hash-embedding")
26
+ self.dimension = int(os.getenv("COMPAIR_LOCAL_EMBED_DIM", "384"))
27
+ base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
28
+ route = os.getenv("COMPAIR_LOCAL_EMBED_ROUTE", "/embed")
29
+ self.endpoint = f"{base_url.rstrip('/')}{route}"
30
+
31
+ @property
32
+ def is_cloud(self) -> bool:
33
+ return self._cloud_impl is not None
34
+
35
+
36
+ def _hash_embedding(text: str, dimension: int) -> List[float]:
37
+ """Generate a deterministic embedding using repeated SHA-256 hashing."""
38
+ if not text:
39
+ text = " "
40
+ digest = hashlib.sha256(text.encode("utf-8", "ignore")).digest()
41
+ vector: List[float] = []
42
+ while len(vector) < dimension:
43
+ for byte in digest:
44
+ vector.append((byte / 255.0) * 2 - 1)
45
+ if len(vector) == dimension:
46
+ break
47
+ digest = hashlib.sha256(digest).digest()
48
+ return vector
49
+
50
+
51
+ def create_embedding(embedder: Embedder, text: str, user=None) -> list[float]:
52
+ if embedder.is_cloud and cloud_create_embedding is not None:
53
+ return cloud_create_embedding(embedder._cloud_impl, text, user=user)
54
+
55
+ # Local/core path
56
+ try:
57
+ response = requests.post(embedder.endpoint, json={"text": text}, timeout=15)
58
+ response.raise_for_status()
59
+ data = response.json()
60
+ embedding = data.get("embedding") or data.get("vector")
61
+ if embedding:
62
+ return embedding
63
+ except Exception as exc:
64
+ log_event("local_embedding_failed", error=str(exc))
65
+
66
+ return _hash_embedding(text, embedder.dimension)
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import requests
5
+ from typing import Any
6
+
7
+ from .logger import log_event
8
+ from .models import Document, User
9
+
10
+ try:
11
+ from compair_cloud.feedback import Reviewer as CloudReviewer # type: ignore
12
+ from compair_cloud.feedback import get_feedback as cloud_get_feedback # type: ignore
13
+ except (ImportError, ModuleNotFoundError):
14
+ CloudReviewer = None # type: ignore
15
+ cloud_get_feedback = None # type: ignore
16
+
17
+
18
+ class Reviewer:
19
+ """Edition-aware wrapper that falls back to the local feedback endpoint."""
20
+
21
+ def __init__(self) -> None:
22
+ self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
23
+ self._cloud_impl = None
24
+ if self.edition == "cloud" and CloudReviewer is not None:
25
+ self._cloud_impl = CloudReviewer()
26
+ else:
27
+ self.client = None
28
+ self.model = os.getenv("COMPAIR_LOCAL_GENERATION_MODEL", "local-feedback")
29
+ base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
30
+ route = os.getenv("COMPAIR_LOCAL_GENERATION_ROUTE", "/generate")
31
+ self.endpoint = f"{base_url.rstrip('/')}{route}"
32
+
33
+ @property
34
+ def is_cloud(self) -> bool:
35
+ return self._cloud_impl is not None
36
+
37
+
38
+ def _fallback_feedback(text: str, references: list[Any]) -> str:
39
+ if not references:
40
+ return "NONE"
41
+ top_ref = references[0]
42
+ snippet = getattr(top_ref, "content", "") or ""
43
+ snippet = snippet.replace("\n", " ").strip()[:200]
44
+ if not snippet:
45
+ return "NONE"
46
+ return f"Check alignment with this reference: {snippet}"
47
+
48
+
49
+ def get_feedback(
50
+ reviewer: Reviewer,
51
+ doc: Document,
52
+ text: str,
53
+ references: list[Any],
54
+ user: User,
55
+ ) -> str:
56
+ if reviewer.is_cloud and cloud_get_feedback is not None:
57
+ return cloud_get_feedback(reviewer._cloud_impl, doc, text, references, user) # type: ignore[arg-type]
58
+
59
+ payload = {
60
+ "document": text,
61
+ "references": [getattr(ref, "content", "") for ref in references],
62
+ "length_instruction": {
63
+ "Brief": "1–2 short sentences",
64
+ "Detailed": "A couple short paragraphs",
65
+ "Verbose": "As thorough as reasonably possible without repeating information",
66
+ }.get(user.preferred_feedback_length, "1–2 short sentences"),
67
+ }
68
+
69
+ try:
70
+ response = requests.post(reviewer.endpoint, json=payload, timeout=30)
71
+ response.raise_for_status()
72
+ data = response.json()
73
+ feedback = data.get("feedback")
74
+ if feedback:
75
+ return feedback
76
+ except Exception as exc: # pragma: no cover - network failures stay graceful
77
+ log_event("local_feedback_failed", error=str(exc))
78
+
79
+ return _fallback_feedback(text, references)
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any, Callable
6
+
7
+ try:
8
+ from compair_cloud.logger import log_event as cloud_log_event # type: ignore
9
+ except (ImportError, ModuleNotFoundError):
10
+ cloud_log_event: Callable[..., None] | None = None
11
+
12
+
13
+ if not logging.getLogger().handlers:
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ _LOGGER = logging.getLogger("compair.core")
17
+
18
+
19
+ def log_event(message: str, **fields: Any) -> None:
20
+ """Emit a structured log entry for the core edition."""
21
+ if cloud_log_event:
22
+ cloud_log_event(message, **fields)
23
+ return
24
+
25
+ try:
26
+ payload = json.dumps({"message": message, **fields}, default=str)
27
+ except TypeError:
28
+ payload = json.dumps({"message": message}, default=str)
29
+ _LOGGER.info(payload)
@@ -0,0 +1,240 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import Mapping
7
+
8
+ import Levenshtein
9
+ from sqlalchemy import select
10
+ from sqlalchemy.orm.attributes import get_history
11
+ from sqlalchemy.orm import Session as SASession
12
+
13
+ from .embeddings import create_embedding, Embedder
14
+ from .feedback import get_feedback, Reviewer
15
+ from .models import Chunk, Document, Feedback, Group, Note, Reference, User
16
+ from .utils import chunk_text, log_activity
17
+
18
+
19
+ def process_document(
20
+ user: User,
21
+ session: SASession,
22
+ embedder: Embedder,
23
+ reviewer: Reviewer,
24
+ doc: Document,
25
+ generate_feedback: bool = True,
26
+ ) -> Mapping[str, int]:
27
+ new = False
28
+
29
+ prev_content = get_history(doc, "content").deleted
30
+ prev_chunks: list[str] = []
31
+ if prev_content:
32
+ prev_chunks = chunk_text(prev_content[-1])
33
+
34
+ feedback_limit = int(os.getenv("COMPAIR_CORE_FEEDBACK_LIMIT", "5"))
35
+ time_cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
36
+
37
+ recent_feedback_count = session.query(Feedback).filter(
38
+ Feedback.source_chunk_id.in_(
39
+ session.query(Chunk.chunk_id).filter(Chunk.document_id == doc.document_id)
40
+ ),
41
+ Feedback.timestamp >= time_cutoff,
42
+ ).count()
43
+
44
+ content = doc.content
45
+ chunks = chunk_text(content)
46
+ new_chunks = list(set(chunks) - set(prev_chunks))
47
+
48
+ prioritized_chunk_indices: list[int] = []
49
+ if generate_feedback:
50
+ prioritized_chunk_indices = detect_significant_edits(prev_chunks=prev_chunks, new_chunks=new_chunks)
51
+
52
+ num_chunks_can_generate_feedback = max((feedback_limit - recent_feedback_count), 0)
53
+ indices_to_generate_feedback = prioritized_chunk_indices[:num_chunks_can_generate_feedback]
54
+
55
+ for i, chunk in enumerate(new_chunks):
56
+ should_generate_feedback = i in indices_to_generate_feedback
57
+ process_text(
58
+ session=session,
59
+ embedder=embedder,
60
+ reviewer=reviewer,
61
+ doc=doc,
62
+ text=chunk,
63
+ generate_feedback=should_generate_feedback,
64
+ )
65
+
66
+ remove_chunks = set(prev_chunks) - set(chunks)
67
+ for chunk in remove_chunks:
68
+ remove_text(session=session, text=chunk, document_id=doc.document_id)
69
+
70
+ if doc.groups:
71
+ log_activity(
72
+ session=session,
73
+ user_id=doc.author_id,
74
+ group_id=doc.groups[0].group_id,
75
+ action="update",
76
+ object_id=doc.document_id,
77
+ object_name=doc.title,
78
+ object_type="document",
79
+ )
80
+
81
+ session.commit()
82
+ return {"new": new}
83
+
84
+
85
+ def detect_significant_edits(
86
+ prev_chunks: list[str],
87
+ new_chunks: list[str],
88
+ threshold: float = 0.5,
89
+ ) -> list[int]:
90
+ prev_set = set(prev_chunks)
91
+ new_set = set(new_chunks)
92
+
93
+ unchanged_chunks = prev_set & new_set
94
+ new_chunks_to_check = new_set - unchanged_chunks
95
+
96
+ process_chunks: list[str] = []
97
+ for new_chunk in new_chunks_to_check:
98
+ best_match = max((Levenshtein.ratio(new_chunk, prev_chunk) for prev_chunk in prev_chunks), default=0.0)
99
+ if best_match < threshold:
100
+ process_chunks.append(new_chunk)
101
+
102
+ prioritized_chunks = prioritize_chunks(process_chunks)
103
+ return [new_chunks.index(new_chunk) for new_chunk in prioritized_chunks if new_chunk in new_chunks]
104
+
105
+
106
+ def prioritize_chunks(chunks: list[str], limit: int | None = None) -> list[str]:
107
+ limit = limit or len(chunks)
108
+ indexed_chunks = [(chunk, idx) for idx, chunk in enumerate(chunks)]
109
+ indexed_chunks.sort(key=lambda x: (-len(x[0]), x[1]))
110
+ return [chunk for chunk, _ in indexed_chunks[:limit]]
111
+
112
+
113
+ def process_text(
114
+ session: SASession,
115
+ embedder: Embedder,
116
+ reviewer: Reviewer,
117
+ doc: Document,
118
+ text: str,
119
+ generate_feedback: bool = True,
120
+ note: Note | None = None,
121
+ ) -> None:
122
+ logger = logging.getLogger(__name__)
123
+ chunk_hash = hash(text)
124
+
125
+ chunk_type = "note" if note else "document"
126
+ note_id = note.note_id if note else None
127
+
128
+ existing_chunks = session.query(Chunk).filter(
129
+ Chunk.hash == str(chunk_hash),
130
+ Chunk.document_id == doc.document_id,
131
+ Chunk.chunk_type == chunk_type,
132
+ Chunk.note_id == note_id,
133
+ )
134
+
135
+ user = session.query(User).filter(User.user_id == doc.author_id).first()
136
+ if existing_chunks.first():
137
+ for chunk in existing_chunks:
138
+ if chunk.embedding is None:
139
+ embedding = create_embedding(embedder, text, user=user)
140
+ existing_chunks.update({"embedding": embedding})
141
+ session.commit()
142
+ else:
143
+ chunk = Chunk(
144
+ hash=chunk_hash,
145
+ document_id=doc.document_id,
146
+ note_id=note_id,
147
+ chunk_type=chunk_type,
148
+ content=text,
149
+ )
150
+ embedding = create_embedding(embedder, text, user=user)
151
+ chunk.embedding = embedding
152
+ session.add(chunk)
153
+ session.commit()
154
+ existing_chunk = chunk
155
+ existing_chunk = session.query(Chunk).filter(
156
+ Chunk.document_id == doc.document_id,
157
+ Chunk.hash == str(chunk_hash),
158
+ Chunk.chunk_type == chunk_type,
159
+ Chunk.note_id == note_id,
160
+ ).first()
161
+
162
+ if generate_feedback and existing_chunk:
163
+ doc_group_ids = [g.group_id for g in doc.groups]
164
+ references = (
165
+ session.query(Chunk)
166
+ .join(Chunk.document)
167
+ .join(Document.groups)
168
+ .filter(
169
+ Document.is_published.is_(True),
170
+ Document.document_id != doc.document_id,
171
+ Chunk.chunk_type == "document",
172
+ Group.group_id.in_(doc_group_ids),
173
+ )
174
+ .order_by(Chunk.embedding.cosine_distance(existing_chunk.embedding))
175
+ .limit(3)
176
+ .all()
177
+ )
178
+
179
+ sql_references: list[Reference] = []
180
+ for ref_chunk in references:
181
+ sql_references.append(
182
+ Reference(
183
+ source_chunk_id=existing_chunk.chunk_id,
184
+ reference_type="document",
185
+ reference_document_id=ref_chunk.document_id,
186
+ reference_note_id=None,
187
+ )
188
+ )
189
+
190
+ if sql_references:
191
+ session.add_all(sql_references)
192
+ session.commit()
193
+
194
+ feedback = get_feedback(reviewer, doc, text, references, user)
195
+ if feedback != "NONE":
196
+ sql_feedback = Feedback(
197
+ source_chunk_id=existing_chunk.chunk_id,
198
+ feedback=feedback,
199
+ model=reviewer.model,
200
+ )
201
+ session.add(sql_feedback)
202
+ session.commit()
203
+
204
+
205
+ def remove_text(session: SASession, text: str, document_id: str) -> None:
206
+ chunks = session.query(Chunk).filter(
207
+ Chunk.document_id == document_id,
208
+ Chunk.content == text,
209
+ )
210
+ chunks.delete(synchronize_session=False)
211
+ session.commit()
212
+
213
+
214
+ def get_all_chunks_for_document(session: SASession, doc: Document) -> list[Chunk]:
215
+ doc_chunks = session.query(Chunk).filter(Chunk.document_id == doc.document_id).all()
216
+ note_chunks: list[Chunk] = []
217
+ notes = session.query(Note).filter(Note.document_id == doc.document_id).all()
218
+ for note in notes:
219
+ note_text_chunks = chunk_text(note.content)
220
+ for text in note_text_chunks:
221
+ chunk_hash = hash(text)
222
+ existing = session.query(Chunk).filter(
223
+ Chunk.hash == str(chunk_hash),
224
+ Chunk.document_id == doc.document_id,
225
+ Chunk.content == text,
226
+ ).first()
227
+ if not existing:
228
+ embedding = create_embedding(Embedder(), text, user=doc.author_id)
229
+ note_chunk = Chunk(
230
+ hash=str(chunk_hash),
231
+ document_id=doc.document_id,
232
+ content=text,
233
+ embedding=embedding,
234
+ )
235
+ session.add(note_chunk)
236
+ session.commit()
237
+ note_chunks.append(note_chunk)
238
+ else:
239
+ note_chunks.append(existing)
240
+ return doc_chunks + note_chunks