compair-core 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of compair-core might be problematic. Click here for more details.
- {compair_core-0.3.1 → compair_core-0.3.3}/PKG-INFO +5 -3
- {compair_core-0.3.1 → compair_core-0.3.3}/README.md +4 -2
- compair_core-0.3.3/compair/__init__.py +87 -0
- compair_core-0.3.3/compair/celery_app.py +22 -0
- compair_core-0.3.3/compair/default_groups.py +14 -0
- compair_core-0.3.3/compair/embeddings.py +66 -0
- compair_core-0.3.3/compair/feedback.py +79 -0
- compair_core-0.3.3/compair/logger.py +29 -0
- compair_core-0.3.3/compair/main.py +240 -0
- compair_core-0.3.3/compair/models.py +355 -0
- compair_core-0.3.3/compair/schema.py +146 -0
- compair_core-0.3.3/compair/tasks.py +98 -0
- compair_core-0.3.3/compair/utils.py +61 -0
- {compair_core-0.3.1 → compair_core-0.3.3}/compair_core.egg-info/PKG-INFO +5 -3
- compair_core-0.3.3/compair_core.egg-info/SOURCES.txt +39 -0
- compair_core-0.3.3/compair_core.egg-info/top_level.txt +3 -0
- compair_core-0.3.3/compair_email/__init__.py +0 -0
- compair_core-0.3.3/compair_email/email.py +6 -0
- compair_core-0.3.3/compair_email/email_core.py +15 -0
- compair_core-0.3.3/compair_email/templates.py +6 -0
- compair_core-0.3.3/compair_email/templates_core.py +13 -0
- {compair_core-0.3.1 → compair_core-0.3.3}/pyproject.toml +11 -2
- compair_core-0.3.3/server/__init__.py +0 -0
- compair_core-0.3.3/server/app.py +90 -0
- compair_core-0.3.3/server/deps.py +67 -0
- compair_core-0.3.3/server/local_model/__init__.py +1 -0
- compair_core-0.3.3/server/local_model/app.py +62 -0
- compair_core-0.3.3/server/providers/__init__.py +0 -0
- compair_core-0.3.3/server/providers/console_mailer.py +9 -0
- compair_core-0.3.3/server/providers/contracts.py +66 -0
- compair_core-0.3.3/server/providers/local_storage.py +28 -0
- compair_core-0.3.3/server/providers/noop_analytics.py +7 -0
- compair_core-0.3.3/server/providers/noop_billing.py +30 -0
- compair_core-0.3.3/server/providers/noop_ocr.py +10 -0
- compair_core-0.3.3/server/routers/__init__.py +0 -0
- compair_core-0.3.3/server/routers/capabilities.py +38 -0
- compair_core-0.3.3/server/settings.py +51 -0
- compair_core-0.3.1/compair_core.egg-info/SOURCES.txt +0 -8
- compair_core-0.3.1/compair_core.egg-info/top_level.txt +0 -1
- {compair_core-0.3.1 → compair_core-0.3.3}/LICENSE +0 -0
- {compair_core-0.3.1 → compair_core-0.3.3}/compair_core.egg-info/dependency_links.txt +0 -0
- {compair_core-0.3.1 → compair_core-0.3.3}/compair_core.egg-info/requires.txt +0 -0
- {compair_core-0.3.1 → compair_core-0.3.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compair-core
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Open-source foundation of the Compair collaboration platform.
|
|
5
5
|
Author: RocketResearch, Inc.
|
|
6
6
|
License: MIT
|
|
@@ -96,8 +96,8 @@ See `compair_core/server/settings.py` for the full settings surface.
|
|
|
96
96
|
```bash
|
|
97
97
|
python -m venv .venv
|
|
98
98
|
source .venv/bin/activate
|
|
99
|
-
pip install -e .[dev]
|
|
100
|
-
uvicorn
|
|
99
|
+
pip install -e ".[dev]"
|
|
100
|
+
uvicorn compair.server.app:create_app --factory --reload
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
The API will be available at http://127.0.0.1:8000 and supports the Swagger UI at `/docs`.
|
|
@@ -106,6 +106,8 @@ The API will be available at http://127.0.0.1:8000 and supports the Swagger UI a
|
|
|
106
106
|
|
|
107
107
|
Core currently ships with a syntax sanity check (`python -m compileall ...`). You can add pytest or other tooling as needed.
|
|
108
108
|
|
|
109
|
+
Release and packaging steps are documented in `docs/maintainers.md`.
|
|
110
|
+
|
|
109
111
|
## Reporting Issues
|
|
110
112
|
|
|
111
113
|
Please open GitHub issues or PRs against this repository. If you are a Compair Cloud customer, reach out through your support channel for issues related to premium features.
|
|
@@ -62,8 +62,8 @@ See `compair_core/server/settings.py` for the full settings surface.
|
|
|
62
62
|
```bash
|
|
63
63
|
python -m venv .venv
|
|
64
64
|
source .venv/bin/activate
|
|
65
|
-
pip install -e .[dev]
|
|
66
|
-
uvicorn
|
|
65
|
+
pip install -e ".[dev]"
|
|
66
|
+
uvicorn compair.server.app:create_app --factory --reload
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
The API will be available at http://127.0.0.1:8000 and supports the Swagger UI at `/docs`.
|
|
@@ -72,6 +72,8 @@ The API will be available at http://127.0.0.1:8000 and supports the Swagger UI a
|
|
|
72
72
|
|
|
73
73
|
Core currently ships with a syntax sanity check (`python -m compileall ...`). You can add pytest or other tooling as needed.
|
|
74
74
|
|
|
75
|
+
Release and packaging steps are documented in `docs/maintainers.md`.
|
|
76
|
+
|
|
75
77
|
## Reporting Issues
|
|
76
78
|
|
|
77
79
|
Please open GitHub issues or PRs against this repository. If you are a Compair Cloud customer, reach out through your support channel for issues related to premium features.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import Engine, create_engine
|
|
7
|
+
from sqlalchemy.orm import sessionmaker
|
|
8
|
+
|
|
9
|
+
edition = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
10
|
+
|
|
11
|
+
_cloud_available = False
|
|
12
|
+
if edition == "cloud":
|
|
13
|
+
try: # Import cloud overrides if the private package is installed
|
|
14
|
+
from compair_cloud import (
|
|
15
|
+
bootstrap as cloud_bootstrap,
|
|
16
|
+
celery_app as cloud_celery_app,
|
|
17
|
+
default_groups as cloud_default_groups,
|
|
18
|
+
embeddings as cloud_embeddings,
|
|
19
|
+
feedback as cloud_feedback,
|
|
20
|
+
logger as cloud_logger,
|
|
21
|
+
main as cloud_main,
|
|
22
|
+
models as cloud_models,
|
|
23
|
+
tasks as cloud_tasks,
|
|
24
|
+
utils as cloud_utils,
|
|
25
|
+
) # type: ignore
|
|
26
|
+
|
|
27
|
+
_cloud_available = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
_cloud_available = False
|
|
30
|
+
|
|
31
|
+
if _cloud_available:
|
|
32
|
+
from compair_cloud.default_groups import initialize_default_groups # type: ignore
|
|
33
|
+
embeddings = cloud_embeddings
|
|
34
|
+
feedback = cloud_feedback
|
|
35
|
+
logger = cloud_logger
|
|
36
|
+
main = cloud_main
|
|
37
|
+
models = cloud_models
|
|
38
|
+
tasks = cloud_tasks
|
|
39
|
+
utils = cloud_utils
|
|
40
|
+
initialize_database_override = getattr(cloud_bootstrap, "initialize_database", None)
|
|
41
|
+
else:
|
|
42
|
+
from . import embeddings, feedback, logger, main, models, tasks, utils
|
|
43
|
+
from .default_groups import initialize_default_groups
|
|
44
|
+
initialize_database_override = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
logging.basicConfig(level=logging.INFO)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _handle_engine() -> Engine:
|
|
51
|
+
db = os.getenv("DB")
|
|
52
|
+
db_user = os.getenv("DB_USER")
|
|
53
|
+
db_passw = os.getenv("DB_PASSW")
|
|
54
|
+
db_url = os.getenv("DB_URL")
|
|
55
|
+
|
|
56
|
+
if all([db, db_user, db_passw, db_url]):
|
|
57
|
+
return create_engine(
|
|
58
|
+
f"postgresql+psycopg2://{db_user}:{db_passw}@{db_url}/{db}",
|
|
59
|
+
pool_size=10,
|
|
60
|
+
max_overflow=0,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
sqlite_dir = os.getenv("COMPAIR_SQLITE_DIR", "/data")
|
|
64
|
+
os.makedirs(sqlite_dir, exist_ok=True)
|
|
65
|
+
sqlite_path = os.path.join(sqlite_dir, os.getenv("COMPAIR_SQLITE_NAME", "compair.db"))
|
|
66
|
+
return create_engine(f"sqlite:///{sqlite_path}", connect_args={"check_same_thread": False})
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
engine = _handle_engine()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def initialize_database() -> None:
|
|
73
|
+
models.Base.metadata.create_all(engine)
|
|
74
|
+
if initialize_database_override:
|
|
75
|
+
initialize_database_override(engine)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
initialize_database()
|
|
79
|
+
|
|
80
|
+
Session = sessionmaker(engine)
|
|
81
|
+
embedder = embeddings.Embedder()
|
|
82
|
+
reviewer = feedback.Reviewer()
|
|
83
|
+
|
|
84
|
+
with Session() as session:
|
|
85
|
+
initialize_default_groups(session)
|
|
86
|
+
|
|
87
|
+
__all__ = ["embeddings", "feedback", "main", "models", "utils", "Session"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from compair_cloud.celery_app import celery_app # type: ignore
|
|
7
|
+
except (ImportError, ModuleNotFoundError):
|
|
8
|
+
|
|
9
|
+
class _NoopCelery:
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self.conf = SimpleNamespace(beat_schedule={})
|
|
12
|
+
|
|
13
|
+
def task(self, func=None, *args, **kwargs):
|
|
14
|
+
def decorator(fn):
|
|
15
|
+
return fn
|
|
16
|
+
|
|
17
|
+
return decorator(func) if func else decorator
|
|
18
|
+
|
|
19
|
+
def send_task(self, *args, **kwargs):
|
|
20
|
+
raise RuntimeError("Celery is not available in the Compair Core edition.")
|
|
21
|
+
|
|
22
|
+
celery_app = _NoopCelery()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.orm import Session
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from compair_cloud.default_groups import initialize_default_groups as cloud_initialize_default_groups # type: ignore
|
|
7
|
+
except (ImportError, ModuleNotFoundError):
|
|
8
|
+
cloud_initialize_default_groups = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def initialize_default_groups(session: Session) -> None:
|
|
12
|
+
"""Core builds do not seed any default groups by default."""
|
|
13
|
+
if cloud_initialize_default_groups:
|
|
14
|
+
cloud_initialize_default_groups(session)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from .logger import log_event
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from compair_cloud.embeddings import Embedder as CloudEmbedder # type: ignore
|
|
11
|
+
from compair_cloud.embeddings import create_embedding as cloud_create_embedding # type: ignore
|
|
12
|
+
except (ImportError, ModuleNotFoundError):
|
|
13
|
+
CloudEmbedder = None
|
|
14
|
+
cloud_create_embedding = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Embedder:
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
20
|
+
self._cloud_impl = None
|
|
21
|
+
if self.edition == "cloud" and CloudEmbedder is not None:
|
|
22
|
+
self._cloud_impl = CloudEmbedder()
|
|
23
|
+
|
|
24
|
+
if self._cloud_impl is None:
|
|
25
|
+
self.model = os.getenv("COMPAIR_LOCAL_EMBED_MODEL", "hash-embedding")
|
|
26
|
+
self.dimension = int(os.getenv("COMPAIR_LOCAL_EMBED_DIM", "384"))
|
|
27
|
+
base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
|
|
28
|
+
route = os.getenv("COMPAIR_LOCAL_EMBED_ROUTE", "/embed")
|
|
29
|
+
self.endpoint = f"{base_url.rstrip('/')}{route}"
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def is_cloud(self) -> bool:
|
|
33
|
+
return self._cloud_impl is not None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _hash_embedding(text: str, dimension: int) -> List[float]:
|
|
37
|
+
"""Generate a deterministic embedding using repeated SHA-256 hashing."""
|
|
38
|
+
if not text:
|
|
39
|
+
text = " "
|
|
40
|
+
digest = hashlib.sha256(text.encode("utf-8", "ignore")).digest()
|
|
41
|
+
vector: List[float] = []
|
|
42
|
+
while len(vector) < dimension:
|
|
43
|
+
for byte in digest:
|
|
44
|
+
vector.append((byte / 255.0) * 2 - 1)
|
|
45
|
+
if len(vector) == dimension:
|
|
46
|
+
break
|
|
47
|
+
digest = hashlib.sha256(digest).digest()
|
|
48
|
+
return vector
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def create_embedding(embedder: Embedder, text: str, user=None) -> list[float]:
|
|
52
|
+
if embedder.is_cloud and cloud_create_embedding is not None:
|
|
53
|
+
return cloud_create_embedding(embedder._cloud_impl, text, user=user)
|
|
54
|
+
|
|
55
|
+
# Local/core path
|
|
56
|
+
try:
|
|
57
|
+
response = requests.post(embedder.endpoint, json={"text": text}, timeout=15)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
data = response.json()
|
|
60
|
+
embedding = data.get("embedding") or data.get("vector")
|
|
61
|
+
if embedding:
|
|
62
|
+
return embedding
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
log_event("local_embedding_failed", error=str(exc))
|
|
65
|
+
|
|
66
|
+
return _hash_embedding(text, embedder.dimension)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import requests
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .logger import log_event
|
|
8
|
+
from .models import Document, User
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from compair_cloud.feedback import Reviewer as CloudReviewer # type: ignore
|
|
12
|
+
from compair_cloud.feedback import get_feedback as cloud_get_feedback # type: ignore
|
|
13
|
+
except (ImportError, ModuleNotFoundError):
|
|
14
|
+
CloudReviewer = None # type: ignore
|
|
15
|
+
cloud_get_feedback = None # type: ignore
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Reviewer:
|
|
19
|
+
"""Edition-aware wrapper that falls back to the local feedback endpoint."""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self.edition = os.getenv("COMPAIR_EDITION", "core").lower()
|
|
23
|
+
self._cloud_impl = None
|
|
24
|
+
if self.edition == "cloud" and CloudReviewer is not None:
|
|
25
|
+
self._cloud_impl = CloudReviewer()
|
|
26
|
+
else:
|
|
27
|
+
self.client = None
|
|
28
|
+
self.model = os.getenv("COMPAIR_LOCAL_GENERATION_MODEL", "local-feedback")
|
|
29
|
+
base_url = os.getenv("COMPAIR_LOCAL_MODEL_URL", "http://local-model:9000")
|
|
30
|
+
route = os.getenv("COMPAIR_LOCAL_GENERATION_ROUTE", "/generate")
|
|
31
|
+
self.endpoint = f"{base_url.rstrip('/')}{route}"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_cloud(self) -> bool:
|
|
35
|
+
return self._cloud_impl is not None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _fallback_feedback(text: str, references: list[Any]) -> str:
|
|
39
|
+
if not references:
|
|
40
|
+
return "NONE"
|
|
41
|
+
top_ref = references[0]
|
|
42
|
+
snippet = getattr(top_ref, "content", "") or ""
|
|
43
|
+
snippet = snippet.replace("\n", " ").strip()[:200]
|
|
44
|
+
if not snippet:
|
|
45
|
+
return "NONE"
|
|
46
|
+
return f"Check alignment with this reference: {snippet}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_feedback(
|
|
50
|
+
reviewer: Reviewer,
|
|
51
|
+
doc: Document,
|
|
52
|
+
text: str,
|
|
53
|
+
references: list[Any],
|
|
54
|
+
user: User,
|
|
55
|
+
) -> str:
|
|
56
|
+
if reviewer.is_cloud and cloud_get_feedback is not None:
|
|
57
|
+
return cloud_get_feedback(reviewer._cloud_impl, doc, text, references, user) # type: ignore[arg-type]
|
|
58
|
+
|
|
59
|
+
payload = {
|
|
60
|
+
"document": text,
|
|
61
|
+
"references": [getattr(ref, "content", "") for ref in references],
|
|
62
|
+
"length_instruction": {
|
|
63
|
+
"Brief": "1–2 short sentences",
|
|
64
|
+
"Detailed": "A couple short paragraphs",
|
|
65
|
+
"Verbose": "As thorough as reasonably possible without repeating information",
|
|
66
|
+
}.get(user.preferred_feedback_length, "1–2 short sentences"),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
response = requests.post(reviewer.endpoint, json=payload, timeout=30)
|
|
71
|
+
response.raise_for_status()
|
|
72
|
+
data = response.json()
|
|
73
|
+
feedback = data.get("feedback")
|
|
74
|
+
if feedback:
|
|
75
|
+
return feedback
|
|
76
|
+
except Exception as exc: # pragma: no cover - network failures stay graceful
|
|
77
|
+
log_event("local_feedback_failed", error=str(exc))
|
|
78
|
+
|
|
79
|
+
return _fallback_feedback(text, references)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Callable
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from compair_cloud.logger import log_event as cloud_log_event # type: ignore
|
|
9
|
+
except (ImportError, ModuleNotFoundError):
|
|
10
|
+
cloud_log_event: Callable[..., None] | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if not logging.getLogger().handlers:
|
|
14
|
+
logging.basicConfig(level=logging.INFO)
|
|
15
|
+
|
|
16
|
+
_LOGGER = logging.getLogger("compair.core")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def log_event(message: str, **fields: Any) -> None:
|
|
20
|
+
"""Emit a structured log entry for the core edition."""
|
|
21
|
+
if cloud_log_event:
|
|
22
|
+
cloud_log_event(message, **fields)
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
payload = json.dumps({"message": message, **fields}, default=str)
|
|
27
|
+
except TypeError:
|
|
28
|
+
payload = json.dumps({"message": message}, default=str)
|
|
29
|
+
_LOGGER.info(payload)
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime, timedelta, timezone
|
|
6
|
+
from typing import Mapping
|
|
7
|
+
|
|
8
|
+
import Levenshtein
|
|
9
|
+
from sqlalchemy import select
|
|
10
|
+
from sqlalchemy.orm.attributes import get_history
|
|
11
|
+
from sqlalchemy.orm import Session as SASession
|
|
12
|
+
|
|
13
|
+
from .embeddings import create_embedding, Embedder
|
|
14
|
+
from .feedback import get_feedback, Reviewer
|
|
15
|
+
from .models import Chunk, Document, Feedback, Group, Note, Reference, User
|
|
16
|
+
from .utils import chunk_text, log_activity
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def process_document(
|
|
20
|
+
user: User,
|
|
21
|
+
session: SASession,
|
|
22
|
+
embedder: Embedder,
|
|
23
|
+
reviewer: Reviewer,
|
|
24
|
+
doc: Document,
|
|
25
|
+
generate_feedback: bool = True,
|
|
26
|
+
) -> Mapping[str, int]:
|
|
27
|
+
new = False
|
|
28
|
+
|
|
29
|
+
prev_content = get_history(doc, "content").deleted
|
|
30
|
+
prev_chunks: list[str] = []
|
|
31
|
+
if prev_content:
|
|
32
|
+
prev_chunks = chunk_text(prev_content[-1])
|
|
33
|
+
|
|
34
|
+
feedback_limit = int(os.getenv("COMPAIR_CORE_FEEDBACK_LIMIT", "5"))
|
|
35
|
+
time_cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
|
|
36
|
+
|
|
37
|
+
recent_feedback_count = session.query(Feedback).filter(
|
|
38
|
+
Feedback.source_chunk_id.in_(
|
|
39
|
+
session.query(Chunk.chunk_id).filter(Chunk.document_id == doc.document_id)
|
|
40
|
+
),
|
|
41
|
+
Feedback.timestamp >= time_cutoff,
|
|
42
|
+
).count()
|
|
43
|
+
|
|
44
|
+
content = doc.content
|
|
45
|
+
chunks = chunk_text(content)
|
|
46
|
+
new_chunks = list(set(chunks) - set(prev_chunks))
|
|
47
|
+
|
|
48
|
+
prioritized_chunk_indices: list[int] = []
|
|
49
|
+
if generate_feedback:
|
|
50
|
+
prioritized_chunk_indices = detect_significant_edits(prev_chunks=prev_chunks, new_chunks=new_chunks)
|
|
51
|
+
|
|
52
|
+
num_chunks_can_generate_feedback = max((feedback_limit - recent_feedback_count), 0)
|
|
53
|
+
indices_to_generate_feedback = prioritized_chunk_indices[:num_chunks_can_generate_feedback]
|
|
54
|
+
|
|
55
|
+
for i, chunk in enumerate(new_chunks):
|
|
56
|
+
should_generate_feedback = i in indices_to_generate_feedback
|
|
57
|
+
process_text(
|
|
58
|
+
session=session,
|
|
59
|
+
embedder=embedder,
|
|
60
|
+
reviewer=reviewer,
|
|
61
|
+
doc=doc,
|
|
62
|
+
text=chunk,
|
|
63
|
+
generate_feedback=should_generate_feedback,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
remove_chunks = set(prev_chunks) - set(chunks)
|
|
67
|
+
for chunk in remove_chunks:
|
|
68
|
+
remove_text(session=session, text=chunk, document_id=doc.document_id)
|
|
69
|
+
|
|
70
|
+
if doc.groups:
|
|
71
|
+
log_activity(
|
|
72
|
+
session=session,
|
|
73
|
+
user_id=doc.author_id,
|
|
74
|
+
group_id=doc.groups[0].group_id,
|
|
75
|
+
action="update",
|
|
76
|
+
object_id=doc.document_id,
|
|
77
|
+
object_name=doc.title,
|
|
78
|
+
object_type="document",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
session.commit()
|
|
82
|
+
return {"new": new}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def detect_significant_edits(
|
|
86
|
+
prev_chunks: list[str],
|
|
87
|
+
new_chunks: list[str],
|
|
88
|
+
threshold: float = 0.5,
|
|
89
|
+
) -> list[int]:
|
|
90
|
+
prev_set = set(prev_chunks)
|
|
91
|
+
new_set = set(new_chunks)
|
|
92
|
+
|
|
93
|
+
unchanged_chunks = prev_set & new_set
|
|
94
|
+
new_chunks_to_check = new_set - unchanged_chunks
|
|
95
|
+
|
|
96
|
+
process_chunks: list[str] = []
|
|
97
|
+
for new_chunk in new_chunks_to_check:
|
|
98
|
+
best_match = max((Levenshtein.ratio(new_chunk, prev_chunk) for prev_chunk in prev_chunks), default=0.0)
|
|
99
|
+
if best_match < threshold:
|
|
100
|
+
process_chunks.append(new_chunk)
|
|
101
|
+
|
|
102
|
+
prioritized_chunks = prioritize_chunks(process_chunks)
|
|
103
|
+
return [new_chunks.index(new_chunk) for new_chunk in prioritized_chunks if new_chunk in new_chunks]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def prioritize_chunks(chunks: list[str], limit: int | None = None) -> list[str]:
|
|
107
|
+
limit = limit or len(chunks)
|
|
108
|
+
indexed_chunks = [(chunk, idx) for idx, chunk in enumerate(chunks)]
|
|
109
|
+
indexed_chunks.sort(key=lambda x: (-len(x[0]), x[1]))
|
|
110
|
+
return [chunk for chunk, _ in indexed_chunks[:limit]]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def process_text(
|
|
114
|
+
session: SASession,
|
|
115
|
+
embedder: Embedder,
|
|
116
|
+
reviewer: Reviewer,
|
|
117
|
+
doc: Document,
|
|
118
|
+
text: str,
|
|
119
|
+
generate_feedback: bool = True,
|
|
120
|
+
note: Note | None = None,
|
|
121
|
+
) -> None:
|
|
122
|
+
logger = logging.getLogger(__name__)
|
|
123
|
+
chunk_hash = hash(text)
|
|
124
|
+
|
|
125
|
+
chunk_type = "note" if note else "document"
|
|
126
|
+
note_id = note.note_id if note else None
|
|
127
|
+
|
|
128
|
+
existing_chunks = session.query(Chunk).filter(
|
|
129
|
+
Chunk.hash == str(chunk_hash),
|
|
130
|
+
Chunk.document_id == doc.document_id,
|
|
131
|
+
Chunk.chunk_type == chunk_type,
|
|
132
|
+
Chunk.note_id == note_id,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
user = session.query(User).filter(User.user_id == doc.author_id).first()
|
|
136
|
+
if existing_chunks.first():
|
|
137
|
+
for chunk in existing_chunks:
|
|
138
|
+
if chunk.embedding is None:
|
|
139
|
+
embedding = create_embedding(embedder, text, user=user)
|
|
140
|
+
existing_chunks.update({"embedding": embedding})
|
|
141
|
+
session.commit()
|
|
142
|
+
else:
|
|
143
|
+
chunk = Chunk(
|
|
144
|
+
hash=chunk_hash,
|
|
145
|
+
document_id=doc.document_id,
|
|
146
|
+
note_id=note_id,
|
|
147
|
+
chunk_type=chunk_type,
|
|
148
|
+
content=text,
|
|
149
|
+
)
|
|
150
|
+
embedding = create_embedding(embedder, text, user=user)
|
|
151
|
+
chunk.embedding = embedding
|
|
152
|
+
session.add(chunk)
|
|
153
|
+
session.commit()
|
|
154
|
+
existing_chunk = chunk
|
|
155
|
+
existing_chunk = session.query(Chunk).filter(
|
|
156
|
+
Chunk.document_id == doc.document_id,
|
|
157
|
+
Chunk.hash == str(chunk_hash),
|
|
158
|
+
Chunk.chunk_type == chunk_type,
|
|
159
|
+
Chunk.note_id == note_id,
|
|
160
|
+
).first()
|
|
161
|
+
|
|
162
|
+
if generate_feedback and existing_chunk:
|
|
163
|
+
doc_group_ids = [g.group_id for g in doc.groups]
|
|
164
|
+
references = (
|
|
165
|
+
session.query(Chunk)
|
|
166
|
+
.join(Chunk.document)
|
|
167
|
+
.join(Document.groups)
|
|
168
|
+
.filter(
|
|
169
|
+
Document.is_published.is_(True),
|
|
170
|
+
Document.document_id != doc.document_id,
|
|
171
|
+
Chunk.chunk_type == "document",
|
|
172
|
+
Group.group_id.in_(doc_group_ids),
|
|
173
|
+
)
|
|
174
|
+
.order_by(Chunk.embedding.cosine_distance(existing_chunk.embedding))
|
|
175
|
+
.limit(3)
|
|
176
|
+
.all()
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
sql_references: list[Reference] = []
|
|
180
|
+
for ref_chunk in references:
|
|
181
|
+
sql_references.append(
|
|
182
|
+
Reference(
|
|
183
|
+
source_chunk_id=existing_chunk.chunk_id,
|
|
184
|
+
reference_type="document",
|
|
185
|
+
reference_document_id=ref_chunk.document_id,
|
|
186
|
+
reference_note_id=None,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if sql_references:
|
|
191
|
+
session.add_all(sql_references)
|
|
192
|
+
session.commit()
|
|
193
|
+
|
|
194
|
+
feedback = get_feedback(reviewer, doc, text, references, user)
|
|
195
|
+
if feedback != "NONE":
|
|
196
|
+
sql_feedback = Feedback(
|
|
197
|
+
source_chunk_id=existing_chunk.chunk_id,
|
|
198
|
+
feedback=feedback,
|
|
199
|
+
model=reviewer.model,
|
|
200
|
+
)
|
|
201
|
+
session.add(sql_feedback)
|
|
202
|
+
session.commit()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def remove_text(session: SASession, text: str, document_id: str) -> None:
|
|
206
|
+
chunks = session.query(Chunk).filter(
|
|
207
|
+
Chunk.document_id == document_id,
|
|
208
|
+
Chunk.content == text,
|
|
209
|
+
)
|
|
210
|
+
chunks.delete(synchronize_session=False)
|
|
211
|
+
session.commit()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_all_chunks_for_document(session: SASession, doc: Document) -> list[Chunk]:
|
|
215
|
+
doc_chunks = session.query(Chunk).filter(Chunk.document_id == doc.document_id).all()
|
|
216
|
+
note_chunks: list[Chunk] = []
|
|
217
|
+
notes = session.query(Note).filter(Note.document_id == doc.document_id).all()
|
|
218
|
+
for note in notes:
|
|
219
|
+
note_text_chunks = chunk_text(note.content)
|
|
220
|
+
for text in note_text_chunks:
|
|
221
|
+
chunk_hash = hash(text)
|
|
222
|
+
existing = session.query(Chunk).filter(
|
|
223
|
+
Chunk.hash == str(chunk_hash),
|
|
224
|
+
Chunk.document_id == doc.document_id,
|
|
225
|
+
Chunk.content == text,
|
|
226
|
+
).first()
|
|
227
|
+
if not existing:
|
|
228
|
+
embedding = create_embedding(Embedder(), text, user=doc.author_id)
|
|
229
|
+
note_chunk = Chunk(
|
|
230
|
+
hash=str(chunk_hash),
|
|
231
|
+
document_id=doc.document_id,
|
|
232
|
+
content=text,
|
|
233
|
+
embedding=embedding,
|
|
234
|
+
)
|
|
235
|
+
session.add(note_chunk)
|
|
236
|
+
session.commit()
|
|
237
|
+
note_chunks.append(note_chunk)
|
|
238
|
+
else:
|
|
239
|
+
note_chunks.append(existing)
|
|
240
|
+
return doc_chunks + note_chunks
|