deja-cli 0.2.1__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deja_cli-0.2.1 → deja_cli-0.3.1}/PKG-INFO +1 -1
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/cloud.py +241 -49
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/__init__.py +219 -6
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/_helpers.py +18 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/model.py +82 -3
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/ranking.py +32 -9
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/search.py +26 -5
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/watchers/codex_cli.py +24 -3
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/watchers/gemini_cli.py +18 -2
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/_helpers.py +15 -4
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/backfill.py +55 -11
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/cloud.py +249 -19
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/maintenance.py +36 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/memory.py +5 -1
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/session.py +17 -6
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/setup.py +73 -7
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/mcp_server.py +14 -2
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/web.py +19 -3
- deja_cli-0.3.1/deja/llm/base.py +54 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/llm/embedding.py +10 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/llm/factory.py +38 -1
- deja_cli-0.3.1/deja/llm/providers/anthropic.py +47 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/hooks/deja-post-fail.sh +13 -1
- deja_cli-0.3.1/hooks/deja-precompact.sh +32 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/hooks/deja-recall.sh +19 -2
- {deja_cli-0.2.1 → deja_cli-0.3.1}/pyproject.toml +1 -1
- deja_cli-0.2.1/deja/llm/base.py +0 -34
- deja_cli-0.2.1/deja/llm/providers/anthropic.py +0 -21
- deja_cli-0.2.1/hooks/deja-precompact.sh +0 -20
- {deja_cli-0.2.1 → deja_cli-0.3.1}/.github/workflows/ci.yml +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/.gitignore +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/LICENSE +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/README.pypi.md +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/config/default.yaml +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/config.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/extractor.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/reflection.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/_schema.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/connection.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/policy.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/queries.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/repos/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/repos/memories.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/repos/observations.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/repos/reflection.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/load.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/maintenance.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/core/store/services/save.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/watchers/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/watchers/base.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/ingest/watchers/claude_code.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/transfer.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/cli/watch.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/interfaces/web_ui/index.html +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/llm/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/llm/providers/__init__.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/llm/providers/ollama.py +0 -0
- {deja_cli-0.2.1 → deja_cli-0.3.1}/deja/main.py +0 -0
|
@@ -12,13 +12,90 @@ import threading
|
|
|
12
12
|
import webbrowser
|
|
13
13
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
14
14
|
from pathlib import Path
|
|
15
|
-
from typing import Callable, Iterator, Optional
|
|
15
|
+
from typing import Callable, Iterator, Optional, Union
|
|
16
16
|
from urllib.parse import parse_qs, urlparse
|
|
17
17
|
|
|
18
18
|
import httpx
|
|
19
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
23
|
+
|
|
24
|
+
# ── Typed payloads (R8, 2026-04-22 review) ──────────────────────────────────
|
|
25
|
+
# Replace the previous ``dict → dict`` boundary at auth + push with typed
|
|
26
|
+
# Pydantic models. The bugs the prior pass caught (N5 trigger→triggerCmds
|
|
27
|
+
# silent drop; N6 endpoint not persisted in auth; theoretical AuthState
|
|
28
|
+
# typo wedging every command) all lived in dict access patterns where a
|
|
29
|
+
# missing or misspelled key was indistinguishable from "field not set."
|
|
30
|
+
# Pydantic models make construction the single point that enforces the
|
|
31
|
+
# field contract.
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AuthState(BaseModel):
|
|
35
|
+
"""The contents of ``~/.deja/auth.json``.
|
|
36
|
+
|
|
37
|
+
``access_token`` is the PAT; ``endpoint`` is the host the token was
|
|
38
|
+
issued by (Bug N6, 2026-04-19) and travels with the credential — it
|
|
39
|
+
overrides ``config.cloud.endpoint`` so a token issued by host A is
|
|
40
|
+
never sent to host B. The two extra OAuth-flow fields (``token_type``,
|
|
41
|
+
``user_id``) are tolerated but not required so legacy auth files keep
|
|
42
|
+
parsing.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(extra="allow") # tolerate forward-compat fields
|
|
46
|
+
|
|
47
|
+
access_token: str
|
|
48
|
+
endpoint: Optional[str] = None
|
|
49
|
+
token_type: Optional[str] = None
|
|
50
|
+
user_id: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class CloudPushPayload(BaseModel):
|
|
54
|
+
"""The shape ``POST /v1/memories`` and ``POST /v1/sync/push`` accept.
|
|
55
|
+
|
|
56
|
+
The cloud uses ``forbidNonWhitelisted`` validation, so any unknown
|
|
57
|
+
key 400s. The previous string-allowlist filter dropped fields whose
|
|
58
|
+
canonical name on the cloud side differs from the local schema —
|
|
59
|
+
most notoriously the local ``trigger`` (comma string) vs cloud
|
|
60
|
+
``triggerCmds`` (list[str]) (Bug N5). Pydantic aliases handle the
|
|
61
|
+
rename in one place.
|
|
62
|
+
|
|
63
|
+
Field set / aliases must match the cloud DTO at
|
|
64
|
+
``apps/api/src/memories/dto/create-memory.dto.ts``. When the cloud
|
|
65
|
+
adds a new accepted field, add it here with the correct alias and
|
|
66
|
+
every save path picks it up automatically.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
70
|
+
|
|
71
|
+
# Required identification
|
|
72
|
+
local_id: str = Field(serialization_alias="localId")
|
|
73
|
+
content: str
|
|
74
|
+
type: str
|
|
75
|
+
scope: str
|
|
76
|
+
# Optional metadata accepted by the cloud (must match the cloud DTO —
|
|
77
|
+
# ``domain`` / ``source`` / ``entity_graph`` / ``embedding`` / the
|
|
78
|
+
# raw ``trigger`` comma-string / timestamps other than lastConfirmed
|
|
79
|
+
# are deliberately excluded; the cloud rejects unknown keys with a
|
|
80
|
+
# 400 under ``forbidNonWhitelisted``).
|
|
81
|
+
project: Optional[str] = None
|
|
82
|
+
confidence: Optional[float] = None
|
|
83
|
+
category: Optional[str] = None
|
|
84
|
+
trigger_cmds: Optional[list[str]] = Field(
|
|
85
|
+
default=None, serialization_alias="triggerCmds",
|
|
86
|
+
)
|
|
87
|
+
last_confirmed: Optional[str] = Field(
|
|
88
|
+
default=None, serialization_alias="lastConfirmed",
|
|
89
|
+
)
|
|
90
|
+
archived: Optional[bool] = None
|
|
91
|
+
archived_at: Optional[str] = Field(
|
|
92
|
+
default=None, serialization_alias="archivedAt",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def to_wire(self) -> dict:
|
|
96
|
+
"""Serialize to the camelCase shape the cloud DTO expects, dropping Nones."""
|
|
97
|
+
return self.model_dump(by_alias=True, exclude_none=True)
|
|
98
|
+
|
|
22
99
|
CLI_REDIRECT_PORT = 51234
|
|
23
100
|
|
|
24
101
|
AUTH_FILE = Path.home() / ".deja" / "auth.json"
|
|
@@ -46,8 +123,8 @@ def _get_endpoint(config=None) -> str:
|
|
|
46
123
|
"where these specific creds belong."
|
|
47
124
|
"""
|
|
48
125
|
auth = load_auth()
|
|
49
|
-
if auth and auth.
|
|
50
|
-
return str(auth
|
|
126
|
+
if auth and auth.endpoint:
|
|
127
|
+
return str(auth.endpoint).rstrip("/")
|
|
51
128
|
if config is None:
|
|
52
129
|
return DEFAULT_ENDPOINT
|
|
53
130
|
cloud = getattr(config, "cloud", None)
|
|
@@ -59,13 +136,22 @@ def _get_endpoint(config=None) -> str:
|
|
|
59
136
|
# ── Token storage ─────────────────────────────────────────────────────
|
|
60
137
|
|
|
61
138
|
|
|
62
|
-
def load_auth() -> Optional[
|
|
139
|
+
def load_auth() -> Optional[AuthState]:
|
|
140
|
+
"""Read ``~/.deja/auth.json`` into a typed AuthState, or None if absent.
|
|
141
|
+
|
|
142
|
+
R8 (2026-04-22 review): previously returned ``Optional[dict]`` and
|
|
143
|
+
every caller did ``auth.get("access_token")`` / ``auth.get("endpoint")``.
|
|
144
|
+
A future caller passing ``{"token": ...}`` instead of
|
|
145
|
+
``{"access_token": ...}`` would silently wedge every subsequent
|
|
146
|
+
command. Pydantic construction now rejects that at the boundary.
|
|
147
|
+
"""
|
|
63
148
|
if not AUTH_FILE.exists():
|
|
64
149
|
return None
|
|
65
|
-
|
|
150
|
+
raw = json.loads(AUTH_FILE.read_text())
|
|
151
|
+
return AuthState.model_validate(raw)
|
|
66
152
|
|
|
67
153
|
|
|
68
|
-
def save_auth(data: dict) -> None:
|
|
154
|
+
def save_auth(data: Union[AuthState, dict]) -> None:
|
|
69
155
|
# Bug Q2 (2026-04-19 pass 3): atomic rewrite. ``Path.write_text``
|
|
70
156
|
# truncates then writes, so a crash mid-write (Ctrl-C, OOM, kernel
|
|
71
157
|
# panic) leaves ``auth.json`` empty or half-written — ``load_auth``
|
|
@@ -75,8 +161,14 @@ def save_auth(data: dict) -> None:
|
|
|
75
161
|
# same directory + ``os.replace`` + cleanup on failure.
|
|
76
162
|
# Bug Q3 (2026-04-19 pass 3): 0700 on the parent so new installs
|
|
77
163
|
# don't create a world-readable ~/.deja.
|
|
164
|
+
# Accept either a typed ``AuthState`` or a raw dict (for legacy callers
|
|
165
|
+
# / tests). Normalize via model construction so a dict missing
|
|
166
|
+
# ``access_token`` raises a validation error here, not later when a
|
|
167
|
+
# caller tries to read the field.
|
|
168
|
+
if not isinstance(data, AuthState):
|
|
169
|
+
data = AuthState.model_validate(data)
|
|
78
170
|
AUTH_FILE.parent.mkdir(exist_ok=True, mode=0o700)
|
|
79
|
-
payload = json.dumps(data, indent=2)
|
|
171
|
+
payload = json.dumps(data.model_dump(exclude_none=True), indent=2)
|
|
80
172
|
fd, tmp_name = tempfile.mkstemp(
|
|
81
173
|
prefix=".auth.", suffix=".tmp", dir=AUTH_FILE.parent,
|
|
82
174
|
)
|
|
@@ -104,7 +196,7 @@ def get_token(config=None) -> Optional[str]:
|
|
|
104
196
|
auth = load_auth()
|
|
105
197
|
if not auth:
|
|
106
198
|
return None
|
|
107
|
-
return auth.
|
|
199
|
+
return auth.access_token
|
|
108
200
|
|
|
109
201
|
|
|
110
202
|
# ── Browser login flow ────────────────────────────────────────────────
|
|
@@ -190,9 +282,6 @@ def whoami(config=None) -> Optional[dict]:
|
|
|
190
282
|
# ── Save to cloud ─────────────────────────────────────────────────────
|
|
191
283
|
|
|
192
284
|
|
|
193
|
-
_PUSH_FIELDS = {"content", "type", "project", "confidence", "triggerCmds", "category"}
|
|
194
|
-
|
|
195
|
-
|
|
196
285
|
def push_memory(memory: dict, config=None) -> tuple[bool, Optional[str]]:
|
|
197
286
|
"""Push a single memory to cloud. Best-effort, never raises.
|
|
198
287
|
|
|
@@ -411,47 +500,57 @@ def save_stuck_ids(endpoint: str, stuck: dict[str, str]) -> None:
|
|
|
411
500
|
def _sanitize_for_push(memory: dict) -> dict:
|
|
412
501
|
"""Convert a local memory dict to the shape the cloud API accepts.
|
|
413
502
|
|
|
414
|
-
The cloud uses ``forbidNonWhitelisted`` validation, so any key not on
|
|
415
|
-
DTO causes ``HTTP 400 "property X should not exist"``.
|
|
416
|
-
(
|
|
417
|
-
|
|
418
|
-
the
|
|
419
|
-
|
|
420
|
-
``archived_at
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
``
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
503
|
+
The cloud uses ``forbidNonWhitelisted`` validation, so any key not on
|
|
504
|
+
the DTO causes ``HTTP 400 "property X should not exist"``. R8
|
|
505
|
+
(2026-04-22 review) replaces the previous string-allowlist filter
|
|
506
|
+
with a typed :class:`CloudPushPayload`. Construction picks the
|
|
507
|
+
accepted fields, the model's serialization aliases handle the
|
|
508
|
+
snake→camel rename in one place (``last_confirmed`` →
|
|
509
|
+
``lastConfirmed``, ``archived_at`` → ``archivedAt``,
|
|
510
|
+
``trigger`` → ``triggerCmds``), and ``to_wire`` drops fields that
|
|
511
|
+
are ``None`` so we never send a key whose value the cloud would
|
|
512
|
+
have to special-case.
|
|
513
|
+
|
|
514
|
+
Bug N5 (2026-04-19): the local schema stores command-boundary
|
|
515
|
+
triggers as a snake-case ``trigger`` comma-string (``"alembic
|
|
516
|
+
upgrade, db migrate"``), while the cloud DTO expects camelCase
|
|
517
|
+
``triggerCmds: list[str]``. The pre-fix sanitizer's allowlist
|
|
518
|
+
filter dropped ``trigger`` (wrong key) and never synthesized
|
|
519
|
+
``triggerCmds``, so the **batch** push path silently lost every
|
|
520
|
+
trigger on every backlog flush. Translation now lives on the
|
|
521
|
+
Pydantic model so eager-CLI, eager-MCP, and batch-sync share the
|
|
522
|
+
one source of truth.
|
|
523
|
+
|
|
524
|
+
Keep field set + aliases in sync with ``CreateMemoryDto`` in
|
|
434
525
|
``~/projects/deja_sh/apps/api/src/memories/dto/create-memory.dto.ts``.
|
|
435
526
|
"""
|
|
436
|
-
payload = {k: v for k, v in memory.items() if k in _PUSH_FIELDS}
|
|
437
|
-
if "id" in memory:
|
|
438
|
-
payload["localId"] = memory["id"]
|
|
439
|
-
payload["scope"] = "global"
|
|
440
|
-
if memory.get("last_confirmed"):
|
|
441
|
-
payload["lastConfirmed"] = memory["last_confirmed"]
|
|
442
|
-
archived_at = memory.get("archived_at")
|
|
443
|
-
if archived_at:
|
|
444
|
-
# Local truth is the timestamp; the boolean is derived. Send both so
|
|
445
|
-
# the cloud has the original archive time for LWW conflict resolution
|
|
446
|
-
# rather than auto-stamping NOW() on receipt.
|
|
447
|
-
payload["archived"] = True
|
|
448
|
-
payload["archivedAt"] = archived_at
|
|
449
527
|
trigger_str = memory.get("trigger")
|
|
528
|
+
trigger_cmds: Optional[list[str]] = None
|
|
450
529
|
if trigger_str:
|
|
451
530
|
tokens = [t.strip() for t in trigger_str.split(",") if t.strip()]
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
531
|
+
trigger_cmds = tokens or None
|
|
532
|
+
|
|
533
|
+
archived_at = memory.get("archived_at")
|
|
534
|
+
payload = CloudPushPayload(
|
|
535
|
+
local_id=memory.get("id", ""),
|
|
536
|
+
content=memory.get("content", ""),
|
|
537
|
+
type=memory.get("type", ""),
|
|
538
|
+
# Cloud-side scope is flat — the local "global" / "project:<name>"
|
|
539
|
+
# encoding doesn't apply (the cloud derives scope from its own
|
|
540
|
+
# ``project`` column).
|
|
541
|
+
scope="global",
|
|
542
|
+
project=memory.get("project"),
|
|
543
|
+
confidence=memory.get("confidence"),
|
|
544
|
+
category=memory.get("category"),
|
|
545
|
+
trigger_cmds=trigger_cmds,
|
|
546
|
+
last_confirmed=memory.get("last_confirmed"),
|
|
547
|
+
# Send both the boolean and the timestamp so the cloud uses the
|
|
548
|
+
# original archive time for LWW conflict resolution rather than
|
|
549
|
+
# auto-stamping NOW() on receipt.
|
|
550
|
+
archived=True if archived_at else None,
|
|
551
|
+
archived_at=archived_at,
|
|
552
|
+
)
|
|
553
|
+
return payload.to_wire()
|
|
455
554
|
|
|
456
555
|
|
|
457
556
|
_PULL_RENAME = {
|
|
@@ -532,6 +631,27 @@ def _sanitize_for_pull(memory: dict) -> dict:
|
|
|
532
631
|
return out
|
|
533
632
|
|
|
534
633
|
|
|
634
|
+
class SyncPushPartialError(RuntimeError):
|
|
635
|
+
"""Raised by :func:`sync_push` when a transport failure aborts the
|
|
636
|
+
push mid-stream. Carries the ``partial`` dict (``accepted``,
|
|
637
|
+
``skipped``, ``conflicts``, ``serverTime`` aggregated across the
|
|
638
|
+
batches that DID land) so callers can persist what landed before
|
|
639
|
+
the failure rather than re-pushing everything blind.
|
|
640
|
+
|
|
641
|
+
Bug N1 (2026-05-01 review): the previous shape raised plain
|
|
642
|
+
``RuntimeError`` and discarded ``aggregated["conflicts"]`` —
|
|
643
|
+
earlier-batch quota / content-too-long rejections never reached
|
|
644
|
+
the user, who saw "sync push failed" with no list of which rows
|
|
645
|
+
were already permanently rejected. With LWW upserts the next
|
|
646
|
+
sync re-pushes everything (safe), but the operator still has no
|
|
647
|
+
signal about the rejected subset.
|
|
648
|
+
"""
|
|
649
|
+
|
|
650
|
+
def __init__(self, message: str, *, partial: dict) -> None:
|
|
651
|
+
super().__init__(message)
|
|
652
|
+
self.partial = partial
|
|
653
|
+
|
|
654
|
+
|
|
535
655
|
SYNC_PUSH_BATCH_SIZE = 50
|
|
536
656
|
"""Max rows per ``POST /v1/sync/push`` body.
|
|
537
657
|
|
|
@@ -593,9 +713,23 @@ def sync_push(memories: list[dict], config=None) -> dict:
|
|
|
593
713
|
chunk = sanitized[start : start + SYNC_PUSH_BATCH_SIZE]
|
|
594
714
|
resp = httpx.post(url, json={"memories": chunk}, headers=headers, timeout=60)
|
|
595
715
|
if not resp.is_success:
|
|
596
|
-
|
|
716
|
+
# N1 (2026-05-01 review): before raising, log any conflicts
|
|
717
|
+
# we accumulated from EARLIER successful batches so they're
|
|
718
|
+
# at least observable — and attach the full partial dict to
|
|
719
|
+
# the exception so callers that catch ``SyncPushPartialError``
|
|
720
|
+
# can persist it (e.g. into ``sync_state.json`` so the next
|
|
721
|
+
# sync knows which rows the cloud already rejected).
|
|
722
|
+
if aggregated["conflicts"]:
|
|
723
|
+
logger.warning(
|
|
724
|
+
"cloud sync push aborted with %d earlier-batch "
|
|
725
|
+
"rejection(s) before the transport error — see "
|
|
726
|
+
"exception.partial['conflicts']",
|
|
727
|
+
len(aggregated["conflicts"]),
|
|
728
|
+
)
|
|
729
|
+
raise SyncPushPartialError(
|
|
597
730
|
f"sync push failed ({resp.status_code}) after "
|
|
598
|
-
f"{aggregated['accepted']} accepted in earlier batches: {resp.text}"
|
|
731
|
+
f"{aggregated['accepted']} accepted in earlier batches: {resp.text}",
|
|
732
|
+
partial=aggregated,
|
|
599
733
|
)
|
|
600
734
|
body = resp.json()
|
|
601
735
|
aggregated["accepted"] += body.get("accepted", 0) or 0
|
|
@@ -642,3 +776,61 @@ def sync_pull(since: Optional[str] = None, config=None) -> dict:
|
|
|
642
776
|
)
|
|
643
777
|
resp.raise_for_status()
|
|
644
778
|
return resp.json()
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def get_memory_by_local_id(
|
|
782
|
+
local_id: str, config=None
|
|
783
|
+
) -> Optional[list[dict]]:
|
|
784
|
+
"""Fetch the cloud's view of a row by its local id (Layer 2 verify).
|
|
785
|
+
|
|
786
|
+
Hits ``GET /v1/memories/by-local-id/<local_id>`` (shipped 2026-05-04 on
|
|
787
|
+
deja_sh) and returns the body — an array of cloud rows matching
|
|
788
|
+
``(user_id, local_id)``, sorted ``updatedAt DESC``. Per the cloud
|
|
789
|
+
contract:
|
|
790
|
+
|
|
791
|
+
- ``200`` with an array → success. ``len == 0`` means no row matches.
|
|
792
|
+
``len == 1`` is the normal case. ``len > 1`` is an anomaly the
|
|
793
|
+
verify path is expected to surface (no UNIQUE on
|
|
794
|
+
``(user_id, local_id)`` in Postgres yet).
|
|
795
|
+
- ``404`` → no row matches; surfaced as an empty list so callers can
|
|
796
|
+
handle "missing" and "anomaly" with one branch.
|
|
797
|
+
- Anything else → returns ``None`` (best-effort: a transient
|
|
798
|
+
verification failure must not be conflated with a divergence
|
|
799
|
+
signal — that would spam ``_stuck`` on every flaky network).
|
|
800
|
+
|
|
801
|
+
The call is intentionally one localId at a time; the divergence
|
|
802
|
+
surface is rare (only fires on push-archive verification today) and
|
|
803
|
+
the cloud endpoint is single-id by design.
|
|
804
|
+
"""
|
|
805
|
+
token = get_token(config)
|
|
806
|
+
if not token:
|
|
807
|
+
raise RuntimeError("Not logged in. Run `deja login`.")
|
|
808
|
+
endpoint = _get_endpoint(config)
|
|
809
|
+
url = f"{endpoint}/v1/memories/by-local-id/{local_id}"
|
|
810
|
+
try:
|
|
811
|
+
resp = httpx.get(
|
|
812
|
+
url,
|
|
813
|
+
headers={"Authorization": f"Bearer {token}"},
|
|
814
|
+
timeout=10,
|
|
815
|
+
)
|
|
816
|
+
except Exception as exc:
|
|
817
|
+
logger.warning("verify-by-local-id %s failed: %s", local_id, exc)
|
|
818
|
+
return None
|
|
819
|
+
if resp.status_code == 404:
|
|
820
|
+
return []
|
|
821
|
+
if not resp.is_success:
|
|
822
|
+
logger.warning(
|
|
823
|
+
"verify-by-local-id %s returned %d: %s",
|
|
824
|
+
local_id, resp.status_code, resp.text[:120],
|
|
825
|
+
)
|
|
826
|
+
return None
|
|
827
|
+
body = resp.json()
|
|
828
|
+
if isinstance(body, list):
|
|
829
|
+
return body
|
|
830
|
+
# Unexpected shape (cloud contract change?). Treat as best-effort
|
|
831
|
+
# failure rather than asserting; logging gives the operator signal.
|
|
832
|
+
logger.warning(
|
|
833
|
+
"verify-by-local-id %s returned non-list body: %r",
|
|
834
|
+
local_id, body,
|
|
835
|
+
)
|
|
836
|
+
return None
|
|
@@ -328,6 +328,25 @@ class MemoryStore:
|
|
|
328
328
|
db = await self._connection.get()
|
|
329
329
|
return await self._mem().without_embeddings(db, project)
|
|
330
330
|
|
|
331
|
+
async def fetch_one_existing_embedding(self) -> Optional[bytes]:
|
|
332
|
+
"""Return one stored embedding blob, or None if no row has one yet.
|
|
333
|
+
|
|
334
|
+
Used by ``deja embed`` (N4, 2026-05-01 review) to detect dim
|
|
335
|
+
mismatch between the vault and the configured embedding model
|
|
336
|
+
BEFORE the backfill loop starts writing new rows at a different
|
|
337
|
+
dimension. Cheap probe — single ``LIMIT 1`` SELECT.
|
|
338
|
+
"""
|
|
339
|
+
db = await self._connection.get()
|
|
340
|
+
cursor = await db.execute(
|
|
341
|
+
"SELECT embedding FROM memories WHERE embedding IS NOT NULL "
|
|
342
|
+
"AND archived_at IS NULL AND invalidated_at IS NULL LIMIT 1"
|
|
343
|
+
)
|
|
344
|
+
row = await cursor.fetchone()
|
|
345
|
+
await cursor.close()
|
|
346
|
+
if row is None:
|
|
347
|
+
return None
|
|
348
|
+
return row["embedding"]
|
|
349
|
+
|
|
331
350
|
async def update_memory(self, memory_id: str, fields: dict) -> bool:
|
|
332
351
|
"""Update allowed metadata fields on an existing active memory.
|
|
333
352
|
|
|
@@ -489,11 +508,32 @@ class MemoryStore:
|
|
|
489
508
|
if not existing:
|
|
490
509
|
fields = list(memory.keys())
|
|
491
510
|
placeholders = ",".join("?" for _ in fields)
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
511
|
+
try:
|
|
512
|
+
await db.execute(
|
|
513
|
+
f"INSERT INTO memories ({','.join(fields)}) VALUES ({placeholders})",
|
|
514
|
+
[memory[f] for f in fields],
|
|
515
|
+
)
|
|
516
|
+
return "inserted"
|
|
517
|
+
except sqlite3.IntegrityError as e:
|
|
518
|
+
# Bug L4 (2026-05-04, cloud-sync-divergence doc §Layer 4):
|
|
519
|
+
# the partial UNIQUE index ``idx_memories_dedup_unique_active``
|
|
520
|
+
# rejects this INSERT because another *different-id* active
|
|
521
|
+
# row already holds the same (scope, COALESCE(project,''),
|
|
522
|
+
# type, content). Without this catch the IntegrityError
|
|
523
|
+
# propagates up, ``deja sync`` aborts mid-pull, the cursor
|
|
524
|
+
# never advances, and every subsequent sync re-fails on the
|
|
525
|
+
# same row — exactly the wedge observed and hand-recovered
|
|
526
|
+
# on 2026-05-04. Resolve via the same survivor algorithm
|
|
527
|
+
# as ``backfill_unique_active_dedup_index`` and let pull
|
|
528
|
+
# continue.
|
|
529
|
+
#
|
|
530
|
+
# Match strictly on the index name so the M2 PK-race path
|
|
531
|
+
# (``IntegrityError: NOT NULL/UNIQUE constraint failed:
|
|
532
|
+
# memories.id``) and any other constraint violation still
|
|
533
|
+
# propagate — we only know how to self-heal *this* one.
|
|
534
|
+
if "idx_memories_dedup_unique_active" not in str(e):
|
|
535
|
+
raise
|
|
536
|
+
return await self._resolve_dedup_conflict(db, memory)
|
|
497
537
|
if merge_strategy == "skip":
|
|
498
538
|
return "skipped"
|
|
499
539
|
if merge_strategy == "overwrite":
|
|
@@ -520,15 +560,188 @@ class MemoryStore:
|
|
|
520
560
|
new_confidence = min(
|
|
521
561
|
CONFIDENCE_MAX, existing["confidence"] + CONFIDENCE_BUMP
|
|
522
562
|
)
|
|
563
|
+
# Bug FB1 (2026-05-04, cloud-sync feedback loop —
|
|
564
|
+
# docs/sync-feedback-loop-2026-05-04.md): pre-fix this
|
|
565
|
+
# path stamped ``updated_at = now`` (local wall-clock).
|
|
566
|
+
# ``deja sync``'s push filter is ``updated_at > cursor``,
|
|
567
|
+
# and the cursor is set to the cloud's ``serverTime``
|
|
568
|
+
# which was captured *before* this upsert ran — so
|
|
569
|
+
# ``now > cursor`` always, and the pulled row sweeps
|
|
570
|
+
# straight back into the next push. Cloud's bulkUpsert
|
|
571
|
+
# then restamps the cloud-side ``updated_at`` on receipt,
|
|
572
|
+
# the next pull returns the row again, and ``deja sync``
|
|
573
|
+
# ships the entire mirrored corpus on every cycle
|
|
574
|
+
# forever (idempotent at the data layer, but turns
|
|
575
|
+
# every sync into O(corpus) instead of O(delta)).
|
|
576
|
+
#
|
|
577
|
+
# Fix: preserve the cloud-supplied ``updated_at`` —
|
|
578
|
+
# already in ``memory["updated_at"]`` after
|
|
579
|
+
# ``_sanitize_for_pull`` does the camelCase rename.
|
|
580
|
+
# ``last_confirmed`` is genuinely changing here (this
|
|
581
|
+
# IS a re-confirmation event), so it stamps to local
|
|
582
|
+
# now() unchanged.
|
|
583
|
+
cloud_updated_at = memory.get("updated_at") or now
|
|
523
584
|
await db.execute(
|
|
524
585
|
"UPDATE memories "
|
|
525
586
|
"SET confidence = ?, last_confirmed = ?, updated_at = ? "
|
|
526
587
|
"WHERE id = ?",
|
|
527
|
-
(new_confidence, now,
|
|
588
|
+
(new_confidence, now, cloud_updated_at, mem_id),
|
|
528
589
|
)
|
|
529
590
|
return "updated"
|
|
530
591
|
return "skipped"
|
|
531
592
|
|
|
593
|
+
async def _resolve_dedup_conflict(self, db, incoming: dict) -> str:
|
|
594
|
+
"""Pull-side dedup-conflict self-heal (Layer 4 of the 2026-05-04
|
|
595
|
+
cloud-sync-divergence recovery plan).
|
|
596
|
+
|
|
597
|
+
Triggered when ``upsert``'s INSERT trips ``idx_memories_dedup_unique_active``:
|
|
598
|
+
an active local row already holds the incoming row's
|
|
599
|
+
(scope, COALESCE(project,''), type, content) tuple under a *different*
|
|
600
|
+
primary key id (typical cause: cross-machine duplicate where the same
|
|
601
|
+
content was saved on two machines, both pushed, and the cloud now
|
|
602
|
+
sends down both copies — see docs/cloud-sync-divergence-2026-05-04.md
|
|
603
|
+
for the worked example).
|
|
604
|
+
|
|
605
|
+
Survivor algorithm — symmetric with
|
|
606
|
+
:func:`backfill_unique_active_dedup_index` so changes stay in
|
|
607
|
+
lockstep across the boot-time batch path and this row-time path:
|
|
608
|
+
|
|
609
|
+
1. Pick survivor by reuse_count DESC → confidence DESC → id DESC
|
|
610
|
+
(lex-largest). Deterministic across machines on the same input,
|
|
611
|
+
so all clients converge on the same survivor without coordination.
|
|
612
|
+
2. Merge stats onto the survivor: ``SUM(reuse_count)``,
|
|
613
|
+
``MAX(confidence)``, ``MAX(updated_at)``, ``MAX(last_confirmed)``,
|
|
614
|
+
and the union of trigger phrases via :func:`_merge_trigger_phrases`
|
|
615
|
+
(the single source of truth — same helper SaveService and
|
|
616
|
+
MaintenanceService use, so trigger semantics never drift).
|
|
617
|
+
3. Archive the loser (``archived_at = now()``) — never delete; the
|
|
618
|
+
archive transition will sync back to the cloud on the next push
|
|
619
|
+
(because ``updated_at`` was just bumped past the cursor) so all
|
|
620
|
+
machines converge on one-active-per-content with no extra wiring.
|
|
621
|
+
|
|
622
|
+
Two paths depending on which row wins the survivor election. Both
|
|
623
|
+
end in exactly one active row for the content key — the partial
|
|
624
|
+
UNIQUE index is restored to a satisfied state inside the same
|
|
625
|
+
transaction, so a crash between the two writes leaves the DB in
|
|
626
|
+
the original pre-resolve state, never half-merged.
|
|
627
|
+
|
|
628
|
+
Runs inside the caller's ``async with self._connection.transaction()``
|
|
629
|
+
block; never opens its own. Returns one of:
|
|
630
|
+
|
|
631
|
+
- ``"merged_existing_archived"`` — incoming won; existing was
|
|
632
|
+
archived in place and incoming was inserted active with the
|
|
633
|
+
merged stats.
|
|
634
|
+
- ``"merged_incoming_archived"`` — existing won; existing's stats
|
|
635
|
+
were updated in place and incoming was inserted as archived.
|
|
636
|
+
"""
|
|
637
|
+
now = _now_iso()
|
|
638
|
+
|
|
639
|
+
cur = await db.execute(
|
|
640
|
+
"SELECT id, reuse_count, confidence, trigger, "
|
|
641
|
+
" updated_at, last_confirmed "
|
|
642
|
+
"FROM memories "
|
|
643
|
+
"WHERE scope = ? AND COALESCE(project, '') = COALESCE(?, '') "
|
|
644
|
+
" AND type = ? AND content = ? "
|
|
645
|
+
" AND archived_at IS NULL AND invalidated_at IS NULL",
|
|
646
|
+
(incoming["scope"], incoming.get("project"),
|
|
647
|
+
incoming["type"], incoming["content"]),
|
|
648
|
+
)
|
|
649
|
+
rows = await cur.fetchall()
|
|
650
|
+
await cur.close()
|
|
651
|
+
if not rows:
|
|
652
|
+
# Defensive: the partial UNIQUE fired but we can't find a
|
|
653
|
+
# matching active row. Either the schema diverged or another
|
|
654
|
+
# writer archived the row between the IntegrityError and now.
|
|
655
|
+
# Re-raise rather than guess; the caller's transaction rolls
|
|
656
|
+
# back cleanly.
|
|
657
|
+
raise sqlite3.IntegrityError(
|
|
658
|
+
"idx_memories_dedup_unique_active fired but no matching "
|
|
659
|
+
"active row found — schema invariant broken"
|
|
660
|
+
)
|
|
661
|
+
existing_row = rows[0] # there can be only one active row by the
|
|
662
|
+
# partial UNIQUE itself; defense in depth.
|
|
663
|
+
|
|
664
|
+
# Survivor election — same tiebreak chain as the migration's batch
|
|
665
|
+
# dedup. Tuple comparison: higher reuse_count first, then higher
|
|
666
|
+
# confidence, then lex-larger id.
|
|
667
|
+
existing_key = (
|
|
668
|
+
existing_row["reuse_count"] or 0,
|
|
669
|
+
existing_row["confidence"] or 0.0,
|
|
670
|
+
existing_row["id"] or "",
|
|
671
|
+
)
|
|
672
|
+
incoming_key = (
|
|
673
|
+
incoming.get("reuse_count") or 0,
|
|
674
|
+
incoming.get("confidence") or 0.0,
|
|
675
|
+
incoming.get("id") or "",
|
|
676
|
+
)
|
|
677
|
+
existing_wins = existing_key >= incoming_key
|
|
678
|
+
|
|
679
|
+
merged_reuse = (existing_row["reuse_count"] or 0) + (
|
|
680
|
+
incoming.get("reuse_count") or 0
|
|
681
|
+
)
|
|
682
|
+
merged_confidence = max(
|
|
683
|
+
existing_row["confidence"] or 0.0,
|
|
684
|
+
incoming.get("confidence") or 0.0,
|
|
685
|
+
)
|
|
686
|
+
merged_trigger = _merge_trigger_phrases(
|
|
687
|
+
existing_row["trigger"], incoming.get("trigger")
|
|
688
|
+
)
|
|
689
|
+
merged_updated_at = max(
|
|
690
|
+
existing_row["updated_at"] or "",
|
|
691
|
+
incoming.get("updated_at") or "",
|
|
692
|
+
) or now
|
|
693
|
+
merged_last_confirmed = max(
|
|
694
|
+
existing_row["last_confirmed"] or "",
|
|
695
|
+
incoming.get("last_confirmed") or "",
|
|
696
|
+
) or None
|
|
697
|
+
|
|
698
|
+
if existing_wins:
|
|
699
|
+
# Survivor stays in place; merge stats onto it. Loser (incoming)
|
|
700
|
+
# lands as a fresh archived row so its id remains resolvable for
|
|
701
|
+
# any peer that already pulled it under that id.
|
|
702
|
+
await db.execute(
|
|
703
|
+
"UPDATE memories "
|
|
704
|
+
"SET reuse_count = ?, confidence = ?, trigger = ?, "
|
|
705
|
+
" updated_at = ?, last_confirmed = ? "
|
|
706
|
+
"WHERE id = ?",
|
|
707
|
+
(merged_reuse, merged_confidence, merged_trigger,
|
|
708
|
+
merged_updated_at, merged_last_confirmed,
|
|
709
|
+
existing_row["id"]),
|
|
710
|
+
)
|
|
711
|
+
loser = dict(incoming)
|
|
712
|
+
loser["archived_at"] = now
|
|
713
|
+
loser["updated_at"] = now # bump so the archive transition
|
|
714
|
+
# propagates on next push
|
|
715
|
+
fields = list(loser.keys())
|
|
716
|
+
placeholders = ",".join("?" for _ in fields)
|
|
717
|
+
await db.execute(
|
|
718
|
+
f"INSERT INTO memories ({','.join(fields)}) "
|
|
719
|
+
f"VALUES ({placeholders})",
|
|
720
|
+
[loser[f] for f in fields],
|
|
721
|
+
)
|
|
722
|
+
return "merged_incoming_archived"
|
|
723
|
+
|
|
724
|
+
# Incoming wins. Archive existing first (clears it from the partial
|
|
725
|
+
# UNIQUE's active set), then insert incoming with the merged stats.
|
|
726
|
+
await db.execute(
|
|
727
|
+
"UPDATE memories SET archived_at = ?, updated_at = ? WHERE id = ?",
|
|
728
|
+
(now, now, existing_row["id"]),
|
|
729
|
+
)
|
|
730
|
+
survivor = dict(incoming)
|
|
731
|
+
survivor["reuse_count"] = merged_reuse
|
|
732
|
+
survivor["confidence"] = merged_confidence
|
|
733
|
+
survivor["trigger"] = merged_trigger
|
|
734
|
+
survivor["updated_at"] = merged_updated_at
|
|
735
|
+
survivor["last_confirmed"] = merged_last_confirmed
|
|
736
|
+
fields = list(survivor.keys())
|
|
737
|
+
placeholders = ",".join("?" for _ in fields)
|
|
738
|
+
await db.execute(
|
|
739
|
+
f"INSERT INTO memories ({','.join(fields)}) "
|
|
740
|
+
f"VALUES ({placeholders})",
|
|
741
|
+
[survivor[f] for f in fields],
|
|
742
|
+
)
|
|
743
|
+
return "merged_existing_archived"
|
|
744
|
+
|
|
532
745
|
# ── observations + reflection meta (delegate to repos) ───────────────────
|
|
533
746
|
|
|
534
747
|
async def save_observation(self, project: Optional[str], content: str) -> str:
|
|
@@ -99,6 +99,24 @@ def _bytes_to_emb(data: bytes) -> list[float]:
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
102
|
+
"""Cosine similarity between two equal-length vectors.
|
|
103
|
+
|
|
104
|
+
Bug R20 (2026-04-22 review): raises ``ValueError`` on dim mismatch
|
|
105
|
+
instead of silently truncating to the shorter length via ``zip``.
|
|
106
|
+
Truncated cosine produces garbage scores and the user sees no
|
|
107
|
+
signal — the typical trigger is a vault that mixes embeddings from
|
|
108
|
+
two different models (e.g. ``nomic-embed-text`` 768-dim → switch to
|
|
109
|
+
``mxbai-embed-large`` 1024-dim without re-embedding old rows).
|
|
110
|
+
Callers in :mod:`deja.core.store.services.search` and
|
|
111
|
+
:mod:`deja.core.store.services.ranking` catch this and skip the
|
|
112
|
+
row, logging once per pass.
|
|
113
|
+
"""
|
|
114
|
+
if len(a) != len(b):
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"cosine_similarity dim mismatch: {len(a)} vs {len(b)}. "
|
|
117
|
+
"Likely mixed-dim embeddings — re-run `deja embed` with the "
|
|
118
|
+
"current embedding.model after changing it in ~/.deja/config.yaml."
|
|
119
|
+
)
|
|
102
120
|
dot = sum(x * y for x, y in zip(a, b))
|
|
103
121
|
mag_a = math.sqrt(sum(x * x for x in a))
|
|
104
122
|
mag_b = math.sqrt(sum(x * x for x in b))
|