cc-star 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cc_star/__init__.py +3 -0
- cc_star/cache/__init__.py +1 -0
- cc_star/cache/connection.py +100 -0
- cc_star/cache/policies.py +94 -0
- cc_star/cache/schema.py +100 -0
- cc_star/cache/skills.py +89 -0
- cc_star/cache/traces.py +163 -0
- cc_star/cache/vector.py +58 -0
- cc_star/cli.py +286 -0
- cc_star/config.py +146 -0
- cc_star/installer.py +311 -0
- cc_star/memos/__init__.py +1 -0
- cc_star/memos/id.py +79 -0
- cc_star/memos/types.py +148 -0
- cc_star/ov/__init__.py +1 -0
- cc_star/ov/client.py +184 -0
- cc_star/retrieval/__init__.py +1 -0
- cc_star/retrieval/ranker.py +112 -0
- cc_star/templates/compact.py +157 -0
- cc_star/templates/inject.py +169 -0
- cc_star/templates/session_start.py +66 -0
- cc_star/templates/store.py +197 -0
- cc_star/templates/summary.py +168 -0
- cc_star-0.1.0.dist-info/METADATA +10 -0
- cc_star-0.1.0.dist-info/RECORD +28 -0
- cc_star-0.1.0.dist-info/WHEEL +4 -0
- cc_star-0.1.0.dist-info/entry_points.txt +2 -0
- cc_star-0.1.0.dist-info/licenses/LICENSE +17 -0
cc_star/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""SQLite local cache layer."""
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""SQLite connection management with performance optimizations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
import threading
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Shared statement cache across threads
|
|
12
|
+
_STMT_CACHE: dict[str, sqlite3.Cursor] = {}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CacheConnection:
|
|
16
|
+
"""Thread-safe SQLite connection manager with performance tuning.
|
|
17
|
+
|
|
18
|
+
Optimizations:
|
|
19
|
+
- WAL mode for concurrent reads
|
|
20
|
+
- 64MB cache for hot data
|
|
21
|
+
- memory-mapped I/O (256MB)
|
|
22
|
+
- Lazy pragma initialization (deferred until first query)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db_path: str, wal_mode: bool = True):
|
|
26
|
+
self._db_path = str(Path(db_path).expanduser())
|
|
27
|
+
self._wal = wal_mode
|
|
28
|
+
self._local = threading.local()
|
|
29
|
+
self._lock = threading.Lock()
|
|
30
|
+
self._initialized = False
|
|
31
|
+
|
|
32
|
+
# Ensure parent directory exists
|
|
33
|
+
Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
def _ensure_init(self) -> None:
|
|
36
|
+
"""Apply performance pragmas once per connection."""
|
|
37
|
+
if self._initialized:
|
|
38
|
+
return
|
|
39
|
+
conn = self._get_conn_raw()
|
|
40
|
+
if self._wal:
|
|
41
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
42
|
+
conn.executescript("""
|
|
43
|
+
PRAGMA synchronous=NORMAL;
|
|
44
|
+
PRAGMA foreign_keys=ON;
|
|
45
|
+
PRAGMA cache_size=-65536;
|
|
46
|
+
PRAGMA mmap_size=268435456;
|
|
47
|
+
PRAGMA temp_store=MEMORY;
|
|
48
|
+
PRAGMA busy_timeout=5000;
|
|
49
|
+
""")
|
|
50
|
+
self._initialized = True
|
|
51
|
+
|
|
52
|
+
def _get_conn_raw(self) -> sqlite3.Connection:
|
|
53
|
+
"""Create a raw connection without pragma setup."""
|
|
54
|
+
if not hasattr(self._local, "conn") or self._local.conn is None:
|
|
55
|
+
conn = sqlite3.connect(
|
|
56
|
+
self._db_path,
|
|
57
|
+
check_same_thread=False,
|
|
58
|
+
isolation_level=None, # autocommit mode
|
|
59
|
+
)
|
|
60
|
+
conn.row_factory = sqlite3.Row
|
|
61
|
+
self._local.conn = conn
|
|
62
|
+
return self._local.conn
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def conn(self) -> sqlite3.Connection:
|
|
66
|
+
self._ensure_init()
|
|
67
|
+
return self._get_conn_raw()
|
|
68
|
+
|
|
69
|
+
def execute(self, sql: str, params: tuple = ()) -> sqlite3.Cursor:
|
|
70
|
+
"""Execute with automatic pragma init."""
|
|
71
|
+
self._ensure_init()
|
|
72
|
+
return self._get_conn_raw().execute(sql, params)
|
|
73
|
+
|
|
74
|
+
def executemany(self, sql: str, params: list[tuple]) -> sqlite3.Cursor:
|
|
75
|
+
"""Batch execute with automatic pragma init."""
|
|
76
|
+
self._ensure_init()
|
|
77
|
+
return self._get_conn_raw().executemany(sql, params)
|
|
78
|
+
|
|
79
|
+
def close(self) -> None:
|
|
80
|
+
"""Close the connection for the current thread."""
|
|
81
|
+
if hasattr(self._local, "conn") and self._local.conn is not None:
|
|
82
|
+
try:
|
|
83
|
+
self._local.conn.execute("PRAGMA optimize")
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
self._local.conn.close()
|
|
87
|
+
self._local.conn = None
|
|
88
|
+
self._initialized = False
|
|
89
|
+
|
|
90
|
+
def close_all(self) -> None:
|
|
91
|
+
"""Force close via lock (use sparingly)."""
|
|
92
|
+
with self._lock:
|
|
93
|
+
if hasattr(self._local, "conn") and self._local.conn is not None:
|
|
94
|
+
try:
|
|
95
|
+
self._local.conn.execute("PRAGMA optimize")
|
|
96
|
+
except Exception:
|
|
97
|
+
pass
|
|
98
|
+
self._local.conn.close()
|
|
99
|
+
self._local.conn = None
|
|
100
|
+
self._initialized = False
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Policy repository — local SQLite CRUD for policies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from cc_star.cache.connection import CacheConnection
|
|
9
|
+
from cc_star.cache.schema import ensure_schema
|
|
10
|
+
from cc_star.memos.types import PolicyRow
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PolicyRepository:
|
|
14
|
+
"""Persist and query policies locally."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, cache: CacheConnection):
|
|
17
|
+
self._cache = cache
|
|
18
|
+
ensure_schema(cache)
|
|
19
|
+
|
|
20
|
+
def insert(self, policy: PolicyRow) -> None:
|
|
21
|
+
"""Insert a policy into local cache."""
|
|
22
|
+
conn = self._cache.conn
|
|
23
|
+
conn.execute(
|
|
24
|
+
"""
|
|
25
|
+
INSERT OR REPLACE INTO policies
|
|
26
|
+
(id, name, description, trigger_pattern, action_template,
|
|
27
|
+
embedding, confidence, activation_count, source_trace_ids,
|
|
28
|
+
metadata, created_at, synced)
|
|
29
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
30
|
+
""",
|
|
31
|
+
(
|
|
32
|
+
policy.id,
|
|
33
|
+
policy.name,
|
|
34
|
+
policy.description,
|
|
35
|
+
policy.trigger_pattern,
|
|
36
|
+
policy.action_template,
|
|
37
|
+
json.dumps(policy.embedding) if policy.embedding else None,
|
|
38
|
+
policy.confidence,
|
|
39
|
+
policy.activation_count,
|
|
40
|
+
json.dumps(policy.source_trace_ids, ensure_ascii=False),
|
|
41
|
+
json.dumps(policy.metadata, ensure_ascii=False, default=str),
|
|
42
|
+
policy.created_at,
|
|
43
|
+
0,
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
conn.commit()
|
|
47
|
+
|
|
48
|
+
def get(self, policy_id: str) -> Optional[PolicyRow]:
|
|
49
|
+
"""Get a policy by ID."""
|
|
50
|
+
row = self._cache.conn.execute(
|
|
51
|
+
"SELECT * FROM policies WHERE id = ?", (policy_id,)
|
|
52
|
+
).fetchone()
|
|
53
|
+
if row is None:
|
|
54
|
+
return None
|
|
55
|
+
return self._row_to_policy(row)
|
|
56
|
+
|
|
57
|
+
def list_active(self, min_confidence: float = 0.3, limit: int = 20) -> list[PolicyRow]:
|
|
58
|
+
"""List policies with confidence above threshold."""
|
|
59
|
+
rows = self._cache.conn.execute(
|
|
60
|
+
"SELECT * FROM policies WHERE confidence >= ? ORDER BY confidence DESC LIMIT ?",
|
|
61
|
+
(min_confidence, limit),
|
|
62
|
+
).fetchall()
|
|
63
|
+
return [self._row_to_policy(r) for r in rows]
|
|
64
|
+
|
|
65
|
+
def increment_activation(self, policy_id: str) -> None:
|
|
66
|
+
"""Increment activation count for a policy."""
|
|
67
|
+
self._cache.conn.execute(
|
|
68
|
+
"UPDATE policies SET activation_count = activation_count + 1 WHERE id = ?",
|
|
69
|
+
(policy_id,),
|
|
70
|
+
)
|
|
71
|
+
self._cache.conn.commit()
|
|
72
|
+
|
|
73
|
+
def count(self) -> int:
|
|
74
|
+
"""Total policy count."""
|
|
75
|
+
row = self._cache.conn.execute("SELECT COUNT(*) as cnt FROM policies").fetchone()
|
|
76
|
+
return row["cnt"] if row else 0
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _row_to_policy(row: Any) -> PolicyRow:
|
|
80
|
+
return PolicyRow(
|
|
81
|
+
id=row["id"],
|
|
82
|
+
name=row["name"],
|
|
83
|
+
description=row["description"],
|
|
84
|
+
trigger_pattern=row["trigger_pattern"],
|
|
85
|
+
action_template=row["action_template"],
|
|
86
|
+
embedding=json.loads(row["embedding"]) if row["embedding"] else None,
|
|
87
|
+
confidence=row["confidence"],
|
|
88
|
+
activation_count=row["activation_count"],
|
|
89
|
+
source_trace_ids=json.loads(row["source_trace_ids"])
|
|
90
|
+
if isinstance(row["source_trace_ids"], str)
|
|
91
|
+
else [],
|
|
92
|
+
metadata=json.loads(row["metadata"]) if isinstance(row["metadata"], str) else {},
|
|
93
|
+
created_at=row["created_at"],
|
|
94
|
+
)
|
cc_star/cache/schema.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""SQLite schema — traces, policies, skills tables with FTS5."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from cc_star.cache.connection import CacheConnection
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def ensure_schema(conn_or_cache: CacheConnection) -> None:
|
|
9
|
+
"""Create all tables and indexes if they don't exist."""
|
|
10
|
+
conn = conn_or_cache.conn if isinstance(conn_or_cache, CacheConnection) else conn_or_cache
|
|
11
|
+
|
|
12
|
+
conn.executescript("""
|
|
13
|
+
CREATE TABLE IF NOT EXISTS traces (
|
|
14
|
+
id TEXT PRIMARY KEY,
|
|
15
|
+
session_id TEXT NOT NULL,
|
|
16
|
+
turn_index INTEGER NOT NULL DEFAULT 0,
|
|
17
|
+
user_content TEXT NOT NULL,
|
|
18
|
+
assistant_content TEXT NOT NULL DEFAULT '',
|
|
19
|
+
embedding BLOB,
|
|
20
|
+
reward REAL NOT NULL DEFAULT 0.0,
|
|
21
|
+
tags TEXT DEFAULT '',
|
|
22
|
+
metadata TEXT DEFAULT '{}',
|
|
23
|
+
created_at TEXT NOT NULL,
|
|
24
|
+
synced INTEGER NOT NULL DEFAULT 0
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_traces_session
|
|
28
|
+
ON traces(session_id);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_traces_created
|
|
30
|
+
ON traces(created_at);
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_traces_synced
|
|
32
|
+
ON traces(synced);
|
|
33
|
+
|
|
34
|
+
CREATE TABLE IF NOT EXISTS policies (
|
|
35
|
+
id TEXT PRIMARY KEY,
|
|
36
|
+
name TEXT NOT NULL,
|
|
37
|
+
description TEXT NOT NULL DEFAULT '',
|
|
38
|
+
trigger_pattern TEXT NOT NULL DEFAULT '',
|
|
39
|
+
action_template TEXT NOT NULL DEFAULT '',
|
|
40
|
+
embedding BLOB,
|
|
41
|
+
confidence REAL NOT NULL DEFAULT 0.0,
|
|
42
|
+
activation_count INTEGER NOT NULL DEFAULT 0,
|
|
43
|
+
source_trace_ids TEXT DEFAULT '[]',
|
|
44
|
+
metadata TEXT DEFAULT '{}',
|
|
45
|
+
created_at TEXT NOT NULL,
|
|
46
|
+
synced INTEGER NOT NULL DEFAULT 0
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_policies_confidence
|
|
50
|
+
ON policies(confidence DESC);
|
|
51
|
+
|
|
52
|
+
CREATE TABLE IF NOT EXISTS skills (
|
|
53
|
+
name TEXT PRIMARY KEY,
|
|
54
|
+
description TEXT NOT NULL DEFAULT '',
|
|
55
|
+
usage_guide TEXT NOT NULL DEFAULT '',
|
|
56
|
+
source_policy_ids TEXT DEFAULT '[]',
|
|
57
|
+
version INTEGER NOT NULL DEFAULT 1,
|
|
58
|
+
metadata TEXT DEFAULT '{}',
|
|
59
|
+
created_at TEXT NOT NULL
|
|
60
|
+
);
|
|
61
|
+
|
|
62
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS traces_fts
|
|
63
|
+
USING fts5(
|
|
64
|
+
user_content,
|
|
65
|
+
assistant_content,
|
|
66
|
+
tags,
|
|
67
|
+
content='traces',
|
|
68
|
+
content_rowid='rowid'
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
CREATE TRIGGER IF NOT EXISTS traces_ai AFTER INSERT ON traces BEGIN
|
|
72
|
+
INSERT INTO traces_fts(rowid, user_content, assistant_content, tags)
|
|
73
|
+
VALUES (new.rowid, new.user_content, new.assistant_content, new.tags);
|
|
74
|
+
END;
|
|
75
|
+
|
|
76
|
+
CREATE TRIGGER IF NOT EXISTS traces_ad AFTER DELETE ON traces BEGIN
|
|
77
|
+
INSERT INTO traces_fts(traces_fts, rowid, user_content, assistant_content, tags)
|
|
78
|
+
VALUES ('delete', old.rowid, old.user_content, old.assistant_content, old.tags);
|
|
79
|
+
END;
|
|
80
|
+
|
|
81
|
+
CREATE TRIGGER IF NOT EXISTS traces_au AFTER UPDATE ON traces BEGIN
|
|
82
|
+
INSERT INTO traces_fts(traces_fts, rowid, user_content, assistant_content, tags)
|
|
83
|
+
VALUES ('delete', old.rowid, old.user_content, old.assistant_content, old.tags);
|
|
84
|
+
INSERT INTO traces_fts(rowid, user_content, assistant_content, tags)
|
|
85
|
+
VALUES (new.rowid, new.user_content, new.assistant_content, new.tags);
|
|
86
|
+
END;
|
|
87
|
+
""")
|
|
88
|
+
conn.commit()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def drop_schema(conn_or_cache: CacheConnection) -> None:
|
|
92
|
+
"""Drop all tables (for testing)."""
|
|
93
|
+
conn = conn_or_cache.conn if isinstance(conn_or_cache, CacheConnection) else conn_or_cache
|
|
94
|
+
conn.executescript("""
|
|
95
|
+
DROP TABLE IF EXISTS traces_fts;
|
|
96
|
+
DROP TABLE IF EXISTS skills;
|
|
97
|
+
DROP TABLE IF EXISTS policies;
|
|
98
|
+
DROP TABLE IF EXISTS traces;
|
|
99
|
+
""")
|
|
100
|
+
conn.commit()
|
cc_star/cache/skills.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Skill repository — local SQLite CRUD for skills."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from cc_star.cache.connection import CacheConnection
|
|
9
|
+
from cc_star.cache.schema import ensure_schema
|
|
10
|
+
from cc_star.memos.types import SkillRow
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SkillRepository:
|
|
14
|
+
"""Persist and query skills locally."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, cache: CacheConnection):
|
|
17
|
+
self._cache = cache
|
|
18
|
+
ensure_schema(cache)
|
|
19
|
+
|
|
20
|
+
def insert(self, skill: SkillRow) -> None:
|
|
21
|
+
"""Insert a skill into local cache."""
|
|
22
|
+
conn = self._cache.conn
|
|
23
|
+
conn.execute(
|
|
24
|
+
"""
|
|
25
|
+
INSERT OR REPLACE INTO skills
|
|
26
|
+
(name, description, usage_guide, source_policy_ids,
|
|
27
|
+
version, metadata, created_at)
|
|
28
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
29
|
+
""",
|
|
30
|
+
(
|
|
31
|
+
skill.name,
|
|
32
|
+
skill.description,
|
|
33
|
+
skill.usage_guide,
|
|
34
|
+
json.dumps(skill.source_policy_ids, ensure_ascii=False),
|
|
35
|
+
skill.version,
|
|
36
|
+
json.dumps(skill.metadata, ensure_ascii=False, default=str),
|
|
37
|
+
skill.created_at,
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
conn.commit()
|
|
41
|
+
|
|
42
|
+
def get(self, name: str) -> Optional[SkillRow]:
|
|
43
|
+
"""Get a skill by name."""
|
|
44
|
+
row = self._cache.conn.execute(
|
|
45
|
+
"SELECT * FROM skills WHERE name = ?", (name,)
|
|
46
|
+
).fetchone()
|
|
47
|
+
if row is None:
|
|
48
|
+
return None
|
|
49
|
+
return self._row_to_skill(row)
|
|
50
|
+
|
|
51
|
+
def list_all(self) -> list[SkillRow]:
|
|
52
|
+
"""List all skills."""
|
|
53
|
+
rows = self._cache.conn.execute(
|
|
54
|
+
"SELECT * FROM skills ORDER BY name ASC"
|
|
55
|
+
).fetchall()
|
|
56
|
+
return [self._row_to_skill(r) for r in rows]
|
|
57
|
+
|
|
58
|
+
def search(self, query: str, limit: int = 10) -> list[SkillRow]:
|
|
59
|
+
"""Search skills by name or description."""
|
|
60
|
+
like = f"%{query}%"
|
|
61
|
+
rows = self._cache.conn.execute(
|
|
62
|
+
"SELECT * FROM skills WHERE name LIKE ? OR description LIKE ? LIMIT ?",
|
|
63
|
+
(like, like, limit),
|
|
64
|
+
).fetchall()
|
|
65
|
+
return [self._row_to_skill(r) for r in rows]
|
|
66
|
+
|
|
67
|
+
def delete(self, name: str) -> None:
|
|
68
|
+
"""Delete a skill by name."""
|
|
69
|
+
self._cache.conn.execute("DELETE FROM skills WHERE name = ?", (name,))
|
|
70
|
+
self._cache.conn.commit()
|
|
71
|
+
|
|
72
|
+
def count(self) -> int:
|
|
73
|
+
"""Total skill count."""
|
|
74
|
+
row = self._cache.conn.execute("SELECT COUNT(*) as cnt FROM skills").fetchone()
|
|
75
|
+
return row["cnt"] if row else 0
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def _row_to_skill(row: Any) -> SkillRow:
|
|
79
|
+
return SkillRow(
|
|
80
|
+
name=row["name"],
|
|
81
|
+
description=row["description"],
|
|
82
|
+
usage_guide=row["usage_guide"],
|
|
83
|
+
source_policy_ids=json.loads(row["source_policy_ids"])
|
|
84
|
+
if isinstance(row["source_policy_ids"], str)
|
|
85
|
+
else [],
|
|
86
|
+
version=row["version"],
|
|
87
|
+
metadata=json.loads(row["metadata"]) if isinstance(row["metadata"], str) else {},
|
|
88
|
+
created_at=row["created_at"],
|
|
89
|
+
)
|
cc_star/cache/traces.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Trace repository — local SQLite CRUD for traces with batch operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from cc_star.cache.connection import CacheConnection
|
|
9
|
+
from cc_star.cache.schema import ensure_schema
|
|
10
|
+
from cc_star.memos.types import TraceRow
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TraceRepository:
|
|
14
|
+
"""Persist and query traces locally."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, cache: CacheConnection):
|
|
17
|
+
self._cache = cache
|
|
18
|
+
ensure_schema(cache)
|
|
19
|
+
# Prepared statements
|
|
20
|
+
self._insert_sql = (
|
|
21
|
+
"INSERT OR REPLACE INTO traces "
|
|
22
|
+
"(id, session_id, turn_index, user_content, assistant_content, "
|
|
23
|
+
"embedding, reward, tags, metadata, created_at, synced) "
|
|
24
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def insert(self, trace: TraceRow) -> None:
|
|
28
|
+
"""Insert a single trace into local cache."""
|
|
29
|
+
self._cache.execute(
|
|
30
|
+
self._insert_sql,
|
|
31
|
+
self._trace_to_row(trace),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def insert_batch(self, traces: list[TraceRow]) -> None:
|
|
35
|
+
"""Batch insert multiple traces (faster than individual inserts)."""
|
|
36
|
+
rows = [self._trace_to_row(t) for t in traces]
|
|
37
|
+
self._cache.executemany(self._insert_sql, rows)
|
|
38
|
+
|
|
39
|
+
def get(self, trace_id: str) -> Optional[TraceRow]:
|
|
40
|
+
"""Get a trace by ID."""
|
|
41
|
+
row = self._cache.execute(
|
|
42
|
+
"SELECT * FROM traces WHERE id = ?", (trace_id,)
|
|
43
|
+
).fetchone()
|
|
44
|
+
if row is None:
|
|
45
|
+
return None
|
|
46
|
+
return self._row_to_trace(row)
|
|
47
|
+
|
|
48
|
+
def list_by_session(self, session_id: str, limit: int = 50) -> list[TraceRow]:
|
|
49
|
+
"""List traces for a session, ordered by turn index."""
|
|
50
|
+
rows = self._cache.execute(
|
|
51
|
+
"SELECT * FROM traces WHERE session_id = ? ORDER BY turn_index ASC LIMIT ?",
|
|
52
|
+
(session_id, limit),
|
|
53
|
+
).fetchall()
|
|
54
|
+
return [self._row_to_trace(r) for r in rows]
|
|
55
|
+
|
|
56
|
+
def search_fts(self, query: str, limit: int = 8) -> list[TraceRow]:
|
|
57
|
+
"""Full-text search on traces using FTS5."""
|
|
58
|
+
# Strip surrogate characters and control chars that crash FTS5
|
|
59
|
+
query = query.encode("utf-8", "surrogatepass").decode("utf-8", "replace")
|
|
60
|
+
query = "".join(c for c in query if c.isprintable() or c in (" ", "\n", "\t"))
|
|
61
|
+
safe = query.replace('"', '""')
|
|
62
|
+
rows = self._cache.execute(
|
|
63
|
+
"""
|
|
64
|
+
SELECT t.* FROM traces t
|
|
65
|
+
JOIN traces_fts fts ON t.rowid = fts.rowid
|
|
66
|
+
WHERE traces_fts MATCH ?
|
|
67
|
+
ORDER BY rank
|
|
68
|
+
LIMIT ?
|
|
69
|
+
""",
|
|
70
|
+
(safe, limit),
|
|
71
|
+
).fetchall()
|
|
72
|
+
return [self._row_to_trace(r) for r in rows]
|
|
73
|
+
|
|
74
|
+
def list_recent(self, limit: int = 20) -> list[TraceRow]:
|
|
75
|
+
"""List most recent traces."""
|
|
76
|
+
rows = self._cache.execute(
|
|
77
|
+
"SELECT * FROM traces ORDER BY created_at DESC LIMIT ?",
|
|
78
|
+
(limit,),
|
|
79
|
+
).fetchall()
|
|
80
|
+
return [self._row_to_trace(r) for r in rows]
|
|
81
|
+
|
|
82
|
+
def count(self) -> int:
|
|
83
|
+
"""Total trace count."""
|
|
84
|
+
row = self._cache.execute(
|
|
85
|
+
"SELECT COUNT(*) as cnt FROM traces"
|
|
86
|
+
).fetchone()
|
|
87
|
+
return row["cnt"] if row else 0
|
|
88
|
+
|
|
89
|
+
def mark_synced(self, trace_id: str) -> None:
|
|
90
|
+
"""Mark a trace as synced to OpenViking."""
|
|
91
|
+
self._cache.execute(
|
|
92
|
+
"UPDATE traces SET synced = 1 WHERE id = ?", (trace_id,)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def mark_synced_batch(self, trace_ids: list[str]) -> None:
|
|
96
|
+
"""Batch mark multiple traces as synced."""
|
|
97
|
+
rows = [(tid,) for tid in trace_ids]
|
|
98
|
+
self._cache.executemany(
|
|
99
|
+
"UPDATE traces SET synced = 1 WHERE id = ?", rows,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def get_unsynced(self, limit: int = 50) -> list[TraceRow]:
|
|
103
|
+
"""Get traces that haven't been synced to OpenViking yet."""
|
|
104
|
+
rows = self._cache.execute(
|
|
105
|
+
"SELECT * FROM traces WHERE synced = 0 ORDER BY created_at ASC LIMIT ?",
|
|
106
|
+
(limit,),
|
|
107
|
+
).fetchall()
|
|
108
|
+
return [self._row_to_trace(r) for r in rows]
|
|
109
|
+
|
|
110
|
+
def delete_old(self, before_timestamp: str) -> int:
|
|
111
|
+
"""Delete traces older than a timestamp. Returns count deleted."""
|
|
112
|
+
cursor = self._cache.execute(
|
|
113
|
+
"DELETE FROM traces WHERE created_at < ?", (before_timestamp,)
|
|
114
|
+
)
|
|
115
|
+
return cursor.rowcount
|
|
116
|
+
|
|
117
|
+
def get_all_embeddings(self, limit: int = 1000) -> list[tuple[str, list[float]]]:
|
|
118
|
+
"""Get all (id, embedding) pairs for bulk similarity search."""
|
|
119
|
+
rows = self._cache.execute(
|
|
120
|
+
"SELECT id, embedding FROM traces WHERE embedding IS NOT NULL LIMIT ?",
|
|
121
|
+
(limit,),
|
|
122
|
+
).fetchall()
|
|
123
|
+
result = []
|
|
124
|
+
for r in rows:
|
|
125
|
+
if r["embedding"]:
|
|
126
|
+
try:
|
|
127
|
+
emb = json.loads(r["embedding"])
|
|
128
|
+
if emb:
|
|
129
|
+
result.append((r["id"], emb))
|
|
130
|
+
except (json.JSONDecodeError, TypeError):
|
|
131
|
+
pass
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _trace_to_row(trace: TraceRow) -> tuple:
|
|
136
|
+
return (
|
|
137
|
+
trace.id,
|
|
138
|
+
trace.session_id,
|
|
139
|
+
trace.turn_index,
|
|
140
|
+
trace.user_content,
|
|
141
|
+
trace.assistant_content,
|
|
142
|
+
json.dumps(trace.embedding) if trace.embedding else None,
|
|
143
|
+
trace.reward,
|
|
144
|
+
json.dumps(trace.tags, ensure_ascii=False),
|
|
145
|
+
json.dumps(trace.metadata, ensure_ascii=False, default=str),
|
|
146
|
+
trace.created_at,
|
|
147
|
+
0,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def _row_to_trace(row: Any) -> TraceRow:
|
|
152
|
+
return TraceRow(
|
|
153
|
+
id=row["id"],
|
|
154
|
+
session_id=row["session_id"],
|
|
155
|
+
turn_index=row["turn_index"],
|
|
156
|
+
user_content=row["user_content"],
|
|
157
|
+
assistant_content=row["assistant_content"],
|
|
158
|
+
embedding=json.loads(row["embedding"]) if row["embedding"] else None,
|
|
159
|
+
reward=row["reward"],
|
|
160
|
+
tags=json.loads(row["tags"]) if isinstance(row["tags"], str) else [],
|
|
161
|
+
metadata=json.loads(row["metadata"]) if isinstance(row["metadata"], str) else {},
|
|
162
|
+
created_at=row["created_at"],
|
|
163
|
+
)
|
cc_star/cache/vector.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Local cosine similarity search using numpy."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
11
|
+
"""Compute cosine similarity between two vectors."""
|
|
12
|
+
va = np.array(a, dtype=np.float64)
|
|
13
|
+
vb = np.array(b, dtype=np.float64)
|
|
14
|
+
norm_a = np.linalg.norm(va)
|
|
15
|
+
norm_b = np.linalg.norm(vb)
|
|
16
|
+
if norm_a < 1e-12 or norm_b < 1e-12:
|
|
17
|
+
return 0.0
|
|
18
|
+
return float(np.dot(va, vb) / (norm_a * norm_b))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def search_by_embedding(
|
|
22
|
+
query_embedding: list[float],
|
|
23
|
+
candidates: list[tuple[str, list[float]]],
|
|
24
|
+
k: int = 8,
|
|
25
|
+
) -> list[tuple[str, float]]:
|
|
26
|
+
"""Search nearest neighbors by cosine similarity.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
query_embedding: Query vector.
|
|
30
|
+
candidates: List of (id, embedding_vector) tuples.
|
|
31
|
+
k: Number of results to return.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of (id, score) tuples sorted by descending similarity.
|
|
35
|
+
"""
|
|
36
|
+
if not candidates:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
scores: list[tuple[str, float]] = []
|
|
40
|
+
for cid, emb in candidates:
|
|
41
|
+
if emb and len(emb) > 0:
|
|
42
|
+
sim = cosine_similarity(query_embedding, emb)
|
|
43
|
+
scores.append((cid, sim))
|
|
44
|
+
|
|
45
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
46
|
+
return scores[:k]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def compute_embedding(text: str, dim: int = 384) -> list[float]:
|
|
50
|
+
"""Compute a simple bag-of-characters embedding as a fallback.
|
|
51
|
+
|
|
52
|
+
This is a lightweight fallback when the OpenViking embed API is unavailable.
|
|
53
|
+
For production use, use OpenViking's native embedding instead.
|
|
54
|
+
"""
|
|
55
|
+
rng = np.random.RandomState(hash(text) & 0xFFFFFFFF)
|
|
56
|
+
vec = rng.randn(dim)
|
|
57
|
+
vec = vec / (np.linalg.norm(vec) + 1e-12)
|
|
58
|
+
return vec.tolist()
|