ocp-server 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocp_server-0.1.0/.gitignore +19 -0
- ocp_server-0.1.0/PKG-INFO +22 -0
- ocp_server-0.1.0/ocp_server/__init__.py +2 -0
- ocp_server-0.1.0/ocp_server/auth.py +145 -0
- ocp_server-0.1.0/ocp_server/embedder.py +169 -0
- ocp_server-0.1.0/ocp_server/indexer.py +158 -0
- ocp_server-0.1.0/ocp_server/models.py +110 -0
- ocp_server-0.1.0/ocp_server/server.py +657 -0
- ocp_server-0.1.0/ocp_server/server_http.py +269 -0
- ocp_server-0.1.0/ocp_server/storage/__init__.py +3 -0
- ocp_server-0.1.0/ocp_server/storage/base.py +138 -0
- ocp_server-0.1.0/ocp_server/storage/postgres.py +735 -0
- ocp_server-0.1.0/ocp_server/storage/sqlite.py +769 -0
- ocp_server-0.1.0/ocp_server/tools/__init__.py +0 -0
- ocp_server-0.1.0/ocp_server/tools/coordination.py +116 -0
- ocp_server-0.1.0/ocp_server/tools/events.py +83 -0
- ocp_server-0.1.0/ocp_server/tools/retrieval.py +108 -0
- ocp_server-0.1.0/ocp_server/tools/state.py +124 -0
- ocp_server-0.1.0/ocp_server/tools/workspace.py +29 -0
- ocp_server-0.1.0/pyproject.toml +39 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocp-server
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Open Context Protocol — reference server implementation
|
|
5
|
+
Project-URL: Homepage, https://github.com/Rajesh1213/OCP
|
|
6
|
+
Project-URL: Repository, https://github.com/Rajesh1213/OCP
|
|
7
|
+
Project-URL: Documentation, https://github.com/Rajesh1213/OCP/blob/main/docs/integrations.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/Rajesh1213/OCP/issues
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Requires-Dist: aiofiles>=23.2
|
|
12
|
+
Requires-Dist: aiosqlite>=0.20
|
|
13
|
+
Requires-Dist: asyncpg>=0.29; extra == 'postgres'
|
|
14
|
+
Requires-Dist: mcp>=1.0
|
|
15
|
+
Requires-Dist: numpy>=1.26
|
|
16
|
+
Requires-Dist: pydantic>=2.7
|
|
17
|
+
Requires-Dist: starlette>=0.37
|
|
18
|
+
Requires-Dist: tiktoken>=0.7
|
|
19
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
20
|
+
Requires-Dist: watchfiles>=0.22
|
|
21
|
+
Provides-Extra: postgres
|
|
22
|
+
Requires-Dist: asyncpg>=0.29; extra == 'postgres'
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""OCP authentication and workspace authorisation layer.
|
|
2
|
+
|
|
3
|
+
Design
|
|
4
|
+
------
|
|
5
|
+
Auth is opt-in. If OCP_API_KEYS is unset the server runs in open/dev mode and
|
|
6
|
+
every request is treated as fully-authorised. Set OCP_API_KEYS to enable.
|
|
7
|
+
|
|
8
|
+
Environment variables
|
|
9
|
+
---------------------
|
|
10
|
+
OCP_API_KEYS
|
|
11
|
+
Comma-separated list of valid API keys that have access to ALL workspaces.
|
|
12
|
+
Example: OCP_API_KEYS=key-prod-abc123,key-ci-def456
|
|
13
|
+
|
|
14
|
+
OCP_API_KEY_WORKSPACES
|
|
15
|
+
Semicolon-separated list of <key>:<ws_id1>,<ws_id2> pairs that restrict a
|
|
16
|
+
key to specific workspaces. Workspaces listed here override any entry in
|
|
17
|
+
OCP_API_KEYS for that key (OCP_API_KEY_WORKSPACES takes precedence).
|
|
18
|
+
Example: OCP_API_KEY_WORKSPACES=key-tenantA:ws_abc,ws_def;key-tenantB:ws_xyz
|
|
19
|
+
|
|
20
|
+
Usage in stdio mode
|
|
21
|
+
-------------------
|
|
22
|
+
Set OCP_API_KEY=<key> in the environment before spawning ocp-server. The
|
|
23
|
+
server reads OCP_API_KEY from the process environment during initialization.
|
|
24
|
+
|
|
25
|
+
Usage in HTTP mode
|
|
26
|
+
------------------
|
|
27
|
+
Pass the key as a Bearer token:
|
|
28
|
+
Authorization: Bearer <key>
|
|
29
|
+
|
|
30
|
+
The HTTP middleware extracts the token and stores it in the auth context var
|
|
31
|
+
for the duration of that request.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import contextvars
|
|
36
|
+
import os
|
|
37
|
+
from dataclasses import dataclass, field
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ------------------------------------------------------------------ #
|
|
41
|
+
# Context variable — one AuthContext per asyncio task (= per request) #
|
|
42
|
+
# ------------------------------------------------------------------ #
|
|
43
|
+
|
|
44
|
+
_auth_ctx: contextvars.ContextVar["AuthContext"] = contextvars.ContextVar("ocp_auth_ctx")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class AuthContext:
|
|
49
|
+
"""Immutable auth state for one request / connection."""
|
|
50
|
+
|
|
51
|
+
api_key: str
|
|
52
|
+
# None means "all workspaces allowed"
|
|
53
|
+
allowed_workspaces: frozenset[str] | None = None
|
|
54
|
+
dev_mode: bool = False # True when auth is disabled globally
|
|
55
|
+
|
|
56
|
+
def can_access_workspace(self, workspace_id: str) -> bool:
|
|
57
|
+
if self.dev_mode:
|
|
58
|
+
return True
|
|
59
|
+
if self.allowed_workspaces is None:
|
|
60
|
+
return True
|
|
61
|
+
return workspace_id in self.allowed_workspaces
|
|
62
|
+
|
|
63
|
+
def assert_workspace(self, workspace_id: str) -> None:
|
|
64
|
+
"""Raise PermissionDeniedError if this context cannot access workspace_id."""
|
|
65
|
+
if not self.can_access_workspace(workspace_id):
|
|
66
|
+
raise PermissionDeniedError(
|
|
67
|
+
f"API key does not have access to workspace {workspace_id}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# Sentinel used when the server is in dev mode (no auth configured)
|
|
72
|
+
_DEV_CONTEXT = AuthContext(api_key="__dev__", dev_mode=True)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_auth_context() -> AuthContext:
|
|
76
|
+
"""Return the AuthContext for the current request.
|
|
77
|
+
|
|
78
|
+
Falls back to the dev context if none has been set (should only happen in
|
|
79
|
+
test code that bypasses the server wiring).
|
|
80
|
+
"""
|
|
81
|
+
return _auth_ctx.get(_DEV_CONTEXT)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def set_auth_context(ctx: AuthContext) -> contextvars.Token:
|
|
85
|
+
return _auth_ctx.set(ctx)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def reset_auth_context(token: contextvars.Token) -> None:
|
|
89
|
+
_auth_ctx.reset(token)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ------------------------------------------------------------------ #
|
|
93
|
+
# Auth config #
|
|
94
|
+
# ------------------------------------------------------------------ #
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class AuthConfig:
|
|
98
|
+
"""Parsed auth configuration derived from environment variables."""
|
|
99
|
+
|
|
100
|
+
# key → frozenset of workspace IDs, or None for unrestricted
|
|
101
|
+
key_map: dict[str, frozenset[str] | None] = field(default_factory=dict)
|
|
102
|
+
enabled: bool = False
|
|
103
|
+
|
|
104
|
+
def validate_key(self, key: str) -> AuthContext | None:
|
|
105
|
+
"""Return an AuthContext for a valid key, or None if the key is invalid."""
|
|
106
|
+
if not self.enabled:
|
|
107
|
+
return _DEV_CONTEXT
|
|
108
|
+
if key not in self.key_map:
|
|
109
|
+
return None
|
|
110
|
+
return AuthContext(api_key=key, allowed_workspaces=self.key_map[key])
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def load_auth_config() -> AuthConfig:
|
|
114
|
+
"""Build AuthConfig from environment variables."""
|
|
115
|
+
cfg = AuthConfig()
|
|
116
|
+
|
|
117
|
+
raw_keys = os.environ.get("OCP_API_KEYS", "").strip()
|
|
118
|
+
for k in (k.strip() for k in raw_keys.split(",") if k.strip()):
|
|
119
|
+
cfg.key_map[k] = None # unrestricted
|
|
120
|
+
cfg.enabled = True
|
|
121
|
+
|
|
122
|
+
raw_scoped = os.environ.get("OCP_API_KEY_WORKSPACES", "").strip()
|
|
123
|
+
for entry in (e.strip() for e in raw_scoped.split(";") if e.strip()):
|
|
124
|
+
if ":" not in entry:
|
|
125
|
+
continue
|
|
126
|
+
key, workspaces_str = entry.split(":", 1)
|
|
127
|
+
key = key.strip()
|
|
128
|
+
ws = frozenset(w.strip() for w in workspaces_str.split(",") if w.strip())
|
|
129
|
+
cfg.key_map[key] = ws
|
|
130
|
+
cfg.enabled = True
|
|
131
|
+
|
|
132
|
+
return cfg
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ------------------------------------------------------------------ #
|
|
136
|
+
# Errors #
|
|
137
|
+
# ------------------------------------------------------------------ #
|
|
138
|
+
|
|
139
|
+
class PermissionDeniedError(Exception):
|
|
140
|
+
code = "PERMISSION_DENIED"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class UnauthorisedError(Exception):
|
|
144
|
+
"""Raised at the transport level when no valid key is presented."""
|
|
145
|
+
code = "UNAUTHORISED"
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Embedding backends for OCP.
|
|
2
|
+
|
|
3
|
+
Default: HashEmbedder — pure numpy, zero dependencies, works on any platform/Python.
|
|
4
|
+
Optional backends (set OCP_EMBEDDER env var):
|
|
5
|
+
- "hash" HashEmbedder (default)
|
|
6
|
+
- "fastembed" FastEmbedEmbedder (requires: pip install fastembed)
|
|
7
|
+
- "openai" OpenAIEmbedder (requires: pip install openai, OPENAI_API_KEY set)
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import hashlib
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any, Protocol, runtime_checkable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@runtime_checkable
|
|
19
|
+
class EmbedderProtocol(Protocol):
|
|
20
|
+
async def embed(self, text: str) -> list[float]: ...
|
|
21
|
+
@property
|
|
22
|
+
def dim(self) -> int: ...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ------------------------------------------------------------------ #
|
|
26
|
+
# Hash n-gram embedder (built-in, no extra deps) #
|
|
27
|
+
# ------------------------------------------------------------------ #
|
|
28
|
+
|
|
29
|
+
class HashEmbedder:
|
|
30
|
+
"""
|
|
31
|
+
Locality-sensitive hash embedding via character n-grams.
|
|
32
|
+
|
|
33
|
+
Splits text into overlapping 3-grams, hashes each into one of `dim`
|
|
34
|
+
buckets, and returns an L2-normalised count vector. Fast, deterministic,
|
|
35
|
+
works on Python 3.13 + Intel Mac with only numpy.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, dim: int = 512) -> None:
|
|
39
|
+
self._dim = dim
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def dim(self) -> int:
|
|
43
|
+
return self._dim
|
|
44
|
+
|
|
45
|
+
async def embed(self, text: str) -> list[float]:
|
|
46
|
+
return self._embed_sync(text)
|
|
47
|
+
|
|
48
|
+
def _embed_sync(self, text: str) -> list[float]:
|
|
49
|
+
import numpy as np
|
|
50
|
+
|
|
51
|
+
text = re.sub(r"\s+", " ", text.lower()).strip()
|
|
52
|
+
vec = np.zeros(self._dim, dtype=np.float32)
|
|
53
|
+
|
|
54
|
+
# Character 3-grams
|
|
55
|
+
for i in range(len(text) - 2):
|
|
56
|
+
gram = text[i : i + 3]
|
|
57
|
+
bucket = int(hashlib.md5(gram.encode()).hexdigest(), 16) % self._dim
|
|
58
|
+
vec[bucket] += 1.0
|
|
59
|
+
|
|
60
|
+
# Word unigrams (extra signal for exact-keyword matches)
|
|
61
|
+
for word in text.split():
|
|
62
|
+
bucket = int(hashlib.md5(word.encode()).hexdigest(), 16) % self._dim
|
|
63
|
+
vec[bucket] += 2.0
|
|
64
|
+
|
|
65
|
+
norm = np.linalg.norm(vec)
|
|
66
|
+
if norm > 0:
|
|
67
|
+
vec /= norm
|
|
68
|
+
return vec.tolist()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ------------------------------------------------------------------ #
|
|
72
|
+
# fastembed backend (optional) #
|
|
73
|
+
# ------------------------------------------------------------------ #
|
|
74
|
+
|
|
75
|
+
class FastEmbedEmbedder:
|
|
76
|
+
"""Uses fastembed (BAAI/bge-small-en-v1.5 by default)."""
|
|
77
|
+
|
|
78
|
+
def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5") -> None:
|
|
79
|
+
self._model_name = model_name
|
|
80
|
+
self._model: Any = None
|
|
81
|
+
self._dim_val: int | None = None
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def dim(self) -> int:
|
|
85
|
+
return self._dim_val or 384
|
|
86
|
+
|
|
87
|
+
def _load(self) -> None:
|
|
88
|
+
if self._model is None:
|
|
89
|
+
from fastembed import TextEmbedding
|
|
90
|
+
self._model = TextEmbedding(model_name=self._model_name)
|
|
91
|
+
|
|
92
|
+
async def embed(self, text: str) -> list[float]:
|
|
93
|
+
loop = asyncio.get_event_loop()
|
|
94
|
+
return await loop.run_in_executor(None, self._embed_sync, text)
|
|
95
|
+
|
|
96
|
+
def _embed_sync(self, text: str) -> list[float]:
|
|
97
|
+
self._load()
|
|
98
|
+
result = list(self._model.embed([text]))
|
|
99
|
+
vec = result[0].tolist()
|
|
100
|
+
self._dim_val = len(vec)
|
|
101
|
+
return vec
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ------------------------------------------------------------------ #
|
|
105
|
+
# OpenAI backend (optional) #
|
|
106
|
+
# ------------------------------------------------------------------ #
|
|
107
|
+
|
|
108
|
+
class OpenAIEmbedder:
|
|
109
|
+
"""Uses the OpenAI embeddings API (text-embedding-3-small)."""
|
|
110
|
+
|
|
111
|
+
def __init__(self, model: str = "text-embedding-3-small") -> None:
|
|
112
|
+
self._model = model
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def dim(self) -> int:
|
|
116
|
+
return 1536
|
|
117
|
+
|
|
118
|
+
async def embed(self, text: str) -> list[float]:
|
|
119
|
+
import openai
|
|
120
|
+
client = openai.AsyncOpenAI()
|
|
121
|
+
resp = await client.embeddings.create(input=text, model=self._model)
|
|
122
|
+
return resp.data[0].embedding
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ------------------------------------------------------------------ #
|
|
126
|
+
# Factory #
|
|
127
|
+
# ------------------------------------------------------------------ #
|
|
128
|
+
|
|
129
|
+
def make_embedder() -> EmbedderProtocol:
|
|
130
|
+
backend = os.environ.get("OCP_EMBEDDER", "hash").lower()
|
|
131
|
+
if backend == "fastembed":
|
|
132
|
+
model = os.environ.get("OCP_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
|
|
133
|
+
return FastEmbedEmbedder(model)
|
|
134
|
+
if backend == "openai":
|
|
135
|
+
model = os.environ.get("OCP_EMBED_MODEL", "text-embedding-3-small")
|
|
136
|
+
return OpenAIEmbedder(model)
|
|
137
|
+
return HashEmbedder(dim=int(os.environ.get("OCP_EMBED_DIM", "512")))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Convenience alias used by server.py
|
|
141
|
+
Embedder = HashEmbedder
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------ #
|
|
145
|
+
# Tokenizer #
|
|
146
|
+
# ------------------------------------------------------------------ #
|
|
147
|
+
|
|
148
|
+
class Tokenizer:
|
|
149
|
+
"""Token counter. Uses tiktoken when available, falls back to word-split."""
|
|
150
|
+
|
|
151
|
+
def __init__(self) -> None:
|
|
152
|
+
self._enc: Any = None
|
|
153
|
+
self._use_tiktoken = True
|
|
154
|
+
|
|
155
|
+
def _load(self) -> None:
|
|
156
|
+
if self._enc is not None or not self._use_tiktoken:
|
|
157
|
+
return
|
|
158
|
+
try:
|
|
159
|
+
import tiktoken
|
|
160
|
+
self._enc = tiktoken.get_encoding("cl100k_base")
|
|
161
|
+
except ImportError:
|
|
162
|
+
self._use_tiktoken = False
|
|
163
|
+
|
|
164
|
+
def count(self, text: str) -> int:
|
|
165
|
+
self._load()
|
|
166
|
+
if self._enc:
|
|
167
|
+
return len(self._enc.encode(text))
|
|
168
|
+
# Fallback: ~4 chars per token heuristic
|
|
169
|
+
return max(1, len(text) // 4)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""File-system workspace indexer.
|
|
2
|
+
|
|
3
|
+
Changes:
|
|
4
|
+
§6.1 trigger 3 — detect hash-mismatch on reindex: stale old chunks and return
|
|
5
|
+
their IDs so the caller can emit chunk.invalidated.
|
|
6
|
+
§7.2 — accepts optional progress_cb for index.progress events.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Awaitable, Callable
|
|
14
|
+
|
|
15
|
+
from ocp_server.embedder import EmbedderProtocol as Embedder
|
|
16
|
+
from ocp_server.models import Chunk, ChunkSource, SourceRange, make_chunk_id
|
|
17
|
+
from ocp_server.storage.base import BaseStore
|
|
18
|
+
|
|
19
|
+
_TEXT_EXTENSIONS = {
|
|
20
|
+
".py", ".ts", ".tsx", ".js", ".jsx", ".go", ".rs", ".java", ".kt",
|
|
21
|
+
".rb", ".cpp", ".c", ".h", ".cs", ".swift", ".md", ".txt", ".yaml",
|
|
22
|
+
".yml", ".json", ".toml", ".html", ".css", ".sh", ".sql",
|
|
23
|
+
}
|
|
24
|
+
_MAX_CHUNK_BYTES = 4096
|
|
25
|
+
|
|
26
|
+
# Type alias: receives (progress 0..1) and returns None
|
|
27
|
+
ProgressCallback = Callable[[float], Awaitable[None]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def index_workspace(
|
|
31
|
+
store: BaseStore,
|
|
32
|
+
embedder: Embedder,
|
|
33
|
+
workspace_id: str,
|
|
34
|
+
root_uri: str,
|
|
35
|
+
paths: list[str] | None,
|
|
36
|
+
progress_cb: ProgressCallback | None = None,
|
|
37
|
+
) -> tuple[dict, list[str]]:
|
|
38
|
+
"""Index files in the workspace.
|
|
39
|
+
|
|
40
|
+
Returns (result_dict, stale_chunk_ids).
|
|
41
|
+
stale_chunk_ids contains IDs of previously-active chunks whose content_hash
|
|
42
|
+
changed during this reindex (§6.1 trigger 3).
|
|
43
|
+
"""
|
|
44
|
+
root = root_uri.removeprefix("file://")
|
|
45
|
+
root_path = Path(root)
|
|
46
|
+
|
|
47
|
+
if not root_path.exists():
|
|
48
|
+
return {"indexed": 0, "skipped": 0, "duration_ms": 0}, []
|
|
49
|
+
|
|
50
|
+
t0 = time.monotonic()
|
|
51
|
+
|
|
52
|
+
if paths:
|
|
53
|
+
targets = [root_path / p.lstrip("/") for p in paths]
|
|
54
|
+
else:
|
|
55
|
+
targets = [root_path]
|
|
56
|
+
|
|
57
|
+
# Collect all candidate files first so we can report progress
|
|
58
|
+
all_files: list[Path] = []
|
|
59
|
+
for target in targets:
|
|
60
|
+
if target.is_file():
|
|
61
|
+
all_files.append(target)
|
|
62
|
+
else:
|
|
63
|
+
all_files.extend(p for p in target.rglob("*") if p.is_file())
|
|
64
|
+
|
|
65
|
+
total = len(all_files)
|
|
66
|
+
indexed = 0
|
|
67
|
+
skipped = 0
|
|
68
|
+
stale_ids: list[str] = []
|
|
69
|
+
|
|
70
|
+
for i, file_path in enumerate(all_files):
|
|
71
|
+
if file_path.suffix.lower() not in _TEXT_EXTENSIONS:
|
|
72
|
+
skipped += 1
|
|
73
|
+
else:
|
|
74
|
+
try:
|
|
75
|
+
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
76
|
+
except OSError:
|
|
77
|
+
skipped += 1
|
|
78
|
+
else:
|
|
79
|
+
uri = f"file://{file_path.resolve()}"
|
|
80
|
+
|
|
81
|
+
# §6.1 trigger 3: snapshot active chunk IDs before reindexing this file
|
|
82
|
+
prior_ids = await store.get_active_chunk_ids_for_uri(workspace_id, uri)
|
|
83
|
+
prior_set = set(prior_ids)
|
|
84
|
+
|
|
85
|
+
new_chunks = _chunk_file(workspace_id, file_path, root_path, text)
|
|
86
|
+
new_ids: set[str] = set()
|
|
87
|
+
for chunk in new_chunks:
|
|
88
|
+
embedding = await embedder.embed(chunk.content)
|
|
89
|
+
await store.upsert_chunk(chunk, embedding)
|
|
90
|
+
new_ids.add(chunk.id)
|
|
91
|
+
indexed += 1
|
|
92
|
+
|
|
93
|
+
# Any prior active chunk not in the new set has a changed hash
|
|
94
|
+
outdated = prior_set - new_ids
|
|
95
|
+
if outdated:
|
|
96
|
+
await store.mark_chunks_stale(list(outdated))
|
|
97
|
+
stale_ids.extend(outdated)
|
|
98
|
+
|
|
99
|
+
# Emit progress (§7.2 index.progress)
|
|
100
|
+
if progress_cb and total > 0:
|
|
101
|
+
await progress_cb((i + 1) / total)
|
|
102
|
+
|
|
103
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
104
|
+
return {"indexed": indexed, "skipped": skipped, "duration_ms": duration_ms}, stale_ids
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _chunk_file(workspace_id: str, file_path: Path, root: Path, text: str) -> list[Chunk]:
|
|
108
|
+
# R1: absolute resolved URI — matches invalidation LIKE-patterns (includes symlink resolution)
|
|
109
|
+
uri = f"file://{file_path.resolve()}"
|
|
110
|
+
|
|
111
|
+
lines = text.splitlines(keepends=True)
|
|
112
|
+
chunks: list[Chunk] = []
|
|
113
|
+
start_line = 0
|
|
114
|
+
ext = file_path.suffix.lstrip(".")
|
|
115
|
+
lang_map = {
|
|
116
|
+
"py": "python", "ts": "typescript", "tsx": "typescript",
|
|
117
|
+
"js": "javascript", "jsx": "javascript", "go": "go",
|
|
118
|
+
"rs": "rust", "java": "java", "rb": "ruby", "md": "markdown",
|
|
119
|
+
}
|
|
120
|
+
language = lang_map.get(ext)
|
|
121
|
+
|
|
122
|
+
while start_line < len(lines):
|
|
123
|
+
buf = ""
|
|
124
|
+
end_line = start_line
|
|
125
|
+
while end_line < len(lines) and len((buf + lines[end_line]).encode()) < _MAX_CHUNK_BYTES:
|
|
126
|
+
buf += lines[end_line]
|
|
127
|
+
end_line += 1
|
|
128
|
+
if not buf.strip():
|
|
129
|
+
start_line = end_line + 1
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
content_hash = hashlib.sha256(buf.encode()).hexdigest()
|
|
133
|
+
range_repr = f"{start_line}:{end_line}"
|
|
134
|
+
chunk_id = make_chunk_id(workspace_id, uri, range_repr, content_hash)
|
|
135
|
+
|
|
136
|
+
start_byte = sum(len(ln.encode()) for ln in lines[:start_line])
|
|
137
|
+
end_byte = start_byte + len(buf.encode())
|
|
138
|
+
|
|
139
|
+
chunks.append(Chunk(
|
|
140
|
+
id=chunk_id,
|
|
141
|
+
workspace_id=workspace_id,
|
|
142
|
+
source=ChunkSource(
|
|
143
|
+
uri=uri,
|
|
144
|
+
range=SourceRange(
|
|
145
|
+
start_line=start_line,
|
|
146
|
+
end_line=end_line - 1,
|
|
147
|
+
start_byte=start_byte,
|
|
148
|
+
end_byte=end_byte,
|
|
149
|
+
),
|
|
150
|
+
content_hash=content_hash,
|
|
151
|
+
),
|
|
152
|
+
kind="section",
|
|
153
|
+
language=language,
|
|
154
|
+
content=buf,
|
|
155
|
+
))
|
|
156
|
+
start_line = end_line
|
|
157
|
+
|
|
158
|
+
return chunks
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""OCP core data model — §3."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
import hashlib
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, model_validator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Scope(str, Enum):
|
|
13
|
+
agent = "agent"
|
|
14
|
+
session = "session"
|
|
15
|
+
global_ = "global"
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def _missing_(cls, value: object) -> "Scope | None":
|
|
19
|
+
if value == "global":
|
|
20
|
+
return cls.global_
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConformanceLevel(str, Enum):
|
|
25
|
+
core = "core"
|
|
26
|
+
core_coordination = "core+coordination"
|
|
27
|
+
full = "full"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SourceRange(BaseModel):
|
|
31
|
+
start_byte: int | None = None
|
|
32
|
+
end_byte: int | None = None
|
|
33
|
+
start_line: int | None = None
|
|
34
|
+
end_line: int | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ChunkSource(BaseModel):
|
|
38
|
+
uri: str
|
|
39
|
+
range: SourceRange | None = None
|
|
40
|
+
content_hash: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Chunk(BaseModel):
|
|
44
|
+
id: str
|
|
45
|
+
workspace_id: str
|
|
46
|
+
source: ChunkSource
|
|
47
|
+
kind: str
|
|
48
|
+
language: str | None = None
|
|
49
|
+
symbol: str | None = None
|
|
50
|
+
content: str
|
|
51
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
52
|
+
version: int = 1
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
_MAX_KEY_BYTES = 256
|
|
56
|
+
_MAX_VALUE_BYTES = 1 * 1024 * 1024 # 1 MiB
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class StateEntry(BaseModel):
|
|
60
|
+
key: str
|
|
61
|
+
value: Any
|
|
62
|
+
scope: Scope
|
|
63
|
+
workspace_id: str | None = None
|
|
64
|
+
session_id: str | None = None
|
|
65
|
+
agent_id: str | None = None
|
|
66
|
+
ttl_seconds: int | None = None
|
|
67
|
+
updated_at: str | None = None
|
|
68
|
+
version: int = 1
|
|
69
|
+
|
|
70
|
+
@model_validator(mode="after")
|
|
71
|
+
def _check_scope_ids(self) -> "StateEntry":
|
|
72
|
+
if self.scope == Scope.agent and not self.agent_id:
|
|
73
|
+
raise ValueError("agent_id required for agent scope")
|
|
74
|
+
if self.scope == Scope.session and not self.session_id:
|
|
75
|
+
raise ValueError("session_id required for session scope")
|
|
76
|
+
if self.scope in (Scope.session, Scope.global_) and not self.workspace_id:
|
|
77
|
+
raise ValueError("workspace_id required for session/global scope")
|
|
78
|
+
# §3.2 — key: max 256 bytes; value: max 1 MiB
|
|
79
|
+
if len(self.key.encode()) > _MAX_KEY_BYTES:
|
|
80
|
+
raise ValueError(f"key exceeds {_MAX_KEY_BYTES} bytes")
|
|
81
|
+
import json as _json
|
|
82
|
+
try:
|
|
83
|
+
encoded = _json.dumps(self.value).encode()
|
|
84
|
+
except (TypeError, ValueError) as exc:
|
|
85
|
+
raise ValueError(f"value is not JSON-serialisable: {exc}") from exc
|
|
86
|
+
if len(encoded) > _MAX_VALUE_BYTES:
|
|
87
|
+
raise ValueError(f"value exceeds {_MAX_VALUE_BYTES} bytes (1 MiB)")
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class EventEnvelope(BaseModel):
|
|
92
|
+
type: str
|
|
93
|
+
event_id: str
|
|
94
|
+
timestamp: str
|
|
95
|
+
workspace_id: str
|
|
96
|
+
subscription_id: str
|
|
97
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# §3.4 — deterministic chunk ID
|
|
101
|
+
def make_chunk_id(workspace_id: str, uri: str, range_repr: str, content_hash: str) -> str:
|
|
102
|
+
raw = f"{workspace_id}\x00{uri}\x00{range_repr}\x00{content_hash}".encode()
|
|
103
|
+
digest = hashlib.sha256(raw).digest()
|
|
104
|
+
return base64.b32encode(digest).decode().lower()[:24]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# §3.3 — deterministic workspace ID
|
|
108
|
+
def make_workspace_id(root_uri: str) -> str:
|
|
109
|
+
digest = hashlib.sha256(root_uri.encode()).hexdigest()
|
|
110
|
+
return f"ws_{digest[:16]}"
|