ocp-server 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocp_server/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ """OCP reference server."""
2
+ __version__ = "0.1.0"
ocp_server/auth.py ADDED
@@ -0,0 +1,145 @@
1
+ """OCP authentication and workspace authorisation layer.
2
+
3
+ Design
4
+ ------
5
+ Auth is opt-in. If OCP_API_KEYS is unset the server runs in open/dev mode and
6
+ every request is treated as fully-authorised. Set OCP_API_KEYS to enable.
7
+
8
+ Environment variables
9
+ ---------------------
10
+ OCP_API_KEYS
11
+ Comma-separated list of valid API keys that have access to ALL workspaces.
12
+ Example: OCP_API_KEYS=key-prod-abc123,key-ci-def456
13
+
14
+ OCP_API_KEY_WORKSPACES
15
+ Semicolon-separated list of <key>:<ws_id1>,<ws_id2> pairs that restrict a
16
+ key to specific workspaces. Workspaces listed here override any entry in
17
+ OCP_API_KEYS for that key (OCP_API_KEY_WORKSPACES takes precedence).
18
+ Example: OCP_API_KEY_WORKSPACES=key-tenantA:ws_abc,ws_def;key-tenantB:ws_xyz
19
+
20
+ Usage in stdio mode
21
+ -------------------
22
+ Set OCP_API_KEY=<key> in the environment before spawning ocp-server. The
23
+ server reads OCP_API_KEY from the process environment during initialization.
24
+
25
+ Usage in HTTP mode
26
+ ------------------
27
+ Pass the key as a Bearer token:
28
+ Authorization: Bearer <key>
29
+
30
+ The HTTP middleware extracts the token and stores it in the auth context var
31
+ for the duration of that request.
32
+ """
33
+ from __future__ import annotations
34
+
35
+ import contextvars
36
+ import os
37
+ from dataclasses import dataclass, field
38
+
39
+
40
+ # ------------------------------------------------------------------ #
41
+ # Context variable — one AuthContext per asyncio task (= per request) #
42
+ # ------------------------------------------------------------------ #
43
+
44
+ _auth_ctx: contextvars.ContextVar["AuthContext"] = contextvars.ContextVar("ocp_auth_ctx")
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class AuthContext:
49
+ """Immutable auth state for one request / connection."""
50
+
51
+ api_key: str
52
+ # None means "all workspaces allowed"
53
+ allowed_workspaces: frozenset[str] | None = None
54
+ dev_mode: bool = False # True when auth is disabled globally
55
+
56
+ def can_access_workspace(self, workspace_id: str) -> bool:
57
+ if self.dev_mode:
58
+ return True
59
+ if self.allowed_workspaces is None:
60
+ return True
61
+ return workspace_id in self.allowed_workspaces
62
+
63
+ def assert_workspace(self, workspace_id: str) -> None:
64
+ """Raise PermissionDeniedError if this context cannot access workspace_id."""
65
+ if not self.can_access_workspace(workspace_id):
66
+ raise PermissionDeniedError(
67
+ f"API key does not have access to workspace {workspace_id}"
68
+ )
69
+
70
+
71
+ # Sentinel used when the server is in dev mode (no auth configured)
72
+ _DEV_CONTEXT = AuthContext(api_key="__dev__", dev_mode=True)
73
+
74
+
75
+ def get_auth_context() -> AuthContext:
76
+ """Return the AuthContext for the current request.
77
+
78
+ Falls back to the dev context if none has been set (should only happen in
79
+ test code that bypasses the server wiring).
80
+ """
81
+ return _auth_ctx.get(_DEV_CONTEXT)
82
+
83
+
84
+ def set_auth_context(ctx: AuthContext) -> contextvars.Token:
85
+ return _auth_ctx.set(ctx)
86
+
87
+
88
+ def reset_auth_context(token: contextvars.Token) -> None:
89
+ _auth_ctx.reset(token)
90
+
91
+
92
+ # ------------------------------------------------------------------ #
93
+ # Auth config #
94
+ # ------------------------------------------------------------------ #
95
+
96
+ @dataclass
97
+ class AuthConfig:
98
+ """Parsed auth configuration derived from environment variables."""
99
+
100
+ # key → frozenset of workspace IDs, or None for unrestricted
101
+ key_map: dict[str, frozenset[str] | None] = field(default_factory=dict)
102
+ enabled: bool = False
103
+
104
+ def validate_key(self, key: str) -> AuthContext | None:
105
+ """Return an AuthContext for a valid key, or None if the key is invalid."""
106
+ if not self.enabled:
107
+ return _DEV_CONTEXT
108
+ if key not in self.key_map:
109
+ return None
110
+ return AuthContext(api_key=key, allowed_workspaces=self.key_map[key])
111
+
112
+
113
+ def load_auth_config() -> AuthConfig:
114
+ """Build AuthConfig from environment variables."""
115
+ cfg = AuthConfig()
116
+
117
+ raw_keys = os.environ.get("OCP_API_KEYS", "").strip()
118
+ for k in (k.strip() for k in raw_keys.split(",") if k.strip()):
119
+ cfg.key_map[k] = None # unrestricted
120
+ cfg.enabled = True
121
+
122
+ raw_scoped = os.environ.get("OCP_API_KEY_WORKSPACES", "").strip()
123
+ for entry in (e.strip() for e in raw_scoped.split(";") if e.strip()):
124
+ if ":" not in entry:
125
+ continue
126
+ key, workspaces_str = entry.split(":", 1)
127
+ key = key.strip()
128
+ ws = frozenset(w.strip() for w in workspaces_str.split(",") if w.strip())
129
+ cfg.key_map[key] = ws
130
+ cfg.enabled = True
131
+
132
+ return cfg
133
+
134
+
135
+ # ------------------------------------------------------------------ #
136
+ # Errors #
137
+ # ------------------------------------------------------------------ #
138
+
139
+ class PermissionDeniedError(Exception):
140
+ code = "PERMISSION_DENIED"
141
+
142
+
143
+ class UnauthorisedError(Exception):
144
+ """Raised at the transport level when no valid key is presented."""
145
+ code = "UNAUTHORISED"
ocp_server/embedder.py ADDED
@@ -0,0 +1,169 @@
1
+ """Embedding backends for OCP.
2
+
3
+ Default: HashEmbedder — pure numpy, zero dependencies, works on any platform/Python.
4
+ Optional backends (set OCP_EMBEDDER env var):
5
+ - "hash" HashEmbedder (default)
6
+ - "fastembed" FastEmbedEmbedder (requires: pip install fastembed)
7
+ - "openai" OpenAIEmbedder (requires: pip install openai, OPENAI_API_KEY set)
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import hashlib
13
+ import os
14
+ import re
15
+ from typing import Any, Protocol, runtime_checkable
16
+
17
+
18
+ @runtime_checkable
19
+ class EmbedderProtocol(Protocol):
20
+ async def embed(self, text: str) -> list[float]: ...
21
+ @property
22
+ def dim(self) -> int: ...
23
+
24
+
25
+ # ------------------------------------------------------------------ #
26
+ # Hash n-gram embedder (built-in, no extra deps) #
27
+ # ------------------------------------------------------------------ #
28
+
29
+ class HashEmbedder:
30
+ """
31
+ Locality-sensitive hash embedding via character n-grams.
32
+
33
+ Splits text into overlapping 3-grams, hashes each into one of `dim`
34
+ buckets, and returns an L2-normalised count vector. Fast, deterministic,
35
+ works on Python 3.13 + Intel Mac with only numpy.
36
+ """
37
+
38
+ def __init__(self, dim: int = 512) -> None:
39
+ self._dim = dim
40
+
41
+ @property
42
+ def dim(self) -> int:
43
+ return self._dim
44
+
45
+ async def embed(self, text: str) -> list[float]:
46
+ return self._embed_sync(text)
47
+
48
+ def _embed_sync(self, text: str) -> list[float]:
49
+ import numpy as np
50
+
51
+ text = re.sub(r"\s+", " ", text.lower()).strip()
52
+ vec = np.zeros(self._dim, dtype=np.float32)
53
+
54
+ # Character 3-grams
55
+ for i in range(len(text) - 2):
56
+ gram = text[i : i + 3]
57
+ bucket = int(hashlib.md5(gram.encode()).hexdigest(), 16) % self._dim
58
+ vec[bucket] += 1.0
59
+
60
+ # Word unigrams (extra signal for exact-keyword matches)
61
+ for word in text.split():
62
+ bucket = int(hashlib.md5(word.encode()).hexdigest(), 16) % self._dim
63
+ vec[bucket] += 2.0
64
+
65
+ norm = np.linalg.norm(vec)
66
+ if norm > 0:
67
+ vec /= norm
68
+ return vec.tolist()
69
+
70
+
71
+ # ------------------------------------------------------------------ #
72
+ # fastembed backend (optional) #
73
+ # ------------------------------------------------------------------ #
74
+
75
+ class FastEmbedEmbedder:
76
+ """Uses fastembed (BAAI/bge-small-en-v1.5 by default)."""
77
+
78
+ def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5") -> None:
79
+ self._model_name = model_name
80
+ self._model: Any = None
81
+ self._dim_val: int | None = None
82
+
83
+ @property
84
+ def dim(self) -> int:
85
+ return self._dim_val or 384
86
+
87
+ def _load(self) -> None:
88
+ if self._model is None:
89
+ from fastembed import TextEmbedding
90
+ self._model = TextEmbedding(model_name=self._model_name)
91
+
92
+ async def embed(self, text: str) -> list[float]:
93
+ loop = asyncio.get_event_loop()
94
+ return await loop.run_in_executor(None, self._embed_sync, text)
95
+
96
+ def _embed_sync(self, text: str) -> list[float]:
97
+ self._load()
98
+ result = list(self._model.embed([text]))
99
+ vec = result[0].tolist()
100
+ self._dim_val = len(vec)
101
+ return vec
102
+
103
+
104
+ # ------------------------------------------------------------------ #
105
+ # OpenAI backend (optional) #
106
+ # ------------------------------------------------------------------ #
107
+
108
+ class OpenAIEmbedder:
109
+ """Uses the OpenAI embeddings API (text-embedding-3-small)."""
110
+
111
+ def __init__(self, model: str = "text-embedding-3-small") -> None:
112
+ self._model = model
113
+
114
+ @property
115
+ def dim(self) -> int:
116
+ return 1536
117
+
118
+ async def embed(self, text: str) -> list[float]:
119
+ import openai
120
+ client = openai.AsyncOpenAI()
121
+ resp = await client.embeddings.create(input=text, model=self._model)
122
+ return resp.data[0].embedding
123
+
124
+
125
+ # ------------------------------------------------------------------ #
126
+ # Factory #
127
+ # ------------------------------------------------------------------ #
128
+
129
+ def make_embedder() -> EmbedderProtocol:
130
+ backend = os.environ.get("OCP_EMBEDDER", "hash").lower()
131
+ if backend == "fastembed":
132
+ model = os.environ.get("OCP_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
133
+ return FastEmbedEmbedder(model)
134
+ if backend == "openai":
135
+ model = os.environ.get("OCP_EMBED_MODEL", "text-embedding-3-small")
136
+ return OpenAIEmbedder(model)
137
+ return HashEmbedder(dim=int(os.environ.get("OCP_EMBED_DIM", "512")))
138
+
139
+
140
+ # Convenience alias used by server.py
141
+ Embedder = HashEmbedder
142
+
143
+
144
+ # ------------------------------------------------------------------ #
145
+ # Tokenizer #
146
+ # ------------------------------------------------------------------ #
147
+
148
+ class Tokenizer:
149
+ """Token counter. Uses tiktoken when available, falls back to word-split."""
150
+
151
+ def __init__(self) -> None:
152
+ self._enc: Any = None
153
+ self._use_tiktoken = True
154
+
155
+ def _load(self) -> None:
156
+ if self._enc is not None or not self._use_tiktoken:
157
+ return
158
+ try:
159
+ import tiktoken
160
+ self._enc = tiktoken.get_encoding("cl100k_base")
161
+ except ImportError:
162
+ self._use_tiktoken = False
163
+
164
+ def count(self, text: str) -> int:
165
+ self._load()
166
+ if self._enc:
167
+ return len(self._enc.encode(text))
168
+ # Fallback: ~4 chars per token heuristic
169
+ return max(1, len(text) // 4)
ocp_server/indexer.py ADDED
@@ -0,0 +1,158 @@
1
+ """File-system workspace indexer.
2
+
3
+ Changes:
4
+ §6.1 trigger 3 — detect hash-mismatch on reindex: stale old chunks and return
5
+ their IDs so the caller can emit chunk.invalidated.
6
+ §7.2 — accepts optional progress_cb for index.progress events.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Awaitable, Callable
14
+
15
+ from ocp_server.embedder import EmbedderProtocol as Embedder
16
+ from ocp_server.models import Chunk, ChunkSource, SourceRange, make_chunk_id
17
+ from ocp_server.storage.base import BaseStore
18
+
19
+ _TEXT_EXTENSIONS = {
20
+ ".py", ".ts", ".tsx", ".js", ".jsx", ".go", ".rs", ".java", ".kt",
21
+ ".rb", ".cpp", ".c", ".h", ".cs", ".swift", ".md", ".txt", ".yaml",
22
+ ".yml", ".json", ".toml", ".html", ".css", ".sh", ".sql",
23
+ }
24
+ _MAX_CHUNK_BYTES = 4096
25
+
26
+ # Type alias: receives (progress 0..1) and returns None
27
+ ProgressCallback = Callable[[float], Awaitable[None]]
28
+
29
+
30
+ async def index_workspace(
31
+ store: BaseStore,
32
+ embedder: Embedder,
33
+ workspace_id: str,
34
+ root_uri: str,
35
+ paths: list[str] | None,
36
+ progress_cb: ProgressCallback | None = None,
37
+ ) -> tuple[dict, list[str]]:
38
+ """Index files in the workspace.
39
+
40
+ Returns (result_dict, stale_chunk_ids).
41
+ stale_chunk_ids contains IDs of previously-active chunks whose content_hash
42
+ changed during this reindex (§6.1 trigger 3).
43
+ """
44
+ root = root_uri.removeprefix("file://")
45
+ root_path = Path(root)
46
+
47
+ if not root_path.exists():
48
+ return {"indexed": 0, "skipped": 0, "duration_ms": 0}, []
49
+
50
+ t0 = time.monotonic()
51
+
52
+ if paths:
53
+ targets = [root_path / p.lstrip("/") for p in paths]
54
+ else:
55
+ targets = [root_path]
56
+
57
+ # Collect all candidate files first so we can report progress
58
+ all_files: list[Path] = []
59
+ for target in targets:
60
+ if target.is_file():
61
+ all_files.append(target)
62
+ else:
63
+ all_files.extend(p for p in target.rglob("*") if p.is_file())
64
+
65
+ total = len(all_files)
66
+ indexed = 0
67
+ skipped = 0
68
+ stale_ids: list[str] = []
69
+
70
+ for i, file_path in enumerate(all_files):
71
+ if file_path.suffix.lower() not in _TEXT_EXTENSIONS:
72
+ skipped += 1
73
+ else:
74
+ try:
75
+ text = file_path.read_text(encoding="utf-8", errors="ignore")
76
+ except OSError:
77
+ skipped += 1
78
+ else:
79
+ uri = f"file://{file_path.resolve()}"
80
+
81
+ # §6.1 trigger 3: snapshot active chunk IDs before reindexing this file
82
+ prior_ids = await store.get_active_chunk_ids_for_uri(workspace_id, uri)
83
+ prior_set = set(prior_ids)
84
+
85
+ new_chunks = _chunk_file(workspace_id, file_path, root_path, text)
86
+ new_ids: set[str] = set()
87
+ for chunk in new_chunks:
88
+ embedding = await embedder.embed(chunk.content)
89
+ await store.upsert_chunk(chunk, embedding)
90
+ new_ids.add(chunk.id)
91
+ indexed += 1
92
+
93
+ # Any prior active chunk not in the new set has a changed hash
94
+ outdated = prior_set - new_ids
95
+ if outdated:
96
+ await store.mark_chunks_stale(list(outdated))
97
+ stale_ids.extend(outdated)
98
+
99
+ # Emit progress (§7.2 index.progress)
100
+ if progress_cb and total > 0:
101
+ await progress_cb((i + 1) / total)
102
+
103
+ duration_ms = int((time.monotonic() - t0) * 1000)
104
+ return {"indexed": indexed, "skipped": skipped, "duration_ms": duration_ms}, stale_ids
105
+
106
+
107
+ def _chunk_file(workspace_id: str, file_path: Path, root: Path, text: str) -> list[Chunk]:
108
+ # R1: absolute resolved URI — matches invalidation LIKE-patterns (includes symlink resolution)
109
+ uri = f"file://{file_path.resolve()}"
110
+
111
+ lines = text.splitlines(keepends=True)
112
+ chunks: list[Chunk] = []
113
+ start_line = 0
114
+ ext = file_path.suffix.lstrip(".")
115
+ lang_map = {
116
+ "py": "python", "ts": "typescript", "tsx": "typescript",
117
+ "js": "javascript", "jsx": "javascript", "go": "go",
118
+ "rs": "rust", "java": "java", "rb": "ruby", "md": "markdown",
119
+ }
120
+ language = lang_map.get(ext)
121
+
122
+ while start_line < len(lines):
123
+ buf = ""
124
+ end_line = start_line
125
+ while end_line < len(lines) and len((buf + lines[end_line]).encode()) < _MAX_CHUNK_BYTES:
126
+ buf += lines[end_line]
127
+ end_line += 1
128
+ if not buf.strip():
129
+ start_line = end_line + 1
130
+ continue
131
+
132
+ content_hash = hashlib.sha256(buf.encode()).hexdigest()
133
+ range_repr = f"{start_line}:{end_line}"
134
+ chunk_id = make_chunk_id(workspace_id, uri, range_repr, content_hash)
135
+
136
+ start_byte = sum(len(ln.encode()) for ln in lines[:start_line])
137
+ end_byte = start_byte + len(buf.encode())
138
+
139
+ chunks.append(Chunk(
140
+ id=chunk_id,
141
+ workspace_id=workspace_id,
142
+ source=ChunkSource(
143
+ uri=uri,
144
+ range=SourceRange(
145
+ start_line=start_line,
146
+ end_line=end_line - 1,
147
+ start_byte=start_byte,
148
+ end_byte=end_byte,
149
+ ),
150
+ content_hash=content_hash,
151
+ ),
152
+ kind="section",
153
+ language=language,
154
+ content=buf,
155
+ ))
156
+ start_line = end_line
157
+
158
+ return chunks
ocp_server/models.py ADDED
@@ -0,0 +1,110 @@
1
+ """OCP core data model — §3."""
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ import hashlib
6
+ from enum import Enum
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel, Field, model_validator
10
+
11
+
12
+ class Scope(str, Enum):
13
+ agent = "agent"
14
+ session = "session"
15
+ global_ = "global"
16
+
17
+ @classmethod
18
+ def _missing_(cls, value: object) -> "Scope | None":
19
+ if value == "global":
20
+ return cls.global_
21
+ return None
22
+
23
+
24
+ class ConformanceLevel(str, Enum):
25
+ core = "core"
26
+ core_coordination = "core+coordination"
27
+ full = "full"
28
+
29
+
30
+ class SourceRange(BaseModel):
31
+ start_byte: int | None = None
32
+ end_byte: int | None = None
33
+ start_line: int | None = None
34
+ end_line: int | None = None
35
+
36
+
37
+ class ChunkSource(BaseModel):
38
+ uri: str
39
+ range: SourceRange | None = None
40
+ content_hash: str
41
+
42
+
43
+ class Chunk(BaseModel):
44
+ id: str
45
+ workspace_id: str
46
+ source: ChunkSource
47
+ kind: str
48
+ language: str | None = None
49
+ symbol: str | None = None
50
+ content: str
51
+ metadata: dict[str, Any] = Field(default_factory=dict)
52
+ version: int = 1
53
+
54
+
55
+ _MAX_KEY_BYTES = 256
56
+ _MAX_VALUE_BYTES = 1 * 1024 * 1024 # 1 MiB
57
+
58
+
59
+ class StateEntry(BaseModel):
60
+ key: str
61
+ value: Any
62
+ scope: Scope
63
+ workspace_id: str | None = None
64
+ session_id: str | None = None
65
+ agent_id: str | None = None
66
+ ttl_seconds: int | None = None
67
+ updated_at: str | None = None
68
+ version: int = 1
69
+
70
+ @model_validator(mode="after")
71
+ def _check_scope_ids(self) -> "StateEntry":
72
+ if self.scope == Scope.agent and not self.agent_id:
73
+ raise ValueError("agent_id required for agent scope")
74
+ if self.scope == Scope.session and not self.session_id:
75
+ raise ValueError("session_id required for session scope")
76
+ if self.scope in (Scope.session, Scope.global_) and not self.workspace_id:
77
+ raise ValueError("workspace_id required for session/global scope")
78
+ # §3.2 — key: max 256 bytes; value: max 1 MiB
79
+ if len(self.key.encode()) > _MAX_KEY_BYTES:
80
+ raise ValueError(f"key exceeds {_MAX_KEY_BYTES} bytes")
81
+ import json as _json
82
+ try:
83
+ encoded = _json.dumps(self.value).encode()
84
+ except (TypeError, ValueError) as exc:
85
+ raise ValueError(f"value is not JSON-serialisable: {exc}") from exc
86
+ if len(encoded) > _MAX_VALUE_BYTES:
87
+ raise ValueError(f"value exceeds {_MAX_VALUE_BYTES} bytes (1 MiB)")
88
+ return self
89
+
90
+
91
+ class EventEnvelope(BaseModel):
92
+ type: str
93
+ event_id: str
94
+ timestamp: str
95
+ workspace_id: str
96
+ subscription_id: str
97
+ payload: dict[str, Any] = Field(default_factory=dict)
98
+
99
+
100
+ # §3.4 — deterministic chunk ID
101
+ def make_chunk_id(workspace_id: str, uri: str, range_repr: str, content_hash: str) -> str:
102
+ raw = f"{workspace_id}\x00{uri}\x00{range_repr}\x00{content_hash}".encode()
103
+ digest = hashlib.sha256(raw).digest()
104
+ return base64.b32encode(digest).decode().lower()[:24]
105
+
106
+
107
+ # §3.3 — deterministic workspace ID
108
+ def make_workspace_id(root_uri: str) -> str:
109
+ digest = hashlib.sha256(root_uri.encode()).hexdigest()
110
+ return f"ws_{digest[:16]}"