buildlog 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
buildlog/embeddings.py ADDED
@@ -0,0 +1,392 @@
1
+ """Embedding backends for semantic similarity.
2
+
3
+ This module provides a pluggable interface for computing text embeddings.
4
+ The default is token-based (no dependencies). Optional backends include:
5
+
6
+ - sentence-transformers (local, offline): pip install buildlog[embeddings]
7
+ - OpenAI API: requires OPENAI_API_KEY
8
+ - Anthropic API: requires ANTHROPIC_API_KEY (future)
9
+
10
+ Usage:
11
+ from buildlog.embeddings import get_backend, similarity
12
+
13
+ # Default (token-based)
14
+ sim = similarity("run type checker", "tsc before commit")
15
+
16
+ # With embeddings
17
+ backend = get_backend("sentence-transformers")
18
+ sim = similarity("run type checker", "tsc before commit", backend=backend)
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ __all__ = [
24
+ "EmbeddingBackend",
25
+ "TokenBackend",
26
+ "SentenceTransformerBackend",
27
+ "OpenAIBackend",
28
+ "get_backend",
29
+ "similarity",
30
+ "compute_embeddings",
31
+ ]
32
+
33
+ import logging
34
+ import os
35
+ import re
36
+ import threading
37
+ from abc import ABC, abstractmethod
38
+ from typing import Final, Literal
39
+
40
+ import numpy as np
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # Type alias for embedding vectors
45
+ Embedding = list[float]
46
+ BackendName = Literal["token", "sentence-transformers", "openai"]
47
+
48
+ # Stop words to filter in token-based approach
49
+ STOP_WORDS: Final[frozenset[str]] = frozenset({
50
+ "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
51
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
52
+ "should", "may", "might", "must", "shall", "can", "need", "dare",
53
+ "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
54
+ "from", "as", "into", "through", "during", "before", "after", "above",
55
+ "below", "between", "under", "again", "further", "then", "once",
56
+ "here", "there", "when", "where", "why", "how", "all", "each", "few",
57
+ "more", "most", "other", "some", "such", "no", "nor", "not", "only",
58
+ "own", "same", "so", "than", "too", "very", "just", "also", "now",
59
+ "always", "never", "often", "still", "already", "ever",
60
+ "it", "its", "this", "that", "these", "those", "i", "you", "he",
61
+ "she", "we", "they", "what", "which", "who", "whom", "whose",
62
+ })
63
+
64
+ # Common synonyms for normalization
65
+ SYNONYMS: Final[dict[str, str]] = {
66
+ # Type checking
67
+ "tsc": "typescript",
68
+ "mypy": "typecheck",
69
+ "pyright": "typecheck",
70
+ "typechecker": "typecheck",
71
+ "type-checker": "typecheck",
72
+ "type_checker": "typecheck",
73
+ # Version control
74
+ "commit": "commit",
75
+ "committing": "commit",
76
+ "commits": "commit",
77
+ "committed": "commit",
78
+ "git": "git",
79
+ "github": "git",
80
+ # Testing
81
+ "test": "test",
82
+ "tests": "test",
83
+ "testing": "test",
84
+ "tested": "test",
85
+ "unittest": "test",
86
+ "pytest": "test",
87
+ # Running
88
+ "run": "run",
89
+ "running": "run",
90
+ "runs": "run",
91
+ "execute": "run",
92
+ "executing": "run",
93
+ # Checking
94
+ "check": "check",
95
+ "checker": "check",
96
+ "checking": "check",
97
+ "checks": "check",
98
+ "verify": "check",
99
+ "verifying": "check",
100
+ "validate": "check",
101
+ "validating": "check",
102
+ # Type checking
103
+ "type": "type",
104
+ "types": "type",
105
+ "typed": "type",
106
+ "typing": "type",
107
+ }
108
+
109
+
110
+ class EmbeddingBackend(ABC):
111
+ """Abstract base class for embedding backends."""
112
+
113
+ @property
114
+ @abstractmethod
115
+ def name(self) -> str:
116
+ """Human-readable name of this backend."""
117
+ ...
118
+
119
+ @abstractmethod
120
+ def embed(self, texts: list[str]) -> list[Embedding]:
121
+ """Compute embeddings for a list of texts.
122
+
123
+ Args:
124
+ texts: List of strings to embed.
125
+
126
+ Returns:
127
+ List of embedding vectors (same length as input).
128
+ """
129
+ ...
130
+
131
+ def similarity(self, a: str, b: str) -> float:
132
+ """Compute similarity between two texts.
133
+
134
+ Default implementation uses cosine similarity of embeddings.
135
+ Subclasses may override for efficiency.
136
+ """
137
+ embeddings = self.embed([a, b])
138
+ return _cosine_similarity(embeddings[0], embeddings[1])
139
+
140
+
141
+ def _cosine_similarity(a: Embedding, b: Embedding) -> float:
142
+ """Compute cosine similarity between two vectors."""
143
+ a_arr = np.array(a)
144
+ b_arr = np.array(b)
145
+
146
+ dot = np.dot(a_arr, b_arr)
147
+ norm_a = np.linalg.norm(a_arr)
148
+ norm_b = np.linalg.norm(b_arr)
149
+
150
+ if norm_a == 0 or norm_b == 0:
151
+ return 0.0
152
+
153
+ return float(dot / (norm_a * norm_b))
154
+
155
+
156
+ class TokenBackend(EmbeddingBackend):
157
+ """Token-based similarity using Jaccard index with normalization.
158
+
159
+ This is the default backend with zero external dependencies.
160
+ It normalizes text, applies synonyms, and computes Jaccard similarity.
161
+ """
162
+
163
+ @property
164
+ def name(self) -> str:
165
+ return "token"
166
+
167
+ def _tokenize(self, text: str) -> set[str]:
168
+ """Tokenize and normalize text."""
169
+ # Lowercase and split on non-alphanumeric
170
+ tokens = re.split(r"[^a-z0-9]+", text.lower())
171
+
172
+ # Filter empty and stop words, apply synonyms
173
+ normalized = set()
174
+ for token in tokens:
175
+ if not token or token in STOP_WORDS:
176
+ continue
177
+ # Apply synonym normalization
178
+ normalized_token = SYNONYMS.get(token, token)
179
+ normalized.add(normalized_token)
180
+
181
+ return normalized
182
+
183
+ def embed(self, texts: list[str]) -> list[Embedding]:
184
+ """Token backend doesn't produce real embeddings.
185
+
186
+ Returns a sparse vector representation for compatibility.
187
+ """
188
+ # Build vocabulary from all texts
189
+ vocab: dict[str, int] = {}
190
+ tokenized = [self._tokenize(t) for t in texts]
191
+
192
+ for tokens in tokenized:
193
+ for token in tokens:
194
+ if token not in vocab:
195
+ vocab[token] = len(vocab)
196
+
197
+ # Create sparse vectors
198
+ embeddings: list[Embedding] = []
199
+ for tokens in tokenized:
200
+ vec = [0.0] * len(vocab)
201
+ for token in tokens:
202
+ if token in vocab:
203
+ vec[vocab[token]] = 1.0
204
+ embeddings.append(vec)
205
+
206
+ return embeddings
207
+
208
+ def similarity(self, a: str, b: str) -> float:
209
+ """Compute Jaccard similarity between tokenized texts."""
210
+ tokens_a = self._tokenize(a)
211
+ tokens_b = self._tokenize(b)
212
+
213
+ if not tokens_a or not tokens_b:
214
+ return 0.0
215
+
216
+ intersection = len(tokens_a & tokens_b)
217
+ union = len(tokens_a | tokens_b)
218
+
219
+ return intersection / union if union > 0 else 0.0
220
+
221
+
222
+ class SentenceTransformerBackend(EmbeddingBackend):
223
+ """Local embedding backend using sentence-transformers.
224
+
225
+ Requires: pip install buildlog[embeddings]
226
+ """
227
+
228
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
229
+ """Initialize with a specific model.
230
+
231
+ Args:
232
+ model_name: HuggingFace model name. Default is a small, fast model.
233
+ """
234
+ self._model_name = model_name
235
+ self._model = None
236
+
237
+ @property
238
+ def name(self) -> str:
239
+ return f"sentence-transformers ({self._model_name})"
240
+
241
+ def _get_model(self):
242
+ """Lazy-load the model."""
243
+ if self._model is None:
244
+ try:
245
+ from sentence_transformers import SentenceTransformer
246
+ except ImportError as e:
247
+ raise ImportError(
248
+ "sentence-transformers is required for local embeddings. "
249
+ "Install with: pip install buildlog[embeddings]"
250
+ ) from e
251
+
252
+ self._model = SentenceTransformer(self._model_name)
253
+
254
+ return self._model
255
+
256
+ def embed(self, texts: list[str]) -> list[Embedding]:
257
+ """Compute embeddings using sentence-transformers."""
258
+ model = self._get_model()
259
+ embeddings = model.encode(texts, convert_to_numpy=True)
260
+ return [emb.tolist() for emb in embeddings]
261
+
262
+
263
+ class OpenAIBackend(EmbeddingBackend):
264
+ """Embedding backend using OpenAI API.
265
+
266
+ Requires: OPENAI_API_KEY environment variable.
267
+ """
268
+
269
+ def __init__(self, model: str = "text-embedding-3-small"):
270
+ """Initialize with a specific model.
271
+
272
+ Args:
273
+ model: OpenAI embedding model name.
274
+
275
+ Raises:
276
+ ValueError: If OPENAI_API_KEY is not set.
277
+ """
278
+ if not os.getenv("OPENAI_API_KEY"):
279
+ raise ValueError(
280
+ "OpenAI backend requires OPENAI_API_KEY environment variable. "
281
+ "Set it with: export OPENAI_API_KEY=your-key"
282
+ )
283
+ self._model = model
284
+ self._client = None
285
+
286
+ @property
287
+ def name(self) -> str:
288
+ return f"openai ({self._model})"
289
+
290
+ def _get_client(self):
291
+ """Lazy-load the OpenAI client."""
292
+ if self._client is None:
293
+ try:
294
+ import openai
295
+ except ImportError as e:
296
+ raise ImportError(
297
+ "openai package is required for OpenAI embeddings. "
298
+ "Install with: pip install openai"
299
+ ) from e
300
+
301
+ self._client = openai.OpenAI()
302
+
303
+ return self._client
304
+
305
+ def embed(self, texts: list[str]) -> list[Embedding]:
306
+ """Compute embeddings using OpenAI API."""
307
+ client = self._get_client()
308
+ response = client.embeddings.create(input=texts, model=self._model)
309
+ return [item.embedding for item in response.data]
310
+
311
+
312
+ # Backend registry
313
+ _BACKENDS: dict[BackendName, type[EmbeddingBackend]] = {
314
+ "token": TokenBackend,
315
+ "sentence-transformers": SentenceTransformerBackend,
316
+ "openai": OpenAIBackend,
317
+ }
318
+
319
+ # Default backend instance (singleton) with thread safety
320
+ _default_backend: EmbeddingBackend | None = None
321
+ _default_backend_lock = threading.Lock()
322
+
323
+
324
+ def get_backend(name: BackendName = "token", **kwargs) -> EmbeddingBackend:
325
+ """Get an embedding backend by name.
326
+
327
+ Args:
328
+ name: Backend name - "token", "sentence-transformers", or "openai".
329
+ **kwargs: Additional arguments passed to backend constructor.
330
+
331
+ Returns:
332
+ EmbeddingBackend instance.
333
+
334
+ Raises:
335
+ ValueError: If backend name is not recognized.
336
+ """
337
+ backend_class = _BACKENDS.get(name)
338
+ if backend_class is None:
339
+ raise ValueError(
340
+ f"Unknown embedding backend: {name}. "
341
+ f"Available: {list(_BACKENDS.keys())}"
342
+ )
343
+
344
+ return backend_class(**kwargs)
345
+
346
+
347
+ def get_default_backend() -> EmbeddingBackend:
348
+ """Get the default backend (token-based).
349
+
350
+ Thread-safe singleton pattern.
351
+ """
352
+ global _default_backend
353
+ if _default_backend is None:
354
+ with _default_backend_lock:
355
+ # Double-check after acquiring lock
356
+ if _default_backend is None:
357
+ _default_backend = TokenBackend()
358
+ return _default_backend
359
+
360
+
361
+ def similarity(a: str, b: str, backend: EmbeddingBackend | None = None) -> float:
362
+ """Compute similarity between two texts.
363
+
364
+ Args:
365
+ a: First text.
366
+ b: Second text.
367
+ backend: Embedding backend to use. Defaults to token-based.
368
+
369
+ Returns:
370
+ Similarity score between 0 and 1.
371
+ """
372
+ if backend is None:
373
+ backend = get_default_backend()
374
+ return backend.similarity(a, b)
375
+
376
+
377
+ def compute_embeddings(
378
+ texts: list[str],
379
+ backend: EmbeddingBackend | None = None,
380
+ ) -> list[Embedding]:
381
+ """Compute embeddings for a list of texts.
382
+
383
+ Args:
384
+ texts: List of strings to embed.
385
+ backend: Embedding backend to use. Defaults to token-based.
386
+
387
+ Returns:
388
+ List of embedding vectors.
389
+ """
390
+ if backend is None:
391
+ backend = get_default_backend()
392
+ return backend.embed(texts)
@@ -0,0 +1,15 @@
1
+ """MCP server for buildlog integration."""
2
+
3
+ from buildlog.mcp.tools import (
4
+ buildlog_diff,
5
+ buildlog_promote,
6
+ buildlog_reject,
7
+ buildlog_status,
8
+ )
9
+
10
+ __all__ = [
11
+ "buildlog_status",
12
+ "buildlog_promote",
13
+ "buildlog_reject",
14
+ "buildlog_diff",
15
+ ]
buildlog/mcp/server.py ADDED
@@ -0,0 +1,29 @@
1
+ """Buildlog MCP server for Claude Code integration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from mcp.server.fastmcp import FastMCP
6
+
7
+ from buildlog.mcp.tools import (
8
+ buildlog_diff,
9
+ buildlog_promote,
10
+ buildlog_reject,
11
+ buildlog_status,
12
+ )
13
+
14
+ mcp = FastMCP("buildlog")
15
+
16
+ # Register tools
17
+ mcp.tool()(buildlog_status)
18
+ mcp.tool()(buildlog_promote)
19
+ mcp.tool()(buildlog_reject)
20
+ mcp.tool()(buildlog_diff)
21
+
22
+
23
+ def main() -> None:
24
+ """Run the MCP server."""
25
+ mcp.run()
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
buildlog/mcp/tools.py ADDED
@@ -0,0 +1,97 @@
1
+ """MCP tool implementations for buildlog.
2
+
3
+ These are thin wrappers around core operations.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import asdict
9
+ from pathlib import Path
10
+ from typing import Literal
11
+
12
+ from buildlog.core import diff, promote, reject, status
13
+
14
+
15
+ def _validate_skill_ids(skill_ids: list[str]) -> list[str]:
16
+ """Filter out invalid skill IDs (empty strings, None, whitespace)."""
17
+ return [sid for sid in skill_ids if sid and isinstance(sid, str) and sid.strip()]
18
+
19
+
20
+ def buildlog_status(
21
+ buildlog_dir: str = "buildlog",
22
+ min_confidence: Literal["low", "medium", "high"] = "low",
23
+ ) -> dict:
24
+ """Get current skills extracted from buildlog entries.
25
+
26
+ Returns skills grouped by category with confidence scores.
27
+ Use this to see what patterns have emerged from your work.
28
+
29
+ Args:
30
+ buildlog_dir: Path to buildlog directory (default: ./buildlog)
31
+ min_confidence: Minimum confidence level to include
32
+
33
+ Returns:
34
+ Dictionary with skills by category and summary statistics
35
+ """
36
+ result = status(Path(buildlog_dir), min_confidence)
37
+ return asdict(result)
38
+
39
+
40
+ def buildlog_promote(
41
+ skill_ids: list[str],
42
+ target: Literal["claude_md", "settings_json"] = "claude_md",
43
+ buildlog_dir: str = "buildlog",
44
+ ) -> dict:
45
+ """Promote skills to your agent's rules.
46
+
47
+ Writes selected skills to CLAUDE.md or .claude/settings.json
48
+ so your AI agent will follow these patterns.
49
+
50
+ Args:
51
+ skill_ids: List of skill IDs to promote (e.g., ["arch-b0fcb62a1e"])
52
+ target: Where to write rules ("claude_md" or "settings_json")
53
+ buildlog_dir: Path to buildlog directory
54
+
55
+ Returns:
56
+ Confirmation with promoted skills
57
+ """
58
+ validated_ids = _validate_skill_ids(skill_ids)
59
+ result = promote(Path(buildlog_dir), validated_ids, target)
60
+ return asdict(result)
61
+
62
+
63
+ def buildlog_reject(
64
+ skill_ids: list[str],
65
+ buildlog_dir: str = "buildlog",
66
+ ) -> dict:
67
+ """Mark skills as rejected so they won't be suggested again.
68
+
69
+ Rejected skills are stored in .buildlog/rejected.json
70
+
71
+ Args:
72
+ skill_ids: List of skill IDs to reject
73
+ buildlog_dir: Path to buildlog directory
74
+
75
+ Returns:
76
+ Confirmation with rejected skill IDs
77
+ """
78
+ validated_ids = _validate_skill_ids(skill_ids)
79
+ result = reject(Path(buildlog_dir), validated_ids)
80
+ return asdict(result)
81
+
82
+
83
+ def buildlog_diff(
84
+ buildlog_dir: str = "buildlog",
85
+ ) -> dict:
86
+ """Show skills that haven't been promoted or rejected yet.
87
+
88
+ Useful for seeing what's new since your last review.
89
+
90
+ Args:
91
+ buildlog_dir: Path to buildlog directory
92
+
93
+ Returns:
94
+ Dictionary with pending skills and counts
95
+ """
96
+ result = diff(Path(buildlog_dir))
97
+ return asdict(result)
@@ -0,0 +1,41 @@
1
+ """Render adapters for different targets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from buildlog.render.base import RenderTarget
9
+ from buildlog.render.claude_md import ClaudeMdRenderer
10
+ from buildlog.render.settings_json import SettingsJsonRenderer
11
+
12
+ __all__ = [
13
+ "RenderTarget",
14
+ "ClaudeMdRenderer",
15
+ "SettingsJsonRenderer",
16
+ "get_renderer",
17
+ ]
18
+
19
+
20
+ def get_renderer(
21
+ target: Literal["claude_md", "settings_json"],
22
+ path: Path | None = None,
23
+ ) -> ClaudeMdRenderer | SettingsJsonRenderer:
24
+ """Get renderer for target.
25
+
26
+ Args:
27
+ target: Target format - "claude_md" or "settings_json".
28
+ path: Optional custom path for the target file.
29
+
30
+ Returns:
31
+ Renderer instance.
32
+
33
+ Raises:
34
+ ValueError: If target is not recognized.
35
+ """
36
+ if target == "claude_md":
37
+ return ClaudeMdRenderer(path=path)
38
+ elif target == "settings_json":
39
+ return SettingsJsonRenderer(path=path)
40
+ else:
41
+ raise ValueError(f"Unknown render target: {target}. Must be 'claude_md' or 'settings_json'")
@@ -0,0 +1,23 @@
1
+ """Base protocol for render targets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Protocol
6
+
7
+ if TYPE_CHECKING:
8
+ from buildlog.skills import Skill
9
+
10
+
11
+ class RenderTarget(Protocol):
12
+ """Protocol for rendering skills to different targets."""
13
+
14
+ def render(self, skills: list[Skill]) -> str:
15
+ """Render skills and write to target.
16
+
17
+ Args:
18
+ skills: List of skills to render.
19
+
20
+ Returns:
21
+ Confirmation message describing what was written.
22
+ """
23
+ ...