coderay 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderay/core/utils.py ADDED
@@ -0,0 +1,35 @@
1
+ import hashlib
2
+ from pathlib import Path
3
+
4
+
5
+ def hash_content(content: str) -> str:
6
+ """Compute SHA-256 hex digest of content."""
7
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
8
+
9
+
10
+ def read_from_path(path: Path) -> str:
11
+ """Read file as UTF-8 text with replacement for invalid bytes."""
12
+ return path.read_text(encoding="utf-8", errors="replace")
13
+
14
+
15
+ def files_with_changed_content(
16
+ repo: Path,
17
+ paths: list[Path],
18
+ file_hashes: dict[str, str],
19
+ ) -> list[Path]:
20
+ """Return paths whose content hash differs from file_hashes."""
21
+ result: list[Path] = []
22
+ for p in paths:
23
+ try:
24
+ rel = str(p.relative_to(repo))
25
+ except ValueError:
26
+ result.append(p)
27
+ continue
28
+ try:
29
+ content = p.read_text(encoding="utf-8", errors="replace")
30
+ h = hash_content(content)
31
+ if file_hashes.get(rel) != h:
32
+ result.append(p)
33
+ except Exception:
34
+ result.append(p)
35
+ return result
File without changes
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any
6
+
7
+ from coderay.core.config import get_embedding_dimensions
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Embedder(ABC):
13
+ """Abstract embedder: embed(texts) -> list of vectors."""
14
+
15
+ @property
16
+ @abstractmethod
17
+ def dimensions(self) -> int:
18
+ """Vector dimension (e.g. 384 for all-MiniLM-L6-v2)."""
19
+ ...
20
+
21
+ @abstractmethod
22
+ def embed(self, texts: list[str]) -> list[list[float]]:
23
+ """Embed a list of texts; returns one vector per text."""
24
+ ...
25
+
26
+
27
+ def load_embedder_from_config(config: dict[str, Any]) -> Embedder:
28
+ """Build an Embedder from a config dict."""
29
+ emb = config.get("embedder") or {}
30
+ provider = (emb.get("provider") or "local").lower()
31
+
32
+ if provider == "local":
33
+ try:
34
+ from coderay.embedding.local import LocalEmbedder
35
+ except ImportError as exc:
36
+ raise ImportError(
37
+ "Local provider requires 'fastembed'. Install with: pip install coderay"
38
+ ) from exc
39
+ return LocalEmbedder(
40
+ model=emb.get("model") or "sentence-transformers/all-MiniLM-L6-v2",
41
+ dimensions=get_embedding_dimensions(config),
42
+ )
43
+
44
+ if provider == "openai":
45
+ try:
46
+ from coderay.embedding.openai import OpenAIEmbedder
47
+ except ImportError as exc:
48
+ raise ImportError(
49
+ "OpenAI provider requires the 'openai' package. "
50
+ "Install with: pip install coderay[openai]"
51
+ ) from exc
52
+ return OpenAIEmbedder(
53
+ model=emb.get("model") or "text-embedding-3-small",
54
+ dimensions=get_embedding_dimensions(config),
55
+ api_key=emb.get("api_key"),
56
+ )
57
+
58
+ raise ValueError(
59
+ f"Unknown embedder provider: {provider}. Supported: 'local', 'openai'."
60
+ )
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+
6
+ from coderay.embedding.base import Embedder
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11
+ DEFAULT_DIMENSIONS = 384
12
+
13
+ # all-MiniLM-L6-v2 max_seq_length is 256 tokens (~1200 chars of code).
14
+ # Truncating early avoids the tokenizer wasting time on text the model
15
+ # will discard anyway.
16
+ #
17
+ # TODO: symbols exceeding this limit lose tail information. Future options:
18
+ # - Split long chunks into overlapping windows and average embeddings
19
+ # - Use a model with a larger context (e.g. nomic-embed-text, 8192 tokens)
20
+ # - Embed a signature+docstring summary instead of raw code for large symbols
21
+ MAX_CHARS = 1500
22
+
23
+ # Number of parallel ONNX workers (0 = auto based on CPU cores)
24
+ _PARALLEL_WORKERS = int(os.environ.get("EMBED_WORKERS", 0)) or None
25
+
26
+
27
+ class LocalEmbedder(Embedder):
28
+ """CPU-only embeddings via fastembed (ONNX Runtime)."""
29
+
30
+ def __init__(
31
+ self,
32
+ model: str = DEFAULT_MODEL,
33
+ dimensions: int = DEFAULT_DIMENSIONS,
34
+ ) -> None:
35
+ """Initialize with model name and vector dimensions."""
36
+ self._dimensions = dimensions
37
+ self._model_name = model
38
+ self._model = None
39
+
40
+ def _load_model(self):
41
+ """Lazily load the fastembed model on first use."""
42
+ from fastembed import TextEmbedding
43
+
44
+ logger.info("Loading local embedding model %s...", self._model_name)
45
+ self._model = TextEmbedding(model_name=self._model_name)
46
+
47
+ @property
48
+ def dimensions(self) -> int:
49
+ """Vector dimension (e.g. 384 for all-MiniLM-L6-v2)."""
50
+ return self._dimensions
51
+
52
+ def embed(self, texts: list[str]) -> list[list[float]]:
53
+ """Embed texts using fastembed; returns one vector per text."""
54
+ if not texts:
55
+ return []
56
+ if self._model is None:
57
+ self._load_model()
58
+
59
+ truncated = [t[:MAX_CHARS] if len(t) > MAX_CHARS else t for t in texts]
60
+ logger.info("Embedding %d chunks...", len(truncated))
61
+ embeddings = list(
62
+ self._model.embed(
63
+ truncated,
64
+ parallel=_PARALLEL_WORKERS,
65
+ batch_size=256,
66
+ )
67
+ )
68
+ return [e.tolist() for e in embeddings]
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+
6
+ from coderay.embedding.base import Embedder
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ MAX_CHARS_PER_TEXT = 8000
11
+ MAX_RETRIES = 3
12
+ RETRY_BASE_DELAY = 1.0
13
+
14
+
15
+ class OpenAIEmbedder(Embedder):
16
+ """OpenAI API embedder."""
17
+
18
+ def __init__(
19
+ self,
20
+ model: str = "text-embedding-3-small",
21
+ dimensions: int = 1536,
22
+ api_key: str | None = None,
23
+ ):
24
+ """Initialize with model, dimensions, and API key."""
25
+ import os
26
+
27
+ import openai
28
+
29
+ self._model = model
30
+ self._dimensions = dimensions
31
+ self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
32
+ if not self._api_key:
33
+ raise ValueError(
34
+ "OpenAI API key required: set OPENAI_API_KEY or pass api_key"
35
+ )
36
+ self._client = openai.OpenAI(api_key=self._api_key)
37
+
38
+ @property
39
+ def dimensions(self) -> int:
40
+ """Vector dimension (e.g. 1536 for text-embedding-3-small)."""
41
+ return self._dimensions
42
+
43
+ def embed(self, texts: list[str]) -> list[list[float]]:
44
+ """Embed texts via the OpenAI API; returns one vector per text."""
45
+ if not texts:
46
+ return []
47
+
48
+ truncated = [
49
+ t[:MAX_CHARS_PER_TEXT] if len(t) > MAX_CHARS_PER_TEXT else t for t in texts
50
+ ]
51
+
52
+ batch_size = 100
53
+ all_vectors: list[list[float]] = []
54
+ for i in range(0, len(truncated), batch_size):
55
+ batch = truncated[i : i + batch_size]
56
+ vecs = self._embed_with_retry(batch)
57
+ all_vectors.extend(vecs)
58
+ return all_vectors
59
+
60
+ def _embed_with_retry(self, texts: list[str]) -> list[list[float]]:
61
+ """Call the API with exponential backoff on transient errors."""
62
+ import openai
63
+
64
+ for attempt in range(MAX_RETRIES):
65
+ try:
66
+ resp = self._client.embeddings.create(
67
+ model=self._model,
68
+ input=texts,
69
+ dimensions=self._dimensions,
70
+ )
71
+ return [e.embedding for e in resp.data]
72
+ except (
73
+ openai.RateLimitError,
74
+ openai.APITimeoutError,
75
+ openai.InternalServerError,
76
+ ) as exc:
77
+ if attempt == MAX_RETRIES - 1:
78
+ raise
79
+ delay = RETRY_BASE_DELAY * (2**attempt)
80
+ logger.warning(
81
+ "OpenAI embed attempt %d failed (%s), retrying in %.1fs...",
82
+ attempt + 1,
83
+ exc,
84
+ delay,
85
+ )
86
+ time.sleep(delay)
87
+ return []
@@ -0,0 +1,19 @@
1
+ from coderay.graph.builder import (
2
+ GRAPH_FILENAME,
3
+ build_and_save_graph,
4
+ build_graph,
5
+ load_graph,
6
+ save_graph,
7
+ )
8
+ from coderay.graph.code_graph import CodeGraph
9
+ from coderay.graph.extractor import GraphExtractor
10
+
11
+ __all__ = [
12
+ "GRAPH_FILENAME",
13
+ "CodeGraph",
14
+ "GraphExtractor",
15
+ "build_and_save_graph",
16
+ "build_graph",
17
+ "load_graph",
18
+ "save_graph",
19
+ ]
@@ -0,0 +1,128 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from coderay.graph.code_graph import CodeGraph
9
+ from coderay.graph.extractor import GraphExtractor
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ GRAPH_FILENAME = "graph.json"
14
+
15
+
16
+ def build_graph(
17
+ repo_root: str | Path,
18
+ file_paths_and_contents: list[tuple[str, str]],
19
+ config: dict[str, Any] | None = None,
20
+ ) -> CodeGraph:
21
+ """Extract a CodeGraph from the given files.
22
+
23
+ Returns:
24
+ Built CodeGraph with resolved edges.
25
+ """
26
+ extractor = GraphExtractor(config=config)
27
+ graph = CodeGraph()
28
+ for file_path, content in file_paths_and_contents:
29
+ try:
30
+ nodes, edges = extractor.extract_from_file(file_path, content)
31
+ graph.add_nodes_and_edges(nodes, edges)
32
+ except Exception as exc:
33
+ logger.warning("Graph extraction failed for %s: %s", file_path, exc)
34
+ resolved = graph.resolve_edges()
35
+ logger.info(
36
+ "Graph built: %d nodes, %d edges (%d call edges resolved)",
37
+ graph.node_count,
38
+ graph.edge_count,
39
+ resolved,
40
+ )
41
+ return graph
42
+
43
+
44
+ def save_graph(graph: CodeGraph, index_dir: str | Path) -> Path:
45
+ """Persist the graph to index_dir/graph.json."""
46
+ path = Path(index_dir) / GRAPH_FILENAME
47
+ path.parent.mkdir(parents=True, exist_ok=True)
48
+ path.write_text(json.dumps(graph.to_dict(), indent=2))
49
+ logger.info("Saved graph to %s", path)
50
+ return path
51
+
52
+
53
+ def load_graph(index_dir: str | Path) -> CodeGraph | None:
54
+ """Load a previously-saved graph, or None if it doesn't exist."""
55
+ path = Path(index_dir) / GRAPH_FILENAME
56
+ if not path.is_file():
57
+ return None
58
+ try:
59
+ data = json.loads(path.read_text())
60
+ return CodeGraph.from_dict(data)
61
+ except Exception as exc:
62
+ logger.warning("Failed to load graph from %s: %s", path, exc)
63
+ return None
64
+
65
+
66
+ def build_and_save_graph(
67
+ repo_root: str | Path,
68
+ index_dir: str | Path,
69
+ changed_paths: list[str] | None = None,
70
+ ) -> None:
71
+ """Build or incrementally update the graph, then save."""
72
+ from coderay.core.config import load_config
73
+ from coderay.state.machine import StateMachine
74
+
75
+ repo = Path(repo_root)
76
+ idx_dir = Path(index_dir)
77
+ config = load_config(idx_dir)
78
+
79
+ existing_graph = load_graph(idx_dir) if changed_paths else None
80
+ incremental = existing_graph is not None and changed_paths is not None
81
+
82
+ if changed_paths is not None:
83
+ paths_to_parse = changed_paths
84
+ else:
85
+ from coderay.chunking.registry import get_supported_extensions
86
+
87
+ supported = get_supported_extensions()
88
+ sm = StateMachine(idx_dir)
89
+ paths_to_parse = [
90
+ p for p in sm.file_hashes if any(p.endswith(ext) for ext in supported)
91
+ ]
92
+
93
+ files_with_content: list[tuple[str, str]] = []
94
+ for p in paths_to_parse:
95
+ full = repo / p
96
+ if full.is_file():
97
+ try:
98
+ content = full.read_text(encoding="utf-8", errors="replace")
99
+ files_with_content.append((p, content))
100
+ except Exception as e:
101
+ logger.warning("Could not read %s for graph: %s", p, e)
102
+
103
+ if incremental:
104
+ extractor = GraphExtractor(config=config)
105
+ for fp in paths_to_parse:
106
+ existing_graph.remove_file(fp)
107
+ for fp, content in files_with_content:
108
+ try:
109
+ nodes, edges = extractor.extract_from_file(fp, content)
110
+ existing_graph.add_nodes_and_edges(nodes, edges)
111
+ except Exception as exc:
112
+ logger.warning("Graph extraction failed for %s: %s", fp, exc)
113
+ existing_graph.resolve_edges()
114
+ graph = existing_graph
115
+ logger.info(
116
+ "Graph incremental update: re-parsed %d files",
117
+ len(files_with_content),
118
+ )
119
+ else:
120
+ graph = build_graph(repo_root, files_with_content, config=config)
121
+
122
+ save_graph(graph, idx_dir)
123
+ logger.info(
124
+ "Graph: saved %d nodes, %d edges from %d files",
125
+ graph.node_count,
126
+ graph.edge_count,
127
+ len(files_with_content) if not incremental else graph.node_count,
128
+ )