coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
coderay/core/utils.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def hash_content(content: str) -> str:
|
|
6
|
+
"""Compute SHA-256 hex digest of content."""
|
|
7
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def read_from_path(path: Path) -> str:
|
|
11
|
+
"""Read file as UTF-8 text with replacement for invalid bytes."""
|
|
12
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def files_with_changed_content(
|
|
16
|
+
repo: Path,
|
|
17
|
+
paths: list[Path],
|
|
18
|
+
file_hashes: dict[str, str],
|
|
19
|
+
) -> list[Path]:
|
|
20
|
+
"""Return paths whose content hash differs from file_hashes."""
|
|
21
|
+
result: list[Path] = []
|
|
22
|
+
for p in paths:
|
|
23
|
+
try:
|
|
24
|
+
rel = str(p.relative_to(repo))
|
|
25
|
+
except ValueError:
|
|
26
|
+
result.append(p)
|
|
27
|
+
continue
|
|
28
|
+
try:
|
|
29
|
+
content = p.read_text(encoding="utf-8", errors="replace")
|
|
30
|
+
h = hash_content(content)
|
|
31
|
+
if file_hashes.get(rel) != h:
|
|
32
|
+
result.append(p)
|
|
33
|
+
except Exception:
|
|
34
|
+
result.append(p)
|
|
35
|
+
return result
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from coderay.core.config import get_embedding_dimensions
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Embedder(ABC):
|
|
13
|
+
"""Abstract embedder: embed(texts) -> list of vectors."""
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def dimensions(self) -> int:
|
|
18
|
+
"""Vector dimension (e.g. 384 for all-MiniLM-L6-v2)."""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
23
|
+
"""Embed a list of texts; returns one vector per text."""
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_embedder_from_config(config: dict[str, Any]) -> Embedder:
|
|
28
|
+
"""Build an Embedder from a config dict."""
|
|
29
|
+
emb = config.get("embedder") or {}
|
|
30
|
+
provider = (emb.get("provider") or "local").lower()
|
|
31
|
+
|
|
32
|
+
if provider == "local":
|
|
33
|
+
try:
|
|
34
|
+
from coderay.embedding.local import LocalEmbedder
|
|
35
|
+
except ImportError as exc:
|
|
36
|
+
raise ImportError(
|
|
37
|
+
"Local provider requires 'fastembed'. Install with: pip install coderay"
|
|
38
|
+
) from exc
|
|
39
|
+
return LocalEmbedder(
|
|
40
|
+
model=emb.get("model") or "sentence-transformers/all-MiniLM-L6-v2",
|
|
41
|
+
dimensions=get_embedding_dimensions(config),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if provider == "openai":
|
|
45
|
+
try:
|
|
46
|
+
from coderay.embedding.openai import OpenAIEmbedder
|
|
47
|
+
except ImportError as exc:
|
|
48
|
+
raise ImportError(
|
|
49
|
+
"OpenAI provider requires the 'openai' package. "
|
|
50
|
+
"Install with: pip install coderay[openai]"
|
|
51
|
+
) from exc
|
|
52
|
+
return OpenAIEmbedder(
|
|
53
|
+
model=emb.get("model") or "text-embedding-3-small",
|
|
54
|
+
dimensions=get_embedding_dimensions(config),
|
|
55
|
+
api_key=emb.get("api_key"),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Unknown embedder provider: {provider}. Supported: 'local', 'openai'."
|
|
60
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from coderay.embedding.base import Embedder
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
11
|
+
DEFAULT_DIMENSIONS = 384
|
|
12
|
+
|
|
13
|
+
# all-MiniLM-L6-v2 max_seq_length is 256 tokens (~1200 chars of code).
|
|
14
|
+
# Truncating early avoids the tokenizer wasting time on text the model
|
|
15
|
+
# will discard anyway.
|
|
16
|
+
#
|
|
17
|
+
# TODO: symbols exceeding this limit lose tail information. Future options:
|
|
18
|
+
# - Split long chunks into overlapping windows and average embeddings
|
|
19
|
+
# - Use a model with a larger context (e.g. nomic-embed-text, 8192 tokens)
|
|
20
|
+
# - Embed a signature+docstring summary instead of raw code for large symbols
|
|
21
|
+
MAX_CHARS = 1500
|
|
22
|
+
|
|
23
|
+
# Number of parallel ONNX workers (0 = auto based on CPU cores)
|
|
24
|
+
_PARALLEL_WORKERS = int(os.environ.get("EMBED_WORKERS", 0)) or None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LocalEmbedder(Embedder):
|
|
28
|
+
"""CPU-only embeddings via fastembed (ONNX Runtime)."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
model: str = DEFAULT_MODEL,
|
|
33
|
+
dimensions: int = DEFAULT_DIMENSIONS,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Initialize with model name and vector dimensions."""
|
|
36
|
+
self._dimensions = dimensions
|
|
37
|
+
self._model_name = model
|
|
38
|
+
self._model = None
|
|
39
|
+
|
|
40
|
+
def _load_model(self):
|
|
41
|
+
"""Lazily load the fastembed model on first use."""
|
|
42
|
+
from fastembed import TextEmbedding
|
|
43
|
+
|
|
44
|
+
logger.info("Loading local embedding model %s...", self._model_name)
|
|
45
|
+
self._model = TextEmbedding(model_name=self._model_name)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def dimensions(self) -> int:
|
|
49
|
+
"""Vector dimension (e.g. 384 for all-MiniLM-L6-v2)."""
|
|
50
|
+
return self._dimensions
|
|
51
|
+
|
|
52
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
53
|
+
"""Embed texts using fastembed; returns one vector per text."""
|
|
54
|
+
if not texts:
|
|
55
|
+
return []
|
|
56
|
+
if self._model is None:
|
|
57
|
+
self._load_model()
|
|
58
|
+
|
|
59
|
+
truncated = [t[:MAX_CHARS] if len(t) > MAX_CHARS else t for t in texts]
|
|
60
|
+
logger.info("Embedding %d chunks...", len(truncated))
|
|
61
|
+
embeddings = list(
|
|
62
|
+
self._model.embed(
|
|
63
|
+
truncated,
|
|
64
|
+
parallel=_PARALLEL_WORKERS,
|
|
65
|
+
batch_size=256,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
return [e.tolist() for e in embeddings]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from coderay.embedding.base import Embedder
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
MAX_CHARS_PER_TEXT = 8000
|
|
11
|
+
MAX_RETRIES = 3
|
|
12
|
+
RETRY_BASE_DELAY = 1.0
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OpenAIEmbedder(Embedder):
|
|
16
|
+
"""OpenAI API embedder."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
model: str = "text-embedding-3-small",
|
|
21
|
+
dimensions: int = 1536,
|
|
22
|
+
api_key: str | None = None,
|
|
23
|
+
):
|
|
24
|
+
"""Initialize with model, dimensions, and API key."""
|
|
25
|
+
import os
|
|
26
|
+
|
|
27
|
+
import openai
|
|
28
|
+
|
|
29
|
+
self._model = model
|
|
30
|
+
self._dimensions = dimensions
|
|
31
|
+
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
32
|
+
if not self._api_key:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
"OpenAI API key required: set OPENAI_API_KEY or pass api_key"
|
|
35
|
+
)
|
|
36
|
+
self._client = openai.OpenAI(api_key=self._api_key)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def dimensions(self) -> int:
|
|
40
|
+
"""Vector dimension (e.g. 1536 for text-embedding-3-small)."""
|
|
41
|
+
return self._dimensions
|
|
42
|
+
|
|
43
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
44
|
+
"""Embed texts via the OpenAI API; returns one vector per text."""
|
|
45
|
+
if not texts:
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
truncated = [
|
|
49
|
+
t[:MAX_CHARS_PER_TEXT] if len(t) > MAX_CHARS_PER_TEXT else t for t in texts
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
batch_size = 100
|
|
53
|
+
all_vectors: list[list[float]] = []
|
|
54
|
+
for i in range(0, len(truncated), batch_size):
|
|
55
|
+
batch = truncated[i : i + batch_size]
|
|
56
|
+
vecs = self._embed_with_retry(batch)
|
|
57
|
+
all_vectors.extend(vecs)
|
|
58
|
+
return all_vectors
|
|
59
|
+
|
|
60
|
+
def _embed_with_retry(self, texts: list[str]) -> list[list[float]]:
|
|
61
|
+
"""Call the API with exponential backoff on transient errors."""
|
|
62
|
+
import openai
|
|
63
|
+
|
|
64
|
+
for attempt in range(MAX_RETRIES):
|
|
65
|
+
try:
|
|
66
|
+
resp = self._client.embeddings.create(
|
|
67
|
+
model=self._model,
|
|
68
|
+
input=texts,
|
|
69
|
+
dimensions=self._dimensions,
|
|
70
|
+
)
|
|
71
|
+
return [e.embedding for e in resp.data]
|
|
72
|
+
except (
|
|
73
|
+
openai.RateLimitError,
|
|
74
|
+
openai.APITimeoutError,
|
|
75
|
+
openai.InternalServerError,
|
|
76
|
+
) as exc:
|
|
77
|
+
if attempt == MAX_RETRIES - 1:
|
|
78
|
+
raise
|
|
79
|
+
delay = RETRY_BASE_DELAY * (2**attempt)
|
|
80
|
+
logger.warning(
|
|
81
|
+
"OpenAI embed attempt %d failed (%s), retrying in %.1fs...",
|
|
82
|
+
attempt + 1,
|
|
83
|
+
exc,
|
|
84
|
+
delay,
|
|
85
|
+
)
|
|
86
|
+
time.sleep(delay)
|
|
87
|
+
return []
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from coderay.graph.builder import (
|
|
2
|
+
GRAPH_FILENAME,
|
|
3
|
+
build_and_save_graph,
|
|
4
|
+
build_graph,
|
|
5
|
+
load_graph,
|
|
6
|
+
save_graph,
|
|
7
|
+
)
|
|
8
|
+
from coderay.graph.code_graph import CodeGraph
|
|
9
|
+
from coderay.graph.extractor import GraphExtractor
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"GRAPH_FILENAME",
|
|
13
|
+
"CodeGraph",
|
|
14
|
+
"GraphExtractor",
|
|
15
|
+
"build_and_save_graph",
|
|
16
|
+
"build_graph",
|
|
17
|
+
"load_graph",
|
|
18
|
+
"save_graph",
|
|
19
|
+
]
|
coderay/graph/builder.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from coderay.graph.code_graph import CodeGraph
|
|
9
|
+
from coderay.graph.extractor import GraphExtractor
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
GRAPH_FILENAME = "graph.json"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_graph(
|
|
17
|
+
repo_root: str | Path,
|
|
18
|
+
file_paths_and_contents: list[tuple[str, str]],
|
|
19
|
+
config: dict[str, Any] | None = None,
|
|
20
|
+
) -> CodeGraph:
|
|
21
|
+
"""Extract a CodeGraph from the given files.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Built CodeGraph with resolved edges.
|
|
25
|
+
"""
|
|
26
|
+
extractor = GraphExtractor(config=config)
|
|
27
|
+
graph = CodeGraph()
|
|
28
|
+
for file_path, content in file_paths_and_contents:
|
|
29
|
+
try:
|
|
30
|
+
nodes, edges = extractor.extract_from_file(file_path, content)
|
|
31
|
+
graph.add_nodes_and_edges(nodes, edges)
|
|
32
|
+
except Exception as exc:
|
|
33
|
+
logger.warning("Graph extraction failed for %s: %s", file_path, exc)
|
|
34
|
+
resolved = graph.resolve_edges()
|
|
35
|
+
logger.info(
|
|
36
|
+
"Graph built: %d nodes, %d edges (%d call edges resolved)",
|
|
37
|
+
graph.node_count,
|
|
38
|
+
graph.edge_count,
|
|
39
|
+
resolved,
|
|
40
|
+
)
|
|
41
|
+
return graph
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def save_graph(graph: CodeGraph, index_dir: str | Path) -> Path:
|
|
45
|
+
"""Persist the graph to index_dir/graph.json."""
|
|
46
|
+
path = Path(index_dir) / GRAPH_FILENAME
|
|
47
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
path.write_text(json.dumps(graph.to_dict(), indent=2))
|
|
49
|
+
logger.info("Saved graph to %s", path)
|
|
50
|
+
return path
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_graph(index_dir: str | Path) -> CodeGraph | None:
|
|
54
|
+
"""Load a previously-saved graph, or None if it doesn't exist."""
|
|
55
|
+
path = Path(index_dir) / GRAPH_FILENAME
|
|
56
|
+
if not path.is_file():
|
|
57
|
+
return None
|
|
58
|
+
try:
|
|
59
|
+
data = json.loads(path.read_text())
|
|
60
|
+
return CodeGraph.from_dict(data)
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
logger.warning("Failed to load graph from %s: %s", path, exc)
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def build_and_save_graph(
|
|
67
|
+
repo_root: str | Path,
|
|
68
|
+
index_dir: str | Path,
|
|
69
|
+
changed_paths: list[str] | None = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Build or incrementally update the graph, then save."""
|
|
72
|
+
from coderay.core.config import load_config
|
|
73
|
+
from coderay.state.machine import StateMachine
|
|
74
|
+
|
|
75
|
+
repo = Path(repo_root)
|
|
76
|
+
idx_dir = Path(index_dir)
|
|
77
|
+
config = load_config(idx_dir)
|
|
78
|
+
|
|
79
|
+
existing_graph = load_graph(idx_dir) if changed_paths else None
|
|
80
|
+
incremental = existing_graph is not None and changed_paths is not None
|
|
81
|
+
|
|
82
|
+
if changed_paths is not None:
|
|
83
|
+
paths_to_parse = changed_paths
|
|
84
|
+
else:
|
|
85
|
+
from coderay.chunking.registry import get_supported_extensions
|
|
86
|
+
|
|
87
|
+
supported = get_supported_extensions()
|
|
88
|
+
sm = StateMachine(idx_dir)
|
|
89
|
+
paths_to_parse = [
|
|
90
|
+
p for p in sm.file_hashes if any(p.endswith(ext) for ext in supported)
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
files_with_content: list[tuple[str, str]] = []
|
|
94
|
+
for p in paths_to_parse:
|
|
95
|
+
full = repo / p
|
|
96
|
+
if full.is_file():
|
|
97
|
+
try:
|
|
98
|
+
content = full.read_text(encoding="utf-8", errors="replace")
|
|
99
|
+
files_with_content.append((p, content))
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.warning("Could not read %s for graph: %s", p, e)
|
|
102
|
+
|
|
103
|
+
if incremental:
|
|
104
|
+
extractor = GraphExtractor(config=config)
|
|
105
|
+
for fp in paths_to_parse:
|
|
106
|
+
existing_graph.remove_file(fp)
|
|
107
|
+
for fp, content in files_with_content:
|
|
108
|
+
try:
|
|
109
|
+
nodes, edges = extractor.extract_from_file(fp, content)
|
|
110
|
+
existing_graph.add_nodes_and_edges(nodes, edges)
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
logger.warning("Graph extraction failed for %s: %s", fp, exc)
|
|
113
|
+
existing_graph.resolve_edges()
|
|
114
|
+
graph = existing_graph
|
|
115
|
+
logger.info(
|
|
116
|
+
"Graph incremental update: re-parsed %d files",
|
|
117
|
+
len(files_with_content),
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
graph = build_graph(repo_root, files_with_content, config=config)
|
|
121
|
+
|
|
122
|
+
save_graph(graph, idx_dir)
|
|
123
|
+
logger.info(
|
|
124
|
+
"Graph: saved %d nodes, %d edges from %d files",
|
|
125
|
+
graph.node_count,
|
|
126
|
+
graph.edge_count,
|
|
127
|
+
len(files_with_content) if not incremental else graph.node_count,
|
|
128
|
+
)
|