codegraph-cli 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_cli/__init__.py +1 -1
- codegraph_cli/agents.py +1 -1
- codegraph_cli/cli.py +6 -0
- codegraph_cli/cli_chat.py +9 -4
- codegraph_cli/cli_setup.py +158 -0
- codegraph_cli/config.py +6 -1
- codegraph_cli/config_manager.py +70 -20
- codegraph_cli/context_manager.py +1 -1
- codegraph_cli/crew_agents.py +6 -1
- codegraph_cli/crew_chat.py +5 -1
- codegraph_cli/crew_tools.py +9 -1
- codegraph_cli/embeddings.py +268 -100
- codegraph_cli/orchestrator.py +2 -2
- codegraph_cli/rag.py +3 -3
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/METADATA +57 -11
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/RECORD +20 -20
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/WHEEL +0 -0
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/entry_points.txt +0 -0
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {codegraph_cli-2.0.0.dist-info → codegraph_cli-2.1.1.dist-info}/top_level.txt +0 -0
codegraph_cli/__init__.py
CHANGED
codegraph_cli/agents.py
CHANGED
|
@@ -6,7 +6,7 @@ from collections import deque
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Dict, List, Set
|
|
8
8
|
|
|
9
|
-
from .embeddings import HashEmbeddingModel
|
|
9
|
+
from .embeddings import HashEmbeddingModel, TransformerEmbedder
|
|
10
10
|
from .llm import LocalLLM
|
|
11
11
|
from .models import ImpactReport
|
|
12
12
|
from .parser import PythonGraphParser
|
codegraph_cli/cli.py
CHANGED
|
@@ -10,6 +10,7 @@ import typer
|
|
|
10
10
|
from . import __version__, config
|
|
11
11
|
from .cli_chat import chat_app
|
|
12
12
|
from .cli_setup import setup as setup_wizard, set_llm, unset_llm, show_llm
|
|
13
|
+
from .cli_setup import set_embedding, unset_embedding, show_embedding
|
|
13
14
|
from .cli_v2 import v2_app
|
|
14
15
|
from .graph_export import export_dot, export_html
|
|
15
16
|
from .orchestrator import MCPOrchestrator
|
|
@@ -35,6 +36,11 @@ app.command("set-llm")(set_llm)
|
|
|
35
36
|
app.command("unset-llm")(unset_llm)
|
|
36
37
|
app.command("show-llm")(show_llm)
|
|
37
38
|
|
|
39
|
+
# Register embedding management commands
|
|
40
|
+
app.command("set-embedding")(set_embedding)
|
|
41
|
+
app.command("unset-embedding")(unset_embedding)
|
|
42
|
+
app.command("show-embedding")(show_embedding)
|
|
43
|
+
|
|
38
44
|
|
|
39
45
|
def version_callback(value: bool):
|
|
40
46
|
"""Print version and exit."""
|
codegraph_cli/cli_chat.py
CHANGED
|
@@ -13,7 +13,6 @@ import typer
|
|
|
13
13
|
from . import config
|
|
14
14
|
from .chat_agent import ChatAgent
|
|
15
15
|
from .chat_session import SessionManager
|
|
16
|
-
from .crew_chat import CrewChatAgent
|
|
17
16
|
from .llm import LocalLLM
|
|
18
17
|
from .orchestrator import MCPOrchestrator
|
|
19
18
|
from .rag import RAGRetriever
|
|
@@ -281,7 +280,7 @@ def start_chat(
|
|
|
281
280
|
new_session: bool = typer.Option(False, "--new", "-n", help="Force start a new session"),
|
|
282
281
|
):
|
|
283
282
|
"""Start interactive chat session."""
|
|
284
|
-
from .embeddings import
|
|
283
|
+
from .embeddings import get_embedder
|
|
285
284
|
from .project_context import ProjectContext
|
|
286
285
|
|
|
287
286
|
pm = ProjectManager()
|
|
@@ -294,12 +293,18 @@ def start_chat(
|
|
|
294
293
|
|
|
295
294
|
# Initialize components
|
|
296
295
|
context = ProjectContext(project, pm)
|
|
297
|
-
embedding_model =
|
|
296
|
+
embedding_model = get_embedder()
|
|
298
297
|
llm = LocalLLM(model=llm_model, provider=llm_provider, api_key=llm_api_key, endpoint=llm_endpoint)
|
|
299
298
|
rag_retriever = RAGRetriever(context.store, embedding_model)
|
|
300
299
|
|
|
301
300
|
if use_crew:
|
|
302
|
-
|
|
301
|
+
try:
|
|
302
|
+
from .crew_chat import CrewChatAgent
|
|
303
|
+
except ImportError:
|
|
304
|
+
print(f"\n {C_RED}CrewAI is not installed.{C_RESET}")
|
|
305
|
+
print(f" {C_DIM}Install with: pip install codegraph-cli[crew]{C_RESET}\n")
|
|
306
|
+
raise typer.Exit(1)
|
|
307
|
+
print(f"\n {C_MAGENTA}Initializing CrewAI multi-agent system...{C_RESET}")
|
|
303
308
|
agent = CrewChatAgent(context, llm, rag_retriever)
|
|
304
309
|
else:
|
|
305
310
|
orchestrator = MCPOrchestrator(
|
codegraph_cli/cli_setup.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Optional
|
|
|
8
8
|
import typer
|
|
9
9
|
|
|
10
10
|
from . import config_manager
|
|
11
|
+
from .embeddings import EMBEDDING_MODELS
|
|
11
12
|
|
|
12
13
|
app = typer.Typer(help="Setup wizard for LLM provider configuration")
|
|
13
14
|
|
|
@@ -287,6 +288,12 @@ def setup():
|
|
|
287
288
|
print_error("Failed to save configuration!")
|
|
288
289
|
raise typer.Exit(code=1)
|
|
289
290
|
|
|
291
|
+
# Offer embedding setup
|
|
292
|
+
typer.echo("")
|
|
293
|
+
setup_emb = typer.confirm("Configure embedding model for semantic search?", default=True)
|
|
294
|
+
if setup_emb:
|
|
295
|
+
_interactive_embedding_setup()
|
|
296
|
+
|
|
290
297
|
|
|
291
298
|
def set_llm(
|
|
292
299
|
provider: str = typer.Argument(..., help="LLM provider: ollama, groq, openai, anthropic, gemini, openrouter"),
|
|
@@ -466,5 +473,156 @@ def show_llm():
|
|
|
466
473
|
typer.echo("")
|
|
467
474
|
|
|
468
475
|
|
|
476
|
+
# ===================================================================
|
|
477
|
+
# Embedding model commands
|
|
478
|
+
# ===================================================================
|
|
479
|
+
|
|
480
|
+
def _interactive_embedding_setup():
|
|
481
|
+
"""Interactive embedding model picker (called from setup wizard)."""
|
|
482
|
+
typer.echo("")
|
|
483
|
+
typer.echo(typer.style("╭──────────────────────────────────────────────╮", fg=typer.colors.CYAN))
|
|
484
|
+
typer.echo(typer.style("│", fg=typer.colors.CYAN) + typer.style(" Embedding Model Setup ", bold=True) + typer.style("│", fg=typer.colors.CYAN))
|
|
485
|
+
typer.echo(typer.style("╰──────────────────────────────────────────────╯", fg=typer.colors.CYAN))
|
|
486
|
+
typer.echo("")
|
|
487
|
+
typer.echo("Choose an embedding model for semantic code search:")
|
|
488
|
+
typer.echo("Larger models give better results but need more disk/RAM.\n")
|
|
489
|
+
|
|
490
|
+
# List models with numbers
|
|
491
|
+
model_keys = list(EMBEDDING_MODELS.keys())
|
|
492
|
+
for i, key in enumerate(model_keys, 1):
|
|
493
|
+
spec = EMBEDDING_MODELS[key]
|
|
494
|
+
name_col = f"{key}".ljust(12)
|
|
495
|
+
size_col = f"({spec['size']})".ljust(14)
|
|
496
|
+
desc = spec["description"]
|
|
497
|
+
typer.echo(f" {i}) {name_col} {size_col} {desc}")
|
|
498
|
+
|
|
499
|
+
typer.echo("")
|
|
500
|
+
|
|
501
|
+
while True:
|
|
502
|
+
choice = typer.prompt(f"Enter choice [1-{len(model_keys)}]", type=str)
|
|
503
|
+
try:
|
|
504
|
+
idx = int(choice)
|
|
505
|
+
if 1 <= idx <= len(model_keys):
|
|
506
|
+
selected = model_keys[idx - 1]
|
|
507
|
+
break
|
|
508
|
+
except ValueError:
|
|
509
|
+
# Accept model key directly
|
|
510
|
+
if choice.strip() in model_keys:
|
|
511
|
+
selected = choice.strip()
|
|
512
|
+
break
|
|
513
|
+
print_error(f"Invalid choice. Enter 1-{len(model_keys)} or a model key.")
|
|
514
|
+
|
|
515
|
+
spec = EMBEDDING_MODELS[selected]
|
|
516
|
+
|
|
517
|
+
if selected != "hash":
|
|
518
|
+
typer.echo(f"\n Model: {typer.style(spec['name'], fg=typer.colors.CYAN)}")
|
|
519
|
+
typer.echo(f" Download: {typer.style(spec['size'], fg=typer.colors.YELLOW)}")
|
|
520
|
+
typer.echo(f" Dim: {spec['dim']}")
|
|
521
|
+
print_info("Requires: pip install codegraph-cli[embeddings]")
|
|
522
|
+
else:
|
|
523
|
+
typer.echo(f"\n Model: {typer.style('Hash Embedding (zero-dependency)', fg=typer.colors.CYAN)}")
|
|
524
|
+
print_info("No download needed, but no semantic understanding.")
|
|
525
|
+
|
|
526
|
+
success = config_manager.save_embedding_config(selected)
|
|
527
|
+
if success:
|
|
528
|
+
print_success(f"Embedding model set to: {selected}")
|
|
529
|
+
if selected != "hash":
|
|
530
|
+
print_info(f"Model will be downloaded on first use (~{spec['size']}).")
|
|
531
|
+
print_info("Re-index your project after changing embeddings: cg index <path>")
|
|
532
|
+
else:
|
|
533
|
+
print_error("Failed to save embedding config!")
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def set_embedding(
|
|
537
|
+
model: str = typer.Argument(
|
|
538
|
+
...,
|
|
539
|
+
help="Embedding model key: qodo-1.5b, jina-code, bge-base, minilm, hash",
|
|
540
|
+
),
|
|
541
|
+
):
|
|
542
|
+
"""Set the embedding model for semantic code search.
|
|
543
|
+
|
|
544
|
+
Available models (smallest to largest):
|
|
545
|
+
|
|
546
|
+
hash 0 bytes No download, keyword-level only
|
|
547
|
+
minilm ~80 MB Tiny, fast, decent quality
|
|
548
|
+
bge-base ~440 MB Solid general-purpose
|
|
549
|
+
jina-code ~550 MB Code-aware, good quality
|
|
550
|
+
qodo-1.5b ~6.2 GB Best quality, code-optimized
|
|
551
|
+
|
|
552
|
+
Examples:
|
|
553
|
+
cg set-embedding minilm
|
|
554
|
+
cg set-embedding jina-code
|
|
555
|
+
cg set-embedding hash
|
|
556
|
+
"""
|
|
557
|
+
model = model.lower().strip()
|
|
558
|
+
|
|
559
|
+
if model not in EMBEDDING_MODELS:
|
|
560
|
+
print_error(
|
|
561
|
+
f"Unknown model '{model}'. "
|
|
562
|
+
f"Choose from: {', '.join(EMBEDDING_MODELS.keys())}"
|
|
563
|
+
)
|
|
564
|
+
raise typer.Exit(code=1)
|
|
565
|
+
|
|
566
|
+
spec = EMBEDDING_MODELS[model]
|
|
567
|
+
success = config_manager.save_embedding_config(model)
|
|
568
|
+
|
|
569
|
+
if success:
|
|
570
|
+
print_success(f"Embedding model set to: {model}")
|
|
571
|
+
typer.echo(f" Name: {typer.style(spec['name'], fg=typer.colors.CYAN)}")
|
|
572
|
+
typer.echo(f" Dim: {spec['dim']}")
|
|
573
|
+
if model != "hash":
|
|
574
|
+
typer.echo(f" Size: {spec['size']} (downloaded on first use)")
|
|
575
|
+
print_info("Re-index your project after changing: cg index <path>")
|
|
576
|
+
else:
|
|
577
|
+
print_error("Failed to save configuration!")
|
|
578
|
+
raise typer.Exit(code=1)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def unset_embedding():
|
|
582
|
+
"""Reset embedding model to default (hash — no download)."""
|
|
583
|
+
success = config_manager.clear_embedding_config()
|
|
584
|
+
if success:
|
|
585
|
+
print_success("Embedding model reset to default (hash).")
|
|
586
|
+
print_info("No neural model will be used. Re-index to apply.")
|
|
587
|
+
else:
|
|
588
|
+
print_error("Failed to reset embedding config!")
|
|
589
|
+
raise typer.Exit(code=1)
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def show_embedding():
|
|
593
|
+
"""Show current embedding model configuration."""
|
|
594
|
+
typer.echo("")
|
|
595
|
+
typer.echo(typer.style("╭──────────────────────────────────────────────╮", fg=typer.colors.CYAN))
|
|
596
|
+
typer.echo(typer.style("│", fg=typer.colors.CYAN) + typer.style(" Embedding Configuration ", bold=True) + typer.style("│", fg=typer.colors.CYAN))
|
|
597
|
+
typer.echo(typer.style("╰──────────────────────────────────────────────╯", fg=typer.colors.CYAN))
|
|
598
|
+
|
|
599
|
+
emb_cfg = config_manager.load_embedding_config()
|
|
600
|
+
current_key = emb_cfg.get("model", "hash")
|
|
601
|
+
spec = EMBEDDING_MODELS.get(current_key)
|
|
602
|
+
|
|
603
|
+
if spec is None:
|
|
604
|
+
typer.echo(f" Model {typer.style(current_key, fg=typer.colors.RED)} (unknown)")
|
|
605
|
+
else:
|
|
606
|
+
typer.echo(f" Model {typer.style(f' {current_key} ', bg=typer.colors.CYAN, fg=typer.colors.WHITE, bold=True)}")
|
|
607
|
+
typer.echo(f" Name {typer.style(spec['name'], bold=True)}")
|
|
608
|
+
typer.echo(f" Dim {spec['dim']}")
|
|
609
|
+
typer.echo(f" Size {spec['size']}")
|
|
610
|
+
typer.echo(f" Desc {spec['description']}")
|
|
611
|
+
|
|
612
|
+
typer.echo("")
|
|
613
|
+
typer.echo(typer.style(" Available Models", bold=True))
|
|
614
|
+
typer.echo(typer.style(" ─────────────────────────────────────────", dim=True))
|
|
615
|
+
for key, s in EMBEDDING_MODELS.items():
|
|
616
|
+
marker = typer.style(" *", fg=typer.colors.GREEN) if key == current_key else " "
|
|
617
|
+
typer.echo(f" {marker} {key.ljust(12)} {s['size'].ljust(12)} {s['description']}")
|
|
618
|
+
|
|
619
|
+
typer.echo("")
|
|
620
|
+
typer.echo(typer.style(" Quick Commands", bold=True))
|
|
621
|
+
typer.echo(typer.style(" ─────────────────────────────────────────", dim=True))
|
|
622
|
+
typer.echo(f" {typer.style('cg set-embedding <model>', fg=typer.colors.YELLOW)} Switch model")
|
|
623
|
+
typer.echo(f" {typer.style('cg unset-embedding', fg=typer.colors.YELLOW)} Reset to hash")
|
|
624
|
+
typer.echo("")
|
|
625
|
+
|
|
626
|
+
|
|
469
627
|
if __name__ == "__main__":
|
|
470
628
|
app()
|
codegraph_cli/config.py
CHANGED
|
@@ -13,10 +13,12 @@ SUPPORTED_EXTENSIONS = {".py"}
|
|
|
13
13
|
|
|
14
14
|
# Load configuration from TOML file (if available)
|
|
15
15
|
try:
|
|
16
|
-
from .config_manager import load_config
|
|
16
|
+
from .config_manager import load_config, load_embedding_config
|
|
17
17
|
_toml_config = load_config()
|
|
18
|
+
_emb_config = load_embedding_config()
|
|
18
19
|
except ImportError:
|
|
19
20
|
_toml_config = {}
|
|
21
|
+
_emb_config = {}
|
|
20
22
|
|
|
21
23
|
# LLM Provider Configuration — loaded from ~/.codegraph/config.toml (set via `cg setup` or `cg set-llm`)
|
|
22
24
|
LLM_PROVIDER = _toml_config.get("provider", "ollama")
|
|
@@ -24,6 +26,9 @@ LLM_API_KEY = _toml_config.get("api_key", "")
|
|
|
24
26
|
LLM_MODEL = _toml_config.get("model", "qwen2.5-coder:7b")
|
|
25
27
|
LLM_ENDPOINT = _toml_config.get("endpoint", "http://127.0.0.1:11434/api/generate")
|
|
26
28
|
|
|
29
|
+
# Embedding model — set via `cg set-embedding` (default: "hash" = no download)
|
|
30
|
+
EMBEDDING_MODEL = _emb_config.get("model", "hash")
|
|
31
|
+
|
|
27
32
|
|
|
28
33
|
def ensure_base_dirs() -> None:
|
|
29
34
|
"""Create base directories for local storage if needed."""
|
codegraph_cli/config_manager.py
CHANGED
|
@@ -78,11 +78,37 @@ def load_config() -> Dict[str, Any]:
|
|
|
78
78
|
return DEFAULT_CONFIGS["ollama"].copy()
|
|
79
79
|
|
|
80
80
|
|
|
81
|
+
def load_full_config() -> Dict[str, Any]:
|
|
82
|
+
"""Load the entire TOML config (all sections)."""
|
|
83
|
+
if not CONFIG_FILE.exists() or toml is None:
|
|
84
|
+
return {}
|
|
85
|
+
try:
|
|
86
|
+
with open(CONFIG_FILE, "r") as f:
|
|
87
|
+
return toml.load(f)
|
|
88
|
+
except Exception:
|
|
89
|
+
return {}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _save_full_config(config: Dict[str, Any]) -> bool:
|
|
93
|
+
"""Write entire config dict to TOML file, preserving all sections."""
|
|
94
|
+
if toml is None:
|
|
95
|
+
return False
|
|
96
|
+
BASE_DIR.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
try:
|
|
98
|
+
with open(CONFIG_FILE, "w") as f:
|
|
99
|
+
toml.dump(config, f)
|
|
100
|
+
return True
|
|
101
|
+
except Exception:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
81
105
|
def save_config(provider: str, model: str, api_key: str = "", endpoint: str = "") -> bool:
|
|
82
106
|
"""Save LLM configuration to TOML file.
|
|
83
107
|
|
|
108
|
+
Preserves other sections (e.g. ``[embeddings]``) in the file.
|
|
109
|
+
|
|
84
110
|
Args:
|
|
85
|
-
provider: Provider name (ollama, groq, openai, anthropic)
|
|
111
|
+
provider: Provider name (ollama, groq, openai, anthropic, gemini, openrouter)
|
|
86
112
|
model: Model name
|
|
87
113
|
api_key: API key for cloud providers
|
|
88
114
|
endpoint: Custom endpoint (for Ollama)
|
|
@@ -90,32 +116,56 @@ def save_config(provider: str, model: str, api_key: str = "", endpoint: str = ""
|
|
|
90
116
|
Returns:
|
|
91
117
|
True if saved successfully, False otherwise
|
|
92
118
|
"""
|
|
93
|
-
|
|
94
|
-
return False
|
|
95
|
-
|
|
96
|
-
# Ensure directory exists
|
|
97
|
-
BASE_DIR.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
config = load_full_config()
|
|
98
120
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
"
|
|
102
|
-
"provider": provider,
|
|
103
|
-
"model": model,
|
|
104
|
-
}
|
|
121
|
+
config["llm"] = {
|
|
122
|
+
"provider": provider,
|
|
123
|
+
"model": model,
|
|
105
124
|
}
|
|
106
|
-
|
|
107
125
|
if api_key:
|
|
108
126
|
config["llm"]["api_key"] = api_key
|
|
109
|
-
|
|
110
127
|
if endpoint:
|
|
111
128
|
config["llm"]["endpoint"] = endpoint
|
|
112
129
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
130
|
+
return _save_full_config(config)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
# Embedding configuration
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def load_embedding_config() -> Dict[str, Any]:
|
|
138
|
+
"""Load embedding configuration from ``[embeddings]`` section.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dict with at least ``model`` key, or empty dict.
|
|
142
|
+
"""
|
|
143
|
+
full = load_full_config()
|
|
144
|
+
return full.get("embeddings", {})
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def save_embedding_config(model_key: str) -> bool:
|
|
148
|
+
"""Save embedding model choice to config TOML.
|
|
149
|
+
|
|
150
|
+
Preserves ``[llm]`` and other sections.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
model_key: One of the keys from ``EMBEDDING_MODELS``
|
|
154
|
+
(e.g. ``"minilm"``, ``"jina-code"``, ``"hash"``).
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if saved successfully.
|
|
158
|
+
"""
|
|
159
|
+
config = load_full_config()
|
|
160
|
+
config["embeddings"] = {"model": model_key}
|
|
161
|
+
return _save_full_config(config)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def clear_embedding_config() -> bool:
|
|
165
|
+
"""Remove ``[embeddings]`` section from config, resetting to default."""
|
|
166
|
+
config = load_full_config()
|
|
167
|
+
config.pop("embeddings", None)
|
|
168
|
+
return _save_full_config(config)
|
|
119
169
|
|
|
120
170
|
|
|
121
171
|
def get_provider_config(provider: str) -> Dict[str, Any]:
|
codegraph_cli/context_manager.py
CHANGED
codegraph_cli/crew_agents.py
CHANGED
|
@@ -4,7 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, List
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
try:
|
|
8
|
+
from crewai import Agent
|
|
9
|
+
CREWAI_AVAILABLE = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
Agent = None # type: ignore
|
|
12
|
+
CREWAI_AVAILABLE = False
|
|
8
13
|
|
|
9
14
|
if TYPE_CHECKING:
|
|
10
15
|
from .crew_tools import create_tools
|
codegraph_cli/crew_chat.py
CHANGED
|
@@ -8,7 +8,11 @@ from typing import TYPE_CHECKING, Dict, List
|
|
|
8
8
|
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
try:
|
|
12
|
+
from crewai import Agent, Crew, Task
|
|
13
|
+
CREWAI_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
CREWAI_AVAILABLE = False
|
|
12
16
|
|
|
13
17
|
from .crew_agents import (
|
|
14
18
|
create_code_analysis_agent,
|
codegraph_cli/crew_tools.py
CHANGED
|
@@ -9,7 +9,15 @@ from datetime import datetime
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
try:
|
|
13
|
+
from crewai.tools import BaseTool
|
|
14
|
+
CREWAI_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
# Provide a dummy base class so the module can still be imported
|
|
17
|
+
class BaseTool: # type: ignore
|
|
18
|
+
def __init_subclass__(cls, **kwargs): pass
|
|
19
|
+
def __init__(self, **kwargs): pass
|
|
20
|
+
CREWAI_AVAILABLE = False
|
|
13
21
|
from pydantic import BaseModel, Field, PrivateAttr
|
|
14
22
|
|
|
15
23
|
if TYPE_CHECKING:
|
codegraph_cli/embeddings.py
CHANGED
|
@@ -1,23 +1,32 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
1
|
+
"""Configurable code embedding engine with multiple model support.
|
|
2
|
+
|
|
3
|
+
Supported models (configure via ``cg set-embedding``):
|
|
4
|
+
|
|
5
|
+
========== ====================================== ========= ====== ======================
|
|
6
|
+
Key HuggingFace Model Download Dim Notes
|
|
7
|
+
========== ====================================== ========= ====== ======================
|
|
8
|
+
qodo-1.5b Qodo/Qodo-Embed-1-1.5B ~6.2 GB 1536 Best quality, code-optimized
|
|
9
|
+
jina-code jinaai/jina-embeddings-v2-base-code ~550 MB 768 Good quality, code-aware
|
|
10
|
+
bge-base BAAI/bge-base-en-v1.5 ~440 MB 768 Solid general-purpose
|
|
11
|
+
minilm sentence-transformers/all-MiniLM-L6-v2 ~80 MB 384 Tiny and fast
|
|
12
|
+
hash (none) 0 B 256 No ML, keyword-level only
|
|
13
|
+
========== ====================================== ========= ====== ======================
|
|
14
|
+
|
|
15
|
+
Architecture:
|
|
16
|
+
- Models downloaded once from HuggingFace and cached in ``~/.codegraph/models``.
|
|
17
|
+
- All inference runs on-device (CPU or GPU). No data leaves the machine.
|
|
18
|
+
- Uses raw ``transformers`` library only — no sentence-transformers, no flash_attn.
|
|
19
|
+
- Falls back to hash embeddings when ``torch``/``transformers`` are not installed.
|
|
10
20
|
"""
|
|
11
21
|
|
|
12
22
|
from __future__ import annotations
|
|
13
23
|
|
|
14
24
|
import logging
|
|
15
25
|
import math
|
|
16
|
-
import os
|
|
17
26
|
import re
|
|
18
27
|
from hashlib import blake2b
|
|
19
28
|
from pathlib import Path
|
|
20
|
-
from typing import Iterable, List, Optional, Union
|
|
29
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
21
30
|
|
|
22
31
|
from .config import BASE_DIR
|
|
23
32
|
|
|
@@ -26,44 +35,115 @@ logger = logging.getLogger(__name__)
|
|
|
26
35
|
# Default local model cache directory
|
|
27
36
|
MODEL_CACHE_DIR: Path = BASE_DIR / "models"
|
|
28
37
|
|
|
29
|
-
# Preferred models in priority order
|
|
30
|
-
PREFERRED_MODELS: List[str] = [
|
|
31
|
-
"all-MiniLM-L6-v2",
|
|
32
|
-
"nomic-ai/nomic-embed-text-v1.5",
|
|
33
|
-
]
|
|
34
|
-
|
|
35
38
|
_TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
# ===================================================================
|
|
39
|
-
#
|
|
42
|
+
# Model Registry
|
|
43
|
+
# ===================================================================
|
|
44
|
+
|
|
45
|
+
EMBEDDING_MODELS: Dict[str, Dict[str, Any]] = {
|
|
46
|
+
"qodo-1.5b": {
|
|
47
|
+
"name": "Qodo Embed 1.5B",
|
|
48
|
+
"hf_id": "Qodo/Qodo-Embed-1-1.5B",
|
|
49
|
+
"dim": 1536,
|
|
50
|
+
"max_tokens": 8192,
|
|
51
|
+
"size": "~6.2 GB",
|
|
52
|
+
"description": "Best quality, code-optimized (needs 8GB+ RAM)",
|
|
53
|
+
"pooling": "last_token",
|
|
54
|
+
"trust_remote_code": True,
|
|
55
|
+
},
|
|
56
|
+
"jina-code": {
|
|
57
|
+
"name": "Jina Embeddings v2 Code",
|
|
58
|
+
"hf_id": "jinaai/jina-embeddings-v2-base-code",
|
|
59
|
+
"dim": 768,
|
|
60
|
+
"max_tokens": 8192,
|
|
61
|
+
"size": "~550 MB",
|
|
62
|
+
"description": "Good quality, code-aware, lightweight",
|
|
63
|
+
"pooling": "mean",
|
|
64
|
+
"trust_remote_code": True,
|
|
65
|
+
},
|
|
66
|
+
"bge-base": {
|
|
67
|
+
"name": "BGE Base EN v1.5",
|
|
68
|
+
"hf_id": "BAAI/bge-base-en-v1.5",
|
|
69
|
+
"dim": 768,
|
|
70
|
+
"max_tokens": 512,
|
|
71
|
+
"size": "~440 MB",
|
|
72
|
+
"description": "Solid general-purpose, fast",
|
|
73
|
+
"pooling": "cls",
|
|
74
|
+
"trust_remote_code": False,
|
|
75
|
+
},
|
|
76
|
+
"minilm": {
|
|
77
|
+
"name": "MiniLM L6 v2",
|
|
78
|
+
"hf_id": "sentence-transformers/all-MiniLM-L6-v2",
|
|
79
|
+
"dim": 384,
|
|
80
|
+
"max_tokens": 256,
|
|
81
|
+
"size": "~80 MB",
|
|
82
|
+
"description": "Tiny and fast, decent quality",
|
|
83
|
+
"pooling": "mean",
|
|
84
|
+
"trust_remote_code": False,
|
|
85
|
+
},
|
|
86
|
+
"hash": {
|
|
87
|
+
"name": "Hash Embedding",
|
|
88
|
+
"hf_id": None,
|
|
89
|
+
"dim": 256,
|
|
90
|
+
"max_tokens": None,
|
|
91
|
+
"size": "0 bytes",
|
|
92
|
+
"description": "Zero-dependency fallback, no semantics",
|
|
93
|
+
"pooling": None,
|
|
94
|
+
"trust_remote_code": False,
|
|
95
|
+
},
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
DEFAULT_MODEL = "hash"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ===================================================================
|
|
102
|
+
# TransformerEmbedder (handles all HuggingFace models)
|
|
40
103
|
# ===================================================================
|
|
41
104
|
|
|
42
|
-
class
|
|
43
|
-
"""
|
|
105
|
+
class TransformerEmbedder:
|
|
106
|
+
"""Generic HuggingFace embedding engine with configurable pooling.
|
|
44
107
|
|
|
45
|
-
|
|
46
|
-
``~/.codegraph/models`` so that subsequent runs are fully offline.
|
|
47
|
-
All computation is local – **no data leaves the machine**.
|
|
108
|
+
Supports multiple pooling strategies:
|
|
48
109
|
|
|
49
|
-
|
|
110
|
+
- **last_token** — last non-padding token (Qodo models).
|
|
111
|
+
- **mean** — mean over non-padding tokens (Jina, MiniLM).
|
|
112
|
+
- **cls** — ``[CLS]`` first token (BGE models).
|
|
50
113
|
|
|
51
|
-
|
|
52
|
-
|
|
114
|
+
Model weights are downloaded on first use and cached in
|
|
115
|
+
``~/.codegraph/models/`` for offline subsequent runs.
|
|
53
116
|
"""
|
|
54
117
|
|
|
55
118
|
def __init__(
|
|
56
119
|
self,
|
|
57
|
-
|
|
120
|
+
model_key: str,
|
|
58
121
|
cache_dir: Optional[Path] = None,
|
|
59
122
|
device: str = "cpu",
|
|
60
123
|
) -> None:
|
|
61
|
-
|
|
124
|
+
if model_key not in EMBEDDING_MODELS:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"Unknown model: '{model_key}'. "
|
|
127
|
+
f"Available: {', '.join(EMBEDDING_MODELS.keys())}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
spec = EMBEDDING_MODELS[model_key]
|
|
131
|
+
if spec["hf_id"] is None:
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"'{model_key}' has no transformer backend. Use HashEmbeddingModel."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
self.model_key = model_key
|
|
137
|
+
self.hf_id: str = spec["hf_id"]
|
|
138
|
+
self.dim: int = spec["dim"]
|
|
139
|
+
self.max_length: int = spec["max_tokens"]
|
|
140
|
+
self.pooling: str = spec["pooling"]
|
|
141
|
+
self.trust_remote_code: bool = spec["trust_remote_code"]
|
|
62
142
|
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
|
63
143
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
64
144
|
self.device = device
|
|
65
|
-
self._model:
|
|
66
|
-
self.
|
|
145
|
+
self._model: Any = None
|
|
146
|
+
self._tokenizer: Any = None
|
|
67
147
|
|
|
68
148
|
# ------------------------------------------------------------------
|
|
69
149
|
# Lazy model loading
|
|
@@ -74,100 +154,154 @@ class NeuralEmbedder:
|
|
|
74
154
|
return
|
|
75
155
|
|
|
76
156
|
try:
|
|
77
|
-
|
|
157
|
+
import torch # noqa: F401
|
|
158
|
+
from transformers import AutoModel, AutoTokenizer
|
|
78
159
|
except ImportError:
|
|
79
160
|
raise ImportError(
|
|
80
|
-
"
|
|
81
|
-
"Install with: pip install
|
|
161
|
+
"torch and transformers are required for neural embeddings.\n"
|
|
162
|
+
"Install with: pip install codegraph-cli[embeddings]\n"
|
|
163
|
+
"For CPU-only (skip NVIDIA packages):\n"
|
|
164
|
+
" pip install torch --index-url https://download.pytorch.org/whl/cpu\n"
|
|
165
|
+
" pip install transformers"
|
|
82
166
|
)
|
|
83
167
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
168
|
+
logger.info(
|
|
169
|
+
"Loading embedding model '%s' (%s) — first run downloads %s...",
|
|
170
|
+
self.model_key, self.hf_id, EMBEDDING_MODELS[self.model_key]["size"],
|
|
87
171
|
)
|
|
88
172
|
|
|
89
173
|
try:
|
|
90
|
-
self.
|
|
91
|
-
self.
|
|
92
|
-
|
|
93
|
-
|
|
174
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
175
|
+
self.hf_id,
|
|
176
|
+
cache_dir=str(self.cache_dir),
|
|
177
|
+
trust_remote_code=self.trust_remote_code,
|
|
94
178
|
)
|
|
95
|
-
self.
|
|
179
|
+
self._model = AutoModel.from_pretrained(
|
|
180
|
+
self.hf_id,
|
|
181
|
+
cache_dir=str(self.cache_dir),
|
|
182
|
+
trust_remote_code=self.trust_remote_code,
|
|
183
|
+
)
|
|
184
|
+
self._model.eval()
|
|
185
|
+
self._model.to(self.device)
|
|
96
186
|
logger.info(
|
|
97
|
-
"Loaded
|
|
98
|
-
self.
|
|
187
|
+
"Loaded '%s' (dim=%d, pooling=%s) on %s",
|
|
188
|
+
self.model_key, self.dim, self.pooling, self.device,
|
|
99
189
|
)
|
|
100
190
|
except Exception as exc:
|
|
101
191
|
raise RuntimeError(
|
|
102
|
-
f"Failed to load embedding model '{self.
|
|
192
|
+
f"Failed to load embedding model '{self.model_key}' "
|
|
193
|
+
f"({self.hf_id}): {exc}"
|
|
103
194
|
) from exc
|
|
104
195
|
|
|
105
196
|
# ------------------------------------------------------------------
|
|
106
|
-
#
|
|
197
|
+
# Pooling strategies
|
|
107
198
|
# ------------------------------------------------------------------
|
|
108
199
|
|
|
109
|
-
@
|
|
110
|
-
def
|
|
111
|
-
"""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
200
|
+
@staticmethod
|
|
201
|
+
def _pool_last_token(last_hidden_states: Any, attention_mask: Any) -> Any:
|
|
202
|
+
"""Last non-padding token (Qodo style)."""
|
|
203
|
+
import torch
|
|
204
|
+
|
|
205
|
+
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
|
206
|
+
if left_padding:
|
|
207
|
+
return last_hidden_states[:, -1]
|
|
208
|
+
sequence_lengths = attention_mask.sum(dim=1) - 1
|
|
209
|
+
batch_size = last_hidden_states.shape[0]
|
|
210
|
+
return last_hidden_states[
|
|
211
|
+
torch.arange(batch_size, device=last_hidden_states.device),
|
|
212
|
+
sequence_lengths,
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def _pool_mean(last_hidden_states: Any, attention_mask: Any) -> Any:
|
|
217
|
+
"""Mean over non-padding tokens (Jina, MiniLM)."""
|
|
218
|
+
mask_expanded = attention_mask.unsqueeze(-1).expand(
|
|
219
|
+
last_hidden_states.size()
|
|
220
|
+
).float()
|
|
221
|
+
sum_embeddings = (last_hidden_states * mask_expanded).sum(dim=1)
|
|
222
|
+
sum_mask = mask_expanded.sum(dim=1).clamp(min=1e-9)
|
|
223
|
+
return sum_embeddings / sum_mask
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _pool_cls(last_hidden_states: Any, attention_mask: Any) -> Any:
|
|
227
|
+
"""[CLS] first token (BGE)."""
|
|
228
|
+
return last_hidden_states[:, 0]
|
|
229
|
+
|
|
230
|
+
def _pool(self, last_hidden_states: Any, attention_mask: Any) -> Any:
|
|
231
|
+
"""Dispatch to the pooling strategy for this model."""
|
|
232
|
+
if self.pooling == "last_token":
|
|
233
|
+
return self._pool_last_token(last_hidden_states, attention_mask)
|
|
234
|
+
if self.pooling == "mean":
|
|
235
|
+
return self._pool_mean(last_hidden_states, attention_mask)
|
|
236
|
+
if self.pooling == "cls":
|
|
237
|
+
return self._pool_cls(last_hidden_states, attention_mask)
|
|
238
|
+
raise ValueError(f"Unknown pooling strategy: {self.pooling}")
|
|
239
|
+
|
|
240
|
+
# ------------------------------------------------------------------
|
|
241
|
+
# Encode
|
|
242
|
+
# ------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
def _encode(self, texts: List[str]) -> List[List[float]]:
|
|
245
|
+
"""Encode a batch of texts into L2-normalised embedding vectors."""
|
|
246
|
+
import torch
|
|
247
|
+
import torch.nn.functional as F
|
|
116
248
|
|
|
117
|
-
def embed_text(self, text: str) -> List[float]:
|
|
118
|
-
"""Embed a single text string and return a unit-norm vector."""
|
|
119
249
|
self._load_model()
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
250
|
+
|
|
251
|
+
batch_dict = self._tokenizer(
|
|
252
|
+
texts,
|
|
253
|
+
max_length=self.max_length,
|
|
254
|
+
padding=True,
|
|
255
|
+
truncation=True,
|
|
256
|
+
return_tensors="pt",
|
|
257
|
+
)
|
|
258
|
+
batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}
|
|
259
|
+
|
|
260
|
+
with torch.no_grad():
|
|
261
|
+
outputs = self._model(**batch_dict)
|
|
262
|
+
|
|
263
|
+
embeddings = self._pool(
|
|
264
|
+
outputs.last_hidden_state, batch_dict["attention_mask"],
|
|
126
265
|
)
|
|
127
|
-
|
|
266
|
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
|
267
|
+
return embeddings.cpu().tolist()
|
|
268
|
+
|
|
269
|
+
# ------------------------------------------------------------------
|
|
270
|
+
# Public API
|
|
271
|
+
# ------------------------------------------------------------------
|
|
272
|
+
|
|
273
|
+
def embed_text(self, text: str) -> List[float]:
|
|
274
|
+
"""Embed a single text string and return a unit-norm vector."""
|
|
275
|
+
return self._encode([text])[0]
|
|
128
276
|
|
|
129
277
|
def embed_documents(
|
|
130
278
|
self,
|
|
131
279
|
texts: List[str],
|
|
132
|
-
batch_size: int =
|
|
280
|
+
batch_size: int = 16,
|
|
133
281
|
) -> List[List[float]]:
|
|
134
|
-
"""Embed multiple documents with batching
|
|
135
|
-
|
|
136
|
-
Args:
|
|
137
|
-
texts: List of text strings to embed.
|
|
138
|
-
batch_size: Number of texts per forward pass.
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
List of embedding vectors (each normalised to unit length).
|
|
142
|
-
"""
|
|
282
|
+
"""Embed multiple documents with batching."""
|
|
143
283
|
if not texts:
|
|
144
284
|
return []
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
batch_size=batch_size,
|
|
150
|
-
convert_to_numpy=True,
|
|
151
|
-
normalize_embeddings=True,
|
|
152
|
-
show_progress_bar=len(texts) > 100,
|
|
153
|
-
)
|
|
154
|
-
return embeddings.tolist()
|
|
285
|
+
all_embeddings: List[List[float]] = []
|
|
286
|
+
for i in range(0, len(texts), batch_size):
|
|
287
|
+
all_embeddings.extend(self._encode(texts[i : i + batch_size]))
|
|
288
|
+
return all_embeddings
|
|
155
289
|
|
|
156
|
-
# Backward-compat alias used by legacy callers
|
|
157
290
|
def embed_many(self, texts: Iterable[str]) -> List[List[float]]:
|
|
158
291
|
"""Alias for :meth:`embed_documents`."""
|
|
159
292
|
return self.embed_documents(list(texts))
|
|
160
293
|
|
|
161
294
|
|
|
162
295
|
# ===================================================================
|
|
163
|
-
# HashEmbeddingModel (
|
|
296
|
+
# HashEmbeddingModel (Zero-dependency fallback)
|
|
164
297
|
# ===================================================================
|
|
165
298
|
|
|
166
299
|
class HashEmbeddingModel:
|
|
167
|
-
"""Deterministic token-hashing embedder
|
|
300
|
+
"""Deterministic token-hashing embedder — no ML dependencies.
|
|
168
301
|
|
|
169
|
-
Provides basic keyword-level similarity.
|
|
170
|
-
|
|
302
|
+
Provides basic keyword-level similarity. Used as the default when
|
|
303
|
+
``torch``/``transformers`` are not installed or when ``hash`` is
|
|
304
|
+
selected via ``cg set-embedding hash``.
|
|
171
305
|
"""
|
|
172
306
|
|
|
173
307
|
def __init__(self, dim: int = 256) -> None:
|
|
@@ -189,7 +323,7 @@ class HashEmbeddingModel:
|
|
|
189
323
|
return [self.embed_text(text) for text in texts]
|
|
190
324
|
|
|
191
325
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
192
|
-
"""Alias matching the
|
|
326
|
+
"""Alias matching the TransformerEmbedder interface."""
|
|
193
327
|
return self.embed_many(texts)
|
|
194
328
|
|
|
195
329
|
|
|
@@ -198,27 +332,61 @@ class HashEmbeddingModel:
|
|
|
198
332
|
# ===================================================================
|
|
199
333
|
|
|
200
334
|
def get_embedder(
|
|
201
|
-
|
|
335
|
+
model_key: Optional[str] = None,
|
|
202
336
|
cache_dir: Optional[Path] = None,
|
|
203
337
|
device: str = "cpu",
|
|
204
|
-
) -> Union[
|
|
205
|
-
"""Return the
|
|
338
|
+
) -> Union[TransformerEmbedder, HashEmbeddingModel]:
|
|
339
|
+
"""Return the configured embedder.
|
|
340
|
+
|
|
341
|
+
Resolution order:
|
|
342
|
+
|
|
343
|
+
1. Explicit ``model_key`` argument.
|
|
344
|
+
2. ``[embeddings].model`` from ``~/.codegraph/config.toml``.
|
|
345
|
+
3. ``"hash"`` (zero-dependency fallback).
|
|
206
346
|
|
|
207
|
-
|
|
208
|
-
|
|
347
|
+
If a transformer model is configured but ``torch``/``transformers``
|
|
348
|
+
are missing, falls back to hash with a warning.
|
|
209
349
|
"""
|
|
350
|
+
if model_key is None:
|
|
351
|
+
try:
|
|
352
|
+
from .config_manager import load_embedding_config
|
|
353
|
+
emb_cfg = load_embedding_config()
|
|
354
|
+
model_key = emb_cfg.get("model", None)
|
|
355
|
+
except Exception:
|
|
356
|
+
model_key = None
|
|
357
|
+
|
|
358
|
+
# Default to hash if nothing configured
|
|
359
|
+
if model_key is None:
|
|
360
|
+
model_key = DEFAULT_MODEL
|
|
361
|
+
|
|
362
|
+
# Hash path — no ML needed
|
|
363
|
+
if model_key == "hash":
|
|
364
|
+
return HashEmbeddingModel()
|
|
365
|
+
|
|
366
|
+
# Unknown model guard
|
|
367
|
+
if model_key not in EMBEDDING_MODELS:
|
|
368
|
+
logger.warning(
|
|
369
|
+
"Unknown embedding model '%s' — falling back to hash.", model_key,
|
|
370
|
+
)
|
|
371
|
+
return HashEmbeddingModel()
|
|
372
|
+
|
|
373
|
+
spec = EMBEDDING_MODELS[model_key]
|
|
374
|
+
if spec["hf_id"] is None:
|
|
375
|
+
return HashEmbeddingModel()
|
|
376
|
+
|
|
377
|
+
# Transformer path — check dependencies
|
|
210
378
|
try:
|
|
211
|
-
import
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
cache_dir=cache_dir,
|
|
215
|
-
device=device,
|
|
379
|
+
import torch # noqa: F401
|
|
380
|
+
import transformers # noqa: F401
|
|
381
|
+
return TransformerEmbedder(
|
|
382
|
+
model_key=model_key, cache_dir=cache_dir, device=device,
|
|
216
383
|
)
|
|
217
384
|
except ImportError:
|
|
218
385
|
logger.warning(
|
|
219
|
-
"
|
|
220
|
-
"
|
|
221
|
-
"
|
|
386
|
+
"Embedding model '%s' requires torch + transformers. "
|
|
387
|
+
"Falling back to hash embeddings. Install with: "
|
|
388
|
+
"pip install codegraph-cli[embeddings]",
|
|
389
|
+
model_key,
|
|
222
390
|
)
|
|
223
391
|
return HashEmbeddingModel()
|
|
224
392
|
|
codegraph_cli/orchestrator.py
CHANGED
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Dict, List
|
|
7
7
|
|
|
8
8
|
from .agents import GraphAgent, RAGAgent, SummarizationAgent
|
|
9
|
-
from .embeddings import
|
|
9
|
+
from .embeddings import get_embedder
|
|
10
10
|
from .llm import LocalLLM
|
|
11
11
|
from .models import ImpactReport, SearchResult
|
|
12
12
|
from .rag import RAGRetriever
|
|
@@ -25,7 +25,7 @@ class MCPOrchestrator:
|
|
|
25
25
|
llm_endpoint: str | None = None,
|
|
26
26
|
):
|
|
27
27
|
self.store = store
|
|
28
|
-
self.embedding_model =
|
|
28
|
+
self.embedding_model = get_embedder()
|
|
29
29
|
self.graph_agent = GraphAgent(store, self.embedding_model)
|
|
30
30
|
self.rag_agent = RAGAgent(RAGRetriever(store, self.embedding_model))
|
|
31
31
|
self.summarization_agent = SummarizationAgent(
|
codegraph_cli/rag.py
CHANGED
|
@@ -11,7 +11,7 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
from typing import Any, Dict, List, Optional, Union
|
|
13
13
|
|
|
14
|
-
from .embeddings import HashEmbeddingModel,
|
|
14
|
+
from .embeddings import HashEmbeddingModel, TransformerEmbedder, cosine_similarity
|
|
15
15
|
from .models import SearchResult
|
|
16
16
|
from .storage import GraphStore
|
|
17
17
|
|
|
@@ -29,14 +29,14 @@ class RAGRetriever:
|
|
|
29
29
|
cosine similarity in Python.
|
|
30
30
|
|
|
31
31
|
The ``embedding_model`` argument accepts either a
|
|
32
|
-
:class:`~codegraph_cli.embeddings.
|
|
32
|
+
:class:`~codegraph_cli.embeddings.TransformerEmbedder` or the lightweight
|
|
33
33
|
:class:`~codegraph_cli.embeddings.HashEmbeddingModel`.
|
|
34
34
|
"""
|
|
35
35
|
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
38
|
store: GraphStore,
|
|
39
|
-
embedding_model: Union[
|
|
39
|
+
embedding_model: Union[TransformerEmbedder, HashEmbeddingModel, Any],
|
|
40
40
|
) -> None:
|
|
41
41
|
self.store = store
|
|
42
42
|
self.embedding_model = embedding_model
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codegraph-cli
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.1
|
|
4
4
|
Summary: AI-powered code intelligence CLI with multi-agent analysis, impact graphs, and conversational coding.
|
|
5
|
-
Author-email: Ali Nasir <
|
|
5
|
+
Author-email: Ali Nasir <muhammadalinasir00786@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/al1-nasir/codegraph-cli
|
|
8
8
|
Project-URL: Documentation, https://github.com/al1-nasir/codegraph-cli#readme
|
|
@@ -31,7 +31,6 @@ Requires-Dist: typer<1.0.0,>=0.12.0
|
|
|
31
31
|
Requires-Dist: toml>=0.10.2
|
|
32
32
|
Requires-Dist: lancedb>=0.4.0
|
|
33
33
|
Requires-Dist: pyarrow>=14.0.0
|
|
34
|
-
Requires-Dist: sentence-transformers>=2.2.0
|
|
35
34
|
Requires-Dist: tree-sitter>=0.24.0
|
|
36
35
|
Requires-Dist: tree-sitter-python>=0.23.0
|
|
37
36
|
Requires-Dist: tree-sitter-javascript>=0.23.0
|
|
@@ -45,9 +44,13 @@ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
|
45
44
|
Requires-Dist: pytest-mock>=3.11.0; extra == "dev"
|
|
46
45
|
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
47
46
|
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
47
|
+
Provides-Extra: embeddings
|
|
48
|
+
Requires-Dist: torch>=2.0.0; extra == "embeddings"
|
|
49
|
+
Requires-Dist: transformers<5.0.0,>=4.48.0; extra == "embeddings"
|
|
48
50
|
Provides-Extra: all
|
|
49
51
|
Requires-Dist: crewai>=0.80.0; extra == "all"
|
|
50
|
-
Requires-Dist:
|
|
52
|
+
Requires-Dist: torch>=2.0.0; extra == "all"
|
|
53
|
+
Requires-Dist: transformers<5.0.0,>=4.48.0; extra == "all"
|
|
51
54
|
Dynamic: license-file
|
|
52
55
|
|
|
53
56
|
# CodeGraph CLI
|
|
@@ -56,7 +59,7 @@ Dynamic: license-file
|
|
|
56
59
|
|
|
57
60
|
[](LICENSE)
|
|
58
61
|
[](https://www.python.org)
|
|
59
|
-
[](https://github.com/al1-nasir/codegraph-cli)
|
|
60
63
|
|
|
61
64
|
---
|
|
62
65
|
|
|
@@ -81,12 +84,24 @@ Core capabilities:
|
|
|
81
84
|
pip install codegraph-cli
|
|
82
85
|
```
|
|
83
86
|
|
|
87
|
+
With neural embedding models (semantic code search):
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install codegraph-cli[embeddings]
|
|
91
|
+
```
|
|
92
|
+
|
|
84
93
|
With CrewAI multi-agent support:
|
|
85
94
|
|
|
86
95
|
```bash
|
|
87
96
|
pip install codegraph-cli[crew]
|
|
88
97
|
```
|
|
89
98
|
|
|
99
|
+
Everything:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install codegraph-cli[all]
|
|
103
|
+
```
|
|
104
|
+
|
|
90
105
|
For development:
|
|
91
106
|
|
|
92
107
|
```bash
|
|
@@ -153,6 +168,34 @@ cg unset-llm # reset to defaults
|
|
|
153
168
|
|
|
154
169
|
---
|
|
155
170
|
|
|
171
|
+
## Embedding Models
|
|
172
|
+
|
|
173
|
+
CodeGraph supports configurable embedding models for semantic code search. Choose based on your hardware and quality needs:
|
|
174
|
+
|
|
175
|
+
| Model | Download | Dim | Quality | Command |
|
|
176
|
+
|-------|----------|-----|---------|---------|
|
|
177
|
+
| hash | 0 bytes | 256 | Keyword-only | `cg set-embedding hash` |
|
|
178
|
+
| minilm | ~80 MB | 384 | Decent | `cg set-embedding minilm` |
|
|
179
|
+
| bge-base | ~440 MB | 768 | Good | `cg set-embedding bge-base` |
|
|
180
|
+
| jina-code | ~550 MB | 768 | Code-aware | `cg set-embedding jina-code` |
|
|
181
|
+
| qodo-1.5b | ~6.2 GB | 1536 | Best | `cg set-embedding qodo-1.5b` |
|
|
182
|
+
|
|
183
|
+
The default is `hash` (zero-dependency, no download). Neural models require the `[embeddings]` extra and are downloaded on first use from HuggingFace.
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
cg set-embedding jina-code # switch to a neural model
|
|
187
|
+
cg show-embedding # view current model and all options
|
|
188
|
+
cg unset-embedding # reset to hash default
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
After changing the embedding model, re-index your project:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
cg index /path/to/project
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
156
199
|
## Commands
|
|
157
200
|
|
|
158
201
|
### Project Management
|
|
@@ -249,8 +292,9 @@ CLI Layer (Typer)
|
|
|
249
292
|
| | |
|
|
250
293
|
| +-- Parser (tree-sitter) +-- VectorStore (LanceDB)
|
|
251
294
|
| +-- RAGRetriever |
|
|
252
|
-
| +-- LLM Adapter +-- Embeddings
|
|
253
|
-
|
|
|
295
|
+
| +-- LLM Adapter +-- Embeddings (configurable)
|
|
296
|
+
| hash | minilm | bge-base
|
|
297
|
+
| jina-code | qodo-1.5b
|
|
254
298
|
+-- ChatAgent (standard mode)
|
|
255
299
|
|
|
|
256
300
|
+-- CrewChatAgent (--crew mode)
|
|
@@ -261,6 +305,8 @@ CLI Layer (Typer)
|
|
|
261
305
|
+-- Code Analysis Agent ---> 3 search/analysis tools
|
|
262
306
|
```
|
|
263
307
|
|
|
308
|
+
**Embeddings**: Five models available via `cg set-embedding`. Hash (default, zero-dependency) through Qodo-Embed-1-1.5B (best quality, 6 GB). Neural models use raw `transformers` + `torch` — no sentence-transformers overhead. Models are cached in `~/.codegraph/models/`.
|
|
309
|
+
|
|
264
310
|
**Parser**: tree-sitter grammars for Python, JavaScript, and TypeScript. Extracts modules, classes, functions, imports, and call relationships into a directed graph.
|
|
265
311
|
|
|
266
312
|
**Storage**: SQLite for the code graph (nodes + edges), LanceDB for vector embeddings. All data stored under `~/.codegraph/`.
|
|
@@ -275,14 +321,14 @@ CLI Layer (Typer)
|
|
|
275
321
|
codegraph_cli/
|
|
276
322
|
cli.py # main Typer application, all top-level commands
|
|
277
323
|
cli_chat.py # interactive chat REPL with styled output
|
|
278
|
-
cli_setup.py # setup wizard, set-llm, unset-llm,
|
|
324
|
+
cli_setup.py # setup wizard, set-llm, unset-llm, set-embedding
|
|
279
325
|
cli_v2.py # v2 code generation commands
|
|
280
326
|
config.py # loads config from TOML
|
|
281
|
-
config_manager.py # TOML read/write, provider
|
|
327
|
+
config_manager.py # TOML read/write, provider and embedding config
|
|
282
328
|
llm.py # multi-provider LLM adapter
|
|
283
329
|
parser.py # tree-sitter AST parsing
|
|
284
330
|
storage.py # SQLite graph store
|
|
285
|
-
embeddings.py #
|
|
331
|
+
embeddings.py # configurable embedding engine (5 models)
|
|
286
332
|
rag.py # RAG retriever
|
|
287
333
|
vector_store.py # LanceDB vector store
|
|
288
334
|
orchestrator.py # coordinates parsing, search, impact
|
|
@@ -307,7 +353,7 @@ codegraph_cli/
|
|
|
307
353
|
git clone https://github.com/al1-nasir/codegraph-cli.git
|
|
308
354
|
cd codegraph-cli
|
|
309
355
|
python -m venv .venv && source .venv/bin/activate
|
|
310
|
-
pip install -e ".[dev,crew]"
|
|
356
|
+
pip install -e ".[dev,crew,embeddings]"
|
|
311
357
|
pytest
|
|
312
358
|
```
|
|
313
359
|
|
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
codegraph_cli/__init__.py,sha256=
|
|
2
|
-
codegraph_cli/agents.py,sha256=
|
|
1
|
+
codegraph_cli/__init__.py,sha256=qTFuIhMU-qKms6nhobwg3YUgDBKR0JenO_3Pq5VgHEk,78
|
|
2
|
+
codegraph_cli/agents.py,sha256=i4VpklF2WLgpS7bmCPcH5lAzohxErZLP5wvssmEK38w,7010
|
|
3
3
|
codegraph_cli/bug_detector.py,sha256=soT4luB5eQx6qrU5rgFCsG44rdo9jRpV0hn-b0f3LPo,16419
|
|
4
4
|
codegraph_cli/chat_agent.py,sha256=dbkEY3zaPJh0ztYaVkCwkTw5zSLGArHkChC_6JWOneg,13685
|
|
5
5
|
codegraph_cli/chat_session.py,sha256=GVey-hnfsa9fa6k2PY1sgy1wtrYSUHKE5cJDV2hG-tg,7038
|
|
6
|
-
codegraph_cli/cli.py,sha256=
|
|
7
|
-
codegraph_cli/cli_chat.py,sha256=
|
|
6
|
+
codegraph_cli/cli.py,sha256=eEzH4TOgyMAFJpVhh2hU0MD2oh61s1hBomeSFx3I3qE,11199
|
|
7
|
+
codegraph_cli/cli_chat.py,sha256=6BV6UADrInATgeywmzr0R7u0Ju4WuyRXgoWHq7lDbUA,14407
|
|
8
8
|
codegraph_cli/cli_diagnose.py,sha256=gT4qHayC_uWRMsr1Tf92BCFJfRcXAMq8XdEImatrSkU,4260
|
|
9
9
|
codegraph_cli/cli_refactor.py,sha256=_u5RvsF3-KV5C_QnErA4sowlkIAmlxSeLeWKBmSusCI,8176
|
|
10
|
-
codegraph_cli/cli_setup.py,sha256=
|
|
10
|
+
codegraph_cli/cli_setup.py,sha256=f8KdcE0Tf9HQ_ewQm1R_4OZ91bOmi0kuM8eQ05Vs7is,24749
|
|
11
11
|
codegraph_cli/cli_test.py,sha256=ZFPIRhbZ9YYIuSWJyPYLi9PEdHZAI9h8FkWXXRYfqcw,5561
|
|
12
12
|
codegraph_cli/cli_v2.py,sha256=iuw3h5gtvsTg5SdUFXSdLx1Ttiq-oUDM7ZugMqMfETg,9465
|
|
13
13
|
codegraph_cli/codegen_agent.py,sha256=F73YZIIVgE5pOvJsKBl0cv22VW3rP_SGj2viwZS-rqE,9193
|
|
14
|
-
codegraph_cli/config.py,sha256=
|
|
15
|
-
codegraph_cli/config_manager.py,sha256=
|
|
16
|
-
codegraph_cli/context_manager.py,sha256=
|
|
17
|
-
codegraph_cli/crew_agents.py,sha256=
|
|
18
|
-
codegraph_cli/crew_chat.py,sha256=
|
|
19
|
-
codegraph_cli/crew_tools.py,sha256=
|
|
14
|
+
codegraph_cli/config.py,sha256=rOq4lDvqmoly1pfEukzPeCUb76BMqK7cUbzDSFHhsC8,1291
|
|
15
|
+
codegraph_cli/config_manager.py,sha256=K81Ca7jHzHlwxoJsSeRezl8V-iGGJD_IEGE7ZWo3eG0,11422
|
|
16
|
+
codegraph_cli/context_manager.py,sha256=qEKjI7llcLX9y8NFTDs3aiHDm7nDF9jTbhu3tHHOk6w,16824
|
|
17
|
+
codegraph_cli/crew_agents.py,sha256=RWdx0H8G5UwIGGCOr6Z7WH04P7V0zedtEiD1236BD3U,6125
|
|
18
|
+
codegraph_cli/crew_chat.py,sha256=ZppRIp4D3RtcZItFPw6mFJxTJGGacbN_f_a1KmMMg-o,6235
|
|
19
|
+
codegraph_cli/crew_tools.py,sha256=wvYJn1w6nZIXPXyPMpiyqsl3kJ9kpR-sK6QOXcny6oM,19624
|
|
20
20
|
codegraph_cli/diff_engine.py,sha256=VGwPG_pZFVz8lGuVHZz_0nhrDocglugw6TumMmnHdTY,8968
|
|
21
|
-
codegraph_cli/embeddings.py,sha256=
|
|
21
|
+
codegraph_cli/embeddings.py,sha256=YoR6OjiIFC628EnLhNWbw2-_YWqtxSlL--tNWHGsKRk,14611
|
|
22
22
|
codegraph_cli/graph_export.py,sha256=gPyRrOc4_gnW-JaHmmp2pAD60PiZIj_uYA6b0xfU5O0,4562
|
|
23
23
|
codegraph_cli/llm.py,sha256=RpGjJKhUvejmtCHTb9FpGInwPtfaEkHBChBSBTwxUUo,23170
|
|
24
24
|
codegraph_cli/models.py,sha256=o6Wlu8TtWEPDWgq0AhB1xJtxzVfViBMQoCW_4AS29p0,794
|
|
25
25
|
codegraph_cli/models_v2.py,sha256=8zS16hT4SlIahMBwDZ7j4I8fdm3YyWv5qD0urJv1LsI,5521
|
|
26
|
-
codegraph_cli/orchestrator.py,sha256=
|
|
26
|
+
codegraph_cli/orchestrator.py,sha256=AguYRsZ-xu-biM3-uZMhRf6QaoQEqSlGxC7eL3fZqXE,1790
|
|
27
27
|
codegraph_cli/parser.py,sha256=vtKOwirs30O9UxJ6siHzvEWLx4-PxMn5dAfhb42QBG4,29193
|
|
28
28
|
codegraph_cli/performance_analyzer.py,sha256=f9PNMZQ_8jWvzs4osPYgTW2eOsvDytIRmfWWO5DuWCs,10090
|
|
29
29
|
codegraph_cli/project_context.py,sha256=9tSEDEPRmfEQfLcyWXjPa8IGFC1sZI1ysochoxrm4y0,7672
|
|
30
|
-
codegraph_cli/rag.py,sha256=
|
|
30
|
+
codegraph_cli/rag.py,sha256=DTijL8uZjdEeShQHWFtC_EkoINRgzT3Cr_oHuHHQcfA,7125
|
|
31
31
|
codegraph_cli/refactor_agent.py,sha256=ktQyhUn5YjhbXt7IVgKV7JgSZaT4AivWLLpMlZ7NLXw,16657
|
|
32
32
|
codegraph_cli/security_scanner.py,sha256=rPf8PcYMBllco4PkrxfILJEqKaj1UuEKqCupVycKpo8,15681
|
|
33
33
|
codegraph_cli/storage.py,sha256=XR_w6nJ_ge4r72bfxuuY8Zt8qi8CtHm4EE268EB5kBE,14340
|
|
@@ -35,9 +35,9 @@ codegraph_cli/testgen_agent.py,sha256=rqlKbLeEnjfzAZhQUXqLPwFKwRIpiHriTPxVgPCuR_
|
|
|
35
35
|
codegraph_cli/validation_engine.py,sha256=pzoRH_b06gWfiDZ5Yiecf0SWDWs4oJ66JokggGZZbaw,9029
|
|
36
36
|
codegraph_cli/vector_store.py,sha256=qbIBVDoNOha8JgZwrk7_Jdb7RMYUnBLphJfmqQdrVN4,9912
|
|
37
37
|
codegraph_cli/templates/graph_interactive.html,sha256=PFpU69DbY-Vkcu5UTiqOva_LrZjN2erdz7VXPgNSt6Q,7813
|
|
38
|
-
codegraph_cli-2.
|
|
39
|
-
codegraph_cli-2.
|
|
40
|
-
codegraph_cli-2.
|
|
41
|
-
codegraph_cli-2.
|
|
42
|
-
codegraph_cli-2.
|
|
43
|
-
codegraph_cli-2.
|
|
38
|
+
codegraph_cli-2.1.1.dist-info/licenses/LICENSE,sha256=3PiQTjpJW4DDJz8k5pk-WqX9TrVQD3fNrVNzbTEyW-A,1066
|
|
39
|
+
codegraph_cli-2.1.1.dist-info/METADATA,sha256=tooj5BPm3FdTkHR7en9n9Gp-zxqOEMpkEGt4yJWugMw,12829
|
|
40
|
+
codegraph_cli-2.1.1.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
41
|
+
codegraph_cli-2.1.1.dist-info/entry_points.txt,sha256=_p5CutxbiWjGVTx9GPeYJ30XOblccdf7SCCNtCkPnaA,45
|
|
42
|
+
codegraph_cli-2.1.1.dist-info/top_level.txt,sha256=XKmdlLsrhdgVW-pN4vzdo-ZTl-9_Rk94SXcM2YRAmHk,14
|
|
43
|
+
codegraph_cli-2.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|