contextly 0.1.0__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contextly-0.1.0 → contextly-0.1.4}/PKG-INFO +1 -1
- {contextly-0.1.0 → contextly-0.1.4}/pyproject.toml +1 -1
- contextly-0.1.4/src/contextly/app.py +111 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/core/analyzer.py +9 -2
- contextly-0.1.4/src/contextly/core/embeddings.py +129 -0
- contextly-0.1.4/src/contextly/llm/__init__.py +13 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/llm/models.py +24 -15
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/llm/openai.py +3 -1
- {contextly-0.1.0 → contextly-0.1.4}/tests/test_core.py +0 -1
- contextly-0.1.0/src/contextly/app.py +0 -85
- contextly-0.1.0/src/contextly/core/embeddings.py +0 -109
- {contextly-0.1.0 → contextly-0.1.4}/.gitignore +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/LICENSE +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/README.md +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/__init__.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/cli.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/core/sync.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/llm/base.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/llm/manager.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/llm/ollama.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/parsers/base.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/parsers/config.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/parsers/javascript.py +0 -0
- {contextly-0.1.0 → contextly-0.1.4}/src/contextly/parsers/python.py +0 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core business logic for Contextly commands.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
|
+
|
|
8
|
+
from .core.analyzer import CodeAnalyzer
|
|
9
|
+
from .core.embeddings import EmbeddingEngine
|
|
10
|
+
from .core.sync import RepoSync
|
|
11
|
+
from .llm import LLMManager
|
|
12
|
+
|
|
13
|
+
class Contextly:
|
|
14
|
+
"""Main class coordinating Contextly's functionality."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, repo_path: Optional[Path] = None, model: Optional[str] = None):
|
|
17
|
+
self.repo_path = repo_path or Path.cwd()
|
|
18
|
+
self.llm_manager = LLMManager(model)
|
|
19
|
+
self.analyzer = CodeAnalyzer(self.repo_path)
|
|
20
|
+
self.embedding_engine = EmbeddingEngine(self.repo_path)
|
|
21
|
+
self.repo_sync = RepoSync(self.repo_path)
|
|
22
|
+
|
|
23
|
+
def ask(self, question: str) -> Dict[str, Any]:
|
|
24
|
+
"""Answer questions about the codebase."""
|
|
25
|
+
try:
|
|
26
|
+
# Search for relevant code
|
|
27
|
+
search_results = self.embedding_engine.search(question)
|
|
28
|
+
|
|
29
|
+
# Build context from search results
|
|
30
|
+
context = []
|
|
31
|
+
for result in search_results['results']:
|
|
32
|
+
context.append(f"From {result['file']}:\n{result['content']}\n")
|
|
33
|
+
|
|
34
|
+
if not context:
|
|
35
|
+
return {
|
|
36
|
+
'answer': 'No relevant code found to answer the question.',
|
|
37
|
+
'context': search_results
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Generate answer using LLM
|
|
41
|
+
context_str = '\n'.join(context)
|
|
42
|
+
context_dict = {
|
|
43
|
+
'question': question,
|
|
44
|
+
'code_snippets': context,
|
|
45
|
+
'files': [r['file'] for r in search_results['results']]
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
answer = self.llm_manager.explain_code(context_str, context_dict)
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
'answer': answer,
|
|
52
|
+
'context': search_results
|
|
53
|
+
}
|
|
54
|
+
except Exception as e:
|
|
55
|
+
return {
|
|
56
|
+
'error': f'Failed to process question: {str(e)}',
|
|
57
|
+
'context': None
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def explain(self, location: str) -> str:
|
|
61
|
+
"""Explain code at specific location."""
|
|
62
|
+
try:
|
|
63
|
+
if ':' not in location:
|
|
64
|
+
return "Invalid location format. Use 'file:line_number'"
|
|
65
|
+
file_path, line_number = location.split(':')
|
|
66
|
+
return self.analyzer.explain_code(file_path, int(line_number))
|
|
67
|
+
except ValueError:
|
|
68
|
+
return f"Invalid line number in location: {location}"
|
|
69
|
+
except Exception as e:
|
|
70
|
+
return f"Error explaining code: {str(e)}"
|
|
71
|
+
|
|
72
|
+
def search(self, term: str) -> Dict[str, Any]:
|
|
73
|
+
"""Semantic search across codebase."""
|
|
74
|
+
try:
|
|
75
|
+
results = self.embedding_engine.search(term)
|
|
76
|
+
return {
|
|
77
|
+
'query': results['query'],
|
|
78
|
+
'results': results['results']
|
|
79
|
+
}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {
|
|
82
|
+
'query': term,
|
|
83
|
+
'error': str(e),
|
|
84
|
+
'results': []
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
def diff(self, path1: Path, path2: Path) -> Dict[str, Any]:
|
|
88
|
+
"""Compare configuration files."""
|
|
89
|
+
return self.analyzer.compare_configs(path1, path2)
|
|
90
|
+
|
|
91
|
+
def sync(self) -> None:
|
|
92
|
+
"""Build or rebuild the embedding index."""
|
|
93
|
+
# Scan repository
|
|
94
|
+
files = list(self.repo_sync.scan_files())
|
|
95
|
+
|
|
96
|
+
# Build index
|
|
97
|
+
index = self.repo_sync.index_repository()
|
|
98
|
+
|
|
99
|
+
# Convert index to list of documents
|
|
100
|
+
docs = []
|
|
101
|
+
for file_path, file_data in index.items():
|
|
102
|
+
docs.extend(file_data.get('chunks', []))
|
|
103
|
+
|
|
104
|
+
# Build or update the embedding index with the documents
|
|
105
|
+
try:
|
|
106
|
+
if docs:
|
|
107
|
+
self.embedding_engine.embed_documents(docs)
|
|
108
|
+
else:
|
|
109
|
+
print("No documents found to index")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise RuntimeError(f"Failed to update search index: {str(e)}")
|
|
@@ -5,7 +5,9 @@ Core functionality for code parsing and analysis.
|
|
|
5
5
|
import difflib
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import List, Dict, Any, Optional
|
|
8
|
+
from typing import Type
|
|
8
9
|
from ..llm import LLMManager
|
|
10
|
+
from ..parsers.base import BaseParser
|
|
9
11
|
from ..parsers.python import PythonParser
|
|
10
12
|
from ..parsers.javascript import JavaScriptParser
|
|
11
13
|
from ..parsers.config import ConfigParser
|
|
@@ -66,8 +68,13 @@ class CodeAnalyzer:
|
|
|
66
68
|
return f"No code found at line {line_number}"
|
|
67
69
|
|
|
68
70
|
# Get explanation from LLM
|
|
69
|
-
|
|
70
|
-
|
|
71
|
+
code = target_chunk['content']
|
|
72
|
+
context = {
|
|
73
|
+
'file_type': result['file_type'],
|
|
74
|
+
'file_path': file_path,
|
|
75
|
+
'line_number': line_number
|
|
76
|
+
}
|
|
77
|
+
explanation = self.llm.explain_code(code, context)
|
|
71
78
|
|
|
72
79
|
return explanation
|
|
73
80
|
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vector embedding and similarity search functionality.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import os
|
|
7
|
+
from typing import List, Dict, Any, Optional
|
|
8
|
+
import chromadb
|
|
9
|
+
from chromadb.config import Settings
|
|
10
|
+
from sentence_transformers import SentenceTransformer
|
|
11
|
+
|
|
12
|
+
class EmbeddingEngine:
|
|
13
|
+
"""Handles document embedding and similarity search."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, repo_path: Path):
|
|
16
|
+
"""Initialize the embedding engine with the given repository path."""
|
|
17
|
+
self.repo_path = repo_path
|
|
18
|
+
self.model: Optional[SentenceTransformer] = None
|
|
19
|
+
self.db = None
|
|
20
|
+
self.collection = None
|
|
21
|
+
self.data_dir = self.repo_path / '.contextly' / 'embeddings'
|
|
22
|
+
self.collection_name = "code_embeddings"
|
|
23
|
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
|
|
25
|
+
def _initialize(self) -> None:
|
|
26
|
+
"""Initialize the model and database if not already initialized."""
|
|
27
|
+
if self.model is None:
|
|
28
|
+
try:
|
|
29
|
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
30
|
+
except Exception as e:
|
|
31
|
+
raise RuntimeError(f"Failed to initialize embedding model: {str(e)}")
|
|
32
|
+
|
|
33
|
+
if self.db is None:
|
|
34
|
+
try:
|
|
35
|
+
self.db = chromadb.PersistentClient(path=str(self.data_dir))
|
|
36
|
+
self.collection = self.db.get_or_create_collection(
|
|
37
|
+
name=self.collection_name,
|
|
38
|
+
metadata={"repo_path": str(self.repo_path)}
|
|
39
|
+
)
|
|
40
|
+
except Exception as e:
|
|
41
|
+
raise RuntimeError(f"Failed to initialize ChromaDB: {str(e)}")
|
|
42
|
+
|
|
43
|
+
def _ensure_initialized(self) -> None:
|
|
44
|
+
"""Ensure both model and database are initialized."""
|
|
45
|
+
if self.model is None or self.db is None or self.collection is None:
|
|
46
|
+
self._initialize()
|
|
47
|
+
"""Initialize the embedding model and vector store if not already initialized."""
|
|
48
|
+
self._ensure_initialized()
|
|
49
|
+
|
|
50
|
+
def embed_documents(self, documents: List[Dict[str, Any]]):
|
|
51
|
+
"""Convert documents into vector embeddings and store them."""
|
|
52
|
+
if not documents:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Ensure initialization
|
|
56
|
+
self._ensure_initialized()
|
|
57
|
+
|
|
58
|
+
# Prepare documents for ChromaDB
|
|
59
|
+
texts = []
|
|
60
|
+
metadatas = []
|
|
61
|
+
ids = []
|
|
62
|
+
|
|
63
|
+
for idx, doc in enumerate(documents):
|
|
64
|
+
# Create a meaningful document summary
|
|
65
|
+
doc_text = f"{doc.get('type', 'unknown')} - {doc.get('name', '')}:\n{doc.get('content', '')}"
|
|
66
|
+
texts.append(doc_text)
|
|
67
|
+
|
|
68
|
+
# Store metadata
|
|
69
|
+
metadatas.append({
|
|
70
|
+
"file_path": str(doc.get("file_path", "")),
|
|
71
|
+
"type": doc.get("type", "unknown"),
|
|
72
|
+
"name": doc.get("name", ""),
|
|
73
|
+
"start_line": doc.get("start_line", 0),
|
|
74
|
+
"end_line": doc.get("end_line", 0),
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Generate unique ID
|
|
78
|
+
doc_id = f"doc_{idx}_{hash(doc_text) & 0xFFFFFFFF}"
|
|
79
|
+
ids.append(doc_id)
|
|
80
|
+
|
|
81
|
+
# Ensure collection is initialized
|
|
82
|
+
if self.collection is None:
|
|
83
|
+
raise RuntimeError("Collection is not initialized.")
|
|
84
|
+
# Add documents to the collection
|
|
85
|
+
self.collection.add(
|
|
86
|
+
documents=texts,
|
|
87
|
+
metadatas=metadatas,
|
|
88
|
+
ids=ids
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
|
|
92
|
+
"""Search for most similar documents to query."""
|
|
93
|
+
self._ensure_initialized()
|
|
94
|
+
|
|
95
|
+
# Ensure collection is initialized
|
|
96
|
+
if self.collection is None:
|
|
97
|
+
raise RuntimeError("Collection is not initialized.")
|
|
98
|
+
|
|
99
|
+
# Query the collection
|
|
100
|
+
results = self.collection.query(
|
|
101
|
+
query_texts=[query],
|
|
102
|
+
n_results=top_k
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Format results
|
|
106
|
+
hits = []
|
|
107
|
+
documents = results.get('documents')
|
|
108
|
+
metadatas = results.get('metadatas')
|
|
109
|
+
distances = results.get('distances')
|
|
110
|
+
ids = results.get('ids')
|
|
111
|
+
|
|
112
|
+
if documents is not None and metadatas is not None and ids is not None:
|
|
113
|
+
for i in range(len(ids[0])):
|
|
114
|
+
hits.append({
|
|
115
|
+
'content': documents[0][i] if documents[0] is not None else None,
|
|
116
|
+
'metadata': metadatas[0][i] if metadatas[0] is not None else None,
|
|
117
|
+
'score': distances[0][i] if distances and distances[0] is not None else None,
|
|
118
|
+
'file': metadatas[0][i]['file_path'] if metadatas[0] is not None else None
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
'query': query,
|
|
123
|
+
'results': hits
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def clear(self):
|
|
127
|
+
"""Clear all embeddings from storage."""
|
|
128
|
+
if self.collection:
|
|
129
|
+
self.collection.delete()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""LLM package for Contextly."""
|
|
2
|
+
|
|
3
|
+
from .manager import LLMManager
|
|
4
|
+
from .base import LLMProvider
|
|
5
|
+
from .models import ModelManager, ModelRegistry, ModelProvider
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
'LLMManager',
|
|
9
|
+
'LLMProvider',
|
|
10
|
+
'ModelManager',
|
|
11
|
+
'ModelRegistry',
|
|
12
|
+
'ModelProvider'
|
|
13
|
+
]
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Model management system for various LLM providers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from dataclasses import dataclass
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from typing import Dict, Any, Optional, List
|
|
8
8
|
import requests
|
|
@@ -25,7 +25,7 @@ class ModelInfo:
|
|
|
25
25
|
description: str
|
|
26
26
|
context_length: int
|
|
27
27
|
quantization: Optional[str] = None
|
|
28
|
-
metadata: Dict[str, Any] =
|
|
28
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
29
29
|
|
|
30
30
|
class ModelRegistry:
|
|
31
31
|
"""Registry of available models and their capabilities."""
|
|
@@ -37,19 +37,28 @@ class ModelRegistry:
|
|
|
37
37
|
|
|
38
38
|
def _load_models(self) -> None:
|
|
39
39
|
"""Load model registry from config file."""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
40
|
+
try:
|
|
41
|
+
if self.config_path.exists():
|
|
42
|
+
with open(self.config_path, 'r') as f:
|
|
43
|
+
try:
|
|
44
|
+
data = json.load(f)
|
|
45
|
+
for model_data in data.get('models', []):
|
|
46
|
+
try:
|
|
47
|
+
info = ModelInfo(
|
|
48
|
+
name=model_data['name'],
|
|
49
|
+
provider=ModelProvider(model_data['provider']),
|
|
50
|
+
description=model_data.get('description', ''),
|
|
51
|
+
context_length=model_data.get('context_length', 4096),
|
|
52
|
+
quantization=model_data.get('quantization'),
|
|
53
|
+
metadata=model_data.get('metadata', {})
|
|
54
|
+
)
|
|
55
|
+
self.models[model_data['name']] = info
|
|
56
|
+
except (KeyError, ValueError) as e:
|
|
57
|
+
print(f"Warning: Skipping invalid model data: {e}")
|
|
58
|
+
except Exception as e:
|
|
59
|
+
print(f"Error loading models from config: {e}")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
print(f"Error accessing config file: {e}")
|
|
53
62
|
|
|
54
63
|
def _save_models(self) -> None:
|
|
55
64
|
"""Save model registry to config file."""
|
|
@@ -10,7 +10,7 @@ from .base import LLMProvider
|
|
|
10
10
|
class OpenAIProvider(LLMProvider):
|
|
11
11
|
"""LLM provider using OpenAI API."""
|
|
12
12
|
|
|
13
|
-
DEFAULT_MODEL = "gpt-
|
|
13
|
+
DEFAULT_MODEL = "gpt-3.5-turbo"
|
|
14
14
|
|
|
15
15
|
def __init__(self, model: str = DEFAULT_MODEL, api_key: Optional[str] = None):
|
|
16
16
|
self.model = model
|
|
@@ -27,6 +27,8 @@ class OpenAIProvider(LLMProvider):
|
|
|
27
27
|
raise RuntimeError("OpenAI API is not configured")
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
|
+
if self.client is None:
|
|
31
|
+
raise RuntimeError("OpenAI client is not initialized. Please provide a valid API key.")
|
|
30
32
|
response = self.client.chat.completions.create(
|
|
31
33
|
model=self.model,
|
|
32
34
|
messages=[{"role": "user", "content": prompt}],
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Core business logic for Contextly commands.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Dict, Any, Optional
|
|
7
|
-
|
|
8
|
-
from .core.analyzer import CodeAnalyzer
|
|
9
|
-
from .core.embeddings import EmbeddingEngine
|
|
10
|
-
from .core.sync import RepoSync
|
|
11
|
-
from .llm.manager import LLMManager
|
|
12
|
-
|
|
13
|
-
class Contextly:
|
|
14
|
-
"""Main class coordinating Contextly's functionality."""
|
|
15
|
-
|
|
16
|
-
def __init__(self, repo_path: Optional[Path] = None, model: Optional[str] = None):
|
|
17
|
-
self.repo_path = repo_path or Path.cwd()
|
|
18
|
-
self.llm_manager = LLMManager(model)
|
|
19
|
-
self.analyzer = CodeAnalyzer(self.repo_path)
|
|
20
|
-
self.embedding_engine = EmbeddingEngine(self.repo_path)
|
|
21
|
-
self.repo_sync = RepoSync(self.repo_path)
|
|
22
|
-
|
|
23
|
-
def ask(self, question: str) -> Dict[str, Any]:
|
|
24
|
-
"""Answer questions about the codebase."""
|
|
25
|
-
# Initialize if needed
|
|
26
|
-
if not hasattr(self, '_initialized'):
|
|
27
|
-
self.embedding_engine.initialize()
|
|
28
|
-
self._initialized = True
|
|
29
|
-
|
|
30
|
-
# Search for relevant code
|
|
31
|
-
search_results = self.embedding_engine.search(question)
|
|
32
|
-
|
|
33
|
-
# Build context from search results
|
|
34
|
-
context = []
|
|
35
|
-
for result in search_results['results']:
|
|
36
|
-
context.append(f"From {result['file']}:\n{result['content']}\n")
|
|
37
|
-
|
|
38
|
-
# Generate answer using LLM
|
|
39
|
-
prompt = f"Question: {question}\n\nContext from codebase:\n{''.join(context)}\n\nAnswer:"
|
|
40
|
-
answer = self.analyzer.llm.generate(prompt)
|
|
41
|
-
|
|
42
|
-
return {
|
|
43
|
-
'answer': answer,
|
|
44
|
-
'context': search_results
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
def explain(self, location: str) -> str:
|
|
48
|
-
"""Explain code at specific location."""
|
|
49
|
-
file_path, line_number = location.split(':')
|
|
50
|
-
return self.analyzer.explain_code(file_path, int(line_number))
|
|
51
|
-
|
|
52
|
-
def search(self, term: str) -> Dict[str, Any]:
|
|
53
|
-
"""Semantic search across codebase."""
|
|
54
|
-
if not hasattr(self, '_initialized'):
|
|
55
|
-
self.embedding_engine.initialize()
|
|
56
|
-
self._initialized = True
|
|
57
|
-
|
|
58
|
-
results = self.embedding_engine.search(term)
|
|
59
|
-
return {
|
|
60
|
-
'query': results['query'],
|
|
61
|
-
'results': results['results']
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
def diff(self, path1: Path, path2: Path) -> Dict[str, Any]:
|
|
65
|
-
"""Compare configuration files."""
|
|
66
|
-
return self.analyzer.compare_configs(path1, path2)
|
|
67
|
-
|
|
68
|
-
def sync(self) -> None:
|
|
69
|
-
"""Build or rebuild the embedding index."""
|
|
70
|
-
# Scan repository
|
|
71
|
-
files = list(self.repo_sync.scan_files())
|
|
72
|
-
|
|
73
|
-
# Build index
|
|
74
|
-
index = self.repo_sync.index_repository()
|
|
75
|
-
|
|
76
|
-
# Initialize embedding engine
|
|
77
|
-
self.embedding_engine.initialize()
|
|
78
|
-
|
|
79
|
-
# Convert index to list of documents
|
|
80
|
-
docs = []
|
|
81
|
-
for file_path, file_data in index.items():
|
|
82
|
-
docs.extend(file_data.get('chunks', []))
|
|
83
|
-
|
|
84
|
-
# Embed documents
|
|
85
|
-
self.embedding_engine.embed_documents(docs)
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Vector embedding and similarity search functionality.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import os
|
|
7
|
-
from typing import List, Dict, Any, Optional
|
|
8
|
-
import chromadb
|
|
9
|
-
from chromadb.config import Settings
|
|
10
|
-
from sentence_transformers import SentenceTransformer
|
|
11
|
-
|
|
12
|
-
class EmbeddingEngine:
|
|
13
|
-
"""Handles document embedding and similarity search."""
|
|
14
|
-
|
|
15
|
-
def __init__(self, repo_path: Path):
|
|
16
|
-
self.repo_path = repo_path
|
|
17
|
-
self.model = None # Lazy load the model
|
|
18
|
-
self.db = None # Initialize ChromaDB client
|
|
19
|
-
self.collection = None
|
|
20
|
-
|
|
21
|
-
def initialize(self):
|
|
22
|
-
"""Initialize the embedding model and vector store."""
|
|
23
|
-
if self.model is None:
|
|
24
|
-
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
25
|
-
|
|
26
|
-
if self.db is None:
|
|
27
|
-
# Create .contextly directory if it doesn't exist
|
|
28
|
-
db_path = self.repo_path / '.contextly'
|
|
29
|
-
os.makedirs(db_path, exist_ok=True)
|
|
30
|
-
|
|
31
|
-
# Initialize ChromaDB with persistent storage
|
|
32
|
-
self.db = chromadb.Client(Settings(
|
|
33
|
-
persist_directory=str(db_path),
|
|
34
|
-
anonymized_telemetry=False
|
|
35
|
-
))
|
|
36
|
-
|
|
37
|
-
# Get or create collection for this repo
|
|
38
|
-
self.collection = self.db.get_or_create_collection(
|
|
39
|
-
name="contextly_docs",
|
|
40
|
-
metadata={"repo_path": str(self.repo_path)}
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
def embed_documents(self, documents: List[Dict[str, Any]]):
|
|
44
|
-
"""Convert documents into vector embeddings and store them."""
|
|
45
|
-
if not documents:
|
|
46
|
-
return
|
|
47
|
-
|
|
48
|
-
# Ensure initialization
|
|
49
|
-
self.initialize()
|
|
50
|
-
|
|
51
|
-
# Prepare documents for ChromaDB
|
|
52
|
-
texts = []
|
|
53
|
-
metadatas = []
|
|
54
|
-
ids = []
|
|
55
|
-
|
|
56
|
-
for idx, doc in enumerate(documents):
|
|
57
|
-
# Create a meaningful document summary
|
|
58
|
-
doc_text = f"{doc.get('type', 'unknown')} - {doc.get('name', '')}:\n{doc.get('content', '')}"
|
|
59
|
-
texts.append(doc_text)
|
|
60
|
-
|
|
61
|
-
# Store metadata
|
|
62
|
-
metadatas.append({
|
|
63
|
-
"file_path": str(doc.get("file_path", "")),
|
|
64
|
-
"type": doc.get("type", "unknown"),
|
|
65
|
-
"name": doc.get("name", ""),
|
|
66
|
-
"start_line": doc.get("start_line", 0),
|
|
67
|
-
"end_line": doc.get("end_line", 0),
|
|
68
|
-
})
|
|
69
|
-
|
|
70
|
-
# Generate unique ID
|
|
71
|
-
doc_id = f"doc_{idx}_{hash(doc_text) & 0xFFFFFFFF}"
|
|
72
|
-
ids.append(doc_id)
|
|
73
|
-
|
|
74
|
-
# Add documents to the collection
|
|
75
|
-
self.collection.add(
|
|
76
|
-
documents=texts,
|
|
77
|
-
metadatas=metadatas,
|
|
78
|
-
ids=ids
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
|
|
82
|
-
"""Search for most similar documents to query."""
|
|
83
|
-
self.initialize()
|
|
84
|
-
|
|
85
|
-
# Query the collection
|
|
86
|
-
results = self.collection.query(
|
|
87
|
-
query_texts=[query],
|
|
88
|
-
n_results=top_k
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# Format results
|
|
92
|
-
hits = []
|
|
93
|
-
for i in range(len(results['ids'][0])):
|
|
94
|
-
hits.append({
|
|
95
|
-
'content': results['documents'][0][i],
|
|
96
|
-
'metadata': results['metadatas'][0][i],
|
|
97
|
-
'score': results['distances'][0][i] if 'distances' in results else None,
|
|
98
|
-
'file': results['metadatas'][0][i]['file_path']
|
|
99
|
-
})
|
|
100
|
-
|
|
101
|
-
return {
|
|
102
|
-
'query': query,
|
|
103
|
-
'results': hits
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
def clear(self):
|
|
107
|
-
"""Clear all embeddings from storage."""
|
|
108
|
-
if self.collection:
|
|
109
|
-
self.collection.delete()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|