code-explore 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_explore/__init__.py +3 -0
- code_explore/analyzer/__init__.py +13 -0
- code_explore/analyzer/dependencies.py +328 -0
- code_explore/analyzer/language.py +240 -0
- code_explore/analyzer/metrics.py +144 -0
- code_explore/analyzer/patterns.py +371 -0
- code_explore/api/__init__.py +1 -0
- code_explore/api/main.py +197 -0
- code_explore/cli/__init__.py +1 -0
- code_explore/cli/main.py +557 -0
- code_explore/database.py +207 -0
- code_explore/indexer/__init__.py +1 -0
- code_explore/indexer/embeddings.py +181 -0
- code_explore/models.py +106 -0
- code_explore/scanner/__init__.py +1 -0
- code_explore/scanner/git_info.py +94 -0
- code_explore/scanner/local.py +70 -0
- code_explore/scanner/readme.py +70 -0
- code_explore/search/__init__.py +1 -0
- code_explore/search/fulltext.py +137 -0
- code_explore/search/hybrid.py +92 -0
- code_explore/search/semantic.py +76 -0
- code_explore/summarizer/__init__.py +1 -0
- code_explore/summarizer/ollama.py +130 -0
- code_explore-0.1.0.dist-info/METADATA +67 -0
- code_explore-0.1.0.dist-info/RECORD +28 -0
- code_explore-0.1.0.dist-info/WHEEL +4 -0
- code_explore-0.1.0.dist-info/entry_points.txt +3 -0
code_explore/database.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""SQLite database with FTS5 for Code Explore."""
|
|
2
|
+
|
|
3
|
+
import sqlite3
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from code_explore.models import Project
|
|
8
|
+
|
|
9
|
+
DEFAULT_DB_PATH = Path.home() / ".code-explore" / "code-explore.db"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_db_path() -> Path:
|
|
13
|
+
path = DEFAULT_DB_PATH
|
|
14
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
return path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_connection(db_path: Path | None = None) -> sqlite3.Connection:
|
|
19
|
+
path = db_path or get_db_path()
|
|
20
|
+
conn = sqlite3.connect(str(path))
|
|
21
|
+
conn.row_factory = sqlite3.Row
|
|
22
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
23
|
+
conn.execute("PRAGMA foreign_keys=ON")
|
|
24
|
+
return conn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _migrate_db(conn: sqlite3.Connection) -> None:
|
|
28
|
+
"""Add columns/tables that may not exist in older databases."""
|
|
29
|
+
cursor = conn.execute("PRAGMA table_info(projects)")
|
|
30
|
+
columns = {row["name"] for row in cursor.fetchall()}
|
|
31
|
+
|
|
32
|
+
if not columns:
|
|
33
|
+
# Table doesn't exist yet, nothing to migrate
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
if "readme_snippet" not in columns:
|
|
37
|
+
conn.execute("ALTER TABLE projects ADD COLUMN readme_snippet TEXT")
|
|
38
|
+
if "git_head" not in columns:
|
|
39
|
+
conn.execute("ALTER TABLE projects ADD COLUMN git_head TEXT")
|
|
40
|
+
|
|
41
|
+
# Rebuild FTS table to include readme_snippet if needed
|
|
42
|
+
try:
|
|
43
|
+
conn.execute("SELECT readme_snippet FROM projects_fts LIMIT 1")
|
|
44
|
+
except sqlite3.OperationalError:
|
|
45
|
+
conn.execute("DROP TABLE IF EXISTS projects_fts")
|
|
46
|
+
conn.execute("DROP TRIGGER IF EXISTS projects_ai")
|
|
47
|
+
conn.execute("DROP TRIGGER IF EXISTS projects_ad")
|
|
48
|
+
conn.execute("DROP TRIGGER IF EXISTS projects_au")
|
|
49
|
+
conn.commit()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def init_db(db_path: Path | None = None) -> None:
|
|
53
|
+
conn = get_connection(db_path)
|
|
54
|
+
_migrate_db(conn)
|
|
55
|
+
conn.executescript("""
|
|
56
|
+
CREATE TABLE IF NOT EXISTS projects (
|
|
57
|
+
id TEXT PRIMARY KEY,
|
|
58
|
+
name TEXT NOT NULL,
|
|
59
|
+
path TEXT,
|
|
60
|
+
remote_url TEXT,
|
|
61
|
+
source TEXT DEFAULT 'local',
|
|
62
|
+
status TEXT DEFAULT 'pending',
|
|
63
|
+
data JSON NOT NULL,
|
|
64
|
+
summary TEXT,
|
|
65
|
+
tags TEXT,
|
|
66
|
+
readme_snippet TEXT,
|
|
67
|
+
git_head TEXT,
|
|
68
|
+
scanned_at TEXT,
|
|
69
|
+
analyzed_at TEXT,
|
|
70
|
+
indexed_at TEXT,
|
|
71
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
72
|
+
updated_at TEXT DEFAULT (datetime('now'))
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
CREATE INDEX IF NOT EXISTS idx_projects_name ON projects(name);
|
|
76
|
+
CREATE INDEX IF NOT EXISTS idx_projects_source ON projects(source);
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_projects_status ON projects(status);
|
|
78
|
+
|
|
79
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS projects_fts USING fts5(
|
|
80
|
+
name,
|
|
81
|
+
summary,
|
|
82
|
+
tags,
|
|
83
|
+
readme_snippet,
|
|
84
|
+
content='projects',
|
|
85
|
+
content_rowid='rowid',
|
|
86
|
+
tokenize='porter unicode61'
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
CREATE TRIGGER IF NOT EXISTS projects_ai AFTER INSERT ON projects BEGIN
|
|
90
|
+
INSERT INTO projects_fts(rowid, name, summary, tags, readme_snippet)
|
|
91
|
+
VALUES (new.rowid, new.name, new.summary, new.tags, new.readme_snippet);
|
|
92
|
+
END;
|
|
93
|
+
|
|
94
|
+
CREATE TRIGGER IF NOT EXISTS projects_ad AFTER DELETE ON projects BEGIN
|
|
95
|
+
INSERT INTO projects_fts(projects_fts, rowid, name, summary, tags, readme_snippet)
|
|
96
|
+
VALUES ('delete', old.rowid, old.name, old.summary, old.tags, old.readme_snippet);
|
|
97
|
+
END;
|
|
98
|
+
|
|
99
|
+
CREATE TRIGGER IF NOT EXISTS projects_au AFTER UPDATE ON projects BEGIN
|
|
100
|
+
INSERT INTO projects_fts(projects_fts, rowid, name, summary, tags, readme_snippet)
|
|
101
|
+
VALUES ('delete', old.rowid, old.name, old.summary, old.tags, old.readme_snippet);
|
|
102
|
+
INSERT INTO projects_fts(rowid, name, summary, tags, readme_snippet)
|
|
103
|
+
VALUES (new.rowid, new.name, new.summary, new.tags, new.readme_snippet);
|
|
104
|
+
END;
|
|
105
|
+
""")
|
|
106
|
+
conn.commit()
|
|
107
|
+
conn.close()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def save_project(project: Project, db_path: Path | None = None) -> None:
|
|
111
|
+
conn = get_connection(db_path)
|
|
112
|
+
now = datetime.now().isoformat()
|
|
113
|
+
tags_str = ", ".join(project.tags) if project.tags else ""
|
|
114
|
+
|
|
115
|
+
readme_str = (project.readme_snippet or "")[:2000]
|
|
116
|
+
|
|
117
|
+
conn.execute(
|
|
118
|
+
"""
|
|
119
|
+
INSERT OR REPLACE INTO projects
|
|
120
|
+
(id, name, path, remote_url, source, status, data, summary, tags,
|
|
121
|
+
readme_snippet, git_head, scanned_at, analyzed_at, indexed_at, updated_at)
|
|
122
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
123
|
+
""",
|
|
124
|
+
(
|
|
125
|
+
project.id, project.name, project.path, project.remote_url,
|
|
126
|
+
project.source.value, project.status.value,
|
|
127
|
+
project.model_dump_json(),
|
|
128
|
+
project.summary, tags_str,
|
|
129
|
+
readme_str, project.git_head,
|
|
130
|
+
project.scanned_at.isoformat() if project.scanned_at else None,
|
|
131
|
+
project.analyzed_at.isoformat() if project.analyzed_at else None,
|
|
132
|
+
project.indexed_at.isoformat() if project.indexed_at else None,
|
|
133
|
+
now,
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
conn.commit()
|
|
137
|
+
conn.close()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_project(project_id: str, db_path: Path | None = None) -> Project | None:
|
|
141
|
+
conn = get_connection(db_path)
|
|
142
|
+
row = conn.execute("SELECT data FROM projects WHERE id = ?", (project_id,)).fetchone()
|
|
143
|
+
conn.close()
|
|
144
|
+
if row:
|
|
145
|
+
return Project.model_validate_json(row["data"])
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_all_projects(db_path: Path | None = None) -> list[Project]:
|
|
150
|
+
conn = get_connection(db_path)
|
|
151
|
+
rows = conn.execute("SELECT data FROM projects ORDER BY name").fetchall()
|
|
152
|
+
conn.close()
|
|
153
|
+
return [Project.model_validate_json(row["data"]) for row in rows]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def search_fulltext(
|
|
157
|
+
query: str, limit: int = 20, db_path: Path | None = None
|
|
158
|
+
) -> list[tuple[str, float]]:
|
|
159
|
+
conn = get_connection(db_path)
|
|
160
|
+
try:
|
|
161
|
+
rows = conn.execute(
|
|
162
|
+
"""
|
|
163
|
+
SELECT p.id, fts.rank
|
|
164
|
+
FROM projects_fts fts
|
|
165
|
+
JOIN projects p ON p.rowid = fts.rowid
|
|
166
|
+
WHERE projects_fts MATCH ?
|
|
167
|
+
ORDER BY fts.rank
|
|
168
|
+
LIMIT ?
|
|
169
|
+
""",
|
|
170
|
+
(query, limit),
|
|
171
|
+
).fetchall()
|
|
172
|
+
except sqlite3.OperationalError:
|
|
173
|
+
# Query may contain special characters that break FTS5 syntax.
|
|
174
|
+
# Escape by wrapping each token in double quotes.
|
|
175
|
+
tokens = query.split()
|
|
176
|
+
escaped = " ".join(f'"{t}"' for t in tokens if t)
|
|
177
|
+
try:
|
|
178
|
+
rows = conn.execute(
|
|
179
|
+
"""
|
|
180
|
+
SELECT p.id, fts.rank
|
|
181
|
+
FROM projects_fts fts
|
|
182
|
+
JOIN projects p ON p.rowid = fts.rowid
|
|
183
|
+
WHERE projects_fts MATCH ?
|
|
184
|
+
ORDER BY fts.rank
|
|
185
|
+
LIMIT ?
|
|
186
|
+
""",
|
|
187
|
+
(escaped, limit),
|
|
188
|
+
).fetchall()
|
|
189
|
+
except sqlite3.OperationalError:
|
|
190
|
+
conn.close()
|
|
191
|
+
return []
|
|
192
|
+
conn.close()
|
|
193
|
+
return [(row["id"], row["rank"]) for row in rows]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def delete_project(project_id: str, db_path: Path | None = None) -> None:
|
|
197
|
+
conn = get_connection(db_path)
|
|
198
|
+
conn.execute("DELETE FROM projects WHERE id = ?", (project_id,))
|
|
199
|
+
conn.commit()
|
|
200
|
+
conn.close()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_project_count(db_path: Path | None = None) -> int:
|
|
204
|
+
conn = get_connection(db_path)
|
|
205
|
+
count = conn.execute("SELECT COUNT(*) as cnt FROM projects").fetchone()["cnt"]
|
|
206
|
+
conn.close()
|
|
207
|
+
return count
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedding generation and vector indexing."""
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Generate and store embeddings using Ollama + LanceDB."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import lancedb
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
|
|
10
|
+
from code_explore.models import Project
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
OLLAMA_BASE_URL = "http://localhost:11434"
|
|
15
|
+
EMBEDDING_MODEL = "qwen3-embedding:8b"
|
|
16
|
+
EMBEDDING_DIM = 4096
|
|
17
|
+
VECTOR_DB_PATH = Path.home() / ".code-explore" / "vectors"
|
|
18
|
+
TABLE_NAME = "project_embeddings"
|
|
19
|
+
|
|
20
|
+
SCHEMA = pa.schema([
|
|
21
|
+
pa.field("id", pa.string()),
|
|
22
|
+
pa.field("text", pa.string()),
|
|
23
|
+
pa.field("vector", pa.list_(pa.float32(), EMBEDDING_DIM)),
|
|
24
|
+
])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ollama_available() -> bool:
|
|
28
|
+
try:
|
|
29
|
+
resp = httpx.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5.0)
|
|
30
|
+
return resp.status_code == 200
|
|
31
|
+
except (httpx.ConnectError, httpx.TimeoutException):
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_embedding(text: str) -> list[float] | None:
|
|
36
|
+
try:
|
|
37
|
+
resp = httpx.post(
|
|
38
|
+
f"{OLLAMA_BASE_URL}/api/embeddings",
|
|
39
|
+
json={"model": EMBEDDING_MODEL, "prompt": text},
|
|
40
|
+
timeout=30.0,
|
|
41
|
+
)
|
|
42
|
+
resp.raise_for_status()
|
|
43
|
+
return resp.json()["embedding"]
|
|
44
|
+
except (httpx.ConnectError, httpx.TimeoutException):
|
|
45
|
+
logger.warning("Ollama is not running at %s. Skipping embedding generation.", OLLAMA_BASE_URL)
|
|
46
|
+
return None
|
|
47
|
+
except (httpx.HTTPStatusError, KeyError) as e:
|
|
48
|
+
logger.error("Failed to generate embedding: %s", e)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _generate_embeddings_batch(texts: list[str]) -> list[list[float] | None]:
|
|
53
|
+
results = []
|
|
54
|
+
for text in texts:
|
|
55
|
+
results.append(generate_embedding(text))
|
|
56
|
+
return results
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _project_to_text(project: Project) -> str:
|
|
60
|
+
parts: list[str] = []
|
|
61
|
+
|
|
62
|
+
# Project name - repeated for weighting
|
|
63
|
+
parts.append(project.name)
|
|
64
|
+
|
|
65
|
+
# Name with summary for context
|
|
66
|
+
if project.summary:
|
|
67
|
+
parts.append(f"{project.name} - {project.summary}")
|
|
68
|
+
|
|
69
|
+
# README snippet - most valuable signal
|
|
70
|
+
if project.readme_snippet:
|
|
71
|
+
parts.append(f"README: {project.readme_snippet}")
|
|
72
|
+
|
|
73
|
+
# Summary standalone
|
|
74
|
+
if project.summary:
|
|
75
|
+
parts.append(f"Summary: {project.summary}")
|
|
76
|
+
|
|
77
|
+
# Dependency names (without versions) - critical for discovery
|
|
78
|
+
dep_names = [dep.name for dep in project.dependencies]
|
|
79
|
+
if dep_names:
|
|
80
|
+
parts.append(f"Dependencies: {', '.join(dep_names)}")
|
|
81
|
+
|
|
82
|
+
# Tags and concepts
|
|
83
|
+
if project.tags:
|
|
84
|
+
parts.append(f"Tags: {', '.join(project.tags)}")
|
|
85
|
+
|
|
86
|
+
if project.concepts:
|
|
87
|
+
parts.append(f"Concepts: {', '.join(project.concepts)}")
|
|
88
|
+
|
|
89
|
+
# Language names
|
|
90
|
+
languages = [lang.name for lang in project.languages]
|
|
91
|
+
if languages:
|
|
92
|
+
parts.append(f"Languages: {', '.join(languages)}")
|
|
93
|
+
|
|
94
|
+
# Framework names
|
|
95
|
+
if project.frameworks:
|
|
96
|
+
parts.append(f"Frameworks: {', '.join(project.frameworks)}")
|
|
97
|
+
|
|
98
|
+
# Pattern names
|
|
99
|
+
patterns = [p.name for p in project.patterns]
|
|
100
|
+
if patterns:
|
|
101
|
+
parts.append(f"Patterns: {', '.join(patterns)}")
|
|
102
|
+
|
|
103
|
+
# Key file names
|
|
104
|
+
if project.key_files:
|
|
105
|
+
parts.append(f"Files: {', '.join(project.key_files)}")
|
|
106
|
+
|
|
107
|
+
# Git remote URL contains useful project name info
|
|
108
|
+
remote = project.remote_url or (project.git.remote_url if project.git else None)
|
|
109
|
+
if remote:
|
|
110
|
+
parts.append(f"Remote: {remote}")
|
|
111
|
+
|
|
112
|
+
return "\n".join(parts)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_table(db: lancedb.DBConnection) -> lancedb.table.Table:
|
|
116
|
+
if TABLE_NAME in db.table_names():
|
|
117
|
+
return db.open_table(TABLE_NAME)
|
|
118
|
+
return db.create_table(TABLE_NAME, schema=SCHEMA)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def index_project(project: Project) -> None:
|
|
122
|
+
if not _ollama_available():
|
|
123
|
+
logger.warning("Ollama is not available. Skipping indexing for project '%s'.", project.name)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
text = _project_to_text(project)
|
|
127
|
+
vector = generate_embedding(text)
|
|
128
|
+
if vector is None:
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
db = lancedb.connect(str(VECTOR_DB_PATH))
|
|
133
|
+
table = _get_table(db)
|
|
134
|
+
|
|
135
|
+
data = [{"id": project.id, "text": text, "vector": vector}]
|
|
136
|
+
|
|
137
|
+
existing = table.search().where(f"id = '{project.id}'", prefilter=True).limit(1).to_list()
|
|
138
|
+
if existing:
|
|
139
|
+
table.delete(f"id = '{project.id}'")
|
|
140
|
+
|
|
141
|
+
table.add(data)
|
|
142
|
+
logger.info("Indexed project '%s' in vector store.", project.name)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def index_all_projects(projects: list[Project]) -> None:
|
|
146
|
+
if not projects:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
if not _ollama_available():
|
|
150
|
+
logger.warning(
|
|
151
|
+
"Ollama is not available. Skipping vector indexing for %d projects.", len(projects)
|
|
152
|
+
)
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
texts = [_project_to_text(p) for p in projects]
|
|
156
|
+
vectors = _generate_embeddings_batch(texts)
|
|
157
|
+
|
|
158
|
+
data = []
|
|
159
|
+
for project, text, vector in zip(projects, texts, vectors):
|
|
160
|
+
if vector is not None:
|
|
161
|
+
data.append({"id": project.id, "text": text, "vector": vector})
|
|
162
|
+
|
|
163
|
+
if not data:
|
|
164
|
+
logger.warning("No embeddings generated. Skipping vector store update.")
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
db = lancedb.connect(str(VECTOR_DB_PATH))
|
|
169
|
+
table = _get_table(db)
|
|
170
|
+
|
|
171
|
+
existing_ids = {item["id"] for item in data}
|
|
172
|
+
for pid in existing_ids:
|
|
173
|
+
try:
|
|
174
|
+
found = table.search().where(f"id = '{pid}'", prefilter=True).limit(1).to_list()
|
|
175
|
+
if found:
|
|
176
|
+
table.delete(f"id = '{pid}'")
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
table.add(data)
|
|
181
|
+
logger.info("Indexed %d/%d projects in vector store.", len(data), len(projects))
|
code_explore/models.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Data models for Code Explore."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProjectSource(str, Enum):
|
|
10
|
+
LOCAL = "local"
|
|
11
|
+
GITHUB = "github"
|
|
12
|
+
GITLAB = "gitlab"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ProjectStatus(str, Enum):
|
|
16
|
+
PENDING = "pending"
|
|
17
|
+
SCANNING = "scanning"
|
|
18
|
+
ANALYZED = "analyzed"
|
|
19
|
+
INDEXED = "indexed"
|
|
20
|
+
ERROR = "error"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LanguageInfo(BaseModel):
|
|
24
|
+
name: str
|
|
25
|
+
files: int = 0
|
|
26
|
+
lines: int = 0
|
|
27
|
+
percentage: float = 0.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DependencyInfo(BaseModel):
|
|
31
|
+
name: str
|
|
32
|
+
version: str | None = None
|
|
33
|
+
dev: bool = False
|
|
34
|
+
source: str = ""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class PatternInfo(BaseModel):
|
|
38
|
+
name: str
|
|
39
|
+
category: str
|
|
40
|
+
confidence: float = 0.0
|
|
41
|
+
evidence: list[str] = Field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class QualityMetrics(BaseModel):
|
|
45
|
+
total_files: int = 0
|
|
46
|
+
total_lines: int = 0
|
|
47
|
+
avg_file_size: float = 0.0
|
|
48
|
+
max_file_size: int = 0
|
|
49
|
+
has_tests: bool = False
|
|
50
|
+
has_ci: bool = False
|
|
51
|
+
has_docs: bool = False
|
|
52
|
+
has_readme: bool = False
|
|
53
|
+
has_license: bool = False
|
|
54
|
+
has_gitignore: bool = False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class GitInfo(BaseModel):
|
|
58
|
+
remote_url: str | None = None
|
|
59
|
+
default_branch: str | None = None
|
|
60
|
+
total_commits: int = 0
|
|
61
|
+
last_commit_date: datetime | None = None
|
|
62
|
+
last_commit_message: str | None = None
|
|
63
|
+
contributors: list[str] = Field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class Project(BaseModel):
|
|
67
|
+
id: str
|
|
68
|
+
name: str
|
|
69
|
+
path: str | None = None
|
|
70
|
+
remote_url: str | None = None
|
|
71
|
+
source: ProjectSource = ProjectSource.LOCAL
|
|
72
|
+
status: ProjectStatus = ProjectStatus.PENDING
|
|
73
|
+
|
|
74
|
+
languages: list[LanguageInfo] = Field(default_factory=list)
|
|
75
|
+
primary_language: str | None = None
|
|
76
|
+
frameworks: list[str] = Field(default_factory=list)
|
|
77
|
+
dependencies: list[DependencyInfo] = Field(default_factory=list)
|
|
78
|
+
patterns: list[PatternInfo] = Field(default_factory=list)
|
|
79
|
+
quality: QualityMetrics = Field(default_factory=QualityMetrics)
|
|
80
|
+
git: GitInfo = Field(default_factory=GitInfo)
|
|
81
|
+
|
|
82
|
+
summary: str | None = None
|
|
83
|
+
tags: list[str] = Field(default_factory=list)
|
|
84
|
+
concepts: list[str] = Field(default_factory=list)
|
|
85
|
+
|
|
86
|
+
readme_snippet: str | None = None
|
|
87
|
+
key_files: list[str] = Field(default_factory=list)
|
|
88
|
+
git_head: str | None = None
|
|
89
|
+
|
|
90
|
+
scanned_at: datetime | None = None
|
|
91
|
+
analyzed_at: datetime | None = None
|
|
92
|
+
indexed_at: datetime | None = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class SearchResult(BaseModel):
|
|
96
|
+
project: Project
|
|
97
|
+
score: float
|
|
98
|
+
match_type: str
|
|
99
|
+
highlights: list[str] = Field(default_factory=list)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class SearchQuery(BaseModel):
|
|
103
|
+
query: str
|
|
104
|
+
mode: str = "hybrid"
|
|
105
|
+
limit: int = 20
|
|
106
|
+
filters: dict = Field(default_factory=dict)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Project scanning - local filesystem and git repos."""
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Extract git metadata from a local repository."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from git import InvalidGitRepositoryError, Repo
|
|
6
|
+
|
|
7
|
+
from code_explore.models import GitInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_git_head(repo_path: str | Path) -> str | None:
|
|
11
|
+
"""Return the HEAD commit SHA for the repository, or None on error."""
|
|
12
|
+
repo_path = Path(repo_path).expanduser().resolve()
|
|
13
|
+
try:
|
|
14
|
+
repo = Repo(repo_path)
|
|
15
|
+
return repo.head.commit.hexsha
|
|
16
|
+
except (InvalidGitRepositoryError, ValueError, TypeError):
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_git_info(repo_path: str | Path) -> GitInfo:
|
|
21
|
+
"""Read git metadata from the repository at repo_path and return a GitInfo model."""
|
|
22
|
+
repo_path = Path(repo_path).expanduser().resolve()
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
repo = Repo(repo_path)
|
|
26
|
+
except InvalidGitRepositoryError:
|
|
27
|
+
return GitInfo()
|
|
28
|
+
|
|
29
|
+
remote_url = _get_remote_url(repo)
|
|
30
|
+
default_branch = _get_default_branch(repo)
|
|
31
|
+
total_commits, last_commit_date, last_commit_message, contributors = _get_commit_info(repo)
|
|
32
|
+
|
|
33
|
+
return GitInfo(
|
|
34
|
+
remote_url=remote_url,
|
|
35
|
+
default_branch=default_branch,
|
|
36
|
+
total_commits=total_commits,
|
|
37
|
+
last_commit_date=last_commit_date,
|
|
38
|
+
last_commit_message=last_commit_message,
|
|
39
|
+
contributors=contributors,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _get_remote_url(repo: Repo) -> str | None:
|
|
44
|
+
if not repo.remotes:
|
|
45
|
+
return None
|
|
46
|
+
try:
|
|
47
|
+
return repo.remotes.origin.url
|
|
48
|
+
except AttributeError:
|
|
49
|
+
return repo.remotes[0].url
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_default_branch(repo: Repo) -> str | None:
|
|
53
|
+
if repo.bare:
|
|
54
|
+
try:
|
|
55
|
+
return repo.head.reference.name
|
|
56
|
+
except TypeError:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
for name in ("main", "master", "develop"):
|
|
60
|
+
if name in repo.heads:
|
|
61
|
+
return name
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
return repo.active_branch.name
|
|
65
|
+
except TypeError:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_commit_info(
|
|
70
|
+
repo: Repo,
|
|
71
|
+
) -> tuple[int, "datetime | None", str | None, list[str]]:
|
|
72
|
+
from datetime import datetime, timezone
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
head_commit = repo.head.commit
|
|
76
|
+
except ValueError:
|
|
77
|
+
return 0, None, None, []
|
|
78
|
+
|
|
79
|
+
total_commits = head_commit.count()
|
|
80
|
+
|
|
81
|
+
last_commit_date = datetime.fromtimestamp(
|
|
82
|
+
head_commit.committed_date, tz=timezone.utc
|
|
83
|
+
)
|
|
84
|
+
last_commit_message = head_commit.message.strip()
|
|
85
|
+
|
|
86
|
+
seen: set[str] = set()
|
|
87
|
+
contributors: list[str] = []
|
|
88
|
+
for commit in repo.iter_commits(max_count=500):
|
|
89
|
+
name = commit.author.name
|
|
90
|
+
if name and name not in seen:
|
|
91
|
+
seen.add(name)
|
|
92
|
+
contributors.append(name)
|
|
93
|
+
|
|
94
|
+
return total_commits, last_commit_date, last_commit_message, contributors
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Scan local directories to discover git repositories."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
console = Console()
|
|
10
|
+
|
|
11
|
+
SKIP_DIRS = frozenset({
|
|
12
|
+
".git",
|
|
13
|
+
"node_modules",
|
|
14
|
+
"venv",
|
|
15
|
+
".venv",
|
|
16
|
+
"__pycache__",
|
|
17
|
+
".cache",
|
|
18
|
+
"vendor",
|
|
19
|
+
"dist",
|
|
20
|
+
"build",
|
|
21
|
+
"target",
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def scan_local_repos(
|
|
26
|
+
root: str | Path,
|
|
27
|
+
max_depth: int = 4,
|
|
28
|
+
) -> list[Path]:
|
|
29
|
+
"""Walk root directory recursively and return paths of discovered git repositories."""
|
|
30
|
+
root = Path(root).expanduser().resolve()
|
|
31
|
+
if not root.is_dir():
|
|
32
|
+
console.print(f"[red]Root path does not exist: {root}[/red]")
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
console.print(f"[blue]Scanning[/blue] {root} (max depth {max_depth})")
|
|
36
|
+
discovered: list[Path] = await asyncio.to_thread(_walk_for_repos, root, max_depth)
|
|
37
|
+
console.print(f"[green]Found {len(discovered)} repositories[/green]")
|
|
38
|
+
return discovered
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _walk_for_repos(root: Path, max_depth: int) -> list[Path]:
|
|
42
|
+
repos: list[Path] = []
|
|
43
|
+
stack: list[tuple[Path, int]] = [(root, 0)]
|
|
44
|
+
|
|
45
|
+
while stack:
|
|
46
|
+
current, depth = stack.pop()
|
|
47
|
+
|
|
48
|
+
if (current / ".git").is_dir():
|
|
49
|
+
repos.append(current)
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
if depth >= max_depth:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
entries = sorted(current.iterdir())
|
|
57
|
+
except PermissionError:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
for entry in entries:
|
|
61
|
+
if entry.is_symlink() or not entry.is_dir():
|
|
62
|
+
continue
|
|
63
|
+
if entry.name in SKIP_DIRS:
|
|
64
|
+
continue
|
|
65
|
+
stack.append((entry, depth + 1))
|
|
66
|
+
|
|
67
|
+
if len(repos) % 50 == 0 and repos:
|
|
68
|
+
console.print(f" [dim]...discovered {len(repos)} repos so far[/dim]")
|
|
69
|
+
|
|
70
|
+
return repos
|