cortexcode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ """Semantic search over symbols — find symbols by meaning, not just name."""
2
+
3
+ import json
4
+ import math
5
+ import re
6
+ from collections import Counter
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ def tokenize(text: str) -> list[str]:
12
+ """Split text into lowercase tokens, splitting camelCase and snake_case."""
13
+ # Split camelCase
14
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
15
+ # Split snake_case and kebab-case
16
+ text = text.replace("_", " ").replace("-", " ").replace("/", " ").replace("\\", " ").replace(".", " ")
17
+ # Lowercase and split
18
+ tokens = [t.lower() for t in re.findall(r'[a-zA-Z]{2,}', text)]
19
+ return tokens
20
+
21
+
22
+ # Common programming synonyms for semantic expansion
23
+ _SYNONYMS = {
24
+ "auth": ["authentication", "authorize", "login", "signin", "credentials", "session", "token", "jwt"],
25
+ "authentication": ["auth", "login", "signin", "credentials"],
26
+ "login": ["auth", "signin", "authentication", "credentials"],
27
+ "handler": ["handle", "controller", "action", "endpoint", "route", "api"],
28
+ "controller": ["handler", "endpoint", "route"],
29
+ "database": ["db", "model", "entity", "schema", "orm", "query", "repository", "store"],
30
+ "model": ["entity", "schema", "database", "db"],
31
+ "user": ["account", "profile", "member", "customer"],
32
+ "create": ["add", "new", "insert", "post", "register", "save"],
33
+ "delete": ["remove", "destroy", "drop"],
34
+ "update": ["edit", "modify", "patch", "put", "save"],
35
+ "get": ["fetch", "read", "find", "query", "retrieve", "list", "load"],
36
+ "list": ["get", "fetch", "all", "index", "browse"],
37
+ "component": ["widget", "ui", "view", "page", "screen"],
38
+ "page": ["screen", "view", "route", "component"],
39
+ "api": ["endpoint", "route", "handler", "rest"],
40
+ "route": ["endpoint", "api", "path", "handler"],
41
+ "test": ["spec", "assert", "expect", "mock"],
42
+ "error": ["exception", "catch", "throw", "fail"],
43
+ "config": ["configuration", "settings", "options", "env"],
44
+ "nav": ["navigation", "menu", "sidebar", "header"],
45
+ "button": ["btn", "click", "action"],
46
+ "submit": ["send", "post", "save", "confirm"],
47
+ "validate": ["check", "verify", "assert", "sanitize"],
48
+ "search": ["find", "query", "filter", "lookup"],
49
+ "file": ["upload", "download", "document", "attachment"],
50
+ "notification": ["alert", "message", "toast", "notify"],
51
+ "schedule": ["calendar", "booking", "appointment", "time"],
52
+ "interview": ["meeting", "call", "session", "conversation"],
53
+ "candidate": ["applicant", "user", "profile"],
54
+ "job": ["position", "role", "posting", "vacancy"],
55
+ }
56
+
57
+
58
+ def expand_query(query_tokens: list[str]) -> list[str]:
59
+ """Expand query tokens with synonyms for better recall."""
60
+ expanded = list(query_tokens)
61
+ for token in query_tokens:
62
+ synonyms = _SYNONYMS.get(token, [])
63
+ for syn in synonyms:
64
+ if syn not in expanded:
65
+ expanded.append(syn)
66
+ return expanded
67
+
68
+
69
+ def build_symbol_documents(index: dict) -> list[dict]:
70
+ """Build searchable documents from index symbols."""
71
+ docs = []
72
+ files = index.get("files", {})
73
+ call_graph = index.get("call_graph", {})
74
+
75
+ for rel_path, file_data in files.items():
76
+ if not isinstance(file_data, dict):
77
+ continue
78
+
79
+ for sym in file_data.get("symbols", []):
80
+ name = sym.get("name", "")
81
+ # Build a rich text representation for search
82
+ parts = [name]
83
+ parts.extend(sym.get("params", []))
84
+ parts.extend(sym.get("calls", []))
85
+ if sym.get("doc"):
86
+ parts.append(sym["doc"])
87
+ if sym.get("class"):
88
+ parts.append(sym["class"])
89
+ if sym.get("framework"):
90
+ parts.append(sym["framework"])
91
+ if sym.get("type"):
92
+ parts.append(sym["type"])
93
+
94
+ # Add file path context
95
+ parts.append(rel_path.replace("/", " ").replace("\\", " "))
96
+
97
+ text = " ".join(parts)
98
+ tokens = tokenize(text)
99
+
100
+ docs.append({
101
+ "name": name,
102
+ "type": sym.get("type"),
103
+ "file": rel_path,
104
+ "line": sym.get("line"),
105
+ "params": sym.get("params", []),
106
+ "calls": sym.get("calls", []),
107
+ "doc": sym.get("doc"),
108
+ "framework": sym.get("framework"),
109
+ "tokens": tokens,
110
+ "text": text,
111
+ })
112
+
113
+ return docs
114
+
115
+
116
+ class TFIDFSearcher:
117
+ """Simple TF-IDF based semantic search (no external dependencies)."""
118
+
119
+ def __init__(self, documents: list[dict]):
120
+ self.documents = documents
121
+ self.idf: dict[str, float] = {}
122
+ self._build_idf()
123
+
124
+ def _build_idf(self):
125
+ """Compute inverse document frequency for all terms."""
126
+ n = len(self.documents)
127
+ if n == 0:
128
+ return
129
+
130
+ doc_freq: dict[str, int] = Counter()
131
+ for doc in self.documents:
132
+ unique_tokens = set(doc["tokens"])
133
+ for token in unique_tokens:
134
+ doc_freq[token] += 1
135
+
136
+ for token, df in doc_freq.items():
137
+ self.idf[token] = math.log(n / (1 + df))
138
+
139
+ def _tfidf_vector(self, tokens: list[str]) -> dict[str, float]:
140
+ """Compute TF-IDF vector for a token list."""
141
+ tf = Counter(tokens)
142
+ total = len(tokens) or 1
143
+ vector = {}
144
+ for token, count in tf.items():
145
+ tf_val = count / total
146
+ idf_val = self.idf.get(token, 0)
147
+ vector[token] = tf_val * idf_val
148
+ return vector
149
+
150
+ def _cosine_similarity(self, vec_a: dict[str, float], vec_b: dict[str, float]) -> float:
151
+ """Compute cosine similarity between two sparse vectors."""
152
+ common = set(vec_a.keys()) & set(vec_b.keys())
153
+ if not common:
154
+ return 0.0
155
+
156
+ dot = sum(vec_a[k] * vec_b[k] for k in common)
157
+ mag_a = math.sqrt(sum(v ** 2 for v in vec_a.values()))
158
+ mag_b = math.sqrt(sum(v ** 2 for v in vec_b.values()))
159
+
160
+ if mag_a == 0 or mag_b == 0:
161
+ return 0.0
162
+
163
+ return dot / (mag_a * mag_b)
164
+
165
+ def search(self, query: str, limit: int = 10) -> list[dict]:
166
+ """Search documents by semantic similarity to query."""
167
+ query_tokens = tokenize(query)
168
+ if not query_tokens:
169
+ return []
170
+
171
+ # Expand with synonyms
172
+ expanded_tokens = expand_query(query_tokens)
173
+ query_vec = self._tfidf_vector(expanded_tokens)
174
+
175
+ scored = []
176
+ for doc in self.documents:
177
+ doc_vec = self._tfidf_vector(doc["tokens"])
178
+ score = self._cosine_similarity(query_vec, doc_vec)
179
+
180
+ # Boost exact name matches
181
+ name_tokens = tokenize(doc.get("name", ""))
182
+ if any(qt in name_tokens for qt in query_tokens):
183
+ score += 0.5
184
+ # Boost partial name matches from expanded tokens
185
+ elif any(qt in name_tokens for qt in expanded_tokens):
186
+ score += 0.2
187
+
188
+ # Boost doc/framework matches
189
+ if doc.get("doc"):
190
+ doc_lower = doc["doc"].lower()
191
+ if any(qt in doc_lower for qt in query_tokens):
192
+ score += 0.15
193
+
194
+ if score > 0.01:
195
+ scored.append((score, doc))
196
+
197
+ scored.sort(key=lambda x: x[0], reverse=True)
198
+
199
+ results = []
200
+ for score, doc in scored[:limit]:
201
+ results.append({
202
+ "name": doc["name"],
203
+ "type": doc["type"],
204
+ "file": doc["file"],
205
+ "line": doc["line"],
206
+ "params": doc["params"],
207
+ "calls": doc["calls"][:5] if doc["calls"] else [],
208
+ "doc": doc.get("doc"),
209
+ "framework": doc.get("framework"),
210
+ "score": round(score, 3),
211
+ })
212
+
213
+ return results
214
+
215
+
216
+ def semantic_search(index_path: Path, query: str, limit: int = 10) -> dict[str, Any]:
217
+ """Run semantic search over the index.
218
+
219
+ Args:
220
+ index_path: Path to index.json
221
+ query: Natural language query (e.g. "authentication handler", "database models")
222
+ limit: Max results
223
+
224
+ Returns:
225
+ Dictionary with ranked results
226
+ """
227
+ index = json.loads(index_path.read_text(encoding="utf-8"))
228
+ documents = build_symbol_documents(index)
229
+
230
+ searcher = TFIDFSearcher(documents)
231
+ results = searcher.search(query, limit)
232
+
233
+ return {
234
+ "query": query,
235
+ "results": results,
236
+ "total_symbols": len(documents),
237
+ }
@@ -0,0 +1,241 @@
1
+ """Dependency vulnerability scanning — check for known vulnerable packages."""
2
+
3
+ import json
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+
9
+ def scan_dependencies(root: Path) -> dict[str, Any]:
10
+ """Scan project for dependency files and check for known issues.
11
+
12
+ Scans: package.json, requirements.txt, pyproject.toml, Gemfile, go.mod, Cargo.toml
13
+ """
14
+ root = Path(root).resolve()
15
+ results = {
16
+ "scanned_files": [],
17
+ "dependencies": [],
18
+ "warnings": [],
19
+ }
20
+
21
+ # package.json
22
+ pkg_json = root / "package.json"
23
+ if pkg_json.exists():
24
+ _scan_package_json(pkg_json, results)
25
+
26
+ # requirements.txt
27
+ req_txt = root / "requirements.txt"
28
+ if req_txt.exists():
29
+ _scan_requirements_txt(req_txt, results)
30
+
31
+ # pyproject.toml
32
+ pyproject = root / "pyproject.toml"
33
+ if pyproject.exists():
34
+ _scan_pyproject_toml(pyproject, results)
35
+
36
+ # go.mod
37
+ go_mod = root / "go.mod"
38
+ if go_mod.exists():
39
+ _scan_go_mod(go_mod, results)
40
+
41
+ # Cargo.toml
42
+ cargo = root / "Cargo.toml"
43
+ if cargo.exists():
44
+ _scan_cargo_toml(cargo, results)
45
+
46
+ # Check for common issues
47
+ _check_common_issues(root, results)
48
+
49
+ results["total_dependencies"] = len(results["dependencies"])
50
+ results["total_warnings"] = len(results["warnings"])
51
+
52
+ return results
53
+
54
+
55
+ def _scan_package_json(path: Path, results: dict) -> None:
56
+ """Scan package.json for dependencies."""
57
+ try:
58
+ data = json.loads(path.read_text(encoding="utf-8"))
59
+ results["scanned_files"].append(str(path.name))
60
+
61
+ for section in ("dependencies", "devDependencies"):
62
+ deps = data.get(section, {})
63
+ for name, version in deps.items():
64
+ dep = {
65
+ "name": name,
66
+ "version": version,
67
+ "source": "package.json",
68
+ "dev": section == "devDependencies",
69
+ }
70
+ results["dependencies"].append(dep)
71
+
72
+ # Check for wildcard/any versions
73
+ if version in ("*", "latest", ""):
74
+ results["warnings"].append({
75
+ "package": name,
76
+ "severity": "medium",
77
+ "message": f"Unpinned version '{version}' — use a specific version range",
78
+ })
79
+
80
+ # Check for known risky patterns
81
+ _check_npm_warnings(name, version, results)
82
+ except (json.JSONDecodeError, OSError):
83
+ pass
84
+
85
+
86
+ def _scan_requirements_txt(path: Path, results: dict) -> None:
87
+ """Scan requirements.txt."""
88
+ try:
89
+ results["scanned_files"].append(str(path.name))
90
+ for line in path.read_text(encoding="utf-8").splitlines():
91
+ line = line.strip()
92
+ if not line or line.startswith("#") or line.startswith("-"):
93
+ continue
94
+
95
+ # Parse name==version or name>=version
96
+ match = re.match(r'^([a-zA-Z0-9_-]+)\s*([><=!~]+)?\s*(.*)$', line)
97
+ if match:
98
+ name = match.group(1)
99
+ op = match.group(2) or ""
100
+ version = match.group(3) or "unpinned"
101
+
102
+ results["dependencies"].append({
103
+ "name": name,
104
+ "version": f"{op}{version}" if op else version,
105
+ "source": "requirements.txt",
106
+ "dev": False,
107
+ })
108
+
109
+ if not op:
110
+ results["warnings"].append({
111
+ "package": name,
112
+ "severity": "medium",
113
+ "message": "No version constraint — pin to a specific version",
114
+ })
115
+ except OSError:
116
+ pass
117
+
118
+
119
+ def _scan_pyproject_toml(path: Path, results: dict) -> None:
120
+ """Scan pyproject.toml for dependencies."""
121
+ try:
122
+ content = path.read_text(encoding="utf-8")
123
+ results["scanned_files"].append(str(path.name))
124
+
125
+ # Simple TOML parsing for dependencies array
126
+ in_deps = False
127
+ for line in content.splitlines():
128
+ stripped = line.strip()
129
+ if stripped.startswith("dependencies") and "=" in stripped:
130
+ in_deps = True
131
+ continue
132
+ if in_deps:
133
+ if stripped == "]":
134
+ in_deps = False
135
+ continue
136
+ # Parse "package>=version"
137
+ match = re.search(r'"([^"]+)"', stripped)
138
+ if match:
139
+ dep_str = match.group(1)
140
+ dep_match = re.match(r'^([a-zA-Z0-9_-]+)\s*([><=!~]+)?\s*(.*)$', dep_str)
141
+ if dep_match:
142
+ results["dependencies"].append({
143
+ "name": dep_match.group(1),
144
+ "version": f"{dep_match.group(2) or ''}{dep_match.group(3) or 'unpinned'}",
145
+ "source": "pyproject.toml",
146
+ "dev": False,
147
+ })
148
+ except OSError:
149
+ pass
150
+
151
+
152
+ def _scan_go_mod(path: Path, results: dict) -> None:
153
+ """Scan go.mod."""
154
+ try:
155
+ content = path.read_text(encoding="utf-8")
156
+ results["scanned_files"].append("go.mod")
157
+
158
+ for line in content.splitlines():
159
+ line = line.strip()
160
+ if line.startswith("require") or line.startswith(")") or line.startswith("("):
161
+ continue
162
+ parts = line.split()
163
+ if len(parts) >= 2 and "/" in parts[0]:
164
+ results["dependencies"].append({
165
+ "name": parts[0],
166
+ "version": parts[1],
167
+ "source": "go.mod",
168
+ "dev": False,
169
+ })
170
+ except OSError:
171
+ pass
172
+
173
+
174
+ def _scan_cargo_toml(path: Path, results: dict) -> None:
175
+ """Scan Cargo.toml."""
176
+ try:
177
+ content = path.read_text(encoding="utf-8")
178
+ results["scanned_files"].append("Cargo.toml")
179
+
180
+ in_deps = False
181
+ for line in content.splitlines():
182
+ stripped = line.strip()
183
+ if stripped == "[dependencies]":
184
+ in_deps = True
185
+ continue
186
+ elif stripped.startswith("["):
187
+ in_deps = False
188
+ continue
189
+ if in_deps and "=" in stripped:
190
+ parts = stripped.split("=", 1)
191
+ name = parts[0].strip()
192
+ version = parts[1].strip().strip('"')
193
+ results["dependencies"].append({
194
+ "name": name,
195
+ "version": version,
196
+ "source": "Cargo.toml",
197
+ "dev": False,
198
+ })
199
+ except OSError:
200
+ pass
201
+
202
+
203
+ def _check_npm_warnings(name: str, version: str, results: dict) -> None:
204
+ """Check for commonly known risky npm patterns."""
205
+ # Check for http:// or git:// protocol in version
206
+ if version.startswith("http://") or version.startswith("git://"):
207
+ results["warnings"].append({
208
+ "package": name,
209
+ "severity": "high",
210
+ "message": "Insecure protocol in dependency URL",
211
+ })
212
+
213
+
214
+ def _check_common_issues(root: Path, results: dict) -> None:
215
+ """Check for common security issues in the project."""
216
+ # .env file committed
217
+ env_file = root / ".env"
218
+ if env_file.exists():
219
+ gitignore = root / ".gitignore"
220
+ if gitignore.exists():
221
+ gi_content = gitignore.read_text(encoding="utf-8", errors="ignore")
222
+ if ".env" not in gi_content:
223
+ results["warnings"].append({
224
+ "package": ".env",
225
+ "severity": "high",
226
+ "message": ".env file exists but is not in .gitignore — secrets may be exposed",
227
+ })
228
+ else:
229
+ results["warnings"].append({
230
+ "package": ".env",
231
+ "severity": "high",
232
+ "message": ".env file exists with no .gitignore — secrets may be exposed",
233
+ })
234
+
235
+ # package-lock.json missing
236
+ if (root / "package.json").exists() and not (root / "package-lock.json").exists() and not (root / "yarn.lock").exists():
237
+ results["warnings"].append({
238
+ "package": "lockfile",
239
+ "severity": "medium",
240
+ "message": "No lockfile (package-lock.json or yarn.lock) — builds may not be reproducible",
241
+ })
cortexcode/watcher.py ADDED
@@ -0,0 +1,122 @@
1
+ """File Watcher - Auto-reindex on file changes."""
2
+
3
+ import time
4
+ from pathlib import Path
5
+
6
+ from watchdog.events import FileSystemEventHandler, FileSystemEvent
7
+ from watchdog.observers import Observer
8
+
9
+ from cortexcode import indexer
10
+
11
+
12
+ class IndexEventHandler(FileSystemEventHandler):
13
+ """Handler that re-indexes files on change."""
14
+
15
+ def __init__(self, root_path: Path, debounce_seconds: float = 1.0):
16
+ self.root_path = root_path
17
+ self.index_path = root_path / ".cortexcode" / "index.json"
18
+ self.debounce_seconds = debounce_seconds
19
+ self.last_index_time = 0.0
20
+ self.pending_files: set[str] = set()
21
+ self.verbose = False
22
+
23
+ def on_modified(self, event: FileSystemEvent) -> None:
24
+ if event.is_directory:
25
+ return
26
+ if not self._should_index(event.src_path):
27
+ return
28
+
29
+ self.pending_files.add(event.src_path)
30
+ self._maybe_reindex()
31
+
32
+ def on_created(self, event: FileSystemEvent) -> None:
33
+ if event.is_directory:
34
+ return
35
+ if not self._should_index(event.src_path):
36
+ return
37
+
38
+ self.pending_files.add(event.src_path)
39
+ self._maybe_reindex()
40
+
41
+ def on_deleted(self, event: FileSystemEvent) -> None:
42
+ if event.is_directory:
43
+ return
44
+ self._maybe_reindex()
45
+
46
+ def _should_index(self, path: str) -> bool:
47
+ """Check if file should be indexed."""
48
+ path_obj = Path(path)
49
+ ext = path_obj.suffix.lower()
50
+
51
+ if ext not in CodeIndexer.SUPPORTED_EXTENSIONS:
52
+ return False
53
+
54
+ ignore_patterns = {
55
+ "__pycache__", ".git", ".venv", "venv", "node_modules",
56
+ ".pytest_cache", ".mypy_cache", ".ruff_cache", ".cortexcode"
57
+ }
58
+
59
+ path_str = str(path_obj)
60
+ return not any(pattern in path_str for pattern in ignore_patterns)
61
+
62
+ def _maybe_reindex(self) -> None:
63
+ """Debounced reindex."""
64
+ now = time.time()
65
+
66
+ if now - self.last_index_time < self.debounce_seconds:
67
+ return
68
+
69
+ if not self.pending_files:
70
+ return
71
+
72
+ self.pending_files.clear()
73
+ self._reindex()
74
+
75
+ def _reindex(self) -> None:
76
+ """Perform the reindex."""
77
+ try:
78
+ index_data = indexer.index_directory(self.root_path)
79
+ indexer.save_index(index_data, self.index_path)
80
+
81
+ if self.verbose:
82
+ print(f"[CortexCode] Re-indexed at {time.strftime('%H:%M:%S')}")
83
+ else:
84
+ print(".", end="", flush=True)
85
+
86
+ self.last_index_time = time.time()
87
+ except Exception as e:
88
+ print(f"\n[CortexCode] Error re-indexing: {e}")
89
+
90
+
91
+ def start_watcher(root_path: Path, verbose: bool = False) -> None:
92
+ """Start watching a directory for changes.
93
+
94
+ Args:
95
+ root_path: Directory to watch
96
+ verbose: Print file change events
97
+ """
98
+ root_path = Path(root_path).resolve()
99
+ index_path = root_path / ".cortexcode" / "index.json"
100
+
101
+ if not index_path.exists():
102
+ print("[CortexCode] No index found. Run 'cortexcode index' first.")
103
+ return
104
+
105
+ event_handler = IndexEventHandler(root_path)
106
+ event_handler.verbose = verbose
107
+
108
+ observer = Observer()
109
+ observer.schedule(event_handler, str(root_path), recursive=True)
110
+ observer.start()
111
+
112
+ try:
113
+ while True:
114
+ time.sleep(1)
115
+ except KeyboardInterrupt:
116
+ observer.stop()
117
+
118
+ observer.join()
119
+
120
+
121
+ # Import SUPPORTED_EXTENSIONS from indexer
122
+ from cortexcode.indexer import CodeIndexer