codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
"""Service-level graph builder.
|
|
2
|
+
|
|
3
|
+
Scans local repos and builds a service dependency graph.
|
|
4
|
+
Scans local repos, detects service boundaries, DB/queue deps, and HTTP calls.
|
|
5
|
+
No Neo4j dependency — uses the pluggable GraphStore interface.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import pathspec
|
|
15
|
+
|
|
16
|
+
from corbell.core.graph.schema import (
|
|
17
|
+
DataStoreNode,
|
|
18
|
+
DependencyEdge,
|
|
19
|
+
GraphStore,
|
|
20
|
+
QueueNode,
|
|
21
|
+
ServiceNode,
|
|
22
|
+
)
|
|
23
|
+
from corbell.core.gitignore import load_gitignore
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Service pattern detection rules
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
_PYTHON_SERVICE_PATTERNS = [
|
|
30
|
+
{"pattern": "FastAPI(", "type": "api", "framework": "fastapi"},
|
|
31
|
+
{"pattern": "Flask(__name__)", "type": "api", "framework": "flask"},
|
|
32
|
+
{"pattern": "@app.route", "type": "api", "framework": "flask"},
|
|
33
|
+
{"pattern": "@celery.task", "type": "worker", "framework": "celery"},
|
|
34
|
+
{"pattern": "@app.task", "type": "worker", "framework": "celery"},
|
|
35
|
+
{"pattern": "@click.command", "type": "cli", "framework": "click"},
|
|
36
|
+
{"pattern": "argparse.ArgumentParser", "type": "cli", "framework": "argparse"},
|
|
37
|
+
{"pattern": "typer.Typer(", "type": "cli", "framework": "typer"},
|
|
38
|
+
{"pattern": "if __name__ == '__main__':", "type": "service", "framework": "stdlib"},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
_JS_SERVICE_PATTERNS = [
|
|
42
|
+
{"pattern": "express()", "type": "api", "framework": "express"},
|
|
43
|
+
{"pattern": "app.listen(", "type": "api", "framework": "express"},
|
|
44
|
+
{"pattern": "@Controller(", "type": "api", "framework": "nestjs"},
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
_JAVA_SERVICE_PATTERNS = [
|
|
48
|
+
{"pattern": "@RestController", "type": "api", "framework": "spring"},
|
|
49
|
+
{"pattern": "@Controller", "type": "api", "framework": "spring"},
|
|
50
|
+
{"pattern": "public static void main(", "type": "service", "framework": "stdlib"},
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
_GO_SERVICE_PATTERNS = [
|
|
54
|
+
{"pattern": "http.ListenAndServe", "type": "api", "framework": "net/http"},
|
|
55
|
+
{"pattern": "gin.Default()", "type": "api", "framework": "gin"},
|
|
56
|
+
{"pattern": "func main()", "type": "service", "framework": "stdlib"},
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
_LANG_SERVICE_PATTERNS = {
|
|
60
|
+
"python": _PYTHON_SERVICE_PATTERNS,
|
|
61
|
+
"javascript": _JS_SERVICE_PATTERNS,
|
|
62
|
+
"typescript": _JS_SERVICE_PATTERNS,
|
|
63
|
+
"java": _JAVA_SERVICE_PATTERNS,
|
|
64
|
+
"go": _GO_SERVICE_PATTERNS,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
_PYTHON_DB_PATTERNS = [
|
|
68
|
+
{"pattern": "psycopg2.connect", "db_type": "postgres"},
|
|
69
|
+
{"pattern": "create_engine(", "db_type": "postgres"},
|
|
70
|
+
{"pattern": "asyncpg.create_pool", "db_type": "postgres"},
|
|
71
|
+
{"pattern": "MongoClient(", "db_type": "mongodb"},
|
|
72
|
+
{"pattern": "redis.Redis(", "db_type": "redis"},
|
|
73
|
+
{"pattern": "redis.StrictRedis(", "db_type": "redis"},
|
|
74
|
+
{"pattern": "boto3.resource('dynamodb')", "db_type": "dynamodb"},
|
|
75
|
+
{"pattern": "sqlite3.connect", "db_type": "sqlite"},
|
|
76
|
+
{"pattern": "chromadb.PersistentClient", "db_type": "chromadb"},
|
|
77
|
+
{"pattern": "GraphDatabase.driver", "db_type": "neo4j"},
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_JS_DB_PATTERNS = [
|
|
81
|
+
{"pattern": "pg.Pool(", "db_type": "postgres"},
|
|
82
|
+
{"pattern": "new Pool(", "db_type": "postgres"},
|
|
83
|
+
{"pattern": "createPool(", "db_type": "mysql"},
|
|
84
|
+
{"pattern": "mongoose.connect(", "db_type": "mongodb"},
|
|
85
|
+
{"pattern": "new MongoClient(", "db_type": "mongodb"},
|
|
86
|
+
{"pattern": "redis.createClient(", "db_type": "redis"},
|
|
87
|
+
{"pattern": "new Redis(", "db_type": "redis"},
|
|
88
|
+
{"pattern": "createClient({", "db_type": "redis"},
|
|
89
|
+
{"pattern": "new Sequelize(", "db_type": "postgres"},
|
|
90
|
+
{"pattern": "DynamoDBClient(", "db_type": "dynamodb"},
|
|
91
|
+
{"pattern": "createClient({ url", "db_type": "supabase"},
|
|
92
|
+
{"pattern": "PrismaClient", "db_type": "postgres"},
|
|
93
|
+
{"pattern": "knex(", "db_type": "postgres"},
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
_GO_DB_PATTERNS = [
|
|
97
|
+
{"pattern": "sql.Open(", "db_type": "postgres"},
|
|
98
|
+
{"pattern": "pgx.Connect(", "db_type": "postgres"},
|
|
99
|
+
{"pattern": "gorm.Open(", "db_type": "postgres"},
|
|
100
|
+
{"pattern": "mongo.Connect(", "db_type": "mongodb"},
|
|
101
|
+
{"pattern": "redis.NewClient(", "db_type": "redis"},
|
|
102
|
+
{"pattern": "dynamodb.New(", "db_type": "dynamodb"},
|
|
103
|
+
{"pattern": "bolt.Open(", "db_type": "sqlite"},
|
|
104
|
+
{"pattern": "neo4j.NewDriver(", "db_type": "neo4j"},
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
_JAVA_DB_PATTERNS = [
|
|
108
|
+
{"pattern": "DriverManager.getConnection(", "db_type": "postgres"},
|
|
109
|
+
{"pattern": "@Repository", "db_type": "postgres"},
|
|
110
|
+
{"pattern": "JdbcTemplate", "db_type": "postgres"},
|
|
111
|
+
{"pattern": "new MongoClient(", "db_type": "mongodb"},
|
|
112
|
+
{"pattern": "MongoClients.create(", "db_type": "mongodb"},
|
|
113
|
+
{"pattern": "JedisPool(", "db_type": "redis"},
|
|
114
|
+
{"pattern": "RedisConnectionFactory", "db_type": "redis"},
|
|
115
|
+
{"pattern": "EntityManager", "db_type": "postgres"},
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
_LANG_DB_PATTERNS: Dict[str, List] = {
|
|
119
|
+
"python": _PYTHON_DB_PATTERNS,
|
|
120
|
+
"javascript": _JS_DB_PATTERNS,
|
|
121
|
+
"typescript": _JS_DB_PATTERNS,
|
|
122
|
+
"java": _JAVA_DB_PATTERNS,
|
|
123
|
+
"go": _GO_DB_PATTERNS,
|
|
124
|
+
"ruby": [],
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
_PYTHON_QUEUE_PATTERNS = [
|
|
128
|
+
{"pattern": "boto3.client('sqs')", "queue_type": "sqs"},
|
|
129
|
+
{"pattern": "pika.BlockingConnection", "queue_type": "rabbitmq"},
|
|
130
|
+
{"pattern": "KafkaProducer(", "queue_type": "kafka"},
|
|
131
|
+
{"pattern": "KafkaConsumer(", "queue_type": "kafka"},
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
_JS_QUEUE_PATTERNS = [
|
|
135
|
+
{"pattern": "new Kafka(", "queue_type": "kafka"},
|
|
136
|
+
{"pattern": "kafkajs", "queue_type": "kafka"},
|
|
137
|
+
{"pattern": "amqplib.connect(", "queue_type": "rabbitmq"},
|
|
138
|
+
{"pattern": "new SQSClient(", "queue_type": "sqs"},
|
|
139
|
+
{"pattern": "new Bull(", "queue_type": "redis"},
|
|
140
|
+
{"pattern": "new Queue(", "queue_type": "redis"},
|
|
141
|
+
{"pattern": "PubSub(", "queue_type": "pubsub"},
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
_GO_QUEUE_PATTERNS = [
|
|
145
|
+
{"pattern": "kafka.NewWriter(", "queue_type": "kafka"},
|
|
146
|
+
{"pattern": "sarama.NewClient(", "queue_type": "kafka"},
|
|
147
|
+
{"pattern": "amqp.Dial(", "queue_type": "rabbitmq"},
|
|
148
|
+
{"pattern": "sqs.New(", "queue_type": "sqs"},
|
|
149
|
+
{"pattern": "pubsub.NewClient(", "queue_type": "pubsub"},
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
_JAVA_QUEUE_PATTERNS = [
|
|
153
|
+
{"pattern": "KafkaProducer(", "queue_type": "kafka"},
|
|
154
|
+
{"pattern": "@KafkaListener", "queue_type": "kafka"},
|
|
155
|
+
{"pattern": "RabbitTemplate", "queue_type": "rabbitmq"},
|
|
156
|
+
{"pattern": "@RabbitListener", "queue_type": "rabbitmq"},
|
|
157
|
+
{"pattern": "AmazonSQS", "queue_type": "sqs"},
|
|
158
|
+
{"pattern": "@SqsListener", "queue_type": "sqs"},
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
_LANG_QUEUE_PATTERNS: Dict[str, List] = {
|
|
162
|
+
"python": _PYTHON_QUEUE_PATTERNS,
|
|
163
|
+
"javascript": _JS_QUEUE_PATTERNS,
|
|
164
|
+
"typescript": _JS_QUEUE_PATTERNS,
|
|
165
|
+
"java": _JAVA_QUEUE_PATTERNS,
|
|
166
|
+
"go": _GO_QUEUE_PATTERNS,
|
|
167
|
+
"ruby": [],
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
_PYTHON_HTTP_PATTERNS = [
|
|
171
|
+
{"pattern": "requests.", "call_type": "http_call"},
|
|
172
|
+
{"pattern": "httpx.", "call_type": "http_call"},
|
|
173
|
+
{"pattern": "httpx.AsyncClient", "call_type": "http_call"},
|
|
174
|
+
{"pattern": "aiohttp.ClientSession", "call_type": "http_call"},
|
|
175
|
+
{"pattern": "urllib.request", "call_type": "http_call"},
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
_JS_HTTP_PATTERNS = [
|
|
179
|
+
{"pattern": "fetch(", "call_type": "http_call"},
|
|
180
|
+
{"pattern": "axios.get(", "call_type": "http_call"},
|
|
181
|
+
{"pattern": "axios.post(", "call_type": "http_call"},
|
|
182
|
+
{"pattern": "axios.request(", "call_type": "http_call"},
|
|
183
|
+
{"pattern": "axios.create(", "call_type": "http_call"},
|
|
184
|
+
{"pattern": "http.get(", "call_type": "http_call"},
|
|
185
|
+
{"pattern": "got.get(", "call_type": "http_call"},
|
|
186
|
+
{"pattern": "superagent.get(", "call_type": "http_call"},
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
_GO_HTTP_PATTERNS = [
|
|
190
|
+
{"pattern": "http.Get(", "call_type": "http_call"},
|
|
191
|
+
{"pattern": "http.Post(", "call_type": "http_call"},
|
|
192
|
+
{"pattern": "http.NewRequest(", "call_type": "http_call"},
|
|
193
|
+
{"pattern": "client.Do(", "call_type": "http_call"},
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
_JAVA_HTTP_PATTERNS = [
|
|
197
|
+
{"pattern": "HttpClient", "call_type": "http_call"},
|
|
198
|
+
{"pattern": "RestTemplate", "call_type": "http_call"},
|
|
199
|
+
{"pattern": "WebClient", "call_type": "http_call"},
|
|
200
|
+
{"pattern": "HttpURLConnection", "call_type": "http_call"},
|
|
201
|
+
{"pattern": "OkHttpClient", "call_type": "http_call"},
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
_LANG_HTTP_PATTERNS: Dict[str, List] = {
|
|
205
|
+
"python": _PYTHON_HTTP_PATTERNS,
|
|
206
|
+
"javascript": _JS_HTTP_PATTERNS,
|
|
207
|
+
"typescript": _JS_HTTP_PATTERNS,
|
|
208
|
+
"java": _JAVA_HTTP_PATTERNS,
|
|
209
|
+
"go": _GO_HTTP_PATTERNS,
|
|
210
|
+
"ruby": [],
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Env-var patterns that indicate a URL is looked up from config (any language)
|
|
214
|
+
_ENV_URL_PATTERNS = [
|
|
215
|
+
"process.env.", "os.getenv(", "os.environ[",
|
|
216
|
+
"System.getenv(", "os.Getenv(",
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
_SKIP_DIRS = {
|
|
220
|
+
".git", "__pycache__", "node_modules", "venv", "env", ".venv", "tests",
|
|
221
|
+
".pytest_cache", "dist", "build", ".next", ".nuxt", "target", "bin",
|
|
222
|
+
"obj", "coverage", ".tox",
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
_EXTENSION_LANG = {
|
|
226
|
+
".py": "python",
|
|
227
|
+
".js": "javascript",
|
|
228
|
+
".ts": "typescript",
|
|
229
|
+
".tsx": "typescript",
|
|
230
|
+
".jsx": "javascript",
|
|
231
|
+
".java": "java",
|
|
232
|
+
".go": "go",
|
|
233
|
+
".rb": "ruby",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class ServiceGraphBuilder:
|
|
238
|
+
"""Build a service-level dependency graph by scanning local repositories."""
|
|
239
|
+
|
|
240
|
+
def __init__(self, graph_store: GraphStore):
|
|
241
|
+
"""Initialize with any GraphStore backend.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
graph_store: Instance of :class:`~corbell.core.graph.schema.GraphStore`.
|
|
245
|
+
"""
|
|
246
|
+
self.store = graph_store
|
|
247
|
+
|
|
248
|
+
def build_from_workspace(
|
|
249
|
+
self,
|
|
250
|
+
services: List[Dict[str, Any]],
|
|
251
|
+
clear_existing: bool = True,
|
|
252
|
+
method_level: bool = False,
|
|
253
|
+
) -> Dict[str, Any]:
|
|
254
|
+
"""Scan all service repos and populate the graph.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
services: List of dicts with keys ``id``, ``repo`` (resolved path),
|
|
258
|
+
``language``, ``tags``.
|
|
259
|
+
clear_existing: Clear the store before building.
|
|
260
|
+
method_level: If True, also build method-call edges.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Summary dict with counts of services, datastores, queues, methods.
|
|
264
|
+
"""
|
|
265
|
+
if clear_existing:
|
|
266
|
+
self.store.clear()
|
|
267
|
+
|
|
268
|
+
discovered: List[Dict] = []
|
|
269
|
+
|
|
270
|
+
for svc in services:
|
|
271
|
+
svc_id = svc["id"]
|
|
272
|
+
repo_path = Path(svc.get("resolved_path") or svc["repo"])
|
|
273
|
+
language = svc.get("language", "python")
|
|
274
|
+
tags = svc.get("tags", [])
|
|
275
|
+
|
|
276
|
+
if not repo_path.exists():
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
# Gather all relevant files first so we can sniff the service type
|
|
280
|
+
gitignore_spec = load_gitignore(repo_path)
|
|
281
|
+
files = list(self._iter_files(repo_path, language, gitignore_spec))
|
|
282
|
+
service_type = self._detect_service_type(files, language)
|
|
283
|
+
|
|
284
|
+
node = ServiceNode(
|
|
285
|
+
id=svc_id,
|
|
286
|
+
name=svc_id,
|
|
287
|
+
repo=str(repo_path),
|
|
288
|
+
language=language,
|
|
289
|
+
tags=tags,
|
|
290
|
+
service_type=service_type,
|
|
291
|
+
)
|
|
292
|
+
self.store.upsert_node(node)
|
|
293
|
+
discovered.append(
|
|
294
|
+
{
|
|
295
|
+
"id": svc_id,
|
|
296
|
+
"repo_path": repo_path,
|
|
297
|
+
"language": language,
|
|
298
|
+
"files": files,
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Phase 2: deps, HTTP calls
|
|
303
|
+
datastore_ids: set = set()
|
|
304
|
+
queue_ids: set = set()
|
|
305
|
+
|
|
306
|
+
for svc in discovered:
|
|
307
|
+
self._detect_db_deps(svc, datastore_ids)
|
|
308
|
+
self._detect_queue_deps(svc, queue_ids)
|
|
309
|
+
|
|
310
|
+
# Phase 3: inter-service HTTP calls (best-effort heuristic)
|
|
311
|
+
all_service_ids = {s["id"] for s in discovered}
|
|
312
|
+
for svc in discovered:
|
|
313
|
+
self._detect_http_calls(svc, discovered)
|
|
314
|
+
self._detect_library_deps(svc, all_service_ids)
|
|
315
|
+
|
|
316
|
+
# Phase 4: method-level graph
|
|
317
|
+
service_diagnostics: Dict[str, Any] = {}
|
|
318
|
+
if method_level:
|
|
319
|
+
from corbell.core.graph.method_graph import MethodGraphBuilder
|
|
320
|
+
|
|
321
|
+
mgb = MethodGraphBuilder(self.store)
|
|
322
|
+
|
|
323
|
+
for svc in discovered:
|
|
324
|
+
svc_id = svc["id"]
|
|
325
|
+
|
|
326
|
+
# Build method-level call graph
|
|
327
|
+
result = mgb.build_for_service(svc_id, svc["repo_path"])
|
|
328
|
+
service_diagnostics[svc_id] = result
|
|
329
|
+
|
|
330
|
+
summary = self.store.get_all_nodes_summary()
|
|
331
|
+
if service_diagnostics:
|
|
332
|
+
summary["service_diagnostics"] = service_diagnostics
|
|
333
|
+
return summary
|
|
334
|
+
|
|
335
|
+
# ------------------------------------------------------------------ #
|
|
336
|
+
# Internal scanning helpers #
|
|
337
|
+
# ------------------------------------------------------------------ #
|
|
338
|
+
|
|
339
|
+
def _detect_service_type(self, files: List[Path], language: str) -> str:
|
|
340
|
+
"""Heuristically detect if a service is an infrastructure repo (CDK, Pulumi, TF, etc.)."""
|
|
341
|
+
if language in ("typescript", "javascript"):
|
|
342
|
+
for fp in files:
|
|
343
|
+
if fp.name == "package.json":
|
|
344
|
+
content = self._read(fp)
|
|
345
|
+
infra_deps = [
|
|
346
|
+
"aws-cdk", "aws-cdk-lib", "@aws-cdk/core",
|
|
347
|
+
"cdktf", "@pulumi/pulumi", "serverless", "sst"
|
|
348
|
+
]
|
|
349
|
+
if any(dep in content for dep in infra_deps):
|
|
350
|
+
return "infrastructure"
|
|
351
|
+
|
|
352
|
+
elif language == "python":
|
|
353
|
+
for fp in files:
|
|
354
|
+
if fp.name in ("requirements.txt", "Pipfile", "pyproject.toml"):
|
|
355
|
+
content = self._read(fp)
|
|
356
|
+
infra_deps = ["aws-cdk-lib", "pulumi", "cdktf"]
|
|
357
|
+
if any(dep in content for dep in infra_deps):
|
|
358
|
+
return "infrastructure"
|
|
359
|
+
|
|
360
|
+
elif language == "go":
|
|
361
|
+
for fp in files:
|
|
362
|
+
if fp.name == "go.mod":
|
|
363
|
+
content = self._read(fp)
|
|
364
|
+
infra_deps = ["github.com/pulumi/pulumi", "github.com/aws/aws-cdk-go", "github.com/hashicorp/terraform-cdk-go"]
|
|
365
|
+
if any(dep in content for dep in infra_deps):
|
|
366
|
+
return "infrastructure"
|
|
367
|
+
|
|
368
|
+
# If we see terraform files directly, we can safely assume it's infra
|
|
369
|
+
for fp in files:
|
|
370
|
+
if fp.suffix in (".tf", ".tfvars"):
|
|
371
|
+
return "infrastructure"
|
|
372
|
+
|
|
373
|
+
return "service"
|
|
374
|
+
|
|
375
|
+
def _iter_files(
|
|
376
|
+
self,
|
|
377
|
+
repo_path: Path,
|
|
378
|
+
language: str,
|
|
379
|
+
gitignore_spec: Optional[pathspec.PathSpec] = None,
|
|
380
|
+
):
|
|
381
|
+
"""Yield all scannable files in a repo."""
|
|
382
|
+
if gitignore_spec is None:
|
|
383
|
+
gitignore_spec = load_gitignore(repo_path)
|
|
384
|
+
manifests = {"package.json", "requirements.txt", "go.mod", "pom.xml", "build.gradle"}
|
|
385
|
+
for fp in repo_path.rglob("*"):
|
|
386
|
+
if not fp.is_file():
|
|
387
|
+
continue
|
|
388
|
+
if self._should_skip(fp):
|
|
389
|
+
continue
|
|
390
|
+
rel = fp.relative_to(repo_path)
|
|
391
|
+
if gitignore_spec.match_file(str(rel).replace("\\", "/")):
|
|
392
|
+
continue
|
|
393
|
+
if fp.name in manifests:
|
|
394
|
+
yield fp
|
|
395
|
+
continue
|
|
396
|
+
if _EXTENSION_LANG.get(fp.suffix) == language or fp.suffix in _EXTENSION_LANG:
|
|
397
|
+
yield fp
|
|
398
|
+
|
|
399
|
+
def _should_skip(self, fp: Path) -> bool:
|
|
400
|
+
if any(part in _SKIP_DIRS for part in fp.parts):
|
|
401
|
+
return True
|
|
402
|
+
name = fp.name
|
|
403
|
+
if name.startswith("test_") or name.endswith("_test.py"):
|
|
404
|
+
return True
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
def _read(self, fp: Path) -> str:
|
|
408
|
+
try:
|
|
409
|
+
return fp.read_text(encoding="utf-8", errors="ignore")
|
|
410
|
+
except Exception:
|
|
411
|
+
return ""
|
|
412
|
+
|
|
413
|
+
def _strip_comments_and_strings(self, content: str) -> str:
|
|
414
|
+
import re
|
|
415
|
+
content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*"', '""', content)
|
|
416
|
+
content = re.sub(r"(?<!\\)'(?:[^'\\]|\\.)*'", "''", content)
|
|
417
|
+
content = re.sub(r'(?<!\\)`(?:[^`\\]|\\.)*`', '``', content)
|
|
418
|
+
content = re.sub(r'//.*', '', content)
|
|
419
|
+
content = re.sub(r'#.*', '', content)
|
|
420
|
+
content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
|
|
421
|
+
return content
|
|
422
|
+
|
|
423
|
+
def _extract_db_name(self, content: str, db_type: str) -> Optional[str]:
|
|
424
|
+
"""Try to extract a database name from connection strings."""
|
|
425
|
+
if db_type == 'sqlite':
|
|
426
|
+
match = re.search(r'sqlite3\.connect\([\'"]([^\'"]+)[\'"]\)', content)
|
|
427
|
+
if match:
|
|
428
|
+
return Path(match.group(1)).name
|
|
429
|
+
elif db_type == 'chromadb':
|
|
430
|
+
match = re.search(r'path=[\'"]([^\'"]+)[\'"]', content)
|
|
431
|
+
if match:
|
|
432
|
+
return Path(match.group(1)).name
|
|
433
|
+
elif db_type == 'postgres':
|
|
434
|
+
match = re.search(r'dbname=([\'"]?)(\w+)\1', content)
|
|
435
|
+
if match:
|
|
436
|
+
return match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1)
|
|
437
|
+
match = re.search(r'database=([\'"]?)(\w+)\1', content)
|
|
438
|
+
if match:
|
|
439
|
+
return match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1)
|
|
440
|
+
elif db_type == 'mongodb':
|
|
441
|
+
match = re.search(r'/(\w+)\?', content)
|
|
442
|
+
if match:
|
|
443
|
+
return match.group(1)
|
|
444
|
+
return None
|
|
445
|
+
|
|
446
|
+
def _extract_queue_name(self, content: str, queue_type: str) -> Optional[str]:
|
|
447
|
+
"""Try to extract queue name from common patterns."""
|
|
448
|
+
patterns = [
|
|
449
|
+
r'QueueUrl\s*=\s*([\'"])([^\'"]+)\1',
|
|
450
|
+
r'queue_url\s*=\s*([\'"])([^\'"]+)\1',
|
|
451
|
+
r'queue_name\s*=\s*([\'"])([^\'"]+)\1',
|
|
452
|
+
r'queue\s*=\s*([\'"])([^\'"]+)\1',
|
|
453
|
+
r'https?://sqs\.[^/]+/\d+/([a-zA-Z0-9_-]+)',
|
|
454
|
+
]
|
|
455
|
+
for pattern in patterns:
|
|
456
|
+
match = re.search(pattern, content)
|
|
457
|
+
if match and match.lastindex is not None:
|
|
458
|
+
ref = match.group(match.lastindex)
|
|
459
|
+
return ref.split('/')[-1] if '/' in ref else ref
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
def _classify_io_direction(self, content: str, conn_idx: int) -> str:
|
|
463
|
+
"""Heuristically determine if the connection is mostly read or write."""
|
|
464
|
+
# Check a tiny window of text after the connection
|
|
465
|
+
window = content[conn_idx:conn_idx + 2000].lower()
|
|
466
|
+
writes = sum(window.count(w) for w in ["insert", "update", ".save", ".create", "publish", "send"])
|
|
467
|
+
reads = sum(window.count(w) for w in ["select", "find", ".get", "query", "receive", "consume"])
|
|
468
|
+
return "write" if writes > reads else "read"
|
|
469
|
+
|
|
470
|
+
def _detect_db_deps(self, svc: Dict, datastore_ids: set) -> None:
|
|
471
|
+
svc_id = svc["id"]
|
|
472
|
+
lang = svc.get("language", "python")
|
|
473
|
+
patterns = _LANG_DB_PATTERNS.get(lang, [])
|
|
474
|
+
|
|
475
|
+
for fp in svc["files"]:
|
|
476
|
+
raw_content = self._read(fp)
|
|
477
|
+
content = self._strip_comments_and_strings(raw_content)
|
|
478
|
+
for pdef in patterns:
|
|
479
|
+
idx = content.find(pdef["pattern"])
|
|
480
|
+
if idx != -1:
|
|
481
|
+
db_type = pdef["db_type"]
|
|
482
|
+
# Extract global shared DB name or fall back to globally shared name
|
|
483
|
+
db_name_extracted = self._extract_db_name(raw_content, db_type)
|
|
484
|
+
db_name = db_name_extracted or f"shared_{db_type}_db"
|
|
485
|
+
|
|
486
|
+
ds_id = f"datastore:{db_type}:{db_name}"
|
|
487
|
+
if ds_id not in datastore_ids:
|
|
488
|
+
datastore_ids.add(ds_id)
|
|
489
|
+
self.store.upsert_node(DataStoreNode(id=ds_id, kind=db_type, name=db_name))
|
|
490
|
+
|
|
491
|
+
direction = self._classify_io_direction(content, idx)
|
|
492
|
+
self.store.upsert_edge(
|
|
493
|
+
DependencyEdge(
|
|
494
|
+
source_id=svc_id,
|
|
495
|
+
target_id=ds_id,
|
|
496
|
+
kind=f"db_{direction}",
|
|
497
|
+
metadata={"file": str(fp.name)},
|
|
498
|
+
)
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
def _detect_queue_deps(self, svc: Dict, queue_ids: set) -> None:
|
|
502
|
+
svc_id = svc["id"]
|
|
503
|
+
lang = svc.get("language", "python")
|
|
504
|
+
patterns = _LANG_QUEUE_PATTERNS.get(lang, [])
|
|
505
|
+
|
|
506
|
+
for fp in svc["files"]:
|
|
507
|
+
raw_content = self._read(fp)
|
|
508
|
+
content = self._strip_comments_and_strings(raw_content)
|
|
509
|
+
for pdef in patterns:
|
|
510
|
+
idx = content.find(pdef["pattern"])
|
|
511
|
+
if idx != -1:
|
|
512
|
+
q_type = pdef["queue_type"]
|
|
513
|
+
q_name_extracted = self._extract_queue_name(raw_content, q_type)
|
|
514
|
+
q_name = q_name_extracted or f"shared_{q_type}_queue"
|
|
515
|
+
|
|
516
|
+
q_id = f"queue:{q_type}:{q_name}"
|
|
517
|
+
if q_id not in queue_ids:
|
|
518
|
+
queue_ids.add(q_id)
|
|
519
|
+
self.store.upsert_node(QueueNode(id=q_id, kind=q_type, name=q_name))
|
|
520
|
+
|
|
521
|
+
direction = self._classify_io_direction(content, idx)
|
|
522
|
+
edge_kind = "queue_publish" if direction == "write" else "queue_consume"
|
|
523
|
+
self.store.upsert_edge(
|
|
524
|
+
DependencyEdge(
|
|
525
|
+
source_id=svc_id,
|
|
526
|
+
target_id=q_id,
|
|
527
|
+
kind=edge_kind,
|
|
528
|
+
metadata={"file": str(fp.name)},
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
def _detect_http_calls(self, svc: Dict, all_services: List[Dict]) -> None:
|
|
533
|
+
svc_id = svc["id"]
|
|
534
|
+
all_service_ids = {s["id"] for s in all_services}
|
|
535
|
+
lang = svc.get("language", "python")
|
|
536
|
+
patterns = _LANG_HTTP_PATTERNS.get(lang, [])
|
|
537
|
+
|
|
538
|
+
for fp in svc["files"]:
|
|
539
|
+
raw_content = self._read(fp)
|
|
540
|
+
stripped_content = self._strip_comments_and_strings(raw_content)
|
|
541
|
+
has_http_client = any(p["pattern"] in stripped_content for p in patterns)
|
|
542
|
+
has_env_url = any(p in raw_content for p in _ENV_URL_PATTERNS)
|
|
543
|
+
if not has_http_client and not has_env_url:
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
# 1. Hard-coded URL matching — service name in URL
|
|
547
|
+
urls = re.findall(r'["\']https?://([^"\'/:]+)', raw_content)
|
|
548
|
+
for url_host in urls:
|
|
549
|
+
for other_id in all_service_ids:
|
|
550
|
+
if other_id == svc_id:
|
|
551
|
+
continue
|
|
552
|
+
svc_slug = other_id.replace("-", "").replace("_", "").lower()
|
|
553
|
+
url_clean = url_host.replace("-", "").replace("_", "").lower()
|
|
554
|
+
if svc_slug in url_clean:
|
|
555
|
+
self.store.upsert_edge(
|
|
556
|
+
DependencyEdge(
|
|
557
|
+
source_id=svc_id,
|
|
558
|
+
target_id=other_id,
|
|
559
|
+
kind="http_call",
|
|
560
|
+
metadata={"url": url_host, "file": str(fp.name)},
|
|
561
|
+
)
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# 2. Env-var URL references (Dynamic target resolution)
|
|
565
|
+
for env_pat in _ENV_URL_PATTERNS:
|
|
566
|
+
if env_pat in raw_content:
|
|
567
|
+
env_vars = re.findall(
|
|
568
|
+
r'(?:process\.env\.|os\.getenv\(|os\.environ\[|os\.environ\.get\(|System\.getenv\(|os\.Getenv\(|envvar=)\s*'
|
|
569
|
+
r'["\']?([A-Z_][A-Z0-9_]*)["\']?',
|
|
570
|
+
raw_content,
|
|
571
|
+
)
|
|
572
|
+
for var in env_vars:
|
|
573
|
+
if any(kw in var for kw in ("URL", "HOST", "ENDPOINT", "BASE", "API", "SERVER")):
|
|
574
|
+
# Try to aggressively map the env var explicitly to another workspace service
|
|
575
|
+
mapped_svc_id = None
|
|
576
|
+
clean_var = var.replace("_URL", "").replace("_HOST", "").replace("_API", "").replace("_", "").lower()
|
|
577
|
+
|
|
578
|
+
for other_id in all_service_ids:
|
|
579
|
+
if other_id == svc_id:
|
|
580
|
+
continue
|
|
581
|
+
clean_other = other_id.replace("_", "").replace("-", "").lower()
|
|
582
|
+
if clean_other in clean_var or clean_var in clean_other:
|
|
583
|
+
mapped_svc_id = other_id
|
|
584
|
+
break
|
|
585
|
+
|
|
586
|
+
if mapped_svc_id:
|
|
587
|
+
self.store.upsert_edge(
|
|
588
|
+
DependencyEdge(
|
|
589
|
+
source_id=svc_id,
|
|
590
|
+
target_id=mapped_svc_id,
|
|
591
|
+
kind="http_call",
|
|
592
|
+
metadata={
|
|
593
|
+
"env_var": var,
|
|
594
|
+
"file": str(fp.name),
|
|
595
|
+
"note": "resolved via env-var heuristic name matching",
|
|
596
|
+
},
|
|
597
|
+
)
|
|
598
|
+
)
|
|
599
|
+
else:
|
|
600
|
+
self.store.upsert_edge(
|
|
601
|
+
DependencyEdge(
|
|
602
|
+
source_id=svc_id,
|
|
603
|
+
target_id="external:env_url",
|
|
604
|
+
kind="http_call",
|
|
605
|
+
metadata={
|
|
606
|
+
"env_var": var,
|
|
607
|
+
"file": str(fp.name),
|
|
608
|
+
},
|
|
609
|
+
)
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# 3. Generic RPC/Edge Function Calls (e.g. Supabase Edge Functions)
|
|
613
|
+
# Matches: call_edge_function("name"), functions.invoke("name")
|
|
614
|
+
rpc_funcs = re.findall(r'(?:call_edge_function|functions\.invoke)\s*\(\s*["\']([^"\']+)["\']', raw_content)
|
|
615
|
+
for fn_name in rpc_funcs:
|
|
616
|
+
mapped_svc_id = None
|
|
617
|
+
|
|
618
|
+
# Scan all other services to see if they host a directory/file matching this RPC name
|
|
619
|
+
for other_svc in all_services:
|
|
620
|
+
if other_svc["id"] == svc_id:
|
|
621
|
+
continue
|
|
622
|
+
|
|
623
|
+
# Look for clues in the file tree of `other_svc`
|
|
624
|
+
for ofp in other_svc.get("files", []):
|
|
625
|
+
if fn_name in ofp.parts or ofp.stem == fn_name:
|
|
626
|
+
mapped_svc_id = other_svc["id"]
|
|
627
|
+
break
|
|
628
|
+
|
|
629
|
+
if mapped_svc_id:
|
|
630
|
+
break
|
|
631
|
+
|
|
632
|
+
if mapped_svc_id:
|
|
633
|
+
self.store.upsert_edge(
|
|
634
|
+
DependencyEdge(
|
|
635
|
+
source_id=svc_id,
|
|
636
|
+
target_id=mapped_svc_id,
|
|
637
|
+
kind="rpc_call",
|
|
638
|
+
metadata={
|
|
639
|
+
"rpc_method": fn_name,
|
|
640
|
+
"file": str(fp.name),
|
|
641
|
+
"note": "resolved via RPC directory match in target repo",
|
|
642
|
+
},
|
|
643
|
+
)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
def _detect_library_deps(self, svc: Dict, all_service_ids: set) -> None:
|
|
647
|
+
"""Scan package manifests and imports to detect if one repo relies directly on another logic module/repo."""
|
|
648
|
+
svc_id = svc["id"]
|
|
649
|
+
|
|
650
|
+
# Build map of lower-case service slugs to service IDs
|
|
651
|
+
slug_to_id = {}
|
|
652
|
+
for sid in all_service_ids:
|
|
653
|
+
if sid != svc_id:
|
|
654
|
+
slug_to_id[sid.replace("-", "").replace("_", "").lower()] = sid
|
|
655
|
+
|
|
656
|
+
# Also map actual original names and package names (like specgen-local)
|
|
657
|
+
exact_to_id = {sid: sid for sid in all_service_ids if sid != svc_id}
|
|
658
|
+
exact_to_id.update({sid.replace("_", "-"): sid for sid in all_service_ids if sid != svc_id})
|
|
659
|
+
|
|
660
|
+
for fp in svc["files"]:
|
|
661
|
+
name = fp.name
|
|
662
|
+
|
|
663
|
+
# Simple heuristic: scan manifests for matching repo/project names
|
|
664
|
+
if name in ("package.json", "requirements.txt", "go.mod", "pom.xml", "build.gradle"):
|
|
665
|
+
content = self._read(fp)
|
|
666
|
+
for exact_name, target_id in exact_to_id.items():
|
|
667
|
+
if f'"{exact_name}"' in content or f"'{exact_name}'" in content or f" {exact_name}==" in content:
|
|
668
|
+
self.store.upsert_edge(
|
|
669
|
+
DependencyEdge(
|
|
670
|
+
source_id=svc_id,
|
|
671
|
+
target_id=target_id,
|
|
672
|
+
kind="library_dependency",
|
|
673
|
+
metadata={"file": name, "note": "manifest dependency"},
|
|
674
|
+
)
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Source codes import tracing
|
|
678
|
+
elif fp.suffix in (".py", ".js", ".ts", ".go", ".java"):
|
|
679
|
+
content = self._read(fp)
|
|
680
|
+
|
|
681
|
+
# Check for imports containing the slug of another service
|
|
682
|
+
# (e.g., `import specgen_local` or `require('specgen_local')`)
|
|
683
|
+
for exact_name, target_id in exact_to_id.items():
|
|
684
|
+
import_pattern_py = rf"(?:from|import)\s+{exact_name.replace('-', '_')}"
|
|
685
|
+
import_pattern_js = rf"(?:import|require).*{exact_name}"
|
|
686
|
+
|
|
687
|
+
if re.search(import_pattern_py, content) or re.search(import_pattern_js, content):
|
|
688
|
+
self.store.upsert_edge(
|
|
689
|
+
DependencyEdge(
|
|
690
|
+
source_id=svc_id,
|
|
691
|
+
target_id=target_id,
|
|
692
|
+
kind="library_dependency",
|
|
693
|
+
metadata={"file": str(fp.name), "note": "source import"},
|
|
694
|
+
)
|
|
695
|
+
)
|
|
696
|
+
|