codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,696 @@
1
+ """Service-level graph builder.
2
+
3
+ Scans local repos and builds a service dependency graph.
4
+ Scans local repos, detects service boundaries, DB/queue deps, and HTTP calls.
5
+ No Neo4j dependency — uses the pluggable GraphStore interface.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ import pathspec
15
+
16
+ from corbell.core.graph.schema import (
17
+ DataStoreNode,
18
+ DependencyEdge,
19
+ GraphStore,
20
+ QueueNode,
21
+ ServiceNode,
22
+ )
23
+ from corbell.core.gitignore import load_gitignore
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Service pattern detection rules
27
+ # ---------------------------------------------------------------------------
28
+
29
+ _PYTHON_SERVICE_PATTERNS = [
30
+ {"pattern": "FastAPI(", "type": "api", "framework": "fastapi"},
31
+ {"pattern": "Flask(__name__)", "type": "api", "framework": "flask"},
32
+ {"pattern": "@app.route", "type": "api", "framework": "flask"},
33
+ {"pattern": "@celery.task", "type": "worker", "framework": "celery"},
34
+ {"pattern": "@app.task", "type": "worker", "framework": "celery"},
35
+ {"pattern": "@click.command", "type": "cli", "framework": "click"},
36
+ {"pattern": "argparse.ArgumentParser", "type": "cli", "framework": "argparse"},
37
+ {"pattern": "typer.Typer(", "type": "cli", "framework": "typer"},
38
+ {"pattern": "if __name__ == '__main__':", "type": "service", "framework": "stdlib"},
39
+ ]
40
+
41
+ _JS_SERVICE_PATTERNS = [
42
+ {"pattern": "express()", "type": "api", "framework": "express"},
43
+ {"pattern": "app.listen(", "type": "api", "framework": "express"},
44
+ {"pattern": "@Controller(", "type": "api", "framework": "nestjs"},
45
+ ]
46
+
47
+ _JAVA_SERVICE_PATTERNS = [
48
+ {"pattern": "@RestController", "type": "api", "framework": "spring"},
49
+ {"pattern": "@Controller", "type": "api", "framework": "spring"},
50
+ {"pattern": "public static void main(", "type": "service", "framework": "stdlib"},
51
+ ]
52
+
53
+ _GO_SERVICE_PATTERNS = [
54
+ {"pattern": "http.ListenAndServe", "type": "api", "framework": "net/http"},
55
+ {"pattern": "gin.Default()", "type": "api", "framework": "gin"},
56
+ {"pattern": "func main()", "type": "service", "framework": "stdlib"},
57
+ ]
58
+
59
+ _LANG_SERVICE_PATTERNS = {
60
+ "python": _PYTHON_SERVICE_PATTERNS,
61
+ "javascript": _JS_SERVICE_PATTERNS,
62
+ "typescript": _JS_SERVICE_PATTERNS,
63
+ "java": _JAVA_SERVICE_PATTERNS,
64
+ "go": _GO_SERVICE_PATTERNS,
65
+ }
66
+
67
+ _PYTHON_DB_PATTERNS = [
68
+ {"pattern": "psycopg2.connect", "db_type": "postgres"},
69
+ {"pattern": "create_engine(", "db_type": "postgres"},
70
+ {"pattern": "asyncpg.create_pool", "db_type": "postgres"},
71
+ {"pattern": "MongoClient(", "db_type": "mongodb"},
72
+ {"pattern": "redis.Redis(", "db_type": "redis"},
73
+ {"pattern": "redis.StrictRedis(", "db_type": "redis"},
74
+ {"pattern": "boto3.resource('dynamodb')", "db_type": "dynamodb"},
75
+ {"pattern": "sqlite3.connect", "db_type": "sqlite"},
76
+ {"pattern": "chromadb.PersistentClient", "db_type": "chromadb"},
77
+ {"pattern": "GraphDatabase.driver", "db_type": "neo4j"},
78
+ ]
79
+
80
+ _JS_DB_PATTERNS = [
81
+ {"pattern": "pg.Pool(", "db_type": "postgres"},
82
+ {"pattern": "new Pool(", "db_type": "postgres"},
83
+ {"pattern": "createPool(", "db_type": "mysql"},
84
+ {"pattern": "mongoose.connect(", "db_type": "mongodb"},
85
+ {"pattern": "new MongoClient(", "db_type": "mongodb"},
86
+ {"pattern": "redis.createClient(", "db_type": "redis"},
87
+ {"pattern": "new Redis(", "db_type": "redis"},
88
+ {"pattern": "createClient({", "db_type": "redis"},
89
+ {"pattern": "new Sequelize(", "db_type": "postgres"},
90
+ {"pattern": "DynamoDBClient(", "db_type": "dynamodb"},
91
+ {"pattern": "createClient({ url", "db_type": "supabase"},
92
+ {"pattern": "PrismaClient", "db_type": "postgres"},
93
+ {"pattern": "knex(", "db_type": "postgres"},
94
+ ]
95
+
96
+ _GO_DB_PATTERNS = [
97
+ {"pattern": "sql.Open(", "db_type": "postgres"},
98
+ {"pattern": "pgx.Connect(", "db_type": "postgres"},
99
+ {"pattern": "gorm.Open(", "db_type": "postgres"},
100
+ {"pattern": "mongo.Connect(", "db_type": "mongodb"},
101
+ {"pattern": "redis.NewClient(", "db_type": "redis"},
102
+ {"pattern": "dynamodb.New(", "db_type": "dynamodb"},
103
+ {"pattern": "bolt.Open(", "db_type": "sqlite"},
104
+ {"pattern": "neo4j.NewDriver(", "db_type": "neo4j"},
105
+ ]
106
+
107
+ _JAVA_DB_PATTERNS = [
108
+ {"pattern": "DriverManager.getConnection(", "db_type": "postgres"},
109
+ {"pattern": "@Repository", "db_type": "postgres"},
110
+ {"pattern": "JdbcTemplate", "db_type": "postgres"},
111
+ {"pattern": "new MongoClient(", "db_type": "mongodb"},
112
+ {"pattern": "MongoClients.create(", "db_type": "mongodb"},
113
+ {"pattern": "JedisPool(", "db_type": "redis"},
114
+ {"pattern": "RedisConnectionFactory", "db_type": "redis"},
115
+ {"pattern": "EntityManager", "db_type": "postgres"},
116
+ ]
117
+
118
+ _LANG_DB_PATTERNS: Dict[str, List] = {
119
+ "python": _PYTHON_DB_PATTERNS,
120
+ "javascript": _JS_DB_PATTERNS,
121
+ "typescript": _JS_DB_PATTERNS,
122
+ "java": _JAVA_DB_PATTERNS,
123
+ "go": _GO_DB_PATTERNS,
124
+ "ruby": [],
125
+ }
126
+
127
+ _PYTHON_QUEUE_PATTERNS = [
128
+ {"pattern": "boto3.client('sqs')", "queue_type": "sqs"},
129
+ {"pattern": "pika.BlockingConnection", "queue_type": "rabbitmq"},
130
+ {"pattern": "KafkaProducer(", "queue_type": "kafka"},
131
+ {"pattern": "KafkaConsumer(", "queue_type": "kafka"},
132
+ ]
133
+
134
+ _JS_QUEUE_PATTERNS = [
135
+ {"pattern": "new Kafka(", "queue_type": "kafka"},
136
+ {"pattern": "kafkajs", "queue_type": "kafka"},
137
+ {"pattern": "amqplib.connect(", "queue_type": "rabbitmq"},
138
+ {"pattern": "new SQSClient(", "queue_type": "sqs"},
139
+ {"pattern": "new Bull(", "queue_type": "redis"},
140
+ {"pattern": "new Queue(", "queue_type": "redis"},
141
+ {"pattern": "PubSub(", "queue_type": "pubsub"},
142
+ ]
143
+
144
+ _GO_QUEUE_PATTERNS = [
145
+ {"pattern": "kafka.NewWriter(", "queue_type": "kafka"},
146
+ {"pattern": "sarama.NewClient(", "queue_type": "kafka"},
147
+ {"pattern": "amqp.Dial(", "queue_type": "rabbitmq"},
148
+ {"pattern": "sqs.New(", "queue_type": "sqs"},
149
+ {"pattern": "pubsub.NewClient(", "queue_type": "pubsub"},
150
+ ]
151
+
152
+ _JAVA_QUEUE_PATTERNS = [
153
+ {"pattern": "KafkaProducer(", "queue_type": "kafka"},
154
+ {"pattern": "@KafkaListener", "queue_type": "kafka"},
155
+ {"pattern": "RabbitTemplate", "queue_type": "rabbitmq"},
156
+ {"pattern": "@RabbitListener", "queue_type": "rabbitmq"},
157
+ {"pattern": "AmazonSQS", "queue_type": "sqs"},
158
+ {"pattern": "@SqsListener", "queue_type": "sqs"},
159
+ ]
160
+
161
+ _LANG_QUEUE_PATTERNS: Dict[str, List] = {
162
+ "python": _PYTHON_QUEUE_PATTERNS,
163
+ "javascript": _JS_QUEUE_PATTERNS,
164
+ "typescript": _JS_QUEUE_PATTERNS,
165
+ "java": _JAVA_QUEUE_PATTERNS,
166
+ "go": _GO_QUEUE_PATTERNS,
167
+ "ruby": [],
168
+ }
169
+
170
+ _PYTHON_HTTP_PATTERNS = [
171
+ {"pattern": "requests.", "call_type": "http_call"},
172
+ {"pattern": "httpx.", "call_type": "http_call"},
173
+ {"pattern": "httpx.AsyncClient", "call_type": "http_call"},
174
+ {"pattern": "aiohttp.ClientSession", "call_type": "http_call"},
175
+ {"pattern": "urllib.request", "call_type": "http_call"},
176
+ ]
177
+
178
+ _JS_HTTP_PATTERNS = [
179
+ {"pattern": "fetch(", "call_type": "http_call"},
180
+ {"pattern": "axios.get(", "call_type": "http_call"},
181
+ {"pattern": "axios.post(", "call_type": "http_call"},
182
+ {"pattern": "axios.request(", "call_type": "http_call"},
183
+ {"pattern": "axios.create(", "call_type": "http_call"},
184
+ {"pattern": "http.get(", "call_type": "http_call"},
185
+ {"pattern": "got.get(", "call_type": "http_call"},
186
+ {"pattern": "superagent.get(", "call_type": "http_call"},
187
+ ]
188
+
189
+ _GO_HTTP_PATTERNS = [
190
+ {"pattern": "http.Get(", "call_type": "http_call"},
191
+ {"pattern": "http.Post(", "call_type": "http_call"},
192
+ {"pattern": "http.NewRequest(", "call_type": "http_call"},
193
+ {"pattern": "client.Do(", "call_type": "http_call"},
194
+ ]
195
+
196
+ _JAVA_HTTP_PATTERNS = [
197
+ {"pattern": "HttpClient", "call_type": "http_call"},
198
+ {"pattern": "RestTemplate", "call_type": "http_call"},
199
+ {"pattern": "WebClient", "call_type": "http_call"},
200
+ {"pattern": "HttpURLConnection", "call_type": "http_call"},
201
+ {"pattern": "OkHttpClient", "call_type": "http_call"},
202
+ ]
203
+
204
+ _LANG_HTTP_PATTERNS: Dict[str, List] = {
205
+ "python": _PYTHON_HTTP_PATTERNS,
206
+ "javascript": _JS_HTTP_PATTERNS,
207
+ "typescript": _JS_HTTP_PATTERNS,
208
+ "java": _JAVA_HTTP_PATTERNS,
209
+ "go": _GO_HTTP_PATTERNS,
210
+ "ruby": [],
211
+ }
212
+
213
+ # Env-var patterns that indicate a URL is looked up from config (any language)
214
+ _ENV_URL_PATTERNS = [
215
+ "process.env.", "os.getenv(", "os.environ[",
216
+ "System.getenv(", "os.Getenv(",
217
+ ]
218
+
219
+ _SKIP_DIRS = {
220
+ ".git", "__pycache__", "node_modules", "venv", "env", ".venv", "tests",
221
+ ".pytest_cache", "dist", "build", ".next", ".nuxt", "target", "bin",
222
+ "obj", "coverage", ".tox",
223
+ }
224
+
225
+ _EXTENSION_LANG = {
226
+ ".py": "python",
227
+ ".js": "javascript",
228
+ ".ts": "typescript",
229
+ ".tsx": "typescript",
230
+ ".jsx": "javascript",
231
+ ".java": "java",
232
+ ".go": "go",
233
+ ".rb": "ruby",
234
+ }
235
+
236
+
237
+ class ServiceGraphBuilder:
238
+ """Build a service-level dependency graph by scanning local repositories."""
239
+
240
+ def __init__(self, graph_store: GraphStore):
241
+ """Initialize with any GraphStore backend.
242
+
243
+ Args:
244
+ graph_store: Instance of :class:`~corbell.core.graph.schema.GraphStore`.
245
+ """
246
+ self.store = graph_store
247
+
248
+ def build_from_workspace(
249
+ self,
250
+ services: List[Dict[str, Any]],
251
+ clear_existing: bool = True,
252
+ method_level: bool = False,
253
+ ) -> Dict[str, Any]:
254
+ """Scan all service repos and populate the graph.
255
+
256
+ Args:
257
+ services: List of dicts with keys ``id``, ``repo`` (resolved path),
258
+ ``language``, ``tags``.
259
+ clear_existing: Clear the store before building.
260
+ method_level: If True, also build method-call edges.
261
+
262
+ Returns:
263
+ Summary dict with counts of services, datastores, queues, methods.
264
+ """
265
+ if clear_existing:
266
+ self.store.clear()
267
+
268
+ discovered: List[Dict] = []
269
+
270
+ for svc in services:
271
+ svc_id = svc["id"]
272
+ repo_path = Path(svc.get("resolved_path") or svc["repo"])
273
+ language = svc.get("language", "python")
274
+ tags = svc.get("tags", [])
275
+
276
+ if not repo_path.exists():
277
+ continue
278
+
279
+ # Gather all relevant files first so we can sniff the service type
280
+ gitignore_spec = load_gitignore(repo_path)
281
+ files = list(self._iter_files(repo_path, language, gitignore_spec))
282
+ service_type = self._detect_service_type(files, language)
283
+
284
+ node = ServiceNode(
285
+ id=svc_id,
286
+ name=svc_id,
287
+ repo=str(repo_path),
288
+ language=language,
289
+ tags=tags,
290
+ service_type=service_type,
291
+ )
292
+ self.store.upsert_node(node)
293
+ discovered.append(
294
+ {
295
+ "id": svc_id,
296
+ "repo_path": repo_path,
297
+ "language": language,
298
+ "files": files,
299
+ }
300
+ )
301
+
302
+ # Phase 2: deps, HTTP calls
303
+ datastore_ids: set = set()
304
+ queue_ids: set = set()
305
+
306
+ for svc in discovered:
307
+ self._detect_db_deps(svc, datastore_ids)
308
+ self._detect_queue_deps(svc, queue_ids)
309
+
310
+ # Phase 3: inter-service HTTP calls (best-effort heuristic)
311
+ all_service_ids = {s["id"] for s in discovered}
312
+ for svc in discovered:
313
+ self._detect_http_calls(svc, discovered)
314
+ self._detect_library_deps(svc, all_service_ids)
315
+
316
+ # Phase 4: method-level graph
317
+ service_diagnostics: Dict[str, Any] = {}
318
+ if method_level:
319
+ from corbell.core.graph.method_graph import MethodGraphBuilder
320
+
321
+ mgb = MethodGraphBuilder(self.store)
322
+
323
+ for svc in discovered:
324
+ svc_id = svc["id"]
325
+
326
+ # Build method-level call graph
327
+ result = mgb.build_for_service(svc_id, svc["repo_path"])
328
+ service_diagnostics[svc_id] = result
329
+
330
+ summary = self.store.get_all_nodes_summary()
331
+ if service_diagnostics:
332
+ summary["service_diagnostics"] = service_diagnostics
333
+ return summary
334
+
335
+ # ------------------------------------------------------------------ #
336
+ # Internal scanning helpers #
337
+ # ------------------------------------------------------------------ #
338
+
339
+ def _detect_service_type(self, files: List[Path], language: str) -> str:
340
+ """Heuristically detect if a service is an infrastructure repo (CDK, Pulumi, TF, etc.)."""
341
+ if language in ("typescript", "javascript"):
342
+ for fp in files:
343
+ if fp.name == "package.json":
344
+ content = self._read(fp)
345
+ infra_deps = [
346
+ "aws-cdk", "aws-cdk-lib", "@aws-cdk/core",
347
+ "cdktf", "@pulumi/pulumi", "serverless", "sst"
348
+ ]
349
+ if any(dep in content for dep in infra_deps):
350
+ return "infrastructure"
351
+
352
+ elif language == "python":
353
+ for fp in files:
354
+ if fp.name in ("requirements.txt", "Pipfile", "pyproject.toml"):
355
+ content = self._read(fp)
356
+ infra_deps = ["aws-cdk-lib", "pulumi", "cdktf"]
357
+ if any(dep in content for dep in infra_deps):
358
+ return "infrastructure"
359
+
360
+ elif language == "go":
361
+ for fp in files:
362
+ if fp.name == "go.mod":
363
+ content = self._read(fp)
364
+ infra_deps = ["github.com/pulumi/pulumi", "github.com/aws/aws-cdk-go", "github.com/hashicorp/terraform-cdk-go"]
365
+ if any(dep in content for dep in infra_deps):
366
+ return "infrastructure"
367
+
368
+ # If we see terraform files directly, we can safely assume it's infra
369
+ for fp in files:
370
+ if fp.suffix in (".tf", ".tfvars"):
371
+ return "infrastructure"
372
+
373
+ return "service"
374
+
375
+ def _iter_files(
376
+ self,
377
+ repo_path: Path,
378
+ language: str,
379
+ gitignore_spec: Optional[pathspec.PathSpec] = None,
380
+ ):
381
+ """Yield all scannable files in a repo."""
382
+ if gitignore_spec is None:
383
+ gitignore_spec = load_gitignore(repo_path)
384
+ manifests = {"package.json", "requirements.txt", "go.mod", "pom.xml", "build.gradle"}
385
+ for fp in repo_path.rglob("*"):
386
+ if not fp.is_file():
387
+ continue
388
+ if self._should_skip(fp):
389
+ continue
390
+ rel = fp.relative_to(repo_path)
391
+ if gitignore_spec.match_file(str(rel).replace("\\", "/")):
392
+ continue
393
+ if fp.name in manifests:
394
+ yield fp
395
+ continue
396
+ if _EXTENSION_LANG.get(fp.suffix) == language or fp.suffix in _EXTENSION_LANG:
397
+ yield fp
398
+
399
+ def _should_skip(self, fp: Path) -> bool:
400
+ if any(part in _SKIP_DIRS for part in fp.parts):
401
+ return True
402
+ name = fp.name
403
+ if name.startswith("test_") or name.endswith("_test.py"):
404
+ return True
405
+ return False
406
+
407
+ def _read(self, fp: Path) -> str:
408
+ try:
409
+ return fp.read_text(encoding="utf-8", errors="ignore")
410
+ except Exception:
411
+ return ""
412
+
413
+ def _strip_comments_and_strings(self, content: str) -> str:
414
+ import re
415
+ content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*"', '""', content)
416
+ content = re.sub(r"(?<!\\)'(?:[^'\\]|\\.)*'", "''", content)
417
+ content = re.sub(r'(?<!\\)`(?:[^`\\]|\\.)*`', '``', content)
418
+ content = re.sub(r'//.*', '', content)
419
+ content = re.sub(r'#.*', '', content)
420
+ content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
421
+ return content
422
+
423
+ def _extract_db_name(self, content: str, db_type: str) -> Optional[str]:
424
+ """Try to extract a database name from connection strings."""
425
+ if db_type == 'sqlite':
426
+ match = re.search(r'sqlite3\.connect\([\'"]([^\'"]+)[\'"]\)', content)
427
+ if match:
428
+ return Path(match.group(1)).name
429
+ elif db_type == 'chromadb':
430
+ match = re.search(r'path=[\'"]([^\'"]+)[\'"]', content)
431
+ if match:
432
+ return Path(match.group(1)).name
433
+ elif db_type == 'postgres':
434
+ match = re.search(r'dbname=([\'"]?)(\w+)\1', content)
435
+ if match:
436
+ return match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1)
437
+ match = re.search(r'database=([\'"]?)(\w+)\1', content)
438
+ if match:
439
+ return match.group(2) if match.lastindex and match.lastindex >= 2 else match.group(1)
440
+ elif db_type == 'mongodb':
441
+ match = re.search(r'/(\w+)\?', content)
442
+ if match:
443
+ return match.group(1)
444
+ return None
445
+
446
+ def _extract_queue_name(self, content: str, queue_type: str) -> Optional[str]:
447
+ """Try to extract queue name from common patterns."""
448
+ patterns = [
449
+ r'QueueUrl\s*=\s*([\'"])([^\'"]+)\1',
450
+ r'queue_url\s*=\s*([\'"])([^\'"]+)\1',
451
+ r'queue_name\s*=\s*([\'"])([^\'"]+)\1',
452
+ r'queue\s*=\s*([\'"])([^\'"]+)\1',
453
+ r'https?://sqs\.[^/]+/\d+/([a-zA-Z0-9_-]+)',
454
+ ]
455
+ for pattern in patterns:
456
+ match = re.search(pattern, content)
457
+ if match and match.lastindex is not None:
458
+ ref = match.group(match.lastindex)
459
+ return ref.split('/')[-1] if '/' in ref else ref
460
+ return None
461
+
462
+ def _classify_io_direction(self, content: str, conn_idx: int) -> str:
463
+ """Heuristically determine if the connection is mostly read or write."""
464
+ # Check a tiny window of text after the connection
465
+ window = content[conn_idx:conn_idx + 2000].lower()
466
+ writes = sum(window.count(w) for w in ["insert", "update", ".save", ".create", "publish", "send"])
467
+ reads = sum(window.count(w) for w in ["select", "find", ".get", "query", "receive", "consume"])
468
+ return "write" if writes > reads else "read"
469
+
470
+ def _detect_db_deps(self, svc: Dict, datastore_ids: set) -> None:
471
+ svc_id = svc["id"]
472
+ lang = svc.get("language", "python")
473
+ patterns = _LANG_DB_PATTERNS.get(lang, [])
474
+
475
+ for fp in svc["files"]:
476
+ raw_content = self._read(fp)
477
+ content = self._strip_comments_and_strings(raw_content)
478
+ for pdef in patterns:
479
+ idx = content.find(pdef["pattern"])
480
+ if idx != -1:
481
+ db_type = pdef["db_type"]
482
+ # Extract global shared DB name or fall back to globally shared name
483
+ db_name_extracted = self._extract_db_name(raw_content, db_type)
484
+ db_name = db_name_extracted or f"shared_{db_type}_db"
485
+
486
+ ds_id = f"datastore:{db_type}:{db_name}"
487
+ if ds_id not in datastore_ids:
488
+ datastore_ids.add(ds_id)
489
+ self.store.upsert_node(DataStoreNode(id=ds_id, kind=db_type, name=db_name))
490
+
491
+ direction = self._classify_io_direction(content, idx)
492
+ self.store.upsert_edge(
493
+ DependencyEdge(
494
+ source_id=svc_id,
495
+ target_id=ds_id,
496
+ kind=f"db_{direction}",
497
+ metadata={"file": str(fp.name)},
498
+ )
499
+ )
500
+
501
+ def _detect_queue_deps(self, svc: Dict, queue_ids: set) -> None:
502
+ svc_id = svc["id"]
503
+ lang = svc.get("language", "python")
504
+ patterns = _LANG_QUEUE_PATTERNS.get(lang, [])
505
+
506
+ for fp in svc["files"]:
507
+ raw_content = self._read(fp)
508
+ content = self._strip_comments_and_strings(raw_content)
509
+ for pdef in patterns:
510
+ idx = content.find(pdef["pattern"])
511
+ if idx != -1:
512
+ q_type = pdef["queue_type"]
513
+ q_name_extracted = self._extract_queue_name(raw_content, q_type)
514
+ q_name = q_name_extracted or f"shared_{q_type}_queue"
515
+
516
+ q_id = f"queue:{q_type}:{q_name}"
517
+ if q_id not in queue_ids:
518
+ queue_ids.add(q_id)
519
+ self.store.upsert_node(QueueNode(id=q_id, kind=q_type, name=q_name))
520
+
521
+ direction = self._classify_io_direction(content, idx)
522
+ edge_kind = "queue_publish" if direction == "write" else "queue_consume"
523
+ self.store.upsert_edge(
524
+ DependencyEdge(
525
+ source_id=svc_id,
526
+ target_id=q_id,
527
+ kind=edge_kind,
528
+ metadata={"file": str(fp.name)},
529
+ )
530
+ )
531
+
532
+ def _detect_http_calls(self, svc: Dict, all_services: List[Dict]) -> None:
533
+ svc_id = svc["id"]
534
+ all_service_ids = {s["id"] for s in all_services}
535
+ lang = svc.get("language", "python")
536
+ patterns = _LANG_HTTP_PATTERNS.get(lang, [])
537
+
538
+ for fp in svc["files"]:
539
+ raw_content = self._read(fp)
540
+ stripped_content = self._strip_comments_and_strings(raw_content)
541
+ has_http_client = any(p["pattern"] in stripped_content for p in patterns)
542
+ has_env_url = any(p in raw_content for p in _ENV_URL_PATTERNS)
543
+ if not has_http_client and not has_env_url:
544
+ continue
545
+
546
+ # 1. Hard-coded URL matching — service name in URL
547
+ urls = re.findall(r'["\']https?://([^"\'/:]+)', raw_content)
548
+ for url_host in urls:
549
+ for other_id in all_service_ids:
550
+ if other_id == svc_id:
551
+ continue
552
+ svc_slug = other_id.replace("-", "").replace("_", "").lower()
553
+ url_clean = url_host.replace("-", "").replace("_", "").lower()
554
+ if svc_slug in url_clean:
555
+ self.store.upsert_edge(
556
+ DependencyEdge(
557
+ source_id=svc_id,
558
+ target_id=other_id,
559
+ kind="http_call",
560
+ metadata={"url": url_host, "file": str(fp.name)},
561
+ )
562
+ )
563
+
564
+ # 2. Env-var URL references (Dynamic target resolution)
565
+ for env_pat in _ENV_URL_PATTERNS:
566
+ if env_pat in raw_content:
567
+ env_vars = re.findall(
568
+ r'(?:process\.env\.|os\.getenv\(|os\.environ\[|os\.environ\.get\(|System\.getenv\(|os\.Getenv\(|envvar=)\s*'
569
+ r'["\']?([A-Z_][A-Z0-9_]*)["\']?',
570
+ raw_content,
571
+ )
572
+ for var in env_vars:
573
+ if any(kw in var for kw in ("URL", "HOST", "ENDPOINT", "BASE", "API", "SERVER")):
574
+ # Try to aggressively map the env var explicitly to another workspace service
575
+ mapped_svc_id = None
576
+ clean_var = var.replace("_URL", "").replace("_HOST", "").replace("_API", "").replace("_", "").lower()
577
+
578
+ for other_id in all_service_ids:
579
+ if other_id == svc_id:
580
+ continue
581
+ clean_other = other_id.replace("_", "").replace("-", "").lower()
582
+ if clean_other in clean_var or clean_var in clean_other:
583
+ mapped_svc_id = other_id
584
+ break
585
+
586
+ if mapped_svc_id:
587
+ self.store.upsert_edge(
588
+ DependencyEdge(
589
+ source_id=svc_id,
590
+ target_id=mapped_svc_id,
591
+ kind="http_call",
592
+ metadata={
593
+ "env_var": var,
594
+ "file": str(fp.name),
595
+ "note": "resolved via env-var heuristic name matching",
596
+ },
597
+ )
598
+ )
599
+ else:
600
+ self.store.upsert_edge(
601
+ DependencyEdge(
602
+ source_id=svc_id,
603
+ target_id="external:env_url",
604
+ kind="http_call",
605
+ metadata={
606
+ "env_var": var,
607
+ "file": str(fp.name),
608
+ },
609
+ )
610
+ )
611
+
612
+ # 3. Generic RPC/Edge Function Calls (e.g. Supabase Edge Functions)
613
+ # Matches: call_edge_function("name"), functions.invoke("name")
614
+ rpc_funcs = re.findall(r'(?:call_edge_function|functions\.invoke)\s*\(\s*["\']([^"\']+)["\']', raw_content)
615
+ for fn_name in rpc_funcs:
616
+ mapped_svc_id = None
617
+
618
+ # Scan all other services to see if they host a directory/file matching this RPC name
619
+ for other_svc in all_services:
620
+ if other_svc["id"] == svc_id:
621
+ continue
622
+
623
+ # Look for clues in the file tree of `other_svc`
624
+ for ofp in other_svc.get("files", []):
625
+ if fn_name in ofp.parts or ofp.stem == fn_name:
626
+ mapped_svc_id = other_svc["id"]
627
+ break
628
+
629
+ if mapped_svc_id:
630
+ break
631
+
632
+ if mapped_svc_id:
633
+ self.store.upsert_edge(
634
+ DependencyEdge(
635
+ source_id=svc_id,
636
+ target_id=mapped_svc_id,
637
+ kind="rpc_call",
638
+ metadata={
639
+ "rpc_method": fn_name,
640
+ "file": str(fp.name),
641
+ "note": "resolved via RPC directory match in target repo",
642
+ },
643
+ )
644
+ )
645
+
646
+ def _detect_library_deps(self, svc: Dict, all_service_ids: set) -> None:
647
+ """Scan package manifests and imports to detect if one repo relies directly on another logic module/repo."""
648
+ svc_id = svc["id"]
649
+
650
+ # Build map of lower-case service slugs to service IDs
651
+ slug_to_id = {}
652
+ for sid in all_service_ids:
653
+ if sid != svc_id:
654
+ slug_to_id[sid.replace("-", "").replace("_", "").lower()] = sid
655
+
656
+ # Also map actual original names and package names (like specgen-local)
657
+ exact_to_id = {sid: sid for sid in all_service_ids if sid != svc_id}
658
+ exact_to_id.update({sid.replace("_", "-"): sid for sid in all_service_ids if sid != svc_id})
659
+
660
+ for fp in svc["files"]:
661
+ name = fp.name
662
+
663
+ # Simple heuristic: scan manifests for matching repo/project names
664
+ if name in ("package.json", "requirements.txt", "go.mod", "pom.xml", "build.gradle"):
665
+ content = self._read(fp)
666
+ for exact_name, target_id in exact_to_id.items():
667
+ if f'"{exact_name}"' in content or f"'{exact_name}'" in content or f" {exact_name}==" in content:
668
+ self.store.upsert_edge(
669
+ DependencyEdge(
670
+ source_id=svc_id,
671
+ target_id=target_id,
672
+ kind="library_dependency",
673
+ metadata={"file": name, "note": "manifest dependency"},
674
+ )
675
+ )
676
+
677
+ # Source codes import tracing
678
+ elif fp.suffix in (".py", ".js", ".ts", ".go", ".java"):
679
+ content = self._read(fp)
680
+
681
+ # Check for imports containing the slug of another service
682
+ # (e.g., `import specgen_local` or `require('specgen_local')`)
683
+ for exact_name, target_id in exact_to_id.items():
684
+ import_pattern_py = rf"(?:from|import)\s+{exact_name.replace('-', '_')}"
685
+ import_pattern_js = rf"(?:import|require).*{exact_name}"
686
+
687
+ if re.search(import_pattern_py, content) or re.search(import_pattern_js, content):
688
+ self.store.upsert_edge(
689
+ DependencyEdge(
690
+ source_id=svc_id,
691
+ target_id=target_id,
692
+ kind="library_dependency",
693
+ metadata={"file": str(fp.name), "note": "source import"},
694
+ )
695
+ )
696
+