codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""AWS infrastructure pattern definitions.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- Terraform (``aws_*`` resource types)
|
|
5
|
+
- AWS CDK for TypeScript / Python (``new rds.DatabaseInstance``, etc.)
|
|
6
|
+
- CDKTF (``Rds.DatabaseInstance(``))
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
# Terraform resource type → (node_class_label, kind)
|
|
13
|
+
# node_class_label: "datastore" or "queue"
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
|
|
16
|
+
# RDS / relational databases
|
|
17
|
+
"aws_db_instance": ("datastore", "rds"),
|
|
18
|
+
"aws_rds_cluster": ("datastore", "rds"),
|
|
19
|
+
"aws_rds_cluster_instance": ("datastore", "rds"),
|
|
20
|
+
# DynamoDB
|
|
21
|
+
"aws_dynamodb_table": ("datastore", "dynamodb"),
|
|
22
|
+
# S3
|
|
23
|
+
"aws_s3_bucket": ("datastore", "s3"),
|
|
24
|
+
# ElastiCache (Redis / Memcached)
|
|
25
|
+
"aws_elasticache_cluster": ("datastore", "redis"),
|
|
26
|
+
"aws_elasticache_replication_group": ("datastore", "redis"),
|
|
27
|
+
# SQS
|
|
28
|
+
"aws_sqs_queue": ("queue", "sqs"),
|
|
29
|
+
# SNS
|
|
30
|
+
"aws_sns_topic": ("queue", "sns"),
|
|
31
|
+
# MSK (managed Kafka)
|
|
32
|
+
"aws_msk_cluster": ("queue", "kafka"),
|
|
33
|
+
"aws_msk_serverless_cluster": ("queue", "kafka"),
|
|
34
|
+
# OpenSearch / Elasticsearch
|
|
35
|
+
"aws_opensearch_domain": ("datastore", "opensearch"),
|
|
36
|
+
"aws_elasticsearch_domain": ("datastore", "opensearch"),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# CDK / CDKTF patterns for TypeScript and Python source files.
|
|
41
|
+
# Each entry: (pattern_substring, node_class_label, kind)
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
CDK_PATTERNS: list[tuple[str, str, str]] = [
|
|
44
|
+
# TypeScript CDK
|
|
45
|
+
("new rds.DatabaseInstance(", "datastore", "rds"),
|
|
46
|
+
("new rds.CfnDBInstance(", "datastore", "rds"),
|
|
47
|
+
("new rds.DatabaseCluster(", "datastore", "rds"),
|
|
48
|
+
("new dynamodb.Table(", "datastore", "dynamodb"),
|
|
49
|
+
("new s3.Bucket(", "datastore", "s3"),
|
|
50
|
+
("new elasticache.CfnCacheCluster(", "datastore", "redis"),
|
|
51
|
+
("new sqs.Queue(", "queue", "sqs"),
|
|
52
|
+
("new sns.Topic(", "queue", "sns"),
|
|
53
|
+
("new msk.Cluster(", "queue", "kafka"),
|
|
54
|
+
# Python CDK / CDKTF
|
|
55
|
+
("rds.DatabaseInstance(", "datastore", "rds"),
|
|
56
|
+
("rds.CfnDBInstance(", "datastore", "rds"),
|
|
57
|
+
("dynamodb.Table(", "datastore", "dynamodb"),
|
|
58
|
+
("s3.Bucket(", "datastore", "s3"),
|
|
59
|
+
("sqs.Queue(", "queue", "sqs"),
|
|
60
|
+
("sns.Topic(", "queue", "sns"),
|
|
61
|
+
("msk.Cluster(", "queue", "kafka"),
|
|
62
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Azure infrastructure pattern definitions.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- Terraform AzureRM provider (``azurerm_*`` resource types)
|
|
5
|
+
- Azure CDK for TypeScript / Python (``@cdktf/provider-azurerm``, ``azure-native``)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# Terraform resource type → (node_class_label, kind)
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
|
|
14
|
+
# SQL / Relational
|
|
15
|
+
"azurerm_sql_database": ("datastore", "azure_sql"),
|
|
16
|
+
"azurerm_sql_server": ("datastore", "azure_sql"),
|
|
17
|
+
"azurerm_mssql_database": ("datastore", "azure_sql"),
|
|
18
|
+
"azurerm_postgresql_server": ("datastore", "azure_sql"),
|
|
19
|
+
"azurerm_postgresql_flexible_server": ("datastore", "azure_sql"),
|
|
20
|
+
"azurerm_mysql_server": ("datastore", "azure_sql"),
|
|
21
|
+
"azurerm_mysql_flexible_server": ("datastore", "azure_sql"),
|
|
22
|
+
# CosmosDB (NoSQL / multi-model)
|
|
23
|
+
"azurerm_cosmosdb_account": ("datastore", "cosmosdb"),
|
|
24
|
+
"azurerm_cosmosdb_sql_database": ("datastore", "cosmosdb"),
|
|
25
|
+
# Blob / data lake storage
|
|
26
|
+
"azurerm_storage_account": ("datastore", "azure_blob"),
|
|
27
|
+
"azurerm_storage_container": ("datastore", "azure_blob"),
|
|
28
|
+
# Redis Cache
|
|
29
|
+
"azurerm_redis_cache": ("datastore", "redis"),
|
|
30
|
+
# Service Bus (queues and topics)
|
|
31
|
+
"azurerm_servicebus_namespace": ("queue", "servicebus"),
|
|
32
|
+
"azurerm_servicebus_queue": ("queue", "servicebus"),
|
|
33
|
+
"azurerm_servicebus_topic": ("queue", "servicebus"),
|
|
34
|
+
# Event Hub (streaming / Kafka-compatible)
|
|
35
|
+
"azurerm_eventhub_namespace": ("queue", "eventhub"),
|
|
36
|
+
"azurerm_eventhub": ("queue", "eventhub"),
|
|
37
|
+
# Event Grid
|
|
38
|
+
"azurerm_eventgrid_topic": ("queue", "eventgrid"),
|
|
39
|
+
"azurerm_eventgrid_domain": ("queue", "eventgrid"),
|
|
40
|
+
# Azure Data Explorer (Kusto)
|
|
41
|
+
"azurerm_kusto_cluster": ("datastore", "kusto"),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# CDK / pulumi-azure / azure-native patterns for TypeScript and Python.
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
CDK_PATTERNS: list[tuple[str, str, str]] = [
|
|
48
|
+
# TypeScript (azure-native SDK / Pulumi Azure)
|
|
49
|
+
("new sql.Database(", "datastore", "azure_sql"),
|
|
50
|
+
("new postgres.Server(", "datastore", "azure_sql"),
|
|
51
|
+
("new cosmos.DatabaseAccount(", "datastore", "cosmosdb"),
|
|
52
|
+
("new storage.StorageAccount(", "datastore", "azure_blob"),
|
|
53
|
+
("new redis.Redis(", "datastore", "redis"),
|
|
54
|
+
("new servicebus.Queue(", "queue", "servicebus"),
|
|
55
|
+
("new servicebus.Topic(", "queue", "servicebus"),
|
|
56
|
+
("new eventhub.EventHub(", "queue", "eventhub"),
|
|
57
|
+
("new eventgrid.Topic(", "queue", "eventgrid"),
|
|
58
|
+
# Python (pulumi-azure or azure-native)
|
|
59
|
+
("sql.Database(", "datastore", "azure_sql"),
|
|
60
|
+
("cosmos.DatabaseAccount(", "datastore", "cosmosdb"),
|
|
61
|
+
("storage.StorageAccount(", "datastore", "azure_blob"),
|
|
62
|
+
("servicebus.Queue(", "queue", "servicebus"),
|
|
63
|
+
("eventhub.EventHub(", "queue", "eventhub"),
|
|
64
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""GCP infrastructure pattern definitions.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- Terraform Google provider (``google_*`` resource types)
|
|
5
|
+
- GCP CDK for TypeScript / Python (``@cdktf/provider-google``, Pulumi GCP)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# Terraform resource type → (node_class_label, kind)
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
|
|
14
|
+
# Cloud SQL
|
|
15
|
+
"google_sql_database_instance": ("datastore", "cloud_sql"),
|
|
16
|
+
"google_sql_database": ("datastore", "cloud_sql"),
|
|
17
|
+
# Bigtable
|
|
18
|
+
"google_bigtable_instance": ("datastore", "bigtable"),
|
|
19
|
+
"google_bigtable_table": ("datastore", "bigtable"),
|
|
20
|
+
# Firestore / Datastore
|
|
21
|
+
"google_firestore_document": ("datastore", "firestore"),
|
|
22
|
+
"google_datastore_index": ("datastore", "firestore"),
|
|
23
|
+
# Spanner
|
|
24
|
+
"google_spanner_instance": ("datastore", "spanner"),
|
|
25
|
+
"google_spanner_database": ("datastore", "spanner"),
|
|
26
|
+
# Memorystore (Redis)
|
|
27
|
+
"google_redis_instance": ("datastore", "redis"),
|
|
28
|
+
# GCS (Cloud Storage)
|
|
29
|
+
"google_storage_bucket": ("datastore", "gcs"),
|
|
30
|
+
# Pub/Sub
|
|
31
|
+
"google_pubsub_topic": ("queue", "pubsub"),
|
|
32
|
+
"google_pubsub_subscription": ("queue", "pubsub"),
|
|
33
|
+
# BiqQuery (analytical)
|
|
34
|
+
"google_bigquery_dataset": ("datastore", "bigquery"),
|
|
35
|
+
"google_bigquery_table": ("datastore", "bigquery"),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# CDK / Pulumi GCP patterns for TypeScript and Python.
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
CDK_PATTERNS: list[tuple[str, str, str]] = [
|
|
42
|
+
# TypeScript (Pulumi GCP / CDKTF google)
|
|
43
|
+
("new sql.DatabaseInstance(", "datastore", "cloud_sql"),
|
|
44
|
+
("new bigtable.Instance(", "datastore", "bigtable"),
|
|
45
|
+
("new firestore.Document(", "datastore", "firestore"),
|
|
46
|
+
("new spanner.Instance(", "datastore", "spanner"),
|
|
47
|
+
("new redis.Instance(", "datastore", "redis"),
|
|
48
|
+
("new storage.Bucket(", "datastore", "gcs"),
|
|
49
|
+
("new pubsub.Topic(", "queue", "pubsub"),
|
|
50
|
+
("new pubsub.Subscription(", "queue", "pubsub"),
|
|
51
|
+
("new bigquery.Dataset(", "datastore", "bigquery"),
|
|
52
|
+
# Python
|
|
53
|
+
("sql.DatabaseInstance(", "datastore", "cloud_sql"),
|
|
54
|
+
("bigtable.Instance(", "datastore", "bigtable"),
|
|
55
|
+
("storage.Bucket(", "datastore", "gcs"),
|
|
56
|
+
("pubsub.Topic(", "queue", "pubsub"),
|
|
57
|
+
("pubsub.Subscription(", "queue", "pubsub"),
|
|
58
|
+
("bigquery.Dataset(", "datastore", "bigquery"),
|
|
59
|
+
]
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Graph schema: nodes, edges, and the GraphStore abstract interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ServiceNode:
|
|
12
|
+
"""Represents a discovered service (microservice, API, worker, etc.)."""
|
|
13
|
+
|
|
14
|
+
id: str
|
|
15
|
+
name: str
|
|
16
|
+
repo: str
|
|
17
|
+
language: str = "python"
|
|
18
|
+
tags: List[str] = field(default_factory=list)
|
|
19
|
+
service_type: str = "api" # api | worker | cron | cli | service | infrastructure
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DataStoreNode:
|
|
24
|
+
"""Represents a data store (DB, cache, object store)."""
|
|
25
|
+
|
|
26
|
+
id: str
|
|
27
|
+
kind: str # postgres | redis | s3 | kafka | dynamodb | sqlite | mongodb | neo4j | chromadb
|
|
28
|
+
name: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class QueueNode:
|
|
33
|
+
"""Represents a message queue."""
|
|
34
|
+
|
|
35
|
+
id: str
|
|
36
|
+
kind: str # sqs | rabbitmq | kafka | pubsub
|
|
37
|
+
name: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MethodNode:
|
|
42
|
+
"""Represents a function or method extracted from source code.
|
|
43
|
+
|
|
44
|
+
Node ID format: ``{repo}::{file_path}::{class_name}::{method_name}``
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
id: str
|
|
48
|
+
repo: str
|
|
49
|
+
file_path: str
|
|
50
|
+
class_name: Optional[str]
|
|
51
|
+
method_name: str
|
|
52
|
+
signature: str
|
|
53
|
+
docstring: Optional[str]
|
|
54
|
+
line_start: int
|
|
55
|
+
line_end: int
|
|
56
|
+
service_id: str
|
|
57
|
+
typed_signature: Optional[str] = None # e.g. "validate(token: str) -> bool"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class FlowNode:
|
|
62
|
+
"""Represents a named execution flow detected from entry points.
|
|
63
|
+
|
|
64
|
+
A flow is a BFS-traversal from an entry point (HTTP handler, CLI command,
|
|
65
|
+
worker) through the call graph. It records which methods participate and
|
|
66
|
+
in which order, enabling Linear tasks to say "step 3 of LoginFlow".
|
|
67
|
+
|
|
68
|
+
Node ID format: ``flow::{service_id}::{flow_name}``
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
id: str
|
|
72
|
+
name: str # e.g. "LoginFlow"
|
|
73
|
+
service_id: str
|
|
74
|
+
entry_method_id: str # method ID of the detected entry point
|
|
75
|
+
step_count: int = 0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class DependencyEdge:
|
|
80
|
+
"""A directed relationship between two graph nodes."""
|
|
81
|
+
|
|
82
|
+
source_id: str
|
|
83
|
+
target_id: str
|
|
84
|
+
kind: str # http_call | grpc_call | db_read | db_write | queue_publish | queue_consume | import | method_call
|
|
85
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class GraphStore(ABC):
|
|
89
|
+
"""Abstract interface for the architecture graph store."""
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def upsert_node(self, node: ServiceNode | DataStoreNode | QueueNode | MethodNode | FlowNode) -> None:
|
|
93
|
+
"""Insert or update a node in the graph."""
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def upsert_edge(self, edge: DependencyEdge) -> None:
|
|
98
|
+
"""Insert or update an edge in the graph."""
|
|
99
|
+
...
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def get_service(self, service_id: str) -> Optional[ServiceNode]:
|
|
103
|
+
"""Retrieve a service node by ID."""
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
@abstractmethod
|
|
107
|
+
def get_all_services(self) -> List[ServiceNode]:
|
|
108
|
+
"""Return all service nodes."""
|
|
109
|
+
...
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def get_dependencies(self, service_id: str) -> List[DependencyEdge]:
|
|
113
|
+
"""Return all edges where source_id == service_id."""
|
|
114
|
+
...
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_dependents(self, service_id: str) -> List[DependencyEdge]:
|
|
118
|
+
"""Return all edges where target_id == service_id."""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def get_method(self, method_id: str) -> Optional[MethodNode]:
|
|
123
|
+
"""Retrieve a method node by ID."""
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def get_call_path(
|
|
128
|
+
self, from_method_id: str, to_method_id: str, max_depth: int = 5
|
|
129
|
+
) -> List[List[str]]:
|
|
130
|
+
"""Return all call paths from one method to another (BFS)."""
|
|
131
|
+
...
|
|
132
|
+
|
|
133
|
+
@abstractmethod
|
|
134
|
+
def get_methods_for_service(self, service_id: str) -> List[MethodNode]:
|
|
135
|
+
"""Return all method nodes belonging to a service."""
|
|
136
|
+
...
|
|
137
|
+
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def get_callers_of_method(self, method_id: str) -> List[MethodNode]:
|
|
140
|
+
"""Return all MethodNodes that call the given method (reverse call lookup).
|
|
141
|
+
|
|
142
|
+
Uses ``method_call`` edges in the store where ``target_id == method_id``.
|
|
143
|
+
Works for all languages as long as call edges were built by
|
|
144
|
+
:class:`~corbell.core.graph.method_graph.MethodGraphBuilder`.
|
|
145
|
+
"""
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
@abstractmethod
|
|
149
|
+
def get_flows_for_method(self, method_id: str) -> List[Dict[str, Any]]:
|
|
150
|
+
"""Return flows that include the given method as a step.
|
|
151
|
+
|
|
152
|
+
Returns a list of dicts with keys ``flow_id``, ``flow_name``,
|
|
153
|
+
``step`` (1-based position in the flow), and ``entry_method_id``.
|
|
154
|
+
"""
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def get_all_nodes_summary(self) -> Dict[str, Any]:
|
|
159
|
+
"""Return a summary of all nodes and edges for display."""
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
@abstractmethod
|
|
163
|
+
def to_mermaid(self) -> str:
|
|
164
|
+
"""Return a Mermaid representation of the graph."""
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
@abstractmethod
|
|
168
|
+
def to_json(self) -> str:
|
|
169
|
+
"""Return the JSON representation of the service graph."""
|
|
170
|
+
...
|
|
171
|
+
|
|
172
|
+
@abstractmethod
|
|
173
|
+
def clear(self) -> None:
|
|
174
|
+
"""Remove all data from the store."""
|
|
175
|
+
...
|