codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,6 @@
1
+ """Cloud provider pattern modules for infrastructure scanning.
2
+
3
+ Each sub-module exposes:
4
+ - ``TF_RESOURCE_MAP``: mapping of Terraform resource type → (node_class, kind).
5
+ - ``CDK_PATTERNS``: list of (regex, node_class, kind) tuples for CDK/Python detection.
6
+ """
@@ -0,0 +1,62 @@
1
+ """AWS infrastructure pattern definitions.
2
+
3
+ Covers:
4
+ - Terraform (``aws_*`` resource types)
5
+ - AWS CDK for TypeScript / Python (``new rds.DatabaseInstance``, etc.)
6
+ - CDKTF (``Rds.DatabaseInstance(``))
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Terraform resource type → (node_class_label, kind)
13
+ # node_class_label: "datastore" or "queue"
14
+ # ---------------------------------------------------------------------------
15
+ TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
16
+ # RDS / relational databases
17
+ "aws_db_instance": ("datastore", "rds"),
18
+ "aws_rds_cluster": ("datastore", "rds"),
19
+ "aws_rds_cluster_instance": ("datastore", "rds"),
20
+ # DynamoDB
21
+ "aws_dynamodb_table": ("datastore", "dynamodb"),
22
+ # S3
23
+ "aws_s3_bucket": ("datastore", "s3"),
24
+ # ElastiCache (Redis / Memcached)
25
+ "aws_elasticache_cluster": ("datastore", "redis"),
26
+ "aws_elasticache_replication_group": ("datastore", "redis"),
27
+ # SQS
28
+ "aws_sqs_queue": ("queue", "sqs"),
29
+ # SNS
30
+ "aws_sns_topic": ("queue", "sns"),
31
+ # MSK (managed Kafka)
32
+ "aws_msk_cluster": ("queue", "kafka"),
33
+ "aws_msk_serverless_cluster": ("queue", "kafka"),
34
+ # OpenSearch / Elasticsearch
35
+ "aws_opensearch_domain": ("datastore", "opensearch"),
36
+ "aws_elasticsearch_domain": ("datastore", "opensearch"),
37
+ }
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # CDK / CDKTF patterns for TypeScript and Python source files.
41
+ # Each entry: (pattern_substring, node_class_label, kind)
42
+ # ---------------------------------------------------------------------------
43
+ CDK_PATTERNS: list[tuple[str, str, str]] = [
44
+ # TypeScript CDK
45
+ ("new rds.DatabaseInstance(", "datastore", "rds"),
46
+ ("new rds.CfnDBInstance(", "datastore", "rds"),
47
+ ("new rds.DatabaseCluster(", "datastore", "rds"),
48
+ ("new dynamodb.Table(", "datastore", "dynamodb"),
49
+ ("new s3.Bucket(", "datastore", "s3"),
50
+ ("new elasticache.CfnCacheCluster(", "datastore", "redis"),
51
+ ("new sqs.Queue(", "queue", "sqs"),
52
+ ("new sns.Topic(", "queue", "sns"),
53
+ ("new msk.Cluster(", "queue", "kafka"),
54
+ # Python CDK / CDKTF
55
+ ("rds.DatabaseInstance(", "datastore", "rds"),
56
+ ("rds.CfnDBInstance(", "datastore", "rds"),
57
+ ("dynamodb.Table(", "datastore", "dynamodb"),
58
+ ("s3.Bucket(", "datastore", "s3"),
59
+ ("sqs.Queue(", "queue", "sqs"),
60
+ ("sns.Topic(", "queue", "sns"),
61
+ ("msk.Cluster(", "queue", "kafka"),
62
+ ]
@@ -0,0 +1,64 @@
1
+ """Azure infrastructure pattern definitions.
2
+
3
+ Covers:
4
+ - Terraform AzureRM provider (``azurerm_*`` resource types)
5
+ - Azure CDK for TypeScript / Python (``@cdktf/provider-azurerm``, ``azure-native``)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Terraform resource type → (node_class_label, kind)
12
+ # ---------------------------------------------------------------------------
13
+ TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
14
+ # SQL / Relational
15
+ "azurerm_sql_database": ("datastore", "azure_sql"),
16
+ "azurerm_sql_server": ("datastore", "azure_sql"),
17
+ "azurerm_mssql_database": ("datastore", "azure_sql"),
18
+ "azurerm_postgresql_server": ("datastore", "azure_sql"),
19
+ "azurerm_postgresql_flexible_server": ("datastore", "azure_sql"),
20
+ "azurerm_mysql_server": ("datastore", "azure_sql"),
21
+ "azurerm_mysql_flexible_server": ("datastore", "azure_sql"),
22
+ # CosmosDB (NoSQL / multi-model)
23
+ "azurerm_cosmosdb_account": ("datastore", "cosmosdb"),
24
+ "azurerm_cosmosdb_sql_database": ("datastore", "cosmosdb"),
25
+ # Blob / data lake storage
26
+ "azurerm_storage_account": ("datastore", "azure_blob"),
27
+ "azurerm_storage_container": ("datastore", "azure_blob"),
28
+ # Redis Cache
29
+ "azurerm_redis_cache": ("datastore", "redis"),
30
+ # Service Bus (queues and topics)
31
+ "azurerm_servicebus_namespace": ("queue", "servicebus"),
32
+ "azurerm_servicebus_queue": ("queue", "servicebus"),
33
+ "azurerm_servicebus_topic": ("queue", "servicebus"),
34
+ # Event Hub (streaming / Kafka-compatible)
35
+ "azurerm_eventhub_namespace": ("queue", "eventhub"),
36
+ "azurerm_eventhub": ("queue", "eventhub"),
37
+ # Event Grid
38
+ "azurerm_eventgrid_topic": ("queue", "eventgrid"),
39
+ "azurerm_eventgrid_domain": ("queue", "eventgrid"),
40
+ # Azure Data Explorer (Kusto)
41
+ "azurerm_kusto_cluster": ("datastore", "kusto"),
42
+ }
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # CDK / pulumi-azure / azure-native patterns for TypeScript and Python.
46
+ # ---------------------------------------------------------------------------
47
+ CDK_PATTERNS: list[tuple[str, str, str]] = [
48
+ # TypeScript (azure-native SDK / Pulumi Azure)
49
+ ("new sql.Database(", "datastore", "azure_sql"),
50
+ ("new postgres.Server(", "datastore", "azure_sql"),
51
+ ("new cosmos.DatabaseAccount(", "datastore", "cosmosdb"),
52
+ ("new storage.StorageAccount(", "datastore", "azure_blob"),
53
+ ("new redis.Redis(", "datastore", "redis"),
54
+ ("new servicebus.Queue(", "queue", "servicebus"),
55
+ ("new servicebus.Topic(", "queue", "servicebus"),
56
+ ("new eventhub.EventHub(", "queue", "eventhub"),
57
+ ("new eventgrid.Topic(", "queue", "eventgrid"),
58
+ # Python (pulumi-azure or azure-native)
59
+ ("sql.Database(", "datastore", "azure_sql"),
60
+ ("cosmos.DatabaseAccount(", "datastore", "cosmosdb"),
61
+ ("storage.StorageAccount(", "datastore", "azure_blob"),
62
+ ("servicebus.Queue(", "queue", "servicebus"),
63
+ ("eventhub.EventHub(", "queue", "eventhub"),
64
+ ]
@@ -0,0 +1,59 @@
1
+ """GCP infrastructure pattern definitions.
2
+
3
+ Covers:
4
+ - Terraform Google provider (``google_*`` resource types)
5
+ - GCP CDK for TypeScript / Python (``@cdktf/provider-google``, Pulumi GCP)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Terraform resource type → (node_class_label, kind)
12
+ # ---------------------------------------------------------------------------
13
+ TF_RESOURCE_MAP: dict[str, tuple[str, str]] = {
14
+ # Cloud SQL
15
+ "google_sql_database_instance": ("datastore", "cloud_sql"),
16
+ "google_sql_database": ("datastore", "cloud_sql"),
17
+ # Bigtable
18
+ "google_bigtable_instance": ("datastore", "bigtable"),
19
+ "google_bigtable_table": ("datastore", "bigtable"),
20
+ # Firestore / Datastore
21
+ "google_firestore_document": ("datastore", "firestore"),
22
+ "google_datastore_index": ("datastore", "firestore"),
23
+ # Spanner
24
+ "google_spanner_instance": ("datastore", "spanner"),
25
+ "google_spanner_database": ("datastore", "spanner"),
26
+ # Memorystore (Redis)
27
+ "google_redis_instance": ("datastore", "redis"),
28
+ # GCS (Cloud Storage)
29
+ "google_storage_bucket": ("datastore", "gcs"),
30
+ # Pub/Sub
31
+ "google_pubsub_topic": ("queue", "pubsub"),
32
+ "google_pubsub_subscription": ("queue", "pubsub"),
33
+ # BiqQuery (analytical)
34
+ "google_bigquery_dataset": ("datastore", "bigquery"),
35
+ "google_bigquery_table": ("datastore", "bigquery"),
36
+ }
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # CDK / Pulumi GCP patterns for TypeScript and Python.
40
+ # ---------------------------------------------------------------------------
41
+ CDK_PATTERNS: list[tuple[str, str, str]] = [
42
+ # TypeScript (Pulumi GCP / CDKTF google)
43
+ ("new sql.DatabaseInstance(", "datastore", "cloud_sql"),
44
+ ("new bigtable.Instance(", "datastore", "bigtable"),
45
+ ("new firestore.Document(", "datastore", "firestore"),
46
+ ("new spanner.Instance(", "datastore", "spanner"),
47
+ ("new redis.Instance(", "datastore", "redis"),
48
+ ("new storage.Bucket(", "datastore", "gcs"),
49
+ ("new pubsub.Topic(", "queue", "pubsub"),
50
+ ("new pubsub.Subscription(", "queue", "pubsub"),
51
+ ("new bigquery.Dataset(", "datastore", "bigquery"),
52
+ # Python
53
+ ("sql.DatabaseInstance(", "datastore", "cloud_sql"),
54
+ ("bigtable.Instance(", "datastore", "bigtable"),
55
+ ("storage.Bucket(", "datastore", "gcs"),
56
+ ("pubsub.Topic(", "queue", "pubsub"),
57
+ ("pubsub.Subscription(", "queue", "pubsub"),
58
+ ("bigquery.Dataset(", "datastore", "bigquery"),
59
+ ]
@@ -0,0 +1,175 @@
1
+ """Graph schema: nodes, edges, and the GraphStore abstract interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, Dict, List, Optional
8
+
9
+
10
+ @dataclass
11
+ class ServiceNode:
12
+ """Represents a discovered service (microservice, API, worker, etc.)."""
13
+
14
+ id: str
15
+ name: str
16
+ repo: str
17
+ language: str = "python"
18
+ tags: List[str] = field(default_factory=list)
19
+ service_type: str = "api" # api | worker | cron | cli | service | infrastructure
20
+
21
+
22
+ @dataclass
23
+ class DataStoreNode:
24
+ """Represents a data store (DB, cache, object store)."""
25
+
26
+ id: str
27
+ kind: str # postgres | redis | s3 | kafka | dynamodb | sqlite | mongodb | neo4j | chromadb
28
+ name: str
29
+
30
+
31
+ @dataclass
32
+ class QueueNode:
33
+ """Represents a message queue."""
34
+
35
+ id: str
36
+ kind: str # sqs | rabbitmq | kafka | pubsub
37
+ name: str
38
+
39
+
40
+ @dataclass
41
+ class MethodNode:
42
+ """Represents a function or method extracted from source code.
43
+
44
+ Node ID format: ``{repo}::{file_path}::{class_name}::{method_name}``
45
+ """
46
+
47
+ id: str
48
+ repo: str
49
+ file_path: str
50
+ class_name: Optional[str]
51
+ method_name: str
52
+ signature: str
53
+ docstring: Optional[str]
54
+ line_start: int
55
+ line_end: int
56
+ service_id: str
57
+ typed_signature: Optional[str] = None # e.g. "validate(token: str) -> bool"
58
+
59
+
60
+ @dataclass
61
+ class FlowNode:
62
+ """Represents a named execution flow detected from entry points.
63
+
64
+ A flow is a BFS-traversal from an entry point (HTTP handler, CLI command,
65
+ worker) through the call graph. It records which methods participate and
66
+ in which order, enabling Linear tasks to say "step 3 of LoginFlow".
67
+
68
+ Node ID format: ``flow::{service_id}::{flow_name}``
69
+ """
70
+
71
+ id: str
72
+ name: str # e.g. "LoginFlow"
73
+ service_id: str
74
+ entry_method_id: str # method ID of the detected entry point
75
+ step_count: int = 0
76
+
77
+
78
+ @dataclass
79
+ class DependencyEdge:
80
+ """A directed relationship between two graph nodes."""
81
+
82
+ source_id: str
83
+ target_id: str
84
+ kind: str # http_call | grpc_call | db_read | db_write | queue_publish | queue_consume | import | method_call
85
+ metadata: Dict[str, Any] = field(default_factory=dict)
86
+
87
+
88
+ class GraphStore(ABC):
89
+ """Abstract interface for the architecture graph store."""
90
+
91
+ @abstractmethod
92
+ def upsert_node(self, node: ServiceNode | DataStoreNode | QueueNode | MethodNode | FlowNode) -> None:
93
+ """Insert or update a node in the graph."""
94
+ ...
95
+
96
+ @abstractmethod
97
+ def upsert_edge(self, edge: DependencyEdge) -> None:
98
+ """Insert or update an edge in the graph."""
99
+ ...
100
+
101
+ @abstractmethod
102
+ def get_service(self, service_id: str) -> Optional[ServiceNode]:
103
+ """Retrieve a service node by ID."""
104
+ ...
105
+
106
+ @abstractmethod
107
+ def get_all_services(self) -> List[ServiceNode]:
108
+ """Return all service nodes."""
109
+ ...
110
+
111
+ @abstractmethod
112
+ def get_dependencies(self, service_id: str) -> List[DependencyEdge]:
113
+ """Return all edges where source_id == service_id."""
114
+ ...
115
+
116
+ @abstractmethod
117
+ def get_dependents(self, service_id: str) -> List[DependencyEdge]:
118
+ """Return all edges where target_id == service_id."""
119
+ ...
120
+
121
+ @abstractmethod
122
+ def get_method(self, method_id: str) -> Optional[MethodNode]:
123
+ """Retrieve a method node by ID."""
124
+ ...
125
+
126
+ @abstractmethod
127
+ def get_call_path(
128
+ self, from_method_id: str, to_method_id: str, max_depth: int = 5
129
+ ) -> List[List[str]]:
130
+ """Return all call paths from one method to another (BFS)."""
131
+ ...
132
+
133
+ @abstractmethod
134
+ def get_methods_for_service(self, service_id: str) -> List[MethodNode]:
135
+ """Return all method nodes belonging to a service."""
136
+ ...
137
+
138
+ @abstractmethod
139
+ def get_callers_of_method(self, method_id: str) -> List[MethodNode]:
140
+ """Return all MethodNodes that call the given method (reverse call lookup).
141
+
142
+ Uses ``method_call`` edges in the store where ``target_id == method_id``.
143
+ Works for all languages as long as call edges were built by
144
+ :class:`~corbell.core.graph.method_graph.MethodGraphBuilder`.
145
+ """
146
+ ...
147
+
148
+ @abstractmethod
149
+ def get_flows_for_method(self, method_id: str) -> List[Dict[str, Any]]:
150
+ """Return flows that include the given method as a step.
151
+
152
+ Returns a list of dicts with keys ``flow_id``, ``flow_name``,
153
+ ``step`` (1-based position in the flow), and ``entry_method_id``.
154
+ """
155
+ ...
156
+
157
+ @abstractmethod
158
+ def get_all_nodes_summary(self) -> Dict[str, Any]:
159
+ """Return a summary of all nodes and edges for display."""
160
+ ...
161
+
162
+ @abstractmethod
163
+ def to_mermaid(self) -> str:
164
+ """Return a Mermaid representation of the graph."""
165
+ ...
166
+
167
+ @abstractmethod
168
+ def to_json(self) -> str:
169
+ """Return the JSON representation of the service graph."""
170
+ ...
171
+
172
+ @abstractmethod
173
+ def clear(self) -> None:
174
+ """Remove all data from the store."""
175
+ ...