agentic-data-contracts 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/CHANGELOG.md +17 -0
  2. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/PKG-INFO +30 -1
  3. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/README.md +29 -0
  4. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/examples/revenue_agent/contract.yml +4 -1
  5. agentic_data_contracts-0.2.6/examples/revenue_agent/semantic.yml +22 -0
  6. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/pyproject.toml +1 -1
  7. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/core/contract.py +61 -12
  8. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/semantic/base.py +8 -0
  9. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/semantic/cube.py +8 -1
  10. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/semantic/dbt.py +8 -1
  11. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/semantic/yaml_source.py +16 -1
  12. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/tools/factory.py +29 -6
  13. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/fixtures/semantic_source.yml +21 -0
  14. agentic_data_contracts-0.2.6/tests/test_core/test_scalability.py +144 -0
  15. agentic_data_contracts-0.2.6/tests/test_semantic/test_relationships.py +83 -0
  16. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_semantic/test_yaml_source.py +1 -1
  17. agentic_data_contracts-0.2.6/tests/test_tools/test_pagination.py +80 -0
  18. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/uv.lock +1 -1
  19. agentic_data_contracts-0.2.4/examples/revenue_agent/semantic.yml +0 -51
  20. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/.github/dependabot.yml +0 -0
  21. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/.github/workflows/ci.yml +0 -0
  22. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/.gitignore +0 -0
  23. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/.pre-commit-config.yaml +0 -0
  24. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/.python-version +0 -0
  25. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/CLAUDE.md +0 -0
  26. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/LICENSE +0 -0
  27. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/docs/architecture.md +0 -0
  28. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/examples/revenue_agent/agent.py +0 -0
  29. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/examples/revenue_agent/setup_db.py +0 -0
  30. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/__init__.py +0 -0
  31. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/adapters/__init__.py +0 -0
  32. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/adapters/base.py +0 -0
  33. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/adapters/duckdb.py +0 -0
  34. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/bridge/__init__.py +0 -0
  35. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/bridge/compiler.py +0 -0
  36. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/core/__init__.py +0 -0
  37. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/core/schema.py +0 -0
  38. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/core/session.py +0 -0
  39. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/py.typed +0 -0
  40. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/semantic/__init__.py +0 -0
  41. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/tools/__init__.py +0 -0
  42. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/tools/middleware.py +0 -0
  43. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/tools/sdk.py +0 -0
  44. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/validation/__init__.py +0 -0
  45. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/validation/checkers.py +0 -0
  46. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/validation/explain.py +0 -0
  47. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/src/agentic_data_contracts/validation/validator.py +0 -0
  48. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/__init__.py +0 -0
  49. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/conftest.py +0 -0
  50. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/fixtures/minimal_contract.yml +0 -0
  51. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/fixtures/sample_cube_schema.yml +0 -0
  52. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/fixtures/sample_dbt_manifest.json +0 -0
  53. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/fixtures/valid_contract.yml +0 -0
  54. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_adapters/__init__.py +0 -0
  55. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_adapters/test_duckdb.py +0 -0
  56. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_bridge/__init__.py +0 -0
  57. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_bridge/test_compiler.py +0 -0
  58. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/__init__.py +0 -0
  59. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_contract.py +0 -0
  60. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_load_semantic_source.py +0 -0
  61. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_schema.py +0 -0
  62. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_sdk_config.py +0 -0
  63. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_session.py +0 -0
  64. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_system_prompt_metrics.py +0 -0
  65. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_core/test_wildcard_tables.py +0 -0
  66. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_public_api.py +0 -0
  67. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_semantic/__init__.py +0 -0
  68. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_semantic/test_cube.py +0 -0
  69. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_semantic/test_dbt.py +0 -0
  70. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_semantic/test_search.py +0 -0
  71. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/__init__.py +0 -0
  72. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_auto_load.py +0 -0
  73. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_factory.py +0 -0
  74. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_middleware.py +0 -0
  75. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_sdk.py +0 -0
  76. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_semantic_tools.py +0 -0
  77. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_tools/test_wildcard_tools.py +0 -0
  78. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_validation/__init__.py +0 -0
  79. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_validation/test_checkers.py +0 -0
  80. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_validation/test_explain.py +0 -0
  81. {agentic_data_contracts-0.2.4 → agentic_data_contracts-0.2.6}/tests/test_validation/test_validator.py +0 -0
@@ -2,6 +2,23 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.2.6] - 2026-03-29
6
+
7
+ ### Changed
8
+
9
+ - **Compact system prompt at scale**: When metrics exceed 20, the system prompt shows domain names with counts (e.g., "acquisition (45)") instead of listing every metric. Reduces prompt from ~6K to ~100 tokens for large metric sets.
10
+ - **Paginated `list_tables`**: Added `limit` (default 50) and `offset` parameters for handling schemas with many tables. Response includes `total` count and `next_offset` for pagination.
11
+ - **Cached wildcard resolution**: `resolve_tables()` is now idempotent — subsequent calls are no-ops, avoiding redundant database queries.
12
+
13
+ ## [0.2.5] - 2026-03-29
14
+
15
+ ### Added
16
+
17
+ - **Table relationship metadata**: `Relationship` dataclass and `get_relationships()` on `SemanticSource` protocol for declaring join paths between tables (from/to column + relationship type)
18
+ - **Relationships in system prompt**: `to_system_prompt()` includes join paths so the agent knows how to combine tables correctly
19
+ - **YamlSource relationships**: Parsed from `relationships` section in semantic YAML files
20
+ - DbtSource and CubeSource return empty relationships (ready for future parsing of native join metadata)
21
+
5
22
  ## [0.2.4] - 2026-03-29
6
23
 
7
24
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentic-data-contracts
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: YAML-first data contract governance for AI agents
5
5
  Project-URL: Homepage, https://github.com/flyersworder/agentic-data-contracts
6
6
  Project-URL: Repository, https://github.com/flyersworder/agentic-data-contracts
@@ -277,6 +277,23 @@ semantic:
277
277
  path: "./cube/schema.yml"
278
278
  ```
279
279
 
280
+ ## Table Relationships
281
+
282
+ Define join paths so the agent knows how to combine tables correctly:
283
+
284
+ ```yaml
285
+ # semantic.yml
286
+ relationships:
287
+ - from: analytics.orders.customer_id
288
+ to: analytics.customers.id
289
+ type: many_to_one
290
+ - from: analytics.orders.product_id
291
+ to: analytics.products.id
292
+ type: many_to_one
293
+ ```
294
+
295
+ The agent sees these in its system prompt and uses them to write correct JOINs instead of guessing from column names.
296
+
280
297
  ## Scalable Metric Discovery
281
298
 
282
299
  For large data lakes with hundreds of KPIs, group metrics by domain and let the agent discover them efficiently:
@@ -297,6 +314,18 @@ lookup_metric("acquisition cost") → fuzzy match, returns [CAC, CPA] as candi
297
314
  list_metrics(domain="retention") → only retention metrics
298
315
  ```
299
316
 
317
+ ## Scaling to Large Organizations
318
+
319
+ Tested for 200+ tables, 300+ metrics, 50+ relationships across multiple schemas.
320
+
321
+ | Concern | How it scales |
322
+ |---|---|
323
+ | **System prompt size** | >20 metrics: auto-switches to compact domain counts (`acquisition (45)`) instead of listing every metric |
324
+ | **Table discovery** | `list_tables` is paginated (default 50, with offset). Use `schema` filter for targeted browsing |
325
+ | **Wildcard schemas** | `tables: ["*"]` discovers tables from the database. Resolution is cached — no repeated queries |
326
+ | **Metric lookup** | Fuzzy search via `thefuzz` (C++ backed) — sub-millisecond even with 1000+ metrics |
327
+ | **SQL validation** | Set-based allowlist check — O(1) per table reference regardless of allowlist size |
328
+
300
329
  ## Resource Limits
301
330
 
302
331
  ```yaml
@@ -224,6 +224,23 @@ semantic:
224
224
  path: "./cube/schema.yml"
225
225
  ```
226
226
 
227
+ ## Table Relationships
228
+
229
+ Define join paths so the agent knows how to combine tables correctly:
230
+
231
+ ```yaml
232
+ # semantic.yml
233
+ relationships:
234
+ - from: analytics.orders.customer_id
235
+ to: analytics.customers.id
236
+ type: many_to_one
237
+ - from: analytics.orders.product_id
238
+ to: analytics.products.id
239
+ type: many_to_one
240
+ ```
241
+
242
+ The agent sees these in its system prompt and uses them to write correct JOINs instead of guessing from column names.
243
+
227
244
  ## Scalable Metric Discovery
228
245
 
229
246
  For large data lakes with hundreds of KPIs, group metrics by domain and let the agent discover them efficiently:
@@ -244,6 +261,18 @@ lookup_metric("acquisition cost") → fuzzy match, returns [CAC, CPA] as candi
244
261
  list_metrics(domain="retention") → only retention metrics
245
262
  ```
246
263
 
264
+ ## Scaling to Large Organizations
265
+
266
+ Tested for 200+ tables, 300+ metrics, 50+ relationships across multiple schemas.
267
+
268
+ | Concern | How it scales |
269
+ |---|---|
270
+ | **System prompt size** | >20 metrics: auto-switches to compact domain counts (`acquisition (45)`) instead of listing every metric |
271
+ | **Table discovery** | `list_tables` is paginated (default 50, with offset). Use `schema` filter for targeted browsing |
272
+ | **Wildcard schemas** | `tables: ["*"]` discovers tables from the database. Resolution is cached — no repeated queries |
273
+ | **Metric lookup** | Fuzzy search via `thefuzz` (C++ backed) — sub-millisecond even with 1000+ metrics |
274
+ | **SQL validation** | Set-based allowlist check — O(1) per table reference regardless of allowlist size |
275
+
247
276
  ## Resource Limits
248
277
 
249
278
  ```yaml
@@ -9,12 +9,15 @@ semantic:
9
9
  - schema: analytics
10
10
  tables: [orders, customers, subscriptions]
11
11
  forbidden_operations: [DELETE, DROP, TRUNCATE, UPDATE, INSERT]
12
+ domains:
13
+ revenue: [total_revenue, revenue_by_region]
12
14
  rules:
13
15
  - name: tenant_isolation
14
16
  description: "All queries must filter by tenant_id"
15
17
  enforcement: block
18
+ filter_column: tenant_id
16
19
  - name: use_semantic_revenue
17
- description: "Revenue calculations must use the dbt metric definition"
20
+ description: "Revenue calculations must use the metric definitions"
18
21
  enforcement: warn
19
22
  - name: no_select_star
20
23
  description: "Must specify explicit columns"
@@ -0,0 +1,22 @@
1
+ # Semantic source — define only what the database can't tell the agent.
2
+ # Table columns are discovered at runtime via the describe_table tool.
3
+
4
+ metrics:
5
+ - name: total_revenue
6
+ description: "Total revenue from completed orders"
7
+ sql_expression: "SUM(amount) FILTER (WHERE status = 'completed')"
8
+ source_model: analytics.orders
9
+ filters:
10
+ - "status = 'completed'"
11
+
12
+ - name: revenue_by_region
13
+ description: "Revenue broken down by customer region"
14
+ sql_expression: "SUM(o.amount) GROUP BY c.region"
15
+ source_model: analytics.orders
16
+ filters:
17
+ - "o.status = 'completed'"
18
+
19
+ relationships:
20
+ - from: analytics.orders.customer_id
21
+ to: analytics.customers.id
22
+ type: many_to_one
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agentic-data-contracts"
3
- version = "0.2.4"
3
+ version = "0.2.6"
4
4
  description = "YAML-first data contract governance for AI agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -23,6 +23,7 @@ class DataContract:
23
23
 
24
24
  def __init__(self, schema: DataContractSchema) -> None:
25
25
  self.schema = schema
26
+ self._tables_resolved: bool = False
26
27
 
27
28
  @property
28
29
  def name(self) -> str:
@@ -43,16 +44,18 @@ class DataContract:
43
44
  """Check if any schema uses wildcard ('*') for tables."""
44
45
  return any("*" in entry.tables for entry in self.schema.semantic.allowed_tables)
45
46
 
46
- def resolve_tables(self, adapter: DatabaseAdapter) -> None:
47
+ def resolve_tables(self, adapter: DatabaseAdapter, *, force: bool = False) -> None:
47
48
  """Expand wildcard tables using the database adapter.
48
49
 
49
50
  Replaces ["*"] entries with actual table names from the database.
50
- Call this once after creating the adapter. Results are cached
51
- on the schema object.
51
+ Results are cached subsequent calls are no-ops unless force=True.
52
52
  """
53
+ if self._tables_resolved and not force:
54
+ return
53
55
  for entry in self.schema.semantic.allowed_tables:
54
56
  if "*" in entry.tables:
55
57
  entry.tables = adapter.list_tables(entry.schema_)
58
+ self._tables_resolved = True
56
59
 
57
60
  def allowed_table_names(self) -> list[str]:
58
61
  names: list[str] = []
@@ -171,6 +174,17 @@ class DataContract:
171
174
  )
172
175
  sections.append(line)
173
176
 
177
+ # Table relationships
178
+ if semantic_source is not None:
179
+ rels = semantic_source.get_relationships()
180
+ if rels:
181
+ sections.append(
182
+ "\n### Table Relationships\n"
183
+ "Use these join paths when combining tables:"
184
+ )
185
+ for r in rels:
186
+ sections.append(f"- {r.from_} \u2192 {r.to} ({r.type})")
187
+
174
188
  # Resource limits
175
189
  res = self.schema.resources
176
190
  if res:
@@ -193,6 +207,10 @@ class DataContract:
193
207
 
194
208
  return "\n".join(sections)
195
209
 
210
+ # Max metrics to list individually in system prompt before switching
211
+ # to compact domain-only summaries.
212
+ METRIC_DETAIL_THRESHOLD = 20
213
+
196
214
  def _build_metrics_section(
197
215
  self, semantic_source: SemanticSource | None
198
216
  ) -> str | None:
@@ -205,11 +223,27 @@ class DataContract:
205
223
 
206
224
  domains = self.schema.semantic.domains
207
225
  lines: list[str] = []
208
- lines.append(
209
- "\n### Available Metrics (use lookup_metric for full SQL definitions)"
210
- )
226
+ compact = len(metrics) > self.METRIC_DETAIL_THRESHOLD
211
227
 
212
- if domains:
228
+ if compact and domains:
229
+ # Large metric set with domains — show counts only
230
+ lines.append("\n### Available Metrics")
231
+ metric_names = {m.name for m in metrics}
232
+ domain_parts = []
233
+ for domain, names in domains.items():
234
+ count = sum(1 for n in names if n in metric_names)
235
+ if count:
236
+ domain_parts.append(f"{domain} ({count})")
237
+ lines.append(f"Domains: {', '.join(domain_parts)}")
238
+ lines.append(
239
+ '\nUse list_metrics(domain="...") to browse,'
240
+ ' lookup_metric("...") to get SQL definitions.'
241
+ )
242
+ elif domains:
243
+ # Small metric set with domains — list with descriptions
244
+ lines.append(
245
+ "\n### Available Metrics (use lookup_metric for full SQL definitions)"
246
+ )
213
247
  metric_map = {m.name: m for m in metrics}
214
248
  for domain, names in domains.items():
215
249
  entries = []
@@ -219,12 +253,27 @@ class DataContract:
219
253
  entries.append(f"{m.name} \u2014 {m.description}")
220
254
  if entries:
221
255
  lines.append(f"**{domain}:** {', '.join(entries)}")
256
+ lines.append(
257
+ "\nUse the lookup_metric tool to get the SQL definition"
258
+ " before computing any KPI."
259
+ )
260
+ elif compact:
261
+ # Large metric set without domains — just show count
262
+ lines.append("\n### Available Metrics")
263
+ lines.append(f"{len(metrics)} metrics available.")
264
+ lines.append(
265
+ "\nUse list_metrics() to browse,"
266
+ ' lookup_metric("...") to get SQL definitions.'
267
+ )
222
268
  else:
269
+ # Small metric set without domains — list all
270
+ lines.append(
271
+ "\n### Available Metrics (use lookup_metric for full SQL definitions)"
272
+ )
223
273
  for m in metrics:
224
274
  lines.append(f"- {m.name} \u2014 {m.description}")
225
-
226
- lines.append(
227
- "\nUse the lookup_metric tool to get the SQL definition"
228
- " before computing any KPI."
229
- )
275
+ lines.append(
276
+ "\nUse the lookup_metric tool to get the SQL definition"
277
+ " before computing any KPI."
278
+ )
230
279
  return "\n".join(lines)
@@ -20,12 +20,20 @@ class MetricDefinition:
20
20
  filters: list[str] = field(default_factory=list)
21
21
 
22
22
 
23
+ @dataclass
24
+ class Relationship:
25
+ from_: str # "schema.table.column"
26
+ to: str # "schema.table.column"
27
+ type: str = "many_to_one" # many_to_one | one_to_one | many_to_many
28
+
29
+
23
30
  @runtime_checkable
24
31
  class SemanticSource(Protocol):
25
32
  def get_metrics(self) -> list[MetricDefinition]: ...
26
33
  def get_metric(self, name: str) -> MetricDefinition | None: ...
27
34
  def get_table_schema(self, schema: str, table: str) -> TableSchema | None: ...
28
35
  def search_metrics(self, query: str) -> list[MetricDefinition]: ...
36
+ def get_relationships(self) -> list[Relationship]: ...
29
37
 
30
38
 
31
39
  def fuzzy_search_metrics(
@@ -7,7 +7,11 @@ from pathlib import Path
7
7
  import yaml
8
8
 
9
9
  from agentic_data_contracts.adapters.base import Column, TableSchema
10
- from agentic_data_contracts.semantic.base import MetricDefinition, fuzzy_search_metrics
10
+ from agentic_data_contracts.semantic.base import (
11
+ MetricDefinition,
12
+ Relationship,
13
+ fuzzy_search_metrics,
14
+ )
11
15
 
12
16
 
13
17
  class CubeSource:
@@ -54,5 +58,8 @@ class CubeSource:
54
58
  def search_metrics(self, query: str) -> list[MetricDefinition]:
55
59
  return fuzzy_search_metrics(self._metrics, self.get_metric, query)
56
60
 
61
+ def get_relationships(self) -> list[Relationship]:
62
+ return [] # TODO: parse from Cube joins config
63
+
57
64
  def get_table_schema(self, schema: str, table: str) -> TableSchema | None:
58
65
  return self._tables.get(f"{schema}.{table}")
@@ -7,7 +7,11 @@ from pathlib import Path
7
7
  from typing import Any
8
8
 
9
9
  from agentic_data_contracts.adapters.base import Column, TableSchema
10
- from agentic_data_contracts.semantic.base import MetricDefinition, fuzzy_search_metrics
10
+ from agentic_data_contracts.semantic.base import (
11
+ MetricDefinition,
12
+ Relationship,
13
+ fuzzy_search_metrics,
14
+ )
11
15
 
12
16
 
13
17
  class DbtSource:
@@ -77,5 +81,8 @@ class DbtSource:
77
81
  def search_metrics(self, query: str) -> list[MetricDefinition]:
78
82
  return fuzzy_search_metrics(self._metrics, self.get_metric, query)
79
83
 
84
+ def get_relationships(self) -> list[Relationship]:
85
+ return [] # TODO: parse from dbt manifest relationships/refs
86
+
80
87
  def get_table_schema(self, schema: str, table: str) -> TableSchema | None:
81
88
  return self._tables.get(f"{schema}.{table}")
@@ -7,7 +7,11 @@ from pathlib import Path
7
7
  import yaml
8
8
 
9
9
  from agentic_data_contracts.adapters.base import Column, TableSchema
10
- from agentic_data_contracts.semantic.base import MetricDefinition, fuzzy_search_metrics
10
+ from agentic_data_contracts.semantic.base import (
11
+ MetricDefinition,
12
+ Relationship,
13
+ fuzzy_search_metrics,
14
+ )
11
15
 
12
16
 
13
17
  class YamlSource:
@@ -38,6 +42,14 @@ class YamlSource:
38
42
  for c in t.get("columns", [])
39
43
  ]
40
44
  )
45
+ self._relationships = [
46
+ Relationship(
47
+ from_=r["from"],
48
+ to=r["to"],
49
+ type=r.get("type", "many_to_one"),
50
+ )
51
+ for r in raw.get("relationships", [])
52
+ ]
41
53
 
42
54
  def get_metrics(self) -> list[MetricDefinition]:
43
55
  return list(self._metrics)
@@ -51,5 +63,8 @@ class YamlSource:
51
63
  def search_metrics(self, query: str) -> list[MetricDefinition]:
52
64
  return fuzzy_search_metrics(self._metrics, self.get_metric, query)
53
65
 
66
+ def get_relationships(self) -> list[Relationship]:
67
+ return list(self._relationships)
68
+
54
69
  def get_table_schema(self, schema: str, table: str) -> TableSchema | None:
55
70
  return self._tables.get(f"{schema}.{table}")
@@ -60,14 +60,23 @@ def create_tools(
60
60
  # ── Tool 2: list_tables ───────────────────────────────────────────────────
61
61
  async def list_tables(args: dict[str, Any]) -> dict[str, Any]:
62
62
  schema_filter = args.get("schema")
63
- tables: list[dict[str, Any]] = []
63
+ try:
64
+ limit = max(1, int(args.get("limit", 50)))
65
+ except (ValueError, TypeError):
66
+ limit = 50
67
+ try:
68
+ offset = max(0, int(args.get("offset", 0)))
69
+ except (ValueError, TypeError):
70
+ offset = 0
71
+ all_tables: list[dict[str, Any]] = []
64
72
  for entry in contract.schema.semantic.allowed_tables:
65
73
  if schema_filter and entry.schema_ != schema_filter:
66
74
  continue
67
75
  if "*" in entry.tables:
68
76
  return _text_response(
69
77
  f"Schema '{entry.schema_}' uses wildcard tables"
70
- " but no database adapter is available to resolve them."
78
+ " but no database adapter is available"
79
+ " to resolve them."
71
80
  )
72
81
  for table in entry.tables:
73
82
  info: dict[str, Any] = {
@@ -78,8 +87,13 @@ def create_tools(
78
87
  ts = semantic_source.get_table_schema(entry.schema_, table)
79
88
  if ts is not None:
80
89
  info["columns"] = [c.name for c in ts.columns]
81
- tables.append(info)
82
- return _text_response(json.dumps({"tables": tables}))
90
+ all_tables.append(info)
91
+ total = len(all_tables)
92
+ page = all_tables[offset : offset + limit]
93
+ result: dict[str, Any] = {"tables": page, "total": total}
94
+ if offset + limit < total:
95
+ result["next_offset"] = offset + limit
96
+ return _text_response(json.dumps(result))
83
97
 
84
98
  # ── Tool 3: describe_table ────────────────────────────────────────────────
85
99
  async def describe_table(args: dict[str, Any]) -> dict[str, Any]:
@@ -321,7 +335,8 @@ def create_tools(
321
335
  name="list_tables",
322
336
  description=(
323
337
  "List allowed tables, optionally filtered by schema. "
324
- "Includes column names when semantic source is available."
338
+ "Includes column names when semantic source is available. "
339
+ "Paginated \u2014 use limit/offset for large schemas."
325
340
  ),
326
341
  input_schema={
327
342
  "type": "object",
@@ -329,7 +344,15 @@ def create_tools(
329
344
  "schema": {
330
345
  "type": "string",
331
346
  "description": "Optional schema name to filter by",
332
- }
347
+ },
348
+ "limit": {
349
+ "type": "integer",
350
+ "description": "Max tables to return (default 50)",
351
+ },
352
+ "offset": {
353
+ "type": "integer",
354
+ "description": "Skip first N tables (default 0)",
355
+ },
333
356
  },
334
357
  "required": [],
335
358
  },
@@ -29,3 +29,24 @@ tables:
29
29
  - name: status
30
30
  type: VARCHAR
31
31
  description: "Order status: pending, completed, cancelled"
32
+ - name: customer_id
33
+ type: INTEGER
34
+ description: "FK to customers"
35
+
36
+ - schema: analytics
37
+ table: customers
38
+ columns:
39
+ - name: id
40
+ type: INTEGER
41
+ description: "Primary key"
42
+ - name: name
43
+ type: VARCHAR
44
+ description: "Customer name"
45
+ - name: region
46
+ type: VARCHAR
47
+ description: "Geographic region"
48
+
49
+ relationships:
50
+ - from: analytics.orders.customer_id
51
+ to: analytics.customers.id
52
+ type: many_to_one
@@ -0,0 +1,144 @@
1
+ """Tests for scalability improvements: compact prompt, pagination, caching."""
2
+
3
+ from unittest.mock import MagicMock
4
+
5
+ from agentic_data_contracts.adapters.base import DatabaseAdapter
6
+ from agentic_data_contracts.core.contract import DataContract
7
+ from agentic_data_contracts.core.schema import (
8
+ AllowedTable,
9
+ DataContractSchema,
10
+ SemanticConfig,
11
+ )
12
+ from agentic_data_contracts.semantic.base import MetricDefinition, Relationship
13
+
14
+
15
+ class FakeSemanticSource:
16
+ """Fake source with configurable metric count."""
17
+
18
+ def __init__(self, count: int) -> None:
19
+ self._metrics = [
20
+ MetricDefinition(
21
+ name=f"metric_{i}",
22
+ description=f"Description for metric {i}",
23
+ sql_expression=f"SUM(col_{i})",
24
+ )
25
+ for i in range(count)
26
+ ]
27
+
28
+ def get_metrics(self) -> list[MetricDefinition]:
29
+ return list(self._metrics)
30
+
31
+ def get_metric(self, name: str) -> MetricDefinition | None:
32
+ for m in self._metrics:
33
+ if m.name == name:
34
+ return m
35
+ return None
36
+
37
+ def get_table_schema(self, schema: str, table: str): # noqa: ANN201
38
+ return None
39
+
40
+ def search_metrics(self, query: str) -> list[MetricDefinition]:
41
+ return []
42
+
43
+ def get_relationships(self) -> list[Relationship]:
44
+ return []
45
+
46
+
47
+ def _make_contract_with_domains(
48
+ metric_names: list[str],
49
+ ) -> DataContract:
50
+ domains = {
51
+ "domain_a": metric_names[: len(metric_names) // 2],
52
+ "domain_b": metric_names[len(metric_names) // 2 :],
53
+ }
54
+ schema = DataContractSchema(
55
+ name="test",
56
+ semantic=SemanticConfig(
57
+ allowed_tables=[
58
+ AllowedTable.model_validate({"schema": "public", "tables": ["t"]}),
59
+ ],
60
+ domains=domains,
61
+ ),
62
+ )
63
+ return DataContract(schema)
64
+
65
+
66
+ class TestCompactMetricPrompt:
67
+ def test_small_set_lists_all_metrics(self) -> None:
68
+ source = FakeSemanticSource(5)
69
+ dc = _make_contract_with_domains([f"metric_{i}" for i in range(5)])
70
+ prompt = dc.to_system_prompt(semantic_source=source)
71
+ # Should list individual metric descriptions
72
+ assert "metric_0 \u2014" in prompt
73
+ assert "metric_4 \u2014" in prompt
74
+
75
+ def test_large_set_shows_domain_counts(self) -> None:
76
+ source = FakeSemanticSource(30)
77
+ dc = _make_contract_with_domains([f"metric_{i}" for i in range(30)])
78
+ prompt = dc.to_system_prompt(semantic_source=source)
79
+ # Should NOT list individual metrics
80
+ assert "metric_0 \u2014" not in prompt
81
+ # Should show domain counts
82
+ assert "domain_a (15)" in prompt
83
+ assert "domain_b (15)" in prompt
84
+ assert "list_metrics" in prompt
85
+
86
+ def test_large_set_no_domains_shows_count(self) -> None:
87
+ source = FakeSemanticSource(30)
88
+ schema = DataContractSchema(
89
+ name="test",
90
+ semantic=SemanticConfig(
91
+ allowed_tables=[
92
+ AllowedTable.model_validate({"schema": "public", "tables": ["t"]}),
93
+ ],
94
+ ),
95
+ )
96
+ dc = DataContract(schema)
97
+ prompt = dc.to_system_prompt(semantic_source=source)
98
+ assert "30 metrics available" in prompt
99
+ assert "metric_0 \u2014" not in prompt
100
+
101
+ def test_threshold_boundary(self) -> None:
102
+ # Exactly at threshold — should still list individually
103
+ source = FakeSemanticSource(20)
104
+ schema = DataContractSchema(
105
+ name="test",
106
+ semantic=SemanticConfig(
107
+ allowed_tables=[
108
+ AllowedTable.model_validate({"schema": "public", "tables": ["t"]}),
109
+ ],
110
+ ),
111
+ )
112
+ dc = DataContract(schema)
113
+ prompt = dc.to_system_prompt(semantic_source=source)
114
+ assert "metric_0 \u2014" in prompt
115
+
116
+ # One above threshold — compact mode
117
+ source = FakeSemanticSource(21)
118
+ prompt = dc.to_system_prompt(semantic_source=source)
119
+ assert "metric_0 \u2014" not in prompt
120
+ assert "21 metrics available" in prompt
121
+
122
+
123
+ class TestWildcardCaching:
124
+ def test_resolve_tables_caches(self) -> None:
125
+ dc = DataContract(
126
+ DataContractSchema(
127
+ name="test",
128
+ semantic=SemanticConfig(
129
+ allowed_tables=[
130
+ AllowedTable.model_validate({"schema": "s", "tables": ["*"]}),
131
+ ],
132
+ ),
133
+ )
134
+ )
135
+ mock_adapter = MagicMock(spec=DatabaseAdapter)
136
+ mock_adapter.list_tables.return_value = ["t1", "t2"]
137
+
138
+ dc.resolve_tables(mock_adapter)
139
+ assert "s.t1" in dc.allowed_table_names()
140
+ assert mock_adapter.list_tables.call_count == 1
141
+
142
+ # Second call should be a no-op
143
+ dc.resolve_tables(mock_adapter)
144
+ assert mock_adapter.list_tables.call_count == 1
@@ -0,0 +1,83 @@
1
+ """Tests for table relationship metadata."""
2
+
3
+ from pathlib import Path
4
+
5
+ from agentic_data_contracts.core.contract import DataContract
6
+ from agentic_data_contracts.core.schema import (
7
+ AllowedTable,
8
+ DataContractSchema,
9
+ SemanticConfig,
10
+ )
11
+ from agentic_data_contracts.semantic.cube import CubeSource
12
+ from agentic_data_contracts.semantic.dbt import DbtSource
13
+ from agentic_data_contracts.semantic.yaml_source import YamlSource
14
+
15
+
16
+ def test_yaml_source_loads_relationships(fixtures_dir: Path) -> None:
17
+ source = YamlSource(fixtures_dir / "semantic_source.yml")
18
+ rels = source.get_relationships()
19
+ assert len(rels) == 1
20
+ assert rels[0].from_ == "analytics.orders.customer_id"
21
+ assert rels[0].to == "analytics.customers.id"
22
+ assert rels[0].type == "many_to_one"
23
+
24
+
25
+ def test_yaml_source_no_relationships(tmp_path: Path) -> None:
26
+ (tmp_path / "empty.yml").write_text("metrics: []")
27
+ source = YamlSource(tmp_path / "empty.yml")
28
+ assert source.get_relationships() == []
29
+
30
+
31
+ def test_dbt_source_returns_empty_relationships(
32
+ fixtures_dir: Path,
33
+ ) -> None:
34
+ source = DbtSource(fixtures_dir / "sample_dbt_manifest.json")
35
+ assert source.get_relationships() == []
36
+
37
+
38
+ def test_cube_source_returns_empty_relationships(
39
+ fixtures_dir: Path,
40
+ ) -> None:
41
+ source = CubeSource(fixtures_dir / "sample_cube_schema.yml")
42
+ assert source.get_relationships() == []
43
+
44
+
45
+ def test_system_prompt_includes_relationships(
46
+ fixtures_dir: Path,
47
+ ) -> None:
48
+ source = YamlSource(fixtures_dir / "semantic_source.yml")
49
+ schema = DataContractSchema(
50
+ name="test",
51
+ semantic=SemanticConfig(
52
+ allowed_tables=[
53
+ AllowedTable.model_validate(
54
+ {"schema": "analytics", "tables": ["orders", "customers"]}
55
+ ),
56
+ ],
57
+ ),
58
+ )
59
+ dc = DataContract(schema)
60
+ prompt = dc.to_system_prompt(semantic_source=source)
61
+ assert "Table Relationships" in prompt
62
+ assert "analytics.orders.customer_id" in prompt
63
+ assert "analytics.customers.id" in prompt
64
+ assert "many_to_one" in prompt
65
+
66
+
67
+ def test_system_prompt_no_relationships_when_empty(
68
+ fixtures_dir: Path,
69
+ ) -> None:
70
+ source = DbtSource(fixtures_dir / "sample_dbt_manifest.json")
71
+ schema = DataContractSchema(
72
+ name="test",
73
+ semantic=SemanticConfig(
74
+ allowed_tables=[
75
+ AllowedTable.model_validate(
76
+ {"schema": "analytics", "tables": ["orders"]}
77
+ ),
78
+ ],
79
+ ),
80
+ )
81
+ dc = DataContract(schema)
82
+ prompt = dc.to_system_prompt(semantic_source=source)
83
+ assert "Table Relationships" not in prompt
@@ -39,7 +39,7 @@ def test_get_metric_not_found(source: YamlSource) -> None:
39
39
  def test_get_table_schema(source: YamlSource) -> None:
40
40
  schema = source.get_table_schema("analytics", "orders")
41
41
  assert schema is not None
42
- assert len(schema.columns) == 4
42
+ assert len(schema.columns) == 5
43
43
  col_names = [c.name for c in schema.columns]
44
44
  assert "id" in col_names
45
45
  assert "amount" in col_names
@@ -0,0 +1,80 @@
1
+ """Tests for list_tables pagination."""
2
+
3
+ import json
4
+
5
+ import pytest
6
+
7
+ from agentic_data_contracts.core.contract import DataContract
8
+ from agentic_data_contracts.core.schema import (
9
+ AllowedTable,
10
+ DataContractSchema,
11
+ SemanticConfig,
12
+ )
13
+ from agentic_data_contracts.tools.factory import create_tools
14
+
15
+
16
+ @pytest.fixture
17
+ def large_contract() -> DataContract:
18
+ """Contract with many tables to test pagination."""
19
+ tables = [f"table_{i}" for i in range(60)]
20
+ schema = DataContractSchema(
21
+ name="test",
22
+ semantic=SemanticConfig(
23
+ allowed_tables=[
24
+ AllowedTable.model_validate({"schema": "analytics", "tables": tables}),
25
+ ],
26
+ ),
27
+ )
28
+ return DataContract(schema)
29
+
30
+
31
+ @pytest.mark.asyncio
32
+ async def test_list_tables_default_limit(
33
+ large_contract: DataContract,
34
+ ) -> None:
35
+ tools = create_tools(large_contract)
36
+ tool = next(t for t in tools if t.name == "list_tables")
37
+ result = await tool.callable({})
38
+ data = json.loads(result["content"][0]["text"])
39
+ assert len(data["tables"]) == 50 # default limit
40
+ assert data["total"] == 60
41
+ assert data["next_offset"] == 50
42
+
43
+
44
+ @pytest.mark.asyncio
45
+ async def test_list_tables_custom_limit(
46
+ large_contract: DataContract,
47
+ ) -> None:
48
+ tools = create_tools(large_contract)
49
+ tool = next(t for t in tools if t.name == "list_tables")
50
+ result = await tool.callable({"limit": 10})
51
+ data = json.loads(result["content"][0]["text"])
52
+ assert len(data["tables"]) == 10
53
+ assert data["total"] == 60
54
+ assert data["next_offset"] == 10
55
+
56
+
57
+ @pytest.mark.asyncio
58
+ async def test_list_tables_with_offset(
59
+ large_contract: DataContract,
60
+ ) -> None:
61
+ tools = create_tools(large_contract)
62
+ tool = next(t for t in tools if t.name == "list_tables")
63
+ result = await tool.callable({"limit": 10, "offset": 50})
64
+ data = json.loads(result["content"][0]["text"])
65
+ assert len(data["tables"]) == 10
66
+ assert data["total"] == 60
67
+ assert "next_offset" not in data # last page
68
+
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_list_tables_small_set_no_next(
72
+ fixtures_dir,
73
+ ) -> None:
74
+ dc = DataContract.from_yaml(fixtures_dir / "minimal_contract.yml")
75
+ tools = create_tools(dc)
76
+ tool = next(t for t in tools if t.name == "list_tables")
77
+ result = await tool.callable({})
78
+ data = json.loads(result["content"][0]["text"])
79
+ assert data["total"] == 1
80
+ assert "next_offset" not in data
@@ -9,7 +9,7 @@ resolution-markers = [
9
9
 
10
10
  [[package]]
11
11
  name = "agentic-data-contracts"
12
- version = "0.2.4"
12
+ version = "0.2.6"
13
13
  source = { editable = "." }
14
14
  dependencies = [
15
15
  { name = "pydantic" },
@@ -1,51 +0,0 @@
1
- metrics:
2
- - name: total_revenue
3
- description: "Total revenue from completed orders"
4
- sql_expression: "SUM(amount) FILTER (WHERE status = 'completed')"
5
- source_model: analytics.orders
6
- filters:
7
- - "status = 'completed'"
8
- - name: revenue_by_region
9
- description: "Revenue broken down by customer region"
10
- sql_expression: "SUM(o.amount) GROUP BY c.region"
11
- source_model: analytics.orders
12
- filters:
13
- - "o.status = 'completed'"
14
-
15
- tables:
16
- - schema: analytics
17
- table: orders
18
- columns:
19
- - name: id
20
- type: INTEGER
21
- description: "Order ID"
22
- - name: customer_id
23
- type: INTEGER
24
- description: "FK to customers"
25
- - name: amount
26
- type: DECIMAL
27
- description: "Order total in USD"
28
- - name: status
29
- type: VARCHAR
30
- description: "pending, completed, cancelled"
31
- - name: tenant_id
32
- type: VARCHAR
33
- description: "Tenant identifier"
34
- - name: created_at
35
- type: DATE
36
- description: "Order date"
37
- - schema: analytics
38
- table: customers
39
- columns:
40
- - name: id
41
- type: INTEGER
42
- description: "Customer ID"
43
- - name: name
44
- type: VARCHAR
45
- description: "Customer name"
46
- - name: region
47
- type: VARCHAR
48
- description: "Geographic region"
49
- - name: tenant_id
50
- type: VARCHAR
51
- description: "Tenant identifier"