create-runcontext 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eric Kittelson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # create-runcontext
2
+
3
+ Scaffold a new [RunContext](https://github.com/RunContext/runcontext) project — one command to start building an AI-ready data product from your database.
4
+
5
+ ## Usage
6
+
7
+ ```bash
8
+ npx create-runcontext my-project
9
+ cd my-project
10
+ context setup
11
+ ```
12
+
13
+ The setup wizard opens in your browser. Fill out a Context Brief (name, owner, sensitivity, database connection), and the pipeline builds your semantic plane automatically.
14
+
15
+ ## What it creates
16
+
17
+ ```
18
+ my-project/
19
+ ├── context/
20
+ │ ├── models/ # Semantic models (OSI YAML)
21
+ │ ├── governance/ # Ownership, trust, security, semantic roles
22
+ │ ├── rules/ # Golden queries, business rules, guardrails
23
+ │ ├── lineage/ # Upstream/downstream lineage
24
+ │ ├── glossary/ # Business term definitions
25
+ │ └── owners/ # Team ownership records
26
+ ├── runcontext.config.yaml
27
+ └── package.json
28
+ ```
29
+
30
+ ## What happens next
31
+
32
+ 1. **`context setup`** — Browser wizard guides you through the Context Brief
33
+ 2. **Pipeline runs** — Introspect database → scaffold Bronze → enrich to Silver
34
+ 3. **`context dev --studio`** — Visual editor to curate metadata to Gold tier
35
+ 4. **`context serve`** — MCP server live, AI agents get full context
36
+
37
+ ## Part of RunContext
38
+
39
+ See the [RunContext repository](https://github.com/RunContext/runcontext) for full documentation.
40
+
41
+ ## License
42
+
43
+ MIT
package/dist/index.js ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/index.ts
4
+ import fs from "fs";
5
+ import path from "path";
6
+ import { fileURLToPath } from "url";
7
+ var __dirname = path.dirname(fileURLToPath(import.meta.url));
8
+ var templatesDir = path.resolve(__dirname, "..", "templates", "minimal");
9
+ function toKebabCase(input) {
10
+ return input.replace(/([a-z])([A-Z])/g, "$1-$2").replace(/[\s_]+/g, "-").toLowerCase();
11
+ }
12
+ function toTitleCase(input) {
13
+ return input.replace(/[-_]+/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
14
+ }
15
+ function copyDir(src, dest, createdFiles, projectDir, projectName, displayName) {
16
+ const entries = fs.readdirSync(src, { withFileTypes: true });
17
+ for (const entry of entries) {
18
+ const srcPath = path.join(src, entry.name);
19
+ const destPath = path.join(dest, entry.name);
20
+ if (entry.isDirectory()) {
21
+ fs.mkdirSync(destPath, { recursive: true });
22
+ copyDir(srcPath, destPath, createdFiles, projectDir, projectName, displayName);
23
+ } else if (entry.name.endsWith(".template")) {
24
+ let content = fs.readFileSync(srcPath, "utf-8");
25
+ content = content.replace(/\{\{PROJECT_NAME\}\}/g, projectName);
26
+ content = content.replace(/\{\{PROJECT_DISPLAY_NAME\}\}/g, displayName);
27
+ const finalPath = destPath.replace(/\.template$/, "");
28
+ fs.writeFileSync(finalPath, content, "utf-8");
29
+ createdFiles.push(path.relative(projectDir, finalPath));
30
+ } else {
31
+ fs.copyFileSync(srcPath, destPath);
32
+ createdFiles.push(path.relative(projectDir, destPath));
33
+ }
34
+ }
35
+ }
36
+ function main() {
37
+ const projectNameArg = process.argv[2];
38
+ if (!projectNameArg) {
39
+ console.log("Usage: create-runcontext <project-name>");
40
+ console.log("");
41
+ console.log("Example:");
42
+ console.log(" npm create runcontext my-project");
43
+ console.log(" pnpm create runcontext my-project");
44
+ process.exit(1);
45
+ }
46
+ const projectName = toKebabCase(projectNameArg);
47
+ const displayName = toTitleCase(projectName);
48
+ const projectDir = path.resolve(process.cwd(), projectName);
49
+ if (fs.existsSync(projectDir)) {
50
+ console.error(`Error: Directory "${projectName}" already exists. Please choose a different name or remove the existing directory.`);
51
+ process.exit(1);
52
+ }
53
+ console.log(`Creating RunContext project: ${projectName}`);
54
+ console.log("");
55
+ fs.mkdirSync(projectDir, { recursive: true });
56
+ const createdFiles = [];
57
+ copyDir(templatesDir, projectDir, createdFiles, projectDir, projectName, displayName);
58
+ for (const file of createdFiles) {
59
+ console.log(` Created ${file}`);
60
+ }
61
+ console.log("");
62
+ console.log("Done! Next steps:");
63
+ console.log(` cd ${projectName}`);
64
+ console.log(" pnpm add -D @runcontext/cli");
65
+ console.log(" npx context lint");
66
+ console.log(" npx context tier");
67
+ }
68
+ main();
package/package.json ADDED
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "create-runcontext",
3
+ "version": "0.5.3",
4
+ "description": "Scaffold a new RunContext data product. Run npx create-runcontext, fill out a Context Brief in your browser, and the pipeline builds your semantic plane automatically.",
5
+ "license": "MIT",
6
+ "author": "Eric Kittelson",
7
+ "homepage": "https://github.com/RunContext/runcontext",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/RunContext/runcontext.git",
11
+ "directory": "create-runcontext"
12
+ },
13
+ "bugs": "https://github.com/RunContext/runcontext/issues",
14
+ "engines": {
15
+ "node": ">=18.0.0"
16
+ },
17
+ "keywords": [
18
+ "create",
19
+ "scaffold",
20
+ "semantic-plane",
21
+ "semantic-layer",
22
+ "runcontext",
23
+ "data-product",
24
+ "ai-ready-data",
25
+ "npx",
26
+ "project-generator",
27
+ "mcp",
28
+ "data-catalog"
29
+ ],
30
+ "type": "module",
31
+ "bin": {
32
+ "create-runcontext": "./dist/index.js"
33
+ },
34
+ "files": [
35
+ "dist",
36
+ "templates"
37
+ ],
38
+ "devDependencies": {
39
+ "@types/node": "^25.3.3",
40
+ "tsup": "^8.4.0",
41
+ "typescript": "^5.7.0"
42
+ },
43
+ "scripts": {
44
+ "build": "tsup",
45
+ "clean": "rm -rf dist"
46
+ }
47
+ }
@@ -0,0 +1,270 @@
1
+ # RunContext Agent Instructions
2
+
3
+ You have two MCP servers: **duckdb** (query data) and **runcontext** (query metadata).
4
+
5
+ ## The Cardinal Rule: Never Fabricate Metadata
6
+
7
+ **Every piece of metadata you write must be grounded in evidence from the actual data.**
8
+
9
+ - NEVER invent owner names, emails, team names, or contact info
10
+ - NEVER write a field description that is just the column name repeated
11
+ - NEVER assign a semantic_role without first querying the column's actual values
12
+ - NEVER mark a field as additive without understanding what summing it means
13
+ - NEVER write lineage entries without knowing the actual data sources
14
+ - NEVER write a business_context narrative you can't justify from the data
15
+ - NEVER create a glossary definition that is just "Definition for X"
16
+
17
+ If you don't know something, say so. Leave it as a TODO with a note about what you'd need to determine the answer. A honest TODO is infinitely better than fabricated metadata that looks plausible but is wrong.
18
+
19
+ ## On Session Start
20
+
21
+ 1. Run `context_tier` to check the current metadata tier (Bronze/Silver/Gold)
22
+ 2. Report the current tier and list failing checks
23
+ 3. Ask the user what they'd like to work on — don't start changing files unprompted
24
+
25
+ ## When Asked to Reach Gold
26
+
27
+ Work through ALL failing Gold checks iteratively until `context tier` reports Gold:
28
+
29
+ 1. Run `context_tier` and collect every failing check
30
+ 2. For each failing check, query the database to gather evidence, then fix the metadata
31
+ 3. Run `context_tier` again
32
+ 4. If checks still fail, go back to step 2
33
+ 5. **Do NOT stop until every Gold check passes** or you hit something that genuinely requires human input (like real owner contact info)
34
+ 6. For checks you cannot fix (e.g., owner email), leave a clear TODO explaining what a human needs to provide
35
+
36
+ You must iterate — a single pass is never enough. Each `context tier` run may reveal new failures after earlier ones are fixed.
37
+
38
+ ## How to Curate Metadata (the right way)
39
+
40
+ ### Before writing ANY metadata, query the database first
41
+
42
+ For every field you're about to describe or classify:
43
+
44
+ ```sql
45
+ -- What type of values does this column contain?
46
+ SELECT DISTINCT column_name FROM table LIMIT 20;
47
+
48
+ -- For numeric columns: is this a metric or dimension?
49
+ SELECT MIN(col), MAX(col), AVG(col), COUNT(DISTINCT col) FROM table;
50
+
51
+ -- For potential metrics: does SUM make sense?
52
+ -- If SUM produces a meaningful business number → additive: true
53
+ -- If SUM is meaningless (e.g., summing percentages, scores, ratings) → additive: false
54
+ ```
55
+
56
+ ### Semantic Role Decision Tree
57
+
58
+ Query the column first, then apply this logic:
59
+
60
+ 1. **Is it a primary key or foreign key?** → `identifier`
61
+ 2. **Is it a date or timestamp?** → `date`
62
+ 3. **Is it numeric AND does aggregation make business sense?**
63
+ - Does SUM make sense? (counts, amounts, quantities) → `metric`, `additive: true`
64
+ - Does only AVG/MIN/MAX make sense? (rates, percentages, scores, ratings) → `metric`, `additive: false`
65
+ 4. **Everything else** → `dimension`
66
+
67
+ Common mistakes to avoid:
68
+ - `stars` (ratings) → metric with AVG, NOT additive (summing star ratings is meaningless)
69
+ - `_per_10k_people` (rates) → metric with AVG, NOT additive
70
+ - `_score` (composite scores) → metric with AVG, NOT additive
71
+ - `useful/funny/cool` (vote counts) → metric with SUM, additive
72
+ - `_count` fields → metric with SUM, additive (usually)
73
+
74
+ ### Field Descriptions
75
+
76
+ Write descriptions that help someone who has never seen this database understand what the column contains. Include:
77
+ - What the value represents
78
+ - Units or scale (if applicable)
79
+ - Where the data comes from (if known)
80
+ - Any known quirks or caveats
81
+
82
+ Bad: `description: total_population`
83
+ Good: `description: Total resident population of the census tract from American Community Survey 5-year estimates`
84
+
85
+ ### Lineage
86
+
87
+ Upstream sources are the EXTERNAL systems that feed data into this warehouse. They are NOT the tables in the warehouse itself.
88
+
89
+ Ask yourself: "Where did this data originally come from before it was loaded here?"
90
+
91
+ ### Owner Files
92
+
93
+ Do NOT create fake owner identities. If the real owner is unknown:
94
+ - Keep the existing owner file as-is
95
+ - Note in the file that contact info needs to be filled in by a real person
96
+ - NEVER invent email addresses like `analytics@example.com`
97
+
98
+ ### Business Context
99
+
100
+ Write business_context entries that describe real analytical use cases you can verify from the data. Query the data to understand what questions it can answer before writing narratives.
101
+
102
+ ### Golden Queries
103
+
104
+ Every golden query MUST be tested against the actual database before you write it. Run the SQL, verify it returns sensible results, then document it.
105
+
106
+ ### Data Quality
107
+
108
+ When you discover data quality issues (null values, broken joins, missing data), FLAG THEM — don't hide them. Add notes in governance or report them to the user.
109
+
110
+ ## MCP Tools
111
+
112
+ | Tool | Parameters | What it does |
113
+ |------|-----------|-------------|
114
+ | `context_search` | `query` | Find models, datasets, fields, terms by keyword |
115
+ | `context_explain` | `model` | Full model details — governance, rules, lineage, tier |
116
+ | `context_validate` | — | Run linter, get errors and warnings |
117
+ | `context_tier` | `model` | Tier scorecard with all check results |
118
+ | `context_golden_query` | `question` | Find pre-validated SQL for a question |
119
+ | `context_guardrails` | `tables[]` | Get required WHERE clauses for tables |
120
+
121
+ ## Tier Checks Quick Reference
122
+
123
+ **Bronze (7):** descriptions, owner, security, grain, table_type
124
+ **Silver (+6):** trust, 2+ tags, glossary linked, lineage, refresh, 2+ sample_values
125
+ **Gold (+24):** semantic_role on ALL fields, metric aggregation/additive, 1+ guardrail, 3+ golden queries, 1+ business rule, 1+ hierarchy, 1+ default_filter, trust=endorsed, contactable owner, 1+ relationship, description >=50 chars, ai_context (no TODO), 1+ business_context, version, field descriptions not lazy, glossary definitions substantive, lineage references real sources, grain statements specific, ai_context filled in, 3+ relationships (models with 3+ datasets), 1+ computed metric, 3+ glossary terms (models with 5+ datasets)
126
+
127
+ ## How to Reach Gold: Curation Recipes
128
+
129
+ ### Metrics (gold/metrics-defined)
130
+
131
+ Inspect computed views in the database. Any calculated column is a candidate metric.
132
+
133
+ ```sql
134
+ -- Find computed columns in views
135
+ SELECT column_name, data_type
136
+ FROM information_schema.columns
137
+ WHERE table_name LIKE 'vw_%' AND data_type IN ('DOUBLE', 'FLOAT', 'INTEGER', 'BIGINT', 'DECIMAL');
138
+ ```
139
+
140
+ For each computed column (e.g., `opportunity_score`, `shops_per_10k`, `demand_signal_pct`):
141
+ 1. Query it to understand what it measures
142
+ 2. Add it to the model's `metrics[]` array in the OSI YAML
143
+ 3. Include the SQL expression, aggregation type (SUM/AVG), and a human description
144
+ 4. Mark whether it's additive (can be summed across dimensions)
145
+
146
+ Example:
147
+ ```yaml
148
+ metrics:
149
+ - name: opportunity_score
150
+ expression:
151
+ dialects:
152
+ - dialect: DuckDB
153
+ expression: "(population/10000)*2 + (income/50000)*2 + (10-shops_per_10k)*3 + transit*1.5 + demand*0.5"
154
+ description: Composite score ranking census tracts for coffee shop viability
155
+ aggregation: AVG
156
+ additive: false
157
+ ```
158
+
159
+ ### Glossary Terms (gold/glossary-coverage)
160
+
161
+ For each key business concept your model measures, create a glossary term file.
162
+
163
+ Think about the terms a new analyst would need defined:
164
+ - What is "supply saturation"? (> 5.0 shops per 10k people)
165
+ - What is a "demand signal"? (review mentioning wait/line/crowded/busy)
166
+ - What is "opportunity score"? (composite ranking formula)
167
+
168
+ For each term, create `context/glossary/<term-name>.term.yaml`:
169
+ ```yaml
170
+ term: supply-saturation
171
+ definition: >
172
+ A measure of coffee shop density per census tract. Calculated as
173
+ shops per 10,000 residents. Tracts with > 5.0 are considered saturated.
174
+ owner: analytics-team
175
+ tags: [coffee-analytics]
176
+ ```
177
+
178
+ Models with 5+ datasets need at least 3 glossary terms linked by shared tags or owner.
179
+
180
+ ### Relationships (gold/relationships-coverage)
181
+
182
+ For each join in the SQL views, define a relationship in the OSI model.
183
+
184
+ ```sql
185
+ -- Find joins by examining view definitions
186
+ -- Look for patterns: ON table_a.col = table_b.col
187
+ -- Or spatial joins: ABS(a.lat - b.lat) < threshold
188
+ ```
189
+
190
+ For each join:
191
+ ```yaml
192
+ relationships:
193
+ - name: business-to-tract
194
+ left_dataset: yelp_business
195
+ right_dataset: census_tract
196
+ join_type: spatial
197
+ cardinality: many-to-one
198
+ description: Businesses assigned to nearest census tract within 0.02 degrees (~1 mile)
199
+ ```
200
+
201
+ Models with 3+ datasets need at least 3 relationships.
202
+
203
+ ### Golden Queries
204
+
205
+ Write 3-5 SQL queries answering common business questions. **Test each query first!**
206
+
207
+ ```sql
208
+ -- Run the query, verify it returns sensible results, then document:
209
+ SELECT geoid, tract_name, opportunity_score
210
+ FROM vw_candidate_zones ORDER BY opportunity_score DESC LIMIT 10;
211
+ ```
212
+
213
+ ## YAML Formats
214
+
215
+ **Governance** (`context/governance/*.governance.yaml`):
216
+ ```yaml
217
+ model: my-model
218
+ owner: team-name
219
+ version: "1.0.0"
220
+ trust: endorsed
221
+ security: internal
222
+ tags: [domain-tag-1, domain-tag-2]
223
+ business_context:
224
+ - name: Use Case Name
225
+ description: What analytical question this data answers and for whom.
226
+ datasets:
227
+ my_table:
228
+ grain: "One row per [entity] identified by [key]"
229
+ table_type: fact # fact | dimension | event | view
230
+ refresh: daily
231
+ fields:
232
+ dataset.field:
233
+ semantic_role: metric # metric | dimension | identifier | date
234
+ default_aggregation: SUM # SUM | AVG | COUNT | COUNT_DISTINCT | MIN | MAX
235
+ additive: true # can this metric be summed across dimensions?
236
+ default_filter: "is_open = 1"
237
+ sample_values: ["val1", "val2"]
238
+ ```
239
+
240
+ **Rules** (`context/rules/*.rules.yaml`):
241
+ ```yaml
242
+ model: my-model
243
+ golden_queries:
244
+ - question: What are the top items by count?
245
+ sql: SELECT name, count FROM my_table ORDER BY count DESC LIMIT 10
246
+ intent: Identify top performers by volume
247
+ caveats: Filters to active records only
248
+ business_rules:
249
+ - name: valid-ratings
250
+ definition: All ratings must be between 1 and 5
251
+ guardrail_filters:
252
+ - name: active-only
253
+ filter: "status = 'active'"
254
+ reason: Exclude inactive records from analytics
255
+ tables: [my_table]
256
+ hierarchies:
257
+ - name: geography
258
+ levels: [state, city, postal_code]
259
+ dataset: my_table
260
+ ```
261
+
262
+ ## CLI Commands
263
+
264
+ ```bash
265
+ context tier # Check scorecard
266
+ context verify --db <path> # Validate against live data
267
+ context fix --db <path> # Auto-fix data warnings
268
+ context setup # Interactive setup wizard
269
+ context dev # Watch mode for live editing
270
+ ```
@@ -0,0 +1,5 @@
1
+ id: revenue
2
+ definition: "Total value of completed orders before refunds"
3
+ synonyms: [sales, order revenue]
4
+ owner: example-team
5
+ tags: [finance]
@@ -0,0 +1,4 @@
1
+ id: example-team
2
+ display_name: Example Team
3
+ email: team@example.com
4
+ description: "Responsible for data and analytics"
@@ -0,0 +1,8 @@
1
+ product_name: example-product
2
+ description: "{{PROJECT_DISPLAY_NAME}} — describe your data product here."
3
+ owner:
4
+ name: Your Name
5
+ team: Your Team
6
+ email: you@example.com
7
+ sensitivity: internal
8
+ docs: []
@@ -0,0 +1,23 @@
1
+ model: example-model
2
+ owner: example-team
3
+ trust: endorsed
4
+ security: internal
5
+ tags: [orders, revenue]
6
+
7
+ datasets:
8
+ orders:
9
+ grain: "One row per order"
10
+ refresh: daily
11
+ table_type: fact
12
+
13
+ fields:
14
+ orders.amount:
15
+ semantic_role: metric
16
+ default_aggregation: SUM
17
+ additive: true
18
+ orders.order_id:
19
+ semantic_role: identifier
20
+ orders.customer_id:
21
+ semantic_role: identifier
22
+ orders.order_date:
23
+ semantic_role: date
@@ -0,0 +1,12 @@
1
+ model: example-model
2
+
3
+ upstream:
4
+ - source: source-system.raw-data
5
+ type: pipeline
6
+ refresh: daily
7
+ notes: Raw data ingested daily via ETL pipeline
8
+
9
+ downstream:
10
+ - target: analytics-dashboard
11
+ type: dashboard
12
+ notes: Executive KPI dashboard
@@ -0,0 +1,51 @@
1
+ version: "1.0"
2
+
3
+ semantic_model:
4
+ - name: example-model
5
+ description: Example orders analytics model
6
+ ai_context:
7
+ instructions: "Use for order and revenue analysis"
8
+ synonyms: ["orders model"]
9
+
10
+ datasets:
11
+ - name: orders
12
+ source: warehouse.public.orders
13
+ primary_key: [order_id]
14
+ description: "Orders fact table"
15
+ fields:
16
+ - name: order_id
17
+ expression:
18
+ dialects:
19
+ - dialect: ANSI_SQL
20
+ expression: order_id
21
+ description: Unique order identifier
22
+ - name: customer_id
23
+ expression:
24
+ dialects:
25
+ - dialect: ANSI_SQL
26
+ expression: customer_id
27
+ description: Foreign key to customers
28
+ dimension:
29
+ is_time: false
30
+ - name: amount
31
+ expression:
32
+ dialects:
33
+ - dialect: ANSI_SQL
34
+ expression: amount
35
+ description: Order amount in USD
36
+ - name: order_date
37
+ expression:
38
+ dialects:
39
+ - dialect: ANSI_SQL
40
+ expression: order_date
41
+ description: Date the order was placed
42
+ dimension:
43
+ is_time: true
44
+
45
+ metrics:
46
+ - name: total_revenue
47
+ expression:
48
+ dialects:
49
+ - dialect: ANSI_SQL
50
+ expression: "SUM(orders.amount)"
51
+ description: Total revenue across all orders
@@ -0,0 +1,19 @@
1
+ model: example-model
2
+
3
+ golden_queries:
4
+ - question: "What is total revenue?"
5
+ sql: |
6
+ SELECT SUM(amount) AS total_revenue
7
+ FROM orders
8
+ dialect: ANSI_SQL
9
+ tags: [revenue]
10
+
11
+ business_rules:
12
+ - name: positive-amounts
13
+ definition: "Only include orders with positive amounts in revenue calculations"
14
+ enforcement:
15
+ - "Filter to amount > 0 when calculating revenue"
16
+ avoid:
17
+ - "Do not include negative or zero amounts in revenue totals"
18
+ tables: [orders]
19
+ applied_always: true
@@ -0,0 +1,6 @@
1
+ context_dir: .
2
+ output_dir: dist
3
+ products:
4
+ - example-product
5
+ glossary_dir: glossary
6
+ owners_dir: owners