@rishildi/ldi-process-skills 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/server.d.ts +4 -0
- package/build/server.d.ts.map +1 -1
- package/build/server.js +45 -20
- package/build/skills/embedded.d.ts +1 -0
- package/build/skills/embedded.d.ts.map +1 -1
- package/build/skills/embedded.js +90 -68
- package/build/skills/registry.d.ts +11 -2
- package/build/skills/registry.d.ts.map +1 -1
- package/build/skills/registry.js +17 -2
- package/package.json +2 -2
- package/README.md +0 -133
package/build/skills/embedded.js
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
// AUTO-GENERATED by scripts/embed-skills.ts — do not edit
|
|
2
|
-
// Generated at: 2026-04-
|
|
2
|
+
// Generated at: 2026-04-04T20:21:46.094Z
|
|
3
3
|
export const EMBEDDED_SKILLS = [
|
|
4
4
|
{
|
|
5
5
|
name: "create-fabric-lakehouses",
|
|
6
|
+
category: "fabric",
|
|
6
7
|
files: [
|
|
7
8
|
{
|
|
8
9
|
relativePath: "SKILL.md",
|
|
9
|
-
content: "---\nname: create-fabric-lakehouse\ndescription: >\n Use this skill when asked to create, provision, or set up one or more\n Lakehouse items in existing Microsoft Fabric workspaces. Triggers on:\n \"create a lakehouse\", \"provision lakehouses\", \"set up a Fabric lakehouse\",\n \"create lakehouse in Fabric\", \"new lakehouse\", \"create lakehouses across\n workspaces\". Does NOT trigger for: creating workspaces (use\n generate-fabric-workspace), querying lakehouse data, managing tables,\n uploading files, creating shortcuts, or general Fabric workspace management.\nlicense: MIT\ncompatibility: Fabric CLI (fab) installed and authenticated; Python 3.10+ for notebook approach\n---\n\n# Create Fabric Lakehouse\n\nProvisions one or more empty Lakehouse items across one or more existing\nMicrosoft Fabric workspaces, using a user-chosen approach, and produces an\naudit-trail definition file.\n\n**Companion skills:** Workspace creation is handled by the\n`generate-fabric-workspace` skill. Shortcut creation between lakehouses is\na separate skill / manual step. This skill assumes target workspaces already\nexist.\n\n## Prerequisites\n\nBefore starting, verify:\n\n```bash\nfab auth status # Must show authenticated\nfab ls # Must return workspace list\n```\n\nIf not authenticated, ask the user to run `fab auth login` first.\n\n## Workflow\n\nExecute these steps in order. Use the `ask_user_input` tool for all\nquestions where discrete choices are available.\n\n### Step 1 — Choose Provisioning Approach\n\nAsk the user which approach they want to follow:\n\n| Approach | Description | Best for |\n|----------|-------------|----------|\n| **A — PySpark Notebook** | Generates a `.py` notebook script that installs `ms-fabric-cli` and uses `!fab` commands. Output for the user to run in their Fabric workspace. | Users who want a reusable notebook artefact in Fabric |\n| **B — PowerShell Script** | Generates a PowerShell script containing `fab` CLI commands. Output for user validation before execution. | Users who prefer a single script to review and run locally |\n| **C — Interactive CLI** | Runs `fab` commands one-by-one in the terminal, pausing for user validation after each step. | Users who want maximum control and visibility |\n\n### Step 2 — Collect Workspace & Lakehouse Definitions (Sequential)\n\nCollect definitions **one workspace at a time**. For each workspace, gather:\n\n#### 2a — Target Workspace\n\n- [ ] **Workspace name** — must already exist. Verify with:\n ```bash\n fab exists \"<WorkspaceName>.Workspace\"\n ```\n If the workspace does not exist, inform the user and suggest they run\n the `generate-fabric-workspace` skill first. Do not proceed for that\n workspace until it exists.\n\n#### 2b — Naming Convention\n\nSuggest the default naming pattern: `{Prefix}_{CoreName}_{Suffix}`\n\n| Component | Description | Default | Example |\n|-----------|-------------|---------|---------|\n| **Prefix** | Item type indicator | `LH` | `LH` |\n| **CoreName** | Business/project name | *(user provides)* | `LANDONREVENUE` |\n| **Suffix** | Medallion layer or purpose | `BRONZE`, `SILVER`, `GOLD` | `BRONZE` |\n| **Separator** | Character between components | `_` | `_` |\n\nExample result: `LH_LANDONREVENUE_BRONZE`\n\nPresent the suggested defaults and **ask the user to confirm or override**\neach component. The user may change any component or use fully custom names\nthat don't follow the pattern at all.\n\n#### 2c — Lakehouse Definitions\n\nFor each lakehouse in this workspace, collect:\n\n- [ ] **Name** — generated from the naming convention, or custom\n- [ ] **Description** — optional text describing the lakehouse's purpose\n- [ ] **Schema-enabled** — yes/no (default: no). See\n `references/schema-enabled.md` for guidance.\n\n#### 2d — More Workspaces?\n\nAfter finishing one workspace, ask:\n\n> \"Do you have another workspace to provision lakehouses in, or are we done?\"\n\nIf yes, loop back to Step 2a. If done, proceed to Step 3.\n\n### Step 3 — Validate Inputs\n\nBefore generating anything, validate **all** lakehouse definitions:\n\n1. For each workspace, confirm it exists:\n ```bash\n fab exists \"<WorkspaceName>.Workspace\"\n ```\n — if it does not exist, stop and direct the user to create it first\n\n2. For each lakehouse, check it doesn't already exist:\n ```bash\n fab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — if it already exists, warn the user and ask whether to **skip** or\n **rename**\n\n3. Validate lakehouse names against naming constraints (see Gotchas)\n\n### Step 4 — Generate & Execute\n\nBranch by the approach chosen in Step 1. Process workspaces sequentially.\n\n**Maintain an audit log** throughout execution — record every command run and\nits outcome. This log feeds into the definition file in Step 6.\n\n#### Approach A — PySpark Notebook\n\n1. Generate a PySpark notebook using the template in\n `assets/notebook-template.py`\n2. The notebook pattern is:\n - Install `ms-fabric-cli` via `%pip install ms-fabric-cli -q`\n - Authenticate using `notebookutils.credentials.getToken('pbi')` and set\n `FAB_TOKEN`, `FAB_TOKEN_ONELAKE`, `FAB_TOKEN_AZURE` environment variables\n - Add pip's scripts directory to `PATH` so `!fab` works\n - Use `!fab mkdir` shell commands for standard lakehouses\n - Use `!fab api` with REST payload for schema-enabled lakehouses\n3. The notebook must include:\n - A configuration cell with all workspace/lakehouse definitions\n - Existence checks before each creation\n - A summary cell at the end\n4. Save to `/home/claude/<workspace>_create_lakehouses.py`\n5. Present to user for review\n6. Optionally upload:\n ```bash\n fab import \"<Workspace>.Workspace/<Name>.Notebook\" -i <path> --format py -f\n ```\n\n#### Approach B — PowerShell Script\n\n1. Generate a PowerShell script using the template in\n `assets/powershell-template.ps1`\n2. The script must:\n - Use `fab mkdir` for standard lakehouses\n - Handle schema-enabled lakehouses via the Fabric REST API\n (`fab api` wrapper — see `references/fabric-api-lakehouse.md`)\n - Include `fab exists` checks before each creation\n - Track created items for potential rollback\n - Include error handling and summary output\n3. Save to `/home/claude/create_lakehouses.ps1`\n4. Present the script and **wait for explicit approval** before running\n\n#### Approach C — Interactive CLI\n\nExecute commands one-by-one per workspace, pausing after each:\n\n1. **For each lakehouse** — check then create:\n ```bash\n fab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — if not exists, create. For standard lakehouses:\n ```bash\n fab mkdir \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — for schema-enabled lakehouses, use the REST API:\n ```bash\n WS_ID=$(fab get \"<WorkspaceName>.Workspace\" -q \"id\" | tr -d '\"')\n fab api \"workspaces/$WS_ID/lakehouses\" -X post \\\n -i '{\"displayName\":\"<Name>\",\"description\":\"<Desc>\",\"creationPayload\":{\"enableSchemas\":true}}'\n ```\n — wait for user confirmation after each\n\n2. **Verification** after all lakehouses in a workspace:\n ```bash\n fab ls \"<WorkspaceName>.Workspace\" -l\n ```\n\n3. Move to next workspace or proceed to Step 5.\n\n### Step 4a — Failure Handling\n\nIf any lakehouse creation fails during execution:\n\n1. **Stop immediately** — do not proceed to the next lakehouse\n2. **Report** what succeeded and what failed\n3. **Ask the user** how to proceed:\n\n| Option | Action |\n|--------|--------|\n| **Retry** | Re-attempt the failed lakehouse creation |\n| **Skip** | Skip the failed item and continue with remaining |\n| **Rollback & Abort** | Delete all lakehouses created *in this run*, then stop |\n| **Abort (keep)** | Stop but leave already-created lakehouses in place |\n\nIf the user chooses **Rollback & Abort**:\n```bash\nfab rm \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\" -f\n```\n— for each lakehouse created in this run (tracked in the audit log).\nConfirm each deletion with the user before executing.\n\n### Step 5 — Verify Creation\n\nRegardless of approach, verify every lakehouse across all workspaces:\n\n```bash\nfab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n```\n\nCollect the lakehouse ID for each:\n```bash\nfab get \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\" -q \"id\"\n```\n\nIf any verification fails, report and ask the user how to proceed (same\noptions as Step 4a).\n\n### Step 6 — Generate Definition File\n\nAfter all lakehouses are verified, generate a Lakehouse Definition markdown\nfile using the template in `assets/definition-template.md`.\n\nThe definition file must include:\n\n- **Per workspace:** name, ID\n- **Per lakehouse:** name, ID, description, schema-enabled status, naming\n convention used, creation timestamp\n- **Overall:** approach used, naming convention applied, full audit trail of\n commands/API calls executed, any warnings, skipped items, or rollback actions\n\nSave to `/home/claude/lakehouse_definition.md` and present to user.\n\n## Naming Convention Reference\n\nThe default pattern is: `{Prefix}_{CoreName}_{Suffix}`\n\n| Component | Default | Notes |\n|-----------|---------|-------|\n| Prefix | `LH` | Item type indicator |\n| CoreName | *(user provides)* | Business/project name, UPPERCASE |\n| Suffix | Medallion layer | `BRONZE`, `SILVER`, `GOLD`, or custom |\n| Separator | `_` | Underscore by default |\n\nExamples:\n- `LH_LANDONREVENUE_BRONZE`\n- `LH_LANDONREVENUE_SILVER`\n- `LH_LANDONREVENUE_GOLD`\n- `LH_SYNERGY_RAW` (custom suffix)\n\nThe agent should **suggest** names using this convention but always let the\nuser override with fully custom names.\n\n## Gotchas\n\n- `fab mkdir` creates a standard lakehouse but does NOT support the\n `enableSchemas` property. To create a schema-enabled lakehouse, use\n the Fabric REST API: `POST workspaces/{workspaceId}/lakehouses` with\n `{\"displayName\":\"<n>\",\"creationPayload\":{\"enableSchemas\":true}}`\n- Always use `-f` flag with `fab` commands in scripts to avoid interactive\n prompts that block execution\n- Lakehouse names must be unique within a workspace\n- Workspace names are case-sensitive in `fab` paths\n- Always quote paths containing spaces: `\"My Workspace.Workspace\"`\n- The Fabric REST API requires workspace ID (GUID), not display name —\n extract with `fab get \"<n>.Workspace\" -q \"id\"`\n- In notebooks, `ms-fabric-cli` must be installed via `%pip install` and\n the scripts directory added to `PATH` before `!fab` commands work\n- Token audience for notebook auth is `'pbi'`, and all three env vars must\n be set: `FAB_TOKEN`, `FAB_TOKEN_ONELAKE`, `FAB_TOKEN_AZURE`\n- `fab auth status` must show a valid token before any operations; tokens\n expire and may need refresh\n- Lakehouse names cannot contain: `/`, `\\`, `#`, `%`, `?` or\n leading/trailing spaces. Max length: 256 characters\n- When rolling back, always confirm each deletion with the user — `fab rm`\n with `-f` is irreversible\n- This skill does NOT create workspaces — if a workspace is missing, direct\n the user to the `generate-fabric-workspace` skill\n- This skill does NOT create shortcuts between lakehouses — that is a\n separate step\n\n## Output Format\n\nSee `assets/definition-template.md` for the full template.\n\n## Available Assets\n\n- **`assets/notebook-template.py`** — PySpark notebook template for Approach A\n- **`assets/powershell-template.ps1`** — PowerShell script template for Approach B\n- **`assets/definition-template.md`** — Lakehouse definition output template\n\n## Available References\n\n- **`references/schema-enabled.md`** — How schema-enabled lakehouses work\n- **`references/fabric-api-lakehouse.md`** — Fabric REST API reference for\n lakehouse creation\n",
|
|
10
|
+
content: "---\nname: create-fabric-lakehouses\ndescription: >\n Use this skill when asked to create, provision, or set up one or more\n Lakehouse items in existing Microsoft Fabric workspaces. Triggers on:\n \"create a lakehouse\", \"provision lakehouses\", \"set up a Fabric lakehouse\",\n \"create lakehouse in Fabric\", \"new lakehouse\", \"create lakehouses across\n workspaces\". Does NOT trigger for: creating workspaces (use\n generate-fabric-workspace), querying lakehouse data, managing tables,\n uploading files, creating shortcuts, or general Fabric workspace management.\nlicense: MIT\ncompatibility: Fabric CLI (fab) installed and authenticated; Python 3.10+ for notebook approach\n---\n\n# Create Fabric Lakehouse\n\n> ⚠️ **GOVERNANCE**: This skill produces notebooks and scripts for the operator to\n> review and run — it never executes commands directly against a live Fabric environment.\n> Present each generated artefact to the operator before they run it.\n\nProvisions one or more empty Lakehouse items across one or more existing\nMicrosoft Fabric workspaces, using a user-chosen approach, and produces an\naudit-trail definition file.\n\n**Companion skills:** Workspace creation is handled by the\n`generate-fabric-workspace` skill. Shortcut creation between lakehouses is\na separate skill / manual step. This skill assumes target workspaces already\nexist.\n\n## Prerequisites\n\nBefore starting, ask the operator to run the following and share the output:\n\n```bash\nfab auth status # Must show authenticated\nfab ls # Must return workspace list\n```\n\nIf not authenticated, ask the operator to run `fab auth login` first.\n\n## Workflow\n\nExecute these steps in order.\n\n### Step 1 — Choose Provisioning Approach\n\nAsk the user which approach they want to follow:\n\n| Approach | Description | Best for |\n|----------|-------------|----------|\n| **A — PySpark Notebook** | Generates a `.py` notebook script that installs `ms-fabric-cli` and uses `!fab` commands. Output for the user to run in their Fabric workspace. | Users who want a reusable notebook artefact in Fabric |\n| **B — PowerShell Script** | Generates a PowerShell script containing `fab` CLI commands. Output for user validation before execution. | Users who prefer a single script to review and run locally |\n| **C — Interactive CLI** | Runs `fab` commands one-by-one in the terminal, pausing for user validation after each step. | Users who want maximum control and visibility |\n\n### Step 2 — Collect Workspace & Lakehouse Definitions (Sequential)\n\nCollect definitions **one workspace at a time**. For each workspace, gather:\n\n#### 2a — Target Workspace\n\n- [ ] **Workspace name** — must already exist. Verify with:\n ```bash\n fab exists \"<WorkspaceName>.Workspace\"\n ```\n If the workspace does not exist, inform the user and suggest they run\n the `generate-fabric-workspace` skill first. Do not proceed for that\n workspace until it exists.\n\n#### 2b — Naming Convention\n\nSuggest the default naming pattern: `{Prefix}_{CoreName}_{Suffix}`\n\n| Component | Description | Default | Example |\n|-----------|-------------|---------|---------|\n| **Prefix** | Item type indicator | `LH` | `LH` |\n| **CoreName** | Business/project name | *(user provides)* | `LANDONREVENUE` |\n| **Suffix** | Medallion layer or purpose | `BRONZE`, `SILVER`, `GOLD` | `BRONZE` |\n| **Separator** | Character between components | `_` | `_` |\n\nExample result: `LH_LANDONREVENUE_BRONZE`\n\nPresent the suggested defaults and **ask the user to confirm or override**\neach component. The user may change any component or use fully custom names\nthat don't follow the pattern at all.\n\n#### 2c — Lakehouse Definitions\n\nFor each lakehouse in this workspace, collect:\n\n- [ ] **Name** — generated from the naming convention, or custom\n- [ ] **Description** — optional text describing the lakehouse's purpose\n- [ ] **Schema-enabled** — yes/no (default: no). See\n `references/schema-enabled.md` for guidance.\n\n#### 2d — More Workspaces?\n\nAfter finishing one workspace, ask:\n\n> \"Do you have another workspace to provision lakehouses in, or are we done?\"\n\nIf yes, loop back to Step 2a. If done, proceed to Step 3.\n\n### Step 3 — Validate Inputs\n\nBefore generating anything, validate **all** lakehouse definitions:\n\n1. For each workspace, confirm it exists:\n ```bash\n fab exists \"<WorkspaceName>.Workspace\"\n ```\n — if it does not exist, stop and direct the user to create it first\n\n2. For each lakehouse, check it doesn't already exist:\n ```bash\n fab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — if it already exists, warn the user and ask whether to **skip** or\n **rename**\n\n3. Validate lakehouse names against naming constraints (see Gotchas)\n\n### Step 4 — Generate & Execute\n\nBranch by the approach chosen in Step 1. Process workspaces sequentially.\n\n**Maintain an audit log** throughout execution — record every command run and\nits outcome. This log feeds into the definition file in Step 6.\n\n#### Approach A — PySpark Notebook\n\n1. Generate a PySpark notebook using the template in\n `references/notebook-template.py`\n2. The notebook pattern is:\n - Install `ms-fabric-cli` via `%pip install ms-fabric-cli -q`\n - Authenticate using `notebookutils.credentials.getToken('pbi')` for `FAB_TOKEN`\n and `FAB_TOKEN_AZURE`, and `notebookutils.credentials.getToken('storage')` for\n `FAB_TOKEN_ONELAKE` (OneLake requires the storage-scope token)\n - Add pip's scripts directory to `PATH` so `!fab` works\n - Use `!fab mkdir` shell commands for standard lakehouses\n - Use `!fab api` with REST payload for schema-enabled lakehouses\n3. The notebook must include:\n - A configuration cell with all workspace/lakehouse definitions\n - Existence checks before each creation\n - A summary cell at the end\n4. Save to `/home/claude/<workspace>_create_lakehouses.py`\n5. Present to user for review\n6. Optionally upload:\n ```bash\n fab import \"<Workspace>.Workspace/<Name>.Notebook\" -i <path> --format py -f\n ```\n\n#### Approach B — PowerShell Script\n\n1. Generate a PowerShell script with the following structure:\n2. The script must:\n - Use `fab mkdir` for standard lakehouses\n - Handle schema-enabled lakehouses via the Fabric REST API\n (`fab api` wrapper — see `references/fabric-api-lakehouse.md`)\n - Include `fab exists` checks before each creation\n - Track created items for potential rollback\n - Include error handling and summary output\n3. Save to `/home/claude/create_lakehouses.ps1`\n4. Present the script and **wait for explicit approval** before running\n\n#### Approach C — Interactive CLI\n\nExecute commands one-by-one per workspace, pausing after each:\n\n1. **For each lakehouse** — check then create:\n ```bash\n fab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — if not exists, create. For standard lakehouses:\n ```bash\n fab mkdir \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n ```\n — for schema-enabled lakehouses, use the REST API:\n ```bash\n WS_ID=$(fab get \"<WorkspaceName>.Workspace\" -q \"id\" | tr -d '\"')\n fab api \"workspaces/$WS_ID/lakehouses\" -X post \\\n -i '{\"displayName\":\"<Name>\",\"description\":\"<Desc>\",\"creationPayload\":{\"enableSchemas\":true}}'\n ```\n — wait for user confirmation after each\n\n2. **Verification** after all lakehouses in a workspace:\n ```bash\n fab ls \"<WorkspaceName>.Workspace\" -l\n ```\n\n3. Move to next workspace or proceed to Step 5.\n\n### Step 4a — Failure Handling\n\nIf any lakehouse creation fails during execution:\n\n1. **Stop immediately** — do not proceed to the next lakehouse\n2. **Report** what succeeded and what failed\n3. **Ask the user** how to proceed:\n\n| Option | Action |\n|--------|--------|\n| **Retry** | Re-attempt the failed lakehouse creation |\n| **Skip** | Skip the failed item and continue with remaining |\n| **Rollback & Abort** | Delete all lakehouses created *in this run*, then stop |\n| **Abort (keep)** | Stop but leave already-created lakehouses in place |\n\nIf the user chooses **Rollback & Abort**:\n```bash\nfab rm \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\" -f\n```\n— for each lakehouse created in this run (tracked in the audit log).\nConfirm each deletion with the user before executing.\n\n### Step 5 — Verify Creation\n\nRegardless of approach, verify every lakehouse across all workspaces:\n\n```bash\nfab exists \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\"\n```\n\nCollect the lakehouse ID for each:\n```bash\nfab get \"<WorkspaceName>.Workspace/<LakehouseName>.Lakehouse\" -q \"id\"\n```\n\nIf any verification fails, report and ask the user how to proceed (same\noptions as Step 4a).\n\n### Step 6 — Generate Definition File\n\nAfter all lakehouses are verified, generate a Lakehouse Definition markdown\nfile using the template in `references/definition-template.md`.\n\nThe definition file must include:\n\n- **Per workspace:** name, ID\n- **Per lakehouse:** name, ID, description, schema-enabled status, naming\n convention used, creation timestamp\n- **Overall:** approach used, naming convention applied, full audit trail of\n commands/API calls executed, any warnings, skipped items, or rollback actions\n\nSave to `/home/claude/lakehouse_definition.md` and present to user.\n\n## Gotchas\n\n- `fab mkdir` creates a standard lakehouse but does NOT support the\n `enableSchemas` property. To create a schema-enabled lakehouse, use\n the Fabric REST API: `POST workspaces/{workspaceId}/lakehouses` with\n `{\"displayName\":\"<n>\",\"creationPayload\":{\"enableSchemas\":true}}`\n- Always use `-f` flag with `fab` commands in scripts to avoid interactive\n prompts that block execution\n- Lakehouse names must be unique within a workspace\n- Workspace names are case-sensitive in `fab` paths\n- Always quote paths containing spaces: `\"My Workspace.Workspace\"`\n- The Fabric REST API requires workspace ID (GUID), not display name —\n extract with `fab get \"<n>.Workspace\" -q \"id\"`\n- In notebooks, `ms-fabric-cli` must be installed via `%pip install` and\n the scripts directory added to `PATH` before `!fab` commands work\n- Token audiences for notebook auth: `'pbi'` for `FAB_TOKEN` and `FAB_TOKEN_AZURE`,\n `'storage'` for `FAB_TOKEN_ONELAKE` (OneLake requires the storage-scope token)\n- `fab auth status` must show a valid token before any operations; tokens\n expire and may need refresh\n- Lakehouse names cannot contain: `/`, `\\`, `#`, `%`, `?` or\n leading/trailing spaces. Max length: 256 characters\n- When rolling back, always confirm each deletion with the user — `fab rm`\n with `-f` is irreversible\n- This skill does NOT create workspaces — if a workspace is missing, direct\n the user to the `generate-fabric-workspace` skill\n- This skill does NOT create shortcuts between lakehouses — that is a\n separate step\n\n## Output Format\n\nSee `references/definition-template.md` for the full template.\n\n## Available References\n\n- **`references/notebook-template.py`** — PySpark notebook template for Approach A\n- **`references/definition-template.md`** — Lakehouse definition output template\n- **`references/schema-enabled.md`** — How schema-enabled lakehouses work\n- **`references/fabric-api-lakehouse.md`** — Fabric REST API reference for\n lakehouse creation\n",
|
|
10
11
|
},
|
|
11
12
|
{
|
|
12
13
|
relativePath: "references/agent.md",
|
|
@@ -18,11 +19,11 @@ export const EMBEDDED_SKILLS = [
|
|
|
18
19
|
},
|
|
19
20
|
{
|
|
20
21
|
relativePath: "references/fabric-api-lakehouse.md",
|
|
21
|
-
content: "# Fabric CLI & REST API — Lakehouse Reference\n\n## Create Lakehouse via CLI\n\nThe preferred method for lakehouse creation is `fab create`:\n\n```bash\n# Standard lakehouse (no schemas)\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\"\n\n# Schema-enabled lakehouse\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -P enableSchemas=true\n\n# Explicitly disable schemas (default)\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -P enableSchemas=false\n```\n\n**Note:** Use `fab create` (not `fab mkdir`) for lakehouse creation — it\nsupports the `-P enableSchemas=true` property directly.\n\n## Create Lakehouse via REST API\n\n**Endpoint:** `POST https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/lakehouses`\n\n### Standard Lakehouse\n\n```json\n{ \"displayName\": \"MyLakehouse\" }\n```\n\n### Schema-Enabled Lakehouse\n\n```json\n{\n \"displayName\": \"MyLakehouse\",\n \"creationPayload\": { \"enableSchemas\": true }\n}\n```\n\n### With Description\n\n```json\n{\n \"displayName\": \"MyLakehouse\",\n \"description\": \"Bronze layer — raw ingestion\",\n \"creationPayload\": { \"enableSchemas\": true }\n}\n```\n\n### Response Codes\n\n| Code | Meaning |\n|------|---------|\n| 201 | Created successfully |\n| 202 | Accepted — long-running operation; poll Location header |\n| 400 | Invalid request (duplicate name, bad characters) |\n| 401 | Token expired or missing |\n| 403 | Insufficient permissions |\n| 404 | Workspace not found |\n| 429 | Rate limited — honour Retry-After header |\n\n## Using fab CLI inside a PySpark Notebook\n\nInstall and authenticate (pip install MUST be in a separate cell — kernel restarts):\n\n```python\n# Cell 1 — Install (kernel restarts after this)\n%pip install ms-fabric-cli -q --no-warn-conflicts\n```\n\n```python\n# Cell 2 — Authenticate\nimport os, sysconfig\n\nscripts_dir = sysconfig.get_path('scripts')\nos.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\n\ntoken = notebookutils.credentials.getToken('pbi')\nos.environ['FAB_TOKEN'] = token\nos.environ['FAB_TOKEN_ONELAKE'] =
|
|
22
|
+
content: "# Fabric CLI & REST API — Lakehouse Reference\n\n## Create Lakehouse via CLI\n\nThe preferred method for lakehouse creation is `fab create`:\n\n```bash\n# Standard lakehouse (no schemas)\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\"\n\n# Schema-enabled lakehouse\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -P enableSchemas=true\n\n# Explicitly disable schemas (default)\nfab create \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -P enableSchemas=false\n```\n\n**Note:** Use `fab create` (not `fab mkdir`) for lakehouse creation — it\nsupports the `-P enableSchemas=true` property directly.\n\n## Create Lakehouse via REST API\n\n**Endpoint:** `POST https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/lakehouses`\n\n### Standard Lakehouse\n\n```json\n{ \"displayName\": \"MyLakehouse\" }\n```\n\n### Schema-Enabled Lakehouse\n\n```json\n{\n \"displayName\": \"MyLakehouse\",\n \"creationPayload\": { \"enableSchemas\": true }\n}\n```\n\n### With Description\n\n```json\n{\n \"displayName\": \"MyLakehouse\",\n \"description\": \"Bronze layer — raw ingestion\",\n \"creationPayload\": { \"enableSchemas\": true }\n}\n```\n\n### Response Codes\n\n| Code | Meaning |\n|------|---------|\n| 201 | Created successfully |\n| 202 | Accepted — long-running operation; poll Location header |\n| 400 | Invalid request (duplicate name, bad characters) |\n| 401 | Token expired or missing |\n| 403 | Insufficient permissions |\n| 404 | Workspace not found |\n| 429 | Rate limited — honour Retry-After header |\n\n## Using fab CLI inside a PySpark Notebook\n\nInstall and authenticate (pip install MUST be in a separate cell — kernel restarts):\n\n```python\n# Cell 1 — Install (kernel restarts after this)\n%pip install ms-fabric-cli -q --no-warn-conflicts\n```\n\n```python\n# Cell 2 — Authenticate\nimport os, sysconfig\n\nscripts_dir = sysconfig.get_path('scripts')\nos.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\n\ntoken = notebookutils.credentials.getToken('pbi')\nstorage_token = notebookutils.credentials.getToken('storage')\nos.environ['FAB_TOKEN'] = token\nos.environ['FAB_TOKEN_ONELAKE'] = storage_token # OneLake needs storage scope\nos.environ['FAB_TOKEN_AZURE'] = token\n\n!fab auth status\n```\n\nThen use shell commands:\n\n```python\n!fab create \"MyWorkspace.Workspace/MyLakehouse.Lakehouse\" -P enableSchemas=true\n```\n\nOr subprocess for programmatic control:\n\n```python\nimport subprocess\nresult = subprocess.run(\n 'fab create \"MyWorkspace.Workspace/MyLakehouse.Lakehouse\" -P enableSchemas=true',\n shell=True, capture_output=True, text=True\n)\nprint(result.stdout)\n```\n\n## Other Lakehouse Operations\n\n```bash\n# Check if lakehouse exists\nfab exists \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\"\n\n# Get lakehouse details / ID\nfab get \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\"\nfab get \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -q \"id\"\n\n# List lakehouses in workspace\nfab ls \"<Workspace>.Workspace\" -l\n\n# Delete lakehouse\nfab rm \"<Workspace>.Workspace/<Lakehouse>.Lakehouse\" -f\n```\n\n## Naming Constraints\n\n- Names must be unique within a workspace\n- Maximum length: 256 characters\n- Cannot contain: `/`, `\\`, `#`, `%`, `?`, leading/trailing spaces\n- Cannot be empty\n",
|
|
22
23
|
},
|
|
23
24
|
{
|
|
24
25
|
relativePath: "references/notebook-template.py",
|
|
25
|
-
content: "# Fabric notebook source\n\n# METADATA ********************\n\n# META {\n# META \"kernel_info\": {\n# META \"name\": \"synapse_pyspark\"\n# META },\n# META \"dependencies\": {}\n# META }\n\n# MARKDOWN ********************\n\n# # Create Lakehouses\n#\n# Auto-generated notebook to provision lakehouse(s) in existing Fabric workspaces.\n#\n# **Approach:** PySpark Notebook (Approach A)\n#\n# **Prerequisites:** Target workspaces must already exist. Use the\n# `generate-fabric-workspace` skill to create them first if needed.\n#\n# **Usage:** Run this notebook inside a Fabric workspace with Spark compute.\n\n# CELL ********************\n\n# ── Step 1: Install Fabric CLI ─────────────────────────────────────────\n# This MUST be in its own cell. The kernel restarts after %pip install,\n# so any code after it in the same cell will not execute.\n%pip install ms-fabric-cli -q --no-warn-conflicts\n\n# CELL ********************\n\n# ── Step 2: Authenticate ───────────────────────────────────────────────\nimport os\nimport sysconfig\n\n# Add pip's scripts directory to PATH so `!fab` works\nscripts_dir = sysconfig.get_path('scripts')\nos.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\n\n# Authenticate using the notebook user's identity\ntoken = notebookutils.credentials.getToken('pbi')\nos.environ['FAB_TOKEN'] = token\nos.environ['FAB_TOKEN_ONELAKE'] =
|
|
26
|
+
content: "# Fabric notebook source\n\n# METADATA ********************\n\n# META {\n# META \"kernel_info\": {\n# META \"name\": \"synapse_pyspark\"\n# META },\n# META \"dependencies\": {}\n# META }\n\n# MARKDOWN ********************\n\n# # Create Lakehouses\n#\n# Auto-generated notebook to provision lakehouse(s) in existing Fabric workspaces.\n#\n# **Approach:** PySpark Notebook (Approach A)\n#\n# **Prerequisites:** Target workspaces must already exist. Use the\n# `generate-fabric-workspace` skill to create them first if needed.\n#\n# **Usage:** Run this notebook inside a Fabric workspace with Spark compute.\n\n# CELL ********************\n\n# ── Step 1: Install Fabric CLI ─────────────────────────────────────────\n# This MUST be in its own cell. The kernel restarts after %pip install,\n# so any code after it in the same cell will not execute.\n%pip install ms-fabric-cli -q --no-warn-conflicts\n\n# CELL ********************\n\n# ── Step 2: Authenticate ───────────────────────────────────────────────\nimport os\nimport sysconfig\n\n# Add pip's scripts directory to PATH so `!fab` works\nscripts_dir = sysconfig.get_path('scripts')\nos.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\n\n# Authenticate using the notebook user's identity\ntoken = notebookutils.credentials.getToken('pbi')\nstorage_token = notebookutils.credentials.getToken('storage')\nos.environ['FAB_TOKEN'] = token\nos.environ['FAB_TOKEN_ONELAKE'] = storage_token # OneLake needs storage scope\nos.environ['FAB_TOKEN_AZURE'] = token\n\n# Verify authentication\n!fab auth status\n\n# CELL ********************\n\n# ── Step 3: Configuration ──────────────────────────────────────────────\n# Edit these definitions to match your environment.\n#\n# Naming convention: {Prefix}_{CoreName}_{Suffix}\n# Example: LH_LANDONREVENUE_BRONZE\n#\n# NOTE: Target workspaces must already exist.\n\nWORKSPACES = [\n {{WORKSPACE_DEFINITIONS}}\n # Example:\n # {\n # \"name\": \"Landon Finance Month End\",\n # \"lakehouses\": [\n # {\"name\": \"LH_LANDONREVENUE_BRONZE\", \"description\": \"Raw ingested data\", \"enable_schemas\": False},\n # {\"name\": \"LH_LANDONREVENUE_SILVER\", \"description\": \"Cleansed and conformed\", \"enable_schemas\": True},\n # {\"name\": \"LH_LANDONREVENUE_GOLD\", \"description\": \"Business-ready aggregates\", \"enable_schemas\": True},\n # ],\n # },\n]\n\n# CELL ********************\n\n# ── Step 4: Create Lakehouses ──────────────────────────────────────────\nimport subprocess\nimport json\nfrom datetime import datetime\n\naudit_log = []\ncreated_items = [] # Track for potential rollback\n\ndef run_fab(cmd, capture=True):\n \"\"\"Run a fab command and return (success, output).\"\"\"\n full_cmd = f\"fab {cmd}\"\n audit_log.append({\"timestamp\": datetime.utcnow().isoformat(), \"command\": full_cmd})\n result = subprocess.run(full_cmd, shell=True, capture_output=capture, text=True)\n success = result.returncode == 0\n output = (result.stdout or \"\").strip()\n audit_log[-1][\"success\"] = success\n audit_log[-1][\"output\"] = output[:500]\n if not success and result.stderr:\n audit_log[-1][\"error\"] = result.stderr.strip()[:500]\n return success, output\n\ndef fab_exists(path):\n ok, _ = run_fab(f'exists \"{path}\"')\n return ok\n\ndef fab_get_id(path):\n ok, output = run_fab(f'get \"{path}\" -q \"id\"')\n return output.strip('\"') if ok else None\n\nresults = []\nabort = False\n\nfor ws_def in WORKSPACES:\n if abort:\n break\n\n ws_name = ws_def[\"name\"]\n\n print(f\"\\n{'='*60}\")\n print(f\"WORKSPACE: {ws_name}\")\n print(f\"{'='*60}\")\n\n # Verify workspace exists\n if not fab_exists(f\"{ws_name}.Workspace\"):\n print(f\" ERROR: Workspace '{ws_name}' does not exist.\")\n print(f\" Create it first using the generate-fabric-workspace skill.\")\n results.append({\"workspace\": ws_name, \"name\": \"-\", \"status\": \"ws_not_found\"})\n continue\n\n print(f\" Workspace verified.\")\n ws_id = fab_get_id(f\"{ws_name}.Workspace\")\n print(f\" Workspace ID: {ws_id}\")\n\n # Create lakehouses\n for lh_def in ws_def.get(\"lakehouses\", []):\n if abort:\n break\n\n lh_name = lh_def[\"name\"]\n lh_desc = lh_def.get(\"description\", \"\")\n schemas = lh_def.get(\"enable_schemas\", False)\n schema_tag = \" [schema-enabled]\" if schemas else \"\"\n lh_path = f\"{ws_name}.Workspace/{lh_name}.Lakehouse\"\n\n # Check if already exists\n if fab_exists(lh_path):\n existing_id = fab_get_id(lh_path)\n print(f\" SKIP '{lh_name}' — already exists (ID: {existing_id})\")\n results.append({\n \"workspace\": ws_name, \"name\": lh_name, \"id\": existing_id,\n \"status\": \"skipped\", \"enable_schemas\": schemas,\n \"description\": lh_desc,\n })\n continue\n\n print(f\" CREATE '{lh_name}'{schema_tag} ... \", end=\"\")\n\n if schemas:\n # Schema-enabled: must use REST API via fab api\n body = json.dumps({\n \"displayName\": lh_name,\n \"description\": lh_desc,\n \"creationPayload\": {\"enableSchemas\": True},\n })\n ok, output = run_fab(f\"api \\\"workspaces/{ws_id}/lakehouses\\\" -X post -i '{body}'\")\n else:\n # Standard lakehouse: use fab mkdir\n ok, output = run_fab(f'mkdir \"{lh_path}\" -f')\n\n if ok:\n lh_id = fab_get_id(lh_path) or \"pending\"\n print(f\"OK (ID: {lh_id})\")\n created_items.append(lh_path)\n results.append({\n \"workspace\": ws_name, \"name\": lh_name, \"id\": lh_id,\n \"status\": \"created\", \"enable_schemas\": schemas,\n \"description\": lh_desc,\n })\n else:\n print(f\"FAILED\")\n results.append({\n \"workspace\": ws_name, \"name\": lh_name, \"id\": None,\n \"status\": \"failed\", \"enable_schemas\": schemas,\n \"description\": lh_desc,\n })\n print(f\"\\n ⚠ FAILURE on '{lh_name}'. Review the error above.\")\n print(f\" Created so far in this run: {len(created_items)} item(s)\")\n print(f\" To rollback, delete these paths: {created_items}\")\n abort = True # Stop processing — user decides next steps\n\n# CELL ********************\n\n# ── Step 5: Verification ──────────────────────────────────────────────\nprint(\"\\n\" + \"=\" * 60)\nprint(\"VERIFICATION\")\nprint(\"=\" * 60)\n\nfor r in results:\n if r[\"status\"] in (\"created\", \"skipped\") and r.get(\"name\") != \"-\":\n path = f\"{r['workspace']}.Workspace/{r['name']}.Lakehouse\"\n exists = fab_exists(path)\n verified = \"VERIFIED\" if exists else \"NOT FOUND\"\n r[\"verified\"] = exists\n print(f\" {verified:12s} {r['name']} ({r['workspace']})\")\n\n# CELL ********************\n\n# ── Step 6: Summary ───────────────────────────────────────────────────\nprint(\"\\n\" + \"=\" * 60)\nprint(\"CREATION SUMMARY\")\nprint(\"=\" * 60)\n\nfor ws_def in WORKSPACES:\n ws_name = ws_def[\"name\"]\n ws_results = [r for r in results if r.get(\"workspace\") == ws_name and r.get(\"name\") != \"-\"]\n if not ws_results:\n ws_err = [r for r in results if r.get(\"workspace\") == ws_name and r.get(\"status\") == \"ws_not_found\"]\n if ws_err:\n print(f\"\\nWorkspace: {ws_name} — NOT FOUND (skipped)\")\n continue\n print(f\"\\nWorkspace: {ws_name}\")\n print(f\" Created : {sum(1 for r in ws_results if r['status'] == 'created')}\")\n print(f\" Skipped : {sum(1 for r in ws_results if r['status'] == 'skipped')}\")\n print(f\" Failed : {sum(1 for r in ws_results if r['status'] == 'failed')}\")\n print(f\" {'-'*50}\")\n for r in ws_results:\n schema_tag = \" [schema-enabled]\" if r.get(\"enable_schemas\") else \"\"\n desc_tag = f\" — {r['description']}\" if r.get(\"description\") else \"\"\n print(f\" {r['status'].upper():10s} {r['name']}{schema_tag}{desc_tag}\")\n if r.get(\"id\"):\n print(f\" ID: {r['id']}\")\n\ntotal = [r for r in results if r.get(\"name\") != \"-\"]\nprint(f\"\\n{'='*60}\")\nprint(f\"Total: {len(total)} | \"\n f\"Created: {sum(1 for r in total if r['status']=='created')} | \"\n f\"Skipped: {sum(1 for r in total if r['status']=='skipped')} | \"\n f\"Failed: {sum(1 for r in total if r['status']=='failed')}\")\n\n# CELL ********************\n\n# ── Audit Log ─────────────────────────────────────────────────────────\nprint(\"\\nAUDIT LOG:\")\nfor entry in audit_log:\n status = \"OK\" if entry.get(\"success\") else \"FAIL\"\n print(f\" [{entry['timestamp']}] [{status}] {entry['command']}\")\n\n# To export: audit_log and results are available as Python objects\n# for downstream processing or definition file generation.\n",
|
|
26
27
|
},
|
|
27
28
|
{
|
|
28
29
|
relativePath: "references/schema-enabled.md",
|
|
@@ -31,115 +32,128 @@ export const EMBEDDED_SKILLS = [
|
|
|
31
32
|
],
|
|
32
33
|
},
|
|
33
34
|
{
|
|
34
|
-
name: "create-
|
|
35
|
+
name: "create-fabric-process-skill",
|
|
36
|
+
category: "fabric",
|
|
35
37
|
files: [
|
|
36
38
|
{
|
|
37
|
-
relativePath: "
|
|
38
|
-
content: "
|
|
39
|
+
relativePath: ".specstory/.project.json",
|
|
40
|
+
content: "{\n \"workspace_id\": \"eead-92c0-d58e-8139\",\n \"workspace_id_at\": \"2026-03-29T18:46:25Z\",\n \"project_name\": \"create-process-skill\"\n}",
|
|
39
41
|
},
|
|
40
42
|
{
|
|
41
|
-
relativePath: "
|
|
42
|
-
content: "#
|
|
43
|
+
relativePath: ".specstory/cli/config.toml",
|
|
44
|
+
content: "# SpecStory CLI Configuration\n#\n# This is the project-level config file for SpecStory CLI.\n# All settings here apply to this project unless overridden by CLI flags.\n#\n# Uncomment (remove the #) the line and edit any setting below to change the default behavior.\n# For more information, see: https://docs.specstory.com/integrations/terminal-coding-agents/usage\n\n[local_sync]\n# Write markdown files locally. (default: true)\n# enabled = false # equivalent to --only-cloud-sync\n\n# Custom output directory for markdown files.\n# Default: ./.specstory/history (relative to the project directory)\n# output_dir = \"~/.specstory/history\" # equivalent to --output-dir \"~/.specstory/history\"\n\n# Use local timezone for file name and content timestamps (default: false, UTC)\n# local_time_zone = true # equivalent to --local-time-zone\n\n[cloud_sync]\n# Sync session data to SpecStory Cloud. (default: true, when logged in to SpecStory Cloud)\n# enabled = false # equivalent to --no-cloud-sync\n\n[logging]\n# Write logs to .specstory/debug/debug.log (default: false)\n# log = true # equivalent to --log \n\n# Debug-level output, requires console or log (default: false)\n# debug = true # equivalent to --debug \n\n# Custom output directory for debug data.\n# Default: ./.specstory/debug (relative to the project directory)\n# debug_dir = \"~/.specstory/debug\" # equivalent to --debug-dir \"~/.specstory/debug\"\n\n# Error/warn/info output to stdout (default: false)\n# console = true # equivalent to --console\n\n# Suppress all non-error output (default: false)\n# silent = true\t# equivalent to --silent\n\n[version_check]\n# Check for new versions of the CLI on startup.\n# Default: true\n# enabled = false # equivalent to --no-version-check\n\n[analytics]\n# Send anonymous product usage analytics to help improve SpecStory.\n# Default: true\n# enabled = false # equivalent to --no-usage-analytics\n\n[telemetry]\n# OTLP gRPC collector endpoint (e.g., \"localhost:4317\" or \"http://localhost:4317\")\n# endpoint = \"localhost:4317\"\n\n# Override the default service name (default: \"specstory-cli\")\n# service_name = \"my-service-name\"\n\n# Include user prompt text in telemetry spans (default: true)\n# prompts = false\n\n[providers]\n# Agent execution commands by provider (used by specstory run)\n# Pass custom flags (e.g. claude_cmd = \"claude --allow-dangerously-skip-permissions\")\n# Use of these is equivalent to -c \"custom command\"\n\n# Claude Code command\n# claude_cmd = \"claude\"\n\n# Codex CLI command\n# codex_cmd = \"codex\"\n\n# Cursor CLI command\n# cursor_cmd = \"cursor-agent\"\n\n# Droid CLI command\n# droid_cmd = \"droid\"\n\n# Gemini CLI command\n# gemini_cmd = \"gemini\"\n",
|
|
43
45
|
},
|
|
44
|
-
],
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
name: "create-materialised-lakeview-scripts",
|
|
48
|
-
files: [
|
|
49
46
|
{
|
|
50
47
|
relativePath: "SKILL.md",
|
|
51
|
-
content: "---\nname: fabric-lakehouse-mlv\ndescription: >\n Use this skill when asked to generate Spark SQL Materialized Lake View (MLV)\n scripts for Microsoft Fabric Lakehouse transformations. Triggers on: \"generate\n MLV\", \"create silver layer\", \"create gold layer\", \"bronze to silver\", \"silver\n to gold\", \"star schema\", \"lakehouse transformation\", \"materialized lake view\".\n Supports two layers (bronze→silver, silver→gold) and two approaches each\n (schema-driven with source+target CSVs, or pattern-driven with source-only CSVs).\n Does NOT trigger for general SQL writing, Power BI semantic model creation,\n notebook authoring, or Fabric workspace/lakehouse provisioning.\nlicense: MIT\ncompatibility: Python 3.8+ with pandas (for profiling script)\n---\n\n# Fabric Lakehouse MLV Generator\n\nGenerates `CREATE OR REPLACE MATERIALIZED LAKE VIEW` scripts that transform data\nbetween lakehouse layers in Microsoft Fabric. Supports bronze→silver (cleaning,\nconforming, restructuring) and silver→gold (Power BI-optimised star schema).\n\n## Inputs\n\n| Parameter | Description | Example |\n|---|---|---|\n| Layer | Bronze→Silver or Silver→Gold | \"bronze to silver\" |\n| Approach | Schema-driven (source+target CSVs) or Pattern-driven (source CSVs only) | \"schema-driven\" |\n| Source CSVs | CSV exports of the source layer tables | `/mnt/user-data/uploads/*.csv` |\n| Target CSVs | (Schema-driven only) CSV exports of the target layer tables | `/mnt/user-data/uploads/silver_*.csv` |\n| Source schema | Schema name for source tables in SQL | `bronze` |\n| Target schema | Schema name for target views in SQL | `silver` or `gold` |\n| Fiscal year start | (Gold layer only) Month number 1–12 | `3` (March) |\n| Currency code | (Gold layer only) Base currency for measure suffixes | `GBP` |\n\n## Workflow\n\n### Phase 1 — Route the request\n\n- [ ] **1.1** Ask the user: **What layer transformation is this?**\n - Bronze → Silver\n - Silver → Gold\n\n- [ ] **1.2** Ask the user: **Which approach?**\n - **Schema-driven** — \"I have both source and target CSV files\"\n - **Pattern-driven** — \"I only have source CSV files; suggest transformations\"\n\n- [ ] **1.3** Based on answers, load the appropriate reference file:\n\n| Layer | Approach | Reference to load |\n|---|---|---|\n| Bronze → Silver | Schema-driven | `references/bronze-to-silver-schema-driven.md` |\n| Bronze → Silver | Pattern-driven | `references/bronze-to-silver-pattern-driven.md` |\n| Silver → Gold | Schema-driven | `references/silver-to-gold-schema-driven.md` |\n| Silver → Gold | Pattern-driven | `references/silver-to-gold-pattern-driven.md` |\n\nRead the full reference file with the `view` tool before proceeding. The reference\ncontains the detailed transformation catalogue, SQL patterns, and validation rules\nfor this specific layer+approach combination.\n\n- [ ] **1.4** Ask the user to confirm:\n - Source schema name (default: `bronze` for B→S, `silver` for S→G)\n - Target schema name (default: `silver` for B→S, `gold` for S→G)\n - If Silver→Gold: fiscal year start month and base currency code\n\n### Phase 2 — Inventory and profile\n\n- [ ] **2.1** List all CSV files in `/mnt/user-data/uploads/`.\n\n- [ ] **2.2** Ask the user to identify which CSVs are **source** and which (if\n schema-driven) are **target**. If file naming makes this obvious, propose the\n split and ask for confirmation.\n\n- [ ] **2.3** Run the profiler against every CSV:\n\n```bash\npython scripts/profile_csvs.py --dir /mnt/user-data/uploads/ --files <file1.csv> <file2.csv> ...\n```\n\nThe profiler outputs a JSON report per file with: column names, inferred dtypes,\nrow count, unique counts, null counts, sample values, and pattern flags (dates,\ncurrency, booleans, commas-in-numbers, whitespace). Store this output for use in\nsubsequent steps.\n\n- [ ] **2.4** If schema-driven: profile both source and target CSVs. Map each\n target file to its source file(s) by column overlap. Present the mapping and\n ask the user to confirm.\n\n- [ ] **2.5** If pattern-driven: classify each source file by archetype (see\n reference file for the classification table). Present the classification and\n ask the user to confirm.\n\n### Phase 3 — Detect and plan transformations\n\nFollow the reference file's Step 3 (schema-driven) or Step 3 + Step 4\n(pattern-driven) exactly. The reference contains the full transformation detection\nlogic and catalogue.\n\n- [ ] **3.1** For each source→target pair (schema-driven) or each source file\n (pattern-driven), detect all applicable transformations.\n\n- [ ] **3.2** Present a **transformation plan** to the user — a table showing\n each output view, its sources, the transformations that will be applied, and\n any assumptions.\n\n- [ ] **3.3** If Silver→Gold: run the **anti-pattern check** from the reference:\n - No table mixes dimensions and measures\n - No dimension references another dimension via FK (no snowflaking)\n - Consistent grain within each fact\n - Degenerate dimensions stay in facts\n - Flag junk dimension candidates\n\n- [ ] **3.4** Wait for user confirmation before generating SQL.\n\n### Phase 4 — Generate the SQL\n\nFollow the reference file's SQL generation step exactly (Step 4 or Step 5,\ndepending on reference). Key rules that apply to ALL layer+approach combinations:\n\n**File structure:**\n1. `CREATE SCHEMA IF NOT EXISTS <target_schema>;`\n2. Comment header with assumptions (layer, approach, fiscal year, currency, grain)\n3. Views ordered by dependency (dimensions/independent views first, then dependents)\n4. Each view: `CREATE OR REPLACE MATERIALIZED LAKE VIEW <schema>.<view_name> AS`\n\n**CTE conventions:**\n- `cleaned` — trims, casts, date parsing\n- `unpivoted` — if LATERAL VIEW STACK applies\n- `expanded` — if temporal/date-range expansion applies\n- `aggregated` — if GROUP BY applies\n- `normalized` — date_trunc, column selection (gold facts)\n- `category_lookup` — surrogate key resolution (gold facts)\n\n**Naming conventions:**\n- View names: `<schema>.<descriptive_snake_case>`\n- Column aliases: `PascalCase` (e.g., `HotelID`, `MonthStart`, `RevenueAmountGBP`)\n- CTE names: `snake_case`\n\n**Type casting:**\n- IDs and counts → `INT`\n- Monetary values → `DECIMAL(18,2)` with `CAST(ROUND(expr, 2) AS DECIMAL(18,2))`\n- Rates → `DECIMAL(10,2)`\n- Quantities → `BIGINT` (gold layer)\n- Dates → `DATE` via `to_date()` with explicit format\n\n**Spark SQL syntax (non-obvious):**\n- Backtick reserved words: `` `Year` ``, `` `Month` ``, `` `Order` ``\n- `LATERAL VIEW STACK(n, 'label1', col1, ...)` for unpivot\n- `LATERAL VIEW explode(sequence(...))` for temporal expansion\n- `date_trunc('month', col)` for month normalisation\n- `regexp_replace()` — double-escape backslashes: `'\\\\$'` not `'\\$'`\n- `dayofweek()` returns 1=Sunday, 7=Saturday in Spark\n- `DENSE_RANK() OVER (ORDER BY col)` for deterministic surrogate keys\n- `COALESCE(fk, 'UNKNOWN')` in final SELECT, never in JOIN ON\n\n- [ ] **4.1** Write the SQL to `/home/claude/mlv_output.sql`.\n\n### Phase 5 — Validate\n\n- [ ] **5.1** Run the **data validation** from the reference file's validation\n step. Load source (and target, if schema-driven) CSVs in pandas and verify:\n - Column names match the target / expected output\n - Row counts are within tolerance (exact for dims, ±5% for facts)\n - Numeric columns: values within tolerance\n - Date columns: all parse correctly\n\n- [ ] **5.2** If Silver→Gold, run the **star schema structural checklist**:\n - [ ] Every table is clearly a dimension or a fact\n - [ ] Every fact has FKs to all related dimensions\n - [ ] Every dimension has a unique primary key\n - [ ] A date dimension exists spanning the full fact date range\n - [ ] Date dimension has display + sort column pairs for Power BI\n - [ ] Every dimension has an unknown/unassigned member row\n - [ ] No snowflaking (no dim-to-dim FK references)\n - [ ] No fact embeds descriptive attributes belonging in a dimension\n - [ ] Consistent grain within each fact table\n - [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n - [ ] Surrogate key DENSE_RANK ORDER BY identical in dim views and fact CTEs\n - [ ] Role-playing dimensions documented\n - [ ] Degenerate dimensions remain in facts\n\n- [ ] **5.3** Fix any issues found. Re-validate until clean.\n\n### Phase 6 — Deliver\n\n- [ ] **6.1** Copy the validated SQL to `/mnt/user-data/outputs/` with a\n descriptive filename:\n - Bronze→Silver: `silver_layer_mlv.sql`\n - Silver→Gold: `gold_layer_mlv.sql`\n\n- [ ] **6.2** Present the file to the user.\n\n- [ ] **6.3** Summarise:\n - Number of views created\n - Key transformation patterns applied\n - (Gold) Number of dimensions vs facts, fiscal year config, currency\n - Any warnings or assumptions\n\n## Output Format\n\n```sql\n-- <Layer> layer Spark SQL MLV definitions\n-- Generated by fabric-lakehouse-mlv skill\n-- Source schema: <source_schema> | Target schema: <target_schema>\n-- Assumptions: <fiscal year, currency, grain, etc.>\n\nCREATE SCHEMA IF NOT EXISTS <target_schema>;\n\n-- <View description>\nCREATE OR REPLACE MATERIALIZED LAKE VIEW <target_schema>.<view_name> AS\nWITH cleaned AS (\n ...\n)\nSELECT ...\nFROM cleaned;\n```\n\n## Gotchas\n\n- **BOM characters**: Bronze/silver CSVs often have UTF-8 BOM. Always use\n `encoding='utf-8-sig'` in pandas.\n- **Date format ambiguity**: If all day values ≤ 12, `dd/MM/yyyy` vs `MM/dd/yyyy`\n is ambiguous. Default to `dd/MM/yyyy` for UK/EU data. Ask the user if unsure.\n- **Unpivot STACK count**: The integer N in `LATERAL VIEW STACK(N, ...)` must\n exactly match the number of column pairs. Off-by-one causes silent data loss.\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` in a gold dimension\n and the matching CTE in a fact MUST use the exact same ORDER BY or keys diverge.\n- **SCD fan-out**: Overlapping date ranges in SCD tables duplicate fact rows.\n Validate non-overlap in silver before building gold.\n- **COALESCE placement**: Apply in the final SELECT of gold facts, never in the\n JOIN condition. Joining `ON fk = 'UNKNOWN'` would incorrectly match the\n unknown dimension row.\n- **Revenue-weighted allocation**: Only use when a revenue table exists. Fall back\n to equal split (`amount / 12.0`) when revenue is zero for a period.\n- **Power BI sort columns**: In the gold date dimension, always pair display\n columns (MonthName, DayOfWeekName, FiscalPeriodLabel) with numeric sort\n columns (MonthNumber, DayOfWeekNumber, FiscalPeriodNumber). Without these,\n months sort alphabetically in Power BI.\n- **No snowflaking in gold**: Flatten all dimension attributes. `dim_hotel`\n should contain City and Country directly, not reference a `dim_geography`.\n- **dayofweek() in Spark**: Returns 1=Sunday, 7=Saturday. Weekend = `IN (1,7)`.\n- **Fiscal year formula**: `((month + (12 - start_month)) % 12) + 1`. Test at\n January and at the start month for off-by-one errors.\n\n## Available Scripts\n\n- **`scripts/profile_csvs.py`** — Profiles uploaded CSV files and outputs a JSON\n report with column metadata, type flags, and pattern detection.\n Run: `python scripts/profile_csvs.py --help`\n",
|
|
48
|
+
content: "---\r\nname: create-fabric-process-skill\r\ndescription: >\r\n Use this skill when asked to create, build, or generate a reusable agent skill\r\n for a specific Microsoft Fabric technical process. Triggers on: \"create a skill for\r\n [process]\", \"build a Fabric process skill\", \"generate a SKILL.md\", \"turn this\r\n workflow into a skill\", \"make a repeatable agent skill for Fabric\". Produces a\r\n complete, concise skill package with parameterized scripts for deterministic,\r\n repeatable execution. Does NOT trigger for general coding tasks, one-off scripts,\r\n non-Fabric processes, or requests unrelated to creating agent skills.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ required for scripts/validate_skill.py\r\n---\r\n\r\n# Create Fabric Process Skill\r\n\r\n> ⚠️ **GOVERNANCE**: This skill produces skill packages (SKILL.md, scripts, templates)\r\n> for the operator to review — it never executes commands or deploys artefacts\r\n> autonomously. Present all generated files to the operator before they use them.\r\n\r\nGuides the user through creating a well-structured, deterministic skill package\r\nfor a Microsoft Fabric technical process. Output follows the agentskills.io\r\nspecification with parameterized scripts for repeatability.\r\n\r\n## Workflow\r\n\r\nExecute these steps in order. Do not skip steps.\r\n\r\n> ⚠️ **ALWAYS ASK FIRST**: Never draft the skill without completing Steps 1 and 2.\r\n> Even when invoked by an orchestrating agent with full context — pause and collect\r\n> user input first. The user's answers determine parameters, edge cases, and output\r\n> format. Skipping this produces a skill that reflects the agent's assumptions, not\r\n> the user's intent.\r\n\r\n### Step 1 — Gather Process Description\r\n\r\nAsk the user:\r\n\r\n> \"Please describe the process you want to turn into a skill. Include:\r\n> - What the process does and its goal\r\n> - Typical inputs and what the output should look like\r\n> - The high-level steps involved\"\r\n\r\n### Step 2 — Probe for Missing Information\r\n\r\nBefore drafting, assess gaps. Ask **one question at a time**. Stop when you\r\nhave enough to draft confidently. Do not ask about things with obvious answers.\r\n\r\nMandatory coverage (use `references/probing-questions.md` for guidance):\r\n\r\n- [ ] What varies between runs? (these become script parameters)\r\n- [ ] What must stay identical every run? (fixed steps, exact commands)\r\n- [ ] Are there known edge cases or failure modes?\r\n- [ ] What tools/languages/runtimes are available in the target environment?\r\n- [ ] What does a correct output look like? (drives the output template)\r\n- [ ] Are there validation checks that confirm success?\r\n- [ ] Does the user reference any other skills or tools from a skills marketplace? (see standalone rule in Step 3)\r\n\r\n### Step 3 — Draft the Skill Package\r\n\r\nUse `assets/output-template.md` as the structural template for the generated\r\n`SKILL.md`. Write all files to `<skill-name>/` in the current directory.\r\n\r\n**Directory structure to produce:**\r\n\r\n```\r\n<skill-name>/\r\n├── SKILL.md # Core instructions (≤500 lines / ≤5,000 tokens)\r\n├── scripts/ # Parameterized scripts (if logic repeats across runs)\r\n├── assets/ # Output templates used by the skill\r\n└── references/ # Supporting docs loaded on demand\r\n```\r\n\r\n**SKILL.md authoring rules:**\r\n\r\n- Use imperative phrasing: \"Run…\", \"Check…\", \"If X, do Y\"\r\n- Parameterize all variable inputs as `--flag <VALUE>` or `$PARAM`\r\n- Include a checklist for multi-step workflows (agents track progress)\r\n- Include a validation loop: do work → validate → fix → re-validate\r\n- Write only what the agent wouldn't know on its own (no generic advice)\r\n- Include a Gotchas section for non-obvious facts, edge cases, naming quirks\r\n- Include an output format template inline (or in `assets/` if >30 lines)\r\n- Move content >50 lines to `references/` or `assets/`; tell the agent when to load each file\r\n\r\n**Standalone rule — the generated skill must be fully self-contained:**\r\n\r\nThe end user running the generated process skill will NOT have access to the\r\nsame skills marketplace or directory that was used during its creation. Therefore:\r\n\r\n- **Do not reference external skills by name** (e.g., `skills-for-fabric`,\r\n `azure-prepare`, `azure-ai`). The user may not have them installed.\r\n- **Extract and inline** any relevant instructions, patterns, API conventions,\r\n or scripts from referenced skills directly into the process skill's own files\r\n (`SKILL.md`, `scripts/`, `references/`, `assets/`).\r\n- If a referenced skill contains a reusable script, copy and adapt it into\r\n `scripts/` rather than calling the external skill's script path.\r\n- If a referenced skill contains useful reference material, distil the relevant\r\n parts into `references/` rather than pointing to the external source.\r\n- The test: a user with only the generated skill directory should be able to\r\n run the process end-to-end without installing anything else from a skills\r\n marketplace.\r\n\r\n**Fabric notebook rule:** When a generated skill produces Fabric notebook cells, prefer appending cells to an existing notebook from a prior step (where there are no ordering dependencies that prevent it) rather than creating a new notebook, since each notebook requires its own Spark session, pip install, and `fab` authentication.\r\n\r\n**Script authoring rules (when scripts are warranted):**\r\n\r\n- Use Python with PEP 723 inline dependencies (`# /// script`)\r\n- Run with: `uv run scripts/<name>.py` (preferred) or `python scripts/<name>.py`\r\n- Accept all inputs via `--flag` CLI args — no interactive prompts\r\n- Output structured JSON to stdout; diagnostics/warnings to stderr\r\n- Implement `--help` with: description, all flags, and a usage example\r\n- Return clear error messages: what went wrong, what was expected, what to try\r\n\r\n**What to put in scripts vs SKILL.md:**\r\n\r\nMove logic to a script when:\r\n- The same command/transformation runs on every execution\r\n- The logic is complex enough to get wrong by free-generation\r\n- Consistent output format is critical\r\n\r\nKeep in SKILL.md when:\r\n- The agent needs to make judgment calls\r\n- Steps vary significantly based on input\r\n- A one-off CLI tool (uvx/npx) already handles it\r\n\r\n### Step 4 — Validate the Draft\r\n\r\nRun the validation script against the generated skill:\r\n\r\n```bash\r\npython scripts/validate_skill.py <skill-name>/SKILL.md\r\n```\r\n\r\nFix any reported issues before proceeding.\r\n\r\n### Step 5 — Create Test Agent\r\n\r\nAfter validation passes, create `agent.md` in the current directory using\r\n`assets/agent-template.md`. Tell the user:\r\n\r\n> \"I've created the skill at `<skill-name>/` and a test agent at `agent.md`.\r\n> Run the agent with a sample set of inputs and share the output so we can\r\n> refine it.\"\r\n\r\n### Step 6 — Iterate on Feedback\r\n\r\nWhen the user shares results:\r\n\r\n1. Classify the issue: missing instruction / wrong instruction / wrong detail level / wrong output format\r\n2. Make the minimal targeted fix — do not rewrite unnecessarily\r\n3. Re-run `python scripts/validate_skill.py <skill-name>/SKILL.md`\r\n4. Confirm the fix and invite the next test round\r\n\r\n## Gotchas\r\n\r\n- **User consultation is mandatory, not optional.** Even when the agent has full\r\n knowledge of the process (from prior conversation, orchestration context, or\r\n attached files), it must still ask the user to describe the process and confirm\r\n how the skill should work. The user may want different parameters, edge-case\r\n handling, or output formats than what the agent would assume. Skipping this\r\n produces skills that reflect the agent's perspective, not the user's intent.\r\n- Keep the generated `SKILL.md` under 500 lines. Move overflow to `references/`.\r\n- Scripts must never block on interactive input — the agent runs in a non-interactive shell.\r\n- The `name` field must match the directory name exactly (lowercase, hyphens only).\r\n- Do not add domain knowledge the agent already has — only what it would get wrong without guidance.\r\n- When the process has conditional branches, prefer a checklist over prose so the agent can track state.\r\n- **Do not leave dangling skill references.** If the user described the process using another skill\r\n (e.g., \"use the fabric-spark skill to load the data\"), do not carry that reference into the output.\r\n Extract the relevant knowledge from that skill and embed it directly. The generated skill must work\r\n without access to any external skills directory or marketplace.\r\n\r\n## Progress Report Format\r\n\r\nAfter each draft or iteration, output:\r\n\r\n```\r\n✅ Skill: <skill-name>/SKILL.md (<N> lines)\r\n📁 Scripts: <list or \"none\">\r\n📁 References: <list or \"none\">\r\n📁 Assets: <list or \"none\">\r\n🧪 Agent: agent.md\r\n```\r\n",
|
|
52
49
|
},
|
|
53
50
|
{
|
|
54
|
-
relativePath: "agent.md",
|
|
55
|
-
content: "# Test Agent:
|
|
51
|
+
relativePath: "assets/agent-template.md",
|
|
52
|
+
content: "# Test Agent: <skill-name>\r\n\r\n## Instructions\r\n\r\nYou are a test agent. Your sole instruction is to follow the skill at\r\n`<skill-name>/SKILL.md` exactly as written.\r\n\r\nDo not deviate from the skill instructions. Do not use knowledge or approaches\r\nnot described in the skill. If the skill is ambiguous, note the ambiguity rather\r\nthan making assumptions.\r\n\r\n## How to Use\r\n\r\nProvide a set of process inputs/parameters and this agent will execute the skill\r\nagainst them. After each run, note:\r\n\r\n- Did the output match the expected format?\r\n- Were any steps unclear or skipped?\r\n- Did the skill handle edge cases correctly?\r\n\r\nShare this feedback so the skill can be refined.\r\n",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
relativePath: "assets/output-template.md",
|
|
56
|
+
content: "---\r\nname: <skill-name>\r\ndescription: >\r\n Use this skill when [user intent / trigger phrases]. Triggers on: \"[phrase 1]\",\r\n \"[phrase 2]\", \"[phrase 3]\". Does NOT trigger for [adjacent tasks to exclude].\r\nlicense: MIT\r\ncompatibility: [e.g. \"Python 3.8+, Node.js 18+\" — omit if no special requirements]\r\n---\r\n\r\n# <Skill Title>\r\n\r\nOne-sentence summary of what this skill does and its goal.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `--param1` | What it controls | `\"value\"` |\r\n| `--param2` | What it controls | `\"value\"` |\r\n\r\n## Workflow\r\n\r\n- [ ] Step 1: [Action] — `scripts/step1.py --param1 <VALUE>`\r\n- [ ] Step 2: [Action] — describe what to do\r\n- [ ] Step 3: Validate — `scripts/validate.py output/`\r\n - If validation fails: fix the issue, re-run validation\r\n - Only proceed when validation passes\r\n- [ ] Step 4: [Final action]\r\n\r\n## Output Format\r\n\r\n```\r\n[Paste a concrete example of what correct output looks like]\r\n```\r\n\r\nOr reference a template: see `assets/output-template.md`\r\n\r\n## Gotchas\r\n\r\n- [Non-obvious fact that the agent would get wrong without being told]\r\n- [Naming inconsistency, soft-delete pattern, environment quirk, etc.]\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/<name>.py`** — [What it does]. Run: `uv run scripts/<name>.py --help`\r\n",
|
|
56
57
|
},
|
|
57
58
|
{
|
|
58
|
-
relativePath: "
|
|
59
|
-
content: "# Approach 2 — Pattern-Driven Silver Layer Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts from bronze CSV files alone.\nThe agent profiles every file, detects structural and data-quality patterns, and\napplies a standardised catalogue of silver-layer transformations without needing\na target schema.\n\n---\n\n## When to use\n\nThe user supplies **only bronze CSV files** — no target silver schema is provided.\nThe goal is to propose and generate a best-practice silver layer that cleans,\nconforms, and restructures the raw data for downstream analytics.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Bronze CSVs | Uploaded to `/mnt/user-data/uploads/` or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/silver_layer_mlv.sql` |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the files\n\n1. List all uploaded CSVs.\n2. For each file, run the profiler from Step 2 below.\n3. Classify each file into one of these **table archetypes**:\n\n| Archetype | Detection signals |\n|---|---|\n| **Dimension / master data** | Low row count (<500), high cardinality ID column, descriptive text columns, no date-series pattern |\n| **Fact / transactional** | High row count, date column, numeric measure columns, foreign key IDs |\n| **Periodic snapshot** | One row per entity per period, numeric columns that look like cumulative or period totals |\n| **SCD / history** | Start/end date pairs, \"is current\" flag, same entity ID appears multiple times |\n| **Wide / pivoted** | Many similarly-typed numeric columns whose names follow a pattern (days, months, categories) |\n| **Event log** | Start date (and optional end date), attendance/count columns, descriptive text |\n| **Forecast / budget** | Quarter/period labels, multiple scenario columns (budget, forecast, actual) |\n| **Lookup / bridge** | Two or more ID columns, few or no measure columns, maps one entity to another |\n\nPresent the classification to the user and ask for confirmation before proceeding.\n\n### Step 2 — Profile every CSV\n\nFor each CSV, capture:\n\n```python\nimport pandas as pd, re\n\ndef profile(path):\n df = pd.read_csv(path, encoding='utf-8-sig', nrows=500)\n result = {}\n for col in df.columns:\n s = df[col].dropna().astype(str)\n result[col] = {\n \"dtype_inferred\": str(df[col].dtype),\n \"n_unique\": int(df[col].nunique()),\n \"n_null\": int(df[col].isna().sum()),\n \"sample\": s.head(5).tolist(),\n \"has_leading_trailing_whitespace\": bool(s.str.strip().ne(s).any()),\n \"looks_like_date\": bool(s.str.match(\n r'^\\d{1,4}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{1,4}$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_currency\": bool(s.str.match(\n r'^[\\s]*[\\$£€]?[\\s]*[\\d,]+\\.?\\d*[\\s]*$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_month_year\": bool(s.str.match(\n r'^[A-Za-z]{3}[\\s\\-]\\d{2,4}$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_boolean\": set(s.str.upper().unique()).issubset(\n {'YES','NO','TRUE','FALSE','1','0','Y','N',''}\n ),\n \"has_commas_in_numbers\": bool(s.str.match(\n r'^[\\d,]+$'\n ).any()) and not bool(s.str.match(r'^[A-Za-z]').any()),\n \"max_len\": int(s.str.len().max()) if len(s) > 0 else 0,\n \"empty_string_count\": int((s == '').sum()),\n }\n return result\n```\n\n### Step 3 — Apply the transformation catalogue\n\nFor each file, walk through the catalogue below **in order**. Every rule that\nmatches produces a SQL fragment. Combine all matching rules into a single MLV\ndefinition for that file.\n\n---\n\n#### CATALOGUE OF STANDARD TRANSFORMATIONS\n\n##### T01 — Trim all strings\n\n| Condition | Always — apply to every `string`/`object` column |\n|---|---|\n| SQL | `trim(ColumnName) AS ColumnAlias` |\n| Notes | First transformation applied; all subsequent rules operate on trimmed values |\n\n##### T02 — Rename columns to PascalCase\n\n| Condition | Column names contain underscores, spaces, or inconsistent casing |\n|---|---|\n| SQL | Use `AS PascalCaseAlias` in the SELECT |\n| Rules | `Hotel_ID` → `HotelID`, `No_of_Rooms` → `RoomCount`, `Month_Year` → `MonthYear`. Strip redundant prefixes/suffixes. Prefer semantic names: `Number_of_X` → `XCount`, `Is_X` → `IsX`. |\n\n##### T03 — Parse dates\n\n| Condition | Column flagged `looks_like_date` or `looks_like_month_year` in profile |\n|---|---|\n| SQL patterns | |\n\n| Source format | SQL |\n|---|---|\n| `dd/MM/yyyy` | `to_date(col, 'dd/MM/yyyy')` |\n| `yyyy-MM-dd` | `to_date(col, 'yyyy-MM-dd')` |\n| `MM/dd/yyyy` | `to_date(col, 'MM/dd/yyyy')` — only if day values never exceed 12; ask user if ambiguous |\n| `Mon-YY` (e.g. `Jan-09`) | `to_date(concat('01-', regexp_replace(col, ' ', '-')), 'dd-MMM-yy')` |\n| `QUARTER N` | Not a date — handle in T10 (forecast expansion) |\n\nAlways output as `DATE` type. Alias date columns as `XDate` or `MonthStart` depending on semantics.\n\n##### T04 — Null sentinel for optional end-dates\n\n| Condition | Column looks like a date AND has empty strings or NULLs AND is paired with a start-date column |\n|---|---|\n| SQL | `coalesce(to_date(NULLIF(trim(col), ''), 'dd/MM/yyyy'), make_date(2099, 12, 31)) AS EndDate` |\n| Notes | The sentinel `2099-12-31` enables open-ended range joins (`BETWEEN StartDate AND EndDate`) |\n\n##### T05 — Boolean normalisation\n\n| Condition | Column flagged `looks_like_boolean` in profile |\n|---|---|\n| SQL | `CASE WHEN upper(trim(col)) IN ('YES','TRUE','Y','1') THEN 1 ELSE 0 END AS IsX` |\n| Output type | `INT` (0 or 1) |\n\n##### T06 — Cast numeric types\n\n| Condition | Column is numeric in intent but stored as string |\n|---|---|\n| SQL by subtype | |\n\n| Subtype | Detection | SQL |\n|---|---|---|\n| Integer (IDs, counts, quantities) | All values are whole numbers, or column name contains `ID`, `Count`, `Qty`, `Quantity`, `Number` | `CAST(col AS INT)` |\n| Monetary | `looks_like_currency` or column name contains `Revenue`, `Profit`, `Cost`, `Amount`, `Price`, `Total`, `Spend` | `CAST(ROUND(col, 2) AS DECIMAL(18,2))` |\n| Rate / ratio | Column name contains `Rate`, `Ratio`, `Percentage`, `Share` | `CAST(ROUND(col, 2) AS DECIMAL(10,2))` |\n\n##### T07 — Strip currency symbols and comma-formatted numbers\n\n| Condition | `looks_like_currency` or `has_commas_in_numbers` in profile |\n|---|---|\n| SQL | `CAST(regexp_replace(regexp_replace(col, '\\\\$|£|€', ''), ',', '') AS DECIMAL(18,2))` |\n| Notes | Apply BEFORE T06 casting. Handle space-padded values: `trim()` first. |\n\n##### T08 — Deduplication\n\n| Condition | Table archetype is **Dimension** AND duplicate rows exist (row count > distinct row count on key columns) |\n|---|---|\n| SQL | Wrap the entire SELECT in `SELECT DISTINCT` |\n\n##### T09 — Unpivot wide tables\n\n| Condition | Table archetype is **Wide / pivoted** — many similarly-typed columns whose names represent categories or time periods |\n|---|---|\n| SQL | `LATERAL VIEW STACK(N, 'Col1', Col1, 'Col2', Col2, …) AS CategoryColumn, ValueColumn` |\n| Steps | 1. Identify the set of columns to unpivot (the \"wide\" columns). 2. Identify the columns to keep (the \"anchor\" columns — usually IDs or names). 3. Generate the STACK expression. 4. Name the new categorical column descriptively (e.g., `DayOfWeek`, `ExpenseCategory`). 5. Name the value column descriptively (e.g., `RateGBP`, `AnnualAmount`). |\n\n**Detection heuristic**: If 4+ columns share the same data type and their names\nform a recognisable set (days of week, months, expense types, room types), they\nare candidates for unpivoting.\n\n##### T10 — Temporal expansion (annual → monthly)\n\n| Condition | Table has a `Year` column (or similar period column) and numeric columns that represent **annual** totals, and the desired grain is monthly |\n|---|---|\n| SQL | `explode(sequence(make_date(YearCol, 1, 1), make_date(YearCol, 12, 1), interval 1 month)) AS MonthStart` |\n| Notes | After expansion, the annual amount must be **allocated** to each month. Default: equal split (`AnnualAmount / 12.0`). If a revenue table exists, prefer proportional allocation — see T13. |\n\n##### T11 — Date-range expansion (event start/end → daily/monthly)\n\n| Condition | Table has `StartDate` and `EndDate` columns representing a span |\n|---|---|\n| SQL | `explode(sequence(StartDate, coalesce(EndDate, StartDate), interval 1 day)) AS EventDay` |\n| Follow-up | Calculate `DailyValue = TotalValue / greatest(datediff(EndDate, StartDate) + 1, 1)` then aggregate to monthly: `GROUP BY date_trunc('month', EventDay)` with `SUM(DailyValue)` |\n\n##### T12 — Forecast / quarter expansion (quarterly → monthly)\n\n| Condition | Table archetype is **Forecast / budget** with quarter labels |\n|---|---|\n| SQL | Use `LATERAL VIEW explode(array(1, 2, 3)) AS MonthNumberWithinQuarter` then `make_date(Year, ((QuarterNumber - 1) * 3) + MonthNumberWithinQuarter, 1) AS MonthStart` |\n| Notes | Quarterly totals are split across 3 months. Default: equal third. If revenue data exists, use revenue-weighted allocation — see T13. |\n\n##### T13 — Proportional allocation (revenue-weighted)\n\n| Condition | A temporal expansion (T10, T11, T12) is being applied AND a revenue/transaction table exists that can provide weights |\n|---|---|\n| SQL pattern | |\n\n```sql\n-- Calculate weight per period\nMonthRevenue / NULLIF(SUM(MonthRevenue) OVER (PARTITION BY EntityID, Year), 0) AS MonthWeight\n-- Apply weight\nROUND(AnnualAmount * MonthWeight, 2) AS MonthlyAmount\n-- Fallback when revenue is zero\nCASE WHEN AnnualTotal > 0\n THEN Amount * (MonthValue / AnnualTotal)\n ELSE Amount / 12.0\nEND\n```\n\n| Notes | Always include a zero-revenue fallback (equal split). Use `NULLIF` or `greatest()` to avoid division by zero. |\n\n##### T14 — Enrichment joins (dimension lookups)\n\n| Condition | A fact/transactional table has an ID column that matches a dimension table's key |\n|---|---|\n| SQL | `LEFT JOIN silver.dimension_view d ON f.KeyCol = d.KeyCol` |\n| Rules | 1. Always use `LEFT JOIN` from fact to dimension (never lose fact rows). 2. For SCD joins, add `AND f.DateCol BETWEEN d.StartDate AND d.EndDate`. 3. Pull through only the columns needed (manager name, city, country — not the whole dimension). |\n\n##### T15 — Fuzzy / normalised name joins\n\n| Condition | Two tables need joining on a name column but values don't match exactly (abbreviations, prefixes, case differences) |\n|---|---|\n| SQL | `regexp_replace(lower(trim(col)), '^common_prefix\\\\s+', '') AS NormalizedName` on both sides, then join on the normalised column |\n| Notes | Common prefixes to strip: brand names, `The `, `Hotel `, etc. Ask the user to confirm the normalisation logic if not obvious. |\n\n##### T16 — Aggregation\n\n| Condition | Silver grain should be coarser than bronze (e.g., order-level → monthly/category level) |\n|---|---|\n| SQL | `GROUP BY entity_id, date_trunc('month', DateCol), CategoryCol` with `SUM(measure)`, `COUNT(*)`, etc. |\n| Notes | Always `CAST(ROUND(SUM(col), 2) AS DECIMAL(18,2))` for monetary aggregations. |\n\n---\n\n### Step 4 — Propose the silver layer\n\nBefore generating SQL, present a **transformation plan** to the user:\n\n```\nBronze file | Silver view name | Archetype | Transformations applied\n-------------------------|--------------------------------|---------------|------------------------\nraw_hotels.csv | silver.hotel_metadata | Dimension | T01, T02, T03, T06, T08\nraw_managers.csv | silver.manager_assignments_scd | SCD | T01, T02, T03, T04, T05\nraw_revenue.csv | silver.revenue_monthly | Fact | T01, T02, T03, T06, T14\nraw_expenses.csv | silver.expenses_monthly | Periodic | T01, T02, T06, T09, T10, T13\nraw_rates.csv | silver.room_rates | Wide | T01, T02, T06, T09, T14\nraw_events.csv | silver.events_city_monthly | Event log | T01, T02, T03, T06, T07, T11, T14\nraw_forecast.csv | silver.forecast_monthly | Forecast | T01, T02, T07, T12, T13\nraw_orders.csv | silver.property_orders | Fact | T01, T02, T03, T06, T14, T16\n```\n\nInclude for each view:\n- Proposed column list with types\n- Join dependencies (which other silver views it references)\n- Any assumptions or ambiguities to resolve\n\n**Wait for user confirmation** before generating SQL.\n\n### Step 5 — Generate the SQL script\n\nProduce one `.sql` file. Follow these structural rules:\n\n**File structure:**\n1. `CREATE SCHEMA IF NOT EXISTS silver;`\n2. Dimension/master views first (no dependencies)\n3. SCD views second (may depend on dimensions)\n4. Fact views next (may depend on dimensions and SCDs)\n5. Derived/allocated views last (depend on other silver views)\n\n**CTE conventions:**\n- `cleaned` — first CTE: trims, casts, parses dates, strips currencies\n- `unpivoted` — if T09 applies\n- `expanded` — if T10/T11/T12 applies\n- `aggregated` — if T16 applies\n- `enriched` or `annotated` — if T13/T14 applies\n- Final SELECT pulls from the last CTE\n\n**Naming conventions:**\n- View names: `silver.<descriptive_snake_case>`\n- Column aliases: `PascalCase`\n- CTE names: `snake_case`\n\n**Type casting rules:**\n- IDs and counts → `INT`\n- Monetary values → `DECIMAL(18,2)`\n- Rates → `DECIMAL(10,2)`\n- Intermediate calculations → `DECIMAL(18,4)` to avoid premature rounding\n- Final monetary output → `CAST(ROUND(expr, 2) AS DECIMAL(18,2))`\n- Dates → `DATE`\n- Strings → trimmed, no explicit cast needed\n\n**Spark SQL syntax reminders:**\n- Backtick reserved words: `` `Year` ``, `` `Month` ``, `` `Order` ``\n- `LATERAL VIEW STACK(n, ...)` for unpivot\n- `LATERAL VIEW explode(...)` for expansion\n- `date_trunc('month', col)` for month normalisation\n- `make_date(y, m, d)` for date construction\n- `regexp_replace()` — double-escape backslashes: `'\\\\$'` not `'\\$'`\n- `sequence(start, end, interval 1 month)` for date sequences\n- `greatest(expr, 1)` to guard against zero division\n- `coalesce()` for null handling\n- `NULLIF(trim(col), '')` to convert empty strings to NULL\n\n### Step 6 — Validate and present\n\n1. For each generated view, run a quick sanity check in Python:\n - Load the bronze CSV, apply the transformations in pandas\n - Confirm the output column names, types, and approximate row counts look correct\n2. Write to `/mnt/user-data/outputs/silver_layer_mlv.sql`\n3. Present the file to the user\n4. Summarise: number of views, transformations applied, any assumptions made\n\n---\n\n## Gotchas\n\n- **BOM characters**: Use `encoding='utf-8-sig'` when reading CSVs.\n- **Date format ambiguity**: If all day values are ≤ 12, the format is ambiguous (`dd/MM` vs `MM/dd`). Ask the user. Default to `dd/MM/yyyy` for UK/EU data.\n- **Unpivot column count**: Double-check the STACK count `N` matches the actual number of column pairs. Off-by-one here causes silent data loss.\n- **Revenue-weighted allocation**: Only apply T13 if a revenue or transaction table exists. If not, fall back to equal split and note the assumption.\n- **Cross-view dependencies**: If the expenses view uses revenue weights, it depends on the revenue view. The revenue view must be created first.\n- **Fuzzy joins are fragile**: Always show the user the normalised values from both sides and ask them to confirm the join produces the expected number of matches.\n- **Empty strings**: Many CSVs use empty strings instead of NULL. Always convert with `NULLIF(trim(col), '')` before date parsing or numeric casting to avoid Spark parse errors.\n- **Comma-formatted numbers**: Attendance, currency, and large counts often have commas. Always strip before casting.\n- **Multiple bronze files for one silver view**: Some silver views need data from 2+ bronze files (e.g., orders + hotel metadata). Detect this when a bronze fact table has an ID that matches a dimension table's key.\n- **Proportional allocation fallback**: When the weighting denominator is zero (no revenue in a period), always fall back to equal split. Never produce NULL or zero-divided amounts.\n",
|
|
59
|
+
relativePath: "references/probing-questions.md",
|
|
60
|
+
content: "# Probing Questions Reference\r\n\r\nUse these questions to fill gaps before drafting a skill. Ask **one at a time**.\r\nStop when you have enough to draft confidently. Skip questions with obvious answers.\r\n\r\n## External Skills & Dependencies\r\n\r\n- Does the user reference any other agent skills (e.g., skills-for-fabric, azure-prepare) as part of this process?\r\n- If yes: what specific steps or outputs from that skill are needed? (extract only what's relevant)\r\n- Are there API conventions, authentication patterns, or script logic from those skills that must be replicated?\r\n- Can all required behaviour be embedded in this skill's own files, without assuming the end user has those skills installed?\r\n\r\n## Parameters & Variability\r\n\r\n- What inputs change between different runs of this process?\r\n- Are any of those inputs user-provided, or fetched from a system/file?\r\n- What would break if the wrong value was used? (helps identify critical params)\r\n- Are there default values that apply when an input isn't specified?\r\n\r\n## Fixed Steps & Determinism\r\n\r\n- Are there steps that must always run in exactly the same way? (candidates for scripts)\r\n- Are there specific commands, API calls, or tools that must be used (not alternatives)?\r\n- Is there a specific order that steps must follow, or can some run in parallel?\r\n\r\n## Edge Cases & Failure Modes\r\n\r\n- What are the most common ways this process goes wrong?\r\n- Are there preconditions that must be met before the process can start?\r\n- What should happen if a step fails mid-process?\r\n- Are there data/input values that require special handling?\r\n\r\n## Environment & Tools\r\n\r\n- What language/runtime is available? (Python, Node.js, PowerShell, bash, etc.)\r\n- Are there existing libraries or internal tools that should be used?\r\n- Does this process need network access, database connections, or file system access?\r\n- Are there permissions or credentials required?\r\n\r\n## Output & Validation\r\n\r\n- What does a correct output look like? (request an example if possible)\r\n- How does the user currently verify that the process succeeded?\r\n- Should the output be a file, stdout, a structured format (JSON/CSV), or a report?\r\n- Are there downstream systems that consume the output? (affects format constraints)\r\n\r\n## Reusability & Scope\r\n\r\n- Is this skill for one team/project or should it work across different contexts?\r\n- Are there variations of this process that the skill should NOT handle? (scope boundary)\r\n- Should the skill handle cleanup/rollback if something goes wrong?\r\n",
|
|
60
61
|
},
|
|
61
62
|
{
|
|
62
|
-
relativePath: "
|
|
63
|
-
content: "#
|
|
63
|
+
relativePath: "scripts/validate_skill.py",
|
|
64
|
+
content: "#!/usr/bin/env python3\r\n# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nValidate a SKILL.md file against the agentskills.io specification.\r\n\r\nUsage:\r\n uv run scripts/validate_skill.py <path/to/SKILL.md>\r\n python scripts/validate_skill.py <path/to/SKILL.md>\r\n\r\nOptions:\r\n --strict Treat warnings as errors\r\n --help Show this message and exit\r\n\r\nExamples:\r\n uv run scripts/validate_skill.py my-skill/SKILL.md\r\n uv run scripts/validate_skill.py my-skill/SKILL.md --strict\r\n\"\"\"\r\n\r\nimport argparse\r\nimport json\r\nimport re\r\nimport sys\r\nfrom pathlib import Path\r\n\r\n\r\ndef parse_frontmatter(text: str) -> tuple[dict, str]:\r\n \"\"\"Extract YAML frontmatter and body from a SKILL.md string.\"\"\"\r\n if not text.startswith(\"---\"):\r\n return {}, text\r\n end = text.find(\"\\n---\", 3)\r\n if end == -1:\r\n return {}, text\r\n fm_raw = text[4:end].strip()\r\n body = text[end + 4:].strip()\r\n fields = {}\r\n current_key = None\r\n current_val_lines = []\r\n for line in fm_raw.splitlines():\r\n if re.match(r\"^\\S.*:\", line) and not line.startswith(\" \"):\r\n if current_key:\r\n fields[current_key] = \" \".join(current_val_lines).strip()\r\n parts = line.split(\":\", 1)\r\n current_key = parts[0].strip()\r\n current_val_lines = [parts[1].strip()] if len(parts) > 1 else []\r\n elif line.startswith(\" \") and current_key:\r\n current_val_lines.append(line.strip())\r\n else:\r\n if current_key:\r\n fields[current_key] = \" \".join(current_val_lines).strip()\r\n current_key = None\r\n current_val_lines = []\r\n if current_key:\r\n fields[current_key] = \" \".join(current_val_lines).strip()\r\n return fields, body\r\n\r\n\r\ndef validate(skill_path: Path, strict: bool) -> list[dict]:\r\n issues = []\r\n\r\n def error(code: str, msg: str):\r\n issues.append({\"level\": \"error\", \"code\": code, \"message\": msg})\r\n\r\n def warn(code: str, msg: str):\r\n issues.append({\"level\": \"warning\" if not strict else \"error\", \"code\": code, \"message\": msg})\r\n\r\n if not skill_path.exists():\r\n error(\"FILE_NOT_FOUND\", f\"File not found: {skill_path}\")\r\n return issues\r\n\r\n text = skill_path.read_text(encoding=\"utf-8\")\r\n fields, body = parse_frontmatter(text)\r\n\r\n # --- name ---\r\n name = fields.get(\"name\", \"\").strip(\">\").strip()\r\n if not name:\r\n error(\"NAME_MISSING\", \"Frontmatter field 'name' is required.\")\r\n else:\r\n if len(name) > 64:\r\n error(\"NAME_TOO_LONG\", f\"'name' must be ≤64 characters (got {len(name)}).\")\r\n if not re.match(r\"^[a-z0-9][a-z0-9-]*[a-z0-9]$\", name) and name not in [c for c in \"abcdefghijklmnopqrstuvwxyz0123456789\"]:\r\n if not re.match(r\"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$\", name):\r\n error(\"NAME_INVALID_CHARS\", f\"'name' may only contain lowercase letters, numbers, and hyphens, and must not start/end with a hyphen. Got: '{name}'\")\r\n if \"--\" in name:\r\n error(\"NAME_CONSECUTIVE_HYPHENS\", f\"'name' must not contain consecutive hyphens. Got: '{name}'\")\r\n parent_dir = skill_path.parent.name\r\n if parent_dir != name:\r\n warn(\"NAME_DIR_MISMATCH\", f\"'name' field ('{name}') must match parent directory name ('{parent_dir}').\")\r\n\r\n # --- description ---\r\n desc = fields.get(\"description\", \"\").replace(\">\", \"\").replace(\"\\n\", \" \").strip()\r\n if not desc:\r\n error(\"DESC_MISSING\", \"Frontmatter field 'description' is required.\")\r\n else:\r\n if len(desc) > 1024:\r\n error(\"DESC_TOO_LONG\", f\"'description' must be ≤1024 characters (got {len(desc)}).\")\r\n if len(desc) < 20:\r\n warn(\"DESC_TOO_SHORT\", f\"'description' is very short ({len(desc)} chars). It should describe what the skill does and when to trigger it.\")\r\n\r\n # --- body ---\r\n if not body.strip():\r\n error(\"BODY_EMPTY\", \"SKILL.md body (after frontmatter) must not be empty.\")\r\n\r\n line_count = len(text.splitlines())\r\n if line_count > 500:\r\n warn(\"FILE_TOO_LONG\", f\"SKILL.md is {line_count} lines. Recommended maximum is 500 lines. Move overflow to references/ or assets/.\")\r\n\r\n token_estimate = len(text) // 4\r\n if token_estimate > 5000:\r\n warn(\"TOO_MANY_TOKENS\", f\"Estimated token count ~{token_estimate}. Recommended maximum is 5,000. Consider moving content to references/.\")\r\n\r\n return issues\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=\"Validate a SKILL.md file against the agentskills.io specification.\",\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=__doc__,\r\n )\r\n parser.add_argument(\"skill_path\", type=Path, help=\"Path to SKILL.md file\")\r\n parser.add_argument(\"--strict\", action=\"store_true\", help=\"Treat warnings as errors\")\r\n args = parser.parse_args()\r\n\r\n issues = validate(args.skill_path, strict=args.strict)\r\n\r\n errors = [i for i in issues if i[\"level\"] == \"error\"]\r\n warnings = [i for i in issues if i[\"level\"] == \"warning\"]\r\n\r\n for issue in issues:\r\n prefix = \"❌ ERROR \" if issue[\"level\"] == \"error\" else \"⚠️ WARN \"\r\n print(f\"{prefix}[{issue['code']}] {issue['message']}\", file=sys.stderr)\r\n\r\n if not issues:\r\n print(json.dumps({\"status\": \"ok\", \"errors\": 0, \"warnings\": 0}))\r\n print(\"✅ SKILL.md is valid.\")\r\n else:\r\n print(json.dumps({\"status\": \"fail\" if errors else \"warn\", \"errors\": len(errors), \"warnings\": len(warnings)}))\r\n\r\n sys.exit(1 if errors else 0)\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
64
65
|
},
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
name: "create-fabric-process-workflow-agent",
|
|
70
|
+
category: "fabric",
|
|
71
|
+
files: [
|
|
65
72
|
{
|
|
66
|
-
relativePath: "
|
|
67
|
-
content: "
|
|
73
|
+
relativePath: "SKILL.md",
|
|
74
|
+
content: "---\nname: create-fabric-process-workflow-agent\ndescription: >\n Use this skill to create an orchestration agent definition (agent.md) for any\n Microsoft Fabric technical process. The user describes what they want to automate;\n the skill produces a self-contained agent.md. When run, the agent maps the process\n to available Fabric process skills, automatically creates any missing skills using\n create-fabric-process-skill, logs all changes to an audit trail, and orchestrates\n the full process end-to-end. The process skills library grows with every run.\n Triggers on: \"create a process workflow agent\", \"build an orchestration agent\n for [process]\", \"create an agent that automates [process]\", \"orchestrate\n [process] into an agent\". Does NOT trigger for creating individual process\n skills, running an agent, writing code, or one-off analysis.\nlicense: MIT\ncompatibility: Python 3.8+ required for scripts/\n---\n\n# Create Fabric Process Workflow Agent\n\nCreates a concise, self-contained `agent.md` that defines an orchestration agent\nfor a Microsoft Fabric technical process. No process skills need to exist upfront.\nWhen run, the agent maps requirements to available skills, creates any that are\nmissing, and builds up the process skills library over time.\n\n## Core Governance Rules\n\nThese rules are non-negotiable. They must be embedded verbatim in every generated\n`agent.md` so they are active at runtime.\n\n- **RULE 1 — Never execute autonomously.** Never run terminal commands, API calls,\n or scripts directly. Present every command in a fenced code block with the\n insert-into-terminal icon. The user runs it and reports back before proceeding.\n- **RULE 2 — Pre-empt; don't react.** Before any step, ask pointed questions about\n permissions, tooling, and dependencies. Do not collect parameters and then\n discover blockers mid-execution.\n- **RULE 3 — No silent approach changes.** If a blocker is found with the chosen\n approach, surface it and present alternatives. Let the user decide. Never switch\n silently.\n- **RULE 4 — No inference from context.** Collect all parameters from the user or\n the current prompt. Do not pre-populate from prior chat history, previous runs,\n or attached files not explicitly part of the current request.\n- **RULE 5 — Respect the user's skill level and environment.** Do not steer toward\n an approach the agent finds easier to generate. Match the user's comfort level,\n installed tooling, and stated preferences.\n- **RULE 6 — Stay within skill boundaries.** Generate only what skill definitions\n describe. On any failure: explain the cause from the error, offer the simplest\n manual or UI fallback, ask whether to skip.\n- **RULE 7 — Append to CHANGE_LOG.md after every step.** Include: step number,\n what was done, outcome (success/failure/skipped), and any notable decisions.\n\n## Inputs\n\n| Parameter | Description | Example |\n|-----------|-------------|---------|\n| `PROCESS_NAME` | Short name for the process (lowercase, hyphens) | `monthly-budget-consolidation` |\n| `REQUIREMENTS` | Full description of the process and each of its steps | `\"1) Collect data from five Excel files... 2) Summarise by category...\"` |\n| `SECTIONS` | Sub-agent sections to include (default: all four) | `impl-plan, biz-process, architecture, governance` |\n| `USERNAME` | Used in output folder naming | `rishi` |\n\n## Workflow\n\n- [ ] **Collect** — If `PROCESS_NAME`, `REQUIREMENTS`, or `USERNAME` are missing, ask for them.\n\n- [ ] **Analyse discovery questions** — Read the requirements and identify the\n environment-specific questions that determine which approaches are viable. For each question:\n - Name the specific activity that needs the permission or tool\n - Offer concrete options (not yes/no)\n - State what the agent does differently based on the answer\n Group questions by domain (permissions, tooling, execution preferences, data access,\n existing infrastructure). Ask only about domains the requirements actually need.\n Embed the questionnaire as **Sub-Agent 0: Environment Discovery** in the generated agent.md.\n\n- [ ] **Confirm sections** — Present the four standard sections with descriptions\n (see `references/section-descriptions.md`). Ask which to include. Default: all four.\n Wait for explicit confirmation before drafting.\n\n- [ ] **Draft agent.md** — Use `assets/agent-template.md` as the base.\n - Substitute `{PROCESS_NAME}` and a ≤3-sentence `{REQUIREMENTS_SUMMARY}`.\n - Remove excluded sections. Keep each sub-agent block ≤25 lines.\n - Do not name any specific process skill or technology — all resolved at runtime.\n - Do not hardcode company names, specific values, or environment paths.\n\n- [ ] **Validate** — Present the draft. Ask: *\"Does this accurately reflect the process? Anything unclear?\"*\n Refine until the user confirms.\n\n- [ ] **Scaffold** — Run `python scripts/scaffold_output.py --process-name $PROCESS_NAME --username $USERNAME --sections $SECTIONS`.\n Write the confirmed agent.md to the returned `agent_md_path`.\n\n- [ ] **Confirm** — Report the output root path and list all created subfolders.\n\n## Output Format\n\n```\noutputs/\n└── {process-name}_{YYYY-MM-DD_HH-MM}_{username}/\n ├── agent.md ← self-contained orchestration agent definition\n ├── CHANGE_LOG.md ← audit trail; updated as agent runs\n ├── 01-implementation-plan/ ← empty; populated when agent runs\n ├── 02-business-process/ ← empty; populated when agent runs\n ├── 03-solution-architecture/ ← empty; populated when agent runs\n ├── 04-governance/ ← empty; populated when agent runs\n └── NN-step-name/ ← additional subfolders for execution steps\n ├── generate_thing.py ← intermediate (generator script)\n └── thing.ipynb ← final deliverable (generated notebook)\n```\n\n`CHANGE_LOG.md` is initialised empty and updated by the agent each time it runs.\n\n### Intermediate vs. final artefacts\n\n| Classification | Description | Examples |\n|----------------|-------------|----------|\n| **Final** | The deliverable the user runs or deploys | `.ipynb` notebooks, `.sql` scripts, `.md` documentation |\n| **Intermediate** | Scripts that generate the final artefacts | `generate_*.py`, `generate_*.ps1` |\n\n- Intermediate artefacts live alongside their final outputs (same subfolder).\n- Label both types clearly when presenting outputs to the user.\n- Intermediate scripts must be deterministic and re-runnable.\n\n### Sub-agents in the generated agent.md\n\n| # | Section | Output document |\n|---|---------|-----------------|\n| 0 | Environment Discovery | `00-environment-discovery/environment-profile.md` |\n| 1 | Implementation Plan | `01-implementation-plan/implementation-plan.md` |\n| 2 | Business Process Mapping | `02-business-process/sop.md` |\n| 3 | Solution Architecture | `03-solution-architecture/specification.md` |\n| 4 | Security, Testing & Governance | `04-governance/governance-plan.md` |\n\n## Gotchas\n\n- **Do not check for or create process skills during skill execution.** All skill\n discovery, creation of missing skills, and audit logging happen inside Sub-Agent 2\n when the generated agent.md is run.\n- **Do not execute sub-agents** during skill execution — `agent.md` is a definition only.\n- Do not name specific tools, technologies, or process skills in the generated agent.md.\n- **Environment discovery must be contextual, not generic.** Derive questions from the\n requirements. If the process doesn't involve workspaces, don't ask about workspace\n creation permissions. The questionnaire should read like a knowledgeable consultant\n scoping a project, not a bureaucratic form.\n- Confirm sections **before** drafting, not after.\n- Keep each sub-agent block ≤25 lines to avoid context overload when the agent runs.\n\n## Available Scripts\n\n- **`scripts/scaffold_output.py`** — Creates the dated output folder structure including\n an empty `CHANGE_LOG.md`. Run: `python scripts/scaffold_output.py --help`\n",
|
|
68
75
|
},
|
|
69
76
|
{
|
|
70
|
-
relativePath: "
|
|
71
|
-
content: "#!/usr/bin/env python3\n# /// script\n# requires-python = \">=3.8\"\n# dependencies = [\"pandas>=1.5\"]\n# ///\n\"\"\"\nProfile CSV files for the fabric-lakehouse-mlv skill.\n\nAnalyses each CSV and outputs a JSON report with column metadata, type\ninference, and pattern flags used by the transformation detection logic.\n\nUsage:\n python scripts/profile_csvs.py --files file1.csv file2.csv\n python scripts/profile_csvs.py --dir /mnt/user-data/uploads/ --files hotels.csv revenue.csv\n\nOptions:\n --dir DIR Base directory for CSV files (default: current directory)\n --files FILE ... One or more CSV filenames to profile\n --max-rows N Maximum rows to sample for profiling (default: 500)\n --output PATH Write JSON report to file instead of stdout\n --help Show this message and exit\n\nExamples:\n python scripts/profile_csvs.py --dir /mnt/user-data/uploads/ --files Landon_hotels.csv Landon_hotel_revenue_data.csv\n python scripts/profile_csvs.py --files silver_hotel_metadata.csv silver_revenue_monthly.csv --max-rows 1000\n\"\"\"\n\nimport argparse\nimport json\nimport re\nimport sys\nfrom pathlib import Path\n\ntry:\n import pandas as pd\nexcept ImportError:\n print(\n json.dumps({\"error\": \"pandas is required. Install with: pip install pandas --break-system-packages\"}),\n file=sys.stderr,\n )\n sys.exit(1)\n\n\nDATE_PATTERNS = [\n (r\"^\\d{1,2}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{2,4}$\", \"dd/MM/yyyy or MM/dd/yyyy\"),\n (r\"^\\d{4}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{1,2}\", \"yyyy-MM-dd\"),\n (r\"^\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}\", \"yyyy-MM-dd HH:mm:ss\"),\n]\n\nMONTH_YEAR_PATTERN = re.compile(r\"^[A-Za-z]{3}[\\s\\-]\\d{2,4}$\")\nCURRENCY_PATTERN = re.compile(r\"^[\\s]*[\\$£€]?[\\s]*[\\-]?[\\d,]+\\.?\\d*[\\s]*$\")\nCURRENCY_SYMBOL_PATTERN = re.compile(r\"[\\$£€]\")\nCOMMA_NUMBER_PATTERN = re.compile(r\"^\\d{1,3}(,\\d{3})+(\\.\\d+)?$\")\nBOOLEAN_VALUES = {\"YES\", \"NO\", \"TRUE\", \"FALSE\", \"1\", \"0\", \"Y\", \"N\", \"\"}\nQUARTER_PATTERN = re.compile(r\"^(Q|QUARTER)\\s*\\d$\", re.IGNORECASE)\n\nMEASURE_KEYWORDS = [\n \"amount\", \"revenue\", \"profit\", \"cost\", \"spend\", \"total\", \"price\",\n \"quantity\", \"qty\", \"attendance\", \"rate\", \"budget\", \"forecast\",\n \"bookings\", \"payroll\", \"insurance\", \"maintenance\", \"utilities\",\n \"expenses\", \"tax\", \"fee\", \"salary\", \"wage\",\n]\n\nID_KEYWORDS = [\"id\", \"key\", \"code\", \"number\", \"no\"]\n\n\ndef looks_like_id(col_name: str) -> bool:\n lower = col_name.lower().replace(\"_\", \"\")\n return any(lower.endswith(kw) or lower.startswith(kw) for kw in ID_KEYWORDS)\n\n\ndef looks_like_measure(col_name: str) -> bool:\n lower = col_name.lower()\n return any(kw in lower for kw in MEASURE_KEYWORDS)\n\n\ndef detect_date_format(series: \"pd.Series\") -> str | None:\n s = series.dropna().astype(str).str.strip()\n if len(s) == 0:\n return None\n sample = s.head(50)\n for pattern, label in DATE_PATTERNS:\n if sample.str.match(pattern).all():\n return label\n return None\n\n\ndef can_disambiguate_date(series: \"pd.Series\") -> dict:\n \"\"\"Check if any day-part value > 12, which disambiguates dd/MM vs MM/dd.\"\"\"\n s = series.dropna().astype(str).str.strip()\n parts = s.str.split(r\"[/\\-\\.]\", expand=True)\n if parts.shape[1] < 3:\n return {\"ambiguous\": True}\n try:\n first_part = parts[0].astype(int)\n if (first_part > 12).any():\n return {\"ambiguous\": False, \"likely_format\": \"dd/MM/yyyy\"}\n second_part = parts[1].astype(int)\n if (second_part > 12).any():\n return {\"ambiguous\": False, \"likely_format\": \"MM/dd/yyyy\"}\n except (ValueError, TypeError):\n pass\n return {\"ambiguous\": True, \"note\": \"All day/month values ≤12; ask user\"}\n\n\ndef profile_column(col_name: str, series: \"pd.Series\") -> dict:\n s_str = series.dropna().astype(str)\n s_stripped = s_str.str.strip()\n n_total = len(series)\n n_null = int(series.isna().sum())\n n_unique = int(series.nunique())\n empty_string_count = int((s_str == \"\").sum())\n\n result = {\n \"dtype_inferred\": str(series.dtype),\n \"n_total\": n_total,\n \"n_unique\": n_unique,\n \"n_null\": n_null,\n \"empty_string_count\": empty_string_count,\n \"sample_values\": s_stripped.head(5).tolist(),\n \"has_leading_trailing_whitespace\": bool(s_stripped.ne(s_str).any()),\n \"looks_like_id\": looks_like_id(col_name),\n \"looks_like_measure\": looks_like_measure(col_name),\n }\n\n # Date detection\n date_fmt = detect_date_format(series)\n result[\"looks_like_date\"] = date_fmt is not None\n if date_fmt:\n result[\"detected_date_format\"] = date_fmt\n if \"dd/MM\" in date_fmt or \"MM/dd\" in date_fmt:\n result[\"date_disambiguation\"] = can_disambiguate_date(series)\n\n # Month-year detection (e.g., \"Jan-09\")\n if len(s_stripped) > 0:\n result[\"looks_like_month_year\"] = bool(\n s_stripped.head(50).str.match(MONTH_YEAR_PATTERN.pattern).all()\n )\n else:\n result[\"looks_like_month_year\"] = False\n\n # Currency detection\n if len(s_stripped) > 0 and series.dtype == object:\n result[\"looks_like_currency\"] = bool(\n s_stripped.head(50).str.match(CURRENCY_PATTERN.pattern).all()\n and not s_stripped.head(50).str.match(r\"^[A-Za-z]\").any()\n )\n else:\n result[\"looks_like_currency\"] = False\n\n # Comma-in-numbers detection\n result[\"has_commas_in_numbers\"] = bool(\n s_stripped.str.match(COMMA_NUMBER_PATTERN.pattern).any()\n ) if len(s_stripped) > 0 else False\n\n # Boolean detection\n result[\"looks_like_boolean\"] = (\n set(s_stripped.str.upper().unique()).issubset(BOOLEAN_VALUES)\n and n_unique <= 3\n and n_unique > 0\n )\n\n # Quarter label detection\n if len(s_stripped) > 0:\n result[\"looks_like_quarter\"] = bool(\n s_stripped.head(20).str.match(QUARTER_PATTERN.pattern).all()\n )\n else:\n result[\"looks_like_quarter\"] = False\n\n # Max string length\n result[\"max_string_length\"] = int(s_str.str.len().max()) if len(s_str) > 0 else 0\n\n return result\n\n\ndef classify_table(profile_data: dict) -> str:\n \"\"\"Classify a table by archetype based on its column profiles.\"\"\"\n columns = profile_data[\"columns\"]\n row_count = profile_data[\"row_count\"]\n\n has_date_col = any(c[\"looks_like_date\"] or c[\"looks_like_month_year\"] for c in columns.values())\n has_measure = any(c[\"looks_like_measure\"] for c in columns.values())\n has_id = any(c[\"looks_like_id\"] for c in columns.values())\n n_boolean = sum(1 for c in columns.values() if c[\"looks_like_boolean\"])\n has_quarter = any(c[\"looks_like_quarter\"] for c in columns.values())\n has_currency = any(c[\"looks_like_currency\"] for c in columns.values())\n\n # Check for SCD pattern (start/end date pairs)\n col_names_lower = [n.lower() for n in columns.keys()]\n has_start_end = (\n any(\"start\" in n for n in col_names_lower)\n and any(\"end\" in n for n in col_names_lower)\n )\n has_is_current = any(\"current\" in n for n in col_names_lower)\n\n # Check for wide/pivoted (many similarly-typed columns)\n numeric_cols = [\n n for n, c in columns.items()\n if c[\"dtype_inferred\"] in (\"int64\", \"float64\")\n and not c[\"looks_like_id\"]\n ]\n # Days of week or months pattern\n day_names = {\"monday\", \"tuesday\", \"wednesday\", \"thursday\", \"friday\", \"saturday\", \"sunday\"}\n col_names_set = {n.lower() for n in columns.keys()}\n is_wide_days = len(day_names & col_names_set) >= 5\n\n if has_start_end and has_is_current:\n return \"scd\"\n if has_quarter and (has_currency or has_measure):\n return \"forecast\"\n if is_wide_days or (len(numeric_cols) >= 6 and not has_date_col):\n return \"wide_pivoted\"\n if has_date_col and has_measure and row_count > 50:\n return \"fact\"\n if row_count < 500 and has_id and not has_measure:\n return \"dimension\"\n # Small reference tables with rates/amounts but no date series (e.g., room rates)\n if row_count < 200 and has_id and has_measure and not has_date_col:\n return \"reference\"\n if has_date_col and any(\"attend\" in n.lower() for n in columns.keys()):\n return \"event_log\"\n if has_id and not has_measure and len(columns) <= 4:\n return \"lookup\"\n return \"unknown\"\n\n\ndef profile_file(filepath: Path, max_rows: int) -> dict:\n try:\n df = pd.read_csv(filepath, encoding=\"utf-8-sig\", nrows=max_rows)\n except Exception as e:\n return {\"error\": str(e), \"file\": str(filepath)}\n\n columns = {}\n for col in df.columns:\n columns[col] = profile_column(col, df[col])\n\n result = {\n \"file\": filepath.name,\n \"row_count\": len(df),\n \"column_count\": len(df.columns),\n \"column_names\": list(df.columns),\n \"columns\": columns,\n }\n\n result[\"archetype\"] = classify_table(result)\n\n return result\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"Profile CSV files for fabric-lakehouse-mlv skill.\",\n formatter_class=argparse.RawDescriptionHelpFormatter,\n epilog=__doc__,\n )\n parser.add_argument(\"--dir\", type=str, default=\".\", help=\"Base directory for CSV files\")\n parser.add_argument(\"--files\", nargs=\"+\", required=True, help=\"CSV filenames to profile\")\n parser.add_argument(\"--max-rows\", type=int, default=500, help=\"Max rows to sample (default: 500)\")\n parser.add_argument(\"--output\", type=str, default=None, help=\"Write JSON to file instead of stdout\")\n args = parser.parse_args()\n\n base = Path(args.dir)\n results = []\n\n for fname in args.files:\n fpath = base / fname\n if not fpath.exists():\n print(f\"⚠️ File not found: {fpath}\", file=sys.stderr)\n results.append({\"file\": fname, \"error\": f\"File not found: {fpath}\"})\n continue\n print(f\"Profiling {fname}...\", file=sys.stderr)\n results.append(profile_file(fpath, args.max_rows))\n\n output = json.dumps(results, indent=2, default=str)\n\n if args.output:\n Path(args.output).write_text(output, encoding=\"utf-8\")\n print(f\"Report written to {args.output}\", file=sys.stderr)\n else:\n print(output)\n\n\nif __name__ == \"__main__\":\n main()\n",
|
|
77
|
+
relativePath: "assets/agent-template.md",
|
|
78
|
+
content: "# Orchestration Agent: {PROCESS_NAME}\r\n\r\n## Context\r\n\r\n**Process**: {PROCESS_NAME}\r\n**Requirements**: {REQUIREMENTS_SUMMARY}\r\n\r\n---\r\n\r\n## How to Run This Agent\r\n\r\n**Start with Sub-Agent 0 (Environment Discovery).** This gathers the user's\r\npermissions, tooling, and preferences so that every subsequent sub-agent produces\r\nplans tailored to their actual environment. Do not skip this step.\r\n\r\nThen execute each remaining sub-agent in sequence:\r\n\r\n1. Use only the inputs and instructions provided in this file.\r\n2. Produce the specified output document in the designated subfolder.\r\n3. Present the output to the user; ask clarifying questions if anything is unclear.\r\n4. Refine until the user explicitly confirms the output.\r\n5. Append a timestamped entry to `CHANGE_LOG.md` recording what was produced or decided.\r\n6. Pass the confirmed output as the primary input to the next sub-agent.\r\n **Every sub-agent must also read `00-environment-discovery/environment-profile.md`**\r\n and respect the path decisions recorded there.\r\n\r\n**Do not proceed to the next sub-agent without explicit user confirmation.**\r\n**Do not produce code, scripts, or data artefacts not described in each sub-agent below.**\r\n\r\n### Notebook Documentation Standard\r\n\r\nEvery Fabric notebook produced by any skill **must** include a numbered markdown cell\r\nimmediately above each code cell. Each markdown cell must:\r\n\r\n1. State the cell number and a short title (e.g. `## Cell 1 — Install dependencies`).\r\n2. Explain **what** the code cell does in 1–2 sentences.\r\n3. Explain **how to use it**: variables to change, flags to toggle, prerequisites.\r\n\r\nAll transformation logic and design rationale must be **embedded as markdown cells inside\r\nthe notebook** — not maintained as separate documentation files. The notebook is the single\r\nsource of truth. A reader must be able to understand what each cell does, why the logic was\r\nchosen, and how to run it without opening any other file.\r\n\r\n### Output Conventions\r\n\r\n- Each sub-agent writes to its own **numbered subfolder** (`01-implementation-plan/`,\r\n `02-business-process/`, etc.). Execution steps continue the numbering (e.g.,\r\n `05-execution/`, `06-gold-layer/`).\r\n- Within each subfolder, distinguish **final deliverables** (notebooks, SQL scripts,\r\n documentation the user runs or deploys) from **intermediate artefacts** (generator\r\n scripts that produce the deliverables). When presenting outputs, label each file.\r\n- All transformation logic and design rationale must be **embedded as markdown cells\r\n inside notebooks** — not maintained as separate documentation files. The notebook\r\n is the single source of truth.\r\n\r\n---\r\n\r\n## Sub-Agent 0: Environment Discovery\r\n\r\n**Input**: Requirements above\r\n**Output**: `00-environment-discovery/environment-profile.md`\r\n\r\nThis sub-agent runs **before anything is planned or built**. Its purpose is to\r\nunderstand the user's environment, permissions, installed tooling, and preferences\r\nso that every subsequent sub-agent produces plans tailored to what is actually\r\npossible and practical.\r\n\r\n### How it works\r\n\r\n1. **Derive questions from the requirements.** Read the requirements and identify\r\n which environment factors will determine which approaches are viable. Group\r\n questions into the relevant discovery domains (see below). Do not ask about\r\n things the requirements don't need — if a process doesn't create workspaces,\r\n don't ask about workspace creation permissions.\r\n\r\n2. **Present the questionnaire.** Show all questions at once, grouped by domain.\r\n Aim for **5–7 questions** — enough to cover the critical unknowns without\r\n overwhelming the user. Prioritise by impact: if an answer could change the\r\n entire approach, ask it; if it's a nice-to-have detail, skip it.\r\n Each question must:\r\n - State **why** the answer matters (what it unlocks or blocks).\r\n - Offer concrete options where applicable (e.g., checkboxes, multiple choice).\r\n - Explain what the agent will do differently depending on the answer.\r\n\r\n3. **Confirm understanding.** After the user answers, present a brief summary:\r\n > \"Based on your answers, here's my understanding of your environment: [2–4\r\n > sentence summary of key decisions]. Is this accurate, or anything to correct\r\n > before I proceed to planning?\"\r\n Wait for explicit confirmation. If new gaps surface, ask only the follow-up\r\n questions needed to resolve them — do not re-ask the full questionnaire.\r\n\r\n4. **Record the answers.** Save the complete environment profile as\r\n `00-environment-discovery/environment-profile.md`. This file is the primary\r\n input for Sub-Agent 1 (Implementation Plan) and is referenced by all\r\n subsequent sub-agents.\r\n\r\n### Discovery domains\r\n\r\nSelect only the domains relevant to the requirements. **Every question must\r\nexplain why it is being asked** — what activity needs the permission or tool,\r\nand what the agent will do differently based on the answer.\r\n\r\n#### Permissions & roles\r\n\r\nProbe platform admin rights, resource creation permissions, role assignments,\r\nand domain management. Frame each question around the **specific activity** that\r\nneeds the permission.\r\n\r\nExample — workspace role assignment with Entra groups (a real technical constraint):\r\n\r\n> **Can you assign Entra security groups to Fabric workspace roles?**\r\n>\r\n> _Why this matters:_ The SOP assigns groups to workspace roles for RBAC. The\r\n> Fabric REST API and CLI require **Entra group Object IDs** — display names\r\n> are not accepted. The Fabric UI allows searching by name but is manual.\r\n>\r\n> Pick the option that best fits your situation:\r\n>\r\n> - **A) I can look up group Object IDs myself** (e.g., from Entra portal or\r\n> from my admin) → Agent will ask you for the Object IDs and script the\r\n> assignments via Fabric CLI.\r\n> - **B) I have Azure CLI (`az`) installed and can query Entra** → Agent will\r\n> generate `az ad group list --display-name \"...\"` commands so you can\r\n> retrieve Object IDs yourself, then script the assignments.\r\n> - **C) I have PowerShell with the Microsoft.Graph module** → Agent will\r\n> generate `Get-MgGroup -Filter \"displayName eq '...'\"` commands instead.\r\n> - **D) I only have access to the Fabric UI** → Agent will provide step-by-step\r\n> UI instructions with screenshots guidance. Role assignment becomes a manual\r\n> step in the SOP.\r\n> - **E) I'm not sure / I need to check** → Agent will provide a quick check\r\n> command (`az ad group list --display-name \"YourGroupName\" --query \"[].id\"`)\r\n> and pause until you confirm.\r\n\r\nOther permission questions follow the same pattern — always state the activity,\r\nthe constraint, and the options:\r\n\r\n- \"Can you **create workspaces** in Fabric? _(Step 1 needs this. If not, the\r\n agent will produce a workspace specification for your admin to create.)_\"\r\n- \"Can you **create or manage domains** and assign workspaces to them? _(The SOP\r\n organises workspaces under a domain. If you lack domain-admin rights, the agent\r\n will produce a domain-assignment request instead.)_\"\r\n- \"Can you **create lakehouses** in the target workspaces? _(Steps 3-5 provision\r\n lakehouses. If you only have Viewer/Member access, the agent will produce\r\n creation requests for a workspace admin.)_\"\r\n\r\n#### Installed tooling\r\n\r\nProbe CLI tools, SDKs, and runtimes — but only the ones the requirements\r\nactually need. **Tell the user what each tool is used for** so they can make\r\nan informed decision about whether to install it.\r\n\r\n- \"Is the **Fabric CLI (`fab`)** installed and authenticated? _(Used for:\r\n creating workspaces, uploading files, creating shortcuts, listing resources.\r\n If not installed, the agent will provide notebook-based alternatives or guide\r\n you through installation.)_\"\r\n- \"Is **Azure CLI (`az`)** available? _(Used for: querying Entra group/user\r\n Object IDs when assigning roles. Not needed if you can supply Object IDs\r\n directly or prefer PowerShell.)_\"\r\n- \"Do you have **Python 3.10+**? _(Used for: running generator scripts that\r\n produce notebooks and SQL. If not available, the agent can provide pre-built\r\n notebooks instead.)_\"\r\n\r\n#### Execution preferences\r\n\r\nGive the user agency over *how* the process is delivered:\r\n\r\n- \"How do you prefer to **run commands**? _(Terminal / Notebook cells / Fabric UI\r\n — the agent will format all instructions accordingly.)_\"\r\n- \"Do you want the agent to **execute commands directly** or **produce scripts\r\n for you to review and run**? _(Direct execution is faster; review-first gives\r\n you more control.)_\"\r\n\r\n#### Data access & connectivity\r\n\r\nOnly ask when the requirements involve data ingestion or movement:\r\n\r\n- \"Where is the **source data**? _(Local files / SharePoint / Azure Storage /\r\n API / already in OneLake — determines upload method and whether shortcuts\r\n can replace copies.)_\"\r\n- \"Can notebooks in your Fabric workspace **access the source location**?\r\n _(Network restrictions or firewall rules may block runtime access. If blocked,\r\n the agent will add a local-upload step.)_\"\r\n\r\n#### Capacity & licensing\r\n\r\nOnly ask when relevant to compute or feature availability:\r\n\r\n- \"What **Fabric capacity SKU** are you on? _(F2/F4 have lower parallelism\r\n limits — the agent will adjust batch sizes. Trial capacities have time and\r\n feature limits the agent will flag.)_\"\r\n\r\n#### Existing infrastructure\r\n\r\nOnly ask when the requirements could reuse existing resources:\r\n\r\n- \"Are there **existing workspaces or lakehouses** the process should reuse\r\n rather than create? _(If so, the agent will skip creation steps and wire up\r\n shortcuts to existing resources.)_\"\r\n\r\n#### Team & handoff\r\n\r\nOnly ask when multi-user or governance concerns apply:\r\n\r\n- \"Will **other team members** run or maintain this pipeline? _(If yes, the\r\n agent will add role-assignment steps, document naming conventions, and\r\n produce a handoff checklist.)_\"\r\n\r\n### Path table\r\n\r\nOnce answers are collected, produce a **path table** summarising how the answers\r\nshape the approach. **Each row links an answer back to the specific step it\r\naffects**, so the user can see exactly how their environment shapes the plan:\r\n\r\n```markdown\r\n## Path Decisions\r\n\r\n| # | Question | Your answer | What this means for the plan |\r\n|---|----------|-------------|------------------------------|\r\n| 1 | Workspace creation rights | Admin on capacity | Steps 1-2: Agent will create workspaces directly via `fab workspace create` |\r\n| 2 | Workspace creation rights | No admin rights | Steps 1-2: Agent will produce a workspace spec document; you hand it to your admin. SOP marks this as a manual gate. |\r\n| 3 | Entra group role assignment | Option B — has Azure CLI | Step 2: Agent will generate `az ad group list` commands to fetch Object IDs, then script `fab workspace role assign` |\r\n| 4 | Entra group role assignment | Option D — UI only | Step 2: Agent will provide click-by-click UI instructions. Role assignment becomes a manual SOP step. |\r\n| 5 | Fabric CLI installed | Yes, authenticated | All CLI steps presented as `fab ...` terminal commands |\r\n| 6 | Fabric CLI installed | Not installed | Agent will either (a) guide installation, or (b) provide notebook `!pip install` + `!fab` alternatives — your choice |\r\n```\r\n\r\n### Rules for question design\r\n\r\n- **Contextual, not generic.** Every question must name the activity it enables\r\n and the step(s) it affects. A questionnaire that reads like a bureaucratic\r\n intake form is wrong — it should read like a knowledgeable consultant scoping\r\n a project.\r\n- **Explain technical constraints in plain language.** When a platform limitation\r\n exists (e.g., \"the API requires Object IDs, not display names\"), say so — then\r\n immediately offer the user multiple ways to work around it. The user should\r\n never feel blocked; they should feel informed and in control.\r\n- **Give power to the user.** Options should not be \"yes/no you can or can't do\r\n this.\" They should be \"here are 3-4 ways to achieve this — which fits your\r\n situation?\" Even a user with limited permissions should see a viable path.\r\n- **Offer verification commands.** If the user isn't sure about an answer, give\r\n them a one-liner they can run to find out (e.g., \"Run `fab ls` — if it\r\n returns workspace names, you're authenticated.\").\r\n- **Do not guess or assume.** If the answer matters to the plan, ask. If the\r\n user says \"I'm not sure,\" help them check — don't default silently.\r\n- **Keep it proportional.** Target 5–7 questions. A simple 3-step process may\r\n need only 3–4; a complex multi-workspace pipeline might need 7. Beyond 7,\r\n split into a first wave and ask follow-ups only if gaps emerge. Never pad\r\n with irrelevant questions to look thorough.\r\n- The environment profile is a **living document** — if a later sub-agent\r\n discovers a new constraint, append it and re-confirm with the user.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 0 complete — environment-profile.md produced. [N] path decisions recorded.`\r\n- **Confirm the environment profile with the user before proceeding to Sub-Agent 1.**\r\n\r\n---\r\n\r\n## Sub-Agent 1: Implementation Plan\r\n\r\n**Input**: Requirements above\r\n**Output**: `01-implementation-plan/implementation-plan.md`\r\n\r\nProduce a phased implementation plan using the structure below. Keep ≤50 lines.\r\nUpdate the RAID log whenever a later sub-agent raises a new risk or dependency.\r\n\r\n```markdown\r\n---\r\ngoal: {PROCESS_NAME} — Implementation Plan\r\nstatus: Planned\r\ndate_created: {DATE}\r\n---\r\n\r\n# Implementation Plan: {PROCESS_NAME}\r\n\r\n## Requirements & Constraints\r\n- REQ-001: [Requirement drawn from the context above]\r\n- CON-001: [Key constraint]\r\n\r\n## Phases\r\n\r\n### Phase 1: [Phase name]\r\n| Task | Description | Status |\r\n|----------|-------------|---------|\r\n| TASK-001 | [Task] | Planned |\r\n| TASK-002 | [Task] | Planned |\r\n\r\n### Phase 2: [Phase name]\r\n| Task | Description | Status |\r\n|----------|-------------|---------|\r\n| TASK-003 | [Task] | Planned |\r\n\r\n## RAID Log\r\n| Type | ID | Description | Mitigation / Action | Status |\r\n|------------|-------|--------------|---------------------|--------|\r\n| Risk | R-001 | [Risk] | [Mitigation] | Open |\r\n| Assumption | A-001 | [Assumption] | [Validation] | Open |\r\n| Issue | I-001 | [Issue] | [Resolution] | Open |\r\n| Dependency | D-001 | [Dependency] | [Owner] | Open |\r\n```\r\n\r\nRules:\r\n- Use REQ-, CON-, TASK-, R-, A-, I-, D- prefixes consistently.\r\n- Task status values: Planned / In Progress / Done.\r\n- Do not include implementation code or scripts.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 1 complete — implementation-plan.md produced.`\r\n- **Confirm with user before proceeding to Sub-Agent 2.**\r\n\r\n---\r\n\r\n## Sub-Agent 2: Business Process Mapping\r\n\r\n**Input**: Confirmed output of Sub-Agent 1 + Requirements above\r\n**Output**: `02-business-process/sop.md`\r\n\r\nThis sub-agent maps requirements to process skills, creates any that are missing,\r\nand produces a Standard Operating Procedure. Work through the three steps below.\r\n\r\n### Step 1 — Decompose requirements into process steps\r\n\r\nRead the requirements and break them into discrete, ordered steps. For each step,\r\nwrite a one-line description of what it needs to do and what its output is.\r\n\r\n### Step 2 — Map each step to a process skill\r\n\r\nFor each step, search the skills directory for a matching process skill\r\n(a skill whose description covers the same action and output).\r\n\r\nFor every step, one of three outcomes applies:\r\n\r\n**A — Skill found**: Read the skill's `SKILL.md`. Note its inputs, outputs, and\r\nany parameters it needs from earlier steps. Mark the step as covered.\r\n\r\n**B — Skill not found**: Determine the deterministic logic needed to automate\r\nthis step (the specific inputs, the repeatable actions, and the expected output).\r\nInvoke `create-fabric-process-skill` to create a new skill definition for this step.\r\nOnce created, read its `SKILL.md` and mark the step as covered.\r\nAppend to `CHANGE_LOG.md`:\r\n`[{DATETIME}] New skill created: [skill-name] — [one-line description of what it does].`\r\nAdd the new skill as a dependency in the RAID log from Sub-Agent 1.\r\n\r\n**C — Step must be manual**: If the step cannot be automated (e.g. requires human\r\njudgement or a physical action), document it as a manual step with exact operator\r\ninstructions and mark it accordingly.\r\n\r\nRepeat until every step is either covered by a skill or accepted as manual.\r\nAsk the user to confirm the skill list before proceeding to Step 3.\r\n\r\n### Step 3 — Produce the SOP\r\n\r\n```markdown\r\n# SOP: {PROCESS_NAME}\r\n\r\n## Step Sequence\r\n| Step | Skill / Action | Input Parameters | Output | Manual? |\r\n|------|---------------------|--------------------|-------------------|---------|\r\n| 1 | [skill-name] | param=value | [output artefact] | No |\r\n| 2 | [skill-name] | output from step 1 | [output artefact] | No |\r\n| 3 | [Manual: action] | — | — | Yes |\r\n\r\n## Shared Parameters\r\n| Parameter | Source | Passed to steps |\r\n|-----------|------------|-----------------|\r\n| [param] | User input | 1, 3 |\r\n\r\n## Newly Created Skills\r\n| Skill name | Step | Description |\r\n|--------------|------|------------------------------------|\r\n| [skill-name] | 2 | [What it does — one line] |\r\n\r\n## Manual Steps\r\n- MANUAL-001: [Step] — [Reason] — [Exact operator instructions]\r\n```\r\n\r\nRules:\r\n- If requirements are unclear for any step, ask a targeted question and update\r\n requirements before continuing.\r\n- New skills created in this sub-agent are a permanent addition to the skills\r\n library and will be available for future agents.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 2 complete — sop.md produced. [N] new skills created.`\r\n- **Confirm with user before proceeding to Sub-Agent 3.**\r\n\r\n---\r\n\r\n## Sub-Agent 3: Solution Architecture\r\n\r\n**Input**: Confirmed output of Sub-Agent 2\r\n**Output**: `03-solution-architecture/specification.md`\r\n\r\nProduce a plain-language specification. Keep total length ≤50 lines.\r\nWrite for a non-technical reader — no code, no implementation detail.\r\n\r\n```markdown\r\n---\r\ntitle: {PROCESS_NAME} — Solution Specification\r\nstatus: Draft\r\ndate_created: {DATE}\r\n---\r\n\r\n# Specification: {PROCESS_NAME}\r\n\r\n## Purpose\r\n[One paragraph: what this solution does and what problem it solves.]\r\n\r\n## Scope\r\n[What is included and what is explicitly excluded.]\r\n\r\n## How It Works\r\n| Step | What happens | Automated? | Notes |\r\n|------|-------------------------------|------------|-----------------|\r\n| 1 | [Plain-language description] | Yes | |\r\n| 2 | [Plain-language description] | No | See MANUAL-001 |\r\n\r\n## Manual Steps\r\n- MANUAL-001: [Step] — [Reason] — [Exact operator instructions]\r\n\r\n## Acceptance Criteria\r\n- AC-001: Given [context], when [action], then [expected outcome].\r\n\r\n## Dependencies\r\n- DEP-001: [External system, file, or service] — [Purpose]\r\n```\r\n\r\nRules:\r\n- Write for a non-technical reader. No jargon without explanation.\r\n- Every manual step must include exact operator instructions.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 3 complete — specification.md produced.`\r\n- **Confirm with user before proceeding to Sub-Agent 4.**\r\n\r\n---\r\n\r\n## Sub-Agent 4: Security, Testing and Governance\r\n\r\n**Input**: Confirmed output of Sub-Agent 3\r\n**Output**: `04-governance/governance-plan.md`\r\n\r\nProduce a governance and deployment plan. Keep total length ≤45 lines.\r\n\r\n```markdown\r\n---\r\ntitle: {PROCESS_NAME} — Governance Plan\r\ndate_created: {DATE}\r\n---\r\n\r\n# Governance Plan: {PROCESS_NAME}\r\n\r\n## Agent Boundaries\r\n| Boundary | Rule |\r\n|-------------------------|--------------------------------------------|\r\n| Allowed actions | [Permitted operations] |\r\n| Blocked actions | [Prohibited operations] |\r\n| Requires human approval | [Steps needing explicit sign-off] |\r\n\r\n## Testing Checklist\r\n- [ ] Validate each sub-agent output before passing it to the next\r\n- [ ] Test all manual steps with a real operator before production use\r\n- [ ] Run against a minimal test dataset before using real data\r\n- [ ] Review CHANGE_LOG.md to confirm all new skills are correct\r\n- [ ] Verify the output folder structure after scaffolding\r\n\r\n## Microsoft Responsible AI Alignment\r\n| Principle | How Applied |\r\n|----------------|--------------------------------------------------------|\r\n| Fairness | [How bias is avoided in outputs and decisions] |\r\n| Reliability | [Validation steps, error handling, new skill review] |\r\n| Privacy | [Data handling — no PII retained in output files] |\r\n| Inclusiveness | [Plain language; no domain assumptions made] |\r\n| Transparency | [User validates every sub-agent output; CHANGE_LOG] |\r\n| Accountability | [Human sign-off required before production execution] |\r\n\r\n## Deployment Guidance\r\n- Review `CHANGE_LOG.md` to verify all newly created skills before first run.\r\n- Store `agent.md`, all outputs, and new skills in version control.\r\n- Review the RAID log from Sub-Agent 1 before each new run.\r\n- Human sign-off required before running against production systems.\r\n```\r\n\r\nRules:\r\n- Every RAI principle row must be completed — state explicitly if not applicable and why.\r\n- Human approval must be required for any step that modifies production systems.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 4 complete — governance-plan.md produced. Agent definition finalised.`\r\n- **Confirm with user before finalising.**\r\n",
|
|
72
79
|
},
|
|
73
80
|
{
|
|
74
|
-
relativePath: "
|
|
75
|
-
content: "# Approach 2 — Pattern-Driven Gold Star-Schema Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts for a **Power BI-optimised\nstar schema** from silver CSV files alone. The agent profiles every file, classifies\nit as a dimension source or fact source, applies a standardised catalogue of\ngold-layer transformations, and validates the result against star schema design\nprinciples before presenting it.\n\n---\n\n## When to use\n\nThe user supplies **only silver CSV files** — no target gold schema is provided.\nThe goal is to propose and generate a best-practice star schema gold layer\noptimised for Power BI semantic model consumption.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Silver CSVs | Uploaded or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/gold_layer_mlv.sql` |\n\n---\n\n## Star Schema Design Principles\n\nThese principles govern every decision in the workflow. The agent must validate\nevery proposed table against these rules before generating SQL.\n\n### Core model structure\n\nA star schema organises data into two table types connected by relationships:\n\n- **Dimension tables** (the \"one\" side): contain descriptive attributes for\n filtering, slicing, and grouping. Each dimension has a unique key column\n (one row per entity). Name using singular nouns (`Hotel`, `Manager`, `Date`).\n- **Fact tables** (the \"many\" side): contain measurable, quantitative data at a\n consistent grain plus foreign keys to every related dimension. Name using\n business-process nouns (`Revenue`, `Expenses`, `Orders`).\n\n### Key design rules\n\n1. **Separate dimensions from facts.** Never embed descriptive attributes in a\n fact table when they belong in a dimension. A wide denormalised table is an\n anti-pattern — split it.\n2. **Consistent grain.** Every row in a fact table represents the same thing\n (e.g., one hotel × one month). Never mix grains in one table.\n3. **Surrogate keys.** Add integer surrogate keys (via `DENSE_RANK`) when the\n source lacks clean unique identifiers or when the natural key is a long string.\n Surrogate keys improve join performance and enable unknown-member rows.\n4. **Date dimension.** Always create a dedicated date table. Include fiscal\n periods if relevant. In Power BI this table will be marked as the date table.\n5. **Flatten dimensions.** Do not snowflake (normalise) dimensions. `dim_hotel`\n should include city and country directly, not point to a separate `dim_city`.\n Extra joins hurt Power BI performance.\n6. **Unknown / unassigned member rows.** Every dimension must have a fallback row\n so that fact records with NULL foreign keys still resolve to a valid dimension\n member. This prevents blank rows in Power BI visuals.\n\n### Special dimension types\n\n| Type | Description | When to use |\n|---|---|---|\n| **Role-playing** | Same dimension used via different relationships (e.g., OrderDate vs ShipDate → dim_date) | Fact has 2+ date/entity references of the same type. Power BI: inactive relationships + `USERELATIONSHIP` in DAX, or duplicate the dimension. |\n| **Slowly Changing (Type 2)** | Tracks historical changes with StartDate/EndDate/IsCurrent | Silver SCD table exists. Gold dim holds only distinct/current attributes; SCD join happens in fact views. |\n| **Junk** | Combines multiple low-cardinality flags into one dimension | Fact has 3+ boolean/flag columns (IsRush, IsGift, IsOnline). Combine all permutations into a single dimension. |\n| **Degenerate** | Transaction identifiers kept directly in the fact, not in a dimension | The only attribute is the ID itself with no descriptive columns. Do not create a dimension for it. |\n\n### Anti-patterns the agent must prevent\n\n| Anti-pattern | How the agent detects it | Resolution |\n|---|---|---|\n| **Wide denormalised table** | A proposed gold table has both descriptive text and numeric measures | Split into dimension + fact |\n| **Snowflaked dimensions** | A proposed dimension references another dimension via FK | Flatten — bring attributes directly into the parent dimension |\n| **Many-to-many without bridge** | Two dimensions related through a fact with no single FK path | Add bridge/junction table or restructure |\n| **Mixed-grain fact** | Proposed fact has rows at different granularities | Separate into distinct fact tables per grain |\n| **Dimension without unique key** | Proposed dimension has duplicates on the PK column | Add DISTINCT or investigate missing SCD pattern |\n| **Fact with embedded attributes** | Proposed fact includes descriptive text alongside measures | Promote text to a dimension; replace with FK |\n| **Pointless one-column dimension** | A dimension that contains only an ID and no descriptive attributes | Keep the ID as a degenerate dimension in the fact |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the silver tables\n\n1. List all uploaded CSVs.\n2. For each file, run the profiler from Step 2.\n3. Classify each silver table:\n\n| Role | Detection signals |\n|---|---|\n| **Dimension source** | Low row count (<500), descriptive text columns, a natural key (ID column), no date-series, no monetary measures |\n| **Fact source** | Higher row count, has `MonthStart`/date column, numeric measures (revenue, amount, spend, attendance, quantity), FK-like ID columns |\n| **SCD source** | StartDate/EndDate pairs, IsCurrent flag, same entity ID repeats with different date ranges |\n| **Bridge / mapping** | Two+ ID columns, no measures, maps one entity to another |\n\n4. Identify the **grain** of each fact source by listing its non-measure columns.\n5. Check for **categorical text columns in fact sources** (low cardinality, not IDs) — these are candidates for surrogate-key dimensions.\n6. Check for **multiple boolean/flag columns in fact sources** — these are junk dimension candidates.\n7. Check for **multiple date columns in fact sources** — these signal role-playing dimension needs.\n8. Present classification to the user for confirmation.\n\n### Step 2 — Profile every CSV\n\nFor each CSV, capture: columns, inferred dtypes, row count, unique counts, null\ncounts, sample values, whether columns look like IDs, measures, dates, or booleans.\nUse `encoding='utf-8-sig'` for BOM handling.\n\n### Step 3 — Apply the transformation catalogue\n\nWalk through the catalogue below **in order**. For each silver table, determine\nwhich gold views it feeds and which transformations apply.\n\n---\n\n#### CATALOGUE OF GOLD-LAYER TRANSFORMATIONS\n\n##### G01 — Generate a date dimension\n\n| Condition | At least one fact source has a date/month column |\n|---|---|\n| Trigger | Always generate `gold.dim_date` |\n\n**Steps:**\n1. Scan all fact source tables; collect every distinct date/month column.\n2. Determine date range: `min(all dates)` to last day of `max(all dates)` month.\n3. Generate daily calendar: `explode(sequence(MinDate, MaxDate, interval 1 day))`.\n4. Ask the user for the **fiscal year start month** (default: January).\n5. Include **display + sort column pairs** for Power BI (critical for Sort by Column):\n\n| Display column | Sort column | Power BI usage |\n|---|---|---|\n| `CalendarMonthName` (MMMM) | `CalendarMonthNumber` (1–12) | Sort month names chronologically |\n| `CalendarMonthNameShort` (MMM) | `CalendarMonthNumber` | Same |\n| `DayOfWeekName` (EEEE) | `DayOfWeekNumber` (1–7) | Sort day names in weekday order |\n| `CalendarQuarter` (Q1–Q4) | `CalendarQuarterNumber` (1–4) | Sort quarters |\n| `FiscalPeriodLabel` (FP01–FP12) | `FiscalPeriodNumber` (1–12) | Sort fiscal periods |\n\n**Standard date dimension columns:**\n\n| Column | SQL |\n|---|---|\n| `DateKey` | `CalendarDate` (PK) |\n| `DateKeyInt` | `year * 10000 + month * 100 + day` |\n| `MonthStartDate` | `date_trunc('month', CalendarDate)` |\n| `MonthEndDate` | `last_day(CalendarDate)` |\n| `CalendarYear` | `year(CalendarDate)` |\n| `CalendarMonthNumber` | `month(CalendarDate)` |\n| `CalendarMonthName` | `date_format(CalendarDate, 'MMMM')` |\n| `CalendarMonthNameShort` | `date_format(CalendarDate, 'MMM')` |\n| `CalendarQuarter` | `concat('Q', quarter(CalendarDate))` |\n| `CalendarQuarterNumber` | `quarter(CalendarDate)` |\n| `CalendarDayNumber` | `day(CalendarDate)` |\n| `WeekOfYear` | `weekofyear(CalendarDate)` |\n| `DayOfWeekNumber` | `dayofweek(CalendarDate)` — 1=Sun, 7=Sat |\n| `DayOfWeekName` | `date_format(CalendarDate, 'EEEE')` |\n| `DayOfWeekNameShort` | `date_format(CalendarDate, 'E')` |\n| `IsWeekend` | `CASE WHEN dayofweek(CalendarDate) IN (1,7) THEN 1 ELSE 0 END` |\n| `IsMonthStart` | `CASE WHEN CalendarDate = date_trunc('month', CalendarDate) THEN 1 ELSE 0 END` |\n| `IsMonthEnd` | `CASE WHEN CalendarDate = last_day(CalendarDate) THEN 1 ELSE 0 END` |\n| `FiscalYear` | See G02 |\n| `FiscalPeriodNumber` | See G02 |\n| `FiscalPeriodLabel` | See G02 |\n\n##### G02 — Add fiscal year logic\n\n| Condition | User confirms a fiscal year start month (or data context suggests one) |\n|---|---|\n| Parameter | `FISCAL_START_MONTH` (integer 1–12) |\n\n**Formulas:**\n\n```sql\nCASE WHEN month(CalendarDate) >= ${FISCAL_START_MONTH}\n THEN year(CalendarDate)\n ELSE year(CalendarDate) - 1\nEND AS FiscalYear,\n\n(((month(CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1) AS FiscalPeriodNumber,\n\nconcat('FP', lpad(\n (((month(CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1),\n 2, '0'\n)) AS FiscalPeriodLabel\n```\n\nIf fiscal year = calendar year: set `FISCAL_START_MONTH = 1` and simplify.\n\n##### G03 — Build pass-through dimensions (flatten, no snowflaking)\n\n| Condition | Silver table classified as **Dimension source** |\n|---|---|\n| Action | Create `gold.dim_<entity>` with selected columns + unknown member row |\n\n**Steps:**\n1. Select only columns needed by downstream facts (key + descriptive attributes).\n Drop operational columns not useful for analysis.\n2. **Flatten**: If any attributes could be split into a sub-dimension (city, country,\n geography), keep them directly in this dimension. Do not create a separate\n `dim_city` or `dim_country`. Snowflaking is an anti-pattern in Power BI.\n3. Add an unknown member row via `UNION ALL`:\n - String keys: `'UNKNOWN'` as key, `'Unknown <Entity>'` for name, `NULL`/`'Unknown'` for attributes.\n - Integer keys: `0` or `-1` as key.\n4. If duplicates exist on the natural key, wrap in `SELECT DISTINCT`.\n\n##### G04 — Build SCD-sourced dimensions\n\n| Condition | Silver table classified as **SCD source** |\n|---|---|\n| Action | Create `gold.dim_<role>` with distinct entity attributes + unassigned row |\n\n**Steps:**\n1. `SELECT DISTINCT` the entity ID and name columns only (drop StartDate/EndDate/IsCurrent — SCD join happens in facts).\n2. `COALESCE(EntityID, 'UNASSIGNED')` for null IDs.\n3. Add unassigned row: `UNION SELECT 'UNASSIGNED', 'Unassigned <Role>'`.\n4. Use `UNION` (not `UNION ALL`) to deduplicate if COALESCE creates a match.\n\n##### G05 — Build surrogate-key dimensions (from fact categorical columns)\n\n| Condition | A fact source has a categorical text column (low cardinality, not an ID) that should become a dimension FK |\n|---|---|\n| Action | Create `gold.dim_<category>` with integer surrogate key |\n\n**Detection**: Look for columns in fact sources that are text/string, low\ncardinality (<100 distinct), not ending in `ID`, and descriptive (category names,\ntypes, labels).\n\n**Steps:**\n1. `SELECT DISTINCT CategoryColumn FROM silver.fact_source`.\n2. `DENSE_RANK() OVER (ORDER BY CategoryColumn) AS CategoryID`.\n3. Add unknown row: `UNION ALL SELECT 0, 'Unknown <Category>'`.\n\n**When NOT to create a surrogate-key dimension:**\n- If the column has only 2–4 values and no descriptive attributes beyond the name itself, it may be better kept as a **degenerate dimension** in the fact. Ask the user.\n- If the column is a transaction identifier (OrderNumber, InvoiceID), it is a degenerate dimension — keep in the fact.\n\n##### G06 — Detect and handle junk dimension candidates\n\n| Condition | A fact source has 3+ boolean/flag columns (0/1, Yes/No) |\n|---|---|\n\n**Steps:**\n1. Identify all boolean/flag columns in the fact source.\n2. If 3 or more exist, propose a **junk dimension** that contains all permutations:\n\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_flags AS\nWITH permutations AS (\n SELECT DISTINCT Flag1, Flag2, Flag3 FROM silver.fact_source\n)\nSELECT\n DENSE_RANK() OVER (ORDER BY Flag1, Flag2, Flag3) AS FlagGroupID,\n Flag1, Flag2, Flag3\nFROM permutations\nUNION ALL\nSELECT 0, NULL, NULL, NULL;\n```\n\n3. Replace the 3 flag columns in the fact with a single `FlagGroupID` FK.\n4. Present this as a suggestion — the user may prefer keeping flags in the fact.\n\n##### G07 — Detect role-playing dimensions\n\n| Condition | A fact source has 2+ date columns (e.g., OrderDate, ShipDate, DueDate) or 2+ FK columns of the same dimension type |\n|---|---|\n\n**Steps:**\n1. Identify the multiple columns.\n2. Generate only ONE dimension (e.g., `gold.dim_date`).\n3. In the fact, create separate FK columns: `OrderDateKey`, `ShipDateKey`.\n4. Add a SQL comment:\n\n```sql\n-- Role-playing dimension: OrderDateKey and ShipDateKey both reference gold.dim_date.\n-- In Power BI, set one relationship as active and use USERELATIONSHIP() in DAX\n-- for the inactive relationship, or duplicate the date table as a query reference.\n```\n\n##### G08 — Conform fact date keys\n\n| Condition | Every fact source |\n|---|---|\n| Action | Standardise date column to `DateKey` |\n\n```sql\ndate_trunc('month', SourceDateColumn) AS DateKey\n```\n\nDefault grain: **monthly** (first of month). If data is daily and user wants daily\ngrain, use the raw date. Ask: \"Should fact tables use monthly or daily date key grain?\"\n\n##### G09 — COALESCE foreign keys to unknown/unassigned members\n\n| Condition | Every FK column in a fact that could contain NULLs |\n|---|---|\n\n| FK type | COALESCE value | Matches dimension |\n|---|---|---|\n| String entity key | `'UNKNOWN'` | Unknown row in dim |\n| Manager/role key | `'UNASSIGNED'` | Unassigned row in dim |\n| Integer surrogate | `0` | Unknown row (ID=0) |\n\nApply in the final SELECT, **not** before any JOINs.\n\n##### G10 — SCD point-in-time join for role assignment\n\n| Condition | A fact needs a manager/owner but has no assignment column, AND an SCD table exists |\n|---|---|\n\n```sql\nLEFT JOIN silver.scd_table ma\n ON fact.EntityID = ma.EntityID\n AND fact.MonthStart BETWEEN ma.StartDate AND ma.EndDate\n```\n\nAlways LEFT JOIN. COALESCE the result to `'UNASSIGNED'`.\n\n##### G11 — Surrogate key lookup in facts\n\n| Condition | A fact references a surrogate-key dimension (G05) |\n|---|---|\n\n*Approach A — join to gold dim view:*\n```sql\nLEFT JOIN gold.dim_category dc ON lower(dc.CategoryName) = lower(fact.CategoryCol)\n```\n\n*Approach B — inline CTE with matching DENSE_RANK:*\n```sql\nWITH category_lookup AS (\n SELECT CategoryCol, DENSE_RANK() OVER (ORDER BY CategoryCol) AS CategoryID\n FROM (SELECT DISTINCT CategoryCol FROM silver.source) c\n)\nLEFT JOIN category_lookup cl ON lower(cl.CategoryCol) = lower(fact.CategoryCol)\n```\n\nUse `COALESCE(resolved_id, 0)` as fallback. Be consistent across all facts.\n\n##### G12 — Rename and round measures\n\n| Condition | Every numeric measure column in a fact |\n|---|---|\n\n**Naming**: `<Metric><Currency>` — e.g., `RevenueAmountGBP`, `TotalSpendGBP`.\nAsk the user for the **base currency code** if not obvious.\n\n**Rounding**: `CAST(ROUND(col, 2) AS DECIMAL(18,2))` for monetary.\n`CAST(col AS BIGINT)` for counts/quantities.\n\n##### G13 — Prune operational columns from facts\n\n| Condition | Fact source has descriptive/categorical columns already covered by a dimension |\n|---|---|\n\nA well-formed fact contains only:\n- `DateKey` (FK to dim_date)\n- Entity FKs (HotelID, ManagerID, CategoryID, etc.)\n- Numeric measures\n\nRemove any text column that has been promoted to a dimension (G05) and replace\nwith its surrogate FK.\n\n**Exception — degenerate dimensions**: Keep transaction IDs (OrderNumber, InvoiceID)\nand low-value categorical columns (≤4 values, no attributes) directly in the fact.\nDo not create a pointless dimension for them. Ask the user if borderline.\n\n---\n\n### Step 4 — Propose the gold layer (with anti-pattern check)\n\nBefore generating SQL, present a **star schema plan** to the user.\n\n**Dimensions:**\n```\nGold dimension | Source | Key type | Special type | Transforms\n---------------------------|----------------------------|-----------|-----------------|------------\ngold.dim_date | All fact date ranges | DateKey | Generated | G01, G02\ngold.dim_hotel | silver.hotel_metadata | HotelID | Pass-through | G03\ngold.dim_manager | silver.manager_scd | ManagerID | SCD-sourced | G04\ngold.dim_expense_category | silver.expenses (distinct) | Surrogate | Surrogate-key | G05\n```\n\n**Facts:**\n```\nGold fact | Source | Grain | Transforms\n---------------------------|----------------------------|---------------------|-------------\ngold.fact_revenue | silver.revenue_monthly | Hotel × Month | G08, G09, G12\ngold.fact_expenses | silver.expenses_monthly | Hotel × Month × Cat | G08-G13\n```\n\n**Anti-pattern validation** (run before presenting):\n- [ ] No proposed table mixes dimensions and measures\n- [ ] No dimension references another dimension via FK\n- [ ] Each fact has a single consistent grain\n- [ ] Degenerate dimensions stay in facts\n- [ ] Junk dimension candidates are flagged if applicable\n\nInclude for each table: column list, FK mappings, assumptions, ambiguities.\n\n**Wait for user confirmation** before generating SQL.\n\n### Step 5 — Generate the SQL script\n\n**File structure (dependency order):**\n1. `CREATE SCHEMA IF NOT EXISTS gold;`\n2. Comment header (fiscal year, currency, grain, Power BI considerations)\n3. `gold.dim_date` (generated)\n4. Pass-through dimensions\n5. SCD-sourced dimensions\n6. Surrogate-key dimensions (and junk dimensions if applicable)\n7. Fact views\n\n**CTE conventions within fact views:**\n- `normalized` — date_trunc, column selection\n- `category_lookup` — surrogate key resolution\n- Final SELECT: COALESCE all FKs, ROUND all measures, alias to standard names\n\n**Naming conventions:**\n- Dimensions: `gold.dim_<entity>` — singular noun, snake_case\n- Facts: `gold.fact_<process>` — process noun, snake_case\n- Surrogate keys: `<Entity>ID` (PascalCase, INT)\n- Natural keys: unchanged from silver\n- DateKey: DATE\n- Measures: `<Metric><Currency>`\n- Unknown: `0` (int), `'UNKNOWN'` (string)\n- Unassigned: `'UNASSIGNED'` (role-based)\n\n**Spark SQL reminders:**\n`DENSE_RANK()`, `COALESCE()`, `UNION ALL` (unknown rows) / `UNION` (SCD dims),\n`date_trunc('month')`, `sequence() + explode()`, `date_format('MMMM'/'EEEE')`,\n`dayofweek()` (1=Sun, 7=Sat), `lpad()`, `last_day()`,\n`CAST(ROUND(x,2) AS DECIMAL(18,2))`, `CAST(x AS BIGINT)`.\n\n### Step 6 — Validate against star schema checklist\n\n**Data validation (Python):**\n- Dimensions: row count = distinct key count + 1 (unknown row)\n- Surrogate keys: gap-free sequence starting at 1 (0 for unknown)\n- Facts: FK values all exist in corresponding dimension\n- Date dimension: full coverage, fiscal logic correct at boundaries\n\n**Star schema structural checklist:**\n\n- [ ] Every gold table is clearly a dimension or a fact — no hybrids\n- [ ] Every fact has FKs to all related dimensions\n- [ ] Every dimension has a unique primary key (no duplicates)\n- [ ] A date dimension exists and spans the full fact date range\n- [ ] Date dimension has display + sort column pairs for Power BI\n- [ ] Every dimension has an unknown/unassigned member row\n- [ ] No snowflaking — no dimension has FKs to other dimensions\n- [ ] No fact embeds descriptive attributes belonging in a dimension\n- [ ] Consistent grain within each fact table\n- [ ] No circular relationship paths\n- [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n- [ ] Surrogate key DENSE_RANK ORDER BY identical in dim views and fact CTEs\n- [ ] Role-playing dimensions documented (Power BI inactive relationships)\n- [ ] Degenerate dimensions remain in facts (not split into pointless dims)\n- [ ] Junk dimension candidates addressed (combined or deliberately kept separate)\n\nReport failures as warnings.\n\n### Step 7 — Write the final file\n\n1. Write to `/mnt/user-data/outputs/gold_layer_mlv.sql`.\n2. Present the file.\n3. Summarise: dimension count, fact count, total measures, fiscal year config.\n4. Note role-playing / junk / degenerate dimension decisions.\n5. Include Power BI modelling notes (mark date table, Sort by Column setup, USERELATIONSHIP for role-playing dims).\n\n---\n\n## Gotchas\n\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` needs a tiebreaker if ties are possible.\n- **UNION ALL vs UNION**: `UNION ALL` for unknown rows. `UNION` for SCD dims where COALESCE may duplicate the unassigned row.\n- **SCD fan-out**: Overlapping SCD date ranges duplicate fact rows. Validate non-overlap.\n- **Surrogate key consistency**: DENSE_RANK ORDER BY must match between dim view and fact CTE.\n- **COALESCE placement**: Apply in final SELECT, never in JOIN ON clause.\n- **Date dimension bounds**: Earliest fact date to last day of latest fact month.\n- **Fiscal year off-by-one**: Test the modular arithmetic at January and at the fiscal start month.\n- **dayofweek()**: Spark returns 1=Sunday, 7=Saturday. Weekend = `IN (1, 7)`.\n- **Power BI sort columns**: Always pair display names with numeric sort columns. Without this, months sort alphabetically (April, August, December…) instead of chronologically.\n- **No snowflaking**: Flatten all attributes into the dimension. If `dim_hotel` has City and Country, do NOT create `dim_geography`. Power BI performs best with flat dimensions.\n- **Degenerate vs real dimensions**: A column needs a dimension only if it has descriptive attributes beyond the key itself. OrderNumber with no other attributes is degenerate — keep in fact.\n- **Junk dimensions**: Combining 3+ boolean flags into a single dimension reduces model complexity but adds a join. Propose it; let the user decide.\n- **Multiple facts, conformed dimensions**: All facts referencing `dim_hotel` must use the same `HotelID` key and `'UNKNOWN'` fallback. Consistency is critical for drill-across.\n- **BOM characters**: `encoding='utf-8-sig'` when reading CSVs.\n",
|
|
81
|
+
relativePath: "references/section-descriptions.md",
|
|
82
|
+
content: "# Section Descriptions\r\n\r\nShow this to the user during Step 3 (section confirmation).\r\nPresent each section with its description, then ask which to include.\r\n\r\n---\r\n\r\n## Standard Sections\r\n\r\n**0. Environment Discovery** (`discovery`)\r\nRuns first — before any planning. Asks the user contextual questions about\r\ntheir permissions, installed tooling, execution preferences, and existing\r\ninfrastructure. Each question explains *why* it is being asked (which\r\nactivity needs it, what the technical constraint is) and offers multiple\r\noptions so the user chooses how to proceed. Produces an environment profile\r\nand a path-decisions table that all subsequent sub-agents must respect.\r\nOutputs: `00-environment-discovery/environment-profile.md`.\r\n**Always included — cannot be deselected.**\r\n\r\n**1. Implementation Plan** (`impl-plan`)\r\nProduces a phased task plan with completion criteria and a RAID log\r\n(Risks, Assumptions, Issues, Dependencies). Updated continuously as\r\nother sub-agents progress. Outputs: `implementation-plan.md`.\r\n\r\n**2. Business Process Mapping** (`biz-process`)\r\nMaps the requirements and process skills into a Standard Operating\r\nProcedure (SOP) — a sequenced table of steps, skill names, parameters,\r\nand whether each step is automated or manual. Identifies gaps and asks\r\nclarifying questions if requirements are insufficient.\r\nOutputs: `sop.md`.\r\n\r\n**3. Solution Architecture** (`architecture`)\r\nPresents the solution design in plain, non-technical language: which\r\nplatform capabilities are used, what can be automated vs. done manually,\r\nand clear instructions for any manual steps. Produces a concise\r\nspecification document. Outputs: `specification.md`.\r\n\r\n**4. Security, Testing and Governance** (`governance`)\r\nOutlines safety controls, human-approval requirements, testing steps,\r\nand responsible AI alignment (Microsoft RAI framework: Fairness,\r\nReliability, Privacy, Inclusiveness, Transparency, Accountability).\r\nProduces a governance and deployment plan. Outputs: `governance-plan.md`.\r\n\r\n---\r\n\r\n## Confirmation Prompt\r\n\r\nAsk the user:\r\n\r\n> \"The orchestration agent can include the following sections. Which would\r\n> you like to include?\"\r\n>\r\n> 1. **Implementation Plan** — phased task plan + RAID log\r\n> 2. **Business Process Mapping** — SOP with sequenced steps and parameters\r\n> 3. **Solution Architecture** — plain-language specification\r\n> 4. **Security, Testing and Governance** — safety controls + RAI alignment\r\n>\r\n> Reply with the numbers you want (e.g. \"1,2,3,4\" for all) or describe\r\n> any changes.\r\n\r\nDefault: all four sections are included.\r\n",
|
|
76
83
|
},
|
|
77
84
|
{
|
|
78
|
-
relativePath: "
|
|
79
|
-
content: "# Approach 1 — Schema-Driven Gold Star-Schema Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts for a **Power BI-optimised\nstar schema** by comparing silver input CSVs against target gold output CSVs. The\nagent infers every dimension, fact, surrogate key, and join pattern by diffing\nschemas and sampling data — then validates the result against star schema design\nprinciples.\n\n---\n\n## When to use\n\nThe user supplies **both**:\n\n- One or more **silver CSV** files (conformed, cleaned data)\n- One or more **gold CSV** files (desired star-schema output — dimensions and facts)\n\nThe goal is to produce a `.sql` script of `CREATE OR REPLACE MATERIALIZED LAKE VIEW`\nstatements that, when executed against the silver tables in a Fabric Lakehouse,\nreproduce the gold outputs as a star schema ready for Power BI consumption.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Silver CSVs | Uploaded or provided in context |\n| Gold CSVs | Uploaded or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/gold_layer_mlv.sql` |\n\n---\n\n## Star Schema Design Principles\n\nThese principles govern every decision in the workflow. Refer back to them during\ngeneration and validation.\n\n### Core model structure\n\nA star schema organises data into two table types connected by relationships:\n\n- **Dimension tables** (the \"one\" side): contain descriptive attributes for\n filtering, slicing, and grouping. Each dimension has a unique key column\n (one row per entity). Name using singular nouns (`Hotel`, `Manager`, `Date`).\n- **Fact tables** (the \"many\" side): contain measurable, quantitative data at a\n consistent grain plus foreign keys to every related dimension. Name using\n business-process nouns (`Revenue`, `Expenses`, `Orders`).\n\n### Key design rules\n\n1. **Separate dimensions from facts.** Never embed descriptive attributes in a\n fact table when they belong in a dimension. A wide, denormalised table is an\n anti-pattern — split it.\n2. **Consistent grain.** Every row in a fact table represents the same thing\n (e.g., one hotel × one month). Never mix grains in one table.\n3. **Surrogate keys.** Add integer surrogate keys (via `DENSE_RANK`) when the\n source lacks clean unique identifiers, or when the natural key is a long\n string. Surrogate keys improve join performance and enable unknown-member rows.\n4. **Date dimension.** Always create a dedicated date table. Include fiscal\n periods if relevant. In Power BI, this table will be marked as the date table.\n5. **Flatten dimensions.** Do not snowflake (normalise) dimensions. A\n `dim_hotel` should include city and country directly, not point to a separate\n `dim_city`. Extra joins hurt Power BI performance.\n6. **Unknown / unassigned member rows.** Every dimension must have a fallback row\n so that fact records with NULL foreign keys still resolve to a valid dimension\n member. This prevents blank rows in Power BI visuals.\n\n### Special dimension types to detect\n\n| Type | Description | When to use |\n|---|---|---|\n| **Role-playing** | Same dimension used multiple times via different relationships (e.g., OrderDate vs ShipDate both pointing to dim_date) | When a fact has 2+ date columns or 2+ entity references of the same type. In Power BI, handle with inactive relationships + `USERELATIONSHIP` in DAX, or by duplicating the dimension table. |\n| **Slowly Changing (Type 2)** | Tracks historical changes with StartDate/EndDate/IsCurrent columns | When a silver SCD table exists. The gold dimension holds only current/distinct attributes; the SCD join happens in the fact view via point-in-time lookup. |\n| **Junk** | Combines multiple low-cardinality flags/indicators into a single dimension | When a fact has several boolean or 2–3 value categorical columns (IsRush, IsGift, IsOnline). Instead of one dimension per flag, combine into one junk dimension with all combinations. |\n| **Degenerate** | Transaction identifiers (OrderNumber, InvoiceID) kept directly in the fact table, not in a dimension | When the only attribute is the ID itself and there are no descriptive columns to justify a dimension. Keep as a column in the fact. |\n\n### Anti-patterns to detect and reject\n\n| Anti-pattern | Detection signal | Resolution |\n|---|---|---|\n| **Wide denormalised table** | A single gold table has both descriptive text columns and numeric measures with no clear key/FK structure | Split into dimension + fact |\n| **Snowflaked dimensions** | A gold dimension has an FK to another gold dimension (e.g., dim_city referenced by dim_hotel) | Flatten — bring attributes directly into the parent dimension |\n| **Many-to-many without bridge** | Two dimensions related through a fact with no single FK path | Add a bridge/junction table or restructure |\n| **Mixed-grain fact** | A fact table has rows at different granularities (some monthly, some daily; or some at line level, some at header level) | Separate into distinct fact tables, one per grain |\n| **Dimension without unique key** | A gold dimension has duplicate values on the primary key column | Add DISTINCT, or investigate for a missing SCD pattern |\n| **Fact with embedded dimension attributes** | A gold fact table has descriptive text columns alongside measures | Move to a dimension; replace with FK in the fact |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the files\n\n1. List all uploaded CSVs.\n2. Ask the user to confirm which files are **silver** (source) and which are **gold** (target).\n3. Ask the user to confirm the **silver schema name** (default `silver`) and **gold schema name** (default `gold`).\n4. Classify every gold CSV:\n\n| Classification | Detection signals |\n|---|---|\n| **Dimension** | Filename contains `dim_`; low row count; unique key column; descriptive attributes; no monetary measures |\n| **Fact** | Filename contains `fact_`; higher row count; has `DateKey`; FK columns matching dimension keys; numeric measures |\n| **Generated dimension** | Dimension whose rows don't map 1:1 from any silver file (date table, surrogate-key dimension from `SELECT DISTINCT`) |\n\n5. For each gold file, identify the silver source(s).\n6. **Anti-pattern check**: Verify every gold table is cleanly dimension or fact. Flag hybrids.\n\n### Step 2 — Profile every CSV\n\nFor each CSV (silver and gold), capture columns, dtypes, row counts, unique\ncounts, null counts, sample values, and min/max for numerics. Use\n`encoding='utf-8-sig'` to handle BOM.\n\n### Step 3 — Detect transformations by diffing silver → gold\n\n#### 3a — Dimension detection patterns\n\n| Signal in gold dimension | Transformation |\n|---|---|\n| Sequential integer column (1, 2, 3…) not in silver | **Surrogate key** — `DENSE_RANK() OVER (ORDER BY NaturalKey)` |\n| Extra row with ID = 0 or `'UNKNOWN'` | **Unknown member row** — `UNION ALL SELECT 0/'UNKNOWN', 'Unknown …'` |\n| Gold is a column subset of silver | **Attribute pruning** (keep only what Power BI needs) |\n| Fewer rows than silver, same key | **Deduplication** — `SELECT DISTINCT` |\n| `COALESCE` pattern (`UNASSIGNED` for NULLs) | **Null-safe key** |\n| Row count = distinct count + 1 | **Distinct + unknown row** |\n\n#### 3b — Generated date dimension detection\n\n1. Check if gold date range spans min-to-max of all fact `DateKey` columns.\n2. Identify fiscal year start by comparing `FiscalYear` vs `CalendarYear` at boundaries.\n3. Map derived columns to Spark SQL expressions.\n4. Verify display + sort column pairs exist (for Power BI Sort by Column):\n `CalendarMonthName` + `CalendarMonthNumber`, `DayOfWeekName` + `DayOfWeekNumber`,\n `FiscalPeriodLabel` + `FiscalPeriodNumber`.\n\n#### 3c — Special dimension type detection\n\n| Signal | Type | Action |\n|---|---|---|\n| Fact has 2+ date FK columns | **Role-playing** | Generate one dim_date; add comment for Power BI inactive relationships |\n| Silver has StartDate/EndDate/IsCurrent; gold dim has only distinct attributes | **SCD** | Dim holds distinct entities; SCD join in facts |\n| Fact has 3+ boolean columns (0/1) | **Junk dimension candidate** | Flag for user — combine or keep separate? |\n| Fact retains a text ID column with no matching dimension | **Degenerate** | Keep in fact |\n\n#### 3d — Fact detection patterns\n\n| Signal | Transformation |\n|---|---|\n| `DateKey` = `date_trunc('month', silver.MonthStart)` | Date key alignment |\n| `'UNKNOWN'` / `'UNASSIGNED'` where silver has NULLs | COALESCE FK |\n| Integer FK not in silver source | Surrogate key lookup join |\n| ManagerID via SCD but not in silver fact | SCD point-in-time join |\n| Measures with currency suffix, DECIMAL(18,2) | Rename + round |\n| Row count matches silver → pass-through; less → GROUP BY | Grain check |\n\n#### 3e — Cross-view dependencies and anti-pattern scan\n\n1. Map all table dependencies (topological sort for MLV creation order).\n2. Verify:\n - Dimensions have unique PKs.\n - Facts contain only keys + measures (no embedded descriptive text).\n - No mixed grain within any single fact.\n - No snowflaked dimensions.\n3. Alert the user on any failure before generating SQL.\n\n### Step 4 — Detect fiscal year logic\n\n1. Sample boundary rows. Identify fiscal year start month.\n2. Derive: `FiscalPeriodNumber = ((MonthNumber + (12 - StartMonth)) % 12) + 1`.\n3. Confirm with user.\n\n### Step 5 — Generate the SQL script\n\n**File structure (dependency order):**\n1. `CREATE SCHEMA IF NOT EXISTS gold;`\n2. Comment header (fiscal year, currency, grain, Power BI target)\n3. Generated dimensions (date table)\n4. Pass-through dimensions (flattened — no snowflaking)\n5. SCD-sourced dimensions\n6. Surrogate-key dimensions\n7. Fact views\n\n**Dimension templates:**\n\n*Pass-through with unknown row (flattened):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_entity AS\nSELECT EntityID, EntityName, City, Country\nFROM silver.entity_table\nUNION ALL\nSELECT 'UNKNOWN', 'Unknown Entity', 'Unknown', 'Unknown';\n```\n\n*SCD-sourced with unassigned row:*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_role AS\nSELECT DISTINCT\n COALESCE(RoleID, 'UNASSIGNED') AS RoleID,\n COALESCE(RoleName, 'Unassigned Role') AS RoleName\nFROM silver.role_scd\nUNION\nSELECT 'UNASSIGNED', 'Unassigned Role';\n```\n\n*Surrogate-key with unknown row:*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_category AS\nWITH categories AS (SELECT DISTINCT CategoryName FROM silver.source_table)\nSELECT DENSE_RANK() OVER (ORDER BY CategoryName) AS CategoryID, CategoryName\nFROM categories\nUNION ALL\nSELECT 0, 'Unknown Category';\n```\n\n*Generated date dimension (with fiscal logic):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_date AS\nWITH fact_months AS (\n SELECT date_trunc('month', DateCol) AS MonthStart FROM silver.fact1\n UNION ALL SELECT date_trunc('month', DateCol) FROM silver.fact2\n),\nbounds AS (\n SELECT min(MonthStart) AS MinDate,\n date_sub(add_months(max(MonthStart), 1), 1) AS MaxDate\n FROM fact_months\n),\ncalendar AS (\n SELECT explode(sequence(b.MinDate, b.MaxDate, interval 1 day)) AS CalendarDate\n FROM bounds b\n)\nSELECT\n c.CalendarDate AS DateKey,\n year(c.CalendarDate) AS CalendarYear,\n month(c.CalendarDate) AS CalendarMonthNumber, -- sort column\n date_format(c.CalendarDate, 'MMMM') AS CalendarMonthName, -- display column\n -- Fiscal (parameterised by start month)\n CASE WHEN month(c.CalendarDate) >= ${FISCAL_START_MONTH}\n THEN year(c.CalendarDate)\n ELSE year(c.CalendarDate) - 1 END AS FiscalYear,\n (((month(c.CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1) AS FiscalPeriodNumber,\n ...\nFROM calendar c;\n```\n\n**Fact templates:**\n\n*Simple fact (keys + measures only):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.fact_measure AS\nSELECT\n date_trunc('month', s.MonthStart) AS DateKey,\n COALESCE(s.EntityID, 'UNKNOWN') AS EntityID,\n COALESCE(s.RoleID, 'UNASSIGNED') AS RoleID,\n CAST(ROUND(s.Amount, 2) AS DECIMAL(18,2)) AS AmountGBP\nFROM silver.source_table s;\n```\n\n*With SCD join:*\n```sql\nLEFT JOIN silver.role_scd ma\n ON s.EntityID = ma.EntityID\n AND s.MonthStart BETWEEN ma.StartDate AND ma.EndDate\n```\n\n*With surrogate key lookup:*\n```sql\nWITH category_lookup AS (\n SELECT CategoryName, DENSE_RANK() OVER (ORDER BY CategoryName) AS CategoryID\n FROM (SELECT DISTINCT CategoryName FROM silver.source_table) c\n)\nLEFT JOIN category_lookup cl ON lower(cl.CategoryName) = lower(s.CategoryName)\n```\n\n**Naming conventions:**\n- Dimensions: `gold.dim_<entity>` — singular noun, snake_case\n- Facts: `gold.fact_<process>` — process noun, snake_case\n- Surrogate keys: `<Entity>ID` (PascalCase, INT)\n- Natural keys: unchanged from silver\n- DateKey: DATE type\n- Measures: `<Metric><Currency>` (e.g., `RevenueAmountGBP`)\n- Unknown: `0` (int surrogates), `'UNKNOWN'` (string keys)\n- Unassigned: `'UNASSIGNED'` (role-based)\n\n**Spark SQL reminders:**\n`DENSE_RANK()`, `COALESCE()`, `UNION ALL`, `date_trunc('month')`,\n`sequence() + explode()`, `date_format('MMMM'/'EEEE')`, `dayofweek()` (1=Sun, 7=Sat),\n`lpad()`, `last_day()`, `CAST(ROUND(x,2) AS DECIMAL(18,2))`, `CAST(x AS BIGINT)`.\n\n### Step 6 — Validate against star schema checklist\n\n**Data validation (Python):** Compare generated logic against gold target CSVs\n(column names, row counts, surrogate ranges, measure tolerance ±0.5%).\n\n**Star schema structural checklist:**\n\n- [ ] Every gold table is clearly a dimension or a fact — no hybrids\n- [ ] Every fact has FKs to all related dimensions\n- [ ] Every dimension has a unique primary key (no duplicate rows on the key)\n- [ ] A date dimension exists and spans the full fact date range\n- [ ] Date dimension has display + sort column pairs for Power BI (e.g., MonthName + MonthNumber)\n- [ ] Every dimension has an unknown/unassigned member row\n- [ ] No snowflaking — no dimension references another dimension via FK\n- [ ] No fact embeds descriptive attributes that belong in a dimension\n- [ ] Consistent grain within each fact table\n- [ ] No circular relationship paths\n- [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n- [ ] Surrogate key DENSE_RANK ORDER BY is identical in dimension views and fact CTEs\n- [ ] Role-playing dimensions documented (for Power BI inactive relationships)\n- [ ] Degenerate dimensions (transaction IDs) remain in facts, not split needlessly\n- [ ] Junk dimension candidates flagged if 3+ boolean columns in a single fact\n\nReport failures as warnings before writing the file.\n\n### Step 7 — Write the final file\n\n1. Write to `/mnt/user-data/outputs/gold_layer_mlv.sql`.\n2. Present the file.\n3. Summarise: dimension count, fact count, surrogate key strategy, fiscal year, warnings.\n4. Note any role-playing dimensions and Power BI `USERELATIONSHIP` requirements.\n\n---\n\n## Gotchas\n\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` needs a tiebreaker if ties are possible.\n- **UNION ALL vs UNION**: `UNION ALL` for unknown rows (key is distinct). `UNION` for SCD dims where COALESCE may duplicate the unassigned row.\n- **SCD fan-out**: Overlapping SCD date ranges duplicate fact rows. Validate non-overlap.\n- **Surrogate key consistency**: DENSE_RANK ORDER BY must match between dim view and fact CTE.\n- **COALESCE placement**: Apply in final SELECT, never in JOIN ON clause.\n- **Date dimension bounds**: Earliest fact date to last day of latest fact month.\n- **Fiscal year formula**: Test at January and at the fiscal start month for off-by-one.\n- **dayofweek()**: Spark returns 1=Sunday, 7=Saturday. Weekend = `IN (1, 7)`.\n- **Power BI sort columns**: Pair every display name column with a numeric sort column.\n- **No snowflaking**: Flatten all attributes into the dimension. No dim-to-dim FKs.\n- **Degenerate dimensions**: Keep transaction IDs in the fact. Do not create pointless one-column dimensions.\n- **Junk dimensions**: Suggest combining 3+ boolean flags into a single junk dimension.\n- **BOM characters**: `encoding='utf-8-sig'` when reading CSVs.\n",
|
|
85
|
+
relativePath: "scripts/scaffold_output.py",
|
|
86
|
+
content: "#!/usr/bin/env python3\r\n# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nScaffold the output folder structure for an orchestration agent run.\r\n\r\nUsage:\r\n python scripts/scaffold_output.py --process-name <NAME> --username <USER> [OPTIONS]\r\n\r\nOptions:\r\n --process-name NAME Short process name (lowercase, hyphens; used in folder name)\r\n --username USER Username for folder naming\r\n --output-dir PATH Parent directory for outputs (default: ./outputs)\r\n --sections SECTIONS Comma-separated section keys to include.\r\n Valid values: impl-plan, biz-process, architecture, governance\r\n Default: all four\r\n --help Show this message and exit\r\n\r\nExamples:\r\n python scripts/scaffold_output.py --process-name fabric-lakehouse --username rishi\r\n python scripts/scaffold_output.py --process-name my-process --username alice --sections impl-plan,biz-process\r\n\"\"\"\r\n\r\nimport argparse\r\nimport json\r\nimport sys\r\nfrom datetime import datetime\r\nfrom pathlib import Path\r\n\r\nSECTION_MAP = {\r\n \"impl-plan\": (\"01-implementation-plan\", \"Implementation Plan\"),\r\n \"biz-process\": (\"02-business-process\", \"Business Process Mapping\"),\r\n \"architecture\": (\"03-solution-architecture\",\"Solution Architecture\"),\r\n \"governance\": (\"04-governance\", \"Security, Testing and Governance\"),\r\n}\r\n\r\nALL_SECTIONS = list(SECTION_MAP.keys())\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=\"Scaffold the output folder structure for an orchestration agent run.\",\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=__doc__,\r\n )\r\n parser.add_argument(\"--process-name\", required=True,\r\n help=\"Short process name (lowercase, hyphens)\")\r\n parser.add_argument(\"--username\", required=True,\r\n help=\"Username for folder naming\")\r\n parser.add_argument(\"--output-dir\", type=Path, default=Path(\"./outputs\"),\r\n help=\"Parent directory for outputs (default: ./outputs)\")\r\n parser.add_argument(\"--sections\", default=\",\".join(ALL_SECTIONS),\r\n help=\"Comma-separated section keys to include\")\r\n args = parser.parse_args()\r\n\r\n # Validate process name\r\n import re\r\n if not re.match(r\"^[a-z0-9][a-z0-9-]*[a-z0-9]$\", args.process_name):\r\n print(json.dumps({\r\n \"status\": \"error\",\r\n \"message\": \"process-name must be lowercase letters, numbers and hyphens only.\"\r\n }))\r\n sys.exit(1)\r\n\r\n sections = [s.strip() for s in args.sections.split(\",\") if s.strip()]\r\n invalid = [s for s in sections if s not in SECTION_MAP]\r\n if invalid:\r\n print(json.dumps({\r\n \"status\": \"error\",\r\n \"message\": f\"Unknown section(s): {invalid}. Valid: {ALL_SECTIONS}\"\r\n }))\r\n sys.exit(1)\r\n\r\n timestamp = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\r\n folder_name = f\"{args.process_name}_{timestamp}_{args.username}\"\r\n root = args.output_dir / folder_name\r\n root.mkdir(parents=True, exist_ok=True)\r\n\r\n created_subfolders = []\r\n for key in sections:\r\n dirname, label = SECTION_MAP[key]\r\n subfolder = root / dirname\r\n subfolder.mkdir(exist_ok=True)\r\n created_subfolders.append({\"key\": key, \"label\": label, \"path\": str(subfolder)})\r\n\r\n # Initialise the audit trail\r\n change_log = root / \"CHANGE_LOG.md\"\r\n change_log.write_text(\r\n f\"# Change Log: {args.process_name}\\n\\n\"\r\n f\"Created: {datetime.now().isoformat(timespec='seconds')} by {args.username}\\n\\n\"\r\n \"---\\n\\n\"\r\n \"_Entries are appended here by each sub-agent as the orchestration runs._\\n\",\r\n encoding=\"utf-8\",\r\n )\r\n\r\n output = {\r\n \"status\": \"ok\",\r\n \"root\": str(root),\r\n \"agent_md_path\": str(root / \"agent.md\"),\r\n \"change_log_path\": str(change_log),\r\n \"subfolders\": created_subfolders\r\n }\r\n\r\n print(json.dumps(output, indent=2))\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
80
87
|
},
|
|
81
88
|
],
|
|
82
89
|
},
|
|
83
90
|
{
|
|
84
|
-
name: "create-
|
|
91
|
+
name: "create-lakehouse-schemas-and-shortcuts",
|
|
92
|
+
category: "fabric",
|
|
85
93
|
files: [
|
|
86
94
|
{
|
|
87
|
-
relativePath: ".
|
|
88
|
-
content: "
|
|
95
|
+
relativePath: "SKILL.md",
|
|
96
|
+
content: "---\r\nname: create-lakehouse-schemas-and-shortcuts\r\ndescription: >\r\n Use this skill to create schemas in schema-enabled Microsoft Fabric lakehouses\r\n and create cross-lakehouse table shortcuts using the Fabric CLI. Triggers on:\r\n \"create lakehouse shortcuts\", \"create schema in lakehouse\", \"shortcut tables\r\n between lakehouses\", \"cross-lakehouse shortcuts\", \"surface bronze tables in\r\n silver\". Does NOT trigger for: creating lakehouses (use create-fabric-lakehouse),\r\n uploading files, creating delta tables from CSV/PDF, or generating MLV scripts.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ for scripts/. Fabric CLI (fab) installed and authenticated.\r\n---\r\n\r\n# Create Lakehouse Schemas and Shortcuts\r\n\r\nCreates schemas in schema-enabled Fabric lakehouses and creates cross-lakehouse\r\ntable shortcuts using `fab ln --type oneLake`. Schemas and shortcuts are\r\ncreated in the same run. Source and target lakehouses must already exist.\r\n\r\n> **GOVERNANCE**: This skill generates commands — it does not execute them.\r\n> All `fab` commands are presented for the operator to review and run.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `--source-workspace` | Source Fabric workspace name (exact, case-sensitive) | `\"LANDON_TEST_20260402_HUB\"` |\r\n| `--source-lakehouse` | Source lakehouse name (exact, case-sensitive) | `\"LANDON_FINANCE_BRONZE\"` |\r\n| `--source-schema` | Schema in source lakehouse. Use `dbo` for non-schema-enabled | `\"dbo\"` |\r\n| `--target-workspace` | Target Fabric workspace name (exact, case-sensitive) | `\"LANDON_TEST_20260402_FINANCE_SPOKE\"` |\r\n| `--target-lakehouse` | Target lakehouse name (exact, case-sensitive) | `\"LANDON_FINANCE_SILVER\"` |\r\n| `--target-schema` | Schema to create in target and place shortcuts into | `\"bronze\"` |\r\n| `--tables` | Comma-separated table names, or output of `fab ls` | `\"bookings,events\"` |\r\n\r\n## Workflow\r\n\r\n- [ ] **Step 1 — Collect parameters**: Ask the user for all inputs listed above.\r\n If source and target are in the same workspace, both workspace parameters will\r\n be the same value.\r\n\r\n- [ ] **Step 2 — Discover tables**: Ask the user to either:\r\n - Provide an explicit comma-separated list of table names, **or**\r\n - Run this command and share the output:\r\n ```\r\n fab ls \"<SOURCE_WORKSPACE>.Workspace/<SOURCE_LAKEHOUSE>.Lakehouse/Tables/\" -l\r\n ```\r\n Parse table names from the output or list. Present them back and confirm.\r\n\r\n- [ ] **Step 3 — Generate commands**: Run the script:\r\n ```\r\n python scripts/generate_schema_shortcut_commands.py \\\r\n --source-workspace \"<SOURCE_WORKSPACE>\" \\\r\n --source-lakehouse \"<SOURCE_LAKEHOUSE>\" \\\r\n --source-schema \"<SOURCE_SCHEMA>\" \\\r\n --target-workspace \"<TARGET_WORKSPACE>\" \\\r\n --target-lakehouse \"<TARGET_LAKEHOUSE>\" \\\r\n --target-schema \"<TARGET_SCHEMA>\" \\\r\n --tables \"<TABLE1>,<TABLE2>,...\"\r\n ```\r\n The script outputs JSON to stdout with sections: `schema_sql`,\r\n `schema_shortcut_test`, `shortcut_commands`, and `validation_command`.\r\n\r\n- [ ] **Step 4 — (Optional) Test schema-level shortcut**: Before creating\r\n individual table shortcuts, optionally test whether a single schema-level\r\n shortcut captures all tables (see \"Schema-Level Shortcut Hypothesis\" below).\r\n Use the `schema_shortcut_test` command from the script output.\r\n If the test succeeds and all tables appear, skip Step 5.\r\n\r\n- [ ] **Step 5 — Choose deployment approach**: Present these options:\r\n\r\n **Option A — Notebook Cells (Recommended for pipeline integration)**\r\n Append two cells to an existing notebook attached to the target lakehouse:\r\n 1. **Spark SQL cell**: Contains `CREATE SCHEMA IF NOT EXISTS <schema>;`\r\n from the `schema_sql` output.\r\n 2. **Code cell**: Contains each command from `shortcut_commands` prefixed\r\n with `!` (one per line).\r\n If no existing notebook is available, create a new one and note that it\r\n will need its own Spark session and `fab` authentication.\r\n\r\n **Option B — PowerShell Script**\r\n Write the `fab ln` commands from `shortcut_commands` to a `.ps1` file.\r\n Add a comment at the top reminding the user to create the schema first\r\n via a Spark SQL notebook cell (`fab` CLI cannot create schemas).\r\n\r\n **Option C — Interactive Terminal**\r\n Present each command one at a time for the operator to run. Start with the\r\n schema creation SQL (must run in a notebook), then present `fab ln` commands.\r\n\r\n- [ ] **Step 6 — Validate**: Ask the user to run:\r\n ```\r\n fab ls \"<TARGET_WORKSPACE>.Workspace/<TARGET_LAKEHOUSE>.Lakehouse/Tables/\" -l\r\n ```\r\n Confirm the expected shortcuts appear under the target schema.\r\n\r\n## Schema-Level Shortcut Hypothesis\r\n\r\nWhen creating shortcuts through the Fabric **UI**, connecting to a schema\r\nautomatically surfaces all tables in that schema as shortcuts. It is unknown\r\nwhether this works programmatically via `fab ln`. To test, use the\r\n`schema_shortcut_test` command from the script output, e.g.:\r\n\r\n```\r\nfab ln \"<TARGET_WS>.Workspace/<TARGET_LH>.Lakehouse/Tables/<TARGET_SCHEMA>/Shortcut\" \\\r\n --type oneLake \\\r\n --target ../../<SOURCE_WS_URL>.Workspace/<SOURCE_LH>.Lakehouse/Tables -f\r\n```\r\n\r\nIf this succeeds and all source tables appear in the target schema, use this\r\none-command approach instead of individual table shortcuts. Document the result\r\nfor future runs.\r\n\r\nIf the source is non-schema-enabled, test with `Tables` as the target path\r\n(no schema segment). If schema-enabled, use `Tables/<source_schema>`.\r\n\r\n## fab ln Syntax Reference\r\n\r\n### Shortcut naming convention (FIXED)\r\n\r\nShortcuts in schema-enabled lakehouses use **slash notation** for the schema path:\r\n```\r\nTables/<Schema>/<table_name>.Shortcut\r\n```\r\nExample: `Tables/Bronze/revenue_raw.Shortcut`\r\n\r\n**Periods (`.`) are FORBIDDEN in shortcut names.** Dot notation like\r\n`Tables/bronze.revenue_raw.Shortcut` will fail with:\r\n`[InvalidPath] Invalid shortcut name. The name should not include any of the following characters: [\"\\:|<>*?.%+]`\r\n\r\n### Cross-lakehouse: non-schema source → schema-enabled target\r\n\r\n```\r\nfab ln \"<TARGET_WS>.Workspace/<TARGET_LH>.Lakehouse/Tables/<TARGET_SCHEMA>/<TABLE>.Shortcut\" \\\r\n --type oneLake \\\r\n --target ../../<SOURCE_WS_URL>.Workspace/<SOURCE_LH>.Lakehouse/Tables/<TABLE> -f\r\n```\r\n\r\n### Cross-lakehouse: schema-enabled source → schema-enabled target\r\n\r\n```\r\nfab ln \"<TARGET_WS>.Workspace/<TARGET_LH>.Lakehouse/Tables/<TARGET_SCHEMA>/<TABLE>.Shortcut\" \\\r\n --type oneLake \\\r\n --target ../../<SOURCE_WS_URL>.Workspace/<SOURCE_LH>.Lakehouse/Tables/<SOURCE_SCHEMA>/<TABLE> -f\r\n```\r\n\r\n### Key rules\r\n\r\n- **Type**: Always `--type oneLake` for cross-lakehouse table shortcuts.\r\n Valid `fab ln` types are: `adlsGen2`, `amazonS3`, `dataverse`, `googleCloudStorage`,\r\n `oneLake`, `s3Compatible`. There is no `lakehouseTable` type.\r\n- **Slash notation**: Shortcut path uses `Tables/<Schema>/<table>.Shortcut` (slash, NOT dot)\r\n- **Periods forbidden**: `.` is not allowed in shortcut names — will error with `[InvalidPath]`\r\n- **`-f` flag**: Always include `-f` to skip the \"Are you sure?\" confirmation prompt\r\n (terminals that don't support CPR will hang without it)\r\n- **Source path**: Schema-enabled sources use `Tables/<schema>/<table>` (slash).\r\n Non-schema sources use `Tables/<table>` (no schema segment)\r\n- **URL encoding**: Workspace names with spaces use `%20` in the `--target` path\r\n- **`../../` prefix**: Required for cross-workspace targets to navigate to OneLake root\r\n- **Display names**: Shortcut destination path uses plain workspace/lakehouse names\r\n (no URL encoding); only the `--target` path is URL-encoded\r\n\r\n## Gotchas\r\n\r\n- **Slash NOT dot in shortcut paths**: The shortcut destination uses slash notation\r\n (`Tables/Bronze/revenue_raw.Shortcut`), NOT dot notation. Periods (`.`) are\r\n **forbidden** in shortcut names and will cause `[InvalidPath]` errors.\r\n- **Always use `-f` flag**: Without `-f`, `fab ln` prompts \"Are you sure? (Y/n)\".\r\n Terminals that don't support cursor position requests (CPR) will hang. Always\r\n append `-f` to force creation without confirmation.\r\n- **`--type oneLake` not `--type lakehouseTable`**: Cross-lakehouse table shortcuts\r\n require `--type oneLake`. The type `lakehouseTable` does not exist in the `fab ln`\r\n CLI. Valid types are: `adlsGen2`, `amazonS3`, `dataverse`, `googleCloudStorage`,\r\n `oneLake`, `s3Compatible`.\r\n- **Schema creation requires Spark SQL**: The `fab` CLI cannot create schemas.\r\n Schemas must be created via `CREATE SCHEMA IF NOT EXISTS <name>` in a Spark SQL\r\n cell in a notebook attached to the target lakehouse.\r\n- **Schema names are case-sensitive** in Fabric. Use exact casing consistently.\r\n- **Viewer access required**: Cross-workspace shortcuts require at least Viewer\r\n access on the source workspace.\r\n- **Existing shortcuts fail**: If a shortcut with the same name already exists,\r\n `fab ln` will error. Skip or delete existing ones before rerunning.\r\n- **Same-workspace shortcuts**: When source and target are in the same workspace,\r\n the `../../` prefix and URL encoding still apply in the `--target` path.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_schema_shortcut_commands.py`** — Generates structured JSON\r\n containing schema SQL, `fab ln` shortcut commands, a schema-level shortcut test\r\n command, and a validation command.\r\n Run: `python scripts/generate_schema_shortcut_commands.py --help`\r\n",
|
|
89
97
|
},
|
|
90
98
|
{
|
|
91
|
-
relativePath: "
|
|
92
|
-
content: "#
|
|
99
|
+
relativePath: "scripts/generate_schema_shortcut_commands.py",
|
|
100
|
+
content: "# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nGenerate structured JSON containing schema creation SQL, fab ln shortcut\r\ncommands, a schema-level shortcut test command, and a validation command\r\nfor cross-lakehouse Fabric shortcuts.\r\n\r\nOutputs JSON to stdout; diagnostics to stderr.\r\n\r\nUsage:\r\n python scripts/generate_schema_shortcut_commands.py \\\r\n --source-workspace \"LANDON_TEST_20260402_HUB\" \\\r\n --source-lakehouse \"LANDON_FINANCE_BRONZE\" \\\r\n --source-schema \"dbo\" \\\r\n --target-workspace \"LANDON_TEST_20260402_FINANCE_SPOKE\" \\\r\n --target-lakehouse \"LANDON_FINANCE_SILVER\" \\\r\n --target-schema \"bronze\" \\\r\n --tables \"bookings,booking_details,events\"\r\n\"\"\"\r\nimport argparse\r\nimport json\r\nimport sys\r\nfrom urllib.parse import quote\r\n\r\n\r\ndef url_encode(name: str) -> str:\r\n \"\"\"URL-encode a path segment (spaces -> %20).\"\"\"\r\n return quote(name, safe=\".\")\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=(\r\n \"Generate schema SQL and fab ln shortcut commands for cross-lakehouse \"\r\n \"Fabric shortcuts. Outputs structured JSON to stdout.\"\r\n ),\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=(\r\n \"Example:\\n\"\r\n ' python scripts/generate_schema_shortcut_commands.py \\\\\\n'\r\n ' --source-workspace \"LANDON_TEST_20260402_HUB\" \\\\\\n'\r\n ' --source-lakehouse \"LANDON_FINANCE_BRONZE\" \\\\\\n'\r\n ' --source-schema \"dbo\" \\\\\\n'\r\n ' --target-workspace \"LANDON_TEST_20260402_FINANCE_SPOKE\" \\\\\\n'\r\n ' --target-lakehouse \"LANDON_FINANCE_SILVER\" \\\\\\n'\r\n ' --target-schema \"bronze\" \\\\\\n'\r\n ' --tables \"bookings,booking_details\"\\n'\r\n ),\r\n )\r\n parser.add_argument(\r\n \"--source-workspace\", required=True,\r\n help=\"Source Fabric workspace name (exact, case-sensitive).\",\r\n )\r\n parser.add_argument(\r\n \"--source-lakehouse\", required=True,\r\n help=\"Source lakehouse name (exact, case-sensitive).\",\r\n )\r\n parser.add_argument(\r\n \"--source-schema\", default=\"dbo\",\r\n help='Source schema. Use \"dbo\" for non-schema-enabled lakehouses (default: dbo).',\r\n )\r\n parser.add_argument(\r\n \"--target-workspace\", required=True,\r\n help=\"Target Fabric workspace name (exact, case-sensitive).\",\r\n )\r\n parser.add_argument(\r\n \"--target-lakehouse\", required=True,\r\n help=\"Target lakehouse name (exact, case-sensitive).\",\r\n )\r\n parser.add_argument(\r\n \"--target-schema\", required=True,\r\n help=\"Target schema name where shortcuts will be placed.\",\r\n )\r\n parser.add_argument(\r\n \"--tables\", required=True,\r\n help=\"Comma-separated list of table names to create shortcuts for.\",\r\n )\r\n args = parser.parse_args()\r\n\r\n tables = [t.strip() for t in args.tables.split(\",\") if t.strip()]\r\n if not tables:\r\n print(\"ERROR: No table names provided.\", file=sys.stderr)\r\n sys.exit(1)\r\n\r\n source_ws = args.source_workspace\r\n source_lh = args.source_lakehouse\r\n source_schema = args.source_schema\r\n target_ws = args.target_workspace\r\n target_lh = args.target_lakehouse\r\n target_schema = args.target_schema\r\n\r\n source_ws_url = url_encode(source_ws)\r\n\r\n # Build source table path — omit schema segment for non-schema-enabled (dbo)\r\n if source_schema.lower() == \"dbo\":\r\n source_table_fmt = (\r\n f\"../../{source_ws_url}.Workspace/\"\r\n f\"{source_lh}.Lakehouse/Tables/{{table}}\"\r\n )\r\n else:\r\n source_table_fmt = (\r\n f\"../../{source_ws_url}.Workspace/\"\r\n f\"{source_lh}.Lakehouse/Tables/{source_schema}/{{table}}\"\r\n )\r\n\r\n # Schema SQL\r\n schema_sql = [f\"CREATE SCHEMA IF NOT EXISTS {target_schema};\"]\r\n\r\n # Shortcut commands — slash notation for schema-enabled destination, oneLake type\r\n # Path: Tables/<Schema>/<table>.Shortcut (dot notation is INVALID — periods\r\n # are forbidden in shortcut names)\r\n shortcut_commands = []\r\n for table in tables:\r\n dest = (\r\n f\"{target_ws}.Workspace/\"\r\n f\"{target_lh}.Lakehouse/\"\r\n f\"Tables/{target_schema}/{table}.Shortcut\"\r\n )\r\n target_path = source_table_fmt.format(table=table)\r\n cmd = f'fab ln \"{dest}\" --type oneLake --target {target_path} -f'\r\n shortcut_commands.append(cmd)\r\n\r\n # Schema-level shortcut test command (hypothesis: one shortcut gets all tables)\r\n if source_schema.lower() == \"dbo\":\r\n schema_target_path = (\r\n f\"../../{source_ws_url}.Workspace/\"\r\n f\"{source_lh}.Lakehouse/Tables\"\r\n )\r\n else:\r\n schema_target_path = (\r\n f\"../../{source_ws_url}.Workspace/\"\r\n f\"{source_lh}.Lakehouse/Tables/{source_schema}\"\r\n )\r\n schema_shortcut_test = (\r\n f'fab ln \"{target_ws}.Workspace/{target_lh}.Lakehouse/'\r\n f'Tables/{target_schema}/Shortcut\" '\r\n f'--type oneLake --target {schema_target_path} -f'\r\n )\r\n\r\n # Validation command\r\n validation_command = (\r\n f'fab ls \"{target_ws}.Workspace/'\r\n f'{target_lh}.Lakehouse/Tables/\" -l'\r\n )\r\n\r\n result = {\r\n \"source\": {\r\n \"workspace\": source_ws,\r\n \"lakehouse\": source_lh,\r\n \"schema\": source_schema,\r\n },\r\n \"target\": {\r\n \"workspace\": target_ws,\r\n \"lakehouse\": target_lh,\r\n \"schema\": target_schema,\r\n },\r\n \"tables\": tables,\r\n \"schema_sql\": schema_sql,\r\n \"schema_shortcut_test\": schema_shortcut_test,\r\n \"shortcut_commands\": shortcut_commands,\r\n \"validation_command\": validation_command,\r\n }\r\n\r\n json.dump(result, sys.stdout, indent=2)\r\n print(file=sys.stdout) # trailing newline\r\n\r\n print(f\"{len(tables)} shortcut command(s) generated.\", file=sys.stderr)\r\n print(f\"Schema SQL: {schema_sql[0]}\", file=sys.stderr)\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
93
101
|
},
|
|
102
|
+
],
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
name: "create-materialised-lakeview-scripts",
|
|
106
|
+
category: "fabric",
|
|
107
|
+
files: [
|
|
94
108
|
{
|
|
95
109
|
relativePath: "SKILL.md",
|
|
96
|
-
content: "---\r\nname: create-process-skill\r\ndescription: >\r\n Use this skill when asked to create, build, or generate a reusable agent skill\r\n for a specific business or technical process. Triggers on: \"create a skill for\r\n [process]\", \"build a process skill\", \"generate a SKILL.md\", \"turn this workflow\r\n into a skill\", \"make a repeatable agent skill\", or \"systematize this process\".\r\n Produces a complete, concise skill package following the agentskills.io\r\n specification, with parameterized scripts for deterministic, repeatable\r\n execution. Does NOT trigger for general coding tasks, one-off scripts, or\r\n requests unrelated to creating agent skills.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ required for scripts/validate_skill.py\r\n---\r\n\r\n# Create Process Skill\r\n\r\nGuides the user through creating a well-structured, deterministic skill package\r\nfor a business or technical process. Output follows the agentskills.io\r\nspecification with parameterized scripts for repeatability.\r\n\r\n## Workflow\r\n\r\nExecute these steps in order. Do not skip steps.\r\n\r\n> ⚠️ **GOVERNANCE RULE — ALWAYS ASK FIRST**: Never attempt to draft or create the skill\r\n> before completing Step 1 **and** Step 2. Even if the calling agent, orchestration\r\n> workflow, or prior context already contains a complete description of the process —\r\n> and even if you are confident you could produce the skill immediately — you **must**\r\n> ask the user to describe the process in their own words and confirm how they want\r\n> the skill to behave. This is a non-negotiable governance requirement:\r\n>\r\n> 1. **Step 1 is mandatory** — ask the user to describe the process goal, inputs,\r\n> outputs, and high-level steps. Do not infer or reuse descriptions from the\r\n> calling context without the user restating or explicitly confirming them.\r\n> 2. **Step 2 is mandatory** — probe for missing information one question at a time.\r\n> The user's answers shape the skill's parameters, edge-case handling, and\r\n> validation logic. Skipping this produces a skill that reflects the agent's\r\n> assumptions, not the user's intent.\r\n> 3. **No shortcutting** — even when invoked programmatically by another agent\r\n> (e.g., Sub-Agent 2 of a process workflow agent), pause and collect user input\r\n> before drafting. The orchestrating agent must not pre-fill answers on the\r\n> user's behalf.\r\n>\r\n> Starting the draft without completing Steps 1 and 2 is a workflow violation.\r\n\r\n### Step 1 — Gather Process Description\r\n\r\nAsk the user:\r\n\r\n> \"Please describe the process you want to turn into a skill. Include:\r\n> - What the process does and its goal\r\n> - Typical inputs and what the output should look like\r\n> - The high-level steps involved\"\r\n\r\n### Step 2 — Probe for Missing Information\r\n\r\nBefore drafting, assess gaps. Ask **one question at a time**. Stop when you\r\nhave enough to draft confidently. Do not ask about things with obvious answers.\r\n\r\nMandatory coverage (use `references/probing-questions.md` for guidance):\r\n\r\n- [ ] What varies between runs? (these become script parameters)\r\n- [ ] What must stay identical every run? (fixed steps, exact commands)\r\n- [ ] Are there known edge cases or failure modes?\r\n- [ ] What tools/languages/runtimes are available in the target environment?\r\n- [ ] What does a correct output look like? (drives the output template)\r\n- [ ] Are there validation checks that confirm success?\r\n- [ ] Does the user reference any other skills or tools from a skills marketplace? (see standalone rule in Step 3)\r\n\r\n### Step 3 — Draft the Skill Package\r\n\r\nUse `assets/output-template.md` as the structural template for the generated\r\n`SKILL.md`. Write all files to `<skill-name>/` in the current directory.\r\n\r\n**Directory structure to produce:**\r\n\r\n```\r\n<skill-name>/\r\n├── SKILL.md # Core instructions (≤500 lines / ≤5,000 tokens)\r\n├── scripts/ # Parameterized scripts (if logic repeats across runs)\r\n├── assets/ # Output templates used by the skill\r\n└── references/ # Supporting docs loaded on demand\r\n```\r\n\r\n**SKILL.md authoring rules:**\r\n\r\n- Use imperative phrasing: \"Run…\", \"Check…\", \"If X, do Y\"\r\n- Parameterize all variable inputs as `--flag <VALUE>` or `$PARAM`\r\n- Include a checklist for multi-step workflows (agents track progress)\r\n- Include a validation loop: do work → validate → fix → re-validate\r\n- Write only what the agent wouldn't know on its own (no generic advice)\r\n- Include a Gotchas section for non-obvious facts, edge cases, naming quirks\r\n- Include an output format template inline (or in `assets/` if >30 lines)\r\n- Move content >50 lines to `references/` or `assets/`; tell the agent when to load each file\r\n\r\n**Standalone rule — the generated skill must be fully self-contained:**\r\n\r\nThe end user running the generated process skill will NOT have access to the\r\nsame skills marketplace or directory that was used during its creation. Therefore:\r\n\r\n- **Do not reference external skills by name** (e.g., `skills-for-fabric`,\r\n `azure-prepare`, `azure-ai`). The user may not have them installed.\r\n- **Extract and inline** any relevant instructions, patterns, API conventions,\r\n or scripts from referenced skills directly into the process skill's own files\r\n (`SKILL.md`, `scripts/`, `references/`, `assets/`).\r\n- If a referenced skill contains a reusable script, copy and adapt it into\r\n `scripts/` rather than calling the external skill's script path.\r\n- If a referenced skill contains useful reference material, distil the relevant\r\n parts into `references/` rather than pointing to the external source.\r\n- The test: a user with only the generated skill directory should be able to\r\n run the process end-to-end without installing anything else from a skills\r\n marketplace.\r\n\r\n**Fabric notebook rule:** When a generated skill produces Fabric notebook cells, prefer appending cells to an existing notebook from a prior step (where there are no ordering dependencies that prevent it) rather than creating a new notebook, since each notebook requires its own Spark session, pip install, and `fab` authentication.\r\n\r\n**Script authoring rules (when scripts are warranted):**\r\n\r\n- Use Python with PEP 723 inline dependencies (`# /// script`)\r\n- Run with: `uv run scripts/<name>.py` (preferred) or `python scripts/<name>.py`\r\n- Accept all inputs via `--flag` CLI args — no interactive prompts\r\n- Output structured JSON to stdout; diagnostics/warnings to stderr\r\n- Implement `--help` with: description, all flags, and a usage example\r\n- Return clear error messages: what went wrong, what was expected, what to try\r\n\r\n**What to put in scripts vs SKILL.md:**\r\n\r\nMove logic to a script when:\r\n- The same command/transformation runs on every execution\r\n- The logic is complex enough to get wrong by free-generation\r\n- Consistent output format is critical\r\n\r\nKeep in SKILL.md when:\r\n- The agent needs to make judgment calls\r\n- Steps vary significantly based on input\r\n- A one-off CLI tool (uvx/npx) already handles it\r\n\r\n### Step 4 — Validate the Draft\r\n\r\nRun the validation script against the generated skill:\r\n\r\n```bash\r\npython scripts/validate_skill.py <skill-name>/SKILL.md\r\n```\r\n\r\nFix any reported issues before proceeding.\r\n\r\n### Step 5 — Create Test Agent\r\n\r\nAfter validation passes, create `agent.md` in the current directory using\r\n`assets/agent-template.md`. Tell the user:\r\n\r\n> \"I've created the skill at `<skill-name>/` and a test agent at `agent.md`.\r\n> Run the agent with a sample set of inputs and share the output so we can\r\n> refine it.\"\r\n\r\n### Step 6 — Iterate on Feedback\r\n\r\nWhen the user shares results:\r\n\r\n1. Classify the issue: missing instruction / wrong instruction / wrong detail level / wrong output format\r\n2. Make the minimal targeted fix — do not rewrite unnecessarily\r\n3. Re-run `python scripts/validate_skill.py <skill-name>/SKILL.md`\r\n4. Confirm the fix and invite the next test round\r\n\r\n## Gotchas\r\n\r\n- **User consultation is mandatory, not optional.** Even when the agent has full\r\n knowledge of the process (from prior conversation, orchestration context, or\r\n attached files), it must still ask the user to describe the process and confirm\r\n how the skill should work. The user may want different parameters, edge-case\r\n handling, or output formats than what the agent would assume. Skipping this\r\n produces skills that reflect the agent's perspective, not the user's intent.\r\n- Keep the generated `SKILL.md` under 500 lines. Move overflow to `references/`.\r\n- Scripts must never block on interactive input — the agent runs in a non-interactive shell.\r\n- The `name` field must match the directory name exactly (lowercase, hyphens only).\r\n- Do not add domain knowledge the agent already has — only what it would get wrong without guidance.\r\n- When the process has conditional branches, prefer a checklist over prose so the agent can track state.\r\n- **Do not leave dangling skill references.** If the user described the process using another skill\r\n (e.g., \"use the fabric-spark skill to load the data\"), do not carry that reference into the output.\r\n Extract the relevant knowledge from that skill and embed it directly. The generated skill must work\r\n without access to any external skills directory or marketplace.\r\n\r\n## Progress Report Format\r\n\r\nAfter each draft or iteration, output:\r\n\r\n```\r\n✅ Skill: <skill-name>/SKILL.md (<N> lines)\r\n📁 Scripts: <list or \"none\">\r\n📁 References: <list or \"none\">\r\n📁 Assets: <list or \"none\">\r\n🧪 Agent: agent.md\r\n```\r\n",
|
|
110
|
+
content: "---\nname: create-materialised-lakeview-scripts\ndescription: >\n Use this skill when asked to generate Spark SQL Materialized Lake View (MLV)\n scripts for Microsoft Fabric Lakehouse transformations. Triggers on: \"generate\n MLV\", \"create silver layer\", \"create gold layer\", \"bronze to silver\", \"silver\n to gold\", \"star schema\", \"lakehouse transformation\", \"materialized lake view\".\n Supports two layers (bronze→silver, silver→gold) and two approaches each\n (schema-driven with source+target CSVs, or pattern-driven with source-only CSVs).\n Does NOT trigger for general SQL writing, Power BI semantic model creation,\n notebook authoring, or Fabric workspace/lakehouse provisioning.\nlicense: MIT\ncompatibility: Python 3.8+ with pandas (for profiling script)\n---\n\n# Fabric Lakehouse MLV Generator\n\n> ⚠️ **GOVERNANCE**: This skill produces Spark SQL notebooks and scripts for the\n> operator to review and run — it never executes queries or deploys notebooks\n> autonomously. Present each generated artefact to the operator before they run it.\n\nGenerates `CREATE OR REPLACE MATERIALIZED LAKE VIEW` scripts that transform data\nbetween lakehouse layers in Microsoft Fabric. Supports bronze→silver (cleaning,\nconforming, restructuring) and silver→gold (Power BI-optimised star schema).\n\n## Inputs\n\n| Parameter | Description | Example |\n|---|---|---|\n| Layer | Bronze→Silver or Silver→Gold | \"bronze to silver\" |\n| Approach | Schema-driven (source+target CSVs) or Pattern-driven (source CSVs only) | \"schema-driven\" |\n| Source CSVs | CSV exports of the source layer tables | `/mnt/user-data/uploads/*.csv` |\n| Target CSVs | (Schema-driven only) CSV exports of the target layer tables | `/mnt/user-data/uploads/silver_*.csv` |\n| Source schema | Schema name for source tables in SQL | `bronze` |\n| Target schema | Schema name for target views in SQL | `silver` or `gold` |\n| Fiscal year start | (Gold layer only) Month number 1–12 | `3` (March) |\n| Currency code | (Gold layer only) Base currency for measure suffixes | `GBP` |\n\n## Workflow\n\n### Phase 1 — Route the request\n\n- [ ] **1.1** Ask the user: **What layer transformation is this?**\n - Bronze → Silver\n - Silver → Gold\n\n- [ ] **1.2** Ask the user: **Which approach?**\n - **Schema-driven** — \"I have both source and target CSV files\"\n - **Pattern-driven** — \"I only have source CSV files; suggest transformations\"\n\n- [ ] **1.3** Based on answers, load the appropriate reference file:\n\n| Layer | Approach | Reference to load |\n|---|---|---|\n| Bronze → Silver | Schema-driven | `references/bronze-to-silver-schema-driven.md` |\n| Bronze → Silver | Pattern-driven | `references/bronze-to-silver-pattern-driven.md` |\n| Silver → Gold | Schema-driven | `references/silver-to-gold-schema-driven.md` |\n| Silver → Gold | Pattern-driven | `references/silver-to-gold-pattern-driven.md` |\n\nRead the full reference file with the `view` tool before proceeding. The reference\ncontains the detailed transformation catalogue, SQL patterns, and validation rules\nfor this specific layer+approach combination.\n\n- [ ] **1.4** Ask the user to confirm:\n - Source schema name (default: `bronze` for B→S, `silver` for S→G)\n - Target schema name (default: `silver` for B→S, `gold` for S→G)\n - If Silver→Gold: fiscal year start month and base currency code\n\n### Phase 2 — Inventory and profile\n\n- [ ] **2.1** List all CSV files in `/mnt/user-data/uploads/`.\n\n- [ ] **2.2** Ask the user to identify which CSVs are **source** and which (if\n schema-driven) are **target**. If file naming makes this obvious, propose the\n split and ask for confirmation.\n\n- [ ] **2.3** Run the profiler against every CSV:\n\n```bash\npython references/profile_csvs.py --dir /mnt/user-data/uploads/ --files <file1.csv> <file2.csv> ...\n```\n\nThe profiler outputs a JSON report per file with: column names, inferred dtypes,\nrow count, unique counts, null counts, sample values, and pattern flags (dates,\ncurrency, booleans, commas-in-numbers, whitespace). Store this output for use in\nsubsequent steps.\n\n> **Column naming in Fabric delta tables:** When CSVs are loaded into Fabric\n> Lakehouse delta tables (e.g., via the `csv-to-bronze-delta-tables` skill), a\n> `clean_columns()` function is applied that lowercases all column names and\n> replaces spaces and special characters with underscores. For example,\n> `Hotel ID` becomes `hotel_id` and `No_of_Rooms` becomes `no_of_rooms`.\n> PDF-extracted tables (from the `pdf-to-bronze-delta-tables` skill) may have\n> **entirely different column schemas** since fields are AI-extracted strings.\n> Always verify actual delta table column names — do NOT assume they match the\n> original CSV file headers.\n\n- [ ] **2.4** If schema-driven: profile both source and target CSVs. Map each\n target file to its source file(s) by column overlap. Present the mapping and\n ask the user to confirm.\n\n- [ ] **2.5** If pattern-driven: classify each source file by archetype (see\n reference file for the classification table). Present the classification and\n ask the user to confirm.\n\n### Phase 3 — Detect and plan transformations\n\nFollow the reference file's Step 3 (schema-driven) or Step 3 + Step 4\n(pattern-driven) exactly. The reference contains the full transformation detection\nlogic and catalogue.\n\n- [ ] **3.1** For each source→target pair (schema-driven) or each source file\n (pattern-driven), detect all applicable transformations.\n\n- [ ] **3.2** Present a **transformation plan** to the user — a table showing\n each output view, its sources, the transformations that will be applied, and\n any assumptions.\n\n- [ ] **3.3** If Silver→Gold: run the **anti-pattern check** from the reference:\n - No table mixes dimensions and measures\n - No dimension references another dimension via FK (no snowflaking)\n - Consistent grain within each fact\n - Degenerate dimensions stay in facts\n - Flag junk dimension candidates\n\n- [ ] **3.4** Wait for user confirmation before generating SQL.\n\n### Phase 4 — Generate the SQL\n\nFollow the reference file's SQL generation step exactly (Step 4 or Step 5,\ndepending on reference). Key rules that apply to ALL layer+approach combinations:\n\n**File structure:**\n1. `CREATE SCHEMA IF NOT EXISTS <target_schema>;`\n2. Comment header with assumptions (layer, approach, fiscal year, currency, grain)\n3. Views ordered by dependency (dimensions/independent views first, then dependents)\n4. Each view: `CREATE OR REPLACE MATERIALIZED LAKE VIEW <schema>.<view_name> AS`\n\n**Notebook documentation (when delivering as .ipynb):**\nLoad `references/notebook-standard.md` for the required markdown cell structure.\nWhen delivering as a notebook, the per-view markdown cells replace the separate\nlogic file — the notebook is the single source of truth.\n\n**MLV-to-MLV dependency pattern:**\nMaterialized Lake Views in Fabric can reference other Materialized Lake Views.\nThis is the **standard layered pattern** — build dimensions and independent\nfacts first, then create dependent views that JOIN to them. For example:\n- `silver.room_rate` joins to `silver.hotel_dim` via a fuzzy/normalised key\n- `silver.forecast_monthly` reads from `silver.revenue_monthly` for weight calculation\n- `silver.expenses_monthly` reads from `silver.revenue_monthly` for proportional allocation\n\nAlways order views by dependency: independent views first, dependent views last.\n\nLoad `references/sql-conventions.md` for naming conventions, CTE patterns,\ntype casting rules, and non-obvious Spark SQL syntax before writing any SQL.\n\n- [ ] **4.1** Write the SQL to `/home/claude/mlv_output.sql`.\n\n### Phase 4a — Generate T-SQL validation queries\n\nBefore converting to MLV format, generate a set of plain `SELECT` queries that\nthe user can run against the Fabric SQL Analytics Endpoint to validate the\ntransformation logic independently.\n\n- [ ] **4a.1** For each MLV definition, extract the CTE + SELECT logic and wrap\n it as a standalone `SELECT` statement (removing the `CREATE OR REPLACE\n MATERIALIZED LAKE VIEW` wrapper).\n\n- [ ] **4a.2** Write the validation queries to a separate file:\n - Bronze→Silver: `bronze_to_silver_validation.sql`\n - Silver→Gold: `silver_to_gold_validation.sql`\n\n- [ ] **4a.3** For each query, add a `LIMIT 20` clause and a `-- Expected: ...`\n comment indicating the expected row count and key column values.\n\n- [ ] **4a.4** Present the validation file to the user. The user can run these\n queries in the Fabric SQL Analytics Endpoint (T-SQL mode) to inspect outputs\n before committing to the MLV definitions.\n\n> **Why T-SQL first?** MLV creation is an all-or-nothing operation. If a column\n> name is wrong or a date format doesn't parse, the entire MLV fails. Running\n> validation SELECTs first catches these issues with clear error messages and\n> lets the user inspect sample data before committing.\n\n### Phase 5 — Validate\n\n- [ ] **5.1** Run the **data validation** from the reference file's validation\n step. Load source (and target, if schema-driven) CSVs in pandas and verify:\n - Column names match the target / expected output\n - Row counts are within tolerance (exact for dims, ±5% for facts)\n - Numeric columns: values within tolerance\n - Date columns: all parse correctly\n\n- [ ] **5.2** If Silver→Gold, run the **star schema structural checklist**:\n - [ ] Every table is clearly a dimension or a fact\n - [ ] Every fact has FKs to all related dimensions\n - [ ] Every dimension has a unique primary key\n - [ ] A date dimension exists spanning the full fact date range\n - [ ] Date dimension has display + sort column pairs for Power BI\n - [ ] Every dimension has an unknown/unassigned member row\n - [ ] No snowflaking (no dim-to-dim FK references)\n - [ ] No fact embeds descriptive attributes belonging in a dimension\n - [ ] Consistent grain within each fact table\n - [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n - [ ] Surrogate key DENSE_RANK ORDER BY identical in dim views and fact CTEs\n - [ ] Role-playing dimensions documented\n - [ ] Degenerate dimensions remain in facts\n\n- [ ] **5.3** Fix any issues found. Re-validate until clean.\n\n### Phase 6 — Deliver\n\n- [ ] **6.1** Copy the validated SQL to `/mnt/user-data/outputs/` with a\n descriptive filename:\n - Bronze→Silver: `bronze_to_silver_mlv.sql`\n - Silver→Gold: `silver_to_gold_mlv.sql`\n\n- [ ] **6.2** Generate a **transformation logic document** alongside the SQL:\n - Bronze→Silver: `silver_logic.md`\n - Silver→Gold: `gold_logic.md`\n\n This file MUST contain:\n - **Per-view section** with: source table(s), transformations applied (reference\n T-codes), column mapping (bronze name → silver alias + type), any data quality\n issues detected (nulls, artifacts, dirty data, ambiguous formats) and how they\n were handled.\n - **Cross-view dependencies**: which MLVs reference other MLVs and why.\n - **Dropped/excluded data**: columns or rows removed, with rationale.\n - **Domain context**: any business-domain knowledge that informed the design\n (e.g., location hierarchies, currency conventions, fiscal calendars).\n - **Assumptions**: anything not explicitly confirmed by the user.\n\n If delivering as a notebook (`.ipynb`), the per-view markdown cells serve as\n the inline documentation — no separate logic file is needed, since the same\n information is embedded directly in the notebook.\n\n- [ ] **6.3** Present both files to the user.\n\n- [ ] **6.4** Summarise:\n - Number of views created\n - Key transformation patterns applied\n - (Gold) Number of dimensions vs facts, fiscal year config, currency\n - Any warnings or assumptions\n\n## Output Format\n\n```sql\n-- <Layer> layer Spark SQL MLV definitions\n-- Generated by fabric-lakehouse-mlv skill\n-- Source schema: <source_schema> | Target schema: <target_schema>\n-- Assumptions: <fiscal year, currency, grain, etc.>\n\nCREATE SCHEMA IF NOT EXISTS <target_schema>;\n\n-- <View description>\nCREATE OR REPLACE MATERIALIZED LAKE VIEW <target_schema>.<view_name> AS\nWITH cleaned AS (\n ...\n)\nSELECT ...\nFROM cleaned;\n```\n\n## Gotchas\n\n- **BOM characters**: Bronze/silver CSVs often have UTF-8 BOM. Always use\n `encoding='utf-8-sig'` in pandas.\n- **Date format ambiguity**: If all day values ≤ 12, `dd/MM/yyyy` vs `MM/dd/yyyy`\n is ambiguous. Default to `dd/MM/yyyy` for UK/EU data. Ask the user if unsure.\n- **Unpivot STACK count**: The integer N in `LATERAL VIEW STACK(N, ...)` must\n exactly match the number of column pairs. Off-by-one causes silent data loss.\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` in a gold dimension\n and the matching CTE in a fact MUST use the exact same ORDER BY or keys diverge.\n- **SCD fan-out**: Overlapping date ranges in SCD tables duplicate fact rows.\n Validate non-overlap in silver before building gold.\n- **COALESCE placement**: Apply in the final SELECT of gold facts, never in the\n JOIN condition. Joining `ON fk = 'UNKNOWN'` would incorrectly match the\n unknown dimension row.\n- **Revenue-weighted allocation**: Only use when a revenue table exists. Fall back\n to equal split (`amount / 12.0`) when revenue is zero for a period.\n- **Power BI sort columns**: In the gold date dimension, always pair display\n columns (MonthName, DayOfWeekName, FiscalPeriodLabel) with numeric sort\n columns (MonthNumber, DayOfWeekNumber, FiscalPeriodNumber). Without these,\n months sort alphabetically in Power BI.\n- **No snowflaking in gold**: Flatten all dimension attributes. `dim_hotel`\n should contain City and Country directly, not reference a `dim_geography`.\n- **dayofweek() in Spark**: Returns 1=Sunday, 7=Saturday. Weekend = `IN (1,7)`.\n- **Fiscal year formula**: `((month + (12 - start_month)) % 12) + 1`. Test at\n January and at the start month for off-by-one errors.\n- **MLV-to-MLV references**: Materialized Lake Views in Fabric CAN reference\n other Materialized Lake Views. This is the preferred layered pattern. Always\n create referenced views before referencing views (dependency ordering).\n Use `silver.view_name` (not `bronze.view_name`) when joining to a silver\n view from another silver view.\n- **Column naming mismatch**: Bronze delta table columns may differ from the\n original CSV file headers. The `csv-to-bronze-delta-tables` skill applies\n `clean_columns()` which lowercases all names and replaces spaces/special\n characters with underscores (e.g., `Hotel ID` → `hotel_id`). PDF-extracted\n tables (from `pdf-to-bronze-delta-tables`) have AI-determined field names\n that may not match any CSV. Always verify actual lakehouse column names\n before writing SQL.\n\n## Available References\n\n- **`references/profile_csvs.py`** — Profiles uploaded CSV files and outputs a JSON\n report with column metadata, type flags, and pattern detection.\n Run: `python references/profile_csvs.py --help`\n- **`references/sql-conventions.md`** — Naming, CTE patterns, type casting, and Spark SQL syntax. Load during Phase 4.\n- **`references/notebook-standard.md`** — Required markdown cell structure when delivering output as a `.ipynb` notebook. Load when user requests notebook output.\n- **`references/bronze-to-silver-schema-driven.md`** — Transformation catalogue for bronze→silver schema-driven approach.\n- **`references/bronze-to-silver-pattern-driven.md`** — Transformation catalogue for bronze→silver pattern-driven approach.\n- **`references/silver-to-gold-schema-driven.md`** — Transformation catalogue for silver→gold schema-driven approach.\n- **`references/silver-to-gold-pattern-driven.md`** — Transformation catalogue for silver→gold pattern-driven approach.\n- **`references/output-template.sql`** — SQL output template.\n",
|
|
97
111
|
},
|
|
98
112
|
{
|
|
99
|
-
relativePath: "
|
|
100
|
-
content: "# Test Agent:
|
|
113
|
+
relativePath: "references/agent.md",
|
|
114
|
+
content: "# Test Agent: fabric-lakehouse-mlv\n\n## Instructions\n\nYou are a test agent. Your sole instruction is to follow the skill at\n`fabric-lakehouse-mlv/SKILL.md` exactly as written.\n\nDo not deviate from the skill instructions. Do not use knowledge or approaches\nnot described in the skill. If the skill is ambiguous, note the ambiguity rather\nthan making assumptions.\n\nWhen the skill tells you to load a reference file, use the `view` tool to read\nit from `fabric-lakehouse-mlv/references/`. When the skill tells you to run a\nscript, execute it with `python fabric-lakehouse-mlv/scripts/profile_csvs.py`.\n\n## How to Use\n\nProvide a set of CSV files (uploaded to `/mnt/user-data/uploads/`) and this\nagent will:\n\n1. Ask you the layer (bronze→silver or silver→gold)\n2. Ask you the approach (schema-driven or pattern-driven)\n3. Profile the CSVs\n4. Detect transformations\n5. Present a plan for your approval\n6. Generate a `.sql` file with MLV definitions\n7. Validate and deliver\n\nAfter each run, note:\n\n- Did the output match the expected SQL format?\n- Were the transformation detections correct?\n- Did the skill handle edge cases (BOM, date ambiguity, nulls)?\n- Were any steps unclear or skipped?\n- (Gold layer) Did the star schema checklist pass?\n\nShare this feedback so the skill can be refined.\n\n## Test Scenarios\n\n### Scenario 1: Bronze → Silver (Schema-Driven)\nUpload both bronze CSVs and corresponding silver CSVs. Select \"bronze to silver\"\nand \"schema-driven\". Verify the generated SQL reproduces the silver outputs.\n\n### Scenario 2: Bronze → Silver (Pattern-Driven)\nUpload only bronze CSVs. Select \"bronze to silver\" and \"pattern-driven\". Verify\nthe suggested transformations are reasonable and the SQL is valid Spark SQL.\n\n### Scenario 3: Silver → Gold (Schema-Driven)\nUpload silver CSVs and gold CSVs. Select \"silver to gold\" and \"schema-driven\".\nVerify the star schema is correct and the checklist passes.\n\n### Scenario 4: Silver → Gold (Pattern-Driven)\nUpload only silver CSVs. Select \"silver to gold\" and \"pattern-driven\". Verify\ndimensions and facts are correctly identified and the star schema checklist passes.\n",
|
|
101
115
|
},
|
|
102
116
|
{
|
|
103
|
-
relativePath: "
|
|
104
|
-
content: "---\r\nname: <skill-name>\r\ndescription: >\r\n Use this skill when [user intent / trigger phrases]. Triggers on: \"[phrase 1]\",\r\n \"[phrase 2]\", \"[phrase 3]\". Does NOT trigger for [adjacent tasks to exclude].\r\nlicense: MIT\r\ncompatibility: [e.g. \"Python 3.8+, Node.js 18+\" — omit if no special requirements]\r\n---\r\n\r\n# <Skill Title>\r\n\r\nOne-sentence summary of what this skill does and its goal.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `--param1` | What it controls | `\"value\"` |\r\n| `--param2` | What it controls | `\"value\"` |\r\n\r\n## Workflow\r\n\r\n- [ ] Step 1: [Action] — `scripts/step1.py --param1 <VALUE>`\r\n- [ ] Step 2: [Action] — describe what to do\r\n- [ ] Step 3: Validate — `scripts/validate.py output/`\r\n - If validation fails: fix the issue, re-run validation\r\n - Only proceed when validation passes\r\n- [ ] Step 4: [Final action]\r\n\r\n## Output Format\r\n\r\n```\r\n[Paste a concrete example of what correct output looks like]\r\n```\r\n\r\nOr reference a template: see `assets/output-template.md`\r\n\r\n## Gotchas\r\n\r\n- [Non-obvious fact that the agent would get wrong without being told]\r\n- [Naming inconsistency, soft-delete pattern, environment quirk, etc.]\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/<name>.py`** — [What it does]. Run: `uv run scripts/<name>.py --help`\r\n",
|
|
117
|
+
relativePath: "references/bronze-to-silver-pattern-driven.md",
|
|
118
|
+
content: "# Approach 2 — Pattern-Driven Silver Layer Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts from bronze CSV files alone.\nThe agent profiles every file, detects structural and data-quality patterns, and\napplies a standardised catalogue of silver-layer transformations without needing\na target schema.\n\n---\n\n## When to use\n\nThe user supplies **only bronze CSV files** — no target silver schema is provided.\nThe goal is to propose and generate a best-practice silver layer that cleans,\nconforms, and restructures the raw data for downstream analytics.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Bronze CSVs | Uploaded to `/mnt/user-data/uploads/` or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/silver_layer_mlv.sql` |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the files\n\n1. List all uploaded CSVs.\n2. For each file, run the profiler from Step 2 below.\n3. Classify each file into one of these **table archetypes**:\n\n| Archetype | Detection signals |\n|---|---|\n| **Dimension / master data** | Low row count (<500), high cardinality ID column, descriptive text columns, no date-series pattern |\n| **Fact / transactional** | High row count, date column, numeric measure columns, foreign key IDs |\n| **Periodic snapshot** | One row per entity per period, numeric columns that look like cumulative or period totals |\n| **SCD / history** | Start/end date pairs, \"is current\" flag, same entity ID appears multiple times |\n| **Wide / pivoted** | Many similarly-typed numeric columns whose names follow a pattern (days, months, categories) |\n| **Event log** | Start date (and optional end date), attendance/count columns, descriptive text |\n| **Forecast / budget** | Quarter/period labels, multiple scenario columns (budget, forecast, actual) |\n| **Lookup / bridge** | Two or more ID columns, few or no measure columns, maps one entity to another |\n\nPresent the classification to the user and ask for confirmation before proceeding.\n\n### Step 2 — Profile every CSV\n\n> **Column naming in Fabric delta tables:** CSV column names with spaces and\n> special characters are converted to lowercase with underscores when loaded\n> into Fabric Lakehouse delta tables (e.g., `Hotel ID` → `hotel_id`). PDF-extracted\n> tables may have entirely different column schemas. Always verify the actual\n> delta table column names before writing SQL — do NOT assume they match the\n> original CSV file headers.\n\nFor each CSV, capture:\n\n```python\nimport pandas as pd, re\n\ndef profile(path):\n df = pd.read_csv(path, encoding='utf-8-sig', nrows=500)\n result = {}\n for col in df.columns:\n s = df[col].dropna().astype(str)\n result[col] = {\n \"dtype_inferred\": str(df[col].dtype),\n \"n_unique\": int(df[col].nunique()),\n \"n_null\": int(df[col].isna().sum()),\n \"sample\": s.head(5).tolist(),\n \"has_leading_trailing_whitespace\": bool(s.str.strip().ne(s).any()),\n \"looks_like_date\": bool(s.str.match(\n r'^\\d{1,4}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{1,4}$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_currency\": bool(s.str.match(\n r'^[\\s]*[\\$£€]?[\\s]*[\\d,]+\\.?\\d*[\\s]*$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_month_year\": bool(s.str.match(\n r'^[A-Za-z]{3}[\\s\\-]\\d{2,4}$'\n ).all()) if len(s) > 0 else False,\n \"looks_like_boolean\": set(s.str.upper().unique()).issubset(\n {'YES','NO','TRUE','FALSE','1','0','Y','N',''}\n ),\n \"has_commas_in_numbers\": bool(s.str.match(\n r'^[\\d,]+$'\n ).any()) and not bool(s.str.match(r'^[A-Za-z]').any()),\n \"max_len\": int(s.str.len().max()) if len(s) > 0 else 0,\n \"empty_string_count\": int((s == '').sum()),\n }\n return result\n```\n\n### Step 3 — Apply the transformation catalogue\n\nFor each file, walk through the catalogue below **in order**. Every rule that\nmatches produces a SQL fragment. Combine all matching rules into a single MLV\ndefinition for that file.\n\n---\n\n#### CATALOGUE OF STANDARD TRANSFORMATIONS\n\n##### T01 — Trim all strings\n\n| Condition | Always — apply to every `string`/`object` column |\n|---|---|\n| SQL | `trim(ColumnName) AS ColumnAlias` |\n| Notes | First transformation applied; all subsequent rules operate on trimmed values |\n\n##### T02 — Rename columns to PascalCase\n\n| Condition | Column names contain underscores, spaces, or inconsistent casing |\n|---|---|\n| SQL | Use `AS PascalCaseAlias` in the SELECT |\n| Rules | `Hotel_ID` → `HotelID`, `No_of_Rooms` → `RoomCount`, `Month_Year` → `MonthYear`. Strip redundant prefixes/suffixes. Prefer semantic names: `Number_of_X` → `XCount`, `Is_X` → `IsX`. |\n\n##### T03 — Parse dates\n\n| Condition | Column flagged `looks_like_date` or `looks_like_month_year` in profile |\n|---|---|\n| SQL patterns | |\n\n| Source format | SQL |\n|---|---|\n| `dd/MM/yyyy` | `to_date(col, 'dd/MM/yyyy')` |\n| `yyyy-MM-dd` | `to_date(col, 'yyyy-MM-dd')` |\n| `MM/dd/yyyy` | `to_date(col, 'MM/dd/yyyy')` — only if day values never exceed 12; ask user if ambiguous |\n| `Mon-YY` (e.g. `Jan-09`) | `to_date(concat('01-', regexp_replace(col, ' ', '-')), 'dd-MMM-yy')` |\n| `QUARTER N` | Not a date — handle in T10 (forecast expansion) |\n\nAlways output as `DATE` type. Alias date columns as `XDate` or `MonthStart` depending on semantics.\n\n##### T04 — Null sentinel for optional end-dates\n\n| Condition | Column looks like a date AND has empty strings or NULLs AND is paired with a start-date column |\n|---|---|\n| SQL | `coalesce(to_date(NULLIF(trim(col), ''), 'dd/MM/yyyy'), make_date(2099, 12, 31)) AS EndDate` |\n| Notes | The sentinel `2099-12-31` enables open-ended range joins (`BETWEEN StartDate AND EndDate`) |\n\n##### T05 — Boolean normalisation\n\n| Condition | Column flagged `looks_like_boolean` in profile |\n|---|---|\n| SQL | `CASE WHEN upper(trim(col)) IN ('YES','TRUE','Y','1') THEN 1 ELSE 0 END AS IsX` |\n| Output type | `INT` (0 or 1) |\n\n##### T06 — Cast numeric types\n\n| Condition | Column is numeric in intent but stored as string |\n|---|---|\n| SQL by subtype | |\n\n| Subtype | Detection | SQL |\n|---|---|---|\n| Integer (IDs, counts, quantities) | All values are whole numbers, or column name contains `ID`, `Count`, `Qty`, `Quantity`, `Number` | `CAST(col AS INT)` |\n| Monetary | `looks_like_currency` or column name contains `Revenue`, `Profit`, `Cost`, `Amount`, `Price`, `Total`, `Spend` | `CAST(ROUND(col, 2) AS DECIMAL(18,2))` |\n| Rate / ratio | Column name contains `Rate`, `Ratio`, `Percentage`, `Share` | `CAST(ROUND(col, 2) AS DECIMAL(10,2))` |\n\n##### T07 — Strip currency symbols and comma-formatted numbers\n\n| Condition | `looks_like_currency` or `has_commas_in_numbers` in profile |\n|---|---|\n| SQL | `CAST(regexp_replace(regexp_replace(col, '\\\\$|£|€', ''), ',', '') AS DECIMAL(18,2))` |\n| Notes | Apply BEFORE T06 casting. Handle space-padded values: `trim()` first. |\n\n##### T08 — Deduplication\n\n| Condition | Table archetype is **Dimension** AND duplicate rows exist (row count > distinct row count on key columns) |\n|---|---|\n| SQL | Wrap the entire SELECT in `SELECT DISTINCT` |\n\n##### T09 — Unpivot wide tables\n\n| Condition | Table archetype is **Wide / pivoted** — many similarly-typed columns whose names represent categories or time periods |\n|---|---|\n| SQL | `LATERAL VIEW STACK(N, 'Col1', Col1, 'Col2', Col2, …) AS CategoryColumn, ValueColumn` |\n| Steps | 1. Identify the set of columns to unpivot (the \"wide\" columns). 2. Identify the columns to keep (the \"anchor\" columns — usually IDs or names). 3. Generate the STACK expression. 4. Name the new categorical column descriptively (e.g., `DayOfWeek`, `ExpenseCategory`). 5. Name the value column descriptively (e.g., `RateGBP`, `AnnualAmount`). |\n\n**Detection heuristic**: If 4+ columns share the same data type and their names\nform a recognisable set (days of week, months, expense types, room types), they\nare candidates for unpivoting.\n\n##### T10 — Temporal expansion (annual → monthly)\n\n| Condition | Table has a `Year` column (or similar period column) and numeric columns that represent **annual** totals, and the desired grain is monthly |\n|---|---|\n| SQL | `explode(sequence(make_date(YearCol, 1, 1), make_date(YearCol, 12, 1), interval 1 month)) AS MonthStart` |\n| Notes | After expansion, the annual amount must be **allocated** to each month. Default: equal split (`AnnualAmount / 12.0`). If a revenue table exists, prefer proportional allocation — see T13. |\n\n##### T11 — Date-range expansion (event start/end → daily/monthly)\n\n| Condition | Table has `StartDate` and `EndDate` columns representing a span |\n|---|---|\n| SQL | `explode(sequence(StartDate, coalesce(EndDate, StartDate), interval 1 day)) AS EventDay` |\n| Follow-up | Calculate `DailyValue = TotalValue / greatest(datediff(EndDate, StartDate) + 1, 1)` then aggregate to monthly: `GROUP BY date_trunc('month', EventDay)` with `SUM(DailyValue)` |\n\n##### T12 — Forecast / quarter expansion (quarterly → monthly)\n\n| Condition | Table archetype is **Forecast / budget** with quarter labels |\n|---|---|\n| SQL | Use `LATERAL VIEW explode(array(1, 2, 3)) AS MonthNumberWithinQuarter` then `make_date(Year, ((QuarterNumber - 1) * 3) + MonthNumberWithinQuarter, 1) AS MonthStart` |\n| Notes | Quarterly totals are split across 3 months. Default: equal third. If revenue data exists, use revenue-weighted allocation — see T13. |\n\n##### T13 — Proportional allocation (revenue-weighted)\n\n| Condition | A temporal expansion (T10, T11, T12) is being applied AND a revenue/transaction table exists that can provide weights |\n|---|---|\n| SQL pattern | |\n\n```sql\n-- Calculate weight per period\nMonthRevenue / NULLIF(SUM(MonthRevenue) OVER (PARTITION BY EntityID, Year), 0) AS MonthWeight\n-- Apply weight\nROUND(AnnualAmount * MonthWeight, 2) AS MonthlyAmount\n-- Fallback when revenue is zero\nCASE WHEN AnnualTotal > 0\n THEN Amount * (MonthValue / AnnualTotal)\n ELSE Amount / 12.0\nEND\n```\n\n| Notes | Always include a zero-revenue fallback (equal split). Use `NULLIF` or `greatest()` to avoid division by zero. |\n\n##### T14 — Enrichment joins (dimension lookups)\n\n| Condition | A fact/transactional table has an ID column that matches a dimension table's key |\n|---|---|\n| SQL | `LEFT JOIN silver.dimension_view d ON f.KeyCol = d.KeyCol` |\n| Rules | 1. Always use `LEFT JOIN` from fact to dimension (never lose fact rows). 2. For SCD joins, add `AND f.DateCol BETWEEN d.StartDate AND d.EndDate`. 3. Pull through only the columns needed (manager name, city, country — not the whole dimension). |\n\n##### T15 — Fuzzy / normalised name joins\n\n| Condition | Two tables need joining on a name column but values don't match exactly (abbreviations, prefixes, case differences) |\n|---|---|\n| SQL | `regexp_replace(lower(trim(col)), '^common_prefix\\\\s+', '') AS NormalizedName` on both sides, then join on the normalised column |\n| Notes | Common prefixes to strip: brand names, `The `, `Hotel `, etc. Ask the user to confirm the normalisation logic if not obvious. |\n\n##### T16 — Aggregation\n\n| Condition | Silver grain should be coarser than bronze (e.g., order-level → monthly/category level) |\n|---|---|\n| SQL | `GROUP BY entity_id, date_trunc('month', DateCol), CategoryCol` with `SUM(measure)`, `COUNT(*)`, etc. |\n| Notes | Always `CAST(ROUND(SUM(col), 2) AS DECIMAL(18,2))` for monetary aggregations. |\n\n---\n\n### Step 4 — Propose the silver layer\n\nBefore generating SQL, present a **transformation plan** to the user:\n\n```\nBronze file | Silver view name | Archetype | Transformations applied\n-------------------------|--------------------------------|---------------|------------------------\nraw_hotels.csv | silver.hotel_metadata | Dimension | T01, T02, T03, T06, T08\nraw_managers.csv | silver.manager_assignments_scd | SCD | T01, T02, T03, T04, T05\nraw_revenue.csv | silver.revenue_monthly | Fact | T01, T02, T03, T06, T14\nraw_expenses.csv | silver.expenses_monthly | Periodic | T01, T02, T06, T09, T10, T13\nraw_rates.csv | silver.room_rates | Wide | T01, T02, T06, T09, T14\nraw_events.csv | silver.events_city_monthly | Event log | T01, T02, T03, T06, T07, T11, T14\nraw_forecast.csv | silver.forecast_monthly | Forecast | T01, T02, T07, T12, T13\nraw_orders.csv | silver.property_orders | Fact | T01, T02, T03, T06, T14, T16\n```\n\nInclude for each view:\n- Proposed column list with types\n- Join dependencies (which other silver views it references)\n- Any assumptions or ambiguities to resolve\n\n**Wait for user confirmation** before generating SQL.\n\n### Step 5 — Generate the SQL script\n\nProduce one `.sql` file. Follow these structural rules:\n\n**File structure:**\n1. `CREATE SCHEMA IF NOT EXISTS silver;`\n2. Dimension/master views first (no dependencies)\n3. SCD views second (may depend on dimensions)\n4. Fact views next (may depend on dimensions and SCDs)\n5. Derived/allocated views last (depend on other silver views)\n\n**MLV-to-MLV cross-references:**\nMaterialized Lake Views in Fabric can reference other Materialized Lake Views.\nThis is the **standard layered pattern**. Dependent views should JOIN to\nearlier silver views using `silver.<view_name>` syntax. Examples:\n- A fact view joining to a dimension view: `LEFT JOIN silver.hotel_dim`\n- An allocated view reading weights from a fact view: `FROM silver.revenue_monthly`\n- A fuzzy-join enrichment: `LEFT JOIN silver.hotel_dim h ON normalised_key = h.normalised_key`\n\nOrder matters: referenced views MUST appear earlier in the script than the views\nthat reference them.\n\n**Notebook documentation (when delivering as .ipynb):**\nWhen delivering as a notebook, each MLV must have a detailed markdown cell before\nits code cell containing: (1) what the view does, (2) why the logic was chosen,\n(3) data quality notes, (4) cross-view dependencies, (5) columns dropped with\nrationale. The notebook header must explain MLVs, list all transformation types\napplied, and provide execution order guidance.\n\n**CTE conventions:**\n- `cleaned` — first CTE: trims, casts, parses dates, strips currencies\n- `unpivoted` — if T09 applies\n- `expanded` — if T10/T11/T12 applies\n- `aggregated` — if T16 applies\n- `enriched` or `annotated` — if T13/T14 applies\n- Final SELECT pulls from the last CTE\n\n**Naming conventions:**\n- View names: `silver.<descriptive_snake_case>`\n- Column aliases: `PascalCase`\n- CTE names: `snake_case`\n\n**Type casting rules:**\n- IDs and counts → `INT`\n- Monetary values → `DECIMAL(18,2)`\n- Rates → `DECIMAL(10,2)`\n- Intermediate calculations → `DECIMAL(18,4)` to avoid premature rounding\n- Final monetary output → `CAST(ROUND(expr, 2) AS DECIMAL(18,2))`\n- Dates → `DATE`\n- Strings → trimmed, no explicit cast needed\n\n**Spark SQL syntax reminders:**\n- Backtick reserved words: `` `Year` ``, `` `Month` ``, `` `Order` ``\n- `LATERAL VIEW STACK(n, ...)` for unpivot\n- `LATERAL VIEW explode(...)` for expansion\n- `date_trunc('month', col)` for month normalisation\n- `make_date(y, m, d)` for date construction\n- `regexp_replace()` — double-escape backslashes: `'\\\\$'` not `'\\$'`\n- `sequence(start, end, interval 1 month)` for date sequences\n- `greatest(expr, 1)` to guard against zero division\n- `coalesce()` for null handling\n- `NULLIF(trim(col), '')` to convert empty strings to NULL\n\n### Step 5a — Generate T-SQL validation queries\n\nBefore committing to MLV definitions, generate standalone `SELECT` queries that\nthe user can run against the Fabric SQL Analytics Endpoint to validate each\ntransformation independently.\n\n1. For each MLV, extract the CTE + SELECT logic into a plain `SELECT` statement.\n2. Add `LIMIT 20` and a `-- Expected: N rows, key columns: [...]` comment.\n3. Write to `bronze_to_silver_validation.sql`.\n4. Present to the user for execution in the SQL Analytics Endpoint.\n\n> **Why validate first?** MLV creation is all-or-nothing. Running validation\n> SELECTs catches column name errors, date format mismatches, and join issues\n> before the irreversible create operation.\n\n### Step 6 — Validate and present\n\n1. For each generated view, run a quick sanity check in Python:\n - Load the bronze CSV, apply the transformations in pandas\n - Confirm the output column names, types, and approximate row counts look correct\n2. Write SQL to `/mnt/user-data/outputs/bronze_to_silver_mlv.sql`\n3. Write a **silver_logic.md** document alongside the SQL. This file MUST contain:\n - **Per-view section**: source table(s), transformations applied (T-codes),\n column mapping (bronze name → silver alias + type), data quality issues\n detected (nulls, artifacts, dirty data, ambiguous formats) and how they\n were resolved.\n - **Cross-view dependencies**: which MLVs reference other MLVs, with the\n join logic and why.\n - **Dropped/excluded data**: columns or rows removed, with rationale.\n - **Domain context**: business-domain knowledge that informed the design\n (e.g., location hierarchies, currency conventions, fiscal calendars).\n - **Assumptions**: anything not explicitly confirmed by the user.\n4. Present both files to the user\n5. Summarise: number of views, transformations applied, any assumptions made\n\n---\n\n## Gotchas\n\n- **BOM characters**: Use `encoding='utf-8-sig'` when reading CSVs.\n- **Date format ambiguity**: If all day values are ≤ 12, the format is ambiguous (`dd/MM` vs `MM/dd`). Ask the user. Default to `dd/MM/yyyy` for UK/EU data.\n- **Unpivot column count**: Double-check the STACK count `N` matches the actual number of column pairs. Off-by-one here causes silent data loss.\n- **Revenue-weighted allocation**: Only apply T13 if a revenue or transaction table exists. If not, fall back to equal split and note the assumption.\n- **Cross-view dependencies**: If the expenses view uses revenue weights, it depends on the revenue view. The revenue view must be created first.\n- **MLV-to-MLV references are supported**: Materialized Lake Views in Fabric CAN\n reference other Materialized Lake Views. This is the preferred layered pattern.\n Always emit dependent views AFTER the views they reference.\n- **Fuzzy joins are fragile**: Always show the user the normalised values from both sides and ask them to confirm the join produces the expected number of matches.\n- **Empty strings**: Many CSVs use empty strings instead of NULL. Always convert with `NULLIF(trim(col), '')` before date parsing or numeric casting to avoid Spark parse errors.\n- **Comma-formatted numbers**: Attendance, currency, and large counts often have commas. Always strip before casting.\n- **Multiple bronze files for one silver view**: Some silver views need data from 2+ bronze files (e.g., orders + hotel metadata). Detect this when a bronze fact table has an ID that matches a dimension table's key.\n- **Proportional allocation fallback**: When the weighting denominator is zero (no revenue in a period), always fall back to equal split. Never produce NULL or zero-divided amounts.\n",
|
|
105
119
|
},
|
|
106
120
|
{
|
|
107
|
-
relativePath: "references/
|
|
108
|
-
content: "# Probing Questions Reference\r\n\r\nUse these questions to fill gaps before drafting a skill. Ask **one at a time**.\r\nStop when you have enough to draft confidently. Skip questions with obvious answers.\r\n\r\n## External Skills & Dependencies\r\n\r\n- Does the user reference any other agent skills (e.g., skills-for-fabric, azure-prepare) as part of this process?\r\n- If yes: what specific steps or outputs from that skill are needed? (extract only what's relevant)\r\n- Are there API conventions, authentication patterns, or script logic from those skills that must be replicated?\r\n- Can all required behaviour be embedded in this skill's own files, without assuming the end user has those skills installed?\r\n\r\n## Parameters & Variability\r\n\r\n- What inputs change between different runs of this process?\r\n- Are any of those inputs user-provided, or fetched from a system/file?\r\n- What would break if the wrong value was used? (helps identify critical params)\r\n- Are there default values that apply when an input isn't specified?\r\n\r\n## Fixed Steps & Determinism\r\n\r\n- Are there steps that must always run in exactly the same way? (candidates for scripts)\r\n- Are there specific commands, API calls, or tools that must be used (not alternatives)?\r\n- Is there a specific order that steps must follow, or can some run in parallel?\r\n\r\n## Edge Cases & Failure Modes\r\n\r\n- What are the most common ways this process goes wrong?\r\n- Are there preconditions that must be met before the process can start?\r\n- What should happen if a step fails mid-process?\r\n- Are there data/input values that require special handling?\r\n\r\n## Environment & Tools\r\n\r\n- What language/runtime is available? (Python, Node.js, PowerShell, bash, etc.)\r\n- Are there existing libraries or internal tools that should be used?\r\n- Does this process need network access, database connections, or file system access?\r\n- Are there permissions or credentials required?\r\n\r\n## Output & Validation\r\n\r\n- What does a correct output look like? (request an example if possible)\r\n- How does the user currently verify that the process succeeded?\r\n- Should the output be a file, stdout, a structured format (JSON/CSV), or a report?\r\n- Are there downstream systems that consume the output? (affects format constraints)\r\n\r\n## Reusability & Scope\r\n\r\n- Is this skill for one team/project or should it work across different contexts?\r\n- Are there variations of this process that the skill should NOT handle? (scope boundary)\r\n- Should the skill handle cleanup/rollback if something goes wrong?\r\n",
|
|
121
|
+
relativePath: "references/bronze-to-silver-schema-driven.md",
|
|
122
|
+
content: "# Approach 1 — Schema-Driven Silver Layer Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts by comparing bronze input\nCSVs against target silver output CSVs. The agent infers every transformation by\ndiffing schemas and sampling data — no domain knowledge required.\n\n---\n\n## When to use\n\nThe user supplies **both**:\n\n- One or more **bronze CSV** files (raw source data)\n- One or more **silver CSV** files (desired output after transformation)\n\nThe goal is to produce a `.sql` script of `CREATE OR REPLACE MATERIALIZED LAKE VIEW`\nstatements that, when executed against the bronze tables in a Fabric Lakehouse,\nreproduce the silver outputs.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Bronze CSVs | Uploaded to `/mnt/user-data/uploads/` or provided in context |\n| Silver CSVs | Uploaded to `/mnt/user-data/uploads/` or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/silver_layer_mlv.sql` |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory the files\n\n1. List all uploaded CSVs.\n2. Ask the user to confirm which files are **bronze** (raw) and which are **silver** (target).\n3. Ask the user to confirm the **bronze schema name** (default `bronze`) and **silver schema name** (default `silver`).\n4. For each silver CSV, ask the user to confirm which bronze CSV(s) feed into it.\n If the mapping is obvious from column overlap, propose it and ask for confirmation.\n\nRecord the mapping as a table:\n\n```\nSilver view | Bronze source(s) | Join/lookup sources\n-------------------------|------------------------------|--------------------\nsilver.hotel_metadata | bronze.hotels_raw | —\nsilver.revenue_monthly | bronze.revenue_raw | silver.manager_assignments_scd\n```\n\n### Step 2 — Profile every CSV\n\n> **Column naming in Fabric delta tables:** CSV column names with spaces and\n> special characters are converted to lowercase with underscores when loaded\n> into Fabric Lakehouse delta tables (e.g., `Hotel ID` → `hotel_id`). PDF-extracted\n> tables may have entirely different column schemas. Always verify the actual\n> delta table column names before writing SQL — do NOT assume they match the\n> original CSV file headers.\n\nFor each CSV (bronze and silver), run a Python profiling pass that captures:\n\n```python\nimport pandas as pd, json, sys\n\ndef profile(path):\n df = pd.read_csv(path, encoding='utf-8-sig', nrows=200)\n info = {\n \"columns\": list(df.columns),\n \"dtypes\": {c: str(df[c].dtype) for c in df.columns},\n \"row_count\": len(df),\n \"sample_values\": {c: df[c].dropna().head(5).tolist() for c in df.columns},\n \"nulls\": {c: int(df[c].isna().sum()) for c in df.columns},\n \"unique_counts\": {c: int(df[c].nunique()) for c in df.columns}\n }\n return info\n```\n\nStore results per file for comparison in subsequent steps.\n\n### Step 3 — Detect transformations by diffing bronze → silver\n\nFor each bronze→silver pair, systematically compare profiles and classify every\ncolumn in the silver output into one or more of the categories below.\n\n#### 3a — Column mapping\n\n| Detection method | Transformation category |\n|---|---|\n| Silver column name exists in bronze (exact or case-insensitive) | **Direct pass-through** (may still need trim/cast) |\n| Silver column name is a PascalCase or camelCase variant of a bronze column | **Rename** |\n| Silver column does not exist in any single bronze file but appears after a JOIN | **Enrichment join** |\n| Silver has fewer columns than bronze | **Column pruning / SELECT subset** |\n\n#### 3b — Type and format changes\n\n| Bronze sample → Silver sample | Transformation |\n|---|---|\n| `\" London \"` → `London` | `trim()` |\n| `\"187\"` (string) → `187` (int) | `CAST(col AS INT)` |\n| `\"15/03/2009\"` → `2009-03-15` | `to_date(col, 'dd/MM/yyyy')` |\n| `\"Jan-09\"` → `2009-01-01` | `to_date(concat('01-', …), 'dd-MMM-yy')` — compound date parse |\n| `\"$1,234.00\"` → `1234.00` | Currency strip: `CAST(regexp_replace(regexp_replace(col, '\\\\$', ''), ',', '') AS DECIMAL)` |\n| `\"YES\"/\"NO\"` → `1`/`0` | Boolean normalisation: `CASE WHEN upper(col) = 'YES' THEN 1 ELSE 0 END` |\n| Empty string → `2099-12-31` | Null sentinel: `coalesce(to_date(NULLIF(trim(col),''), fmt), make_date(2099,12,31))` |\n| Numeric with commas `\"6,500\"` → `6500` | `CAST(regexp_replace(col, ',', '') AS DECIMAL)` |\n\n#### 3c — Structural transformations\n\n| Signal in silver that is absent in bronze | Transformation |\n|---|---|\n| Silver has many more rows than bronze, with a new categorical column whose values match bronze column **names** | **Unpivot** — use `LATERAL VIEW STACK(N, 'ColName1', ColName1, …)` |\n| Silver has a `MonthStart` column and row count ≈ bronze rows × 12 | **Temporal expansion** — `explode(sequence(start, end, interval 1 month))` |\n| Silver has daily/monthly granularity from date-range bronze rows | **Date-range expansion** — `explode(sequence(StartDate, EndDate, interval 1 day))` then `date_trunc('month', …)` |\n| Silver numeric values are fractions of bronze annual totals | **Proportional allocation** — weight by revenue share or equal split (`amount * (month_value / annual_value)`) |\n| Silver has columns from multiple bronze sources | **JOIN enrichment** — identify join keys by matching ID/name columns |\n| Silver row count < bronze row count with same columns | **Aggregation** — `GROUP BY` with `SUM/COUNT/AVG` |\n| Silver has `DISTINCT` fewer rows | **Deduplication** — `SELECT DISTINCT` |\n| Bronze has a name column, silver has an ID + name, and a fuzzy match is needed | **Normalised join** — `regexp_replace(lower(trim(col)), pattern, '')` on both sides |\n\n#### 3d — Cross-table dependencies\n\nCheck whether any silver view references **another silver view** (not just bronze).\nDetect this when:\n- A silver output contains columns that exist in another silver output but not in any bronze file.\n- A silver output's row count or granularity only makes sense after joining to another silver view.\n\nRecord these as dependencies and order the MLV creation statements accordingly.\n\n### Step 4 — Generate the SQL script\n\nProduce one `.sql` file containing all MLV statements. Follow these rules:\n\n**Structural rules:**\n1. Begin with `CREATE SCHEMA IF NOT EXISTS silver;`\n2. Order statements so that dependencies are created first (topological sort).\n3. Each view is a `CREATE OR REPLACE MATERIALIZED LAKE VIEW silver.<view_name> AS`.\n4. Use CTEs (`WITH`) to keep logic readable — one CTE per logical step.\n\n**Notebook documentation (when delivering as .ipynb):**\nWhen delivering as a notebook, each MLV must have a detailed markdown cell before\nits code cell containing: (1) what the view does, (2) why the logic was chosen,\n(3) data quality notes, (4) cross-view dependencies, (5) columns dropped with\nrationale. The notebook header must explain MLVs, list all transformation types\napplied, and provide execution order guidance.\n\n**Naming conventions:**\n- Silver view names: `snake_case`, descriptive (e.g., `revenue_monthly`, `expenses_monthly_unpivoted`).\n- Column aliases: `PascalCase` (e.g., `HotelID`, `MonthStart`, `TotalSpend`).\n- CTE names: `snake_case` (e.g., `cleaned`, `unpivoted`, `aggregated`).\n\n**Type casting rules:**\n- IDs and counts → `INT`\n- Monetary values → `DECIMAL(18,2)`\n- Rates and ratios → `DECIMAL(10,2)` or `DECIMAL(18,4)` for intermediate calculations\n- Dates → `DATE` via `to_date()` with explicit format string\n- Strings → `trim()` always applied on ingestion\n- Final `CAST(ROUND(expr, 2) AS DECIMAL(18,2))` for all monetary outputs\n\n**Spark SQL syntax reminders:**\n- Backtick-quote reserved words used as column names: `` `Year` ``, `` `Month` ``\n- `LATERAL VIEW STACK(n, 'label1', col1, 'label2', col2, …) AS alias1, alias2` for unpivot\n- `LATERAL VIEW explode(array(…))` or `explode(sequence(…))` for expansion\n- `date_trunc('month', col)` to normalise to month start\n- `make_date(year, month, 1)` to construct dates\n- `regexp_replace()` for string cleaning — double-escape backslashes in patterns\n- `coalesce()` for null handling\n- `greatest(expr, 1)` to avoid division by zero\n\n### Step 5 — Validate the output\n\n> Before running the full validation, generate T-SQL validation queries (standalone\n> `SELECT` statements with `LIMIT 20`) that the user can run against the Fabric SQL\n> Analytics Endpoint to spot-check each transformation. Write these to\n> `bronze_to_silver_validation.sql`.\n\nRun a Python validation that:\n\n1. Loads each bronze CSV into a pandas DataFrame.\n2. Applies the *intent* of each SQL transformation in pandas (trim, cast, pivot, join, aggregate).\n3. Loads the corresponding silver CSV.\n4. Compares:\n - Column names match exactly\n - Row counts are within 5% tolerance (date/rounding edge cases)\n - Numeric columns: mean absolute difference < 1% of column mean\n - Date columns: all values parse correctly\n5. Report mismatches as warnings, not hard failures (the SQL will run on Spark, not pandas).\n\n### Step 6 — Write the final file\n\n1. Write the validated SQL to `/mnt/user-data/outputs/silver_layer_mlv.sql`.\n2. Present the file to the user.\n3. Summarise: number of views created, key transformation patterns detected, any warnings.\n\n---\n\n## Gotchas\n\n- **BOM characters**: Bronze CSVs may have UTF-8 BOM (`\\ufeff`). Use `encoding='utf-8-sig'` in pandas.\n- **Carriage returns**: Windows line endings in CSVs — strip `\\r` when comparing.\n- **Date ambiguity**: If dates could be `dd/MM/yyyy` or `MM/dd/yyyy`, check whether any day value > 12 to disambiguate. Ask the user if still ambiguous.\n- **Unpivot detection**: When silver has a column whose values are bronze column names, that is the strongest signal for `LATERAL VIEW STACK`. Count the distinct values — they should match the number of pivoted columns.\n- **Proportional allocation**: If monthly amounts in silver don't simply equal annual ÷ 12, check for revenue-weighted distribution. Compare against equal-split as a baseline.\n- **Cross-view dependencies**: The expenses view may depend on the revenue view for weighting. Always check for this pattern and order MLVs correctly.\n- **Fuzzy name joins**: When bronze has short names and silver has full names (or vice versa), normalise both sides by lowercasing, trimming, and stripping common prefixes.\n- **Empty strings vs NULLs**: Bronze CSVs often use empty strings where NULLs are intended. Detect with `trim(col) = ''` and convert to NULL before further processing.\n- **Comma-formatted numbers**: Attendance, currency, and large integers may have commas in the CSV. Always `regexp_replace(col, ',', '')` before casting to numeric.\n- **Column naming mismatch**: Bronze delta table columns may differ from the\n original CSV file headers. The `csv-to-bronze-delta-tables` skill applies\n `clean_columns()` which lowercases all names and replaces spaces/special\n characters with underscores. Always verify actual lakehouse column names.\n",
|
|
109
123
|
},
|
|
110
124
|
{
|
|
111
|
-
relativePath: "
|
|
112
|
-
content: "
|
|
125
|
+
relativePath: "references/notebook-standard.md",
|
|
126
|
+
content: "# Notebook Documentation Standard for MLV Notebooks\n\nLoad this file when delivering MLV output as a Fabric `.ipynb` notebook.\n\n## Per-view markdown cell (required above each code cell)\n\nEach MLV must include a detailed markdown cell immediately before its SQL cell containing:\n\n1. **What this view does** — one-paragraph plain-language summary\n2. **Why this logic was chosen** — explain each major transformation, why it was needed,\n and what alternatives were considered\n3. **Data quality notes** — any issues found in source data (nulls, artefacts, ambiguous\n formats) and how they were handled\n4. **Cross-view dependencies** — which other MLVs this view references and why\n5. **Columns dropped or excluded** — any source columns excluded, with rationale\n\n## Header markdown cell (required at notebook start)\n\nThe notebook must start with a header markdown cell containing:\n\n- A title and one-paragraph overview of what the notebook does\n- An explanation of what Materialised Lake Views are (for non-technical readers)\n- A summary of all transformation types applied across the notebook\n- Execution order guidance (run top to bottom; dependencies are pre-ordered)\n\n## Principle\n\nWhen delivering as a notebook, the per-view markdown cells serve as the inline\ndocumentation — no separate logic file is needed. The notebook is the single source\nof truth: transformation logic, design rationale, and data quality notes are all\nembedded directly inside it.\n",
|
|
113
127
|
},
|
|
114
|
-
],
|
|
115
|
-
},
|
|
116
|
-
{
|
|
117
|
-
name: "create-process-workflow-agent",
|
|
118
|
-
files: [
|
|
119
128
|
{
|
|
120
|
-
relativePath: "
|
|
121
|
-
content: "---\r\nname: create-process-workflow-agent\r\ndescription: >\r\n Use this skill to create an orchestration agent definition (agent.md) for any\r\n business or technical process. The user describes what they want to automate;\r\n the skill produces a self-contained agent.md. When run, the agent maps the\r\n process to available process skills, automatically creates any missing skills\r\n using create-process-skill, logs all changes to an audit trail, and orchestrates\r\n the full process end-to-end. The process skills library grows with every run.\r\n Triggers on: \"create a process workflow agent\", \"build an orchestration agent\r\n for [process]\", \"create an agent that automates [process]\", \"orchestrate\r\n [process] into an agent\". Does NOT trigger for creating individual process\r\n skills, running an agent, writing code, or one-off analysis.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ required for scripts/\r\n---\r\n\r\n# Create Process Workflow Agent\r\n\r\nCreates a concise, self-contained `agent.md` that defines an orchestration agent\r\nfor any business or technical process — data pipelines, spreadsheet workflows,\r\ndocument processing, reporting, or anything else. No process skills need to exist\r\nupfront. When run, the agent maps requirements to available skills, creates any\r\nthat are missing, and builds up the process skills library over time.\r\n\r\n## Core Governance Rules\r\n\r\nThese rules define what this agent is — and what it is not. They apply at\r\n**every step** of both the skill (creating the agent.md) and the generated\r\nagent (running sub-agents and executing skills). They are non-negotiable and\r\noverride any implicit pressure to optimise for speed or efficiency.\r\n\r\n**These rules must be held constant in the context window for the entire\r\nagent run.** If context limits are reached, evict skill definitions and\r\nreference material first — the agent can re-read those on demand. The\r\ngovernance rules must never be evicted.\r\n\r\n### Where the agent's intelligence belongs\r\n\r\nThe agent's value is in **reasoning, not execution**. Specifically:\r\n\r\n- **Pre-empting problems** — before any step, reason about what could go\r\n wrong (permissions, tooling, dependencies) and ask the user pointed\r\n questions upfront to determine the right path.\r\n- **Choosing the correct path** — given the user's environment, permissions,\r\n installed tooling, and comfort level, determine which parts of the solution\r\n can run in an automated fashion (commands the user pastes) vs which need to\r\n be done manually (UI steps the user follows).\r\n- **Diagnosing failures** — when a command fails, identify the cause from the\r\n error/status code and offer the most pragmatic resolution for the user.\r\n\r\nThe agent's intelligence is **not** for deciding what code to run and\r\nautonomously executing it.\r\n\r\n### The rules\r\n\r\n> **RULE 1 — User-driven, not agent-driven.** The agent assists; the user\r\n> decides. Every deployment option, parameter choice, and execution step is the\r\n> user's call. The agent surfaces options, explains trade-offs and limitations\r\n> in context, and waits for the user to choose. Never make a decision on the\r\n> user's behalf — even when the \"correct\" answer seems obvious.\r\n> **Never autonomously execute terminal commands, API calls, or scripts.**\r\n> Present each command in a fenced code block for the user to validate. Instruct\r\n> the user to insert the command into the terminal using the insert-into-terminal\r\n> icon (visible when hovering over the code block in chat) and to respond back\r\n> with the status or outcome before the agent proceeds.\r\n>\r\n> **RULE 2 — Pre-empt; don't react.** Before starting any step, assess what\r\n> the user needs to have in place: permissions (Fabric Admin? Entra group\r\n> read access?), installed tooling (Fabric CLI? Azure CLI? Python?), network\r\n> access, and role assignments. Ask pointed questions upfront to establish\r\n> what's available. Don't collect parameters and then discover blockers.\r\n> Frame questions based on the approach the user has already chosen and the\r\n> permissions/dependencies that approach requires.\r\n>\r\n> **RULE 3 — Never silently switch approach.** If a limitation is discovered\r\n> with the user's chosen approach, highlight it and present alternatives. Let\r\n> the user decide whether to switch, work around it, or defer. Do not change\r\n> the deployment option, execution method, or parameter strategy without\r\n> explicit user consent.\r\n>\r\n> **RULE 4 — No inference from external context.** Collect all parameters from\r\n> the user or from the current prompt's explicit instructions. Pre-populating\r\n> values that are implicit in the original requirements is acceptable (with\r\n> user confirmation), but never infer from prior chat history, previous agent\r\n> runs, or attached reference files that were not part of the original request.\r\n>\r\n> **RULE 5 — Respect the user's skill level.** The user may prefer notebooks\r\n> because they understand Python, or terminal because they're comfortable with\r\n> CLIs, or PowerShell because that's what their team uses. Do not assume\r\n> technical proficiency or steer toward an approach the agent finds easier to\r\n> generate.\r\n>\r\n> **RULE 6 — Stay within skill boundaries; pragmatic fallbacks on failure.**\r\n> The agent must not generate code, scripts, or workarounds that are not\r\n> defined in its skill definitions. When **any** command fails — for any\r\n> reason — the agent must: (1) identify the cause from the error or status\r\n> code, (2) explain the failure clearly, (3) offer the simplest manual or UI\r\n> fallback the user can do themselves, and (4) ask if the user wants to skip\r\n> the step entirely. The goal is always what's easiest and most pragmatic\r\n> for the user — not for the agent.\r\n>\r\n> **RULE 7 — Update the change log after every step.** After each SOP step\r\n> completes (or fails), append a timestamped entry to `CHANGE_LOG.md` in the\r\n> output root. Each entry must include: the SOP step number, what was done,\r\n> the outcome (success/failure/skipped), and any notable decisions or\r\n> deviations. This is non-negotiable — the change log is the audit trail.\r\n\r\nThe generated `agent.md` must include these rules verbatim in a\r\n\"Core Governance Rules\" section so they are active at runtime.\r\n\r\n## Notebook Documentation Standard\r\n\r\nEvery Fabric notebook produced by any skill — whether generated by a script or\r\nwritten inline — **must** include a numbered markdown cell immediately above\r\neach code cell. Each markdown cell must:\r\n\r\n1. State the cell number and a short title (e.g. `## Cell 1 — Install dependencies`).\r\n2. Explain **what** the code cell does in 1–2 sentences.\r\n3. Explain **how to use it**: any variables the user should change, any flags\r\n to toggle (e.g. `Set TEST_MODE = False after the test run succeeds`), any\r\n prerequisites that must be true before running.\r\n\r\nThis standard ensures notebooks are self-documenting and executable by any\r\nteam member — not just the person who generated them. The generated `agent.md`\r\nmust include this standard so it propagates to every skill that produces\r\nnotebooks at runtime.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `PROCESS_NAME` | Short name for the process (lowercase, hyphens) | `monthly-budget-consolidation` |\r\n| `REQUIREMENTS` | Full description of the process and each of its steps | `\"1) Collect data from five Excel files... 2) Summarise by category...\"` |\r\n| `SECTIONS` | Sub-agent sections to include (default: all four) | `impl-plan, biz-process, architecture, governance` |\r\n| `USERNAME` | Used in output folder naming | `rishi` |\r\n\r\n## Workflow\r\n\r\n- [ ] **Collect** — If `PROCESS_NAME`, `REQUIREMENTS`, or `USERNAME` are missing, ask for them.\r\n\r\n- [ ] **Confirm sections** — Present the four standard sections with descriptions\r\n (see `references/section-descriptions.md`). Ask which to include. Default: all four.\r\n Wait for explicit confirmation before drafting.\r\n\r\n- [ ] **Draft agent.md** — Use `assets/agent-template.md` as the base.\r\n - Substitute `{PROCESS_NAME}` and a ≤3-sentence `{REQUIREMENTS_SUMMARY}`.\r\n - Remove excluded sections. Keep each sub-agent block ≤25 lines.\r\n - Do not name any specific process skill or technology — all of this is resolved\r\n at runtime when the agent runs against real requirements.\r\n - Do not hardcode company names, specific values, or environment paths.\r\n\r\n- [ ] **Validate** — Present the draft. Ask: *\"Does this accurately reflect the process? Anything unclear?\"*\r\n Refine until the user confirms.\r\n\r\n- [ ] **Scaffold** — Run `python scripts/scaffold_output.py --process-name $PROCESS_NAME --username $USERNAME --sections $SECTIONS`.\r\n Write the confirmed agent.md to the returned `agent_md_path`.\r\n\r\n- [ ] **Confirm** — Report the output root path and list all created subfolders.\r\n\r\n## Output Format\r\n\r\n```\r\noutputs/\r\n└── {process-name}_{YYYY-MM-DD_HH-MM}_{username}/\r\n ├── agent.md ← self-contained orchestration agent definition\r\n ├── CHANGE_LOG.md ← audit trail; updated as agent runs\r\n ├── 01-implementation-plan/ ← empty; populated when agent runs\r\n ├── 02-business-process/ ← empty; populated when agent runs\r\n ├── 03-solution-architecture/ ← empty; populated when agent runs\r\n └── 04-governance/ ← empty; populated when agent runs\r\n```\r\n\r\n`CHANGE_LOG.md` is initialised empty and updated by the agent each time it runs:\r\nnew skills created, outputs produced, decisions made, and user confirmations received\r\nare all appended in chronological order. Nothing is populated during skill execution.\r\n\r\n### Sub-agents in the generated agent.md\r\n\r\n| # | Section | Output document | Notes |\r\n|---|---------|-----------------|-------|\r\n| 1 | Implementation Plan | `implementation-plan.md` + RAID log | Fully inlined |\r\n| 2 | Business Process Mapping | `sop.md` | Maps requirements → skills; creates missing skills; logs to audit trail |\r\n| 3 | Solution Architecture | `specification.md` | Fully inlined |\r\n| 4 | Security, Testing & Governance | `governance-plan.md` | Fully inlined |\r\n\r\n## Gotchas\r\n\r\n- Rules 1–5 in Core Governance Rules above are the primary guardrails.\r\n Re-read them before every step — they override efficiency considerations.\r\n- **Do not check for or create process skills during skill execution.** All skill\r\n discovery, creation of missing skills, and audit logging happen inside Sub-Agent 2\r\n when the generated agent.md is run.\r\n- **Do not execute sub-agents** during skill execution — `agent.md` is a definition only.\r\n- Do not name specific tools, technologies, or process skills in the generated agent.md —\r\n the agent is generic and works for any process.\r\n- Confirm sections **before** drafting, not after.\r\n- Keep each sub-agent block ≤25 lines to avoid context overload when the agent runs.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/scaffold_output.py`** — Creates the dated output folder structure including\r\n an empty `CHANGE_LOG.md`. Run: `python scripts/scaffold_output.py --help`\r\n",
|
|
129
|
+
relativePath: "references/output-template.sql",
|
|
130
|
+
content: "-- =============================================================================\n-- ${LAYER} layer Spark SQL MLV definitions\n-- Generated by fabric-lakehouse-mlv skill\n-- =============================================================================\n-- Source schema: ${SOURCE_SCHEMA}\n-- Target schema: ${TARGET_SCHEMA}\n-- Approach: ${APPROACH}\n-- Assumptions: ${ASSUMPTIONS}\n-- =============================================================================\n\nCREATE SCHEMA IF NOT EXISTS ${TARGET_SCHEMA};\n\n-- ---------------------------------------------------------------------------\n-- Dimension / Independent views (create first — no dependencies)\n-- ---------------------------------------------------------------------------\n\n-- ${DIM_VIEW_DESCRIPTION}\nCREATE OR REPLACE MATERIALIZED LAKE VIEW ${TARGET_SCHEMA}.${VIEW_NAME} AS\nWITH cleaned AS (\n SELECT\n trim(Column1) AS Column1Alias,\n CAST(Column2 AS INT) AS Column2Alias,\n to_date(Column3, 'dd/MM/yyyy') AS DateColumn\n FROM ${SOURCE_SCHEMA}.${SOURCE_TABLE}\n)\nSELECT * FROM cleaned;\n\n-- ---------------------------------------------------------------------------\n-- Fact / Dependent views (create last — may reference dimensions or other views)\n-- ---------------------------------------------------------------------------\n\n-- ${FACT_VIEW_DESCRIPTION}\nCREATE OR REPLACE MATERIALIZED LAKE VIEW ${TARGET_SCHEMA}.${VIEW_NAME} AS\nWITH normalized AS (\n SELECT\n date_trunc('month', s.MonthStart) AS DateKey,\n s.EntityID,\n CAST(ROUND(s.Amount, 2) AS DECIMAL(18,2)) AS AmountGBP\n FROM ${SOURCE_SCHEMA}.${SOURCE_TABLE} s\n)\nSELECT\n n.DateKey,\n COALESCE(n.EntityID, 'UNKNOWN') AS EntityID,\n n.AmountGBP\nFROM normalized n;\n",
|
|
122
131
|
},
|
|
123
132
|
{
|
|
124
|
-
relativePath: "
|
|
125
|
-
content: "# Orchestration Agent: {PROCESS_NAME}\r\n\r\n## Context\r\n\r\n**Process**: {PROCESS_NAME}\r\n**Requirements**: {REQUIREMENTS_SUMMARY}\r\n\r\n---\r\n\r\n## How to Run This Agent\r\n\r\nExecute each sub-agent below in sequence:\r\n\r\n1. Use only the inputs and instructions provided in this file.\r\n2. Produce the specified output document in the designated subfolder.\r\n3. Present the output to the user; ask clarifying questions if anything is unclear.\r\n4. Refine until the user explicitly confirms the output.\r\n5. Append a timestamped entry to `CHANGE_LOG.md` recording what was produced or decided.\r\n6. Pass the confirmed output as the primary input to the next sub-agent.\r\n\r\n**Do not proceed to the next sub-agent without explicit user confirmation.**\r\n**Do not produce code, scripts, or data artefacts not described in each sub-agent below.**\r\n\r\n---\r\n\r\n## Sub-Agent 1: Implementation Plan\r\n\r\n**Input**: Requirements above\r\n**Output**: `01-implementation-plan/implementation-plan.md`\r\n\r\nProduce a phased implementation plan using the structure below. Keep ≤50 lines.\r\nUpdate the RAID log whenever a later sub-agent raises a new risk or dependency.\r\n\r\n```markdown\r\n---\r\ngoal: {PROCESS_NAME} — Implementation Plan\r\nstatus: Planned\r\ndate_created: {DATE}\r\n---\r\n\r\n# Implementation Plan: {PROCESS_NAME}\r\n\r\n## Requirements & Constraints\r\n- REQ-001: [Requirement drawn from the context above]\r\n- CON-001: [Key constraint]\r\n\r\n## Phases\r\n\r\n### Phase 1: [Phase name]\r\n| Task | Description | Status |\r\n|----------|-------------|---------|\r\n| TASK-001 | [Task] | Planned |\r\n| TASK-002 | [Task] | Planned |\r\n\r\n### Phase 2: [Phase name]\r\n| Task | Description | Status |\r\n|----------|-------------|---------|\r\n| TASK-003 | [Task] | Planned |\r\n\r\n## RAID Log\r\n| Type | ID | Description | Mitigation / Action | Status |\r\n|------------|-------|--------------|---------------------|--------|\r\n| Risk | R-001 | [Risk] | [Mitigation] | Open |\r\n| Assumption | A-001 | [Assumption] | [Validation] | Open |\r\n| Issue | I-001 | [Issue] | [Resolution] | Open |\r\n| Dependency | D-001 | [Dependency] | [Owner] | Open |\r\n```\r\n\r\nRules:\r\n- Use REQ-, CON-, TASK-, R-, A-, I-, D- prefixes consistently.\r\n- Task status values: Planned / In Progress / Done.\r\n- Do not include implementation code or scripts.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 1 complete — implementation-plan.md produced.`\r\n- **Confirm with user before proceeding to Sub-Agent 2.**\r\n\r\n---\r\n\r\n## Sub-Agent 2: Business Process Mapping\r\n\r\n**Input**: Confirmed output of Sub-Agent 1 + Requirements above\r\n**Output**: `02-business-process/sop.md`\r\n\r\nThis sub-agent maps requirements to process skills, creates any that are missing,\r\nand produces a Standard Operating Procedure. Work through the three steps below.\r\n\r\n### Step 1 — Decompose requirements into process steps\r\n\r\nRead the requirements and break them into discrete, ordered steps. For each step,\r\nwrite a one-line description of what it needs to do and what its output is.\r\n\r\n### Step 2 — Map each step to a process skill\r\n\r\nFor each step, search the skills directory for a matching process skill\r\n(a skill whose description covers the same action and output).\r\n\r\nFor every step, one of three outcomes applies:\r\n\r\n**A — Skill found**: Read the skill's `SKILL.md`. Note its inputs, outputs, and\r\nany parameters it needs from earlier steps. Mark the step as covered.\r\n\r\n**B — Skill not found**: Determine the deterministic logic needed to automate\r\nthis step (the specific inputs, the repeatable actions, and the expected output).\r\nInvoke `create-process-skill` to create a new skill definition for this step.\r\nOnce created, read its `SKILL.md` and mark the step as covered.\r\nAppend to `CHANGE_LOG.md`:\r\n`[{DATETIME}] New skill created: [skill-name] — [one-line description of what it does].`\r\nAdd the new skill as a dependency in the RAID log from Sub-Agent 1.\r\n\r\n**C — Step must be manual**: If the step cannot be automated (e.g. requires human\r\njudgement or a physical action), document it as a manual step with exact operator\r\ninstructions and mark it accordingly.\r\n\r\nRepeat until every step is either covered by a skill or accepted as manual.\r\nAsk the user to confirm the skill list before proceeding to Step 3.\r\n\r\n### Step 3 — Produce the SOP\r\n\r\n```markdown\r\n# SOP: {PROCESS_NAME}\r\n\r\n## Step Sequence\r\n| Step | Skill / Action | Input Parameters | Output | Manual? |\r\n|------|---------------------|--------------------|-------------------|---------|\r\n| 1 | [skill-name] | param=value | [output artefact] | No |\r\n| 2 | [skill-name] | output from step 1 | [output artefact] | No |\r\n| 3 | [Manual: action] | — | — | Yes |\r\n\r\n## Shared Parameters\r\n| Parameter | Source | Passed to steps |\r\n|-----------|------------|-----------------|\r\n| [param] | User input | 1, 3 |\r\n\r\n## Newly Created Skills\r\n| Skill name | Step | Description |\r\n|--------------|------|------------------------------------|\r\n| [skill-name] | 2 | [What it does — one line] |\r\n\r\n## Manual Steps\r\n- MANUAL-001: [Step] — [Reason] — [Exact operator instructions]\r\n```\r\n\r\nRules:\r\n- If requirements are unclear for any step, ask a targeted question and update\r\n requirements before continuing.\r\n- New skills created in this sub-agent are a permanent addition to the skills\r\n library and will be available for future agents.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 2 complete — sop.md produced. [N] new skills created.`\r\n- **Confirm with user before proceeding to Sub-Agent 3.**\r\n\r\n---\r\n\r\n## Sub-Agent 3: Solution Architecture\r\n\r\n**Input**: Confirmed output of Sub-Agent 2\r\n**Output**: `03-solution-architecture/specification.md`\r\n\r\nProduce a plain-language specification. Keep total length ≤50 lines.\r\nWrite for a non-technical reader — no code, no implementation detail.\r\n\r\n```markdown\r\n---\r\ntitle: {PROCESS_NAME} — Solution Specification\r\nstatus: Draft\r\ndate_created: {DATE}\r\n---\r\n\r\n# Specification: {PROCESS_NAME}\r\n\r\n## Purpose\r\n[One paragraph: what this solution does and what problem it solves.]\r\n\r\n## Scope\r\n[What is included and what is explicitly excluded.]\r\n\r\n## How It Works\r\n| Step | What happens | Automated? | Notes |\r\n|------|-------------------------------|------------|-----------------|\r\n| 1 | [Plain-language description] | Yes | |\r\n| 2 | [Plain-language description] | No | See MANUAL-001 |\r\n\r\n## Manual Steps\r\n- MANUAL-001: [Step] — [Reason] — [Exact operator instructions]\r\n\r\n## Acceptance Criteria\r\n- AC-001: Given [context], when [action], then [expected outcome].\r\n\r\n## Dependencies\r\n- DEP-001: [External system, file, or service] — [Purpose]\r\n```\r\n\r\nRules:\r\n- Write for a non-technical reader. No jargon without explanation.\r\n- Every manual step must include exact operator instructions.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 3 complete — specification.md produced.`\r\n- **Confirm with user before proceeding to Sub-Agent 4.**\r\n\r\n---\r\n\r\n## Sub-Agent 4: Security, Testing and Governance\r\n\r\n**Input**: Confirmed output of Sub-Agent 3\r\n**Output**: `04-governance/governance-plan.md`\r\n\r\nProduce a governance and deployment plan. Keep total length ≤45 lines.\r\n\r\n```markdown\r\n---\r\ntitle: {PROCESS_NAME} — Governance Plan\r\ndate_created: {DATE}\r\n---\r\n\r\n# Governance Plan: {PROCESS_NAME}\r\n\r\n## Agent Boundaries\r\n| Boundary | Rule |\r\n|-------------------------|--------------------------------------------|\r\n| Allowed actions | [Permitted operations] |\r\n| Blocked actions | [Prohibited operations] |\r\n| Requires human approval | [Steps needing explicit sign-off] |\r\n\r\n## Testing Checklist\r\n- [ ] Validate each sub-agent output before passing it to the next\r\n- [ ] Test all manual steps with a real operator before production use\r\n- [ ] Run against a minimal test dataset before using real data\r\n- [ ] Review CHANGE_LOG.md to confirm all new skills are correct\r\n- [ ] Verify the output folder structure after scaffolding\r\n\r\n## Microsoft Responsible AI Alignment\r\n| Principle | How Applied |\r\n|----------------|--------------------------------------------------------|\r\n| Fairness | [How bias is avoided in outputs and decisions] |\r\n| Reliability | [Validation steps, error handling, new skill review] |\r\n| Privacy | [Data handling — no PII retained in output files] |\r\n| Inclusiveness | [Plain language; no domain assumptions made] |\r\n| Transparency | [User validates every sub-agent output; CHANGE_LOG] |\r\n| Accountability | [Human sign-off required before production execution] |\r\n\r\n## Deployment Guidance\r\n- Review `CHANGE_LOG.md` to verify all newly created skills before first run.\r\n- Store `agent.md`, all outputs, and new skills in version control.\r\n- Review the RAID log from Sub-Agent 1 before each new run.\r\n- Human sign-off required before running against production systems.\r\n```\r\n\r\nRules:\r\n- Every RAI principle row must be completed — state explicitly if not applicable and why.\r\n- Human approval must be required for any step that modifies production systems.\r\n- Append to `CHANGE_LOG.md`: `[{DATETIME}] Sub-Agent 4 complete — governance-plan.md produced. Agent definition finalised.`\r\n- **Confirm with user before finalising.**\r\n",
|
|
133
|
+
relativePath: "references/profile_csvs.py",
|
|
134
|
+
content: "#!/usr/bin/env python3\n# /// script\n# requires-python = \">=3.8\"\n# dependencies = [\"pandas>=1.5\"]\n# ///\n\"\"\"\nProfile CSV files for the fabric-lakehouse-mlv skill.\n\nAnalyses each CSV and outputs a JSON report with column metadata, type\ninference, and pattern flags used by the transformation detection logic.\n\nUsage:\n python scripts/profile_csvs.py --files file1.csv file2.csv\n python scripts/profile_csvs.py --dir /mnt/user-data/uploads/ --files hotels.csv revenue.csv\n\nOptions:\n --dir DIR Base directory for CSV files (default: current directory)\n --files FILE ... One or more CSV filenames to profile\n --max-rows N Maximum rows to sample for profiling (default: 500)\n --output PATH Write JSON report to file instead of stdout\n --help Show this message and exit\n\nExamples:\n python scripts/profile_csvs.py --dir /mnt/user-data/uploads/ --files Landon_hotels.csv Landon_hotel_revenue_data.csv\n python scripts/profile_csvs.py --files silver_hotel_metadata.csv silver_revenue_monthly.csv --max-rows 1000\n\"\"\"\n\nimport argparse\nimport json\nimport re\nimport sys\nfrom pathlib import Path\n\ntry:\n import pandas as pd\nexcept ImportError:\n print(\n json.dumps({\"error\": \"pandas is required. Install with: pip install pandas --break-system-packages\"}),\n file=sys.stderr,\n )\n sys.exit(1)\n\n\nDATE_PATTERNS = [\n (r\"^\\d{1,2}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{2,4}$\", \"dd/MM/yyyy or MM/dd/yyyy\"),\n (r\"^\\d{4}[/\\-\\.]\\d{1,2}[/\\-\\.]\\d{1,2}\", \"yyyy-MM-dd\"),\n (r\"^\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}\", \"yyyy-MM-dd HH:mm:ss\"),\n]\n\nMONTH_YEAR_PATTERN = re.compile(r\"^[A-Za-z]{3}[\\s\\-]\\d{2,4}$\")\nCURRENCY_PATTERN = re.compile(r\"^[\\s]*[\\$£€]?[\\s]*[\\-]?[\\d,]+\\.?\\d*[\\s]*$\")\nCURRENCY_SYMBOL_PATTERN = re.compile(r\"[\\$£€]\")\nCOMMA_NUMBER_PATTERN = re.compile(r\"^\\d{1,3}(,\\d{3})+(\\.\\d+)?$\")\nBOOLEAN_VALUES = {\"YES\", \"NO\", \"TRUE\", \"FALSE\", \"1\", \"0\", \"Y\", \"N\", \"\"}\nQUARTER_PATTERN = re.compile(r\"^(Q|QUARTER)\\s*\\d$\", re.IGNORECASE)\n\nMEASURE_KEYWORDS = [\n \"amount\", \"revenue\", \"profit\", \"cost\", \"spend\", \"total\", \"price\",\n \"quantity\", \"qty\", \"attendance\", \"rate\", \"budget\", \"forecast\",\n \"bookings\", \"payroll\", \"insurance\", \"maintenance\", \"utilities\",\n \"expenses\", \"tax\", \"fee\", \"salary\", \"wage\",\n]\n\nID_KEYWORDS = [\"id\", \"key\", \"code\", \"number\", \"no\"]\n\n\ndef looks_like_id(col_name: str) -> bool:\n lower = col_name.lower().replace(\"_\", \"\")\n return any(lower.endswith(kw) or lower.startswith(kw) for kw in ID_KEYWORDS)\n\n\ndef looks_like_measure(col_name: str) -> bool:\n lower = col_name.lower()\n return any(kw in lower for kw in MEASURE_KEYWORDS)\n\n\ndef detect_date_format(series: \"pd.Series\") -> str | None:\n s = series.dropna().astype(str).str.strip()\n if len(s) == 0:\n return None\n sample = s.head(50)\n for pattern, label in DATE_PATTERNS:\n if sample.str.match(pattern).all():\n return label\n return None\n\n\ndef can_disambiguate_date(series: \"pd.Series\") -> dict:\n \"\"\"Check if any day-part value > 12, which disambiguates dd/MM vs MM/dd.\"\"\"\n s = series.dropna().astype(str).str.strip()\n parts = s.str.split(r\"[/\\-\\.]\", expand=True)\n if parts.shape[1] < 3:\n return {\"ambiguous\": True}\n try:\n first_part = parts[0].astype(int)\n if (first_part > 12).any():\n return {\"ambiguous\": False, \"likely_format\": \"dd/MM/yyyy\"}\n second_part = parts[1].astype(int)\n if (second_part > 12).any():\n return {\"ambiguous\": False, \"likely_format\": \"MM/dd/yyyy\"}\n except (ValueError, TypeError):\n pass\n return {\"ambiguous\": True, \"note\": \"All day/month values ≤12; ask user\"}\n\n\ndef profile_column(col_name: str, series: \"pd.Series\") -> dict:\n s_str = series.dropna().astype(str)\n s_stripped = s_str.str.strip()\n n_total = len(series)\n n_null = int(series.isna().sum())\n n_unique = int(series.nunique())\n empty_string_count = int((s_str == \"\").sum())\n\n result = {\n \"dtype_inferred\": str(series.dtype),\n \"n_total\": n_total,\n \"n_unique\": n_unique,\n \"n_null\": n_null,\n \"empty_string_count\": empty_string_count,\n \"sample_values\": s_stripped.head(5).tolist(),\n \"has_leading_trailing_whitespace\": bool(s_stripped.ne(s_str).any()),\n \"looks_like_id\": looks_like_id(col_name),\n \"looks_like_measure\": looks_like_measure(col_name),\n }\n\n # Date detection\n date_fmt = detect_date_format(series)\n result[\"looks_like_date\"] = date_fmt is not None\n if date_fmt:\n result[\"detected_date_format\"] = date_fmt\n if \"dd/MM\" in date_fmt or \"MM/dd\" in date_fmt:\n result[\"date_disambiguation\"] = can_disambiguate_date(series)\n\n # Month-year detection (e.g., \"Jan-09\")\n if len(s_stripped) > 0:\n result[\"looks_like_month_year\"] = bool(\n s_stripped.head(50).str.match(MONTH_YEAR_PATTERN.pattern).all()\n )\n else:\n result[\"looks_like_month_year\"] = False\n\n # Currency detection\n if len(s_stripped) > 0 and series.dtype == object:\n result[\"looks_like_currency\"] = bool(\n s_stripped.head(50).str.match(CURRENCY_PATTERN.pattern).all()\n and not s_stripped.head(50).str.match(r\"^[A-Za-z]\").any()\n )\n else:\n result[\"looks_like_currency\"] = False\n\n # Comma-in-numbers detection\n result[\"has_commas_in_numbers\"] = bool(\n s_stripped.str.match(COMMA_NUMBER_PATTERN.pattern).any()\n ) if len(s_stripped) > 0 else False\n\n # Boolean detection\n result[\"looks_like_boolean\"] = (\n set(s_stripped.str.upper().unique()).issubset(BOOLEAN_VALUES)\n and n_unique <= 3\n and n_unique > 0\n )\n\n # Quarter label detection\n if len(s_stripped) > 0:\n result[\"looks_like_quarter\"] = bool(\n s_stripped.head(20).str.match(QUARTER_PATTERN.pattern).all()\n )\n else:\n result[\"looks_like_quarter\"] = False\n\n # Max string length\n result[\"max_string_length\"] = int(s_str.str.len().max()) if len(s_str) > 0 else 0\n\n return result\n\n\ndef classify_table(profile_data: dict) -> str:\n \"\"\"Classify a table by archetype based on its column profiles.\"\"\"\n columns = profile_data[\"columns\"]\n row_count = profile_data[\"row_count\"]\n\n has_date_col = any(c[\"looks_like_date\"] or c[\"looks_like_month_year\"] for c in columns.values())\n has_measure = any(c[\"looks_like_measure\"] for c in columns.values())\n has_id = any(c[\"looks_like_id\"] for c in columns.values())\n n_boolean = sum(1 for c in columns.values() if c[\"looks_like_boolean\"])\n has_quarter = any(c[\"looks_like_quarter\"] for c in columns.values())\n has_currency = any(c[\"looks_like_currency\"] for c in columns.values())\n\n # Check for SCD pattern (start/end date pairs)\n col_names_lower = [n.lower() for n in columns.keys()]\n has_start_end = (\n any(\"start\" in n for n in col_names_lower)\n and any(\"end\" in n for n in col_names_lower)\n )\n has_is_current = any(\"current\" in n for n in col_names_lower)\n\n # Check for wide/pivoted (many similarly-typed columns)\n numeric_cols = [\n n for n, c in columns.items()\n if c[\"dtype_inferred\"] in (\"int64\", \"float64\")\n and not c[\"looks_like_id\"]\n ]\n # Days of week or months pattern\n day_names = {\"monday\", \"tuesday\", \"wednesday\", \"thursday\", \"friday\", \"saturday\", \"sunday\"}\n col_names_set = {n.lower() for n in columns.keys()}\n is_wide_days = len(day_names & col_names_set) >= 5\n\n if has_start_end and has_is_current:\n return \"scd\"\n if has_quarter and (has_currency or has_measure):\n return \"forecast\"\n if is_wide_days or (len(numeric_cols) >= 6 and not has_date_col):\n return \"wide_pivoted\"\n if has_date_col and has_measure and row_count > 50:\n return \"fact\"\n if row_count < 500 and has_id and not has_measure:\n return \"dimension\"\n # Small reference tables with rates/amounts but no date series (e.g., room rates)\n if row_count < 200 and has_id and has_measure and not has_date_col:\n return \"reference\"\n if has_date_col and any(\"attend\" in n.lower() for n in columns.keys()):\n return \"event_log\"\n if has_id and not has_measure and len(columns) <= 4:\n return \"lookup\"\n return \"unknown\"\n\n\ndef profile_file(filepath: Path, max_rows: int) -> dict:\n try:\n df = pd.read_csv(filepath, encoding=\"utf-8-sig\", nrows=max_rows)\n except Exception as e:\n return {\"error\": str(e), \"file\": str(filepath)}\n\n columns = {}\n for col in df.columns:\n columns[col] = profile_column(col, df[col])\n\n result = {\n \"file\": filepath.name,\n \"row_count\": len(df),\n \"column_count\": len(df.columns),\n \"column_names\": list(df.columns),\n \"columns\": columns,\n }\n\n result[\"archetype\"] = classify_table(result)\n\n return result\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"Profile CSV files for fabric-lakehouse-mlv skill.\",\n formatter_class=argparse.RawDescriptionHelpFormatter,\n epilog=__doc__,\n )\n parser.add_argument(\"--dir\", type=str, default=\".\", help=\"Base directory for CSV files\")\n parser.add_argument(\"--files\", nargs=\"+\", required=True, help=\"CSV filenames to profile\")\n parser.add_argument(\"--max-rows\", type=int, default=500, help=\"Max rows to sample (default: 500)\")\n parser.add_argument(\"--output\", type=str, default=None, help=\"Write JSON to file instead of stdout\")\n args = parser.parse_args()\n\n base = Path(args.dir)\n results = []\n\n for fname in args.files:\n fpath = base / fname\n if not fpath.exists():\n print(f\"⚠️ File not found: {fpath}\", file=sys.stderr)\n results.append({\"file\": fname, \"error\": f\"File not found: {fpath}\"})\n continue\n print(f\"Profiling {fname}...\", file=sys.stderr)\n results.append(profile_file(fpath, args.max_rows))\n\n output = json.dumps(results, indent=2, default=str)\n\n if args.output:\n Path(args.output).write_text(output, encoding=\"utf-8\")\n print(f\"Report written to {args.output}\", file=sys.stderr)\n else:\n print(output)\n\n\nif __name__ == \"__main__\":\n main()\n",
|
|
126
135
|
},
|
|
127
136
|
{
|
|
128
|
-
relativePath: "references/
|
|
129
|
-
content: "# Section Descriptions\r\n\r\nShow this to the user during Step 3 (section confirmation).\r\nPresent each section with its description, then ask which to include.\r\n\r\n---\r\n\r\n## Standard Sections\r\n\r\n**1. Implementation Plan** (`impl-plan`)\r\nProduces a phased task plan with completion criteria and a RAID log\r\n(Risks, Assumptions, Issues, Dependencies). Updated continuously as\r\nother sub-agents progress. Outputs: `implementation-plan.md`.\r\n\r\n**2. Business Process Mapping** (`biz-process`)\r\nMaps the requirements and process skills into a Standard Operating\r\nProcedure (SOP) — a sequenced table of steps, skill names, parameters,\r\nand whether each step is automated or manual. Identifies gaps and asks\r\nclarifying questions if requirements are insufficient.\r\nOutputs: `sop.md`.\r\n\r\n**3. Solution Architecture** (`architecture`)\r\nPresents the solution design in plain, non-technical language: which\r\nplatform capabilities are used, what can be automated vs. done manually,\r\nand clear instructions for any manual steps. Produces a concise\r\nspecification document. Outputs: `specification.md`.\r\n\r\n**4. Security, Testing and Governance** (`governance`)\r\nOutlines safety controls, human-approval requirements, testing steps,\r\nand responsible AI alignment (Microsoft RAI framework: Fairness,\r\nReliability, Privacy, Inclusiveness, Transparency, Accountability).\r\nProduces a governance and deployment plan. Outputs: `governance-plan.md`.\r\n\r\n---\r\n\r\n## Confirmation Prompt\r\n\r\nAsk the user:\r\n\r\n> \"The orchestration agent can include the following sections. Which would\r\n> you like to include?\"\r\n>\r\n> 1. **Implementation Plan** — phased task plan + RAID log\r\n> 2. **Business Process Mapping** — SOP with sequenced steps and parameters\r\n> 3. **Solution Architecture** — plain-language specification\r\n> 4. **Security, Testing and Governance** — safety controls + RAI alignment\r\n>\r\n> Reply with the numbers you want (e.g. \"1,2,3,4\" for all) or describe\r\n> any changes.\r\n\r\nDefault: all four sections are included.\r\n",
|
|
137
|
+
relativePath: "references/silver-to-gold-pattern-driven.md",
|
|
138
|
+
content: "# Approach 2 — Pattern-Driven Gold Star-Schema Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts for a **Power BI-optimised\nstar schema** from silver CSV files alone. The agent profiles every file, classifies\nit as a dimension source or fact source, applies a standardised catalogue of\ngold-layer transformations, and validates the result against star schema design\nprinciples before presenting it.\n\n---\n\n## When to use\n\nThe user supplies **only silver CSV files** — no target gold schema is provided.\nThe goal is to propose and generate a best-practice star schema gold layer\noptimised for Power BI semantic model consumption.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Silver CSVs | Uploaded or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/gold_layer_mlv.sql` |\n\n---\n\n## Star Schema Design Principles\n\nThese principles govern every decision in the workflow. The agent must validate\nevery proposed table against these rules before generating SQL.\n\n### Core model structure\n\nA star schema organises data into two table types connected by relationships:\n\n- **Dimension tables** (the \"one\" side): contain descriptive attributes for\n filtering, slicing, and grouping. Each dimension has a unique key column\n (one row per entity). Name using singular nouns (`Hotel`, `Manager`, `Date`).\n- **Fact tables** (the \"many\" side): contain measurable, quantitative data at a\n consistent grain plus foreign keys to every related dimension. Name using\n business-process nouns (`Revenue`, `Expenses`, `Orders`).\n\n### Key design rules\n\n1. **Separate dimensions from facts.** Never embed descriptive attributes in a\n fact table when they belong in a dimension. A wide denormalised table is an\n anti-pattern — split it.\n2. **Consistent grain.** Every row in a fact table represents the same thing\n (e.g., one hotel × one month). Never mix grains in one table.\n3. **Surrogate keys.** Add integer surrogate keys (via `DENSE_RANK`) when the\n source lacks clean unique identifiers or when the natural key is a long string.\n Surrogate keys improve join performance and enable unknown-member rows.\n4. **Date dimension.** Always create a dedicated date table. Include fiscal\n periods if relevant. In Power BI this table will be marked as the date table.\n5. **Flatten dimensions.** Do not snowflake (normalise) dimensions. `dim_hotel`\n should include city and country directly, not point to a separate `dim_city`.\n Extra joins hurt Power BI performance.\n6. **Unknown / unassigned member rows.** Every dimension must have a fallback row\n so that fact records with NULL foreign keys still resolve to a valid dimension\n member. This prevents blank rows in Power BI visuals.\n\n### Special dimension types\n\n| Type | Description | When to use |\n|---|---|---|\n| **Role-playing** | Same dimension used via different relationships (e.g., OrderDate vs ShipDate → dim_date) | Fact has 2+ date/entity references of the same type. Power BI: inactive relationships + `USERELATIONSHIP` in DAX, or duplicate the dimension. |\n| **Slowly Changing (Type 2)** | Tracks historical changes with StartDate/EndDate/IsCurrent | Silver SCD table exists. Gold dim holds only distinct/current attributes; SCD join happens in fact views. |\n| **Junk** | Combines multiple low-cardinality flags into one dimension | Fact has 3+ boolean/flag columns (IsRush, IsGift, IsOnline). Combine all permutations into a single dimension. |\n| **Degenerate** | Transaction identifiers kept directly in the fact, not in a dimension | The only attribute is the ID itself with no descriptive columns. Do not create a dimension for it. |\n\n### Anti-patterns the agent must prevent\n\n| Anti-pattern | How the agent detects it | Resolution |\n|---|---|---|\n| **Wide denormalised table** | A proposed gold table has both descriptive text and numeric measures | Split into dimension + fact |\n| **Snowflaked dimensions** | A proposed dimension references another dimension via FK | Flatten — bring attributes directly into the parent dimension |\n| **Many-to-many without bridge** | Two dimensions related through a fact with no single FK path | Add bridge/junction table or restructure |\n| **Mixed-grain fact** | Proposed fact has rows at different granularities | Separate into distinct fact tables per grain |\n| **Dimension without unique key** | Proposed dimension has duplicates on the PK column | Add DISTINCT or investigate missing SCD pattern |\n| **Fact with embedded attributes** | Proposed fact includes descriptive text alongside measures | Promote text to a dimension; replace with FK |\n| **Pointless one-column dimension** | A dimension that contains only an ID and no descriptive attributes | Keep the ID as a degenerate dimension in the fact |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the silver tables\n\n1. List all uploaded CSVs.\n2. For each file, run the profiler from Step 2.\n3. Classify each silver table:\n\n| Role | Detection signals |\n|---|---|\n| **Dimension source** | Low row count (<500), descriptive text columns, a natural key (ID column), no date-series, no monetary measures |\n| **Fact source** | Higher row count, has `MonthStart`/date column, numeric measures (revenue, amount, spend, attendance, quantity), FK-like ID columns |\n| **SCD source** | StartDate/EndDate pairs, IsCurrent flag, same entity ID repeats with different date ranges |\n| **Bridge / mapping** | Two+ ID columns, no measures, maps one entity to another |\n\n4. Identify the **grain** of each fact source by listing its non-measure columns.\n5. Check for **categorical text columns in fact sources** (low cardinality, not IDs) — these are candidates for surrogate-key dimensions.\n6. Check for **multiple boolean/flag columns in fact sources** — these are junk dimension candidates.\n7. Check for **multiple date columns in fact sources** — these signal role-playing dimension needs.\n8. Present classification to the user for confirmation.\n\n### Step 2 — Profile every CSV\n\n> **Column naming note:** Silver CSV columns may not match the actual silver delta\n> table columns if the CSVs were manually created or exported before transformations.\n> Always verify actual Fabric Lakehouse column names before writing SQL.\n\nFor each CSV, capture: columns, inferred dtypes, row count, unique counts, null\ncounts, sample values, whether columns look like IDs, measures, dates, or booleans.\nUse `encoding='utf-8-sig'` for BOM handling.\n\n### Step 3 — Apply the transformation catalogue\n\nWalk through the catalogue below **in order**. For each silver table, determine\nwhich gold views it feeds and which transformations apply.\n\n---\n\n#### CATALOGUE OF GOLD-LAYER TRANSFORMATIONS\n\n##### G01 — Generate a date dimension\n\n| Condition | At least one fact source has a date/month column |\n|---|---|\n| Trigger | Always generate `gold.dim_date` |\n\n**Steps:**\n1. Scan all fact source tables; collect every distinct date/month column.\n2. Determine date range: `min(all dates)` to last day of `max(all dates)` month.\n3. Generate daily calendar: `explode(sequence(MinDate, MaxDate, interval 1 day))`.\n4. Ask the user for the **fiscal year start month** (default: January).\n5. Include **display + sort column pairs** for Power BI (critical for Sort by Column):\n\n| Display column | Sort column | Power BI usage |\n|---|---|---|\n| `CalendarMonthName` (MMMM) | `CalendarMonthNumber` (1–12) | Sort month names chronologically |\n| `CalendarMonthNameShort` (MMM) | `CalendarMonthNumber` | Same |\n| `DayOfWeekName` (EEEE) | `DayOfWeekNumber` (1–7) | Sort day names in weekday order |\n| `CalendarQuarter` (Q1–Q4) | `CalendarQuarterNumber` (1–4) | Sort quarters |\n| `FiscalPeriodLabel` (FP01–FP12) | `FiscalPeriodNumber` (1–12) | Sort fiscal periods |\n\n**Standard date dimension columns:**\n\n| Column | SQL |\n|---|---|\n| `DateKey` | `CalendarDate` (PK) |\n| `DateKeyInt` | `year * 10000 + month * 100 + day` |\n| `MonthStartDate` | `date_trunc('month', CalendarDate)` |\n| `MonthEndDate` | `last_day(CalendarDate)` |\n| `CalendarYear` | `year(CalendarDate)` |\n| `CalendarMonthNumber` | `month(CalendarDate)` |\n| `CalendarMonthName` | `date_format(CalendarDate, 'MMMM')` |\n| `CalendarMonthNameShort` | `date_format(CalendarDate, 'MMM')` |\n| `CalendarQuarter` | `concat('Q', quarter(CalendarDate))` |\n| `CalendarQuarterNumber` | `quarter(CalendarDate)` |\n| `CalendarDayNumber` | `day(CalendarDate)` |\n| `WeekOfYear` | `weekofyear(CalendarDate)` |\n| `DayOfWeekNumber` | `dayofweek(CalendarDate)` — 1=Sun, 7=Sat |\n| `DayOfWeekName` | `date_format(CalendarDate, 'EEEE')` |\n| `DayOfWeekNameShort` | `date_format(CalendarDate, 'E')` |\n| `IsWeekend` | `CASE WHEN dayofweek(CalendarDate) IN (1,7) THEN 1 ELSE 0 END` |\n| `IsMonthStart` | `CASE WHEN CalendarDate = date_trunc('month', CalendarDate) THEN 1 ELSE 0 END` |\n| `IsMonthEnd` | `CASE WHEN CalendarDate = last_day(CalendarDate) THEN 1 ELSE 0 END` |\n| `FiscalYear` | See G02 |\n| `FiscalPeriodNumber` | See G02 |\n| `FiscalPeriodLabel` | See G02 |\n\n##### G02 — Add fiscal year logic\n\n| Condition | User confirms a fiscal year start month (or data context suggests one) |\n|---|---|\n| Parameter | `FISCAL_START_MONTH` (integer 1–12) |\n\n**Formulas:**\n\n```sql\nCASE WHEN month(CalendarDate) >= ${FISCAL_START_MONTH}\n THEN year(CalendarDate)\n ELSE year(CalendarDate) - 1\nEND AS FiscalYear,\n\n(((month(CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1) AS FiscalPeriodNumber,\n\nconcat('FP', lpad(\n (((month(CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1),\n 2, '0'\n)) AS FiscalPeriodLabel\n```\n\nIf fiscal year = calendar year: set `FISCAL_START_MONTH = 1` and simplify.\n\n##### G03 — Build pass-through dimensions (flatten, no snowflaking)\n\n| Condition | Silver table classified as **Dimension source** |\n|---|---|\n| Action | Create `gold.dim_<entity>` with selected columns + unknown member row |\n\n**Steps:**\n1. Select only columns needed by downstream facts (key + descriptive attributes).\n Drop operational columns not useful for analysis.\n2. **Flatten**: If any attributes could be split into a sub-dimension (city, country,\n geography), keep them directly in this dimension. Do not create a separate\n `dim_city` or `dim_country`. Snowflaking is an anti-pattern in Power BI.\n3. Add an unknown member row via `UNION ALL`:\n - String keys: `'UNKNOWN'` as key, `'Unknown <Entity>'` for name, `NULL`/`'Unknown'` for attributes.\n - Integer keys: `0` or `-1` as key.\n4. If duplicates exist on the natural key, wrap in `SELECT DISTINCT`.\n\n##### G04 — Build SCD-sourced dimensions\n\n| Condition | Silver table classified as **SCD source** |\n|---|---|\n| Action | Create `gold.dim_<role>` with distinct entity attributes + unassigned row |\n\n**Steps:**\n1. `SELECT DISTINCT` the entity ID and name columns only (drop StartDate/EndDate/IsCurrent — SCD join happens in facts).\n2. `COALESCE(EntityID, 'UNASSIGNED')` for null IDs.\n3. Add unassigned row: `UNION SELECT 'UNASSIGNED', 'Unassigned <Role>'`.\n4. Use `UNION` (not `UNION ALL`) to deduplicate if COALESCE creates a match.\n\n##### G05 — Build surrogate-key dimensions (from fact categorical columns)\n\n| Condition | A fact source has a categorical text column (low cardinality, not an ID) that should become a dimension FK |\n|---|---|\n| Action | Create `gold.dim_<category>` with integer surrogate key |\n\n**Detection**: Look for columns in fact sources that are text/string, low\ncardinality (<100 distinct), not ending in `ID`, and descriptive (category names,\ntypes, labels).\n\n**Steps:**\n1. `SELECT DISTINCT CategoryColumn FROM silver.fact_source`.\n2. `DENSE_RANK() OVER (ORDER BY CategoryColumn) AS CategoryID`.\n3. Add unknown row: `UNION ALL SELECT 0, 'Unknown <Category>'`.\n\n**When NOT to create a surrogate-key dimension:**\n- If the column has only 2–4 values and no descriptive attributes beyond the name itself, it may be better kept as a **degenerate dimension** in the fact. Ask the user.\n- If the column is a transaction identifier (OrderNumber, InvoiceID), it is a degenerate dimension — keep in the fact.\n\n##### G06 — Detect and handle junk dimension candidates\n\n| Condition | A fact source has 3+ boolean/flag columns (0/1, Yes/No) |\n|---|---|\n\n**Steps:**\n1. Identify all boolean/flag columns in the fact source.\n2. If 3 or more exist, propose a **junk dimension** that contains all permutations:\n\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_flags AS\nWITH permutations AS (\n SELECT DISTINCT Flag1, Flag2, Flag3 FROM silver.fact_source\n)\nSELECT\n DENSE_RANK() OVER (ORDER BY Flag1, Flag2, Flag3) AS FlagGroupID,\n Flag1, Flag2, Flag3\nFROM permutations\nUNION ALL\nSELECT 0, NULL, NULL, NULL;\n```\n\n3. Replace the 3 flag columns in the fact with a single `FlagGroupID` FK.\n4. Present this as a suggestion — the user may prefer keeping flags in the fact.\n\n##### G07 — Detect role-playing dimensions\n\n| Condition | A fact source has 2+ date columns (e.g., OrderDate, ShipDate, DueDate) or 2+ FK columns of the same dimension type |\n|---|---|\n\n**Steps:**\n1. Identify the multiple columns.\n2. Generate only ONE dimension (e.g., `gold.dim_date`).\n3. In the fact, create separate FK columns: `OrderDateKey`, `ShipDateKey`.\n4. Add a SQL comment:\n\n```sql\n-- Role-playing dimension: OrderDateKey and ShipDateKey both reference gold.dim_date.\n-- In Power BI, set one relationship as active and use USERELATIONSHIP() in DAX\n-- for the inactive relationship, or duplicate the date table as a query reference.\n```\n\n##### G08 — Conform fact date keys\n\n| Condition | Every fact source |\n|---|---|\n| Action | Standardise date column to `DateKey` |\n\n```sql\ndate_trunc('month', SourceDateColumn) AS DateKey\n```\n\nDefault grain: **monthly** (first of month). If data is daily and user wants daily\ngrain, use the raw date. Ask: \"Should fact tables use monthly or daily date key grain?\"\n\n##### G09 — COALESCE foreign keys to unknown/unassigned members\n\n| Condition | Every FK column in a fact that could contain NULLs |\n|---|---|\n\n| FK type | COALESCE value | Matches dimension |\n|---|---|---|\n| String entity key | `'UNKNOWN'` | Unknown row in dim |\n| Manager/role key | `'UNASSIGNED'` | Unassigned row in dim |\n| Integer surrogate | `0` | Unknown row (ID=0) |\n\nApply in the final SELECT, **not** before any JOINs.\n\n##### G10 — SCD point-in-time join for role assignment\n\n| Condition | A fact needs a manager/owner but has no assignment column, AND an SCD table exists |\n|---|---|\n\n```sql\nLEFT JOIN silver.scd_table ma\n ON fact.EntityID = ma.EntityID\n AND fact.MonthStart BETWEEN ma.StartDate AND ma.EndDate\n```\n\nAlways LEFT JOIN. COALESCE the result to `'UNASSIGNED'`.\n\n##### G11 — Surrogate key lookup in facts\n\n| Condition | A fact references a surrogate-key dimension (G05) |\n|---|---|\n\n*Approach A — join to gold dim view:*\n```sql\nLEFT JOIN gold.dim_category dc ON lower(dc.CategoryName) = lower(fact.CategoryCol)\n```\n\n*Approach B — inline CTE with matching DENSE_RANK:*\n```sql\nWITH category_lookup AS (\n SELECT CategoryCol, DENSE_RANK() OVER (ORDER BY CategoryCol) AS CategoryID\n FROM (SELECT DISTINCT CategoryCol FROM silver.source) c\n)\nLEFT JOIN category_lookup cl ON lower(cl.CategoryCol) = lower(fact.CategoryCol)\n```\n\nUse `COALESCE(resolved_id, 0)` as fallback. Be consistent across all facts.\n\n##### G12 — Rename and round measures\n\n| Condition | Every numeric measure column in a fact |\n|---|---|\n\n**Naming**: `<Metric><Currency>` — e.g., `RevenueAmountGBP`, `TotalSpendGBP`.\nAsk the user for the **base currency code** if not obvious.\n\n**Rounding**: `CAST(ROUND(col, 2) AS DECIMAL(18,2))` for monetary.\n`CAST(col AS BIGINT)` for counts/quantities.\n\n##### G13 — Prune operational columns from facts\n\n| Condition | Fact source has descriptive/categorical columns already covered by a dimension |\n|---|---|\n\nA well-formed fact contains only:\n- `DateKey` (FK to dim_date)\n- Entity FKs (HotelID, ManagerID, CategoryID, etc.)\n- Numeric measures\n\nRemove any text column that has been promoted to a dimension (G05) and replace\nwith its surrogate FK.\n\n**Exception — degenerate dimensions**: Keep transaction IDs (OrderNumber, InvoiceID)\nand low-value categorical columns (≤4 values, no attributes) directly in the fact.\nDo not create a pointless dimension for them. Ask the user if borderline.\n\n---\n\n### Step 4 — Propose the gold layer (with anti-pattern check)\n\nBefore generating SQL, present a **star schema plan** to the user.\n\n**Dimensions:**\n```\nGold dimension | Source | Key type | Special type | Transforms\n---------------------------|----------------------------|-----------|-----------------|------------\ngold.dim_date | All fact date ranges | DateKey | Generated | G01, G02\ngold.dim_hotel | silver.hotel_metadata | HotelID | Pass-through | G03\ngold.dim_manager | silver.manager_scd | ManagerID | SCD-sourced | G04\ngold.dim_expense_category | silver.expenses (distinct) | Surrogate | Surrogate-key | G05\n```\n\n**Facts:**\n```\nGold fact | Source | Grain | Transforms\n---------------------------|----------------------------|---------------------|-------------\ngold.fact_revenue | silver.revenue_monthly | Hotel × Month | G08, G09, G12\ngold.fact_expenses | silver.expenses_monthly | Hotel × Month × Cat | G08-G13\n```\n\n**Anti-pattern validation** (run before presenting):\n- [ ] No proposed table mixes dimensions and measures\n- [ ] No dimension references another dimension via FK\n- [ ] Each fact has a single consistent grain\n- [ ] Degenerate dimensions stay in facts\n- [ ] Junk dimension candidates are flagged if applicable\n\nInclude for each table: column list, FK mappings, assumptions, ambiguities.\n\n**Wait for user confirmation** before generating SQL.\n\n### Step 5 — Generate the SQL script\n\n> Before committing to MLV definitions, generate standalone `SELECT` validation\n> queries (with `LIMIT 20`) that the user can run against the Fabric SQL Analytics\n> Endpoint. Write to `silver_to_gold_validation.sql`.\n\n**File structure (dependency order):**\n1. `CREATE SCHEMA IF NOT EXISTS gold;`\n2. Comment header (fiscal year, currency, grain, Power BI considerations)\n3. `gold.dim_date` (generated)\n4. Pass-through dimensions\n5. SCD-sourced dimensions\n6. Surrogate-key dimensions (and junk dimensions if applicable)\n7. Fact views\n\n**CTE conventions within fact views:**\n- `normalized` — date_trunc, column selection\n- `category_lookup` — surrogate key resolution\n- Final SELECT: COALESCE all FKs, ROUND all measures, alias to standard names\n\n**Naming conventions:**\n- Dimensions: `gold.dim_<entity>` — singular noun, snake_case\n- Facts: `gold.fact_<process>` — process noun, snake_case\n- Surrogate keys: `<Entity>ID` (PascalCase, INT)\n- Natural keys: unchanged from silver\n- DateKey: DATE\n- Measures: `<Metric><Currency>`\n- Unknown: `0` (int), `'UNKNOWN'` (string)\n- Unassigned: `'UNASSIGNED'` (role-based)\n\n**Spark SQL reminders:**\n`DENSE_RANK()`, `COALESCE()`, `UNION ALL` (unknown rows) / `UNION` (SCD dims),\n`date_trunc('month')`, `sequence() + explode()`, `date_format('MMMM'/'EEEE')`,\n`dayofweek()` (1=Sun, 7=Sat), `lpad()`, `last_day()`,\n`CAST(ROUND(x,2) AS DECIMAL(18,2))`, `CAST(x AS BIGINT)`.\n\n**Notebook documentation (when delivering as .ipynb):**\nWhen delivering as a notebook, each MLV must have a detailed markdown cell before\nits code cell containing: (1) what the view does, (2) star schema role (dim/fact),\n(3) surrogate key logic, (4) cross-view dependencies, (5) Power BI usage notes.\nThe notebook header must explain the star schema structure, list all dimensions and\nfacts, and provide execution order guidance.\n\n### Step 6 — Validate against star schema checklist\n\n**Data validation (Python):**\n- Dimensions: row count = distinct key count + 1 (unknown row)\n- Surrogate keys: gap-free sequence starting at 1 (0 for unknown)\n- Facts: FK values all exist in corresponding dimension\n- Date dimension: full coverage, fiscal logic correct at boundaries\n\n**Star schema structural checklist:**\n\n- [ ] Every gold table is clearly a dimension or a fact — no hybrids\n- [ ] Every fact has FKs to all related dimensions\n- [ ] Every dimension has a unique primary key (no duplicates)\n- [ ] A date dimension exists and spans the full fact date range\n- [ ] Date dimension has display + sort column pairs for Power BI\n- [ ] Every dimension has an unknown/unassigned member row\n- [ ] No snowflaking — no dimension has FKs to other dimensions\n- [ ] No fact embeds descriptive attributes belonging in a dimension\n- [ ] Consistent grain within each fact table\n- [ ] No circular relationship paths\n- [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n- [ ] Surrogate key DENSE_RANK ORDER BY identical in dim views and fact CTEs\n- [ ] Role-playing dimensions documented (Power BI inactive relationships)\n- [ ] Degenerate dimensions remain in facts (not split into pointless dims)\n- [ ] Junk dimension candidates addressed (combined or deliberately kept separate)\n\nReport failures as warnings.\n\n### Step 7 — Write the final file\n\n1. Write to `/mnt/user-data/outputs/gold_layer_mlv.sql`.\n2. Present the file.\n3. Summarise: dimension count, fact count, total measures, fiscal year config.\n4. Note role-playing / junk / degenerate dimension decisions.\n5. Include Power BI modelling notes (mark date table, Sort by Column setup, USERELATIONSHIP for role-playing dims).\n\n---\n\n## Gotchas\n\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` needs a tiebreaker if ties are possible.\n- **UNION ALL vs UNION**: `UNION ALL` for unknown rows. `UNION` for SCD dims where COALESCE may duplicate the unassigned row.\n- **SCD fan-out**: Overlapping SCD date ranges duplicate fact rows. Validate non-overlap.\n- **Surrogate key consistency**: DENSE_RANK ORDER BY must match between dim view and fact CTE.\n- **COALESCE placement**: Apply in final SELECT, never in JOIN ON clause.\n- **Date dimension bounds**: Earliest fact date to last day of latest fact month.\n- **Fiscal year off-by-one**: Test the modular arithmetic at January and at the fiscal start month.\n- **dayofweek()**: Spark returns 1=Sunday, 7=Saturday. Weekend = `IN (1, 7)`.\n- **Power BI sort columns**: Always pair display names with numeric sort columns. Without this, months sort alphabetically (April, August, December…) instead of chronologically.\n- **No snowflaking**: Flatten all attributes into the dimension. If `dim_hotel` has City and Country, do NOT create `dim_geography`. Power BI performs best with flat dimensions.\n- **Degenerate vs real dimensions**: A column needs a dimension only if it has descriptive attributes beyond the key itself. OrderNumber with no other attributes is degenerate — keep in fact.\n- **Junk dimensions**: Combining 3+ boolean flags into a single dimension reduces model complexity but adds a join. Propose it; let the user decide.\n- **Multiple facts, conformed dimensions**: All facts referencing `dim_hotel` must use the same `HotelID` key and `'UNKNOWN'` fallback. Consistency is critical for drill-across.\n- **BOM characters**: `encoding='utf-8-sig'` when reading CSVs.\n- **Column naming mismatch**: Verify actual lakehouse column names match what the\n SQL references. Silver delta table columns are PascalCase aliases from the\n bronze→silver transformation, not the original CSV headers.\n",
|
|
130
139
|
},
|
|
131
140
|
{
|
|
132
|
-
relativePath: "
|
|
133
|
-
content: "#!/usr/bin/env python3\r\n# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nScaffold the output folder structure for an orchestration agent run.\r\n\r\nUsage:\r\n python scripts/scaffold_output.py --process-name <NAME> --username <USER> [OPTIONS]\r\n\r\nOptions:\r\n --process-name NAME Short process name (lowercase, hyphens; used in folder name)\r\n --username USER Username for folder naming\r\n --output-dir PATH Parent directory for outputs (default: ./outputs)\r\n --sections SECTIONS Comma-separated section keys to include.\r\n Valid values: impl-plan, biz-process, architecture, governance\r\n Default: all four\r\n --help Show this message and exit\r\n\r\nExamples:\r\n python scripts/scaffold_output.py --process-name fabric-lakehouse --username rishi\r\n python scripts/scaffold_output.py --process-name my-process --username alice --sections impl-plan,biz-process\r\n\"\"\"\r\n\r\nimport argparse\r\nimport json\r\nimport sys\r\nfrom datetime import datetime\r\nfrom pathlib import Path\r\n\r\nSECTION_MAP = {\r\n \"impl-plan\": (\"01-implementation-plan\", \"Implementation Plan\"),\r\n \"biz-process\": (\"02-business-process\", \"Business Process Mapping\"),\r\n \"architecture\": (\"03-solution-architecture\",\"Solution Architecture\"),\r\n \"governance\": (\"04-governance\", \"Security, Testing and Governance\"),\r\n}\r\n\r\nALL_SECTIONS = list(SECTION_MAP.keys())\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=\"Scaffold the output folder structure for an orchestration agent run.\",\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=__doc__,\r\n )\r\n parser.add_argument(\"--process-name\", required=True,\r\n help=\"Short process name (lowercase, hyphens)\")\r\n parser.add_argument(\"--username\", required=True,\r\n help=\"Username for folder naming\")\r\n parser.add_argument(\"--output-dir\", type=Path, default=Path(\"./outputs\"),\r\n help=\"Parent directory for outputs (default: ./outputs)\")\r\n parser.add_argument(\"--sections\", default=\",\".join(ALL_SECTIONS),\r\n help=\"Comma-separated section keys to include\")\r\n args = parser.parse_args()\r\n\r\n # Validate process name\r\n import re\r\n if not re.match(r\"^[a-z0-9][a-z0-9-]*[a-z0-9]$\", args.process_name):\r\n print(json.dumps({\r\n \"status\": \"error\",\r\n \"message\": \"process-name must be lowercase letters, numbers and hyphens only.\"\r\n }))\r\n sys.exit(1)\r\n\r\n sections = [s.strip() for s in args.sections.split(\",\") if s.strip()]\r\n invalid = [s for s in sections if s not in SECTION_MAP]\r\n if invalid:\r\n print(json.dumps({\r\n \"status\": \"error\",\r\n \"message\": f\"Unknown section(s): {invalid}. Valid: {ALL_SECTIONS}\"\r\n }))\r\n sys.exit(1)\r\n\r\n timestamp = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\r\n folder_name = f\"{args.process_name}_{timestamp}_{args.username}\"\r\n root = args.output_dir / folder_name\r\n root.mkdir(parents=True, exist_ok=True)\r\n\r\n created_subfolders = []\r\n for key in sections:\r\n dirname, label = SECTION_MAP[key]\r\n subfolder = root / dirname\r\n subfolder.mkdir(exist_ok=True)\r\n created_subfolders.append({\"key\": key, \"label\": label, \"path\": str(subfolder)})\r\n\r\n # Initialise the audit trail\r\n change_log = root / \"CHANGE_LOG.md\"\r\n change_log.write_text(\r\n f\"# Change Log: {args.process_name}\\n\\n\"\r\n f\"Created: {datetime.now().isoformat(timespec='seconds')} by {args.username}\\n\\n\"\r\n \"---\\n\\n\"\r\n \"_Entries are appended here by each sub-agent as the orchestration runs._\\n\",\r\n encoding=\"utf-8\",\r\n )\r\n\r\n output = {\r\n \"status\": \"ok\",\r\n \"root\": str(root),\r\n \"agent_md_path\": str(root / \"agent.md\"),\r\n \"change_log_path\": str(change_log),\r\n \"subfolders\": created_subfolders\r\n }\r\n\r\n print(json.dumps(output, indent=2))\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
141
|
+
relativePath: "references/silver-to-gold-schema-driven.md",
|
|
142
|
+
content: "# Approach 1 — Schema-Driven Gold Star-Schema Generation\n\nGenerate Spark SQL Materialized Lake View (MLV) scripts for a **Power BI-optimised\nstar schema** by comparing silver input CSVs against target gold output CSVs. The\nagent infers every dimension, fact, surrogate key, and join pattern by diffing\nschemas and sampling data — then validates the result against star schema design\nprinciples.\n\n---\n\n## When to use\n\nThe user supplies **both**:\n\n- One or more **silver CSV** files (conformed, cleaned data)\n- One or more **gold CSV** files (desired star-schema output — dimensions and facts)\n\nThe goal is to produce a `.sql` script of `CREATE OR REPLACE MATERIALIZED LAKE VIEW`\nstatements that, when executed against the silver tables in a Fabric Lakehouse,\nreproduce the gold outputs as a star schema ready for Power BI consumption.\n\n---\n\n## Prerequisites\n\n| Item | Detail |\n|---|---|\n| Silver CSVs | Uploaded or provided in context |\n| Gold CSVs | Uploaded or provided in context |\n| SQL dialect | Spark SQL (Fabric Lakehouse MLV syntax) |\n| Output path | `/mnt/user-data/outputs/gold_layer_mlv.sql` |\n\n---\n\n## Star Schema Design Principles\n\nThese principles govern every decision in the workflow. Refer back to them during\ngeneration and validation.\n\n### Core model structure\n\nA star schema organises data into two table types connected by relationships:\n\n- **Dimension tables** (the \"one\" side): contain descriptive attributes for\n filtering, slicing, and grouping. Each dimension has a unique key column\n (one row per entity). Name using singular nouns (`Hotel`, `Manager`, `Date`).\n- **Fact tables** (the \"many\" side): contain measurable, quantitative data at a\n consistent grain plus foreign keys to every related dimension. Name using\n business-process nouns (`Revenue`, `Expenses`, `Orders`).\n\n### Key design rules\n\n1. **Separate dimensions from facts.** Never embed descriptive attributes in a\n fact table when they belong in a dimension. A wide, denormalised table is an\n anti-pattern — split it.\n2. **Consistent grain.** Every row in a fact table represents the same thing\n (e.g., one hotel × one month). Never mix grains in one table.\n3. **Surrogate keys.** Add integer surrogate keys (via `DENSE_RANK`) when the\n source lacks clean unique identifiers, or when the natural key is a long\n string. Surrogate keys improve join performance and enable unknown-member rows.\n4. **Date dimension.** Always create a dedicated date table. Include fiscal\n periods if relevant. In Power BI, this table will be marked as the date table.\n5. **Flatten dimensions.** Do not snowflake (normalise) dimensions. A\n `dim_hotel` should include city and country directly, not point to a separate\n `dim_city`. Extra joins hurt Power BI performance.\n6. **Unknown / unassigned member rows.** Every dimension must have a fallback row\n so that fact records with NULL foreign keys still resolve to a valid dimension\n member. This prevents blank rows in Power BI visuals.\n\n### Special dimension types to detect\n\n| Type | Description | When to use |\n|---|---|---|\n| **Role-playing** | Same dimension used multiple times via different relationships (e.g., OrderDate vs ShipDate both pointing to dim_date) | When a fact has 2+ date columns or 2+ entity references of the same type. In Power BI, handle with inactive relationships + `USERELATIONSHIP` in DAX, or by duplicating the dimension table. |\n| **Slowly Changing (Type 2)** | Tracks historical changes with StartDate/EndDate/IsCurrent columns | When a silver SCD table exists. The gold dimension holds only current/distinct attributes; the SCD join happens in the fact view via point-in-time lookup. |\n| **Junk** | Combines multiple low-cardinality flags/indicators into a single dimension | When a fact has several boolean or 2–3 value categorical columns (IsRush, IsGift, IsOnline). Instead of one dimension per flag, combine into one junk dimension with all combinations. |\n| **Degenerate** | Transaction identifiers (OrderNumber, InvoiceID) kept directly in the fact table, not in a dimension | When the only attribute is the ID itself and there are no descriptive columns to justify a dimension. Keep as a column in the fact. |\n\n### Anti-patterns to detect and reject\n\n| Anti-pattern | Detection signal | Resolution |\n|---|---|---|\n| **Wide denormalised table** | A single gold table has both descriptive text columns and numeric measures with no clear key/FK structure | Split into dimension + fact |\n| **Snowflaked dimensions** | A gold dimension has an FK to another gold dimension (e.g., dim_city referenced by dim_hotel) | Flatten — bring attributes directly into the parent dimension |\n| **Many-to-many without bridge** | Two dimensions related through a fact with no single FK path | Add a bridge/junction table or restructure |\n| **Mixed-grain fact** | A fact table has rows at different granularities (some monthly, some daily; or some at line level, some at header level) | Separate into distinct fact tables, one per grain |\n| **Dimension without unique key** | A gold dimension has duplicate values on the primary key column | Add DISTINCT, or investigate for a missing SCD pattern |\n| **Fact with embedded dimension attributes** | A gold fact table has descriptive text columns alongside measures | Move to a dimension; replace with FK in the fact |\n\n---\n\n## Workflow\n\nExecute every step. Do not skip.\n\n### Step 1 — Inventory and classify the files\n\n1. List all uploaded CSVs.\n2. Ask the user to confirm which files are **silver** (source) and which are **gold** (target).\n3. Ask the user to confirm the **silver schema name** (default `silver`) and **gold schema name** (default `gold`).\n4. Classify every gold CSV:\n\n| Classification | Detection signals |\n|---|---|\n| **Dimension** | Filename contains `dim_`; low row count; unique key column; descriptive attributes; no monetary measures |\n| **Fact** | Filename contains `fact_`; higher row count; has `DateKey`; FK columns matching dimension keys; numeric measures |\n| **Generated dimension** | Dimension whose rows don't map 1:1 from any silver file (date table, surrogate-key dimension from `SELECT DISTINCT`) |\n\n5. For each gold file, identify the silver source(s).\n6. **Anti-pattern check**: Verify every gold table is cleanly dimension or fact. Flag hybrids.\n\n### Step 2 — Profile every CSV\n\n> **Column naming note:** Silver CSV columns may not match the actual silver delta\n> table columns if the CSVs were manually created or exported separately. Always\n> verify actual Fabric Lakehouse column names before writing SQL.\n\nFor each CSV (silver and gold), capture columns, dtypes, row counts, unique\ncounts, null counts, sample values, and min/max for numerics. Use\n`encoding='utf-8-sig'` to handle BOM.\n\n### Step 3 — Detect transformations by diffing silver → gold\n\n#### 3a — Dimension detection patterns\n\n| Signal in gold dimension | Transformation |\n|---|---|\n| Sequential integer column (1, 2, 3…) not in silver | **Surrogate key** — `DENSE_RANK() OVER (ORDER BY NaturalKey)` |\n| Extra row with ID = 0 or `'UNKNOWN'` | **Unknown member row** — `UNION ALL SELECT 0/'UNKNOWN', 'Unknown …'` |\n| Gold is a column subset of silver | **Attribute pruning** (keep only what Power BI needs) |\n| Fewer rows than silver, same key | **Deduplication** — `SELECT DISTINCT` |\n| `COALESCE` pattern (`UNASSIGNED` for NULLs) | **Null-safe key** |\n| Row count = distinct count + 1 | **Distinct + unknown row** |\n\n#### 3b — Generated date dimension detection\n\n1. Check if gold date range spans min-to-max of all fact `DateKey` columns.\n2. Identify fiscal year start by comparing `FiscalYear` vs `CalendarYear` at boundaries.\n3. Map derived columns to Spark SQL expressions.\n4. Verify display + sort column pairs exist (for Power BI Sort by Column):\n `CalendarMonthName` + `CalendarMonthNumber`, `DayOfWeekName` + `DayOfWeekNumber`,\n `FiscalPeriodLabel` + `FiscalPeriodNumber`.\n\n#### 3c — Special dimension type detection\n\n| Signal | Type | Action |\n|---|---|---|\n| Fact has 2+ date FK columns | **Role-playing** | Generate one dim_date; add comment for Power BI inactive relationships |\n| Silver has StartDate/EndDate/IsCurrent; gold dim has only distinct attributes | **SCD** | Dim holds distinct entities; SCD join in facts |\n| Fact has 3+ boolean columns (0/1) | **Junk dimension candidate** | Flag for user — combine or keep separate? |\n| Fact retains a text ID column with no matching dimension | **Degenerate** | Keep in fact |\n\n#### 3d — Fact detection patterns\n\n| Signal | Transformation |\n|---|---|\n| `DateKey` = `date_trunc('month', silver.MonthStart)` | Date key alignment |\n| `'UNKNOWN'` / `'UNASSIGNED'` where silver has NULLs | COALESCE FK |\n| Integer FK not in silver source | Surrogate key lookup join |\n| ManagerID via SCD but not in silver fact | SCD point-in-time join |\n| Measures with currency suffix, DECIMAL(18,2) | Rename + round |\n| Row count matches silver → pass-through; less → GROUP BY | Grain check |\n\n#### 3e — Cross-view dependencies and anti-pattern scan\n\n1. Map all table dependencies (topological sort for MLV creation order).\n2. Verify:\n - Dimensions have unique PKs.\n - Facts contain only keys + measures (no embedded descriptive text).\n - No mixed grain within any single fact.\n - No snowflaked dimensions.\n3. Alert the user on any failure before generating SQL.\n\n### Step 4 — Detect fiscal year logic\n\n1. Sample boundary rows. Identify fiscal year start month.\n2. Derive: `FiscalPeriodNumber = ((MonthNumber + (12 - StartMonth)) % 12) + 1`.\n3. Confirm with user.\n\n### Step 5 — Generate the SQL script\n\n> Before committing to MLV definitions, generate standalone `SELECT` validation\n> queries (with `LIMIT 20`) that the user can run against the Fabric SQL Analytics\n> Endpoint. Write to `silver_to_gold_validation.sql`.\n\n**File structure (dependency order):**\n1. `CREATE SCHEMA IF NOT EXISTS gold;`\n2. Comment header (fiscal year, currency, grain, Power BI target)\n3. Generated dimensions (date table)\n4. Pass-through dimensions (flattened — no snowflaking)\n5. SCD-sourced dimensions\n6. Surrogate-key dimensions\n7. Fact views\n\n**Dimension templates:**\n\n*Pass-through with unknown row (flattened):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_entity AS\nSELECT EntityID, EntityName, City, Country\nFROM silver.entity_table\nUNION ALL\nSELECT 'UNKNOWN', 'Unknown Entity', 'Unknown', 'Unknown';\n```\n\n*SCD-sourced with unassigned row:*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_role AS\nSELECT DISTINCT\n COALESCE(RoleID, 'UNASSIGNED') AS RoleID,\n COALESCE(RoleName, 'Unassigned Role') AS RoleName\nFROM silver.role_scd\nUNION\nSELECT 'UNASSIGNED', 'Unassigned Role';\n```\n\n*Surrogate-key with unknown row:*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_category AS\nWITH categories AS (SELECT DISTINCT CategoryName FROM silver.source_table)\nSELECT DENSE_RANK() OVER (ORDER BY CategoryName) AS CategoryID, CategoryName\nFROM categories\nUNION ALL\nSELECT 0, 'Unknown Category';\n```\n\n*Generated date dimension (with fiscal logic):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.dim_date AS\nWITH fact_months AS (\n SELECT date_trunc('month', DateCol) AS MonthStart FROM silver.fact1\n UNION ALL SELECT date_trunc('month', DateCol) FROM silver.fact2\n),\nbounds AS (\n SELECT min(MonthStart) AS MinDate,\n date_sub(add_months(max(MonthStart), 1), 1) AS MaxDate\n FROM fact_months\n),\ncalendar AS (\n SELECT explode(sequence(b.MinDate, b.MaxDate, interval 1 day)) AS CalendarDate\n FROM bounds b\n)\nSELECT\n c.CalendarDate AS DateKey,\n year(c.CalendarDate) AS CalendarYear,\n month(c.CalendarDate) AS CalendarMonthNumber, -- sort column\n date_format(c.CalendarDate, 'MMMM') AS CalendarMonthName, -- display column\n -- Fiscal (parameterised by start month)\n CASE WHEN month(c.CalendarDate) >= ${FISCAL_START_MONTH}\n THEN year(c.CalendarDate)\n ELSE year(c.CalendarDate) - 1 END AS FiscalYear,\n (((month(c.CalendarDate) + (12 - ${FISCAL_START_MONTH})) % 12) + 1) AS FiscalPeriodNumber,\n ...\nFROM calendar c;\n```\n\n**Fact templates:**\n\n*Simple fact (keys + measures only):*\n```sql\nCREATE OR REPLACE MATERIALIZED LAKE VIEW gold.fact_measure AS\nSELECT\n date_trunc('month', s.MonthStart) AS DateKey,\n COALESCE(s.EntityID, 'UNKNOWN') AS EntityID,\n COALESCE(s.RoleID, 'UNASSIGNED') AS RoleID,\n CAST(ROUND(s.Amount, 2) AS DECIMAL(18,2)) AS AmountGBP\nFROM silver.source_table s;\n```\n\n*With SCD join:*\n```sql\nLEFT JOIN silver.role_scd ma\n ON s.EntityID = ma.EntityID\n AND s.MonthStart BETWEEN ma.StartDate AND ma.EndDate\n```\n\n*With surrogate key lookup:*\n```sql\nWITH category_lookup AS (\n SELECT CategoryName, DENSE_RANK() OVER (ORDER BY CategoryName) AS CategoryID\n FROM (SELECT DISTINCT CategoryName FROM silver.source_table) c\n)\nLEFT JOIN category_lookup cl ON lower(cl.CategoryName) = lower(s.CategoryName)\n```\n\n**Naming conventions:**\n- Dimensions: `gold.dim_<entity>` — singular noun, snake_case\n- Facts: `gold.fact_<process>` — process noun, snake_case\n- Surrogate keys: `<Entity>ID` (PascalCase, INT)\n- Natural keys: unchanged from silver\n- DateKey: DATE type\n- Measures: `<Metric><Currency>` (e.g., `RevenueAmountGBP`)\n- Unknown: `0` (int surrogates), `'UNKNOWN'` (string keys)\n- Unassigned: `'UNASSIGNED'` (role-based)\n\n**Spark SQL reminders:**\n`DENSE_RANK()`, `COALESCE()`, `UNION ALL`, `date_trunc('month')`,\n`sequence() + explode()`, `date_format('MMMM'/'EEEE')`, `dayofweek()` (1=Sun, 7=Sat),\n`lpad()`, `last_day()`, `CAST(ROUND(x,2) AS DECIMAL(18,2))`, `CAST(x AS BIGINT)`.\n\n**Notebook documentation (when delivering as .ipynb):**\nWhen delivering as a notebook, each MLV must have a detailed markdown cell before\nits code cell containing: (1) what the view does, (2) star schema role (dim/fact),\n(3) surrogate key logic, (4) cross-view dependencies, (5) Power BI usage notes.\nThe notebook header must explain the star schema structure, list all dimensions and\nfacts, and provide execution order guidance.\n\n### Step 6 — Validate against star schema checklist\n\n**Data validation (Python):** Compare generated logic against gold target CSVs\n(column names, row counts, surrogate ranges, measure tolerance ±0.5%).\n\n**Star schema structural checklist:**\n\n- [ ] Every gold table is clearly a dimension or a fact — no hybrids\n- [ ] Every fact has FKs to all related dimensions\n- [ ] Every dimension has a unique primary key (no duplicate rows on the key)\n- [ ] A date dimension exists and spans the full fact date range\n- [ ] Date dimension has display + sort column pairs for Power BI (e.g., MonthName + MonthNumber)\n- [ ] Every dimension has an unknown/unassigned member row\n- [ ] No snowflaking — no dimension references another dimension via FK\n- [ ] No fact embeds descriptive attributes that belong in a dimension\n- [ ] Consistent grain within each fact table\n- [ ] No circular relationship paths\n- [ ] Consistent naming: `dim_` for dimensions, `fact_` for facts\n- [ ] Surrogate key DENSE_RANK ORDER BY is identical in dimension views and fact CTEs\n- [ ] Role-playing dimensions documented (for Power BI inactive relationships)\n- [ ] Degenerate dimensions (transaction IDs) remain in facts, not split needlessly\n- [ ] Junk dimension candidates flagged if 3+ boolean columns in a single fact\n\nReport failures as warnings before writing the file.\n\n### Step 7 — Write the final file\n\n1. Write to `/mnt/user-data/outputs/gold_layer_mlv.sql`.\n2. Present the file.\n3. Summarise: dimension count, fact count, surrogate key strategy, fiscal year, warnings.\n4. Note any role-playing dimensions and Power BI `USERELATIONSHIP` requirements.\n\n---\n\n## Gotchas\n\n- **Surrogate key determinism**: `DENSE_RANK(ORDER BY col)` needs a tiebreaker if ties are possible.\n- **UNION ALL vs UNION**: `UNION ALL` for unknown rows (key is distinct). `UNION` for SCD dims where COALESCE may duplicate the unassigned row.\n- **SCD fan-out**: Overlapping SCD date ranges duplicate fact rows. Validate non-overlap.\n- **Surrogate key consistency**: DENSE_RANK ORDER BY must match between dim view and fact CTE.\n- **COALESCE placement**: Apply in final SELECT, never in JOIN ON clause.\n- **Date dimension bounds**: Earliest fact date to last day of latest fact month.\n- **Fiscal year formula**: Test at January and at the fiscal start month for off-by-one.\n- **dayofweek()**: Spark returns 1=Sunday, 7=Saturday. Weekend = `IN (1, 7)`.\n- **Power BI sort columns**: Pair every display name column with a numeric sort column.\n- **No snowflaking**: Flatten all attributes into the dimension. No dim-to-dim FKs.\n- **Degenerate dimensions**: Keep transaction IDs in the fact. Do not create pointless one-column dimensions.\n- **Junk dimensions**: Suggest combining 3+ boolean flags into a single junk dimension.\n- **BOM characters**: `encoding='utf-8-sig'` when reading CSVs.\n- **Column naming mismatch**: Verify actual lakehouse column names match what the\n SQL references. Silver delta table columns are PascalCase aliases from the\n bronze→silver transformation, not the original CSV headers.\n",
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
relativePath: "references/sql-conventions.md",
|
|
146
|
+
content: "# Spark SQL Conventions for MLV Generation\n\nLoad this file during Phase 4 (Generate the SQL) when writing MLV definitions.\n\n## Naming Conventions\n\n- View names: `<schema>.<descriptive_snake_case>`\n- Column aliases: `PascalCase` (e.g., `HotelID`, `MonthStart`, `RevenueAmountGBP`)\n- CTE names: `snake_case`\n\n## CTE Conventions\n\n- `cleaned` — trims, casts, date parsing\n- `unpivoted` — if LATERAL VIEW STACK applies\n- `expanded` — if temporal/date-range expansion applies\n- `aggregated` — if GROUP BY applies\n- `normalized` — date_trunc, column selection (gold facts)\n- `category_lookup` — surrogate key resolution (gold facts)\n\n## Type Casting\n\n- IDs and counts → `INT`\n- Monetary values → `DECIMAL(18,2)` with `CAST(ROUND(expr, 2) AS DECIMAL(18,2))`\n- Rates → `DECIMAL(10,2)`\n- Quantities → `BIGINT` (gold layer)\n- Dates → `DATE` via `to_date()` with explicit format\n\n## Spark SQL Syntax (Non-Obvious)\n\n- Backtick reserved words: `` `Year` ``, `` `Month` ``, `` `Order` ``\n- `LATERAL VIEW STACK(n, 'label1', col1, ...)` for unpivot\n- `LATERAL VIEW explode(sequence(...))` for temporal expansion\n- `date_trunc('month', col)` for month normalisation\n- `regexp_replace()` — double-escape backslashes: `'\\\\$'` not `'\\$'`\n- `dayofweek()` returns 1=Sunday, 7=Saturday in Spark\n- `DENSE_RANK() OVER (ORDER BY col)` for deterministic surrogate keys\n- `COALESCE(fk, 'UNKNOWN')` in final SELECT, never in JOIN ON\n",
|
|
134
147
|
},
|
|
135
148
|
],
|
|
136
149
|
},
|
|
137
150
|
{
|
|
138
151
|
name: "csv-to-bronze-delta-tables",
|
|
152
|
+
category: "fabric",
|
|
139
153
|
files: [
|
|
140
154
|
{
|
|
141
155
|
relativePath: "SKILL.md",
|
|
142
|
-
content: "---\r\nname: csv-to-bronze-delta-tables\r\ndescription: >\r\n Use this skill to upload CSV files from a local machine into a Microsoft Fabric\r\n bronze lakehouse and convert them to delta tables. Triggers on: \"create delta\r\n tables from CSV files\", \"load CSVs into bronze lakehouse\", \"upload CSV to Fabric\r\n and create tables\", \"ingest CSV files to delta format in Fabric\", \"create bronze\r\n tables from local CSV\". Does NOT trigger for creating lakehouses, transforming\r\n existing delta tables, or non-Fabric storage targets.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ required for scripts/. Fabric CLI (fab) must be installed for the CLI upload option.\r\n---\r\n\r\n# CSV to Bronze Delta Tables\r\n\r\nUploads CSV files from an operator's local machine to a Microsoft Fabric bronze\r\nlakehouse and converts them to delta tables. The lakehouse must already exist.\r\n\r\n> ⚠️ **GOVERNANCE RULE**: This skill **never executes `fab` CLI commands directly**.\r\n> All `fab cp`, `fab ln`, and `fab ls` commands are presented to the operator as\r\n> script blocks for them to run. The agent only generates and presents commands.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `WORKSPACE_NAME` | Fabric workspace name (exact, case-sensitive) | `\"Landon Finance Month End\"` |\r\n| `LAKEHOUSE_NAME` | Bronze lakehouse name (exact, case-sensitive) | `\"Lh_landon_finance_bronze\"` |\r\n| `LOCAL_CSV_FOLDER` | Relative path to local folder containing CSV files (CLI upload only) | `\"./Data\"` |\r\n| `LAKEHOUSE_FILES_FOLDER` | Folder name under the Files section of the lakehouse | `\"raw\"` |\r\n\r\n## Workflow\r\n\r\n- [ ] **Collect parameters** — If `WORKSPACE_NAME` or `LAKEHOUSE_NAME` are not\r\n provided, ask the operator for them before proceeding.\r\n\r\n- [ ] **Upload CSV files** — Present these three options and ask the operator to\r\n choose one:\r\n\r\n **Option 1 — OneLake File Explorer (Manual)**\r\n Open the OneLake File Explorer desktop app and drag-and-drop the CSV files into\r\n the target folder under the lakehouse Files section. No agent action required.\r\n\r\n **Option 2 — Fabric UI (Manual)**\r\n In the Fabric browser UI navigate to the lakehouse → Files section, open or create\r\n the target folder, click **Upload** and select the CSV files. No agent action required.\r\n\r\n **Option 3 — Fabric CLI (Automated)**\r\n > ⚠️ **Performance note**: The CLI uploads files one at a time. For large\r\n > batches (50+ files) this is significantly slower than Options 1 or 2.\r\n > Recommend Options 1 or 2 for bulk uploads.\r\n\r\n Ask for `LOCAL_CSV_FOLDER` as the **exact absolute path** to the local folder\r\n
|
|
156
|
+
content: "---\r\nname: csv-to-bronze-delta-tables\r\ndescription: >\r\n Use this skill to upload CSV files from a local machine into a Microsoft Fabric\r\n bronze lakehouse and convert them to delta tables. Triggers on: \"create delta\r\n tables from CSV files\", \"load CSVs into bronze lakehouse\", \"upload CSV to Fabric\r\n and create tables\", \"ingest CSV files to delta format in Fabric\", \"create bronze\r\n tables from local CSV\". Does NOT trigger for creating lakehouses, transforming\r\n existing delta tables, or non-Fabric storage targets.\r\nlicense: MIT\r\ncompatibility: Python 3.8+ required for scripts/. Fabric CLI (fab) must be installed for the CLI upload option.\r\n---\r\n\r\n# CSV to Bronze Delta Tables\r\n\r\nUploads CSV files from an operator's local machine to a Microsoft Fabric bronze\r\nlakehouse and converts them to delta tables. The lakehouse must already exist.\r\n\r\n> ⚠️ **GOVERNANCE RULE**: This skill **never executes `fab` CLI commands directly**.\r\n> All `fab cp`, `fab ln`, and `fab ls` commands are presented to the operator as\r\n> script blocks for them to run. The agent only generates and presents commands.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `WORKSPACE_NAME` | Fabric workspace name (exact, case-sensitive) | `\"Landon Finance Month End\"` |\r\n| `LAKEHOUSE_NAME` | Bronze lakehouse name (exact, case-sensitive) | `\"Lh_landon_finance_bronze\"` |\r\n| `LOCAL_CSV_FOLDER` | Relative path to local folder containing CSV files (CLI upload only) | `\"./Data\"` |\r\n| `LAKEHOUSE_FILES_FOLDER` | Folder name under the Files section of the lakehouse | `\"raw\"` |\r\n\r\n## Workflow\r\n\r\n- [ ] **Collect parameters** — If `WORKSPACE_NAME` or `LAKEHOUSE_NAME` are not\r\n provided, ask the operator for them before proceeding.\r\n\r\n- [ ] **Upload CSV files** — Present these three options and ask the operator to\r\n choose one:\r\n\r\n **Option 1 — OneLake File Explorer (Manual)**\r\n Open the OneLake File Explorer desktop app and drag-and-drop the CSV files into\r\n the target folder under the lakehouse Files section. No agent action required.\r\n\r\n **Option 2 — Fabric UI (Manual)**\r\n In the Fabric browser UI navigate to the lakehouse → Files section, open or create\r\n the target folder, click **Upload** and select the CSV files. No agent action required.\r\n\r\n **Option 3 — Fabric CLI (Automated)**\r\n > ⚠️ **Requires PowerShell** — generates a `.ps1` script. PowerShell is available\r\n > on Windows natively and on Mac/Linux via `brew install powershell`. If PowerShell\r\n > is not available and the operator does not want to install it, use Option 1 or 2.\r\n > Do not substitute a bash or shell script.\r\n >\r\n > ⚠️ **Performance note**: The CLI uploads files one at a time. For large\r\n > batches (50+ files) this is significantly slower than Options 1 or 2.\r\n > Recommend Options 1 or 2 for bulk uploads.\r\n\r\n Ask for `LOCAL_CSV_FOLDER` as the **exact absolute path** to the local folder\r\n and `LAKEHOUSE_FILES_FOLDER` (the destination folder name under Files). Then run:\r\n ```\r\n python scripts/generate_upload_commands.py \\\r\n --local-folder \"<LOCAL_CSV_FOLDER>\" \\\r\n --workspace \"<WORKSPACE_NAME>\" \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --output-script \"<OUTPUT_FOLDER>/upload_csv_files.ps1\"\r\n ```\r\n The script generates a PowerShell `.ps1` file saved directly to the outputs folder.\r\n Present the script path to the operator and ask them to run it with `pwsh upload_csv_files.ps1`.\r\n\r\n## Output Folder\r\n\r\nBefore beginning the workflow, create the output folder:\r\n```\r\noutputs/csv-to-bronze-delta-tables_{YYYY-MM-DD_HH-MM}_{USERNAME}/\r\n```\r\nAll scripts produced during the run are saved here.\r\n\r\n- [ ] **Confirm upload** — Ask the operator to confirm the CSV files are visible\r\n in the Files section of the lakehouse before proceeding.\r\n\r\n- [ ] **Create delta tables** — If `LAKEHOUSE_FILES_FOLDER` was not already\r\n captured above, ask for it now. Present these two options:\r\n\r\n **Option 1 — Fabric UI (Manual)**\r\n > Quick and easy — recommended for most users.\r\n In the Fabric browser UI navigate to the lakehouse → Files →\r\n `<LAKEHOUSE_FILES_FOLDER>`. For each CSV file: click the three-dot menu →\r\n **Load to Tables** → **New Table**. Accept the suggested table name (Fabric\r\n applies it automatically). No agent action required.\r\n\r\n **Option 2 — PySpark notebook (Automated)**\r\n Run:\r\n ```\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --output-notebook \"<OUTPUT_FOLDER>\\csv_to_delta_tables.ipynb\"\r\n ```\r\n This writes a ready-to-run `.ipynb` file to the outputs folder. Tell the operator:\r\n 1. In the Fabric UI go to the workspace → **New** → **Import notebook**\r\n 2. Select `csv_to_delta_tables.ipynb` from the outputs folder\r\n 3. Click **Run All** — the notebook attaches the lakehouse automatically\r\n **Validate**: confirm every cell printed `✅ Created table: <table_name>` with\r\n no errors. If any `❌` lines appear, report the error message to the operator.\r\n\r\n## Table Naming Convention\r\n\r\nCSV filename → delta table name:\r\n- Strip `.csv` extension\r\n- Convert to lowercase\r\n- Replace any non-alphanumeric characters (spaces, hyphens, dots) with underscores\r\n- Strip leading/trailing underscores\r\n\r\nExamples:\r\n| CSV filename | Delta table name |\r\n|---|---|\r\n| `Revenue Data.csv` | `revenue_data` |\r\n| `Landon hotel revenue data.csv` | `landon_hotel_revenue_data` |\r\n| `Q1-Sales.csv` | `q1_sales` |\r\n\r\n## Column Naming Convention\r\n\r\nWhen CSVs are loaded into delta tables via the PySpark notebook (Option 2 of\r\ndelta table creation), a `clean_columns()` function transforms every column name:\r\n\r\n- Convert to lowercase\r\n- Replace spaces, hyphens, and other non-alphanumeric characters with underscores\r\n- Strip leading/trailing underscores\r\n\r\n| CSV column header | Delta table column name |\r\n|---|---|\r\n| `Hotel ID` | `hotel_id` |\r\n| `No_of_Rooms` | `no_of_rooms` |\r\n| `Total Revenue (GBP)` | `total_revenue_gbp` |\r\n| `First Name` | `first_name` |\r\n\r\n> **Important for downstream skills:** When writing SQL queries against bronze\r\n> delta tables (e.g., in the `create-materialised-lakeview-scripts` skill),\r\n> always use the cleaned column names — not the original CSV headers.\r\n\r\n## Output Format\r\n\r\nDelta tables appear under the **Tables** section of the bronze lakehouse in the\r\nFabric UI, named according to the convention above. Each table is queryable via\r\nthe lakehouse SQL endpoint and PySpark.\r\n\r\n## Gotchas\r\n\r\n- `fab cp` uses the path prefix to identify local vs OneLake paths. **Absolute\r\n Windows paths (`C:\\...`) are not recognised as local** and cause a\r\n `[NotSupported] Source and destination must be of the same type` error. Always\r\n use `Push-Location` into the source folder and `./filename` (forward slash,\r\n not backslash) syntax — confirmed working pattern.\r\n- **The destination folder must exist before running `fab cp`.** Always run\r\n `fab mkdir \"{WORKSPACE}.Workspace/{LAKEHOUSE}.Lakehouse/Files/{FOLDER}\"` first.\r\n Running `fab mkdir` on an already-existing folder is safe and does not error.\r\n- `WORKSPACE_NAME` and `LAKEHOUSE_NAME` are case-sensitive and must exactly match\r\n what appears in the Fabric UI.\r\n- Shortcuts (Option 1 for delta table creation) use Fabric's automatic schema\r\n inference. They may fail if column names contain spaces or if data types are\r\n inconsistent. Switch to Option 2 (PySpark notebook) in those cases.\r\n- The PySpark notebook attaches the lakehouse automatically via `%%configure` in\r\n Cell 1 — no manual attachment needed before running.\r\n- When using the Fabric CLI, run all commands from the directory that\r\n `LOCAL_CSV_FOLDER` is relative to (typically the project root).\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_upload_commands.py`** — Scans a local CSV folder and outputs\r\n `fab cp` commands to upload each file to the lakehouse Files section.\r\n Run: `python scripts/generate_upload_commands.py --help`\r\n- **`scripts/generate_notebook.py`** — Generates a Fabric-compatible `.ipynb`\r\n notebook pre-configured with the correct lakehouse and `FILES_FOLDER`. The\r\n notebook attaches the lakehouse automatically via `%%configure`. Import into\r\n Fabric via **New → Import notebook**.\r\n Run: `python scripts/generate_notebook.py --help`\r\n",
|
|
143
157
|
},
|
|
144
158
|
{
|
|
145
159
|
relativePath: "assets/pyspark_notebook_template.py",
|
|
@@ -164,19 +178,30 @@ export const EMBEDDED_SKILLS = [
|
|
|
164
178
|
],
|
|
165
179
|
},
|
|
166
180
|
{
|
|
167
|
-
name: "
|
|
181
|
+
name: "fabric-process-discovery",
|
|
182
|
+
category: "fabric",
|
|
168
183
|
files: [
|
|
169
184
|
{
|
|
170
|
-
relativePath: ".
|
|
171
|
-
content: "\u0000\u0000\u0000\u0001Bud1\u0000\u0000\u0010\u0000\u0000\u0000\b\u0000\u0000\u0000\u0010\u0000\u0000\u0000\u0002\t\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\b\u0000\u0000\u0000\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\f\u0000\u0000\u0000\u0001\u0000\u0000\u0010\u0000\u0000t\u0000slg1Scomp\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\f\u0000\u0000\u0000\u0006\u0000a\u0000s\u0000s\u0000e\u0000t\u0000slg1Scomp\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0006\u0000a\u0000s\u0000s\u0000e\u0000t\u0000smoDDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\u0006\u0000a\u0000s\u0000s\u0000e\u0000t\u0000smodDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\u0006\u0000a\u0000s\u0000s\u0000e\u0000t\u0000sph1Scomp\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n\u0000r\u0000e\u0000f\u0000e\u0000r\u0000e\u0000n\u0000c\u0000e\u0000slg1Scomp\u0000\u0000\u0000\u0000\u0000\u0000\u0015\u0007\u0000\u0000\u0000\n\u0000r\u0000e\u0000f\u0000e\u0000r\u0000e\u0000n\u0000c\u0000e\u0000smoDDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\n\u0000r\u0000e\u0000f\u0000e\u0000r\u0000e\u0000n\u0000c\u0000e\u0000smodDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\n\u0000r\u0000e\u0000f\u0000e\u0000r\u0000e\u0000n\u0000c\u0000e\u0000sph1Scomp\u0000\u0000\u0000\u0000\u0000\u0000 \u0000\u0000\u0000\u0000\u0007\u0000s\u0000c\u0000r\u0000i\u0000p\u0000t\u0000slg1Scomp\u0000\u0000\u0000\u0000\u0000\u0000k�\u0000\u0000\u0000\u0007\u0000s\u0000c\u0000r\u0000i\u0000p\u0000t\u0000smoDDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\u0007\u0000s\u0000c\u0000r\u0000i\u0000p\u0000t\u0000smodDblob\u0000\u0000\u0000\b\u0000\u0000��\u0006��A\u0000\u0000\u0000\u0007\u0000s\u0000c\u0000r\u0000i\u0000p\u0000t\u0000sph1Scomp\u0000\u0000\u0000\u0000\u0000\u0000�\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0000\u0000\u0000\u0000\b\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0000 \u0000\u0000\u0000\u0001\u0000\u0000\u0000@\u0000\u0000\u0000\u0001\u0000\u0000\u0000�\u0000\u0000\u0000\u0001\u0000\u0000\u0001\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0002\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0010\u0000\u0000\u0000\u0000\u0001\u0000\u0000 \u0000\u0000\u0000\u0000\u0001\u0000\u0000@\u0000\u0000\u0000\u0000\u0001\u0000\u0000�\u0000\u0000\u0000\u0000\u0001\u0000\u0001\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0002\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0004\u0000\u0000\u0000\u0000\u0000\u0001\u0000\b\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0010\u0000\u0000\u0000\u0000\u0000\u0001\u0000 \u0000\u0000\u0000\u0000\u0000\u0001\u0000@\u0000\u0000\u0000\u0000\u0000\u0001\u0000�\u0000\u0000\u0000\u0000\u0000\u0001\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0001\b\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0001 \u0000\u0000\u0000\u0000\u0000\u0000\u0001@\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0010\u000b\u0000\u0000\u0000E\u0000\u0000\u0002\t\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0004DSDB\u0000\u0000\u0000\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0002\u0000\u0000\u0000 \u0000\u0000\u0000`\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0000�\u0000\u0000\u0000\u0001\u0000\u0000\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000\u0004\u0000\u0000\u0000\u0000\u0002\u0000\u0000\b\u0000\u0000\u0000\u0018\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0000 \u0000\u0000\u0000\u0000\u0001\u0000\u0000@\u0000\u0000\u0000\u0000\u0001\u0000\u0000�\u0000\u0000\u0000\u0000\u0001\u0000\u0001\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0002\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0004\u0000\u0000\u0000\u0000\u0000\u0001\u0000\b\u0000\u0000\u0000\u0000\u0000\u0001\u0000\u0010\u0000\u0000\u0000\u0000\u0000\u0001\u0000 \u0000\u0000\u0000\u0000\u0000\u0001\u0000@\u0000\u0000\u0000\u0000\u0000\u0001\u0000�\u0000\u0000\u0000\u0000\u0000\u0001\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0001\b\u0000\u0000\u0000\u0000\u0000\u0000\u0001\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0001 \u0000\u0000\u0000\u0000\u0000\u0000\u0001@\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
|
185
|
+
relativePath: "SKILL.md",
|
|
186
|
+
content: "---\nname: fabric-process-discovery\ndescription: >\n Use this skill to conduct the initial environment discovery conversation for any\n Microsoft Fabric process workflow. Collects workspace access, deployment approach,\n access control preferences, capacity, and data location through an adaptive,\n one-question-at-a-time conversation grounded in what the downstream Fabric skills\n actually require. Output is a structured environment profile used by the\n orchestrating agent to plan execution. Triggers as Sub-Agent 0 in any Fabric\n process workflow agent.\nlicense: MIT\ncompatibility: Works in any Claude context — no external tools required at this stage.\n---\n\n# Fabric Process Discovery\n\n> ⚠️ **GOVERNANCE**: This skill only gathers context — it never executes commands or\n> creates resources. All collected information feeds into the execution plan which the\n> operator reviews and confirms before anything runs.\n\n## Workflow\n\n1. Read the process requirements and identify which domains below are relevant.\n2. Ask one question at a time, branching adaptively based on each answer.\n3. Collect all path decisions and any parameter values the operator has available.\n4. Present a confirmation summary and wait for explicit approval.\n5. Write the environment profile and append to `CHANGE_LOG.md`.\n\nRuns a structured, adaptive discovery conversation before any Fabric work begins.\nAsk **one question at a time**. Branch based on each answer before deciding what to\nask next. Every question must explain why it matters. Never leave the user blocked.\n\n## Principles\n\nThese are not scripts to follow — they are the reasoning the model should apply\nwhen deriving and sequencing questions.\n\n**1. Read requirements first.**\nBefore asking anything, read the process requirements. Identify which domains below\nare relevant. Only ask about what the requirements actually need — do not run\nthrough all domains for every process.\n\n**2. Ask one question at a time.**\nNever present multiple questions in one turn. Ask the most important unresolved\nquestion, wait for the answer, then decide what to ask next based on that answer.\nThis produces cleaner answers and better branching.\n\n**3. Always explain why.**\nEvery question must briefly state what it unlocks or what it blocks. Users answer\nbetter when they understand the purpose.\n\n**4. Always offer a way forward.**\nEvery question should include an option to provide the answer later (placeholder),\nor to skip the step if it is optional. For questions requiring specific values the\nuser may not have ready (names, IDs, capacity names), offer a command or\ninstruction that helps them find it. Never leave the user stuck.\n\n**5. Distinguish path decisions from parameter values.**\n- **Path decisions** (can you create workspaces? what deployment approach?) determine\n the plan structure — always collect these during discovery.\n- **Parameter values** (exact workspace names, group Object IDs, capacity name) are\n needed before execution — collect them now if the user has them, or flag them as\n \"required before running\" if not.\n\n**6. Trust the model's intelligence.**\nThe domains below describe what to establish and the technical context needed to\nask good questions. Do not read them as scripts. Derive clear, natural questions\nfrom the requirements and the conversation so far.\n\n---\n\n## Domains\n\nCover only the domains relevant to the process requirements:\n\n| Process involves | Domains to cover |\n|---|---|\n| Creating workspaces | A, B, C, D, F |\n| Creating lakehouses | A, D, F |\n| Ingesting files (CSV/PDF) | D, E |\n| Running notebooks/scripts | D, F |\n| Full pipeline | All domains |\n\n---\n\n### Domain A — Workspace access\n\n**What to establish:**\n- Can the operator create new workspaces, or must they use existing ones?\n- If creating: what names do they want?\n- If using existing: what are the exact names?\n\n**Technical context:**\n- Workspace names are case-sensitive in `fab` paths. Always confirm exact casing.\n- If the operator is unsure whether they have create rights, `fab ls` will show\n workspaces they already have access to. Command requires `fab` CLI installed first:\n `pip install ms-fabric-cli` → `fab auth login` → `fab ls`.\n- Read the requirements to determine how many workspaces are needed (e.g. hub and\n spoke, or a single workspace) before asking.\n\n**Branch:**\n- Can create → collect intended workspace names (or use placeholder if not decided)\n- Cannot create → collect exact names of existing workspaces to use\n- Unsure → offer the `fab ls` command to check; proceed once confirmed\n\n---\n\n### Domain B — Domain assignment\n\n**What to establish:**\n- Does the operator want to assign the workspace(s) to a Fabric domain?\n- If yes: assign to existing domain, or create a new one?\n- If creating a new domain: do they have Fabric Admin rights?\n\n**Technical context:**\n- Domain assignment is optional. Many teams skip it and add it later.\n- Assigning to an existing domain requires no special rights beyond workspace access.\n- **Creating a new domain requires Fabric Administrator rights — this is a\n tenant-level permission, not workspace-level.** If the operator is unsure, default\n to assigning an existing domain or skipping. Do not assume they have these rights.\n- Domain assignment can always be done later via the Fabric portal.\n\n**Branch:**\n- Assign to existing domain → collect domain name\n- Create new domain → confirm Fabric Admin rights; if uncertain or no → mark as\n manual gate, note the intended domain name for the plan\n- Skip → no domain parameters needed\n\n---\n\n### Domain C — Access control\n\n**What to establish:**\n- Beyond the workspace creator (automatically assigned as Admin), should additional\n users or security groups be assigned workspace roles?\n- If groups: how will the Object IDs be obtained?\n\n**Technical context:**\n- **The Fabric REST API requires Entra group Object IDs (GUIDs) — display names are\n not accepted programmatically.** This is a hard API requirement.\n- Individual users can be identified by email address (UPN) — no Object ID needed.\n- Object IDs can be found via:\n - Azure portal: Azure Active Directory → Groups → select group → Object ID field\n - Azure CLI: `az ad group show --group \"Display Name\" --query id -o tsv`\n - PowerShell: `Get-MgGroup -Filter \"displayName eq 'Name'\" | Select-Object Id`\n- **If the deployment approach is a PySpark notebook AND security groups are involved:\n `notebookutils` inside a Fabric notebook cannot query Microsoft Graph.** The\n notebook cannot resolve group display names to Object IDs at runtime. Options:\n (a) operator provides Object IDs directly before running, (b) IDs are resolved via\n Azure CLI or PowerShell before the notebook is run, (c) switch to PowerShell or\n terminal deployment for the role assignment step.\n\n**Branch:**\n- No additional access → skip role collection\n- Users only → collect email addresses and intended roles\n- Security groups → ask if the operator can see the groups in the Azure portal:\n - Yes → ask if they will provide Object IDs directly, or want the agent to\n generate Azure CLI lookup commands to retrieve them automatically\n - No / unsure → mark group role assignment as manual; provide portal instructions\n- Mix of users and groups → handle each type appropriately\n\n**Roles available:** Admin, Member, Contributor, Viewer\n\n---\n\n### Domain D — Deployment approach\n\n**What to establish:**\n- How does the operator want to run the generated scripts or notebooks?\n\n**Technical context:**\n- **All three approaches use the Fabric CLI (`fab`) internally.** This is not a\n question about whether to use the CLI — it is about how the operator runs the\n generated artefacts.\n- **PySpark notebook:** imported into a Fabric workspace and run cell-by-cell in the\n Fabric UI. Authentication is automatic via `notebookutils`. Best for operators\n who prefer working inside Fabric and want step-by-step visibility.\n- **PowerShell script:** a `.ps1` file the operator reviews and runs locally.\n Requires `fab` CLI installed locally (`pip install ms-fabric-cli`) and PowerShell.\n- **Terminal commands:** individual `fab` commands run one at a time in a terminal.\n Requires `fab` CLI installed locally. Best for operators who want full control\n and visibility at each step.\n- If the operator chooses notebook AND has Entra group role assignments, flag the\n Service Principal constraint from Domain C before proceeding.\n\n---\n\n### Domain E — Source data\n\n*Only ask if the process involves ingesting files.*\n\n**What to establish:**\n- Where are the source files (CSVs, PDFs, etc.)?\n\n**Technical context:**\n- Local files require an upload step before they can be referenced in Fabric.\n- Files already in OneLake can be referenced by path directly — no upload needed.\n- Files in SharePoint or Azure Blob Storage can be connected via Fabric shortcuts,\n avoiding the need to copy data.\n\n**Branch:**\n- Local machine → include an upload step in the plan\n- Already in OneLake → collect the OneLake path; skip upload\n- Cloud storage (SharePoint / Azure Blob) → collect source URL; include shortcut\n creation step\n\n---\n\n### Domain F — Capacity\n\n*Ask whenever workspaces are being created.*\n\n**What to establish:**\n- What Fabric capacity will the workspace(s) be assigned to?\n\n**Technical context:**\n- Every Fabric workspace must be assigned to an active capacity at creation time.\n- The capacity must be in Active state — if it is paused, the operator must resume\n it in the Azure portal before running workspace creation.\n- The operator may not know the exact name. Options:\n - Run `fab ls` — capacity information appears in the output\n - Check the Fabric Admin portal under Capacities\n- If the operator does not have the name yet, use the placeholder `[CAPACITY_NAME]`\n and flag it as required before the notebook or script is run.\n\n---\n\n## What to Collect\n\nBy the end of discovery, the environment profile must include:\n\n**Path decisions** (always required — these determine the shape of the plan):\n- Workspace approach: creating new / using existing\n- Domain approach: new (manual if no admin rights) / existing / skipped\n- Access control: none / users only / groups / manual\n- Deployment approach: notebook / PowerShell / terminal\n- Group ID resolution method (if groups involved): direct / CLI lookup / manual\n\n**Parameter values** (collect if available; flag as required before run if not):\n- Workspace name(s) — exact, case-preserved\n- Capacity name\n- Domain name (if assigning)\n- Security group display names and intended roles\n- Group Object IDs (if the operator has them; otherwise flag as needed before run)\n- Existing workspace names (verbatim, if using existing)\n\n---\n\n## Confirmation\n\nBefore writing the environment profile, present a concise summary table of all path\ndecisions and collected parameters. Ask the operator to confirm accuracy. If anything\nis missing or unclear, ask only the targeted follow-up needed — do not restart from\nthe beginning.\n\nExample format:\n\n```\n| # | Question | Your answer | What this means |\n|---|-----------------------|------------------------------------|----------------------------------------------------|\n| A | Workspace creation | Creating new | Agent will create hub + spoke workspaces |\n| B | Domain assignment | New domain (manual gate) | Domain creation flagged manual — admin rights needed |\n| C | Access control | Security groups — IDs to be provided | Role assignment scripted; IDs needed before run |\n| D | Deployment approach | PySpark notebook | Agent generates .ipynb for import into Fabric |\n| F | Capacity | ldifabricdev | Embedded in notebook |\n```\n\n---\n\n## Output\n\nSave the confirmed profile as `00-environment-discovery/environment-profile.md`.\n\nInclude:\n- All path decisions\n- All collected parameter values\n- Parameters flagged as required before execution, with instructions for obtaining them\n- Manual gates — steps the operator must perform themselves, and why\n- Deployment prerequisites (e.g. `pip install ms-fabric-cli` if PowerShell or terminal)\n\nAppend to `CHANGE_LOG.md`:\n`[{DATETIME}] Sub-Agent 0 complete — environment-profile.md produced. [N] path decisions recorded. Manual gates: [list or none]. Parameters still needed: [list or none].`\n\n---\n\n## Gotchas\n\n- **Never frame deployment as CLI vs no-CLI.** All three approaches use `fab`. The\n question is only about how the operator runs the generated artefacts.\n- **Workspace names are case-sensitive in `fab` paths.** Always confirm exact casing.\n- **Entra group Object IDs are GUIDs, not display names.** The Fabric REST API will\n reject display names. If the user provides a name, generate a lookup command rather\n than scripting the assignment directly.\n- **`notebookutils` does not support Microsoft Graph.** A Fabric notebook cannot\n resolve group display names to Object IDs at runtime. Either the operator provides\n IDs directly, or resolution must happen outside the notebook.\n- **Domain creation requires Fabric Administrator rights — tenant-level.** Workspace\n Admin rights are not sufficient. Default to assigning an existing domain or skipping\n if there is any doubt about the operator's rights.\n- **Never leave the user blocked.** If a step requires permissions they don't have,\n always offer: (a) skip and mark as manual, (b) produce a spec for their admin, or\n (c) substitute a UI-based workaround.\n",
|
|
172
187
|
},
|
|
188
|
+
],
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
name: "generate-fabric-workspace",
|
|
192
|
+
category: "fabric",
|
|
193
|
+
files: [
|
|
173
194
|
{
|
|
174
195
|
relativePath: "SKILL.md",
|
|
175
|
-
content: "---\r\nname: generate-fabric-workspace\r\ndescription: >\r\n Use this skill when asked to create, provision, or set up a Microsoft Fabric\r\n workspace. Triggers on: \"create a Fabric workspace\", \"provision a workspace\r\n in Fabric\", \"set up a new Fabric workspace\", \"generate a workspace with\r\n capacity and permissions\", \"create workspace and assign roles in Fabric\".\r\n Collects workspace name, capacity, principals/roles, and optional domain\r\n settings, then creates the workspace using one of three approaches: PySpark\r\n notebook, PowerShell script, or interactive terminal commands. Produces a\r\n workspace definition markdown as a creation audit record. Does NOT trigger\r\n for general Fabric questions, item creation within a workspace, or\r\n workspace deletion tasks.\r\nlicense: MIT\r\ncompatibility: >\r\n ms-fabric-cli required (pip install ms-fabric-cli). Approach 1 requires a\r\n Fabric notebook environment. Approaches 2 and 3 require fab CLI installed\r\n locally with network access to Microsoft Fabric.\r\n---\r\n\r\n# Generate Fabric Workspace\r\n\r\nCreates a Microsoft Fabric workspace assigned to a specified capacity, with\r\naccess roles and optional domain assignment. If the workspace already exists,\r\ncreation is skipped and roles/domain are updated. Outputs a workspace\r\ndefinition markdown as an audit trail.\r\n\r\n## Step 1 — Choose Approach\r\n\r\nAsk the user:\r\n\r\n> \"Which approach would you like to use?\r\n> 1. **PySpark Notebook** — generates a notebook to run inside Fabric\r\n> (authenticated automatically via the notebook environment)\r\n> 2. **PowerShell Script** — generates a `.ps1` for your review before execution\r\n> (requires fab CLI installed locally)\r\n> 3. **Interactive Terminal** — runs fab CLI commands one by one in the terminal,\r\n> with your confirmation at each step (requires fab CLI installed locally)\"\r\n\r\n### Authentication by approach\r\n\r\n| Approach | Authentication |\r\n|---|---|\r\n| PySpark Notebook | Auto via `notebookutils.credentials.getToken('pbi')` inside Fabric |\r\n| PowerShell / Terminal | `fab auth login` (browser pop-up) or set `$env:FAB_TOKEN` / `FAB_TOKEN` |\r\n\r\n## Step 2 — Domain Handling\r\n\r\nAsk the user:\r\n\r\n> \"Would you like to:\r\n> A. **Create a new domain** and assign the workspace to it\r\n> ⚠️ Requires **Fabric Admin** tenant-level permissions.\r\n> You will also need to specify an **Entra group** that will be allowed to\r\n> add/remove workspaces from this domain (the domain contributor group).\r\n> B. **Assign the workspace to an existing domain**\r\n> C. **Skip domain assignment**\"\r\n\r\n- If **A**: collect `DOMAIN_NAME` and `DOMAIN_CONTRIBUTOR_GROUP` (the Entra\r\n group display name allowed to add/remove workspaces from the domain). Confirm\r\n the user has Fabric Admin rights.\r\n- If **B**: collect `DOMAIN_NAME` only.\r\n- If **C**: no domain parameters needed.\r\n\r\n## Step 3 — Collect Parameters\r\n\r\nCollect these values from the user:\r\n\r\n| Parameter | Required | Description |\r\n|---|---|---|\r\n| `WORKSPACE_NAME` | Yes | Display name for the workspace |\r\n| `CAPACITY_NAME` | Yes | Exact name of the Fabric capacity to assign |\r\n| `DOMAIN_NAME` | If A or B | Name of the domain (new or existing) |\r\n| `DOMAIN_CONTRIBUTOR_GROUP` | If A | Display name of the Entra group that manages the domain |\r\n| `WORKSPACE_ROLES` | Conditional | Additional principals + roles (see approach-specific guidance below) |\r\n\r\n### Workspace roles — approach-specific guidance\r\n\r\nThe workspace creator is **automatically assigned as Admin**. Before collecting\r\nadditional roles, ask:\r\n\r\n> \"You (the creator) will be automatically assigned as workspace Admin. Do you\r\n> want to assign additional roles to other users or groups?\"\r\n\r\nIf **no**, skip role collection entirely.\r\n\r\nIf **yes**, tailor the guidance based on the deployment approach chosen in Step 1:\r\n\r\n**PySpark Notebook approach:**\r\n> \"With the notebook approach, you can assign roles to **individual users by email\r\n> address**. Assigning roles to **Entra groups** requires an app registration\r\n> (Service Principal) with `Group.Read.All` and `User.Read.All` application\r\n> permissions and admin consent to resolve group names to Object IDs.\r\n>\r\n> Do you have an existing app registration with these permissions? If not, you\r\n> can either:\r\n> - Assign roles to individual users only (by email), or\r\n> - Switch to the PowerShell/Terminal approach for group support, or\r\n> - Set up an app registration first.\"\r\n\r\nIf the user wants groups and has no SP, let them choose — do not silently switch.\r\n\r\n**PowerShell / Terminal approach:**\r\n> \"With this approach, you can assign roles to both individual users and Entra\r\n> groups. Group resolution requires:\r\n> - Azure CLI installed and logged in (`az login`)\r\n> - Permission to read Entra group details (`az ad group show`)\r\n> - The Entra groups must already exist\r\n>\r\n> Do you have Azure CLI access with these permissions?\"\r\n\r\nFor each additional principal, collect:\r\n- User **email address (UPN)** or Entra **group display name** — do NOT ask for Object IDs\r\n- Principal type: `User` or `Group` (or `ServicePrincipal`)\r\n- Role: `Admin`, `Member`, `Contributor`, or `Viewer`\r\n\r\n**Principal ID resolution:** `fab acl set` requires Entra Object IDs. The scripts\r\nand notebook resolve human-readable names/emails automatically before calling the CLI:\r\n- **Notebook** — uses the Microsoft Graph API via a Service Principal\r\n- **PowerShell / Terminal** — uses `az ad user show` / `az ad group show`\r\n (requires Azure CLI installed and `az login` completed)\r\n\r\n## Step 4 — Execute\r\n\r\n### Approach 1: PySpark Notebook\r\n\r\n**Prerequisite:** `notebookutils.credentials.getToken()` does not support Microsoft\r\nGraph API tokens in Fabric notebooks. An Entra **app registration** (Service Principal)\r\nis required to resolve group/user names to Object IDs. Before collecting parameters, ask:\r\n\r\n> \"Do you have an Entra app registration with `Group.Read.All` and `User.Read.All`\r\n> application permissions (with admin consent)? If not, ask your Entra admin to create\r\n> one, or use the PowerShell or Interactive Terminal approach instead.\"\r\n\r\nCollect additionally: `TENANT_ID`, `CLIENT_ID`, `CLIENT_SECRET` for the app registration.\r\nThese are entered directly into Cell 1 of the generated notebook (not passed to the script).\r\n\r\nRun `scripts/generate_notebook.py` with the collected parameters:\r\n\r\n```bash\r\npython scripts/generate_notebook.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n [--create-domain] \\\r\n [--domain-contributor-group \"DOMAIN_CONTRIBUTOR_GROUP\"] \\\r\n --output workspace_setup.ipynb\r\n```\r\n\r\nPresent the generated `workspace_setup.ipynb` to the user and instruct them to:\r\n1. Upload to any Fabric workspace as a notebook\r\n2. Run each cell **one at a time**, reading the output before proceeding\r\n3. ✅ Verification cells are clearly marked — confirm output before moving on\r\n4. Share the output of Cell 7 (`fab ls`) and Cell 9 (`fab acl ls`)\r\n\r\n### Approach 2: PowerShell Script\r\n\r\nRun `scripts/generate_ps1.py` with the collected parameters:\r\n\r\n```bash\r\npython scripts/generate_ps1.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n [--create-domain] \\\r\n [--domain-contributor-group \"DOMAIN_CONTRIBUTOR_GROUP\"] \\\r\n --output workspace_setup.ps1\r\n```\r\n\r\nShow `workspace_setup.ps1` to the user for review. **Do not execute until the\r\nuser confirms.** Then run:\r\n\r\n```powershell\r\n.\\workspace_setup.ps1\r\n```\r\n\r\n### Approach 3: Interactive Terminal\r\n\r\nRun these commands in sequence. Show output after each and ask the user to\r\nconfirm before continuing.\r\n\r\n**Install and authenticate:**\r\n```bash\r\npip install ms-fabric-cli\r\nfab auth login\r\n```\r\n\r\n**Check if workspace already exists:**\r\n```bash\r\nfab exists \"WORKSPACE_NAME.Workspace\"\r\n```\r\n- Exit code 0 → workspace exists → skip creation, go to role assignment\r\n- Non-zero → proceed to create\r\n\r\n**Create workspace:**\r\n```bash\r\nfab mkdir \"WORKSPACE_NAME.Workspace\" -P capacityName=CAPACITY_NAME\r\n```\r\n\r\n**Verify creation:**\r\n```bash\r\nfab exists \"WORKSPACE_NAME.Workspace\"\r\nfab ls \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n**Resolve principal IDs** (before assigning roles — repeat for each principal):\r\n```bash\r\n# For a user (by UPN / email):\r\naz ad user show --id user@corp.com --query id -o tsv\r\n\r\n# For a group (by display name):\r\naz ad group show --group \"Finance Team\" --query id -o tsv\r\n\r\n# For a service principal (by display name or app ID):\r\naz ad sp show --id \"My App Name\" --query id -o tsv\r\n```\r\n\r\n**Assign roles** (use the resolved Object ID, role in lowercase):\r\n```bash\r\nfab acl set \"WORKSPACE_NAME.Workspace\" -I <RESOLVED_OBJECT_ID> -R role\r\n```\r\n\r\n**Verify roles:**\r\n```bash\r\nfab acl ls \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n**Create domain** (if Step 2 = A):\r\n```bash\r\n# Resolve domain contributor group ID:\r\naz ad group show --group \"DOMAIN_CONTRIBUTOR_GROUP\" --query id -o tsv\r\n\r\nfab mkdir \"DOMAIN_NAME.domain\"\r\nfab acl set \".domains/DOMAIN_NAME.Domain\" -I <RESOLVED_GROUP_ID> -R contributor\r\n```\r\n\r\n**Assign workspace to domain** (if Step 2 = A or B):\r\n```bash\r\nfab assign \".domains/DOMAIN_NAME.Domain\" -W \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n## Step 5 — Generate Workspace Definition\r\n\r\nCollect from the command output (or ask the user):\r\n- Workspace ID (appears in `fab ls` output)\r\n- Tenant name or tenant ID\r\n- Confirmed principals and roles\r\n- Domain name (if assigned)\r\n\r\nRun `scripts/generate_definition.py`:\r\n\r\n```bash\r\npython scripts/generate_definition.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --workspace-id \"WORKSPACE_ID\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --tenant \"TENANT_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n --approach \"notebook|powershell|terminal\" \\\r\n --output workspace_definition.md\r\n```\r\n\r\nPresent `workspace_definition.md` to the user.\r\n\r\n## Gotchas\r\n\r\n- Workspace path format is `WorkspaceName.Workspace` — the `.Workspace` suffix is required.\r\n- The capacity must be **Active** before `fab mkdir`. If you see `CapacityNotInActiveState`,\r\n ask the user to resume the capacity in the Azure portal before retrying.\r\n- `notebookutils.credentials.getToken()` in Fabric notebooks **does not support Microsoft Graph**.\r\n The notebook approach requires a Service Principal with `Group.Read.All` + `User.Read.All`\r\n application permissions and admin consent. The SP credentials are entered in Cell 1 of\r\n the generated notebook. If the user doesn't have an SP, direct them to the PowerShell\r\n or Interactive Terminal approach instead.\r\n- Domain creation requires Fabric Administrator tenant-level rights. If the user cannot\r\n create a domain, fall back to assigning an existing one or skipping.\r\n- `fab exists` uses exit code (0 = exists, non-zero = not found) — do not rely on stdout text alone.\r\n- In the notebook approach, `notebookutils` is only available inside a Fabric notebook.\r\n The generated script must not be run as a plain Python script outside Fabric.\r\n- The `.domain` suffix (lowercase) is used in `fab mkdir`; `.Domain` (capitalised) is\r\n used in `fab assign` and `fab acl set` — these are different and both matter.\r\n- Role values passed to `fab acl set` must be **lowercase** (`admin`, `member`, `contributor`, `viewer`).\r\n The scripts handle this conversion automatically.\r\n- For PowerShell/terminal approaches, `az login` must be completed before `az ad user/group show` will work.\r\n This is separate from `fab auth login` — both are required.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_notebook.py`** — Generates PySpark notebook. Run: `python scripts/generate_notebook.py --help`\r\n- **`scripts/generate_ps1.py`** — Generates PowerShell script. Run: `python scripts/generate_ps1.py --help`\r\n- **`scripts/generate_definition.py`** — Generates workspace definition markdown. Run: `python scripts/generate_definition.py --help`\r\n",
|
|
196
|
+
content: "---\r\nname: generate-fabric-workspace\r\ndescription: >\r\n Use this skill when asked to create, provision, or set up a Microsoft Fabric\r\n workspace. Triggers on: \"create a Fabric workspace\", \"provision a workspace\r\n in Fabric\", \"set up a new Fabric workspace\", \"generate a workspace with\r\n capacity and permissions\", \"create workspace and assign roles in Fabric\".\r\n Collects workspace name, capacity, principals/roles, and optional domain\r\n settings, then creates the workspace using one of three approaches: PySpark\r\n notebook, PowerShell script, or interactive terminal commands. Produces a\r\n workspace definition markdown as a creation audit record. Does NOT trigger\r\n for general Fabric questions, item creation within a workspace, or\r\n workspace deletion tasks.\r\nlicense: MIT\r\ncompatibility: >\r\n ms-fabric-cli required (pip install ms-fabric-cli). Approach 1 requires a\r\n Fabric notebook environment. Approaches 2 and 3 require fab CLI installed\r\n locally with network access to Microsoft Fabric.\r\n---\r\n\r\n# Generate Fabric Workspace\r\n\r\n> ⚠️ **GOVERNANCE**: This skill produces notebooks and scripts for the operator to\r\n> review and run — it never executes commands directly against a live Fabric environment.\r\n> Present each generated artefact to the operator before they run it.\r\n\r\nCreates a Microsoft Fabric workspace assigned to a specified capacity, with\r\naccess roles and optional domain assignment. If the workspace already exists,\r\ncreation is skipped and roles/domain are updated. Outputs a workspace\r\ndefinition markdown as an audit trail.\r\n\r\n## Step 1 — Choose Approach\r\n\r\nAsk the user:\r\n\r\n> \"Which approach would you like to use?\r\n> 1. **PySpark Notebook** — generates a notebook to run inside Fabric\r\n> (authenticated automatically via the notebook environment)\r\n> 2. **PowerShell Script** — generates a `.ps1` for your review before execution\r\n> (requires fab CLI installed locally)\r\n> 3. **Interactive Terminal** — runs fab CLI commands one by one in the terminal,\r\n> with your confirmation at each step (requires fab CLI installed locally)\"\r\n\r\n### Authentication by approach\r\n\r\n| Approach | Authentication |\r\n|---|---|\r\n| PySpark Notebook | Auto via `notebookutils.credentials.getToken('pbi')` inside Fabric |\r\n| PowerShell / Terminal | `fab auth login` (browser pop-up) or set `$env:FAB_TOKEN` / `FAB_TOKEN` |\r\n\r\n## Step 2 — Domain Handling\r\n\r\nAsk the user:\r\n\r\n> \"Would you like to:\r\n> A. **Create a new domain** and assign the workspace to it\r\n> ⚠️ Requires **Fabric Admin** tenant-level permissions.\r\n> You will also need to specify an **Entra group** that will be allowed to\r\n> add/remove workspaces from this domain (the domain contributor group).\r\n> B. **Assign the workspace to an existing domain**\r\n> C. **Skip domain assignment**\"\r\n\r\n- If **A**: collect `DOMAIN_NAME` and `DOMAIN_CONTRIBUTOR_GROUP` (the Entra\r\n group display name allowed to add/remove workspaces from the domain). Confirm\r\n the user has Fabric Admin rights.\r\n- If **B**: collect `DOMAIN_NAME` only.\r\n- If **C**: no domain parameters needed.\r\n\r\n## Step 3 — Collect Parameters\r\n\r\nCollect these values from the user:\r\n\r\n| Parameter | Required | Description |\r\n|---|---|---|\r\n| `WORKSPACE_NAME` | Yes | Display name for the workspace |\r\n| `CAPACITY_NAME` | Yes | Exact name of the Fabric capacity to assign |\r\n| `DOMAIN_NAME` | If A or B | Name of the domain (new or existing) |\r\n| `DOMAIN_CONTRIBUTOR_GROUP` | If A | Display name of the Entra group that manages the domain |\r\n| `WORKSPACE_ROLES` | Conditional | Additional principals + roles (see approach-specific guidance below) |\r\n\r\n### Workspace roles — approach-specific guidance\r\n\r\nThe workspace creator is **automatically assigned as Admin**. Before collecting\r\nadditional roles, ask:\r\n\r\n> \"You (the creator) will be automatically assigned as workspace Admin. Do you\r\n> want to assign additional roles to other users or groups?\"\r\n\r\nIf **no**, skip role collection entirely. If **yes**, load\r\n`references/role-assignment.md` for approach-specific guidance on collecting\r\nprincipals, group resolution requirements, and Service Principal prerequisites.\r\n\r\nFor each additional principal, collect:\r\n- User **email address (UPN)** or Entra **group display name** — do NOT ask for Object IDs\r\n- Principal type: `User` or `Group` (or `ServicePrincipal`)\r\n- Role: `Admin`, `Member`, `Contributor`, or `Viewer`\r\n\r\n## Step 4 — Execute\r\n\r\n### Approach 1: PySpark Notebook\r\n\r\nIf role assignment includes Entra groups, `TENANT_ID`, `CLIENT_ID`, and `CLIENT_SECRET`\r\nare required — entered directly into Cell 1 of the generated notebook. See\r\n`references/role-assignment.md` for prerequisite details.\r\n\r\nRun `scripts/generate_notebook.py` with the collected parameters:\r\n\r\n```bash\r\npython scripts/generate_notebook.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n [--create-domain] \\\r\n [--domain-contributor-group \"DOMAIN_CONTRIBUTOR_GROUP\"] \\\r\n --output workspace_setup.ipynb\r\n```\r\n\r\nPresent the generated `workspace_setup.ipynb` to the user and instruct them to:\r\n1. Upload to any Fabric workspace as a notebook\r\n2. Run each cell **one at a time**, reading the output before proceeding\r\n3. ✅ Verification cells are clearly marked — confirm output before moving on\r\n4. Share the output of Cell 7 (`fab ls`) and Cell 9 (`fab acl ls`)\r\n\r\n### Approach 2: PowerShell Script\r\n\r\nRun `scripts/generate_ps1.py` with the collected parameters:\r\n\r\n```bash\r\npython scripts/generate_ps1.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n [--create-domain] \\\r\n [--domain-contributor-group \"DOMAIN_CONTRIBUTOR_GROUP\"] \\\r\n --output workspace_setup.ps1\r\n```\r\n\r\nShow `workspace_setup.ps1` to the user for review. **Do not execute until the\r\nuser confirms.** Then run:\r\n\r\n```powershell\r\n.\\workspace_setup.ps1\r\n```\r\n\r\n### Approach 3: Interactive Terminal\r\n\r\nRun these commands in sequence. Show output after each and ask the user to\r\nconfirm before continuing.\r\n\r\n**Install and authenticate:**\r\n```bash\r\npip install ms-fabric-cli\r\nfab auth login\r\n```\r\n\r\n**Check if workspace already exists:**\r\n```bash\r\nfab exists \"WORKSPACE_NAME.Workspace\"\r\n```\r\n- Exit code 0 → workspace exists → skip creation, go to role assignment\r\n- Non-zero → proceed to create\r\n\r\n**Create workspace:**\r\n```bash\r\nfab mkdir \"WORKSPACE_NAME.Workspace\" -P capacityName=CAPACITY_NAME\r\n```\r\n\r\n**Verify creation:**\r\n```bash\r\nfab exists \"WORKSPACE_NAME.Workspace\"\r\nfab ls \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n**Resolve principal IDs** (before assigning roles — repeat for each principal):\r\n```bash\r\n# For a user (by UPN / email):\r\naz ad user show --id user@corp.com --query id -o tsv\r\n\r\n# For a group (by display name):\r\naz ad group show --group \"Finance Team\" --query id -o tsv\r\n\r\n# For a service principal (by display name or app ID):\r\naz ad sp show --id \"My App Name\" --query id -o tsv\r\n```\r\n\r\n**Assign roles** (use the resolved Object ID, role in lowercase):\r\n```bash\r\nfab acl set \"WORKSPACE_NAME.Workspace\" -I <RESOLVED_OBJECT_ID> -R role\r\n```\r\n\r\n**Verify roles:**\r\n```bash\r\nfab acl ls \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n**Create domain** (if Step 2 = A):\r\n```bash\r\n# Resolve domain contributor group ID:\r\naz ad group show --group \"DOMAIN_CONTRIBUTOR_GROUP\" --query id -o tsv\r\n\r\nfab mkdir \"DOMAIN_NAME.domain\"\r\nfab acl set \".domains/DOMAIN_NAME.Domain\" -I <RESOLVED_GROUP_ID> -R contributor\r\n```\r\n\r\n**Assign workspace to domain** (if Step 2 = A or B):\r\n```bash\r\nfab assign \".domains/DOMAIN_NAME.Domain\" -W \"WORKSPACE_NAME.Workspace\"\r\n```\r\n\r\n## Step 5 — Generate Workspace Definition\r\n\r\nCollect from the command output (or ask the user):\r\n- Workspace ID (appears in `fab ls` output)\r\n- Tenant name or tenant ID\r\n- Confirmed principals and roles\r\n- Domain name (if assigned)\r\n\r\nRun `scripts/generate_definition.py`:\r\n\r\n```bash\r\npython scripts/generate_definition.py \\\r\n --workspace-name \"WORKSPACE_NAME\" \\\r\n --workspace-id \"WORKSPACE_ID\" \\\r\n --capacity-name \"CAPACITY_NAME\" \\\r\n --tenant \"TENANT_NAME\" \\\r\n --roles \"user@corp.com:User:Admin,Finance Team:Group:Member\" \\\r\n [--domain-name \"DOMAIN_NAME\"] \\\r\n --approach \"notebook|powershell|terminal\" \\\r\n --output workspace_definition.md\r\n```\r\n\r\nPresent `workspace_definition.md` to the user.\r\n\r\n## Gotchas\r\n\r\n- Workspace path format is `WorkspaceName.Workspace` — the `.Workspace` suffix is required.\r\n- The capacity must be **Active** before `fab mkdir`. If you see `CapacityNotInActiveState`,\r\n ask the user to resume the capacity in the Azure portal before retrying.\r\n- `notebookutils.credentials.getToken()` in Fabric notebooks **does not support Microsoft Graph**.\r\n The notebook approach requires a Service Principal with `Group.Read.All` + `User.Read.All`\r\n application permissions and admin consent. The SP credentials are entered in Cell 1 of\r\n the generated notebook. If the user doesn't have an SP, direct them to the PowerShell\r\n or Interactive Terminal approach instead.\r\n- Domain creation requires Fabric Administrator tenant-level rights. If the user cannot\r\n create a domain, fall back to assigning an existing one or skipping.\r\n- `fab exists` uses exit code (0 = exists, non-zero = not found) — do not rely on stdout text alone.\r\n- In the notebook approach, `notebookutils` is only available inside a Fabric notebook.\r\n The generated script must not be run as a plain Python script outside Fabric.\r\n- The `.domain` suffix (lowercase) is used in `fab mkdir`; `.Domain` (capitalised) is\r\n used in `fab assign` and `fab acl set` — these are different and both matter.\r\n- Role values passed to `fab acl set` must be **lowercase** (`admin`, `member`, `contributor`, `viewer`).\r\n The scripts handle this conversion automatically.\r\n- For PowerShell/terminal approaches, `az login` must be completed before `az ad user/group show` will work.\r\n This is separate from `fab auth login` — both are required.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_notebook.py`** — Generates PySpark notebook. Run: `python scripts/generate_notebook.py --help`\r\n- **`scripts/generate_ps1.py`** — Generates PowerShell script. Run: `python scripts/generate_ps1.py --help`\r\n- **`scripts/generate_definition.py`** — Generates workspace definition markdown. Run: `python scripts/generate_definition.py --help`\r\n\r\n## Available References\r\n\r\n- **`references/role-assignment.md`** — Approach-specific guidance for assigning roles to users and Entra groups. Load when user wants to assign additional workspace roles.\r\n- **`references/fabric-cli-reference.md`** — Fabric CLI command reference.\r\n",
|
|
176
197
|
},
|
|
177
198
|
{
|
|
178
199
|
relativePath: "references/fabric-cli-reference.md",
|
|
179
|
-
content: "# Fabric CLI Quick Reference\r\n\r\nLoad this file if you need to look up fabric-cli syntax during execution.\r\n\r\n## Installation\r\n\r\n```bash\r\npip install ms-fabric-cli # local / PowerShell\r\n%pip install ms-fabric-cli -q # inside Fabric notebook\r\n```\r\n\r\n## Authentication\r\n\r\n```bash\r\nfab auth login # browser pop-up (local)\r\n# OR set environment variables:\r\nexport FAB_TOKEN=<pbi_token>\r\nexport FAB_TOKEN_ONELAKE=<
|
|
200
|
+
content: "# Fabric CLI Quick Reference\r\n\r\nLoad this file if you need to look up fabric-cli syntax during execution.\r\n\r\n## Installation\r\n\r\n```bash\r\npip install ms-fabric-cli # local / PowerShell\r\n%pip install ms-fabric-cli -q # inside Fabric notebook\r\n```\r\n\r\n## Authentication\r\n\r\n```bash\r\nfab auth login # browser pop-up (local)\r\n# OR set environment variables:\r\nexport FAB_TOKEN=<pbi_token>\r\nexport FAB_TOKEN_ONELAKE=<storage_token> # OneLake needs storage scope\r\nexport FAB_TOKEN_AZURE=<pbi_token>\r\n```\r\n\r\nInside a Fabric notebook (pip install MUST be in a separate cell — kernel restarts):\r\n```python\r\n# Cell 1 — Install (kernel restarts after this)\r\n%pip install ms-fabric-cli -q --no-warn-conflicts\r\n```\r\n\r\n```python\r\n# Cell 2 — Authenticate\r\nimport os, sysconfig\r\n\r\nscripts_dir = sysconfig.get_path('scripts')\r\nos.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\r\n\r\ntoken = notebookutils.credentials.getToken(\"pbi\")\r\nstorage_token = notebookutils.credentials.getToken(\"storage\")\r\nos.environ[\"FAB_TOKEN\"] = token\r\nos.environ[\"FAB_TOKEN_ONELAKE\"] = storage_token # OneLake needs storage scope\r\nos.environ[\"FAB_TOKEN_AZURE\"] = token\r\n\r\n!fab auth status\r\n```\r\n\r\n## Workspace Commands\r\n\r\n| Action | Command |\r\n|---|---|\r\n| Check exists | `fab exists \"WorkspaceName.Workspace\"` |\r\n| Create | `fab mkdir \"WorkspaceName.Workspace\" -P capacityName=CAPACITY_NAME` |\r\n| List contents | `fab ls \"WorkspaceName.Workspace\"` |\r\n\r\n**Note:** Path format is `DisplayName.Workspace` — the `.Workspace` suffix is required.\r\n\r\n## Resolving Entra Names/Emails to Object IDs\r\n\r\n`fab acl set` requires Entra Object IDs. Use these methods to resolve names first.\r\n\r\n### In a Fabric Notebook (Graph API)\r\n\r\n```python\r\nimport requests\r\n\r\ndef resolve_principal_id(token: str, name_or_email: str, principal_type: str) -> str:\r\n headers = {\"Authorization\": f\"Bearer {token}\", \"ConsistencyLevel\": \"eventual\"}\r\n if principal_type == \"User\":\r\n resp = requests.get(f\"https://graph.microsoft.com/v1.0/users/{name_or_email}\", headers=headers)\r\n elif principal_type == \"Group\":\r\n resp = requests.get(\r\n f\"https://graph.microsoft.com/v1.0/groups?$filter=displayName eq '{name_or_email}'&$select=id\",\r\n headers=headers)\r\n return resp.json()[\"value\"][0][\"id\"]\r\n resp.raise_for_status()\r\n return resp.json()[\"id\"]\r\n\r\n# Get Graph token inside notebook:\r\ngraph_token = notebookutils.credentials.getToken(\"https://graph.microsoft.com\")\r\n```\r\n\r\n### Locally (Azure CLI)\r\n\r\n```bash\r\n# Resolve user by email / UPN:\r\naz ad user show --id user@corp.com --query id -o tsv\r\n\r\n# Resolve group by display name:\r\naz ad group show --group \"Finance Team\" --query id -o tsv\r\n\r\n# Resolve service principal by display name:\r\naz ad sp show --id \"My App Name\" --query id -o tsv\r\n```\r\n\r\nRequires `az login` before use (separate from `fab auth login`).\r\n\r\n\r\n\r\n## Workspace Role Assignment\r\n\r\nUse the Power BI REST API via `fab api` — **not** `fab acl set` — for workspace membership roles.\r\n\r\n```bash\r\n# Get workspace ID first\r\nWS_ID=$(fab get \"WorkspaceName.Workspace\" -q \"id\" | tr -d '\"')\r\n\r\n# Add a user by email\r\nfab api -A powerbi \"groups/$WS_ID/users\" -X post \\\r\n -i '{\"emailAddress\": \"user@corp.com\", \"groupUserAccessRight\": \"Member\"}'\r\n\r\n# Add a group by Object ID\r\nfab api -A powerbi \"groups/$WS_ID/users\" -X post \\\r\n -i '{\"identifier\": \"<group-object-id>\", \"groupUserAccessRight\": \"Member\", \"principalType\": \"Group\"}'\r\n\r\n# List current users\r\nfab api -A powerbi \"groups/$WS_ID/users\"\r\n\r\n# Remove a user\r\nfab api -A powerbi \"groups/$WS_ID/users/user@corp.com\" -X delete\r\n```\r\n\r\nValid `groupUserAccessRight` values (case-sensitive): `Admin`, `Member`, `Contributor`, `Viewer`\r\n\r\n> ⚠️ `fab acl set` is for OneLake/item-level ACLs, **not** workspace membership roles.\r\n\r\n## Domain Commands\r\n\r\n| Action | Command |\r\n|---|---|\r\n| Create domain | `fab create \".domains/DomainName.Domain\"` _(requires Fabric Admin)_ |\r\n| List domains | `fab ls .domains` |\r\n| Get domain details | `fab get \".domains/DomainName.Domain\"` |\r\n| Set contributor scope | `fab set \".domains/DomainName.Domain\" -q contributorsScope -i \"SpecificUsersAndGroups\"` |\r\n| ~~Set domain contributor~~ | ~~`fab acl set \".domains/DomainName.Domain\" -I GROUP_OBJECT_ID -R contributor -f`~~ ⚠️ **Not supported** — `acl set` does not work on `.domains/` paths. Set domain contributors manually via the Fabric Admin portal: admin.powerbi.com → Domains → select domain → Manage contributors. |\r\n| Assign workspace to domain | `fab assign \".domains/DomainName.Domain\" -W \"WorkspaceName.Workspace\"` |\r\n| Unassign workspace | `fab unassign \".domains/DomainName.Domain\" -W \"WorkspaceName.Workspace\"` |\r\n| Remove domain | `fab rm \".domains/DomainName.Domain\" -f` |\r\n\r\n**Path format:** Always `.domains/DisplayName.Domain` — the `.domains/` prefix and `.Domain` suffix (capital D) are required for all domain operations including creation.\r\n\r\n**⚠️ `fab exists` is unreliable for conditional logic** — it returns exit code 0 regardless of whether the resource exists. Use `fab ls` output or just attempt the operation directly.\r\n\r\n## Common Errors\r\n\r\n| Error | Cause | Fix |\r\n|---|---|---|\r\n| `CapacityNotInActiveState` | Capacity is paused | Resume in Azure portal |\r\n| `ItemAlreadyExists` | Workspace name taken | Skip creation, update roles |\r\n| `Unauthorized` | Token expired or missing | Re-authenticate |\r\n| Domain creation fails | Not Fabric Admin | Assign existing domain or skip |\r\n",
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
relativePath: "references/role-assignment.md",
|
|
204
|
+
content: "# Role Assignment — Approach-Specific Guidance\n\nLoad this file when the user wants to assign additional workspace roles.\n\n## PySpark Notebook approach\n\n> \"With the notebook approach, you can assign roles to **individual users by email\n> address**. Assigning roles to **Entra groups** requires an app registration\n> (Service Principal) with `Group.Read.All` and `User.Read.All` application\n> permissions and admin consent to resolve group names to Object IDs.\n>\n> Do you have an existing app registration with these permissions? If not, you can either:\n> - Assign roles to individual users only (by email), or\n> - Switch to the PowerShell/Terminal approach for group support, or\n> - Set up an app registration first.\"\n\nIf the user wants groups and has no SP, let them choose — do not silently switch.\n\nCollect additionally: `TENANT_ID`, `CLIENT_ID`, `CLIENT_SECRET` for the app\nregistration. These are entered directly into Cell 1 of the generated notebook.\n\n## PowerShell / Terminal approach\n\n> \"With this approach, you can assign roles to both individual users and Entra\n> groups. Group resolution requires:\n> - Azure CLI installed and logged in (`az login`)\n> - Permission to read Entra group details (`az ad group show`)\n> - The Entra groups must already exist\n>\n> Do you have Azure CLI access with these permissions?\"\n\n## Principal ID resolution\n\n`fab acl set` requires Entra Object IDs. The scripts and notebook resolve\nhuman-readable names/emails automatically before calling the CLI:\n- **Notebook** — uses the Microsoft Graph API via a Service Principal\n- **PowerShell / Terminal** — uses `az ad user show` / `az ad group show`\n (requires Azure CLI installed and `az login` completed)\n\nRole values passed to `fab acl set` must be **lowercase** (`admin`, `member`,\n`contributor`, `viewer`). The scripts handle this conversion automatically.\n",
|
|
180
205
|
},
|
|
181
206
|
{
|
|
182
207
|
relativePath: "scripts/generate_definition.py",
|
|
@@ -184,7 +209,7 @@ export const EMBEDDED_SKILLS = [
|
|
|
184
209
|
},
|
|
185
210
|
{
|
|
186
211
|
relativePath: "scripts/generate_notebook.py",
|
|
187
|
-
content: "#!/usr/bin/env python3\r\n# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nGenerate a Jupyter (.ipynb) notebook for creating a Microsoft Fabric workspace.\r\nEach step is a separate cell so it can be run one at a time inside Fabric.\r\n\r\nPasses names/emails directly to `fab acl set -I`. This works for users (UPN/email)\r\nand may work for groups depending on your tenant configuration. If a group assignment\r\nfails, the cell reports the error and continues — re-run that assignment with the\r\ngroup's Entra Object ID instead.\r\n\r\nUsage:\r\n python scripts/generate_notebook.py --workspace-name NAME --capacity-name CAP --roles ROLES [OPTIONS]\r\n\r\nOptions:\r\n --workspace-name TEXT Workspace display name (required)\r\n --capacity-name TEXT Fabric capacity name (required)\r\n --roles TEXT Comma-separated EMAIL:ROLE list (required)\r\n e.g. \"alice@corp.com:Admin,bob@corp.com:Member\"\r\n Roles: Admin, Member, Contributor, Viewer\r\n Note: user UPNs (email addresses) only.\r\n For group assignments, use the PowerShell approach instead.\r\n --domain-name TEXT Domain name to create or assign (optional)\r\n --create-domain Create the domain before assigning (optional)\r\n --domain-contributor-group TEXT Display name or email of Entra group that manages the domain\r\n (required if --create-domain)\r\n --output TEXT Output file path (default: workspace_setup.ipynb)\r\n --help Show this message and exit\r\n\r\nExamples:\r\n python scripts/generate_notebook.py \\\\\r\n --workspace-name \"Finance-Reporting\" \\\\\r\n --capacity-name \"fabriccapacity01\" \\\\\r\n --roles \"alice@corp.com:User:Admin,Finance Team:Group:Member\" \\\\\r\n --output workspace_setup.ipynb\r\n\"\"\"\r\n\r\nimport argparse\r\nimport json\r\nimport sys\r\nfrom datetime import datetime, timezone\r\n\r\n\r\ndef parse_roles(roles_str: str) -> list[dict]:\r\n entries = []\r\n for entry in roles_str.split(\",\"):\r\n parts = entry.strip().split(\":\")\r\n if len(parts) != 2:\r\n print(f\"Error: '{entry}' must be EMAIL:ROLE\", file=sys.stderr)\r\n print(\" e.g. alice@corp.com:Admin\", file=sys.stderr)\r\n sys.exit(1)\r\n email, role = parts[0].strip(), parts[1].strip()\r\n if role not in {\"Admin\", \"Member\", \"Contributor\", \"Viewer\"}:\r\n print(f\"Error: role '{role}' must be Admin, Member, Contributor, or Viewer\", file=sys.stderr)\r\n sys.exit(1)\r\n entries.append({\"email\": email, \"role\": role})\r\n return entries\r\n\r\n\r\ndef md_cell(lines: list[str]) -> dict:\r\n return {\"cell_type\": \"markdown\", \"metadata\": {}, \"source\": [l + \"\\n\" for l in lines]}\r\n\r\n\r\ndef code_cell(lines: list[str]) -> dict:\r\n return {\"cell_type\": \"code\", \"execution_count\": None, \"metadata\": {}, \"outputs\": [], \"source\": [l + \"\\n\" for l in lines]}\r\n\r\n\r\ndef build_notebook(ws_name: str, cap_name: str, roles: list[dict],\r\n domain_name: str | None, create_domain: bool,\r\n domain_contributor_group: str | None) -> dict:\r\n ws_path = f\"{ws_name}.Workspace\"\r\n now = datetime.now(timezone.utc).strftime(\"%Y-%m-%d %H:%M UTC\")\r\n\r\n cells = []\r\n\r\n # ── Title ──────────────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n f\"# Fabric Workspace Setup: {ws_name}\",\r\n f\"_Generated: {now}_\",\r\n \"\",\r\n \"**Run each cell one at a time. Read the output before running the next cell.**\",\r\n \"\",\r\n f\"**Prerequisite:** Capacity `{cap_name}` must be in **Active** state.\",\r\n ]))\r\n\r\n # ── Cell 1: Install ────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 1 — Install Fabric CLI\",\r\n \"\",\r\n \"⚠️ **The kernel restarts after `%pip install`.** Run this cell first,\",\r\n \"then continue from Cell 2. **Skip if `ms-fabric-cli` is already installed.**\",\r\n ]))\r\n cells.append(code_cell([\r\n \"%pip install ms-fabric-cli -q --no-warn-conflicts\",\r\n ]))\r\n\r\n # ── Cell 2: Authenticate + Parameters ─────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 2 — Authenticate & Set Parameters\",\r\n \"\",\r\n \"Sets Fabric CLI tokens using the notebook user's identity and defines workspace parameters.\",\r\n \"**Start here if you skipped Cell 1.**\",\r\n ]))\r\n cells.append(code_cell([\r\n \"import os, sysconfig, json\",\r\n \"\",\r\n \"# ── Parameters ──────────────────────────────────────────────\",\r\n f'ws_name = \"{ws_name}\"',\r\n f'ws_path = \"{ws_path}\"',\r\n f'cap_name = \"{cap_name}\"',\r\n \"\",\r\n \"# ── Auth ────────────────────────────────────────────────────\",\r\n \"scripts_dir = sysconfig.get_path('scripts')\",\r\n \"os.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\",\r\n \"\",\r\n \"token = notebookutils.credentials.getToken('pbi')\",\r\n \"os.environ['FAB_TOKEN'] = token\",\r\n \"os.environ['FAB_TOKEN_ONELAKE'] = token\",\r\n \"os.environ['FAB_TOKEN_AZURE'] = token\",\r\n \"print(f'Authenticated. Workspace: {ws_name} Capacity: {cap_name}')\",\r\n ]))\r\n\r\n # ── Cell 3: Create workspace ───────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 3 — Create Workspace\",\r\n \"\",\r\n f\"Creates `{ws_name}` on capacity `{cap_name}`.\",\r\n \"If the workspace already exists, fab will report it — that is fine, continue to Cell 4.\",\r\n f\"> ⚠️ If you see `CapacityNotInActiveState`, resume `{cap_name}` in the Azure portal first.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Creating workspace: {{ws_name}} ===\")',\r\n f'!fab mkdir \"{{ws_path}}\" -P capacityName={{cap_name}}',\r\n ]))\r\n\r\n # ── Cell 4: Verify workspace + capture WS_ID ──────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 4 — Verify Workspace\",\r\n \"\",\r\n \"✅ Confirm the workspace was created. `WS_ID` is captured for use in later cells.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Workspace details: {{ws_name}} ===\")',\r\n f'!fab get \"{{ws_path}}\"',\r\n \"\",\r\n \"ws_id_out = !fab get \\\"{ws_path}\\\" -q \\\"id\\\"\",\r\n \"WS_ID = ws_id_out[0].strip('\\\"')\",\r\n 'print(f\"\\\\nWorkspace ID: {WS_ID}\")',\r\n ]))\r\n\r\n # ── Cell 5: Assign roles ───────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 5 — Assign Workspace Roles\",\r\n \"\",\r\n \"Writes each role payload to a temp file then calls `fab api` to POST it.\",\r\n \"Valid roles: `Admin`, `Member`, `Contributor`, `Viewer`.\",\r\n ]))\r\n role_lines = [\r\n 'print(\"=== Assigning workspace roles ===\")',\r\n \"\",\r\n \"roles = [\",\r\n ]\r\n for r in roles:\r\n role_lines.append(f' (\"{r[\"email\"]}\", \"{r[\"role\"]}\"),')\r\n role_lines += [\r\n \"]\",\r\n \"\",\r\n \"for email, role in roles:\",\r\n \" payload = json.dumps({'emailAddress': email, 'groupUserAccessRight': role})\",\r\n \" tmp = f\\\"/tmp/role_{email.replace('@','_').replace('.','_')}.json\\\"\",\r\n \" with open(tmp, 'w') as f:\",\r\n \" f.write(payload)\",\r\n \" print(f\\\" {role} -> {email}\\\")\",\r\n \" !fab api -A powerbi \\\"groups/{WS_ID}/users\\\" -X post -i {tmp}\",\r\n \"\",\r\n 'print(\"Done.\")',\r\n ]\r\n cells.append(code_cell(role_lines))\r\n\r\n # ── Cell 6: Verify roles ───────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 6 — Verify Roles\",\r\n \"\",\r\n \"✅ Confirm all expected users and roles appear.\",\r\n ]))\r\n cells.append(code_cell([\r\n 'print(f\"=== Users in workspace: {ws_name} ===\")',\r\n '!fab api -A powerbi \"groups/{WS_ID}/users\"',\r\n ]))\r\n\r\n # ── Domain cells (optional) ────────────────────────────────────────────────\r\n cell_num = 7\r\n if domain_name:\r\n domain_path = f\".domains/{domain_name}.Domain\"\r\n\r\n if create_domain:\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Create Domain\",\r\n \"\",\r\n f\"Creates domain `{domain_name}`.\",\r\n \"⚠️ Requires Fabric Admin role.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'domain_path = \"{domain_path}\"',\r\n f'print(f\"=== Creating domain: {domain_name} ===\")',\r\n f'!fab create \"{{domain_path}}\"',\r\n \"\",\r\n \"domain_id_out = !fab get \\\"{domain_path}\\\" -q \\\"id\\\"\",\r\n \"DOMAIN_ID = domain_id_out[0].strip('\\\"')\",\r\n 'print(f\"Domain ID: {DOMAIN_ID}\")',\r\n ]))\r\n cell_num += 1\r\n\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Assign Workspace to Domain\",\r\n \"\",\r\n f\"Links `{ws_name}` to domain `{domain_name}` using the Fabric admin API.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Assigning {{ws_name}} to domain: {domain_name} ===\")',\r\n \"payload = json.dumps({'workspacesIds': [WS_ID]})\",\r\n \"with open('/tmp/domain_assign.json', 'w') as f:\",\r\n \" f.write(payload)\",\r\n '!fab api -X post \"admin/domains/{DOMAIN_ID}/assignWorkspaces\" -i /tmp/domain_assign.json',\r\n 'print(\"✅ Done.\")',\r\n ]))\r\n cell_num += 1\r\n\r\n # ── Final cell ─────────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Setup Complete\",\r\n \"\",\r\n \"✅ Workspace setup complete.\",\r\n ]))\r\n\r\n return {\r\n \"nbformat\": 4,\r\n \"nbformat_minor\": 5,\r\n \"metadata\": {\r\n \"kernelspec\": {\"display_name\": \"Python 3\", \"language\": \"python\", \"name\": \"python3\"},\r\n \"language_info\": {\"name\": \"python\"},\r\n },\r\n \"cells\": cells,\r\n }\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=\"Generate a Jupyter notebook (.ipynb) for Fabric workspace creation.\",\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=__doc__,\r\n )\r\n parser.add_argument(\"--workspace-name\", required=True)\r\n parser.add_argument(\"--capacity-name\", required=True)\r\n parser.add_argument(\"--roles\", required=True)\r\n parser.add_argument(\"--domain-name\", default=None)\r\n parser.add_argument(\"--create-domain\", action=\"store_true\")\r\n parser.add_argument(\"--domain-contributor-group\", default=None)\r\n parser.add_argument(\"--output\", default=\"workspace_setup.ipynb\")\r\n args = parser.parse_args()\r\n\r\n if args.create_domain and not args.domain_contributor_group:\r\n print(\"Error: --domain-contributor-group is required when --create-domain is set.\", file=sys.stderr)\r\n sys.exit(1)\r\n\r\n roles = parse_roles(args.roles)\r\n notebook = build_notebook(\r\n args.workspace_name, args.capacity_name, roles,\r\n args.domain_name, args.create_domain, args.domain_contributor_group,\r\n )\r\n\r\n output = args.output if args.output.endswith(\".ipynb\") else args.output.replace(\".py\", \".ipynb\")\r\n with open(output, \"w\", encoding=\"utf-8\") as f:\r\n json.dump(notebook, f, indent=2)\r\n\r\n print(f'{{\"status\": \"ok\", \"output\": \"{output}\", \"cells\": {len(notebook[\"cells\"])}, \"roles\": {len(roles)}}}')\r\n print(f\"✅ Notebook written to: {output}\", file=sys.stderr)\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
212
|
+
content: "#!/usr/bin/env python3\r\n# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nGenerate a Jupyter (.ipynb) notebook for creating a Microsoft Fabric workspace.\r\nEach step is a separate cell so it can be run one at a time inside Fabric.\r\n\r\nPasses names/emails directly to `fab acl set -I`. This works for users (UPN/email)\r\nand may work for groups depending on your tenant configuration. If a group assignment\r\nfails, the cell reports the error and continues — re-run that assignment with the\r\ngroup's Entra Object ID instead.\r\n\r\nUsage:\r\n python scripts/generate_notebook.py --workspace-name NAME --capacity-name CAP --roles ROLES [OPTIONS]\r\n\r\nOptions:\r\n --workspace-name TEXT Workspace display name (required)\r\n --capacity-name TEXT Fabric capacity name (required)\r\n --roles TEXT Comma-separated EMAIL:ROLE list (required)\r\n e.g. \"alice@corp.com:Admin,bob@corp.com:Member\"\r\n Roles: Admin, Member, Contributor, Viewer\r\n Note: user UPNs (email addresses) only.\r\n For group assignments, use the PowerShell approach instead.\r\n --domain-name TEXT Domain name to create or assign (optional)\r\n --create-domain Create the domain before assigning (optional)\r\n --domain-contributor-group TEXT Display name or email of Entra group that manages the domain\r\n (required if --create-domain)\r\n --output TEXT Output file path (default: workspace_setup.ipynb)\r\n --help Show this message and exit\r\n\r\nExamples:\r\n python scripts/generate_notebook.py \\\\\r\n --workspace-name \"Finance-Reporting\" \\\\\r\n --capacity-name \"fabriccapacity01\" \\\\\r\n --roles \"alice@corp.com:User:Admin,Finance Team:Group:Member\" \\\\\r\n --output workspace_setup.ipynb\r\n\"\"\"\r\n\r\nimport argparse\r\nimport json\r\nimport sys\r\nfrom datetime import datetime, timezone\r\n\r\n\r\ndef parse_roles(roles_str: str) -> list[dict]:\r\n entries = []\r\n for entry in roles_str.split(\",\"):\r\n parts = entry.strip().split(\":\")\r\n if len(parts) != 2:\r\n print(f\"Error: '{entry}' must be EMAIL:ROLE\", file=sys.stderr)\r\n print(\" e.g. alice@corp.com:Admin\", file=sys.stderr)\r\n sys.exit(1)\r\n email, role = parts[0].strip(), parts[1].strip()\r\n if role not in {\"Admin\", \"Member\", \"Contributor\", \"Viewer\"}:\r\n print(f\"Error: role '{role}' must be Admin, Member, Contributor, or Viewer\", file=sys.stderr)\r\n sys.exit(1)\r\n entries.append({\"email\": email, \"role\": role})\r\n return entries\r\n\r\n\r\ndef md_cell(lines: list[str]) -> dict:\r\n return {\"cell_type\": \"markdown\", \"metadata\": {}, \"source\": [l + \"\\n\" for l in lines]}\r\n\r\n\r\ndef code_cell(lines: list[str]) -> dict:\r\n return {\"cell_type\": \"code\", \"execution_count\": None, \"metadata\": {}, \"outputs\": [], \"source\": [l + \"\\n\" for l in lines]}\r\n\r\n\r\ndef build_notebook(ws_name: str, cap_name: str, roles: list[dict],\r\n domain_name: str | None, create_domain: bool,\r\n domain_contributor_group: str | None) -> dict:\r\n ws_path = f\"{ws_name}.Workspace\"\r\n now = datetime.now(timezone.utc).strftime(\"%Y-%m-%d %H:%M UTC\")\r\n\r\n cells = []\r\n\r\n # ── Title ──────────────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n f\"# Fabric Workspace Setup: {ws_name}\",\r\n f\"_Generated: {now}_\",\r\n \"\",\r\n \"**Run each cell one at a time. Read the output before running the next cell.**\",\r\n \"\",\r\n f\"**Prerequisite:** Capacity `{cap_name}` must be in **Active** state.\",\r\n ]))\r\n\r\n # ── Cell 1: Install ────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 1 — Install Fabric CLI\",\r\n \"\",\r\n \"⚠️ **The kernel restarts after `%pip install`.** Run this cell first,\",\r\n \"then continue from Cell 2. **Skip if `ms-fabric-cli` is already installed.**\",\r\n ]))\r\n cells.append(code_cell([\r\n \"%pip install ms-fabric-cli -q --no-warn-conflicts\",\r\n ]))\r\n\r\n # ── Cell 2: Authenticate + Parameters ─────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 2 — Authenticate & Set Parameters\",\r\n \"\",\r\n \"Sets Fabric CLI tokens using the notebook user's identity and defines workspace parameters.\",\r\n \"**Start here if you skipped Cell 1.**\",\r\n ]))\r\n cells.append(code_cell([\r\n \"import os, sysconfig, json\",\r\n \"\",\r\n \"# ── Parameters ──────────────────────────────────────────────\",\r\n f'ws_name = \"{ws_name}\"',\r\n f'ws_path = \"{ws_path}\"',\r\n f'cap_name = \"{cap_name}\"',\r\n \"\",\r\n \"# ── Auth ────────────────────────────────────────────────────\",\r\n \"scripts_dir = sysconfig.get_path('scripts')\",\r\n \"os.environ['PATH'] = scripts_dir + os.pathsep + os.environ.get('PATH', '')\",\r\n \"\",\r\n \"token = notebookutils.credentials.getToken('pbi')\",\r\n \"storage_token = notebookutils.credentials.getToken('storage')\",\r\n \"os.environ['FAB_TOKEN'] = token\",\r\n \"os.environ['FAB_TOKEN_ONELAKE'] = storage_token # OneLake needs storage scope\",\r\n \"os.environ['FAB_TOKEN_AZURE'] = token\",\r\n \"print(f'Authenticated. Workspace: {ws_name} Capacity: {cap_name}')\",\r\n ]))\r\n\r\n # ── Cell 3: Create workspace ───────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 3 — Create Workspace\",\r\n \"\",\r\n f\"Creates `{ws_name}` on capacity `{cap_name}`.\",\r\n \"If the workspace already exists, fab will report it — that is fine, continue to Cell 4.\",\r\n f\"> ⚠️ If you see `CapacityNotInActiveState`, resume `{cap_name}` in the Azure portal first.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Creating workspace: {{ws_name}} ===\")',\r\n f'!fab mkdir \"{{ws_path}}\" -P capacityName={{cap_name}}',\r\n ]))\r\n\r\n # ── Cell 4: Verify workspace + capture WS_ID ──────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 4 — Verify Workspace\",\r\n \"\",\r\n \"✅ Confirm the workspace was created. `WS_ID` is captured for use in later cells.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Workspace details: {{ws_name}} ===\")',\r\n f'!fab get \"{{ws_path}}\"',\r\n \"\",\r\n \"ws_id_out = !fab get \\\"{ws_path}\\\" -q \\\"id\\\"\",\r\n \"WS_ID = ws_id_out[0].strip('\\\"')\",\r\n 'print(f\"\\\\nWorkspace ID: {WS_ID}\")',\r\n ]))\r\n\r\n # ── Cell 5: Assign roles ───────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 5 — Assign Workspace Roles\",\r\n \"\",\r\n \"Writes each role payload to a temp file then calls `fab api` to POST it.\",\r\n \"Valid roles: `Admin`, `Member`, `Contributor`, `Viewer`.\",\r\n ]))\r\n role_lines = [\r\n 'print(\"=== Assigning workspace roles ===\")',\r\n \"\",\r\n \"roles = [\",\r\n ]\r\n for r in roles:\r\n role_lines.append(f' (\"{r[\"email\"]}\", \"{r[\"role\"]}\"),')\r\n role_lines += [\r\n \"]\",\r\n \"\",\r\n \"for email, role in roles:\",\r\n \" payload = json.dumps({'emailAddress': email, 'groupUserAccessRight': role})\",\r\n \" tmp = f\\\"/tmp/role_{email.replace('@','_').replace('.','_')}.json\\\"\",\r\n \" with open(tmp, 'w') as f:\",\r\n \" f.write(payload)\",\r\n \" print(f\\\" {role} -> {email}\\\")\",\r\n \" !fab api -A powerbi \\\"groups/{WS_ID}/users\\\" -X post -i {tmp}\",\r\n \"\",\r\n 'print(\"Done.\")',\r\n ]\r\n cells.append(code_cell(role_lines))\r\n\r\n # ── Cell 6: Verify roles ───────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n \"## Cell 6 — Verify Roles\",\r\n \"\",\r\n \"✅ Confirm all expected users and roles appear.\",\r\n ]))\r\n cells.append(code_cell([\r\n 'print(f\"=== Users in workspace: {ws_name} ===\")',\r\n '!fab api -A powerbi \"groups/{WS_ID}/users\"',\r\n ]))\r\n\r\n # ── Domain cells (optional) ────────────────────────────────────────────────\r\n cell_num = 7\r\n if domain_name:\r\n domain_path = f\".domains/{domain_name}.Domain\"\r\n\r\n if create_domain:\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Create Domain\",\r\n \"\",\r\n f\"Creates domain `{domain_name}`.\",\r\n \"⚠️ Requires Fabric Admin role.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'domain_path = \"{domain_path}\"',\r\n f'print(f\"=== Creating domain: {domain_name} ===\")',\r\n f'!fab create \"{{domain_path}}\"',\r\n \"\",\r\n \"domain_id_out = !fab get \\\"{domain_path}\\\" -q \\\"id\\\"\",\r\n \"DOMAIN_ID = domain_id_out[0].strip('\\\"')\",\r\n 'print(f\"Domain ID: {DOMAIN_ID}\")',\r\n ]))\r\n cell_num += 1\r\n\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Assign Workspace to Domain\",\r\n \"\",\r\n f\"Links `{ws_name}` to domain `{domain_name}` using the Fabric admin API.\",\r\n ]))\r\n cells.append(code_cell([\r\n f'print(f\"=== Assigning {{ws_name}} to domain: {domain_name} ===\")',\r\n \"payload = json.dumps({'workspacesIds': [WS_ID]})\",\r\n \"with open('/tmp/domain_assign.json', 'w') as f:\",\r\n \" f.write(payload)\",\r\n '!fab api -X post \"admin/domains/{DOMAIN_ID}/assignWorkspaces\" -i /tmp/domain_assign.json',\r\n 'print(\"✅ Done.\")',\r\n ]))\r\n cell_num += 1\r\n\r\n # ── Final cell ─────────────────────────────────────────────────────────────\r\n cells.append(md_cell([\r\n f\"## Cell {cell_num} — Setup Complete\",\r\n \"\",\r\n \"✅ Workspace setup complete.\",\r\n ]))\r\n\r\n return {\r\n \"nbformat\": 4,\r\n \"nbformat_minor\": 5,\r\n \"metadata\": {\r\n \"kernelspec\": {\"display_name\": \"Python 3\", \"language\": \"python\", \"name\": \"python3\"},\r\n \"language_info\": {\"name\": \"python\"},\r\n },\r\n \"cells\": cells,\r\n }\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=\"Generate a Jupyter notebook (.ipynb) for Fabric workspace creation.\",\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=__doc__,\r\n )\r\n parser.add_argument(\"--workspace-name\", required=True)\r\n parser.add_argument(\"--capacity-name\", required=True)\r\n parser.add_argument(\"--roles\", required=True)\r\n parser.add_argument(\"--domain-name\", default=None)\r\n parser.add_argument(\"--create-domain\", action=\"store_true\")\r\n parser.add_argument(\"--domain-contributor-group\", default=None)\r\n parser.add_argument(\"--output\", default=\"workspace_setup.ipynb\")\r\n args = parser.parse_args()\r\n\r\n if args.create_domain and not args.domain_contributor_group:\r\n print(\"Error: --domain-contributor-group is required when --create-domain is set.\", file=sys.stderr)\r\n sys.exit(1)\r\n\r\n roles = parse_roles(args.roles)\r\n notebook = build_notebook(\r\n args.workspace_name, args.capacity_name, roles,\r\n args.domain_name, args.create_domain, args.domain_contributor_group,\r\n )\r\n\r\n output = args.output if args.output.endswith(\".ipynb\") else args.output.replace(\".py\", \".ipynb\")\r\n with open(output, \"w\", encoding=\"utf-8\") as f:\r\n json.dump(notebook, f, indent=2)\r\n\r\n print(f'{{\"status\": \"ok\", \"output\": \"{output}\", \"cells\": {len(notebook[\"cells\"])}, \"roles\": {len(roles)}}}')\r\n print(f\"✅ Notebook written to: {output}\", file=sys.stderr)\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|
|
188
213
|
},
|
|
189
214
|
{
|
|
190
215
|
relativePath: "scripts/generate_ps1.py",
|
|
@@ -194,19 +219,16 @@ export const EMBEDDED_SKILLS = [
|
|
|
194
219
|
},
|
|
195
220
|
{
|
|
196
221
|
name: "pdf-to-bronze-delta-tables",
|
|
222
|
+
category: "fabric",
|
|
197
223
|
files: [
|
|
198
224
|
{
|
|
199
225
|
relativePath: "SKILL.md",
|
|
200
|
-
content: "---\r\nname: pdf-to-bronze-delta-tables\r\ndescription: >\r\n Use this skill to extract structured data from PDF files on an operator's\r\n local machine, upload them to a Microsoft Fabric bronze lakehouse, and convert\r\n them to a delta table using AI-powered field extraction. Triggers on: \"create\r\n delta tables from PDFs\", \"extract data from PDF invoices to Fabric\", \"load\r\n PDFs into bronze lakehouse\", \"parse PDF documents to delta format\", \"ingest\r\n PDF files to Fabric tables\". Does NOT trigger for CSV/Excel ingestion,\r\n transforming existing delta tables, or non-Fabric storage targets.\r\nlicense: MIT\r\ncompatibility: >\r\n Python 3.8+ for scripts/. Fabric CLI (fab) for CLI upload option.\r\n Fabric notebook runtime 1.3 required (for synapse.ml.aifunc).\r\n---\r\n\r\n# PDF to Bronze Delta Tables\r\n\r\nUploads PDF files from a local machine to a Microsoft Fabric bronze lakehouse\r\nand converts each PDF into a row in a delta table using AI field extraction.\r\nThe lakehouse must already exist.\r\n\r\n> ⚠️ **GOVERNANCE RULE**: This skill **never executes `fab` CLI commands directly**.\r\n> All `fab` commands are written to a PowerShell script for the operator to run.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `WORKSPACE_NAME` | Fabric workspace name (exact, case-sensitive) | `\"Landon Finance Month End\"` |\r\n| `LAKEHOUSE_NAME` | Bronze lakehouse name (exact, case-sensitive) | `\"Lh_landon_finance_bronze\"` |\r\n| `LAKEHOUSE_FILES_FOLDER` | Folder name under lakehouse Files section | `\"Booking PDFs\"` |\r\n| `TABLE_NAME` | Target delta table name (snake_case) | `\"booking_invoices\"` |\r\n| `LOCAL_PDF_FOLDER` | Exact absolute path to local PDF folder (CLI upload only) | `\"C:\\Users\\rishi\\Data\\Booking PDFs\"` |\r\n| `FIELDS` | Fields to extract from each PDF — collected in Step 2 | See workflow |\r\n\r\n## Workflow\r\n\r\n- [ ] **Collect parameters** — If `WORKSPACE_NAME` or `LAKEHOUSE_NAME` are not\r\n provided, ask the operator for them before proceeding.\r\n\r\n- [ ] **Suggest and confirm extraction fields** — Before asking the operator to\r\n define fields from scratch, the agent should **read a sample PDF** to understand\r\n the document structure and proactively suggest fields:\r\n\r\n 1. Use `pdfplumber` (or equivalent) to extract text from 1–2 sample PDFs in\r\n `LOCAL_PDF_FOLDER`. If a second PDF is from a different sub-group (e.g.\r\n different property/entity), include it to confirm layout consistency.\r\n 2. Identify all extractable fields from the document structure (headers, labels,\r\n line items, totals, payment details, etc.).\r\n 3. Present the suggested fields to the operator in a table format, split into:\r\n - **Header-level fields** (one row per PDF) — for the main table\r\n - **Line-item fields** (multiple rows per PDF) — for the detail table, if\r\n the document contains repeating line items\r\n 4. For each field, show: `snake_case` name, extraction hint for the AI, and an\r\n example value from the sample PDF.\r\n 5. Ask the operator:\r\n - \"Do these fields look right? Anything to add, remove, or rename?\"\r\n - \"What should the main delta table be named?\" → `TABLE_NAME`\r\n - \"Do you want a second table for line/detail items?\" If yes:\r\n → `LINE_ITEMS_TABLE_NAME` and confirm the line-item fields\r\n - \"What folder name will the PDFs be stored in under the lakehouse Files\r\n section?\" → `LAKEHOUSE_FILES_FOLDER`\r\n 6. **Do not proceed until the operator confirms the fields.**\r\n\r\n Build `FIELDS` as a JSON array: `[{\"name\": \"...\", \"description\": \"...\"}, ...]`\r\n\r\n If the operator confirmed a second line-items table, build `LINE_ITEMS_FIELDS`\r\n as a JSON array: `[{\"name\": \"...\", \"description\": \"...\"}, ...]`\r\n\r\n- [ ] **Upload PDFs** — Present these three options and ask the operator to choose:\r\n\r\n **Option 1 — OneLake File Explorer (Manual)**\r\n Drag-and-drop the PDFs into the target folder under the lakehouse Files section\r\n using the OneLake File Explorer desktop app. No agent action required.\r\n\r\n **Option 2 — Fabric UI (Manual)**\r\n In the Fabric browser UI navigate to the lakehouse → Files section → open or\r\n create the `LAKEHOUSE_FILES_FOLDER` folder → click **Upload** and select the\r\n PDF files. No agent action required.\r\n\r\n **Option 3 — Fabric CLI (Automated)**\r\n > ⚠️ **Performance note**: The CLI uploads files one at a time. For large\r\n > batches (50+ files) this is significantly slower than Options 1 or 2.\r\n > Recommend Options 1 or 2 for bulk uploads.\r\n\r\n Ask for `LOCAL_PDF_FOLDER` (exact absolute path). Then run:\r\n ```\r\n python scripts/generate_upload_commands.py \\\r\n --local-folder \"<LOCAL_PDF_FOLDER>\" \\\r\n --workspace \"<WORKSPACE_NAME>\" \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --output-script \"<OUTPUT_FOLDER>\\upload_pdf_files.ps1\"\r\n ```\r\n Present the script path to the operator and ask them to run it.\r\n\r\n## Output Folder\r\n\r\nBefore beginning, create the output folder:\r\n```\r\noutputs/pdf-to-bronze-delta-tables_{YYYY-MM-DD_HH-MM}_{USERNAME}/\r\n```\r\nAll generated scripts and notebooks for this run are saved here.\r\n\r\n- [ ] **Confirm upload** — Ask the operator to confirm all PDFs are visible in the\r\n lakehouse Files section before proceeding.\r\n\r\n- [ ] **Generate TEST notebook** — Run:\r\n ```\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --table-name \"<TABLE_NAME>\" \\\r\n --fields-json \"<FIELDS_JSON>\" \\\r\n [--line-items-table-name \"<LINE_ITEMS_TABLE_NAME>\"] \\\r\n [--line-items-fields-json \"<LINE_ITEMS_FIELDS_JSON>\"] \\\r\n --test-mode \\\r\n --output-notebook \"<OUTPUT_FOLDER>\\pdf_to_delta_TEST.ipynb\"\r\n ```\r\n Where `<FIELDS_JSON>` is the JSON array built from `FIELDS` above, as a\r\n single-line string (e.g. `'[{\"name\":\"invoice_number\",\"description\":\"...\"}]'`).\r\n Include `--line-items-table-name` and `--line-items-fields-json` if a second\r\n line-items table was requested — both must be provided together.\r\n\r\n Tell the operator:\r\n 1. Go to the workspace → **New** → **Import notebook**\r\n 2. Select `pdf_to_delta_TEST.ipynb`\r\n 3. Click **Run All** — the notebook attaches the lakehouse automatically and\r\n processes **one PDF only**\r\n 4. Share the output row displayed at the end of the notebook\r\n\r\n- [ ] **Validate and iterate** — Review the output row the operator shares:\r\n - Check each field has a value and it looks correct\r\n - If a field is missing or wrong: update its description in `FIELDS_JSON`,\r\n regenerate the TEST notebook, and ask the operator to re-run it\r\n - Repeat until all fields are correct\r\n - **Do not proceed to full run until the test row is confirmed correct**\r\n\r\n- [ ] **Generate FULL notebook** — Once test output is confirmed, run the same\r\n command **without** `--test-mode`:\r\n ```\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --table-name \"<TABLE_NAME>\" \\\r\n --fields-json \"<FIELDS_JSON>\" \\\r\n [--line-items-table-name \"<LINE_ITEMS_TABLE_NAME>\"] \\\r\n [--line-items-fields-json \"<LINE_ITEMS_FIELDS_JSON>\"] \\\r\n --output-notebook \"<OUTPUT_FOLDER>\\pdf_to_delta_FULL.ipynb\"\r\n ```\r\n Tell the operator to import and run `pdf_to_delta_FULL.ipynb`. This processes\r\n all PDFs in the folder.\r\n\r\n- [ ] **Validate final table** — Ask the operator to confirm:\r\n - Delta table `<TABLE_NAME>` appears in the Tables section of the lakehouse\r\n - Row count matches the number of PDFs uploaded\r\n - Spot-check a few rows for data quality\r\n\r\n## Table Naming\r\n\r\n- Use a descriptive `snake_case` name based on the document type, not the filename\r\n- PDFs are individual records — do not derive table name from filenames\r\n- Ask the operator to confirm the table name before generating any notebook\r\n\r\n## Gotchas\r\n\r\n- **AI features must be enabled on the capacity.** `synapse.ml.aifunc` uses Fabric's\r\n built-in AI endpoint — no Azure OpenAI key needed. Prerequisites: (1) paid Fabric\r\n capacity F2 or higher, (2) tenant admin must enable \"Copilot and other features\r\n powered by Azure OpenAI\" in Admin portal → Tenant settings, (3) if capacity is\r\n outside an Azure OpenAI region, also enable the cross-geo processing toggle.\r\n- **Default model is `gpt-4.1-mini`.** If the notebook throws `DeploymentConfigNotFound`,\r\n the `MODEL_DEPLOYMENT_NAME` in the configuration cell doesn't match a model on\r\n the built-in endpoint. Check supported models at\r\n https://learn.microsoft.com/en-us/fabric/data-science/ai-services/ai-services-overview\r\n- `fab cp` requires `./filename` (forward slash) syntax.Absolute Windows paths\r\n (`C:\\...`) cause `[NotSupported]` errors. The generated script uses `Push-Location`\r\n to work around this — do not modify this pattern.\r\n- **Destination folder must exist before uploading.** The script runs `fab mkdir` first.\r\n Running `fab mkdir` on an existing folder is safe.\r\n- `WORKSPACE_NAME` and `LAKEHOUSE_NAME` are case-sensitive.\r\n- The notebook uses `synapse.ml.aifunc` which requires Fabric **runtime 1.3**.\r\n If the operator sees import errors, check runtime version in notebook settings.\r\n- The `%%configure` cell attaches the lakehouse automatically — no manual\r\n attachment needed before clicking Run All.\r\n- AI extraction temperature is set to `0.0` for consistency, but it is still\r\n non-deterministic across different PDF layouts. Always validate with TEST mode first.\r\n- All extracted fields are written as strings. If the operator needs typed columns\r\n (dates, numbers), add a post-processing step after confirming extraction is correct.\r\n- The notebook installs `openai` and `pymupdf4llm` at runtime. The `synapse.ml.aifunc`\r\n package is pre-installed in Fabric Runtime 1.3+.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_upload_commands.py`** — Scans a local folder for PDFs and\r\n writes a PowerShell script of `fab cp` upload commands.\r\n Run: `python scripts/generate_upload_commands.py --help`\r\n- **`scripts/generate_notebook.py`** — Generates a Fabric-compatible `.ipynb`\r\n notebook with the AI extraction prompt pre-populated from the supplied fields.\r\n Supports `--test-mode` for single-PDF validation runs.\r\n Run: `python scripts/generate_notebook.py --help`\r\n",
|
|
226
|
+
content: "---\r\nname: pdf-to-bronze-delta-tables\r\ndescription: >\r\n Use this skill to extract structured data from PDF files on an operator's\r\n local machine, upload them to a Microsoft Fabric bronze lakehouse, and convert\r\n them to a delta table using AI-powered field extraction. Triggers on: \"create\r\n delta tables from PDFs\", \"extract data from PDF invoices to Fabric\", \"load\r\n PDFs into bronze lakehouse\", \"parse PDF documents to delta format\", \"ingest\r\n PDF files to Fabric tables\". Does NOT trigger for CSV/Excel ingestion,\r\n transforming existing delta tables, or non-Fabric storage targets.\r\nlicense: MIT\r\ncompatibility: >\r\n Python 3.8+ for scripts/. Fabric CLI (fab) for CLI upload option.\r\n Fabric notebook runtime 1.3 required (for synapse.ml.aifunc).\r\n---\r\n\r\n# PDF to Bronze Delta Tables\r\n\r\nUploads PDF files from a local machine to a Microsoft Fabric bronze lakehouse\r\nand converts each PDF into a row in a delta table using AI field extraction.\r\nThe lakehouse must already exist.\r\n\r\n> ⚠️ **GOVERNANCE RULE**: This skill **never executes `fab` CLI commands directly**.\r\n> All `fab` commands are written to a PowerShell script for the operator to run.\r\n\r\n## Inputs\r\n\r\n| Parameter | Description | Example |\r\n|-----------|-------------|---------|\r\n| `WORKSPACE_NAME` | Fabric workspace name (exact, case-sensitive) | `\"Landon Finance Month End\"` |\r\n| `LAKEHOUSE_NAME` | Bronze lakehouse name (exact, case-sensitive) | `\"Lh_landon_finance_bronze\"` |\r\n| `LAKEHOUSE_FILES_FOLDER` | Folder name under lakehouse Files section | `\"Booking PDFs\"` |\r\n| `TABLE_NAME` | Target delta table name (snake_case) | `\"booking_invoices\"` |\r\n| `LOCAL_PDF_FOLDER` | Exact absolute path to local PDF folder (CLI upload only) | `\"C:\\Users\\rishi\\Data\\Booking PDFs\"` |\r\n| `FIELDS` | Fields to extract from each PDF — collected in Step 2 | See workflow |\r\n\r\n## Workflow\r\n\r\n- [ ] **Collect parameters** — If `WORKSPACE_NAME` or `LAKEHOUSE_NAME` are not\r\n provided, ask the operator for them before proceeding.\r\n\r\n- [ ] **Suggest and confirm extraction fields** — Before asking the operator to\r\n define fields from scratch, the agent should **read a sample PDF** to understand\r\n the document structure and proactively suggest fields:\r\n\r\n 1. Use `pdfplumber` (or equivalent) to extract text from 1–2 sample PDFs in\r\n `LOCAL_PDF_FOLDER`. If a second PDF is from a different sub-group (e.g.\r\n different property/entity), include it to confirm layout consistency.\r\n 2. Identify all extractable fields from the document structure (headers, labels,\r\n line items, totals, payment details, etc.).\r\n 3. Present the suggested fields to the operator in a table format, split into:\r\n - **Header-level fields** (one row per PDF) — for the main table\r\n - **Line-item fields** (multiple rows per PDF) — for the detail table, if\r\n the document contains repeating line items\r\n 4. For each field, show: `snake_case` name, extraction hint for the AI, and an\r\n example value from the sample PDF.\r\n 5. Ask the operator:\r\n - \"Do these fields look right? Anything to add, remove, or rename?\"\r\n - \"What should the main delta table be named?\" → `TABLE_NAME`\r\n - \"Do you want a second table for line/detail items?\" If yes:\r\n → `LINE_ITEMS_TABLE_NAME` and confirm the line-item fields\r\n - \"What folder name will the PDFs be stored in under the lakehouse Files\r\n section?\" → `LAKEHOUSE_FILES_FOLDER`\r\n 6. **Do not proceed until the operator confirms the fields.**\r\n\r\n Build `FIELDS` as a JSON array: `[{\"name\": \"...\", \"description\": \"...\"}, ...]`\r\n\r\n If the operator confirmed a second line-items table, build `LINE_ITEMS_FIELDS`\r\n as a JSON array: `[{\"name\": \"...\", \"description\": \"...\"}, ...]`\r\n\r\n- [ ] **Upload PDFs** — Present these three options and ask the operator to choose:\r\n\r\n **Option 1 — OneLake File Explorer (Manual)**\r\n Drag-and-drop the PDFs into the target folder under the lakehouse Files section\r\n using the OneLake File Explorer desktop app. No agent action required.\r\n\r\n **Option 2 — Fabric UI (Manual)**\r\n In the Fabric browser UI navigate to the lakehouse → Files section → open or\r\n create the `LAKEHOUSE_FILES_FOLDER` folder → click **Upload** and select the\r\n PDF files. No agent action required.\r\n\r\n **Option 3 — Fabric CLI (Automated)**\r\n > ⚠️ **Requires PowerShell** — generates a `.ps1` script. PowerShell is available\r\n > on Windows natively and on Mac/Linux via `brew install powershell`. If PowerShell\r\n > is not available and the operator does not want to install it, use Option 1 or 2.\r\n > Do not substitute a bash or shell script.\r\n >\r\n > ⚠️ **Performance note**: The CLI uploads files one at a time. For large\r\n > batches (50+ files) this is significantly slower than Options 1 or 2.\r\n > Recommend Options 1 or 2 for bulk uploads.\r\n\r\n Ask for `LOCAL_PDF_FOLDER` (exact absolute path). Then run:\r\n ```\r\n python scripts/generate_upload_commands.py \\\r\n --local-folder \"<LOCAL_PDF_FOLDER>\" \\\r\n --workspace \"<WORKSPACE_NAME>\" \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --output-script \"<OUTPUT_FOLDER>/upload_pdf_files.ps1\"\r\n ```\r\n Present the script path to the operator and ask them to run it with `pwsh upload_pdf_files.ps1`.\r\n\r\n## Output Folder\r\n\r\nBefore beginning, create the output folder:\r\n```\r\noutputs/pdf-to-bronze-delta-tables_{YYYY-MM-DD_HH-MM}_{USERNAME}/\r\n```\r\nAll generated scripts and notebooks for this run are saved here.\r\n\r\n- [ ] **Confirm upload** — Ask the operator to confirm all PDFs are visible in the\r\n lakehouse Files section before proceeding.\r\n\r\n- [ ] **Generate TEST notebook** — Run:\r\n ```\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --table-name \"<TABLE_NAME>\" \\\r\n --fields-json \"<FIELDS_JSON>\" \\\r\n [--line-items-table-name \"<LINE_ITEMS_TABLE_NAME>\"] \\\r\n [--line-items-fields-json \"<LINE_ITEMS_FIELDS_JSON>\"] \\\r\n --test-mode \\\r\n --output-notebook \"<OUTPUT_FOLDER>\\pdf_to_delta_TEST.ipynb\"\r\n ```\r\n Where `<FIELDS_JSON>` is the JSON array built from `FIELDS` above, as a\r\n single-line string (e.g. `'[{\"name\":\"invoice_number\",\"description\":\"...\"}]'`).\r\n Include `--line-items-table-name` and `--line-items-fields-json` if a second\r\n line-items table was requested — both must be provided together.\r\n\r\n Tell the operator:\r\n 1. Go to the workspace → **New** → **Import notebook**\r\n 2. Select `pdf_to_delta_TEST.ipynb`\r\n 3. Click **Run All** — the notebook attaches the lakehouse automatically and\r\n processes **one PDF only**\r\n 4. Share the output row displayed at the end of the notebook\r\n\r\n- [ ] **Validate and iterate** — Review the output row the operator shares:\r\n - Check each field has a value and it looks correct\r\n - If a field is missing or wrong: update its description in `FIELDS_JSON`,\r\n regenerate the TEST notebook, and ask the operator to re-run it\r\n - Repeat until all fields are correct\r\n - **Do not proceed to full run until the test row is confirmed correct**\r\n\r\n- [ ] **Generate FULL notebook** — Once test output is confirmed, run the same\r\n command **without** `--test-mode`:\r\n ```\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"<LAKEHOUSE_NAME>\" \\\r\n --lakehouse-folder \"<LAKEHOUSE_FILES_FOLDER>\" \\\r\n --table-name \"<TABLE_NAME>\" \\\r\n --fields-json \"<FIELDS_JSON>\" \\\r\n [--line-items-table-name \"<LINE_ITEMS_TABLE_NAME>\"] \\\r\n [--line-items-fields-json \"<LINE_ITEMS_FIELDS_JSON>\"] \\\r\n --output-notebook \"<OUTPUT_FOLDER>\\pdf_to_delta_FULL.ipynb\"\r\n ```\r\n Tell the operator to import and run `pdf_to_delta_FULL.ipynb`. This processes\r\n all PDFs in the folder.\r\n\r\n- [ ] **Validate final table** — Ask the operator to confirm:\r\n - Delta table `<TABLE_NAME>` appears in the Tables section of the lakehouse\r\n - Row count matches the number of PDFs uploaded\r\n - Spot-check a few rows for data quality\r\n\r\n## Table Naming\r\n\r\n- Use a descriptive `snake_case` name based on the document type, not the filename\r\n- PDFs are individual records — do not derive table name from filenames\r\n- Ask the operator to confirm the table name before generating any notebook\r\n\r\n## Gotchas\r\n\r\n- **AI features must be enabled on the capacity.** `synapse.ml.aifunc` uses Fabric's\r\n built-in AI endpoint — no Azure OpenAI key needed. Prerequisites: (1) paid Fabric\r\n capacity F2 or higher, (2) tenant admin must enable \"Copilot and other features\r\n powered by Azure OpenAI\" in Admin portal → Tenant settings, (3) if capacity is\r\n outside an Azure OpenAI region, also enable the cross-geo processing toggle.\r\n- **Default model is `gpt-4.1-mini`.** If the notebook throws `DeploymentConfigNotFound`,\r\n the `MODEL_DEPLOYMENT_NAME` in the configuration cell doesn't match a model on\r\n the built-in endpoint. Check supported models at\r\n https://learn.microsoft.com/en-us/fabric/data-science/ai-services/ai-services-overview\r\n- `fab cp` requires `./filename` (forward slash) syntax.Absolute Windows paths\r\n (`C:\\...`) cause `[NotSupported]` errors. The generated script uses `Push-Location`\r\n to work around this — do not modify this pattern.\r\n- **Destination folder must exist before uploading.** The script runs `fab mkdir` first.\r\n Running `fab mkdir` on an existing folder is safe.\r\n- `WORKSPACE_NAME` and `LAKEHOUSE_NAME` are case-sensitive.\r\n- The notebook uses `synapse.ml.aifunc` which requires Fabric **runtime 1.3**.\r\n If the operator sees import errors, check runtime version in notebook settings.\r\n- The `%%configure` cell attaches the lakehouse automatically — no manual\r\n attachment needed before clicking Run All.\r\n- AI extraction temperature is set to `0.0` for consistency, but it is still\r\n non-deterministic across different PDF layouts. Always validate with TEST mode first.\r\n- All extracted fields are written as strings. If the operator needs typed columns\r\n (dates, numbers), add a post-processing step after confirming extraction is correct.\r\n- **Column names come from AI extraction.** The delta table column names match\r\n the `name` field in the `FIELDS` JSON array provided during setup. These are\r\n `snake_case` names chosen by the operator (e.g., `invoice_number`, `hotel_name`).\r\n They do NOT follow the same `clean_columns()` convention used by the\r\n `csv-to-bronze-delta-tables` skill. Downstream skills (e.g.,\r\n `create-materialised-lakeview-scripts`) must verify actual delta table column\r\n names rather than assuming any naming convention.\r\n- The notebook installs `openai` and `pymupdf4llm` at runtime. The `synapse.ml.aifunc`\r\n package is pre-installed in Fabric Runtime 1.3+.\r\n\r\n## Available Scripts\r\n\r\n- **`scripts/generate_upload_commands.py`** — Scans a local folder for PDFs and\r\n writes a PowerShell script of `fab cp` upload commands.\r\n Run: `python scripts/generate_upload_commands.py --help`\r\n- **`scripts/generate_notebook.py`** — Generates a Fabric-compatible `.ipynb`\r\n notebook with the AI extraction prompt pre-populated from the supplied fields.\r\n Supports `--test-mode` for single-PDF validation runs.\r\n Run: `python scripts/generate_notebook.py --help`\r\n",
|
|
201
227
|
},
|
|
202
228
|
{
|
|
203
229
|
relativePath: "references/notebook-cells-reference.md",
|
|
204
230
|
content: "# Notebook Cells Reference\r\n\r\nReference for the PDF-to-delta notebook structure, sourced from\r\n`NB_ConvertPDFToDelta.ipynb`. Use this when debugging notebook failures or\r\nadapting the generated notebook for more complex extraction requirements.\r\n\r\n## Cell Order\r\n\r\n| # | Type | Purpose |\r\n|---|------|---------|\r\n| 1 | Code (`%%configure`) | Attach default lakehouse for the session |\r\n| 2 | Markdown | Header and mode indicator |\r\n| 3 | Code | pip installs (pymupdf4llm + synapseml) |\r\n| 4 | Code | Imports |\r\n| 5 | Code | `create_mkd()` — PDF → markdown via pymupdf4llm |\r\n| 6 | Code | `process_pdfs()` — list PDFs, call create_mkd for each |\r\n| 7 | Code | Configuration (`LAKEHOUSE_FILES_FOLDER`, `TABLE_NAME`, `TEST_MODE`) |\r\n| 8 | Code | Load PDFs, display filename/error summary |\r\n| 9 | Code | `EXTRACTION_PROMPT` — JSON template with field descriptions |\r\n| 10 | Code | Run AI extraction via `ai.generate_response()` |\r\n| 11 | Code | `parse_output()` — parse JSON from LLM output, concat to DataFrame |\r\n| 12 | Code | Display results + test mode validation prompt |\r\n| 13 | Code | Write to delta table via `saveAsTable()` |\r\n\r\n## pip Install Versions\r\n\r\nFor Fabric **runtime 1.3+**, `synapse.ml.aifunc` is pre-installed.\r\nOnly `openai` and `pymupdf4llm` need to be installed at runtime:\r\n\r\n```python\r\n%pip install -q openai pymupdf4llm 2>/dev/null\r\n```\r\n\r\n## AI Extraction API\r\n\r\nThe notebook uses `synapse.ml.aifunc` which is Fabric's native AI function\r\nlibrary. It calls the built-in Fabric AI endpoint automatically — no API key\r\nconfiguration needed. Requires a paid capacity (F2+) with the Copilot tenant\r\nsetting enabled.\r\n\r\n```python\r\nimport synapse.ml.aifunc as aifunc\r\nfrom synapse.ml.aifunc import Conf\r\n\r\npdf_df[\"output\"] = pdf_df[[\"mkdown_text\"]].ai.generate_response(\r\n EXTRACTION_PROMPT,\r\n conf=Conf(model_deployment_name=\"gpt-4.1-mini\", temperature=0.0, top_p=1.0, concurrency=25)\r\n)\r\n```\r\n\r\nKey notes:\r\n- Input must be a **DataFrame** (double brackets `[[col]]`), not a Series\r\n- `temperature=0.0` + `top_p=1.0` maximises consistency across identical inputs\r\n- `seed` parameter is **not supported** by the OpenAI Responses API — do not use it\r\n- `concurrency=25` parallelises extraction across PDFs (default is 200, tune as needed)\r\n- Default model is `gpt-4.1-mini`. Other options: `gpt-4.1`, `gpt-4o`, `gpt-5`\r\n- Output is a string column containing the raw LLM response\r\n\r\n## Prompt Structure\r\n\r\nThe extraction prompt must contain a JSON template with field names as keys\r\nand extraction hints as values. The LLM returns a JSON object matching this\r\ntemplate. Example:\r\n\r\n```\r\nExtract the following fields from this document and return ONLY a valid JSON object.\r\nNo explanation, no markdown fences, no additional text.\r\n\r\n{\r\n \"invoice_number\": \"invoice number after 'no.' e.g. 3105-0060\",\r\n \"invoice_date\": \"invoice date as YYYY-MM-DD\",\r\n \"total_amount\": \"total amount as a number, no currency symbol\"\r\n}\r\n\r\nDocument:\r\n```\r\n\r\nTips for improving extraction accuracy:\r\n- Be specific: `\"date as YYYY-MM-DD\"` not just `\"date\"`\r\n- Provide examples from real PDFs: `\"e.g. 3105-0060\"`\r\n- For numeric fields: `\"as a number, no currency symbol\"`\r\n- For optional fields: `\"or null if not present\"`\r\n\r\n## File Path Convention\r\n\r\nPDFs are read via their local mount path inside the Fabric notebook:\r\n```python\r\nlocal_path = \"/lakehouse/default/Files\" + f.path.split(\"Files\")[1]\r\n```\r\n\r\n`mssparkutils.fs.ls(\"Files/<folder>\")` returns OneLake paths; splitting on\r\n`\"Files\"` extracts the suffix which is then appended to the local mount root.\r\n\r\n## Common Failure Modes\r\n\r\n| Symptom | Likely cause | Fix |\r\n|---------|-------------|-----|\r\n| `ModuleNotFoundError: synapse.ml` | Wrong runtime version | Set notebook runtime to 1.3 |\r\n| `AttributeError: 'Series' has no attribute 'ai'` | Single brackets used | Use `df[[\"col\"]]` not `df[\"col\"]` |\r\n| Fields all `null` / empty JSON `{}` | Prompt not matching PDF layout | Review `mkdown_text` column, refine field descriptions |\r\n| `[NotSupported] Source and destination must be of the same type` | fab cp path issue | Use `./filename` with Push-Location (see upload script) |\r\n| Lakehouse not found | %%configure name mismatch | Check exact lakehouse name in Fabric UI |\r\n",
|
|
205
231
|
},
|
|
206
|
-
{
|
|
207
|
-
relativePath: "scripts/__pycache__/generate_notebook.cpython-313.pyc",
|
|
208
|
-
content: "�\r\r\n\u0000\u0000\u0000\u0000d\u0007�i�R\u0000\u0000�\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0011\u0000\u0000\u0000\u0000\u0000\u0000\u0000�\u0000\u0000\u0000�\u0000S\u0000r\u0000S\u0001S\u0002K\u0001r\u0001S\u0001S\u0002K\u0002r\u0002S\u0001S\u0002K\u0003r\u0003S\u0001S\u0002K\u0004r\u0004S\u0012S\u0003\\\u0005S\u0004\\\u0005S\u0005\\\u00064\u0006S\u0006\u001a\u0000j\u0004j\u0001r\u0007S\u0013S\u0007\\\bS\b\\\bS\u0005\\\u00054\u0006S\t\u001a\u0000j\u0004j\u0001r\t\u001e\u0000\u001e\u0000S\u0014S\n\\\u0005S\u000b\\\u0005S\f\\\u0005S\u0007\\\bS\r\\\nS\u000e\\\u0005S\b\\\bS\u0005\\\u00064\u0010S\u000f\u001a\u0000j\u0004j\u0001r\u000bS\u0010\u001a\u0000r\f\\\rS\u0011:X\u0000\u0000a\b\u0000\u0000\\\f\"\u00005\u0000\u0000\u0000\u0000\u0000\u0000\u0000 \u0000g\u0002g\u0002)\u0015a�\u0005\u0000\u0000\nGenerate a Fabric-compatible PySpark notebook (.ipynb) that reads PDF files\nfrom a lakehouse Files section, extracts structured fields using AI, and writes\nthe results to one or two delta tables (header + optional line items).\n\nThe notebook follows the structure of NB_ConvertPDFToDelta.ipynb:\n %%configure -> pip installs -> imports/helpers -> config -> load PDFs ->\n AI prompt -> run extraction -> parse output -> display -> write delta table(s)\n\nSingle-table usage:\n python scripts/generate_notebook.py --lakehouse \"Lh_landon_finance_bronze\" --lakehouse-folder \"Booking PDFs\" --table-name \"booking_invoices\" --fields-json '[{\"name\":\"invoice_number\",\"description\":\"invoice number after no.\"}]' --test-mode --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\n\nTwo-table usage (header + line items):\n python scripts/generate_notebook.py --lakehouse \"Lh_landon_finance_bronze\" --lakehouse-folder \"Booking PDFs\" --table-name \"booking_invoices\" --fields-json '[{\"name\":\"invoice_number\",\"description\":\"invoice number after no.\"}]' --line-items-table-name \"booking_invoice_line_items\" --line-items-fields-json '[{\"name\":\"item_date\",\"description\":\"date as YYYY-MM-DD\"},{\"name\":\"description\",\"description\":\"line item description\"},{\"name\":\"charge_gbp\",\"description\":\"charge as float, null if empty\"}]' --test-mode --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\n�\u0000\u0000\u0000\u0000N�\u0006source�\tcell_type�\u0006returnc\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0006\u0000\u0000\u0000\u0003\u0000\u0000\u0000�\u0000\u0000\u0000�\u0000U\u0000R\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00015\u0001\u0000\u0000\u0000\u0000\u0000\u0000n\u0002U\u0002S\u0002S\u0003\u0004\u0000\u0013\u0000V\u0003s\u0002/\u0000s\u0002\u0013\u0000H\u0007\u0000\u0000o3S\u0001-\u0000\u0000\u0000P\u0002M\t\u0000\u0000\u000b\u0000 \u0000s\u0002n\u0003U\u0002S\u0003\u0005\u0000\u0000\u0000/\u0001-\u0000\u0000\u0000n\u0004U\u00010\u0000U\u0004/\u0000S\u0002S\u0004.\u0005n\u0005U\u0001S\u0005:X\u0000\u0000a\u0006\u0000\u0000U\u0005S\u0006\t\u0000U\u0005S\u0007\t\u0000U\u0005$\u0000s\u0002 \u0000s\u0002n\u0003f\u0000)\bz4Build a notebook cell dict from a multi-line string.�\u0001\nN�����)\u0005r\u0004\u0000\u0000\u0000�\bmetadatar\u0003\u0000\u0000\u0000�\u0007outputs�\u000fexecution_count�\bmarkdownr\n\u0000\u0000\u0000r\u000b\u0000\u0000\u0000)\u0001�\u0005split)\u0006r\u0003\u0000\u0000\u0000r\u0004\u0000\u0000\u0000�\u0005lines�\u0004line�\u000bsource_list�\u0001cs\u0006\u0000\u0000\u0000 �MC:\\Users\\rishi\\source\\pdf-to-bronze-delta-tables\\scripts\\generate_notebook.py�\u0004cellr\u0013\u0000\u0000\u0000(\u0000\u0000\u0000sq\u0000\u0000\u0000�\u0000�\f\u0012�L�L�\u0014�\f\u001e�E�+0�\u0013�\"�:�\u00126�:�4�$�;�:�\u00126�%�\u0002�)�\u001b�\u0012D�K�\u0015\u001e�\u0014\u0016�\u0012\u001d�\u0013\u0015�\u001b\u001f�\u000b\u0006\t\u0006�A�\u000e\u0000\b\u0011�J�\u0007\u001e�\f\r�i�L�\f\r�\u000e\u001f�\f �\u000b\f�H��\u0017\u0000\u00137s\u0005\u0000\u0000\u0000�\u000eA\u0007\u0004�\u0006fields�\u0011line_items_fieldsc\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\t\u0000\u0000\u0000\u0003\u0000\u0000\u0000�\u0018\u0002\u0000\u0000�\u0000S\u0001/\u0001n\u0002[\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0013\u0000HB\u0000\u0000u\u0002\u0000\u0000p4U\u0003[\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000S\u0002-\n\u0000\u0000:\u0012\u0000\u0000d\u0007\u0000\u0000U\u0001(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000S\u0003O\u0001S\u0004n\u0005U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0005U\u0004S\u0006\u0005\u0000\u0000\u0000\u000e\u0000S\u0007U\u0004S\b\u0005\u0000\u0000\u0000\u000e\u0000S\tU\u0005\u000e\u00003\u00065\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000MD\u0000\u0000\u000b\u0000 \u0000U\u0001(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a�\u0000\u0000U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\n5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u000b5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000[\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00015\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0013\u0000H;\u0000\u0000u\u0002\u0000\u0000p4U\u0003[\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00015\u0001\u0000\u0000\u0000\u0000\u0000\u0000S\u0002-\n\u0000\u0000:\u0012\u0000\u0000a\u0002\u0000\u0000S\u0003O\u0001S\u0004n\u0005U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\fU\u0004S\u0006\u0005\u0000\u0000\u0000\u000e\u0000S\u0007U\u0004S\b\u0005\u0000\u0000\u0000\u000e\u0000S\tU\u0005\u000e\u00003\u00065\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000M=\u0000\u0000\u000b\u0000 \u0000U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\r5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u000e5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0002R\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u000f5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000S\u0010R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00025\u0001\u0000\u0000\u0000\u0000\u0000\u0000$\u0000)\u0011z;Build the JSON template block for the AI extraction prompt.�\u0001{�\u0001\u0000\u0000\u0000�\u0001,�\u0000z\u0003 \"�\u0004namez\u0004\": \"�\u000bdescription�\u0001\"z\u0011 \"line_items\": [z\u0005 {z\u0007 \"z\u0005 }z\u0003 ]�\u0001}r\u0007\u0000\u0000\u0000)\u0004�\tenumerate�\u0003len�\u0006append�\u0004join)\u0006r\u0014\u0000\u0000\u0000r\u0015\u0000\u0000\u0000r\u000e\u0000\u0000\u0000�\u0001i�\u0001f�\u0005commas\u0006\u0000\u0000\u0000 r\u0012\u0000\u0000\u0000�\u0011build_prompt_jsonr&\u0000\u0000\u00009\u0000\u0000\u0000s\u0003\u0001\u0000\u0000�\u0000�\r\u0010�E�E�\u0010\u0019�&�\u0010!�\u0004�\u0001�\u0018\u0019�C�\u0006�K�!�O�\u0018+�/@�\u0003�r�\u0005�\b\r�\f�\f�s�1�V�9�+�T�!�M�*:�);�1�U�G�\u0015D�\bE�\u0005\u0000\u0011\"�\u0006\u0000\b\u0019�\b\r�\f�\f�\u0015(�\b)�\b\r�\f�\f�W�\b\u001d�\u0014\u001d�\u001e/�\u00140�D�A�\u001b\u001c�s�#4�\u001f5�\u0001�\u001f9�\u001b9�C�r�E�\f\u0011�L�L�7�1�V�9�+�T�!�M�2B�1C�1�U�G�\u0019L�\fM�\u0005\u0000\u00151�\u0006\u0000\t\u000e�\f�\f�W�\b\u001d�\b\r�\f�\f�U�\b\u001b�\u0004\t�L�L�\u0013�\u0004\u0015�\u000b\u000f�9�9�U�\u000b\u001b�\u0004\u001b�\u0000\u0000\u0000\u0000�\u000elakehouse_name�\u0010lakehouse_folder�\ntable_name�\ttest_mode�\u0015line_items_table_namec\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u000b\u0000\u0000\u0000\u0003\u0000\u0000\u0000�,\u0005\u0000\u0000�\u0000[\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0005=\u0001(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000 \u0000U\u00065\u0001\u0000\u0000\u0000\u0000\u0000\u0000n\u0007U\u0004(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000S\u0001O\u0001S\u0002n\bU\u0004(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000S\u0003O\u0001S\u0004n\t[\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000X7(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000U\u0006O\u0001S\u00005\u0002\u0000\u0000\u0000\u0000\u0000\u0000n\nU\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\t\u0000\u0000S\u0005U\u0002\u000e\u0000S\u0006U\u0005\u000e\u0000S\u00073\u0005O\u0005S\bU\u0002\u000e\u0000S\t3\u0003n\u000b/\u0000n\fU\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\nU\u0000\u000e\u0000S\u000b3\u00035\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\fU\u0001\u000e\u0000S\rU\u000b\u000e\u0000S\u000eU\b\u000e\u0000S\u000f3\u0007S\u0010S\u00119\u00025\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00125\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00135\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00145\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00155\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\n\u0000\u0000S\u0016U\u0002\u000e\u0000S\u0017U\u0005\u000e\u0000S\u00183\u0005n\rO\u0006S\u0019U\u0002\u000e\u0000S\u001a3\u0003n\rU\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001bU\u0001\u000e\u0000S\u001cU\r\u000e\u0000S\u001dU\t\u000e\u0000S\u001e3\u00075\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001f5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S U\n\u000e\u0000S!3\u00035\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\"5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u001b\u0000\u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S#5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000O\u001aU\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S$5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u001b\u0000\u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S%5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000O\u001aU\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S&5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u001a\u0000\u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S'5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u001b\u0000\u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S(5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000O\u001aU\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S)5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0007(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u001a\u0000\u0000U\fR\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S*5\u0001\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000S+S,S-S.S/S0.\u0003S1S.0\u0001S20\u00000\u0001S3.\u0003U\fS4.\u0004$\u0000)5Nu\u0013\u0000\u0000\u0000TEST — 1 PDF onlyu\u0011\u0000\u0000\u0000FULL — all PDFs�\u0004True�\u0005Falsez\u001eWrites **two** delta tables: `z\u0010` (header) and `z\u000f` (line items).z\u0016Writes delta table **`z\u0004`**.z3%%configure\n{\n \"defaultLakehouse\": {\n \"name\": \"z\u0007\"\n }\n}zN## PDF to Bronze Delta Tables\n\nExtracts structured fields from PDFs in `Files/z\u000e` using AI. \nz\r\n\n**Mode**: `zL` \nTo switch mode, change `TEST_MODE` in the configuration cell and re-run.r\f\u0000\u0000\u0000)\u0001r\u0004\u0000\u0000\u0000a�\u0001\u0000\u0000%pip install -q pymupdf4llm markitdown[pdf] dateparser deepdiff openai==1.30 > /dev/null 2>&1\n%pip install -q --force-reinstall httpx==0.27.0 > /dev/null 2>&1\n%pip install -q --force-reinstall https://mmlspark.blob.core.windows.net/pip/1.0.9/synapseml_core-1.0.9-py2.py3-none-any.whl > /dev/null 2>&1\n%pip install -q --force-reinstall https://mmlspark.blob.core.windows.net/pip/1.0.10.0-spark3.4-5-a5d50c90-SNAPSHOT/synapseml_internal-1.0.10.0.dev1-py2.py3-none-any.whl > /dev/null 2>&1ziimport re\nimport json\nimport pandas as pd\nfrom notebookutils import fs\nfrom synapse.ml.aifunc import Confz�def create_mkd(path: str):\n \"\"\"Convert a PDF file to markdown text using pymupdf4llm.\"\"\"\n try:\n import pymupdf4llm\n return pymupdf4llm.to_markdown(path), None\n except Exception as e:\n return None, str(e)u�\u0003\u0000\u0000def process_pdfs(files_folder: str, test_mode: bool = False):\n \"\"\"List PDFs from the lakehouse Files section and convert each to markdown.\"\"\"\n files = fs.ls(files_folder)\n pdf_files = [f for f in files if f.name.lower().endswith(\".pdf\")]\n if not pdf_files:\n raise ValueError(f\"No PDF files found in '{files_folder}'. \"\n \"Check LAKEHOUSE_FILES_FOLDER and upload step.\")\n if test_mode:\n pdf_files = pdf_files[:1]\n print(f\"TEST MODE: processing 1 PDF — {pdf_files[0].name}\")\n data = []\n for f in pdf_files:\n local_path = \"/lakehouse/default/Files\" + f.path.split(\"Files\")[1]\n mkdown_text, error = create_mkd(local_path)\n if error:\n print(f\"⚠️ Failed to convert {f.name}: {error}\")\n data.append((f.name, local_path, mkdown_text, error))\n return pd.DataFrame(data, columns=[\"filename\", \"file_path\", \"mkdown_text\", \"error\"])z\u001aHEADER_TABLE_NAME = \"z>\" # header delta table\nLINE_ITEMS_TABLE_NAME = \"z\u001c\" # line items delta tablez\u001aTABLE_NAME = \"z!\" # delta table nameu�\u0000\u0000\u0000# ── CONFIGURE ────────────────────────────────────────────────────\nLAKEHOUSE_FILES_FOLDER = \"z)\" # folder under Files/ containing PDFs\nz\u001a\nTEST_MODE = u\u0014\u0001\u0000\u0000 # True = process 1 PDF only\n# ─────────────────────────────────────────────────────────────────────────────z�pdf_df = process_pdfs(f\"Files/{LAKEHOUSE_FILES_FOLDER}\", test_mode=TEST_MODE)\nprint(f\"Loaded {len(pdf_df)} PDF(s) from Files/{LAKEHOUSE_FILES_FOLDER}\")\ndisplay(pdf_df[[\"filename\", \"error\"]])z�# Edit field descriptions below to tune extraction for your PDFs\nEXTRACTION_PROMPT = \"\"\"\nExtract the following fields from this document and return ONLY a valid JSON object.\nNo explanation, no markdown fences, no additional text.\n\nz\u000f\n\nDocument:\n\"\"\"z�pdf_df[\"output\"] = pdf_df[[\"mkdown_text\"]].ai.generate_response(\n EXTRACTION_PROMPT,\n conf=Conf(temperature=0.0, seed=0, max_concurrency=25)\n)\nprint(\"AI extraction complete.\")aV\u0004\u0000\u0000def parse_output(df, json_column):\n \"\"\"Parse LLM JSON output into header_df and line_items_df.\"\"\"\n def parse_row(val):\n if not isinstance(val, str):\n return {}\n try:\n cleaned = val.strip().replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n return json.loads(cleaned)\n except Exception:\n return {}\n\n header_rows, all_line_items = [], []\n for val, filename in zip(df[json_column], df[\"filename\"]):\n row = parse_row(val)\n line_items = row.pop(\"line_items\", None) or []\n invoice_number = row.get(\"invoice_number\", \"\")\n header_rows.append({\"source_filename\": filename, **row})\n for item in line_items:\n all_line_items.append({\"source_filename\": filename,\n \"invoice_number\": invoice_number,\n **item})\n\n header_df = pd.DataFrame(header_rows)\n line_items_df = pd.DataFrame(all_line_items) if all_line_items else pd.DataFrame()\n return header_df, line_items_df\n\nheader_df, line_items_df = parse_output(pdf_df, \"output\")a)\u0002\u0000\u0000def parse_output(df, json_column):\n def parse_row(val):\n if not isinstance(val, str):\n return {}\n try:\n cleaned = val.strip().replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n return json.loads(cleaned)\n except Exception:\n return {}\n extracted = [pd.json_normalize(parse_row(v)) for v in df[json_column]]\n result = pd.concat(extracted, ignore_index=True)\n result.insert(0, \"source_filename\", df[\"filename\"].values)\n return result\n\nfinal_df = parse_output(pdf_df, \"output\")u\u001d\u0001\u0000\u0000print(f\"Header rows: {len(header_df)}\")\nif TEST_MODE:\n print(\"✅ TEST MODE — review header row below.\")\n print(\" If all fields look correct, scroll down to check line items too.\")\n print(\" If any field is wrong, update EXTRACTION_PROMPT and re-run.\")\ndisplay(header_df)u4\u0001\u0000\u0000print(f\"Extracted {len(final_df)} row(s) from {len(pdf_df)} PDF(s).\")\nif TEST_MODE:\n print(\"✅ TEST MODE — review the row below.\")\n print(\" If all fields look correct, set TEST_MODE = False and re-run.\")\n print(\" If any field is wrong, update EXTRACTION_PROMPT and re-run.\")\ndisplay(final_df)u�\u0000\u0000\u0000print(f\"Line item rows: {len(line_items_df)}\")\nif TEST_MODE:\n print(\"✅ TEST MODE — review line items below.\")\n print(\" If correct, set TEST_MODE = False and re-run for all PDFs.\")\ndisplay(line_items_df)u�\u0000\u0000\u0000spark_df = spark.createDataFrame(header_df.astype(str).fillna(\"\"))\nspark_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(HEADER_TABLE_NAME)\nprint(f\"✅ Written {len(header_df)} row(s) to delta table: {HEADER_TABLE_NAME}\")u�\u0000\u0000\u0000spark_df = spark.createDataFrame(final_df.astype(str).fillna(\"\"))\nspark_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(TABLE_NAME)\nprint(f\"✅ Written {len(final_df)} row(s) to delta table: {TABLE_NAME}\")u�\u0001\u0000\u0000if not line_items_df.empty:\n spark_li = spark.createDataFrame(line_items_df.astype(str).fillna(\"\"))\n spark_li.write.format(\"delta\").mode(\"overwrite\").saveAsTable(LINE_ITEMS_TABLE_NAME)\n print(f\"✅ Written {len(line_items_df)} row(s) to delta table: {LINE_ITEMS_TABLE_NAME}\")\nelse:\n print(\"⚠️ No line items extracted — check the line_items field in EXTRACTION_PROMPT.\")�\u0004\u0000\u0000\u0000�\u0005\u0000\u0000\u0000�\u0007PySpark�\u0006python�\u000fsynapse_pyspark)\u0003�\fdisplay_name�\blanguager\u001b\u0000\u0000\u0000r\u001b\u0000\u0000\u0000�\tlakehouse)\u0003�\nkernelspec�\rlanguage_info�\u0007trident)\u0004�\bnbformat�\u000enbformat_minorr\t\u0000\u0000\u0000�\u0005cells)\u0004�\u0004boolr&\u0000\u0000\u0000r!\u0000\u0000\u0000r\u0013\u0000\u0000\u0000)\u000er(\u0000\u0000\u0000r)\u0000\u0000\u0000r*\u0000\u0000\u0000r\u0014\u0000\u0000\u0000r+\u0000\u0000\u0000r,\u0000\u0000\u0000r\u0015\u0000\u0000\u0000�\ntwo_tables�\nmode_label�\rtest_mode_str�\u000bprompt_json�\u000btables_noter=\u0000\u0000\u0000�\ftable_configs\u000e\u0000\u0000\u0000 r\u0012\u0000\u0000\u0000�\u000ebuild_notebookrE\u0000\u0000\u0000K\u0000\u0000\u0000s?\u0003\u0000\u0000�\u0000�\u0012\u0000\u0012\u0016�\u0016+�\u0016A�0A�\u0011B�J�*3�\u0011&�9L�J�\u001e'�F�W�M�\u0012#�F�\u001a�,=�QU�\u0012V�K�\b\u0000\f\u0016�\u0005\u0000\u000b)�\u001a�\f�\u0000\u00015\f�\f!�\u000b\"�/�\u0003\u0001\t3�\u0006\u0000\u0010&�j�\\�\u0014�\r6�\t\u0000\u0005\u0010�\u000e\u0000\r\u000f�E�\u0006\u0000\u0005\n�L�L�\u0014�\u0002\u0003\t\u0018�\u0006\u0000\u0019'�\u0017'�\u0000\u0002(\f�\u0007\u0005\t\f�\u0003\u0007\u0012\u0006�\u0000\u0007\u0005\u0007�\u0014\u0000\u0005\n�L�L�\u0014�\u0002\u0002\u000b:�:J�9K�?�\u000b\u0016�-�\u0000\u0002\u0018\u0016�\u0016 �\\�\u0000\u0001\"S\u0001�\u000b\u0006\tT\u0001�\u000e\u0000\u0013\u001d�\u0011\t\u0012\u0006�\u0000\t\u0005\u0007�\u001a\u0000\u0005\n�L�L�\u0014�\u0002\u0003\tD\u0003�\u0003\u0005\u0012\u0006�\u0000\u0005\u0005\u0007�\u0010\u0000\u0005\n�L�L�\u0014�\u0002\u0004\t-�\u0003\u0006\u0012\u0006�\u0000\u0006\u0005\u0007�\u0012\u0000\u0005\n�L�L�\u0014�\u0002\u0006\t&�\u0003\b\u0012\u0006�\u0000\b\u0005\u0007�\u0016\u0000\u0005\n�L�L�\u0014�\u0002\u0011\tc\u0001�\u0003\u0013\u0012\u0006�\u0000\u0013\u0005\u0007�,\u0000\b\u0012�\u000e(�\u001a�\f�\u0000\u00015)�)>�(?�?[�\u0003\u0001\r]\u0001�\u0003\u0000\t\u0015�\n\u0000\u001a4�J�<�?`�\u0017a�\f�\u0004\t�L�L�\u0014�\u0002\u0001\t%�%5�$6�6`�\u000b\u0017�.�\u0000\u0001\u0019$�$1�?�\u0000\u00013[\u0007�\u0007\u0004\t[\u0007�\u0003\u0006\u0012\u0006�\u0000\u0006\u0005\u0007�\u0012\u0000\u0005\n�L�L�\u0014�\u0002\u0002\t1�\u0003\u0004\u0012\u0006�\u0000\u0004\u0005\u0007�\u000e\u0000\u0005\n�L�L�\u0014�\u0002\u0004\t\r�\n\u0000\f\u0017�-�\u0000\u0003\u0018\u000e�\u000b\b\t\u000e�\u0003\n\u0012\u0006�\u0000\n\u0005\u0007�\u001a\u0000\u0005\n�L�L�\u0014�\u0002\u0004\t+�\u0003\u0006\u0012\u0006�\u0000\u0006\u0005\u0007�\u0012\u0000\b\u0012�\b\r�\f�\f�T�\u0002\u001a\rH\u0001�\u0003\u001c\u0016\n�\u0000\u001c\t\u000b�<\u0000\t\u000e�\f�\f�T�\u0002\u000e\r8�\u0003\u0010\u0016\n�\u0000\u0010\t\u000b�&\u0000\b\u0012�\b\r�\f�\f�T�\u0002\u0005\r!�\u0003\u0007\u0016\n�\u0000\u0007\t\u000b�\u0012\u0000\t\u000e�\f�\f�T�\u0002\u0005\r �\u0003\u0007\u0016\n�\u0000\u0007\t\u000b�\u0014\u0000\b\u0012�\b\r�\f�\f�T�\u0002\u0004\r%�\u0003\u0006\u0016\n�\u0000\u0006\t\u000b�\u0012\u0000\b\u0012�\b\r�\f�\f�T�\u0002\u0002\rc\u0001�\u0003\u0004\u0016\n�\u0000\u0004\t\u000b�\f\u0000\t\u000e�\f�\f�T�\u0002\u0002\r[\u0001�\u0003\u0004\u0016\n�\u0000\u0004\t\u000b�\u000e\u0000\b\u0012�\b\r�\f�\f�T�\u0002\u0005\rx\u0001�\u0003\u0007\u0016\n�\u0000\u0007\t\u000b�\u0014\u0000\u0015\u0016�\u001a\u001b�\u0006\u0000!*�\u001c$�\u0018)�\u0007\u0004\u001b\u000e�\n\u0000\u001f%�h�\u001d/�\u0018#�R�\u0017(�\u000f\b\u0015\n�\u0012\u0000\u0012\u0017�\u0019\r\f\u0006�\u0000\r\u0005\u0006r'\u0000\u0000\u0000c\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n\u0000\u0000\u0000\u0003\u0000\u0000\u0000��\u0005\u0000\u0000�\u0000[\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000S\u0001[\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0002S\u00039\u0003n\u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0004S\u0005S\u0006S\u00079\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\bS\u0005S\tS\u00079\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\nS\u0005S\u000bS\u00079\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\fS\u0005S\rS\u00079\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u000eS\u0000S\u000fS\u00109\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0011S\u0000S\u0012S\u00109\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0013S\u0014S\u0015S\u00169\u0003 \u0000U\u0000R\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0017S\u0005S\u0018S\u00079\u0003 \u0000U\u0000R\t\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0000\u0000\u0000\u0000\u0000\u0000\u0000n\u0001S\u0019\u001a\u0000n\u0002U\u0002\"\u0000U\u0001R\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\f5\u0002\u0000\u0000\u0000\u0000\u0000\u0000n\u0003U\u0001R\f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0013\u0000\u0000U\u0002\"\u0000U\u0001R\f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00115\u0002\u0000\u0000\u0000\u0000\u0000\u0000O\u0001S\u0000n\u0004[\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000[\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00045\u0001\u0000\u0000\u0000\u0000\u0000\u0000:w\u0000\u0000a.\u0000\u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001a[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0018\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000S\u001c5\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000[\u001b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R\u001c\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R\u001e\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R \u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0003U\u0001R\"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0004S\u001d9\u0007n\u0005[$\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R&\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R)\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R*\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000n\u0006[$\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R,\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000[$\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R&\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R/\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00065\u0001\u0000\u0000\u0000\u0000\u0000\u0000S\u0005S\u001e9\u0002 \u0000[1\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0006S\u001fS S!9\u0003\u0002\u0000n\u0007[2\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R4\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000XWS\"S#S$9\u0004 \u0000S\u0000S\u0000S\u00005\u0002\u0000\u0000\u0000\u0000\u0000\u0000 \u0000[\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u0001R\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000n\bU\u0001R\"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000S%O\u0001S&n\t[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S'U\u0006\u000e\u00003\u0002[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S(U\t\u000e\u0000S)U\b(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0002\u0000\u0000S*O\u0001S+\u000e\u00003\u0004[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S,[7\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00035\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u000e\u00003\u0002U\u0004(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u000e\u0000\u0000S-[7\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000U\u00045\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u000e\u00003\u0002O\u0001S.-\u0000\u0000\u0000[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S/[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S0[\u0014\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0016\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u001b9\u0002 \u0000g\u0000!\u0000,\u0000(\u0000\u0000\u0000\u0000\u0000\u0000\u0000d\u0001\u0000\u0000f\u0002 \u0000\u001f\u0000 \u0000 \u0000N�=\u0003\u001f\u0000f\u0001)1Nz�Generate a Fabric-compatible PySpark notebook (.ipynb) that extracts structured fields from PDFs using AI and writes one or two delta tables.aP\u0001\u0000\u0000Single-table example:\n python scripts/generate_notebook.py \\\n --lakehouse \"Lh_landon_finance_bronze\" \\\n --lakehouse-folder \"Booking PDFs\" \\\n --table-name \"booking_invoices\" \\\n --fields-json '[{\"name\":\"invoice_number\",\"description\":\"...\"}]' \\\n --test-mode \\\n --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\n)\u0003r\u001c\u0000\u0000\u0000�\u000fformatter_class�\u0006epilogz\u000b--lakehouseTz'Name of the bronze lakehouse to attach.)\u0002�\brequired�\u0004helpz\u0012--lakehouse-folderz5Folder under Files/ in the lakehouse containing PDFs.z\f--table-namez&Delta table name for header/main rows.z\r--fields-jsonz8JSON array: [{\"name\": \"...\", \"description\": \"...\"}, ...]z\u0017--line-items-table-namez0(Optional) Delta table name for line items rows.)\u0002�\u0007defaultrJ\u0000\u0000\u0000z\u0018--line-items-fields-jsonz5(Optional) JSON array of line item field definitions.z\u000b--test-mode�\nstore_truez.If set, notebook processes only the first PDF.)\u0002�\u0006actionrJ\u0000\u0000\u0000z\u0011--output-notebookz+Path where the .ipynb file should be saved.c\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0007\u0000\u0000\u0000\u0013\u0000\u0000\u0000�\u0001\u0000\u0000�\u0000\u001e\u0000[\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000U\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000n\u0002[\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000W\u0002[\u0010\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0002\u0000\u0000\u0000\u0000\u0000\u0000(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0017\u0000\u0000[\u0013\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0005\u001a\u0000U\u0002\u0013\u00005\u0000\u0000\u0000\u0000\u0000\u0000\u00005\u0001\u0000\u0000\u0000\u0000\u0000\u0000(\u0000\u0000\u0000\u0000\u0000\u0000\u0000d2\u0000\u0000[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0001U\u0001\u000e\u0000S\u00063\u0003[\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00039\u0002 \u0000[\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000S\u00045\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000U\u0002$\u0000!\u0000[\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0007\u0000a>\u0000\u0000n\u0003[\u0007\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u0001U\u0001\u000e\u0000S\u0002U\u0003\u000e\u00003\u0004[\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000S\u00039\u0002 \u0000[\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000R\f\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\"\u0000S\u00045\u0001\u0000\u0000\u0000\u0000\u0000\u0000 \u0000\u001f\u0000S\u0000n\u0003A\u0003N�S\u0000n\u0003A\u0003f\u0001f\u0000=\u0003\u001f\u0000f\u0001)\u0007Nz\u0007ERROR: z\u0014 is not valid JSON: �\u0001�\u0004filer\u0018\u0000\u0000\u0000c\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0003\u0000\u0000\u00003\u0000\u0000\u0000�D\u0000\u0000\u0000#\u0000 \u0000�\u0000U\u0000\u0013\u0000H\u0016\u0000\u0000n\u0001S\u0000U\u0001;\u0000\u0000\u0000=\u0001(\u0000\u0000\u0000\u0000\u0000\u0000\u0000a\u0005\u0000\u0000 \u0000S\u0001U\u0001;\u0000\u0000\u0000v\u0000�\u0005 \u0000M\u0018\u0000\u0000\u000b\u0000 \u0000g\u00027\u0003f\u0001)\u0003r\u001b\u0000\u0000\u0000r\u001c\u0000\u0000\u0000N�\u0000)\u0002�\u0002.0r$\u0000\u0000\u0000s\u0002\u0000\u0000\u0000 r\u0012\u0000\u0000\u0000�\t<genexpr>�-main.<locals>.parse_fields.<locals>.<genexpr>{\u0001\u0000\u0000s$\u0000\u0000\u0000�\u0000�\u0000�\u0000\u00023\n�8>�1�F�a�K�\f.�M�Q�\u001c.�\f.�\u0006�s\u0004\u0000\u0000\u0000�\u001e \u0001zA must be a JSON array of {\"name\":..., \"description\":...} objects.)\n�\u0004json�\u0005loads�\u000fJSONDecodeError�\u0005print�\u0003sys�\u0006stderr�\u0004exit�\nisinstance�\u0004list�\u0003all)\u0004�\u0003raw�\u0005labelr\u0014\u0000\u0000\u0000�\u0001es\u0004\u0000\u0000\u0000 r\u0012\u0000\u0000\u0000�\fparse_fields�\u001amain.<locals>.parse_fieldsu\u0001\u0000\u0000s�\u0000\u0000\u0000�\u0000�\u0002\u0004\t\u0018�\u0015\u0019�Z�Z�\u0003�_�F�\b\u0000\u0010\u001a�&�$�\u000f'�\u000f'�s�\u0000\u00023\n�8>�\u0003\u00023\n�\u0000\u00020\n�\u0000\u00020\n�\u0006\u0000\r\u0012�G�E�7�\"e�\u0012f�\u0017\u001a�z�z�\u0003\u0001\r#�\f\u000f�H�H�Q�K�\u000f\u0015�\r��\u0013\u0000\u0010\u0014�\u000f#�\u000f#�\u0000\u0002\t\u0018�\f\u0011�G�E�7�\"6�q�c�\u0012:�\u0013�\u001a�\u001a�\fL�\f\u000f�H�H�Q�K�K��\u0005\u0002\t\u0018�s\u0017\u0000\u0000\u0000�\u0016A8\u0000�8\u0014C\n\u0003�\f4C\u0005\u0003�\u0005\u0005C\n\u0003z[ERROR: --line-items-table-name and --line-items-fields-json must both be provided together.rO\u0000\u0000\u0000r\u0018\u0000\u0000\u0000)\u0007r(\u0000\u0000\u0000r)\u0000\u0000\u0000r*\u0000\u0000\u0000r\u0014\u0000\u0000\u0000r+\u0000\u0000\u0000r,\u0000\u0000\u0000r\u0015\u0000\u0000\u0000)\u0001�\bexist_ok�\u0001wz\u0005utf-8)\u0001�\bencoding�\u0002\u0000\u0000\u0000F)\u0002�\u0006indent�\fensure_asciiz\fTEST (1 PDF)z\u000fFULL (all PDFs)z\u0015Notebook written to: z\u0006Mode: z\u000b | Tables: z\u00172 (header + line items)�\u00011z\u000fHeader fields: z\u0015 | Line item fields: r\u001a\u0000\u0000\u0000zSImport into Fabric: Workspace -> New -> Import notebook -> select this .ipynb file.zJLakehouse is attached automatically via %%configure -- just click Run All.)\u001c�\bargparse�\u000eArgumentParser�\u001bRawDescriptionHelpFormatter�\fadd_argument�\nparse_args�\u000bfields_json�\u0016line_items_fields_jsonr>\u0000\u0000\u0000r,\u0000\u0000\u0000rY\u0000\u0000\u0000rZ\u0000\u0000\u0000r[\u0000\u0000\u0000r\\\u0000\u0000\u0000rE\u0000\u0000\u0000r7\u0000\u0000\u0000r)\u0000\u0000\u0000r*\u0000\u0000\u0000r+\u0000\u0000\u0000�\u0002os�\u0004path�\u0007abspath�\u000foutput_notebook�\bmakedirs�\u0007dirname�\u0004openrV\u0000\u0000\u0000�\u0004dumpr \u0000\u0000\u0000)\n�\u0006parser�\u0004argsrc\u0000\u0000\u0000r\u0014\u0000\u0000\u0000r\u0015\u0000\u0000\u0000�\bnotebook�\bout_pathr$\u0000\u0000\u0000r?\u0000\u0000\u0000r@\u0000\u0000\u0000s\n\u0000\u0000\u0000 r\u0012\u0000\u0000\u0000�\u0004mainr\u0000\u0000\u0000Q\u0001\u0000\u0000s�\u0002\u0000\u0000�\u0000�\r\u0015�\r$�\r$�\u0004\u0001\rW\u0001�\u0006\u0000\u0019!�\u0018<�\u0018<�\u0004\u0007\rO\u0001�\u000f\u0010\u000e\u0006�F�\"\u0000\u0005\u000b�\u0004\u0017�\u0004\u0017�\r�\u0004�\u001dF�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005H\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u0018,�t�\u001dT�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005V\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u000e�\u0014�\u001dE�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005G\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u000f�$�\u001dW�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005Y\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u00181�4�\u001dO�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005Q\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u00182�D�\u001dT�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005V\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\r�l�\u001dM�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005O\u0001�\u0004\n�\u0004\u0017�\u0004\u0017�\u0018+�d�\u001dJ�\u0003\u0000\u0005\u0018�\u0000\u0001\u0005L\u0001�\u000b\u0011�\u000b\u001c�\u000b\u001c�\u000b\u001e�D�\u0004\f\u0005\u0016�\u001c\u0000\u000e\u001a�$�\u001a*�\u001a*�O�\r<�F�\u0006\u0000\f\u0010�\u000b&�\u000b&�\u0003\u0000\t\u0015�T�\u00150�\u00150�2L�\bM�,0�\u0005\u0000\u0005\u0016�\n\u0000\b\f�D�\f&�\f&�\u0007'�4�0A�+B�\u0007B�\b\r�\u000ek�\u0013\u0016�:�:�\u0003\u0001\t\u001f�\b\u000b�\b�\b�\u0011�\u000b�\u000f\u001d�\u0017\u001b�~�~�\u0019\u001d�\u0019.�\u0019.�\u0013\u0017�?�?�\u000f\u0015�\u0012\u0016�.�.�\u001e\"�\u001e8�\u001e8�\u001a+�\u000f\b\u0010\u0006�H�\u0014\u0000\u0010\u0012�w�w���t�\u001f3�\u001f3�\u000f4�H�\u0004\u0006�K�K�\u0002�\u0007�\u0007�\u000f�\u000f�\b�\u0010)�D�\u00049�\t\r�h�\u0003�g�\t.�!�\b\f�\t�\t�(�a�e�\b<�\u0003\u0000\n/�\u0006\u0000\u0012\u0016�d�\u00160�\u00160�\u00111�J�#'�>�>�\u001e�7H�J�\u0004\t�\f!�(�\u001a�\n,�3�:�:�\u0004>�\u0004\t�F�:�,�k�z�*C�WZ�)[�\n\\�cf�cm�cm�\u0004n�\u0004\t�O�C�\u0006�K�=�\n)�ar�/D�S�IZ�E[�D\\�-]�xz�\n{�\u0000\u0000C\u0002F\u0002�\u0000\u0000C\u0002M\u0002�\u0000\u0000C\u0002M\u0002�\u0000\u0000\u0005N\u0002�\u0004\t�\n_�fi�fp�fp�\u0004q�\u0004\t�\nV�]`�]g�]g�\u0004h�\u0013\u0000\n/�\t.�s\f\u0000\u0000\u0000�0\u0017K(\u0003�(\nK6\u0007�\b__main__)\u0001�\u0004code)\u0001N)\u0002NN)\u000e�\u0007__doc__rl\u0000\u0000\u0000rV\u0000\u0000\u0000rs\u0000\u0000\u0000rZ\u0000\u0000\u0000�\u0003str�\u0004dictr\u0013\u0000\u0000\u0000r^\u0000\u0000\u0000r&\u0000\u0000\u0000r>\u0000\u0000\u0000rE\u0000\u0000\u0000r\u0000\u0000\u0000�\b__name__rR\u0000\u0000\u0000r'\u0000\u0000\u0000r\u0012\u0000\u0000\u0000�\b<module>r�\u0000\u0000\u0000\u0001\u0000\u0000\u0000s�\u0000\u0000\u0000�\u0003\u0001\u0001\u0001�\n\u001c\u0001\u0004�:\u0000\u0001\u0010�\u0000\u000b�\u0000\t�\u0000\n�\u0006\u000e\u0001\r�\u0013�\u0000\u000e\u0001\r�\u0013�\u0000\u000e\u0001\r�$�\u0000\u000e\u0001\r�\"\u000f\u0001\u001c�d�\u0000\u000f\u0001\u001c�t�\u0000\u000f\u0001\u001c�s�\u0000\u000f\u0001\u001c�0\u0000\"&�\u001e\"�\u000fC\u0004\u0001\u0006�\u0014\u0017�\u0003C\u0004\u0001\u0006�\u0016\u0019�\u0005C\u0004\u0001\u0006�\u0006\u0000\u0011\u0014�\u0007C\u0004\u0001\u0006�\b\u0000\r\u0011�\tC\u0004\u0001\u0006�\n\u0000\u0010\u0014�\u000bC\u0004\u0001\u0006�\f\u0000\u001c\u001f�\rC\u0004\u0001\u0006�\u000e\u0000\u0018\u001c�\u000fC\u0004\u0001\u0006�\u0010\u0000\u0006\n�\u0011C\u0004\u0001\u0006�L\bR\u0001\u0001i\u0001�j\u0002\u0000\u0004\f�z�\u0003\u0019�\u0004\b�F�\u0003\u0000\u0004\u001ar'\u0000\u0000\u0000",
|
|
209
|
-
},
|
|
210
232
|
{
|
|
211
233
|
relativePath: "scripts/generate_notebook.py",
|
|
212
234
|
content: "# /// script\r\n# requires-python = \">=3.8\"\r\n# dependencies = []\r\n# ///\r\n\"\"\"\r\nGenerate a Fabric-compatible PySpark notebook (.ipynb) that reads PDF files\r\nfrom a lakehouse Files section, extracts structured fields using AI, and writes\r\nthe results to one or two delta tables (header + optional line items).\r\n\r\nThe notebook follows the structure of NB_ConvertPDFToDelta.ipynb:\r\n %%configure -> pip installs -> imports/helpers -> config -> load PDFs ->\r\n AI prompt -> run extraction -> parse output -> display -> write delta table(s)\r\n\r\nSingle-table usage:\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"Lh_landon_finance_bronze\" \\\r\n --lakehouse-folder \"Booking PDFs\" \\\r\n --table-name \"booking_invoices\" \\\r\n --fields-json '[{\"name\":\"invoice_number\",\"description\":\"invoice number after no.\"}]' \\\r\n --test-mode \\\r\n --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\r\n\r\nTwo-table usage (header + line items):\r\n python scripts/generate_notebook.py \\\r\n --lakehouse \"Lh_landon_finance_bronze\" \\\r\n --lakehouse-folder \"Booking PDFs\" \\\r\n --table-name \"booking_invoices\" \\\r\n --fields-json '[{\"name\":\"invoice_number\",\"description\":\"invoice number after no.\"}]' \\\r\n --line-items-table-name \"booking_invoice_line_items\" \\\r\n --line-items-fields-json '[{\"name\":\"item_date\",\"description\":\"date as YYYY-MM-DD\"},{\"name\":\"description\",\"description\":\"line item description\"},{\"name\":\"charge_gbp\",\"description\":\"charge as float, null if empty\"}]' \\\r\n --test-mode \\\r\n --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\r\n\"\"\"\r\nimport argparse\r\nimport json\r\nimport os\r\nimport sys\r\n\r\n\r\ndef cell(source: str, cell_type: str = \"code\") -> dict:\r\n \"\"\"Build a notebook cell dict from a multi-line string.\"\"\"\r\n lines = source.split(\"\\n\")\r\n source_list = [line + \"\\n\" for line in lines[:-1]] + [lines[-1]]\r\n c = {\r\n \"cell_type\": cell_type,\r\n \"metadata\": {},\r\n \"source\": source_list,\r\n \"outputs\": [],\r\n \"execution_count\": None,\r\n }\r\n if cell_type == \"markdown\":\r\n del c[\"outputs\"]\r\n del c[\"execution_count\"]\r\n return c\r\n\r\n\r\ndef build_prompt_json(fields: list, line_items_fields: list = None) -> str:\r\n \"\"\"Build the JSON template block for the AI extraction prompt.\"\"\"\r\n lines = [\"{\"]\r\n for i, f in enumerate(fields):\r\n comma = \",\" if (i < len(fields) - 1 or line_items_fields) else \"\"\r\n lines.append(f' \"{f[\"name\"]}\": \"{f[\"description\"]}\"{comma}')\r\n if line_items_fields:\r\n lines.append(' \"line_items\": [')\r\n lines.append(\" {\")\r\n for i, f in enumerate(line_items_fields):\r\n comma = \",\" if i < len(line_items_fields) - 1 else \"\"\r\n lines.append(f' \"{f[\"name\"]}\": \"{f[\"description\"]}\"{comma}')\r\n lines.append(\" }\")\r\n lines.append(\" ]\")\r\n lines.append(\"}\")\r\n return \"\\n\".join(lines)\r\n\r\n\r\ndef build_notebook(\r\n lakehouse_name: str,\r\n lakehouse_folder: str,\r\n table_name: str,\r\n fields: list,\r\n test_mode: bool,\r\n line_items_table_name: str = None,\r\n line_items_fields: list = None,\r\n) -> dict:\r\n two_tables = bool(line_items_table_name and line_items_fields)\r\n mode_label = \"TEST — 1 PDF only\" if test_mode else \"FULL — all PDFs\"\r\n test_mode_str = \"True\" if test_mode else \"False\"\r\n prompt_json = build_prompt_json(fields, line_items_fields if two_tables else None)\r\n tables_note = (\r\n f\"Writes **two** delta tables: `{table_name}` (header) and \"\r\n f\"`{line_items_table_name}` (line items).\"\r\n if two_tables\r\n else f\"Writes delta table **`{table_name}`**.\"\r\n )\r\n\r\n cells = []\r\n\r\n # ── Cell 1: manual lakehouse attachment instructions ─────────────────────\r\n cells.append(cell(\r\n f'## ⚠️ Before Running: Setup Steps Required\\n'\r\n f'\\n'\r\n f'### Step 1 — Attach the Lakehouse\\n'\r\n f'1. In the left panel, click **Add data items** (database icon)\\n'\r\n f'2. Click **Add lakehouse** → **Existing lakehouse**\\n'\r\n f'3. Choose **{lakehouse_name}** → **Confirm**\\n'\r\n f'\\n'\r\n f'### Step 2 — AI Features\\n'\r\n f'`synapse.ml.aifunc` uses Fabric\\'s built-in AI — no Azure OpenAI key or workspace\\n'\r\n f'settings change needed. It works automatically on capacities with AI/Copilot features\\n'\r\n f'enabled (F64+ or a trial with AI enabled).\\n'\r\n f'\\n'\r\n f'If you see `AuthenticationError: Authentication failed for all authenticators`,\\n'\r\n f'the workspace is on a capacity without AI features. Move the notebook to a workspace\\n'\r\n f'on an AI-enabled capacity and re-run.',\r\n cell_type=\"markdown\",\r\n ))\r\n\r\n # ── Cell 2: markdown header ──────────────────────────────────────────────\r\n cells.append(cell(\r\n f'## PDF to Bronze Delta Tables\\n'\r\n f'\\n'\r\n f'Extracts structured fields from PDFs in `Files/{lakehouse_folder}` using AI. \\n'\r\n f'{tables_note}\\n'\r\n f'\\n'\r\n f'**Mode**: `{mode_label}` \\n'\r\n f'To switch mode, change `TEST_MODE` in the configuration cell and re-run.',\r\n cell_type=\"markdown\",\r\n ))\r\n # ── Cell 3: pip installs ─────────────────────────────────────────────────\r\n # Per https://learn.microsoft.com/en-us/fabric/data-science/ai-functions/overview\r\n # Pandas on PySpark runtime requires openai package. pymupdf4llm for PDF→markdown.\r\n cells.append(cell(\r\n '%pip install -q openai pymupdf4llm 2>/dev/null'\r\n ))\r\n\r\n # ── Cell 4: imports ──────────────────────────────────────────────────────\r\n cells.append(cell(\r\n 'import re\\n'\r\n 'import json\\n'\r\n 'import pandas as pd\\n'\r\n 'import synapse.ml.aifunc as aifunc\\n'\r\n 'from synapse.ml.aifunc import Conf\\n'\r\n 'from notebookutils import fs'\r\n ))\r\n\r\n # ── Cell 5: helper — PDF to markdown ────────────────────────────────────\r\n cells.append(cell(\r\n 'def create_mkd(path: str):\\n'\r\n ' \"\"\"Convert a PDF file to markdown text using pymupdf4llm.\"\"\"\\n'\r\n ' try:\\n'\r\n ' import pymupdf4llm\\n'\r\n ' return pymupdf4llm.to_markdown(path), None\\n'\r\n ' except Exception as e:\\n'\r\n ' return None, str(e)'\r\n ))\r\n\r\n # ── Cell 6: helper — load all PDFs from lakehouse ────────────────────────\r\n cells.append(cell(\r\n 'def process_pdfs(files_folder: str, test_mode: bool = False):\\n'\r\n ' \"\"\"List PDFs from the lakehouse Files section and convert each to markdown.\"\"\"\\n'\r\n ' files = fs.ls(files_folder)\\n'\r\n ' pdf_files = [f for f in files if f.name.lower().endswith(\".pdf\")]\\n'\r\n ' if not pdf_files:\\n'\r\n ' raise ValueError(f\"No PDF files found in \\'{files_folder}\\'. \"\\n'\r\n ' \"Check LAKEHOUSE_FILES_FOLDER and upload step.\")\\n'\r\n ' if test_mode:\\n'\r\n ' pdf_files = pdf_files[:1]\\n'\r\n ' print(f\"TEST MODE: processing 1 PDF \\u2014 {pdf_files[0].name}\")\\n'\r\n ' data = []\\n'\r\n ' for f in pdf_files:\\n'\r\n ' local_path = \"/lakehouse/default/Files\" + f.path.split(\"Files\")[1]\\n'\r\n ' mkdown_text, error = create_mkd(local_path)\\n'\r\n ' if error:\\n'\r\n ' print(f\"\\u26a0\\ufe0f Failed to convert {f.name}: {error}\")\\n'\r\n ' data.append((f.name, local_path, mkdown_text, error))\\n'\r\n ' return pd.DataFrame(data, columns=[\"filename\", \"file_path\", \"mkdown_text\", \"error\"])'\r\n ))\r\n\r\n # ── Cell 7: configuration ────────────────────────────────────────────────\r\n if two_tables:\r\n table_config = (\r\n f'HEADER_TABLE_NAME = \"{table_name}\" # header delta table\\n'\r\n f'LINE_ITEMS_TABLE_NAME = \"{line_items_table_name}\" # line items delta table'\r\n )\r\n else:\r\n table_config = f'TABLE_NAME = \"{table_name}\" # delta table name'\r\n\r\n cells.append(cell(\r\n '# \\u2500\\u2500 CONFIGURE \\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\n'\r\n f'LAKEHOUSE_FILES_FOLDER = \"{lakehouse_folder}\" # folder under Files/ containing PDFs\\n'\r\n f'{table_config}\\n'\r\n f'TEST_MODE = {test_mode_str} # True = process 1 PDF only\\n'\r\n '# AI model deployment — must match a model supported by the Fabric built-in AI endpoint.\\n'\r\n '# Default: \"gpt-4.1-mini\". Other options: \"gpt-4.1\", \"gpt-4o\", \"gpt-5\"\\n'\r\n 'MODEL_DEPLOYMENT_NAME = \"gpt-4.1-mini\" # update if DeploymentNotFound error\\n'\r\n '# \\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500\\u2500'\r\n ))\r\n\r\n # ── Cell 8: load PDFs and convert to markdown ────────────────────────────\r\n cells.append(cell(\r\n 'pdf_df = process_pdfs(f\"Files/{LAKEHOUSE_FILES_FOLDER}\", test_mode=TEST_MODE)\\n'\r\n 'print(f\"Loaded {len(pdf_df)} PDF(s) from Files/{LAKEHOUSE_FILES_FOLDER}\")\\n'\r\n 'display(pdf_df[[\"filename\", \"error\"]])'\r\n ))\r\n\r\n # ── Cell 9: AI extraction prompt ─────────────────────────────────────────\r\n cells.append(cell(\r\n '# Edit field descriptions below to tune extraction for your PDFs\\n'\r\n 'EXTRACTION_PROMPT = \"\"\"\\n'\r\n 'Extract the following fields from this document and return ONLY a valid JSON object.\\n'\r\n 'No explanation, no markdown fences, no additional text.\\n'\r\n '\\n'\r\n f'{prompt_json}\\n'\r\n '\\n'\r\n 'Document:\\n'\r\n '\"\"\"'\r\n ))\r\n\r\n # ── Cell 10: run AI extraction ───────────────────────────────────────────\r\n cells.append(cell(\r\n 'pdf_df[\"output\"] = pdf_df[[\"mkdown_text\"]].ai.generate_response(\\n'\r\n ' EXTRACTION_PROMPT,\\n'\r\n ' conf=Conf(model_deployment_name=MODEL_DEPLOYMENT_NAME, temperature=0.0, top_p=1.0, concurrency=25)\\n'\r\n ')\\n'\r\n 'print(\"AI extraction complete.\")'\r\n ))\r\n\r\n # ── Cell 11: parse output ────────────────────────────────────────────────\r\n if two_tables:\r\n cells.append(cell(\r\n 'def parse_output(df, json_column):\\n'\r\n ' \"\"\"Parse LLM JSON output into header_df and line_items_df.\"\"\"\\n'\r\n ' def parse_row(val):\\n'\r\n ' if not isinstance(val, str):\\n'\r\n ' return {}\\n'\r\n ' try:\\n'\r\n ' cleaned = val.strip().replace(\"```json\", \"\").replace(\"```\", \"\").strip()\\n'\r\n ' return json.loads(cleaned)\\n'\r\n ' except Exception:\\n'\r\n ' return {}\\n'\r\n '\\n'\r\n ' header_rows, all_line_items = [], []\\n'\r\n ' for val, filename in zip(df[json_column], df[\"filename\"]):\\n'\r\n ' row = parse_row(val)\\n'\r\n ' line_items = row.pop(\"line_items\", None) or []\\n'\r\n ' invoice_number = row.get(\"invoice_number\", \"\")\\n'\r\n ' header_rows.append({\"source_filename\": filename, **row})\\n'\r\n ' for item in line_items:\\n'\r\n ' all_line_items.append({\"source_filename\": filename,\\n'\r\n ' \"invoice_number\": invoice_number,\\n'\r\n ' **item})\\n'\r\n '\\n'\r\n ' header_df = pd.DataFrame(header_rows)\\n'\r\n ' line_items_df = pd.DataFrame(all_line_items) if all_line_items else pd.DataFrame()\\n'\r\n ' return header_df, line_items_df\\n'\r\n '\\n'\r\n 'header_df, line_items_df = parse_output(pdf_df, \"output\")'\r\n ))\r\n else:\r\n cells.append(cell(\r\n 'def parse_output(df, json_column):\\n'\r\n ' def parse_row(val):\\n'\r\n ' if not isinstance(val, str):\\n'\r\n ' return {}\\n'\r\n ' try:\\n'\r\n ' cleaned = val.strip().replace(\"```json\", \"\").replace(\"```\", \"\").strip()\\n'\r\n ' return json.loads(cleaned)\\n'\r\n ' except Exception:\\n'\r\n ' return {}\\n'\r\n ' extracted = [pd.json_normalize(parse_row(v)) for v in df[json_column]]\\n'\r\n ' result = pd.concat(extracted, ignore_index=True)\\n'\r\n ' result.insert(0, \"source_filename\", df[\"filename\"].values)\\n'\r\n ' return result\\n'\r\n '\\n'\r\n 'final_df = parse_output(pdf_df, \"output\")'\r\n ))\r\n\r\n # ── Cell 12: display header results ─────────────────────────────────────\r\n if two_tables:\r\n cells.append(cell(\r\n 'print(f\"Header rows: {len(header_df)}\")\\n'\r\n 'if TEST_MODE:\\n'\r\n ' print(\"\\u2705 TEST MODE \\u2014 review header row below.\")\\n'\r\n ' print(\" If all fields look correct, scroll down to check line items too.\")\\n'\r\n ' print(\" If any field is wrong, update EXTRACTION_PROMPT and re-run.\")\\n'\r\n 'display(header_df)'\r\n ))\r\n else:\r\n cells.append(cell(\r\n 'print(f\"Extracted {len(final_df)} row(s) from {len(pdf_df)} PDF(s).\")\\n'\r\n 'if TEST_MODE:\\n'\r\n ' print(\"\\u2705 TEST MODE \\u2014 review the row below.\")\\n'\r\n ' print(\" If all fields look correct, set TEST_MODE = False and re-run.\")\\n'\r\n ' print(\" If any field is wrong, update EXTRACTION_PROMPT and re-run.\")\\n'\r\n 'display(final_df)'\r\n ))\r\n\r\n # ── Cell 13: display line items (two-table mode only) ───────────────────\r\n if two_tables:\r\n cells.append(cell(\r\n 'print(f\"Line item rows: {len(line_items_df)}\")\\n'\r\n 'if TEST_MODE:\\n'\r\n ' print(\"\\u2705 TEST MODE \\u2014 review line items below.\")\\n'\r\n ' print(\" If correct, set TEST_MODE = False and re-run for all PDFs.\")\\n'\r\n 'display(line_items_df)'\r\n ))\r\n\r\n # ── Cell 14: write header table ──────────────────────────────────────────\r\n if two_tables:\r\n cells.append(cell(\r\n 'spark_df = spark.createDataFrame(header_df.astype(str).fillna(\"\"))\\n'\r\n 'spark_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(HEADER_TABLE_NAME)\\n'\r\n 'print(f\"\\u2705 Written {len(header_df)} row(s) to delta table: {HEADER_TABLE_NAME}\")'\r\n ))\r\n else:\r\n cells.append(cell(\r\n 'spark_df = spark.createDataFrame(final_df.astype(str).fillna(\"\"))\\n'\r\n 'spark_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(TABLE_NAME)\\n'\r\n 'print(f\"\\u2705 Written {len(final_df)} row(s) to delta table: {TABLE_NAME}\")'\r\n ))\r\n\r\n # ── Cell 15: write line items table (two-table mode only) ────────────────\r\n if two_tables:\r\n cells.append(cell(\r\n 'if not line_items_df.empty:\\n'\r\n ' spark_li = spark.createDataFrame(line_items_df.astype(str).fillna(\"\"))\\n'\r\n ' spark_li.write.format(\"delta\").mode(\"overwrite\").saveAsTable(LINE_ITEMS_TABLE_NAME)\\n'\r\n ' print(f\"\\u2705 Written {len(line_items_df)} row(s) to delta table: {LINE_ITEMS_TABLE_NAME}\")\\n'\r\n 'else:\\n'\r\n ' print(\"\\u26a0\\ufe0f No line items extracted \\u2014 check the line_items field in EXTRACTION_PROMPT.\")'\r\n ))\r\n\r\n return {\r\n \"nbformat\": 4,\r\n \"nbformat_minor\": 5,\r\n \"metadata\": {\r\n \"kernelspec\": {\r\n \"display_name\": \"synapse_pyspark\",\r\n \"language\": \"python\",\r\n \"name\": \"synapse_pyspark\",\r\n },\r\n \"language_info\": {\"name\": \"python\"},\r\n \"trident\": {\"lakehouse\": {}},\r\n },\r\n \"cells\": cells,\r\n }\r\n\r\n\r\ndef main():\r\n parser = argparse.ArgumentParser(\r\n description=(\r\n \"Generate a Fabric-compatible PySpark notebook (.ipynb) that extracts \"\r\n \"structured fields from PDFs using AI and writes one or two delta tables.\"\r\n ),\r\n formatter_class=argparse.RawDescriptionHelpFormatter,\r\n epilog=(\r\n \"Single-table example:\\n\"\r\n \" python scripts/generate_notebook.py \\\\\\n\"\r\n ' --lakehouse \"Lh_landon_finance_bronze\" \\\\\\n'\r\n ' --lakehouse-folder \"Booking PDFs\" \\\\\\n'\r\n ' --table-name \"booking_invoices\" \\\\\\n'\r\n \" --fields-json '[{\\\"name\\\":\\\"invoice_number\\\",\\\"description\\\":\\\"...\\\"}]' \\\\\\n\"\r\n \" --test-mode \\\\\\n\"\r\n ' --output-notebook \"outputs/my-run/pdf_to_delta_TEST.ipynb\"\\n'\r\n ),\r\n )\r\n parser.add_argument(\"--lakehouse\", required=True,\r\n help=\"Name of the bronze lakehouse to attach.\")\r\n parser.add_argument(\"--lakehouse-folder\", required=True,\r\n help=\"Folder under Files/ in the lakehouse containing PDFs.\")\r\n parser.add_argument(\"--table-name\", required=True,\r\n help=\"Delta table name for header/main rows.\")\r\n parser.add_argument(\"--fields-json\", required=True,\r\n help='JSON array: [{\"name\": \"...\", \"description\": \"...\"}, ...]')\r\n parser.add_argument(\"--line-items-table-name\", default=None,\r\n help=\"(Optional) Delta table name for line items rows.\")\r\n parser.add_argument(\"--line-items-fields-json\", default=None,\r\n help=\"(Optional) JSON array of line item field definitions.\")\r\n parser.add_argument(\"--test-mode\", action=\"store_true\",\r\n help=\"If set, notebook processes only the first PDF.\")\r\n parser.add_argument(\"--output-notebook\", required=True,\r\n help=\"Path where the .ipynb file should be saved.\")\r\n args = parser.parse_args()\r\n\r\n def parse_fields(raw, label):\r\n try:\r\n fields = json.loads(raw)\r\n except json.JSONDecodeError as e:\r\n print(f\"ERROR: {label} is not valid JSON: {e}\", file=sys.stderr)\r\n sys.exit(1)\r\n if not isinstance(fields, list) or not all(\r\n \"name\" in f and \"description\" in f for f in fields\r\n ):\r\n print(f'ERROR: {label} must be a JSON array of {{\"name\":..., \"description\":...}} objects.',\r\n file=sys.stderr)\r\n sys.exit(1)\r\n return fields\r\n\r\n fields = parse_fields(args.fields_json, \"--fields-json\")\r\n line_items_fields = (\r\n parse_fields(args.line_items_fields_json, \"--line-items-fields-json\")\r\n if args.line_items_fields_json else None\r\n )\r\n\r\n if bool(args.line_items_table_name) != bool(line_items_fields):\r\n print(\"ERROR: --line-items-table-name and --line-items-fields-json must both be provided together.\",\r\n file=sys.stderr)\r\n sys.exit(1)\r\n\r\n notebook = build_notebook(\r\n lakehouse_name=args.lakehouse,\r\n lakehouse_folder=args.lakehouse_folder,\r\n table_name=args.table_name,\r\n fields=fields,\r\n test_mode=args.test_mode,\r\n line_items_table_name=args.line_items_table_name,\r\n line_items_fields=line_items_fields,\r\n )\r\n\r\n out_path = os.path.abspath(args.output_notebook)\r\n os.makedirs(os.path.dirname(out_path), exist_ok=True)\r\n with open(out_path, \"w\", encoding=\"utf-8\") as f:\r\n json.dump(notebook, f, indent=2, ensure_ascii=False)\r\n\r\n two_tables = bool(args.line_items_table_name)\r\n mode_label = \"TEST (1 PDF)\" if args.test_mode else \"FULL (all PDFs)\"\r\n print(f\"Notebook written to: {out_path}\", file=sys.stderr)\r\n print(f\"Mode: {mode_label} | Tables: {'2 (header + line items)' if two_tables else '1'}\", file=sys.stderr)\r\n print(f\"Header fields: {len(fields)}\" + (f\" | Line item fields: {len(line_items_fields)}\" if line_items_fields else \"\"), file=sys.stderr)\r\n print(\"Import into Fabric: Workspace -> New -> Import notebook -> select this .ipynb file.\", file=sys.stderr)\r\n print(\"Lakehouse is attached automatically via %%configure -- just click Run All.\", file=sys.stderr)\r\n\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n",
|