InfoTracker 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infotracker-0.1.0/.gitignore +42 -0
- infotracker-0.1.0/PKG-INFO +108 -0
- infotracker-0.1.0/ProjectDescription.md +82 -0
- infotracker-0.1.0/README.md +79 -0
- infotracker-0.1.0/docs/adapters.md +117 -0
- infotracker-0.1.0/docs/agentic_workflow.md +134 -0
- infotracker-0.1.0/docs/algorithm.md +109 -0
- infotracker-0.1.0/docs/architecture.md +35 -0
- infotracker-0.1.0/docs/breaking_changes.md +291 -0
- infotracker-0.1.0/docs/cli_usage.md +133 -0
- infotracker-0.1.0/docs/configuration.md +38 -0
- infotracker-0.1.0/docs/dbt_integration.md +144 -0
- infotracker-0.1.0/docs/edge_cases.md +102 -0
- infotracker-0.1.0/docs/example_dataset.md +128 -0
- infotracker-0.1.0/docs/faq.md +23 -0
- infotracker-0.1.0/docs/lineage_concepts.md +306 -0
- infotracker-0.1.0/docs/openlineage_mapping.md +45 -0
- infotracker-0.1.0/docs/overview.md +123 -0
- infotracker-0.1.0/examples/warehouse/lineage/01_customers.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/02_orders.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/03_products.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/04_order_items.json +26 -0
- infotracker-0.1.0/examples/warehouse/lineage/10_stg_orders.json +43 -0
- infotracker-0.1.0/examples/warehouse/lineage/11_stg_order_items.json +56 -0
- infotracker-0.1.0/examples/warehouse/lineage/12_stg_customers.json +48 -0
- infotracker-0.1.0/examples/warehouse/lineage/20_vw_recent_orders.json +38 -0
- infotracker-0.1.0/examples/warehouse/lineage/30_dim_customer.json +43 -0
- infotracker-0.1.0/examples/warehouse/lineage/31_dim_product.json +43 -0
- infotracker-0.1.0/examples/warehouse/lineage/40_fct_sales.json +59 -0
- infotracker-0.1.0/examples/warehouse/lineage/41_agg_sales_by_day.json +39 -0
- infotracker-0.1.0/examples/warehouse/lineage/50_vw_orders_all.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/51_vw_orders_all_enriched.json +26 -0
- infotracker-0.1.0/examples/warehouse/lineage/52_vw_order_details_star.json +31 -0
- infotracker-0.1.0/examples/warehouse/lineage/53_vw_products_all.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/54_vw_recent_orders_star_cte.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/55_vw_orders_shipped_or_delivered.json +25 -0
- infotracker-0.1.0/examples/warehouse/lineage/56_vw_orders_union_star.json +40 -0
- infotracker-0.1.0/examples/warehouse/lineage/60_vw_customer_order_analysis.json +149 -0
- infotracker-0.1.0/examples/warehouse/lineage/90_usp_refresh_sales_with_temp.json +57 -0
- infotracker-0.1.0/examples/warehouse/lineage/91_usp_snapshot_recent_orders_star.json +26 -0
- infotracker-0.1.0/examples/warehouse/lineage/92_usp_rebuild_recent_sales_with_vars.json +52 -0
- infotracker-0.1.0/examples/warehouse/lineage/93_usp_top_products_since_var.json +42 -0
- infotracker-0.1.0/examples/warehouse/sql/01_customers.sql +6 -0
- infotracker-0.1.0/examples/warehouse/sql/02_orders.sql +6 -0
- infotracker-0.1.0/examples/warehouse/sql/03_products.sql +6 -0
- infotracker-0.1.0/examples/warehouse/sql/04_order_items.sql +7 -0
- infotracker-0.1.0/examples/warehouse/sql/10_stg_orders.sql +7 -0
- infotracker-0.1.0/examples/warehouse/sql/11_stg_order_items.sql +9 -0
- infotracker-0.1.0/examples/warehouse/sql/12_stg_customers.sql +8 -0
- infotracker-0.1.0/examples/warehouse/sql/20_vw_recent_orders.sql +14 -0
- infotracker-0.1.0/examples/warehouse/sql/30_dim_customer.sql +7 -0
- infotracker-0.1.0/examples/warehouse/sql/31_dim_product.sql +7 -0
- infotracker-0.1.0/examples/warehouse/sql/40_fct_sales.sql +12 -0
- infotracker-0.1.0/examples/warehouse/sql/41_agg_sales_by_day.sql +9 -0
- infotracker-0.1.0/examples/warehouse/sql/50_vw_orders_all.sql +3 -0
- infotracker-0.1.0/examples/warehouse/sql/51_vw_orders_all_enriched.sql +5 -0
- infotracker-0.1.0/examples/warehouse/sql/52_vw_order_details_star.sql +9 -0
- infotracker-0.1.0/examples/warehouse/sql/53_vw_products_all.sql +3 -0
- infotracker-0.1.0/examples/warehouse/sql/54_vw_recent_orders_star_cte.sql +7 -0
- infotracker-0.1.0/examples/warehouse/sql/55_vw_orders_shipped_or_delivered.sql +4 -0
- infotracker-0.1.0/examples/warehouse/sql/56_vw_orders_union_star.sql +4 -0
- infotracker-0.1.0/examples/warehouse/sql/60_vw_customer_order_analysis.sql +12 -0
- infotracker-0.1.0/examples/warehouse/sql/60_vw_customer_order_ranking.sql +11 -0
- infotracker-0.1.0/examples/warehouse/sql/61_vw_sales_analytics.sql +11 -0
- infotracker-0.1.0/examples/warehouse/sql/90_usp_refresh_sales_with_temp.sql +53 -0
- infotracker-0.1.0/examples/warehouse/sql/91_usp_snapshot_recent_orders_star.sql +23 -0
- infotracker-0.1.0/examples/warehouse/sql/92_usp_rebuild_recent_sales_with_vars.sql +48 -0
- infotracker-0.1.0/examples/warehouse/sql/93_usp_top_products_since_var.sql +41 -0
- infotracker-0.1.0/infotracker.yml +45 -0
- infotracker-0.1.0/pyproject.toml +73 -0
- infotracker-0.1.0/requirements.txt +11 -0
- infotracker-0.1.0/src/infotracker/__init__.py +6 -0
- infotracker-0.1.0/src/infotracker/__main__.py +6 -0
- infotracker-0.1.0/src/infotracker/adapters.py +65 -0
- infotracker-0.1.0/src/infotracker/cli.py +150 -0
- infotracker-0.1.0/src/infotracker/config.py +57 -0
- infotracker-0.1.0/src/infotracker/diff.py +291 -0
- infotracker-0.1.0/src/infotracker/engine.py +340 -0
- infotracker-0.1.0/src/infotracker/lineage.py +122 -0
- infotracker-0.1.0/src/infotracker/models.py +302 -0
- infotracker-0.1.0/src/infotracker/parser.py +807 -0
- infotracker-0.1.0/tests/__init__.py +21 -0
- infotracker-0.1.0/tests/conftest.py +60 -0
- infotracker-0.1.0/tests/test_adapter.py +150 -0
- infotracker-0.1.0/tests/test_expected_outputs.py +229 -0
- infotracker-0.1.0/tests/test_integration.py +148 -0
- infotracker-0.1.0/tests/test_parser.py +145 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
# Środowiska wirtualne
|
2
|
+
.venv/
|
3
|
+
infotracker-env/
|
4
|
+
|
5
|
+
# Pliki i katalogi build
|
6
|
+
build/
|
7
|
+
dist/
|
8
|
+
*.egg-info/
|
9
|
+
|
10
|
+
# Cache Pythona
|
11
|
+
__pycache__/
|
12
|
+
*.pyc
|
13
|
+
*.pyo
|
14
|
+
*.pyd
|
15
|
+
|
16
|
+
# Pliki systemowe
|
17
|
+
.DS_Store
|
18
|
+
Thumbs.db
|
19
|
+
|
20
|
+
# Pliki konfiguracyjne IDE
|
21
|
+
.vscode/
|
22
|
+
.idea/
|
23
|
+
|
24
|
+
# Logi
|
25
|
+
*.log
|
26
|
+
|
27
|
+
# Zrzuty i pliki tymczasowe
|
28
|
+
*.tmp
|
29
|
+
*.temp
|
30
|
+
|
31
|
+
# Pliki wygenerowane automatycznie
|
32
|
+
docs/_build/
|
33
|
+
|
34
|
+
# Ignoruj wszystkie pliki Pythona z 'test' lub 'phase' w nazwie
|
35
|
+
(?i)*test*.py
|
36
|
+
(?i)*phase*.py
|
37
|
+
(?i)*test*.md
|
38
|
+
(?i)*phase*.md
|
39
|
+
InfoTracker/PHASE2_COMPLETE.md
|
40
|
+
InfoTracker/PHASE3_PLAN.md
|
41
|
+
InfoTracker/test_phase2.py
|
42
|
+
InfoTracker/test_comprehensive.py
|
@@ -0,0 +1,108 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: InfoTracker
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Column-level SQL lineage, impact analysis, and breaking-change detection (MS SQL first)
|
5
|
+
Project-URL: homepage, https://example.com/infotracker
|
6
|
+
Project-URL: documentation, https://example.com/infotracker/docs
|
7
|
+
Author: InfoTracker Authors
|
8
|
+
License: MIT
|
9
|
+
Keywords: data-lineage,impact-analysis,lineage,mssql,openlineage,sql
|
10
|
+
Classifier: Environment :: Console
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
15
|
+
Classifier: Topic :: Database
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
17
|
+
Requires-Python: <3.13,>=3.10
|
18
|
+
Requires-Dist: click<9.0.0,>=8.1.3
|
19
|
+
Requires-Dist: networkx>=3.3
|
20
|
+
Requires-Dist: packaging>=24.0
|
21
|
+
Requires-Dist: pydantic>=2.8.2
|
22
|
+
Requires-Dist: pyyaml>=6.0.1
|
23
|
+
Requires-Dist: sqlglot>=23.0.0
|
24
|
+
Requires-Dist: typer[all]==0.12.3
|
25
|
+
Provides-Extra: dev
|
26
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
27
|
+
Requires-Dist: pytest>=7.4.0; extra == 'dev'
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
|
30
|
+
### InfoTracker
|
31
|
+
|
32
|
+
This is a Python CLI that extracts column-level lineage from SQL, runs impact analysis, and detects breaking changes. First adapter targets MS SQL.
|
33
|
+
|
34
|
+
#### For Students
|
35
|
+
Start with a simple command: `infotracker extract --sql-dir examples/warehouse/sql --out-dir build/lineage`. This analyzes SQL files in the directory.
|
36
|
+
|
37
|
+
#### Setup & Installation
|
38
|
+
```bash
|
39
|
+
# Activate virtual environment first (REQUIRED)
|
40
|
+
source infotracker-env/bin/activate # or your venv path
|
41
|
+
|
42
|
+
# Install dependencies
|
43
|
+
pip install -e .
|
44
|
+
|
45
|
+
# Verify installation
|
46
|
+
infotracker --help
|
47
|
+
```
|
48
|
+
|
49
|
+
#### Quickstart
|
50
|
+
```bash
|
51
|
+
# IMPORTANT: Always run InfoTracker commands in the activated virtual environment
|
52
|
+
|
53
|
+
# Extract lineage from all SQL files
|
54
|
+
infotracker extract --sql-dir examples/warehouse/sql --out-dir build/lineage
|
55
|
+
|
56
|
+
# Impact analysis (downstream dependencies)
|
57
|
+
infotracker impact -s dbo.fct_sales.Revenue+
|
58
|
+
|
59
|
+
# Impact analysis (upstream sources)
|
60
|
+
infotracker impact -s +dbo.Orders.OrderID
|
61
|
+
|
62
|
+
# Branch diff for breaking changes
|
63
|
+
infotracker diff --base main --head feature/x --sql-dir examples/warehouse/sql
|
64
|
+
```
|
65
|
+
|
66
|
+
#### Configuration
|
67
|
+
InfoTracker follows this configuration precedence:
|
68
|
+
1. **CLI flags** (highest priority) - override everything
|
69
|
+
2. **infotracker.yml** config file - project defaults
|
70
|
+
3. **Built-in defaults** (lowest priority) - fallback values
|
71
|
+
|
72
|
+
Create an `infotracker.yml` file in your project root:
|
73
|
+
```yaml
|
74
|
+
default_adapter: mssql
|
75
|
+
sql_dir: examples/warehouse/sql
|
76
|
+
out_dir: build/lineage
|
77
|
+
include: ["*.sql"]
|
78
|
+
exclude: ["*_wip.sql"]
|
79
|
+
severity_threshold: BREAKING
|
80
|
+
```
|
81
|
+
|
82
|
+
#### Documentation
|
83
|
+
- `docs/overview.md` — what it is, goals, scope
|
84
|
+
- `docs/algorithm.md` — how extraction works
|
85
|
+
- `docs/lineage_concepts.md` — core concepts with visuals
|
86
|
+
- `docs/cli_usage.md` — commands and options
|
87
|
+
- `docs/breaking_changes.md` — definition and detection
|
88
|
+
- `docs/edge_cases.md` — SELECT *, UNION, temp tables, etc.
|
89
|
+
- `docs/adapters.md` — interface and MSSQL specifics
|
90
|
+
- `docs/architecture.md` — system and sequence diagrams
|
91
|
+
- `docs/configuration.md` — configuration reference
|
92
|
+
- `docs/openlineage_mapping.md` — how outputs map to OpenLineage
|
93
|
+
- `docs/faq.md` — common questions
|
94
|
+
- `docs/dbt_integration.md` — how to use with dbt projects
|
95
|
+
|
96
|
+
#### Requirements
|
97
|
+
- Python 3.10+
|
98
|
+
- Virtual environment (activated)
|
99
|
+
- Basic SQL knowledge
|
100
|
+
- Git and shell
|
101
|
+
|
102
|
+
#### Troubleshooting
|
103
|
+
- **Error tracebacks on help commands**: Make sure you're running in an activated virtual environment
|
104
|
+
- **Command not found**: Activate your virtual environment first
|
105
|
+
- **Import errors**: Ensure all dependencies are installed with `pip install -e .`
|
106
|
+
|
107
|
+
#### License
|
108
|
+
MIT (or your team’s preferred license)
|
@@ -0,0 +1,82 @@
|
|
1
|
+
### InfoTracker — Student Brief
|
2
|
+
|
3
|
+
#### Welcome to the Data Dungeon (friendly, no actual monsters)
|
4
|
+
You are the hero. Your quest: teach a tool to read SQL scrolls and tell true stories about columns. You’ll map where data comes from, spot traps (breaking changes), and keep the kingdom’s dashboards happy.
|
5
|
+
|
6
|
+
- Your gear: a CLI, example SQLs, and clear docs
|
7
|
+
- Your allies: adapters, lineage graphs, and CI checks
|
8
|
+
- Your enemies: sneaky `SELECT *`, UNION goblins, and 3 a.m. alerts
|
9
|
+
- Goal: green checks, clear diffs, and no broken charts
|
10
|
+
|
11
|
+
If you get stuck, it’s normal. Take a sip of tea, re-read the step, try a smaller example.
|
12
|
+
|
13
|
+
### For Beginners
|
14
|
+
If you're new to SQL, try this free tutorial: [Khan Academy SQL](https://www.khanacademy.org/computing/computer-programming/sql). It's in simple English and has exercises.
|
15
|
+
|
16
|
+
### Action plan (read and build in this order)
|
17
|
+
1) Understand the goal and scope (1 hour)
|
18
|
+
- Overview: [docs/overview.md](docs/overview.md)
|
19
|
+
- What you’re building, supported features, and what’s out of scope. Keep this open as your north star.
|
20
|
+
|
21
|
+
2) Learn column-level lineage basics (1-2 hours)
|
22
|
+
- Concepts: [docs/lineage_concepts.md](docs/lineage_concepts.md)
|
23
|
+
- Visual examples showing how each output column maps back to inputs (joins, transforms, aggregations). This informs how your extractor must reason.
|
24
|
+
|
25
|
+
3) Explore the example dataset (your training corpus)
|
26
|
+
- Dataset map: [docs/example_dataset.md](docs/example_dataset.md)
|
27
|
+
- Where the SQL files live, what each file represents (tables, views, CTEs, procs), and the matching OpenLineage JSON expectations you must reproduce.
|
28
|
+
|
29
|
+
4) Implement the algorithm incrementally
|
30
|
+
- Algorithm: [docs/algorithm.md](docs/algorithm.md)
|
31
|
+
- Steps: parse → object graph → schema resolution (expand `*` late) → column lineage extraction → impact graph → outputs.
|
32
|
+
- Aim for correctness on simple files first, then progress to joins and aggregations.
|
33
|
+
|
34
|
+
5) Handle edge cases early enough to avoid rewrites
|
35
|
+
- SELECT-star and star expansion: [docs/edge_cases.md](docs/edge_cases.md)
|
36
|
+
- Requires object-level lineage first, then star expansion. Also watch UNION ordinals and SELECT INTO schema inference.
|
37
|
+
|
38
|
+
6) Decide on architecture and adapter boundaries
|
39
|
+
- Adapters & extensibility: [docs/adapters.md](docs/adapters.md)
|
40
|
+
- Define a clear adapter interface and implement MS SQL first (temp tables, variables, SELECT INTO, T-SQL functions). Keep the core engine adapter-agnostic.
|
41
|
+
|
42
|
+
7) Wire up the agentic workflow and regression tests
|
43
|
+
- Agentic workflow: [docs/agentic_workflow.md](docs/agentic_workflow.md)
|
44
|
+
- Loop the agent on the example corpus until the generated lineage matches the gold JSON. Add CI to auto-run on any SQL/lineage change.
|
45
|
+
|
46
|
+
8) Expose the CLI and iterate to parity
|
47
|
+
- CLI usage: [docs/cli_usage.md](docs/cli_usage.md)
|
48
|
+
- Implement `extract`, `impact`, and `diff`. The CLI is your acceptance surface; keep behavior stable and well-documented.
|
49
|
+
|
50
|
+
9) Implement breaking-change detection and reporting
|
51
|
+
- Breaking changes: [docs/breaking_changes.md](docs/breaking_changes.md)
|
52
|
+
- Compare base vs head branches: diff schemas/expressions, classify severity, compute downstream impacts, and emit machine + human-readable reports.
|
53
|
+
|
54
|
+
10) Optional: Integrate with dbt
|
55
|
+
- dbt integration: [docs/dbt_integration.md](docs/dbt_integration.md)
|
56
|
+
|
57
|
+
### Milestones (suggested timebox)
|
58
|
+
- Day 1–2: read docs, install CLI, run extract on examples
|
59
|
+
- Day 3–5: implement simple lineage (no joins), pass gold files
|
60
|
+
- Day 6–8: add joins and aggregations, handle star expansion
|
61
|
+
- Day 9–10: wire warn-only diff in CI, polish docs
|
62
|
+
|
63
|
+
### Acceptance checklist
|
64
|
+
- Lineage matches gold JSONs in `examples/warehouse/lineage`
|
65
|
+
- Impact queries return correct columns for sample selectors
|
66
|
+
- Diff runs in CI (warn-only) and shows helpful messages
|
67
|
+
- Docs updated where needed; examples run without errors
|
68
|
+
|
69
|
+
### Quick-start CLI (target behavior)
|
70
|
+
Simple Example: To test one file, run `infotracker extract --sql-dir examples/warehouse/sql/01_customers.sql --out-dir build/lineage`
|
71
|
+
|
72
|
+
### Tips (pro-level, easy to follow)
|
73
|
+
- Start small: one view, then a join, then an aggregate
|
74
|
+
- Be explicit: avoid `SELECT *` while testing
|
75
|
+
- Commit often: small steps are easy to undo
|
76
|
+
- Use the example JSONs as your “gold” truth
|
77
|
+
|
78
|
+
### If stuck (quick help)
|
79
|
+
- Re-read the related doc step (linked above)
|
80
|
+
- Run the CLI with `--log-level debug` to see more info
|
81
|
+
- Create a tiny SQL with just the failing pattern and test that first
|
82
|
+
- Write down expected lineage for one column, then match it in code
|
@@ -0,0 +1,79 @@
|
|
1
|
+
### InfoTracker
|
2
|
+
|
3
|
+
This is a Python CLI that extracts column-level lineage from SQL, runs impact analysis, and detects breaking changes. First adapter targets MS SQL.
|
4
|
+
|
5
|
+
#### For Students
|
6
|
+
Start with a simple command: `infotracker extract --sql-dir examples/warehouse/sql --out-dir build/lineage`. This analyzes SQL files in the directory.
|
7
|
+
|
8
|
+
#### Setup & Installation
|
9
|
+
```bash
|
10
|
+
# Activate virtual environment first (REQUIRED)
|
11
|
+
source infotracker-env/bin/activate # or your venv path
|
12
|
+
|
13
|
+
# Install dependencies
|
14
|
+
pip install -e .
|
15
|
+
|
16
|
+
# Verify installation
|
17
|
+
infotracker --help
|
18
|
+
```
|
19
|
+
|
20
|
+
#### Quickstart
|
21
|
+
```bash
|
22
|
+
# IMPORTANT: Always run InfoTracker commands in the activated virtual environment
|
23
|
+
|
24
|
+
# Extract lineage from all SQL files
|
25
|
+
infotracker extract --sql-dir examples/warehouse/sql --out-dir build/lineage
|
26
|
+
|
27
|
+
# Impact analysis (downstream dependencies)
|
28
|
+
infotracker impact -s dbo.fct_sales.Revenue+
|
29
|
+
|
30
|
+
# Impact analysis (upstream sources)
|
31
|
+
infotracker impact -s +dbo.Orders.OrderID
|
32
|
+
|
33
|
+
# Branch diff for breaking changes
|
34
|
+
infotracker diff --base main --head feature/x --sql-dir examples/warehouse/sql
|
35
|
+
```
|
36
|
+
|
37
|
+
#### Configuration
|
38
|
+
InfoTracker follows this configuration precedence:
|
39
|
+
1. **CLI flags** (highest priority) - override everything
|
40
|
+
2. **infotracker.yml** config file - project defaults
|
41
|
+
3. **Built-in defaults** (lowest priority) - fallback values
|
42
|
+
|
43
|
+
Create an `infotracker.yml` file in your project root:
|
44
|
+
```yaml
|
45
|
+
default_adapter: mssql
|
46
|
+
sql_dir: examples/warehouse/sql
|
47
|
+
out_dir: build/lineage
|
48
|
+
include: ["*.sql"]
|
49
|
+
exclude: ["*_wip.sql"]
|
50
|
+
severity_threshold: BREAKING
|
51
|
+
```
|
52
|
+
|
53
|
+
#### Documentation
|
54
|
+
- `docs/overview.md` — what it is, goals, scope
|
55
|
+
- `docs/algorithm.md` — how extraction works
|
56
|
+
- `docs/lineage_concepts.md` — core concepts with visuals
|
57
|
+
- `docs/cli_usage.md` — commands and options
|
58
|
+
- `docs/breaking_changes.md` — definition and detection
|
59
|
+
- `docs/edge_cases.md` — SELECT *, UNION, temp tables, etc.
|
60
|
+
- `docs/adapters.md` — interface and MSSQL specifics
|
61
|
+
- `docs/architecture.md` — system and sequence diagrams
|
62
|
+
- `docs/configuration.md` — configuration reference
|
63
|
+
- `docs/openlineage_mapping.md` — how outputs map to OpenLineage
|
64
|
+
- `docs/faq.md` — common questions
|
65
|
+
- `docs/dbt_integration.md` — how to use with dbt projects
|
66
|
+
|
67
|
+
#### Requirements
|
68
|
+
- Python 3.10+
|
69
|
+
- Virtual environment (activated)
|
70
|
+
- Basic SQL knowledge
|
71
|
+
- Git and shell
|
72
|
+
|
73
|
+
#### Troubleshooting
|
74
|
+
- **Error tracebacks on help commands**: Make sure you're running in an activated virtual environment
|
75
|
+
- **Command not found**: Activate your virtual environment first
|
76
|
+
- **Import errors**: Ensure all dependencies are installed with `pip install -e .`
|
77
|
+
|
78
|
+
#### License
|
79
|
+
MIT (or your team’s preferred license)
|
@@ -0,0 +1,117 @@
|
|
1
|
+
### Adapters and extensibility
|
2
|
+
|
3
|
+
#### Forge your adapter (smithing for data heroes)
|
4
|
+
In the forge of Integration Keep, you’ll craft adapters that turn raw SQL into neatly qualified lineage. Sparks may fly; that’s normal.
|
5
|
+
|
6
|
+
- Materials: `parse`, `qualify`, `resolve`, `to_openlineage`
|
7
|
+
- Armor enchantments: case-normalization, bracket taming, and dialect charms
|
8
|
+
- Future artifacts: Snowflake blade, BigQuery bow, Postgres shield
|
9
|
+
|
10
|
+
If an imp named “Case Insensitivity” throws a tantrum, feed it brackets: `[like_this]`.
|
11
|
+
|
12
|
+
#### Audience & prerequisites
|
13
|
+
- Audience: engineers implementing or extending dialect adapters (Level 2: After basics)
|
14
|
+
- Prerequisites: Python; SQL basics; familiarity with SQLGlot or similar parser
|
15
|
+
|
16
|
+
Define an adapter interface:
|
17
|
+
- parse(sql) → AST
|
18
|
+
- qualify(ast) → fully qualified refs (db.schema.object)
|
19
|
+
- resolve(ast, catalog) → output schema + expressions
|
20
|
+
- to_openlineage(object) → columnLineage facet
|
21
|
+
|
22
|
+
MS SQL adapter (first):
|
23
|
+
- Use `SQLGlot`/`sqllineage` for parsing/lineage hints
|
24
|
+
- Handle T-SQL specifics: temp tables, SELECT INTO, variables, functions
|
25
|
+
- Normalize identifiers (brackets vs quotes), case-insensitivity
|
26
|
+
|
27
|
+
Future adapters: Snowflake, BigQuery, Postgres, etc.
|
28
|
+
|
29
|
+
### Adapter interface (pseudocode)
|
30
|
+
```python
|
31
|
+
class Adapter(Protocol):
|
32
|
+
name: str
|
33
|
+
dialect: str
|
34
|
+
|
35
|
+
def parse(self, sql: str) -> AST: ...
|
36
|
+
def qualify(self, ast: AST, default_db: str | None) -> AST: ...
|
37
|
+
def resolve(self, ast: AST, catalog: Catalog) -> tuple[Schema, ColumnLineage]: ...
|
38
|
+
def to_openlineage(self, obj_name: str, schema: Schema, lineage: ColumnLineage) -> dict: ...
|
39
|
+
```
|
40
|
+
|
41
|
+
### MS SQL specifics
|
42
|
+
- Case-insensitive identifiers; bracket quoting `[name]`
|
43
|
+
- Temp tables (`#t`) live in tempdb; scope to procedure; support SELECT INTO schema inference
|
44
|
+
- Variables (`@v`) and their use in filters/windows; capture expressions for context
|
45
|
+
- GETDATE/DATEADD and common built-ins; treat as CONSTANT/ARITHMETIC transformations
|
46
|
+
- JOINs default to INNER; OUTER joins affect nullability
|
47
|
+
- Parser: prefer SQLGlot for AST; use sqllineage as an optional hint only
|
48
|
+
|
49
|
+
### Mini example (very small, illustrative)
|
50
|
+
```python
|
51
|
+
class MssqlAdapter(Adapter):
|
52
|
+
name = "mssql"
|
53
|
+
dialect = "tsql"
|
54
|
+
|
55
|
+
def parse(self, sql: str) -> AST:
|
56
|
+
return sqlglot.parse_one(sql, read=self.dialect)
|
57
|
+
|
58
|
+
def qualify(self, ast: AST, default_db: str | None) -> AST:
|
59
|
+
# apply name normalization and database/schema defaults
|
60
|
+
return qualify_identifiers(ast, default_db)
|
61
|
+
|
62
|
+
def resolve(self, ast: AST, catalog: Catalog) -> tuple[Schema, ColumnLineage]:
|
63
|
+
schema = infer_schema(ast, catalog)
|
64
|
+
lineage = extract_column_lineage(ast, catalog)
|
65
|
+
return schema, lineage
|
66
|
+
|
67
|
+
def to_openlineage(self, obj_name: str, schema: Schema, lineage: ColumnLineage) -> dict:
|
68
|
+
return build_openlineage_payload(obj_name, schema, lineage)
|
69
|
+
```
|
70
|
+
|
71
|
+
### How to Test Your Adapter
|
72
|
+
Create a test.py:
|
73
|
+
```python
|
74
|
+
adapter = MssqlAdapter()
|
75
|
+
ast = adapter.parse("SELECT * FROM table")
|
76
|
+
print(ast)
|
77
|
+
```
|
78
|
+
// Run: python test.py
|
79
|
+
|
80
|
+
### Adding a new adapter
|
81
|
+
1. Implement the interface; configure SQLGlot dialect
|
82
|
+
2. Provide normalization rules (case, quoting, name resolution)
|
83
|
+
3. Add adapter-specific tests using a small example corpus
|
84
|
+
4. Document limitations and differences
|
85
|
+
|
86
|
+
### Adapter testing template
|
87
|
+
- Create 3 SQL files: simple select, join with alias, aggregation with group by
|
88
|
+
- Write expected schema (columns, types, nullability)
|
89
|
+
- Write expected lineage (inputs per output column)
|
90
|
+
- Run extraction and compare to expected JSON in CI
|
91
|
+
|
92
|
+
### Adapter selection and registry
|
93
|
+
```python
|
94
|
+
ADAPTERS: dict[str, Adapter] = {
|
95
|
+
"mssql": MssqlAdapter(),
|
96
|
+
}
|
97
|
+
|
98
|
+
def get_adapter(name: str) -> Adapter:
|
99
|
+
return ADAPTERS[name]
|
100
|
+
```
|
101
|
+
|
102
|
+
### Catalog handling
|
103
|
+
- Accept a `catalog.yml` with known schemas for external refs
|
104
|
+
- Use catalog to resolve `*`, disambiguate references, and provide types when DDL is missing
|
105
|
+
- Warn on unknown objects; continue best-effort
|
106
|
+
|
107
|
+
### Common pitfalls
|
108
|
+
- Case-insensitive matching; normalize but preserve display casing
|
109
|
+
- Bracket/quoted identifiers: `[Name]` vs `"Name"`
|
110
|
+
- Temp table scoping and lifetime
|
111
|
+
- SELECT INTO column ordinals and inferred types
|
112
|
+
- Variables used in expressions and filters
|
113
|
+
|
114
|
+
### See also
|
115
|
+
- `docs/algorithm.md`
|
116
|
+
- `docs/cli_usage.md`
|
117
|
+
- `docs/overview.md`
|
@@ -0,0 +1,134 @@
|
|
1
|
+
### Agentic workflow and regression tests
|
2
|
+
|
3
|
+
#### Train your lineage familiar (it learns by fetching JSON)
|
4
|
+
Summon your agent, toss it SQL scrolls, and reward it when it returns with matching OpenLineage scrolls. Repeat until it purrs (tests pass).
|
5
|
+
|
6
|
+
- The loop: cast → compare → tweak → repeat
|
7
|
+
- The arena: `examples/warehouse/{sql,lineage}`
|
8
|
+
- Victory condition: exact matches, zero diffs, tests pass (green)
|
9
|
+
|
10
|
+
Remember: agents love clear acceptance criteria more than tuna.
|
11
|
+
|
12
|
+
#### Audience & prerequisites
|
13
|
+
- Audience: engineers using agents/CIs to iterate on lineage extractors
|
14
|
+
- Prerequisites: basic SQL; Python; familiarity with CI and diff workflows
|
15
|
+
|
16
|
+
### Gold files (recap)
|
17
|
+
- Gold files = expected JSON lineage in `examples/warehouse/lineage`
|
18
|
+
- Your extractor must match them exactly (order and content)
|
19
|
+
|
20
|
+
### Fixing diffs (common)
|
21
|
+
- If a column mismatches: compare expressions; check alias qualification and star expansion timing
|
22
|
+
- If extra/missing columns: check join resolution and GROUP BY; ensure inputs are resolved before expansion
|
23
|
+
- If ordering differs: make outputs and diagnostics deterministic
|
24
|
+
|
25
|
+
- Prepare training set: SQL files + expected OpenLineage JSONs
|
26
|
+
- Loop (Cursor AI/CLI/web agents):
|
27
|
+
1) Generate lineage → 2) Compare with expected → 3) Adjust prompts/code → 4) Repeat until pass
|
28
|
+
- CI: on any change under `examples/warehouse/{sql,lineage}`, run extraction and compare; fail on diffs
|
29
|
+
- Track coverage and edge cases (SELECT *, temp tables, UNION, variables)
|
30
|
+
|
31
|
+
### Setup
|
32
|
+
- Install Cursor CLI and authenticate
|
33
|
+
- Organize repo with `examples/warehouse/{sql,lineage}` and a `build/` output folder
|
34
|
+
|
35
|
+
### Agent loop
|
36
|
+
1. Prompt template includes: adapter target (MS SQL), acceptance criteria (must match gold JSON), and allowed libraries (SQLGlot)
|
37
|
+
2. Agent writes code to `src/` and runs `infotracker extract` on the SQL corpus
|
38
|
+
3. Compare `build/lineage/*.json` to `examples/warehouse/lineage/*.json`
|
39
|
+
4. If diff exists, agent refines parsing/resolution rules and retries
|
40
|
+
5. Stop condition: all files match; record commit checkpoint
|
41
|
+
|
42
|
+
### Loop diagram
|
43
|
+
```mermaid
|
44
|
+
flowchart LR
|
45
|
+
G[Generate lineage] --> C[Compare to gold JSONs]
|
46
|
+
C -->|diffs| R[Refine rules/prompts]
|
47
|
+
R --> G
|
48
|
+
C -->|no diffs| S[Stop (green)]
|
49
|
+
```
|
50
|
+
|
51
|
+
### Artifacts
|
52
|
+
- Inputs: SQL corpus under `examples/warehouse/sql`, optional catalog
|
53
|
+
- Outputs: `build/lineage/*.json`, diff logs, warnings
|
54
|
+
- CI artifacts: upload generated lineage for review
|
55
|
+
|
56
|
+
### Stop criteria
|
57
|
+
- All gold JSONs match exactly (order and content)
|
58
|
+
- No warnings if using `--fail-on-warn`
|
59
|
+
|
60
|
+
### CI integration
|
61
|
+
- GitHub Actions (example): on push/PR, run extraction and `git diff --no-index` against gold lineage; fail on differences
|
62
|
+
- Cache Python deps and AST caches for speed
|
63
|
+
- Upload generated `build/lineage/*.json` as CI artifacts for review
|
64
|
+
|
65
|
+
### Evaluation metrics
|
66
|
+
- Exact-match rate across files
|
67
|
+
- Column coverage (percentage of outputs with lineage)
|
68
|
+
- Warning/error counts should trend down across iterations
|
69
|
+
|
70
|
+
### Updating gold files
|
71
|
+
- Intentional changes: regenerate lineage and review diffs; update gold JSON with PR describing the change
|
72
|
+
|
73
|
+
### See also
|
74
|
+
- `docs/example_dataset.md`
|
75
|
+
- `docs/algorithm.md`
|
76
|
+
- `docs/cli_usage.md`
|
77
|
+
- `docs/dbt_integration.md`
|
78
|
+
|
79
|
+
### Modus Operandi: Continuous Improvement with Cursor Agents
|
80
|
+
To extend the agentic workflow for 24/7/365 improvement of InfoTracker, integrate Cursor's web-based agents (like Background Agents) with GitHub. This builds on the regression testing and CI loops above, enabling automated code suggestions, bug fixes, and PR reviews. See [Cursor Changelog](https://cursor.com/changelog) for details.
|
81
|
+
|
82
|
+
#### Step 1: Set Up Cursor Web Agents for Continuous Improvement
|
83
|
+
1. **Enable Background Agents:** In Cursor, use Background Agents which run remotely and can be triggered via web (e.g., GitHub or Slack).
|
84
|
+
2. **Integrate with GitHub for 24/7 Operation:** Use GitHub Actions to schedule agent runs, combined with Cursor's API for AI tasks.
|
85
|
+
- Create a workflow: `.github/workflows/cursor-improve.yml` to run daily.
|
86
|
+
- Use Cursor's features like tagging @Cursor in issues for automated suggestions.
|
87
|
+
3. **Example Workflow for Scheduled Improvements:**
|
88
|
+
```yaml
|
89
|
+
name: Cursor AI Improvement
|
90
|
+
on: schedule
|
91
|
+
- cron: '0 0 * * *' # Daily at midnight
|
92
|
+
jobs:
|
93
|
+
improve:
|
94
|
+
runs-on: ubuntu-latest
|
95
|
+
steps:
|
96
|
+
- uses: actions/checkout@v4
|
97
|
+
- name: Run Cursor Agent
|
98
|
+
env:
|
99
|
+
CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} # Add in GitHub Secrets
|
100
|
+
run: |
|
101
|
+
# Script to call Cursor API or simulate agent for code analysis
|
102
|
+
python cursor_improve.py # Prompt agent to suggest repo improvements
|
103
|
+
```
|
104
|
+
4. **Script (cursor_improve.py):** Use Cursor's API to analyze code and open issues/PRs with suggestions.
|
105
|
+
|
106
|
+
#### Step 2: Set Up Cursor Agents for PR Reviews
|
107
|
+
1. **GitHub PR Integration:** In GitHub, tag @Cursor in PR comments to trigger Background Agent for reviews and fixes.
|
108
|
+
- Example: In a PR, comment "@Cursor review this for bugs" – it will analyze and suggest changes.
|
109
|
+
2. **Automate with Workflow:** Trigger on PR events to auto-invoke Cursor.
|
110
|
+
```yaml
|
111
|
+
name: Cursor PR Review
|
112
|
+
on: pull_request
|
113
|
+
jobs:
|
114
|
+
review:
|
115
|
+
runs-on: ubuntu-latest
|
116
|
+
steps:
|
117
|
+
- uses: actions/checkout@v4
|
118
|
+
- name: Invoke Cursor Agent
|
119
|
+
run: |
|
120
|
+
# Use GitHub API to comment "@Cursor review" on the PR
|
121
|
+
gh pr comment $PR_NUMBER --body "@Cursor review this PR for improvements"
|
122
|
+
```
|
123
|
+
3. **How It Works:** Cursor's Background Agent will read the PR, apply fixes if needed, and push commits (per changelog features).
|
124
|
+
|
125
|
+
#### Safeguards to Prevent Breaking Working Code
|
126
|
+
Constant improvements are great, but we must avoid breaking things that already work well. Here's how to build safety into the process:
|
127
|
+
|
128
|
+
- **Regression Testing:** Always run tests against gold standards (e.g., example JSONs in `examples/warehouse/lineage`). If an agent's suggestion changes outputs, reject it unless intentionally updating the gold.
|
129
|
+
- **CI/CD Pipelines:** Set up automated tests in GitHub Actions to run on every PR or scheduled run. Fail the build if tests break, catching issues early.
|
130
|
+
- **Human Oversight:** Agents suggest changes—review and approve them manually before merging. Use PR reviews to double-check.
|
131
|
+
- **Modular Changes:** Limit agent tasks to small, isolated improvements (e.g., one file at a time) to minimize risk.
|
132
|
+
- **Monitoring and Rollback:** Track metrics like test pass rates. Use Git for easy rollbacks if something breaks.
|
133
|
+
|
134
|
+
Integrate these into workflows: Add test steps to the example YAML files, and prompt agents with "Suggest improvements without changing existing correct behavior."
|
@@ -0,0 +1,109 @@
|
|
1
|
+
### High-level algorithm
|
2
|
+
|
3
|
+
#### Map the labyrinth (bring string, not MINUS)
|
4
|
+
To escape the SQL maze, you’ll build maps: object graphs, schemas, and column lineage webs. One step at a time, no special math needed.
|
5
|
+
|
6
|
+
- Scout terrain: parse files → objects → dependencies
|
7
|
+
- Place torches: resolve schemas, expand stars where safe
|
8
|
+
- Trace footprints: build column graph for impact and diff
|
9
|
+
|
10
|
+
If you see a star `*`, don’t panic—just expand it after you know what’s upstream.
|
11
|
+
|
12
|
+
Plain: compute inputs first, then dependents (this is topological sort).
|
13
|
+
|
14
|
+
#### Audience & prerequisites
|
15
|
+
- Audience: data engineers, analytics engineers, platform engineers
|
16
|
+
|
17
|
+
- Prerequisites: basic SQL; comfortable with CLI and git; Python 3.10+; AST familiarity helpful but optional (AST is Abstract Syntax Tree: a breakdown of code structure).
|
18
|
+
|
19
|
+
1. Discover SQL assets and parse to AST (normalize identifiers)
|
20
|
+
2. Build object-level dependency graph (views, CTEs, procs/temp tables)
|
21
|
+
3. Resolve schemas topologically; expand `*` after inputs known
|
22
|
+
4. Extract column-level lineage per output column expression
|
23
|
+
5. Build bidirectional column graph for impact analysis
|
24
|
+
6. Detect breaking changes by diffing base vs head graphs/schemas/expressions
|
25
|
+
7. Output OpenLineage JSON + CLI reports
|
26
|
+
|
27
|
+
### Data structures
|
28
|
+
- ObjectGraph: nodes = objects {name, type, statements}, edges = dependencies
|
29
|
+
- SchemaRegistry: map object -> [columns {name, type, nullable, ordinal}]
|
30
|
+
- ColumnGraph: nodes = fully qualified columns, edges = lineage relations
|
31
|
+
|
32
|
+
### Pseudocode (high-level)
|
33
|
+
```
|
34
|
+
files = load_sql(dir)
|
35
|
+
objects = parse(files) # AST per object
|
36
|
+
objGraph = build_object_graph(objects)
|
37
|
+
order = topo_sort(objGraph)
|
38
|
+
for obj in order:
|
39
|
+
schema_in = schemas_of_inputs(obj)
|
40
|
+
schema_out, lineage = resolve(obj.AST, schema_in)
|
41
|
+
SchemaRegistry[obj] = schema_out
|
42
|
+
ColumnGraph.add(lineage)
|
43
|
+
```
|
44
|
+
|
45
|
+
### Resolve() essentials
|
46
|
+
- Name resolution: qualify identifiers using input schemas and aliases
|
47
|
+
- Star expansion: replace `*` with ordered columns from the resolved input
|
48
|
+
- Expression lineage: walk AST; collect input column refs per output column
|
49
|
+
- Type/nullable inference: derive from operations (e.g., CAST types, SUM numeric, CASE nullability = union of branches)
|
50
|
+
- Join semantics: track how join type affects nullability of columns
|
51
|
+
- Set ops: ensure column counts/types align; union lineage inputs
|
52
|
+
|
53
|
+
### Type/nullable rules (examples)
|
54
|
+
- `CAST(x AS T)` → type T
|
55
|
+
- `a + b` → numeric promotion; nullable if a or b nullable
|
56
|
+
- `CASE WHEN p THEN x ELSE y` → type = LUB(type(x), type(y)); nullable if either branch nullable or no ELSE
|
57
|
+
- `SUM(x)` → numeric; nullable unless GROUP present and engine semantics dictate otherwise
|
58
|
+
|
59
|
+
### Error handling and diagnostics
|
60
|
+
- On unresolved identifiers: record error with location; skip column lineage for affected outputs
|
61
|
+
- On unsupported syntax: emit warning; continue best-effort resolution
|
62
|
+
- Deterministic ordering of outputs and diagnostics for stable diffs
|
63
|
+
- Don’t crash the whole run for one bad file; continue and report
|
64
|
+
|
65
|
+
### Performance notes
|
66
|
+
- Cache parsed ASTs and resolved schemas by file hash
|
67
|
+
- Short-circuit lineage for unchanged objects between branches
|
68
|
+
|
69
|
+
### Impact search (recursive)
|
70
|
+
- Upstream: walk edges from selected column to its input columns until sources are reached
|
71
|
+
- Downstream: walk edges from selected column to outputs until targets are reached
|
72
|
+
- Stop conditions: max-depth (if given), visited-set to avoid cycles
|
73
|
+
|
74
|
+
### Flowchart
|
75
|
+
```mermaid
|
76
|
+
flowchart TD
|
77
|
+
A[Discover SQL files] --> B[Parse to AST]
|
78
|
+
B --> C[Build object graph]
|
79
|
+
C --> D[Topological order]
|
80
|
+
D --> E[Resolve input schemas]
|
81
|
+
E --> F[Expand * after inputs known]
|
82
|
+
F --> G[Extract column lineage per output]
|
83
|
+
G --> H[Build column graph]
|
84
|
+
H --> I[Emit OpenLineage + reports]
|
85
|
+
```
|
86
|
+
|
87
|
+
### Star expansion worked example
|
88
|
+
```sql
|
89
|
+
CREATE VIEW dbo.vw_orders_all_enriched AS
|
90
|
+
SELECT o.*, c.Region
|
91
|
+
FROM dbo.Orders o
|
92
|
+
JOIN dbo.Customers c ON o.CustomerID = c.CustomerID;
|
93
|
+
```
|
94
|
+
- Resolve `Orders` and `Customers` schemas first
|
95
|
+
- Expand `o.*` by ordinal from `Orders`; append `Region`
|
96
|
+
- Track nullability impact from the JOIN type
|
97
|
+
|
98
|
+
### Determinism requirements
|
99
|
+
- Deterministic output column ordering and schema serialization
|
100
|
+
- Deterministic diagnostics (stable file/line ordering)
|
101
|
+
- Stable JSON field ordering to make diffs meaningful
|
102
|
+
|
103
|
+
### Simple Example Walkthrough
|
104
|
+
Take this SQL: `SELECT id AS student_id FROM students;`
|
105
|
+
|
106
|
+
1. Parse to AST: Break into parts like 'SELECT', 'id', 'AS student_id'.
|
107
|
+
2. Build object graph: Note dependency on 'students'.
|
108
|
+
3. Resolve schemas: Output has 'student_id' from 'students.id'.
|
109
|
+
4. Extract lineage: student_id comes from students.id.
|