stata-code 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. stata_code-0.3.0/.gitignore +224 -0
  2. stata_code-0.3.0/CHANGELOG.md +205 -0
  3. stata_code-0.3.0/LICENSE +21 -0
  4. stata_code-0.3.0/LICENSE-POLICY.md +125 -0
  5. stata_code-0.3.0/PKG-INFO +389 -0
  6. stata_code-0.3.0/PUBLISHING.md +122 -0
  7. stata_code-0.3.0/README.md +351 -0
  8. stata_code-0.3.0/SCHEMA.md +593 -0
  9. stata_code-0.3.0/docs/design/hard_timeout.md +161 -0
  10. stata_code-0.3.0/examples/01-basic-regression.md +170 -0
  11. stata_code-0.3.0/examples/02-did-card-krueger.md +307 -0
  12. stata_code-0.3.0/examples/03-graphs.md +155 -0
  13. stata_code-0.3.0/examples/04-multi-session.md +191 -0
  14. stata_code-0.3.0/examples/05-large-matrix.md +140 -0
  15. stata_code-0.3.0/examples/README.md +15 -0
  16. stata_code-0.3.0/pyproject.toml +97 -0
  17. stata_code-0.3.0/schema/run_result.schema.json +726 -0
  18. stata_code-0.3.0/scripts/export_schema.py +82 -0
  19. stata_code-0.3.0/stata_code/__init__.py +100 -0
  20. stata_code-0.3.0/stata_code/core/__init__.py +73 -0
  21. stata_code-0.3.0/stata_code/core/_pool.py +808 -0
  22. stata_code-0.3.0/stata_code/core/_refs.py +97 -0
  23. stata_code-0.3.0/stata_code/core/_runtime.py +179 -0
  24. stata_code-0.3.0/stata_code/core/errors.py +447 -0
  25. stata_code-0.3.0/stata_code/core/runner.py +1092 -0
  26. stata_code-0.3.0/stata_code/core/schema.py +317 -0
  27. stata_code-0.3.0/stata_code/kernel/__init__.py +5 -0
  28. stata_code-0.3.0/stata_code/kernel/__main__.py +6 -0
  29. stata_code-0.3.0/stata_code/kernel/kernel.py +331 -0
  30. stata_code-0.3.0/stata_code/mcp/__init__.py +3 -0
  31. stata_code-0.3.0/stata_code/mcp/__main__.py +6 -0
  32. stata_code-0.3.0/stata_code/mcp/server.py +360 -0
  33. stata_code-0.3.0/tests/__init__.py +1 -0
  34. stata_code-0.3.0/tests/fixtures/.gitkeep +0 -0
  35. stata_code-0.3.0/tests/test_cancel.py +133 -0
  36. stata_code-0.3.0/tests/test_errors.py +231 -0
  37. stata_code-0.3.0/tests/test_kernel.py +267 -0
  38. stata_code-0.3.0/tests/test_mcp.py +249 -0
  39. stata_code-0.3.0/tests/test_pool.py +365 -0
  40. stata_code-0.3.0/tests/test_runner.py +777 -0
  41. stata_code-0.3.0/tests/test_schema.py +561 -0
  42. stata_code-0.3.0/tests/test_schema_artifact.py +61 -0
@@ -0,0 +1,224 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+
220
+ # Stata-specific
221
+ *.gph
222
+ *.smcl
223
+ *.dta
224
+ !tests/fixtures/*.dta
@@ -0,0 +1,205 @@
1
+ # Changelog
2
+
3
+ All notable changes to `stata-code` are documented here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); the project adheres
5
+ to semver-major.minor for the result schema (see `SCHEMA.md` §6).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.3.0] — 2026-05-07
10
+
11
+ ### Changed
12
+
13
+ - **PyPI distribution renamed to `stata-code`.** Previously published as
14
+ `stata_code`. Install with `pip install stata-code` going forward; the
15
+ Python import name remains `stata_code` (Python identifier rules — same
16
+ pattern as `scikit-learn` → `import sklearn`). Existing users on
17
+ `pip install stata_code` will keep working until that PyPI project
18
+ stops receiving new versions, but should migrate.
19
+ - **Project URLs in `pyproject.toml` corrected** to
20
+ `github.com/brycewang-stanford/stata-code` (the actual repository
21
+ URL — the previous metadata had `stata_code`).
22
+ - **MCP server announces itself as `stata-code`** (was `stata_code`).
23
+ This is the protocol-level server name; tool ids
24
+ (`stata_run`, `get_log`, etc.) are unchanged.
25
+ - **VSCode extension display name unified to `stata-code`** in the
26
+ Marketplace, activity-bar tile, command-palette `category`, output
27
+ channel, all toast messages, and webview title. Code identifiers
28
+ (`stataCode.*` command / view / setting ids; npm `name`
29
+ `stata-code-vscode`) are unchanged so existing keybindings keep
30
+ working.
31
+ - **Version aligned across surfaces.** `pyproject.toml`,
32
+ `stata_code/__init__.py`, `stata_code/mcp/server.py`,
33
+ `vscode/package.json`, and the VSCode MCP-client handshake all
34
+ declare `0.3.0`.
35
+
36
+ ### Added
37
+
38
+ - **VSCode extension v0.3 — full UI surface** (`vscode/`). Beyond the
39
+ v0.1 "run from command palette" scaffold, the extension now ships
40
+ every common GUI affordance, so users who don't drive Stata through
41
+ Claude Code / Cursor can still operate the same MCP server from the
42
+ editor:
43
+ - **Editor title-bar ▶ button** (`editor/title/run` menu) and
44
+ editor right-click menu entries (`Run Selection` / `Run Active File`).
45
+ - **Status bar item** showing the current session; click for a
46
+ QuickPick (`Switch session…` / `Cancel` / `Reset`). The icon
47
+ swaps to a spinner during runs and the run progress notification
48
+ now has a Cancel button (cooperative cancellation through the
49
+ MCP `cancel_session` tool).
50
+ - **Activity-bar sidebar** with four views: live `Sessions` (with
51
+ inline Cancel/Reset/Close per item — `main` is non-closable;
52
+ locally-known but not-yet-started sessions persist via
53
+ `workspaceState`), `Last Result` (collapsible
54
+ `r()` / `e()` / warnings / dataset / log / graphs), `Graphs`
55
+ history (click-to-open + per-item Save…), and `Logs`
56
+ history (click-to-open + per-item Save…). Section-header buttons
57
+ for Clear (logs / graphs) and New / Refresh (sessions).
58
+ - **Inline error decorations.** Failed runs now publish a
59
+ `DiagnosticCollection` entry on the failing file/line, complete
60
+ with the typed error message, failing snippet, and any
61
+ suggestions surfaced in `runResult.error.suggestions`. Hover
62
+ shows the full text; the Problems panel lists the entry under
63
+ `source: stata-code, code: <error.kind>`.
64
+ - **Code-lens "Run Cell" support.** Lines starting with `* %%`
65
+ get an inline `▶ Run Cell` lens; clicking submits the code
66
+ between markers. Cell ranges map back to the original file
67
+ lines so error squigglies still anchor correctly.
68
+ - **Graph webview action buttons.** The webview now uses a strict
69
+ nonce-based CSP and exposes `Save as…`, `Open externally`, and
70
+ `Refresh` per-graph and panel-level buttons. PNG/SVG/PDF bytes
71
+ still flow lazily through `get_graph(ref)`.
72
+ - Bumped the extension version to `0.2.0`.
73
+
74
+ - **Matrix size cap + `get_matrix(ref)`.** Matrices larger than
75
+ `MATRIX_INLINE_CELL_CAP` (default 10,000 cells) now drop their
76
+ `values` from the envelope and surface a `matrix://<request_id>/<r|e>/
77
+ <name>` ref instead. Callers fetch the values via `get_matrix(ref)`,
78
+ which mirrors the existing `get_log` / `get_graph` pattern. The MCP
79
+ server gains a seventh tool, `get_matrix`, returning JSON
80
+ `{rows, cols, values}`. Closes the last open §3.4 todo from
81
+ SCHEMA.md and prevents pathological commands (e.g., `correlate` over
82
+ hundreds of variables) from blowing up the result envelope.
83
+
84
+ - **VSCode extension scaffold** (`vscode/`). TypeScript extension that
85
+ spawns `stata-code-mcp` over stdio and registers four commands
86
+ (`Run Selection`, `Run Active File`, `Show Graphs`, `Show Last
87
+ Result`). Hand-rolled TypeScript types in
88
+ `vscode/src/types/runResult.ts` mirror the Pydantic envelope;
89
+ `npm run gen-types` regenerates a full copy from
90
+ `schema/run_result.schema.json` for cross-checking. Source-only —
91
+ build with `npm install && npm run compile`.
92
+
93
+ - **VSCode graph webview** (`vscode/src/graphPanel.ts`). Successful
94
+ runs that capture graphs auto-open a side-by-side webview that
95
+ renders PNG / SVG / PDF inline. The webview lazily fetches each
96
+ graph's bytes via `get_graph(ref)` rather than embedding them in
97
+ the original `RunResult`, so token economy is preserved end-to-end
98
+ (an agent driving the same MCP server pays nothing extra for
99
+ inlining). Strict CSP (`default-src 'none'`, no scripts).
100
+ Marketplace publishing still deferred.
101
+
102
+ - **`stata_required` pytest marker.** Integration tests against a
103
+ real Stata installation are now tagged with the marker; CI runs
104
+ `pytest -m "not stata_required"`, completing in ~1.5s instead of
105
+ ~19s. Local without Stata, the same tests still skip cleanly.
106
+
107
+ - **Cooperative cancellation.** New `cancel(session_id)` /
108
+ `clear_cancel(session_id)` / `is_cancel_pending(session_id)` Python
109
+ API plus the MCP `cancel_session` tool (eighth tool). A pending
110
+ cancel short-circuits the next `execute()` call for that session
111
+ and returns a `RunResult` with `ok=false`, `rc=-3` (synthetic),
112
+ `error.kind="cancelled"`. The flag is one-shot per cancel, isolated
113
+ per session, and thread-safe. Note: this is *cooperative* — it does
114
+ not interrupt code that is currently mid-`stata.run()` (pystata is
115
+ in-process and has no clean cancel primitive). Hard interruption
116
+ remains deferred to the subprocess-based runtime planned for v0.3+.
117
+
118
+ ### Changed
119
+
120
+ - **MCP server tool count is now 8** (added `get_matrix`,
121
+ `cancel_session`).
122
+
123
+ ## [0.2.0] — 2026-05-07
124
+
125
+ The first release that actually ships an end-to-end Stata pipeline. The v1.0
126
+ result schema is the load-bearing artifact; everything below is implemented
127
+ against it and end-to-end-tested on Stata 18 MP.
128
+
129
+ ### Added
130
+
131
+ - **`SCHEMA.md` v1.0** — normative result-envelope contract: `ok` / `rc`,
132
+ typed `error` (32 `kind` values), structured `r()` / `e()` (scalars,
133
+ macros, matrices), `dataset` snapshot with variable list, log
134
+ head+tail+ref, graph refs with PNG/SVG/PDF support, multi-session id,
135
+ forward-compat clauses.
136
+ - **`stata_code.run()`** (= `execute()`) — the real-Stata pipeline. Uses
137
+ pystata in-process; collects native-typed return values via `sfi`;
138
+ builds a `RunResult` end to end.
139
+ - **`get_log` / `get_graph` / `list_sessions` / `reset_session`** —
140
+ auxiliary tools per `SCHEMA.md` §5.
141
+ - **MCP server** (`stata_code.mcp.server`) — six tools: `stata_run`,
142
+ `stata_info`, `get_log`, `get_graph`, `list_sessions`,
143
+ `reset_session`. Console script: `stata-code-mcp`. Module entry:
144
+ `python -m stata_code.mcp`.
145
+ - **Jupyter kernel** (`stata_code.kernel`) rewired to the v1.0 pipeline.
146
+ Defaults tuned for notebooks (`include_full_log=True`,
147
+ `include_graphs="inline"`). Console script: `stata-code-kernel`.
148
+ Module entry: `python -m stata_code.kernel`.
149
+ - **Multi-session via Stata frames**. `session_id="main"` maps to the
150
+ default frame; other ids create/route to same-named frames.
151
+ - **Per-line error attribution** — `error.line`, `commands_executed`,
152
+ and `context.{before, failing, after}` are populated by parsing
153
+ pystata's multi-line transcript.
154
+ - **Warning extraction** — five built-in patterns
155
+ (`omitted_collinear`, `convergence`, `singular`, `boundary`, generic
156
+ `note`) + dedup.
157
+ - **Graph capture pipeline** — `graph dir` snapshot delta + `graph
158
+ display` + `graph export`; PNG `width`/`height` parsed from IHDR;
159
+ bytes stored under a `_refs` LRU.
160
+ - **`_refs` LRU eviction** — bounded ref store (default 256 entries)
161
+ to keep long-running MCP processes from growing unboundedly.
162
+ - **`LICENSE-POLICY.md`** — clean-room policy that forbids opening
163
+ AGPL/GPL Stata project source.
164
+ - **138 tests** covering schema, runner integration, MCP, kernel,
165
+ `_refs`, and error helpers. Real-Stata tests run against Stata 18 MP
166
+ when available.
167
+
168
+ ### Changed
169
+
170
+ - Top-level `stata_code.run()` now returns the new `RunResult` (Pydantic
171
+ v2). The legacy `StataResult` dataclass and the `capture_graphs`/
172
+ `capture_log`/`timeout` keyword arguments are gone.
173
+ - Wheel build now ships **all** of `stata_code` (`core`, `mcp`,
174
+ `kernel`). Previously the wheel only contained `core`.
175
+
176
+ ### Removed
177
+
178
+ - **Legacy modules** — `core/pystata_adapter.py`, `core/console_fallback.py`,
179
+ `core/result.py`, `core/version.py`. Their behavior is now provided by
180
+ `core/runner.py`, `core/_runtime.py`, `core/schema.py`.
181
+ - **Legacy tests** — `tests/test_result.py`, `tests/test_version.py`,
182
+ `tests/test_integration.py`. Coverage moved to `tests/test_runner.py`,
183
+ `tests/test_schema.py`, and `tests/test_mcp.py`.
184
+
185
+ ### Migration notes
186
+
187
+ | Before (v0.1) | After (v0.2) |
188
+ | --- | --- |
189
+ | `from stata_code import run` returns `StataResult` | Returns `RunResult` |
190
+ | `result.log` (string) | `result.log.head` / `result.log.tail` (and `get_log(ref)` for full) |
191
+ | `result.results["r(mean)"]` | `result.results.r.scalars["mean"]` (native float) |
192
+ | `result.error` (string) | `result.error.kind` (typed) + `result.error.message` |
193
+ | `result.graphs[0].data` (bytes) | `result.graphs[0].ref` + `get_graph(ref)` |
194
+ | `run(code, capture_graphs=True)` | `run(code, include_graphs="ref" \| "inline" \| "none")` |
195
+ | `run(code, timeout=120)` | `run(code, timeout_ms=120_000)` |
196
+
197
+ `pystata` is no longer declared as a runtime dependency in
198
+ `pyproject.toml` — it is sourced from your local Stata install per the
199
+ documented `_runtime` discovery path.
200
+
201
+ ## [0.1.0] — 2026-04
202
+
203
+ Initial scaffolding. `pystata_adapter`, `console_fallback`, basic kernel
204
+ and MCP server, `References-tools.md` survey, project vision in
205
+ `README.md`. Largely superseded by 0.2.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 brycew6m
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,125 @@
1
+ # License Policy
2
+
3
+ `stata_code` is released under the **MIT License**. To keep the codebase legally clean and freely usable downstream (including by commercial and closed-source projects), this repository follows a strict **protocol-first, clean-room** development policy. This document is the binding policy; contributors must read it before opening a pull request.
4
+
5
+ ---
6
+
7
+ ## 1. Project license
8
+
9
+ - **License:** MIT (see `LICENSE`).
10
+ - **Goal:** Anyone — including commercial and closed-source projects — can integrate, fork, or redistribute `stata_code` without copyleft obligations.
11
+
12
+ This goal is incompatible with deriving from AGPL-3.0 / GPL-3.0 source code. The rules below exist to prevent that.
13
+
14
+ ---
15
+
16
+ ## 2. The three categories of references
17
+
18
+ Every external project relevant to `stata_code` falls into one of three buckets:
19
+
20
+ ### 2.1 Open standards & vendor docs (always allowed)
21
+
22
+ These define **public protocols and APIs**. Reading them, citing them, and implementing against them does not contaminate our code.
23
+
24
+ - **Anthropic MCP specification** — protocol shape, message formats, tool registration semantics.
25
+ - **Jupyter kernel protocol** — `kernel_info`, `execute_request`, message routing.
26
+ - **Language Server Protocol (LSP)** — for any future LSP work.
27
+ - **StataCorp pystata documentation** — official Python API surface.
28
+ - **StataCorp Stata documentation** (`help`, manuals) — `r()`, `e()`, `_rc`, system values.
29
+ - **Stata `.dta` file format documentation** — published by StataCorp.
30
+ - **Anthropic / OpenAI tool-use docs** — function-calling shapes.
31
+
32
+ ### 2.2 Permissively-licensed projects (allowed with attribution if reused)
33
+
34
+ MIT, BSD, Apache 2.0, ISC. Reading source is allowed; copying must follow the license terms (preserve copyright notice, etc.). Even when allowed, we **prefer independent implementation** to keep authorship clean.
35
+
36
+ - `kylebarron/stata-enhanced` — MIT (TextMate grammar; we do not reuse it).
37
+ - `kylebarron/stata-exec` — MIT (Atom; not reused).
38
+ - `kylebarron/language-stata` — MIT (Atom grammar; not reused).
39
+ - `hanlulong/stata-mcp` — MIT (we do not consult its source; see §4).
40
+ - `lbraglia/RStata` — design reference only.
41
+ - `euglevi/stata-language-server` — MIT.
42
+
43
+ ### 2.3 Copyleft projects (source code forbidden)
44
+
45
+ **Source code of these projects must not be read by anyone contributing to `stata_code`.** Their READMEs, public issues, demos, screenshots, and documentation describing user-facing behavior are fine — copyright protects expression, not ideas. But the source itself contaminates.
46
+
47
+ - `SepineTam/stata-mcp` — AGPL-3.0
48
+ - `tmonk/mcp-stata` — AGPL-3.0
49
+ - `tmonk/stata-workbench` — AGPL-3.0
50
+ - `kylebarron/stata_kernel` — GPL-3.0
51
+ - `hugetim/nbstata` — GPL-3.0
52
+
53
+ If new copyleft Stata projects appear, add them here in the same PR that first references them.
54
+
55
+ ---
56
+
57
+ ## 3. The clean-room rule
58
+
59
+ When designing or implementing any feature that overlaps with a copyleft project's behavior:
60
+
61
+ 1. **Do not open the copyleft project's source files.** Not in a browser, not in `git clone`, not in an IDE.
62
+ 2. **You may** read its README, feature list, screenshots, public issues, blog posts, and conference talks describing what it does.
63
+ 3. **You may** read the underlying public protocol or API spec (MCP, pystata, etc.) and implement against that.
64
+ 4. **You may** look at the inputs and outputs (call its tools, observe responses) — black-box behavioral observation is fine.
65
+ 5. **Design from first principles.** Our schema (`SCHEMA.md`) was designed from agent-token-economy principles and the public pystata API. It was not derived by simplifying or rearranging anyone else's schema.
66
+
67
+ If you find yourself thinking *"how does project X handle Y?"*, the answer is: read its docs and observe its behavior. Do not open its source.
68
+
69
+ ---
70
+
71
+ ## 4. If you accidentally read forbidden source
72
+
73
+ It happens. Honesty is the only safe response.
74
+
75
+ 1. **Stop reading immediately.** Close the file.
76
+ 2. **Disclose in the PR or issue.** Note what you read and approximately how much.
77
+ 3. **Wait at least 30 days** before contributing code in the affected area. If the area is small (one function), a fresh contributor implements it. If broad, that contributor sits out the area indefinitely.
78
+ 4. **Do not** quote, paraphrase, or rewrite from memory.
79
+
80
+ This is the same posture used by clean-room reverse-engineering teams. It is conservative on purpose.
81
+
82
+ ---
83
+
84
+ ## 5. Adding a new reference
85
+
86
+ When introducing any new external project to documentation, code, or discussion:
87
+
88
+ 1. Add it to one of the three lists in §2 of this file in the same PR.
89
+ 2. State its license explicitly (check `LICENSE` file, not `package.json`/`README` — those drift).
90
+ 3. If copyleft, the PR must not include any code; only the bucket-3 listing.
91
+
92
+ Reviewers should reject PRs that mention an external project without classifying it.
93
+
94
+ ---
95
+
96
+ ## 6. Dependencies vs. derivation
97
+
98
+ Note the difference:
99
+
100
+ - **Depending** on an MIT/BSD/Apache library at runtime is fine and does not contaminate.
101
+ - **Depending** on a GPL/AGPL library at runtime *does* contaminate the distributed package; we don't do that for any package we ship under MIT.
102
+ - **Depending** on a GPL/AGPL library only in a separate, GPL-licensed sub-package (e.g., `stata-code-jupyter-glue`) is acceptable as long as the MIT core does not import it. Any such split must be called out at the top of the README and in `pyproject.toml`.
103
+
104
+ ---
105
+
106
+ ## 7. Why this matters
107
+
108
+ Stata is a small ecosystem with active and vigilant maintainers, several of whom have publicly enforced their AGPL terms. A clean license posture:
109
+
110
+ - Keeps `stata_code` usable by any downstream — universities, central banks, commercial vendors.
111
+ - Prevents "rip-off" accusations that have already been levied at fork-style projects in the space.
112
+ - Makes future fundraising, hiring, and acquisitions trivial on the IP side.
113
+ - Protects contributors personally — clean-room compliance is auditable.
114
+
115
+ The cost of this policy is small (some independent design work). The cost of getting it wrong is irreversible: a contaminated codebase cannot be "scrubbed" of AGPL after the fact; only rewritten from scratch by uncontaminated authors.
116
+
117
+ ---
118
+
119
+ ## 8. Acknowledgement on first contribution
120
+
121
+ Every first-time contributor to `stata_code` adds the following line to their first PR description:
122
+
123
+ > I have read `LICENSE-POLICY.md` and confirm I have not consulted source code from the copyleft projects listed therein for the purposes of this contribution.
124
+
125
+ Maintainers may decline contributions without this acknowledgement.