stata-code 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stata_code-0.7.0 → stata_code-0.7.2}/.gitignore +2 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/CHANGELOG.md +60 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/PKG-INFO +11 -8
- {stata_code-0.7.0 → stata_code-0.7.2}/README.md +10 -7
- {stata_code-0.7.0 → stata_code-0.7.2}/pyproject.toml +1 -1
- stata_code-0.7.2/scripts/build_skill_zip.py +105 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/__init__.py +1 -1
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/runner.py +141 -10
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/kernel.py +32 -2
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/mcp/server.py +376 -1
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_kernel.py +113 -0
- stata_code-0.7.2/tests/test_new_tools.py +472 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_runner.py +61 -0
- stata_code-0.7.2/tests/test_skill_package.py +65 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/LICENSE +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/LICENSE-POLICY.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/PUBLISHING.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/SCHEMA.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/docs/design/hard_timeout.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/01-basic-regression.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/02-did-card-krueger.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/03-graphs.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/04-multi-session.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/05-large-matrix.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/examples/README.md +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/schema/run_result.schema.json +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/scripts/check_versions.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/scripts/export_schema.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/__init__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/_pool.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/_refs.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/_runtime.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/errors.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/log_artifacts.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/notebook.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/run_index.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/core/schema.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/__init__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/__main__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/assets/logo-32x32.png +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/assets/logo-64x64.png +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/kernel/assets/logo-svg.svg +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/mcp/__init__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/stata_code/mcp/__main__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/__init__.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/conftest.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/fixtures/.gitkeep +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_cancel.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_errors.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_log_artifacts.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_mcp.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_mcp_stdio.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_notebook.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_notebook_phase2.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_pool.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_public_api.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_release_versions.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_run_index.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_runtime_discovery.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_schema.py +0 -0
- {stata_code-0.7.0 → stata_code-0.7.2}/tests/test_schema_artifact.py +0 -0
|
@@ -6,6 +6,66 @@ to semver-major.minor for the result schema (see `SCHEMA.md` §6).
|
|
|
6
6
|
|
|
7
7
|
## Unreleased
|
|
8
8
|
|
|
9
|
+
## 0.7.2 — 2026-06-20
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- **Three convenience MCP tools** raise the tool surface from 15 to 18:
|
|
14
|
+
- `install_package(name, source?, url?, replace?, session_id?)` — installs a
|
|
15
|
+
community package via `ssc install` / `net install` without the agent
|
|
16
|
+
having to remember the syntax, then verifies it resolves with `which`.
|
|
17
|
+
Package names and URLs are validated to keep them out of the generated
|
|
18
|
+
command line; failures surface the typed `error` block (e.g. `network`).
|
|
19
|
+
- `search_log(ref, pattern, is_regex?, ignore_case?, context?, max_matches?)`
|
|
20
|
+
— greps within a truncated `log://` payload and returns only the matching
|
|
21
|
+
lines (with optional context), so a long log can be inspected without
|
|
22
|
+
pulling the whole transcript back through `get_log`.
|
|
23
|
+
- `inspect_data(varlist?, detail?, session_id?)` — runs `describe` +
|
|
24
|
+
`codebook` and returns the structured `dataset` block plus the codebook
|
|
25
|
+
log: a one-call "what's in this dataset" the agent doesn't have to spell out.
|
|
26
|
+
- **On-demand Stata reference library** under `skills/stata-code/references/`
|
|
27
|
+
(~4,200 lines): topic files for core syntax, data management, econometrics,
|
|
28
|
+
causal inference, panel/time series, graphics, and table export; load-bearing
|
|
29
|
+
`error-codes.md` (the full `rc → kind → fix` table + self-repair loop, aligned
|
|
30
|
+
with the typed-error taxonomy) and `defensive-coding.md`; and per-package notes
|
|
31
|
+
for `reghdfe`, `coefplot`, `estout`, and `gtools`. `SKILL.md` gained a routing
|
|
32
|
+
table (read 1–3 files on demand) and a live-vs-offline execution-mode section.
|
|
33
|
+
- **`scripts/build_skill_zip.py`** packages the skill into a deterministic
|
|
34
|
+
`build/stata-code-skill.zip` for upload as Claude.ai project knowledge.
|
|
35
|
+
|
|
36
|
+
## 0.7.1 — 2026-06-19
|
|
37
|
+
|
|
38
|
+
### Fixed
|
|
39
|
+
|
|
40
|
+
- **Jupyter kernel: graphs after the first cell now display.** Graph capture
|
|
41
|
+
detected new graphs by diffing in-memory graph names before/after a run.
|
|
42
|
+
Because Stata keeps only one graph per name and every unnamed graph command
|
|
43
|
+
overwrites the default `Graph` in place, the second and later cells of a
|
|
44
|
+
persistent session produced no name delta and their graphs were silently
|
|
45
|
+
dropped — only the first cell's graph ever rendered. Capture now also
|
|
46
|
+
re-exports any graph the cell's own source shows it (re)drew (every
|
|
47
|
+
`name(...)` target, plus the default `Graph` for any unnamed graph command),
|
|
48
|
+
so in-place redraws surface every time. The same fix covers repeated MCP
|
|
49
|
+
`stata_run` calls in one session. The graph-command detector was tightened
|
|
50
|
+
to distinguish drawing commands from `graph` utility subcommands (`export`,
|
|
51
|
+
`display`, `dir`, `drop`, …) so a utility-only cell no longer re-surfaces a
|
|
52
|
+
stale graph.
|
|
53
|
+
- **Jupyter kernel: no more duplicated code echo in cell output.** pystata
|
|
54
|
+
runs a multi-line cell as a temporary do-file, and Stata echoes every
|
|
55
|
+
submitted command (`. cmd` / `> continuation`) regardless of `echo=False`
|
|
56
|
+
(which only suppresses echo for a single inline command). For a cell with no
|
|
57
|
+
textual output (e.g. a graph) that echo was the *only* thing shown — a
|
|
58
|
+
useless repeat of the source already visible in the input cell. The kernel
|
|
59
|
+
now strips command-echo lines before streaming, keeping genuine command
|
|
60
|
+
output. The full log (with echo) is unchanged in `RunResult.log` for MCP /
|
|
61
|
+
agent consumers.
|
|
62
|
+
|
|
63
|
+
### Changed
|
|
64
|
+
|
|
65
|
+
- **VS Code extension now ships a Marketplace icon** (coef-plot mark, Anthropic
|
|
66
|
+
palette on white) so the listing and Extensions sidebar render branded
|
|
67
|
+
artwork instead of the default placeholder.
|
|
68
|
+
|
|
9
69
|
## 0.7.0 — 2026-05-30
|
|
10
70
|
|
|
11
71
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stata-code
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Agent-native Stata bridge — one core, multiple frontends (MCP, Jupyter, VSCode)
|
|
5
5
|
Project-URL: Homepage, https://github.com/brycewang-stanford/stata-code
|
|
6
6
|
Project-URL: Repository, https://github.com/brycewang-stanford/stata-code
|
|
@@ -84,9 +84,9 @@ Description-Content-Type: text/markdown
|
|
|
84
84
|
└─────────────┘ └────────────┘ └─────────────────┘
|
|
85
85
|
```
|
|
86
86
|
|
|
87
|
-
**Status: v0.
|
|
87
|
+
**Status: v0.7 (May 2026)** — the core, MCP server, Jupyter kernel, and VS Code extension work end-to-end against Stata 18 MP. The test suite covers schema, runner, MCP, kernel, notebook, run-index, subprocess-pool, and VS Code modules; CI also checks linting, type safety, schema generation, package metadata, and VSIX packaging. License: **MIT**.
|
|
88
88
|
|
|
89
|
-
Two workflows
|
|
89
|
+
Two workflows the current release explicitly supports for end users:
|
|
90
90
|
|
|
91
91
|
- **Run Stata code from a Jupyter notebook.** `pip install "stata-code[kernel]"` + `stata-code-kernel install --user` registers a **Stata** kernel that the Jupyter Notebook UI, JupyterLab, and the VS Code Jupyter extension all pick up by name. Cells render Stata logs, graphs, and warnings inline (the kernel logo bundled since v0.5 makes it appear in VS Code's kernel picker too). See [As a Jupyter Kernel](#as-a-jupyter-kernel).
|
|
92
92
|
- **Optional agent "fix and rerun" loop.** `stata_run` returns typed `error.kind/line/context` plus `suggestions` on every failure. By default Claude Code only reports diagnostics — but if you explicitly say "fix this and rerun until it passes", the agent uses the same fields to edit your `.do` file and re-call `stata_run` until the run is green. The repair loop is **opt-in**: failed runs are diagnostics first, not automatic rewrite permission. See [Error Recovery in Agent Workflows](#error-recovery-in-agent-workflows).
|
|
@@ -188,7 +188,7 @@ claude mcp add stata-code --scope local -- stata-code-mcp
|
|
|
188
188
|
claude mcp add stata-code --scope project -- stata-code-mcp
|
|
189
189
|
```
|
|
190
190
|
|
|
191
|
-
Then launch `claude` and type `/mcp` to confirm `stata-code` shows up with its
|
|
191
|
+
Then launch `claude` and type `/mcp` to confirm `stata-code` shows up with its 18 tools (`stata_run`, `stata_info`, `get_log`, `search_log`, `get_graph`, `get_matrix`, `inspect_data`, `install_package`, `list_sessions`, `cancel_session`, `reset_session`, `notebook_outline`, `notebook_get_cell`, `notebook_locate`, `notebook_edit_cell`, `notebook_insert_cell`, `notebook_delete_cell`, `list_runs`).
|
|
192
192
|
|
|
193
193
|
#### Error Recovery in Agent Workflows
|
|
194
194
|
|
|
@@ -276,15 +276,18 @@ If an OpenAI-backed client reports `API Error: 400 Invalid schema for function
|
|
|
276
276
|
upgrade to `stata-code>=0.6.5`, then restart the MCP client. Older server
|
|
277
277
|
processes keep advertising the stale schema until they are restarted.
|
|
278
278
|
|
|
279
|
-
The MCP server registers
|
|
279
|
+
The MCP server registers 18 tools:
|
|
280
280
|
|
|
281
281
|
| Tool | Purpose |
|
|
282
282
|
| --- | --- |
|
|
283
283
|
| `stata_run` | Execute Stata code and return a v1.0 RunResult JSON |
|
|
284
284
|
| `stata_info` | Report Stata edition, version, and capabilities |
|
|
285
285
|
| `get_log` | Fetch the full log behind a `log://` ref |
|
|
286
|
+
| `search_log` | Search matching lines inside a stored `log://` payload |
|
|
286
287
|
| `get_graph` | Fetch graph bytes behind a `graph://` ref (`ImageContent`) |
|
|
287
288
|
| `get_matrix` | Fetch matrix payloads behind a `matrix://` ref |
|
|
289
|
+
| `inspect_data` | Run `describe` + `codebook` and return compact dataset metadata |
|
|
290
|
+
| `install_package` | Install an SSC or explicit `net install` package and verify it resolves |
|
|
288
291
|
| `list_sessions` | Enumerate live sessions |
|
|
289
292
|
| `cancel_session` | Cancel a session; the subprocess-backed path terminates in-flight runs and short-circuits pending ones |
|
|
290
293
|
| `reset_session` | Drop a session's data |
|
|
@@ -416,7 +419,7 @@ stata_code/
|
|
|
416
419
|
│ ├── runner.py # in-process execute(); collects everything via sfi
|
|
417
420
|
│ └── _pool.py # subprocess workers for public API / MCP hard timeouts
|
|
418
421
|
├── mcp/
|
|
419
|
-
│ └── server.py # MCP server (
|
|
422
|
+
│ └── server.py # MCP server (18 tools)
|
|
420
423
|
└── kernel/
|
|
421
424
|
└── kernel.py # Jupyter kernel
|
|
422
425
|
```
|
|
@@ -444,7 +447,7 @@ stata_code/
|
|
|
444
447
|
|
|
445
448
|
## Roadmap
|
|
446
449
|
|
|
447
|
-
### Done (through v0.
|
|
450
|
+
### Done (through v0.7 — May 2026)
|
|
448
451
|
|
|
449
452
|
- v1.0 result schema ([SCHEMA.md](SCHEMA.md))
|
|
450
453
|
- `pystata`-based runner with native-typed `r()`, `e()`, and matrices
|
|
@@ -454,7 +457,7 @@ stata_code/
|
|
|
454
457
|
- Log truncation with ref store
|
|
455
458
|
- Warning extraction: 5 categories + generic notes
|
|
456
459
|
- 32-kind error taxonomy with canonical suggestions
|
|
457
|
-
- MCP server:
|
|
460
|
+
- MCP server: 18 tools, including notebook navigation / search / atomic edits, the run-bundle index (`list_runs`), log grep (`search_log`), dataset inspection (`inspect_data`), and package installation (`install_package`)
|
|
458
461
|
- Jupyter kernel: rewired to the v1.0 pipeline, kernel logos bundled
|
|
459
462
|
- Matrix size cap + `get_matrix(ref)` for large matrices (>10k cells)
|
|
460
463
|
- Subprocess-backed hard timeout and cancellation for the public Python API and MCP server: `timeout_ms`, `cancel(session_id)`, and MCP `cancel_session`
|
|
@@ -45,9 +45,9 @@
|
|
|
45
45
|
└─────────────┘ └────────────┘ └─────────────────┘
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
-
**Status: v0.
|
|
48
|
+
**Status: v0.7 (May 2026)** — the core, MCP server, Jupyter kernel, and VS Code extension work end-to-end against Stata 18 MP. The test suite covers schema, runner, MCP, kernel, notebook, run-index, subprocess-pool, and VS Code modules; CI also checks linting, type safety, schema generation, package metadata, and VSIX packaging. License: **MIT**.
|
|
49
49
|
|
|
50
|
-
Two workflows
|
|
50
|
+
Two workflows the current release explicitly supports for end users:
|
|
51
51
|
|
|
52
52
|
- **Run Stata code from a Jupyter notebook.** `pip install "stata-code[kernel]"` + `stata-code-kernel install --user` registers a **Stata** kernel that the Jupyter Notebook UI, JupyterLab, and the VS Code Jupyter extension all pick up by name. Cells render Stata logs, graphs, and warnings inline (the kernel logo bundled since v0.5 makes it appear in VS Code's kernel picker too). See [As a Jupyter Kernel](#as-a-jupyter-kernel).
|
|
53
53
|
- **Optional agent "fix and rerun" loop.** `stata_run` returns typed `error.kind/line/context` plus `suggestions` on every failure. By default Claude Code only reports diagnostics — but if you explicitly say "fix this and rerun until it passes", the agent uses the same fields to edit your `.do` file and re-call `stata_run` until the run is green. The repair loop is **opt-in**: failed runs are diagnostics first, not automatic rewrite permission. See [Error Recovery in Agent Workflows](#error-recovery-in-agent-workflows).
|
|
@@ -149,7 +149,7 @@ claude mcp add stata-code --scope local -- stata-code-mcp
|
|
|
149
149
|
claude mcp add stata-code --scope project -- stata-code-mcp
|
|
150
150
|
```
|
|
151
151
|
|
|
152
|
-
Then launch `claude` and type `/mcp` to confirm `stata-code` shows up with its
|
|
152
|
+
Then launch `claude` and type `/mcp` to confirm `stata-code` shows up with its 18 tools (`stata_run`, `stata_info`, `get_log`, `search_log`, `get_graph`, `get_matrix`, `inspect_data`, `install_package`, `list_sessions`, `cancel_session`, `reset_session`, `notebook_outline`, `notebook_get_cell`, `notebook_locate`, `notebook_edit_cell`, `notebook_insert_cell`, `notebook_delete_cell`, `list_runs`).
|
|
153
153
|
|
|
154
154
|
#### Error Recovery in Agent Workflows
|
|
155
155
|
|
|
@@ -237,15 +237,18 @@ If an OpenAI-backed client reports `API Error: 400 Invalid schema for function
|
|
|
237
237
|
upgrade to `stata-code>=0.6.5`, then restart the MCP client. Older server
|
|
238
238
|
processes keep advertising the stale schema until they are restarted.
|
|
239
239
|
|
|
240
|
-
The MCP server registers
|
|
240
|
+
The MCP server registers 18 tools:
|
|
241
241
|
|
|
242
242
|
| Tool | Purpose |
|
|
243
243
|
| --- | --- |
|
|
244
244
|
| `stata_run` | Execute Stata code and return a v1.0 RunResult JSON |
|
|
245
245
|
| `stata_info` | Report Stata edition, version, and capabilities |
|
|
246
246
|
| `get_log` | Fetch the full log behind a `log://` ref |
|
|
247
|
+
| `search_log` | Search matching lines inside a stored `log://` payload |
|
|
247
248
|
| `get_graph` | Fetch graph bytes behind a `graph://` ref (`ImageContent`) |
|
|
248
249
|
| `get_matrix` | Fetch matrix payloads behind a `matrix://` ref |
|
|
250
|
+
| `inspect_data` | Run `describe` + `codebook` and return compact dataset metadata |
|
|
251
|
+
| `install_package` | Install an SSC or explicit `net install` package and verify it resolves |
|
|
249
252
|
| `list_sessions` | Enumerate live sessions |
|
|
250
253
|
| `cancel_session` | Cancel a session; the subprocess-backed path terminates in-flight runs and short-circuits pending ones |
|
|
251
254
|
| `reset_session` | Drop a session's data |
|
|
@@ -377,7 +380,7 @@ stata_code/
|
|
|
377
380
|
│ ├── runner.py # in-process execute(); collects everything via sfi
|
|
378
381
|
│ └── _pool.py # subprocess workers for public API / MCP hard timeouts
|
|
379
382
|
├── mcp/
|
|
380
|
-
│ └── server.py # MCP server (
|
|
383
|
+
│ └── server.py # MCP server (18 tools)
|
|
381
384
|
└── kernel/
|
|
382
385
|
└── kernel.py # Jupyter kernel
|
|
383
386
|
```
|
|
@@ -405,7 +408,7 @@ stata_code/
|
|
|
405
408
|
|
|
406
409
|
## Roadmap
|
|
407
410
|
|
|
408
|
-
### Done (through v0.
|
|
411
|
+
### Done (through v0.7 — May 2026)
|
|
409
412
|
|
|
410
413
|
- v1.0 result schema ([SCHEMA.md](SCHEMA.md))
|
|
411
414
|
- `pystata`-based runner with native-typed `r()`, `e()`, and matrices
|
|
@@ -415,7 +418,7 @@ stata_code/
|
|
|
415
418
|
- Log truncation with ref store
|
|
416
419
|
- Warning extraction: 5 categories + generic notes
|
|
417
420
|
- 32-kind error taxonomy with canonical suggestions
|
|
418
|
-
- MCP server:
|
|
421
|
+
- MCP server: 18 tools, including notebook navigation / search / atomic edits, the run-bundle index (`list_runs`), log grep (`search_log`), dataset inspection (`inspect_data`), and package installation (`install_package`)
|
|
419
422
|
- Jupyter kernel: rewired to the v1.0 pipeline, kernel logos bundled
|
|
420
423
|
- Matrix size cap + `get_matrix(ref)` for large matrices (>10k cells)
|
|
421
424
|
- Subprocess-backed hard timeout and cancellation for the public Python API and MCP server: `timeout_ms`, `cancel(session_id)`, and MCP `cancel_session`
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Package the ``stata-code`` skill into a single uploadable ``.zip``.
|
|
2
|
+
|
|
3
|
+
The skill (``skills/stata-code/SKILL.md`` + the ``references/`` library) is
|
|
4
|
+
consumed two ways:
|
|
5
|
+
|
|
6
|
+
* In-repo / Claude Code — read straight from ``skills/stata-code/``.
|
|
7
|
+
* Claude.ai project knowledge — uploaded as a ``.zip``. This script builds
|
|
8
|
+
that archive.
|
|
9
|
+
|
|
10
|
+
The archive contains a single top-level ``stata-code/`` folder so it extracts
|
|
11
|
+
cleanly::
|
|
12
|
+
|
|
13
|
+
stata-code/SKILL.md
|
|
14
|
+
stata-code/references/econometrics.md
|
|
15
|
+
stata-code/references/packages/reghdfe.md
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
Run::
|
|
19
|
+
|
|
20
|
+
python scripts/build_skill_zip.py # -> build/stata-code-skill.zip
|
|
21
|
+
python scripts/build_skill_zip.py -o /tmp/out.zip # custom destination
|
|
22
|
+
|
|
23
|
+
The build is deterministic (sorted entries, fixed timestamps) so re-running it
|
|
24
|
+
on unchanged inputs produces a byte-identical archive.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import sys
|
|
31
|
+
import zipfile
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
35
|
+
SKILL_DIR = REPO_ROOT / "skills" / "stata-code"
|
|
36
|
+
DEFAULT_OUTPUT = REPO_ROOT / "build" / "stata-code-skill.zip"
|
|
37
|
+
ARCHIVE_PREFIX = "stata-code"
|
|
38
|
+
|
|
39
|
+
# Fixed timestamp for reproducible archives (zip epoch starts at 1980).
|
|
40
|
+
_FIXED_DATE_TIME = (1980, 1, 1, 0, 0, 0)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def collect_files(skill_dir: Path = SKILL_DIR) -> list[Path]:
|
|
44
|
+
"""Return every shippable skill file, sorted, relative-stable.
|
|
45
|
+
|
|
46
|
+
Excludes editor/OS cruft so the archive is clean.
|
|
47
|
+
"""
|
|
48
|
+
if not skill_dir.is_dir():
|
|
49
|
+
raise FileNotFoundError(f"skill directory not found: {skill_dir}")
|
|
50
|
+
skip = {".DS_Store"}
|
|
51
|
+
files = [
|
|
52
|
+
p
|
|
53
|
+
for p in skill_dir.rglob("*")
|
|
54
|
+
if p.is_file() and p.name not in skip and "__pycache__" not in p.parts
|
|
55
|
+
]
|
|
56
|
+
return sorted(files)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def build_zip(
|
|
60
|
+
dest: Path = DEFAULT_OUTPUT,
|
|
61
|
+
skill_dir: Path = SKILL_DIR,
|
|
62
|
+
) -> list[str]:
|
|
63
|
+
"""Write the skill archive to ``dest``; return the arcnames included."""
|
|
64
|
+
files = collect_files(skill_dir)
|
|
65
|
+
if not files:
|
|
66
|
+
raise FileNotFoundError(f"no skill files under {skill_dir}")
|
|
67
|
+
|
|
68
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
arcnames: list[str] = []
|
|
70
|
+
with zipfile.ZipFile(dest, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
71
|
+
for path in files:
|
|
72
|
+
rel = path.relative_to(skill_dir).as_posix()
|
|
73
|
+
arcname = f"{ARCHIVE_PREFIX}/{rel}"
|
|
74
|
+
info = zipfile.ZipInfo(arcname, date_time=_FIXED_DATE_TIME)
|
|
75
|
+
info.compress_type = zipfile.ZIP_DEFLATED
|
|
76
|
+
info.external_attr = 0o644 << 16 # regular file, rw-r--r--
|
|
77
|
+
zf.writestr(info, path.read_bytes())
|
|
78
|
+
arcnames.append(arcname)
|
|
79
|
+
return arcnames
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def main() -> int:
|
|
83
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"-o",
|
|
86
|
+
"--output",
|
|
87
|
+
type=Path,
|
|
88
|
+
default=DEFAULT_OUTPUT,
|
|
89
|
+
help=f"Destination .zip (default: {DEFAULT_OUTPUT.relative_to(REPO_ROOT)}).",
|
|
90
|
+
)
|
|
91
|
+
args = parser.parse_args()
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
arcnames = build_zip(args.output)
|
|
95
|
+
except FileNotFoundError as exc:
|
|
96
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
97
|
+
return 1
|
|
98
|
+
|
|
99
|
+
size = args.output.stat().st_size
|
|
100
|
+
print(f"wrote: {args.output} ({len(arcnames)} files, {size:,} bytes)")
|
|
101
|
+
return 0
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
sys.exit(main())
|
|
@@ -218,6 +218,107 @@ def get_log(ref: str) -> dict[str, Any]:
|
|
|
218
218
|
}
|
|
219
219
|
|
|
220
220
|
|
|
221
|
+
def search_log(
|
|
222
|
+
ref: str,
|
|
223
|
+
pattern: str,
|
|
224
|
+
*,
|
|
225
|
+
is_regex: bool = False,
|
|
226
|
+
ignore_case: bool = True,
|
|
227
|
+
context: int = 0,
|
|
228
|
+
max_matches: int = 50,
|
|
229
|
+
) -> dict[str, Any]:
|
|
230
|
+
"""Auxiliary tool: grep within a stored ``log://`` payload.
|
|
231
|
+
|
|
232
|
+
Pairs with the token-economy default of returning long logs by
|
|
233
|
+
reference: instead of pulling the whole log back with
|
|
234
|
+
:func:`get_log`, the agent can find just the lines it cares about.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
ref : str
|
|
239
|
+
A ``log://<request_id>`` ref produced by a truncated ``stata_run``.
|
|
240
|
+
pattern : str
|
|
241
|
+
Substring (default) or regular expression (``is_regex=True``) to
|
|
242
|
+
match against each line.
|
|
243
|
+
is_regex : bool
|
|
244
|
+
Treat ``pattern`` as a Python regular expression. A malformed
|
|
245
|
+
regex raises :class:`ValueError` (surfaced as ``invalid_request``).
|
|
246
|
+
ignore_case : bool
|
|
247
|
+
Case-insensitive matching (default ``True``).
|
|
248
|
+
context : int
|
|
249
|
+
Lines of surrounding context to include on each side of a match
|
|
250
|
+
(capped at 10). ``before`` / ``after`` are omitted when 0.
|
|
251
|
+
max_matches : int
|
|
252
|
+
Stop after this many matches; ``truncated`` reports whether more
|
|
253
|
+
existed (capped at 1000).
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
dict
|
|
258
|
+
``{ref, pattern, is_regex, lines_total, match_count, truncated,
|
|
259
|
+
matches: [{line_no, text, before?, after?}]}``. ``line_no`` is
|
|
260
|
+
1-based. Raises :class:`RefNotFound` for an unknown ref.
|
|
261
|
+
"""
|
|
262
|
+
payload = _refs.get(ref)
|
|
263
|
+
if (
|
|
264
|
+
not isinstance(payload, dict)
|
|
265
|
+
or not isinstance(payload.get("text"), str)
|
|
266
|
+
or "lines_total" not in payload
|
|
267
|
+
):
|
|
268
|
+
raise RefNotFound(ref, kind="unknown_log_ref")
|
|
269
|
+
if not pattern:
|
|
270
|
+
raise ValueError("pattern must be a non-empty string")
|
|
271
|
+
|
|
272
|
+
context = max(0, min(int(context), 10))
|
|
273
|
+
max_matches = max(1, min(int(max_matches), 1000))
|
|
274
|
+
|
|
275
|
+
flags = re.IGNORECASE if ignore_case else 0
|
|
276
|
+
if is_regex:
|
|
277
|
+
try:
|
|
278
|
+
matcher = re.compile(pattern, flags)
|
|
279
|
+
except re.error as exc:
|
|
280
|
+
raise ValueError(f"invalid regex: {exc}") from exc
|
|
281
|
+
|
|
282
|
+
def _hit(line: str) -> bool:
|
|
283
|
+
return matcher.search(line) is not None
|
|
284
|
+
else:
|
|
285
|
+
needle = pattern.lower() if ignore_case else pattern
|
|
286
|
+
|
|
287
|
+
def _hit(line: str) -> bool:
|
|
288
|
+
hay = line.lower() if ignore_case else line
|
|
289
|
+
return needle in hay
|
|
290
|
+
|
|
291
|
+
text: str = payload["text"]
|
|
292
|
+
lines = text.split("\n")
|
|
293
|
+
matches: list[dict[str, Any]] = []
|
|
294
|
+
truncated = False
|
|
295
|
+
for idx, line in enumerate(lines):
|
|
296
|
+
if not _hit(line):
|
|
297
|
+
continue
|
|
298
|
+
if len(matches) >= max_matches:
|
|
299
|
+
truncated = True
|
|
300
|
+
break
|
|
301
|
+
entry: dict[str, Any] = {"line_no": idx + 1, "text": line}
|
|
302
|
+
if context:
|
|
303
|
+
before = lines[max(0, idx - context):idx]
|
|
304
|
+
after = lines[idx + 1:idx + 1 + context]
|
|
305
|
+
if before:
|
|
306
|
+
entry["before"] = before
|
|
307
|
+
if after:
|
|
308
|
+
entry["after"] = after
|
|
309
|
+
matches.append(entry)
|
|
310
|
+
|
|
311
|
+
return {
|
|
312
|
+
"ref": ref,
|
|
313
|
+
"pattern": pattern,
|
|
314
|
+
"is_regex": is_regex,
|
|
315
|
+
"lines_total": payload["lines_total"],
|
|
316
|
+
"match_count": len(matches),
|
|
317
|
+
"truncated": truncated,
|
|
318
|
+
"matches": matches,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
|
|
221
322
|
def cancel(session_id: str = "main") -> bool:
|
|
222
323
|
"""Request cancellation of the next ``execute()`` call for ``session_id``.
|
|
223
324
|
|
|
@@ -1195,11 +1296,20 @@ def _extract_warnings(log: str) -> list: # list[StataWarning]
|
|
|
1195
1296
|
|
|
1196
1297
|
|
|
1197
1298
|
_GRAPH_NAME_RE = re.compile(r"\bname\(\s*([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
|
|
1299
|
+
# Stata's default in-memory graph name, (re)used by any graph command that
|
|
1300
|
+
# omits an explicit `name(...)` option. Capture/redraw detection keys off this.
|
|
1301
|
+
_DEFAULT_GRAPH_NAME = "Graph"
|
|
1302
|
+
# Commands that actually *draw* a graph (and thus create/overwrite an
|
|
1303
|
+
# in-memory graph). Deliberately excludes the `graph` utility subcommands
|
|
1304
|
+
# (export, display, dir, drop, describe, save, use, rename, set, copy, query,
|
|
1305
|
+
# replay) — those operate on existing graphs and must not be mistaken for a
|
|
1306
|
+
# redraw, or a bare `graph export` cell would spuriously re-surface a stale
|
|
1307
|
+
# graph.
|
|
1198
1308
|
_GRAPH_COMMAND_RE = re.compile(
|
|
1199
1309
|
r"^\s*(?:"
|
|
1200
|
-
r"graph\s
|
|
1201
|
-
r"twoway|scatter|line|connected|histogram|kdensity|lowess|
|
|
1202
|
-
r"coefplot|binscatter"
|
|
1310
|
+
r"graph\s+(?:bar|hbar|box|hbox|dot|pie|twoway|matrix|combine)\b|"
|
|
1311
|
+
r"twoway|scatter|line|connected|histogram|hist|kdensity|lpoly|lowess|"
|
|
1312
|
+
r"lfit|qfit|coefplot|binscatter|marginsplot"
|
|
1203
1313
|
r")\b",
|
|
1204
1314
|
re.IGNORECASE,
|
|
1205
1315
|
)
|
|
@@ -1262,19 +1372,40 @@ def _collect_graphs(
|
|
|
1262
1372
|
source_hints: dict[str, tuple[str, int]] | None = None,
|
|
1263
1373
|
unnamed_source_hints: list[tuple[str, int]] | None = None,
|
|
1264
1374
|
) -> list[GraphInfo]:
|
|
1265
|
-
"""Capture graphs that user code newly created.
|
|
1375
|
+
"""Capture graphs that user code newly created or redrew.
|
|
1266
1376
|
|
|
1267
1377
|
Strategy: snapshot graph names before user code (`pre_existing`), call
|
|
1268
|
-
after to find the post-existing list
|
|
1269
|
-
new
|
|
1270
|
-
|
|
1378
|
+
after to find the post-existing list. Capture a graph when its name is
|
|
1379
|
+
genuinely new *or* when this cell's source shows it (re)drew that name.
|
|
1380
|
+
|
|
1381
|
+
The redraw case matters because Stata keeps only one in-memory graph per
|
|
1382
|
+
name, so a command that overwrites an existing name (most commonly the
|
|
1383
|
+
default ``Graph``, produced by any unnamed graph command) leaves the
|
|
1384
|
+
``graph dir`` name set unchanged. A pure set-difference against
|
|
1385
|
+
`pre_existing` therefore misses it — which is why, in a persistent session
|
|
1386
|
+
(Jupyter cell 2+, repeated MCP runs), only the first graph ever surfaced.
|
|
1387
|
+
|
|
1388
|
+
For each captured graph: `graph display <name>` (makes it active),
|
|
1389
|
+
`graph export` to a tmpfile, read bytes, store under a ref. Tmpfile is
|
|
1390
|
+
deleted after.
|
|
1271
1391
|
"""
|
|
1272
1392
|
after_names = _list_graph_names(rt)
|
|
1273
|
-
new_names = [n for n in after_names if n not in pre_existing]
|
|
1274
|
-
if not new_names:
|
|
1275
|
-
return []
|
|
1276
1393
|
source_hints = source_hints or {}
|
|
1277
1394
|
unnamed_source_hints = unnamed_source_hints or []
|
|
1395
|
+
|
|
1396
|
+
# Names this cell explicitly drew, inferred from its source: every
|
|
1397
|
+
# `name(...)` option, plus the default graph when any unnamed graph
|
|
1398
|
+
# command ran. These are re-captured even if they already existed, so an
|
|
1399
|
+
# in-place redraw is not dropped.
|
|
1400
|
+
redrawn = set(source_hints)
|
|
1401
|
+
if unnamed_source_hints:
|
|
1402
|
+
redrawn.add(_DEFAULT_GRAPH_NAME)
|
|
1403
|
+
|
|
1404
|
+
new_names = [
|
|
1405
|
+
n for n in after_names if n not in pre_existing or n in redrawn
|
|
1406
|
+
]
|
|
1407
|
+
if not new_names:
|
|
1408
|
+
return []
|
|
1278
1409
|
unattributed_names = [n for n in new_names if n not in source_hints]
|
|
1279
1410
|
unnamed_by_graph: dict[str, tuple[str, int]] = {}
|
|
1280
1411
|
if len(unattributed_names) == len(unnamed_source_hints):
|
|
@@ -102,6 +102,35 @@ def _word_at_cursor(code: str, cursor_pos: int) -> tuple[str, int, int]:
|
|
|
102
102
|
return code[start:end], start, end
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
def _strip_command_echo(log_text: str) -> str:
|
|
106
|
+
"""Drop Stata's do-file command echo from a captured cell log.
|
|
107
|
+
|
|
108
|
+
pystata runs a multi-line cell as a temporary do-file, and Stata echoes
|
|
109
|
+
every submitted command — ``. cmd`` for the first line of each command and
|
|
110
|
+
``> ...`` for wrapped/continued lines — regardless of the ``echo=False``
|
|
111
|
+
flag (which only suppresses echo for a single inline command). In a
|
|
112
|
+
notebook the input cell already shows the source, so the echo is pure
|
|
113
|
+
duplication; for a cell with no textual output (e.g. a graph) the echo is
|
|
114
|
+
the *only* thing shown, which reads as a useless repeat of the code.
|
|
115
|
+
|
|
116
|
+
Strip the echoed command/continuation lines, keep genuine command output,
|
|
117
|
+
and collapse the blank-line runs the removal leaves behind. Echoed lines
|
|
118
|
+
always start at column 0 with ``. `` (dot-space) or ``> `` (continuation);
|
|
119
|
+
real Stata output never begins that way, so this is safe.
|
|
120
|
+
"""
|
|
121
|
+
kept: list[str] = []
|
|
122
|
+
for line in log_text.split("\n"):
|
|
123
|
+
if line.startswith(". ") or line.startswith("> "):
|
|
124
|
+
continue
|
|
125
|
+
# Collapse leading and consecutive blank lines left by removed echoes.
|
|
126
|
+
if not line.strip() and (not kept or not kept[-1].strip()):
|
|
127
|
+
continue
|
|
128
|
+
kept.append(line)
|
|
129
|
+
while kept and not kept[-1].strip():
|
|
130
|
+
kept.pop()
|
|
131
|
+
return "\n".join(kept)
|
|
132
|
+
|
|
133
|
+
|
|
105
134
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
106
135
|
# Kernel
|
|
107
136
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -155,8 +184,9 @@ class StataKernel(_KernelBase):
|
|
|
155
184
|
self._last_result = result
|
|
156
185
|
|
|
157
186
|
if not silent:
|
|
158
|
-
if result.log.head
|
|
159
|
-
|
|
187
|
+
log_text = _strip_command_echo(result.log.head) if result.log.head else ""
|
|
188
|
+
if log_text:
|
|
189
|
+
self._stream("stdout", log_text + "\n")
|
|
160
190
|
if result.warnings:
|
|
161
191
|
for w in result.warnings:
|
|
162
192
|
self._stream("stderr", f"[{w.kind}] {w.message}\n")
|