pubrun 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. pubrun-0.1.1/.gitignore +56 -0
  2. pubrun-0.1.1/CHANGELOG.md +93 -0
  3. pubrun-0.1.1/CITATION.cff +21 -0
  4. pubrun-0.1.1/LICENSE +29 -0
  5. pubrun-0.1.1/PKG-INFO +250 -0
  6. pubrun-0.1.1/README.md +217 -0
  7. pubrun-0.1.1/docs/api.md +192 -0
  8. pubrun-0.1.1/docs/architecture.md +254 -0
  9. pubrun-0.1.1/docs/cli.md +186 -0
  10. pubrun-0.1.1/docs/configuration.md +223 -0
  11. pubrun-0.1.1/docs/functional_spec.md +692 -0
  12. pubrun-0.1.1/docs/manifest.md +336 -0
  13. pubrun-0.1.1/examples/00_auto_start.py +23 -0
  14. pubrun-0.1.1/examples/01_minimal_start_stop.py +35 -0
  15. pubrun-0.1.1/examples/02_context_manager.py +32 -0
  16. pubrun-0.1.1/examples/03_audit_decorator.py +57 -0
  17. pubrun-0.1.1/examples/04_annotate_events.py +39 -0
  18. pubrun-0.1.1/examples/05_execution_phases.py +44 -0
  19. pubrun-0.1.1/examples/06_subprocess_spy.py +31 -0
  20. pubrun-0.1.1/examples/07_file_capture.py +45 -0
  21. pubrun-0.1.1/examples/08_diff_engine.py +47 -0
  22. pubrun-0.1.1/examples/09_cli_methods.py +38 -0
  23. pubrun-0.1.1/examples/10_hardware_probe.py +34 -0
  24. pubrun-0.1.1/examples/11_cli_report.py +32 -0
  25. pubrun-0.1.1/examples/verify_all.py +82 -0
  26. pubrun-0.1.1/pyproject.toml +63 -0
  27. pubrun-0.1.1/schemas/manifest.schema.json +1306 -0
  28. pubrun-0.1.1/src/pubrun/__init__.py +305 -0
  29. pubrun-0.1.1/src/pubrun/__main__.py +478 -0
  30. pubrun-0.1.1/src/pubrun/analysis/__init__.py +1 -0
  31. pubrun-0.1.1/src/pubrun/analysis/diff.py +149 -0
  32. pubrun-0.1.1/src/pubrun/analysis/render.py +129 -0
  33. pubrun-0.1.1/src/pubrun/capture/console.py +151 -0
  34. pubrun-0.1.1/src/pubrun/capture/environment.py +27 -0
  35. pubrun-0.1.1/src/pubrun/capture/git.py +67 -0
  36. pubrun-0.1.1/src/pubrun/capture/hardware.py +175 -0
  37. pubrun-0.1.1/src/pubrun/capture/host.py +29 -0
  38. pubrun-0.1.1/src/pubrun/capture/invocation.py +184 -0
  39. pubrun-0.1.1/src/pubrun/capture/packages.py +71 -0
  40. pubrun-0.1.1/src/pubrun/capture/process.py +35 -0
  41. pubrun-0.1.1/src/pubrun/capture/python_runtime.py +24 -0
  42. pubrun-0.1.1/src/pubrun/capture/redaction.py +191 -0
  43. pubrun-0.1.1/src/pubrun/capture/resources.py +148 -0
  44. pubrun-0.1.1/src/pubrun/capture/subprocesses.py +190 -0
  45. pubrun-0.1.1/src/pubrun/config.py +123 -0
  46. pubrun-0.1.1/src/pubrun/events.py +82 -0
  47. pubrun-0.1.1/src/pubrun/py.typed +1 -0
  48. pubrun-0.1.1/src/pubrun/report/__init__.py +1 -0
  49. pubrun-0.1.1/src/pubrun/report/diagnostics.py +199 -0
  50. pubrun-0.1.1/src/pubrun/report/meta_snapshot.py +72 -0
  51. pubrun-0.1.1/src/pubrun/report/methods.py +70 -0
  52. pubrun-0.1.1/src/pubrun/report/templates.py +46 -0
  53. pubrun-0.1.1/src/pubrun/report/utils.py +85 -0
  54. pubrun-0.1.1/src/pubrun/resources/default.toml +241 -0
  55. pubrun-0.1.1/src/pubrun/tracker.py +325 -0
  56. pubrun-0.1.1/src/pubrun/writer.py +63 -0
  57. pubrun-0.1.1/tests/conftest.py +44 -0
  58. pubrun-0.1.1/tests/fixtures/sample_manifest.json +140 -0
  59. pubrun-0.1.1/tests/scripts/hpc_node.py +11 -0
  60. pubrun-0.1.1/tests/test_api.py +317 -0
  61. pubrun-0.1.1/tests/test_capture.py +166 -0
  62. pubrun-0.1.1/tests/test_capture_engines.py +236 -0
  63. pubrun-0.1.1/tests/test_cli.py +308 -0
  64. pubrun-0.1.1/tests/test_config.py +208 -0
  65. pubrun-0.1.1/tests/test_events.py +152 -0
  66. pubrun-0.1.1/tests/test_hardware.py +46 -0
  67. pubrun-0.1.1/tests/test_quality.py +253 -0
  68. pubrun-0.1.1/tests/test_redaction.py +123 -0
  69. pubrun-0.1.1/tests/test_reports.py +164 -0
  70. pubrun-0.1.1/tests/test_resources.py +50 -0
  71. pubrun-0.1.1/tests/test_tracker.py +52 -0
  72. pubrun-0.1.1/tox.ini +10 -0
@@ -0,0 +1,56 @@
1
+ # These are some examples of commonly ignored file patterns.
2
+ # You should customize this list as applicable to your project.
3
+ # Learn more about .gitignore:
4
+ # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5
+
6
+ # Node artifact files
7
+ node_modules/
8
+ dist/
9
+ tmp/
10
+ runs/
11
+ __pycache__/
12
+ *.pyc
13
+ p3*/
14
+
15
+ # Compiled Java class files
16
+ *.class
17
+
18
+ # Compiled Python bytecode
19
+ *.py[cod]
20
+ .pytest_cache/
21
+
22
+ # Log files
23
+ *.log
24
+
25
+ # Package files
26
+ *.jar
27
+
28
+ # Maven
29
+ target/
30
+ dist/
31
+
32
+ # JetBrains IDE
33
+ .idea/
34
+
35
+ # Unit test reports
36
+ TEST*.xml
37
+
38
+ # Generated by MacOS
39
+ .DS_Store
40
+
41
+ # Generated by Windows
42
+ Thumbs.db
43
+
44
+ # Applications
45
+ *.app
46
+ *.exe
47
+ *.war
48
+
49
+ # Large media files
50
+ *.mp4
51
+ *.tiff
52
+ *.avi
53
+ *.flv
54
+ *.mov
55
+ *.wmv
56
+
@@ -0,0 +1,93 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
6
+
7
+ ## [0.1.1] - 2026-05-09
8
+
9
+ ### Added
10
+
11
+ - **`--version` CLI flag**: `pubrun --version` now prints the installed version.
12
+ - **`PUBRUN_PROFILE` environment variable**: Overrides `[core].profile` at runtime without a config file.
13
+ - **`__all__` declaration**: `__init__.py` now declares a formal public API surface.
14
+ - **Console tee documentation**: README Quick Start now documents the stdout/stderr tee behavior and how to disable it for high-output scripts.
15
+
16
+ ### Changed
17
+
18
+ - **Single-source versioning**: `__version__` is now read from installed package metadata via `importlib.metadata` instead of being hard-coded.
19
+ - **CLI `prog` name**: Help output now shows `pubrun` instead of `__main__.py`.
20
+ - **CLI help text**: All help strings rewritten for conciseness. Removed verbose and non-technical phrasing.
21
+ - **Error messages**: Standardized CLI error messages to use consistent `Error: Failed to ...` format.
22
+ - **`default.toml` comments**: Full rewrite. All configuration comments now use concise, technical phrasing.
23
+ - **Module docstring**: `__init__.py` docstring rewritten for clarity.
24
+ - **`CITATION.cff`**: Version bumped to 0.1.1; abstract cleaned.
25
+
26
+ ### Improved
27
+
28
+ - **Docstring audit**: All docstrings and inline comments across 24 source files cleaned to remove non-technical prose.
29
+ - **`_handle_inactive()` extraction**: Duplicate inactive-run handling from `annotate()` and `phase.__enter__()` consolidated into a shared helper.
30
+
31
+ ### Security
32
+
33
+ - **Subprocess argv redaction**: CLI arguments in `sys.argv` and subprocess records matching sensitive patterns (passwords, tokens, API keys, etc.) are now redacted before being written to the manifest. Configurable via `[redaction].argv_enabled` in `.pubrun.toml`.
34
+ - **Expanded secret detection**: Added patterns for `private`, `conn_str`, `connection_string`, `database_url`, `dsn`, `signing`, and `bearer` to the default sensitive key regex.
35
+ - **Thread-safe subprocess tracking**: `SubprocessSpy._records` mutations are now protected by `threading.Lock` to prevent race conditions in parallelized HPC array jobs.
36
+
37
+ ### Fixed
38
+
39
+ - **Double-finalization guard**: `_finalize_state()` is now idempotent, preventing redundant cleanup calls in writer or exit hooks.
40
+ - **Ghost mode stability**: Tracker now initializes `_outcome` and `_finalized` attributes even on failure (e.g., read-only filesystem), preventing `AttributeError` on `stop()`.
41
+ - **Event stream race condition**: Consolidated double-lock into a single atomic lock covering both budget check and file I/O.
42
+ - **Config merge leakage**: `_deep_merge()` now uses `copy.deepcopy()` to prevent nested dictionary reference sharing across run instances.
43
+ - **Diagnostics field names**: Subprocess reports now use `argv`/`exit_code` to match the actual manifest schema (was `command`/`return_code`).
44
+ - **Mutable default argument**: Fixed `get_invocation(config: Dict = {})` to `Optional[Dict] = None`.
45
+ - **OS detection**: Methods report now uses `host.os_name` from the manifest instead of the Windows-only `OS` environment variable.
46
+ - **Import-time safety**: Boot sequence wrapped in `try/except` to prevent corrupt configs from crashing `import pubrun`.
47
+ - **shlex fallback**: `SubprocessSpy` now handles unterminated quotes in command strings without crashing.
48
+
49
+ ### Improved
50
+
51
+ - **Console performance**: `TqdmSafeTee.write()` rewritten from O(n²) char-by-char to O(n) split-based processing.
52
+ - **File hashing**: SHA-256 generation uses chunked 8KB reads instead of full-file read, preventing memory spikes on large inputs.
53
+ - **Redaction configurability**: Env var and argv redaction are independently toggleable via `[redaction].env_enabled` and `[redaction].argv_enabled`. Both default to on.
54
+ - **Git safety**: `_run_git()` calls wrapped in `disable_spy()` to prevent circular subprocess logging.
55
+
56
+ ### Removed
57
+
58
+ - 224 redundant `pass # for auto-indentation` statements across 18 source files.
59
+
60
+ ### Tests
61
+
62
+ - Test suite expanded from 13 to 232 tests across 12 test files.
63
+ - New coverage: manifest schema contract, all 6 capture engines, public API contracts (start/stop/annotate/phase/tracked_run/audit_run/diff), config file discovery and precedence, diff normalization and PATH splitting, report generation (markdown/LaTeX), manifest hydration, and exhaustive CLI dispatch for every subcommand.
64
+ - Added `tests/fixtures/sample_manifest.json` golden fixture for deterministic contract tests.
65
+
66
+ ## [0.1.0] - 2026-04-03
67
+
68
+ ### Added
69
+
70
+ - **Core library**: Singleton `Run` tracker with automatic provenance capture via `start()`, `stop()`, `annotate()`, `phase()`, `tracked_run()`, `audit_run()`, and `diff()`.
71
+ - **Manifest-first design**: Each run produces a `manifest.json` (schema v1.0), `config.resolved.json`, and optional `events.jsonl`, console logs, and `methods.md`/`.tex`.
72
+ - **Capture engines**: Modular sub-engines for hardware, environment, packages, git, invocation, subprocesses, console (tee-style with tqdm-safe carriage return squashing), resources (background thread sampling), and process metadata.
73
+ - **Configuration system**: Hierarchical TOML configuration (`default.toml` -> user-global -> project-local -> API overrides) with per-category depth controls (`off`, `basic`, `standard`, `deep`).
74
+ - **Redaction engine**: Regex-based detection and destructive redaction of sensitive environment variables.
75
+ - **CLI commands**: `pubrun report`, `pubrun methods`, `pubrun rerun`, `pubrun diff`, `pubrun meta`, `pubrun cite`.
76
+ - **CLI utilities**: `--create-config`, `--show-config`, `--info`, `--run-tests`.
77
+ - **HPC support**: Parent-child manifest hydration via `PUBRUN_META_REF` for array job provenance.
78
+ - **Ghost mode**: Silent failure if filesystem operations fail, preventing the library from crashing the host application.
79
+ - **Subprocess spy**: Transparent monkey-patching of `subprocess.Popen` and `os.system` to capture spawned processes.
80
+ - **Schema**: Formal JSON Schema (`schemas/manifest.schema.json`) for manifest validation.
81
+ - **Documentation**: Architecture spec, functional spec, CLI reference, API reference.
82
+ - **Test suite**: pytest-based tests covering tracker lifecycle, config resolution, event streaming, hardware capture, resource monitoring, subprocess interception, and console tee.
83
+
84
+ ### Design Decisions
85
+
86
+ - **Timestamps as POSIX epoch floats**: All timestamps use `time.time()` for sub-second precision, timezone-agnostic storage, trivial arithmetic, and deterministic serialization.
87
+ - **Zero dependencies**: No runtime dependencies except `tomli` for Python < 3.11.
88
+ - **Auto-start**: Configurable import-time activation via `auto_start = true` in config.
89
+
90
+ ### Notes
91
+
92
+ - Tests require `pytest >= 7.0` for `pythonpath` support in `pyproject.toml`.
93
+ - `tox.ini` targets Python 3.10 and 3.11.
@@ -0,0 +1,21 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use this software in your research, please cite it."
3
+ authors:
4
+ - family-names: "Fariello"
5
+ given-names: "Gabriele"
6
+ orcid: "https://orcid.org/0000-0002-0326-4752"
7
+ title: "pubrun: Lightweight native execution provenance and reproducibility tracking"
8
+ version: 0.1.1
9
+ date-released: 2026-04-03
10
+ url: "https://github.com/gfariello/pubrun"
11
+ repository-code: "https://github.com/gfariello/pubrun"
12
+ abstract: >-
13
+ pubrun is a zero-dependency, manifest-first Python framework for tracking
14
+ computational provenance, environment state, and execution metadata without
15
+ requiring complex virtualization overhead.
16
+ keywords:
17
+ - reproducibility
18
+ - provenance
19
+ - metadata
20
+ - research-software
21
+ license: "BSD-3-Clause"
pubrun-0.1.1/LICENSE ADDED
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2007-2026 Gabriele Fariello
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pubrun-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,250 @@
1
+ Metadata-Version: 2.4
2
+ Name: pubrun
3
+ Version: 0.1.1
4
+ Summary: A lightweight Python library for capturing execution context.
5
+ Project-URL: Homepage, https://github.com/gfariello/pubrun
6
+ Project-URL: Repository, https://github.com/gfariello/pubrun
7
+ Project-URL: Issues, https://github.com/gfariello/pubrun/issues
8
+ Author: Gabriele Fariello
9
+ License: BSD-3-Clause
10
+ License-File: LICENSE
11
+ Keywords: execution-tracking,metadata,provenance,reproducibility,research-software,telemetry
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: BSD License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering
23
+ Classifier: Topic :: System :: Logging
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.8
26
+ Requires-Dist: tomli>=1.1.0; python_version < '3.11'
27
+ Provides-Extra: dev
28
+ Requires-Dist: mypy>=1.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0; extra == 'dev'
31
+ Requires-Dist: tox>=4.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ [README](README.md) | [Architecture](docs/architecture.md) | [Functional Spec](docs/functional_spec.md) | [API](docs/api.md) | [CLI](docs/cli.md) | [Configuration](docs/configuration.md) | [Manifest](docs/manifest.md)
35
+
36
+ # pubrun
37
+
38
+ > **Let your code monitor itself and write its own Methods section while you go to the pub.**
39
+
40
+ `pubrun` is a stupidly simple, zero-dependency Python library that eliminates the boilerplate of documenting methodology, tracking versions, recording inputs, and monitoring resources — making it dramatically easier to publish, share, and reproduce your models and research. If you're feeling formal, you can think of "publication-ready runner" as the meaning of the name.
41
+
42
+ ## Quick Start
43
+
44
+ ```python
45
+ import pubrun # That's it 90% of the time!
46
+ ```
47
+ or
48
+ ```bash
49
+ pubrun -h # Lots of info here.
50
+ ```
51
+ That's it. No frameworks, no heavy integrations, no syntax hijacking.
52
+ When the script exits, `pubrun` silently generates a structured, lightweight footprint in your local `./runs/` directory.
53
+
54
+ > [!NOTE]
55
+ > **Console capture**: By default, `pubrun` tees `stdout` and `stderr` to log files in the run directory. Your terminal output is unchanged, but a copy is saved alongside the manifest. If your script produces very high output volume, you can disable this with `capture_mode = "off"` in `.pubrun.toml` or via `pubrun.start(console={"capture_mode": "off"})`. See [Configuration](docs/configuration.md) for details.
56
+
57
+ See [CLI Reference](docs/cli.md) and [API Reference](docs/api.md) for full details.
58
+
59
+ ## Features
60
+
61
+ - **Automatic Execution Tracing** — Captures environment variables, hardware specs, and dependency graphs without manual configuration.
62
+ - **Publication-Ready Output** — Generates LaTeX/Markdown methodology blocks ready for academic papers.
63
+ - **Semantic Diffing** — Compares execution footprints to identify subtle but critical differences between runs.
64
+ - **Secret Redaction** — Automatically detects and redacts passwords, tokens, and API keys in environment variables and CLI arguments.
65
+ - **Codebase Drift Detection** — Compares current code state against the execution snapshot to highlight changes.
66
+ - **Cross-Platform Reproducibility** — Extracts initialization commands for seamless environment replication.
67
+ - **HPC Optimized** — Supports global parent-child manifest hydration to minimize overhead on massive clusters.
68
+
69
+ ## The Problem
70
+
71
+ Modern scientific workflows rely on implicit state. When it's time to publish a paper or ship a model, researchers are forced to retroactively piece together their methodology — PyTorch versions, OS constraints, hardware parameters — from memory.
72
+
73
+ ## The Solution
74
+
75
+ `pubrun` permanently ends this friction.
76
+
77
+ With a single `import pubrun`, the library quietly traces your script execution, hashes your environment dependencies, detects codebase drift, and compiles publication-ready **Computational Methodology** LaTeX/Markdown blocks so your run is instantly citable.
78
+
79
+ ### Lazy Initialization (Explicit Tracking)
80
+
81
+ By default, simply importing `pubrun` spins up an invisible tracer. If you want to import `pubrun` without instantly generating a footprint until you explicitly call `pubrun.start()`, set this in your `.pubrun.toml`:
82
+
83
+ ```toml
84
+ [core]
85
+ auto_start = false
86
+ ```
87
+
88
+ Or set the environment variable before import:
89
+
90
+ ```python
91
+ import os
92
+ os.environ["PUBRUN_AUTO_START"] = "false"
93
+
94
+ import pubrun
95
+ # No directory is generated until you say so.
96
+
97
+ pubrun.start(output_dir="./custom_storage", profile="deep")
98
+ ```
99
+
100
+ Now extract your method paragraph for your paper:
101
+
102
+ ```bash
103
+ pubrun methods --format latex
104
+ ```
105
+
106
+ ### Sample Output
107
+
108
+ > Computational experiments were executed on a machine running Linux (5.15.0-91-generic) equipped with an Intel(R) Core(TM) i7-12700H and 32.0 GB of RAM. The execution environment relied on Python 3.10.12 (CPython). Key dependencies tracked include torch (v2.0.1) and numpy (v1.24.3). To guarantee computational reproducibility, the exact state of the source code was anchored at Git commit `a1b2c3d4`. Environment and execution provenance were tracked using the `pubrun` library [1].
109
+
110
+ > [!NOTE]
111
+ > **Windows support**: `pubrun` works on Windows, but some capture engines have reduced functionality. Process `uid`/`gid` fields are not available, and `os.system` interception uses shell-string parsing rather than structured argument lists. All other features work identically.
112
+
113
+ ---
114
+
115
+ ## CLI Reference
116
+
117
+ The `pubrun` CLI provides six commands and four diagnostic flags, all designed to work equally well on a developer laptop or across a Slurm array of thousands of HPC jobs.
118
+
119
+ ### `pubrun cite`
120
+ Generates the bibliographic citation for crediting this library in your paper.
121
+ ```bash
122
+ pubrun cite --style bibtex
123
+ ```
124
+
125
+ ### `pubrun methods`
126
+ Translates raw JSON diagnostic payloads into publication-ready methodology paragraphs.
127
+ ```bash
128
+ pubrun methods [RUN_DIR] --format markdown|latex
129
+ ```
130
+
131
+ ### `pubrun report`
132
+ A diagnostic viewer that surfaces execution timing, hardware, dependencies, and codebase drift. Accepts multiple run directories for sequential evaluation.
133
+ ```bash
134
+ pubrun report ./runs/pubrun-A ./runs/pubrun-B --deep
135
+ ```
136
+
137
+ ### `pubrun rerun`
138
+ Extracts the exact shell command needed to reproduce a run.
139
+ ```bash
140
+ pubrun rerun ./runs/pubrun-A
141
+ ```
142
+
143
+ ### `pubrun diff`
144
+ Generates a semantic side-by-side comparison between two execution traces, filtering volatile noise (timestamps, PIDs) by default.
145
+ ```bash
146
+ pubrun diff ./runs/pubrun-A ./runs/pubrun-B --same --basic --wrap
147
+ ```
148
+
149
+ ### `pubrun meta`
150
+ Generates a standalone environment snapshot for HPC parent-child hydration.
151
+ ```bash
152
+ pubrun meta --out ./runs/meta.json --deep
153
+ ```
154
+
155
+ ### Diagnostic Flags
156
+
157
+ | Flag | Description |
158
+ |---|---|
159
+ | `--create-config` | Bootstrap a fully commented `.pubrun.toml` file |
160
+ | `--show-config` | Print the default configuration to the terminal |
161
+ | `--info` | Display system capabilities and pubrun version |
162
+ | `--run-tests` | Execute the built-in self-test suite |
163
+
164
+ See [CLI Reference](docs/cli.md) for full details and examples.
165
+
166
+ ---
167
+
168
+ ## Advanced HPC Ecosystems (Global Hydration)
169
+
170
+ If you run thousands of array jobs across a cluster, you don't want each child run wasting time and disk logging identical dependency graphs. `pubrun` supports **parent-child manifest hydration**.
171
+
172
+ #### Step 1: Snap the Parent Cluster
173
+ On the head node, snapshot the global environment:
174
+ ```bash
175
+ pubrun meta --out ./runs/meta.json --deep
176
+ ```
177
+ This generates a deep metadata map of hardware, environment variables, and the full Python package tree.
178
+
179
+ #### Step 2: Hydrate Children
180
+ In your Slurm script, reference the parent snapshot:
181
+ ```bash
182
+ export PUBRUN_META_REF=meta.json
183
+ python minimal_script.py
184
+ ```
185
+
186
+ Child scripts automatically skip heavy footprint tracking. When you run `pubrun report` or `pubrun methods`, the orchestrator detects the `PUBRUN_META_REF`, pulls in the parent `meta.json` context, and stitches the complete hardware and dependency picture back together. It also compares script timestamps against the parent snapshot and warns you if **environmental drift** has been detected.
187
+
188
+ ---
189
+
190
+ ## Configuration
191
+
192
+ `pubrun` supports a hierarchical configuration system (highest to lowest precedence):
193
+
194
+ 1. **API overrides** — `pubrun.start(profile="deep")`
195
+ 2. **Environment variables** — `PUBRUN_AUTO_START=false`
196
+ 3. **Local project config** — `.pubrun.toml` or `.config/pubrun/config.toml`
197
+ 4. **User home config** — `~/.config/pubrun/config.toml`
198
+ 5. **Built-in defaults** — `default.toml` (shipped with the library)
199
+
200
+ ### Generate a Configuration File
201
+ ```bash
202
+ pubrun --create-config
203
+ ```
204
+
205
+ See [Configuration Reference](docs/configuration.md) for all settings and examples.
206
+
207
+ ---
208
+
209
+ ## Security & Redaction
210
+
211
+ `pubrun` automatically detects and redacts sensitive values (passwords, tokens, API keys, credentials) in both environment variables and CLI arguments before writing them to the manifest. Redaction is **destructive by default** — raw values are replaced with `{"representation": "redacted"}`, and no hashes are generated, to prevent brute-force attacks.
212
+
213
+ Both environment variable and argv redaction are independently configurable:
214
+
215
+ ```toml
216
+ [redaction]
217
+ env_enabled = true # Redact matching environment variable values
218
+ argv_enabled = true # Redact matching CLI argument values
219
+ ```
220
+
221
+ See [Configuration Reference](docs/configuration.md) for the full redaction policy and regex pattern.
222
+
223
+ ---
224
+
225
+ ## Roadmap
226
+
227
+ ### Pre-v0.2
228
+
229
+ 1. **Sphinx / MkDocs integration** — Generate hosted API documentation from docstrings.
230
+ 2. **Subprocess argv redaction refinement** — The current regex-based approach may over-match legitimate scientific arguments (e.g., `--output=secret_findings.csv`). Community input welcome on the best policy.
231
+ 3. **Coverage reporting** — Integrate `pytest-cov` into CI for coverage tracking.
232
+ 4. **Plugin / extension model** — Formal extension points for custom capture engines.
233
+ 5. **Artifact registration API** — `register_artifact()` for tracking user-produced output files.
234
+ 6. **Custom metadata API** — `register_metadata()` for injecting structured data into the manifest.
235
+ 7. **Determinism tracking** — `register_seed()` and the `[capture.determinism]` engine for recording pseudorandom seeds.
236
+ 8. **Combined console log** — Interleaved `combined.log` output alongside `stdout.log` and `stderr.log`.
237
+
238
+ ---
239
+
240
+ ## Acknowledgements
241
+
242
+ `pubrun` was redesigned and rewritten from pre-existing custom libraries, code fragments, scripts, and ideas spanning almost two decades, with the assistance of Google Antigravity for its official release.
243
+
244
+ ## License
245
+
246
+ Released under the BSD 3-Clause License. Copyright (c) 2007-2026 Gabriele Fariello. See the [LICENSE](LICENSE) file for full terms.
247
+
248
+ ---
249
+
250
+ [README](README.md) | [Architecture](docs/architecture.md) | [Functional Spec](docs/functional_spec.md) | [API](docs/api.md) | [CLI](docs/cli.md) | [Configuration](docs/configuration.md) | [Manifest](docs/manifest.md)
pubrun-0.1.1/README.md ADDED
@@ -0,0 +1,217 @@
1
+ [README](README.md) | [Architecture](docs/architecture.md) | [Functional Spec](docs/functional_spec.md) | [API](docs/api.md) | [CLI](docs/cli.md) | [Configuration](docs/configuration.md) | [Manifest](docs/manifest.md)
2
+
3
+ # pubrun
4
+
5
+ > **Let your code monitor itself and write its own Methods section while you go to the pub.**
6
+
7
+ `pubrun` is a stupidly simple, zero-dependency Python library that eliminates the boilerplate of documenting methodology, tracking versions, recording inputs, and monitoring resources — making it dramatically easier to publish, share, and reproduce your models and research. If you're feeling formal, you can think of "publication-ready runner" as the meaning of the name.
8
+
9
+ ## Quick Start
10
+
11
+ ```python
12
+ import pubrun # That's it 90% of the time!
13
+ ```
14
+ or
15
+ ```bash
16
+ pubrun -h # Lots of info here.
17
+ ```
18
+ That's it. No frameworks, no heavy integrations, no syntax hijacking.
19
+ When the script exits, `pubrun` silently generates a structured, lightweight footprint in your local `./runs/` directory.
20
+
21
+ > [!NOTE]
22
+ > **Console capture**: By default, `pubrun` tees `stdout` and `stderr` to log files in the run directory. Your terminal output is unchanged, but a copy is saved alongside the manifest. If your script produces very high output volume, you can disable this with `capture_mode = "off"` in `.pubrun.toml` or via `pubrun.start(console={"capture_mode": "off"})`. See [Configuration](docs/configuration.md) for details.
23
+
24
+ See [CLI Reference](docs/cli.md) and [API Reference](docs/api.md) for full details.
25
+
26
+ ## Features
27
+
28
+ - **Automatic Execution Tracing** — Captures environment variables, hardware specs, and dependency graphs without manual configuration.
29
+ - **Publication-Ready Output** — Generates LaTeX/Markdown methodology blocks ready for academic papers.
30
+ - **Semantic Diffing** — Compares execution footprints to identify subtle but critical differences between runs.
31
+ - **Secret Redaction** — Automatically detects and redacts passwords, tokens, and API keys in environment variables and CLI arguments.
32
+ - **Codebase Drift Detection** — Compares current code state against the execution snapshot to highlight changes.
33
+ - **Cross-Platform Reproducibility** — Extracts initialization commands for seamless environment replication.
34
+ - **HPC Optimized** — Supports global parent-child manifest hydration to minimize overhead on massive clusters.
35
+
36
+ ## The Problem
37
+
38
+ Modern scientific workflows rely on implicit state. When it's time to publish a paper or ship a model, researchers are forced to retroactively piece together their methodology — PyTorch versions, OS constraints, hardware parameters — from memory.
39
+
40
+ ## The Solution
41
+
42
+ `pubrun` permanently ends this friction.
43
+
44
+ With a single `import pubrun`, the library quietly traces your script execution, hashes your environment dependencies, detects codebase drift, and compiles publication-ready **Computational Methodology** LaTeX/Markdown blocks so your run is instantly citable.
45
+
46
+ ### Lazy Initialization (Explicit Tracking)
47
+
48
+ By default, simply importing `pubrun` spins up an invisible tracer. If you want to import `pubrun` without instantly generating a footprint until you explicitly call `pubrun.start()`, set this in your `.pubrun.toml`:
49
+
50
+ ```toml
51
+ [core]
52
+ auto_start = false
53
+ ```
54
+
55
+ Or set the environment variable before import:
56
+
57
+ ```python
58
+ import os
59
+ os.environ["PUBRUN_AUTO_START"] = "false"
60
+
61
+ import pubrun
62
+ # No directory is generated until you say so.
63
+
64
+ pubrun.start(output_dir="./custom_storage", profile="deep")
65
+ ```
66
+
67
+ Now extract your method paragraph for your paper:
68
+
69
+ ```bash
70
+ pubrun methods --format latex
71
+ ```
72
+
73
+ ### Sample Output
74
+
75
+ > Computational experiments were executed on a machine running Linux (5.15.0-91-generic) equipped with an Intel(R) Core(TM) i7-12700H and 32.0 GB of RAM. The execution environment relied on Python 3.10.12 (CPython). Key dependencies tracked include torch (v2.0.1) and numpy (v1.24.3). To guarantee computational reproducibility, the exact state of the source code was anchored at Git commit `a1b2c3d4`. Environment and execution provenance were tracked using the `pubrun` library [1].
76
+
77
+ > [!NOTE]
78
+ > **Windows support**: `pubrun` works on Windows, but some capture engines have reduced functionality. Process `uid`/`gid` fields are not available, and `os.system` interception uses shell-string parsing rather than structured argument lists. All other features work identically.
79
+
80
+ ---
81
+
82
+ ## CLI Reference
83
+
84
+ The `pubrun` CLI provides six commands and four diagnostic flags, all designed to work equally well on a developer laptop or across a Slurm array of thousands of HPC jobs.
85
+
86
+ ### `pubrun cite`
87
+ Generates the bibliographic citation for crediting this library in your paper.
88
+ ```bash
89
+ pubrun cite --style bibtex
90
+ ```
91
+
92
+ ### `pubrun methods`
93
+ Translates raw JSON diagnostic payloads into publication-ready methodology paragraphs.
94
+ ```bash
95
+ pubrun methods [RUN_DIR] --format markdown|latex
96
+ ```
97
+
98
+ ### `pubrun report`
99
+ A diagnostic viewer that surfaces execution timing, hardware, dependencies, and codebase drift. Accepts multiple run directories for sequential evaluation.
100
+ ```bash
101
+ pubrun report ./runs/pubrun-A ./runs/pubrun-B --deep
102
+ ```
103
+
104
+ ### `pubrun rerun`
105
+ Extracts the exact shell command needed to reproduce a run.
106
+ ```bash
107
+ pubrun rerun ./runs/pubrun-A
108
+ ```
109
+
110
+ ### `pubrun diff`
111
+ Generates a semantic side-by-side comparison between two execution traces, filtering volatile noise (timestamps, PIDs) by default.
112
+ ```bash
113
+ pubrun diff ./runs/pubrun-A ./runs/pubrun-B --same --basic --wrap
114
+ ```
115
+
116
+ ### `pubrun meta`
117
+ Generates a standalone environment snapshot for HPC parent-child hydration.
118
+ ```bash
119
+ pubrun meta --out ./runs/meta.json --deep
120
+ ```
121
+
122
+ ### Diagnostic Flags
123
+
124
+ | Flag | Description |
125
+ |---|---|
126
+ | `--create-config` | Bootstrap a fully commented `.pubrun.toml` file |
127
+ | `--show-config` | Print the default configuration to the terminal |
128
+ | `--info` | Display system capabilities and pubrun version |
129
+ | `--run-tests` | Execute the built-in self-test suite |
130
+
131
+ See [CLI Reference](docs/cli.md) for full details and examples.
132
+
133
+ ---
134
+
135
+ ## Advanced HPC Ecosystems (Global Hydration)
136
+
137
+ If you run thousands of array jobs across a cluster, you don't want each child run wasting time and disk logging identical dependency graphs. `pubrun` supports **parent-child manifest hydration**.
138
+
139
+ #### Step 1: Snap the Parent Cluster
140
+ On the head node, snapshot the global environment:
141
+ ```bash
142
+ pubrun meta --out ./runs/meta.json --deep
143
+ ```
144
+ This generates a deep metadata map of hardware, environment variables, and the full Python package tree.
145
+
146
+ #### Step 2: Hydrate Children
147
+ In your Slurm script, reference the parent snapshot:
148
+ ```bash
149
+ export PUBRUN_META_REF=meta.json
150
+ python minimal_script.py
151
+ ```
152
+
153
+ Child scripts automatically skip heavy footprint tracking. When you run `pubrun report` or `pubrun methods`, the orchestrator detects the `PUBRUN_META_REF`, pulls in the parent `meta.json` context, and stitches the complete hardware and dependency picture back together. It also compares script timestamps against the parent snapshot and warns you if **environmental drift** has been detected.
154
+
155
+ ---
156
+
157
+ ## Configuration
158
+
159
+ `pubrun` supports a hierarchical configuration system (highest to lowest precedence):
160
+
161
+ 1. **API overrides** — `pubrun.start(profile="deep")`
162
+ 2. **Environment variables** — `PUBRUN_AUTO_START=false`
163
+ 3. **Local project config** — `.pubrun.toml` or `.config/pubrun/config.toml`
164
+ 4. **User home config** — `~/.config/pubrun/config.toml`
165
+ 5. **Built-in defaults** — `default.toml` (shipped with the library)
166
+
167
+ ### Generate a Configuration File
168
+ ```bash
169
+ pubrun --create-config
170
+ ```
171
+
172
+ See [Configuration Reference](docs/configuration.md) for all settings and examples.
173
+
174
+ ---
175
+
176
+ ## Security & Redaction
177
+
178
+ `pubrun` automatically detects and redacts sensitive values (passwords, tokens, API keys, credentials) in both environment variables and CLI arguments before writing them to the manifest. Redaction is **destructive by default** — raw values are replaced with `{"representation": "redacted"}`, and no hashes are generated, to prevent brute-force attacks.
179
+
180
+ Both environment variable and argv redaction are independently configurable:
181
+
182
+ ```toml
183
+ [redaction]
184
+ env_enabled = true # Redact matching environment variable values
185
+ argv_enabled = true # Redact matching CLI argument values
186
+ ```
187
+
188
+ See [Configuration Reference](docs/configuration.md) for the full redaction policy and regex pattern.
189
+
190
+ ---
191
+
192
+ ## Roadmap
193
+
194
+ ### Pre-v0.2
195
+
196
+ 1. **Sphinx / MkDocs integration** — Generate hosted API documentation from docstrings.
197
+ 2. **Subprocess argv redaction refinement** — The current regex-based approach may over-match legitimate scientific arguments (e.g., `--output=secret_findings.csv`). Community input welcome on the best policy.
198
+ 3. **Coverage reporting** — Integrate `pytest-cov` into CI for coverage tracking.
199
+ 4. **Plugin / extension model** — Formal extension points for custom capture engines.
200
+ 5. **Artifact registration API** — `register_artifact()` for tracking user-produced output files.
201
+ 6. **Custom metadata API** — `register_metadata()` for injecting structured data into the manifest.
202
+ 7. **Determinism tracking** — `register_seed()` and the `[capture.determinism]` engine for recording pseudorandom seeds.
203
+ 8. **Combined console log** — Interleaved `combined.log` output alongside `stdout.log` and `stderr.log`.
204
+
205
+ ---
206
+
207
+ ## Acknowledgements
208
+
209
+ `pubrun` was redesigned and rewritten from pre-existing custom libraries, code fragments, scripts, and ideas spanning almost two decades, with the assistance of Google Antigravity for its official release.
210
+
211
+ ## License
212
+
213
+ Released under the BSD 3-Clause License. Copyright (c) 2007-2026 Gabriele Fariello. See the [LICENSE](LICENSE) file for full terms.
214
+
215
+ ---
216
+
217
+ [README](README.md) | [Architecture](docs/architecture.md) | [Functional Spec](docs/functional_spec.md) | [API](docs/api.md) | [CLI](docs/cli.md) | [Configuration](docs/configuration.md) | [Manifest](docs/manifest.md)