@xcraftmind/mastermind 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/bin/mastermind.js +4 -0
- package/package.json +9 -8
- package/share/agents/mastermind-auditor.md +205 -0
- package/share/agents/mastermind-critic.md +222 -0
- package/share/agents/mastermind-prompt-refiner.md +70 -0
- package/share/agents/mastermind-release.md +442 -0
- package/share/agents/mastermind-researcher.md +167 -0
- package/share/agents/mastermind-task-executor.md +86 -0
- package/share/skills/doc-stub-sync/SKILL.md +187 -0
- package/share/skills/doc-stub-sync/references/error-handling.md +79 -0
- package/share/skills/doc-stub-sync/references/url-patterns.md +83 -0
- package/share/skills/doc-stub-sync/scripts/doc_update.py +285 -0
- package/share/skills/doc-stub-sync/scripts/requirements.txt +2 -0
- package/share/skills/flaky-finder/SKILL.md +75 -0
- package/share/skills/mastermind-incident-response/SKILL.md +157 -0
- package/share/skills/mastermind-incident-response/references/investigation-playbook.md +173 -0
- package/share/skills/mastermind-incident-response/references/postmortem-template.md +184 -0
- package/share/skills/mastermind-incident-response/references/triage-checklist.md +117 -0
- package/share/skills/mastermind-prompt-refiner/SKILL.md +157 -0
- package/share/skills/mastermind-prompt-refiner/references/refining-checklist.md +89 -0
- package/share/skills/mastermind-prompt-refiner/references/techniques.md +143 -0
- package/share/skills/mastermind-task-executor/SKILL.md +154 -0
- package/share/skills/mastermind-task-planning/SKILL.md +337 -0
- package/share/skills/mastermind-task-planning/references/spec-template.md +286 -0
- package/share/skills/pr-review/SKILL.md +89 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: doc-stub-sync
|
|
3
|
+
description: 'Sync local documentation stubs with their current online versions — finds files matching a stub pattern (default `Fetch live documentation: <URL>`), compares content hashes, refetches changed pages, reports diffs. Use when the user says "update docs", "sync docs with online sources", "refresh local docs", or has a folder of stub files pointing at upstream URLs.'
|
|
4
|
+
metadata:
|
|
5
|
+
version: 0.1.0
|
|
6
|
+
authors:
|
|
7
|
+
- mastermind
|
|
8
|
+
tags:
|
|
9
|
+
- docs
|
|
10
|
+
- automation
|
|
11
|
+
- sync
|
|
12
|
+
model: sonnet
|
|
13
|
+
requires:
|
|
14
|
+
- Bash
|
|
15
|
+
- Python 3.10+ (for the bundled script), OR any HTTP-capable runtime if doing it manually
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
# Documentation Stub Sync
|
|
19
|
+
|
|
20
|
+
Keeps a tree of local documentation files in sync with their online sources. Works on the **stub-link pattern**: each local file contains a marker line like `Fetch live documentation: https://...` pointing at the canonical upstream URL. The skill finds those stubs, checks whether the upstream changed, and refetches only what's stale.
|
|
21
|
+
|
|
22
|
+
This is for the specific workflow where you maintain a local mirror of upstream docs (vendor reference, framework docs, internal wiki snapshots) as part of your personal or team knowledge base. **Not** a general "fix all my docs" tool.
|
|
23
|
+
|
|
24
|
+
## When to use
|
|
25
|
+
|
|
26
|
+
- User says "update docs", "sync docs with online sources", "refresh local docs"
|
|
27
|
+
- User points at a folder and says "the stubs in here are stale"
|
|
28
|
+
- A local docs tree has files matching the stub pattern and the user wants them current
|
|
29
|
+
- Do NOT use to *write* new documentation. Use a writing-focused skill for that.
|
|
30
|
+
- Do NOT use on files that don't have the stub marker — the skill will skip them, but you shouldn't even point it there.
|
|
31
|
+
|
|
32
|
+
## Prerequisites
|
|
33
|
+
|
|
34
|
+
- A local directory containing markdown files with the stub pattern
|
|
35
|
+
- Bash + Python 3.10+ if using the bundled script (`scripts/doc_update.py`)
|
|
36
|
+
- Network access for the URLs in the stubs
|
|
37
|
+
- Optional: configurable stub pattern (default `Fetch live documentation: <URL>`)
|
|
38
|
+
|
|
39
|
+
## Steps
|
|
40
|
+
|
|
41
|
+
### 1. Inventory
|
|
42
|
+
|
|
43
|
+
Find every file matching the stub pattern under the target directory:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
grep -rl "Fetch live documentation:" <target-dir> --include="*.md"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Then for each matched file, extract the URL. Report: total count, list of unique URLs, files-per-URL if any URL repeats. Show this to the user **before** any network requests — confirm scope.
|
|
50
|
+
|
|
51
|
+
### 2. Confirm with the user
|
|
52
|
+
|
|
53
|
+
Show the inventory and ask before proceeding. For >10 files, also note expected time (rule of thumb: ~1.5s/file with rate limiting). For >100 files, suggest breaking into batches by subdirectory.
|
|
54
|
+
|
|
55
|
+
### 3. Detect what changed
|
|
56
|
+
|
|
57
|
+
For each stub file:
|
|
58
|
+
- Fetch the upstream URL (with timeout, single retry on transient failure)
|
|
59
|
+
- Extract the main content region (`<main>`, `<article>`, or the largest content block)
|
|
60
|
+
- Compute a hash (SHA-256 of normalized text — strip whitespace at line ends, collapse runs of blank lines)
|
|
61
|
+
- Compute the same hash on the local file's content body (excluding the stub line)
|
|
62
|
+
- If hashes match: **skip** (already current)
|
|
63
|
+
- If hashes differ: **mark for update**
|
|
64
|
+
- If URL returns ≥400 or times out: **mark unreachable**
|
|
65
|
+
|
|
66
|
+
Rate limit: minimum 1 second between requests to the same host. Use the bundled script — `scripts/doc_update.py` — for the actual implementation. The script handles rate limiting, retry, and parallelism within rate limits.
|
|
67
|
+
|
|
68
|
+
### 4. Update changed files
|
|
69
|
+
|
|
70
|
+
For each "mark for update" file:
|
|
71
|
+
- Build the new body: extracted content + the stub line preserved at the bottom
|
|
72
|
+
- Write atomically (tmp file + rename) to avoid partial writes
|
|
73
|
+
- Log: old hash → new hash, byte delta
|
|
74
|
+
|
|
75
|
+
Never delete files. Never modify files that don't have the stub marker. Never touch files outside the target directory.
|
|
76
|
+
|
|
77
|
+
### 5. Report
|
|
78
|
+
|
|
79
|
+
Output a two-part report:
|
|
80
|
+
|
|
81
|
+
**Machine-readable JSON** (for downstream tooling):
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"summary": {
|
|
85
|
+
"scanned": 96,
|
|
86
|
+
"updated": 12,
|
|
87
|
+
"skipped_current": 82,
|
|
88
|
+
"unreachable": 2,
|
|
89
|
+
"errors": 0,
|
|
90
|
+
"duration_seconds": 187
|
|
91
|
+
},
|
|
92
|
+
"updated": [
|
|
93
|
+
{"path": "anthropic/configuration/settings.md", "url": "...", "old_hash": "a1b2...", "new_hash": "c3d4...", "delta_bytes": 412}
|
|
94
|
+
],
|
|
95
|
+
"unreachable": [
|
|
96
|
+
{"path": "...", "url": "...", "reason": "404 Not Found"}
|
|
97
|
+
],
|
|
98
|
+
"errors": []
|
|
99
|
+
}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Human summary** (one paragraph + the deltas):
|
|
103
|
+
```
|
|
104
|
+
Synced 96 stubs in 3 minutes. Updated 12 files (largest delta: configuration/settings.md +412 B). 82 already current. 2 unreachable (404): see report. No write errors.
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Outputs
|
|
108
|
+
|
|
109
|
+
- The JSON + human summary shown above
|
|
110
|
+
- Optionally write the JSON to `<target-dir>/.doc-stub-sync-report.json` if the user asks for it
|
|
111
|
+
- Only file modifications: the stub files themselves, atomically rewritten
|
|
112
|
+
|
|
113
|
+
## ALWAYS / NEVER
|
|
114
|
+
|
|
115
|
+
**ALWAYS**
|
|
116
|
+
- Rate-limit (≥1s between requests to the same host)
|
|
117
|
+
- Show inventory and ask for confirmation before any network requests
|
|
118
|
+
- Preserve the stub marker line in the rewritten file
|
|
119
|
+
- Atomic writes (tmp + rename)
|
|
120
|
+
- Continue on individual file failures; never abort the whole batch on one error
|
|
121
|
+
- Report unreachable URLs in the final summary
|
|
122
|
+
|
|
123
|
+
**NEVER**
|
|
124
|
+
- Modify files outside the target directory
|
|
125
|
+
- Modify files that don't contain the stub marker
|
|
126
|
+
- Delete files
|
|
127
|
+
- Overwrite files where the hash is unchanged (it's a no-op, but skip the write — preserves mtime for downstream tools that watch it)
|
|
128
|
+
- Fire requests at the same host faster than the rate limit, even on retries
|
|
129
|
+
- Trust the upstream content blindly — strip scripts, styles, and tracking elements before hashing/storing
|
|
130
|
+
|
|
131
|
+
## Examples
|
|
132
|
+
|
|
133
|
+
### Full sync
|
|
134
|
+
|
|
135
|
+
**User:** "Sync everything under `~/Documents/kb/anthropic/`."
|
|
136
|
+
|
|
137
|
+
**Agent:**
|
|
138
|
+
```
|
|
139
|
+
Found 96 stub files under ~/Documents/kb/anthropic/. They reference 96 unique URLs across 3 hosts (docs.claude.com, www.anthropic.com, github.com/anthropics).
|
|
140
|
+
|
|
141
|
+
Estimated time: ~3 minutes with rate limiting.
|
|
142
|
+
|
|
143
|
+
Proceed? [yes / no / dry-run]
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
After confirmation, runs `scripts/doc_update.py` and emits the JSON + summary report.
|
|
147
|
+
|
|
148
|
+
### Targeted
|
|
149
|
+
|
|
150
|
+
**User:** "Just the configuration subdirectory."
|
|
151
|
+
|
|
152
|
+
**Agent:**
|
|
153
|
+
```
|
|
154
|
+
Found 5 stub files under ~/Documents/kb/anthropic/configuration/:
|
|
155
|
+
- settings.md → docs.claude.com/.../settings
|
|
156
|
+
- model-config.md → docs.claude.com/.../model-config
|
|
157
|
+
- terminal-config.md → docs.claude.com/.../terminal
|
|
158
|
+
- memory.md → docs.claude.com/.../memory
|
|
159
|
+
- statusline.md → docs.claude.com/.../statusline
|
|
160
|
+
|
|
161
|
+
Estimated time: ~10 seconds. Proceed?
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Dry-run
|
|
165
|
+
|
|
166
|
+
**User:** "What would change if I ran it now?"
|
|
167
|
+
|
|
168
|
+
**Agent:** Runs the hash comparison only, skipping the update step. Reports which files *would* change and which would stay.
|
|
169
|
+
|
|
170
|
+
## References
|
|
171
|
+
|
|
172
|
+
- [`references/url-patterns.md`](references/url-patterns.md) — recognized stub formats and how to add a new one
|
|
173
|
+
- [`references/error-handling.md`](references/error-handling.md) — HTTP error codes and how the skill responds to each
|
|
174
|
+
- [`scripts/doc_update.py`](scripts/doc_update.py) — bundled implementation (Python 3.10+)
|
|
175
|
+
- [`scripts/requirements.txt`](scripts/requirements.txt) — Python deps for the script
|
|
176
|
+
|
|
177
|
+
## Customizing the stub pattern
|
|
178
|
+
|
|
179
|
+
Default: `Fetch live documentation: <URL>` (case-sensitive, URL is the rest of the line).
|
|
180
|
+
|
|
181
|
+
To use a different pattern, pass `--stub-pattern` to the script with a regex containing one capture group for the URL. Example for a different convention:
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
python scripts/doc_update.py --stub-pattern 'Source: (https?://\S+)' <target-dir>
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
The skill body assumes the default pattern unless the user tells you otherwise. If you see they're using a different convention, ask first before assuming.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Error handling — HTTP responses and how the skill reacts
|
|
2
|
+
|
|
3
|
+
Reference for the [`doc-stub-sync`](../SKILL.md) skill. The script behavior is in [`../scripts/doc_update.py`](../scripts/doc_update.py).
|
|
4
|
+
|
|
5
|
+
## How errors are classified
|
|
6
|
+
|
|
7
|
+
Every stub processed ends up in one of four buckets in the report:
|
|
8
|
+
|
|
9
|
+
- `updated` — content changed, file rewritten
|
|
10
|
+
- `current` — content unchanged, file untouched
|
|
11
|
+
- `unreachable` — request failed or returned ≥400, file untouched
|
|
12
|
+
- `error` — local issue (permissions, disk, encoding), file untouched
|
|
13
|
+
|
|
14
|
+
A stub never silently fails. Every problem lands in one bucket and is reported.
|
|
15
|
+
|
|
16
|
+
## HTTP response handling
|
|
17
|
+
|
|
18
|
+
| Status | Bucket | Skill action |
|
|
19
|
+
|---|---|---|
|
|
20
|
+
| 2xx | `current` / `updated` | Parse, hash, compare, maybe rewrite |
|
|
21
|
+
| 301, 302, 307, 308 | (followed) | The HTTP client follows redirects automatically; the *final* response is what the skill evaluates. If the final response is ≥400, treat as unreachable. |
|
|
22
|
+
| 304 Not Modified | `current` | Treated as no-change. (Note: the bundled script doesn't send `If-Modified-Since` yet — 304 in practice means the server volunteered it.) |
|
|
23
|
+
| 401, 403 | `unreachable` | Auth required or forbidden. Don't retry with credentials — surface to user and stop. |
|
|
24
|
+
| 404, 410 | `unreachable` | Upstream page is gone. The user has to decide: update the stub's URL, delete the file, or accept the staleness. The skill does NOT auto-delete. |
|
|
25
|
+
| 408, 429, 503, 504 | `unreachable` (after one retry) | Transient. Script retries once after a 2-second pause. Still failing → unreachable. |
|
|
26
|
+
| 5xx (other) | `unreachable` (after one retry) | Same handling as transient. |
|
|
27
|
+
| Network timeout | `unreachable` (after one retry) | Single retry with backoff, then give up for this run. |
|
|
28
|
+
| DNS / connection refused | `unreachable` | No retry — the host isn't there. |
|
|
29
|
+
|
|
30
|
+
## Retry policy
|
|
31
|
+
|
|
32
|
+
The script retries **once** on a network-level failure (timeout, connection error, DNS). It does **not** retry on HTTP status codes ≥400 — those represent a definitive answer from the server, not a transient transport issue.
|
|
33
|
+
|
|
34
|
+
Why one retry, not more:
|
|
35
|
+
- Most transient failures are resolved by waiting 1-2 seconds
|
|
36
|
+
- More retries on a doc-sync job means slowing down the whole batch by minutes
|
|
37
|
+
- 429 (rate limit) deserves a different response: respect the limit, don't hammer
|
|
38
|
+
|
|
39
|
+
If the user is hitting 429 repeatedly, the fix is `--rate-limit` (raise it), not more retries.
|
|
40
|
+
|
|
41
|
+
## Rate limiting
|
|
42
|
+
|
|
43
|
+
The script enforces a per-host minimum interval between requests. Default: 1.0 second.
|
|
44
|
+
|
|
45
|
+
This applies *per host*, not globally: syncing 50 stubs from `docs.claude.com` and 30 from `github.com` runs them in parallel as far as the rate limiter is concerned. Within each host, requests are serialized with the minimum gap.
|
|
46
|
+
|
|
47
|
+
Raise `--rate-limit` to 2.0 or higher if the upstream is sensitive (small docs site, single-server) or you've seen 429s.
|
|
48
|
+
|
|
49
|
+
## Local errors
|
|
50
|
+
|
|
51
|
+
| Situation | Bucket | What to do |
|
|
52
|
+
|---|---|---|
|
|
53
|
+
| File not writable (permissions) | `error` | Surface to user. They fix the perms and re-run. |
|
|
54
|
+
| Disk full | `error` | Surface. Don't keep trying. |
|
|
55
|
+
| File contains invalid UTF-8 | (skipped at discovery) | These files aren't even inventoried. If the user expects them to be stubs, they have a separate problem. |
|
|
56
|
+
| Atomic rename fails (race, cross-device tmp) | `error` | Surface the underlying OS error verbatim. Often means `tempfile.mkstemp` landed on a different filesystem than the target — usually a misconfigured `TMPDIR`. |
|
|
57
|
+
|
|
58
|
+
## What the user sees on error
|
|
59
|
+
|
|
60
|
+
Every error appears in two places:
|
|
61
|
+
1. The JSON report (`errors` array, with `path`, `url`, `reason`)
|
|
62
|
+
2. The human summary (count + first 3 reasons inline, full list pointer)
|
|
63
|
+
|
|
64
|
+
Example human summary on errors:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
Synced 96 stubs in 4 minutes. Updated 8. 80 already current. 6 unreachable (4×404, 2×timeout): see report. 2 errors (permission denied): see report.
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
If the run had *only* errors and no updates, exit code is non-zero — pipelines and cron jobs that depend on the script can detect this.
|
|
71
|
+
|
|
72
|
+
## What the agent does with the report
|
|
73
|
+
|
|
74
|
+
The skill's job is to **report**, not to **act on** errors. The user decides:
|
|
75
|
+
- A 404 on a doc → maybe the upstream renamed it. Update the stub URL or delete the local file.
|
|
76
|
+
- 401/403 → maybe the doc moved behind auth. Out of scope for this skill.
|
|
77
|
+
- A permissions error → the user fixes the OS-level perms and re-runs.
|
|
78
|
+
|
|
79
|
+
The skill does **not** edit stub URLs, delete files, or modify other files in response to errors. Those are destructive operations and they belong to the user (or to a separate, explicitly-invoked tool).
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Stub URL patterns
|
|
2
|
+
|
|
3
|
+
This reference covers stub formats the [`doc-stub-sync`](../SKILL.md) skill recognizes, and how to add new ones.
|
|
4
|
+
|
|
5
|
+
## The default pattern
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Fetch live documentation: https://docs.example.com/path/to/page
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Regex: `Fetch live documentation:\s*(https?://\S+)`
|
|
12
|
+
|
|
13
|
+
This is the format the bundled `scripts/doc_update.py` looks for unless overridden.
|
|
14
|
+
|
|
15
|
+
### Why this pattern
|
|
16
|
+
|
|
17
|
+
- **Easy to grep**: single grep across the whole tree finds every stub
|
|
18
|
+
- **Self-documenting**: a human reading the file knows where the canonical source is
|
|
19
|
+
- **One URL per file**: forces one-doc-per-stub-file, which simplifies sync logic
|
|
20
|
+
- **Tool-agnostic prefix**: the words `Fetch live documentation:` are unlikely to appear in normal prose, so false matches are rare
|
|
21
|
+
|
|
22
|
+
## Where the pattern goes inside the file
|
|
23
|
+
|
|
24
|
+
The skill doesn't care about position — it greps the whole file. By convention:
|
|
25
|
+
|
|
26
|
+
- **At the end of the file**, after a `---` separator, as a "source" footer
|
|
27
|
+
- **In a frontmatter field** (e.g., `source: https://...`) — for that, switch to a frontmatter-aware pattern (see below)
|
|
28
|
+
|
|
29
|
+
## Common alternative patterns
|
|
30
|
+
|
|
31
|
+
If your knowledge base uses a different convention, pass `--stub-pattern` to the script. The pattern must be a Python regex with **exactly one capture group** that captures the URL.
|
|
32
|
+
|
|
33
|
+
### Source-line convention
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
Source: https://docs.example.com/page
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
python scripts/doc_update.py --stub-pattern 'Source:\s*(https?://\S+)' ./docs
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### HTML-comment marker
|
|
44
|
+
|
|
45
|
+
```html
|
|
46
|
+
<!-- mirror: https://docs.example.com/page -->
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python scripts/doc_update.py --stub-pattern '<!--\s*mirror:\s*(https?://\S+)\s*-->' ./docs
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Frontmatter `source:` field
|
|
54
|
+
|
|
55
|
+
```yaml
|
|
56
|
+
---
|
|
57
|
+
title: API Reference
|
|
58
|
+
source: https://docs.example.com/api
|
|
59
|
+
---
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python scripts/doc_update.py --stub-pattern '^source:\s*(https?://\S+)$' ./docs
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Frontmatter matching is best done with the multiline flag — wrap the pattern in `(?m)` if needed: `'(?m)^source:\s*(https?://\S+)$'`.
|
|
67
|
+
|
|
68
|
+
## Adding a new pattern to the standard
|
|
69
|
+
|
|
70
|
+
If you find yourself using a new pattern across multiple projects, send a PR adding:
|
|
71
|
+
|
|
72
|
+
1. An entry to this file under "Common alternative patterns"
|
|
73
|
+
2. An example command line
|
|
74
|
+
3. A short note on when this pattern is preferable
|
|
75
|
+
|
|
76
|
+
Don't add a flag to the script for every pattern — the regex flag is general enough. Only the default in `DEFAULT_STUB_PATTERN` is special, and we don't change the default casually (it would break every existing local KB using this skill).
|
|
77
|
+
|
|
78
|
+
## Pattern gotchas
|
|
79
|
+
|
|
80
|
+
- **Greediness**: `https?://.+` will eat trailing punctuation. Use `\S+` to stop at whitespace.
|
|
81
|
+
- **Multiline**: stubs that wrap across lines aren't supported — keep the URL on the same line as the marker.
|
|
82
|
+
- **Query strings and fragments**: included by default. If you want to strip `?utm_*` and `#fragments` for hashing, do it in the script, not the regex.
|
|
83
|
+
- **Multiple stubs per file**: only the *first* match wins. If a file points at multiple URLs, it's not a stub file — it's a hand-edited doc, leave it alone.
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Sync local documentation stubs with their current online versions.
|
|
4
|
+
|
|
5
|
+
See ../SKILL.md for the workflow this script implements.
|
|
6
|
+
Run with --help for CLI usage.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import dataclasses
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
import tempfile
|
|
19
|
+
import time
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional
|
|
23
|
+
from urllib.parse import urlparse
|
|
24
|
+
|
|
25
|
+
import requests
|
|
26
|
+
from bs4 import BeautifulSoup
|
|
27
|
+
|
|
28
|
+
DEFAULT_STUB_PATTERN = r"Fetch live documentation:\s*(https?://\S+)"
|
|
29
|
+
DEFAULT_TIMEOUT_SECONDS = 15
|
|
30
|
+
DEFAULT_RATE_LIMIT_SECONDS = 1.0
|
|
31
|
+
USER_AGENT = "mastermind-doc-stub-sync/0.1 (+https://github.com/aglumova/mastermind)"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclasses.dataclass
|
|
35
|
+
class StubFile:
|
|
36
|
+
path: Path
|
|
37
|
+
url: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclasses.dataclass
|
|
41
|
+
class UpdateResult:
|
|
42
|
+
path: str
|
|
43
|
+
url: str
|
|
44
|
+
status: str # "updated" | "current" | "unreachable" | "error"
|
|
45
|
+
old_hash: Optional[str] = None
|
|
46
|
+
new_hash: Optional[str] = None
|
|
47
|
+
delta_bytes: int = 0
|
|
48
|
+
reason: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def find_stubs(root: Path, pattern: re.Pattern) -> list[StubFile]:
|
|
52
|
+
stubs: list[StubFile] = []
|
|
53
|
+
for path in root.rglob("*.md"):
|
|
54
|
+
try:
|
|
55
|
+
text = path.read_text(encoding="utf-8")
|
|
56
|
+
except (UnicodeDecodeError, PermissionError):
|
|
57
|
+
continue
|
|
58
|
+
match = pattern.search(text)
|
|
59
|
+
if match:
|
|
60
|
+
stubs.append(StubFile(path=path, url=match.group(1).strip()))
|
|
61
|
+
return stubs
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def normalize_text(text: str) -> str:
|
|
65
|
+
"""Strip line-end whitespace and collapse blank runs for stable hashing."""
|
|
66
|
+
lines = [line.rstrip() for line in text.splitlines()]
|
|
67
|
+
out: list[str] = []
|
|
68
|
+
prev_blank = False
|
|
69
|
+
for line in lines:
|
|
70
|
+
if not line:
|
|
71
|
+
if not prev_blank:
|
|
72
|
+
out.append("")
|
|
73
|
+
prev_blank = True
|
|
74
|
+
else:
|
|
75
|
+
out.append(line)
|
|
76
|
+
prev_blank = False
|
|
77
|
+
return "\n".join(out).strip() + "\n"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def content_hash(text: str) -> str:
|
|
81
|
+
return hashlib.sha256(normalize_text(text).encode("utf-8")).hexdigest()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def local_body(text: str, pattern: re.Pattern) -> str:
|
|
85
|
+
"""The local file's body, with the stub line removed for fair comparison."""
|
|
86
|
+
return pattern.sub("", text).strip()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def extract_main(html: str) -> tuple[str, str]:
|
|
90
|
+
"""Return (title, main-text) extracted from the upstream HTML."""
|
|
91
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
92
|
+
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
|
|
93
|
+
tag.decompose()
|
|
94
|
+
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
|
95
|
+
text = main.get_text(separator="\n")
|
|
96
|
+
title_tag = soup.find("title")
|
|
97
|
+
title = (
|
|
98
|
+
title_tag.get_text().split("|")[0].strip()
|
|
99
|
+
if title_tag
|
|
100
|
+
else urlparse(soup.find("link", rel="canonical").get("href", ""))
|
|
101
|
+
.path.rstrip("/")
|
|
102
|
+
.rsplit("/", 1)[-1]
|
|
103
|
+
if soup.find("link", rel="canonical")
|
|
104
|
+
else ""
|
|
105
|
+
)
|
|
106
|
+
return title, text
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def fetch(url: str, timeout: float) -> requests.Response:
|
|
110
|
+
return requests.get(url, timeout=timeout, headers={"User-Agent": USER_AGENT})
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def fetch_with_retry(url: str, timeout: float) -> requests.Response:
|
|
114
|
+
try:
|
|
115
|
+
return fetch(url, timeout)
|
|
116
|
+
except requests.exceptions.RequestException:
|
|
117
|
+
time.sleep(2)
|
|
118
|
+
return fetch(url, timeout)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def build_updated_content(title: str, body_text: str, url: str) -> str:
|
|
122
|
+
header = f"# {title}\n\n" if title else ""
|
|
123
|
+
return f"{header}{body_text.strip()}\n\n---\n\nFetch live documentation: {url}\n"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def write_atomic(path: Path, content: str) -> None:
|
|
127
|
+
tmp_fd, tmp_path = tempfile.mkstemp(dir=path.parent, prefix=".tmp-", suffix=".md")
|
|
128
|
+
try:
|
|
129
|
+
with os.fdopen(tmp_fd, "w", encoding="utf-8") as f:
|
|
130
|
+
f.write(content)
|
|
131
|
+
os.replace(tmp_path, path)
|
|
132
|
+
except Exception:
|
|
133
|
+
if os.path.exists(tmp_path):
|
|
134
|
+
os.unlink(tmp_path)
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def process_stub(
|
|
139
|
+
stub: StubFile,
|
|
140
|
+
pattern: re.Pattern,
|
|
141
|
+
timeout: float,
|
|
142
|
+
dry_run: bool,
|
|
143
|
+
) -> UpdateResult:
|
|
144
|
+
try:
|
|
145
|
+
response = fetch_with_retry(stub.url, timeout)
|
|
146
|
+
except requests.exceptions.RequestException as exc:
|
|
147
|
+
return UpdateResult(
|
|
148
|
+
path=str(stub.path),
|
|
149
|
+
url=stub.url,
|
|
150
|
+
status="unreachable",
|
|
151
|
+
reason=f"request failed: {exc}",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
if response.status_code >= 400:
|
|
155
|
+
return UpdateResult(
|
|
156
|
+
path=str(stub.path),
|
|
157
|
+
url=stub.url,
|
|
158
|
+
status="unreachable",
|
|
159
|
+
reason=f"{response.status_code} {response.reason}",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
title, upstream_text = extract_main(response.text)
|
|
163
|
+
new_content = build_updated_content(title, upstream_text, stub.url)
|
|
164
|
+
new_hash = content_hash(new_content)
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
old_text = stub.path.read_text(encoding="utf-8")
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
return UpdateResult(
|
|
170
|
+
path=str(stub.path), url=stub.url, status="error", reason=str(exc)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
old_hash = content_hash(local_body(old_text, pattern))
|
|
174
|
+
cmp_hash = content_hash(local_body(new_content, pattern))
|
|
175
|
+
|
|
176
|
+
if old_hash == cmp_hash:
|
|
177
|
+
return UpdateResult(
|
|
178
|
+
path=str(stub.path),
|
|
179
|
+
url=stub.url,
|
|
180
|
+
status="current",
|
|
181
|
+
old_hash=old_hash,
|
|
182
|
+
new_hash=cmp_hash,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
delta = len(new_content.encode("utf-8")) - len(old_text.encode("utf-8"))
|
|
186
|
+
if not dry_run:
|
|
187
|
+
try:
|
|
188
|
+
write_atomic(stub.path, new_content)
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
return UpdateResult(
|
|
191
|
+
path=str(stub.path), url=stub.url, status="error", reason=str(exc)
|
|
192
|
+
)
|
|
193
|
+
return UpdateResult(
|
|
194
|
+
path=str(stub.path),
|
|
195
|
+
url=stub.url,
|
|
196
|
+
status="updated",
|
|
197
|
+
old_hash=old_hash,
|
|
198
|
+
new_hash=cmp_hash,
|
|
199
|
+
delta_bytes=delta,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def run(
|
|
204
|
+
root: Path,
|
|
205
|
+
stub_pattern: str,
|
|
206
|
+
rate_limit_seconds: float,
|
|
207
|
+
timeout_seconds: float,
|
|
208
|
+
dry_run: bool,
|
|
209
|
+
) -> dict:
|
|
210
|
+
pattern = re.compile(stub_pattern)
|
|
211
|
+
stubs = find_stubs(root, pattern)
|
|
212
|
+
if not stubs:
|
|
213
|
+
return {
|
|
214
|
+
"summary": {"scanned": 0, "updated": 0, "skipped_current": 0, "unreachable": 0, "errors": 0, "duration_seconds": 0},
|
|
215
|
+
"updated": [],
|
|
216
|
+
"unreachable": [],
|
|
217
|
+
"errors": [],
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
started = time.monotonic()
|
|
221
|
+
last_request_per_host: dict[str, float] = defaultdict(float)
|
|
222
|
+
results: list[UpdateResult] = []
|
|
223
|
+
|
|
224
|
+
for stub in stubs:
|
|
225
|
+
host = urlparse(stub.url).netloc
|
|
226
|
+
wait = (last_request_per_host[host] + rate_limit_seconds) - time.monotonic()
|
|
227
|
+
if wait > 0:
|
|
228
|
+
time.sleep(wait)
|
|
229
|
+
last_request_per_host[host] = time.monotonic()
|
|
230
|
+
results.append(process_stub(stub, pattern, timeout_seconds, dry_run))
|
|
231
|
+
|
|
232
|
+
summary = {
|
|
233
|
+
"scanned": len(results),
|
|
234
|
+
"updated": sum(1 for r in results if r.status == "updated"),
|
|
235
|
+
"skipped_current": sum(1 for r in results if r.status == "current"),
|
|
236
|
+
"unreachable": sum(1 for r in results if r.status == "unreachable"),
|
|
237
|
+
"errors": sum(1 for r in results if r.status == "error"),
|
|
238
|
+
"duration_seconds": round(time.monotonic() - started, 1),
|
|
239
|
+
"dry_run": dry_run,
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
"summary": summary,
|
|
243
|
+
"updated": [dataclasses.asdict(r) for r in results if r.status == "updated"],
|
|
244
|
+
"unreachable": [dataclasses.asdict(r) for r in results if r.status == "unreachable"],
|
|
245
|
+
"errors": [dataclasses.asdict(r) for r in results if r.status == "error"],
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def cli(argv: list[str]) -> int:
|
|
250
|
+
parser = argparse.ArgumentParser(description="Sync local doc stubs with online sources.")
|
|
251
|
+
parser.add_argument("target_dir", type=Path, help="Root directory containing stub markdown files.")
|
|
252
|
+
parser.add_argument("--stub-pattern", default=DEFAULT_STUB_PATTERN,
|
|
253
|
+
help="Regex with one capture group for the URL. Default: %(default)r")
|
|
254
|
+
parser.add_argument("--rate-limit", type=float, default=DEFAULT_RATE_LIMIT_SECONDS,
|
|
255
|
+
help="Minimum seconds between requests to the same host. Default: %(default)s")
|
|
256
|
+
parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT_SECONDS,
|
|
257
|
+
help="Per-request timeout in seconds. Default: %(default)s")
|
|
258
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
259
|
+
help="Detect changes but do not write files. Report what would change.")
|
|
260
|
+
parser.add_argument("--report-file", type=Path,
|
|
261
|
+
help="If set, write the JSON report to this path.")
|
|
262
|
+
args = parser.parse_args(argv)
|
|
263
|
+
|
|
264
|
+
if not args.target_dir.is_dir():
|
|
265
|
+
print(f"error: target_dir is not a directory: {args.target_dir}", file=sys.stderr)
|
|
266
|
+
return 2
|
|
267
|
+
|
|
268
|
+
report = run(
|
|
269
|
+
root=args.target_dir,
|
|
270
|
+
stub_pattern=args.stub_pattern,
|
|
271
|
+
rate_limit_seconds=args.rate_limit,
|
|
272
|
+
timeout_seconds=args.timeout,
|
|
273
|
+
dry_run=args.dry_run,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
output_json = json.dumps(report, indent=2)
|
|
277
|
+
print(output_json)
|
|
278
|
+
if args.report_file:
|
|
279
|
+
args.report_file.write_text(output_json + "\n", encoding="utf-8")
|
|
280
|
+
|
|
281
|
+
return 0 if report["summary"]["errors"] == 0 else 1
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
if __name__ == "__main__":
|
|
285
|
+
raise SystemExit(cli(sys.argv[1:]))
|