elliot-stack 1.0.29 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +5 -0
  3. package/bin/install.cjs +981 -950
  4. package/hooks/repo-search-nudge.js +32 -32
  5. package/package.json +1 -1
  6. package/skills/estack-active-learning-tutor/SKILL.md +339 -339
  7. package/skills/estack-better-title/SKILL.md +64 -64
  8. package/skills/estack-better-title/scripts/rename.sh +55 -55
  9. package/skills/estack-chris-voss/SKILL.md +80 -80
  10. package/skills/estack-chris-voss/references/elliot-notes.md +120 -120
  11. package/skills/estack-chris-voss/references/voss-principles.md +210 -210
  12. package/skills/estack-customer-discovery/SKILL.md +60 -60
  13. package/skills/estack-flight-planner/SKILL.md +332 -332
  14. package/skills/estack-flight-planner/references/config_schema.md +156 -156
  15. package/skills/estack-flight-planner/references/flight_history_schema.md +97 -97
  16. package/skills/estack-flight-planner/references/shuttle_schedules.md +98 -98
  17. package/skills/estack-flight-planner/scripts/check_setup.sh +89 -89
  18. package/skills/estack-flight-planner/scripts/fetch_flights.py +99 -99
  19. package/skills/estack-flight-planner/scripts/filter_flights.py +265 -265
  20. package/skills/estack-flight-planner/scripts/pair_shuttles.py +173 -173
  21. package/skills/estack-github-issue-tracker/SKILL.md +322 -322
  22. package/skills/estack-github-issue-tracker/bin/tracker-tools.cjs +1358 -1358
  23. package/skills/estack-github-issue-tracker/references/gh-cli-patterns.md +124 -124
  24. package/skills/estack-github-issue-tracker/references/result-file-schema.md +156 -156
  25. package/skills/estack-github-issue-tracker/references/tracker-schema.md +96 -96
  26. package/skills/estack-github-issue-tracker/tracker-template.md +58 -58
  27. package/skills/estack-leadership-coach/SKILL.md +235 -0
  28. package/skills/estack-leadership-coach/adding-references.md +280 -0
  29. package/skills/estack-leadership-coach/frameworks/delegation/flows/post-mortem.md +120 -0
  30. package/skills/estack-leadership-coach/frameworks/delegation/flows/pre-delegation.md +138 -0
  31. package/skills/estack-leadership-coach/frameworks/delegation/phases/1-intake.md +145 -0
  32. package/skills/estack-leadership-coach/frameworks/delegation/phases/2-trm-assessment.md +119 -0
  33. package/skills/estack-leadership-coach/frameworks/delegation/phases/3-enrollment.md +132 -0
  34. package/skills/estack-leadership-coach/frameworks/delegation/phases/4-build-brief.md +171 -0
  35. package/skills/estack-leadership-coach/frameworks/delegation/phases/5-monitoring.md +134 -0
  36. package/skills/estack-leadership-coach/frameworks/delegation/phases/6-reverse-delegation.md +118 -0
  37. package/skills/estack-leadership-coach/frameworks/delegation/phases/7-diagnose.md +200 -0
  38. package/skills/estack-leadership-coach/references/.source-files/deci-ryan_self-determination-theory__deci-olafsen-ryan-2017-self-determination-theory-in-work-organizations.md +1881 -0
  39. package/skills/estack-leadership-coach/references/.source-files/deci-ryan_self-determination-theory__gagne-deci-2005-self-determination-theory-and-work-motivation.md +2058 -0
  40. package/skills/estack-leadership-coach/references/.source-files/deci-ryan_self-determination-theory__selfdeterminationtheory-org-theory-overview-page.md +61 -0
  41. package/skills/estack-leadership-coach/references/.source-files/gallup_engagement-research__gallup-3-key-insights-into-the-global-workplace-2024.md +57 -0
  42. package/skills/estack-leadership-coach/references/.source-files/gallup_engagement-research__gallup-managers-account-for-70-percent-of-variance-in-employee-engagement-2015.md +40 -0
  43. package/skills/estack-leadership-coach/references/.source-files/gallup_engagement-research__gallup-state-of-the-global-workplace-2026-global-data-summary.md +73 -0
  44. package/skills/estack-leadership-coach/references/.source-files/gallup_engagement-research__gallup-state-of-the-global-workplace-2026-report-landing.md +42 -0
  45. package/skills/estack-leadership-coach/references/.source-files/hormozi-leila_4-stages__leila-hormozi-the-art-of-delegation-blog-post.md +91 -0
  46. package/skills/estack-leadership-coach/references/.source-files/oncken-wass_monkeys-hbr-1974__oncken-wass-management-time-whos-got-the-monkey-hbr-classic-1974.md +969 -0
  47. package/skills/estack-leadership-coach/references/.source-files/sanchez_main-street-millionaire__codie-sanchez-afford-anything-podcast-ep-565-show-notes.md +89 -0
  48. package/skills/estack-leadership-coach/references/.source-files/sullivan_who-not-how__dan-sullivan-impact-filter-tool-and-guide-booklet.md +565 -0
  49. package/skills/estack-leadership-coach/references/.source-files/van-edwards_cues__vanessa-van-edwards-lewis-howes-school-of-greatness-ep-1231-show-notes.md +122 -0
  50. package/skills/estack-leadership-coach/references/.source-files/van-edwards_cues__vanessa-van-edwards-roger-dooley-cues-interview.md +194 -0
  51. package/skills/estack-leadership-coach/references/deci-ryan_self-determination-theory.md +166 -0
  52. package/skills/estack-leadership-coach/references/doerr_measure-what-matters.md +154 -0
  53. package/skills/estack-leadership-coach/references/ferriss_4hww.md +189 -0
  54. package/skills/estack-leadership-coach/references/gallup_engagement-research.md +105 -0
  55. package/skills/estack-leadership-coach/references/gerber_e-myth-revisited.md +118 -0
  56. package/skills/estack-leadership-coach/references/grove_high-output-management.md +95 -0
  57. package/skills/estack-leadership-coach/references/hormozi-alex_followthrough.md +152 -0
  58. package/skills/estack-leadership-coach/references/hormozi-leila_4-stages.md +146 -0
  59. package/skills/estack-leadership-coach/references/oncken-wass_monkeys-hbr-1974.md +128 -0
  60. package/skills/estack-leadership-coach/references/sanchez_main-street-millionaire.md +196 -0
  61. package/skills/estack-leadership-coach/references/sullivan_who-not-how.md +137 -0
  62. package/skills/estack-leadership-coach/references/van-edwards_cues.md +189 -0
  63. package/skills/estack-migrate-claude-session-history/SKILL.md +226 -0
  64. package/skills/estack-migrate-claude-session-history/references/path-encoding.md +55 -0
  65. package/skills/estack-migrate-claude-session-history/references/troubleshooting.md +96 -0
  66. package/skills/estack-migrate-claude-session-history/scripts/migrate-claude-history.js +1123 -0
  67. package/skills/estack-migrate-claude-session-history/scripts/test-append-note.js +48 -0
  68. package/skills/estack-migrate-claude-session-history/scripts/test-validate-migration.py +326 -0
  69. package/skills/estack-migrate-claude-session-history/scripts/validate-migration.py +493 -0
  70. package/skills/estack-pdf-to-md/SKILL.md +180 -0
  71. package/skills/estack-pdf-to-md/scripts/pdf_to_md.py +596 -0
  72. package/skills/estack-productivity-prioritization-coach/SKILL.md +124 -0
  73. package/skills/estack-productivity-prioritization-coach/sources/01-tony-robbins-rpm.md +39 -0
  74. package/skills/estack-productivity-prioritization-coach/sources/02-justin-sung-task-prioritization.md +34 -0
  75. package/skills/estack-prompt-builder-coach/SKILL.md +81 -81
  76. package/skills/estack-prompt-builder-coach/definition-of-done-generator.md +42 -42
  77. package/skills/estack-prompt-builder-coach/prompt-builder.md +37 -37
  78. package/skills/estack-prompt-builder-coach/task-shaper.md +36 -36
  79. package/skills/estack-prompt-builder-coach/vague-ask-auditor.md +37 -37
  80. package/skills/estack-read-claude-session-history/SKILL.md +204 -204
  81. package/skills/estack-read-claude-session-history/references/jsonl-schema.md +126 -126
  82. package/skills/estack-read-claude-session-history/references/modes.md +423 -423
  83. package/skills/estack-read-claude-session-history/references/recipes.md +271 -271
  84. package/skills/estack-read-claude-session-history/scripts/lib/__init__.py +1 -1
  85. package/skills/estack-read-claude-session-history/scripts/lib/parser.py +460 -460
  86. package/skills/estack-read-claude-session-history/scripts/lib/paths.py +234 -234
  87. package/skills/estack-read-claude-session-history/scripts/lib/search.py +179 -179
  88. package/skills/estack-read-claude-session-history/scripts/lib/subagents.py +88 -88
  89. package/skills/estack-read-claude-session-history/scripts/lib/tools.py +144 -144
  90. package/skills/estack-read-claude-session-history/scripts/read_transcript.py +1776 -1776
  91. package/skills/estack-read-claude-session-history/scripts/tests/conftest.py +40 -40
  92. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/README.md +20 -20
  93. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/all-noise.jsonl +4 -4
  94. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/basic-session.jsonl +2 -2
  95. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/engagement-gaps.jsonl +9 -9
  96. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/engagement-noise.jsonl +7 -7
  97. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/engagement-parallel-a.jsonl +3 -3
  98. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/engagement-parallel-b.jsonl +3 -3
  99. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/engagement-waiting.jsonl +5 -5
  100. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/interrupted.jsonl +2 -2
  101. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/multi-compact.jsonl +8 -8
  102. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/pending-user.jsonl +2 -2
  103. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/subagent-no-meta/subagents/agent-aaa.jsonl +2 -2
  104. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/subagent-no-meta.jsonl +2 -2
  105. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/subagent-parent/subagents/agent-xyz123.jsonl +2 -2
  106. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/subagent-parent/subagents/agent-xyz123.meta.json +1 -1
  107. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/subagent-parent.jsonl +4 -4
  108. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/time-spread.jsonl +6 -6
  109. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/timeline-day-test.jsonl +5 -5
  110. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/tool-zoo.jsonl +10 -10
  111. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/truncated.jsonl +2 -2
  112. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/unicode.jsonl +2 -2
  113. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/with-advisor.jsonl +3 -3
  114. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/with-compact.jsonl +5 -5
  115. package/skills/estack-read-claude-session-history/scripts/tests/fixtures/with-thinking.jsonl +2 -2
  116. package/skills/estack-read-claude-session-history/scripts/tests/test_backup_roots.py +56 -56
  117. package/skills/estack-read-claude-session-history/scripts/tests/test_engagement.py +239 -239
  118. package/skills/estack-read-claude-session-history/scripts/tests/test_json_format.py +201 -201
  119. package/skills/estack-read-claude-session-history/scripts/tests/test_modes.py +199 -199
  120. package/skills/estack-read-claude-session-history/scripts/tests/test_parser.py +195 -195
  121. package/skills/estack-read-claude-session-history/scripts/tests/test_paths.py +133 -133
  122. package/skills/estack-read-claude-session-history/scripts/tests/test_search.py +78 -78
  123. package/skills/estack-read-claude-session-history/scripts/tests/test_subagents.py +43 -43
  124. package/skills/estack-read-claude-session-history/scripts/tests/test_timeline.py +179 -179
  125. package/skills/estack-read-claude-session-history/scripts/tests/test_timezone_and_project.py +212 -212
  126. package/skills/estack-read-claude-session-history/scripts/tests/test_tools.py +80 -80
  127. package/skills/estack-repo-search/SKILL.md +65 -65
  128. package/skills/estack-vscode-file-recovery/SKILL.md +188 -0
@@ -0,0 +1,596 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf_to_md.py — Convert a PDF to Markdown (or .txt) using the RunPulse API.
4
+
5
+ Usage:
6
+ python pdf_to_md.py <pdf_path> [options]
7
+
8
+ Options:
9
+ --batch-size N Pages per API call (default: 10)
10
+ --output-dir PATH Where to write the output file (default: same dir as PDF)
11
+ --format md|txt Output file extension (default: md)
12
+ --no-separator Join batches with a plain newline instead of a page marker
13
+ --min-chars N Skip pages with fewer than N non-whitespace chars of locally-
14
+ extracted text (default: 20). Catches blank pages and pages
15
+ that are entirely an image, since pypdf can't extract their
16
+ text. Set to 0 to send every page to RunPulse.
17
+ --no-skip Alias for --min-chars 0. Useful for scanned PDFs where
18
+ RunPulse's OCR is the whole point.
19
+ --quality fast|high fast (default): RunPulse 'default' model, no refinement,
20
+ full parallelism. Cheap and quick.
21
+ high: 'pulse-ultra-2' vision-language model + full refinement
22
+ pass (tables, text, formatting), chart-to-table extraction,
23
+ figure descriptions, footnote linking. Slower, more expensive,
24
+ throttled by RunPulse to 2 concurrent / 5 per minute / 20 per
25
+ hour. Use for tables, math, charts, scanned pages, or sloppy
26
+ formatting.
27
+
28
+ Requires:
29
+ pip install requests pypdf
30
+ PULSE_API_KEY env var (already set in your user environment)
31
+ """
32
+
33
+ import argparse
34
+ import json
35
+ import os
36
+ import sys
37
+ import tempfile
38
+ import time
39
+ from concurrent.futures import ThreadPoolExecutor, as_completed
40
+ from pathlib import Path
41
+
42
+ try:
43
+ import requests
44
+ except ImportError:
45
+ sys.exit("Missing dependency: pip install requests")
46
+
47
+ try:
48
+ from pypdf import PdfReader, PdfWriter
49
+ except ImportError:
50
+ sys.exit("Missing dependency: pip install pypdf")
51
+
52
+ def _load_env_key() -> str:
53
+ """Look for PULSE_API_KEY in a .env file co-located with the skill or script.
54
+
55
+ The skill stores the key in `<skill_root>/.env` by default so the script
56
+ works without requiring a Windows user env var to be set. Env var wins if
57
+ both are present.
58
+ """
59
+ candidates = [
60
+ Path(__file__).parent.parent / ".env", # skill root: ~/.agents/skills/estack-pdf-to-md/.env
61
+ Path.home() / ".claude" / "skills" / "estack-pdf-to-md" / ".env",
62
+ Path.home() / ".claude" / "skills" / "pdf-to-md" / ".env", # legacy location
63
+ ]
64
+ for p in candidates:
65
+ if not p.exists():
66
+ continue
67
+ try:
68
+ for raw in p.read_text(encoding="utf-8").splitlines():
69
+ line = raw.strip()
70
+ if not line or line.startswith("#") or "=" not in line:
71
+ continue
72
+ k, _, v = line.partition("=")
73
+ if k.strip() == "PULSE_API_KEY":
74
+ return v.strip().strip('"').strip("'")
75
+ except Exception:
76
+ pass
77
+ return ""
78
+
79
+
80
+ API_KEY = os.environ.get("PULSE_API_KEY", "") or _load_env_key()
81
+ BASE_URL = "https://api.runpulse.com"
82
+ POLL_INTERVAL = 2 # seconds between status checks
83
+ POLL_TIMEOUT = 600 # seconds before giving up on a job (raised for refine pass)
84
+ MAX_429_RETRIES = 5 # exponential backoff: 5s, 10s, 20s, 40s, 80s
85
+ MAX_5XX_RETRIES = 3 # transient gateway errors — fewer retries than 429 since 5xx
86
+ # often signals a real problem rather than throttling
87
+
88
+ RETRYABLE_5XX = {500, 502, 503, 504}
89
+
90
+ QUALITY_PRESETS = {
91
+ "fast": {
92
+ "model": "default",
93
+ "max_workers": None, # None -> use total_batches (full parallelism)
94
+ "extra_options": {},
95
+ },
96
+ "high": {
97
+ "model": "pulse-ultra-2",
98
+ # Ultra 2 caps at 2 concurrent extractions per API key; exceeding that
99
+ # triggers 429s. Cap the worker pool to match.
100
+ "max_workers": 2,
101
+ "extra_options": {
102
+ "refine": True,
103
+ "refine_options": {
104
+ "tables": True,
105
+ "text": True,
106
+ "formatting": True,
107
+ },
108
+ "extract_figure": True,
109
+ "figure_description": True,
110
+ "figure_processing": {
111
+ "description": True,
112
+ },
113
+ "extensions": {
114
+ "footnote_references": True,
115
+ },
116
+ },
117
+ },
118
+ }
119
+
120
+
121
+ def _ensure_decrypted(pdf_path: Path) -> tuple[Path, Path | None]:
122
+ """If `pdf_path` is encrypted, write an unencrypted temp copy and return it.
123
+
124
+ Returns (path_to_use, cleanup_path_or_None). Many publisher-restricted PDFs
125
+ are owner-locked but have no user password, so `decrypt('')` succeeds and we
126
+ can transparently unlock them. If that fails (real user-password protection),
127
+ exit with workaround guidance.
128
+ """
129
+ reader = PdfReader(pdf_path)
130
+ if not reader.is_encrypted:
131
+ return pdf_path, None
132
+
133
+ if not reader.decrypt(""):
134
+ sys.exit(
135
+ f"PDF is password-protected: {pdf_path.name}.\n"
136
+ f" Workarounds:\n"
137
+ f" 1. Open in Chrome and print to PDF (strips most publisher locks)\n"
138
+ f" 2. qpdf --decrypt --password=<pwd> in.pdf out.pdf\n"
139
+ f" Then rerun on the new file."
140
+ )
141
+
142
+ tmp = tempfile.NamedTemporaryFile(
143
+ suffix=".pdf",
144
+ prefix=f"{pdf_path.stem}_decrypted_",
145
+ delete=False,
146
+ )
147
+ tmp.close()
148
+ tmp_path = Path(tmp.name)
149
+ writer = PdfWriter()
150
+ for page in reader.pages:
151
+ writer.add_page(page)
152
+ with open(tmp_path, "wb") as f:
153
+ writer.write(f)
154
+ print(f" {pdf_path.name} was owner-locked; decrypted with empty password to temp copy.")
155
+ return tmp_path, tmp_path
156
+
157
+
158
+ def analyze_pages(pdf_path: Path, min_chars: int) -> tuple[int, list[int], list[int]]:
159
+ """Return (total_pages, pages_to_convert, pages_skipped). All 1-indexed.
160
+
161
+ A page is kept if pypdf can locally extract at least `min_chars` non-whitespace
162
+ characters from it. Blank pages produce empty text; pages whose entire content
163
+ is a rasterized image also produce empty text (pypdf can't OCR). Both get
164
+ skipped, which avoids paying RunPulse to process pages with nothing useful on
165
+ them.
166
+ """
167
+ reader = PdfReader(pdf_path)
168
+ total = len(reader.pages)
169
+ keep: list[int] = []
170
+ skip: list[int] = []
171
+ for i, page in enumerate(reader.pages, 1):
172
+ try:
173
+ text = page.extract_text() or ""
174
+ except Exception:
175
+ text = ""
176
+ nonspace = sum(1 for c in text if not c.isspace())
177
+ if nonspace >= min_chars:
178
+ keep.append(i)
179
+ else:
180
+ skip.append(i)
181
+ return total, keep, skip
182
+
183
+
184
+ def build_ranges(pages: list[int], max_per_range: int) -> list[tuple[int, int]]:
185
+ """Group a sorted list of 1-indexed pages into consecutive ranges, splitting
186
+ any run that would exceed `max_per_range` pages so each API call stays bounded.
187
+ """
188
+ if not pages:
189
+ return []
190
+ sorted_pages = sorted(set(pages))
191
+ ranges: list[tuple[int, int]] = []
192
+ start = prev = sorted_pages[0]
193
+ for p in sorted_pages[1:]:
194
+ if p == prev + 1 and (prev - start + 1) < max_per_range:
195
+ prev = p
196
+ else:
197
+ ranges.append((start, prev))
198
+ start = prev = p
199
+ ranges.append((start, prev))
200
+ return ranges
201
+
202
+
203
+ def _format_page_list(pages: list[int], max_show: int = 30) -> str:
204
+ """Render a page list compactly: 1,2,3,7,8,9 -> '1-3, 7-9'."""
205
+ if not pages:
206
+ return ""
207
+ sorted_pages = sorted(set(pages))
208
+ groups: list[str] = []
209
+ start = prev = sorted_pages[0]
210
+ for p in sorted_pages[1:]:
211
+ if p == prev + 1:
212
+ prev = p
213
+ else:
214
+ groups.append(f"{start}" if start == prev else f"{start}-{prev}")
215
+ start = prev = p
216
+ groups.append(f"{start}" if start == prev else f"{start}-{prev}")
217
+ if len(groups) > max_show:
218
+ return ", ".join(groups[:max_show]) + f", ... ({len(groups) - max_show} more)"
219
+ return ", ".join(groups)
220
+
221
+
222
+ def _form_value(v):
223
+ """Coerce a Python value into a form-field-friendly string.
224
+
225
+ Nested dicts get JSON-encoded; booleans become 'true'/'false'; everything else
226
+ is stringified. RunPulse's multipart endpoint accepts nested option blocks as
227
+ JSON-stringified form fields.
228
+ """
229
+ if isinstance(v, dict):
230
+ return json.dumps(v)
231
+ if isinstance(v, bool):
232
+ return "true" if v else "false"
233
+ return str(v)
234
+
235
+
236
+ def _resolve_result(payload: dict, label: str) -> str:
237
+ """Return markdown from a result payload, fetching from URL for large results.
238
+
239
+ RunPulse returns `is_url: true` + a one-time `url` when the result exceeds
240
+ ~5MB or ~70 pages. We download with the same auth header. The body is either
241
+ raw markdown or a small JSON wrapper around it.
242
+ """
243
+ if payload.get("is_url"):
244
+ url = payload.get("url")
245
+ if not url:
246
+ raise RuntimeError(f"{label}: is_url=true but no url in payload: {payload}")
247
+ print(f" {label}: fetching large result from URL...")
248
+ resp = requests.get(url, headers={"x-api-key": API_KEY}, timeout=180)
249
+ resp.raise_for_status()
250
+ ctype = resp.headers.get("content-type", "")
251
+ if "json" in ctype:
252
+ body = resp.json()
253
+ md = body.get("markdown") or body.get("result", {}).get("markdown")
254
+ if md:
255
+ return md
256
+ return resp.text
257
+ return resp.text
258
+ return payload.get("markdown", "")
259
+
260
+
261
+ def extract_pages(pdf_path: Path, start: int, end: int, quality: str = "fast") -> str:
262
+ """Upload the PDF and extract a specific page range; returns markdown string."""
263
+ preset = QUALITY_PRESETS[quality]
264
+ headers = {"x-api-key": API_KEY}
265
+
266
+ pages_arg = f"{start}" if start == end else f"{start}-{end}"
267
+ data = {
268
+ "pages": pages_arg,
269
+ "async": "true",
270
+ "model": preset["model"],
271
+ }
272
+ for key, value in preset["extra_options"].items():
273
+ data[key] = _form_value(value)
274
+
275
+ payload = _post_with_retry(pdf_path, headers, data, start, end)
276
+
277
+ if "job_id" in payload:
278
+ return _poll(payload["job_id"], start, end)
279
+ if "markdown" in payload or payload.get("is_url"):
280
+ return _resolve_result(payload, f"pages {start}-{end}")
281
+
282
+ raise RuntimeError(f"Unexpected response for pages {start}-{end}: {payload}")
283
+
284
+
285
+ def _post_with_retry(pdf_path: Path, headers: dict, data: dict, start: int, end: int) -> dict:
286
+ """POST to /extract with exponential-backoff retry on 429 and transient 5xx."""
287
+ backoff_429 = 5
288
+ backoff_5xx = 5
289
+ attempts_429 = 0
290
+ attempts_5xx = 0
291
+ while True:
292
+ with open(pdf_path, "rb") as f:
293
+ resp = requests.post(
294
+ f"{BASE_URL}/extract",
295
+ headers=headers,
296
+ files={"file": (pdf_path.name, f, "application/pdf")},
297
+ data=data,
298
+ timeout=120,
299
+ )
300
+ if resp.status_code == 429 and attempts_429 < MAX_429_RETRIES:
301
+ print(f" pages {start}-{end}: 429 rate-limited, sleeping {backoff_429}s before retry...")
302
+ time.sleep(backoff_429)
303
+ backoff_429 *= 2
304
+ attempts_429 += 1
305
+ continue
306
+ if resp.status_code in RETRYABLE_5XX and attempts_5xx < MAX_5XX_RETRIES:
307
+ print(f" pages {start}-{end}: {resp.status_code} from RunPulse, sleeping {backoff_5xx}s before retry...")
308
+ time.sleep(backoff_5xx)
309
+ backoff_5xx *= 2
310
+ attempts_5xx += 1
311
+ continue
312
+ resp.raise_for_status()
313
+ return resp.json()
314
+
315
+
316
+ def _poll(job_id: str, start: int, end: int) -> str:
317
+ """Block until the async job completes and return its markdown."""
318
+ headers = {"x-api-key": API_KEY}
319
+ deadline = time.time() + POLL_TIMEOUT
320
+ backoff_429 = 5
321
+ backoff_5xx = 5
322
+ attempts_5xx = 0
323
+
324
+ while time.time() < deadline:
325
+ resp = requests.get(f"{BASE_URL}/job/{job_id}", headers=headers, timeout=30)
326
+ if resp.status_code == 429:
327
+ print(f" pages {start}-{end}: 429 during poll, sleeping {backoff_429}s...")
328
+ time.sleep(backoff_429)
329
+ backoff_429 = min(backoff_429 * 2, 60)
330
+ continue
331
+ if resp.status_code in RETRYABLE_5XX and attempts_5xx < MAX_5XX_RETRIES:
332
+ print(f" pages {start}-{end}: {resp.status_code} during poll, sleeping {backoff_5xx}s...")
333
+ time.sleep(backoff_5xx)
334
+ backoff_5xx *= 2
335
+ attempts_5xx += 1
336
+ continue
337
+ resp.raise_for_status()
338
+ data = resp.json()
339
+ status = data.get("status")
340
+
341
+ if status == "completed":
342
+ result = data.get("result", {}) or {}
343
+ return _resolve_result(result, f"pages {start}-{end}")
344
+ if status in ("failed", "canceled"):
345
+ raise RuntimeError(f"Job {job_id} ended with status '{status}': {data}")
346
+
347
+ time.sleep(POLL_INTERVAL)
348
+
349
+ raise TimeoutError(f"Job {job_id} did not finish within {POLL_TIMEOUT}s")
350
+
351
+
352
+ def _parse_page_range(spec: str, total: int) -> set[int]:
353
+ """Parse '5-10,12,20-22' into a set of 1-indexed page numbers, clamped to total.
354
+
355
+ Exits with a clear message on malformed input (non-integers, reversed ranges,
356
+ missing sides, non-positive numbers) instead of crashing with a bare ValueError.
357
+ """
358
+ pages: set[int] = set()
359
+ for raw_part in spec.split(","):
360
+ part = raw_part.strip()
361
+ if not part:
362
+ continue
363
+ if "-" in part:
364
+ a, b = (s.strip() for s in part.split("-", 1))
365
+ if not a or not b:
366
+ sys.exit(f"Bad --pages: range '{part}' is missing a number on one side. Use e.g. '5-10' or just '5'.")
367
+ try:
368
+ lo, hi = int(a), int(b)
369
+ except ValueError:
370
+ sys.exit(f"Bad --pages: '{part}' contains a non-integer.")
371
+ if lo > hi:
372
+ sys.exit(f"Bad --pages: range '{part}' is reversed ({lo} > {hi}).")
373
+ else:
374
+ try:
375
+ lo = hi = int(part)
376
+ except ValueError:
377
+ sys.exit(f"Bad --pages: '{part}' is not an integer.")
378
+ if lo < 1:
379
+ sys.exit(f"Bad --pages: '{part}' contains a non-positive page number.")
380
+ for p in range(lo, min(total, hi) + 1):
381
+ pages.add(p)
382
+ if not pages:
383
+ sys.exit(f"Bad --pages: '{spec}' resolved to no valid pages (PDF has {total}).")
384
+ return pages
385
+
386
+
387
+ def convert_pdf(
388
+ pdf_path: Path,
389
+ batch_size: int = 10,
390
+ output_dir: Path | None = None,
391
+ fmt: str = "md",
392
+ separator: bool = True,
393
+ min_chars: int = 20,
394
+ quality: str = "fast",
395
+ pages_filter: str | None = None,
396
+ ) -> Path:
397
+ if not API_KEY:
398
+ sys.exit(
399
+ "PULSE_API_KEY is not set. Open a new terminal so the user env var is loaded, "
400
+ "or set it manually: $env:PULSE_API_KEY = '...'"
401
+ )
402
+
403
+ pdf_path = pdf_path.resolve()
404
+ if not pdf_path.exists():
405
+ sys.exit(f"File not found: {pdf_path}")
406
+
407
+ effective_pdf, cleanup_path = _ensure_decrypted(pdf_path)
408
+ try:
409
+ return _convert_pdf_impl(
410
+ original_pdf=pdf_path,
411
+ effective_pdf=effective_pdf,
412
+ batch_size=batch_size,
413
+ output_dir=output_dir,
414
+ fmt=fmt,
415
+ separator=separator,
416
+ min_chars=min_chars,
417
+ quality=quality,
418
+ pages_filter=pages_filter,
419
+ )
420
+ finally:
421
+ if cleanup_path is not None and cleanup_path.exists():
422
+ try:
423
+ cleanup_path.unlink()
424
+ except OSError:
425
+ pass
426
+
427
+
428
+ def _convert_pdf_impl(
429
+ *,
430
+ original_pdf: Path,
431
+ effective_pdf: Path,
432
+ batch_size: int,
433
+ output_dir: Path | None,
434
+ fmt: str,
435
+ separator: bool,
436
+ min_chars: int,
437
+ quality: str,
438
+ pages_filter: str | None,
439
+ ) -> Path:
440
+ page_count, pages_to_convert, pages_skipped = analyze_pages(effective_pdf, min_chars)
441
+ print(f"{original_pdf.name}: {page_count} pages total")
442
+
443
+ skip_reason = "blank or image-only"
444
+ if pages_filter:
445
+ requested = _parse_page_range(pages_filter, page_count)
446
+ pages_to_convert = sorted(requested)
447
+ pages_skipped = [p for p in range(1, page_count + 1) if p not in requested]
448
+ skip_reason = "excluded by --pages filter"
449
+ print(f" --pages filter active: only processing {_format_page_list(pages_to_convert)}")
450
+ elif pages_skipped:
451
+ print(
452
+ f" Skipping {len(pages_skipped)} page(s) with <{min_chars} chars of "
453
+ f"extractable text (blank or image-only): {_format_page_list(pages_skipped)}"
454
+ )
455
+ print(" Override with --no-skip if you want every page sent to RunPulse.")
456
+
457
+ if not pages_to_convert:
458
+ sys.exit(
459
+ "No pages contain extractable text above the threshold. If this PDF is a "
460
+ "scan where RunPulse OCR is exactly what you need, rerun with --no-skip."
461
+ )
462
+
463
+ if quality not in QUALITY_PRESETS:
464
+ sys.exit(f"Unknown quality preset: {quality!r}. Choose 'fast' or 'high'.")
465
+ preset = QUALITY_PRESETS[quality]
466
+
467
+ ranges = build_ranges(pages_to_convert, batch_size)
468
+ total_batches = len(ranges)
469
+ pages_being_sent = len(pages_to_convert)
470
+ max_workers = preset["max_workers"] or total_batches
471
+ max_workers = min(max_workers, total_batches)
472
+ print(
473
+ f" Sending {pages_being_sent} page(s) in {total_batches} batch(es) "
474
+ f"(max {batch_size} pages each) via quality='{quality}' (model={preset['model']})"
475
+ )
476
+
477
+ results: dict[int, str] = {}
478
+
479
+ def _process_batch(idx: int, start: int, end: int) -> tuple[int, str]:
480
+ text = extract_pages(effective_pdf, start, end, quality=quality)
481
+ print(f" [{idx}/{total_batches}] pages {start}-{end} done ({len(text):,} chars)")
482
+ return idx, text.strip()
483
+
484
+ print(f"Submitting {total_batches} batch(es) with up to {max_workers} in parallel...")
485
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
486
+ futures = {
487
+ pool.submit(_process_batch, i, start, end): (i, start, end)
488
+ for i, (start, end) in enumerate(ranges, 1)
489
+ }
490
+ for future in as_completed(futures):
491
+ i, start, end = futures[future]
492
+ try:
493
+ idx, text = future.result()
494
+ results[idx] = text
495
+ except Exception as exc:
496
+ print(f" [{i}/{total_batches}] pages {start}-{end} FAILED: {exc}")
497
+ raise
498
+
499
+ if separator:
500
+ chunks: list[str] = []
501
+ skipped_any = bool(pages_skipped)
502
+ prev_end = 0
503
+ for i, (start, end) in enumerate(ranges, 1):
504
+ if skipped_any:
505
+ gap_start = prev_end + 1
506
+ if gap_start < start:
507
+ chunks.append(
508
+ f"<!-- pages {gap_start}-{start - 1} skipped ({skip_reason}) -->"
509
+ )
510
+ chunks.append(f"<!-- pages {start}-{end} -->\n\n{results[i]}")
511
+ else:
512
+ if i == 1:
513
+ chunks.append(results[i])
514
+ else:
515
+ chunks.append(f"<!-- pages {start}-{end} -->\n\n{results[i]}")
516
+ prev_end = end
517
+ if skipped_any and prev_end < page_count:
518
+ chunks.append(
519
+ f"<!-- pages {prev_end + 1}-{page_count} skipped ({skip_reason}) -->"
520
+ )
521
+ full_text = "\n\n".join(chunks)
522
+ else:
523
+ full_text = "\n\n".join(results[i] for i in range(1, total_batches + 1))
524
+
525
+ dest_dir = output_dir or original_pdf.parent
526
+ dest_dir.mkdir(parents=True, exist_ok=True)
527
+ out_path = dest_dir / f"{original_pdf.stem}.{fmt}"
528
+ if out_path.exists():
529
+ print(f" WARNING: overwriting existing file: {out_path}")
530
+ out_path.write_text(full_text, encoding="utf-8")
531
+
532
+ print(f"\nSaved -> {out_path}")
533
+ return out_path
534
+
535
+
536
+ def main() -> None:
537
+ parser = argparse.ArgumentParser(
538
+ description="Convert a PDF to Markdown using RunPulse."
539
+ )
540
+ parser.add_argument("pdf_path", help="Path to the PDF file")
541
+ parser.add_argument(
542
+ "--batch-size", type=int, default=10, metavar="N",
543
+ help="Pages per API call (default: 10)"
544
+ )
545
+ parser.add_argument(
546
+ "--output-dir", metavar="PATH",
547
+ help="Output directory (default: same directory as the PDF)"
548
+ )
549
+ parser.add_argument(
550
+ "--format", choices=["md", "txt"], default="md",
551
+ help="Output file extension (default: md)"
552
+ )
553
+ parser.add_argument(
554
+ "--no-separator", action="store_true",
555
+ help="Join batches without page-marker comments"
556
+ )
557
+ parser.add_argument(
558
+ "--min-chars", type=int, default=20, metavar="N",
559
+ help="Skip pages with fewer than N non-whitespace chars of locally-extracted "
560
+ "text — catches blank and image-only pages (default: 20)"
561
+ )
562
+ parser.add_argument(
563
+ "--no-skip", action="store_true",
564
+ help="Send every page to RunPulse (equivalent to --min-chars 0). Use for "
565
+ "scanned PDFs where OCR is the whole point."
566
+ )
567
+ parser.add_argument(
568
+ "--quality", choices=["fast", "high"], default="fast",
569
+ help="fast (default): 'default' model, full parallelism, cheap. "
570
+ "high: 'pulse-ultra-2' + refinement + figure extraction; throttled to "
571
+ "2 concurrent. Use for tables, math, charts, scans, or sloppy formatting."
572
+ )
573
+ parser.add_argument(
574
+ "--pages", metavar="RANGE",
575
+ help="Restrict to a specific 1-indexed page range, e.g. '5-10'. Useful for "
576
+ "spot-testing on a single page. Overrides the blank/image-only filter "
577
+ "for pages explicitly requested."
578
+ )
579
+ args = parser.parse_args()
580
+
581
+ min_chars = 0 if args.no_skip else args.min_chars
582
+
583
+ convert_pdf(
584
+ pdf_path=Path(args.pdf_path),
585
+ batch_size=args.batch_size,
586
+ output_dir=Path(args.output_dir) if args.output_dir else None,
587
+ fmt=args.format,
588
+ separator=not args.no_separator,
589
+ min_chars=min_chars,
590
+ quality=args.quality,
591
+ pages_filter=args.pages,
592
+ )
593
+
594
+
595
+ if __name__ == "__main__":
596
+ main()