ethan-agent-skills 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +140 -0
  3. package/agents/skills/openspec-apply-change/SKILL.md +156 -0
  4. package/agents/skills/openspec-apply-change/skill.json +5 -0
  5. package/agents/skills/openspec-archive-change/SKILL.md +114 -0
  6. package/agents/skills/openspec-archive-change/skill.json +5 -0
  7. package/agents/skills/openspec-explore/SKILL.md +288 -0
  8. package/agents/skills/openspec-explore/skill.json +5 -0
  9. package/agents/skills/openspec-propose/SKILL.md +110 -0
  10. package/agents/skills/openspec-propose/skill.json +5 -0
  11. package/agents/skills/source-command-opsx-apply/SKILL.md +156 -0
  12. package/agents/skills/source-command-opsx-apply/skill.json +5 -0
  13. package/agents/skills/source-command-opsx-archive/SKILL.md +161 -0
  14. package/agents/skills/source-command-opsx-archive/skill.json +5 -0
  15. package/agents/skills/source-command-opsx-explore/SKILL.md +177 -0
  16. package/agents/skills/source-command-opsx-explore/skill.json +5 -0
  17. package/agents/skills/source-command-opsx-propose/SKILL.md +110 -0
  18. package/agents/skills/source-command-opsx-propose/skill.json +5 -0
  19. package/bin/skills.mjs +38 -0
  20. package/claude/commands/opsx/apply.md +152 -0
  21. package/claude/commands/opsx/archive.md +157 -0
  22. package/claude/commands/opsx/explore.md +173 -0
  23. package/claude/commands/opsx/propose.md +106 -0
  24. package/claude/skills/openspec-apply-change/SKILL.md +156 -0
  25. package/claude/skills/openspec-apply-change/skill.json +5 -0
  26. package/claude/skills/openspec-archive-change/SKILL.md +114 -0
  27. package/claude/skills/openspec-archive-change/skill.json +5 -0
  28. package/claude/skills/openspec-explore/SKILL.md +288 -0
  29. package/claude/skills/openspec-explore/skill.json +5 -0
  30. package/claude/skills/openspec-propose/SKILL.md +110 -0
  31. package/claude/skills/openspec-propose/skill.json +5 -0
  32. package/codex/skills/openspec-apply-change/SKILL.md +156 -0
  33. package/codex/skills/openspec-apply-change/skill.json +5 -0
  34. package/codex/skills/openspec-archive-change/SKILL.md +114 -0
  35. package/codex/skills/openspec-archive-change/skill.json +5 -0
  36. package/codex/skills/openspec-explore/SKILL.md +288 -0
  37. package/codex/skills/openspec-explore/skill.json +5 -0
  38. package/codex/skills/openspec-propose/SKILL.md +110 -0
  39. package/codex/skills/openspec-propose/skill.json +5 -0
  40. package/lib/discover.mjs +60 -0
  41. package/lib/list.mjs +22 -0
  42. package/lib/update.mjs +144 -0
  43. package/lib/utils.mjs +120 -0
  44. package/package.json +46 -0
  45. package/skills/pdf-extract/SKILL.md +214 -0
  46. package/skills/pdf-extract/scripts/pdf_extract.py +112 -0
  47. package/skills/pdf-extract/skill.json +5 -0
package/lib/update.mjs ADDED
@@ -0,0 +1,144 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import kleur from 'kleur';
4
+ import { discoverBundledSkills } from './discover.mjs';
5
+ import {
6
+ CLIENT_DIRS,
7
+ copyDir,
8
+ LOCK_FILE,
9
+ PACKAGE_NAME,
10
+ pathExists,
11
+ readJSON,
12
+ resolveTarget,
13
+ rmDir,
14
+ writeJSON,
15
+ } from './utils.mjs';
16
+
17
+ export async function update(opts = {}) {
18
+ const targets = resolveTargets(opts);
19
+
20
+ for (const target of targets) {
21
+ const skills = await discoverBundledSkills(target.groups, { dedupe: true });
22
+ await syncToTarget(target.path, skills, opts);
23
+ }
24
+ }
25
+
26
+ function resolveTargets(opts) {
27
+ const client = String(opts.client || 'claude').toLowerCase();
28
+
29
+ if (opts.target) {
30
+ return [
31
+ {
32
+ name: 'custom',
33
+ path: resolveTarget(opts.target),
34
+ groups: groupsForClient(client),
35
+ },
36
+ ];
37
+ }
38
+
39
+ if (client === 'claude') {
40
+ return [{ name: 'claude', path: CLIENT_DIRS.claude, groups: groupsForClient(client) }];
41
+ }
42
+
43
+ if (client === 'codex') {
44
+ return [{ name: 'codex', path: CLIENT_DIRS.codex, groups: groupsForClient(client) }];
45
+ }
46
+
47
+ if (client === 'all') {
48
+ return [
49
+ { name: 'claude', path: CLIENT_DIRS.claude, groups: groupsForClient('claude') },
50
+ { name: 'codex', path: CLIENT_DIRS.codex, groups: groupsForClient('codex') },
51
+ ];
52
+ }
53
+
54
+ throw new Error(`Unsupported client "${opts.client}". Use claude, codex, or all.`);
55
+ }
56
+
57
+ function groupsForClient(client) {
58
+ if (client === 'claude') return ['common', 'claude'];
59
+ if (client === 'codex') return ['common', 'codex'];
60
+ if (client === 'all') return ['common', 'claude', 'codex'];
61
+ throw new Error(`Unsupported client "${client}". Use claude, codex, or all.`);
62
+ }
63
+
64
+ async function syncToTarget(targetDir, skills, opts) {
65
+ console.log(kleur.cyan(`\n-> Syncing ${skills.length} skill(s) to ${targetDir}`));
66
+
67
+ if (skills.length === 0) {
68
+ console.log(kleur.yellow(' No bundled skills found for this target.'));
69
+ return;
70
+ }
71
+
72
+ const lockPath = path.join(targetDir, LOCK_FILE);
73
+ const hadLock = await pathExists(lockPath);
74
+ const lock = normalizeLock(await readJSON(lockPath, {}));
75
+ const nextLock = structuredClone(lock);
76
+ let updated = 0;
77
+ let skipped = 0;
78
+ let conflicts = 0;
79
+
80
+ if (!opts.dryRun) {
81
+ await fs.mkdir(targetDir, { recursive: true });
82
+ }
83
+
84
+ for (const skill of skills) {
85
+ const destination = path.join(targetDir, skill.name);
86
+ const lockEntry = lock.skills[skill.name];
87
+ const exists = await pathExists(destination);
88
+
89
+ if (exists && !isManaged(lockEntry)) {
90
+ console.log(` ${kleur.red('!')} ${skill.name} ${kleur.red('(unmanaged destination exists; skipped)')}`);
91
+ conflicts += 1;
92
+ continue;
93
+ }
94
+
95
+ if (!opts.force && exists && lockEntry?.version === skill.version) {
96
+ console.log(` ${kleur.gray('=')} ${skill.name} ${kleur.gray(`(${skill.version}, up-to-date)`)}`);
97
+ skipped += 1;
98
+ continue;
99
+ }
100
+
101
+ const oldVersion = lockEntry?.version ?? 'none';
102
+
103
+ if (opts.dryRun) {
104
+ const marker = exists ? '~' : '+';
105
+ console.log(` ${kleur.yellow(marker)} ${skill.name} ${oldVersion} -> ${skill.version} ${kleur.gray('(dry-run)')}`);
106
+ updated += 1;
107
+ continue;
108
+ }
109
+
110
+ await rmDir(destination);
111
+ await copyDir(skill.sourceDir, destination);
112
+ nextLock.skills[skill.name] = {
113
+ name: skill.name,
114
+ version: skill.version,
115
+ sourcePackage: PACKAGE_NAME,
116
+ sourceGroup: skill.group,
117
+ updatedAt: new Date().toISOString(),
118
+ };
119
+ updated += 1;
120
+ console.log(` ${kleur.green('✓')} ${skill.name} ${kleur.gray(oldVersion)} -> ${kleur.green(skill.version)}`);
121
+ }
122
+
123
+ if (!opts.dryRun && (updated > 0 || hadLock)) {
124
+ nextLock.package = PACKAGE_NAME;
125
+ nextLock.lockfileVersion = 1;
126
+ nextLock.updatedAt = new Date().toISOString();
127
+ await writeJSON(lockPath, nextLock);
128
+ }
129
+
130
+ console.log(kleur.gray(` ${updated} changed, ${skipped} skipped, ${conflicts} conflict(s).`));
131
+ }
132
+
133
+ function normalizeLock(lock) {
134
+ return {
135
+ package: lock?.package || PACKAGE_NAME,
136
+ lockfileVersion: lock?.lockfileVersion || 1,
137
+ updatedAt: lock?.updatedAt || null,
138
+ skills: lock?.skills && typeof lock.skills === 'object' ? lock.skills : {},
139
+ };
140
+ }
141
+
142
+ function isManaged(lockEntry) {
143
+ return lockEntry?.sourcePackage === PACKAGE_NAME;
144
+ }
package/lib/utils.mjs ADDED
@@ -0,0 +1,120 @@
1
+ import fs from 'node:fs/promises';
2
+ import os from 'node:os';
3
+ import path from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+
6
+ export const PACKAGE_NAME = 'ethan-agent-skills';
7
+ export const LOCK_FILE = '.skills-lock.json';
8
+ export const PACKAGE_ROOT = path.resolve(
9
+ path.dirname(fileURLToPath(import.meta.url)),
10
+ '..',
11
+ );
12
+
13
+ export const CLIENT_DIRS = {
14
+ claude: path.join(os.homedir(), '.claude', 'skills'),
15
+ codex: path.join(os.homedir(), '.codex', 'skills'),
16
+ };
17
+
18
+ export const SOURCE_GROUPS = {
19
+ common: path.join(PACKAGE_ROOT, 'skills'),
20
+ claude: path.join(PACKAGE_ROOT, 'claude', 'skills'),
21
+ codex: path.join(PACKAGE_ROOT, 'codex', 'skills'),
22
+ agents: path.join(PACKAGE_ROOT, 'agents', 'skills'),
23
+ };
24
+
25
+ export async function pathExists(targetPath) {
26
+ try {
27
+ await fs.access(targetPath);
28
+ return true;
29
+ } catch {
30
+ return false;
31
+ }
32
+ }
33
+
34
+ export async function readJSON(file, fallback = null) {
35
+ try {
36
+ return JSON.parse(await fs.readFile(file, 'utf8'));
37
+ } catch {
38
+ return fallback;
39
+ }
40
+ }
41
+
42
+ export async function writeJSON(file, data) {
43
+ await fs.mkdir(path.dirname(file), { recursive: true });
44
+ const tempFile = `${file}.${process.pid}.tmp`;
45
+ await fs.writeFile(tempFile, `${JSON.stringify(data, null, 2)}\n`);
46
+ await fs.rename(tempFile, file);
47
+ }
48
+
49
+ export async function copyDir(src, dest) {
50
+ await fs.mkdir(dest, { recursive: true });
51
+
52
+ for (const entry of await fs.readdir(src, { withFileTypes: true })) {
53
+ const srcPath = path.join(src, entry.name);
54
+ const destPath = path.join(dest, entry.name);
55
+
56
+ if (entry.isDirectory()) {
57
+ await copyDir(srcPath, destPath);
58
+ continue;
59
+ }
60
+
61
+ if (entry.isFile() || entry.isSymbolicLink()) {
62
+ await fs.copyFile(srcPath, destPath);
63
+ }
64
+ }
65
+ }
66
+
67
+ export async function rmDir(dir) {
68
+ await fs.rm(dir, { recursive: true, force: true });
69
+ }
70
+
71
+ export async function readSkillVersion(skillDir) {
72
+ const meta = await readJSON(path.join(skillDir, 'skill.json'));
73
+ if (meta?.version) return String(meta.version);
74
+
75
+ try {
76
+ const md = await fs.readFile(path.join(skillDir, 'SKILL.md'), 'utf8');
77
+ const frontmatter = md.match(/^---\s*\n([\s\S]*?)\n---/);
78
+ const version = frontmatter?.[1]?.match(/^version:\s*['"]?([^'"\n]+)['"]?$/m);
79
+ if (version) return version[1].trim();
80
+ } catch {
81
+ // Missing or unreadable metadata falls back to the package baseline.
82
+ }
83
+
84
+ return '0.0.0';
85
+ }
86
+
87
+ export async function readSkillDescription(skillDir) {
88
+ const meta = await readJSON(path.join(skillDir, 'skill.json'));
89
+ if (meta?.description) return String(meta.description);
90
+
91
+ try {
92
+ const md = await fs.readFile(path.join(skillDir, 'SKILL.md'), 'utf8');
93
+ const frontmatter = md.match(/^---\s*\n([\s\S]*?)\n---/);
94
+ const description = frontmatter?.[1]?.match(/^description:\s*['"]?([^'"\n]+)['"]?$/m);
95
+ if (description) return description[1].trim();
96
+ } catch {
97
+ // Description is optional.
98
+ }
99
+
100
+ return '';
101
+ }
102
+
103
+ export function normalizeClient(client) {
104
+ const value = String(client || 'claude').toLowerCase();
105
+ if (value === 'claude' || value === 'codex' || value === 'all') {
106
+ return value;
107
+ }
108
+
109
+ throw new Error(`Unsupported client "${client}". Use claude, codex, or all.`);
110
+ }
111
+
112
+ export function expandHome(inputPath) {
113
+ if (inputPath === '~') return os.homedir();
114
+ if (inputPath?.startsWith('~/')) return path.join(os.homedir(), inputPath.slice(2));
115
+ return inputPath;
116
+ }
117
+
118
+ export function resolveTarget(inputPath) {
119
+ return path.resolve(expandHome(inputPath));
120
+ }
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "ethan-agent-skills",
3
+ "version": "0.1.0",
4
+ "description": "Agent skills published from my_agent_skill",
5
+ "type": "module",
6
+ "bin": {
7
+ "skills": "bin/skills.mjs"
8
+ },
9
+ "files": [
10
+ "bin/",
11
+ "lib/",
12
+ "skills/",
13
+ "codex/",
14
+ "claude/",
15
+ "agents/",
16
+ "README.md",
17
+ "LICENSE"
18
+ ],
19
+ "engines": {
20
+ "node": ">=18"
21
+ },
22
+ "scripts": {
23
+ "test:local": "rm -rf /tmp/my-agent-skill-test-local && node bin/skills.mjs --help && node bin/skills.mjs --version && node bin/skills.mjs list && node bin/skills.mjs update --dry-run --target /tmp/my-agent-skill-test-local && node bin/skills.mjs update --target /tmp/my-agent-skill-test-local",
24
+ "pack:check": "npm pack --dry-run"
25
+ },
26
+ "dependencies": {
27
+ "commander": "^12.1.0",
28
+ "kleur": "^4.1.5"
29
+ },
30
+ "repository": {
31
+ "type": "git",
32
+ "url": "git+https://github.com/EthenZhang/my_agent_skill.git"
33
+ },
34
+ "keywords": [
35
+ "agent",
36
+ "claude",
37
+ "claude-code",
38
+ "codex",
39
+ "skills",
40
+ "cli"
41
+ ],
42
+ "license": "MIT",
43
+ "publishConfig": {
44
+ "access": "public"
45
+ }
46
+ }
@@ -0,0 +1,214 @@
1
+ ---
2
+ name: pdf-extract
3
+ description: Extract text from PDF files and save as clean markdown documents. Use this skill whenever a PDF file needs to be converted to readable text, especially for insurance policies, financial documents, scanned PDFs, or mixed text/image PDFs. Trigger on requests like "extract this PDF", "convert PDF to markdown", "read this PDF", "process this PDF", "OCR this PDF", or when the user drops a PDF and asks for its contents. This skill handles both text-based PDFs and scanned/image-based PDFs using macOS-native tools (no poppler or tesseract needed).
4
+ ---
5
+
6
+ # PDF Extract Skill
7
+
8
+ Extract complete text from PDF files and save as clean markdown. Handles text-based PDFs, scanned/image pages, and mixed documents — all using macOS native frameworks (no external dependencies beyond pyobjc, which comes with the system Python on macOS).
9
+
10
+ ## When to Use
11
+
12
+ - A user uploads a PDF and wants its content extracted to text or markdown
13
+ - The PDF contains a mix of text and scanned pages (e.g., insurance policy booklets)
14
+ - Scanned pages are embedded within otherwise text-based PDFs
15
+ - Large PDFs where the extracted text exceeds normal read limits
16
+ - Financial documents, insurance policies, contracts, or reports that need structured extraction
17
+
18
+ ## Workflow Overview
19
+
20
+ 1. **Try text extraction first** — use PDFKit to get native text
21
+ 2. **Detect scanned pages** — pages with little or no extracted text are likely scans
22
+ 3. **Render scanned pages as images** — convert them to PNGs at high resolution
23
+ 4. **Extract text from images** — use multimodal vision to OCR the rendered pages
24
+ 5. **Crop if needed** — isolate specific regions (tables, signatures) from page images
25
+ 6. **Assemble and save** — combine all extracted text into a clean markdown document
26
+
27
+ ## Step-by-Step Instructions
28
+
29
+ ### Step 1: Inspect the PDF
30
+
31
+ First, check the PDF file path and try basic text extraction on the first page to gauge quality:
32
+
33
+ ```python
34
+ import sys, Quartz, os
35
+ sys.path.insert(0, '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
36
+ import objc
37
+
38
+ pdf_path = "/path/to/file.pdf"
39
+ url = Quartz.NSURL.fileURLWithPath_(pdf_path)
40
+ doc = Quartz.PDFDocument.alloc().initWithURL_(doc)
41
+
42
+ if doc is None:
43
+ # Cannot read PDF
44
+ exit()
45
+
46
+ page_count = doc.pageCount()
47
+ print(f"Pages: {page_count}")
48
+
49
+ # Quick quality check on first page
50
+ p1 = doc.pageAtIndex_(0)
51
+ text = p1.string() if p1 else ""
52
+ print(f"First page text length: {len(text)}")
53
+ if len(text) < 50:
54
+ print("WARNING: Low text extraction — likely a scanned PDF")
55
+ ```
56
+
57
+ ### Step 2: Extract All Text-Based Pages
58
+
59
+ Attempt to extract text from every page. Pages with meaningful text (>50 chars) are text-based; pages with very little text (<10 chars, often just whitespace or decorative elements) are scanned images.
60
+
61
+ ```python
62
+ import sys, Quartz
63
+
64
+ pdf_path = "/path/to/file.pdf"
65
+ url = Quartz.NSURL.fileURLWithPath_(pdf_path)
66
+ doc = Quartz.PDFDocument.alloc().initWithURL_(url)
67
+
68
+ pages_text = []
69
+ scanned_pages = []
70
+
71
+ for i in range(doc.pageCount()):
72
+ page = doc.pageAtIndex_(i)
73
+ text = page.string() if page else ""
74
+ if len(text.strip()) < 10:
75
+ scanned_pages.append(i)
76
+ else:
77
+ pages_text.append({"page": i + 1, "text": text})
78
+
79
+ print(f"Text pages: {len(pages_text)}, Scanned pages: {scanned_pages}")
80
+ ```
81
+
82
+ ### Step 3: Render Scanned Pages as PNG Images
83
+
84
+ For each scanned page, render it to a PNG image at 6x scale (300+ DPI equivalent) for clarity. Lower scales (3x-4x) may not be readable for dense financial tables.
85
+
86
+ ```python
87
+ import sys, Quartz
88
+
89
+ pdf_path = "/path/to/file.pdf"
90
+ url = Quartz.NSURL.fileURLWithPath_(pdf_path)
91
+ doc = Quartz.PDFDocument.alloc().initWithURL_(url)
92
+ output_dir = "/tmp/pdf_rendered/"
93
+ os.makedirs(output_dir, exist_ok=True)
94
+
95
+ scale = 6.0 # 6x scale for dense tables
96
+
97
+ for page_idx in scanned_pages:
98
+ page = doc.pageAtIndex_(page_idx)
99
+ media_box = page.boundsForBox_(Quartz.kCGPDFMediaBox)
100
+
101
+ pw = int(media_box.size.width * scale)
102
+ ph = int(media_box.size.height * scale)
103
+
104
+ cs = Quartz.CGColorSpaceCreateDeviceRGB()
105
+ ctx = Quartz.CGBitmapContextCreate(
106
+ None, pw, ph, 8, pw * 4, cs,
107
+ Quartz.kCGImageAlphaPremultipliedLast
108
+ )
109
+
110
+ # White background
111
+ Quartz.CGContextSetRGBFillColor(ctx, 1.0, 1.0, 1.0, 1.0)
112
+ Quartz.CGContextFillRect(ctx, Quartz.CGRectMake(0, 0, pw, ph))
113
+ Quartz.CGContextScaleCTM(ctx, scale, scale)
114
+ page.drawWithBox_toContext_(Quartz.kCGPDFMediaBox, ctx)
115
+
116
+ cg_img = Quartz.CGBitmapContextCreateImage(ctx)
117
+ Quartz.CGImageDestinationAddImage(
118
+ Quartz.CGImageDestinationCreateWithURL(
119
+ Quartz.NSURL.fileURLWithPath_(f"{output_dir}page_{page_idx+1}.png"),
120
+ Quartz.kUTTypePNG, 1, None
121
+ ),
122
+ cg_img, None
123
+ )
124
+ # Finalize the destination
125
+ dest = Quartz.CGImageDestinationCreateWithURL(
126
+ Quartz.NSURL.fileURLWithPath_(f"{output_dir}page_{page_idx+1}.png"),
127
+ Quartz.kUTTypePNG, 1, None
128
+ )
129
+ Quartz.CGImageDestinationAddImage(dest, cg_img, None)
130
+ Quartz.CGImageDestinationFinalize(dest)
131
+
132
+ print(f"Rendered page {page_idx+1} to {output_dir}page_{page_idx+1}.png ({pw}x{ph})")
133
+ ```
134
+
135
+ > **Scale reference:** 3x works for simple text, 4x for most documents, 5x for dense forms, 6x for financial tables with small fonts. Start at 4x and go up if text isn't legible.
136
+
137
+ ### Step 4: Crop Specific Regions (Optional)
138
+
139
+ When only part of a page is relevant (e.g., a specific table), crop the rendered image to that region. Coordinates use the Quartz coordinate system where the origin (0,0) is at the **top-left** of the page.
140
+
141
+ ```python
142
+ import sys, Quartz
143
+
144
+ # After rendering to cg_img (before saving), crop a region:
145
+ # crop_rect = Quartz.CGRectMake(x * scale, y * scale, width * scale, height * scale)
146
+ # cg_img = Quartz.CGImageCreateWithImageInRect(cg_img, crop_rect)
147
+ ```
148
+
149
+ > **Finding coordinates:** Render the full page first, view the image, then estimate the crop rectangle. The page bounds are in PDF points (1 point = 1/72 inch). A standard A4 page is roughly 595x842 points. Multiply by `scale` for the pixel-space crop rectangle.
150
+
151
+ ### Step 5: Extract Text from Rendered Images
152
+
153
+ Use multimodal vision to extract text from the rendered PNG images. This works best with the Read tool on the image files.
154
+
155
+ ### Step 6: Assemble and Save as Markdown
156
+
157
+ Combine all extracted content into a structured markdown document. Follow the Obsidian wiki schema if the target vault uses it:
158
+
159
+ ```markdown
160
+ ---
161
+ type: source
162
+ tags: [category, subcategory]
163
+ date: YYYY-MM-DD
164
+ source_count: 1
165
+ sources: [[original-filename.pdf]]
166
+ status: active
167
+ ---
168
+
169
+ # Document Title
170
+
171
+ > Brief description of the document
172
+
173
+ ## Key Data
174
+
175
+ [Extracted tables as markdown tables]
176
+
177
+ ## Main Content
178
+
179
+ [Extracted body text, organized by sections]
180
+
181
+ ## Important Notes
182
+
183
+ [Any specific details, warnings, or risks mentioned]
184
+ ```
185
+
186
+ ## Handling Large PDFs
187
+
188
+ When the extracted text is very long and would exceed output limits:
189
+
190
+ - Process pages in batches of 5-10
191
+ - Save intermediate results to a temporary file
192
+ - For text-based extraction, use chunked reading with offset/limit if reading from a file:
193
+
194
+ ```python
195
+ # Write extracted text to a file first, then read in chunks
196
+ with open("/tmp/extracted.txt", "w") as f:
197
+ for page_data in pages_text:
198
+ f.write(f"\n--- Page {page_data['page']} ---\n")
199
+ f.write(page_data["text"])
200
+ ```
201
+
202
+ ## Common Pitfalls
203
+
204
+ - **Low resolution renders:** If OCR quality is poor, increase the scale from 4x to 6x. Dense tables with small fonts almost always need 6x.
205
+ - **Page orientation:** Some PDFs have rotated pages. Check `media_box` dimensions to detect landscape pages.
206
+ - **Watermarks/overlays:** Background watermarks can interfere with OCR. If pages have heavy watermarks, try cropping to the content region.
207
+ - **Mixed content pages:** A page might have both text and scanned elements. The `< 10 chars` threshold detects pure scans, but pages with partial text need manual review.
208
+ - **pyobjc availability:** On macOS, pyobjc is pre-installed with the system Python. Use `python3` from the system, not a Homebrew Python that may lack the Quartz bindings.
209
+
210
+ ## Dependencies
211
+
212
+ - macOS (required — uses Quartz/PDFKit frameworks)
213
+ - pyobjc (pre-installed on macOS with system Python)
214
+ - No additional packages needed (no poppler, no tesseract, no PIL)
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf_extract.py — Extract text from PDF files on macOS using PDFKit + Quartz.
4
+
5
+ Usage:
6
+ python3 pdf_extract.py <pdf_path> [--render-scanned] [--output-dir DIR] [--scale N]
7
+
8
+ Options:
9
+ --render-scanned Render scanned/blank pages as PNG images for OCR
10
+ --output-dir DIR Directory for rendered page images (default: /tmp/pdf_rendered/)
11
+ --scale N Render scale multiplier (default: 6.0)
12
+
13
+ Output:
14
+ - Prints extracted text to stdout, page by page
15
+ - If --render-scanned is set, saves scanned pages as PNGs to output_dir
16
+ - Prints a JSON summary to stderr with page counts and scanned page indices
17
+ """
18
+
19
+ import sys
20
+ import os
21
+ import json
22
+ import argparse
23
+
24
+ import Quartz
25
+
26
+
27
+ def extract_text(pdf_path, render_scanned=False, output_dir=None, scale=6.0):
28
+ url = Quartz.NSURL.fileURLWithPath_(pdf_path)
29
+ doc = Quartz.PDFDocument.alloc().initWithURL_(url)
30
+
31
+ if doc is None:
32
+ print(f"ERROR: Cannot open PDF: {pdf_path}", file=sys.stderr)
33
+ sys.exit(1)
34
+
35
+ page_count = doc.pageCount()
36
+ pages_text = []
37
+ scanned_pages = []
38
+
39
+ for i in range(page_count):
40
+ page = doc.pageAtIndex_(i)
41
+ text = page.string() if page else ""
42
+
43
+ if len(text.strip()) < 10:
44
+ scanned_pages.append(i)
45
+ if render_scanned:
46
+ if output_dir is None:
47
+ output_dir = "/tmp/pdf_rendered/"
48
+ os.makedirs(output_dir, exist_ok=True)
49
+ _render_page(page, i, output_dir, scale)
50
+ else:
51
+ pages_text.append({"page": i + 1, "text": text})
52
+
53
+ # Print extracted text
54
+ for pd in pages_text:
55
+ print(f"\n--- Page {pd['page']} ---")
56
+ print(pd["text"])
57
+
58
+ # Summary to stderr
59
+ summary = {
60
+ "total_pages": page_count,
61
+ "text_pages": len(pages_text),
62
+ "scanned_pages": scanned_pages,
63
+ "scanned_count": len(scanned_pages),
64
+ }
65
+ print(f"\n[Summary] {summary['text_pages']} text pages, {summary['scanned_count']} scanned pages: {scanned_pages}", file=sys.stderr)
66
+ json.dump(summary, sys.stderr)
67
+ print("", file=sys.stderr)
68
+
69
+ return summary
70
+
71
+
72
+ def _render_page(page, page_idx, output_dir, scale):
73
+ media_box = page.boundsForBox_(Quartz.kCGPDFMediaBox)
74
+ pw = int(media_box.size.width * scale)
75
+ ph = int(media_box.size.height * scale)
76
+
77
+ cs = Quartz.CGColorSpaceCreateDeviceRGB()
78
+ ctx = Quartz.CGBitmapContextCreate(
79
+ None, pw, ph, 8, pw * 4, cs,
80
+ Quartz.kCGImageAlphaPremultipliedLast
81
+ )
82
+
83
+ Quartz.CGContextSetRGBFillColor(ctx, 1.0, 1.0, 1.0, 1.0)
84
+ Quartz.CGContextFillRect(ctx, Quartz.CGRectMake(0, 0, pw, ph))
85
+ Quartz.CGContextScaleCTM(ctx, scale, scale)
86
+ page.drawWithBox_toContext_(Quartz.kCGPDFMediaBox, ctx)
87
+
88
+ cg_img = Quartz.CGBitmapContextCreateImage(ctx)
89
+ out_path = os.path.join(output_dir, f"page_{page_idx + 1}.png")
90
+
91
+ dest = Quartz.CGImageDestinationCreateWithURL(
92
+ Quartz.NSURL.fileURLWithPath_(out_path),
93
+ Quartz.kUTTypePNG, 1, None
94
+ )
95
+ Quartz.CGImageDestinationAddImage(dest, cg_img, None)
96
+ Quartz.CGImageDestinationFinalize(dest)
97
+
98
+ print(f" [Rendered] page {page_idx + 1} -> {out_path} ({pw}x{ph})", file=sys.stderr)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ parser = argparse.ArgumentParser(description="Extract text from PDF on macOS")
103
+ parser.add_argument("pdf_path", help="Path to the PDF file")
104
+ parser.add_argument("--render-scanned", action="store_true",
105
+ help="Render scanned/blank pages as PNG images")
106
+ parser.add_argument("--output-dir", default="/tmp/pdf_rendered/",
107
+ help="Output directory for rendered images")
108
+ parser.add_argument("--scale", type=float, default=6.0,
109
+ help="Scale multiplier for rendering (default: 6.0)")
110
+ args = parser.parse_args()
111
+
112
+ extract_text(args.pdf_path, args.render_scanned, args.output_dir, args.scale)
@@ -0,0 +1,5 @@
1
+ {
2
+ "name": "pdf-extract",
3
+ "version": "0.1.0",
4
+ "description": ""
5
+ }