ethan-agent-skills 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +140 -0
- package/agents/skills/openspec-apply-change/SKILL.md +156 -0
- package/agents/skills/openspec-apply-change/skill.json +5 -0
- package/agents/skills/openspec-archive-change/SKILL.md +114 -0
- package/agents/skills/openspec-archive-change/skill.json +5 -0
- package/agents/skills/openspec-explore/SKILL.md +288 -0
- package/agents/skills/openspec-explore/skill.json +5 -0
- package/agents/skills/openspec-propose/SKILL.md +110 -0
- package/agents/skills/openspec-propose/skill.json +5 -0
- package/agents/skills/source-command-opsx-apply/SKILL.md +156 -0
- package/agents/skills/source-command-opsx-apply/skill.json +5 -0
- package/agents/skills/source-command-opsx-archive/SKILL.md +161 -0
- package/agents/skills/source-command-opsx-archive/skill.json +5 -0
- package/agents/skills/source-command-opsx-explore/SKILL.md +177 -0
- package/agents/skills/source-command-opsx-explore/skill.json +5 -0
- package/agents/skills/source-command-opsx-propose/SKILL.md +110 -0
- package/agents/skills/source-command-opsx-propose/skill.json +5 -0
- package/bin/skills.mjs +38 -0
- package/claude/commands/opsx/apply.md +152 -0
- package/claude/commands/opsx/archive.md +157 -0
- package/claude/commands/opsx/explore.md +173 -0
- package/claude/commands/opsx/propose.md +106 -0
- package/claude/skills/openspec-apply-change/SKILL.md +156 -0
- package/claude/skills/openspec-apply-change/skill.json +5 -0
- package/claude/skills/openspec-archive-change/SKILL.md +114 -0
- package/claude/skills/openspec-archive-change/skill.json +5 -0
- package/claude/skills/openspec-explore/SKILL.md +288 -0
- package/claude/skills/openspec-explore/skill.json +5 -0
- package/claude/skills/openspec-propose/SKILL.md +110 -0
- package/claude/skills/openspec-propose/skill.json +5 -0
- package/codex/skills/openspec-apply-change/SKILL.md +156 -0
- package/codex/skills/openspec-apply-change/skill.json +5 -0
- package/codex/skills/openspec-archive-change/SKILL.md +114 -0
- package/codex/skills/openspec-archive-change/skill.json +5 -0
- package/codex/skills/openspec-explore/SKILL.md +288 -0
- package/codex/skills/openspec-explore/skill.json +5 -0
- package/codex/skills/openspec-propose/SKILL.md +110 -0
- package/codex/skills/openspec-propose/skill.json +5 -0
- package/lib/discover.mjs +60 -0
- package/lib/list.mjs +22 -0
- package/lib/update.mjs +144 -0
- package/lib/utils.mjs +120 -0
- package/package.json +46 -0
- package/skills/pdf-extract/SKILL.md +214 -0
- package/skills/pdf-extract/scripts/pdf_extract.py +112 -0
- package/skills/pdf-extract/skill.json +5 -0
package/lib/update.mjs
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import kleur from 'kleur';
|
|
4
|
+
import { discoverBundledSkills } from './discover.mjs';
|
|
5
|
+
import {
|
|
6
|
+
CLIENT_DIRS,
|
|
7
|
+
copyDir,
|
|
8
|
+
LOCK_FILE,
|
|
9
|
+
PACKAGE_NAME,
|
|
10
|
+
pathExists,
|
|
11
|
+
readJSON,
|
|
12
|
+
resolveTarget,
|
|
13
|
+
rmDir,
|
|
14
|
+
writeJSON,
|
|
15
|
+
} from './utils.mjs';
|
|
16
|
+
|
|
17
|
+
export async function update(opts = {}) {
|
|
18
|
+
const targets = resolveTargets(opts);
|
|
19
|
+
|
|
20
|
+
for (const target of targets) {
|
|
21
|
+
const skills = await discoverBundledSkills(target.groups, { dedupe: true });
|
|
22
|
+
await syncToTarget(target.path, skills, opts);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function resolveTargets(opts) {
|
|
27
|
+
const client = String(opts.client || 'claude').toLowerCase();
|
|
28
|
+
|
|
29
|
+
if (opts.target) {
|
|
30
|
+
return [
|
|
31
|
+
{
|
|
32
|
+
name: 'custom',
|
|
33
|
+
path: resolveTarget(opts.target),
|
|
34
|
+
groups: groupsForClient(client),
|
|
35
|
+
},
|
|
36
|
+
];
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (client === 'claude') {
|
|
40
|
+
return [{ name: 'claude', path: CLIENT_DIRS.claude, groups: groupsForClient(client) }];
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (client === 'codex') {
|
|
44
|
+
return [{ name: 'codex', path: CLIENT_DIRS.codex, groups: groupsForClient(client) }];
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (client === 'all') {
|
|
48
|
+
return [
|
|
49
|
+
{ name: 'claude', path: CLIENT_DIRS.claude, groups: groupsForClient('claude') },
|
|
50
|
+
{ name: 'codex', path: CLIENT_DIRS.codex, groups: groupsForClient('codex') },
|
|
51
|
+
];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
throw new Error(`Unsupported client "${opts.client}". Use claude, codex, or all.`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function groupsForClient(client) {
|
|
58
|
+
if (client === 'claude') return ['common', 'claude'];
|
|
59
|
+
if (client === 'codex') return ['common', 'codex'];
|
|
60
|
+
if (client === 'all') return ['common', 'claude', 'codex'];
|
|
61
|
+
throw new Error(`Unsupported client "${client}". Use claude, codex, or all.`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
async function syncToTarget(targetDir, skills, opts) {
|
|
65
|
+
console.log(kleur.cyan(`\n-> Syncing ${skills.length} skill(s) to ${targetDir}`));
|
|
66
|
+
|
|
67
|
+
if (skills.length === 0) {
|
|
68
|
+
console.log(kleur.yellow(' No bundled skills found for this target.'));
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const lockPath = path.join(targetDir, LOCK_FILE);
|
|
73
|
+
const hadLock = await pathExists(lockPath);
|
|
74
|
+
const lock = normalizeLock(await readJSON(lockPath, {}));
|
|
75
|
+
const nextLock = structuredClone(lock);
|
|
76
|
+
let updated = 0;
|
|
77
|
+
let skipped = 0;
|
|
78
|
+
let conflicts = 0;
|
|
79
|
+
|
|
80
|
+
if (!opts.dryRun) {
|
|
81
|
+
await fs.mkdir(targetDir, { recursive: true });
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
for (const skill of skills) {
|
|
85
|
+
const destination = path.join(targetDir, skill.name);
|
|
86
|
+
const lockEntry = lock.skills[skill.name];
|
|
87
|
+
const exists = await pathExists(destination);
|
|
88
|
+
|
|
89
|
+
if (exists && !isManaged(lockEntry)) {
|
|
90
|
+
console.log(` ${kleur.red('!')} ${skill.name} ${kleur.red('(unmanaged destination exists; skipped)')}`);
|
|
91
|
+
conflicts += 1;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (!opts.force && exists && lockEntry?.version === skill.version) {
|
|
96
|
+
console.log(` ${kleur.gray('=')} ${skill.name} ${kleur.gray(`(${skill.version}, up-to-date)`)}`);
|
|
97
|
+
skipped += 1;
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const oldVersion = lockEntry?.version ?? 'none';
|
|
102
|
+
|
|
103
|
+
if (opts.dryRun) {
|
|
104
|
+
const marker = exists ? '~' : '+';
|
|
105
|
+
console.log(` ${kleur.yellow(marker)} ${skill.name} ${oldVersion} -> ${skill.version} ${kleur.gray('(dry-run)')}`);
|
|
106
|
+
updated += 1;
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
await rmDir(destination);
|
|
111
|
+
await copyDir(skill.sourceDir, destination);
|
|
112
|
+
nextLock.skills[skill.name] = {
|
|
113
|
+
name: skill.name,
|
|
114
|
+
version: skill.version,
|
|
115
|
+
sourcePackage: PACKAGE_NAME,
|
|
116
|
+
sourceGroup: skill.group,
|
|
117
|
+
updatedAt: new Date().toISOString(),
|
|
118
|
+
};
|
|
119
|
+
updated += 1;
|
|
120
|
+
console.log(` ${kleur.green('✓')} ${skill.name} ${kleur.gray(oldVersion)} -> ${kleur.green(skill.version)}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (!opts.dryRun && (updated > 0 || hadLock)) {
|
|
124
|
+
nextLock.package = PACKAGE_NAME;
|
|
125
|
+
nextLock.lockfileVersion = 1;
|
|
126
|
+
nextLock.updatedAt = new Date().toISOString();
|
|
127
|
+
await writeJSON(lockPath, nextLock);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
console.log(kleur.gray(` ${updated} changed, ${skipped} skipped, ${conflicts} conflict(s).`));
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function normalizeLock(lock) {
|
|
134
|
+
return {
|
|
135
|
+
package: lock?.package || PACKAGE_NAME,
|
|
136
|
+
lockfileVersion: lock?.lockfileVersion || 1,
|
|
137
|
+
updatedAt: lock?.updatedAt || null,
|
|
138
|
+
skills: lock?.skills && typeof lock.skills === 'object' ? lock.skills : {},
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function isManaged(lockEntry) {
|
|
143
|
+
return lockEntry?.sourcePackage === PACKAGE_NAME;
|
|
144
|
+
}
|
package/lib/utils.mjs
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import os from 'node:os';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { fileURLToPath } from 'node:url';
|
|
5
|
+
|
|
6
|
+
export const PACKAGE_NAME = 'ethan-agent-skills';
|
|
7
|
+
export const LOCK_FILE = '.skills-lock.json';
|
|
8
|
+
export const PACKAGE_ROOT = path.resolve(
|
|
9
|
+
path.dirname(fileURLToPath(import.meta.url)),
|
|
10
|
+
'..',
|
|
11
|
+
);
|
|
12
|
+
|
|
13
|
+
export const CLIENT_DIRS = {
|
|
14
|
+
claude: path.join(os.homedir(), '.claude', 'skills'),
|
|
15
|
+
codex: path.join(os.homedir(), '.codex', 'skills'),
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
export const SOURCE_GROUPS = {
|
|
19
|
+
common: path.join(PACKAGE_ROOT, 'skills'),
|
|
20
|
+
claude: path.join(PACKAGE_ROOT, 'claude', 'skills'),
|
|
21
|
+
codex: path.join(PACKAGE_ROOT, 'codex', 'skills'),
|
|
22
|
+
agents: path.join(PACKAGE_ROOT, 'agents', 'skills'),
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
export async function pathExists(targetPath) {
|
|
26
|
+
try {
|
|
27
|
+
await fs.access(targetPath);
|
|
28
|
+
return true;
|
|
29
|
+
} catch {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export async function readJSON(file, fallback = null) {
|
|
35
|
+
try {
|
|
36
|
+
return JSON.parse(await fs.readFile(file, 'utf8'));
|
|
37
|
+
} catch {
|
|
38
|
+
return fallback;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export async function writeJSON(file, data) {
|
|
43
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
44
|
+
const tempFile = `${file}.${process.pid}.tmp`;
|
|
45
|
+
await fs.writeFile(tempFile, `${JSON.stringify(data, null, 2)}\n`);
|
|
46
|
+
await fs.rename(tempFile, file);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export async function copyDir(src, dest) {
|
|
50
|
+
await fs.mkdir(dest, { recursive: true });
|
|
51
|
+
|
|
52
|
+
for (const entry of await fs.readdir(src, { withFileTypes: true })) {
|
|
53
|
+
const srcPath = path.join(src, entry.name);
|
|
54
|
+
const destPath = path.join(dest, entry.name);
|
|
55
|
+
|
|
56
|
+
if (entry.isDirectory()) {
|
|
57
|
+
await copyDir(srcPath, destPath);
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (entry.isFile() || entry.isSymbolicLink()) {
|
|
62
|
+
await fs.copyFile(srcPath, destPath);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export async function rmDir(dir) {
|
|
68
|
+
await fs.rm(dir, { recursive: true, force: true });
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export async function readSkillVersion(skillDir) {
|
|
72
|
+
const meta = await readJSON(path.join(skillDir, 'skill.json'));
|
|
73
|
+
if (meta?.version) return String(meta.version);
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const md = await fs.readFile(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
77
|
+
const frontmatter = md.match(/^---\s*\n([\s\S]*?)\n---/);
|
|
78
|
+
const version = frontmatter?.[1]?.match(/^version:\s*['"]?([^'"\n]+)['"]?$/m);
|
|
79
|
+
if (version) return version[1].trim();
|
|
80
|
+
} catch {
|
|
81
|
+
// Missing or unreadable metadata falls back to the package baseline.
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return '0.0.0';
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export async function readSkillDescription(skillDir) {
|
|
88
|
+
const meta = await readJSON(path.join(skillDir, 'skill.json'));
|
|
89
|
+
if (meta?.description) return String(meta.description);
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
const md = await fs.readFile(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
93
|
+
const frontmatter = md.match(/^---\s*\n([\s\S]*?)\n---/);
|
|
94
|
+
const description = frontmatter?.[1]?.match(/^description:\s*['"]?([^'"\n]+)['"]?$/m);
|
|
95
|
+
if (description) return description[1].trim();
|
|
96
|
+
} catch {
|
|
97
|
+
// Description is optional.
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return '';
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export function normalizeClient(client) {
|
|
104
|
+
const value = String(client || 'claude').toLowerCase();
|
|
105
|
+
if (value === 'claude' || value === 'codex' || value === 'all') {
|
|
106
|
+
return value;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
throw new Error(`Unsupported client "${client}". Use claude, codex, or all.`);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export function expandHome(inputPath) {
|
|
113
|
+
if (inputPath === '~') return os.homedir();
|
|
114
|
+
if (inputPath?.startsWith('~/')) return path.join(os.homedir(), inputPath.slice(2));
|
|
115
|
+
return inputPath;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function resolveTarget(inputPath) {
|
|
119
|
+
return path.resolve(expandHome(inputPath));
|
|
120
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ethan-agent-skills",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Agent skills published from my_agent_skill",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"skills": "bin/skills.mjs"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"bin/",
|
|
11
|
+
"lib/",
|
|
12
|
+
"skills/",
|
|
13
|
+
"codex/",
|
|
14
|
+
"claude/",
|
|
15
|
+
"agents/",
|
|
16
|
+
"README.md",
|
|
17
|
+
"LICENSE"
|
|
18
|
+
],
|
|
19
|
+
"engines": {
|
|
20
|
+
"node": ">=18"
|
|
21
|
+
},
|
|
22
|
+
"scripts": {
|
|
23
|
+
"test:local": "rm -rf /tmp/my-agent-skill-test-local && node bin/skills.mjs --help && node bin/skills.mjs --version && node bin/skills.mjs list && node bin/skills.mjs update --dry-run --target /tmp/my-agent-skill-test-local && node bin/skills.mjs update --target /tmp/my-agent-skill-test-local",
|
|
24
|
+
"pack:check": "npm pack --dry-run"
|
|
25
|
+
},
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"commander": "^12.1.0",
|
|
28
|
+
"kleur": "^4.1.5"
|
|
29
|
+
},
|
|
30
|
+
"repository": {
|
|
31
|
+
"type": "git",
|
|
32
|
+
"url": "git+https://github.com/EthenZhang/my_agent_skill.git"
|
|
33
|
+
},
|
|
34
|
+
"keywords": [
|
|
35
|
+
"agent",
|
|
36
|
+
"claude",
|
|
37
|
+
"claude-code",
|
|
38
|
+
"codex",
|
|
39
|
+
"skills",
|
|
40
|
+
"cli"
|
|
41
|
+
],
|
|
42
|
+
"license": "MIT",
|
|
43
|
+
"publishConfig": {
|
|
44
|
+
"access": "public"
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pdf-extract
|
|
3
|
+
description: Extract text from PDF files and save as clean markdown documents. Use this skill whenever a PDF file needs to be converted to readable text, especially for insurance policies, financial documents, scanned PDFs, or mixed text/image PDFs. Trigger on requests like "extract this PDF", "convert PDF to markdown", "read this PDF", "process this PDF", "OCR this PDF", or when the user drops a PDF and asks for its contents. This skill handles both text-based PDFs and scanned/image-based PDFs using macOS-native tools (no poppler or tesseract needed).
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PDF Extract Skill
|
|
7
|
+
|
|
8
|
+
Extract complete text from PDF files and save as clean markdown. Handles text-based PDFs, scanned/image pages, and mixed documents — all using macOS native frameworks (no external dependencies beyond pyobjc, which comes with the system Python on macOS).
|
|
9
|
+
|
|
10
|
+
## When to Use
|
|
11
|
+
|
|
12
|
+
- A user uploads a PDF and wants its content extracted to text or markdown
|
|
13
|
+
- The PDF contains a mix of text and scanned pages (e.g., insurance policy booklets)
|
|
14
|
+
- Scanned pages are embedded within otherwise text-based PDFs
|
|
15
|
+
- Large PDFs where the extracted text exceeds normal read limits
|
|
16
|
+
- Financial documents, insurance policies, contracts, or reports that need structured extraction
|
|
17
|
+
|
|
18
|
+
## Workflow Overview
|
|
19
|
+
|
|
20
|
+
1. **Try text extraction first** — use PDFKit to get native text
|
|
21
|
+
2. **Detect scanned pages** — pages with little or no extracted text are likely scans
|
|
22
|
+
3. **Render scanned pages as images** — convert them to PNGs at high resolution
|
|
23
|
+
4. **Extract text from images** — use multimodal vision to OCR the rendered pages
|
|
24
|
+
5. **Crop if needed** — isolate specific regions (tables, signatures) from page images
|
|
25
|
+
6. **Assemble and save** — combine all extracted text into a clean markdown document
|
|
26
|
+
|
|
27
|
+
## Step-by-Step Instructions
|
|
28
|
+
|
|
29
|
+
### Step 1: Inspect the PDF
|
|
30
|
+
|
|
31
|
+
First, check the PDF file path and try basic text extraction on the first page to gauge quality:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import sys, Quartz, os
|
|
35
|
+
sys.path.insert(0, '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
|
|
36
|
+
import objc
|
|
37
|
+
|
|
38
|
+
pdf_path = "/path/to/file.pdf"
|
|
39
|
+
url = Quartz.NSURL.fileURLWithPath_(pdf_path)
|
|
40
|
+
doc = Quartz.PDFDocument.alloc().initWithURL_(doc)
|
|
41
|
+
|
|
42
|
+
if doc is None:
|
|
43
|
+
# Cannot read PDF
|
|
44
|
+
exit()
|
|
45
|
+
|
|
46
|
+
page_count = doc.pageCount()
|
|
47
|
+
print(f"Pages: {page_count}")
|
|
48
|
+
|
|
49
|
+
# Quick quality check on first page
|
|
50
|
+
p1 = doc.pageAtIndex_(0)
|
|
51
|
+
text = p1.string() if p1 else ""
|
|
52
|
+
print(f"First page text length: {len(text)}")
|
|
53
|
+
if len(text) < 50:
|
|
54
|
+
print("WARNING: Low text extraction — likely a scanned PDF")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Step 2: Extract All Text-Based Pages
|
|
58
|
+
|
|
59
|
+
Attempt to extract text from every page. Pages with meaningful text (>50 chars) are text-based; pages with very little text (<10 chars, often just whitespace or decorative elements) are scanned images.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import sys, Quartz
|
|
63
|
+
|
|
64
|
+
pdf_path = "/path/to/file.pdf"
|
|
65
|
+
url = Quartz.NSURL.fileURLWithPath_(pdf_path)
|
|
66
|
+
doc = Quartz.PDFDocument.alloc().initWithURL_(url)
|
|
67
|
+
|
|
68
|
+
pages_text = []
|
|
69
|
+
scanned_pages = []
|
|
70
|
+
|
|
71
|
+
for i in range(doc.pageCount()):
|
|
72
|
+
page = doc.pageAtIndex_(i)
|
|
73
|
+
text = page.string() if page else ""
|
|
74
|
+
if len(text.strip()) < 10:
|
|
75
|
+
scanned_pages.append(i)
|
|
76
|
+
else:
|
|
77
|
+
pages_text.append({"page": i + 1, "text": text})
|
|
78
|
+
|
|
79
|
+
print(f"Text pages: {len(pages_text)}, Scanned pages: {scanned_pages}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Step 3: Render Scanned Pages as PNG Images
|
|
83
|
+
|
|
84
|
+
For each scanned page, render it to a PNG image at 6x scale (300+ DPI equivalent) for clarity. Lower scales (3x-4x) may not be readable for dense financial tables.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import sys, Quartz
|
|
88
|
+
|
|
89
|
+
pdf_path = "/path/to/file.pdf"
|
|
90
|
+
url = Quartz.NSURL.fileURLWithPath_(pdf_path)
|
|
91
|
+
doc = Quartz.PDFDocument.alloc().initWithURL_(url)
|
|
92
|
+
output_dir = "/tmp/pdf_rendered/"
|
|
93
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
94
|
+
|
|
95
|
+
scale = 6.0 # 6x scale for dense tables
|
|
96
|
+
|
|
97
|
+
for page_idx in scanned_pages:
|
|
98
|
+
page = doc.pageAtIndex_(page_idx)
|
|
99
|
+
media_box = page.boundsForBox_(Quartz.kCGPDFMediaBox)
|
|
100
|
+
|
|
101
|
+
pw = int(media_box.size.width * scale)
|
|
102
|
+
ph = int(media_box.size.height * scale)
|
|
103
|
+
|
|
104
|
+
cs = Quartz.CGColorSpaceCreateDeviceRGB()
|
|
105
|
+
ctx = Quartz.CGBitmapContextCreate(
|
|
106
|
+
None, pw, ph, 8, pw * 4, cs,
|
|
107
|
+
Quartz.kCGImageAlphaPremultipliedLast
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# White background
|
|
111
|
+
Quartz.CGContextSetRGBFillColor(ctx, 1.0, 1.0, 1.0, 1.0)
|
|
112
|
+
Quartz.CGContextFillRect(ctx, Quartz.CGRectMake(0, 0, pw, ph))
|
|
113
|
+
Quartz.CGContextScaleCTM(ctx, scale, scale)
|
|
114
|
+
page.drawWithBox_toContext_(Quartz.kCGPDFMediaBox, ctx)
|
|
115
|
+
|
|
116
|
+
cg_img = Quartz.CGBitmapContextCreateImage(ctx)
|
|
117
|
+
Quartz.CGImageDestinationAddImage(
|
|
118
|
+
Quartz.CGImageDestinationCreateWithURL(
|
|
119
|
+
Quartz.NSURL.fileURLWithPath_(f"{output_dir}page_{page_idx+1}.png"),
|
|
120
|
+
Quartz.kUTTypePNG, 1, None
|
|
121
|
+
),
|
|
122
|
+
cg_img, None
|
|
123
|
+
)
|
|
124
|
+
# Finalize the destination
|
|
125
|
+
dest = Quartz.CGImageDestinationCreateWithURL(
|
|
126
|
+
Quartz.NSURL.fileURLWithPath_(f"{output_dir}page_{page_idx+1}.png"),
|
|
127
|
+
Quartz.kUTTypePNG, 1, None
|
|
128
|
+
)
|
|
129
|
+
Quartz.CGImageDestinationAddImage(dest, cg_img, None)
|
|
130
|
+
Quartz.CGImageDestinationFinalize(dest)
|
|
131
|
+
|
|
132
|
+
print(f"Rendered page {page_idx+1} to {output_dir}page_{page_idx+1}.png ({pw}x{ph})")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
> **Scale reference:** 3x works for simple text, 4x for most documents, 5x for dense forms, 6x for financial tables with small fonts. Start at 4x and go up if text isn't legible.
|
|
136
|
+
|
|
137
|
+
### Step 4: Crop Specific Regions (Optional)
|
|
138
|
+
|
|
139
|
+
When only part of a page is relevant (e.g., a specific table), crop the rendered image to that region. Coordinates use the Quartz coordinate system where the origin (0,0) is at the **top-left** of the page.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import sys, Quartz
|
|
143
|
+
|
|
144
|
+
# After rendering to cg_img (before saving), crop a region:
|
|
145
|
+
# crop_rect = Quartz.CGRectMake(x * scale, y * scale, width * scale, height * scale)
|
|
146
|
+
# cg_img = Quartz.CGImageCreateWithImageInRect(cg_img, crop_rect)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
> **Finding coordinates:** Render the full page first, view the image, then estimate the crop rectangle. The page bounds are in PDF points (1 point = 1/72 inch). A standard A4 page is roughly 595x842 points. Multiply by `scale` for the pixel-space crop rectangle.
|
|
150
|
+
|
|
151
|
+
### Step 5: Extract Text from Rendered Images
|
|
152
|
+
|
|
153
|
+
Use multimodal vision to extract text from the rendered PNG images. This works best with the Read tool on the image files.
|
|
154
|
+
|
|
155
|
+
### Step 6: Assemble and Save as Markdown
|
|
156
|
+
|
|
157
|
+
Combine all extracted content into a structured markdown document. Follow the Obsidian wiki schema if the target vault uses it:
|
|
158
|
+
|
|
159
|
+
```markdown
|
|
160
|
+
---
|
|
161
|
+
type: source
|
|
162
|
+
tags: [category, subcategory]
|
|
163
|
+
date: YYYY-MM-DD
|
|
164
|
+
source_count: 1
|
|
165
|
+
sources: [[original-filename.pdf]]
|
|
166
|
+
status: active
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
# Document Title
|
|
170
|
+
|
|
171
|
+
> Brief description of the document
|
|
172
|
+
|
|
173
|
+
## Key Data
|
|
174
|
+
|
|
175
|
+
[Extracted tables as markdown tables]
|
|
176
|
+
|
|
177
|
+
## Main Content
|
|
178
|
+
|
|
179
|
+
[Extracted body text, organized by sections]
|
|
180
|
+
|
|
181
|
+
## Important Notes
|
|
182
|
+
|
|
183
|
+
[Any specific details, warnings, or risks mentioned]
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Handling Large PDFs
|
|
187
|
+
|
|
188
|
+
When the extracted text is very long and would exceed output limits:
|
|
189
|
+
|
|
190
|
+
- Process pages in batches of 5-10
|
|
191
|
+
- Save intermediate results to a temporary file
|
|
192
|
+
- For text-based extraction, use chunked reading with offset/limit if reading from a file:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
# Write extracted text to a file first, then read in chunks
|
|
196
|
+
with open("/tmp/extracted.txt", "w") as f:
|
|
197
|
+
for page_data in pages_text:
|
|
198
|
+
f.write(f"\n--- Page {page_data['page']} ---\n")
|
|
199
|
+
f.write(page_data["text"])
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Common Pitfalls
|
|
203
|
+
|
|
204
|
+
- **Low resolution renders:** If OCR quality is poor, increase the scale from 4x to 6x. Dense tables with small fonts almost always need 6x.
|
|
205
|
+
- **Page orientation:** Some PDFs have rotated pages. Check `media_box` dimensions to detect landscape pages.
|
|
206
|
+
- **Watermarks/overlays:** Background watermarks can interfere with OCR. If pages have heavy watermarks, try cropping to the content region.
|
|
207
|
+
- **Mixed content pages:** A page might have both text and scanned elements. The `< 10 chars` threshold detects pure scans, but pages with partial text need manual review.
|
|
208
|
+
- **pyobjc availability:** On macOS, pyobjc is pre-installed with the system Python. Use `python3` from the system, not a Homebrew Python that may lack the Quartz bindings.
|
|
209
|
+
|
|
210
|
+
## Dependencies
|
|
211
|
+
|
|
212
|
+
- macOS (required — uses Quartz/PDFKit frameworks)
|
|
213
|
+
- pyobjc (pre-installed on macOS with system Python)
|
|
214
|
+
- No additional packages needed (no poppler, no tesseract, no PIL)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pdf_extract.py — Extract text from PDF files on macOS using PDFKit + Quartz.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python3 pdf_extract.py <pdf_path> [--render-scanned] [--output-dir DIR] [--scale N]
|
|
7
|
+
|
|
8
|
+
Options:
|
|
9
|
+
--render-scanned Render scanned/blank pages as PNG images for OCR
|
|
10
|
+
--output-dir DIR Directory for rendered page images (default: /tmp/pdf_rendered/)
|
|
11
|
+
--scale N Render scale multiplier (default: 6.0)
|
|
12
|
+
|
|
13
|
+
Output:
|
|
14
|
+
- Prints extracted text to stdout, page by page
|
|
15
|
+
- If --render-scanned is set, saves scanned pages as PNGs to output_dir
|
|
16
|
+
- Prints a JSON summary to stderr with page counts and scanned page indices
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import os
|
|
21
|
+
import json
|
|
22
|
+
import argparse
|
|
23
|
+
|
|
24
|
+
import Quartz
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_text(pdf_path, render_scanned=False, output_dir=None, scale=6.0):
|
|
28
|
+
url = Quartz.NSURL.fileURLWithPath_(pdf_path)
|
|
29
|
+
doc = Quartz.PDFDocument.alloc().initWithURL_(url)
|
|
30
|
+
|
|
31
|
+
if doc is None:
|
|
32
|
+
print(f"ERROR: Cannot open PDF: {pdf_path}", file=sys.stderr)
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
|
|
35
|
+
page_count = doc.pageCount()
|
|
36
|
+
pages_text = []
|
|
37
|
+
scanned_pages = []
|
|
38
|
+
|
|
39
|
+
for i in range(page_count):
|
|
40
|
+
page = doc.pageAtIndex_(i)
|
|
41
|
+
text = page.string() if page else ""
|
|
42
|
+
|
|
43
|
+
if len(text.strip()) < 10:
|
|
44
|
+
scanned_pages.append(i)
|
|
45
|
+
if render_scanned:
|
|
46
|
+
if output_dir is None:
|
|
47
|
+
output_dir = "/tmp/pdf_rendered/"
|
|
48
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
49
|
+
_render_page(page, i, output_dir, scale)
|
|
50
|
+
else:
|
|
51
|
+
pages_text.append({"page": i + 1, "text": text})
|
|
52
|
+
|
|
53
|
+
# Print extracted text
|
|
54
|
+
for pd in pages_text:
|
|
55
|
+
print(f"\n--- Page {pd['page']} ---")
|
|
56
|
+
print(pd["text"])
|
|
57
|
+
|
|
58
|
+
# Summary to stderr
|
|
59
|
+
summary = {
|
|
60
|
+
"total_pages": page_count,
|
|
61
|
+
"text_pages": len(pages_text),
|
|
62
|
+
"scanned_pages": scanned_pages,
|
|
63
|
+
"scanned_count": len(scanned_pages),
|
|
64
|
+
}
|
|
65
|
+
print(f"\n[Summary] {summary['text_pages']} text pages, {summary['scanned_count']} scanned pages: {scanned_pages}", file=sys.stderr)
|
|
66
|
+
json.dump(summary, sys.stderr)
|
|
67
|
+
print("", file=sys.stderr)
|
|
68
|
+
|
|
69
|
+
return summary
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _render_page(page, page_idx, output_dir, scale):
|
|
73
|
+
media_box = page.boundsForBox_(Quartz.kCGPDFMediaBox)
|
|
74
|
+
pw = int(media_box.size.width * scale)
|
|
75
|
+
ph = int(media_box.size.height * scale)
|
|
76
|
+
|
|
77
|
+
cs = Quartz.CGColorSpaceCreateDeviceRGB()
|
|
78
|
+
ctx = Quartz.CGBitmapContextCreate(
|
|
79
|
+
None, pw, ph, 8, pw * 4, cs,
|
|
80
|
+
Quartz.kCGImageAlphaPremultipliedLast
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
Quartz.CGContextSetRGBFillColor(ctx, 1.0, 1.0, 1.0, 1.0)
|
|
84
|
+
Quartz.CGContextFillRect(ctx, Quartz.CGRectMake(0, 0, pw, ph))
|
|
85
|
+
Quartz.CGContextScaleCTM(ctx, scale, scale)
|
|
86
|
+
page.drawWithBox_toContext_(Quartz.kCGPDFMediaBox, ctx)
|
|
87
|
+
|
|
88
|
+
cg_img = Quartz.CGBitmapContextCreateImage(ctx)
|
|
89
|
+
out_path = os.path.join(output_dir, f"page_{page_idx + 1}.png")
|
|
90
|
+
|
|
91
|
+
dest = Quartz.CGImageDestinationCreateWithURL(
|
|
92
|
+
Quartz.NSURL.fileURLWithPath_(out_path),
|
|
93
|
+
Quartz.kUTTypePNG, 1, None
|
|
94
|
+
)
|
|
95
|
+
Quartz.CGImageDestinationAddImage(dest, cg_img, None)
|
|
96
|
+
Quartz.CGImageDestinationFinalize(dest)
|
|
97
|
+
|
|
98
|
+
print(f" [Rendered] page {page_idx + 1} -> {out_path} ({pw}x{ph})", file=sys.stderr)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
parser = argparse.ArgumentParser(description="Extract text from PDF on macOS")
|
|
103
|
+
parser.add_argument("pdf_path", help="Path to the PDF file")
|
|
104
|
+
parser.add_argument("--render-scanned", action="store_true",
|
|
105
|
+
help="Render scanned/blank pages as PNG images")
|
|
106
|
+
parser.add_argument("--output-dir", default="/tmp/pdf_rendered/",
|
|
107
|
+
help="Output directory for rendered images")
|
|
108
|
+
parser.add_argument("--scale", type=float, default=6.0,
|
|
109
|
+
help="Scale multiplier for rendering (default: 6.0)")
|
|
110
|
+
args = parser.parse_args()
|
|
111
|
+
|
|
112
|
+
extract_text(args.pdf_path, args.render_scanned, args.output_dir, args.scale)
|