sophhub 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/compact-context/skill.json +20 -0
- package/skills/compact-context/src/SKILL.md +133 -0
- package/skills/compact-context/src/scripts/check.sh +381 -0
- package/skills/compact-context/src/scripts/set-keep-recent.mjs +1337 -0
- package/skills/compact-context/src/scripts/setup.sh +96 -0
- package/skills/feishu-notes-assistant-universal/skill.json +20 -0
- package/skills/feishu-notes-assistant-universal/src/README.md +55 -0
- package/skills/feishu-notes-assistant-universal/src/SKILL.md +159 -0
- package/skills/feishu-notes-assistant-universal/src/bin/linux-amd64/lark-cli-openclaw +0 -0
- package/skills/feishu-notes-assistant-universal/src/bin/linux-arm64/lark-cli-openclaw +0 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/_resolve_lark_cli.py +58 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_meeting_minutes.py +462 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_notes_crud.py +547 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_notes_crud_test.py +181 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/run_meeting_minutes.py +80 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/run_meeting_minutes.sh +5 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/run_note_crud.py +32 -0
- package/skills/feishu-notes-assistant-universal/src/scripts/run_note_crud.sh +5 -0
- package/skills/image-classify/skill.json +5 -5
- package/skills/image-classify/src/SKILL.md +60 -67
- package/skills/image-classify/src/scripts/face_search.py +400 -15
- package/skills/image-classify/src/scripts/send_dm_message.py +332 -0
- package/skills/md2pdf-converter/skill.json +20 -0
- package/skills/md2pdf-converter/src/SKILL.md +244 -0
- package/skills/md2pdf-converter/src/_meta.json +6 -0
- package/skills/md2pdf-converter/src/scripts/generate_emoji_mapping.py +74 -0
- package/skills/md2pdf-converter/src/scripts/md2pdf-local.sh +291 -0
- package/skills/sophnet-bot-client/skill.json +20 -0
- package/skills/sophnet-bot-client/src/SKILL.md +255 -0
- package/skills/sophnet-bot-client/src/pyproject.toml +13 -0
- package/skills/sophnet-bot-client/src/scripts/__init__.py +0 -0
- package/skills/sophnet-bot-client/src/scripts/bot_client_proxy.py +165 -0
- package/skills/sophnet-bot-client/src/scripts/bot_client_safe.sh +29 -0
- package/skills/sophnet-bot-client/src/scripts/bot_client_setup.py +502 -0
- package/skills/sophnet-bot-client/src/tests/__init__.py +0 -0
- package/skills/sophnet-bot-client/src/tests/test_bot_client_proxy.py +255 -0
- package/skills/sophnet-bot-client/src/tests/test_bot_client_setup.py +679 -0
- package/skills/sophnet-bot-client/src/uv.lock +8 -0
- package/skills/sophnet-docx/skill.json +20 -0
- package/skills/sophnet-docx/src/SKILL.md +463 -0
- package/skills/sophnet-docx/src/package-lock.json +208 -0
- package/skills/sophnet-docx/src/package.json +16 -0
- package/skills/sophnet-docx/src/pyproject.toml +11 -0
- package/skills/sophnet-docx/src/scripts/__init__.py +1 -0
- package/skills/sophnet-docx/src/scripts/accept_changes.py +135 -0
- package/skills/sophnet-docx/src/scripts/comment.py +318 -0
- package/skills/sophnet-docx/src/scripts/ensure_uv_env.sh +68 -0
- package/skills/sophnet-docx/src/scripts/office/helpers/__init__.py +0 -0
- package/skills/sophnet-docx/src/scripts/office/helpers/merge_runs.py +199 -0
- package/skills/sophnet-docx/src/scripts/office/helpers/simplify_redlines.py +197 -0
- package/skills/sophnet-docx/src/scripts/office/pack.py +159 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/sophnet-docx/src/scripts/office/soffice.py +183 -0
- package/skills/sophnet-docx/src/scripts/office/unpack.py +132 -0
- package/skills/sophnet-docx/src/scripts/office/validate.py +111 -0
- package/skills/sophnet-docx/src/scripts/office/validators/__init__.py +15 -0
- package/skills/sophnet-docx/src/scripts/office/validators/base.py +847 -0
- package/skills/sophnet-docx/src/scripts/office/validators/docx.py +446 -0
- package/skills/sophnet-docx/src/scripts/office/validators/pptx.py +275 -0
- package/skills/sophnet-docx/src/scripts/office/validators/redlining.py +247 -0
- package/skills/sophnet-docx/src/scripts/templates/comments.xml +3 -0
- package/skills/sophnet-docx/src/scripts/templates/commentsExtended.xml +3 -0
- package/skills/sophnet-docx/src/scripts/templates/commentsExtensible.xml +3 -0
- package/skills/sophnet-docx/src/scripts/templates/commentsIds.xml +3 -0
- package/skills/sophnet-docx/src/scripts/templates/people.xml +3 -0
- package/skills/sophnet-docx/src/scripts/upload_file.sh +96 -0
- package/skills/sophnet-docx/src/uv.lock +320 -0
- package/skills/sophnet-pdf/skill.json +20 -0
- package/skills/sophnet-pdf/src/SKILL.md +413 -0
- package/skills/sophnet-pdf/src/forms.md +297 -0
- package/skills/sophnet-pdf/src/pyproject.toml +14 -0
- package/skills/sophnet-pdf/src/reference.md +612 -0
- package/skills/sophnet-pdf/src/scripts/check_bounding_boxes.py +65 -0
- package/skills/sophnet-pdf/src/scripts/check_fillable_fields.py +11 -0
- package/skills/sophnet-pdf/src/scripts/convert_pdf_to_images.py +33 -0
- package/skills/sophnet-pdf/src/scripts/create_validation_image.py +37 -0
- package/skills/sophnet-pdf/src/scripts/enhance_tutorial.py +558 -0
- package/skills/sophnet-pdf/src/scripts/ensure_uv_env.sh +68 -0
- package/skills/sophnet-pdf/src/scripts/extract_form_field_info.py +122 -0
- package/skills/sophnet-pdf/src/scripts/extract_form_structure.py +115 -0
- package/skills/sophnet-pdf/src/scripts/extract_pdf_content.py +35 -0
- package/skills/sophnet-pdf/src/scripts/fill_fillable_fields.py +98 -0
- package/skills/sophnet-pdf/src/scripts/fill_pdf_form_with_annotations.py +107 -0
- package/skills/sophnet-pdf/src/scripts/upload_file.sh +88 -0
- package/skills/sophnet-pdf/src/uv.lock +537 -0
- package/skills/sophnet-xlsx/skill.json +20 -0
- package/skills/sophnet-xlsx/src/SKILL.md +399 -0
- package/skills/sophnet-xlsx/src/pyproject.toml +11 -0
- package/skills/sophnet-xlsx/src/scripts/ensure_uv_env.sh +68 -0
- package/skills/sophnet-xlsx/src/scripts/office/helpers/__init__.py +0 -0
- package/skills/sophnet-xlsx/src/scripts/office/helpers/merge_runs.py +199 -0
- package/skills/sophnet-xlsx/src/scripts/office/helpers/simplify_redlines.py +197 -0
- package/skills/sophnet-xlsx/src/scripts/office/pack.py +159 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/sophnet-xlsx/src/scripts/office/soffice.py +183 -0
- package/skills/sophnet-xlsx/src/scripts/office/unpack.py +132 -0
- package/skills/sophnet-xlsx/src/scripts/office/validate.py +111 -0
- package/skills/sophnet-xlsx/src/scripts/office/validators/__init__.py +15 -0
- package/skills/sophnet-xlsx/src/scripts/office/validators/base.py +847 -0
- package/skills/sophnet-xlsx/src/scripts/office/validators/docx.py +446 -0
- package/skills/sophnet-xlsx/src/scripts/office/validators/pptx.py +275 -0
- package/skills/sophnet-xlsx/src/scripts/office/validators/redlining.py +247 -0
- package/skills/sophnet-xlsx/src/scripts/recalc.py +184 -0
- package/skills/sophnet-xlsx/src/scripts/upload_file.sh +96 -0
- package/skills/sophnet-xlsx/src/uv.lock +319 -0
- package/skills/wechat-article-publisher/skill.json +20 -0
- package/skills/wechat-article-publisher/src/SKILL.md +60 -0
- package/skills/wechat-article-publisher/src/config.json +7 -0
- package/skills/wechat-article-publisher/src/pyproject.toml +12 -0
- package/skills/wechat-article-publisher/src/scripts/publish_wechat.py +825 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sophnet-pdf
|
|
3
|
+
description: Comprehensive PDF processing for creating, editing, and analyzing PDF documents. Use when the user mentions PDF files, needs to create new PDFs (reports, invoices, certificates), edit existing PDFs (text replacement, image operations, watermarks), extract content, merge/split files, or perform OCR on scanned documents.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PDF Processing Guide
|
|
7
|
+
|
|
8
|
+
## MANDATORY: Working Directory
|
|
9
|
+
|
|
10
|
+
**EVERY command in this skill MUST be executed from THIS skill's directory.** Before running ANY command — Python or bash script — you MUST `cd` into this skill's directory first. Determine the absolute path of this `SKILL.md` file and use its parent directory.
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
SKILL_DIR="<absolute-path-to-this-skills-sophnet-pdf-directory>"
|
|
14
|
+
cd "$SKILL_DIR"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
**NEVER run commands from the repository root or any other directory.** If you do, `uv run` won't find `pyproject.toml`, and `python` won't have access to required packages.
|
|
18
|
+
|
|
19
|
+
## Overview
|
|
20
|
+
|
|
21
|
+
This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
|
|
22
|
+
|
|
23
|
+
### ⚠️ CJK Text in PDFs — READ FIRST
|
|
24
|
+
|
|
25
|
+
**If the PDF will contain ANY Chinese/Japanese/Korean text** (including titles, body text, table cells, or watermarks), you MUST handle CJK fonts correctly:
|
|
26
|
+
|
|
27
|
+
- **reportlab**: Default fonts (`Helvetica`, `Times-Roman`) render CJK as **black boxes (█)**. You MUST register `STSong-Light` and call `addMapping()` before creating any content. See the "reportlab - Create PDFs" section for the exact setup code.
|
|
28
|
+
- **PyMuPDF (fitz)**: Use `fontname="china-s"` for `insert_text()` / `insert_textbox()`. This is built-in and requires no extra setup.
|
|
29
|
+
- **Rule of thumb**: When in doubt, always use CJK-safe fonts. `STSong-Light` (reportlab) and `china-s` (PyMuPDF) both handle mixed CJK+Latin text correctly.
|
|
30
|
+
|
|
31
|
+
## Python Runtime (uv)
|
|
32
|
+
|
|
33
|
+
**CRITICAL: All Python execution in this skill MUST use `uv run --project .` from the skill directory. NEVER use bare `python3`, `python`, or `pip install` directly — the required packages (pdfplumber, reportlab, pypdf, etc.) are ONLY available inside the uv virtual environment defined by this skill's `pyproject.toml`. Direct `python3` will fail with ModuleNotFoundError.**
|
|
34
|
+
|
|
35
|
+
**IMPORTANT: Only use libraries available in this skill's pyproject.toml (reportlab, pypdf, pdfplumber, pillow, pdf2image, pymupdf). Do NOT import matplotlib, numpy, pandas, or other packages not listed — they will cause ModuleNotFoundError. For charts/graphs, use `reportlab.graphics.charts` (VerticalBarChart, HorizontalBarChart, Pie, etc.) instead of matplotlib.**
|
|
36
|
+
|
|
37
|
+
First, ensure the environment is set up (run once per session):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
cd "$SKILL_DIR"
|
|
41
|
+
bash scripts/ensure_uv_env.sh
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Then ALL Python commands must use this prefix:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
cd "$SKILL_DIR" && uv run --project . python <script-or-module>
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
This applies to both the provided scripts AND any inline Python code you write. For inline code, use:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
cd "$SKILL_DIR" && uv run --project . python -c "import pdfplumber; ..."
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Delivery
|
|
57
|
+
|
|
58
|
+
Local PDF creation/editing/analysis does not require any Sophnet API key.
|
|
59
|
+
|
|
60
|
+
**IMPORTANT: After creating or modifying a PDF, ALWAYS upload it and return the download URL to the user.** This is the default behavior — do not skip the upload step.
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
bash scripts/upload_file.sh --file <absolute-path-to-pdf>
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Upload command output contract:
|
|
67
|
+
|
|
68
|
+
- `FILE_PATH=<absolute-path>`
|
|
69
|
+
- `UPLOAD_STATUS=uploaded|skipped`
|
|
70
|
+
- `DOWNLOAD_URL=<https://...>` (present only when uploaded)
|
|
71
|
+
|
|
72
|
+
Delivery rules:
|
|
73
|
+
|
|
74
|
+
- **ALWAYS call `scripts/upload_file.sh` after producing a PDF file**, then return the `DOWNLOAD_URL` to the user.
|
|
75
|
+
- If `UPLOAD_STATUS=uploaded`, return the exact `DOWNLOAD_URL` value to the user as a clickable link.
|
|
76
|
+
- If `UPLOAD_STATUS=skipped` (missing API key), return `FILE_PATH` instead of failing the whole task.
|
|
77
|
+
- Keep URL output logic independent inside `sophnet-pdf/scripts`. Do not call other skills' upload scripts.
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from pypdf import PdfReader, PdfWriter
|
|
83
|
+
|
|
84
|
+
# Read a PDF
|
|
85
|
+
reader = PdfReader("document.pdf")
|
|
86
|
+
print(f"Pages: {len(reader.pages)}")
|
|
87
|
+
|
|
88
|
+
# Extract text
|
|
89
|
+
text = ""
|
|
90
|
+
for page in reader.pages:
|
|
91
|
+
text += page.extract_text()
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Python Libraries
|
|
95
|
+
|
|
96
|
+
### PyMuPDF (fitz) - Text Search & Replace
|
|
97
|
+
|
|
98
|
+
**This is the ONLY method for replacing text in existing PDFs. Do NOT fall back to nano-pdf, pdftotext, tesseract, or any external CLI tool — use PyMuPDF exclusively.**
|
|
99
|
+
|
|
100
|
+
**CRITICAL FONT RULES:**
|
|
101
|
+
- NEVER use the original PDF's embedded font names (e.g. `Unnamed-T3`, `NotoSansCJKsc-Regular`, `LiberationSans`) with `insert_text()` — they are embedded fonts and will cause `Exception: need font file or buffer`.
|
|
102
|
+
- Always use `page.search_for()` to locate text (it works across spans, unlike iterating `get_text("dict")` spans which may split multi-word terms).
|
|
103
|
+
- Use the **fallback font picker** below to choose a suitable built-in font.
|
|
104
|
+
|
|
105
|
+
**Built-in font aliases (Latin):** `helv` (Helvetica), `heit` (Helvetica-Bold), `tiro` (Times-Roman), `tibo` (Times-Bold), `cour` (Courier), `cobo` (Courier-Bold).
|
|
106
|
+
|
|
107
|
+
**Built-in CJK font aliases:** `china-s` (Simplified Chinese), `china-t` (Traditional Chinese), `japan` (Japanese), `korea` (Korean). These also render Latin characters correctly.
|
|
108
|
+
|
|
109
|
+
#### Find & Replace Text in PDF (Complete Example)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import fitz # PyMuPDF
|
|
113
|
+
|
|
114
|
+
# --- Font fallback logic ---
|
|
115
|
+
BUILTIN_FONTS = {"helv","heit","tiro","tibo","tibi","toit","cour","cobo","cobi","cout","symb","zadb"}
|
|
116
|
+
CJK_FONTS = {"china-s","china-ss","china-t","china-ts","japan","japan-s","korea","korea-s"}
|
|
117
|
+
|
|
118
|
+
def has_cjk(text):
|
|
119
|
+
"""Check if text contains CJK characters."""
|
|
120
|
+
return any("\u4e00" <= c <= "\u9fff" or "\u3000" <= c <= "\u303f" for c in text)
|
|
121
|
+
|
|
122
|
+
def pick_font(original_font, text):
|
|
123
|
+
"""Pick a built-in font. Falls back from the original embedded font name."""
|
|
124
|
+
if original_font in BUILTIN_FONTS or original_font in CJK_FONTS:
|
|
125
|
+
return original_font
|
|
126
|
+
if has_cjk(text):
|
|
127
|
+
return "china-s" # handles both CJK and Latin
|
|
128
|
+
lower = original_font.lower()
|
|
129
|
+
if "bold" in lower:
|
|
130
|
+
return "heit"
|
|
131
|
+
if "mono" in lower or "courier" in lower or "code" in lower:
|
|
132
|
+
return "cour"
|
|
133
|
+
return "helv"
|
|
134
|
+
|
|
135
|
+
def get_span_info(page, rect):
|
|
136
|
+
"""Get font size, font name, and color from the span overlapping rect."""
|
|
137
|
+
for block in page.get_text("dict")["blocks"]:
|
|
138
|
+
if "lines" not in block:
|
|
139
|
+
continue
|
|
140
|
+
for line in block["lines"]:
|
|
141
|
+
for span in line["spans"]:
|
|
142
|
+
if fitz.Rect(span["bbox"]).intersects(rect):
|
|
143
|
+
return span["size"], span["font"], span.get("color", 0)
|
|
144
|
+
return 12, "helv", 0
|
|
145
|
+
|
|
146
|
+
# --- Main replacement logic ---
|
|
147
|
+
search_terms = ["OldText", "Old Text"] # all variants to search for
|
|
148
|
+
replacement = "NewText"
|
|
149
|
+
|
|
150
|
+
doc = fitz.open("input.pdf")
|
|
151
|
+
|
|
152
|
+
for page in doc:
|
|
153
|
+
# Step 1: Find all instances using search_for (works across spans)
|
|
154
|
+
hits = []
|
|
155
|
+
for term in search_terms:
|
|
156
|
+
for rect in page.search_for(term):
|
|
157
|
+
size, font, color = get_span_info(page, rect)
|
|
158
|
+
hits.append({"rect": rect, "size": size, "font": font, "color": color})
|
|
159
|
+
|
|
160
|
+
# Step 2: Redact (erase) old text
|
|
161
|
+
for h in hits:
|
|
162
|
+
page.add_redact_annot(h["rect"], fill=(1, 1, 1))
|
|
163
|
+
page.apply_redactions()
|
|
164
|
+
|
|
165
|
+
# Step 3: Insert replacement text with fallback font
|
|
166
|
+
for h in hits:
|
|
167
|
+
font = pick_font(h["font"], replacement)
|
|
168
|
+
c = h["color"]
|
|
169
|
+
rgb = (((c>>16)&0xFF)/255, ((c>>8)&0xFF)/255, (c&0xFF)/255) if isinstance(c, int) else (0,0,0)
|
|
170
|
+
page.insert_text(
|
|
171
|
+
(h["rect"].x0, h["rect"].y0 + h["size"] * 0.85),
|
|
172
|
+
replacement,
|
|
173
|
+
fontname=font,
|
|
174
|
+
fontsize=h["size"],
|
|
175
|
+
color=rgb,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
doc.save("output.pdf", garbage=4, deflate=True)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
**After replacement, always verify** by extracting text from the output PDF:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
doc2 = fitz.open("output.pdf")
|
|
185
|
+
for i, p in enumerate(doc2):
|
|
186
|
+
text = p.get_text()
|
|
187
|
+
old_count = text.lower().count("oldtext")
|
|
188
|
+
new_count = text.lower().count("newtext")
|
|
189
|
+
print(f"Page {i+1}: OldText={old_count}, NewText={new_count}")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Notes:**
|
|
193
|
+
- PDF text replacement is inherently imperfect — PDFs are presentation-focused, not text-edit-focused.
|
|
194
|
+
- The replacement font may look slightly different from the original embedded font. This is expected.
|
|
195
|
+
- `china-s` is the best fallback for mixed CJK+Latin text; `helv` for Latin-only text.
|
|
196
|
+
|
|
197
|
+
### PyMuPDF (fitz) - Image Operations
|
|
198
|
+
|
|
199
|
+
**Use PyMuPDF for ALL image operations in existing PDFs.** Key APIs:
|
|
200
|
+
- `page.insert_image(rect, filename=...)` — insert a new image
|
|
201
|
+
- `page.replace_image(xref, filename=...)` — replace an existing image in-place (keeps original rect)
|
|
202
|
+
- `page.delete_image(xref)` — remove an image (replaces with 1×1 transparent pixel)
|
|
203
|
+
- `page.get_images(full=True)` — list images on a page (returns xref, etc.)
|
|
204
|
+
- `page.get_image_info(xrefs=True)` — list images with bounding box info
|
|
205
|
+
|
|
206
|
+
**CRITICAL: Always use `page.get_image_info(xrefs=True)` (not `get_images`) when you need the on-page bounding box of each image.**
|
|
207
|
+
|
|
208
|
+
#### Insert Image into Existing PDF
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
import fitz
|
|
212
|
+
|
|
213
|
+
doc = fitz.open("input.pdf")
|
|
214
|
+
page = doc[0]
|
|
215
|
+
|
|
216
|
+
# Define target rectangle (x0, y0, x1, y1) — image will be scaled to fit
|
|
217
|
+
# To center horizontally with 50% page width:
|
|
218
|
+
pw = page.rect.width
|
|
219
|
+
img_w = pw * 0.5
|
|
220
|
+
# Load image to get aspect ratio
|
|
221
|
+
img = fitz.Pixmap("image.png")
|
|
222
|
+
aspect = img.width / img.height
|
|
223
|
+
img_h = img_w / aspect
|
|
224
|
+
x0 = (pw - img_w) / 2
|
|
225
|
+
y0 = 700 # vertical position
|
|
226
|
+
|
|
227
|
+
rect = fitz.Rect(x0, y0, x0 + img_w, y0 + img_h)
|
|
228
|
+
page.insert_image(rect, filename="image.png")
|
|
229
|
+
doc.save("output.pdf")
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
#### Replace an Existing Image
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
import fitz
|
|
236
|
+
|
|
237
|
+
doc = fitz.open("input.pdf")
|
|
238
|
+
page = doc[0]
|
|
239
|
+
|
|
240
|
+
# Find the image to replace
|
|
241
|
+
images = page.get_images(full=True)
|
|
242
|
+
target_xref = images[0][0] # xref of the first image
|
|
243
|
+
|
|
244
|
+
# Replace it — the new image fills the SAME rect as the original
|
|
245
|
+
page.replace_image(target_xref, filename="new_image.png")
|
|
246
|
+
doc.save("output.pdf")
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
**Note:** `replace_image` keeps the original bounding box. The new image is scaled to fit.
|
|
250
|
+
|
|
251
|
+
#### Modify Image Layout (Reposition / Resize)
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
import fitz
|
|
255
|
+
|
|
256
|
+
doc = fitz.open("input.pdf")
|
|
257
|
+
page = doc[0]
|
|
258
|
+
|
|
259
|
+
# Get image placements with bounding boxes
|
|
260
|
+
img_info = page.get_image_info(xrefs=True)
|
|
261
|
+
target = img_info[0] # first image
|
|
262
|
+
xref = target["xref"]
|
|
263
|
+
orig_rect = fitz.Rect(target["bbox"])
|
|
264
|
+
|
|
265
|
+
# Extract original image bytes
|
|
266
|
+
pix = fitz.Pixmap(doc, xref)
|
|
267
|
+
img_bytes = pix.tobytes("png")
|
|
268
|
+
|
|
269
|
+
# Delete the original (replaces with 1x1 transparent pixel)
|
|
270
|
+
page.delete_image(xref)
|
|
271
|
+
|
|
272
|
+
# Re-insert at new position/size (e.g. 70% size, centered)
|
|
273
|
+
pw = page.rect.width
|
|
274
|
+
new_w = orig_rect.width * 0.7
|
|
275
|
+
new_h = orig_rect.height * 0.7
|
|
276
|
+
new_x0 = (pw - new_w) / 2
|
|
277
|
+
new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x0 + new_w, orig_rect.y0 + new_h)
|
|
278
|
+
page.insert_image(new_rect, stream=img_bytes)
|
|
279
|
+
|
|
280
|
+
doc.save("output.pdf", garbage=4, deflate=True)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
**Notes on image operations:**
|
|
284
|
+
- `delete_image` replaces the image data with a 1×1 transparent pixel (the page reference remains but is invisible). This is normal PyMuPDF behavior.
|
|
285
|
+
- For layout changes: extract → delete → re-insert at new rect. Do NOT try `doc.update_image()` (does not exist).
|
|
286
|
+
- `insert_image` accepts `filename=` (file path) or `stream=` (bytes). Use `stream=` when re-inserting extracted images.
|
|
287
|
+
- Use `garbage=4, deflate=True` in `doc.save()` to clean up unused objects and compress.
|
|
288
|
+
|
|
289
|
+
### pypdf - Basic Operations
|
|
290
|
+
|
|
291
|
+
#### Merge PDFs
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from pypdf import PdfWriter, PdfReader
|
|
295
|
+
|
|
296
|
+
writer = PdfWriter()
|
|
297
|
+
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
|
|
298
|
+
reader = PdfReader(pdf_file)
|
|
299
|
+
for page in reader.pages:
|
|
300
|
+
writer.add_page(page)
|
|
301
|
+
|
|
302
|
+
with open("merged.pdf", "wb") as output:
|
|
303
|
+
writer.write(output)
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
#### Split PDF
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
reader = PdfReader("input.pdf")
|
|
310
|
+
for i, page in enumerate(reader.pages):
|
|
311
|
+
writer = PdfWriter()
|
|
312
|
+
writer.add_page(page)
|
|
313
|
+
with open(f"page_{i+1}.pdf", "wb") as output:
|
|
314
|
+
writer.write(output)
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
#### Extract Metadata
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
reader = PdfReader("document.pdf")
|
|
321
|
+
meta = reader.metadata
|
|
322
|
+
print(f"Title: {meta.title}")
|
|
323
|
+
print(f"Author: {meta.author}")
|
|
324
|
+
print(f"Subject: {meta.subject}")
|
|
325
|
+
print(f"Creator: {meta.creator}")
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
#### Rotate Pages
|
|
329
|
+
|
|
330
|
+
```python
|
|
331
|
+
reader = PdfReader("input.pdf")
|
|
332
|
+
writer = PdfWriter()
|
|
333
|
+
|
|
334
|
+
page = reader.pages[0]
|
|
335
|
+
page.rotate(90) # Rotate 90 degrees clockwise
|
|
336
|
+
writer.add_page(page)
|
|
337
|
+
|
|
338
|
+
with open("rotated.pdf", "wb") as output:
|
|
339
|
+
writer.write(output)
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### pdfplumber - Text and Table Extraction
|
|
343
|
+
|
|
344
|
+
#### Extract Text with Layout
|
|
345
|
+
|
|
346
|
+
```python
|
|
347
|
+
import pdfplumber
|
|
348
|
+
|
|
349
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
350
|
+
for page in pdf.pages:
|
|
351
|
+
text = page.extract_text()
|
|
352
|
+
print(text)
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
#### Extract Tables
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
359
|
+
for i, page in enumerate(pdf.pages):
|
|
360
|
+
tables = page.extract_tables()
|
|
361
|
+
for j, table in enumerate(tables):
|
|
362
|
+
print(f"Table {j+1} on page {i+1}:")
|
|
363
|
+
for row in table:
|
|
364
|
+
print(row)
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
#### Advanced Table Extraction
|
|
368
|
+
|
|
369
|
+
```python
|
|
370
|
+
import pandas as pd
|
|
371
|
+
|
|
372
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
373
|
+
all_tables = []
|
|
374
|
+
for page in pdf.pages:
|
|
375
|
+
tables = page.extract_tables()
|
|
376
|
+
for table in tables:
|
|
377
|
+
if table: # Check if table is not empty
|
|
378
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
379
|
+
all_tables.append(df)
|
|
380
|
+
|
|
381
|
+
# Combine all tables
|
|
382
|
+
if all_tables:
|
|
383
|
+
combined_df = pd.concat(all_tables, ignore_index=True)
|
|
384
|
+
combined_df.to_excel("extracted_tables.xlsx", index=False)
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
### reportlab - Create PDFs
|
|
388
|
+
|
|
389
|
+
> ⚠️ **If the PDF contains ANY Chinese/Japanese/Korean text, you MUST follow the CJK rules below. Skipping them causes black boxes (█) or crash errors.**
|
|
390
|
+
|
|
391
|
+
**FORBIDDEN patterns (these ALL break CJK rendering):**
|
|
392
|
+
- ❌ Using `Helvetica`, `Times-Roman`, or `Courier` fonts for CJK text → renders as **black boxes (█)**
|
|
393
|
+
- ❌ Registering `STSong-Light` without calling `addMapping()` → `Paragraph()` crashes with `ValueError: Can't map determine family/bold/italic for stsong-light`
|
|
394
|
+
- ❌ Using `<font face="STSong-Light">` in Paragraph XML without `addMapping()` → same crash
|
|
395
|
+
- ❌ Setting `style.fontName = "STSong-Light"` without `addMapping()` → same crash
|
|
396
|
+
|
|
397
|
+
**CJK font setup (MUST be at the top of EVERY script with CJK text):**
|
|
398
|
+
|
|
399
|
+
```python
|
|
400
|
+
from reportlab.pdfbase import pdfmetrics
|
|
401
|
+
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
|
|
402
|
+
from reportlab.lib.fonts import addMapping
|
|
403
|
+
|
|
404
|
+
# Step 1: Register the CJK font
|
|
405
|
+
pdfmetrics.registerFont(UnicodeCIDFont("STSong-Light"))
|
|
406
|
+
# Step 2: MANDATORY — register font family mapping (without this Paragraph() crashes)
|
|
407
|
+
addMapping("STSong-Light", 0, 0, "STSong-Light") # normal
|
|
408
|
+
addMapping("STSong-Light", 1, 0, "STSong-Light") # bold
|
|
409
|
+
addMapping("STSong-Light", 0, 1, "STSong-Light") # italic
|
|
410
|
+
addMapping("STSong-Light", 1, 1, "STSong-Light") # bold+italic
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
**After setup, use `"STSong-L
|