sophhub 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/package.json +1 -1
  2. package/skills/compact-context/skill.json +20 -0
  3. package/skills/compact-context/src/SKILL.md +133 -0
  4. package/skills/compact-context/src/scripts/check.sh +381 -0
  5. package/skills/compact-context/src/scripts/set-keep-recent.mjs +1337 -0
  6. package/skills/compact-context/src/scripts/setup.sh +96 -0
  7. package/skills/feishu-notes-assistant-universal/skill.json +20 -0
  8. package/skills/feishu-notes-assistant-universal/src/README.md +55 -0
  9. package/skills/feishu-notes-assistant-universal/src/SKILL.md +159 -0
  10. package/skills/feishu-notes-assistant-universal/src/bin/linux-amd64/lark-cli-openclaw +0 -0
  11. package/skills/feishu-notes-assistant-universal/src/bin/linux-arm64/lark-cli-openclaw +0 -0
  12. package/skills/feishu-notes-assistant-universal/src/scripts/_resolve_lark_cli.py +58 -0
  13. package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_meeting_minutes.py +462 -0
  14. package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_notes_crud.py +547 -0
  15. package/skills/feishu-notes-assistant-universal/src/scripts/openclaw_notes_crud_test.py +181 -0
  16. package/skills/feishu-notes-assistant-universal/src/scripts/run_meeting_minutes.py +80 -0
  17. package/skills/feishu-notes-assistant-universal/src/scripts/run_meeting_minutes.sh +5 -0
  18. package/skills/feishu-notes-assistant-universal/src/scripts/run_note_crud.py +32 -0
  19. package/skills/feishu-notes-assistant-universal/src/scripts/run_note_crud.sh +5 -0
  20. package/skills/image-classify/skill.json +5 -5
  21. package/skills/image-classify/src/SKILL.md +60 -67
  22. package/skills/image-classify/src/scripts/face_search.py +400 -15
  23. package/skills/image-classify/src/scripts/send_dm_message.py +332 -0
  24. package/skills/md2pdf-converter/skill.json +20 -0
  25. package/skills/md2pdf-converter/src/SKILL.md +244 -0
  26. package/skills/md2pdf-converter/src/_meta.json +6 -0
  27. package/skills/md2pdf-converter/src/scripts/generate_emoji_mapping.py +74 -0
  28. package/skills/md2pdf-converter/src/scripts/md2pdf-local.sh +291 -0
  29. package/skills/sophnet-bot-client/skill.json +20 -0
  30. package/skills/sophnet-bot-client/src/SKILL.md +255 -0
  31. package/skills/sophnet-bot-client/src/pyproject.toml +13 -0
  32. package/skills/sophnet-bot-client/src/scripts/__init__.py +0 -0
  33. package/skills/sophnet-bot-client/src/scripts/bot_client_proxy.py +165 -0
  34. package/skills/sophnet-bot-client/src/scripts/bot_client_safe.sh +29 -0
  35. package/skills/sophnet-bot-client/src/scripts/bot_client_setup.py +502 -0
  36. package/skills/sophnet-bot-client/src/tests/__init__.py +0 -0
  37. package/skills/sophnet-bot-client/src/tests/test_bot_client_proxy.py +255 -0
  38. package/skills/sophnet-bot-client/src/tests/test_bot_client_setup.py +679 -0
  39. package/skills/sophnet-bot-client/src/uv.lock +8 -0
  40. package/skills/sophnet-docx/skill.json +20 -0
  41. package/skills/sophnet-docx/src/SKILL.md +463 -0
  42. package/skills/sophnet-docx/src/package-lock.json +208 -0
  43. package/skills/sophnet-docx/src/package.json +16 -0
  44. package/skills/sophnet-docx/src/pyproject.toml +11 -0
  45. package/skills/sophnet-docx/src/scripts/__init__.py +1 -0
  46. package/skills/sophnet-docx/src/scripts/accept_changes.py +135 -0
  47. package/skills/sophnet-docx/src/scripts/comment.py +318 -0
  48. package/skills/sophnet-docx/src/scripts/ensure_uv_env.sh +68 -0
  49. package/skills/sophnet-docx/src/scripts/office/helpers/__init__.py +0 -0
  50. package/skills/sophnet-docx/src/scripts/office/helpers/merge_runs.py +199 -0
  51. package/skills/sophnet-docx/src/scripts/office/helpers/simplify_redlines.py +197 -0
  52. package/skills/sophnet-docx/src/scripts/office/pack.py +159 -0
  53. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  54. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  55. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  56. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  57. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  58. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  59. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  60. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  61. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  62. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  63. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  64. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  65. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  66. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  67. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  68. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  69. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  70. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  71. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  72. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  73. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  74. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  75. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  76. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  77. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  78. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  79. package/skills/sophnet-docx/src/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  80. package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  81. package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  82. package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  83. package/skills/sophnet-docx/src/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  84. package/skills/sophnet-docx/src/scripts/office/schemas/mce/mc.xsd +75 -0
  85. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  86. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  87. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  88. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  89. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  90. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  91. package/skills/sophnet-docx/src/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  92. package/skills/sophnet-docx/src/scripts/office/soffice.py +183 -0
  93. package/skills/sophnet-docx/src/scripts/office/unpack.py +132 -0
  94. package/skills/sophnet-docx/src/scripts/office/validate.py +111 -0
  95. package/skills/sophnet-docx/src/scripts/office/validators/__init__.py +15 -0
  96. package/skills/sophnet-docx/src/scripts/office/validators/base.py +847 -0
  97. package/skills/sophnet-docx/src/scripts/office/validators/docx.py +446 -0
  98. package/skills/sophnet-docx/src/scripts/office/validators/pptx.py +275 -0
  99. package/skills/sophnet-docx/src/scripts/office/validators/redlining.py +247 -0
  100. package/skills/sophnet-docx/src/scripts/templates/comments.xml +3 -0
  101. package/skills/sophnet-docx/src/scripts/templates/commentsExtended.xml +3 -0
  102. package/skills/sophnet-docx/src/scripts/templates/commentsExtensible.xml +3 -0
  103. package/skills/sophnet-docx/src/scripts/templates/commentsIds.xml +3 -0
  104. package/skills/sophnet-docx/src/scripts/templates/people.xml +3 -0
  105. package/skills/sophnet-docx/src/scripts/upload_file.sh +96 -0
  106. package/skills/sophnet-docx/src/uv.lock +320 -0
  107. package/skills/sophnet-pdf/skill.json +20 -0
  108. package/skills/sophnet-pdf/src/SKILL.md +413 -0
  109. package/skills/sophnet-pdf/src/forms.md +297 -0
  110. package/skills/sophnet-pdf/src/pyproject.toml +14 -0
  111. package/skills/sophnet-pdf/src/reference.md +612 -0
  112. package/skills/sophnet-pdf/src/scripts/check_bounding_boxes.py +65 -0
  113. package/skills/sophnet-pdf/src/scripts/check_fillable_fields.py +11 -0
  114. package/skills/sophnet-pdf/src/scripts/convert_pdf_to_images.py +33 -0
  115. package/skills/sophnet-pdf/src/scripts/create_validation_image.py +37 -0
  116. package/skills/sophnet-pdf/src/scripts/enhance_tutorial.py +558 -0
  117. package/skills/sophnet-pdf/src/scripts/ensure_uv_env.sh +68 -0
  118. package/skills/sophnet-pdf/src/scripts/extract_form_field_info.py +122 -0
  119. package/skills/sophnet-pdf/src/scripts/extract_form_structure.py +115 -0
  120. package/skills/sophnet-pdf/src/scripts/extract_pdf_content.py +35 -0
  121. package/skills/sophnet-pdf/src/scripts/fill_fillable_fields.py +98 -0
  122. package/skills/sophnet-pdf/src/scripts/fill_pdf_form_with_annotations.py +107 -0
  123. package/skills/sophnet-pdf/src/scripts/upload_file.sh +88 -0
  124. package/skills/sophnet-pdf/src/uv.lock +537 -0
  125. package/skills/sophnet-xlsx/skill.json +20 -0
  126. package/skills/sophnet-xlsx/src/SKILL.md +399 -0
  127. package/skills/sophnet-xlsx/src/pyproject.toml +11 -0
  128. package/skills/sophnet-xlsx/src/scripts/ensure_uv_env.sh +68 -0
  129. package/skills/sophnet-xlsx/src/scripts/office/helpers/__init__.py +0 -0
  130. package/skills/sophnet-xlsx/src/scripts/office/helpers/merge_runs.py +199 -0
  131. package/skills/sophnet-xlsx/src/scripts/office/helpers/simplify_redlines.py +197 -0
  132. package/skills/sophnet-xlsx/src/scripts/office/pack.py +159 -0
  133. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  134. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  135. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  136. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  137. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  138. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  139. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  140. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  141. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  142. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  143. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  144. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  145. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  146. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  147. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  148. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  149. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  150. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  151. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  152. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  153. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  154. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  155. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  156. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  157. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  158. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  159. package/skills/sophnet-xlsx/src/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  160. package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  161. package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  162. package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  163. package/skills/sophnet-xlsx/src/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  164. package/skills/sophnet-xlsx/src/scripts/office/schemas/mce/mc.xsd +75 -0
  165. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  166. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  167. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  168. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  169. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  170. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  171. package/skills/sophnet-xlsx/src/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  172. package/skills/sophnet-xlsx/src/scripts/office/soffice.py +183 -0
  173. package/skills/sophnet-xlsx/src/scripts/office/unpack.py +132 -0
  174. package/skills/sophnet-xlsx/src/scripts/office/validate.py +111 -0
  175. package/skills/sophnet-xlsx/src/scripts/office/validators/__init__.py +15 -0
  176. package/skills/sophnet-xlsx/src/scripts/office/validators/base.py +847 -0
  177. package/skills/sophnet-xlsx/src/scripts/office/validators/docx.py +446 -0
  178. package/skills/sophnet-xlsx/src/scripts/office/validators/pptx.py +275 -0
  179. package/skills/sophnet-xlsx/src/scripts/office/validators/redlining.py +247 -0
  180. package/skills/sophnet-xlsx/src/scripts/recalc.py +184 -0
  181. package/skills/sophnet-xlsx/src/scripts/upload_file.sh +96 -0
  182. package/skills/sophnet-xlsx/src/uv.lock +319 -0
  183. package/skills/wechat-article-publisher/skill.json +20 -0
  184. package/skills/wechat-article-publisher/src/SKILL.md +60 -0
  185. package/skills/wechat-article-publisher/src/config.json +7 -0
  186. package/skills/wechat-article-publisher/src/pyproject.toml +12 -0
  187. package/skills/wechat-article-publisher/src/scripts/publish_wechat.py +825 -0
@@ -0,0 +1,413 @@
1
+ ---
2
+ name: sophnet-pdf
3
+ description: Comprehensive PDF processing for creating, editing, and analyzing PDF documents. Use when the user mentions PDF files, needs to create new PDFs (reports, invoices, certificates), edit existing PDFs (text replacement, image operations, watermarks), extract content, merge/split files, or perform OCR on scanned documents.
4
+ ---
5
+
6
+ # PDF Processing Guide
7
+
8
+ ## MANDATORY: Working Directory
9
+
10
+ **EVERY command in this skill MUST be executed from THIS skill's directory.** Before running ANY command — Python or bash script — you MUST `cd` into this skill's directory first. Determine the absolute path of this `SKILL.md` file and use its parent directory.
11
+
12
+ ```bash
13
+ SKILL_DIR="<absolute-path-to-this-skills-sophnet-pdf-directory>"
14
+ cd "$SKILL_DIR"
15
+ ```
16
+
17
+ **NEVER run commands from the repository root or any other directory.** If you do, `uv run` won't find `pyproject.toml`, and `python` won't have access to required packages.
18
+
19
+ ## Overview
20
+
21
+ This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
22
+
23
+ ### ⚠️ CJK Text in PDFs — READ FIRST
24
+
25
+ **If the PDF will contain ANY Chinese/Japanese/Korean text** (including titles, body text, table cells, or watermarks), you MUST handle CJK fonts correctly:
26
+
27
+ - **reportlab**: Default fonts (`Helvetica`, `Times-Roman`) render CJK as **black boxes (█)**. You MUST register `STSong-Light` and call `addMapping()` before creating any content. See the "reportlab - Create PDFs" section for the exact setup code.
28
+ - **PyMuPDF (fitz)**: Use `fontname="china-s"` for `insert_text()` / `insert_textbox()`. This is built-in and requires no extra setup.
29
+ - **Rule of thumb**: When in doubt, always use CJK-safe fonts. `STSong-Light` (reportlab) and `china-s` (PyMuPDF) both handle mixed CJK+Latin text correctly.
30
+
31
+ ## Python Runtime (uv)
32
+
33
+ **CRITICAL: All Python execution in this skill MUST use `uv run --project .` from the skill directory. NEVER use bare `python3`, `python`, or `pip install` directly — the required packages (pdfplumber, reportlab, pypdf, etc.) are ONLY available inside the uv virtual environment defined by this skill's `pyproject.toml`. Direct `python3` will fail with ModuleNotFoundError.**
34
+
35
+ **IMPORTANT: Only use libraries available in this skill's pyproject.toml (reportlab, pypdf, pdfplumber, pillow, pdf2image, pymupdf). Do NOT import matplotlib, numpy, pandas, or other packages not listed — they will cause ModuleNotFoundError. For charts/graphs, use `reportlab.graphics.charts` (VerticalBarChart, HorizontalBarChart, Pie, etc.) instead of matplotlib.**
36
+
37
+ First, ensure the environment is set up (run once per session):
38
+
39
+ ```bash
40
+ cd "$SKILL_DIR"
41
+ bash scripts/ensure_uv_env.sh
42
+ ```
43
+
44
+ Then ALL Python commands must use this prefix:
45
+
46
+ ```bash
47
+ cd "$SKILL_DIR" && uv run --project . python <script-or-module>
48
+ ```
49
+
50
+ This applies to both the provided scripts AND any inline Python code you write. For inline code, use:
51
+
52
+ ```bash
53
+ cd "$SKILL_DIR" && uv run --project . python -c "import pdfplumber; ..."
54
+ ```
55
+
56
+ ## Delivery
57
+
58
+ Local PDF creation/editing/analysis does not require any Sophnet API key.
59
+
60
+ **IMPORTANT: After creating or modifying a PDF, ALWAYS upload it and return the download URL to the user.** This is the default behavior — do not skip the upload step.
61
+
62
+ ```bash
63
+ bash scripts/upload_file.sh --file <absolute-path-to-pdf>
64
+ ```
65
+
66
+ Upload command output contract:
67
+
68
+ - `FILE_PATH=<absolute-path>`
69
+ - `UPLOAD_STATUS=uploaded|skipped`
70
+ - `DOWNLOAD_URL=<https://...>` (present only when uploaded)
71
+
72
+ Delivery rules:
73
+
74
+ - **ALWAYS call `scripts/upload_file.sh` after producing a PDF file**, then return the `DOWNLOAD_URL` to the user.
75
+ - If `UPLOAD_STATUS=uploaded`, return the exact `DOWNLOAD_URL` value to the user as a clickable link.
76
+ - If `UPLOAD_STATUS=skipped` (missing API key), return `FILE_PATH` instead of failing the whole task.
77
+ - Keep URL output logic independent inside `sophnet-pdf/scripts`. Do not call other skills' upload scripts.
78
+
79
+ ## Quick Start
80
+
81
+ ```python
82
+ from pypdf import PdfReader, PdfWriter
83
+
84
+ # Read a PDF
85
+ reader = PdfReader("document.pdf")
86
+ print(f"Pages: {len(reader.pages)}")
87
+
88
+ # Extract text
89
+ text = ""
90
+ for page in reader.pages:
91
+ text += page.extract_text()
92
+ ```
93
+
94
+ ## Python Libraries
95
+
96
+ ### PyMuPDF (fitz) - Text Search & Replace
97
+
98
+ **This is the ONLY method for replacing text in existing PDFs. Do NOT fall back to nano-pdf, pdftotext, tesseract, or any external CLI tool — use PyMuPDF exclusively.**
99
+
100
+ **CRITICAL FONT RULES:**
101
+ - NEVER use the original PDF's embedded font names (e.g. `Unnamed-T3`, `NotoSansCJKsc-Regular`, `LiberationSans`) with `insert_text()` — they are embedded fonts and will cause `Exception: need font file or buffer`.
102
+ - Always use `page.search_for()` to locate text (it works across spans, unlike iterating `get_text("dict")` spans which may split multi-word terms).
103
+ - Use the **fallback font picker** below to choose a suitable built-in font.
104
+
105
+ **Built-in font aliases (Latin):** `helv` (Helvetica), `heit` (Helvetica-Bold), `tiro` (Times-Roman), `tibo` (Times-Bold), `cour` (Courier), `cobo` (Courier-Bold).
106
+
107
+ **Built-in CJK font aliases:** `china-s` (Simplified Chinese), `china-t` (Traditional Chinese), `japan` (Japanese), `korea` (Korean). These also render Latin characters correctly.
108
+
109
+ #### Find & Replace Text in PDF (Complete Example)
110
+
111
+ ```python
112
+ import fitz # PyMuPDF
113
+
114
+ # --- Font fallback logic ---
115
+ BUILTIN_FONTS = {"helv","heit","tiro","tibo","tibi","toit","cour","cobo","cobi","cout","symb","zadb"}
116
+ CJK_FONTS = {"china-s","china-ss","china-t","china-ts","japan","japan-s","korea","korea-s"}
117
+
118
+ def has_cjk(text):
119
+ """Check if text contains CJK characters."""
120
+ return any("\u4e00" <= c <= "\u9fff" or "\u3000" <= c <= "\u303f" for c in text)
121
+
122
+ def pick_font(original_font, text):
123
+ """Pick a built-in font. Falls back from the original embedded font name."""
124
+ if original_font in BUILTIN_FONTS or original_font in CJK_FONTS:
125
+ return original_font
126
+ if has_cjk(text):
127
+ return "china-s" # handles both CJK and Latin
128
+ lower = original_font.lower()
129
+ if "bold" in lower:
130
+ return "heit"
131
+ if "mono" in lower or "courier" in lower or "code" in lower:
132
+ return "cour"
133
+ return "helv"
134
+
135
+ def get_span_info(page, rect):
136
+ """Get font size, font name, and color from the span overlapping rect."""
137
+ for block in page.get_text("dict")["blocks"]:
138
+ if "lines" not in block:
139
+ continue
140
+ for line in block["lines"]:
141
+ for span in line["spans"]:
142
+ if fitz.Rect(span["bbox"]).intersects(rect):
143
+ return span["size"], span["font"], span.get("color", 0)
144
+ return 12, "helv", 0
145
+
146
+ # --- Main replacement logic ---
147
+ search_terms = ["OldText", "Old Text"] # all variants to search for
148
+ replacement = "NewText"
149
+
150
+ doc = fitz.open("input.pdf")
151
+
152
+ for page in doc:
153
+ # Step 1: Find all instances using search_for (works across spans)
154
+ hits = []
155
+ for term in search_terms:
156
+ for rect in page.search_for(term):
157
+ size, font, color = get_span_info(page, rect)
158
+ hits.append({"rect": rect, "size": size, "font": font, "color": color})
159
+
160
+ # Step 2: Redact (erase) old text
161
+ for h in hits:
162
+ page.add_redact_annot(h["rect"], fill=(1, 1, 1))
163
+ page.apply_redactions()
164
+
165
+ # Step 3: Insert replacement text with fallback font
166
+ for h in hits:
167
+ font = pick_font(h["font"], replacement)
168
+ c = h["color"]
169
+ rgb = (((c>>16)&0xFF)/255, ((c>>8)&0xFF)/255, (c&0xFF)/255) if isinstance(c, int) else (0,0,0)
170
+ page.insert_text(
171
+ (h["rect"].x0, h["rect"].y0 + h["size"] * 0.85),
172
+ replacement,
173
+ fontname=font,
174
+ fontsize=h["size"],
175
+ color=rgb,
176
+ )
177
+
178
+ doc.save("output.pdf", garbage=4, deflate=True)
179
+ ```
180
+
181
+ **After replacement, always verify** by extracting text from the output PDF:
182
+
183
+ ```python
184
+ doc2 = fitz.open("output.pdf")
185
+ for i, p in enumerate(doc2):
186
+ text = p.get_text()
187
+ old_count = text.lower().count("oldtext")
188
+ new_count = text.lower().count("newtext")
189
+ print(f"Page {i+1}: OldText={old_count}, NewText={new_count}")
190
+ ```
191
+
192
+ **Notes:**
193
+ - PDF text replacement is inherently imperfect — PDFs are presentation-focused, not text-edit-focused.
194
+ - The replacement font may look slightly different from the original embedded font. This is expected.
195
+ - `china-s` is the best fallback for mixed CJK+Latin text; `helv` for Latin-only text.
196
+
197
+ ### PyMuPDF (fitz) - Image Operations
198
+
199
+ **Use PyMuPDF for ALL image operations in existing PDFs.** Key APIs:
200
+ - `page.insert_image(rect, filename=...)` — insert a new image
201
+ - `page.replace_image(xref, filename=...)` — replace an existing image in-place (keeps original rect)
202
+ - `page.delete_image(xref)` — remove an image (replaces with 1×1 transparent pixel)
203
+ - `page.get_images(full=True)` — list images on a page (returns xref, etc.)
204
+ - `page.get_image_info(xrefs=True)` — list images with bounding box info
205
+
206
+ **CRITICAL: Always use `page.get_image_info(xrefs=True)` (not `get_images`) when you need the on-page bounding box of each image.**
207
+
208
+ #### Insert Image into Existing PDF
209
+
210
+ ```python
211
+ import fitz
212
+
213
+ doc = fitz.open("input.pdf")
214
+ page = doc[0]
215
+
216
+ # Define target rectangle (x0, y0, x1, y1) — image will be scaled to fit
217
+ # To center horizontally with 50% page width:
218
+ pw = page.rect.width
219
+ img_w = pw * 0.5
220
+ # Load image to get aspect ratio
221
+ img = fitz.Pixmap("image.png")
222
+ aspect = img.width / img.height
223
+ img_h = img_w / aspect
224
+ x0 = (pw - img_w) / 2
225
+ y0 = 700 # vertical position
226
+
227
+ rect = fitz.Rect(x0, y0, x0 + img_w, y0 + img_h)
228
+ page.insert_image(rect, filename="image.png")
229
+ doc.save("output.pdf")
230
+ ```
231
+
232
+ #### Replace an Existing Image
233
+
234
+ ```python
235
+ import fitz
236
+
237
+ doc = fitz.open("input.pdf")
238
+ page = doc[0]
239
+
240
+ # Find the image to replace
241
+ images = page.get_images(full=True)
242
+ target_xref = images[0][0] # xref of the first image
243
+
244
+ # Replace it — the new image fills the SAME rect as the original
245
+ page.replace_image(target_xref, filename="new_image.png")
246
+ doc.save("output.pdf")
247
+ ```
248
+
249
+ **Note:** `replace_image` keeps the original bounding box. The new image is scaled to fit.
250
+
251
+ #### Modify Image Layout (Reposition / Resize)
252
+
253
+ ```python
254
+ import fitz
255
+
256
+ doc = fitz.open("input.pdf")
257
+ page = doc[0]
258
+
259
+ # Get image placements with bounding boxes
260
+ img_info = page.get_image_info(xrefs=True)
261
+ target = img_info[0] # first image
262
+ xref = target["xref"]
263
+ orig_rect = fitz.Rect(target["bbox"])
264
+
265
+ # Extract original image bytes
266
+ pix = fitz.Pixmap(doc, xref)
267
+ img_bytes = pix.tobytes("png")
268
+
269
+ # Delete the original (replaces with 1x1 transparent pixel)
270
+ page.delete_image(xref)
271
+
272
+ # Re-insert at new position/size (e.g. 70% size, centered)
273
+ pw = page.rect.width
274
+ new_w = orig_rect.width * 0.7
275
+ new_h = orig_rect.height * 0.7
276
+ new_x0 = (pw - new_w) / 2
277
+ new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x0 + new_w, orig_rect.y0 + new_h)
278
+ page.insert_image(new_rect, stream=img_bytes)
279
+
280
+ doc.save("output.pdf", garbage=4, deflate=True)
281
+ ```
282
+
283
+ **Notes on image operations:**
284
+ - `delete_image` replaces the image data with a 1×1 transparent pixel (the page reference remains but is invisible). This is normal PyMuPDF behavior.
285
+ - For layout changes: extract → delete → re-insert at new rect. Do NOT try `doc.update_image()` (does not exist).
286
+ - `insert_image` accepts `filename=` (file path) or `stream=` (bytes). Use `stream=` when re-inserting extracted images.
287
+ - Use `garbage=4, deflate=True` in `doc.save()` to clean up unused objects and compress.
288
+
289
+ ### pypdf - Basic Operations
290
+
291
+ #### Merge PDFs
292
+
293
+ ```python
294
+ from pypdf import PdfWriter, PdfReader
295
+
296
+ writer = PdfWriter()
297
+ for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
298
+ reader = PdfReader(pdf_file)
299
+ for page in reader.pages:
300
+ writer.add_page(page)
301
+
302
+ with open("merged.pdf", "wb") as output:
303
+ writer.write(output)
304
+ ```
305
+
306
+ #### Split PDF
307
+
308
+ ```python
309
+ reader = PdfReader("input.pdf")
310
+ for i, page in enumerate(reader.pages):
311
+ writer = PdfWriter()
312
+ writer.add_page(page)
313
+ with open(f"page_{i+1}.pdf", "wb") as output:
314
+ writer.write(output)
315
+ ```
316
+
317
+ #### Extract Metadata
318
+
319
+ ```python
320
+ reader = PdfReader("document.pdf")
321
+ meta = reader.metadata
322
+ print(f"Title: {meta.title}")
323
+ print(f"Author: {meta.author}")
324
+ print(f"Subject: {meta.subject}")
325
+ print(f"Creator: {meta.creator}")
326
+ ```
327
+
328
+ #### Rotate Pages
329
+
330
+ ```python
331
+ reader = PdfReader("input.pdf")
332
+ writer = PdfWriter()
333
+
334
+ page = reader.pages[0]
335
+ page.rotate(90) # Rotate 90 degrees clockwise
336
+ writer.add_page(page)
337
+
338
+ with open("rotated.pdf", "wb") as output:
339
+ writer.write(output)
340
+ ```
341
+
342
+ ### pdfplumber - Text and Table Extraction
343
+
344
+ #### Extract Text with Layout
345
+
346
+ ```python
347
+ import pdfplumber
348
+
349
+ with pdfplumber.open("document.pdf") as pdf:
350
+ for page in pdf.pages:
351
+ text = page.extract_text()
352
+ print(text)
353
+ ```
354
+
355
+ #### Extract Tables
356
+
357
+ ```python
358
+ with pdfplumber.open("document.pdf") as pdf:
359
+ for i, page in enumerate(pdf.pages):
360
+ tables = page.extract_tables()
361
+ for j, table in enumerate(tables):
362
+ print(f"Table {j+1} on page {i+1}:")
363
+ for row in table:
364
+ print(row)
365
+ ```
366
+
367
+ #### Advanced Table Extraction
368
+
369
+ ```python
370
+ import pandas as pd
371
+
372
+ with pdfplumber.open("document.pdf") as pdf:
373
+ all_tables = []
374
+ for page in pdf.pages:
375
+ tables = page.extract_tables()
376
+ for table in tables:
377
+ if table: # Check if table is not empty
378
+ df = pd.DataFrame(table[1:], columns=table[0])
379
+ all_tables.append(df)
380
+
381
+ # Combine all tables
382
+ if all_tables:
383
+ combined_df = pd.concat(all_tables, ignore_index=True)
384
+ combined_df.to_excel("extracted_tables.xlsx", index=False)
385
+ ```
386
+
387
+ ### reportlab - Create PDFs
388
+
389
+ > ⚠️ **If the PDF contains ANY Chinese/Japanese/Korean text, you MUST follow the CJK rules below. Skipping them causes black boxes (█) or crash errors.**
390
+
391
+ **FORBIDDEN patterns (these ALL break CJK rendering):**
392
+ - ❌ Using `Helvetica`, `Times-Roman`, or `Courier` fonts for CJK text → renders as **black boxes (█)**
393
+ - ❌ Registering `STSong-Light` without calling `addMapping()` → `Paragraph()` crashes with `ValueError: Can't map determine family/bold/italic for stsong-light`
394
+ - ❌ Using `<font face="STSong-Light">` in Paragraph XML without `addMapping()` → same crash
395
+ - ❌ Setting `style.fontName = "STSong-Light"` without `addMapping()` → same crash
396
+
397
+ **CJK font setup (MUST be at the top of EVERY script with CJK text):**
398
+
399
+ ```python
400
+ from reportlab.pdfbase import pdfmetrics
401
+ from reportlab.pdfbase.cidfonts import UnicodeCIDFont
402
+ from reportlab.lib.fonts import addMapping
403
+
404
+ # Step 1: Register the CJK font
405
+ pdfmetrics.registerFont(UnicodeCIDFont("STSong-Light"))
406
+ # Step 2: MANDATORY — register font family mapping (without this Paragraph() crashes)
407
+ addMapping("STSong-Light", 0, 0, "STSong-Light") # normal
408
+ addMapping("STSong-Light", 1, 0, "STSong-Light") # bold
409
+ addMapping("STSong-Light", 0, 1, "STSong-Light") # italic
410
+ addMapping("STSong-Light", 1, 1, "STSong-Light") # bold+italic
411
+ ```
412
+
413
+ **After setup, use `"STSong-L