@kortix/sandbox 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. package/config/customize.sh +143 -0
  2. package/config/kortix-env-setup.sh +25 -0
  3. package/kortix-master/package.json +22 -0
  4. package/kortix-master/src/config.ts +22 -0
  5. package/kortix-master/src/index.ts +44 -0
  6. package/kortix-master/src/routes/env.ts +65 -0
  7. package/kortix-master/src/routes/proxy.ts +108 -0
  8. package/kortix-master/src/routes/update.ts +185 -0
  9. package/kortix-master/src/services/proxy.ts +43 -0
  10. package/kortix-master/src/services/secret-store.ts +156 -0
  11. package/kortix-master/tsconfig.json +14 -0
  12. package/opencode/agents/kortix-browser.md +142 -0
  13. package/opencode/agents/kortix-build.md +62 -0
  14. package/opencode/agents/kortix-explore.md +66 -0
  15. package/opencode/agents/kortix-image-gen.md +33 -0
  16. package/opencode/agents/kortix-main.md +450 -0
  17. package/opencode/agents/kortix-plan.md +100 -0
  18. package/opencode/agents/kortix-research.md +84 -0
  19. package/opencode/agents/kortix-sheets.md +61 -0
  20. package/opencode/agents/kortix-slides.md +64 -0
  21. package/opencode/agents/kortix-web-dev.md +572 -0
  22. package/opencode/commands/email.md +36 -0
  23. package/opencode/commands/init.md +43 -0
  24. package/opencode/commands/journal.md +44 -0
  25. package/opencode/commands/memory-init.md +81 -0
  26. package/opencode/commands/memory-search.md +50 -0
  27. package/opencode/commands/memory-status.md +56 -0
  28. package/opencode/commands/research.md +36 -0
  29. package/opencode/commands/search.md +38 -0
  30. package/opencode/commands/slides.md +32 -0
  31. package/opencode/commands/spreadsheet.md +30 -0
  32. package/opencode/memory.json +37 -0
  33. package/opencode/ocx.jsonc +10 -0
  34. package/opencode/opencode.jsonc +103 -0
  35. package/opencode/package.json +25 -0
  36. package/opencode/patches/apply.sh +19 -0
  37. package/opencode/patches/opencode-pty-spawn.txt +49 -0
  38. package/opencode/plugin/background-agents.ts.disabled +483 -0
  39. package/opencode/plugin/kdco-primitives/get-project-id.ts +172 -0
  40. package/opencode/plugin/kdco-primitives/index.ts +26 -0
  41. package/opencode/plugin/kdco-primitives/log-warn.ts +51 -0
  42. package/opencode/plugin/kdco-primitives/mutex.ts +122 -0
  43. package/opencode/plugin/kdco-primitives/shell.ts +138 -0
  44. package/opencode/plugin/kdco-primitives/temp.ts +36 -0
  45. package/opencode/plugin/kdco-primitives/terminal-detect.ts +34 -0
  46. package/opencode/plugin/kdco-primitives/types.ts +13 -0
  47. package/opencode/plugin/kdco-primitives/with-timeout.ts +84 -0
  48. package/opencode/plugin/memory.ts +306 -0
  49. package/opencode/plugin/worktree/state.ts +412 -0
  50. package/opencode/plugin/worktree/terminal.ts +1002 -0
  51. package/opencode/plugin/worktree.ts +861 -0
  52. package/opencode/skills/KORTIX-browser/SKILL.md +478 -0
  53. package/opencode/skills/KORTIX-cron-triggers/SKILL.md +173 -0
  54. package/opencode/skills/KORTIX-deep-research/SKILL.md +278 -0
  55. package/opencode/skills/KORTIX-docx/SKILL.md +398 -0
  56. package/opencode/skills/KORTIX-docx/scripts/__init__.py +1 -0
  57. package/opencode/skills/KORTIX-docx/scripts/accept_changes.py +104 -0
  58. package/opencode/skills/KORTIX-docx/scripts/comment.py +244 -0
  59. package/opencode/skills/KORTIX-docx/scripts/office/helpers/__init__.py +0 -0
  60. package/opencode/skills/KORTIX-docx/scripts/office/helpers/merge_runs.py +199 -0
  61. package/opencode/skills/KORTIX-docx/scripts/office/helpers/simplify_redlines.py +197 -0
  62. package/opencode/skills/KORTIX-docx/scripts/office/pack.py +159 -0
  63. package/opencode/skills/KORTIX-docx/scripts/office/soffice.py +183 -0
  64. package/opencode/skills/KORTIX-docx/scripts/office/unpack.py +132 -0
  65. package/opencode/skills/KORTIX-docx/scripts/office/validate.py +111 -0
  66. package/opencode/skills/KORTIX-docx/scripts/office/validators/__init__.py +15 -0
  67. package/opencode/skills/KORTIX-docx/scripts/office/validators/base.py +847 -0
  68. package/opencode/skills/KORTIX-docx/scripts/office/validators/docx.py +446 -0
  69. package/opencode/skills/KORTIX-docx/scripts/office/validators/pptx.py +275 -0
  70. package/opencode/skills/KORTIX-docx/scripts/office/validators/redlining.py +247 -0
  71. package/opencode/skills/KORTIX-docx/scripts/render_docx.py +179 -0
  72. package/opencode/skills/KORTIX-docx/scripts/templates/comments.xml +3 -0
  73. package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtended.xml +3 -0
  74. package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtensible.xml +3 -0
  75. package/opencode/skills/KORTIX-docx/scripts/templates/commentsIds.xml +3 -0
  76. package/opencode/skills/KORTIX-docx/scripts/templates/people.xml +3 -0
  77. package/opencode/skills/KORTIX-domain-research/SKILL.md +96 -0
  78. package/opencode/skills/KORTIX-domain-research/scripts/domain-lookup.py +810 -0
  79. package/opencode/skills/KORTIX-elevenlabs/SKILL.md +230 -0
  80. package/opencode/skills/KORTIX-elevenlabs/scripts/tts.py +389 -0
  81. package/opencode/skills/KORTIX-email/SKILL.md +145 -0
  82. package/opencode/skills/KORTIX-legal-writer/SKILL.md +409 -0
  83. package/opencode/skills/KORTIX-legal-writer/references/bluebook.md +152 -0
  84. package/opencode/skills/KORTIX-legal-writer/references/document-types.md +416 -0
  85. package/opencode/skills/KORTIX-legal-writer/scripts/courtlistener.py +291 -0
  86. package/opencode/skills/KORTIX-legal-writer/scripts/ecfr_lookup.py +299 -0
  87. package/opencode/skills/KORTIX-legal-writer/scripts/verify-legal.py +507 -0
  88. package/opencode/skills/KORTIX-logo-creator/SKILL.md +293 -0
  89. package/opencode/skills/KORTIX-logo-creator/references/prompt-patterns.md +134 -0
  90. package/opencode/skills/KORTIX-logo-creator/scripts/compose_logo.py +406 -0
  91. package/opencode/skills/KORTIX-logo-creator/scripts/create_logo_sheet.py +258 -0
  92. package/opencode/skills/KORTIX-logo-creator/scripts/remove_bg.py +96 -0
  93. package/opencode/skills/KORTIX-memory/SKILL.md +261 -0
  94. package/opencode/skills/KORTIX-memory/scripts/export-sessions.py +409 -0
  95. package/opencode/skills/KORTIX-paper-creator/SKILL.md +549 -0
  96. package/opencode/skills/KORTIX-paper-creator/assets/template.tex +101 -0
  97. package/opencode/skills/KORTIX-paper-creator/scripts/compile.sh +177 -0
  98. package/opencode/skills/KORTIX-paper-creator/scripts/openalex_to_bibtex.py +220 -0
  99. package/opencode/skills/KORTIX-paper-creator/scripts/verify.sh +354 -0
  100. package/opencode/skills/KORTIX-paper-search/SKILL.md +418 -0
  101. package/opencode/skills/KORTIX-pdf/SKILL.md +232 -0
  102. package/opencode/skills/KORTIX-pdf/forms.md +36 -0
  103. package/opencode/skills/KORTIX-pdf/reference.md +105 -0
  104. package/opencode/skills/KORTIX-pdf/scripts/check_bounding_boxes.py +65 -0
  105. package/opencode/skills/KORTIX-pdf/scripts/check_fillable_fields.py +11 -0
  106. package/opencode/skills/KORTIX-pdf/scripts/convert_pdf_to_images.py +33 -0
  107. package/opencode/skills/KORTIX-pdf/scripts/create_validation_image.py +37 -0
  108. package/opencode/skills/KORTIX-pdf/scripts/extract_form_field_info.py +122 -0
  109. package/opencode/skills/KORTIX-pdf/scripts/extract_form_structure.py +115 -0
  110. package/opencode/skills/KORTIX-pdf/scripts/fill_fillable_fields.py +98 -0
  111. package/opencode/skills/KORTIX-pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
  112. package/opencode/skills/KORTIX-plan/SKILL.md +228 -0
  113. package/opencode/skills/KORTIX-presentation-viewer/SKILL.md +87 -0
  114. package/opencode/skills/KORTIX-presentation-viewer/serve.ts +136 -0
  115. package/opencode/skills/KORTIX-presentation-viewer/viewer.html +559 -0
  116. package/opencode/skills/KORTIX-presentations/SKILL.md +344 -0
  117. package/opencode/skills/KORTIX-remotion/SKILL.md +56 -0
  118. package/opencode/skills/KORTIX-remotion/rules/3d.md +86 -0
  119. package/opencode/skills/KORTIX-remotion/rules/animations.md +29 -0
  120. package/opencode/skills/KORTIX-remotion/rules/assets.md +78 -0
  121. package/opencode/skills/KORTIX-remotion/rules/audio-visualization.md +198 -0
  122. package/opencode/skills/KORTIX-remotion/rules/audio.md +169 -0
  123. package/opencode/skills/KORTIX-remotion/rules/calculate-metadata.md +104 -0
  124. package/opencode/skills/KORTIX-remotion/rules/can-decode.md +75 -0
  125. package/opencode/skills/KORTIX-remotion/rules/charts.md +120 -0
  126. package/opencode/skills/KORTIX-remotion/rules/compositions.md +141 -0
  127. package/opencode/skills/KORTIX-remotion/rules/display-captions.md +184 -0
  128. package/opencode/skills/KORTIX-remotion/rules/extract-frames.md +229 -0
  129. package/opencode/skills/KORTIX-remotion/rules/ffmpeg.md +38 -0
  130. package/opencode/skills/KORTIX-remotion/rules/fonts.md +152 -0
  131. package/opencode/skills/KORTIX-remotion/rules/get-audio-duration.md +58 -0
  132. package/opencode/skills/KORTIX-remotion/rules/get-video-dimensions.md +68 -0
  133. package/opencode/skills/KORTIX-remotion/rules/get-video-duration.md +58 -0
  134. package/opencode/skills/KORTIX-remotion/rules/gifs.md +141 -0
  135. package/opencode/skills/KORTIX-remotion/rules/images.md +130 -0
  136. package/opencode/skills/KORTIX-remotion/rules/import-srt-captions.md +69 -0
  137. package/opencode/skills/KORTIX-remotion/rules/light-leaks.md +73 -0
  138. package/opencode/skills/KORTIX-remotion/rules/lottie.md +68 -0
  139. package/opencode/skills/KORTIX-remotion/rules/maps.md +401 -0
  140. package/opencode/skills/KORTIX-remotion/rules/measuring-dom-nodes.md +35 -0
  141. package/opencode/skills/KORTIX-remotion/rules/measuring-text.md +143 -0
  142. package/opencode/skills/KORTIX-remotion/rules/parameters.md +98 -0
  143. package/opencode/skills/KORTIX-remotion/rules/sequencing.md +118 -0
  144. package/opencode/skills/KORTIX-remotion/rules/subtitles.md +36 -0
  145. package/opencode/skills/KORTIX-remotion/rules/tailwind.md +11 -0
  146. package/opencode/skills/KORTIX-remotion/rules/text-animations.md +20 -0
  147. package/opencode/skills/KORTIX-remotion/rules/timing.md +179 -0
  148. package/opencode/skills/KORTIX-remotion/rules/transcribe-captions.md +70 -0
  149. package/opencode/skills/KORTIX-remotion/rules/transitions.md +197 -0
  150. package/opencode/skills/KORTIX-remotion/rules/transparent-videos.md +106 -0
  151. package/opencode/skills/KORTIX-remotion/rules/trimming.md +53 -0
  152. package/opencode/skills/KORTIX-remotion/rules/videos.md +171 -0
  153. package/opencode/skills/KORTIX-secrets/SKILL.md +280 -0
  154. package/opencode/skills/KORTIX-semantic-search/SKILL.md +213 -0
  155. package/opencode/skills/KORTIX-session-search/SKILL.md +807 -0
  156. package/opencode/skills/KORTIX-session-search/Untitled +1 -0
  157. package/opencode/skills/KORTIX-skill-creator/SKILL.md +163 -0
  158. package/opencode/skills/KORTIX-web-research/SKILL.md +69 -0
  159. package/opencode/skills/KORTIX-xlsx/LICENSE.txt +30 -0
  160. package/opencode/skills/KORTIX-xlsx/SKILL.md +549 -0
  161. package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/__init__.py +0 -0
  162. package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/merge_runs.py +199 -0
  163. package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
  164. package/opencode/skills/KORTIX-xlsx/scripts/office/pack.py +159 -0
  165. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  166. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  167. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  168. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  169. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  170. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  171. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  172. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  173. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  174. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  175. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  176. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  177. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  178. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  179. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  180. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  181. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  182. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  183. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  184. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  185. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  186. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  187. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  188. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  189. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  190. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  191. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  192. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  193. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  194. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  195. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  196. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  197. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  198. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  199. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  200. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  201. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  202. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  203. package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  204. package/opencode/skills/KORTIX-xlsx/scripts/office/soffice.py +183 -0
  205. package/opencode/skills/KORTIX-xlsx/scripts/office/unpack.py +132 -0
  206. package/opencode/skills/KORTIX-xlsx/scripts/office/validate.py +111 -0
  207. package/opencode/skills/KORTIX-xlsx/scripts/office/validators/__init__.py +15 -0
  208. package/opencode/skills/KORTIX-xlsx/scripts/office/validators/base.py +847 -0
  209. package/opencode/skills/KORTIX-xlsx/scripts/office/validators/docx.py +446 -0
  210. package/opencode/skills/KORTIX-xlsx/scripts/office/validators/pptx.py +275 -0
  211. package/opencode/skills/KORTIX-xlsx/scripts/office/validators/redlining.py +247 -0
  212. package/opencode/skills/KORTIX-xlsx/scripts/recalc.py +184 -0
  213. package/opencode/tools/image-gen.ts +342 -0
  214. package/opencode/tools/image-search.ts +190 -0
  215. package/opencode/tools/memory-get.ts +168 -0
  216. package/opencode/tools/memory-search.ts +247 -0
  217. package/opencode/tools/presentation-gen.ts +723 -0
  218. package/opencode/tools/scrape-webpage.ts +115 -0
  219. package/opencode/tools/scripts/.python-version +1 -0
  220. package/opencode/tools/scripts/convert_pdf.py +184 -0
  221. package/opencode/tools/scripts/convert_pptx.py +562 -0
  222. package/opencode/tools/scripts/pyproject.toml +11 -0
  223. package/opencode/tools/scripts/uv.lock +287 -0
  224. package/opencode/tools/scripts/validate_slide.py +74 -0
  225. package/opencode/tools/show-user.ts +217 -0
  226. package/opencode/tools/tests/e2e-presentation-fix.ts +277 -0
  227. package/opencode/tools/tests/image-gen.test.ts +215 -0
  228. package/opencode/tools/tests/image-search.test.ts +125 -0
  229. package/opencode/tools/tests/memory-system-benchmark.ts +1076 -0
  230. package/opencode/tools/tests/presentation-gen.test.ts +389 -0
  231. package/opencode/tools/tests/scrape-webpage.test.ts +74 -0
  232. package/opencode/tools/tests/show-user.test.ts +241 -0
  233. package/opencode/tools/tests/video-gen.test.ts +110 -0
  234. package/opencode/tools/tests/web-search.test.ts +106 -0
  235. package/opencode/tools/video-gen.ts +200 -0
  236. package/opencode/tools/web-search.ts +153 -0
  237. package/opencode/tsconfig.json +29 -0
  238. package/package.json +36 -0
  239. package/patch-agent-browser.js +100 -0
  240. package/postinstall.sh +88 -0
  241. package/services/KORTIX-presentation-viewer/run +37 -0
  242. package/services/agent-browser-viewer/run +48 -0
  243. package/services/kortix-master/run +16 -0
  244. package/services/lss-sync/run +22 -0
  245. package/services/opencode-serve/run +25 -0
  246. package/services/opencode-web/run +21 -0
@@ -0,0 +1,418 @@
1
+ ---
2
+ name: kortix-paper-search
3
+ description: "Academic paper search powered by OpenAlex -- the free, open catalog of 240M+ scholarly works. Use when the user needs to find academic papers, research articles, literature for a topic, citation data, author publications, or any scholarly source. Triggers on: 'find papers on', 'academic research about', 'what studies exist', 'literature review', 'find citations', 'scholarly articles about', 'who published on', 'papers by [author]', 'highly cited papers on', any request for peer-reviewed or academic sources. Also use during deep research when you need to ground findings in academic literature. Do NOT use for general web searches -- use web-search for that."
4
+ ---
5
+
6
+ # Academic Paper Search (OpenAlex)
7
+
8
+ Search 240M+ scholarly works using the OpenAlex API -- completely free, no API key required, no SDK needed. Just `curl` or `bash` with URL construction.
9
+
10
+ **Full docs:** https://docs.openalex.org
11
+
12
+ ## Quick Start
13
+
14
+ OpenAlex is a REST API. You query it by constructing URLs and fetching them with `curl`. All responses are JSON.
15
+
16
+ ```bash
17
+ # Search for papers about "transformer architecture"
18
+ curl -s "https://api.openalex.org/works?search=transformer+architecture&per_page=5&mailto=agent@kortix.ai" | python3 -m json.tool
19
+ ```
20
+
21
+ **Important:** Always include `mailto=agent@kortix.ai` (or any valid email) in every request. Without it, you're limited to 1 request/second. With it, you get 10 requests/second (the "polite pool").
22
+
23
+ ## Core Concepts
24
+
25
+ ### Entities
26
+
27
+ OpenAlex has these entity types (all queryable):
28
+
29
+ | Entity | Endpoint | Count | Description |
30
+ |--------|----------|-------|-------------|
31
+ | **Works** | `/works` | 240M+ | Papers, articles, books, datasets, theses |
32
+ | **Authors** | `/authors` | 90M+ | People who create works |
33
+ | **Sources** | `/sources` | 250K+ | Journals, repositories, conferences |
34
+ | **Institutions** | `/institutions` | 110K+ | Universities, research orgs |
35
+ | **Topics** | `/topics` | 4K+ | Research topics (hierarchical) |
36
+
37
+ ### Work Object -- Key Fields
38
+
39
+ When you fetch a work, these are the most useful fields:
40
+
41
+ ```
42
+ id OpenAlex ID (e.g., "https://openalex.org/W2741809807")
43
+ doi DOI URL
44
+ title / display_name Paper title
45
+ publication_year Year published
46
+ publication_date Full date (YYYY-MM-DD)
47
+ cited_by_count Number of incoming citations
48
+ fwci Field-Weighted Citation Impact (normalized)
49
+ type article, preprint, review, book, dataset, etc.
50
+ language ISO 639-1 code (e.g., "en")
51
+ is_retracted Boolean
52
+ open_access.is_oa Boolean -- is it freely accessible?
53
+ open_access.oa_url Direct URL to free version
54
+ authorships List of authors with names, institutions, ORCIDs
55
+ abstract_inverted_index Abstract as inverted index (needs reconstruction)
56
+ referenced_works List of OpenAlex IDs this work cites (outgoing)
57
+ related_works Algorithmically related works
58
+ cited_by_api_url API URL to get works that cite this one (incoming)
59
+ topics Assigned research topics with scores
60
+ keywords Extracted keywords with scores
61
+ primary_location Where the work is published (journal, repo)
62
+ best_oa_location Best open access location with PDF link
63
+ ```
64
+
65
+ ### Reconstructing Abstracts
66
+
67
+ OpenAlex stores abstracts as inverted indexes for legal reasons. To get plaintext, reconstruct:
68
+
69
+ ```python
70
+ import json, sys
71
+ # Read the abstract_inverted_index from a work object
72
+ inv_idx = work["abstract_inverted_index"]
73
+ if inv_idx:
74
+ words = [""] * (max(max(positions) for positions in inv_idx.values()) + 1)
75
+ for word, positions in inv_idx.items():
76
+ for pos in positions:
77
+ words[pos] = word
78
+ abstract = " ".join(words)
79
+ ```
80
+
81
+ Or in bash with `python3 -c`:
82
+ ```bash
83
+ # Pipe a work JSON into this to extract the abstract
84
+ echo "$WORK_JSON" | python3 -c "
85
+ import json,sys
86
+ w=json.load(sys.stdin)
87
+ idx=w.get('abstract_inverted_index',{})
88
+ if idx:
89
+ words=['']*( max(max(p) for p in idx.values())+1 )
90
+ for word,positions in idx.items():
91
+ for pos in positions: words[pos]=word
92
+ print(' '.join(words))
93
+ "
94
+ ```
95
+
96
+ ## Searching for Papers
97
+
98
+ ### Basic Keyword Search
99
+
100
+ Searches across titles, abstracts, and fulltext. Uses stemming and stop-word removal.
101
+
102
+ ```bash
103
+ # Simple search
104
+ curl -s "https://api.openalex.org/works?search=large+language+models&mailto=agent@kortix.ai"
105
+
106
+ # With per_page limit
107
+ curl -s "https://api.openalex.org/works?search=CRISPR+gene+editing&per_page=10&mailto=agent@kortix.ai"
108
+ ```
109
+
110
+ ### Boolean Search
111
+
112
+ Use uppercase `AND`, `OR`, `NOT` with parentheses and quoted phrases:
113
+
114
+ ```bash
115
+ # Complex boolean query
116
+ curl -s "https://api.openalex.org/works?search=(reinforcement+learning+AND+%22robot+control%22)+NOT+simulation&mailto=agent@kortix.ai"
117
+
118
+ # Exact phrase match (use double quotes, URL-encoded as %22)
119
+ curl -s "https://api.openalex.org/works?search=%22attention+is+all+you+need%22&mailto=agent@kortix.ai"
120
+ ```
121
+
122
+ ### Search Specific Fields
123
+
124
+ ```bash
125
+ # Title only
126
+ curl -s "https://api.openalex.org/works?filter=title.search:transformer&mailto=agent@kortix.ai"
127
+
128
+ # Abstract only
129
+ curl -s "https://api.openalex.org/works?filter=abstract.search:protein+folding&mailto=agent@kortix.ai"
130
+
131
+ # Title and abstract combined
132
+ curl -s "https://api.openalex.org/works?filter=title_and_abstract.search:neural+scaling+laws&mailto=agent@kortix.ai"
133
+
134
+ # Fulltext search (subset of works)
135
+ curl -s "https://api.openalex.org/works?filter=fulltext.search:climate+tipping+points&mailto=agent@kortix.ai"
136
+ ```
137
+
138
+ ## Filtering
139
+
140
+ Filters are the most powerful feature. Combine them with commas (AND) or pipes (OR).
141
+
142
+ ### Most Useful Filters
143
+
144
+ ```bash
145
+ # By publication year
146
+ ?filter=publication_year:2024
147
+ ?filter=publication_year:2020-2024
148
+ ?filter=publication_year:>2022
149
+
150
+ # By citation count
151
+ ?filter=cited_by_count:>100 # highly cited
152
+ ?filter=cited_by_count:>1000 # landmark papers
153
+
154
+ # By open access
155
+ ?filter=is_oa:true # only open access
156
+ ?filter=oa_status:gold # gold OA only
157
+
158
+ # By type
159
+ ?filter=type:article # journal articles
160
+ ?filter=type:preprint # preprints
161
+ ?filter=type:review # review articles
162
+
163
+ # By language
164
+ ?filter=language:en # English only
165
+
166
+ # Not retracted
167
+ ?filter=is_retracted:false
168
+
169
+ # Has abstract
170
+ ?filter=has_abstract:true
171
+
172
+ # Has downloadable PDF
173
+ ?filter=has_content.pdf:true
174
+
175
+ # By author (OpenAlex ID)
176
+ ?filter=author.id:A5023888391
177
+
178
+ # By institution (OpenAlex ID)
179
+ ?filter=institutions.id:I27837315 # e.g., University of Michigan
180
+
181
+ # By DOI
182
+ ?filter=doi:https://doi.org/10.1038/s41586-021-03819-2
183
+
184
+ # By indexed source
185
+ ?filter=indexed_in:arxiv # arXiv papers
186
+ ?filter=indexed_in:pubmed # PubMed papers
187
+ ?filter=indexed_in:crossref # Crossref papers
188
+ ```
189
+
190
+ ### Combining Filters
191
+
192
+ ```bash
193
+ # AND: comma-separated
194
+ ?filter=publication_year:>2022,cited_by_count:>50,is_oa:true,type:article
195
+
196
+ # OR: pipe-separated within a filter
197
+ ?filter=publication_year:2023|2024
198
+
199
+ # NOT: prefix with !
200
+ ?filter=type:!preprint
201
+
202
+ # Combined example: highly-cited OA articles from 2023-2024, not preprints
203
+ curl -s "https://api.openalex.org/works?filter=publication_year:2023-2024,cited_by_count:>50,is_oa:true,type:!preprint&search=machine+learning&per_page=10&mailto=agent@kortix.ai"
204
+ ```
205
+
206
+ ## Sorting
207
+
208
+ ```bash
209
+ # Most cited first
210
+ ?sort=cited_by_count:desc
211
+
212
+ # Most recent first
213
+ ?sort=publication_date:desc
214
+
215
+ # Most relevant first (only when using search)
216
+ ?sort=relevance_score:desc
217
+
218
+ # Multiple sort keys
219
+ ?sort=publication_year:desc,cited_by_count:desc
220
+ ```
221
+
222
+ ## Pagination
223
+
224
+ Two modes: **basic paging** (for browsing) and **cursor paging** (for collecting all results).
225
+
226
+ ```bash
227
+ # Basic paging (limited to 10,000 results)
228
+ ?page=1&per_page=25
229
+ ?page=2&per_page=25
230
+
231
+ # Cursor paging (unlimited, for collecting everything)
232
+ ?per_page=100&cursor=* # first page
233
+ ?per_page=100&cursor=IlsxNjk0ODc... # next page (cursor from previous response meta)
234
+ ```
235
+
236
+ The cursor for the next page is in `response.meta.next_cursor`. When it's `null`, you've reached the end.
237
+
238
+ ## Select Fields
239
+
240
+ Reduce response size by selecting only the fields you need:
241
+
242
+ ```bash
243
+ # Only get IDs, titles, citation counts, and DOIs
244
+ ?select=id,display_name,cited_by_count,doi,publication_year
245
+
246
+ # Minimal metadata for scanning
247
+ ?select=id,display_name,publication_year,cited_by_count,open_access
248
+ ```
249
+
250
+ ## Citation Graph Traversal
251
+
252
+ ### Find what a paper cites (outgoing references)
253
+
254
+ ```bash
255
+ # Get works cited BY a specific paper
256
+ curl -s "https://api.openalex.org/works?filter=cited_by:W2741809807&per_page=25&mailto=agent@kortix.ai"
257
+ ```
258
+
259
+ ### Find what cites a paper (incoming citations)
260
+
261
+ ```bash
262
+ # Get works that CITE a specific paper
263
+ curl -s "https://api.openalex.org/works?filter=cites:W2741809807&sort=cited_by_count:desc&per_page=25&mailto=agent@kortix.ai"
264
+ ```
265
+
266
+ ### Find related works
267
+
268
+ ```bash
269
+ # Get related works (algorithmic, based on shared concepts)
270
+ curl -s "https://api.openalex.org/works?filter=related_to:W2741809807&per_page=25&mailto=agent@kortix.ai"
271
+ ```
272
+
273
+ ### Citation chain: follow the references
274
+
275
+ 1. Get a seminal paper by DOI
276
+ 2. Find its `referenced_works` (what it cites)
277
+ 3. Find who cites it (`filter=cites:WORK_ID`)
278
+ 4. For the most cited citers, repeat
279
+
280
+ This is how you build a literature graph around a topic.
281
+
282
+ ## Author Lookup
283
+
284
+ ```bash
285
+ # Search for an author
286
+ curl -s "https://api.openalex.org/authors?search=Yann+LeCun&mailto=agent@kortix.ai"
287
+
288
+ # Get an author's works (by OpenAlex author ID)
289
+ curl -s "https://api.openalex.org/works?filter=author.id:A5064850633&sort=cited_by_count:desc&per_page=10&mailto=agent@kortix.ai"
290
+
291
+ # Get an author by ORCID
292
+ curl -s "https://api.openalex.org/authors/orcid:0000-0001-6187-6610?mailto=agent@kortix.ai"
293
+ ```
294
+
295
+ ## Lookup by External ID
296
+
297
+ ```bash
298
+ # By DOI
299
+ curl -s "https://api.openalex.org/works/doi:10.1038/s41586-021-03819-2?mailto=agent@kortix.ai"
300
+
301
+ # By PubMed ID
302
+ curl -s "https://api.openalex.org/works/pmid:14907713?mailto=agent@kortix.ai"
303
+
304
+ # By arXiv ID (via DOI)
305
+ curl -s "https://api.openalex.org/works/doi:10.48550/arXiv.2303.08774?mailto=agent@kortix.ai"
306
+
307
+ # Batch lookup: up to 50 IDs at once
308
+ curl -s "https://api.openalex.org/works?filter=doi:https://doi.org/10.1234/a|https://doi.org/10.1234/b|https://doi.org/10.1234/c&mailto=agent@kortix.ai"
309
+ ```
310
+
311
+ ## Open Access & PDF Access
312
+
313
+ ```bash
314
+ # Find OA papers with direct PDF links
315
+ curl -s "https://api.openalex.org/works?search=quantum+computing&filter=is_oa:true,has_content.pdf:true&select=id,display_name,open_access,best_oa_location&per_page=5&mailto=agent@kortix.ai"
316
+ ```
317
+
318
+ The `best_oa_location.pdf_url` field gives a direct PDF link when available. The `open_access.oa_url` gives the best available OA landing page or PDF.
319
+
320
+ ## Practical Workflows
321
+
322
+ ### Literature Survey on a Topic
323
+
324
+ ```bash
325
+ # 1. Find the most-cited papers on a topic
326
+ curl -s "https://api.openalex.org/works?search=retrieval+augmented+generation&sort=cited_by_count:desc&filter=publication_year:>2020,type:article,has_abstract:true&per_page=20&select=id,display_name,publication_year,cited_by_count,doi,authorships,abstract_inverted_index&mailto=agent@kortix.ai"
327
+
328
+ # 2. For the top papers, explore their citation graphs
329
+ curl -s "https://api.openalex.org/works?filter=cites:W4285719527&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
330
+
331
+ # 3. Find recent papers building on this work
332
+ curl -s "https://api.openalex.org/works?filter=cites:W4285719527,publication_year:>2023&sort=publication_date:desc&per_page=10&mailto=agent@kortix.ai"
333
+ ```
334
+
335
+ ### Find Landmark/Seminal Papers
336
+
337
+ ```bash
338
+ # Highly cited + search term
339
+ curl -s "https://api.openalex.org/works?search=attention+mechanism+neural+networks&filter=cited_by_count:>500,type:article&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
340
+ ```
341
+
342
+ ### Find Recent Preprints
343
+
344
+ ```bash
345
+ # Latest preprints on a topic
346
+ curl -s "https://api.openalex.org/works?search=multimodal+large+language+models&filter=type:preprint,publication_year:2025&sort=publication_date:desc&per_page=15&mailto=agent@kortix.ai"
347
+ ```
348
+
349
+ ### Find Review Articles
350
+
351
+ ```bash
352
+ # Review/survey papers on a topic
353
+ curl -s "https://api.openalex.org/works?search=federated+learning&filter=type:review,cited_by_count:>20&sort=cited_by_count:desc&per_page=10&mailto=agent@kortix.ai"
354
+ ```
355
+
356
+ ### Author Analysis
357
+
358
+ ```bash
359
+ # 1. Find the author
360
+ curl -s "https://api.openalex.org/authors?search=Geoffrey+Hinton&select=id,display_name,works_count,cited_by_count,last_known_institutions&mailto=agent@kortix.ai"
361
+
362
+ # 2. Get their most influential papers
363
+ curl -s "https://api.openalex.org/works?filter=author.id:A5068082743&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
364
+
365
+ # 3. Get their recent work
366
+ curl -s "https://api.openalex.org/works?filter=author.id:A5068082743,publication_year:>2023&sort=publication_date:desc&per_page=10&mailto=agent@kortix.ai"
367
+ ```
368
+
369
+ ## Saving Results to Disk
370
+
371
+ When doing deep research, save paper data to disk for later processing:
372
+
373
+ ```bash
374
+ # Save search results as JSON
375
+ curl -s "https://api.openalex.org/works?search=topic&per_page=50&mailto=agent@kortix.ai" > research/papers/topic-search.json
376
+
377
+ # Extract and save a clean summary
378
+ curl -s "https://api.openalex.org/works?search=topic&per_page=50&select=id,display_name,publication_year,cited_by_count,doi,authorships&mailto=agent@kortix.ai" | python3 -c "
379
+ import json, sys
380
+ data = json.load(sys.stdin)
381
+ for w in data.get('results', []):
382
+ authors = ', '.join(a['author']['display_name'] for a in w.get('authorships', [])[:3])
383
+ if len(w.get('authorships', [])) > 3: authors += ' et al.'
384
+ print(f\"[{w.get('cited_by_count',0)} cites] {w['display_name']} ({w.get('publication_year','?')}) - {authors}\")
385
+ if w.get('doi'): print(f\" DOI: {w['doi']}\")
386
+ print()
387
+ " > research/papers/topic-summary.txt
388
+ ```
389
+
390
+ For deep research, save individual paper metadata to your `sources-index.md` and raw data to `sources/`:
391
+
392
+ ```bash
393
+ # Save a paper's full metadata
394
+ curl -s "https://api.openalex.org/works/W2741809807?mailto=agent@kortix.ai" > research/sources/001-paper-title.json
395
+ ```
396
+
397
+ ## Rate Limits
398
+
399
+ | Pool | Rate | How to get it |
400
+ |------|------|---------------|
401
+ | Common | 1 req/sec | No email provided |
402
+ | Polite | 10 req/sec | Add `mailto=your@email.com` to requests |
403
+ | Premium | Higher | Paid API key via `api_key` param |
404
+
405
+ **Always use the polite pool.** Add `&mailto=agent@kortix.ai` to every request.
406
+
407
+ ## Tips
408
+
409
+ - **Use `select` aggressively** to reduce response size and speed up requests
410
+ - **Use `per_page=100`** (max) when collecting lots of results to minimize request count
411
+ - **Use cursor paging** (`cursor=*`) when you need more than 10,000 results
412
+ - **Batch DOI lookups** with OR syntax: `filter=doi:DOI1|DOI2|DOI3` (up to 50)
413
+ - **Reconstruct abstracts** using the inverted index -- don't skip this, abstracts are gold
414
+ - **Follow citation chains** to find seminal works and recent developments
415
+ - **Filter by `has_abstract:true`** when you need abstracts (not all works have them)
416
+ - **Filter by `indexed_in:arxiv`** or `indexed_in:pubmed` to target specific repositories
417
+ - **Sort by `cited_by_count:desc`** to find the most influential papers first
418
+ - **Combine search + filters** for precise results: search gives relevance, filters give precision
@@ -0,0 +1,232 @@
1
+ ---
2
+ name: kortix-pdf
3
+ description: "Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill."
4
+ ---
5
+
6
+ # PDF Processing Guide
7
+
8
+ ## Overview
9
+
10
+ Essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see reference.md. For filling PDF forms, read forms.md and follow its instructions.
11
+
12
+ ## Quick Start
13
+
14
+ ```python
15
+ from pypdf import PdfReader, PdfWriter
16
+
17
+ reader = PdfReader("document.pdf")
18
+ print(f"Pages: {len(reader.pages)}")
19
+
20
+ text = ""
21
+ for page in reader.pages:
22
+ text += page.extract_text()
23
+ ```
24
+
25
+ ## Python Libraries
26
+
27
+ ### pypdf - Basic Operations
28
+
29
+ #### Merge PDFs
30
+ ```python
31
+ from pypdf import PdfWriter, PdfReader
32
+
33
+ writer = PdfWriter()
34
+ for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
35
+ reader = PdfReader(pdf_file)
36
+ for page in reader.pages:
37
+ writer.add_page(page)
38
+
39
+ with open("merged.pdf", "wb") as output:
40
+ writer.write(output)
41
+ ```
42
+
43
+ #### Split PDF
44
+ ```python
45
+ reader = PdfReader("input.pdf")
46
+ for i, page in enumerate(reader.pages):
47
+ writer = PdfWriter()
48
+ writer.add_page(page)
49
+ with open(f"page_{i+1}.pdf", "wb") as output:
50
+ writer.write(output)
51
+ ```
52
+
53
+ #### Extract Metadata
54
+ ```python
55
+ reader = PdfReader("document.pdf")
56
+ meta = reader.metadata
57
+ print(f"Title: {meta.title}")
58
+ print(f"Author: {meta.author}")
59
+ ```
60
+
61
+ #### Rotate Pages
62
+ ```python
63
+ reader = PdfReader("input.pdf")
64
+ writer = PdfWriter()
65
+ page = reader.pages[0]
66
+ page.rotate(90)
67
+ writer.add_page(page)
68
+ with open("rotated.pdf", "wb") as output:
69
+ writer.write(output)
70
+ ```
71
+
72
+ ### pdfplumber - Text and Table Extraction
73
+
74
+ #### Extract Text with Layout
75
+ ```python
76
+ import pdfplumber
77
+
78
+ with pdfplumber.open("document.pdf") as pdf:
79
+ for page in pdf.pages:
80
+ text = page.extract_text()
81
+ print(text)
82
+ ```
83
+
84
+ #### Extract Tables
85
+ ```python
86
+ import pandas as pd
87
+
88
+ with pdfplumber.open("document.pdf") as pdf:
89
+ all_tables = []
90
+ for page in pdf.pages:
91
+ tables = page.extract_tables()
92
+ for table in tables:
93
+ if table:
94
+ df = pd.DataFrame(table[1:], columns=table[0])
95
+ all_tables.append(df)
96
+
97
+ if all_tables:
98
+ combined_df = pd.concat(all_tables, ignore_index=True)
99
+ combined_df.to_excel("extracted_tables.xlsx", index=False)
100
+ ```
101
+
102
+ ### reportlab - Create PDFs
103
+
104
+ #### Basic PDF Creation
105
+ ```python
106
+ from reportlab.lib.pagesizes import letter
107
+ from reportlab.pdfgen import canvas
108
+
109
+ c = canvas.Canvas("hello.pdf", pagesize=letter)
110
+ width, height = letter
111
+ c.drawString(100, height - 100, "Hello World!")
112
+ c.line(100, height - 140, 400, height - 140)
113
+ c.save()
114
+ ```
115
+
116
+ #### Multi-Page with Platypus
117
+ ```python
118
+ from reportlab.lib.pagesizes import letter
119
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
120
+ from reportlab.lib.styles import getSampleStyleSheet
121
+
122
+ doc = SimpleDocTemplate("report.pdf", pagesize=letter)
123
+ styles = getSampleStyleSheet()
124
+ story = []
125
+
126
+ story.append(Paragraph("Report Title", styles['Title']))
127
+ story.append(Spacer(1, 12))
128
+ story.append(Paragraph("This is the body of the report. " * 20, styles['Normal']))
129
+ story.append(PageBreak())
130
+ story.append(Paragraph("Page 2", styles['Heading1']))
131
+
132
+ doc.build(story)
133
+ ```
134
+
135
+ #### Subscripts and Superscripts
136
+
137
+ **IMPORTANT**: Never use Unicode subscript/superscript characters in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
138
+
139
+ Use ReportLab's XML markup tags in Paragraph objects:
140
+ ```python
141
+ from reportlab.platypus import Paragraph
142
+ from reportlab.lib.styles import getSampleStyleSheet
143
+ styles = getSampleStyleSheet()
144
+
145
+ chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
146
+ squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
147
+ ```
148
+
149
+ ## Command-Line Tools
150
+
151
+ ### pdftotext (poppler-utils)
152
+ ```bash
153
+ pdftotext input.pdf output.txt
154
+ pdftotext -layout input.pdf output.txt
155
+ pdftotext -f 1 -l 5 input.pdf output.txt
156
+ ```
157
+
158
+ ### qpdf
159
+ ```bash
160
+ qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
161
+ qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
162
+ qpdf input.pdf output.pdf --rotate=+90:1
163
+ qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
164
+ ```
165
+
166
+ ## Common Tasks
167
+
168
+ ### OCR Scanned PDFs
169
+ ```python
170
+ import pytesseract
171
+ from pdf2image import convert_from_path
172
+
173
+ images = convert_from_path('scanned.pdf')
174
+ text = ""
175
+ for i, image in enumerate(images):
176
+ text += f"Page {i+1}:\n"
177
+ text += pytesseract.image_to_string(image)
178
+ text += "\n\n"
179
+ ```
180
+
181
+ ### Add Watermark
182
+ ```python
183
+ from pypdf import PdfReader, PdfWriter
184
+
185
+ watermark = PdfReader("watermark.pdf").pages[0]
186
+ reader = PdfReader("document.pdf")
187
+ writer = PdfWriter()
188
+
189
+ for page in reader.pages:
190
+ page.merge_page(watermark)
191
+ writer.add_page(page)
192
+
193
+ with open("watermarked.pdf", "wb") as output:
194
+ writer.write(output)
195
+ ```
196
+
197
+ ### Extract Images
198
+ ```bash
199
+ pdfimages -j input.pdf output_prefix
200
+ ```
201
+
202
+ ### Password Protection
203
+ ```python
204
+ from pypdf import PdfReader, PdfWriter
205
+
206
+ reader = PdfReader("input.pdf")
207
+ writer = PdfWriter()
208
+ for page in reader.pages:
209
+ writer.add_page(page)
210
+ writer.encrypt("userpassword", "ownerpassword")
211
+ with open("encrypted.pdf", "wb") as output:
212
+ writer.write(output)
213
+ ```
214
+
215
+ ## Quick Reference
216
+
217
+ | Task | Best Tool | Command/Code |
218
+ |------|-----------|--------------|
219
+ | Merge PDFs | pypdf | `writer.add_page(page)` |
220
+ | Split PDFs | pypdf | One page per file |
221
+ | Extract text | pdfplumber | `page.extract_text()` |
222
+ | Extract tables | pdfplumber | `page.extract_tables()` |
223
+ | Create PDFs | reportlab | Canvas or Platypus |
224
+ | Command line merge | qpdf | `qpdf --empty --pages ...` |
225
+ | OCR scanned PDFs | pytesseract | Convert to image first |
226
+ | Fill PDF forms | pypdf or pdf-lib | See forms.md |
227
+
228
+ ## Next Steps
229
+
230
+ - For advanced pypdfium2 usage, see reference.md
231
+ - For JavaScript libraries (pdf-lib), see reference.md
232
+ - For filling PDF forms, follow instructions in forms.md
@@ -0,0 +1,36 @@
1
+ **CRITICAL: Complete these steps in order. Do not skip ahead to writing code.**
2
+
3
+ First check if the PDF has fillable form fields. Run:
4
+ `python scripts/check_fillable_fields <file.pdf>`, then go to "Fillable fields" or "Non-fillable fields".
5
+
6
+ # Fillable fields
7
+ If the PDF has fillable form fields:
8
+ - Run: `python scripts/extract_form_field_info.py <input.pdf> <field_info.json>` to get field list as JSON.
9
+ - Convert PDF to PNGs: `python scripts/convert_pdf_to_images.py <file.pdf> <output_directory>`
10
+ - Analyze images to determine each field's purpose.
11
+ - Create `field_values.json` with values for each field.
12
+ - Run: `python scripts/fill_fillable_fields.py <input.pdf> <field_values.json> <output.pdf>`
13
+
14
+ # Non-fillable fields
15
+ If no fillable form fields, add text annotations.
16
+
17
+ ## Step 1: Try Structure Extraction First
18
+ Run: `python scripts/extract_form_structure.py <input.pdf> form_structure.json`
19
+
20
+ If meaningful labels found, use **Approach A**. If scanned/image-based, use **Approach B**.
21
+
22
+ ## Approach A: Structure-Based Coordinates (Preferred)
23
+ 1. Analyze form_structure.json for label groups, row structure, field columns, checkboxes
24
+ 2. Create fields.json with PDF coordinates using `pdf_width`/`pdf_height`
25
+ 3. Validate: `python scripts/check_bounding_boxes.py fields.json`
26
+
27
+ ## Approach B: Visual Estimation (Fallback)
28
+ 1. Convert to images: `python scripts/convert_pdf_to_images.py <input.pdf> <images_dir/>`
29
+ 2. Identify fields with rough estimates
30
+ 3. Zoom and refine with ImageMagick crops
31
+ 4. Create fields.json with image coordinates using `image_width`/`image_height`
32
+ 5. Validate: `python scripts/check_bounding_boxes.py fields.json`
33
+
34
+ ## Fill and Verify
35
+ - Fill: `python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
36
+ - Verify: `python scripts/convert_pdf_to_images.py <output.pdf> <verify_images/>`