@kortix/sandbox 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/customize.sh +143 -0
- package/config/kortix-env-setup.sh +25 -0
- package/kortix-master/package.json +22 -0
- package/kortix-master/src/config.ts +22 -0
- package/kortix-master/src/index.ts +44 -0
- package/kortix-master/src/routes/env.ts +65 -0
- package/kortix-master/src/routes/proxy.ts +108 -0
- package/kortix-master/src/routes/update.ts +185 -0
- package/kortix-master/src/services/proxy.ts +43 -0
- package/kortix-master/src/services/secret-store.ts +156 -0
- package/kortix-master/tsconfig.json +14 -0
- package/opencode/agents/kortix-browser.md +142 -0
- package/opencode/agents/kortix-build.md +62 -0
- package/opencode/agents/kortix-explore.md +66 -0
- package/opencode/agents/kortix-image-gen.md +33 -0
- package/opencode/agents/kortix-main.md +450 -0
- package/opencode/agents/kortix-plan.md +100 -0
- package/opencode/agents/kortix-research.md +84 -0
- package/opencode/agents/kortix-sheets.md +61 -0
- package/opencode/agents/kortix-slides.md +64 -0
- package/opencode/agents/kortix-web-dev.md +572 -0
- package/opencode/commands/email.md +36 -0
- package/opencode/commands/init.md +43 -0
- package/opencode/commands/journal.md +44 -0
- package/opencode/commands/memory-init.md +81 -0
- package/opencode/commands/memory-search.md +50 -0
- package/opencode/commands/memory-status.md +56 -0
- package/opencode/commands/research.md +36 -0
- package/opencode/commands/search.md +38 -0
- package/opencode/commands/slides.md +32 -0
- package/opencode/commands/spreadsheet.md +30 -0
- package/opencode/memory.json +37 -0
- package/opencode/ocx.jsonc +10 -0
- package/opencode/opencode.jsonc +103 -0
- package/opencode/package.json +25 -0
- package/opencode/patches/apply.sh +19 -0
- package/opencode/patches/opencode-pty-spawn.txt +49 -0
- package/opencode/plugin/background-agents.ts.disabled +483 -0
- package/opencode/plugin/kdco-primitives/get-project-id.ts +172 -0
- package/opencode/plugin/kdco-primitives/index.ts +26 -0
- package/opencode/plugin/kdco-primitives/log-warn.ts +51 -0
- package/opencode/plugin/kdco-primitives/mutex.ts +122 -0
- package/opencode/plugin/kdco-primitives/shell.ts +138 -0
- package/opencode/plugin/kdco-primitives/temp.ts +36 -0
- package/opencode/plugin/kdco-primitives/terminal-detect.ts +34 -0
- package/opencode/plugin/kdco-primitives/types.ts +13 -0
- package/opencode/plugin/kdco-primitives/with-timeout.ts +84 -0
- package/opencode/plugin/memory.ts +306 -0
- package/opencode/plugin/worktree/state.ts +412 -0
- package/opencode/plugin/worktree/terminal.ts +1002 -0
- package/opencode/plugin/worktree.ts +861 -0
- package/opencode/skills/KORTIX-browser/SKILL.md +478 -0
- package/opencode/skills/KORTIX-cron-triggers/SKILL.md +173 -0
- package/opencode/skills/KORTIX-deep-research/SKILL.md +278 -0
- package/opencode/skills/KORTIX-docx/SKILL.md +398 -0
- package/opencode/skills/KORTIX-docx/scripts/__init__.py +1 -0
- package/opencode/skills/KORTIX-docx/scripts/accept_changes.py +104 -0
- package/opencode/skills/KORTIX-docx/scripts/comment.py +244 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-docx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-docx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-docx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-docx/scripts/render_docx.py +179 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/comments.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtended.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtensible.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsIds.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/people.xml +3 -0
- package/opencode/skills/KORTIX-domain-research/SKILL.md +96 -0
- package/opencode/skills/KORTIX-domain-research/scripts/domain-lookup.py +810 -0
- package/opencode/skills/KORTIX-elevenlabs/SKILL.md +230 -0
- package/opencode/skills/KORTIX-elevenlabs/scripts/tts.py +389 -0
- package/opencode/skills/KORTIX-email/SKILL.md +145 -0
- package/opencode/skills/KORTIX-legal-writer/SKILL.md +409 -0
- package/opencode/skills/KORTIX-legal-writer/references/bluebook.md +152 -0
- package/opencode/skills/KORTIX-legal-writer/references/document-types.md +416 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/courtlistener.py +291 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/ecfr_lookup.py +299 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/verify-legal.py +507 -0
- package/opencode/skills/KORTIX-logo-creator/SKILL.md +293 -0
- package/opencode/skills/KORTIX-logo-creator/references/prompt-patterns.md +134 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/compose_logo.py +406 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/create_logo_sheet.py +258 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/remove_bg.py +96 -0
- package/opencode/skills/KORTIX-memory/SKILL.md +261 -0
- package/opencode/skills/KORTIX-memory/scripts/export-sessions.py +409 -0
- package/opencode/skills/KORTIX-paper-creator/SKILL.md +549 -0
- package/opencode/skills/KORTIX-paper-creator/assets/template.tex +101 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/compile.sh +177 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/openalex_to_bibtex.py +220 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/verify.sh +354 -0
- package/opencode/skills/KORTIX-paper-search/SKILL.md +418 -0
- package/opencode/skills/KORTIX-pdf/SKILL.md +232 -0
- package/opencode/skills/KORTIX-pdf/forms.md +36 -0
- package/opencode/skills/KORTIX-pdf/reference.md +105 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_bounding_boxes.py +65 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_fillable_fields.py +11 -0
- package/opencode/skills/KORTIX-pdf/scripts/convert_pdf_to_images.py +33 -0
- package/opencode/skills/KORTIX-pdf/scripts/create_validation_image.py +37 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_field_info.py +122 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_structure.py +115 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_fillable_fields.py +98 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
- package/opencode/skills/KORTIX-plan/SKILL.md +228 -0
- package/opencode/skills/KORTIX-presentation-viewer/SKILL.md +87 -0
- package/opencode/skills/KORTIX-presentation-viewer/serve.ts +136 -0
- package/opencode/skills/KORTIX-presentation-viewer/viewer.html +559 -0
- package/opencode/skills/KORTIX-presentations/SKILL.md +344 -0
- package/opencode/skills/KORTIX-remotion/SKILL.md +56 -0
- package/opencode/skills/KORTIX-remotion/rules/3d.md +86 -0
- package/opencode/skills/KORTIX-remotion/rules/animations.md +29 -0
- package/opencode/skills/KORTIX-remotion/rules/assets.md +78 -0
- package/opencode/skills/KORTIX-remotion/rules/audio-visualization.md +198 -0
- package/opencode/skills/KORTIX-remotion/rules/audio.md +169 -0
- package/opencode/skills/KORTIX-remotion/rules/calculate-metadata.md +104 -0
- package/opencode/skills/KORTIX-remotion/rules/can-decode.md +75 -0
- package/opencode/skills/KORTIX-remotion/rules/charts.md +120 -0
- package/opencode/skills/KORTIX-remotion/rules/compositions.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/display-captions.md +184 -0
- package/opencode/skills/KORTIX-remotion/rules/extract-frames.md +229 -0
- package/opencode/skills/KORTIX-remotion/rules/ffmpeg.md +38 -0
- package/opencode/skills/KORTIX-remotion/rules/fonts.md +152 -0
- package/opencode/skills/KORTIX-remotion/rules/get-audio-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-dimensions.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/gifs.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/images.md +130 -0
- package/opencode/skills/KORTIX-remotion/rules/import-srt-captions.md +69 -0
- package/opencode/skills/KORTIX-remotion/rules/light-leaks.md +73 -0
- package/opencode/skills/KORTIX-remotion/rules/lottie.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/maps.md +401 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-dom-nodes.md +35 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-text.md +143 -0
- package/opencode/skills/KORTIX-remotion/rules/parameters.md +98 -0
- package/opencode/skills/KORTIX-remotion/rules/sequencing.md +118 -0
- package/opencode/skills/KORTIX-remotion/rules/subtitles.md +36 -0
- package/opencode/skills/KORTIX-remotion/rules/tailwind.md +11 -0
- package/opencode/skills/KORTIX-remotion/rules/text-animations.md +20 -0
- package/opencode/skills/KORTIX-remotion/rules/timing.md +179 -0
- package/opencode/skills/KORTIX-remotion/rules/transcribe-captions.md +70 -0
- package/opencode/skills/KORTIX-remotion/rules/transitions.md +197 -0
- package/opencode/skills/KORTIX-remotion/rules/transparent-videos.md +106 -0
- package/opencode/skills/KORTIX-remotion/rules/trimming.md +53 -0
- package/opencode/skills/KORTIX-remotion/rules/videos.md +171 -0
- package/opencode/skills/KORTIX-secrets/SKILL.md +280 -0
- package/opencode/skills/KORTIX-semantic-search/SKILL.md +213 -0
- package/opencode/skills/KORTIX-session-search/SKILL.md +807 -0
- package/opencode/skills/KORTIX-session-search/Untitled +1 -0
- package/opencode/skills/KORTIX-skill-creator/SKILL.md +163 -0
- package/opencode/skills/KORTIX-web-research/SKILL.md +69 -0
- package/opencode/skills/KORTIX-xlsx/LICENSE.txt +30 -0
- package/opencode/skills/KORTIX-xlsx/SKILL.md +549 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-xlsx/scripts/recalc.py +184 -0
- package/opencode/tools/image-gen.ts +342 -0
- package/opencode/tools/image-search.ts +190 -0
- package/opencode/tools/memory-get.ts +168 -0
- package/opencode/tools/memory-search.ts +247 -0
- package/opencode/tools/presentation-gen.ts +723 -0
- package/opencode/tools/scrape-webpage.ts +115 -0
- package/opencode/tools/scripts/.python-version +1 -0
- package/opencode/tools/scripts/convert_pdf.py +184 -0
- package/opencode/tools/scripts/convert_pptx.py +562 -0
- package/opencode/tools/scripts/pyproject.toml +11 -0
- package/opencode/tools/scripts/uv.lock +287 -0
- package/opencode/tools/scripts/validate_slide.py +74 -0
- package/opencode/tools/show-user.ts +217 -0
- package/opencode/tools/tests/e2e-presentation-fix.ts +277 -0
- package/opencode/tools/tests/image-gen.test.ts +215 -0
- package/opencode/tools/tests/image-search.test.ts +125 -0
- package/opencode/tools/tests/memory-system-benchmark.ts +1076 -0
- package/opencode/tools/tests/presentation-gen.test.ts +389 -0
- package/opencode/tools/tests/scrape-webpage.test.ts +74 -0
- package/opencode/tools/tests/show-user.test.ts +241 -0
- package/opencode/tools/tests/video-gen.test.ts +110 -0
- package/opencode/tools/tests/web-search.test.ts +106 -0
- package/opencode/tools/video-gen.ts +200 -0
- package/opencode/tools/web-search.ts +153 -0
- package/opencode/tsconfig.json +29 -0
- package/package.json +36 -0
- package/patch-agent-browser.js +100 -0
- package/postinstall.sh +88 -0
- package/services/KORTIX-presentation-viewer/run +37 -0
- package/services/agent-browser-viewer/run +48 -0
- package/services/kortix-master/run +16 -0
- package/services/lss-sync/run +22 -0
- package/services/opencode-serve/run +25 -0
- package/services/opencode-web/run +21 -0
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: kortix-paper-search
|
|
3
|
+
description: "Academic paper search powered by OpenAlex -- the free, open catalog of 240M+ scholarly works. Use when the user needs to find academic papers, research articles, literature for a topic, citation data, author publications, or any scholarly source. Triggers on: 'find papers on', 'academic research about', 'what studies exist', 'literature review', 'find citations', 'scholarly articles about', 'who published on', 'papers by [author]', 'highly cited papers on', any request for peer-reviewed or academic sources. Also use during deep research when you need to ground findings in academic literature. Do NOT use for general web searches -- use web-search for that."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Academic Paper Search (OpenAlex)
|
|
7
|
+
|
|
8
|
+
Search 240M+ scholarly works using the OpenAlex API -- completely free, no API key required, no SDK needed. Just `curl` or `bash` with URL construction.
|
|
9
|
+
|
|
10
|
+
**Full docs:** https://docs.openalex.org
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
OpenAlex is a REST API. You query it by constructing URLs and fetching them with `curl`. All responses are JSON.
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Search for papers about "transformer architecture"
|
|
18
|
+
curl -s "https://api.openalex.org/works?search=transformer+architecture&per_page=5&mailto=agent@kortix.ai" | python3 -m json.tool
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
**Important:** Always include `mailto=agent@kortix.ai` (or any valid email) in every request. Without it, you're limited to 1 request/second. With it, you get 10 requests/second (the "polite pool").
|
|
22
|
+
|
|
23
|
+
## Core Concepts
|
|
24
|
+
|
|
25
|
+
### Entities
|
|
26
|
+
|
|
27
|
+
OpenAlex has these entity types (all queryable):
|
|
28
|
+
|
|
29
|
+
| Entity | Endpoint | Count | Description |
|
|
30
|
+
|--------|----------|-------|-------------|
|
|
31
|
+
| **Works** | `/works` | 240M+ | Papers, articles, books, datasets, theses |
|
|
32
|
+
| **Authors** | `/authors` | 90M+ | People who create works |
|
|
33
|
+
| **Sources** | `/sources` | 250K+ | Journals, repositories, conferences |
|
|
34
|
+
| **Institutions** | `/institutions` | 110K+ | Universities, research orgs |
|
|
35
|
+
| **Topics** | `/topics` | 4K+ | Research topics (hierarchical) |
|
|
36
|
+
|
|
37
|
+
### Work Object -- Key Fields
|
|
38
|
+
|
|
39
|
+
When you fetch a work, these are the most useful fields:
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
id OpenAlex ID (e.g., "https://openalex.org/W2741809807")
|
|
43
|
+
doi DOI URL
|
|
44
|
+
title / display_name Paper title
|
|
45
|
+
publication_year Year published
|
|
46
|
+
publication_date Full date (YYYY-MM-DD)
|
|
47
|
+
cited_by_count Number of incoming citations
|
|
48
|
+
fwci Field-Weighted Citation Impact (normalized)
|
|
49
|
+
type article, preprint, review, book, dataset, etc.
|
|
50
|
+
language ISO 639-1 code (e.g., "en")
|
|
51
|
+
is_retracted Boolean
|
|
52
|
+
open_access.is_oa Boolean -- is it freely accessible?
|
|
53
|
+
open_access.oa_url Direct URL to free version
|
|
54
|
+
authorships List of authors with names, institutions, ORCIDs
|
|
55
|
+
abstract_inverted_index Abstract as inverted index (needs reconstruction)
|
|
56
|
+
referenced_works List of OpenAlex IDs this work cites (outgoing)
|
|
57
|
+
related_works Algorithmically related works
|
|
58
|
+
cited_by_api_url API URL to get works that cite this one (incoming)
|
|
59
|
+
topics Assigned research topics with scores
|
|
60
|
+
keywords Extracted keywords with scores
|
|
61
|
+
primary_location Where the work is published (journal, repo)
|
|
62
|
+
best_oa_location Best open access location with PDF link
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Reconstructing Abstracts
|
|
66
|
+
|
|
67
|
+
OpenAlex stores abstracts as inverted indexes for legal reasons. To get plaintext, reconstruct:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import json, sys
|
|
71
|
+
# Read the abstract_inverted_index from a work object
|
|
72
|
+
inv_idx = work["abstract_inverted_index"]
|
|
73
|
+
if inv_idx:
|
|
74
|
+
words = [""] * (max(max(positions) for positions in inv_idx.values()) + 1)
|
|
75
|
+
for word, positions in inv_idx.items():
|
|
76
|
+
for pos in positions:
|
|
77
|
+
words[pos] = word
|
|
78
|
+
abstract = " ".join(words)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or in bash with `python3 -c`:
|
|
82
|
+
```bash
|
|
83
|
+
# Pipe a work JSON into this to extract the abstract
|
|
84
|
+
echo "$WORK_JSON" | python3 -c "
|
|
85
|
+
import json,sys
|
|
86
|
+
w=json.load(sys.stdin)
|
|
87
|
+
idx=w.get('abstract_inverted_index',{})
|
|
88
|
+
if idx:
|
|
89
|
+
words=['']*( max(max(p) for p in idx.values())+1 )
|
|
90
|
+
for word,positions in idx.items():
|
|
91
|
+
for pos in positions: words[pos]=word
|
|
92
|
+
print(' '.join(words))
|
|
93
|
+
"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Searching for Papers
|
|
97
|
+
|
|
98
|
+
### Basic Keyword Search
|
|
99
|
+
|
|
100
|
+
Searches across titles, abstracts, and fulltext. Uses stemming and stop-word removal.
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# Simple search
|
|
104
|
+
curl -s "https://api.openalex.org/works?search=large+language+models&mailto=agent@kortix.ai"
|
|
105
|
+
|
|
106
|
+
# With per_page limit
|
|
107
|
+
curl -s "https://api.openalex.org/works?search=CRISPR+gene+editing&per_page=10&mailto=agent@kortix.ai"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Boolean Search
|
|
111
|
+
|
|
112
|
+
Use uppercase `AND`, `OR`, `NOT` with parentheses and quoted phrases:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Complex boolean query
|
|
116
|
+
curl -s "https://api.openalex.org/works?search=(reinforcement+learning+AND+%22robot+control%22)+NOT+simulation&mailto=agent@kortix.ai"
|
|
117
|
+
|
|
118
|
+
# Exact phrase match (use double quotes, URL-encoded as %22)
|
|
119
|
+
curl -s "https://api.openalex.org/works?search=%22attention+is+all+you+need%22&mailto=agent@kortix.ai"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Search Specific Fields
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Title only
|
|
126
|
+
curl -s "https://api.openalex.org/works?filter=title.search:transformer&mailto=agent@kortix.ai"
|
|
127
|
+
|
|
128
|
+
# Abstract only
|
|
129
|
+
curl -s "https://api.openalex.org/works?filter=abstract.search:protein+folding&mailto=agent@kortix.ai"
|
|
130
|
+
|
|
131
|
+
# Title and abstract combined
|
|
132
|
+
curl -s "https://api.openalex.org/works?filter=title_and_abstract.search:neural+scaling+laws&mailto=agent@kortix.ai"
|
|
133
|
+
|
|
134
|
+
# Fulltext search (subset of works)
|
|
135
|
+
curl -s "https://api.openalex.org/works?filter=fulltext.search:climate+tipping+points&mailto=agent@kortix.ai"
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Filtering
|
|
139
|
+
|
|
140
|
+
Filters are the most powerful feature. Combine them with commas (AND) or pipes (OR).
|
|
141
|
+
|
|
142
|
+
### Most Useful Filters
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# By publication year
|
|
146
|
+
?filter=publication_year:2024
|
|
147
|
+
?filter=publication_year:2020-2024
|
|
148
|
+
?filter=publication_year:>2022
|
|
149
|
+
|
|
150
|
+
# By citation count
|
|
151
|
+
?filter=cited_by_count:>100 # highly cited
|
|
152
|
+
?filter=cited_by_count:>1000 # landmark papers
|
|
153
|
+
|
|
154
|
+
# By open access
|
|
155
|
+
?filter=is_oa:true # only open access
|
|
156
|
+
?filter=oa_status:gold # gold OA only
|
|
157
|
+
|
|
158
|
+
# By type
|
|
159
|
+
?filter=type:article # journal articles
|
|
160
|
+
?filter=type:preprint # preprints
|
|
161
|
+
?filter=type:review # review articles
|
|
162
|
+
|
|
163
|
+
# By language
|
|
164
|
+
?filter=language:en # English only
|
|
165
|
+
|
|
166
|
+
# Not retracted
|
|
167
|
+
?filter=is_retracted:false
|
|
168
|
+
|
|
169
|
+
# Has abstract
|
|
170
|
+
?filter=has_abstract:true
|
|
171
|
+
|
|
172
|
+
# Has downloadable PDF
|
|
173
|
+
?filter=has_content.pdf:true
|
|
174
|
+
|
|
175
|
+
# By author (OpenAlex ID)
|
|
176
|
+
?filter=author.id:A5023888391
|
|
177
|
+
|
|
178
|
+
# By institution (OpenAlex ID)
|
|
179
|
+
?filter=institutions.id:I27837315 # e.g., University of Michigan
|
|
180
|
+
|
|
181
|
+
# By DOI
|
|
182
|
+
?filter=doi:https://doi.org/10.1038/s41586-021-03819-2
|
|
183
|
+
|
|
184
|
+
# By indexed source
|
|
185
|
+
?filter=indexed_in:arxiv # arXiv papers
|
|
186
|
+
?filter=indexed_in:pubmed # PubMed papers
|
|
187
|
+
?filter=indexed_in:crossref # Crossref papers
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Combining Filters
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# AND: comma-separated
|
|
194
|
+
?filter=publication_year:>2022,cited_by_count:>50,is_oa:true,type:article
|
|
195
|
+
|
|
196
|
+
# OR: pipe-separated within a filter
|
|
197
|
+
?filter=publication_year:2023|2024
|
|
198
|
+
|
|
199
|
+
# NOT: prefix with !
|
|
200
|
+
?filter=type:!preprint
|
|
201
|
+
|
|
202
|
+
# Combined example: highly-cited OA articles from 2023-2024, not preprints
|
|
203
|
+
curl -s "https://api.openalex.org/works?filter=publication_year:2023-2024,cited_by_count:>50,is_oa:true,type:!preprint&search=machine+learning&per_page=10&mailto=agent@kortix.ai"
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Sorting
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
# Most cited first
|
|
210
|
+
?sort=cited_by_count:desc
|
|
211
|
+
|
|
212
|
+
# Most recent first
|
|
213
|
+
?sort=publication_date:desc
|
|
214
|
+
|
|
215
|
+
# Most relevant first (only when using search)
|
|
216
|
+
?sort=relevance_score:desc
|
|
217
|
+
|
|
218
|
+
# Multiple sort keys
|
|
219
|
+
?sort=publication_year:desc,cited_by_count:desc
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Pagination
|
|
223
|
+
|
|
224
|
+
Two modes: **basic paging** (for browsing) and **cursor paging** (for collecting all results).
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
# Basic paging (limited to 10,000 results)
|
|
228
|
+
?page=1&per_page=25
|
|
229
|
+
?page=2&per_page=25
|
|
230
|
+
|
|
231
|
+
# Cursor paging (unlimited, for collecting everything)
|
|
232
|
+
?per_page=100&cursor=* # first page
|
|
233
|
+
?per_page=100&cursor=IlsxNjk0ODc... # next page (cursor from previous response meta)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The cursor for the next page is in `response.meta.next_cursor`. When it's `null`, you've reached the end.
|
|
237
|
+
|
|
238
|
+
## Select Fields
|
|
239
|
+
|
|
240
|
+
Reduce response size by selecting only the fields you need:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
# Only get IDs, titles, citation counts, and DOIs
|
|
244
|
+
?select=id,display_name,cited_by_count,doi,publication_year
|
|
245
|
+
|
|
246
|
+
# Minimal metadata for scanning
|
|
247
|
+
?select=id,display_name,publication_year,cited_by_count,open_access
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Citation Graph Traversal
|
|
251
|
+
|
|
252
|
+
### Find what a paper cites (outgoing references)
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
# Get works cited BY a specific paper
|
|
256
|
+
curl -s "https://api.openalex.org/works?filter=cited_by:W2741809807&per_page=25&mailto=agent@kortix.ai"
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Find what cites a paper (incoming citations)
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
# Get works that CITE a specific paper
|
|
263
|
+
curl -s "https://api.openalex.org/works?filter=cites:W2741809807&sort=cited_by_count:desc&per_page=25&mailto=agent@kortix.ai"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Find related works
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
# Get related works (algorithmic, based on shared concepts)
|
|
270
|
+
curl -s "https://api.openalex.org/works?filter=related_to:W2741809807&per_page=25&mailto=agent@kortix.ai"
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Citation chain: follow the references
|
|
274
|
+
|
|
275
|
+
1. Get a seminal paper by DOI
|
|
276
|
+
2. Find its `referenced_works` (what it cites)
|
|
277
|
+
3. Find who cites it (`filter=cites:WORK_ID`)
|
|
278
|
+
4. For the most cited citers, repeat
|
|
279
|
+
|
|
280
|
+
This is how you build a literature graph around a topic.
|
|
281
|
+
|
|
282
|
+
## Author Lookup
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
# Search for an author
|
|
286
|
+
curl -s "https://api.openalex.org/authors?search=Yann+LeCun&mailto=agent@kortix.ai"
|
|
287
|
+
|
|
288
|
+
# Get an author's works (by OpenAlex author ID)
|
|
289
|
+
curl -s "https://api.openalex.org/works?filter=author.id:A5064850633&sort=cited_by_count:desc&per_page=10&mailto=agent@kortix.ai"
|
|
290
|
+
|
|
291
|
+
# Get an author by ORCID
|
|
292
|
+
curl -s "https://api.openalex.org/authors/orcid:0000-0001-6187-6610?mailto=agent@kortix.ai"
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Lookup by External ID
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# By DOI
|
|
299
|
+
curl -s "https://api.openalex.org/works/doi:10.1038/s41586-021-03819-2?mailto=agent@kortix.ai"
|
|
300
|
+
|
|
301
|
+
# By PubMed ID
|
|
302
|
+
curl -s "https://api.openalex.org/works/pmid:14907713?mailto=agent@kortix.ai"
|
|
303
|
+
|
|
304
|
+
# By arXiv ID (via DOI)
|
|
305
|
+
curl -s "https://api.openalex.org/works/doi:10.48550/arXiv.2303.08774?mailto=agent@kortix.ai"
|
|
306
|
+
|
|
307
|
+
# Batch lookup: up to 50 IDs at once
|
|
308
|
+
curl -s "https://api.openalex.org/works?filter=doi:https://doi.org/10.1234/a|https://doi.org/10.1234/b|https://doi.org/10.1234/c&mailto=agent@kortix.ai"
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Open Access & PDF Access
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
# Find OA papers with direct PDF links
|
|
315
|
+
curl -s "https://api.openalex.org/works?search=quantum+computing&filter=is_oa:true,has_content.pdf:true&select=id,display_name,open_access,best_oa_location&per_page=5&mailto=agent@kortix.ai"
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
The `best_oa_location.pdf_url` field gives a direct PDF link when available. The `open_access.oa_url` gives the best available OA landing page or PDF.
|
|
319
|
+
|
|
320
|
+
## Practical Workflows
|
|
321
|
+
|
|
322
|
+
### Literature Survey on a Topic
|
|
323
|
+
|
|
324
|
+
```bash
|
|
325
|
+
# 1. Find the most-cited papers on a topic
|
|
326
|
+
curl -s "https://api.openalex.org/works?search=retrieval+augmented+generation&sort=cited_by_count:desc&filter=publication_year:>2020,type:article,has_abstract:true&per_page=20&select=id,display_name,publication_year,cited_by_count,doi,authorships,abstract_inverted_index&mailto=agent@kortix.ai"
|
|
327
|
+
|
|
328
|
+
# 2. For the top papers, explore their citation graphs
|
|
329
|
+
curl -s "https://api.openalex.org/works?filter=cites:W4285719527&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
|
|
330
|
+
|
|
331
|
+
# 3. Find recent papers building on this work
|
|
332
|
+
curl -s "https://api.openalex.org/works?filter=cites:W4285719527,publication_year:>2023&sort=publication_date:desc&per_page=10&mailto=agent@kortix.ai"
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
### Find Landmark/Seminal Papers
|
|
336
|
+
|
|
337
|
+
```bash
|
|
338
|
+
# Highly cited + search term
|
|
339
|
+
curl -s "https://api.openalex.org/works?search=attention+mechanism+neural+networks&filter=cited_by_count:>500,type:article&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Find Recent Preprints
|
|
343
|
+
|
|
344
|
+
```bash
|
|
345
|
+
# Latest preprints on a topic
|
|
346
|
+
curl -s "https://api.openalex.org/works?search=multimodal+large+language+models&filter=type:preprint,publication_year:2025&sort=publication_date:desc&per_page=15&mailto=agent@kortix.ai"
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Find Review Articles
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
# Review/survey papers on a topic
|
|
353
|
+
curl -s "https://api.openalex.org/works?search=federated+learning&filter=type:review,cited_by_count:>20&sort=cited_by_count:desc&per_page=10&mailto=agent@kortix.ai"
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Author Analysis
|
|
357
|
+
|
|
358
|
+
```bash
|
|
359
|
+
# 1. Find the author
|
|
360
|
+
curl -s "https://api.openalex.org/authors?search=Geoffrey+Hinton&select=id,display_name,works_count,cited_by_count,last_known_institutions&mailto=agent@kortix.ai"
|
|
361
|
+
|
|
362
|
+
# 2. Get their most influential papers
|
|
363
|
+
curl -s "https://api.openalex.org/works?filter=author.id:A5068082743&sort=cited_by_count:desc&per_page=10&select=id,display_name,publication_year,cited_by_count,doi&mailto=agent@kortix.ai"
|
|
364
|
+
|
|
365
|
+
# 3. Get their recent work
|
|
366
|
+
curl -s "https://api.openalex.org/works?filter=author.id:A5068082743,publication_year:>2023&sort=publication_date:desc&per_page=10&mailto=agent@kortix.ai"
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
## Saving Results to Disk
|
|
370
|
+
|
|
371
|
+
When doing deep research, save paper data to disk for later processing:
|
|
372
|
+
|
|
373
|
+
```bash
|
|
374
|
+
# Save search results as JSON
|
|
375
|
+
curl -s "https://api.openalex.org/works?search=topic&per_page=50&mailto=agent@kortix.ai" > research/papers/topic-search.json
|
|
376
|
+
|
|
377
|
+
# Extract and save a clean summary
|
|
378
|
+
curl -s "https://api.openalex.org/works?search=topic&per_page=50&select=id,display_name,publication_year,cited_by_count,doi,authorships&mailto=agent@kortix.ai" | python3 -c "
|
|
379
|
+
import json, sys
|
|
380
|
+
data = json.load(sys.stdin)
|
|
381
|
+
for w in data.get('results', []):
|
|
382
|
+
authors = ', '.join(a['author']['display_name'] for a in w.get('authorships', [])[:3])
|
|
383
|
+
if len(w.get('authorships', [])) > 3: authors += ' et al.'
|
|
384
|
+
print(f\"[{w.get('cited_by_count',0)} cites] {w['display_name']} ({w.get('publication_year','?')}) - {authors}\")
|
|
385
|
+
if w.get('doi'): print(f\" DOI: {w['doi']}\")
|
|
386
|
+
print()
|
|
387
|
+
" > research/papers/topic-summary.txt
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
For deep research, save individual paper metadata to your `sources-index.md` and raw data to `sources/`:
|
|
391
|
+
|
|
392
|
+
```bash
|
|
393
|
+
# Save a paper's full metadata
|
|
394
|
+
curl -s "https://api.openalex.org/works/W2741809807?mailto=agent@kortix.ai" > research/sources/001-paper-title.json
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
## Rate Limits
|
|
398
|
+
|
|
399
|
+
| Pool | Rate | How to get it |
|
|
400
|
+
|------|------|---------------|
|
|
401
|
+
| Common | 1 req/sec | No email provided |
|
|
402
|
+
| Polite | 10 req/sec | Add `mailto=your@email.com` to requests |
|
|
403
|
+
| Premium | Higher | Paid API key via `api_key` param |
|
|
404
|
+
|
|
405
|
+
**Always use the polite pool.** Add `&mailto=agent@kortix.ai` to every request.
|
|
406
|
+
|
|
407
|
+
## Tips
|
|
408
|
+
|
|
409
|
+
- **Use `select` aggressively** to reduce response size and speed up requests
|
|
410
|
+
- **Use `per_page=100`** (max) when collecting lots of results to minimize request count
|
|
411
|
+
- **Use cursor paging** (`cursor=*`) when you need more than 10,000 results
|
|
412
|
+
- **Batch DOI lookups** with OR syntax: `filter=doi:DOI1|DOI2|DOI3` (up to 50)
|
|
413
|
+
- **Reconstruct abstracts** using the inverted index -- don't skip this, abstracts are gold
|
|
414
|
+
- **Follow citation chains** to find seminal works and recent developments
|
|
415
|
+
- **Filter by `has_abstract:true`** when you need abstracts (not all works have them)
|
|
416
|
+
- **Filter by `indexed_in:arxiv`** or `indexed_in:pubmed` to target specific repositories
|
|
417
|
+
- **Sort by `cited_by_count:desc`** to find the most influential papers first
|
|
418
|
+
- **Combine search + filters** for precise results: search gives relevance, filters give precision
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: kortix-pdf
|
|
3
|
+
description: "Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PDF Processing Guide
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
Essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see reference.md. For filling PDF forms, read forms.md and follow its instructions.
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from pypdf import PdfReader, PdfWriter
|
|
16
|
+
|
|
17
|
+
reader = PdfReader("document.pdf")
|
|
18
|
+
print(f"Pages: {len(reader.pages)}")
|
|
19
|
+
|
|
20
|
+
text = ""
|
|
21
|
+
for page in reader.pages:
|
|
22
|
+
text += page.extract_text()
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Python Libraries
|
|
26
|
+
|
|
27
|
+
### pypdf - Basic Operations
|
|
28
|
+
|
|
29
|
+
#### Merge PDFs
|
|
30
|
+
```python
|
|
31
|
+
from pypdf import PdfWriter, PdfReader
|
|
32
|
+
|
|
33
|
+
writer = PdfWriter()
|
|
34
|
+
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
|
|
35
|
+
reader = PdfReader(pdf_file)
|
|
36
|
+
for page in reader.pages:
|
|
37
|
+
writer.add_page(page)
|
|
38
|
+
|
|
39
|
+
with open("merged.pdf", "wb") as output:
|
|
40
|
+
writer.write(output)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
#### Split PDF
|
|
44
|
+
```python
|
|
45
|
+
reader = PdfReader("input.pdf")
|
|
46
|
+
for i, page in enumerate(reader.pages):
|
|
47
|
+
writer = PdfWriter()
|
|
48
|
+
writer.add_page(page)
|
|
49
|
+
with open(f"page_{i+1}.pdf", "wb") as output:
|
|
50
|
+
writer.write(output)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
#### Extract Metadata
|
|
54
|
+
```python
|
|
55
|
+
reader = PdfReader("document.pdf")
|
|
56
|
+
meta = reader.metadata
|
|
57
|
+
print(f"Title: {meta.title}")
|
|
58
|
+
print(f"Author: {meta.author}")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### Rotate Pages
|
|
62
|
+
```python
|
|
63
|
+
reader = PdfReader("input.pdf")
|
|
64
|
+
writer = PdfWriter()
|
|
65
|
+
page = reader.pages[0]
|
|
66
|
+
page.rotate(90)
|
|
67
|
+
writer.add_page(page)
|
|
68
|
+
with open("rotated.pdf", "wb") as output:
|
|
69
|
+
writer.write(output)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### pdfplumber - Text and Table Extraction
|
|
73
|
+
|
|
74
|
+
#### Extract Text with Layout
|
|
75
|
+
```python
|
|
76
|
+
import pdfplumber
|
|
77
|
+
|
|
78
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
79
|
+
for page in pdf.pages:
|
|
80
|
+
text = page.extract_text()
|
|
81
|
+
print(text)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Extract Tables
|
|
85
|
+
```python
|
|
86
|
+
import pandas as pd
|
|
87
|
+
|
|
88
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
89
|
+
all_tables = []
|
|
90
|
+
for page in pdf.pages:
|
|
91
|
+
tables = page.extract_tables()
|
|
92
|
+
for table in tables:
|
|
93
|
+
if table:
|
|
94
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
95
|
+
all_tables.append(df)
|
|
96
|
+
|
|
97
|
+
if all_tables:
|
|
98
|
+
combined_df = pd.concat(all_tables, ignore_index=True)
|
|
99
|
+
combined_df.to_excel("extracted_tables.xlsx", index=False)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### reportlab - Create PDFs
|
|
103
|
+
|
|
104
|
+
#### Basic PDF Creation
|
|
105
|
+
```python
|
|
106
|
+
from reportlab.lib.pagesizes import letter
|
|
107
|
+
from reportlab.pdfgen import canvas
|
|
108
|
+
|
|
109
|
+
c = canvas.Canvas("hello.pdf", pagesize=letter)
|
|
110
|
+
width, height = letter
|
|
111
|
+
c.drawString(100, height - 100, "Hello World!")
|
|
112
|
+
c.line(100, height - 140, 400, height - 140)
|
|
113
|
+
c.save()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
#### Multi-Page with Platypus
|
|
117
|
+
```python
|
|
118
|
+
from reportlab.lib.pagesizes import letter
|
|
119
|
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
|
120
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
121
|
+
|
|
122
|
+
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
|
|
123
|
+
styles = getSampleStyleSheet()
|
|
124
|
+
story = []
|
|
125
|
+
|
|
126
|
+
story.append(Paragraph("Report Title", styles['Title']))
|
|
127
|
+
story.append(Spacer(1, 12))
|
|
128
|
+
story.append(Paragraph("This is the body of the report. " * 20, styles['Normal']))
|
|
129
|
+
story.append(PageBreak())
|
|
130
|
+
story.append(Paragraph("Page 2", styles['Heading1']))
|
|
131
|
+
|
|
132
|
+
doc.build(story)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
#### Subscripts and Superscripts
|
|
136
|
+
|
|
137
|
+
**IMPORTANT**: Never use Unicode subscript/superscript characters in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
|
|
138
|
+
|
|
139
|
+
Use ReportLab's XML markup tags in Paragraph objects:
|
|
140
|
+
```python
|
|
141
|
+
from reportlab.platypus import Paragraph
|
|
142
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
143
|
+
styles = getSampleStyleSheet()
|
|
144
|
+
|
|
145
|
+
chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
|
|
146
|
+
squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Command-Line Tools
|
|
150
|
+
|
|
151
|
+
### pdftotext (poppler-utils)
|
|
152
|
+
```bash
|
|
153
|
+
pdftotext input.pdf output.txt
|
|
154
|
+
pdftotext -layout input.pdf output.txt
|
|
155
|
+
pdftotext -f 1 -l 5 input.pdf output.txt
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### qpdf
|
|
159
|
+
```bash
|
|
160
|
+
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
|
|
161
|
+
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
|
|
162
|
+
qpdf input.pdf output.pdf --rotate=+90:1
|
|
163
|
+
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Common Tasks
|
|
167
|
+
|
|
168
|
+
### OCR Scanned PDFs
|
|
169
|
+
```python
|
|
170
|
+
import pytesseract
|
|
171
|
+
from pdf2image import convert_from_path
|
|
172
|
+
|
|
173
|
+
images = convert_from_path('scanned.pdf')
|
|
174
|
+
text = ""
|
|
175
|
+
for i, image in enumerate(images):
|
|
176
|
+
text += f"Page {i+1}:\n"
|
|
177
|
+
text += pytesseract.image_to_string(image)
|
|
178
|
+
text += "\n\n"
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Add Watermark
|
|
182
|
+
```python
|
|
183
|
+
from pypdf import PdfReader, PdfWriter
|
|
184
|
+
|
|
185
|
+
watermark = PdfReader("watermark.pdf").pages[0]
|
|
186
|
+
reader = PdfReader("document.pdf")
|
|
187
|
+
writer = PdfWriter()
|
|
188
|
+
|
|
189
|
+
for page in reader.pages:
|
|
190
|
+
page.merge_page(watermark)
|
|
191
|
+
writer.add_page(page)
|
|
192
|
+
|
|
193
|
+
with open("watermarked.pdf", "wb") as output:
|
|
194
|
+
writer.write(output)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Extract Images
|
|
198
|
+
```bash
|
|
199
|
+
pdfimages -j input.pdf output_prefix
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Password Protection
|
|
203
|
+
```python
|
|
204
|
+
from pypdf import PdfReader, PdfWriter
|
|
205
|
+
|
|
206
|
+
reader = PdfReader("input.pdf")
|
|
207
|
+
writer = PdfWriter()
|
|
208
|
+
for page in reader.pages:
|
|
209
|
+
writer.add_page(page)
|
|
210
|
+
writer.encrypt("userpassword", "ownerpassword")
|
|
211
|
+
with open("encrypted.pdf", "wb") as output:
|
|
212
|
+
writer.write(output)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Quick Reference
|
|
216
|
+
|
|
217
|
+
| Task | Best Tool | Command/Code |
|
|
218
|
+
|------|-----------|--------------|
|
|
219
|
+
| Merge PDFs | pypdf | `writer.add_page(page)` |
|
|
220
|
+
| Split PDFs | pypdf | One page per file |
|
|
221
|
+
| Extract text | pdfplumber | `page.extract_text()` |
|
|
222
|
+
| Extract tables | pdfplumber | `page.extract_tables()` |
|
|
223
|
+
| Create PDFs | reportlab | Canvas or Platypus |
|
|
224
|
+
| Command line merge | qpdf | `qpdf --empty --pages ...` |
|
|
225
|
+
| OCR scanned PDFs | pytesseract | Convert to image first |
|
|
226
|
+
| Fill PDF forms | pypdf or pdf-lib | See forms.md |
|
|
227
|
+
|
|
228
|
+
## Next Steps
|
|
229
|
+
|
|
230
|
+
- For advanced pypdfium2 usage, see reference.md
|
|
231
|
+
- For JavaScript libraries (pdf-lib), see reference.md
|
|
232
|
+
- For filling PDF forms, follow instructions in forms.md
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
**CRITICAL: Complete these steps in order. Do not skip ahead to writing code.**
|
|
2
|
+
|
|
3
|
+
First check if the PDF has fillable form fields. Run:
|
|
4
|
+
`python scripts/check_fillable_fields <file.pdf>`, then go to "Fillable fields" or "Non-fillable fields".
|
|
5
|
+
|
|
6
|
+
# Fillable fields
|
|
7
|
+
If the PDF has fillable form fields:
|
|
8
|
+
- Run: `python scripts/extract_form_field_info.py <input.pdf> <field_info.json>` to get field list as JSON.
|
|
9
|
+
- Convert PDF to PNGs: `python scripts/convert_pdf_to_images.py <file.pdf> <output_directory>`
|
|
10
|
+
- Analyze images to determine each field's purpose.
|
|
11
|
+
- Create `field_values.json` with values for each field.
|
|
12
|
+
- Run: `python scripts/fill_fillable_fields.py <input.pdf> <field_values.json> <output.pdf>`
|
|
13
|
+
|
|
14
|
+
# Non-fillable fields
|
|
15
|
+
If no fillable form fields, add text annotations.
|
|
16
|
+
|
|
17
|
+
## Step 1: Try Structure Extraction First
|
|
18
|
+
Run: `python scripts/extract_form_structure.py <input.pdf> form_structure.json`
|
|
19
|
+
|
|
20
|
+
If meaningful labels found, use **Approach A**. If scanned/image-based, use **Approach B**.
|
|
21
|
+
|
|
22
|
+
## Approach A: Structure-Based Coordinates (Preferred)
|
|
23
|
+
1. Analyze form_structure.json for label groups, row structure, field columns, checkboxes
|
|
24
|
+
2. Create fields.json with PDF coordinates using `pdf_width`/`pdf_height`
|
|
25
|
+
3. Validate: `python scripts/check_bounding_boxes.py fields.json`
|
|
26
|
+
|
|
27
|
+
## Approach B: Visual Estimation (Fallback)
|
|
28
|
+
1. Convert to images: `python scripts/convert_pdf_to_images.py <input.pdf> <images_dir/>`
|
|
29
|
+
2. Identify fields with rough estimates
|
|
30
|
+
3. Zoom and refine with ImageMagick crops
|
|
31
|
+
4. Create fields.json with image coordinates using `image_width`/`image_height`
|
|
32
|
+
5. Validate: `python scripts/check_bounding_boxes.py fields.json`
|
|
33
|
+
|
|
34
|
+
## Fill and Verify
|
|
35
|
+
- Fill: `python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
|
|
36
|
+
- Verify: `python scripts/convert_pdf_to_images.py <output.pdf> <verify_images/>`
|