@bgicli/bgicli 2.2.7 → 2.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/skills/anthropic-algorithmic-art/SKILL.md +405 -0
- package/data/skills/anthropic-canvas-design/SKILL.md +130 -0
- package/data/skills/anthropic-claude-api/SKILL.md +243 -0
- package/data/skills/anthropic-doc-coauthoring/SKILL.md +375 -0
- package/data/skills/anthropic-docx/SKILL.md +590 -0
- package/data/skills/anthropic-frontend-design/SKILL.md +42 -0
- package/data/skills/anthropic-internal-comms/SKILL.md +32 -0
- package/data/skills/anthropic-mcp-builder/SKILL.md +236 -0
- package/data/skills/anthropic-pdf/SKILL.md +314 -0
- package/data/skills/anthropic-pptx/SKILL.md +232 -0
- package/data/skills/anthropic-skill-creator/SKILL.md +485 -0
- package/data/skills/anthropic-webapp-testing/SKILL.md +96 -0
- package/data/skills/anthropic-xlsx/SKILL.md +292 -0
- package/data/skills/arxiv-database/SKILL.md +362 -0
- package/data/skills/astropy/SKILL.md +329 -0
- package/data/skills/ctx-advanced-evaluation/SKILL.md +402 -0
- package/data/skills/ctx-bdi-mental-states/SKILL.md +311 -0
- package/data/skills/ctx-context-compression/SKILL.md +272 -0
- package/data/skills/ctx-context-degradation/SKILL.md +206 -0
- package/data/skills/ctx-context-fundamentals/SKILL.md +201 -0
- package/data/skills/ctx-context-optimization/SKILL.md +195 -0
- package/data/skills/ctx-evaluation/SKILL.md +251 -0
- package/data/skills/ctx-filesystem-context/SKILL.md +287 -0
- package/data/skills/ctx-hosted-agents/SKILL.md +260 -0
- package/data/skills/ctx-memory-systems/SKILL.md +225 -0
- package/data/skills/ctx-multi-agent-patterns/SKILL.md +257 -0
- package/data/skills/ctx-project-development/SKILL.md +291 -0
- package/data/skills/ctx-tool-design/SKILL.md +271 -0
- package/data/skills/dhdna-profiler/SKILL.md +162 -0
- package/data/skills/generate-image/SKILL.md +183 -0
- package/data/skills/geomaster/SKILL.md +365 -0
- package/data/skills/get-available-resources/SKILL.md +275 -0
- package/data/skills/hamelsmu-build-review-interface/SKILL.md +96 -0
- package/data/skills/hamelsmu-error-analysis/SKILL.md +164 -0
- package/data/skills/hamelsmu-eval-audit/SKILL.md +183 -0
- package/data/skills/hamelsmu-evaluate-rag/SKILL.md +177 -0
- package/data/skills/hamelsmu-generate-synthetic-data/SKILL.md +131 -0
- package/data/skills/hamelsmu-validate-evaluator/SKILL.md +212 -0
- package/data/skills/hamelsmu-write-judge-prompt/SKILL.md +144 -0
- package/data/skills/hf-cli/SKILL.md +174 -0
- package/data/skills/hf-mcp/SKILL.md +178 -0
- package/data/skills/hugging-face-dataset-viewer/SKILL.md +121 -0
- package/data/skills/hugging-face-datasets/SKILL.md +542 -0
- package/data/skills/hugging-face-evaluation/SKILL.md +651 -0
- package/data/skills/hugging-face-jobs/SKILL.md +1042 -0
- package/data/skills/hugging-face-model-trainer/SKILL.md +717 -0
- package/data/skills/hugging-face-paper-pages/SKILL.md +239 -0
- package/data/skills/hugging-face-paper-publisher/SKILL.md +624 -0
- package/data/skills/hugging-face-tool-builder/SKILL.md +110 -0
- package/data/skills/hugging-face-trackio/SKILL.md +115 -0
- package/data/skills/hugging-face-vision-trainer/SKILL.md +593 -0
- package/data/skills/huggingface-gradio/SKILL.md +245 -0
- package/data/skills/matlab/SKILL.md +376 -0
- package/data/skills/modal/SKILL.md +381 -0
- package/data/skills/openai-cloudflare-deploy/SKILL.md +224 -0
- package/data/skills/openai-develop-web-game/SKILL.md +149 -0
- package/data/skills/openai-doc/SKILL.md +80 -0
- package/data/skills/openai-figma/SKILL.md +42 -0
- package/data/skills/openai-figma-implement-design/SKILL.md +264 -0
- package/data/skills/openai-gh-address-comments/SKILL.md +25 -0
- package/data/skills/openai-gh-fix-ci/SKILL.md +69 -0
- package/data/skills/openai-imagegen/SKILL.md +174 -0
- package/data/skills/openai-jupyter-notebook/SKILL.md +107 -0
- package/data/skills/openai-linear/SKILL.md +87 -0
- package/data/skills/openai-netlify-deploy/SKILL.md +247 -0
- package/data/skills/openai-notion-knowledge-capture/SKILL.md +56 -0
- package/data/skills/openai-notion-meeting-intelligence/SKILL.md +60 -0
- package/data/skills/openai-notion-research-documentation/SKILL.md +59 -0
- package/data/skills/openai-notion-spec-to-implementation/SKILL.md +58 -0
- package/data/skills/openai-openai-docs/SKILL.md +69 -0
- package/data/skills/openai-pdf/SKILL.md +67 -0
- package/data/skills/openai-playwright/SKILL.md +147 -0
- package/data/skills/openai-render-deploy/SKILL.md +479 -0
- package/data/skills/openai-screenshot/SKILL.md +267 -0
- package/data/skills/openai-security-best-practices/SKILL.md +86 -0
- package/data/skills/openai-security-ownership-map/SKILL.md +206 -0
- package/data/skills/openai-security-threat-model/SKILL.md +81 -0
- package/data/skills/openai-sentry/SKILL.md +123 -0
- package/data/skills/openai-sora/SKILL.md +178 -0
- package/data/skills/openai-speech/SKILL.md +144 -0
- package/data/skills/openai-spreadsheet/SKILL.md +145 -0
- package/data/skills/openai-transcribe/SKILL.md +81 -0
- package/data/skills/openai-vercel-deploy/SKILL.md +77 -0
- package/data/skills/openai-yeet/SKILL.md +28 -0
- package/data/skills/pennylane/SKILL.md +224 -0
- package/data/skills/polars-bio/SKILL.md +374 -0
- package/data/skills/primekg/SKILL.md +97 -0
- package/data/skills/pymatgen/SKILL.md +689 -0
- package/data/skills/qiskit/SKILL.md +273 -0
- package/data/skills/qutip/SKILL.md +316 -0
- package/data/skills/recursive-decomposition/SKILL.md +185 -0
- package/data/skills/rowan/SKILL.md +427 -0
- package/data/skills/scholar-evaluation/SKILL.md +298 -0
- package/data/skills/sentry-create-alert/SKILL.md +210 -0
- package/data/skills/sentry-fix-issues/SKILL.md +126 -0
- package/data/skills/sentry-pr-code-review/SKILL.md +105 -0
- package/data/skills/sentry-python-sdk/SKILL.md +317 -0
- package/data/skills/sentry-setup-ai-monitoring/SKILL.md +217 -0
- package/data/skills/stable-baselines3/SKILL.md +297 -0
- package/data/skills/sympy/SKILL.md +498 -0
- package/data/skills/trailofbits-ask-questions-if-underspecified/SKILL.md +85 -0
- package/data/skills/trailofbits-audit-context-building/SKILL.md +302 -0
- package/data/skills/trailofbits-differential-review/SKILL.md +220 -0
- package/data/skills/trailofbits-insecure-defaults/SKILL.md +117 -0
- package/data/skills/trailofbits-modern-python/SKILL.md +333 -0
- package/data/skills/trailofbits-property-based-testing/SKILL.md +123 -0
- package/data/skills/trailofbits-semgrep-rule-creator/SKILL.md +172 -0
- package/data/skills/trailofbits-sharp-edges/SKILL.md +292 -0
- package/data/skills/trailofbits-variant-analysis/SKILL.md +142 -0
- package/data/skills/transformers.js/SKILL.md +637 -0
- package/data/skills/writing/SKILL.md +419 -0
- package/data/workflows/survival-analysis-clinical/SKILL.md +348 -0
- package/data/workflows/survival-analysis-clinical/scripts/full_workflow.R +95 -0
- package/data/workflows/survival-analysis-clinical/scripts/load_example_data.R +65 -0
- package/data/workflows/survival-analysis-clinical/scripts/plot_forest.R +46 -0
- package/dist/bgi.js +1608 -233
- package/package.json +45 -45
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mcp-builder
|
|
3
|
+
description: Guide for creating high-quality MCP (Model Context Protocol) servers that enable LLMs to interact with external services through well-designed tools. Use when building MCP servers to integrate external APIs or services, whether in Python (FastMCP) or Node/TypeScript (MCP SDK).
|
|
4
|
+
license: Complete terms in LICENSE.txt
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# MCP Server Development Guide
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
Create MCP (Model Context Protocol) servers that enable LLMs to interact with external services through well-designed tools. The quality of an MCP server is measured by how well it enables LLMs to accomplish real-world tasks.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# Process
|
|
16
|
+
|
|
17
|
+
## 🚀 High-Level Workflow
|
|
18
|
+
|
|
19
|
+
Creating a high-quality MCP server involves four main phases:
|
|
20
|
+
|
|
21
|
+
### Phase 1: Deep Research and Planning
|
|
22
|
+
|
|
23
|
+
#### 1.1 Understand Modern MCP Design
|
|
24
|
+
|
|
25
|
+
**API Coverage vs. Workflow Tools:**
|
|
26
|
+
Balance comprehensive API endpoint coverage with specialized workflow tools. Workflow tools can be more convenient for specific tasks, while comprehensive coverage gives agents flexibility to compose operations. Performance varies by client—some clients benefit from code execution that combines basic tools, while others work better with higher-level workflows. When uncertain, prioritize comprehensive API coverage.
|
|
27
|
+
|
|
28
|
+
**Tool Naming and Discoverability:**
|
|
29
|
+
Clear, descriptive tool names help agents find the right tools quickly. Use consistent prefixes (e.g., `github_create_issue`, `github_list_repos`) and action-oriented naming.
|
|
30
|
+
|
|
31
|
+
**Context Management:**
|
|
32
|
+
Agents benefit from concise tool descriptions and the ability to filter/paginate results. Design tools that return focused, relevant data. Some clients support code execution which can help agents filter and process data efficiently.
|
|
33
|
+
|
|
34
|
+
**Actionable Error Messages:**
|
|
35
|
+
Error messages should guide agents toward solutions with specific suggestions and next steps.
|
|
36
|
+
|
|
37
|
+
#### 1.2 Study MCP Protocol Documentation
|
|
38
|
+
|
|
39
|
+
**Navigate the MCP specification:**
|
|
40
|
+
|
|
41
|
+
Start with the sitemap to find relevant pages: `https://modelcontextprotocol.io/sitemap.xml`
|
|
42
|
+
|
|
43
|
+
Then fetch specific pages with `.md` suffix for markdown format (e.g., `https://modelcontextprotocol.io/specification/draft.md`).
|
|
44
|
+
|
|
45
|
+
Key pages to review:
|
|
46
|
+
- Specification overview and architecture
|
|
47
|
+
- Transport mechanisms (streamable HTTP, stdio)
|
|
48
|
+
- Tool, resource, and prompt definitions
|
|
49
|
+
|
|
50
|
+
#### 1.3 Study Framework Documentation
|
|
51
|
+
|
|
52
|
+
**Recommended stack:**
|
|
53
|
+
- **Language**: TypeScript (high-quality SDK support and good compatibility in many execution environments e.g. MCPB. Plus AI models are good at generating TypeScript code, benefiting from its broad usage, static typing and good linting tools)
|
|
54
|
+
- **Transport**: Streamable HTTP for remote servers, using stateless JSON (simpler to scale and maintain, as opposed to stateful sessions and streaming responses). stdio for local servers.
|
|
55
|
+
|
|
56
|
+
**Load framework documentation:**
|
|
57
|
+
|
|
58
|
+
- **MCP Best Practices**: [📋 View Best Practices](./reference/mcp_best_practices.md) - Core guidelines
|
|
59
|
+
|
|
60
|
+
**For TypeScript (recommended):**
|
|
61
|
+
- **TypeScript SDK**: Use WebFetch to load `https://raw.githubusercontent.com/modelcontextprotocol/typescript-sdk/main/README.md`
|
|
62
|
+
- [⚡ TypeScript Guide](./reference/node_mcp_server.md) - TypeScript patterns and examples
|
|
63
|
+
|
|
64
|
+
**For Python:**
|
|
65
|
+
- **Python SDK**: Use WebFetch to load `https://raw.githubusercontent.com/modelcontextprotocol/python-sdk/main/README.md`
|
|
66
|
+
- [🐍 Python Guide](./reference/python_mcp_server.md) - Python patterns and examples
|
|
67
|
+
|
|
68
|
+
#### 1.4 Plan Your Implementation
|
|
69
|
+
|
|
70
|
+
**Understand the API:**
|
|
71
|
+
Review the service's API documentation to identify key endpoints, authentication requirements, and data models. Use web search and WebFetch as needed.
|
|
72
|
+
|
|
73
|
+
**Tool Selection:**
|
|
74
|
+
Prioritize comprehensive API coverage. List endpoints to implement, starting with the most common operations.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
### Phase 2: Implementation
|
|
79
|
+
|
|
80
|
+
#### 2.1 Set Up Project Structure
|
|
81
|
+
|
|
82
|
+
See language-specific guides for project setup:
|
|
83
|
+
- [⚡ TypeScript Guide](./reference/node_mcp_server.md) - Project structure, package.json, tsconfig.json
|
|
84
|
+
- [🐍 Python Guide](./reference/python_mcp_server.md) - Module organization, dependencies
|
|
85
|
+
|
|
86
|
+
#### 2.2 Implement Core Infrastructure
|
|
87
|
+
|
|
88
|
+
Create shared utilities:
|
|
89
|
+
- API client with authentication
|
|
90
|
+
- Error handling helpers
|
|
91
|
+
- Response formatting (JSON/Markdown)
|
|
92
|
+
- Pagination support
|
|
93
|
+
|
|
94
|
+
#### 2.3 Implement Tools
|
|
95
|
+
|
|
96
|
+
For each tool:
|
|
97
|
+
|
|
98
|
+
**Input Schema:**
|
|
99
|
+
- Use Zod (TypeScript) or Pydantic (Python)
|
|
100
|
+
- Include constraints and clear descriptions
|
|
101
|
+
- Add examples in field descriptions
|
|
102
|
+
|
|
103
|
+
**Output Schema:**
|
|
104
|
+
- Define `outputSchema` where possible for structured data
|
|
105
|
+
- Use `structuredContent` in tool responses (TypeScript SDK feature)
|
|
106
|
+
- Helps clients understand and process tool outputs
|
|
107
|
+
|
|
108
|
+
**Tool Description:**
|
|
109
|
+
- Concise summary of functionality
|
|
110
|
+
- Parameter descriptions
|
|
111
|
+
- Return type schema
|
|
112
|
+
|
|
113
|
+
**Implementation:**
|
|
114
|
+
- Async/await for I/O operations
|
|
115
|
+
- Proper error handling with actionable messages
|
|
116
|
+
- Support pagination where applicable
|
|
117
|
+
- Return both text content and structured data when using modern SDKs
|
|
118
|
+
|
|
119
|
+
**Annotations:**
|
|
120
|
+
- `readOnlyHint`: true/false
|
|
121
|
+
- `destructiveHint`: true/false
|
|
122
|
+
- `idempotentHint`: true/false
|
|
123
|
+
- `openWorldHint`: true/false
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
### Phase 3: Review and Test
|
|
128
|
+
|
|
129
|
+
#### 3.1 Code Quality
|
|
130
|
+
|
|
131
|
+
Review for:
|
|
132
|
+
- No duplicated code (DRY principle)
|
|
133
|
+
- Consistent error handling
|
|
134
|
+
- Full type coverage
|
|
135
|
+
- Clear tool descriptions
|
|
136
|
+
|
|
137
|
+
#### 3.2 Build and Test
|
|
138
|
+
|
|
139
|
+
**TypeScript:**
|
|
140
|
+
- Run `npm run build` to verify compilation
|
|
141
|
+
- Test with MCP Inspector: `npx @modelcontextprotocol/inspector`
|
|
142
|
+
|
|
143
|
+
**Python:**
|
|
144
|
+
- Verify syntax: `python -m py_compile your_server.py`
|
|
145
|
+
- Test with MCP Inspector
|
|
146
|
+
|
|
147
|
+
See language-specific guides for detailed testing approaches and quality checklists.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
### Phase 4: Create Evaluations
|
|
152
|
+
|
|
153
|
+
After implementing your MCP server, create comprehensive evaluations to test its effectiveness.
|
|
154
|
+
|
|
155
|
+
**Load [✅ Evaluation Guide](./reference/evaluation.md) for complete evaluation guidelines.**
|
|
156
|
+
|
|
157
|
+
#### 4.1 Understand Evaluation Purpose
|
|
158
|
+
|
|
159
|
+
Use evaluations to test whether LLMs can effectively use your MCP server to answer realistic, complex questions.
|
|
160
|
+
|
|
161
|
+
#### 4.2 Create 10 Evaluation Questions
|
|
162
|
+
|
|
163
|
+
To create effective evaluations, follow the process outlined in the evaluation guide:
|
|
164
|
+
|
|
165
|
+
1. **Tool Inspection**: List available tools and understand their capabilities
|
|
166
|
+
2. **Content Exploration**: Use READ-ONLY operations to explore available data
|
|
167
|
+
3. **Question Generation**: Create 10 complex, realistic questions
|
|
168
|
+
4. **Answer Verification**: Solve each question yourself to verify answers
|
|
169
|
+
|
|
170
|
+
#### 4.3 Evaluation Requirements
|
|
171
|
+
|
|
172
|
+
Ensure each question is:
|
|
173
|
+
- **Independent**: Not dependent on other questions
|
|
174
|
+
- **Read-only**: Only non-destructive operations required
|
|
175
|
+
- **Complex**: Requiring multiple tool calls and deep exploration
|
|
176
|
+
- **Realistic**: Based on real use cases humans would care about
|
|
177
|
+
- **Verifiable**: Single, clear answer that can be verified by string comparison
|
|
178
|
+
- **Stable**: Answer won't change over time
|
|
179
|
+
|
|
180
|
+
#### 4.4 Output Format
|
|
181
|
+
|
|
182
|
+
Create an XML file with this structure:
|
|
183
|
+
|
|
184
|
+
```xml
|
|
185
|
+
<evaluation>
|
|
186
|
+
<qa_pair>
|
|
187
|
+
<question>Find discussions about AI model launches with animal codenames. One model needed a specific safety designation that uses the format ASL-X. What number X was being determined for the model named after a spotted wild cat?</question>
|
|
188
|
+
<answer>3</answer>
|
|
189
|
+
</qa_pair>
|
|
190
|
+
<!-- More qa_pairs... -->
|
|
191
|
+
</evaluation>
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
# Reference Files
|
|
197
|
+
|
|
198
|
+
## 📚 Documentation Library
|
|
199
|
+
|
|
200
|
+
Load these resources as needed during development:
|
|
201
|
+
|
|
202
|
+
### Core MCP Documentation (Load First)
|
|
203
|
+
- **MCP Protocol**: Start with sitemap at `https://modelcontextprotocol.io/sitemap.xml`, then fetch specific pages with `.md` suffix
|
|
204
|
+
- [📋 MCP Best Practices](./reference/mcp_best_practices.md) - Universal MCP guidelines including:
|
|
205
|
+
- Server and tool naming conventions
|
|
206
|
+
- Response format guidelines (JSON vs Markdown)
|
|
207
|
+
- Pagination best practices
|
|
208
|
+
- Transport selection (streamable HTTP vs stdio)
|
|
209
|
+
- Security and error handling standards
|
|
210
|
+
|
|
211
|
+
### SDK Documentation (Load During Phase 1/2)
|
|
212
|
+
- **Python SDK**: Fetch from `https://raw.githubusercontent.com/modelcontextprotocol/python-sdk/main/README.md`
|
|
213
|
+
- **TypeScript SDK**: Fetch from `https://raw.githubusercontent.com/modelcontextprotocol/typescript-sdk/main/README.md`
|
|
214
|
+
|
|
215
|
+
### Language-Specific Implementation Guides (Load During Phase 2)
|
|
216
|
+
- [🐍 Python Implementation Guide](./reference/python_mcp_server.md) - Complete Python/FastMCP guide with:
|
|
217
|
+
- Server initialization patterns
|
|
218
|
+
- Pydantic model examples
|
|
219
|
+
- Tool registration with `@mcp.tool`
|
|
220
|
+
- Complete working examples
|
|
221
|
+
- Quality checklist
|
|
222
|
+
|
|
223
|
+
- [⚡ TypeScript Implementation Guide](./reference/node_mcp_server.md) - Complete TypeScript guide with:
|
|
224
|
+
- Project structure
|
|
225
|
+
- Zod schema patterns
|
|
226
|
+
- Tool registration with `server.registerTool`
|
|
227
|
+
- Complete working examples
|
|
228
|
+
- Quality checklist
|
|
229
|
+
|
|
230
|
+
### Evaluation Guide (Load During Phase 4)
|
|
231
|
+
- [✅ Evaluation Guide](./reference/evaluation.md) - Complete evaluation creation guide with:
|
|
232
|
+
- Question creation guidelines
|
|
233
|
+
- Answer verification strategies
|
|
234
|
+
- XML format specifications
|
|
235
|
+
- Example questions and answers
|
|
236
|
+
- Running an evaluation with the provided scripts
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pdf
|
|
3
|
+
description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill.
|
|
4
|
+
license: Proprietary. LICENSE.txt has complete terms
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# PDF Processing Guide
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from pypdf import PdfReader, PdfWriter
|
|
17
|
+
|
|
18
|
+
# Read a PDF
|
|
19
|
+
reader = PdfReader("document.pdf")
|
|
20
|
+
print(f"Pages: {len(reader.pages)}")
|
|
21
|
+
|
|
22
|
+
# Extract text
|
|
23
|
+
text = ""
|
|
24
|
+
for page in reader.pages:
|
|
25
|
+
text += page.extract_text()
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Python Libraries
|
|
29
|
+
|
|
30
|
+
### pypdf - Basic Operations
|
|
31
|
+
|
|
32
|
+
#### Merge PDFs
|
|
33
|
+
```python
|
|
34
|
+
from pypdf import PdfWriter, PdfReader
|
|
35
|
+
|
|
36
|
+
writer = PdfWriter()
|
|
37
|
+
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
|
|
38
|
+
reader = PdfReader(pdf_file)
|
|
39
|
+
for page in reader.pages:
|
|
40
|
+
writer.add_page(page)
|
|
41
|
+
|
|
42
|
+
with open("merged.pdf", "wb") as output:
|
|
43
|
+
writer.write(output)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
#### Split PDF
|
|
47
|
+
```python
|
|
48
|
+
reader = PdfReader("input.pdf")
|
|
49
|
+
for i, page in enumerate(reader.pages):
|
|
50
|
+
writer = PdfWriter()
|
|
51
|
+
writer.add_page(page)
|
|
52
|
+
with open(f"page_{i+1}.pdf", "wb") as output:
|
|
53
|
+
writer.write(output)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
#### Extract Metadata
|
|
57
|
+
```python
|
|
58
|
+
reader = PdfReader("document.pdf")
|
|
59
|
+
meta = reader.metadata
|
|
60
|
+
print(f"Title: {meta.title}")
|
|
61
|
+
print(f"Author: {meta.author}")
|
|
62
|
+
print(f"Subject: {meta.subject}")
|
|
63
|
+
print(f"Creator: {meta.creator}")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
#### Rotate Pages
|
|
67
|
+
```python
|
|
68
|
+
reader = PdfReader("input.pdf")
|
|
69
|
+
writer = PdfWriter()
|
|
70
|
+
|
|
71
|
+
page = reader.pages[0]
|
|
72
|
+
page.rotate(90) # Rotate 90 degrees clockwise
|
|
73
|
+
writer.add_page(page)
|
|
74
|
+
|
|
75
|
+
with open("rotated.pdf", "wb") as output:
|
|
76
|
+
writer.write(output)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### pdfplumber - Text and Table Extraction
|
|
80
|
+
|
|
81
|
+
#### Extract Text with Layout
|
|
82
|
+
```python
|
|
83
|
+
import pdfplumber
|
|
84
|
+
|
|
85
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
86
|
+
for page in pdf.pages:
|
|
87
|
+
text = page.extract_text()
|
|
88
|
+
print(text)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### Extract Tables
|
|
92
|
+
```python
|
|
93
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
94
|
+
for i, page in enumerate(pdf.pages):
|
|
95
|
+
tables = page.extract_tables()
|
|
96
|
+
for j, table in enumerate(tables):
|
|
97
|
+
print(f"Table {j+1} on page {i+1}:")
|
|
98
|
+
for row in table:
|
|
99
|
+
print(row)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
#### Advanced Table Extraction
|
|
103
|
+
```python
|
|
104
|
+
import pandas as pd
|
|
105
|
+
|
|
106
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
107
|
+
all_tables = []
|
|
108
|
+
for page in pdf.pages:
|
|
109
|
+
tables = page.extract_tables()
|
|
110
|
+
for table in tables:
|
|
111
|
+
if table: # Check if table is not empty
|
|
112
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
113
|
+
all_tables.append(df)
|
|
114
|
+
|
|
115
|
+
# Combine all tables
|
|
116
|
+
if all_tables:
|
|
117
|
+
combined_df = pd.concat(all_tables, ignore_index=True)
|
|
118
|
+
combined_df.to_excel("extracted_tables.xlsx", index=False)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### reportlab - Create PDFs
|
|
122
|
+
|
|
123
|
+
#### Basic PDF Creation
|
|
124
|
+
```python
|
|
125
|
+
from reportlab.lib.pagesizes import letter
|
|
126
|
+
from reportlab.pdfgen import canvas
|
|
127
|
+
|
|
128
|
+
c = canvas.Canvas("hello.pdf", pagesize=letter)
|
|
129
|
+
width, height = letter
|
|
130
|
+
|
|
131
|
+
# Add text
|
|
132
|
+
c.drawString(100, height - 100, "Hello World!")
|
|
133
|
+
c.drawString(100, height - 120, "This is a PDF created with reportlab")
|
|
134
|
+
|
|
135
|
+
# Add a line
|
|
136
|
+
c.line(100, height - 140, 400, height - 140)
|
|
137
|
+
|
|
138
|
+
# Save
|
|
139
|
+
c.save()
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
#### Create PDF with Multiple Pages
|
|
143
|
+
```python
|
|
144
|
+
from reportlab.lib.pagesizes import letter
|
|
145
|
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
|
146
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
147
|
+
|
|
148
|
+
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
|
|
149
|
+
styles = getSampleStyleSheet()
|
|
150
|
+
story = []
|
|
151
|
+
|
|
152
|
+
# Add content
|
|
153
|
+
title = Paragraph("Report Title", styles['Title'])
|
|
154
|
+
story.append(title)
|
|
155
|
+
story.append(Spacer(1, 12))
|
|
156
|
+
|
|
157
|
+
body = Paragraph("This is the body of the report. " * 20, styles['Normal'])
|
|
158
|
+
story.append(body)
|
|
159
|
+
story.append(PageBreak())
|
|
160
|
+
|
|
161
|
+
# Page 2
|
|
162
|
+
story.append(Paragraph("Page 2", styles['Heading1']))
|
|
163
|
+
story.append(Paragraph("Content for page 2", styles['Normal']))
|
|
164
|
+
|
|
165
|
+
# Build PDF
|
|
166
|
+
doc.build(story)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
#### Subscripts and Superscripts
|
|
170
|
+
|
|
171
|
+
**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
|
|
172
|
+
|
|
173
|
+
Instead, use ReportLab's XML markup tags in Paragraph objects:
|
|
174
|
+
```python
|
|
175
|
+
from reportlab.platypus import Paragraph
|
|
176
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
177
|
+
|
|
178
|
+
styles = getSampleStyleSheet()
|
|
179
|
+
|
|
180
|
+
# Subscripts: use <sub> tag
|
|
181
|
+
chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
|
|
182
|
+
|
|
183
|
+
# Superscripts: use <super> tag
|
|
184
|
+
squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
|
|
188
|
+
|
|
189
|
+
## Command-Line Tools
|
|
190
|
+
|
|
191
|
+
### pdftotext (poppler-utils)
|
|
192
|
+
```bash
|
|
193
|
+
# Extract text
|
|
194
|
+
pdftotext input.pdf output.txt
|
|
195
|
+
|
|
196
|
+
# Extract text preserving layout
|
|
197
|
+
pdftotext -layout input.pdf output.txt
|
|
198
|
+
|
|
199
|
+
# Extract specific pages
|
|
200
|
+
pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### qpdf
|
|
204
|
+
```bash
|
|
205
|
+
# Merge PDFs
|
|
206
|
+
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
|
|
207
|
+
|
|
208
|
+
# Split pages
|
|
209
|
+
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
|
|
210
|
+
qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
|
|
211
|
+
|
|
212
|
+
# Rotate pages
|
|
213
|
+
qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees
|
|
214
|
+
|
|
215
|
+
# Remove password
|
|
216
|
+
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### pdftk (if available)
|
|
220
|
+
```bash
|
|
221
|
+
# Merge
|
|
222
|
+
pdftk file1.pdf file2.pdf cat output merged.pdf
|
|
223
|
+
|
|
224
|
+
# Split
|
|
225
|
+
pdftk input.pdf burst
|
|
226
|
+
|
|
227
|
+
# Rotate
|
|
228
|
+
pdftk input.pdf rotate 1east output rotated.pdf
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Common Tasks
|
|
232
|
+
|
|
233
|
+
### Extract Text from Scanned PDFs
|
|
234
|
+
```python
|
|
235
|
+
# Requires: pip install pytesseract pdf2image
|
|
236
|
+
import pytesseract
|
|
237
|
+
from pdf2image import convert_from_path
|
|
238
|
+
|
|
239
|
+
# Convert PDF to images
|
|
240
|
+
images = convert_from_path('scanned.pdf')
|
|
241
|
+
|
|
242
|
+
# OCR each page
|
|
243
|
+
text = ""
|
|
244
|
+
for i, image in enumerate(images):
|
|
245
|
+
text += f"Page {i+1}:\n"
|
|
246
|
+
text += pytesseract.image_to_string(image)
|
|
247
|
+
text += "\n\n"
|
|
248
|
+
|
|
249
|
+
print(text)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Add Watermark
|
|
253
|
+
```python
|
|
254
|
+
from pypdf import PdfReader, PdfWriter
|
|
255
|
+
|
|
256
|
+
# Create watermark (or load existing)
|
|
257
|
+
watermark = PdfReader("watermark.pdf").pages[0]
|
|
258
|
+
|
|
259
|
+
# Apply to all pages
|
|
260
|
+
reader = PdfReader("document.pdf")
|
|
261
|
+
writer = PdfWriter()
|
|
262
|
+
|
|
263
|
+
for page in reader.pages:
|
|
264
|
+
page.merge_page(watermark)
|
|
265
|
+
writer.add_page(page)
|
|
266
|
+
|
|
267
|
+
with open("watermarked.pdf", "wb") as output:
|
|
268
|
+
writer.write(output)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Extract Images
|
|
272
|
+
```bash
|
|
273
|
+
# Using pdfimages (poppler-utils)
|
|
274
|
+
pdfimages -j input.pdf output_prefix
|
|
275
|
+
|
|
276
|
+
# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc.
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Password Protection
|
|
280
|
+
```python
|
|
281
|
+
from pypdf import PdfReader, PdfWriter
|
|
282
|
+
|
|
283
|
+
reader = PdfReader("input.pdf")
|
|
284
|
+
writer = PdfWriter()
|
|
285
|
+
|
|
286
|
+
for page in reader.pages:
|
|
287
|
+
writer.add_page(page)
|
|
288
|
+
|
|
289
|
+
# Add password
|
|
290
|
+
writer.encrypt("userpassword", "ownerpassword")
|
|
291
|
+
|
|
292
|
+
with open("encrypted.pdf", "wb") as output:
|
|
293
|
+
writer.write(output)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Quick Reference
|
|
297
|
+
|
|
298
|
+
| Task | Best Tool | Command/Code |
|
|
299
|
+
|------|-----------|--------------|
|
|
300
|
+
| Merge PDFs | pypdf | `writer.add_page(page)` |
|
|
301
|
+
| Split PDFs | pypdf | One page per file |
|
|
302
|
+
| Extract text | pdfplumber | `page.extract_text()` |
|
|
303
|
+
| Extract tables | pdfplumber | `page.extract_tables()` |
|
|
304
|
+
| Create PDFs | reportlab | Canvas or Platypus |
|
|
305
|
+
| Command line merge | qpdf | `qpdf --empty --pages ...` |
|
|
306
|
+
| OCR scanned PDFs | pytesseract | Convert to image first |
|
|
307
|
+
| Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md |
|
|
308
|
+
|
|
309
|
+
## Next Steps
|
|
310
|
+
|
|
311
|
+
- For advanced pypdfium2 usage, see REFERENCE.md
|
|
312
|
+
- For JavaScript libraries (pdf-lib), see REFERENCE.md
|
|
313
|
+
- If you need to fill out a PDF form, follow the instructions in FORMS.md
|
|
314
|
+
- For troubleshooting guides, see REFERENCE.md
|