@shuji-bonji/pdf-reader-mcp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/LICENSE +21 -0
  3. package/README.ja.md +190 -0
  4. package/README.md +206 -0
  5. package/dist/constants.d.ts +22 -0
  6. package/dist/constants.d.ts.map +1 -0
  7. package/dist/constants.js +23 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/index.d.ts +9 -0
  10. package/dist/index.d.ts.map +1 -0
  11. package/dist/index.js +35 -0
  12. package/dist/index.js.map +1 -0
  13. package/dist/schemas/common.d.ts +14 -0
  14. package/dist/schemas/common.d.ts.map +1 -0
  15. package/dist/schemas/common.js +26 -0
  16. package/dist/schemas/common.js.map +1 -0
  17. package/dist/schemas/tier1.d.ts +104 -0
  18. package/dist/schemas/tier1.d.ts.map +1 -0
  19. package/dist/schemas/tier1.js +77 -0
  20. package/dist/schemas/tier1.js.map +1 -0
  21. package/dist/schemas/tier2.d.ts +68 -0
  22. package/dist/schemas/tier2.d.ts.map +1 -0
  23. package/dist/schemas/tier2.js +42 -0
  24. package/dist/schemas/tier2.js.map +1 -0
  25. package/dist/schemas/tier3.d.ts +44 -0
  26. package/dist/schemas/tier3.d.ts.map +1 -0
  27. package/dist/schemas/tier3.js +28 -0
  28. package/dist/schemas/tier3.js.map +1 -0
  29. package/dist/services/pdfjs-service.d.ts +65 -0
  30. package/dist/services/pdfjs-service.d.ts.map +1 -0
  31. package/dist/services/pdfjs-service.js +520 -0
  32. package/dist/services/pdfjs-service.js.map +1 -0
  33. package/dist/services/pdflib-service.d.ts +35 -0
  34. package/dist/services/pdflib-service.d.ts.map +1 -0
  35. package/dist/services/pdflib-service.js +318 -0
  36. package/dist/services/pdflib-service.js.map +1 -0
  37. package/dist/services/url-fetcher.d.ts +8 -0
  38. package/dist/services/url-fetcher.d.ts.map +1 -0
  39. package/dist/services/url-fetcher.js +40 -0
  40. package/dist/services/url-fetcher.js.map +1 -0
  41. package/dist/services/validation-service.d.ts +49 -0
  42. package/dist/services/validation-service.d.ts.map +1 -0
  43. package/dist/services/validation-service.js +670 -0
  44. package/dist/services/validation-service.js.map +1 -0
  45. package/dist/tools/index.d.ts +10 -0
  46. package/dist/tools/index.d.ts.map +1 -0
  47. package/dist/tools/index.js +46 -0
  48. package/dist/tools/index.js.map +1 -0
  49. package/dist/tools/tier1/get-metadata.d.ts +6 -0
  50. package/dist/tools/tier1/get-metadata.d.ts.map +1 -0
  51. package/dist/tools/tier1/get-metadata.js +49 -0
  52. package/dist/tools/tier1/get-metadata.js.map +1 -0
  53. package/dist/tools/tier1/get-page-count.d.ts +6 -0
  54. package/dist/tools/tier1/get-page-count.d.ts.map +1 -0
  55. package/dist/tools/tier1/get-page-count.js +50 -0
  56. package/dist/tools/tier1/get-page-count.js.map +1 -0
  57. package/dist/tools/tier1/read-images.d.ts +6 -0
  58. package/dist/tools/tier1/read-images.d.ts.map +1 -0
  59. package/dist/tools/tier1/read-images.js +79 -0
  60. package/dist/tools/tier1/read-images.js.map +1 -0
  61. package/dist/tools/tier1/read-text.d.ts +6 -0
  62. package/dist/tools/tier1/read-text.d.ts.map +1 -0
  63. package/dist/tools/tier1/read-text.js +57 -0
  64. package/dist/tools/tier1/read-text.js.map +1 -0
  65. package/dist/tools/tier1/read-url.d.ts +6 -0
  66. package/dist/tools/tier1/read-url.d.ts.map +1 -0
  67. package/dist/tools/tier1/read-url.js +64 -0
  68. package/dist/tools/tier1/read-url.js.map +1 -0
  69. package/dist/tools/tier1/search-text.d.ts +6 -0
  70. package/dist/tools/tier1/search-text.d.ts.map +1 -0
  71. package/dist/tools/tier1/search-text.js +62 -0
  72. package/dist/tools/tier1/search-text.js.map +1 -0
  73. package/dist/tools/tier1/summarize.d.ts +6 -0
  74. package/dist/tools/tier1/summarize.d.ts.map +1 -0
  75. package/dist/tools/tier1/summarize.js +70 -0
  76. package/dist/tools/tier1/summarize.js.map +1 -0
  77. package/dist/tools/tier2/inspect-annotations.d.ts +6 -0
  78. package/dist/tools/tier2/inspect-annotations.d.ts.map +1 -0
  79. package/dist/tools/tier2/inspect-annotations.js +47 -0
  80. package/dist/tools/tier2/inspect-annotations.js.map +1 -0
  81. package/dist/tools/tier2/inspect-fonts.d.ts +6 -0
  82. package/dist/tools/tier2/inspect-fonts.d.ts.map +1 -0
  83. package/dist/tools/tier2/inspect-fonts.js +54 -0
  84. package/dist/tools/tier2/inspect-fonts.js.map +1 -0
  85. package/dist/tools/tier2/inspect-signatures.d.ts +6 -0
  86. package/dist/tools/tier2/inspect-signatures.d.ts.map +1 -0
  87. package/dist/tools/tier2/inspect-signatures.js +48 -0
  88. package/dist/tools/tier2/inspect-signatures.js.map +1 -0
  89. package/dist/tools/tier2/inspect-structure.d.ts +6 -0
  90. package/dist/tools/tier2/inspect-structure.d.ts.map +1 -0
  91. package/dist/tools/tier2/inspect-structure.js +46 -0
  92. package/dist/tools/tier2/inspect-structure.js.map +1 -0
  93. package/dist/tools/tier2/inspect-tags.d.ts +6 -0
  94. package/dist/tools/tier2/inspect-tags.d.ts.map +1 -0
  95. package/dist/tools/tier2/inspect-tags.js +46 -0
  96. package/dist/tools/tier2/inspect-tags.js.map +1 -0
  97. package/dist/tools/tier3/compare-structure.d.ts +6 -0
  98. package/dist/tools/tier3/compare-structure.d.ts.map +1 -0
  99. package/dist/tools/tier3/compare-structure.js +47 -0
  100. package/dist/tools/tier3/compare-structure.js.map +1 -0
  101. package/dist/tools/tier3/validate-metadata.d.ts +6 -0
  102. package/dist/tools/tier3/validate-metadata.d.ts.map +1 -0
  103. package/dist/tools/tier3/validate-metadata.js +57 -0
  104. package/dist/tools/tier3/validate-metadata.js.map +1 -0
  105. package/dist/tools/tier3/validate-tagged.d.ts +6 -0
  106. package/dist/tools/tier3/validate-tagged.d.ts.map +1 -0
  107. package/dist/tools/tier3/validate-tagged.js +56 -0
  108. package/dist/tools/tier3/validate-tagged.js.map +1 -0
  109. package/dist/types.d.ts +226 -0
  110. package/dist/types.d.ts.map +1 -0
  111. package/dist/types.js +5 -0
  112. package/dist/types.js.map +1 -0
  113. package/dist/utils/batch-processor.d.ts +60 -0
  114. package/dist/utils/batch-processor.d.ts.map +1 -0
  115. package/dist/utils/batch-processor.js +72 -0
  116. package/dist/utils/batch-processor.js.map +1 -0
  117. package/dist/utils/error-handler.d.ts +23 -0
  118. package/dist/utils/error-handler.d.ts.map +1 -0
  119. package/dist/utils/error-handler.js +76 -0
  120. package/dist/utils/error-handler.js.map +1 -0
  121. package/dist/utils/formatter.d.ts +64 -0
  122. package/dist/utils/formatter.d.ts.map +1 -0
  123. package/dist/utils/formatter.js +379 -0
  124. package/dist/utils/formatter.js.map +1 -0
  125. package/dist/utils/pdf-helpers.d.ts +22 -0
  126. package/dist/utils/pdf-helpers.d.ts.map +1 -0
  127. package/dist/utils/pdf-helpers.js +68 -0
  128. package/dist/utils/pdf-helpers.js.map +1 -0
  129. package/package.json +78 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,44 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.2.0] - 2026-02-08
9
+
10
+ ### Added
11
+
12
+ - **Tier 3 tools**: `validate_tagged`, `validate_metadata`, `compare_structure`
13
+ - **E2E test suite**: 146 tests across 9 test suites with performance baseline tracking
14
+ - **CI/CD**: GitHub Actions workflow for automated testing (Node 18/20/22) and npm publishing
15
+ - **Batch processor utility**: `processInBatches()` and `processAndReduce()` for large PDF handling
16
+ - **`analyzeTagsFromDoc()`**: Pre-loaded document variant for shared document lifecycle
17
+
18
+ ### Changed
19
+
20
+ - **Parallel page processing**: `extractTextFromDoc`, `searchText`, `countImagesFromDoc`, `extractImages`, `checkSignatures`, `analyzeAnnotations`, `analyzeTags` now use `Promise.all` for concurrent page access
21
+ - **`validateTagged()`**: Eliminated double document load — now loads once and runs `analyzeTagsFromDoc` + `countImagesFromDoc` in parallel via `Promise.all`
22
+ - **Test infrastructure**: Extracted constants (`constants.ts`), cached baseline reads, added `measureAndCheck()` helper for DRY performance tests
23
+ - **Test naming**: All test descriptions translated to English for international readability
24
+
25
+ ### Performance
26
+
27
+ - `searchText`: ~68% faster (parallel text extraction across pages)
28
+ - `getMetadata`: ~27% faster (parallel signature check)
29
+ - `validateMetadata`: ~25% faster
30
+ - `analyzeFonts`: ~57% faster
31
+ - `getMetadata-all-7` (batch): ~17% faster
32
+
33
+ ## [0.1.0] - 2026-02-07
34
+
35
+ ### Added
36
+
37
+ - **Tier 1 tools** (7): `get_page_count`, `get_metadata`, `read_text`, `search_text`, `read_images`, `read_url`, `summarize`
38
+ - **Tier 2 tools** (5): `inspect_structure`, `inspect_tags`, `inspect_fonts`, `inspect_annotations`, `inspect_signatures`
39
+ - Input validation with Zod schemas and path security checks
40
+ - Y-coordinate-based text extraction preserving natural reading order
41
+ - Unit tests for core utilities and pdfjs-service
42
+
43
+ [0.2.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.1.0...v0.2.0
44
+ [0.1.0]: https://github.com/shuji-bonji/pdf-reader-mcp/releases/tag/v0.1.0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 shuji-bonji
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.ja.md ADDED
@@ -0,0 +1,190 @@
1
+ # PDF Reader MCP Server
2
+
3
+ [![npm version](https://img.shields.io/npm/v/@shuji-bonji/pdf-reader-mcp)](https://www.npmjs.com/package/@shuji-bonji/pdf-reader-mcp)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Built with Claude Code](https://img.shields.io/badge/Built%20with-Claude%20Code-blueviolet?logo=anthropic)](https://claude.ai/code)
6
+
7
+ [English](./README.md) | **日本語**
8
+
9
+ PDF 内部構造解析に特化した MCP (Model Context Protocol) サーバー。
10
+
11
+ 既存の pdf-reader-mcp がテキスト抽出の薄いラッパーに留まるのに対し、本プロジェクトは **PDF の内部構造を読み解く** ことに焦点を当てています。[pdf-spec-mcp](https://github.com/nicholasgriffintn/pdf-spec-mcp) と組み合わせることで、仕様知識に基づいた構造解析・検証が可能になります。
12
+
13
+ ## ツール一覧
14
+
15
+ ### Tier 1: 基本機能 ✅ (v0.1.0)
16
+
17
+ | ツール | 説明 |
18
+ | ---------------- | --------------------------------------------------- |
19
+ | `get_page_count` | ページ数の軽量取得 |
20
+ | `get_metadata` | メタデータ抽出(タイトル、著者、PDF版、タグ有無等) |
21
+ | `read_text` | テキスト抽出(Y座標ベースの読み順保持) |
22
+ | `search_text` | 全文検索(前後コンテキスト付き) |
23
+ | `read_images` | 画像抽出(base64、メタデータ付き) |
24
+ | `read_url` | URLからリモートPDFを取得して処理 |
25
+ | `summarize` | 全体概要レポート(メタデータ + テキスト + 画像数) |
26
+
27
+ ### Tier 2: 構造解析 ✅ (v0.1.0)
28
+
29
+ | ツール | 説明 |
30
+ | --------------------- | -------------------------------------------- |
31
+ | `inspect_structure` | オブジェクトツリー・カタログ辞書の解析 |
32
+ | `inspect_tags` | Tagged PDF のタグツリー可視化 |
33
+ | `inspect_fonts` | フォント一覧(埋め込み/サブセット/Type判定) |
34
+ | `inspect_annotations` | 注釈一覧(タイプ別分類) |
35
+ | `inspect_signatures` | 電子署名フィールドの構造解析 |
36
+
37
+ ### Tier 3: 検証・分析 ✅ (v0.2.0)
38
+
39
+ | ツール | 説明 |
40
+ | ------------------- | ---------------------------------------------- |
41
+ | `validate_tagged` | PDF/UA タグ構造の検証(8項目チェック) |
42
+ | `validate_metadata` | メタデータの仕様適合チェック(10項目チェック) |
43
+ | `compare_structure` | 2つのPDFの構造差分比較(プロパティ+フォント) |
44
+
45
+ ## セットアップ
46
+
47
+ ### Claude Desktop
48
+
49
+ `claude_desktop_config.json` に追加:
50
+
51
+ ```json
52
+ {
53
+ "mcpServers": {
54
+ "pdf-reader-mcp": {
55
+ "command": "node",
56
+ "args": ["/path/to/pdf-reader-mcp/dist/index.js"]
57
+ }
58
+ }
59
+ }
60
+ ```
61
+
62
+ ### Claude Code
63
+
64
+ ```bash
65
+ claude mcp add pdf-reader-mcp node /path/to/pdf-reader-mcp/dist/index.js
66
+ ```
67
+
68
+ ### 開発用
69
+
70
+ ```bash
71
+ git clone https://github.com/shuji-bonji/pdf-reader-mcp.git
72
+ cd pdf-reader-mcp
73
+ npm install
74
+ npm run build
75
+ ```
76
+
77
+ ## 使用例
78
+
79
+ ### ページ数の取得
80
+
81
+ ```
82
+ get_page_count({ file_path: "/path/to/document.pdf" })
83
+ → 42
84
+ ```
85
+
86
+ ### テキスト検索
87
+
88
+ ```
89
+ search_text({
90
+ file_path: "/path/to/spec.pdf",
91
+ query: "digital signature",
92
+ pages: "1-20",
93
+ max_results: 10
94
+ })
95
+ → Found 5 matches (page 3, 7, 12, 15, 18)
96
+ ```
97
+
98
+ ### PDF概要
99
+
100
+ ```
101
+ summarize({ file_path: "/path/to/document.pdf" })
102
+ → | Pages | 42 |
103
+ | PDF Version | 2.0 |
104
+ | Tagged | Yes |
105
+ | Signatures | No |
106
+ | Images | 15 |
107
+ ```
108
+
109
+ ### PDF/UA タグ検証
110
+
111
+ ```
112
+ validate_tagged({ file_path: "/path/to/document.pdf" })
113
+ → ✅ [TAG-001] Document is marked as tagged
114
+ ✅ [TAG-002] Structure tree root exists
115
+ ⚠️ [TAG-004] Heading hierarchy has gaps: H1, H3
116
+ ❌ [TAG-005] Document has 3 image(s) but no Figure tags
117
+ ```
118
+
119
+ ### メタデータ検証
120
+
121
+ ```
122
+ validate_metadata({ file_path: "/path/to/document.pdf" })
123
+ → ✅ [META-001] Title: "Annual Report 2025"
124
+ ⚠️ [META-002] Author is missing
125
+ ✅ [META-006] PDF version: 2.0
126
+ ```
127
+
128
+ ### 構造比較
129
+
130
+ ```
131
+ compare_structure({
132
+ file_path_1: "/path/to/v1.pdf",
133
+ file_path_2: "/path/to/v2.pdf"
134
+ })
135
+ → | Page Count | 10 | 12 | ❌ |
136
+ | PDF Version | 1.7 | 2.0 | ❌ |
137
+ | Tagged | true | true | ✅ |
138
+ ```
139
+
140
+ ## 技術スタック
141
+
142
+ - **TypeScript** + MCP TypeScript SDK
143
+ - **pdfjs-dist** (Mozilla) — テキスト/画像抽出
144
+ - **pdf-lib** (Tier 2〜) — 低レベル構造解析
145
+ - **Vitest** — テスト
146
+ - **Zod** — 入力バリデーション
147
+
148
+ ## テスト
149
+
150
+ ```bash
151
+ npm test # テスト実行
152
+ npm run test:watch # ウォッチモード
153
+ ```
154
+
155
+ ## アーキテクチャ
156
+
157
+ ```
158
+ pdf-reader-mcp/
159
+ ├── src/
160
+ │ ├── index.ts # MCP Server エントリーポイント
161
+ │ ├── constants.ts # 定数
162
+ │ ├── types.ts # 型定義
163
+ │ ├── tools/
164
+ │ │ ├── tier1/ # 基本ツール(7ツール)
165
+ │ │ ├── tier2/ # 構造解析(5ツール)
166
+ │ │ ├── tier3/ # 検証・分析(3ツール)
167
+ │ │ └── index.ts # ツール登録の集約
168
+ │ ├── services/ # PDF ライブラリラッパー
169
+ │ │ ├── pdfjs-service.ts # pdfjs-dist ラッパー
170
+ │ │ ├── pdflib-service.ts # pdf-lib ラッパー
171
+ │ │ ├── validation-service.ts # 検証・比較ロジック
172
+ │ │ └── url-fetcher.ts # URL取得
173
+ │ ├── schemas/ # Zod バリデーションスキーマ
174
+ │ └── utils/ # ユーティリティ
175
+ └── tests/
176
+ ```
177
+
178
+ ## pdf-spec-mcp との連携
179
+
180
+ pdf-spec-mcp は PDF 仕様(ISO 32000-2 等)の知識を提供する MCP サーバーです。両方を有効にすることで、LLM は以下のようなワークフローを実行できます:
181
+
182
+ 1. `summarize` で PDF の概要を把握
183
+ 2. `inspect_tags` でタグ構造を確認
184
+ 3. pdf-spec-mcp の `get_requirements` で PDF/UA 要件を取得
185
+ 4. `validate_tagged` で適合性を検証
186
+ 5. `compare_structure` で修正前後の構造差分を確認
187
+
188
+ ## ライセンス
189
+
190
+ MIT
package/README.md ADDED
@@ -0,0 +1,206 @@
1
+ # PDF Reader MCP Server
2
+
3
+ [![npm version](https://img.shields.io/npm/v/@shuji-bonji/pdf-reader-mcp)](https://www.npmjs.com/package/@shuji-bonji/pdf-reader-mcp)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Built with Claude Code](https://img.shields.io/badge/Built%20with-Claude%20Code-blueviolet?logo=anthropic)](https://claude.ai/code)
6
+
7
+ **English** | [日本語](./README.ja.md)
8
+
9
+ An MCP (Model Context Protocol) server specialized in **deciphering PDF internal structures**.
10
+
11
+ While typical PDF MCP servers are thin wrappers for text extraction, this project focuses on **reading and analyzing the internal structure** of PDF documents. Pair it with [pdf-spec-mcp](https://github.com/nicholasgriffintn/pdf-spec-mcp) for specification-aware structural analysis and validation.
12
+
13
+ ## Features
14
+
15
+ **15 tools** organized into three tiers:
16
+
17
+ ### Tier 1: Basic Operations
18
+
19
+ | Tool | Description |
20
+ | ---------------- | -------------------------------------------------------- |
21
+ | `get_page_count` | Lightweight page count retrieval |
22
+ | `get_metadata` | Full metadata extraction (title, author, PDF version...) |
23
+ | `read_text` | Text extraction with Y-coordinate reading order |
24
+ | `search_text` | Full-text search with surrounding context |
25
+ | `read_images` | Image extraction as base64 with metadata |
26
+ | `read_url` | Fetch and process remote PDFs from URLs |
27
+ | `summarize` | Quick overview report (metadata + text + image count) |
28
+
29
+ ### Tier 2: Structure Inspection
30
+
31
+ | Tool | Description |
32
+ | --------------------- | ------------------------------------------------ |
33
+ | `inspect_structure` | Object tree and catalog dictionary analysis |
34
+ | `inspect_tags` | Tagged PDF structure tree visualization |
35
+ | `inspect_fonts` | Font inventory (embedded/subset/type detection) |
36
+ | `inspect_annotations` | Annotation listing (categorized by subtype) |
37
+ | `inspect_signatures` | Digital signature field structure analysis |
38
+
39
+ ### Tier 3: Validation & Analysis
40
+
41
+ | Tool | Description |
42
+ | ------------------- | ---------------------------------------------------- |
43
+ | `validate_tagged` | PDF/UA tag structure validation (8 checks) |
44
+ | `validate_metadata` | Metadata conformance checking (10 checks) |
45
+ | `compare_structure` | Structural diff between two PDFs (properties + fonts)|
46
+
47
+ ## Installation
48
+
49
+ ### npx (recommended)
50
+
51
+ ```bash
52
+ npx @shuji-bonji/pdf-reader-mcp
53
+ ```
54
+
55
+ ### Claude Desktop
56
+
57
+ Add to your `claude_desktop_config.json`:
58
+
59
+ ```json
60
+ {
61
+ "mcpServers": {
62
+ "pdf-reader-mcp": {
63
+ "command": "npx",
64
+ "args": ["-y", "@shuji-bonji/pdf-reader-mcp"]
65
+ }
66
+ }
67
+ }
68
+ ```
69
+
70
+ ### Claude Code
71
+
72
+ ```bash
73
+ claude mcp add pdf-reader-mcp -- npx -y @shuji-bonji/pdf-reader-mcp
74
+ ```
75
+
76
+ ### From Source
77
+
78
+ ```bash
79
+ git clone https://github.com/shuji-bonji/pdf-reader-mcp.git
80
+ cd pdf-reader-mcp
81
+ npm install
82
+ npm run build
83
+ ```
84
+
85
+ ## Usage Examples
86
+
87
+ ### Get Page Count
88
+
89
+ ```
90
+ get_page_count({ file_path: "/path/to/document.pdf" })
91
+ → 42
92
+ ```
93
+
94
+ ### Search Text
95
+
96
+ ```
97
+ search_text({
98
+ file_path: "/path/to/spec.pdf",
99
+ query: "digital signature",
100
+ pages: "1-20",
101
+ max_results: 10
102
+ })
103
+ → Found 5 matches (page 3, 7, 12, 15, 18)
104
+ ```
105
+
106
+ ### Summarize
107
+
108
+ ```
109
+ summarize({ file_path: "/path/to/document.pdf" })
110
+ → | Pages | 42 |
111
+ | PDF Version | 2.0 |
112
+ | Tagged | Yes |
113
+ | Signatures | No |
114
+ | Images | 15 |
115
+ ```
116
+
117
+ ### Validate Tagged Structure (PDF/UA)
118
+
119
+ ```
120
+ validate_tagged({ file_path: "/path/to/document.pdf" })
121
+ → ✅ [TAG-001] Document is marked as tagged
122
+ ✅ [TAG-002] Structure tree root exists
123
+ ⚠️ [TAG-004] Heading hierarchy has gaps: H1, H3
124
+ ❌ [TAG-005] Document has 3 image(s) but no Figure tags
125
+ ```
126
+
127
+ ### Validate Metadata
128
+
129
+ ```
130
+ validate_metadata({ file_path: "/path/to/document.pdf" })
131
+ → ✅ [META-001] Title: "Annual Report 2025"
132
+ ⚠️ [META-002] Author is missing
133
+ ✅ [META-006] PDF version: 2.0
134
+ ```
135
+
136
+ ### Compare Structure
137
+
138
+ ```
139
+ compare_structure({
140
+ file_path_1: "/path/to/v1.pdf",
141
+ file_path_2: "/path/to/v2.pdf"
142
+ })
143
+ → | Page Count | 10 | 12 | ❌ |
144
+ | PDF Version | 1.7 | 2.0 | ❌ |
145
+ | Tagged | true | true | ✅ |
146
+ ```
147
+
148
+ ## Tech Stack
149
+
150
+ - **TypeScript** + MCP TypeScript SDK
151
+ - **pdfjs-dist** (Mozilla) — text/image extraction, tag tree, annotations
152
+ - **pdf-lib** — low-level object structure analysis
153
+ - **Vitest** — unit + E2E testing (185 tests)
154
+ - **Biome** — linting + formatting
155
+ - **Zod** — input validation
156
+
157
+ ## Testing
158
+
159
+ ```bash
160
+ npm test # Run all tests (unit + E2E)
161
+ npm run test:e2e # E2E tests only (146 tests)
162
+ npm run test:watch # Watch mode
163
+ ```
164
+
165
+ ## Architecture
166
+
167
+ ```
168
+ pdf-reader-mcp/
169
+ ├── src/
170
+ │ ├── index.ts # MCP Server entry point
171
+ │ ├── constants.ts # Shared constants
172
+ │ ├── types.ts # Type definitions
173
+ │ ├── tools/
174
+ │ │ ├── tier1/ # Basic tools (7)
175
+ │ │ ├── tier2/ # Structure inspection (5)
176
+ │ │ ├── tier3/ # Validation & analysis (3)
177
+ │ │ └── index.ts # Tool registration
178
+ │ ├── services/
179
+ │ │ ├── pdfjs-service.ts # pdfjs-dist wrapper (parallel page processing)
180
+ │ │ ├── pdflib-service.ts # pdf-lib wrapper
181
+ │ │ ├── validation-service.ts # Validation & comparison logic
182
+ │ │ └── url-fetcher.ts # URL fetching
183
+ │ ├── schemas/ # Zod validation schemas
184
+ │ └── utils/
185
+ │ ├── pdf-helpers.ts # PDF utilities (page range parsing, file I/O)
186
+ │ ├── batch-processor.ts # Batch processing for large PDFs
187
+ │ ├── formatter.ts # Output formatting
188
+ │ └── error-handler.ts # Error handling
189
+ └── tests/
190
+ ├── tier1/ # Unit tests
191
+ └── e2e/ # E2E tests (9 suites, 146 tests)
192
+ ```
193
+
194
+ ## Pairing with pdf-spec-mcp
195
+
196
+ [pdf-spec-mcp](https://github.com/nicholasgriffintn/pdf-spec-mcp) provides PDF specification knowledge (ISO 32000-2, etc.). With both servers enabled, an LLM can perform specification-aware workflows:
197
+
198
+ 1. `summarize` — get a PDF overview
199
+ 2. `inspect_tags` — examine the tag structure
200
+ 3. pdf-spec-mcp `get_requirements` — fetch PDF/UA requirements
201
+ 4. `validate_tagged` — check conformance
202
+ 5. `compare_structure` — diff before/after fixes
203
+
204
+ ## License
205
+
206
+ MIT
@@ -0,0 +1,22 @@
1
+ /**
2
+ * pdf-reader-mcp shared constants
3
+ */
4
+ /** Maximum response size in characters */
5
+ export declare const CHARACTER_LIMIT = 25000;
6
+ /** Maximum PDF file size in bytes (50MB) */
7
+ export declare const MAX_FILE_SIZE: number;
8
+ /** Default page limit for text extraction */
9
+ export declare const DEFAULT_PAGE_LIMIT = 50;
10
+ /** Maximum number of search results to return */
11
+ export declare const MAX_SEARCH_RESULTS = 100;
12
+ /** Default context characters around search matches */
13
+ export declare const DEFAULT_SEARCH_CONTEXT = 80;
14
+ /** Server info */
15
+ export declare const SERVER_NAME = "pdf-reader-mcp";
16
+ export declare const SERVER_VERSION = "0.2.0";
17
+ /** Response format enum */
18
+ export declare enum ResponseFormat {
19
+ MARKDOWN = "markdown",
20
+ JSON = "json"
21
+ }
22
+ //# sourceMappingURL=constants.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../src/constants.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,0CAA0C;AAC1C,eAAO,MAAM,eAAe,QAAS,CAAC;AAEtC,4CAA4C;AAC5C,eAAO,MAAM,aAAa,QAAmB,CAAC;AAE9C,6CAA6C;AAC7C,eAAO,MAAM,kBAAkB,KAAK,CAAC;AAErC,iDAAiD;AACjD,eAAO,MAAM,kBAAkB,MAAM,CAAC;AAEtC,uDAAuD;AACvD,eAAO,MAAM,sBAAsB,KAAK,CAAC;AAEzC,kBAAkB;AAClB,eAAO,MAAM,WAAW,mBAAmB,CAAC;AAC5C,eAAO,MAAM,cAAc,UAAU,CAAC;AAEtC,2BAA2B;AAC3B,oBAAY,cAAc;IACxB,QAAQ,aAAa;IACrB,IAAI,SAAS;CACd"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * pdf-reader-mcp shared constants
3
+ */
4
+ /** Maximum response size in characters */
5
+ export const CHARACTER_LIMIT = 25_000;
6
+ /** Maximum PDF file size in bytes (50MB) */
7
+ export const MAX_FILE_SIZE = 50 * 1024 * 1024;
8
+ /** Default page limit for text extraction */
9
+ export const DEFAULT_PAGE_LIMIT = 50;
10
+ /** Maximum number of search results to return */
11
+ export const MAX_SEARCH_RESULTS = 100;
12
+ /** Default context characters around search matches */
13
+ export const DEFAULT_SEARCH_CONTEXT = 80;
14
+ /** Server info */
15
+ export const SERVER_NAME = 'pdf-reader-mcp';
16
+ export const SERVER_VERSION = '0.2.0';
17
+ /** Response format enum */
18
+ export var ResponseFormat;
19
+ (function (ResponseFormat) {
20
+ ResponseFormat["MARKDOWN"] = "markdown";
21
+ ResponseFormat["JSON"] = "json";
22
+ })(ResponseFormat || (ResponseFormat = {}));
23
+ //# sourceMappingURL=constants.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.js","sourceRoot":"","sources":["../src/constants.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,0CAA0C;AAC1C,MAAM,CAAC,MAAM,eAAe,GAAG,MAAM,CAAC;AAEtC,4CAA4C;AAC5C,MAAM,CAAC,MAAM,aAAa,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC;AAE9C,6CAA6C;AAC7C,MAAM,CAAC,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAErC,iDAAiD;AACjD,MAAM,CAAC,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAEtC,uDAAuD;AACvD,MAAM,CAAC,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAEzC,kBAAkB;AAClB,MAAM,CAAC,MAAM,WAAW,GAAG,gBAAgB,CAAC;AAC5C,MAAM,CAAC,MAAM,cAAc,GAAG,OAAO,CAAC;AAEtC,2BAA2B;AAC3B,MAAM,CAAN,IAAY,cAGX;AAHD,WAAY,cAAc;IACxB,uCAAqB,CAAA;IACrB,+BAAa,CAAA;AACf,CAAC,EAHW,cAAc,KAAd,cAAc,QAGzB"}
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * pdf-reader-mcp - MCP server for PDF structure analysis.
4
+ *
5
+ * Provides tools to read, inspect, and validate PDF internals.
6
+ * Designed to work alongside pdf-spec-mcp for specification-aware analysis.
7
+ */
8
+ export {};
9
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AACA;;;;;GAKG"}
package/dist/index.js ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * pdf-reader-mcp - MCP server for PDF structure analysis.
4
+ *
5
+ * Provides tools to read, inspect, and validate PDF internals.
6
+ * Designed to work alongside pdf-spec-mcp for specification-aware analysis.
7
+ */
8
+ // IMPORTANT: Guard stdout before any imports.
9
+ // pdfjs-dist's warn() uses console.log (= stdout), which corrupts the
10
+ // stdio JSON-RPC stream. Redirect console.log/console.warn to stderr.
11
+ const _originalConsoleLog = console.log;
12
+ const _originalConsoleWarn = console.warn;
13
+ console.log = (...args) => console.error('[log]', ...args);
14
+ console.warn = (...args) => console.error('[warn]', ...args);
15
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
16
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
17
+ import { SERVER_NAME, SERVER_VERSION } from './constants.js';
18
+ import { registerAllTools } from './tools/index.js';
19
+ const server = new McpServer({
20
+ name: SERVER_NAME,
21
+ version: SERVER_VERSION,
22
+ });
23
+ // Register all tools
24
+ registerAllTools(server);
25
+ // Start the server with stdio transport
26
+ async function main() {
27
+ const transport = new StdioServerTransport();
28
+ await server.connect(transport);
29
+ console.error(`${SERVER_NAME} v${SERVER_VERSION} running via stdio`);
30
+ }
31
+ main().catch((error) => {
32
+ console.error('Fatal error:', error);
33
+ process.exit(1);
34
+ });
35
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AACA;;;;;GAKG;AAEH,8CAA8C;AAC9C,sEAAsE;AACtE,sEAAsE;AACtE,MAAM,mBAAmB,GAAG,OAAO,CAAC,GAAG,CAAC;AACxC,MAAM,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;AAC1C,OAAO,CAAC,GAAG,GAAG,CAAC,GAAG,IAAe,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,IAAI,CAAC,CAAC;AACtE,OAAO,CAAC,IAAI,GAAG,CAAC,GAAG,IAAe,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,CAAC;AAExE,OAAO,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AACpE,OAAO,EAAE,oBAAoB,EAAE,MAAM,2CAA2C,CAAC;AACjF,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAC7D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEpD,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC;IAC3B,IAAI,EAAE,WAAW;IACjB,OAAO,EAAE,cAAc;CACxB,CAAC,CAAC;AAEH,qBAAqB;AACrB,gBAAgB,CAAC,MAAM,CAAC,CAAC;AAEzB,wCAAwC;AACxC,KAAK,UAAU,IAAI;IACjB,MAAM,SAAS,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAC7C,MAAM,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAChC,OAAO,CAAC,KAAK,CAAC,GAAG,WAAW,KAAK,cAAc,oBAAoB,CAAC,CAAC;AACvE,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC;IACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Common Zod schemas shared across tools.
3
+ */
4
+ import { z } from 'zod';
5
+ import { ResponseFormat } from '../constants.js';
6
+ /** File path parameter for local PDF files */
7
+ export declare const FilePathSchema: z.ZodString;
8
+ /** Optional page range parameter */
9
+ export declare const PagesSchema: z.ZodOptional<z.ZodString>;
10
+ /** Response format parameter */
11
+ export declare const ResponseFormatSchema: z.ZodDefault<z.ZodNativeEnum<typeof ResponseFormat>>;
12
+ /** URL parameter for remote PDFs */
13
+ export declare const UrlSchema: z.ZodString;
14
+ //# sourceMappingURL=common.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../src/schemas/common.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEjD,8CAA8C;AAC9C,eAAO,MAAM,cAAc,aAGqD,CAAC;AAEjF,oCAAoC;AACpC,eAAO,MAAM,WAAW,4BAGmE,CAAC;AAE5F,gCAAgC;AAChC,eAAO,MAAM,oBAAoB,sDAGsD,CAAC;AAExF,oCAAoC;AACpC,eAAO,MAAM,SAAS,aAGmC,CAAC"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Common Zod schemas shared across tools.
3
+ */
4
+ import { z } from 'zod';
5
+ import { ResponseFormat } from '../constants.js';
6
+ /** File path parameter for local PDF files */
7
+ export const FilePathSchema = z
8
+ .string()
9
+ .min(1, 'File path is required')
10
+ .describe('Absolute path to a local PDF file (e.g., "/path/to/document.pdf")');
11
+ /** Optional page range parameter */
12
+ export const PagesSchema = z
13
+ .string()
14
+ .optional()
15
+ .describe('Page range to process. Format: "1-5", "3", or "1,3,5-7". Omit for all pages.');
16
+ /** Response format parameter */
17
+ export const ResponseFormatSchema = z
18
+ .nativeEnum(ResponseFormat)
19
+ .default(ResponseFormat.MARKDOWN)
20
+ .describe('Output format: "markdown" for human-readable, "json" for structured data');
21
+ /** URL parameter for remote PDFs */
22
+ export const UrlSchema = z
23
+ .string()
24
+ .url('Must be a valid URL')
25
+ .describe('URL pointing to a PDF file (HTTP or HTTPS)');
26
+ //# sourceMappingURL=common.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common.js","sourceRoot":"","sources":["../../src/schemas/common.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEjD,8CAA8C;AAC9C,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC;KAC5B,MAAM,EAAE;KACR,GAAG,CAAC,CAAC,EAAE,uBAAuB,CAAC;KAC/B,QAAQ,CAAC,mEAAmE,CAAC,CAAC;AAEjF,oCAAoC;AACpC,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC;KACzB,MAAM,EAAE;KACR,QAAQ,EAAE;KACV,QAAQ,CAAC,8EAA8E,CAAC,CAAC;AAE5F,gCAAgC;AAChC,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC;KAClC,UAAU,CAAC,cAAc,CAAC;KAC1B,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC;KAChC,QAAQ,CAAC,0EAA0E,CAAC,CAAC;AAExF,oCAAoC;AACpC,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC;KACvB,MAAM,EAAE;KACR,GAAG,CAAC,qBAAqB,CAAC;KAC1B,QAAQ,CAAC,4CAA4C,CAAC,CAAC"}