@shuji-bonji/pdf-reader-mcp 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/CHANGELOG.md +37 -0
  2. package/README.ja.md +64 -4
  3. package/README.md +64 -4
  4. package/dist/constants.d.ts +1 -1
  5. package/dist/constants.js +1 -1
  6. package/dist/errors.d.ts +83 -0
  7. package/dist/errors.d.ts.map +1 -0
  8. package/dist/errors.js +75 -0
  9. package/dist/errors.js.map +1 -0
  10. package/dist/schemas/tier1.d.ts +19 -1
  11. package/dist/schemas/tier1.d.ts.map +1 -1
  12. package/dist/schemas/tier1.js +19 -0
  13. package/dist/schemas/tier1.js.map +1 -1
  14. package/dist/services/pdfjs-service.d.ts +8 -4
  15. package/dist/services/pdfjs-service.d.ts.map +1 -1
  16. package/dist/services/pdfjs-service.js +24 -4
  17. package/dist/services/pdfjs-service.js.map +1 -1
  18. package/dist/tools/tier1/get-metadata.d.ts.map +1 -1
  19. package/dist/tools/tier1/get-metadata.js +4 -2
  20. package/dist/tools/tier1/get-metadata.js.map +1 -1
  21. package/dist/tools/tier1/get-page-count.d.ts.map +1 -1
  22. package/dist/tools/tier1/get-page-count.js +4 -2
  23. package/dist/tools/tier1/get-page-count.js.map +1 -1
  24. package/dist/tools/tier1/read-images.d.ts.map +1 -1
  25. package/dist/tools/tier1/read-images.js +4 -2
  26. package/dist/tools/tier1/read-images.js.map +1 -1
  27. package/dist/tools/tier1/read-text.d.ts.map +1 -1
  28. package/dist/tools/tier1/read-text.js +10 -4
  29. package/dist/tools/tier1/read-text.js.map +1 -1
  30. package/dist/tools/tier1/read-url.d.ts.map +1 -1
  31. package/dist/tools/tier1/read-url.js +9 -4
  32. package/dist/tools/tier1/read-url.js.map +1 -1
  33. package/dist/tools/tier1/search-text.d.ts.map +1 -1
  34. package/dist/tools/tier1/search-text.js +4 -2
  35. package/dist/tools/tier1/search-text.js.map +1 -1
  36. package/dist/tools/tier1/summarize.d.ts.map +1 -1
  37. package/dist/tools/tier1/summarize.js +4 -2
  38. package/dist/tools/tier1/summarize.js.map +1 -1
  39. package/dist/tools/tier2/extract-tables.d.ts.map +1 -1
  40. package/dist/tools/tier2/extract-tables.js +6 -2
  41. package/dist/tools/tier2/extract-tables.js.map +1 -1
  42. package/dist/tools/tier2/inspect-annotations.d.ts.map +1 -1
  43. package/dist/tools/tier2/inspect-annotations.js +6 -2
  44. package/dist/tools/tier2/inspect-annotations.js.map +1 -1
  45. package/dist/tools/tier2/inspect-fonts.d.ts.map +1 -1
  46. package/dist/tools/tier2/inspect-fonts.js +6 -2
  47. package/dist/tools/tier2/inspect-fonts.js.map +1 -1
  48. package/dist/tools/tier2/inspect-signatures.d.ts.map +1 -1
  49. package/dist/tools/tier2/inspect-signatures.js +6 -2
  50. package/dist/tools/tier2/inspect-signatures.js.map +1 -1
  51. package/dist/tools/tier2/inspect-structure.d.ts.map +1 -1
  52. package/dist/tools/tier2/inspect-structure.js +6 -2
  53. package/dist/tools/tier2/inspect-structure.js.map +1 -1
  54. package/dist/tools/tier2/inspect-tags.d.ts.map +1 -1
  55. package/dist/tools/tier2/inspect-tags.js +6 -2
  56. package/dist/tools/tier2/inspect-tags.js.map +1 -1
  57. package/dist/tools/tier3/compare-structure.d.ts.map +1 -1
  58. package/dist/tools/tier3/compare-structure.js +6 -2
  59. package/dist/tools/tier3/compare-structure.js.map +1 -1
  60. package/dist/tools/tier3/validate-metadata.d.ts.map +1 -1
  61. package/dist/tools/tier3/validate-metadata.js +6 -2
  62. package/dist/tools/tier3/validate-metadata.js.map +1 -1
  63. package/dist/tools/tier3/validate-tagged.d.ts.map +1 -1
  64. package/dist/tools/tier3/validate-tagged.js +6 -2
  65. package/dist/tools/tier3/validate-tagged.js.map +1 -1
  66. package/dist/utils/error-handler.d.ts +40 -3
  67. package/dist/utils/error-handler.d.ts.map +1 -1
  68. package/dist/utils/error-handler.js +210 -31
  69. package/dist/utils/error-handler.js.map +1 -1
  70. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -5,6 +5,42 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.6.0] - 2026-05-07
9
+
10
+ ### Added
11
+
12
+ - **houki-hub family-compatible 構造化エラー応答** (Issue #9): すべての tool がエラー時に `LawServiceError` を JSON 文字列化したものを `content[0].text` に入れ、`isError: true` を立てて返すように変更しました。`code` 文字列は houki-hub family (houki-egov-mcp / houki-nta-mcp) と語彙を共有するため、LLM や Skill 層が一貫した解釈ロジックでエラーを処理できます。
13
+ - 新規 `src/errors.ts`: `LawErrorCode` / `LawServiceError` / `makeError` / `isLawServiceError` / `NEXT_ACTIONS` を定義 (houki-egov-mcp の `src/errors.ts` をリファレンスとし、`houki-abbreviations` 等への依存はなし)。
14
+ - 新規 `handleStructuredError(error)`: `unknown` を `LawServiceError` に正規化。`PdfReaderError` の legacy `code` を family 語彙にマッピング、汎用 `Error` のメッセージから `DOC_NOT_FOUND` / `INVALID_PDF` / `ENCRYPTED_PDF` / `FILE_TOO_LARGE` / `SOURCE_*` を推定。
15
+ - 採用 `code` 一覧: `INVALID_ARGUMENT` / `DOC_NOT_FOUND` / `INVALID_PDF` / `ENCRYPTED_PDF` / `UNSUPPORTED_PDF_FEATURE` / `FILE_TOO_LARGE` / `SOURCE_API_ERROR` / `SOURCE_TIMEOUT` / `SOURCE_UNAVAILABLE` / `INTERNAL_ERROR`。
16
+ - `PdfReaderError` を拡張: 直接 `familyCode` / `nextActions` / `retryable` / `detail` を渡せる第 4〜7 引数を追加 (既存呼び出しは変更なし、後方互換)。
17
+ - **`tests/tier1/errors.test.ts`**: makeError / isLawServiceError / NEXT_ACTIONS / handleStructuredError の単体テストを追加 (28 ケース)。
18
+
19
+ ### Changed
20
+
21
+ - **全 16 tool のエラーパス**: tier1 (7) / tier2 (6) / tier3 (3) すべてが `handleStructuredError(error)` を呼び、JSON 文字列 + `isError: true` を返す形に統一。`content[0].text` は人間可読文字列ではなく **JSON** になります。
22
+ - **README.md / README.ja.md** の Error Contract セクション: 「方針」記述を「v0.6.0 で実装済み」に更新し、`code` 一覧テーブル / 移行ノート (v0.5.x → v0.6.0) を追記。
23
+ - **`handleError(error)` (deprecated)**: 後方互換のため引き続き人間可読な文字列を返しますが、内部実装は `formatStructuredErrorForHumans(handleStructuredError(error))` に置き換わりました。新規 tool では `handleStructuredError()` を直接利用してください。
24
+
25
+ ### Migration
26
+
27
+ LLM クライアントや Skill 層で `content[0].text` を文字列として解釈していた場合、v0.6.0 以降は `JSON.parse(content[0].text)` で `LawServiceError` として解釈してください。`isError: true` フラグで構造化エラーかどうかを判定できます。詳細は README の「Error Contract (houki-hub family)」セクションを参照。
28
+
29
+ ## [0.5.0] - 2026-05-07
30
+
31
+ ### Added
32
+
33
+ - **`compact_whitespace` parameter on `read_text` and `read_url`** (Issue #4): opt-in whitespace normalization that collapses runs of `\s` plus U+3000 (fullwidth space) to a single ASCII space, and trims every line. Designed for Japanese form-style PDFs (帳票・様式) where U+3000 is used as visual indentation, inflating token consumption without adding information.
34
+ - Default `false` keeps the existing extraction byte-for-byte (regression-tested).
35
+ - Combines orthogonally with `split_columns`: inter-column `\n\n` separators survive the compaction.
36
+ - Per-cell CJK kerning ("消 費 税" → "消費税") is intentionally NOT touched here — that requires CJK-aware logic and remains the responsibility of `extract_tables`'s `compactCellText`.
37
+ - Empirically reduces character count by ~40% on 国税庁 form PDFs (e.g. `jimu-unei/hojin/090401-2/pdf/01.pdf`) while preserving all content.
38
+ - **E2E tests** in `tests/e2e/02-tier1-text.test.ts`: 3 new cases (RT-CW-1..3) covering the regression guard, the line-trim + run-collapse invariants, and orthogonal combination with `split_columns`.
39
+
40
+ ### Changed
41
+
42
+ - **`read_text` / `read_url` tool descriptions**: documented the new `compact_whitespace` parameter and added "Japanese form template" usage example.
43
+
8
44
  ## [0.4.0] - 2026-05-07
9
45
 
10
46
  ### Added
@@ -105,6 +141,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
105
141
  - Y-coordinate-based text extraction preserving natural reading order
106
142
  - Unit tests for core utilities and pdfjs-service
107
143
 
144
+ [0.5.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.4.0...v0.5.0
108
145
  [0.4.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.3.0...v0.4.0
109
146
  [0.3.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.3...v0.3.0
110
147
  [0.2.3]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.2...v0.2.3
package/README.ja.md CHANGED
@@ -20,7 +20,7 @@ PDF 内部構造解析に特化した MCP (Model Context Protocol) サーバー
20
20
  | ---------------- | ----------------------------------------------------- |
21
21
  | `get_page_count` | ページ数の軽量取得 |
22
22
  | `get_metadata` | メタデータ抽出(タイトル、著者、PDF版、タグ有無等) |
23
- | `read_text` | テキスト抽出(Y座標ベースの読み順保持。`split_columns: 2 \| 3` で **タグなし** 多カラム PDF にも対応) |
23
+ | `read_text` | テキスト抽出(Y座標ベースの読み順保持。`split_columns: 2 \| 3` で **タグなし** 多カラム PDF、`compact_whitespace` 帳票 の U+3000 連続空白を畳み込み) |
24
24
  | `search_text` | 全文検索(前後コンテキスト付き) |
25
25
  | `read_images` | 画像抽出(base64、メタデータ付き) |
26
26
  | `read_url` | URLからリモートPDFを取得して処理 |
@@ -177,12 +177,27 @@ read_text({ file_path: "/path/to/older-shinkyu.pdf", split_columns: 2 })
177
177
  **タグなし** 多カラム PDF (古い 新旧対照表など) で `split_columns: 2 \| 3` を指定。
178
178
  タグ付き PDF (`<Table>` 構造あり) では `extract_tables` の方が表構造を保持できます。
179
179
 
180
+ ### 連続する全角空白を畳み込む(日本語帳票・様式向け)
181
+
182
+ ```
183
+ read_text({ file_path: "/path/to/form.pdf", compact_whitespace: true })
184
+ → // 元 PDF が U+3000 で視覚的インデントを表現している場合:
185
+ // " ( ) 自 年 月 日 法 有 ( 年 月 日) 有 有"
186
+ //
187
+ // compact_whitespace: true で:
188
+ // "( ) 自 年 月 日 法 有 ( 年 月 日) 有 有"
189
+ //
190
+ // 国税庁の帳票 PDF で文字数が約 40% 削減できる実例あり。
191
+ ```
192
+
193
+ `compact_whitespace` は `split_columns` と直交して併用可能です。
194
+
180
195
  ## 技術スタック
181
196
 
182
197
  - **TypeScript** + MCP TypeScript SDK
183
198
  - **pdfjs-dist** (Mozilla) — テキスト/画像抽出、タグツリー、注釈
184
199
  - **pdf-lib** — 低レベルオブジェクト構造解析
185
- - **Vitest** — Unit + E2E テスト(168 tests)
200
+ - **Vitest** — Unit + E2E テスト(171 tests)
186
201
  - **Biome** — lint + format
187
202
  - **Zod** — 入力バリデーション
188
203
 
@@ -190,7 +205,7 @@ read_text({ file_path: "/path/to/older-shinkyu.pdf", split_columns: 2 })
190
205
 
191
206
  ```bash
192
207
  npm test # 全テスト実行(Unit: 39 tests)
193
- npm run test:e2e # E2E のみ(129 tests)
208
+ npm run test:e2e # E2E のみ(132 tests)
194
209
  npm run test:watch # ウォッチモード
195
210
  ```
196
211
 
@@ -220,9 +235,54 @@ pdf-reader-mcp/
220
235
  │ └── error-handler.ts # エラーハンドリング
221
236
  └── tests/
222
237
  ├── tier1/ # Unit tests
223
- └── e2e/ # E2E tests (9 suites, 129 tests)
238
+ └── e2e/ # E2E tests (9 suites, 132 tests)
239
+ ```
240
+
241
+ ## エラー応答 (houki-hub family contract)
242
+
243
+ **v0.6.0** より、本 MCP のエラー応答は **houki-hub family 共通契約**に従う構造化レスポンスを返します。`code` 文字列は family 全体で統一された語彙を使用するため、`houki-egov-mcp` / `houki-nta-mcp` と併用しても LLM・Skill 層は一貫したロジックで解釈できます。
244
+
245
+ - [`docs/ERROR-CODES.md`](https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-CODES.md) — エラーコード語彙の正典 (houki-research-skill)
246
+ - [`docs/ERROR-HANDLING.md`](https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-HANDLING.md) — 解釈ポリシー / next_actions テンプレ
247
+
248
+ 実装は **完全に独立** しており、`houki-abbreviations` 等の family パッケージに依存しません。リファレンス実装は [`houki-egov-mcp/src/errors.ts`](https://github.com/shuji-bonji/houki-egov-mcp/blob/main/src/errors.ts)、本 MCP のローカル定義は [`src/errors.ts`](./src/errors.ts) を参照してください。
249
+
250
+ エラー時は全 tool が `isError: true` を立て、`content[0].text` に `LawServiceError` を JSON 文字列化したものを入れて返します:
251
+
252
+ ```json
253
+ {
254
+ "error": "The file does not appear to be a valid PDF.",
255
+ "code": "INVALID_PDF",
256
+ "hint": "ファイルが破損していないか確認してください。",
257
+ "next_actions": [
258
+ {
259
+ "action": "inspect_structure",
260
+ "reason": "PDF が壊れている可能性があります。Catalog / Pages 等の構造を確認してください"
261
+ }
262
+ ],
263
+ "detail": { "cause": "Invalid PDF structure" }
264
+ }
224
265
  ```
225
266
 
267
+ ### 本 MCP で使用するコード
268
+
269
+ | code | 用途 |
270
+ |---|---|
271
+ | `INVALID_ARGUMENT` | パス・URL・ページ範囲などクライアント側引数の不正 |
272
+ | `DOC_NOT_FOUND` | ファイル未存在 (ENOENT) |
273
+ | `INVALID_PDF` | PDF として不正・破損 |
274
+ | `ENCRYPTED_PDF` | 暗号化 PDF (現状未対応) |
275
+ | `UNSUPPORTED_PDF_FEATURE` | サポート外の PDF 機能 |
276
+ | `FILE_TOO_LARGE` | 50MB 上限超過 (pdf-reader 固有) |
277
+ | `SOURCE_API_ERROR` | URL fetch の HTTP エラー (4xx/5xx) |
278
+ | `SOURCE_TIMEOUT` | リモート取得タイムアウト |
279
+ | `SOURCE_UNAVAILABLE` | DNS / 接続失敗 |
280
+ | `INTERNAL_ERROR` | パーミッション拒否を含むその他バグ |
281
+
282
+ ### 移行ノート (v0.5.x → v0.6.0)
283
+
284
+ 旧 v0.5.x までは `content[0].text` に `Error: ...\n\nSuggestion: ...` という人間可読な文字列を入れていました。v0.6.0 では同じ場所に **JSON 文字列** が入ります。LLM 側でテキスト解釈に依存していた場合は `JSON.parse(content[0].text)` での解釈に切り替えてください。`isError: true` フラグで構造化エラーかどうかを判定できます。
285
+
226
286
  ## pdf-spec-mcp との連携
227
287
 
228
288
  [pdf-spec-mcp](https://github.com/shuji-bonji/pdf-spec-mcp) は PDF 仕様(ISO 32000-2 等)の知識を提供する MCP サーバーです。両方を有効にすることで、LLM は以下のような仕様知識ベースのワークフローを実行できます:
package/README.md CHANGED
@@ -20,7 +20,7 @@ While typical PDF MCP servers are thin wrappers for text extraction, this projec
20
20
  | ---------------- | -------------------------------------------------------- |
21
21
  | `get_page_count` | Lightweight page count retrieval |
22
22
  | `get_metadata` | Full metadata extraction (title, author, PDF version...) |
23
- | `read_text` | Text extraction with Y-coordinate reading order (opt-in `split_columns: 2 \| 3` for untagged multi-column PDFs) |
23
+ | `read_text` | Text extraction with Y-coordinate reading order (opt-in `split_columns: 2 \| 3` for untagged multi-column PDFs, `compact_whitespace` for Japanese forms) |
24
24
  | `search_text` | Full-text search with surrounding context |
25
25
  | `read_images` | Image extraction as base64 with metadata |
26
26
  | `read_url` | Fetch and process remote PDFs from URLs |
@@ -177,12 +177,27 @@ read_text({ file_path: "/path/to/older-shinkyu.pdf", split_columns: 2 })
177
177
  Use `split_columns: 2 | 3` for **untagged** multi-column PDFs. For Tagged
178
178
  PDFs with proper `<Table>` markup, `extract_tables` (above) is preferred.
179
179
 
180
+ ### Compact Whitespace (Japanese Forms)
181
+
182
+ ```
183
+ read_text({ file_path: "/path/to/form.pdf", compact_whitespace: true })
184
+ → // Original PDF uses U+3000 fullwidth space as visual indentation:
185
+ // " ( ) 自 年 月 日 法 有 ( 年 月 日) 有 有"
186
+ //
187
+ // With compact_whitespace: true:
188
+ // "( ) 自 年 月 日 法 有 ( 年 月 日) 有 有"
189
+ //
190
+ // Empirically reduces character count by ~40% on form PDFs.
191
+ ```
192
+
193
+ `compact_whitespace` is orthogonal to `split_columns` — both can be combined.
194
+
180
195
  ## Tech Stack
181
196
 
182
197
  - **TypeScript** + MCP TypeScript SDK
183
198
  - **pdfjs-dist** (Mozilla) — text/image extraction, tag tree, annotations
184
199
  - **pdf-lib** — low-level object structure analysis
185
- - **Vitest** — unit + E2E testing (168 tests)
200
+ - **Vitest** — unit + E2E testing (171 tests)
186
201
  - **Biome** — linting + formatting
187
202
  - **Zod** — input validation
188
203
 
@@ -190,7 +205,7 @@ PDFs with proper `<Table>` markup, `extract_tables` (above) is preferred.
190
205
 
191
206
  ```bash
192
207
  npm test # Run all tests (unit: 39 tests)
193
- npm run test:e2e # E2E tests only (129 tests)
208
+ npm run test:e2e # E2E tests only (132 tests)
194
209
  npm run test:watch # Watch mode
195
210
  ```
196
211
 
@@ -220,9 +235,54 @@ pdf-reader-mcp/
220
235
  │ └── error-handler.ts # Error handling
221
236
  └── tests/
222
237
  ├── tier1/ # Unit tests
223
- └── e2e/ # E2E tests (9 suites, 129 tests)
238
+ └── e2e/ # E2E tests (9 suites, 132 tests)
239
+ ```
240
+
241
+ ## Error Contract (houki-hub family)
242
+
243
+ Since **v0.6.0**, this MCP returns structured errors that follow the **houki-hub family error contract**, sharing a unified `code` vocabulary across the family. Combined with `houki-egov-mcp` / `houki-nta-mcp`, an LLM or Skill layer can interpret errors with consistent logic.
244
+
245
+ - [`docs/ERROR-CODES.md`](https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-CODES.md) — error code vocabulary (houki-research-skill)
246
+ - [`docs/ERROR-HANDLING.md`](https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-HANDLING.md) — handling policy / next_actions templates
247
+
248
+ Implementation is **independent** — no dependency on `houki-abbreviations` or other family packages. The reference implementation is [`houki-egov-mcp/src/errors.ts`](https://github.com/shuji-bonji/houki-egov-mcp/blob/main/src/errors.ts); pdf-reader-mcp's local definition is in [`src/errors.ts`](./src/errors.ts).
249
+
250
+ On error, every tool returns `isError: true` and the JSON-stringified `LawServiceError` in `content[0].text`:
251
+
252
+ ```json
253
+ {
254
+ "error": "The file does not appear to be a valid PDF.",
255
+ "code": "INVALID_PDF",
256
+ "hint": "ファイルが破損していないか確認してください。",
257
+ "next_actions": [
258
+ {
259
+ "action": "inspect_structure",
260
+ "reason": "PDF が壊れている可能性があります。Catalog / Pages 等の構造を確認してください"
261
+ }
262
+ ],
263
+ "detail": { "cause": "Invalid PDF structure" }
264
+ }
224
265
  ```
225
266
 
267
+ ### Codes used by pdf-reader-mcp
268
+
269
+ | code | 用途 |
270
+ |---|---|
271
+ | `INVALID_ARGUMENT` | パス・URL・ページ範囲などクライアント側引数の不正 |
272
+ | `DOC_NOT_FOUND` | ファイル未存在 (ENOENT) |
273
+ | `INVALID_PDF` | PDF として不正・破損 |
274
+ | `ENCRYPTED_PDF` | 暗号化 PDF (現状未対応) |
275
+ | `UNSUPPORTED_PDF_FEATURE` | サポート外の PDF 機能 |
276
+ | `FILE_TOO_LARGE` | 50MB 上限超過 (pdf-reader 固有) |
277
+ | `SOURCE_API_ERROR` | URL fetch の HTTP エラー (4xx/5xx) |
278
+ | `SOURCE_TIMEOUT` | リモート取得タイムアウト |
279
+ | `SOURCE_UNAVAILABLE` | DNS / 接続失敗 |
280
+ | `INTERNAL_ERROR` | パーミッション拒否を含むその他バグ |
281
+
282
+ ### Migration note (v0.5.x → v0.6.0)
283
+
284
+ 旧 v0.5.x までは `content[0].text` に `Error: ...\n\nSuggestion: ...` という人間可読文字列を入れていました。v0.6.0 では同じ場所に **JSON 文字列** が入ります。LLM 側でテキスト解釈に依存していた場合は、`JSON.parse(content[0].text)` での解釈に切り替えてください。`isError: true` フラグで構造化エラーかどうかを判定できます。
285
+
226
286
  ## Pairing with pdf-spec-mcp
227
287
 
228
288
  [pdf-spec-mcp](https://github.com/shuji-bonji/pdf-spec-mcp) provides PDF specification knowledge (ISO 32000-2, etc.). With both servers enabled, an LLM can perform specification-aware workflows:
@@ -13,7 +13,7 @@ export declare const MAX_SEARCH_RESULTS = 100;
13
13
  export declare const DEFAULT_SEARCH_CONTEXT = 80;
14
14
  /** Server info */
15
15
  export declare const SERVER_NAME = "pdf-reader-mcp";
16
- export declare const SERVER_VERSION = "0.4.0";
16
+ export declare const SERVER_VERSION = "0.5.0";
17
17
  /** Response format enum */
18
18
  export declare enum ResponseFormat {
19
19
  MARKDOWN = "markdown",
package/dist/constants.js CHANGED
@@ -13,7 +13,7 @@ export const MAX_SEARCH_RESULTS = 100;
13
13
  export const DEFAULT_SEARCH_CONTEXT = 80;
14
14
  /** Server info */
15
15
  export const SERVER_NAME = 'pdf-reader-mcp';
16
- export const SERVER_VERSION = '0.4.0';
16
+ export const SERVER_VERSION = '0.5.0';
17
17
  /** Response format enum */
18
18
  export var ResponseFormat;
19
19
  (function (ResponseFormat) {
@@ -0,0 +1,83 @@
1
+ /**
2
+ * houki-hub family-compatible 構造化エラー応答 — pdf-reader-mcp 独自実装
3
+ *
4
+ * 設計指針:
5
+ * - houki-hub family の error contract に準拠 (`code` 文字列を共有)
6
+ * - 共通パッケージ依存を持たず独立実装 (pdf-reader-mcp は汎用 PDF ツールであり、
7
+ * houki-abbreviations 等の家族専用パッケージへ依存させない)
8
+ * - houki-egov-mcp / houki-nta-mcp の `src/errors.ts` をリファレンスとし、
9
+ * PDF 固有のコード (INVALID_PDF / ENCRYPTED_PDF 等) を追加
10
+ *
11
+ * family contract 仕様:
12
+ * @see https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-CODES.md
13
+ * @see https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-HANDLING.md
14
+ */
15
+ /**
16
+ * family 共通エラーコード — pdf-reader-mcp で使用する部分集合 + PDF 固有拡張。
17
+ *
18
+ * - 引数・入力 (クライアント側責任): `INVALID_ARGUMENT`
19
+ * - リソース未発見: `DOC_NOT_FOUND`
20
+ * - PDF コンテンツ: `INVALID_PDF` / `ENCRYPTED_PDF` / `UNSUPPORTED_PDF_FEATURE`
21
+ * - 外部ソース由来: `SOURCE_API_ERROR` / `SOURCE_TIMEOUT` / `SOURCE_UNAVAILABLE`
22
+ * - PDF 固有: `FILE_TOO_LARGE` (50MB 制限超過)
23
+ * - システム: `INTERNAL_ERROR`
24
+ */
25
+ export type LawErrorCode = 'INVALID_ARGUMENT' | 'DOC_NOT_FOUND' | 'INVALID_PDF' | 'ENCRYPTED_PDF' | 'UNSUPPORTED_PDF_FEATURE' | 'SOURCE_API_ERROR' | 'SOURCE_TIMEOUT' | 'SOURCE_UNAVAILABLE' | 'FILE_TOO_LARGE' | 'INTERNAL_ERROR';
26
+ /**
27
+ * 次に取るべきアクションの提案。
28
+ * LLM がこれを読んで自律的に次のツールを呼ぶことを想定。
29
+ */
30
+ export interface NextAction {
31
+ /** 推奨アクション (tool 名 or 自然言語) */
32
+ action: string;
33
+ /** どんなときに有効か */
34
+ reason: string;
35
+ /** 具体的な引数例 (任意) */
36
+ example?: Record<string, unknown>;
37
+ }
38
+ /**
39
+ * family-compatible 共通エラー応答。
40
+ */
41
+ export interface LawServiceError {
42
+ /** 1文の人間可読メッセージ (LLM もここを読む) */
43
+ error: string;
44
+ /** プログラム判定用の安定したコード */
45
+ code: LawErrorCode;
46
+ /** 追加情報 (任意) */
47
+ hint?: string;
48
+ /** LLM が次に呼ぶべき tool / 取るべき手段の候補 */
49
+ next_actions?: NextAction[];
50
+ /** 一時的エラーかどうか (true なら時間をおいて再試行可) */
51
+ retryable?: boolean;
52
+ /** 元のエラー詳細 (debug 用) */
53
+ detail?: {
54
+ status?: number;
55
+ url?: string;
56
+ cause?: string;
57
+ };
58
+ }
59
+ /**
60
+ * エラーレスポンスを構築するヘルパー。
61
+ * 必須フィールド (error, code) と任意フィールドを安全に組み立てる。
62
+ */
63
+ export declare function makeError(code: LawErrorCode, message: string, options?: {
64
+ hint?: string;
65
+ next_actions?: NextAction[];
66
+ retryable?: boolean;
67
+ detail?: LawServiceError['detail'];
68
+ }): LawServiceError;
69
+ /** オブジェクトが LawServiceError かどうかの type guard */
70
+ export declare function isLawServiceError(value: unknown): value is LawServiceError;
71
+ /**
72
+ * よく使う next_actions のプリセット。
73
+ * pdf-reader-mcp 固有の tool 名を含む。
74
+ */
75
+ export declare const NEXT_ACTIONS: {
76
+ readonly checkFilePath: (filePath?: string) => NextAction;
77
+ readonly useGetPageCount: (filePath?: string) => NextAction;
78
+ readonly useInspectStructure: (filePath?: string) => NextAction;
79
+ readonly splitPdf: () => NextAction;
80
+ readonly retryLater: () => NextAction;
81
+ readonly checkUrl: (url?: string) => NextAction;
82
+ };
83
+ //# sourceMappingURL=errors.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH;;;;;;;;;GASG;AACH,MAAM,MAAM,YAAY,GAEpB,kBAAkB,GAElB,eAAe,GAEf,aAAa,GACb,eAAe,GACf,yBAAyB,GAEzB,kBAAkB,GAClB,gBAAgB,GAChB,oBAAoB,GAEpB,gBAAgB,GAEhB,gBAAgB,CAAC;AAErB;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,+BAA+B;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,gBAAgB;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,mBAAmB;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,gCAAgC;IAChC,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,IAAI,EAAE,YAAY,CAAC;IACnB,gBAAgB;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,mCAAmC;IACnC,YAAY,CAAC,EAAE,UAAU,EAAE,CAAC;IAC5B,qCAAqC;IACrC,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,wBAAwB;IACxB,MAAM,CAAC,EAAE;QACP,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC;CACH;AAED;;;GAGG;AACH,wBAAgB,SAAS,CACvB,IAAI,EAAE,YAAY,EAClB,OAAO,EAAE,MAAM,EACf,OAAO,GAAE;IACP,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,UAAU,EAAE,CAAC;IAC5B,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,MAAM,CAAC,EAAE,eAAe,CAAC,QAAQ,CAAC,CAAC;CAC/B,GACL,eAAe,CASjB;AAED,+CAA+C;AAC/C,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,eAAe,CAS1E;AAED;;;GAGG;AACH,eAAO,MAAM,YAAY;wCACI,MAAM,KAAG,UAAU;0CAKjB,MAAM,KAAG,UAAU;8CAKf,MAAM,KAAG,UAAU;6BAKtC,UAAU;+BAIR,UAAU;8BAIT,MAAM,KAAG,UAAU;CAK5B,CAAC"}
package/dist/errors.js ADDED
@@ -0,0 +1,75 @@
1
+ /**
2
+ * houki-hub family-compatible 構造化エラー応答 — pdf-reader-mcp 独自実装
3
+ *
4
+ * 設計指針:
5
+ * - houki-hub family の error contract に準拠 (`code` 文字列を共有)
6
+ * - 共通パッケージ依存を持たず独立実装 (pdf-reader-mcp は汎用 PDF ツールであり、
7
+ * houki-abbreviations 等の家族専用パッケージへ依存させない)
8
+ * - houki-egov-mcp / houki-nta-mcp の `src/errors.ts` をリファレンスとし、
9
+ * PDF 固有のコード (INVALID_PDF / ENCRYPTED_PDF 等) を追加
10
+ *
11
+ * family contract 仕様:
12
+ * @see https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-CODES.md
13
+ * @see https://github.com/shuji-bonji/houki-research-skill/blob/main/docs/ERROR-HANDLING.md
14
+ */
15
+ /**
16
+ * エラーレスポンスを構築するヘルパー。
17
+ * 必須フィールド (error, code) と任意フィールドを安全に組み立てる。
18
+ */
19
+ export function makeError(code, message, options = {}) {
20
+ const err = { error: message, code };
21
+ if (options.hint)
22
+ err.hint = options.hint;
23
+ if (options.next_actions && options.next_actions.length > 0) {
24
+ err.next_actions = options.next_actions;
25
+ }
26
+ if (options.retryable !== undefined)
27
+ err.retryable = options.retryable;
28
+ if (options.detail)
29
+ err.detail = options.detail;
30
+ return err;
31
+ }
32
+ /** オブジェクトが LawServiceError かどうかの type guard */
33
+ export function isLawServiceError(value) {
34
+ return (typeof value === 'object' &&
35
+ value !== null &&
36
+ 'error' in value &&
37
+ 'code' in value &&
38
+ typeof value.error === 'string' &&
39
+ typeof value.code === 'string');
40
+ }
41
+ /**
42
+ * よく使う next_actions のプリセット。
43
+ * pdf-reader-mcp 固有の tool 名を含む。
44
+ */
45
+ export const NEXT_ACTIONS = {
46
+ checkFilePath: (filePath) => ({
47
+ action: 'verify_file_path',
48
+ reason: 'ファイルパスが正しいか、絶対パスで指定されているか確認してください',
49
+ example: filePath ? { file_path: filePath } : undefined,
50
+ }),
51
+ useGetPageCount: (filePath) => ({
52
+ action: 'get_page_count',
53
+ reason: '総ページ数を確認してから pages を指定してください',
54
+ example: filePath ? { file_path: filePath } : undefined,
55
+ }),
56
+ useInspectStructure: (filePath) => ({
57
+ action: 'inspect_structure',
58
+ reason: 'PDF が壊れている可能性があります。Catalog / Pages 等の構造を確認してください',
59
+ example: filePath ? { file_path: filePath } : undefined,
60
+ }),
61
+ splitPdf: () => ({
62
+ action: 'split_pdf',
63
+ reason: 'ファイルサイズが 50MB を超えています。分割してから再試行してください',
64
+ }),
65
+ retryLater: () => ({
66
+ action: 'retry_later',
67
+ reason: '一時的なネットワークエラーの可能性があります。30秒〜数分後に再試行してください',
68
+ }),
69
+ checkUrl: (url) => ({
70
+ action: 'verify_url',
71
+ reason: 'URL が正しいか・HTTP/HTTPS であるか確認してください',
72
+ example: url ? { url } : undefined,
73
+ }),
74
+ };
75
+ //# sourceMappingURL=errors.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"errors.js","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAiEH;;;GAGG;AACH,MAAM,UAAU,SAAS,CACvB,IAAkB,EAClB,OAAe,EACf,UAKI,EAAE;IAEN,MAAM,GAAG,GAAoB,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IACtD,IAAI,OAAO,CAAC,IAAI;QAAE,GAAG,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAC1C,IAAI,OAAO,CAAC,YAAY,IAAI,OAAO,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5D,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;IAC1C,CAAC;IACD,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS;QAAE,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;IACvE,IAAI,OAAO,CAAC,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAChD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,+CAA+C;AAC/C,MAAM,UAAU,iBAAiB,CAAC,KAAc;IAC9C,OAAO,CACL,OAAO,KAAK,KAAK,QAAQ;QACzB,KAAK,KAAK,IAAI;QACd,OAAO,IAAI,KAAK;QAChB,MAAM,IAAI,KAAK;QACf,OAAQ,KAA4B,CAAC,KAAK,KAAK,QAAQ;QACvD,OAAQ,KAA2B,CAAC,IAAI,KAAK,QAAQ,CACtD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG;IAC1B,aAAa,EAAE,CAAC,QAAiB,EAAc,EAAE,CAAC,CAAC;QACjD,MAAM,EAAE,kBAAkB;QAC1B,MAAM,EAAE,mCAAmC;QAC3C,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,SAAS;KACxD,CAAC;IACF,eAAe,EAAE,CAAC,QAAiB,EAAc,EAAE,CAAC,CAAC;QACnD,MAAM,EAAE,gBAAgB;QACxB,MAAM,EAAE,8BAA8B;QACtC,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,SAAS;KACxD,CAAC;IACF,mBAAmB,EAAE,CAAC,QAAiB,EAAc,EAAE,CAAC,CAAC;QACvD,MAAM,EAAE,mBAAmB;QAC3B,MAAM,EAAE,kDAAkD;QAC1D,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,SAAS;KACxD,CAAC;IACF,QAAQ,EAAE,GAAe,EAAE,CAAC,CAAC;QAC3B,MAAM,EAAE,WAAW;QACnB,MAAM,EAAE,uCAAuC;KAChD,CAAC;IACF,UAAU,EAAE,GAAe,EAAE,CAAC,CAAC;QAC7B,MAAM,EAAE,aAAa;QACrB,MAAM,EAAE,0CAA0C;KACnD,CAAC;IACF,QAAQ,EAAE,CAAC,GAAY,EAAc,EAAE,CAAC,CAAC;QACvC,MAAM,EAAE,YAAY;QACpB,MAAM,EAAE,mCAAmC;QAC3C,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,SAAS;KACnC,CAAC;CACM,CAAC"}
@@ -32,22 +32,37 @@ export declare const GetMetadataSchema: z.ZodObject<{
32
32
  * use `extract_tables` instead — `split_columns` is for untagged cases.
33
33
  */
34
34
  export declare const SplitColumnsSchema: z.ZodOptional<z.ZodNumber>;
35
+ /**
36
+ * `compact_whitespace` — Issue #4: collapse runs of ASCII / fullwidth
37
+ * whitespace down to a single ASCII space, and trim each line.
38
+ *
39
+ * Default `false` keeps the existing extraction byte-for-byte. When set to
40
+ * `true`, every run of `\s` plus ` ` is replaced with one ASCII space
41
+ * and trailing/leading whitespace is removed line by line. This typically
42
+ * cuts 20–40% of token consumption for Japanese form-style PDFs without
43
+ * losing any content. Inter-column blank-line separators (from
44
+ * `split_columns`) are preserved.
45
+ */
46
+ export declare const CompactWhitespaceSchema: z.ZodOptional<z.ZodBoolean>;
35
47
  /** read_text */
36
48
  export declare const ReadTextSchema: z.ZodObject<{
37
49
  file_path: z.ZodString;
38
50
  pages: z.ZodOptional<z.ZodString>;
39
51
  response_format: z.ZodDefault<z.ZodNativeEnum<typeof import("../constants.js").ResponseFormat>>;
40
52
  split_columns: z.ZodOptional<z.ZodNumber>;
53
+ compact_whitespace: z.ZodOptional<z.ZodBoolean>;
41
54
  }, "strict", z.ZodTypeAny, {
42
55
  file_path: string;
43
56
  response_format: import("../constants.js").ResponseFormat;
44
57
  pages?: string | undefined;
45
58
  split_columns?: number | undefined;
59
+ compact_whitespace?: boolean | undefined;
46
60
  }, {
47
61
  file_path: string;
48
62
  response_format?: import("../constants.js").ResponseFormat | undefined;
49
63
  pages?: string | undefined;
50
64
  split_columns?: number | undefined;
65
+ compact_whitespace?: boolean | undefined;
51
66
  }>;
52
67
  /** search_text */
53
68
  export declare const SearchTextSchema: z.ZodObject<{
@@ -89,16 +104,19 @@ export declare const ReadUrlSchema: z.ZodObject<{
89
104
  pages: z.ZodOptional<z.ZodString>;
90
105
  response_format: z.ZodDefault<z.ZodNativeEnum<typeof import("../constants.js").ResponseFormat>>;
91
106
  split_columns: z.ZodOptional<z.ZodNumber>;
107
+ compact_whitespace: z.ZodOptional<z.ZodBoolean>;
92
108
  }, "strict", z.ZodTypeAny, {
93
- response_format: import("../constants.js").ResponseFormat;
94
109
  url: string;
110
+ response_format: import("../constants.js").ResponseFormat;
95
111
  pages?: string | undefined;
96
112
  split_columns?: number | undefined;
113
+ compact_whitespace?: boolean | undefined;
97
114
  }, {
98
115
  url: string;
99
116
  response_format?: import("../constants.js").ResponseFormat | undefined;
100
117
  pages?: string | undefined;
101
118
  split_columns?: number | undefined;
119
+ compact_whitespace?: boolean | undefined;
102
120
  }>;
103
121
  /** summarize */
104
122
  export declare const SummarizeSchema: z.ZodObject<{
@@ -1 +1 @@
1
- {"version":3,"file":"tier1.d.ts","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,qBAAqB;AACrB,eAAO,MAAM,kBAAkB;;;;;;EAIpB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,4BAW5B,CAAC;AAEJ,gBAAgB;AAChB,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;EAOhB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAyBlB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;EAKlB,CAAC;AAEZ,eAAe;AACf,eAAO,MAAM,aAAa;;;;;;;;;;;;;;;EAOf,CAAC;AAEZ,gBAAgB;AAChB,eAAO,MAAM,eAAe;;;;;;;;;EAKjB,CAAC;AAGZ,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAC3D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,aAAa,CAAC,CAAC;AACzD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC"}
1
+ {"version":3,"file":"tier1.d.ts","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,qBAAqB;AACrB,eAAO,MAAM,kBAAkB;;;;;;EAIpB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,4BAW5B,CAAC;AAEJ;;;;;;;;;;GAUG;AACH,eAAO,MAAM,uBAAuB,6BAOjC,CAAC;AAEJ,gBAAgB;AAChB,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;EAQhB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAyBlB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;EAKlB,CAAC;AAEZ,eAAe;AACf,eAAO,MAAM,aAAa;;;;;;;;;;;;;;;;;;EAQf,CAAC;AAEZ,gBAAgB;AAChB,eAAO,MAAM,eAAe;;;;;;;;;EAKjB,CAAC;AAGZ,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAC3D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,aAAa,CAAC,CAAC;AACzD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC"}
@@ -37,6 +37,23 @@ export const SplitColumnsSchema = z
37
37
  '2 or 3 = bucket by X-coordinate left-to-right. Use for untagged 新旧対照表 / ' +
38
38
  'two-column PDFs where Y-sort would interleave columns. Tagged PDFs with proper ' +
39
39
  '<Table> markup should use extract_tables instead.');
40
+ /**
41
+ * `compact_whitespace` — Issue #4: collapse runs of ASCII / fullwidth
42
+ * whitespace down to a single ASCII space, and trim each line.
43
+ *
44
+ * Default `false` keeps the existing extraction byte-for-byte. When set to
45
+ * `true`, every run of `\s` plus ` ` is replaced with one ASCII space
46
+ * and trailing/leading whitespace is removed line by line. This typically
47
+ * cuts 20–40% of token consumption for Japanese form-style PDFs without
48
+ * losing any content. Inter-column blank-line separators (from
49
+ * `split_columns`) are preserved.
50
+ */
51
+ export const CompactWhitespaceSchema = z
52
+ .boolean()
53
+ .optional()
54
+ .describe('When true, collapse runs of whitespace (incl. fullwidth space U+3000) ' +
55
+ 'to a single ASCII space and trim each line. Reduces token consumption ' +
56
+ 'on Japanese form-style PDFs. Default: false (no whitespace normalization).');
40
57
  /** read_text */
41
58
  export const ReadTextSchema = z
42
59
  .object({
@@ -44,6 +61,7 @@ export const ReadTextSchema = z
44
61
  pages: PagesSchema,
45
62
  response_format: ResponseFormatSchema,
46
63
  split_columns: SplitColumnsSchema,
64
+ compact_whitespace: CompactWhitespaceSchema,
47
65
  })
48
66
  .strict();
49
67
  /** search_text */
@@ -87,6 +105,7 @@ export const ReadUrlSchema = z
87
105
  pages: PagesSchema,
88
106
  response_format: ResponseFormatSchema,
89
107
  split_columns: SplitColumnsSchema,
108
+ compact_whitespace: CompactWhitespaceSchema,
90
109
  })
91
110
  .strict();
92
111
  /** summarize */
@@ -1 +1 @@
1
- {"version":3,"file":"tier1.js","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE3F,qBAAqB;AACrB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,EAAE;KACR,GAAG,EAAE;KACL,GAAG,CAAC,CAAC,CAAC;KACN,GAAG,CAAC,CAAC,CAAC;KACN,QAAQ,EAAE;KACV,QAAQ,CACP,gFAAgF;IAC9E,0EAA0E;IAC1E,iFAAiF;IACjF,mDAAmD,CACtD,CAAC;AAEJ,gBAAgB;AAChB,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC;KAC5B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;CAClC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,CAAC;SACL,MAAM,EAAE;SACR,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;SAClC,GAAG,CAAC,GAAG,EAAE,sCAAsC,CAAC;SAChD,QAAQ,CAAC,uCAAuC,CAAC;IACpD,KAAK,EAAE,WAAW;IAClB,aAAa,EAAE,CAAC;SACb,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,GAAG,CAAC;SACR,OAAO,CAAC,sBAAsB,CAAC;SAC/B,QAAQ,CAAC,0DAA0D,CAAC;IACvE,WAAW,EAAE,CAAC;SACX,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,kBAAkB,CAAC;SACvB,OAAO,CAAC,EAAE,CAAC;SACX,QAAQ,CAAC,qCAAqC,CAAC;IAClD,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;CACnB,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,eAAe;AACf,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC;KAC3B,MAAM,CAAC;IACN,GAAG,EAAE,SAAS;IACd,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;CAClC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,gBAAgB;AAChB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC;KAC7B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}
1
+ {"version":3,"file":"tier1.js","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE3F,qBAAqB;AACrB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,EAAE;KACR,GAAG,EAAE;KACL,GAAG,CAAC,CAAC,CAAC;KACN,GAAG,CAAC,CAAC,CAAC;KACN,QAAQ,EAAE;KACV,QAAQ,CACP,gFAAgF;IAC9E,0EAA0E;IAC1E,iFAAiF;IACjF,mDAAmD,CACtD,CAAC;AAEJ;;;;;;;;;;GAUG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC;KACrC,OAAO,EAAE;KACT,QAAQ,EAAE;KACV,QAAQ,CACP,wEAAwE;IACtE,wEAAwE;IACxE,4EAA4E,CAC/E,CAAC;AAEJ,gBAAgB;AAChB,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC;KAC5B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;IACjC,kBAAkB,EAAE,uBAAuB;CAC5C,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,CAAC;SACL,MAAM,EAAE;SACR,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;SAClC,GAAG,CAAC,GAAG,EAAE,sCAAsC,CAAC;SAChD,QAAQ,CAAC,uCAAuC,CAAC;IACpD,KAAK,EAAE,WAAW;IAClB,aAAa,EAAE,CAAC;SACb,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,GAAG,CAAC;SACR,OAAO,CAAC,sBAAsB,CAAC;SAC/B,QAAQ,CAAC,0DAA0D,CAAC;IACvE,WAAW,EAAE,CAAC;SACX,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,kBAAkB,CAAC;SACvB,OAAO,CAAC,EAAE,CAAC;SACX,QAAQ,CAAC,qCAAqC,CAAC;IAClD,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;CACnB,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,eAAe;AACf,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC;KAC3B,MAAM,CAAC;IACN,GAAG,EAAE,SAAS;IACd,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;IACjC,kBAAkB,EAAE,uBAAuB;CAC5C,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,gBAAgB;AAChB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC;KAC7B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}
@@ -25,13 +25,17 @@ export declare function getMetadataFromDoc(doc: PDFDocumentProxy, filePath: stri
25
25
  /**
26
26
  * Options for text extraction.
27
27
  *
28
- * `splitColumns` controls Issue #3 column-aware reordering. When `>= 2`,
29
- * text items are bucketed into N equal-width columns by X-coordinate and
30
- * concatenated left-to-right. `1` (default / undefined) preserves the
31
- * existing single-column Y-sort behaviour.
28
+ * - `splitColumns` controls Issue #3 column-aware reordering. When `>= 2`,
29
+ * text items are bucketed into N equal-width columns by X-coordinate and
30
+ * concatenated left-to-right. `1` (default / undefined) preserves the
31
+ * existing single-column Y-sort behaviour.
32
+ * - `compactWhitespace` controls Issue #4 whitespace normalization. When
33
+ * `true`, runs of `\s` plus U+3000 collapse to one ASCII space and each
34
+ * line is trimmed. Default `false` preserves original spacing.
32
35
  */
33
36
  export interface ExtractTextOptions {
34
37
  splitColumns?: number;
38
+ compactWhitespace?: boolean;
35
39
  }
36
40
  /**
37
41
  * Extract text from a pre-loaded PDFDocumentProxy.
@@ -1 +1 @@
1
- {"version":3,"file":"pdfjs-service.d.ts","sourceRoot":"","sources":["../../src/services/pdfjs-service.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAGL,KAAK,gBAAgB,EAEtB,MAAM,iCAAiC,CAAC;AAGzC,OAAO,KAAK,EAEV,mBAAmB,EAGnB,qBAAqB,EACrB,QAAQ,EACR,WAAW,EACX,WAAW,EAGX,sBAAsB,EAEtB,YAAY,EACb,MAAM,aAAa,CAAC;AAWrB;;GAEG;AACH,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAI9E;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAGtF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOxE;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,WAAW,CAAC,CA6BtB;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,kBAAkB;IACjC,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAarB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAQrB;AAED;;GAEG;AACH,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,YAAY,GAAE,MAA+B,EAC7C,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,WAAW,EAAE,CAAC,CAsDxB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB/F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQnF;AAED;;;GAGG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,qBAAqB,CAAC,CAoEhC;AAyID;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,YAAY,CAAC,CAmErF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAOzE;AAID;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,oBAAoB,CACxC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CA8CjC;AAED,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CAOjC;AA2KD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,mBAAmB,CAAC,CAmG9B"}
1
+ {"version":3,"file":"pdfjs-service.d.ts","sourceRoot":"","sources":["../../src/services/pdfjs-service.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAGL,KAAK,gBAAgB,EAEtB,MAAM,iCAAiC,CAAC;AAGzC,OAAO,KAAK,EAEV,mBAAmB,EAGnB,qBAAqB,EACrB,QAAQ,EACR,WAAW,EACX,WAAW,EAGX,sBAAsB,EAEtB,YAAY,EACb,MAAM,aAAa,CAAC;AAWrB;;GAEG;AACH,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAI9E;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAGtF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOxE;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,WAAW,CAAC,CA6BtB;AAED;;;;;;;;;;GAUG;AACH,MAAM,WAAW,kBAAkB;IACjC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAarB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAQrB;AAED;;GAEG;AACH,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,YAAY,GAAE,MAA+B,EAC7C,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,WAAW,EAAE,CAAC,CAsDxB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB/F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQnF;AAED;;;GAGG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,qBAAqB,CAAC,CAoEhC;AA8JD;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,YAAY,CAAC,CAmErF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAOzE;AAID;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,oBAAoB,CACxC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CA8CjC;AAED,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CAOjC;AA2KD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,mBAAmB,CAAC,CAmG9B"}