npm - @shuji-bonji/pdf-reader-mcp - Versions diffs - 0.2.3 → 0.4.0 - Mend

@shuji-bonji/pdf-reader-mcp 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +35 -0
package/README.ja.md +45 -13
package/README.md +45 -13
package/dist/constants.d.ts +1 -1
package/dist/constants.js +1 -1
package/dist/schemas/tier1.d.ts +17 -0
package/dist/schemas/tier1.d.ts.map +1 -1
package/dist/schemas/tier1.js +22 -0
package/dist/schemas/tier1.js.map +1 -1
package/dist/schemas/tier2.d.ts +15 -0
package/dist/schemas/tier2.d.ts.map +1 -1
package/dist/schemas/tier2.js +8 -0
package/dist/schemas/tier2.js.map +1 -1
package/dist/services/pdfjs-service.d.ts +35 -3
package/dist/services/pdfjs-service.d.ts.map +1 -1
package/dist/services/pdfjs-service.js +257 -9
package/dist/services/pdfjs-service.js.map +1 -1
package/dist/tools/index.d.ts.map +1 -1
package/dist/tools/index.js +2 -0
package/dist/tools/index.js.map +1 -1
package/dist/tools/tier1/read-text.d.ts.map +1 -1
package/dist/tools/tier1/read-text.js +8 -3
package/dist/tools/tier1/read-text.js.map +1 -1
package/dist/tools/tier1/read-url.d.ts.map +1 -1
package/dist/tools/tier1/read-url.js +7 -2
package/dist/tools/tier1/read-url.js.map +1 -1
package/dist/tools/tier2/extract-tables.d.ts +11 -0
package/dist/tools/tier2/extract-tables.d.ts.map +1 -0
package/dist/tools/tier2/extract-tables.js +66 -0
package/dist/tools/tier2/extract-tables.js.map +1 -0
package/dist/types.d.ts +41 -0
package/dist/types.d.ts.map +1 -1
package/dist/utils/formatter.d.ts +8 -1
package/dist/utils/formatter.d.ts.map +1 -1
package/dist/utils/formatter.js +52 -0
package/dist/utils/formatter.js.map +1 -1
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,39 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.4.0] - 2026-05-07
+### Added
+- **`split_columns` parameter on `read_text` and `read_url`** (Issue #3): opt-in column-aware reordering for **untagged** multi-column PDFs. When `split_columns: 2` (or `3`) is passed, text items are first bucketed by X-coordinate into N equal-width columns, then each bucket is Y-sorted independently and concatenated left-to-right with a blank-line separator. Designed for older 新旧対照表 PDFs and similar two-column documents that lack a structure tree (Tagged PDFs with proper `<Table>` markup should use `extract_tables` instead).
+  - `splitColumns: 1` (default / undefined) is unchanged — existing single-column Y-sort behaviour is preserved as a regression-tested baseline.
+  - `extractText` / `extractTextFromDoc` now accept an `ExtractTextOptions` object with `splitColumns?: number`. Internally, line-grouping logic is factored out into `itemsToText` so the column path reuses the same Y-sort.
+- **Test fixture `tests/fixtures/two-column.pdf`**: 1-page A4 untagged PDF with `LEFT-1..4` and `RIGHT-1..4` placed at paired Y-coordinates so plain Y-sort interleaves them — the regression target for `split_columns`.
+- **E2E tests** in `tests/e2e/02-tier1-text.test.ts`: 4 new cases (RT-SC-1..4) covering the failure mode without `split_columns`, the success mode with `split_columns: 2`, the `split_columns: 1` regression guard, and a sanity check that `split_columns: 2` on a single-column PDF preserves all content.
+### Changed
+- **`read_text` / `read_url` tool descriptions**: documented the new `split_columns` parameter with guidance to prefer `extract_tables` for Tagged PDFs.
+## [0.3.0] - 2026-05-06
+### Added
+- **`extract_tables` (Tier 2)**: New tool that walks a Tagged PDF's structure tree and emits every `<Table>` subtree as a Markdown table or a JSON document. Designed for documents whose meaning depends on multi-column layout — e.g. 国税庁 新旧対照表 (kaisei tsutatsu) PDFs where reading-order extraction merges 改正後 / 改正前 columns into ambiguous text. Internals:
+  - `extractTables(filePath, pages?)` / `extractTablesFromDoc(doc, pages?)` in `services/pdfjs-service.ts`. Walks the StructTree, identifies `<Table>` → `<THead> | <TBody> | <TFoot>` → `<TR>` → `<TH> | <TD>`, then resolves cell text by mapping each leaf node's marked-content `id` (e.g. `p715R_mc4`) to the corresponding `beginMarkedContentProps` boundary in `getTextContent({ includeMarkedContent: true })`.
+  - Cell text post-processing: collapses whitespace runs (incl. U+3000), folds per-character kerning runs ("消 費 税 法" → "消費税法") while preserving natural inter-word spacing ("事業者 法人番号"), escapes Markdown table delimiters.
+  - Untagged PDFs return `isTagged: false`, an empty `tables` array, and a `note` recommending column-aware extraction (planned in Issue #3) as the fallback.
+  - colspan / rowspan and nested tables are skipped in this initial release; cells appear in source order.
+- **Types**: `TableCell`, `TableRow`, `ExtractedTable`, `TablesExtractionResult` in `types.ts`.
+- **Schema**: `ExtractTablesSchema` in `schemas/tier2.ts` (file_path + pages + response_format).
+- **Markdown formatter**: `formatTablesMarkdown` in `utils/formatter.ts` renders results as `# Extracted Tables` summary block followed by `## Page N — Table M` GFM tables.
+- **E2E tests**: 5 new tests in `tests/e2e/04-tier2-structure.test.ts` covering untagged → note path, tagged-but-empty path, formatter shape, and pages filter.
+### Changed
+- **Tool count**: 15 → 16 tools (Tier 2 now has 6).
+- README / README.ja.md tool tables and architecture diagram updated accordingly.
 ## [0.2.3] - 2026-05-06
 ### Fixed
@@ -72,6 +105,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Y-coordinate-based text extraction preserving natural reading order
 - Unit tests for core utilities and pdfjs-service
+[0.4.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.3.0...v0.4.0
+[0.3.0]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.3...v0.3.0
 [0.2.3]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.2...v0.2.3
 [0.2.2]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.1...v0.2.2
 [0.2.1]: https://github.com/shuji-bonji/pdf-reader-mcp/compare/v0.2.0...v0.2.1

package/README.ja.md CHANGED Viewed

@@ -12,7 +12,7 @@ PDF 内部構造解析に特化した MCP (Model Context Protocol) サーバー
 ## 機能
-**15 ツール** を 3 層構成で提供します。
+**16 ツール** を 3 層構成で提供します。
 ### Tier 1: 基本機能
@@ -20,7 +20,7 @@ PDF 内部構造解析に特化した MCP (Model Context Protocol) サーバー
 | ---------------- | ----------------------------------------------------- |
 | `get_page_count` | ページ数の軽量取得                                    |
 | `get_metadata`   | メタデータ抽出（タイトル、著者、PDF版、タグ有無等）   |
-| `read_text`      | テキスト抽出（Y座標ベースの読み順保持）               |
+| `read_text`      | テキスト抽出（Y座標ベースの読み順保持。`split_columns: 2 \| 3` で **タグなし** 多カラム PDF にも対応） |
 | `search_text`    | 全文検索（前後コンテキスト付き）                      |
 | `read_images`    | 画像抽出（base64、メタデータ付き）                    |
 | `read_url`       | URLからリモートPDFを取得して処理                      |
@@ -28,13 +28,14 @@ PDF 内部構造解析に特化した MCP (Model Context Protocol) サーバー
 ### Tier 2: 構造解析
-| ツール                | 説明                                         |
-| --------------------- | -------------------------------------------- |
-| `inspect_structure`   | オブジェクトツリー・カタログ辞書の解析       |
-| `inspect_tags`        | Tagged PDF のタグツリー可視化                |
-| `inspect_fonts`       | フォント一覧（埋め込み/サブセット/Type判定） |
-| `inspect_annotations` | 注釈一覧（タイプ別分類）                     |
-| `inspect_signatures`  | 電子署名フィールドの構造解析                 |
+| ツール                | 説明                                                     |
+| --------------------- | -------------------------------------------------------- |
+| `inspect_structure`   | オブジェクトツリー・カタログ辞書の解析                   |
+| `inspect_tags`        | Tagged PDF のタグツリー可視化                            |
+| `inspect_fonts`       | フォント一覧（埋め込み/サブセット/Type判定）             |
+| `inspect_annotations` | 注釈一覧（タイプ別分類）                                 |
+| `inspect_signatures`  | 電子署名フィールドの構造解析                             |
+| `extract_tables`      | Tagged PDF の `<Table>` を Markdown テーブルとして抽出   |
 ### Tier 3: 検証・分析
@@ -145,12 +146,43 @@ compare_structure({
   | Tagged      | true | true | ✅ |
 ```
+### Tagged PDF からテーブル抽出
+```
+extract_tables({ file_path: "/path/to/kaisei-tsutatsu.pdf", pages: "1" })
+→ # Extracted Tables
+  - **Tagged**: Yes / **Pages Scanned**: 1 / **Tables Found**: 1
+  ## Page 1 — Table 1
+  | 改正後 | 改正前 |
+  | --- | --- |
+  | …第２条第 16 項《定義》… | …第２条第 15 項《定義》… |
+```
+タグ無し PDF では空結果と note を返し、下記の column-aware 抽出への
+フォールバックを推奨します。
+### タグなし多カラム PDF をカラム単位で読む
+```
+read_text({ file_path: "/path/to/older-shinkyu.pdf", split_columns: 2 })
+→ // 通常の Y ソートだとカラムが交互連結:
+//   "改正後セル1   改正前セル1\n 改正後セル2   改正前セル2..."
+//
+// split_columns: 2 では左カラム → 空行 → 右カラムの順:
+//   "改正後セル1\n改正後セル2\n…\n\n改正前セル1\n改正前セル2\n…"
+```
+**タグなし** 多カラム PDF (古い 新旧対照表など) で `split_columns: 2 \| 3` を指定。
+タグ付き PDF (`<Table>` 構造あり) では `extract_tables` の方が表構造を保持できます。
 ## 技術スタック
 - **TypeScript** + MCP TypeScript SDK
 - **pdfjs-dist** (Mozilla) — テキスト/画像抽出、タグツリー、注釈
 - **pdf-lib** — 低レベルオブジェクト構造解析
-- **Vitest** — Unit + E2E テスト（159 tests）
+- **Vitest** — Unit + E2E テスト（168 tests）
 - **Biome** — lint + format
 - **Zod** — 入力バリデーション
@@ -158,7 +190,7 @@ compare_structure({
 ```bash
 npm test              # 全テスト実行（Unit: 39 tests）
-npm run test:e2e      # E2E のみ（120 tests）
+npm run test:e2e      # E2E のみ（129 tests）
 npm run test:watch    # ウォッチモード
 ```
@@ -172,7 +204,7 @@ pdf-reader-mcp/
 │   ├── types.ts              # 型定義
 │   ├── tools/
 │   │   ├── tier1/            # 基本ツール (7)
-│   │   ├── tier2/            # 構造解析 (5)
+│   │   ├── tier2/            # 構造解析 (6)
 │   │   ├── tier3/            # 検証・分析 (3)
 │   │   └── index.ts          # ツール登録
 │   ├── services/
@@ -188,7 +220,7 @@ pdf-reader-mcp/
 │       └── error-handler.ts  # エラーハンドリング
 └── tests/
     ├── tier1/                # Unit tests
-    └── e2e/                  # E2E tests (9 suites, 120 tests)
+    └── e2e/                  # E2E tests (9 suites, 129 tests)
 ```
 ## pdf-spec-mcp との連携

package/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ While typical PDF MCP servers are thin wrappers for text extraction, this projec
 ## Features
-**15 tools** organized into three tiers:
+**16 tools** organized into three tiers:
 ### Tier 1: Basic Operations
@@ -20,7 +20,7 @@ While typical PDF MCP servers are thin wrappers for text extraction, this projec
 | ---------------- | -------------------------------------------------------- |
 | `get_page_count` | Lightweight page count retrieval                         |
 | `get_metadata`   | Full metadata extraction (title, author, PDF version...) |
-| `read_text`      | Text extraction with Y-coordinate reading order          |
+| `read_text`      | Text extraction with Y-coordinate reading order (opt-in `split_columns: 2 \| 3` for untagged multi-column PDFs) |
 | `search_text`    | Full-text search with surrounding context                |
 | `read_images`    | Image extraction as base64 with metadata                 |
 | `read_url`       | Fetch and process remote PDFs from URLs                  |
@@ -28,13 +28,14 @@ While typical PDF MCP servers are thin wrappers for text extraction, this projec
 ### Tier 2: Structure Inspection
-| Tool                  | Description                                      |
-| --------------------- | ------------------------------------------------ |
-| `inspect_structure`   | Object tree and catalog dictionary analysis      |
-| `inspect_tags`        | Tagged PDF structure tree visualization          |
-| `inspect_fonts`       | Font inventory (embedded/subset/type detection)  |
-| `inspect_annotations` | Annotation listing (categorized by subtype)      |
-| `inspect_signatures`  | Digital signature field structure analysis        |
+| Tool                  | Description                                                         |
+| --------------------- | ------------------------------------------------------------------- |
+| `inspect_structure`   | Object tree and catalog dictionary analysis                         |
+| `inspect_tags`        | Tagged PDF structure tree visualization                             |
+| `inspect_fonts`       | Font inventory (embedded/subset/type detection)                     |
+| `inspect_annotations` | Annotation listing (categorized by subtype)                         |
+| `inspect_signatures`  | Digital signature field structure analysis                          |
+| `extract_tables`      | Tagged PDF `<Table>` subtree → Markdown table (preserves columns)   |
 ### Tier 3: Validation & Analysis
@@ -145,12 +146,43 @@ compare_structure({
   | Tagged      | true | true | ✅ |
 ```
+### Extract Tables (Tagged PDF)
+```
+extract_tables({ file_path: "/path/to/kaisei-tsutatsu.pdf", pages: "1" })
+→ # Extracted Tables
+  - **Tagged**: Yes / **Pages Scanned**: 1 / **Tables Found**: 1
+  ## Page 1 — Table 1
+  | 改正後 | 改正前 |
+  | --- | --- |
+  | …第２条第 16 項《定義》… | …第２条第 15 項《定義》… |
+```
+Untagged PDFs return an empty result with a `note` recommending the
+column-aware fallback below.
+### Read Untagged Multi-Column PDF
+```
+read_text({ file_path: "/path/to/older-shinkyu.pdf", split_columns: 2 })
+→ // Plain Y-sort would interleave columns:
+//   "改正後セル1   改正前セル1\n 改正後セル2   改正前セル2..."
+//
+// With split_columns: 2 the left column is emitted first, then the right:
+//   "改正後セル1\n改正後セル2\n…\n\n改正前セル1\n改正前セル2\n…"
+```
+Use `split_columns: 2 | 3` for **untagged** multi-column PDFs. For Tagged
+PDFs with proper `<Table>` markup, `extract_tables` (above) is preferred.
 ## Tech Stack
 - **TypeScript** + MCP TypeScript SDK
 - **pdfjs-dist** (Mozilla) — text/image extraction, tag tree, annotations
 - **pdf-lib** — low-level object structure analysis
-- **Vitest** — unit + E2E testing (159 tests)
+- **Vitest** — unit + E2E testing (168 tests)
 - **Biome** — linting + formatting
 - **Zod** — input validation
@@ -158,7 +190,7 @@ compare_structure({
 ```bash
 npm test              # Run all tests (unit: 39 tests)
-npm run test:e2e      # E2E tests only (120 tests)
+npm run test:e2e      # E2E tests only (129 tests)
 npm run test:watch    # Watch mode
 ```
@@ -172,7 +204,7 @@ pdf-reader-mcp/
 │   ├── types.ts              # Type definitions
 │   ├── tools/
 │   │   ├── tier1/            # Basic tools (7)
-│   │   ├── tier2/            # Structure inspection (5)
+│   │   ├── tier2/            # Structure inspection (6)
 │   │   ├── tier3/            # Validation & analysis (3)
 │   │   └── index.ts          # Tool registration
 │   ├── services/
@@ -188,7 +220,7 @@ pdf-reader-mcp/
 │       └── error-handler.ts  # Error handling
 └── tests/
     ├── tier1/                # Unit tests
-    └── e2e/                  # E2E tests (9 suites, 120 tests)
+    └── e2e/                  # E2E tests (9 suites, 129 tests)
 ```
 ## Pairing with pdf-spec-mcp

package/dist/constants.d.ts CHANGED Viewed

@@ -13,7 +13,7 @@ export declare const MAX_SEARCH_RESULTS = 100;
 export declare const DEFAULT_SEARCH_CONTEXT = 80;
 /** Server info */
 export declare const SERVER_NAME = "pdf-reader-mcp";
-export declare const SERVER_VERSION = "0.2.3";
+export declare const SERVER_VERSION = "0.4.0";
 /** Response format enum */
 export declare enum ResponseFormat {
     MARKDOWN = "markdown",

package/dist/constants.js CHANGED Viewed

@@ -13,7 +13,7 @@ export const MAX_SEARCH_RESULTS = 100;
 export const DEFAULT_SEARCH_CONTEXT = 80;
 /** Server info */
 export const SERVER_NAME = 'pdf-reader-mcp';
-export const SERVER_VERSION = '0.2.3';
+export const SERVER_VERSION = '0.4.0';
 /** Response format enum */
 export var ResponseFormat;
 (function (ResponseFormat) {

package/dist/schemas/tier1.d.ts CHANGED Viewed

@@ -21,19 +21,33 @@ export declare const GetMetadataSchema: z.ZodObject<{
     file_path: string;
     response_format?: import("../constants.js").ResponseFormat | undefined;
 }>;
+/**
+ * `split_columns` — Issue #3: column-aware extraction for untagged
+ * multi-column PDFs.
+ *
+ * Acts as an opt-in override for the default reading-order strategy. When
+ * `>= 2`, items are bucketed by X-coordinate into N equal-width columns and
+ * the buckets are concatenated left-to-right. `1` (default) preserves the
+ * existing Y-sort behaviour. Tagged PDFs with proper `<Table>` markup should
+ * use `extract_tables` instead — `split_columns` is for untagged cases.
+ */
+export declare const SplitColumnsSchema: z.ZodOptional<z.ZodNumber>;
 /** read_text */
 export declare const ReadTextSchema: z.ZodObject<{
     file_path: z.ZodString;
     pages: z.ZodOptional<z.ZodString>;
     response_format: z.ZodDefault<z.ZodNativeEnum<typeof import("../constants.js").ResponseFormat>>;
+    split_columns: z.ZodOptional<z.ZodNumber>;
 }, "strict", z.ZodTypeAny, {
     file_path: string;
     response_format: import("../constants.js").ResponseFormat;
     pages?: string | undefined;
+    split_columns?: number | undefined;
 }, {
     file_path: string;
     response_format?: import("../constants.js").ResponseFormat | undefined;
     pages?: string | undefined;
+    split_columns?: number | undefined;
 }>;
 /** search_text */
 export declare const SearchTextSchema: z.ZodObject<{
@@ -74,14 +88,17 @@ export declare const ReadUrlSchema: z.ZodObject<{
     url: z.ZodString;
     pages: z.ZodOptional<z.ZodString>;
     response_format: z.ZodDefault<z.ZodNativeEnum<typeof import("../constants.js").ResponseFormat>>;
+    split_columns: z.ZodOptional<z.ZodNumber>;
 }, "strict", z.ZodTypeAny, {
     response_format: import("../constants.js").ResponseFormat;
     url: string;
     pages?: string | undefined;
+    split_columns?: number | undefined;
 }, {
     url: string;
     response_format?: import("../constants.js").ResponseFormat | undefined;
     pages?: string | undefined;
+    split_columns?: number | undefined;
 }>;
 /** summarize */
 export declare const SummarizeSchema: z.ZodObject<{

package/dist/schemas/tier1.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"tier1.d.ts","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,qBAAqB;AACrB,eAAO,MAAM,kBAAkB;;;;;;EAIpB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ,gBAAgB;AAChB,eAAO,MAAM,cAAc~~;;;;;;;;;;;;EAMhB~~,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAyBlB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;EAKlB,CAAC;AAEZ,eAAe;AACf,eAAO,MAAM,aAAa~~;;;;;;;;;;;;EAMf~~,CAAC;AAEZ,gBAAgB;AAChB,eAAO,MAAM,eAAe;;;;;;;;;EAKjB,CAAC;AAGZ,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAC3D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,aAAa,CAAC,CAAC;AACzD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC"}
1	+ {"version":3,"file":"tier1.d.ts","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,qBAAqB;AACrB,eAAO,MAAM,kBAAkB;;;;;;EAIpB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,4BAW5B,CAAC;AAEJ,gBAAgB;AAChB,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;EAOhB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAyBlB,CAAC;AAEZ,kBAAkB;AAClB,eAAO,MAAM,gBAAgB;;;;;;;;;EAKlB,CAAC;AAEZ,eAAe;AACf,eAAO,MAAM,aAAa;;;;;;;;;;;;;;;EAOf,CAAC;AAEZ,gBAAgB;AAChB,eAAO,MAAM,eAAe;;;;;;;;;EAKjB,CAAC;AAGZ,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAC3D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAC/D,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,aAAa,CAAC,CAAC;AACzD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC"}

package/dist/schemas/tier1.js CHANGED Viewed

@@ -17,12 +17,33 @@ export const GetMetadataSchema = z
     response_format: ResponseFormatSchema,
 })
     .strict();
+/**
+ * `split_columns` — Issue #3: column-aware extraction for untagged
+ * multi-column PDFs.
+ *
+ * Acts as an opt-in override for the default reading-order strategy. When
+ * `>= 2`, items are bucketed by X-coordinate into N equal-width columns and
+ * the buckets are concatenated left-to-right. `1` (default) preserves the
+ * existing Y-sort behaviour. Tagged PDFs with proper `<Table>` markup should
+ * use `extract_tables` instead — `split_columns` is for untagged cases.
+ */
+export const SplitColumnsSchema = z
+    .number()
+    .int()
+    .min(1)
+    .max(3)
+    .optional()
+    .describe('Number of columns to use when reordering text. 1 (default) = existing Y-sort. ' +
+    '2 or 3 = bucket by X-coordinate left-to-right. Use for untagged 新旧対照表 / ' +
+    'two-column PDFs where Y-sort would interleave columns. Tagged PDFs with proper ' +
+    '<Table> markup should use extract_tables instead.');
 /** read_text */
 export const ReadTextSchema = z
     .object({
     file_path: FilePathSchema,
     pages: PagesSchema,
     response_format: ResponseFormatSchema,
+    split_columns: SplitColumnsSchema,
 })
     .strict();
 /** search_text */
@@ -65,6 +86,7 @@ export const ReadUrlSchema = z
     url: UrlSchema,
     pages: PagesSchema,
     response_format: ResponseFormatSchema,
+    split_columns: SplitColumnsSchema,
 })
     .strict();
 /** summarize */

package/dist/schemas/tier1.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"tier1.js","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE3F,qBAAqB;AACrB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,gBAAgB;AAChB,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC;KAC5B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;~~CACtC~~,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,CAAC;SACL,MAAM,EAAE;SACR,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;SAClC,GAAG,CAAC,GAAG,EAAE,sCAAsC,CAAC;SAChD,QAAQ,CAAC,uCAAuC,CAAC;IACpD,KAAK,EAAE,WAAW;IAClB,aAAa,EAAE,CAAC;SACb,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,GAAG,CAAC;SACR,OAAO,CAAC,sBAAsB,CAAC;SAC/B,QAAQ,CAAC,0DAA0D,CAAC;IACvE,WAAW,EAAE,CAAC;SACX,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,kBAAkB,CAAC;SACvB,OAAO,CAAC,EAAE,CAAC;SACX,QAAQ,CAAC,qCAAqC,CAAC;IAClD,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;CACnB,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,eAAe;AACf,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC;KAC3B,MAAM,CAAC;IACN,GAAG,EAAE,SAAS;IACd,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;~~CACtC~~,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,gBAAgB;AAChB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC;KAC7B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}
1	+ {"version":3,"file":"tier1.js","sourceRoot":"","sources":["../../src/schemas/tier1.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE3F,qBAAqB;AACrB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,EAAE;KACR,GAAG,EAAE;KACL,GAAG,CAAC,CAAC,CAAC;KACN,GAAG,CAAC,CAAC,CAAC;KACN,QAAQ,EAAE;KACV,QAAQ,CACP,gFAAgF;IAC9E,0EAA0E;IAC1E,iFAAiF;IACjF,mDAAmD,CACtD,CAAC;AAEJ,gBAAgB;AAChB,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC;KAC5B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;CAClC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,CAAC;SACL,MAAM,EAAE;SACR,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;SAClC,GAAG,CAAC,GAAG,EAAE,sCAAsC,CAAC;SAChD,QAAQ,CAAC,uCAAuC,CAAC;IACpD,KAAK,EAAE,WAAW;IAClB,aAAa,EAAE,CAAC;SACb,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,GAAG,CAAC;SACR,OAAO,CAAC,sBAAsB,CAAC;SAC/B,QAAQ,CAAC,0DAA0D,CAAC;IACvE,WAAW,EAAE,CAAC;SACX,MAAM,EAAE;SACR,GAAG,EAAE;SACL,GAAG,CAAC,CAAC,CAAC;SACN,GAAG,CAAC,kBAAkB,CAAC;SACvB,OAAO,CAAC,EAAE,CAAC;SACX,QAAQ,CAAC,qCAAqC,CAAC;IAClD,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,kBAAkB;AAClB,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC;KAC9B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;CACnB,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,eAAe;AACf,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC;KAC3B,MAAM,CAAC;IACN,GAAG,EAAE,SAAS;IACd,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;IACrC,aAAa,EAAE,kBAAkB;CAClC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,gBAAgB;AAChB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC;KAC7B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}

package/dist/schemas/tier2.d.ts CHANGED Viewed

@@ -60,9 +60,24 @@ export declare const InspectSignaturesSchema: z.ZodObject<{
     file_path: string;
     response_format?: import("../constants.js").ResponseFormat | undefined;
 }>;
+/** extract_tables — Tagged PDF Table → Markdown */
+export declare const ExtractTablesSchema: z.ZodObject<{
+    file_path: z.ZodString;
+    pages: z.ZodOptional<z.ZodString>;
+    response_format: z.ZodDefault<z.ZodNativeEnum<typeof import("../constants.js").ResponseFormat>>;
+}, "strict", z.ZodTypeAny, {
+    file_path: string;
+    response_format: import("../constants.js").ResponseFormat;
+    pages?: string | undefined;
+}, {
+    file_path: string;
+    response_format?: import("../constants.js").ResponseFormat | undefined;
+    pages?: string | undefined;
+}>;
 export type InspectStructureInput = z.infer<typeof InspectStructureSchema>;
 export type InspectTagsInput = z.infer<typeof InspectTagsSchema>;
 export type InspectFontsInput = z.infer<typeof InspectFontsSchema>;
 export type InspectAnnotationsInput = z.infer<typeof InspectAnnotationsSchema>;
 export type InspectSignaturesInput = z.infer<typeof InspectSignaturesSchema>;
+export type ExtractTablesInput = z.infer<typeof ExtractTablesSchema>;
 //# sourceMappingURL=tier2.d.ts.map

package/dist/schemas/tier2.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"tier2.d.ts","sourceRoot":"","sources":["../../src/schemas/tier2.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,wBAAwB;AACxB,eAAO,MAAM,sBAAsB;;;;;;;;;EAKxB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ,oBAAoB;AACpB,eAAO,MAAM,kBAAkB;;;;;;;;;EAKpB,CAAC;AAEZ,0BAA0B;AAC1B,eAAO,MAAM,wBAAwB;;;;;;;;;;;;EAM1B,CAAC;AAEZ,yBAAyB;AACzB,eAAO,MAAM,uBAAuB;;;;;;;;;EAKzB,CAAC;AAGZ,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAC3E,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,wBAAwB,CAAC,CAAC;AAC/E,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,uBAAuB,CAAC,CAAC"}
1	+ {"version":3,"file":"tier2.d.ts","sourceRoot":"","sources":["../../src/schemas/tier2.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,wBAAwB;AACxB,eAAO,MAAM,sBAAsB;;;;;;;;;EAKxB,CAAC;AAEZ,mBAAmB;AACnB,eAAO,MAAM,iBAAiB;;;;;;;;;EAKnB,CAAC;AAEZ,oBAAoB;AACpB,eAAO,MAAM,kBAAkB;;;;;;;;;EAKpB,CAAC;AAEZ,0BAA0B;AAC1B,eAAO,MAAM,wBAAwB;;;;;;;;;;;;EAM1B,CAAC;AAEZ,yBAAyB;AACzB,eAAO,MAAM,uBAAuB;;;;;;;;;EAKzB,CAAC;AAEZ,mDAAmD;AACnD,eAAO,MAAM,mBAAmB;;;;;;;;;;;;EAMrB,CAAC;AAGZ,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAC3E,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AACjE,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,wBAAwB,CAAC,CAAC;AAC/E,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,uBAAuB,CAAC,CAAC;AAC7E,MAAM,MAAM,kBAAkB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC"}

package/dist/schemas/tier2.js CHANGED Viewed

@@ -39,4 +39,12 @@ export const InspectSignaturesSchema = z
     response_format: ResponseFormatSchema,
 })
     .strict();
+/** extract_tables — Tagged PDF Table → Markdown */
+export const ExtractTablesSchema = z
+    .object({
+    file_path: FilePathSchema,
+    pages: PagesSchema,
+    response_format: ResponseFormatSchema,
+})
+    .strict();
 //# sourceMappingURL=tier2.js.map

package/dist/schemas/tier2.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"tier2.js","sourceRoot":"","sources":["../../src/schemas/tier2.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEhF,wBAAwB;AACxB,MAAM,CAAC,MAAM,sBAAsB,GAAG,CAAC;KACpC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,oBAAoB;AACpB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,0BAA0B;AAC1B,MAAM,CAAC,MAAM,wBAAwB,GAAG,CAAC;KACtC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,yBAAyB;AACzB,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC;KACrC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}
1	+ {"version":3,"file":"tier2.js","sourceRoot":"","sources":["../../src/schemas/tier2.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEhF,wBAAwB;AACxB,MAAM,CAAC,MAAM,sBAAsB,GAAG,CAAC;KACpC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mBAAmB;AACnB,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC;KAC/B,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,oBAAoB;AACpB,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC;KAChC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,0BAA0B;AAC1B,MAAM,CAAC,MAAM,wBAAwB,GAAG,CAAC;KACtC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,yBAAyB;AACzB,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC;KACrC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,mDAAmD;AACnD,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC;KACjC,MAAM,CAAC;IACN,SAAS,EAAE,cAAc;IACzB,KAAK,EAAE,WAAW;IAClB,eAAe,EAAE,oBAAoB;CACtC,CAAC;KACD,MAAM,EAAE,CAAC"}

package/dist/services/pdfjs-service.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Centralizes all pdfjs-dist interactions for reuse across tools.
  */
 import { type PDFDocumentProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
-import type { AnnotationsAnalysis, ImageExtractionResult, PageText, PdfMetadata, SearchMatch, TagsAnalysis } from '../types.js';
+import type { AnnotationsAnalysis, ImageExtractionResult, PageText, PdfMetadata, SearchMatch, TablesExtractionResult, TagsAnalysis } from '../types.js';
 /**
  * Load a PDF document from a file path.
  */
@@ -22,15 +22,26 @@ export declare function getMetadata(filePath: string): Promise<PdfMetadata>;
  * Does NOT destroy the document — caller is responsible for lifecycle.
  */
 export declare function getMetadataFromDoc(doc: PDFDocumentProxy, filePath: string): Promise<PdfMetadata>;
+/**
+ * Options for text extraction.
+ *
+ * `splitColumns` controls Issue #3 column-aware reordering. When `>= 2`,
+ * text items are bucketed into N equal-width columns by X-coordinate and
+ * concatenated left-to-right. `1` (default / undefined) preserves the
+ * existing single-column Y-sort behaviour.
+ */
+export interface ExtractTextOptions {
+    splitColumns?: number;
+}
 /**
  * Extract text from a pre-loaded PDFDocumentProxy.
  * Does NOT destroy the document — caller is responsible for lifecycle.
  */
-export declare function extractTextFromDoc(doc: PDFDocumentProxy, pages?: string): Promise<PageText[]>;
+export declare function extractTextFromDoc(doc: PDFDocumentProxy, pages?: string, options?: ExtractTextOptions): Promise<PageText[]>;
 /**
  * Extract text from specified pages (1-based).
  */
-export declare function extractText(filePath: string, pages?: string): Promise<PageText[]>;
+export declare function extractText(filePath: string, pages?: string, options?: ExtractTextOptions): Promise<PageText[]>;
 /**
  * Search for text across all pages.
  */
@@ -58,6 +69,27 @@ export declare function analyzeTagsFromDoc(doc: PDFDocumentProxy): Promise<TagsA
  * Analyze Tagged PDF structure tree.
  */
 export declare function analyzeTags(filePath: string): Promise<TagsAnalysis>;
+/**
+ * Extract tables from a Tagged PDF as structured rows/cells.
+ *
+ * The strategy is: for each page, walk the StructTree, identify `<Table>`
+ * subtrees, then walk down `<THead>/<TBody>/<TFoot>` → `<TR>` → `<TH>/<TD>`.
+ * Cell text is reconstructed by mapping each Span/P/Lbl/LBody leaf node's
+ * `id` (e.g. `p715R_mc4`) onto the corresponding `beginMarkedContentProps`
+ * boundary in `getTextContent({ includeMarkedContent: true })`.
+ *
+ * Untagged PDFs return `isTagged: false`, an empty `tables` array, and a
+ * `note` recommending the column-aware extraction (planned in a future
+ * release) as the fallback for two-column layouts without a structure tree.
+ *
+ * Cell text is post-processed:
+ *   - Newlines (`hasEOL`) become single spaces.
+ *   - Repeated whitespace runs (including U+3000 fullwidth space) collapse to one.
+ *   - Per-character kerning spaces (e.g. `"消 費 税 法"`) are folded
+ *     by removing single ASCII spaces between two CJK characters.
+ */
+export declare function extractTablesFromDoc(doc: PDFDocumentProxy, pages?: string): Promise<TablesExtractionResult>;
+export declare function extractTables(filePath: string, pages?: string): Promise<TablesExtractionResult>;
 /**
  * Analyze annotations across all pages.
  */

package/dist/services/pdfjs-service.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"pdfjs-service.d.ts","sourceRoot":"","sources":["../../src/services/pdfjs-service.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAGL,KAAK,gBAAgB,EAEtB,MAAM,iCAAiC,CAAC;AAGzC,OAAO,KAAK,EAEV,mBAAmB,~~EAEnB~~,qBAAqB,EACrB,QAAQ,EACR,WAAW,EACX,WAAW,~~EAEX~~,YAAY,EACb,MAAM,aAAa,CAAC;AAWrB;;GAEG;AACH,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAI9E;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAGtF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOxE;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,WAAW,CAAC,CA6BtB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,~~GACb~~,OAAO,CAAC,QAAQ,EAAE,CAAC,CAarB;AAED;;GAEG;AACH,wBAAsB,WAAW,~~CAAC~~,QAAQ,EAAE,MAAM,~~EAAE~~,KAAK,CAAC,EAAE,MAAM,~~GAAG~~,OAAO,CAAC,QAAQ,EAAE,CAAC,~~CAQvF~~;AAED;;GAEG;AACH,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,YAAY,GAAE,MAA+B,EAC7C,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,WAAW,EAAE,CAAC,CAsDxB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB/F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQnF;AAED;;;GAGG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,qBAAqB,CAAC,CAoEhC;~~AAgGD~~;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,YAAY,CAAC,CAmErF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAOzE;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,mBAAmB,CAAC,CAmG9B"}
1	+ {"version":3,"file":"pdfjs-service.d.ts","sourceRoot":"","sources":["../../src/services/pdfjs-service.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAGL,KAAK,gBAAgB,EAEtB,MAAM,iCAAiC,CAAC;AAGzC,OAAO,KAAK,EAEV,mBAAmB,EAGnB,qBAAqB,EACrB,QAAQ,EACR,WAAW,EACX,WAAW,EAGX,sBAAsB,EAEtB,YAAY,EACb,MAAM,aAAa,CAAC;AAWrB;;GAEG;AACH,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAI9E;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAGtF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAOxE;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,WAAW,CAAC,CA6BtB;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,kBAAkB;IACjC,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAarB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,QAAQ,EAAE,CAAC,CAQrB;AAED;;GAEG;AACH,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,YAAY,GAAE,MAA+B,EAC7C,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,WAAW,EAAE,CAAC,CAsDxB;AAED;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB/F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQnF;AAED;;;GAGG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,qBAAqB,CAAC,CAoEhC;AAyID;;;GAGG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAAC,YAAY,CAAC,CAmErF;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAOzE;AAID;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,oBAAoB,CACxC,GAAG,EAAE,gBAAgB,EACrB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CA8CjC;AAED,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,sBAAsB,CAAC,CAOjC;AA2KD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,OAAO,CAAC,mBAAmB,CAAC,CAmG9B"}