pi-docparser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,17 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project follows semantic versioning.
6
+
7
+ ## [1.0.0] - 2026-03-20
8
+
9
+ Initial public release.
10
+
11
+ ### Added
12
+
13
+ - `document_parse` pi extension powered by LiteParse for PDFs, Office documents, spreadsheets, CSV files, and common images
14
+ - OCR support, text or JSON output, page targeting, and optional PDF screenshot extraction
15
+ - `/docparser-doctor` command for host dependency checks and guided setup hints
16
+ - `parse-document` skill, package docs, third-party notices, and preview assets
17
+ - Bun-based validation scripts for formatting with `oxfmt`, linting with `oxlint`, and TypeScript type checks
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MaxedApps GmbH / Maximilian Schwarzmüller
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,219 @@
1
+ # pi-docparser
2
+
3
+ A standalone [pi](https://shittycodingagent.ai/) package that adds a `document_parse` tool plus a companion `parse-document` skill for extracting content from local documents.
4
+
5
+ It wraps [`@llamaindex/liteparse`](https://github.com/run-llama/liteparse) so pi can parse PDFs, Office documents, spreadsheets, CSV files, and common image formats through a dedicated tool instead of ad-hoc shell commands. It can also perform OCR.
6
+
7
+ When required host tools such as LibreOffice, ImageMagick, or Ghostscript are missing, the tool surfaces actionable install guidance instead of generic conversion failures and points users to `/docparser-doctor` for guided setup.
8
+
9
+ ## What this package provides
10
+
11
+ ### Extension
12
+
13
+ Registers a `document_parse` tool that can:
14
+
15
+ - parse local documents to `text` or `json`
16
+ - use OCR for scanned or image-based documents
17
+ - accept single-language OCR via `ocrLanguage` or multilingual OCR via `ocrLanguages`
18
+ - limit parsing to selected page ranges
19
+ - preserve layout alignment across page boundaries when needed
20
+ - generate PNG screenshots for PDF pages, including `screenshotPages: "all"`
21
+ - save full parsed output to temporary files for follow-up inspection with pi's `read` tool
22
+
23
+ ### Skill
24
+
25
+ Ships a `parse-document` skill that helps pi use the tool efficiently:
26
+
27
+ - prefer text output unless coordinates matter
28
+ - use OCR deliberately
29
+ - use screenshots only when visual layout matters
30
+ - keep large parsed outputs out of the model context until needed
31
+
32
+ ## Supported inputs
33
+
34
+ This package uses LiteParse and therefore supports the formats LiteParse supports locally, including:
35
+
36
+ - PDF
37
+ - DOC / DOCX / ODT / RTF
38
+ - PPT / PPTX / ODP
39
+ - XLS / XLSX / XLSM / ODS
40
+ - CSV / TSV
41
+ - PNG / JPG / JPEG / GIF / BMP / TIFF / WebP / SVG
42
+
43
+ Support for non-PDF formats may depend on host tools such as LibreOffice or ImageMagick. See [Host dependencies](#host-dependencies).
44
+
45
+ ## Requirements
46
+
47
+ - pi installed and working
48
+ - Node.js 18+
49
+ - local machine access to the files you want to parse
50
+
51
+ ## Installation
52
+
53
+ ### npm install
54
+
55
+ ```bash
56
+ pi install npm:pi-docparser
57
+ ```
58
+
59
+ ### Install from GitHub
60
+
61
+ ```bash
62
+ pi install git:github.com/maxedapps/pi-docparser
63
+ ```
64
+
65
+ ## Example model tool calls
66
+
67
+ These are representative `document_parse` calls pi may make internally, depending on the user's request:
68
+
69
+ ### 1) Extract plain text from a PDF
70
+
71
+ ```text
72
+ document_parse({
73
+ path: "./docs/contract.pdf"
74
+ })
75
+ ```
76
+
77
+ Useful when the user wants the document summarized, searched, or quoted and layout coordinates are not needed.
78
+
79
+ ### 2) OCR a scanned image or photo
80
+
81
+ ```text
82
+ document_parse({
83
+ path: "./scans/receipt.jpg",
84
+ ocr: "auto",
85
+ ocrLanguage: "eng"
86
+ })
87
+ ```
88
+
89
+ Useful when the source is an image or scanned document and text must be recognized first.
90
+
91
+ ### 3) Extract structured JSON and PDF screenshots for selected pages
92
+
93
+ ```text
94
+ document_parse({
95
+ path: "./reports/financial-report.pdf",
96
+ format: "json",
97
+ targetPages: "1-3",
98
+ screenshotPages: "1-2"
99
+ })
100
+ ```
101
+
102
+ Useful when the user cares about page layout, bounding boxes, or wants visual follow-up on specific PDF pages.
103
+
104
+ ## Tool behavior notes
105
+
106
+ ### OCR notes
107
+
108
+ LiteParse's built-in Tesseract OCR is used by default when OCR is enabled and no `ocrServerUrl` is provided.
109
+
110
+ Important details:
111
+
112
+ - the first OCR run may download Tesseract language/model data
113
+ - built-in Tesseract typically uses ISO 639-3 language codes such as `eng`, `deu`, `fra`, `jpn`
114
+ - many HTTP OCR servers instead expect ISO 639-1 codes such as `en`, `de`, `fr`, `ja`
115
+ - `ocrLanguages` is joined into a multilingual language string for built-in Tesseract
116
+ - when `ocrServerUrl` is used, only the first entry from `ocrLanguages` is forwarded
117
+
118
+ ### Screenshots
119
+
120
+ - screenshot rendering is PDF-only
121
+ - screenshot output is PNG-only in this package
122
+ - use `screenshotPages: "all"` to render every PDF page
123
+ - use page selections like `"1-3,8"` to limit screenshot work
124
+
125
+ ## Host dependencies
126
+
127
+ This package relies on LiteParse for local parsing and conversion. Depending on the input format, you may need additional host tools installed.
128
+
129
+ The tool performs a lightweight preflight check for the most common host dependencies and also forwards LiteParse's original error messages when conversion fails.
130
+
131
+ ### LibreOffice
132
+
133
+ Needed for many Office document and spreadsheet conversion paths.
134
+
135
+ Examples from LiteParse documentation:
136
+
137
+ ```bash
138
+ # macOS
139
+ brew install --cask libreoffice
140
+
141
+ # Ubuntu / Debian
142
+ apt-get install libreoffice
143
+
144
+ # Windows
145
+ choco install libreoffice-fresh
146
+ ```
147
+
148
+ ### ImageMagick
149
+
150
+ Needed for image-to-PDF conversion paths.
151
+
152
+ ```bash
153
+ # macOS
154
+ brew install imagemagick
155
+
156
+ # Ubuntu / Debian
157
+ apt-get install imagemagick
158
+
159
+ # Windows
160
+ choco install imagemagick.app
161
+ ```
162
+
163
+ ### Ghostscript
164
+
165
+ Some image or vector conversion paths may also require Ghostscript.
166
+
167
+ ## Doctor command
168
+
169
+ If parsing fails because a host dependency is missing, the extension points users to:
170
+
171
+ ```text
172
+ /docparser-doctor
173
+ ```
174
+
175
+ Run it inside pi to:
176
+
177
+ - detect the current operating system
178
+ - check whether LibreOffice, ImageMagick, and Ghostscript are available
179
+ - optionally focus the check on a specific file path
180
+ - suggest the most appropriate install commands for the current machine
181
+ - optionally attempt those install commands after user confirmation when that looks safe to automate
182
+
183
+ Examples:
184
+
185
+ ```text
186
+ /docparser-doctor
187
+ /docparser-doctor @./slides.pptx
188
+ ```
189
+
190
+ ## Known limitations
191
+
192
+ - screenshot rendering is PDF-only and PNG-only
193
+ - OCR quality depends on scan quality, page layout, and the chosen OCR language
194
+ - some conversion paths depend on external host tools
195
+ - output is written to temporary files, not directly into your repository
196
+
197
+ ## Third-party dependency: LiteParse
198
+
199
+ This package depends on:
200
+
201
+ - [`@llamaindex/liteparse`](https://github.com/run-llama/liteparse)
202
+ - license: Apache-2.0
203
+ - purpose: local document parsing, OCR, screenshots, and conversion support
204
+
205
+ LiteParse itself documents its own upstream dependencies and platform requirements. See:
206
+
207
+ - repository: https://github.com/run-llama/liteparse
208
+ - npm package: https://www.npmjs.com/package/@llamaindex/liteparse
209
+ - docs: https://developers.llamaindex.ai/liteparse/
210
+
211
+ Additional attribution details are listed in [THIRD_PARTY_NOTICES.md](./THIRD_PARTY_NOTICES.md).
212
+
213
+ ## Changelog
214
+
215
+ See [CHANGELOG.md](./CHANGELOG.md).
216
+
217
+ ## License
218
+
219
+ This package is licensed under the MIT License. See [LICENSE](./LICENSE).
@@ -0,0 +1,29 @@
1
+ # Third-Party Notices
2
+
3
+ ## LiteParse
4
+
5
+ This package depends on the following third-party library at runtime:
6
+
7
+ - **Package:** `@llamaindex/liteparse`
8
+ - **Version used by this package:** `1.0.0`
9
+ - **Repository:** https://github.com/run-llama/liteparse
10
+ - **License:** Apache-2.0
11
+ - **Local license copy:** [`./licenses/LiteParse-APACHE-2.0.txt`](./licenses/LiteParse-APACHE-2.0.txt)
12
+ - **Upstream license file:** https://github.com/run-llama/liteparse/blob/main/LICENSE
13
+
14
+ ### Usage in this package
15
+
16
+ `pi-docparser` uses LiteParse as an npm dependency to provide:
17
+
18
+ - local document parsing
19
+ - OCR support
20
+ - PDF screenshot generation
21
+ - conversion support for Office and image inputs
22
+
23
+ This package does **not** vendor LiteParse source code.
24
+ It relies on the installed npm dependency at runtime.
25
+
26
+ ### Upstream attribution
27
+
28
+ LiteParse is developed by LlamaIndex and distributed under the Apache License 2.0.
29
+ Please review the upstream repository and license for full details.
Binary file
@@ -0,0 +1,71 @@
1
+ import { cpus } from "node:os";
2
+
3
+ export const PREVIEW_MAX_LINES = 20;
4
+ export const PREVIEW_MAX_BYTES = 2 * 1024;
5
+ export const DEFAULT_MAX_PAGES = 10000;
6
+ export const DEFAULT_DPI = 150;
7
+ export const DEFAULT_NUM_WORKERS = Math.max(1, cpus().length - 1);
8
+ export const INSTALL_COMMAND_TIMEOUT_MS = 30 * 60 * 1000;
9
+
10
+ export const OFFICE_EXTENSIONS = new Set([
11
+ ".doc",
12
+ ".docx",
13
+ ".docm",
14
+ ".dot",
15
+ ".dotm",
16
+ ".dotx",
17
+ ".odt",
18
+ ".ott",
19
+ ".ppt",
20
+ ".pptx",
21
+ ".pptm",
22
+ ".pot",
23
+ ".potm",
24
+ ".potx",
25
+ ".odp",
26
+ ".otp",
27
+ ".rtf",
28
+ ".pages",
29
+ ".key",
30
+ ]);
31
+
32
+ export const SPREADSHEET_EXTENSIONS = new Set([
33
+ ".xls",
34
+ ".xlsx",
35
+ ".xlsm",
36
+ ".xlsb",
37
+ ".ods",
38
+ ".ots",
39
+ ".csv",
40
+ ".tsv",
41
+ ".numbers",
42
+ ]);
43
+
44
+ export const IMAGE_EXTENSIONS = new Set([
45
+ ".jpg",
46
+ ".jpeg",
47
+ ".png",
48
+ ".gif",
49
+ ".bmp",
50
+ ".tiff",
51
+ ".tif",
52
+ ".webp",
53
+ ".svg",
54
+ ".eps",
55
+ ".ps",
56
+ ".ai",
57
+ ]);
58
+
59
+ export const GHOSTSCRIPT_REQUIRED_EXTENSIONS = new Set([".svg", ".eps", ".ps", ".ai"]);
60
+
61
+ export const LIBREOFFICE_MISSING_MESSAGE =
62
+ "LibreOffice is not installed. Please install LibreOffice to convert office documents. On macOS: brew install --cask libreoffice, On Ubuntu: apt-get install libreoffice, On Windows: choco install libreoffice-fresh";
63
+
64
+ export const IMAGEMAGICK_MISSING_MESSAGE =
65
+ "ImageMagick is not installed. Please install ImageMagick to convert images. On macOS: brew install imagemagick, On Ubuntu: apt-get install imagemagick, On Windows: choco install imagemagick.app";
66
+
67
+ export const GHOSTSCRIPT_MISSING_MESSAGE =
68
+ "Ghostscript is required to convert %s files but is not installed. On macOS: brew install ghostscript, On Ubuntu: apt-get install ghostscript, On Windows: choco install ghostscript";
69
+
70
+ export const DOCTOR_COMMAND_NAME = "docparser-doctor";
71
+ export const DOCTOR_COMMAND = `/${DOCTOR_COMMAND_NAME}`;