hazo_pdf 1.6.7 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -0
- package/dist/{chunk-TJBBE34D.js → chunk-4JJOUQ62.js} +7 -6
- package/dist/{chunk-TJBBE34D.js.map → chunk-4JJOUQ62.js.map} +1 -1
- package/dist/chunk-KHB3VZJQ.js +157 -0
- package/dist/chunk-KHB3VZJQ.js.map +1 -0
- package/dist/index.d.ts +178 -1
- package/dist/index.js +1545 -3
- package/dist/index.js.map +1 -1
- package/dist/{pdf_viewer-KMV7W3DA.js → pdf_viewer-B6S5PJJB.js} +2 -2
- package/dist/server/index.d.ts +164 -1
- package/dist/server/index.js +439 -1
- package/dist/server/index.js.map +1 -1
- package/dist/server/text_search-2OZOVUIP.js +154 -0
- package/dist/server/text_search-2OZOVUIP.js.map +1 -0
- package/dist/styles/full.css +184 -0
- package/dist/styles/full.css.map +1 -1
- package/dist/styles/index.css +136 -0
- package/dist/styles/index.css.map +1 -1
- package/dist/text_search-I2KZ7DTW.js +11 -0
- package/package.json +7 -3
- package/dist/chunk-FXOJ3DPX.js +0 -71
- package/dist/chunk-FXOJ3DPX.js.map +0 -1
- package/dist/text_search-GW2VYMU6.js +0 -9
- /package/dist/{pdf_viewer-KMV7W3DA.js.map → pdf_viewer-B6S5PJJB.js.map} +0 -0
- /package/dist/{text_search-GW2VYMU6.js.map → text_search-I2KZ7DTW.js.map} +0 -0
package/README.md
CHANGED
|
@@ -21,6 +21,7 @@ A React component library for viewing and annotating PDF documents with support
|
|
|
21
21
|
- ☁️ **Remote Storage** - Load and save PDFs from Google Drive, Dropbox, or local storage (via hazo_files)
|
|
22
22
|
- 🖼️ **Dialog Component** - Ready-to-use modal dialog wrapper (`PdfViewerDialog`)
|
|
23
23
|
- 🔧 **Server Utilities** - Server-side extraction utilities via `hazo_pdf/server` entry point
|
|
24
|
+
- 🔎 **Text Snippet Extraction** - Server-side: find text in a PDF, highlight it, and return a cropped image snippet
|
|
24
25
|
|
|
25
26
|
## Installation
|
|
26
27
|
|
|
@@ -54,6 +55,14 @@ npm install hazo_llm_api
|
|
|
54
55
|
|
|
55
56
|
The `hazo_llm_api` package is an optional peer dependency. When installed, it enables server-side document data extraction via the `hazo_pdf/server` entry point. See [Server-Side Extraction](#server-side-extraction) for details.
|
|
56
57
|
|
|
58
|
+
**@napi-rs/canvas** (optional): For server-side text snippet extraction
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
npm install @napi-rs/canvas
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Required only for the `extract_text_snippet()` server utility. Already installed as a transitive dependency of `pdfjs-dist` in most environments. See [Text Snippet Extraction](#text-snippet-extraction) for details.
|
|
65
|
+
|
|
57
66
|
## CSS Import Options
|
|
58
67
|
|
|
59
68
|
The library provides two CSS files to choose from:
|
|
@@ -2461,6 +2470,80 @@ interface ExtractDocumentResult {
|
|
|
2461
2470
|
|
|
2462
2471
|
---
|
|
2463
2472
|
|
|
2473
|
+
## Text Snippet Extraction
|
|
2474
|
+
|
|
2475
|
+
Extract a cropped image snippet from a PDF with the search text highlighted. Useful for attaching contextual evidence from documents.
|
|
2476
|
+
|
|
2477
|
+
### Basic Usage
|
|
2478
|
+
|
|
2479
|
+
```typescript
|
|
2480
|
+
import { extract_text_snippet } from 'hazo_pdf/server';
|
|
2481
|
+
|
|
2482
|
+
const result = await extract_text_snippet(
|
|
2483
|
+
{ file_path: '/path/to/document.pdf' },
|
|
2484
|
+
{
|
|
2485
|
+
search_text: 'invoice total',
|
|
2486
|
+
snippet_size: 'half', // 'full' | 'half' | 'quarter'
|
|
2487
|
+
match_mode: 'first', // 'first' | 'all'
|
|
2488
|
+
}
|
|
2489
|
+
);
|
|
2490
|
+
|
|
2491
|
+
if (result.success) {
|
|
2492
|
+
for (const snippet of result.snippets) {
|
|
2493
|
+
// snippet.image_buffer - PNG Buffer (for saving to file or API response)
|
|
2494
|
+
// snippet.image_base64 - Base64 PNG (for embedding in HTML/JSON)
|
|
2495
|
+
// snippet.page_index - Which page (0-based)
|
|
2496
|
+
// snippet.matches - Array of match positions
|
|
2497
|
+
// snippet.highlight_approximate - true for image-based PDFs
|
|
2498
|
+
}
|
|
2499
|
+
}
|
|
2500
|
+
```
|
|
2501
|
+
|
|
2502
|
+
### Snippet Sizes
|
|
2503
|
+
|
|
2504
|
+
All snippet sizes use the full page width. Only the height varies:
|
|
2505
|
+
|
|
2506
|
+
| Size | Dimensions |
|
|
2507
|
+
|------|-----------|
|
|
2508
|
+
| `full` | Full page (W x H) |
|
|
2509
|
+
| `half` | Full width, half height (W x H/2) |
|
|
2510
|
+
| `quarter` | Full width, quarter height (W x H/4) |
|
|
2511
|
+
|
|
2512
|
+
The snippet is centered on the matched text. If multiple matches are found on the same page, the snippet expands to cover all of them.
|
|
2513
|
+
|
|
2514
|
+
### Match Modes
|
|
2515
|
+
|
|
2516
|
+
| Mode | Behavior |
|
|
2517
|
+
|------|----------|
|
|
2518
|
+
| `first` | Returns a single snippet from the first page where text is found |
|
|
2519
|
+
| `all` | Returns one snippet per page that contains the text |
|
|
2520
|
+
|
|
2521
|
+
### Options
|
|
2522
|
+
|
|
2523
|
+
| Option | Type | Default | Description |
|
|
2524
|
+
|--------|------|---------|-------------|
|
|
2525
|
+
| `search_text` | `string` | Required | Text to search for |
|
|
2526
|
+
| `page_index` | `number` | `0` | Page to start searching (0-based) |
|
|
2527
|
+
| `snippet_size` | `SnippetSize` | `'half'` | Crop size relative to page |
|
|
2528
|
+
| `match_mode` | `SnippetMatchMode` | `'first'` | Return first match or all pages |
|
|
2529
|
+
| `render_scale` | `number` | `2.0` | Rendering quality (higher = sharper, larger file) |
|
|
2530
|
+
| `highlight_color` | `string` | `'rgba(255, 255, 0, 0.35)'` | Highlight color (CSS color string) |
|
|
2531
|
+
| `use_llm_for_image_pdf` | `boolean` | `false` | Use LLM vision for image-based PDFs |
|
|
2532
|
+
|
|
2533
|
+
### Image-Based PDFs
|
|
2534
|
+
|
|
2535
|
+
For scanned/image PDFs where text cannot be extracted from the text layer:
|
|
2536
|
+
|
|
2537
|
+
- **Without LLM** (default): Returns full page as the snippet, no highlighting
|
|
2538
|
+
- **With LLM** (`use_llm_for_image_pdf: true`): Uses `hazo_llm_api` vision to find approximate text location and draw a rough highlight
|
|
2539
|
+
|
|
2540
|
+
### Requirements
|
|
2541
|
+
|
|
2542
|
+
- **`@napi-rs/canvas`**: Required for server-side canvas rendering. Usually already installed as a transitive dependency of `pdfjs-dist`.
|
|
2543
|
+
- **Server-only**: This utility runs in Node.js only (not in the browser).
|
|
2544
|
+
|
|
2545
|
+
---
|
|
2546
|
+
|
|
2464
2547
|
## Development
|
|
2465
2548
|
|
|
2466
2549
|
### Setup
|
|
@@ -2390,8 +2390,8 @@ async function load_pdf_config_async(config_file) {
|
|
|
2390
2390
|
return load_config_browser(config_file);
|
|
2391
2391
|
}
|
|
2392
2392
|
try {
|
|
2393
|
-
const {
|
|
2394
|
-
const hazo_config = new
|
|
2393
|
+
const { AppConfig } = __require("hazo_config");
|
|
2394
|
+
const hazo_config = new AppConfig({ filePath: config_file });
|
|
2395
2395
|
logger.debug(`Using hazo_config to load: ${config_file}`);
|
|
2396
2396
|
const get_value = (section, key) => {
|
|
2397
2397
|
return hazo_config.get(section, key);
|
|
@@ -3000,8 +3000,8 @@ function load_pdf_config(config_file) {
|
|
|
3000
3000
|
return default_config;
|
|
3001
3001
|
}
|
|
3002
3002
|
try {
|
|
3003
|
-
const {
|
|
3004
|
-
const hazo_config = new
|
|
3003
|
+
const { AppConfig } = __require("hazo_config");
|
|
3004
|
+
const hazo_config = new AppConfig({ filePath: config_file });
|
|
3005
3005
|
const get_value = (section, key) => {
|
|
3006
3006
|
return hazo_config.get(section, key);
|
|
3007
3007
|
};
|
|
@@ -5334,7 +5334,7 @@ ${suffix_line}`;
|
|
|
5334
5334
|
});
|
|
5335
5335
|
setAutoHighlightIds(/* @__PURE__ */ new Set());
|
|
5336
5336
|
const perform_auto_highlights = async () => {
|
|
5337
|
-
const { find_text_in_pdf } = await import("./text_search-
|
|
5337
|
+
const { find_text_in_pdf } = await import("./text_search-I2KZ7DTW.js");
|
|
5338
5338
|
const new_ids = /* @__PURE__ */ new Set();
|
|
5339
5339
|
const auto_config = config_ref.current?.auto_highlight || default_config.auto_highlight;
|
|
5340
5340
|
const search_opts = {
|
|
@@ -6365,6 +6365,7 @@ PdfViewer.displayName = "PdfViewer";
|
|
|
6365
6365
|
var pdf_viewer_default = PdfViewer;
|
|
6366
6366
|
|
|
6367
6367
|
export {
|
|
6368
|
+
cn,
|
|
6368
6369
|
load_pdf_document,
|
|
6369
6370
|
create_coordinate_mapper,
|
|
6370
6371
|
get_viewport_dimensions,
|
|
@@ -6400,4 +6401,4 @@ export {
|
|
|
6400
6401
|
PdfViewer,
|
|
6401
6402
|
pdf_viewer_default
|
|
6402
6403
|
};
|
|
6403
|
-
//# sourceMappingURL=chunk-
|
|
6404
|
+
//# sourceMappingURL=chunk-4JJOUQ62.js.map
|