@happyvertical/documents 0.74.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENT.md +32 -0
- package/LICENSE +7 -0
- package/README.md +145 -0
- package/dist/cli/claude-context.d.ts +3 -0
- package/dist/cli/claude-context.d.ts.map +1 -0
- package/dist/cli/claude-context.js +21 -0
- package/dist/cli/claude-context.js.map +1 -0
- package/dist/document.d.ts +88 -0
- package/dist/document.d.ts.map +1 -0
- package/dist/factory.d.ts +40 -0
- package/dist/factory.d.ts.map +1 -0
- package/dist/index.d.ts +29 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +320 -0
- package/dist/index.js.map +1 -0
- package/dist/processors/pdf.d.ts +45 -0
- package/dist/processors/pdf.d.ts.map +1 -0
- package/dist/types.d.ts +199 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/utils.d.ts +25 -0
- package/dist/utils.d.ts.map +1 -0
- package/metadata.json +35 -0
- package/package.json +67 -0
package/AGENT.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# @happyvertical/documents
|
|
2
|
+
|
|
3
|
+
<!-- BEGIN AGENT:GENERATED -->
|
|
4
|
+
## Purpose
|
|
5
|
+
Multi-part document processing with support for PDF, HTML, and Markdown
|
|
6
|
+
|
|
7
|
+
## Package Map
|
|
8
|
+
- Package: `@happyvertical/documents`
|
|
9
|
+
- Hierarchy path: `@happyvertical/sdk > packages > documents`
|
|
10
|
+
- Workspace position: `8 of 30` local packages
|
|
11
|
+
- Internal dependencies: `@happyvertical/files`, `@happyvertical/utils`
|
|
12
|
+
- Internal dependents: none
|
|
13
|
+
- Knowledge graph files: `AGENT.md`, `metadata.json`, `ecosystem-manifest.json`
|
|
14
|
+
|
|
15
|
+
## Build & Test
|
|
16
|
+
```bash
|
|
17
|
+
pnpm --filter @happyvertical/documents build
|
|
18
|
+
pnpm --filter @happyvertical/documents test
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Agent Correction Loops
|
|
22
|
+
- If module resolution or export errors mention a workspace dependency, build the dependency first (`pnpm --filter @happyvertical/files build`, `pnpm --filter @happyvertical/utils build`) and then rerun `pnpm --filter @happyvertical/documents build`.
|
|
23
|
+
- If a change only affects runtime behavior, rerun `pnpm --filter @happyvertical/documents test` after rebuilding the package to confirm the failure is local.
|
|
24
|
+
- If failures span multiple packages or Turborepo ordering looks wrong, run `pnpm build` and `pnpm typecheck` from the repo root before retrying package-scoped commands.
|
|
25
|
+
|
|
26
|
+
## Ecosystem Relationships
|
|
27
|
+
- Provides: Multi-part document processing with support for PDF, HTML, and Markdown
|
|
28
|
+
- Implements: none
|
|
29
|
+
- Requires: @happyvertical/files, @happyvertical/utils, @happyvertical/ocr, @happyvertical/pdf, @happyvertical/spider, uuid
|
|
30
|
+
- Stability: stable (Primary package surface is described as implemented and production-oriented.)
|
|
31
|
+
<!-- END AGENT:GENERATED -->
|
|
32
|
+
|
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright <2025> <Happy Vertical Corporation>
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# @happyvertical/documents
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
|
|
5
|
+
Document processing with hierarchical structure. Currently supports PDF documents with text extraction, automatic document management system detection (WordPress Download Manager, CivicWeb, DocuShare), and file caching. Uses `@happyvertical/spider` for web page analysis and `@happyvertical/pdf` for PDF text extraction.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Install from the public npm registry:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install @happyvertical/documents
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Anonymous installs also require the package's external `@happyvertical/ocr`, `@happyvertical/pdf`, and `@happyvertical/spider` dependencies to be available on public npm.
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
import { fetchDocument } from '@happyvertical/documents';
|
|
21
|
+
|
|
22
|
+
// Process a local PDF
|
|
23
|
+
const doc = await fetchDocument('file:///path/to/report.pdf');
|
|
24
|
+
|
|
25
|
+
for (const part of doc.parts) {
|
|
26
|
+
console.log(part.title);
|
|
27
|
+
console.log(part.content);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Fetch a remote PDF (auto-detected from URL extension)
|
|
31
|
+
const remote = await fetchDocument('https://example.com/report.pdf');
|
|
32
|
+
console.log(remote.parts[0].content);
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Document Management System Detection
|
|
38
|
+
|
|
39
|
+
When fetching web URLs, the package uses `@happyvertical/spider` to detect document management systems and extract direct PDF links:
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
// WordPress Download Manager URL — spider detects the PDF link automatically
|
|
43
|
+
const doc = await fetchDocument(
|
|
44
|
+
'https://example.com/download/meeting-minutes/',
|
|
45
|
+
{ scraper: 'basic', spider: 'dom' }
|
|
46
|
+
);
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Override MIME Type
|
|
50
|
+
|
|
51
|
+
```typescript
|
|
52
|
+
const doc = await fetchDocument('https://example.com/download?id=123', {
|
|
53
|
+
type: 'application/pdf',
|
|
54
|
+
});
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Cache Control
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
const doc = await fetchDocument('https://example.com/report.pdf', {
|
|
61
|
+
cacheDir: './my-cache',
|
|
62
|
+
cache: true,
|
|
63
|
+
cacheExpiry: 600_000, // 10 minutes
|
|
64
|
+
});
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## API Reference
|
|
68
|
+
|
|
69
|
+
### `fetchDocument(url, options?)`
|
|
70
|
+
|
|
71
|
+
Main factory function. Detects document format, selects the appropriate processor, and returns structured content.
|
|
72
|
+
|
|
73
|
+
- **url** `string` — Document URL or file path (`file://`, `http://`, `https://`)
|
|
74
|
+
- **options** `FetchDocumentOptions` — See below
|
|
75
|
+
- **Returns** `Promise<Document>`
|
|
76
|
+
- **Throws** if no processor is available for the detected MIME type
|
|
77
|
+
|
|
78
|
+
### `FetchDocumentOptions`
|
|
79
|
+
|
|
80
|
+
| Option | Type | Default | Description |
|
|
81
|
+
|--------|------|---------|-------------|
|
|
82
|
+
| `type` | `string` | auto-detected | Override MIME type detection |
|
|
83
|
+
| `extractImages` | `boolean` | `true` | Extract images from document (stub — currently returns `[]`) |
|
|
84
|
+
| `runOcr` | `boolean` | `true` for PDFs | Run OCR on extracted images (stub) |
|
|
85
|
+
| `cacheDir` | `string` | OS temp dir | Directory for caching downloaded files |
|
|
86
|
+
| `cache` | `boolean` | `true` | Enable/disable spider fetch caching |
|
|
87
|
+
| `cacheExpiry` | `number` | `300000` | Cache expiry in milliseconds |
|
|
88
|
+
| `scraper` | `'basic' \| 'crawlee'` | `'basic'` | Scraper type for content extraction |
|
|
89
|
+
| `spider` | `'simple' \| 'dom' \| 'crawlee'` | `'dom'` | Spider adapter for fetching web pages |
|
|
90
|
+
| `headers` | `Record<string, string>` | — | Custom HTTP headers for spider requests |
|
|
91
|
+
| `timeout` | `number` | `30000` | Request timeout in milliseconds |
|
|
92
|
+
| `maxDuration` | `number` | — | Max scraping time in milliseconds |
|
|
93
|
+
| `maxInteractions` | `number` | — | Max interactions for advanced scrapers |
|
|
94
|
+
|
|
95
|
+
### `Document` (class)
|
|
96
|
+
|
|
97
|
+
Base document handler. Manages downloading, caching, and local file path resolution. Used internally by processors; can also be used directly via `Document.create(url, options)`.
|
|
98
|
+
|
|
99
|
+
### `PDFProcessor`
|
|
100
|
+
|
|
101
|
+
Implements `DocumentProcessor`. Extracts text from PDF files, validates PDF headers (detects HTML cache poisoning), and caches processed results.
|
|
102
|
+
|
|
103
|
+
### `getTitleFromUrl(url, defaultTitle?)`
|
|
104
|
+
|
|
105
|
+
Extracts a human-readable title from a URL by parsing the filename, removing extensions, and decoding URL-encoded characters.
|
|
106
|
+
|
|
107
|
+
### Types
|
|
108
|
+
|
|
109
|
+
```typescript
|
|
110
|
+
interface Document {
|
|
111
|
+
url: string;
|
|
112
|
+
type: string;
|
|
113
|
+
parts: DocumentPart[];
|
|
114
|
+
metadata?: Record<string, any>;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
interface DocumentPart {
|
|
118
|
+
id: string;
|
|
119
|
+
title: string;
|
|
120
|
+
content: string;
|
|
121
|
+
type: 'text' | 'html' | 'markdown';
|
|
122
|
+
images?: DocumentImage[];
|
|
123
|
+
metadata?: Record<string, any>;
|
|
124
|
+
parts?: DocumentPart[];
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
interface DocumentImage {
|
|
128
|
+
id: string;
|
|
129
|
+
url: string;
|
|
130
|
+
localPath?: string;
|
|
131
|
+
altText?: string;
|
|
132
|
+
ocrText?: string;
|
|
133
|
+
position?: number;
|
|
134
|
+
metadata?: { width?: number; height?: number; format?: string };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
interface DocumentProcessor {
|
|
138
|
+
process(url: string, options?: FetchDocumentOptions): Promise<Document>;
|
|
139
|
+
supports(type: string): boolean;
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-context.d.ts","sourceRoot":"","sources":["../../src/cli/claude-context.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { existsSync, mkdirSync, copyFileSync } from "node:fs";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
const Dirname = dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
const pkgRoot = join(Dirname, "../..");
|
|
7
|
+
const targetDir = join(process.cwd(), ".claude");
|
|
8
|
+
if (!existsSync(targetDir)) {
|
|
9
|
+
mkdirSync(targetDir, { recursive: true });
|
|
10
|
+
}
|
|
11
|
+
const pkgName = "documents";
|
|
12
|
+
const agentMdSrc = existsSync(join(pkgRoot, "AGENT.md")) ? join(pkgRoot, "AGENT.md") : join(pkgRoot, "CLAUDE.md");
|
|
13
|
+
const metaSrc = existsSync(join(pkgRoot, "metadata.json")) ? join(pkgRoot, "metadata.json") : join(pkgRoot, ".claude-meta.json");
|
|
14
|
+
if (existsSync(agentMdSrc)) {
|
|
15
|
+
copyFileSync(agentMdSrc, join(targetDir, `have-${pkgName}.md`));
|
|
16
|
+
}
|
|
17
|
+
if (existsSync(metaSrc)) {
|
|
18
|
+
copyFileSync(metaSrc, join(targetDir, `have-${pkgName}.meta.json`));
|
|
19
|
+
}
|
|
20
|
+
console.log(`✓ Installed @happyvertical/${pkgName} context to .claude/`);
|
|
21
|
+
//# sourceMappingURL=claude-context.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-context.js","sources":["../../src/cli/claude-context.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * CLI script to install agent context for @happyvertical/documents\n * Run the published context installer binary for this package.\n */\nimport { copyFileSync, existsSync, mkdirSync } from 'node:fs';\nimport { dirname, join } from 'node:path';\nimport { fileURLToPath } from 'node:url';\n\nconst Dirname = dirname(fileURLToPath(import.meta.url));\nconst pkgRoot = join(Dirname, '../..');\nconst targetDir = join(process.cwd(), '.claude');\n\nif (!existsSync(targetDir)) {\n mkdirSync(targetDir, { recursive: true });\n}\n\nconst pkgName = 'documents';\nconst agentMdSrc = existsSync(join(pkgRoot, 'AGENT.md'))\n ? join(pkgRoot, 'AGENT.md')\n : join(pkgRoot, 'CLAUDE.md');\nconst metaSrc = existsSync(join(pkgRoot, 'metadata.json'))\n ? join(pkgRoot, 'metadata.json')\n : join(pkgRoot, '.claude-meta.json');\n\nif (existsSync(agentMdSrc)) {\n copyFileSync(agentMdSrc, join(targetDir, `have-${pkgName}.md`));\n}\n\nif (existsSync(metaSrc)) {\n copyFileSync(metaSrc, join(targetDir, `have-${pkgName}.meta.json`));\n}\n\nconsole.log(`✓ Installed @happyvertical/${pkgName} context to .claude/`);\n"],"names":[],"mappings":";;;;AASA,MAAM,UAAU,QAAQ,cAAc,YAAY,GAAG,CAAC;AACtD,MAAM,UAAU,KAAK,SAAS,OAAO;AACrC,MAAM,YAAY,KAAK,QAAQ,IAAA,GAAO,SAAS;AAE/C,IAAI,CAAC,WAAW,SAAS,GAAG;AAC1B,YAAU,WAAW,EAAE,WAAW,KAAA,CAAM;AAC1C;AAEA,MAAM,UAAU;AAChB,MAAM,aAAa,WAAW,KAAK,SAAS,UAAU,CAAC,IACnD,KAAK,SAAS,UAAU,IACxB,KAAK,SAAS,WAAW;AAC7B,MAAM,UAAU,WAAW,KAAK,SAAS,eAAe,CAAC,IACrD,KAAK,SAAS,eAAe,IAC7B,KAAK,SAAS,mBAAmB;AAErC,IAAI,WAAW,UAAU,GAAG;AAC1B,eAAa,YAAY,KAAK,WAAW,QAAQ,OAAO,KAAK,CAAC;AAChE;AAEA,IAAI,WAAW,OAAO,GAAG;AACvB,eAAa,SAAS,KAAK,WAAW,QAAQ,OAAO,YAAY,CAAC;AACpE;AAEA,QAAQ,IAAI,8BAA8B,OAAO,sBAAsB;"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { URL } from 'node:url';
|
|
2
|
+
import { DocumentPart, Document as DocumentType, FetchDocumentOptions } from './types';
|
|
3
|
+
/**
|
|
4
|
+
* Base document handler with multi-part support
|
|
5
|
+
*
|
|
6
|
+
* Provides functionality for downloading, caching, and structuring documents
|
|
7
|
+
* into hierarchical parts. Specific format processing (PDF, HTML, Markdown)
|
|
8
|
+
* is handled by specialized processors.
|
|
9
|
+
*/
|
|
10
|
+
export declare class Document {
|
|
11
|
+
/**
|
|
12
|
+
* Flag indicating if document is from a remote source
|
|
13
|
+
*/
|
|
14
|
+
protected isRemote: boolean;
|
|
15
|
+
/**
|
|
16
|
+
* Configuration options
|
|
17
|
+
*/
|
|
18
|
+
protected options: FetchDocumentOptions;
|
|
19
|
+
/**
|
|
20
|
+
* Local file path where document is stored
|
|
21
|
+
*/
|
|
22
|
+
private _localPath;
|
|
23
|
+
/**
|
|
24
|
+
* Directory used for caching files
|
|
25
|
+
*/
|
|
26
|
+
private _cacheDir;
|
|
27
|
+
/**
|
|
28
|
+
* Document URL
|
|
29
|
+
*/
|
|
30
|
+
url: URL;
|
|
31
|
+
/**
|
|
32
|
+
* Document MIME type
|
|
33
|
+
*/
|
|
34
|
+
type: string;
|
|
35
|
+
/**
|
|
36
|
+
* Document parts (hierarchical structure)
|
|
37
|
+
*/
|
|
38
|
+
parts: DocumentPart[];
|
|
39
|
+
/**
|
|
40
|
+
* Document-level metadata
|
|
41
|
+
*/
|
|
42
|
+
metadata: Record<string, any>;
|
|
43
|
+
/**
|
|
44
|
+
* Get the local file path where document is stored
|
|
45
|
+
*/
|
|
46
|
+
get localPath(): string;
|
|
47
|
+
/**
|
|
48
|
+
* Get the directory used for caching files
|
|
49
|
+
*/
|
|
50
|
+
get cacheDir(): string;
|
|
51
|
+
/**
|
|
52
|
+
* Creates a new Document instance
|
|
53
|
+
*
|
|
54
|
+
* @param url - Document URL or file path
|
|
55
|
+
* @param options - Document configuration options
|
|
56
|
+
*/
|
|
57
|
+
constructor(url: string, options?: FetchDocumentOptions);
|
|
58
|
+
/**
|
|
59
|
+
* Creates and initializes a Document instance
|
|
60
|
+
*
|
|
61
|
+
* Downloads remote files and prepares the document for processing.
|
|
62
|
+
*
|
|
63
|
+
* @param url - Document URL or file path
|
|
64
|
+
* @param options - Document configuration options
|
|
65
|
+
* @returns Promise resolving to the initialized Document
|
|
66
|
+
*/
|
|
67
|
+
static create(url: string, options?: FetchDocumentOptions): Promise<Document>;
|
|
68
|
+
/**
|
|
69
|
+
* Initializes the document, downloading it if it's remote
|
|
70
|
+
*
|
|
71
|
+
* @returns Promise that resolves when initialization is complete
|
|
72
|
+
*/
|
|
73
|
+
initialize(): Promise<void>;
|
|
74
|
+
/**
|
|
75
|
+
* Checks if the document is a text-based file that can be read directly
|
|
76
|
+
*
|
|
77
|
+
* @returns Boolean indicating if the file is text-based
|
|
78
|
+
*/
|
|
79
|
+
isTextFile(): boolean;
|
|
80
|
+
/**
|
|
81
|
+
* Converts the document to the standard Document interface
|
|
82
|
+
*
|
|
83
|
+
* @returns Document object with URL, type, parts, and metadata
|
|
84
|
+
*/
|
|
85
|
+
toDocument(): DocumentType;
|
|
86
|
+
}
|
|
87
|
+
export default Document;
|
|
88
|
+
//# sourceMappingURL=document.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../src/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAG/B,OAAO,KAAK,EACV,YAAY,EACZ,QAAQ,IAAI,YAAY,EACxB,oBAAoB,EACrB,MAAM,SAAS,CAAC;AAEjB;;;;;;GAMG;AACH,qBAAa,QAAQ;IACnB;;OAEG;IACH,SAAS,CAAC,QAAQ,UAAS;IAE3B;;OAEG;IACH,SAAS,CAAC,OAAO,EAAE,oBAAoB,CAAC;IAExC;;OAEG;IACH,OAAO,CAAC,UAAU,CAAM;IAExB;;OAEG;IACH,OAAO,CAAC,SAAS,CAAM;IAEvB;;OAEG;IACI,GAAG,EAAE,GAAG,CAAC;IAEhB;;OAEG;IACI,IAAI,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACI,KAAK,EAAE,YAAY,EAAE,CAAM;IAElC;;OAEG;IACI,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAM;IAE1C;;OAEG;IACH,IAAW,SAAS,IAAI,MAAM,CAE7B;IAED;;OAEG;IACH,IAAW,QAAQ,IAAI,MAAM,CAE5B;IAED;;;;;OAKG;gBACS,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB;IA+C3D;;;;;;;;OAQG;WACU,MAAM,CACjB,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC;IAMpB;;;;OAIG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IASjC;;;;OAIG;IACI,UAAU,IAAI,OAAO;IAwB5B;;;;OAIG;IACI,UAAU,IAAI,YAAY;CAQlC;AAED,eAAe,QAAQ,CAAC"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { Document, FetchDocumentOptions } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Fetch a document from a URL with automatic format detection
|
|
4
|
+
*
|
|
5
|
+
* This factory function:
|
|
6
|
+
* 1. Detects the document format (PDF, HTML, Markdown, etc.)
|
|
7
|
+
* 2. Selects the appropriate processor
|
|
8
|
+
* 3. Processes the document into structured parts
|
|
9
|
+
* 4. Returns a Document object with hierarchical content
|
|
10
|
+
*
|
|
11
|
+
* @param url - Document URL or file path (file://, http://, https://)
|
|
12
|
+
* @param options - Fetch and processing options
|
|
13
|
+
* @returns Promise resolving to structured Document
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* // Fetch a PDF with image extraction and OCR
|
|
18
|
+
* const doc = await fetchDocument('https://example.com/report.pdf', {
|
|
19
|
+
* extractImages: true,
|
|
20
|
+
* runOcr: true
|
|
21
|
+
* });
|
|
22
|
+
*
|
|
23
|
+
* // Access document parts
|
|
24
|
+
* for (const part of doc.parts) {
|
|
25
|
+
* console.log(part.title);
|
|
26
|
+
* console.log(part.content);
|
|
27
|
+
*
|
|
28
|
+
* // Check for images
|
|
29
|
+
* if (part.images) {
|
|
30
|
+
* for (const image of part.images) {
|
|
31
|
+
* console.log(image.url);
|
|
32
|
+
* console.log(image.ocrText); // Text extracted via OCR
|
|
33
|
+
* }
|
|
34
|
+
* }
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export declare function fetchDocument(url: string, options?: FetchDocumentOptions): Promise<Document>;
|
|
39
|
+
export default fetchDocument;
|
|
40
|
+
//# sourceMappingURL=factory.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"factory.d.ts","sourceRoot":"","sources":["../src/factory.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,SAAS,CAAC;AAO9D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,wBAAsB,aAAa,CACjC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC,CAyEnB;AAED,eAAe,aAAa,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @happyvertical/documents - Document processing with multi-part structure
|
|
3
|
+
*
|
|
4
|
+
* Provides document processing for PDFs with support for:
|
|
5
|
+
* - Hierarchical document parts
|
|
6
|
+
* - Automatic format detection from URL or MIME type
|
|
7
|
+
* - Document management system detection (WordPress, CivicWeb, DocuShare)
|
|
8
|
+
* - File caching for performance
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```typescript
|
|
12
|
+
* import { fetchDocument } from '@happyvertical/documents';
|
|
13
|
+
*
|
|
14
|
+
* const doc = await fetchDocument('https://example.com/report.pdf');
|
|
15
|
+
*
|
|
16
|
+
* for (const part of doc.parts) {
|
|
17
|
+
* console.log(part.title);
|
|
18
|
+
* console.log(part.content);
|
|
19
|
+
* }
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
export { Document } from './document';
|
|
23
|
+
export { fetchDocument } from './factory';
|
|
24
|
+
export { PDFProcessor } from './processors/pdf';
|
|
25
|
+
export type { Document as DocumentType, DocumentImage, DocumentPart, DocumentProcessor, FetchDocumentOptions, } from './types';
|
|
26
|
+
export { getTitleFromUrl } from './utils';
|
|
27
|
+
/** @internal */
|
|
28
|
+
export declare const PACKAGE_VERSION_INITIALIZED = true;
|
|
29
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAEtC,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAG1C,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEhD,YAAY,EACV,QAAQ,IAAI,YAAY,EACxB,aAAa,EACb,YAAY,EACZ,iBAAiB,EACjB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAEjB,OAAO,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE1C,gBAAgB;AAChB,eAAO,MAAM,2BAA2B,OAAO,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import os from "node:os";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { URL as URL$1 } from "node:url";
|
|
4
|
+
import { getMimeType, downloadFileWithCache, getCached, setCached } from "@happyvertical/files";
|
|
5
|
+
import { makeSlug } from "@happyvertical/utils";
|
|
6
|
+
import { scrapeDocument } from "@happyvertical/spider";
|
|
7
|
+
import { promises } from "node:fs";
|
|
8
|
+
import { getPDFReader } from "@happyvertical/pdf";
|
|
9
|
+
import { v4 } from "uuid";
|
|
10
|
+
class Document {
|
|
11
|
+
/**
|
|
12
|
+
* Flag indicating if document is from a remote source
|
|
13
|
+
*/
|
|
14
|
+
isRemote = false;
|
|
15
|
+
/**
|
|
16
|
+
* Configuration options
|
|
17
|
+
*/
|
|
18
|
+
options;
|
|
19
|
+
/**
|
|
20
|
+
* Local file path where document is stored
|
|
21
|
+
*/
|
|
22
|
+
_localPath = "";
|
|
23
|
+
/**
|
|
24
|
+
* Directory used for caching files
|
|
25
|
+
*/
|
|
26
|
+
_cacheDir = "";
|
|
27
|
+
/**
|
|
28
|
+
* Document URL
|
|
29
|
+
*/
|
|
30
|
+
url;
|
|
31
|
+
/**
|
|
32
|
+
* Document MIME type
|
|
33
|
+
*/
|
|
34
|
+
type;
|
|
35
|
+
/**
|
|
36
|
+
* Document parts (hierarchical structure)
|
|
37
|
+
*/
|
|
38
|
+
parts = [];
|
|
39
|
+
/**
|
|
40
|
+
* Document-level metadata
|
|
41
|
+
*/
|
|
42
|
+
metadata = {};
|
|
43
|
+
/**
|
|
44
|
+
* Get the local file path where document is stored
|
|
45
|
+
*/
|
|
46
|
+
get localPath() {
|
|
47
|
+
return this._localPath;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Get the directory used for caching files
|
|
51
|
+
*/
|
|
52
|
+
get cacheDir() {
|
|
53
|
+
return this._cacheDir;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Creates a new Document instance
|
|
57
|
+
*
|
|
58
|
+
* @param url - Document URL or file path
|
|
59
|
+
* @param options - Document configuration options
|
|
60
|
+
*/
|
|
61
|
+
constructor(url, options = {}) {
|
|
62
|
+
this.url = new URL$1(url);
|
|
63
|
+
this.options = options;
|
|
64
|
+
this.type = options.type || getMimeType(this.url.toString()) || "text/plain";
|
|
65
|
+
this._cacheDir = options.cacheDir || path.resolve(os.tmpdir(), ".cache", "have-sdk", "documents");
|
|
66
|
+
if (this.url.protocol.startsWith("file")) {
|
|
67
|
+
this._localPath = decodeURIComponent(this.url.pathname);
|
|
68
|
+
this.isRemote = false;
|
|
69
|
+
} else if (this.url.protocol.startsWith("http")) {
|
|
70
|
+
let pathname = this.url.pathname;
|
|
71
|
+
if (pathname.endsWith("/")) {
|
|
72
|
+
pathname = pathname.slice(0, -1);
|
|
73
|
+
}
|
|
74
|
+
if (!pathname.match(/\.[a-z0-9]+$/i)) {
|
|
75
|
+
if (this.type === "application/pdf" || options.type === "application/pdf") {
|
|
76
|
+
pathname += ".pdf";
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
this._localPath = path.join(
|
|
80
|
+
this._cacheDir,
|
|
81
|
+
makeSlug(this.url.hostname),
|
|
82
|
+
pathname
|
|
83
|
+
);
|
|
84
|
+
this.isRemote = true;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Creates and initializes a Document instance
|
|
89
|
+
*
|
|
90
|
+
* Downloads remote files and prepares the document for processing.
|
|
91
|
+
*
|
|
92
|
+
* @param url - Document URL or file path
|
|
93
|
+
* @param options - Document configuration options
|
|
94
|
+
* @returns Promise resolving to the initialized Document
|
|
95
|
+
*/
|
|
96
|
+
static async create(url, options = {}) {
|
|
97
|
+
const document = new Document(url, options);
|
|
98
|
+
await document.initialize();
|
|
99
|
+
return document;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Initializes the document, downloading it if it's remote
|
|
103
|
+
*
|
|
104
|
+
* @returns Promise that resolves when initialization is complete
|
|
105
|
+
*/
|
|
106
|
+
async initialize() {
|
|
107
|
+
if (this.isRemote) {
|
|
108
|
+
if (!this.url) {
|
|
109
|
+
throw new Error("Cannot initialize remote document: URL is required");
|
|
110
|
+
}
|
|
111
|
+
await downloadFileWithCache(this.url.toString(), this._localPath);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Checks if the document is a text-based file that can be read directly
|
|
116
|
+
*
|
|
117
|
+
* @returns Boolean indicating if the file is text-based
|
|
118
|
+
*/
|
|
119
|
+
isTextFile() {
|
|
120
|
+
if (!this.type) return false;
|
|
121
|
+
return this.type.startsWith("text/") || this.type === "application/json" || this.type === "application/xml" || this.type === "application/javascript" || this.type === "application/typescript" || [
|
|
122
|
+
".txt",
|
|
123
|
+
".md",
|
|
124
|
+
".json",
|
|
125
|
+
".xml",
|
|
126
|
+
".html",
|
|
127
|
+
".css",
|
|
128
|
+
".js",
|
|
129
|
+
".ts",
|
|
130
|
+
".yaml",
|
|
131
|
+
".yml"
|
|
132
|
+
].some((ext) => this.localPath.toLowerCase().endsWith(ext));
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Converts the document to the standard Document interface
|
|
136
|
+
*
|
|
137
|
+
* @returns Document object with URL, type, parts, and metadata
|
|
138
|
+
*/
|
|
139
|
+
toDocument() {
|
|
140
|
+
return {
|
|
141
|
+
url: this.url.toString(),
|
|
142
|
+
type: this.type,
|
|
143
|
+
parts: this.parts,
|
|
144
|
+
metadata: this.metadata
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function getTitleFromUrl(url, defaultTitle = "Document") {
|
|
149
|
+
try {
|
|
150
|
+
const urlObj = new URL(url);
|
|
151
|
+
const pathname = urlObj.pathname;
|
|
152
|
+
const filename = pathname.split("/").pop() || defaultTitle;
|
|
153
|
+
const decodedFilename = decodeURIComponent(filename);
|
|
154
|
+
return decodedFilename.replace(/\.(pdf|html?|md|txt)$/i, "").replace(/[-_]/g, " ").trim();
|
|
155
|
+
} catch {
|
|
156
|
+
return defaultTitle;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
class PDFProcessor {
|
|
160
|
+
/**
|
|
161
|
+
* Check if this processor supports the given MIME type or extension.
|
|
162
|
+
* Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).
|
|
163
|
+
*
|
|
164
|
+
* @param type - MIME type or file extension to check
|
|
165
|
+
* @returns `true` if this processor can handle the given type
|
|
166
|
+
*/
|
|
167
|
+
supports(type) {
|
|
168
|
+
return type === "application/pdf" || type.endsWith(".pdf") || type.toLowerCase() === "pdf";
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Process a PDF document
|
|
172
|
+
*
|
|
173
|
+
* Extracts text and optionally images/OCR from the PDF, structuring
|
|
174
|
+
* it into hierarchical document parts.
|
|
175
|
+
*
|
|
176
|
+
* @param url - PDF URL or file path
|
|
177
|
+
* @param options - Processing options
|
|
178
|
+
* @returns Promise resolving to structured Document
|
|
179
|
+
*/
|
|
180
|
+
async process(url, options = {}) {
|
|
181
|
+
const baseDoc = await Document.create(url, options);
|
|
182
|
+
const cacheKey = `${baseDoc.localPath}.processed_pdf`;
|
|
183
|
+
const cached = await getCached(cacheKey);
|
|
184
|
+
if (cached) {
|
|
185
|
+
try {
|
|
186
|
+
const parsed = JSON.parse(cached);
|
|
187
|
+
return {
|
|
188
|
+
url: baseDoc.url.toString(),
|
|
189
|
+
type: baseDoc.type,
|
|
190
|
+
parts: parsed.parts,
|
|
191
|
+
metadata: parsed.metadata || {}
|
|
192
|
+
};
|
|
193
|
+
} catch (error) {
|
|
194
|
+
console.warn("Cached PDF data corrupted, reprocessing", error);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
const fileBuffer = await promises.readFile(baseDoc.localPath);
|
|
198
|
+
const header = fileBuffer.subarray(0, 5).toString("utf-8");
|
|
199
|
+
if (header !== "%PDF-") {
|
|
200
|
+
try {
|
|
201
|
+
await promises.unlink(baseDoc.localPath);
|
|
202
|
+
} catch (unlinkError) {
|
|
203
|
+
console.warn(
|
|
204
|
+
`Failed to delete poisoned cache file: ${baseDoc.localPath}`,
|
|
205
|
+
unlinkError
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
const content = fileBuffer.toString(
|
|
209
|
+
"utf-8",
|
|
210
|
+
0,
|
|
211
|
+
Math.min(1e3, fileBuffer.length)
|
|
212
|
+
);
|
|
213
|
+
if (content.includes("<!DOCTYPE html>") || content.includes("<html")) {
|
|
214
|
+
throw new Error(
|
|
215
|
+
`Downloaded file is HTML, not PDF. The server returned HTML content for ${url}. This commonly occurs with WordPress Download Manager URLs that return tracking pages. Expected PDF magic bytes (%PDF-) but got: ${header}. The poisoned cache file has been removed - please try again.`
|
|
216
|
+
);
|
|
217
|
+
} else {
|
|
218
|
+
throw new Error(
|
|
219
|
+
`Downloaded file is not a valid PDF. Expected %PDF- magic bytes but got: ${header}. The invalid cache file has been removed - please try again.`
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const reader = await getPDFReader();
|
|
224
|
+
const extractedText = await reader.extractText(baseDoc.localPath);
|
|
225
|
+
const mainPart = {
|
|
226
|
+
id: v4(),
|
|
227
|
+
title: getTitleFromUrl(url, "PDF Document"),
|
|
228
|
+
content: extractedText || "",
|
|
229
|
+
type: "text",
|
|
230
|
+
metadata: {
|
|
231
|
+
source: "pdf",
|
|
232
|
+
filePath: baseDoc.localPath
|
|
233
|
+
}
|
|
234
|
+
};
|
|
235
|
+
if (options.extractImages === true) {
|
|
236
|
+
mainPart.images = await this.extractImages(
|
|
237
|
+
baseDoc.localPath,
|
|
238
|
+
options.runOcr !== false
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
const document = {
|
|
242
|
+
url: baseDoc.url.toString(),
|
|
243
|
+
type: baseDoc.type,
|
|
244
|
+
parts: [mainPart],
|
|
245
|
+
metadata: {
|
|
246
|
+
processor: "pdf",
|
|
247
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
248
|
+
hasImages: (mainPart.images?.length || 0) > 0
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
await setCached(cacheKey, JSON.stringify(document));
|
|
252
|
+
return document;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Extract images from PDF
|
|
256
|
+
*
|
|
257
|
+
* This is a placeholder for future image extraction functionality.
|
|
258
|
+
* Will use @happyvertical/pdf's image extraction capabilities when available.
|
|
259
|
+
*
|
|
260
|
+
* @param filePath - Local PDF file path
|
|
261
|
+
* @param runOcr - Whether to run OCR on extracted images
|
|
262
|
+
* @returns Promise resolving to array of DocumentImages
|
|
263
|
+
*/
|
|
264
|
+
async extractImages(_filePath, _runOcr) {
|
|
265
|
+
return [];
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const processors = [new PDFProcessor()];
|
|
269
|
+
async function fetchDocument(url, options = {}) {
|
|
270
|
+
const isWebUrl = url.startsWith("http://") || url.startsWith("https://");
|
|
271
|
+
if (isWebUrl && !options.type) {
|
|
272
|
+
try {
|
|
273
|
+
const scraped = await scrapeDocument(url, {
|
|
274
|
+
scraper: options.scraper || "basic",
|
|
275
|
+
spider: options.spider || "dom",
|
|
276
|
+
cache: options.cache,
|
|
277
|
+
cacheExpiry: options.cacheExpiry,
|
|
278
|
+
headers: options.headers,
|
|
279
|
+
timeout: options.timeout,
|
|
280
|
+
maxDuration: options.maxDuration,
|
|
281
|
+
maxInteractions: options.maxInteractions
|
|
282
|
+
});
|
|
283
|
+
const hasDocLink = scraped.metadata.strategy === "wordpress-pdf-link" || scraped.metadata.strategy === "civicweb-pdf-link" || scraped.metadata.strategy === "docushare-pdf-link";
|
|
284
|
+
if (hasDocLink && scraped.metadata.isPdf && !scraped.metadata.complete) {
|
|
285
|
+
url = scraped.url;
|
|
286
|
+
options.type = "application/pdf";
|
|
287
|
+
}
|
|
288
|
+
} catch (error) {
|
|
289
|
+
console.warn(
|
|
290
|
+
`Spider detection failed for ${url}, falling back to direct download:`,
|
|
291
|
+
error
|
|
292
|
+
);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
let type = options.type;
|
|
296
|
+
if (!type) {
|
|
297
|
+
const urlLower = url.toLowerCase();
|
|
298
|
+
if (urlLower.endsWith(".pdf") || urlLower.includes(".pdf?") || urlLower.includes(".pdf#")) {
|
|
299
|
+
type = "application/pdf";
|
|
300
|
+
} else {
|
|
301
|
+
type = getMimeType(url) || "";
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
const processor = processors.find((p) => p.supports(type));
|
|
305
|
+
if (!processor) {
|
|
306
|
+
throw new Error(
|
|
307
|
+
`No processor available for document type: ${type}. Supported types: PDF (.pdf, application/pdf)`
|
|
308
|
+
);
|
|
309
|
+
}
|
|
310
|
+
return processor.process(url, options);
|
|
311
|
+
}
|
|
312
|
+
const PACKAGE_VERSION_INITIALIZED = true;
|
|
313
|
+
export {
|
|
314
|
+
Document,
|
|
315
|
+
PACKAGE_VERSION_INITIALIZED,
|
|
316
|
+
PDFProcessor,
|
|
317
|
+
fetchDocument,
|
|
318
|
+
getTitleFromUrl
|
|
319
|
+
};
|
|
320
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sources":["../src/document.ts","../src/utils.ts","../src/processors/pdf.ts","../src/factory.ts","../src/index.ts"],"sourcesContent":["import os from 'node:os';\nimport path from 'node:path';\nimport { URL } from 'node:url';\nimport { downloadFileWithCache, getMimeType } from '@happyvertical/files';\nimport { makeSlug } from '@happyvertical/utils';\nimport type {\n DocumentPart,\n Document as DocumentType,\n FetchDocumentOptions,\n} from './types';\n\n/**\n * Base document handler with multi-part support\n *\n * Provides functionality for downloading, caching, and structuring documents\n * into hierarchical parts. Specific format processing (PDF, HTML, Markdown)\n * is handled by specialized processors.\n */\nexport class Document {\n /**\n * Flag indicating if document is from a remote source\n */\n protected isRemote = false;\n\n /**\n * Configuration options\n */\n protected options: FetchDocumentOptions;\n\n /**\n * Local file path where document is stored\n */\n private _localPath = '';\n\n /**\n * Directory used for caching files\n */\n private _cacheDir = '';\n\n /**\n * Document URL\n */\n public url: URL;\n\n /**\n * Document MIME type\n */\n public type: string;\n\n /**\n * Document parts (hierarchical structure)\n */\n public parts: DocumentPart[] = [];\n\n /**\n * Document-level metadata\n */\n public metadata: Record<string, any> = {};\n\n /**\n * Get the local file path where document is stored\n */\n public get localPath(): string {\n return this._localPath;\n }\n\n /**\n * Get the directory used for caching files\n */\n public get cacheDir(): string {\n return this._cacheDir;\n }\n\n /**\n * Creates a new Document instance\n *\n * @param url - Document URL or file path\n * @param options - Document configuration options\n */\n constructor(url: string, options: FetchDocumentOptions = {}) {\n this.url = new URL(url);\n this.options = options;\n this.type =\n options.type || getMimeType(this.url.toString()) || 'text/plain';\n\n this._cacheDir =\n options.cacheDir ||\n path.resolve(os.tmpdir(), '.cache', 'have-sdk', 'documents');\n\n if (this.url.protocol.startsWith('file')) {\n // Decode URL-encoded characters in the pathname only (e.g., %20 -> space).\n // Note: Query parameters and hash fragments are not decoded here.\n this._localPath = decodeURIComponent(this.url.pathname);\n this.isRemote = false;\n } else if (this.url.protocol.startsWith('http')) {\n // Generate cache path from URL pathname\n // Query parameters (?) and fragments (#) are automatically excluded from url.pathname\n let pathname = this.url.pathname;\n\n // Remove trailing slash (directory-style URLs)\n if (pathname.endsWith('/')) {\n pathname = pathname.slice(0, -1);\n }\n\n // Add file extension if missing and we know the type\n // This is crucial for URLs like /download/file/?wpdmdl=123 which have no extension\n if (!pathname.match(/\\.[a-z0-9]+$/i)) {\n // Add appropriate extension based on MIME type\n if (\n this.type === 'application/pdf' ||\n options.type === 'application/pdf'\n ) {\n pathname += '.pdf';\n }\n // Future: Add other common extensions (html, json, etc.)\n }\n\n this._localPath = path.join(\n this._cacheDir,\n makeSlug(this.url.hostname),\n pathname,\n );\n this.isRemote = true;\n }\n }\n\n /**\n * Creates and initializes a Document instance\n *\n * Downloads remote files and prepares the document for processing.\n *\n * @param url - Document URL or file path\n * @param options - Document configuration options\n * @returns Promise resolving to the initialized Document\n */\n static async create(\n url: string,\n options: FetchDocumentOptions = {},\n ): Promise<Document> {\n const document = new Document(url, options);\n await document.initialize();\n return document;\n }\n\n /**\n * Initializes the document, downloading it if it's remote\n *\n * @returns Promise that resolves when initialization is complete\n */\n async initialize(): Promise<void> {\n if (this.isRemote) {\n if (!this.url) {\n throw new Error('Cannot initialize remote document: URL is required');\n }\n await downloadFileWithCache(this.url.toString(), this._localPath);\n }\n }\n\n /**\n * Checks if the document is a text-based file that can be read directly\n *\n * @returns Boolean indicating if the file is text-based\n */\n public isTextFile(): boolean {\n if (!this.type) return false;\n\n return (\n this.type.startsWith('text/') ||\n this.type === 'application/json' ||\n this.type === 'application/xml' ||\n this.type === 'application/javascript' ||\n this.type === 'application/typescript' ||\n [\n '.txt',\n '.md',\n '.json',\n '.xml',\n '.html',\n '.css',\n '.js',\n '.ts',\n '.yaml',\n '.yml',\n ].some((ext) => this.localPath.toLowerCase().endsWith(ext))\n );\n }\n\n /**\n * Converts the document to the standard Document interface\n *\n * @returns Document object with URL, type, parts, and metadata\n */\n public toDocument(): DocumentType {\n return {\n url: this.url.toString(),\n type: this.type,\n parts: this.parts,\n metadata: this.metadata,\n };\n }\n}\n\nexport default Document;\n","/**\n * Utility functions for document processing\n */\n\n/**\n * Extract a human-readable title from a URL\n *\n * Takes a URL and extracts the filename from the pathname, then formats it\n * into a readable title by removing the extension and converting separators\n * to spaces. Also decodes URL-encoded characters like %20.\n *\n * @param url - URL string to extract title from\n * @param defaultTitle - Default title to use if extraction fails\n * @returns Formatted title string\n *\n * @example\n * ```typescript\n * getTitleFromUrl('file:///path/to/My%20Document.pdf')\n * // Returns: 'My Document'\n *\n * getTitleFromUrl('https://example.com/research_paper.pdf')\n * // Returns: 'research paper'\n * ```\n */\nexport function getTitleFromUrl(\n url: string,\n defaultTitle = 'Document',\n): string {\n try {\n const urlObj = new URL(url);\n const pathname = urlObj.pathname;\n const filename = pathname.split('/').pop() || defaultTitle;\n\n // Decode URL-encoded characters (e.g., %20 -> space)\n const decodedFilename = decodeURIComponent(filename);\n\n // Remove extension and convert separators to spaces\n return decodedFilename\n .replace(/\\.(pdf|html?|md|txt)$/i, '')\n .replace(/[-_]/g, ' ')\n .trim();\n } catch {\n return defaultTitle;\n }\n}\n","import { promises as fs } from 'node:fs';\nimport { getCached, setCached } from '@happyvertical/files';\nimport { getPDFReader } from '@happyvertical/pdf';\nimport { v4 as uuidv4 } from 'uuid';\nimport { Document as BaseDocument } from '../document';\nimport type {\n Document,\n DocumentImage,\n DocumentPart,\n DocumentProcessor,\n FetchDocumentOptions,\n} from '../types';\nimport { getTitleFromUrl } from '../utils';\n\n/**\n * PDF Document Processor\n *\n * Handles PDF documents with support for:\n * - Text extraction from PDF content via `@happyvertical/pdf`\n * - PDF header validation (detects HTML cache poisoning from document management systems)\n * - Processed document caching via `@happyvertical/files`\n *\n * Image extraction and OCR are stubbed for future implementation.\n */\nexport class PDFProcessor implements DocumentProcessor {\n /**\n * Check if this processor supports the given MIME type or extension.\n * Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).\n *\n * @param type - MIME type or file extension to check\n * @returns `true` if this processor can handle the given type\n */\n supports(type: string): boolean {\n return (\n type === 'application/pdf' ||\n type.endsWith('.pdf') ||\n type.toLowerCase() === 'pdf'\n );\n }\n\n /**\n * Process a PDF document\n *\n * Extracts text and optionally images/OCR from the PDF, structuring\n * it into hierarchical document parts.\n *\n * @param url - PDF URL or file path\n * @param options - Processing options\n * @returns Promise resolving to structured Document\n */\n async process(\n url: string,\n options: FetchDocumentOptions = {},\n ): Promise<Document> {\n // Create and initialize base document\n const baseDoc = await BaseDocument.create(url, options);\n\n // Check cache for processed document\n const cacheKey = `${baseDoc.localPath}.processed_pdf`;\n const cached = await getCached(cacheKey);\n if (cached) {\n try {\n const parsed = JSON.parse(cached);\n return {\n url: baseDoc.url.toString(),\n type: baseDoc.type,\n parts: parsed.parts,\n metadata: parsed.metadata || {},\n };\n } catch (error) {\n // Cache corrupted, continue with fresh processing\n console.warn('Cached PDF data corrupted, reprocessing', error);\n }\n }\n\n // Validate that the downloaded file is actually a PDF (issue #460, #463)\n // WordPress Download Manager and some other servers may return HTML\n // with Content-Type: application/pdf, causing PDF extraction to fail\n const fileBuffer = await fs.readFile(baseDoc.localPath);\n const header = fileBuffer.subarray(0, 5).toString('utf-8');\n\n if (header !== '%PDF-') {\n // File is not a valid PDF - delete poisoned cache file (issue #463)\n try {\n await fs.unlink(baseDoc.localPath);\n } catch (unlinkError) {\n console.warn(\n `Failed to delete poisoned cache file: ${baseDoc.localPath}`,\n unlinkError,\n );\n }\n\n // Check if it's HTML to provide helpful error message\n const content = fileBuffer.toString(\n 'utf-8',\n 0,\n Math.min(1000, fileBuffer.length),\n );\n if (content.includes('<!DOCTYPE html>') || content.includes('<html')) {\n throw new Error(\n `Downloaded file is HTML, not PDF. The server returned HTML content for ${url}. ` +\n 'This commonly occurs with WordPress Download Manager URLs that return tracking pages. ' +\n `Expected PDF magic bytes (%PDF-) but got: ${header}. ` +\n 'The poisoned cache file has been removed - please try again.',\n );\n } else {\n throw new Error(\n `Downloaded file is not a valid PDF. Expected %PDF- magic bytes but got: ${header}. ` +\n 'The invalid cache file has been removed - please try again.',\n );\n }\n }\n\n // Get PDF reader and extract content\n const reader = await getPDFReader();\n const extractedText = await reader.extractText(baseDoc.localPath);\n\n // Create main document part\n const mainPart: DocumentPart = {\n id: uuidv4(),\n title: getTitleFromUrl(url, 'PDF Document'),\n content: extractedText || '',\n type: 'text',\n metadata: {\n source: 'pdf',\n filePath: baseDoc.localPath,\n },\n };\n\n // Extract images if enabled\n if (options.extractImages === true) {\n mainPart.images = await this.extractImages(\n baseDoc.localPath,\n options.runOcr !== false,\n );\n }\n\n const document: Document = {\n url: baseDoc.url.toString(),\n type: baseDoc.type,\n parts: [mainPart],\n metadata: {\n processor: 'pdf',\n extractedAt: new Date().toISOString(),\n hasImages: (mainPart.images?.length || 0) > 0,\n },\n };\n\n // Cache the processed document\n await setCached(cacheKey, JSON.stringify(document));\n\n return document;\n }\n\n /**\n * Extract images from PDF\n *\n * This is a placeholder for future image extraction functionality.\n * Will use @happyvertical/pdf's image extraction capabilities when available.\n *\n * @param filePath - Local PDF file path\n * @param runOcr - Whether to run OCR on extracted images\n * @returns Promise resolving to array of DocumentImages\n */\n private async extractImages(\n _filePath: string,\n _runOcr: boolean,\n ): Promise<DocumentImage[]> {\n // TODO: Implement image extraction using @happyvertical/pdf\n // For now, return empty array as placeholder\n\n // Future implementation will:\n // 1. Use getPDFReader() to extract images from PDF\n // 2. Save images to cache directory\n // 3. If runOcr is true, use @happyvertical/ocr to extract text from images\n // 4. Return array of DocumentImage objects with metadata\n\n return [];\n }\n}\n\nexport default PDFProcessor;\n","import { getMimeType } from '@happyvertical/files';\nimport { scrapeDocument } from '@happyvertical/spider';\nimport { PDFProcessor } from './processors/pdf';\nimport type { Document, FetchDocumentOptions } from './types';\n\n/**\n * Available document processors\n */\nconst processors = [new PDFProcessor()];\n\n/**\n * Fetch a document from a URL with automatic format detection\n *\n * This factory function:\n * 1. Detects the document format (PDF, HTML, Markdown, etc.)\n * 2. Selects the appropriate processor\n * 3. Processes the document into structured parts\n * 4. Returns a Document object with hierarchical content\n *\n * @param url - Document URL or file path (file://, http://, https://)\n * @param options - Fetch and processing options\n * @returns Promise resolving to structured Document\n *\n * @example\n * ```typescript\n * // Fetch a PDF with image extraction and OCR\n * const doc = await fetchDocument('https://example.com/report.pdf', {\n * extractImages: true,\n * runOcr: true\n * });\n *\n * // Access document parts\n * for (const part of doc.parts) {\n * console.log(part.title);\n * console.log(part.content);\n *\n * // Check for images\n * if (part.images) {\n * for (const image of part.images) {\n * console.log(image.url);\n * console.log(image.ocrText); // Text extracted via OCR\n * }\n * }\n * }\n * ```\n */\nexport async function fetchDocument(\n url: string,\n options: FetchDocumentOptions = {},\n): Promise<Document> {\n // For web URLs (http/https), use spider package to detect special cases\n // (WordPress Download Manager, CivicWeb, DocuShare, etc.)\n const isWebUrl = url.startsWith('http://') || url.startsWith('https://');\n\n if (isWebUrl && !options.type) {\n try {\n // Use spider to detect WordPress, CivicWeb, DocuShare, and other document management systems\n const scraped = await scrapeDocument(url, {\n scraper: options.scraper || 'basic',\n spider: options.spider || 'dom',\n cache: options.cache,\n cacheExpiry: options.cacheExpiry,\n headers: options.headers,\n timeout: options.timeout,\n maxDuration: options.maxDuration,\n maxInteractions: options.maxInteractions,\n });\n\n // Check if spider detected a document management system with PDF link\n const hasDocLink =\n scraped.metadata.strategy === 'wordpress-pdf-link' ||\n scraped.metadata.strategy === 'civicweb-pdf-link' ||\n scraped.metadata.strategy === 'docushare-pdf-link';\n\n if (hasDocLink && scraped.metadata.isPdf && !scraped.metadata.complete) {\n // Spider detected a document management page and extracted the PDF URL\n // Use the extracted URL for PDF processing\n url = scraped.url;\n options.type = 'application/pdf';\n }\n } catch (error) {\n // If spider fails, continue with direct download\n // This ensures backward compatibility\n console.warn(\n `Spider detection failed for ${url}, falling back to direct download:`,\n error,\n );\n }\n }\n\n // Determine type - check URL extension first, then MIME type\n // This handles servers that return incorrect Content-Type headers (e.g., application/octet-stream for PDFs)\n let type = options.type;\n\n if (!type) {\n // Extract file extension from URL\n const urlLower = url.toLowerCase();\n\n // Check for common document extensions in URL\n if (\n urlLower.endsWith('.pdf') ||\n urlLower.includes('.pdf?') ||\n urlLower.includes('.pdf#')\n ) {\n type = 'application/pdf';\n } else {\n // Fall back to MIME type detection\n type = getMimeType(url) || '';\n }\n }\n\n // Find appropriate processor\n const processor = processors.find((p) => p.supports(type));\n\n if (!processor) {\n throw new Error(\n `No processor available for document type: ${type}. Supported types: PDF (.pdf, application/pdf)`,\n );\n }\n\n // Process document\n return processor.process(url, options);\n}\n\nexport default fetchDocument;\n","/**\n * @happyvertical/documents - Document processing with multi-part structure\n *\n * Provides document processing for PDFs with support for:\n * - Hierarchical document parts\n * - Automatic format detection from URL or MIME type\n * - Document management system detection (WordPress, CivicWeb, DocuShare)\n * - File caching for performance\n *\n * @example\n * ```typescript\n * import { fetchDocument } from '@happyvertical/documents';\n *\n * const doc = await fetchDocument('https://example.com/report.pdf');\n *\n * for (const part of doc.parts) {\n * console.log(part.title);\n * console.log(part.content);\n * }\n * ```\n */\n\n// Base classes\nexport { Document } from './document';\n// Main factory function\nexport { fetchDocument } from './factory';\n\n// Processors\nexport { PDFProcessor } from './processors/pdf';\n// Types\nexport type {\n Document as DocumentType,\n DocumentImage,\n DocumentPart,\n DocumentProcessor,\n FetchDocumentOptions,\n} from './types';\n// Utilities\nexport { getTitleFromUrl } from './utils';\n\n/** @internal */\nexport const PACKAGE_VERSION_INITIALIZED = true;\n"],"names":["URL","BaseDocument","fs","uuidv4"],"mappings":";;;;;;;;;AAkBO,MAAM,SAAS;AAAA;AAAA;AAAA;AAAA,EAIV,WAAW;AAAA;AAAA;AAAA;AAAA,EAKX;AAAA;AAAA;AAAA;AAAA,EAKF,aAAa;AAAA;AAAA;AAAA;AAAA,EAKb,YAAY;AAAA;AAAA;AAAA;AAAA,EAKb;AAAA;AAAA;AAAA;AAAA,EAKA;AAAA;AAAA;AAAA;AAAA,EAKA,QAAwB,CAAA;AAAA;AAAA;AAAA;AAAA,EAKxB,WAAgC,CAAA;AAAA;AAAA;AAAA;AAAA,EAKvC,IAAW,YAAoB;AAC7B,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA,EAKA,IAAW,WAAmB;AAC5B,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,YAAY,KAAa,UAAgC,IAAI;AAC3D,SAAK,MAAM,IAAIA,MAAI,GAAG;AACtB,SAAK,UAAU;AACf,SAAK,OACH,QAAQ,QAAQ,YAAY,KAAK,IAAI,SAAA,CAAU,KAAK;AAEtD,SAAK,YACH,QAAQ,YACR,KAAK,QAAQ,GAAG,OAAA,GAAU,UAAU,YAAY,WAAW;AAE7D,QAAI,KAAK,IAAI,SAAS,WAAW,MAAM,GAAG;AAGxC,WAAK,aAAa,mBAAmB,KAAK,IAAI,QAAQ;AACtD,WAAK,WAAW;AAAA,IAClB,WAAW,KAAK,IAAI,SAAS,WAAW,MAAM,GAAG;AAG/C,UAAI,WAAW,KAAK,IAAI;AAGxB,UAAI,SAAS,SAAS,GAAG,GAAG;AAC1B,mBAAW,SAAS,MAAM,GAAG,EAAE;AAAA,MACjC;AAIA,UAAI,CAAC,SAAS,MAAM,eAAe,GAAG;AAEpC,YACE,KAAK,SAAS,qBACd,QAAQ,SAAS,mBACjB;AACA,sBAAY;AAAA,QACd;AAAA,MAEF;AAEA,WAAK,aAAa,KAAK;AAAA,QACrB,KAAK;AAAA,QACL,SAAS,KAAK,IAAI,QAAQ;AAAA,QAC1B;AAAA,MAAA;AAEF,WAAK,WAAW;AAAA,IAClB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWA,aAAa,OACX,KACA,UAAgC,IACb;AACnB,UAAM,WAAW,IAAI,SAAS,KAAK,OAAO;AAC1C,UAAM,SAAS,WAAA;AACf,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAM,aAA4B;AAChC,QAAI,KAAK,UAAU;AACjB,UAAI,CAAC,KAAK,KAAK;AACb,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACtE;AACA,YAAM,sBAAsB,KAAK,IAAI,SAAA,GAAY,KAAK,UAAU;AAAA,IAClE;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOO,aAAsB;AAC3B,QAAI,CAAC,KAAK,KAAM,QAAO;AAEvB,WACE,KAAK,KAAK,WAAW,OAAO,KAC5B,KAAK,SAAS,sBACd,KAAK,SAAS,qBACd,KAAK,SAAS,4BACd,KAAK,SAAS,4BACd;AAAA,MACE;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IAAA,EACA,KAAK,CAAC,QAAQ,KAAK,UAAU,YAAA,EAAc,SAAS,GAAG,CAAC;AAAA,EAE9D;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOO,aAA2B;AAChC,WAAO;AAAA,MACL,KAAK,KAAK,IAAI,SAAA;AAAA,MACd,MAAM,KAAK;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU,KAAK;AAAA,IAAA;AAAA,EAEnB;AACF;AChLO,SAAS,gBACd,KACA,eAAe,YACP;AACR,MAAI;AACF,UAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,UAAM,WAAW,OAAO;AACxB,UAAM,WAAW,SAAS,MAAM,GAAG,EAAE,SAAS;AAG9C,UAAM,kBAAkB,mBAAmB,QAAQ;AAGnD,WAAO,gBACJ,QAAQ,0BAA0B,EAAE,EACpC,QAAQ,SAAS,GAAG,EACpB,KAAA;AAAA,EACL,QAAQ;AACN,WAAO;AAAA,EACT;AACF;ACpBO,MAAM,aAA0C;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQrD,SAAS,MAAuB;AAC9B,WACE,SAAS,qBACT,KAAK,SAAS,MAAM,KACpB,KAAK,kBAAkB;AAAA,EAE3B;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,MAAM,QACJ,KACA,UAAgC,IACb;AAEnB,UAAM,UAAU,MAAMC,SAAa,OAAO,KAAK,OAAO;AAGtD,UAAM,WAAW,GAAG,QAAQ,SAAS;AACrC,UAAM,SAAS,MAAM,UAAU,QAAQ;AACvC,QAAI,QAAQ;AACV,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,MAAM;AAChC,eAAO;AAAA,UACL,KAAK,QAAQ,IAAI,SAAA;AAAA,UACjB,MAAM,QAAQ;AAAA,UACd,OAAO,OAAO;AAAA,UACd,UAAU,OAAO,YAAY,CAAA;AAAA,QAAC;AAAA,MAElC,SAAS,OAAO;AAEd,gBAAQ,KAAK,2CAA2C,KAAK;AAAA,MAC/D;AAAA,IACF;AAKA,UAAM,aAAa,MAAMC,SAAG,SAAS,QAAQ,SAAS;AACtD,UAAM,SAAS,WAAW,SAAS,GAAG,CAAC,EAAE,SAAS,OAAO;AAEzD,QAAI,WAAW,SAAS;AAEtB,UAAI;AACF,cAAMA,SAAG,OAAO,QAAQ,SAAS;AAAA,MACnC,SAAS,aAAa;AACpB,gBAAQ;AAAA,UACN,yCAAyC,QAAQ,SAAS;AAAA,UAC1D;AAAA,QAAA;AAAA,MAEJ;AAGA,YAAM,UAAU,WAAW;AAAA,QACzB;AAAA,QACA;AAAA,QACA,KAAK,IAAI,KAAM,WAAW,MAAM;AAAA,MAAA;AAElC,UAAI,QAAQ,SAAS,iBAAiB,KAAK,QAAQ,SAAS,OAAO,GAAG;AACpE,cAAM,IAAI;AAAA,UACR,0EAA0E,GAAG,qIAE9B,MAAM;AAAA,QAAA;AAAA,MAGzD,OAAO;AACL,cAAM,IAAI;AAAA,UACR,2EAA2E,MAAM;AAAA,QAAA;AAAA,MAGrF;AAAA,IACF;AAGA,UAAM,SAAS,MAAM,aAAA;AACrB,UAAM,gBAAgB,MAAM,OAAO,YAAY,QAAQ,SAAS;AAGhE,UAAM,WAAyB;AAAA,MAC7B,IAAIC,GAAA;AAAA,MACJ,OAAO,gBAAgB,KAAK,cAAc;AAAA,MAC1C,SAAS,iBAAiB;AAAA,MAC1B,MAAM;AAAA,MACN,UAAU;AAAA,QACR,QAAQ;AAAA,QACR,UAAU,QAAQ;AAAA,MAAA;AAAA,IACpB;AAIF,QAAI,QAAQ,kBAAkB,MAAM;AAClC,eAAS,SAAS,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,QAAQ,WAAW;AAAA,MAAA;AAAA,IAEvB;AAEA,UAAM,WAAqB;AAAA,MACzB,KAAK,QAAQ,IAAI,SAAA;AAAA,MACjB,MAAM,QAAQ;AAAA,MACd,OAAO,CAAC,QAAQ;AAAA,MAChB,UAAU;AAAA,QACR,WAAW;AAAA,QACX,cAAa,oBAAI,KAAA,GAAO,YAAA;AAAA,QACxB,YAAY,SAAS,QAAQ,UAAU,KAAK;AAAA,MAAA;AAAA,IAC9C;AAIF,UAAM,UAAU,UAAU,KAAK,UAAU,QAAQ,CAAC;AAElD,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,MAAc,cACZ,WACA,SAC0B;AAU1B,WAAO,CAAA;AAAA,EACT;AACF;AC3KA,MAAM,aAAa,CAAC,IAAI,cAAc;AAsCtC,eAAsB,cACpB,KACA,UAAgC,IACb;AAGnB,QAAM,WAAW,IAAI,WAAW,SAAS,KAAK,IAAI,WAAW,UAAU;AAEvE,MAAI,YAAY,CAAC,QAAQ,MAAM;AAC7B,QAAI;AAEF,YAAM,UAAU,MAAM,eAAe,KAAK;AAAA,QACxC,SAAS,QAAQ,WAAW;AAAA,QAC5B,QAAQ,QAAQ,UAAU;AAAA,QAC1B,OAAO,QAAQ;AAAA,QACf,aAAa,QAAQ;AAAA,QACrB,SAAS,QAAQ;AAAA,QACjB,SAAS,QAAQ;AAAA,QACjB,aAAa,QAAQ;AAAA,QACrB,iBAAiB,QAAQ;AAAA,MAAA,CAC1B;AAGD,YAAM,aACJ,QAAQ,SAAS,aAAa,wBAC9B,QAAQ,SAAS,aAAa,uBAC9B,QAAQ,SAAS,aAAa;AAEhC,UAAI,cAAc,QAAQ,SAAS,SAAS,CAAC,QAAQ,SAAS,UAAU;AAGtE,cAAM,QAAQ;AACd,gBAAQ,OAAO;AAAA,MACjB;AAAA,IACF,SAAS,OAAO;AAGd,cAAQ;AAAA,QACN,+BAA+B,GAAG;AAAA,QAClC;AAAA,MAAA;AAAA,IAEJ;AAAA,EACF;AAIA,MAAI,OAAO,QAAQ;AAEnB,MAAI,CAAC,MAAM;AAET,UAAM,WAAW,IAAI,YAAA;AAGrB,QACE,SAAS,SAAS,MAAM,KACxB,SAAS,SAAS,OAAO,KACzB,SAAS,SAAS,OAAO,GACzB;AACA,aAAO;AAAA,IACT,OAAO;AAEL,aAAO,YAAY,GAAG,KAAK;AAAA,IAC7B;AAAA,EACF;AAGA,QAAM,YAAY,WAAW,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI,CAAC;AAEzD,MAAI,CAAC,WAAW;AACd,UAAM,IAAI;AAAA,MACR,6CAA6C,IAAI;AAAA,IAAA;AAAA,EAErD;AAGA,SAAO,UAAU,QAAQ,KAAK,OAAO;AACvC;ACjFO,MAAM,8BAA8B;"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { Document, DocumentProcessor, FetchDocumentOptions } from '../types';
|
|
2
|
+
/**
|
|
3
|
+
* PDF Document Processor
|
|
4
|
+
*
|
|
5
|
+
* Handles PDF documents with support for:
|
|
6
|
+
* - Text extraction from PDF content via `@happyvertical/pdf`
|
|
7
|
+
* - PDF header validation (detects HTML cache poisoning from document management systems)
|
|
8
|
+
* - Processed document caching via `@happyvertical/files`
|
|
9
|
+
*
|
|
10
|
+
* Image extraction and OCR are stubbed for future implementation.
|
|
11
|
+
*/
|
|
12
|
+
export declare class PDFProcessor implements DocumentProcessor {
|
|
13
|
+
/**
|
|
14
|
+
* Check if this processor supports the given MIME type or extension.
|
|
15
|
+
* Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).
|
|
16
|
+
*
|
|
17
|
+
* @param type - MIME type or file extension to check
|
|
18
|
+
* @returns `true` if this processor can handle the given type
|
|
19
|
+
*/
|
|
20
|
+
supports(type: string): boolean;
|
|
21
|
+
/**
|
|
22
|
+
* Process a PDF document
|
|
23
|
+
*
|
|
24
|
+
* Extracts text and optionally images/OCR from the PDF, structuring
|
|
25
|
+
* it into hierarchical document parts.
|
|
26
|
+
*
|
|
27
|
+
* @param url - PDF URL or file path
|
|
28
|
+
* @param options - Processing options
|
|
29
|
+
* @returns Promise resolving to structured Document
|
|
30
|
+
*/
|
|
31
|
+
process(url: string, options?: FetchDocumentOptions): Promise<Document>;
|
|
32
|
+
/**
|
|
33
|
+
* Extract images from PDF
|
|
34
|
+
*
|
|
35
|
+
* This is a placeholder for future image extraction functionality.
|
|
36
|
+
* Will use @happyvertical/pdf's image extraction capabilities when available.
|
|
37
|
+
*
|
|
38
|
+
* @param filePath - Local PDF file path
|
|
39
|
+
* @param runOcr - Whether to run OCR on extracted images
|
|
40
|
+
* @returns Promise resolving to array of DocumentImages
|
|
41
|
+
*/
|
|
42
|
+
private extractImages;
|
|
43
|
+
}
|
|
44
|
+
export default PDFProcessor;
|
|
45
|
+
//# sourceMappingURL=pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/processors/pdf.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EACV,QAAQ,EAGR,iBAAiB,EACjB,oBAAoB,EACrB,MAAM,UAAU,CAAC;AAGlB;;;;;;;;;GASG;AACH,qBAAa,YAAa,YAAW,iBAAiB;IACpD;;;;;;OAMG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAQ/B;;;;;;;;;OASG;IACG,OAAO,CACX,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC;IAqGpB;;;;;;;;;OASG;YACW,aAAa;CAe5B;AAED,eAAe,YAAY,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for the @happyvertical/documents package
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Image extracted from a document
|
|
6
|
+
*
|
|
7
|
+
* Represents an image found in a document (PDF, HTML, etc.)
|
|
8
|
+
* with optional OCR text extraction for scanned images.
|
|
9
|
+
*/
|
|
10
|
+
export interface DocumentImage {
|
|
11
|
+
/**
|
|
12
|
+
* Unique identifier for the image
|
|
13
|
+
*/
|
|
14
|
+
id: string;
|
|
15
|
+
/**
|
|
16
|
+
* URL or reference to the image
|
|
17
|
+
*/
|
|
18
|
+
url: string;
|
|
19
|
+
/**
|
|
20
|
+
* Local filesystem path if image has been downloaded
|
|
21
|
+
*/
|
|
22
|
+
localPath?: string;
|
|
23
|
+
/**
|
|
24
|
+
* Alt text from HTML or PDF metadata
|
|
25
|
+
*/
|
|
26
|
+
altText?: string;
|
|
27
|
+
/**
|
|
28
|
+
* Text extracted from image via OCR
|
|
29
|
+
* Useful for scanned documents or images with text
|
|
30
|
+
*/
|
|
31
|
+
ocrText?: string;
|
|
32
|
+
/**
|
|
33
|
+
* Position/order of image in the document
|
|
34
|
+
*/
|
|
35
|
+
position?: number;
|
|
36
|
+
/**
|
|
37
|
+
* Image metadata (dimensions, format, etc.)
|
|
38
|
+
*/
|
|
39
|
+
metadata?: {
|
|
40
|
+
width?: number;
|
|
41
|
+
height?: number;
|
|
42
|
+
format?: string;
|
|
43
|
+
[key: string]: any;
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* A part or section of a document
|
|
48
|
+
*
|
|
49
|
+
* Documents can be hierarchical with nested parts (e.g., sections, chapters).
|
|
50
|
+
* Each part contains content, optional images, and can have child parts.
|
|
51
|
+
*/
|
|
52
|
+
export interface DocumentPart {
|
|
53
|
+
/**
|
|
54
|
+
* Unique identifier for this part
|
|
55
|
+
*/
|
|
56
|
+
id: string;
|
|
57
|
+
/**
|
|
58
|
+
* Title or heading for this part
|
|
59
|
+
*/
|
|
60
|
+
title: string;
|
|
61
|
+
/**
|
|
62
|
+
* Text content of this part
|
|
63
|
+
*/
|
|
64
|
+
content: string;
|
|
65
|
+
/**
|
|
66
|
+
* Content type for this part
|
|
67
|
+
*/
|
|
68
|
+
type: 'text' | 'html' | 'markdown';
|
|
69
|
+
/**
|
|
70
|
+
* Images contained in this part
|
|
71
|
+
*/
|
|
72
|
+
images?: DocumentImage[];
|
|
73
|
+
/**
|
|
74
|
+
* Additional metadata for this part
|
|
75
|
+
*/
|
|
76
|
+
metadata?: Record<string, any>;
|
|
77
|
+
/**
|
|
78
|
+
* Nested child parts (for hierarchical documents)
|
|
79
|
+
*/
|
|
80
|
+
parts?: DocumentPart[];
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Complete document with all parts and metadata
|
|
84
|
+
*/
|
|
85
|
+
export interface Document {
|
|
86
|
+
/**
|
|
87
|
+
* Source URL of the document
|
|
88
|
+
*/
|
|
89
|
+
url: string;
|
|
90
|
+
/**
|
|
91
|
+
* MIME type or document type
|
|
92
|
+
*/
|
|
93
|
+
type: string;
|
|
94
|
+
/**
|
|
95
|
+
* Document parts (can be hierarchical)
|
|
96
|
+
*/
|
|
97
|
+
parts: DocumentPart[];
|
|
98
|
+
/**
|
|
99
|
+
* Document-level metadata
|
|
100
|
+
*/
|
|
101
|
+
metadata?: Record<string, any>;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Options for fetching a document
|
|
105
|
+
*/
|
|
106
|
+
export interface FetchDocumentOptions {
|
|
107
|
+
/**
|
|
108
|
+
* Directory for caching downloaded files
|
|
109
|
+
* @default os.tmpdir()/.cache/have-sdk/documents
|
|
110
|
+
*/
|
|
111
|
+
cacheDir?: string;
|
|
112
|
+
/**
|
|
113
|
+
* Whether to extract images from the document
|
|
114
|
+
* @default true
|
|
115
|
+
*/
|
|
116
|
+
extractImages?: boolean;
|
|
117
|
+
/**
|
|
118
|
+
* Whether to run OCR on images (PDF scans, etc.)
|
|
119
|
+
* @default true for PDFs, false for HTML/Markdown
|
|
120
|
+
*/
|
|
121
|
+
runOcr?: boolean;
|
|
122
|
+
/**
|
|
123
|
+
* Scraper type to use for content extraction
|
|
124
|
+
* - 'basic': Fast, static HTML scraping (default)
|
|
125
|
+
* - 'crawlee': Full browser with JavaScript execution
|
|
126
|
+
* @default 'basic'
|
|
127
|
+
*/
|
|
128
|
+
scraper?: 'basic' | 'crawlee';
|
|
129
|
+
/**
|
|
130
|
+
* Spider adapter to use for fetching web pages
|
|
131
|
+
* - 'simple': Basic HTTP fetch
|
|
132
|
+
* - 'dom': HTML parsing with happy-dom
|
|
133
|
+
* - 'crawlee': Headless browser (requires scraper: 'crawlee')
|
|
134
|
+
* @default 'dom'
|
|
135
|
+
*/
|
|
136
|
+
spider?: 'simple' | 'dom' | 'crawlee';
|
|
137
|
+
/**
|
|
138
|
+
* Spider adapter to use for HTML fetching (deprecated, use 'spider' instead)
|
|
139
|
+
* @default 'simple'
|
|
140
|
+
* @deprecated Use 'spider' instead
|
|
141
|
+
*/
|
|
142
|
+
spiderAdapter?: 'simple' | 'dom' | 'crawlee';
|
|
143
|
+
/**
|
|
144
|
+
* Whether to use cache for spider fetching
|
|
145
|
+
* @default true
|
|
146
|
+
*/
|
|
147
|
+
cache?: boolean;
|
|
148
|
+
/**
|
|
149
|
+
* Cache expiry time in milliseconds for spider fetching
|
|
150
|
+
* @default 300000 (5 minutes)
|
|
151
|
+
*/
|
|
152
|
+
cacheExpiry?: number;
|
|
153
|
+
/**
|
|
154
|
+
* Custom HTTP headers for spider requests
|
|
155
|
+
*/
|
|
156
|
+
headers?: Record<string, string>;
|
|
157
|
+
/**
|
|
158
|
+
* Request timeout in milliseconds for spider fetching
|
|
159
|
+
* @default 30000 (30 seconds)
|
|
160
|
+
*/
|
|
161
|
+
timeout?: number;
|
|
162
|
+
/**
|
|
163
|
+
* Maximum time to spend scraping in milliseconds
|
|
164
|
+
* Used by advanced scrapers (tree, pagination, etc.)
|
|
165
|
+
*/
|
|
166
|
+
maxDuration?: number;
|
|
167
|
+
/**
|
|
168
|
+
* Maximum number of interactions to perform
|
|
169
|
+
* Used by advanced scrapers (clicking, scrolling, etc.)
|
|
170
|
+
*/
|
|
171
|
+
maxInteractions?: number;
|
|
172
|
+
/**
|
|
173
|
+
* Override MIME type detection
|
|
174
|
+
*/
|
|
175
|
+
type?: string;
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Interface for document processors
|
|
179
|
+
*
|
|
180
|
+
* Each processor handles a specific document format (PDF, HTML, Markdown, etc.)
|
|
181
|
+
*/
|
|
182
|
+
export interface DocumentProcessor {
|
|
183
|
+
/**
|
|
184
|
+
* Process a document and return structured parts
|
|
185
|
+
*
|
|
186
|
+
* @param url - Source URL or file path
|
|
187
|
+
* @param options - Processing options
|
|
188
|
+
* @returns Promise resolving to Document with parts
|
|
189
|
+
*/
|
|
190
|
+
process(url: string, options?: FetchDocumentOptions): Promise<Document>;
|
|
191
|
+
/**
|
|
192
|
+
* Check if this processor can handle the given type
|
|
193
|
+
*
|
|
194
|
+
* @param type - MIME type or file extension
|
|
195
|
+
* @returns True if processor supports this type
|
|
196
|
+
*/
|
|
197
|
+
supports(type: string): boolean;
|
|
198
|
+
}
|
|
199
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;OAEG;IACH,QAAQ,CAAC,EAAE;QACT,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;KACpB,CAAC;CACH;AAED;;;;;GAKG;AACH,MAAM,WAAW,YAAY;IAC3B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,CAAC;IAEnC;;OAEG;IACH,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC;IAEzB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAE/B;;OAEG;IACH,KAAK,CAAC,EAAE,YAAY,EAAE,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB;;OAEG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IAEb;;OAEG;IACH,KAAK,EAAE,YAAY,EAAE,CAAC;IAEtB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB;;;OAGG;IACH,MAAM,CAAC,EAAE,OAAO,CAAC;IAEjB;;;;;OAKG;IACH,OAAO,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAE9B;;;;;;OAMG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,KAAK,GAAG,SAAS,CAAC;IAEtC;;;;OAIG;IACH,aAAa,CAAC,EAAE,QAAQ,GAAG,KAAK,GAAG,SAAS,CAAC;IAE7C;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAEjC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;;;;OAMG;IACH,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAExE;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC;CACjC"}
|
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for document processing
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Extract a human-readable title from a URL
|
|
6
|
+
*
|
|
7
|
+
* Takes a URL and extracts the filename from the pathname, then formats it
|
|
8
|
+
* into a readable title by removing the extension and converting separators
|
|
9
|
+
* to spaces. Also decodes URL-encoded characters like %20.
|
|
10
|
+
*
|
|
11
|
+
* @param url - URL string to extract title from
|
|
12
|
+
* @param defaultTitle - Default title to use if extraction fails
|
|
13
|
+
* @returns Formatted title string
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* getTitleFromUrl('file:///path/to/My%20Document.pdf')
|
|
18
|
+
* // Returns: 'My Document'
|
|
19
|
+
*
|
|
20
|
+
* getTitleFromUrl('https://example.com/research_paper.pdf')
|
|
21
|
+
* // Returns: 'research paper'
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export declare function getTitleFromUrl(url: string, defaultTitle?: string): string;
|
|
25
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,eAAe,CAC7B,GAAG,EAAE,MAAM,EACX,YAAY,SAAa,GACxB,MAAM,CAiBR"}
|
package/metadata.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@happyvertical/documents",
|
|
3
|
+
"path": "packages/documents",
|
|
4
|
+
"position": {
|
|
5
|
+
"index": 8,
|
|
6
|
+
"count": 30
|
|
7
|
+
},
|
|
8
|
+
"description": "Multi-part document processing with support for PDF, HTML, and Markdown",
|
|
9
|
+
"provides": [
|
|
10
|
+
"Multi-part document processing with support for PDF, HTML, and Markdown"
|
|
11
|
+
],
|
|
12
|
+
"implements": [],
|
|
13
|
+
"requires": {
|
|
14
|
+
"workspace": [
|
|
15
|
+
"@happyvertical/files",
|
|
16
|
+
"@happyvertical/utils"
|
|
17
|
+
],
|
|
18
|
+
"externalHappyVertical": [
|
|
19
|
+
"@happyvertical/ocr",
|
|
20
|
+
"@happyvertical/pdf",
|
|
21
|
+
"@happyvertical/spider"
|
|
22
|
+
],
|
|
23
|
+
"external": [
|
|
24
|
+
"uuid"
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
"dependents": [],
|
|
28
|
+
"stability": {
|
|
29
|
+
"level": "stable",
|
|
30
|
+
"reason": "Primary package surface is described as implemented and production-oriented."
|
|
31
|
+
},
|
|
32
|
+
"keywords": [
|
|
33
|
+
"documents"
|
|
34
|
+
]
|
|
35
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@happyvertical/documents",
|
|
3
|
+
"version": "0.74.9",
|
|
4
|
+
"description": "Multi-part document processing with support for PDF, HTML, and Markdown",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"have-documents-context": "./dist/cli/claude-context.js"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"dist",
|
|
13
|
+
"README.md",
|
|
14
|
+
"LICENSE",
|
|
15
|
+
"AGENT.md",
|
|
16
|
+
"metadata.json"
|
|
17
|
+
],
|
|
18
|
+
"publishConfig": {
|
|
19
|
+
"registry": "https://registry.npmjs.org",
|
|
20
|
+
"access": "public"
|
|
21
|
+
},
|
|
22
|
+
"repository": {
|
|
23
|
+
"type": "git",
|
|
24
|
+
"url": "https://github.com/happyvertical/sdk.git",
|
|
25
|
+
"directory": "packages/documents"
|
|
26
|
+
},
|
|
27
|
+
"bugs": {
|
|
28
|
+
"url": "https://github.com/happyvertical/sdk/issues"
|
|
29
|
+
},
|
|
30
|
+
"homepage": "https://github.com/happyvertical/sdk/tree/main/packages/documents#readme",
|
|
31
|
+
"license": "MIT",
|
|
32
|
+
"exports": {
|
|
33
|
+
".": {
|
|
34
|
+
"types": "./dist/index.d.ts",
|
|
35
|
+
"import": "./dist/index.js"
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"dependencies": {
|
|
39
|
+
"@happyvertical/ocr": "^0.60.39",
|
|
40
|
+
"@happyvertical/pdf": "^0.62.25",
|
|
41
|
+
"@happyvertical/spider": "^0.60.10",
|
|
42
|
+
"uuid": "^13.0.0",
|
|
43
|
+
"@happyvertical/files": "0.74.9",
|
|
44
|
+
"@happyvertical/utils": "0.74.9"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"@types/node": "25.0.10",
|
|
48
|
+
"typescript": "^5.9.3",
|
|
49
|
+
"vite": "7.3.2",
|
|
50
|
+
"vitest": "^4.1.5"
|
|
51
|
+
},
|
|
52
|
+
"keywords": [
|
|
53
|
+
"documents",
|
|
54
|
+
"pdf",
|
|
55
|
+
"html",
|
|
56
|
+
"markdown",
|
|
57
|
+
"ocr",
|
|
58
|
+
"content-extraction"
|
|
59
|
+
],
|
|
60
|
+
"scripts": {
|
|
61
|
+
"build": "vite build",
|
|
62
|
+
"build:watch": "vite build --watch",
|
|
63
|
+
"dev": "vite --port 3004",
|
|
64
|
+
"test": "vitest run",
|
|
65
|
+
"test:watch": "vitest watch"
|
|
66
|
+
}
|
|
67
|
+
}
|