@happyvertical/documents 0.74.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENT.md ADDED
@@ -0,0 +1,32 @@
1
+ # @happyvertical/documents
2
+
3
+ <!-- BEGIN AGENT:GENERATED -->
4
+ ## Purpose
5
+ Multi-part document processing with support for PDF, HTML, and Markdown
6
+
7
+ ## Package Map
8
+ - Package: `@happyvertical/documents`
9
+ - Hierarchy path: `@happyvertical/sdk > packages > documents`
10
+ - Workspace position: `8 of 30` local packages
11
+ - Internal dependencies: `@happyvertical/files`, `@happyvertical/utils`
12
+ - Internal dependents: none
13
+ - Knowledge graph files: `AGENT.md`, `metadata.json`, `ecosystem-manifest.json`
14
+
15
+ ## Build & Test
16
+ ```bash
17
+ pnpm --filter @happyvertical/documents build
18
+ pnpm --filter @happyvertical/documents test
19
+ ```
20
+
21
+ ## Agent Correction Loops
22
+ - If module resolution or export errors mention a workspace dependency, build the dependency first (`pnpm --filter @happyvertical/files build`, `pnpm --filter @happyvertical/utils build`) and then rerun `pnpm --filter @happyvertical/documents build`.
23
+ - If a change only affects runtime behavior, rerun `pnpm --filter @happyvertical/documents test` after rebuilding the package to confirm the failure is local.
24
+ - If failures span multiple packages or Turborepo ordering looks wrong, run `pnpm build` and `pnpm typecheck` from the repo root before retrying package-scoped commands.
25
+
26
+ ## Ecosystem Relationships
27
+ - Provides: Multi-part document processing with support for PDF, HTML, and Markdown
28
+ - Implements: none
29
+ - Requires: @happyvertical/files, @happyvertical/utils, @happyvertical/ocr, @happyvertical/pdf, @happyvertical/spider, uuid
30
+ - Stability: stable (Primary package surface is described as implemented and production-oriented.)
31
+ <!-- END AGENT:GENERATED -->
32
+
package/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright <2025> <Happy Vertical Corporation>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,145 @@
1
+ # @happyvertical/documents
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+
5
+ Document processing with hierarchical structure. Currently supports PDF documents with text extraction, automatic document management system detection (WordPress Download Manager, CivicWeb, DocuShare), and file caching. Uses `@happyvertical/spider` for web page analysis and `@happyvertical/pdf` for PDF text extraction.
6
+
7
+ ## Installation
8
+
9
+ Install from the public npm registry:
10
+
11
+ ```bash
12
+ npm install @happyvertical/documents
13
+ ```
14
+
15
+ Anonymous installs also require the package's external `@happyvertical/ocr`, `@happyvertical/pdf`, and `@happyvertical/spider` dependencies to be available on public npm.
16
+
17
+ ## Quick Start
18
+
19
+ ```typescript
20
+ import { fetchDocument } from '@happyvertical/documents';
21
+
22
+ // Process a local PDF
23
+ const doc = await fetchDocument('file:///path/to/report.pdf');
24
+
25
+ for (const part of doc.parts) {
26
+ console.log(part.title);
27
+ console.log(part.content);
28
+ }
29
+
30
+ // Fetch a remote PDF (auto-detected from URL extension)
31
+ const remote = await fetchDocument('https://example.com/report.pdf');
32
+ console.log(remote.parts[0].content);
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### Document Management System Detection
38
+
39
+ When fetching web URLs, the package uses `@happyvertical/spider` to detect document management systems and extract direct PDF links:
40
+
41
+ ```typescript
42
+ // WordPress Download Manager URL — spider detects the PDF link automatically
43
+ const doc = await fetchDocument(
44
+ 'https://example.com/download/meeting-minutes/',
45
+ { scraper: 'basic', spider: 'dom' }
46
+ );
47
+ ```
48
+
49
+ ### Override MIME Type
50
+
51
+ ```typescript
52
+ const doc = await fetchDocument('https://example.com/download?id=123', {
53
+ type: 'application/pdf',
54
+ });
55
+ ```
56
+
57
+ ### Cache Control
58
+
59
+ ```typescript
60
+ const doc = await fetchDocument('https://example.com/report.pdf', {
61
+ cacheDir: './my-cache',
62
+ cache: true,
63
+ cacheExpiry: 600_000, // 10 minutes
64
+ });
65
+ ```
66
+
67
+ ## API Reference
68
+
69
+ ### `fetchDocument(url, options?)`
70
+
71
+ Main factory function. Detects document format, selects the appropriate processor, and returns structured content.
72
+
73
+ - **url** `string` — Document URL or file path (`file://`, `http://`, `https://`)
74
+ - **options** `FetchDocumentOptions` — See below
75
+ - **Returns** `Promise<Document>`
76
+ - **Throws** if no processor is available for the detected MIME type
77
+
78
+ ### `FetchDocumentOptions`
79
+
80
+ | Option | Type | Default | Description |
81
+ |--------|------|---------|-------------|
82
+ | `type` | `string` | auto-detected | Override MIME type detection |
83
+ | `extractImages` | `boolean` | `true` | Extract images from document (stub — currently returns `[]`) |
84
+ | `runOcr` | `boolean` | `true` for PDFs | Run OCR on extracted images (stub) |
85
+ | `cacheDir` | `string` | OS temp dir | Directory for caching downloaded files |
86
+ | `cache` | `boolean` | `true` | Enable/disable spider fetch caching |
87
+ | `cacheExpiry` | `number` | `300000` | Cache expiry in milliseconds |
88
+ | `scraper` | `'basic' \| 'crawlee'` | `'basic'` | Scraper type for content extraction |
89
+ | `spider` | `'simple' \| 'dom' \| 'crawlee'` | `'dom'` | Spider adapter for fetching web pages |
90
+ | `headers` | `Record<string, string>` | — | Custom HTTP headers for spider requests |
91
+ | `timeout` | `number` | `30000` | Request timeout in milliseconds |
92
+ | `maxDuration` | `number` | — | Max scraping time in milliseconds |
93
+ | `maxInteractions` | `number` | — | Max interactions for advanced scrapers |
94
+
95
+ ### `Document` (class)
96
+
97
+ Base document handler. Manages downloading, caching, and local file path resolution. Used internally by processors; can also be used directly via `Document.create(url, options)`.
98
+
99
+ ### `PDFProcessor`
100
+
101
+ Implements `DocumentProcessor`. Extracts text from PDF files, validates PDF headers (detects HTML cache poisoning), and caches processed results.
102
+
103
+ ### `getTitleFromUrl(url, defaultTitle?)`
104
+
105
+ Extracts a human-readable title from a URL by parsing the filename, removing extensions, and decoding URL-encoded characters.
106
+
107
+ ### Types
108
+
109
+ ```typescript
110
+ interface Document {
111
+ url: string;
112
+ type: string;
113
+ parts: DocumentPart[];
114
+ metadata?: Record<string, any>;
115
+ }
116
+
117
+ interface DocumentPart {
118
+ id: string;
119
+ title: string;
120
+ content: string;
121
+ type: 'text' | 'html' | 'markdown';
122
+ images?: DocumentImage[];
123
+ metadata?: Record<string, any>;
124
+ parts?: DocumentPart[];
125
+ }
126
+
127
+ interface DocumentImage {
128
+ id: string;
129
+ url: string;
130
+ localPath?: string;
131
+ altText?: string;
132
+ ocrText?: string;
133
+ position?: number;
134
+ metadata?: { width?: number; height?: number; format?: string };
135
+ }
136
+
137
+ interface DocumentProcessor {
138
+ process(url: string, options?: FetchDocumentOptions): Promise<Document>;
139
+ supports(type: string): boolean;
140
+ }
141
+ ```
142
+
143
+ ## License
144
+
145
+ MIT
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=claude-context.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"claude-context.d.ts","sourceRoot":"","sources":["../../src/cli/claude-context.ts"],"names":[],"mappings":""}
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env node
2
+ import { existsSync, mkdirSync, copyFileSync } from "node:fs";
3
+ import { dirname, join } from "node:path";
4
+ import { fileURLToPath } from "node:url";
5
+ const Dirname = dirname(fileURLToPath(import.meta.url));
6
+ const pkgRoot = join(Dirname, "../..");
7
+ const targetDir = join(process.cwd(), ".claude");
8
+ if (!existsSync(targetDir)) {
9
+ mkdirSync(targetDir, { recursive: true });
10
+ }
11
+ const pkgName = "documents";
12
+ const agentMdSrc = existsSync(join(pkgRoot, "AGENT.md")) ? join(pkgRoot, "AGENT.md") : join(pkgRoot, "CLAUDE.md");
13
+ const metaSrc = existsSync(join(pkgRoot, "metadata.json")) ? join(pkgRoot, "metadata.json") : join(pkgRoot, ".claude-meta.json");
14
+ if (existsSync(agentMdSrc)) {
15
+ copyFileSync(agentMdSrc, join(targetDir, `have-${pkgName}.md`));
16
+ }
17
+ if (existsSync(metaSrc)) {
18
+ copyFileSync(metaSrc, join(targetDir, `have-${pkgName}.meta.json`));
19
+ }
20
+ console.log(`✓ Installed @happyvertical/${pkgName} context to .claude/`);
21
+ //# sourceMappingURL=claude-context.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"claude-context.js","sources":["../../src/cli/claude-context.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * CLI script to install agent context for @happyvertical/documents\n * Run the published context installer binary for this package.\n */\nimport { copyFileSync, existsSync, mkdirSync } from 'node:fs';\nimport { dirname, join } from 'node:path';\nimport { fileURLToPath } from 'node:url';\n\nconst Dirname = dirname(fileURLToPath(import.meta.url));\nconst pkgRoot = join(Dirname, '../..');\nconst targetDir = join(process.cwd(), '.claude');\n\nif (!existsSync(targetDir)) {\n mkdirSync(targetDir, { recursive: true });\n}\n\nconst pkgName = 'documents';\nconst agentMdSrc = existsSync(join(pkgRoot, 'AGENT.md'))\n ? join(pkgRoot, 'AGENT.md')\n : join(pkgRoot, 'CLAUDE.md');\nconst metaSrc = existsSync(join(pkgRoot, 'metadata.json'))\n ? join(pkgRoot, 'metadata.json')\n : join(pkgRoot, '.claude-meta.json');\n\nif (existsSync(agentMdSrc)) {\n copyFileSync(agentMdSrc, join(targetDir, `have-${pkgName}.md`));\n}\n\nif (existsSync(metaSrc)) {\n copyFileSync(metaSrc, join(targetDir, `have-${pkgName}.meta.json`));\n}\n\nconsole.log(`✓ Installed @happyvertical/${pkgName} context to .claude/`);\n"],"names":[],"mappings":";;;;AASA,MAAM,UAAU,QAAQ,cAAc,YAAY,GAAG,CAAC;AACtD,MAAM,UAAU,KAAK,SAAS,OAAO;AACrC,MAAM,YAAY,KAAK,QAAQ,IAAA,GAAO,SAAS;AAE/C,IAAI,CAAC,WAAW,SAAS,GAAG;AAC1B,YAAU,WAAW,EAAE,WAAW,KAAA,CAAM;AAC1C;AAEA,MAAM,UAAU;AAChB,MAAM,aAAa,WAAW,KAAK,SAAS,UAAU,CAAC,IACnD,KAAK,SAAS,UAAU,IACxB,KAAK,SAAS,WAAW;AAC7B,MAAM,UAAU,WAAW,KAAK,SAAS,eAAe,CAAC,IACrD,KAAK,SAAS,eAAe,IAC7B,KAAK,SAAS,mBAAmB;AAErC,IAAI,WAAW,UAAU,GAAG;AAC1B,eAAa,YAAY,KAAK,WAAW,QAAQ,OAAO,KAAK,CAAC;AAChE;AAEA,IAAI,WAAW,OAAO,GAAG;AACvB,eAAa,SAAS,KAAK,WAAW,QAAQ,OAAO,YAAY,CAAC;AACpE;AAEA,QAAQ,IAAI,8BAA8B,OAAO,sBAAsB;"}
@@ -0,0 +1,88 @@
1
+ import { URL } from 'node:url';
2
+ import { DocumentPart, Document as DocumentType, FetchDocumentOptions } from './types';
3
+ /**
4
+ * Base document handler with multi-part support
5
+ *
6
+ * Provides functionality for downloading, caching, and structuring documents
7
+ * into hierarchical parts. Specific format processing (PDF, HTML, Markdown)
8
+ * is handled by specialized processors.
9
+ */
10
+ export declare class Document {
11
+ /**
12
+ * Flag indicating if document is from a remote source
13
+ */
14
+ protected isRemote: boolean;
15
+ /**
16
+ * Configuration options
17
+ */
18
+ protected options: FetchDocumentOptions;
19
+ /**
20
+ * Local file path where document is stored
21
+ */
22
+ private _localPath;
23
+ /**
24
+ * Directory used for caching files
25
+ */
26
+ private _cacheDir;
27
+ /**
28
+ * Document URL
29
+ */
30
+ url: URL;
31
+ /**
32
+ * Document MIME type
33
+ */
34
+ type: string;
35
+ /**
36
+ * Document parts (hierarchical structure)
37
+ */
38
+ parts: DocumentPart[];
39
+ /**
40
+ * Document-level metadata
41
+ */
42
+ metadata: Record<string, any>;
43
+ /**
44
+ * Get the local file path where document is stored
45
+ */
46
+ get localPath(): string;
47
+ /**
48
+ * Get the directory used for caching files
49
+ */
50
+ get cacheDir(): string;
51
+ /**
52
+ * Creates a new Document instance
53
+ *
54
+ * @param url - Document URL or file path
55
+ * @param options - Document configuration options
56
+ */
57
+ constructor(url: string, options?: FetchDocumentOptions);
58
+ /**
59
+ * Creates and initializes a Document instance
60
+ *
61
+ * Downloads remote files and prepares the document for processing.
62
+ *
63
+ * @param url - Document URL or file path
64
+ * @param options - Document configuration options
65
+ * @returns Promise resolving to the initialized Document
66
+ */
67
+ static create(url: string, options?: FetchDocumentOptions): Promise<Document>;
68
+ /**
69
+ * Initializes the document, downloading it if it's remote
70
+ *
71
+ * @returns Promise that resolves when initialization is complete
72
+ */
73
+ initialize(): Promise<void>;
74
+ /**
75
+ * Checks if the document is a text-based file that can be read directly
76
+ *
77
+ * @returns Boolean indicating if the file is text-based
78
+ */
79
+ isTextFile(): boolean;
80
+ /**
81
+ * Converts the document to the standard Document interface
82
+ *
83
+ * @returns Document object with URL, type, parts, and metadata
84
+ */
85
+ toDocument(): DocumentType;
86
+ }
87
+ export default Document;
88
+ //# sourceMappingURL=document.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../src/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAG/B,OAAO,KAAK,EACV,YAAY,EACZ,QAAQ,IAAI,YAAY,EACxB,oBAAoB,EACrB,MAAM,SAAS,CAAC;AAEjB;;;;;;GAMG;AACH,qBAAa,QAAQ;IACnB;;OAEG;IACH,SAAS,CAAC,QAAQ,UAAS;IAE3B;;OAEG;IACH,SAAS,CAAC,OAAO,EAAE,oBAAoB,CAAC;IAExC;;OAEG;IACH,OAAO,CAAC,UAAU,CAAM;IAExB;;OAEG;IACH,OAAO,CAAC,SAAS,CAAM;IAEvB;;OAEG;IACI,GAAG,EAAE,GAAG,CAAC;IAEhB;;OAEG;IACI,IAAI,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACI,KAAK,EAAE,YAAY,EAAE,CAAM;IAElC;;OAEG;IACI,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAM;IAE1C;;OAEG;IACH,IAAW,SAAS,IAAI,MAAM,CAE7B;IAED;;OAEG;IACH,IAAW,QAAQ,IAAI,MAAM,CAE5B;IAED;;;;;OAKG;gBACS,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB;IA+C3D;;;;;;;;OAQG;WACU,MAAM,CACjB,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC;IAMpB;;;;OAIG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IASjC;;;;OAIG;IACI,UAAU,IAAI,OAAO;IAwB5B;;;;OAIG;IACI,UAAU,IAAI,YAAY;CAQlC;AAED,eAAe,QAAQ,CAAC"}
@@ -0,0 +1,40 @@
1
+ import { Document, FetchDocumentOptions } from './types';
2
+ /**
3
+ * Fetch a document from a URL with automatic format detection
4
+ *
5
+ * This factory function:
6
+ * 1. Detects the document format (PDF, HTML, Markdown, etc.)
7
+ * 2. Selects the appropriate processor
8
+ * 3. Processes the document into structured parts
9
+ * 4. Returns a Document object with hierarchical content
10
+ *
11
+ * @param url - Document URL or file path (file://, http://, https://)
12
+ * @param options - Fetch and processing options
13
+ * @returns Promise resolving to structured Document
14
+ *
15
+ * @example
16
+ * ```typescript
17
+ * // Fetch a PDF with image extraction and OCR
18
+ * const doc = await fetchDocument('https://example.com/report.pdf', {
19
+ * extractImages: true,
20
+ * runOcr: true
21
+ * });
22
+ *
23
+ * // Access document parts
24
+ * for (const part of doc.parts) {
25
+ * console.log(part.title);
26
+ * console.log(part.content);
27
+ *
28
+ * // Check for images
29
+ * if (part.images) {
30
+ * for (const image of part.images) {
31
+ * console.log(image.url);
32
+ * console.log(image.ocrText); // Text extracted via OCR
33
+ * }
34
+ * }
35
+ * }
36
+ * ```
37
+ */
38
+ export declare function fetchDocument(url: string, options?: FetchDocumentOptions): Promise<Document>;
39
+ export default fetchDocument;
40
+ //# sourceMappingURL=factory.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"factory.d.ts","sourceRoot":"","sources":["../src/factory.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,SAAS,CAAC;AAO9D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,wBAAsB,aAAa,CACjC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC,CAyEnB;AAED,eAAe,aAAa,CAAC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * @happyvertical/documents - Document processing with multi-part structure
3
+ *
4
+ * Provides document processing for PDFs with support for:
5
+ * - Hierarchical document parts
6
+ * - Automatic format detection from URL or MIME type
7
+ * - Document management system detection (WordPress, CivicWeb, DocuShare)
8
+ * - File caching for performance
9
+ *
10
+ * @example
11
+ * ```typescript
12
+ * import { fetchDocument } from '@happyvertical/documents';
13
+ *
14
+ * const doc = await fetchDocument('https://example.com/report.pdf');
15
+ *
16
+ * for (const part of doc.parts) {
17
+ * console.log(part.title);
18
+ * console.log(part.content);
19
+ * }
20
+ * ```
21
+ */
22
+ export { Document } from './document';
23
+ export { fetchDocument } from './factory';
24
+ export { PDFProcessor } from './processors/pdf';
25
+ export type { Document as DocumentType, DocumentImage, DocumentPart, DocumentProcessor, FetchDocumentOptions, } from './types';
26
+ export { getTitleFromUrl } from './utils';
27
+ /** @internal */
28
+ export declare const PACKAGE_VERSION_INITIALIZED = true;
29
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAEtC,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAG1C,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEhD,YAAY,EACV,QAAQ,IAAI,YAAY,EACxB,aAAa,EACb,YAAY,EACZ,iBAAiB,EACjB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAEjB,OAAO,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE1C,gBAAgB;AAChB,eAAO,MAAM,2BAA2B,OAAO,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,320 @@
1
+ import os from "node:os";
2
+ import path from "node:path";
3
+ import { URL as URL$1 } from "node:url";
4
+ import { getMimeType, downloadFileWithCache, getCached, setCached } from "@happyvertical/files";
5
+ import { makeSlug } from "@happyvertical/utils";
6
+ import { scrapeDocument } from "@happyvertical/spider";
7
+ import { promises } from "node:fs";
8
+ import { getPDFReader } from "@happyvertical/pdf";
9
+ import { v4 } from "uuid";
10
+ class Document {
11
+ /**
12
+ * Flag indicating if document is from a remote source
13
+ */
14
+ isRemote = false;
15
+ /**
16
+ * Configuration options
17
+ */
18
+ options;
19
+ /**
20
+ * Local file path where document is stored
21
+ */
22
+ _localPath = "";
23
+ /**
24
+ * Directory used for caching files
25
+ */
26
+ _cacheDir = "";
27
+ /**
28
+ * Document URL
29
+ */
30
+ url;
31
+ /**
32
+ * Document MIME type
33
+ */
34
+ type;
35
+ /**
36
+ * Document parts (hierarchical structure)
37
+ */
38
+ parts = [];
39
+ /**
40
+ * Document-level metadata
41
+ */
42
+ metadata = {};
43
+ /**
44
+ * Get the local file path where document is stored
45
+ */
46
+ get localPath() {
47
+ return this._localPath;
48
+ }
49
+ /**
50
+ * Get the directory used for caching files
51
+ */
52
+ get cacheDir() {
53
+ return this._cacheDir;
54
+ }
55
+ /**
56
+ * Creates a new Document instance
57
+ *
58
+ * @param url - Document URL or file path
59
+ * @param options - Document configuration options
60
+ */
61
+ constructor(url, options = {}) {
62
+ this.url = new URL$1(url);
63
+ this.options = options;
64
+ this.type = options.type || getMimeType(this.url.toString()) || "text/plain";
65
+ this._cacheDir = options.cacheDir || path.resolve(os.tmpdir(), ".cache", "have-sdk", "documents");
66
+ if (this.url.protocol.startsWith("file")) {
67
+ this._localPath = decodeURIComponent(this.url.pathname);
68
+ this.isRemote = false;
69
+ } else if (this.url.protocol.startsWith("http")) {
70
+ let pathname = this.url.pathname;
71
+ if (pathname.endsWith("/")) {
72
+ pathname = pathname.slice(0, -1);
73
+ }
74
+ if (!pathname.match(/\.[a-z0-9]+$/i)) {
75
+ if (this.type === "application/pdf" || options.type === "application/pdf") {
76
+ pathname += ".pdf";
77
+ }
78
+ }
79
+ this._localPath = path.join(
80
+ this._cacheDir,
81
+ makeSlug(this.url.hostname),
82
+ pathname
83
+ );
84
+ this.isRemote = true;
85
+ }
86
+ }
87
+ /**
88
+ * Creates and initializes a Document instance
89
+ *
90
+ * Downloads remote files and prepares the document for processing.
91
+ *
92
+ * @param url - Document URL or file path
93
+ * @param options - Document configuration options
94
+ * @returns Promise resolving to the initialized Document
95
+ */
96
+ static async create(url, options = {}) {
97
+ const document = new Document(url, options);
98
+ await document.initialize();
99
+ return document;
100
+ }
101
+ /**
102
+ * Initializes the document, downloading it if it's remote
103
+ *
104
+ * @returns Promise that resolves when initialization is complete
105
+ */
106
+ async initialize() {
107
+ if (this.isRemote) {
108
+ if (!this.url) {
109
+ throw new Error("Cannot initialize remote document: URL is required");
110
+ }
111
+ await downloadFileWithCache(this.url.toString(), this._localPath);
112
+ }
113
+ }
114
+ /**
115
+ * Checks if the document is a text-based file that can be read directly
116
+ *
117
+ * @returns Boolean indicating if the file is text-based
118
+ */
119
+ isTextFile() {
120
+ if (!this.type) return false;
121
+ return this.type.startsWith("text/") || this.type === "application/json" || this.type === "application/xml" || this.type === "application/javascript" || this.type === "application/typescript" || [
122
+ ".txt",
123
+ ".md",
124
+ ".json",
125
+ ".xml",
126
+ ".html",
127
+ ".css",
128
+ ".js",
129
+ ".ts",
130
+ ".yaml",
131
+ ".yml"
132
+ ].some((ext) => this.localPath.toLowerCase().endsWith(ext));
133
+ }
134
+ /**
135
+ * Converts the document to the standard Document interface
136
+ *
137
+ * @returns Document object with URL, type, parts, and metadata
138
+ */
139
+ toDocument() {
140
+ return {
141
+ url: this.url.toString(),
142
+ type: this.type,
143
+ parts: this.parts,
144
+ metadata: this.metadata
145
+ };
146
+ }
147
+ }
148
+ function getTitleFromUrl(url, defaultTitle = "Document") {
149
+ try {
150
+ const urlObj = new URL(url);
151
+ const pathname = urlObj.pathname;
152
+ const filename = pathname.split("/").pop() || defaultTitle;
153
+ const decodedFilename = decodeURIComponent(filename);
154
+ return decodedFilename.replace(/\.(pdf|html?|md|txt)$/i, "").replace(/[-_]/g, " ").trim();
155
+ } catch {
156
+ return defaultTitle;
157
+ }
158
+ }
159
+ class PDFProcessor {
160
+ /**
161
+ * Check if this processor supports the given MIME type or extension.
162
+ * Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).
163
+ *
164
+ * @param type - MIME type or file extension to check
165
+ * @returns `true` if this processor can handle the given type
166
+ */
167
+ supports(type) {
168
+ return type === "application/pdf" || type.endsWith(".pdf") || type.toLowerCase() === "pdf";
169
+ }
170
+ /**
171
+ * Process a PDF document
172
+ *
173
+ * Extracts text and optionally images/OCR from the PDF, structuring
174
+ * it into hierarchical document parts.
175
+ *
176
+ * @param url - PDF URL or file path
177
+ * @param options - Processing options
178
+ * @returns Promise resolving to structured Document
179
+ */
180
+ async process(url, options = {}) {
181
+ const baseDoc = await Document.create(url, options);
182
+ const cacheKey = `${baseDoc.localPath}.processed_pdf`;
183
+ const cached = await getCached(cacheKey);
184
+ if (cached) {
185
+ try {
186
+ const parsed = JSON.parse(cached);
187
+ return {
188
+ url: baseDoc.url.toString(),
189
+ type: baseDoc.type,
190
+ parts: parsed.parts,
191
+ metadata: parsed.metadata || {}
192
+ };
193
+ } catch (error) {
194
+ console.warn("Cached PDF data corrupted, reprocessing", error);
195
+ }
196
+ }
197
+ const fileBuffer = await promises.readFile(baseDoc.localPath);
198
+ const header = fileBuffer.subarray(0, 5).toString("utf-8");
199
+ if (header !== "%PDF-") {
200
+ try {
201
+ await promises.unlink(baseDoc.localPath);
202
+ } catch (unlinkError) {
203
+ console.warn(
204
+ `Failed to delete poisoned cache file: ${baseDoc.localPath}`,
205
+ unlinkError
206
+ );
207
+ }
208
+ const content = fileBuffer.toString(
209
+ "utf-8",
210
+ 0,
211
+ Math.min(1e3, fileBuffer.length)
212
+ );
213
+ if (content.includes("<!DOCTYPE html>") || content.includes("<html")) {
214
+ throw new Error(
215
+ `Downloaded file is HTML, not PDF. The server returned HTML content for ${url}. This commonly occurs with WordPress Download Manager URLs that return tracking pages. Expected PDF magic bytes (%PDF-) but got: ${header}. The poisoned cache file has been removed - please try again.`
216
+ );
217
+ } else {
218
+ throw new Error(
219
+ `Downloaded file is not a valid PDF. Expected %PDF- magic bytes but got: ${header}. The invalid cache file has been removed - please try again.`
220
+ );
221
+ }
222
+ }
223
+ const reader = await getPDFReader();
224
+ const extractedText = await reader.extractText(baseDoc.localPath);
225
+ const mainPart = {
226
+ id: v4(),
227
+ title: getTitleFromUrl(url, "PDF Document"),
228
+ content: extractedText || "",
229
+ type: "text",
230
+ metadata: {
231
+ source: "pdf",
232
+ filePath: baseDoc.localPath
233
+ }
234
+ };
235
+ if (options.extractImages === true) {
236
+ mainPart.images = await this.extractImages(
237
+ baseDoc.localPath,
238
+ options.runOcr !== false
239
+ );
240
+ }
241
+ const document = {
242
+ url: baseDoc.url.toString(),
243
+ type: baseDoc.type,
244
+ parts: [mainPart],
245
+ metadata: {
246
+ processor: "pdf",
247
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
248
+ hasImages: (mainPart.images?.length || 0) > 0
249
+ }
250
+ };
251
+ await setCached(cacheKey, JSON.stringify(document));
252
+ return document;
253
+ }
254
+ /**
255
+ * Extract images from PDF
256
+ *
257
+ * This is a placeholder for future image extraction functionality.
258
+ * Will use @happyvertical/pdf's image extraction capabilities when available.
259
+ *
260
+ * @param filePath - Local PDF file path
261
+ * @param runOcr - Whether to run OCR on extracted images
262
+ * @returns Promise resolving to array of DocumentImages
263
+ */
264
+ async extractImages(_filePath, _runOcr) {
265
+ return [];
266
+ }
267
+ }
268
+ const processors = [new PDFProcessor()];
269
+ async function fetchDocument(url, options = {}) {
270
+ const isWebUrl = url.startsWith("http://") || url.startsWith("https://");
271
+ if (isWebUrl && !options.type) {
272
+ try {
273
+ const scraped = await scrapeDocument(url, {
274
+ scraper: options.scraper || "basic",
275
+ spider: options.spider || "dom",
276
+ cache: options.cache,
277
+ cacheExpiry: options.cacheExpiry,
278
+ headers: options.headers,
279
+ timeout: options.timeout,
280
+ maxDuration: options.maxDuration,
281
+ maxInteractions: options.maxInteractions
282
+ });
283
+ const hasDocLink = scraped.metadata.strategy === "wordpress-pdf-link" || scraped.metadata.strategy === "civicweb-pdf-link" || scraped.metadata.strategy === "docushare-pdf-link";
284
+ if (hasDocLink && scraped.metadata.isPdf && !scraped.metadata.complete) {
285
+ url = scraped.url;
286
+ options.type = "application/pdf";
287
+ }
288
+ } catch (error) {
289
+ console.warn(
290
+ `Spider detection failed for ${url}, falling back to direct download:`,
291
+ error
292
+ );
293
+ }
294
+ }
295
+ let type = options.type;
296
+ if (!type) {
297
+ const urlLower = url.toLowerCase();
298
+ if (urlLower.endsWith(".pdf") || urlLower.includes(".pdf?") || urlLower.includes(".pdf#")) {
299
+ type = "application/pdf";
300
+ } else {
301
+ type = getMimeType(url) || "";
302
+ }
303
+ }
304
+ const processor = processors.find((p) => p.supports(type));
305
+ if (!processor) {
306
+ throw new Error(
307
+ `No processor available for document type: ${type}. Supported types: PDF (.pdf, application/pdf)`
308
+ );
309
+ }
310
+ return processor.process(url, options);
311
+ }
312
+ const PACKAGE_VERSION_INITIALIZED = true;
313
+ export {
314
+ Document,
315
+ PACKAGE_VERSION_INITIALIZED,
316
+ PDFProcessor,
317
+ fetchDocument,
318
+ getTitleFromUrl
319
+ };
320
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sources":["../src/document.ts","../src/utils.ts","../src/processors/pdf.ts","../src/factory.ts","../src/index.ts"],"sourcesContent":["import os from 'node:os';\nimport path from 'node:path';\nimport { URL } from 'node:url';\nimport { downloadFileWithCache, getMimeType } from '@happyvertical/files';\nimport { makeSlug } from '@happyvertical/utils';\nimport type {\n DocumentPart,\n Document as DocumentType,\n FetchDocumentOptions,\n} from './types';\n\n/**\n * Base document handler with multi-part support\n *\n * Provides functionality for downloading, caching, and structuring documents\n * into hierarchical parts. Specific format processing (PDF, HTML, Markdown)\n * is handled by specialized processors.\n */\nexport class Document {\n /**\n * Flag indicating if document is from a remote source\n */\n protected isRemote = false;\n\n /**\n * Configuration options\n */\n protected options: FetchDocumentOptions;\n\n /**\n * Local file path where document is stored\n */\n private _localPath = '';\n\n /**\n * Directory used for caching files\n */\n private _cacheDir = '';\n\n /**\n * Document URL\n */\n public url: URL;\n\n /**\n * Document MIME type\n */\n public type: string;\n\n /**\n * Document parts (hierarchical structure)\n */\n public parts: DocumentPart[] = [];\n\n /**\n * Document-level metadata\n */\n public metadata: Record<string, any> = {};\n\n /**\n * Get the local file path where document is stored\n */\n public get localPath(): string {\n return this._localPath;\n }\n\n /**\n * Get the directory used for caching files\n */\n public get cacheDir(): string {\n return this._cacheDir;\n }\n\n /**\n * Creates a new Document instance\n *\n * @param url - Document URL or file path\n * @param options - Document configuration options\n */\n constructor(url: string, options: FetchDocumentOptions = {}) {\n this.url = new URL(url);\n this.options = options;\n this.type =\n options.type || getMimeType(this.url.toString()) || 'text/plain';\n\n this._cacheDir =\n options.cacheDir ||\n path.resolve(os.tmpdir(), '.cache', 'have-sdk', 'documents');\n\n if (this.url.protocol.startsWith('file')) {\n // Decode URL-encoded characters in the pathname only (e.g., %20 -> space).\n // Note: Query parameters and hash fragments are not decoded here.\n this._localPath = decodeURIComponent(this.url.pathname);\n this.isRemote = false;\n } else if (this.url.protocol.startsWith('http')) {\n // Generate cache path from URL pathname\n // Query parameters (?) and fragments (#) are automatically excluded from url.pathname\n let pathname = this.url.pathname;\n\n // Remove trailing slash (directory-style URLs)\n if (pathname.endsWith('/')) {\n pathname = pathname.slice(0, -1);\n }\n\n // Add file extension if missing and we know the type\n // This is crucial for URLs like /download/file/?wpdmdl=123 which have no extension\n if (!pathname.match(/\\.[a-z0-9]+$/i)) {\n // Add appropriate extension based on MIME type\n if (\n this.type === 'application/pdf' ||\n options.type === 'application/pdf'\n ) {\n pathname += '.pdf';\n }\n // Future: Add other common extensions (html, json, etc.)\n }\n\n this._localPath = path.join(\n this._cacheDir,\n makeSlug(this.url.hostname),\n pathname,\n );\n this.isRemote = true;\n }\n }\n\n /**\n * Creates and initializes a Document instance\n *\n * Downloads remote files and prepares the document for processing.\n *\n * @param url - Document URL or file path\n * @param options - Document configuration options\n * @returns Promise resolving to the initialized Document\n */\n static async create(\n url: string,\n options: FetchDocumentOptions = {},\n ): Promise<Document> {\n const document = new Document(url, options);\n await document.initialize();\n return document;\n }\n\n /**\n * Initializes the document, downloading it if it's remote\n *\n * @returns Promise that resolves when initialization is complete\n */\n async initialize(): Promise<void> {\n if (this.isRemote) {\n if (!this.url) {\n throw new Error('Cannot initialize remote document: URL is required');\n }\n await downloadFileWithCache(this.url.toString(), this._localPath);\n }\n }\n\n /**\n * Checks if the document is a text-based file that can be read directly\n *\n * @returns Boolean indicating if the file is text-based\n */\n public isTextFile(): boolean {\n if (!this.type) return false;\n\n return (\n this.type.startsWith('text/') ||\n this.type === 'application/json' ||\n this.type === 'application/xml' ||\n this.type === 'application/javascript' ||\n this.type === 'application/typescript' ||\n [\n '.txt',\n '.md',\n '.json',\n '.xml',\n '.html',\n '.css',\n '.js',\n '.ts',\n '.yaml',\n '.yml',\n ].some((ext) => this.localPath.toLowerCase().endsWith(ext))\n );\n }\n\n /**\n * Converts the document to the standard Document interface\n *\n * @returns Document object with URL, type, parts, and metadata\n */\n public toDocument(): DocumentType {\n return {\n url: this.url.toString(),\n type: this.type,\n parts: this.parts,\n metadata: this.metadata,\n };\n }\n}\n\nexport default Document;\n","/**\n * Utility functions for document processing\n */\n\n/**\n * Extract a human-readable title from a URL\n *\n * Takes a URL and extracts the filename from the pathname, then formats it\n * into a readable title by removing the extension and converting separators\n * to spaces. Also decodes URL-encoded characters like %20.\n *\n * @param url - URL string to extract title from\n * @param defaultTitle - Default title to use if extraction fails\n * @returns Formatted title string\n *\n * @example\n * ```typescript\n * getTitleFromUrl('file:///path/to/My%20Document.pdf')\n * // Returns: 'My Document'\n *\n * getTitleFromUrl('https://example.com/research_paper.pdf')\n * // Returns: 'research paper'\n * ```\n */\nexport function getTitleFromUrl(\n url: string,\n defaultTitle = 'Document',\n): string {\n try {\n const urlObj = new URL(url);\n const pathname = urlObj.pathname;\n const filename = pathname.split('/').pop() || defaultTitle;\n\n // Decode URL-encoded characters (e.g., %20 -> space)\n const decodedFilename = decodeURIComponent(filename);\n\n // Remove extension and convert separators to spaces\n return decodedFilename\n .replace(/\\.(pdf|html?|md|txt)$/i, '')\n .replace(/[-_]/g, ' ')\n .trim();\n } catch {\n return defaultTitle;\n }\n}\n","import { promises as fs } from 'node:fs';\nimport { getCached, setCached } from '@happyvertical/files';\nimport { getPDFReader } from '@happyvertical/pdf';\nimport { v4 as uuidv4 } from 'uuid';\nimport { Document as BaseDocument } from '../document';\nimport type {\n Document,\n DocumentImage,\n DocumentPart,\n DocumentProcessor,\n FetchDocumentOptions,\n} from '../types';\nimport { getTitleFromUrl } from '../utils';\n\n/**\n * PDF Document Processor\n *\n * Handles PDF documents with support for:\n * - Text extraction from PDF content via `@happyvertical/pdf`\n * - PDF header validation (detects HTML cache poisoning from document management systems)\n * - Processed document caching via `@happyvertical/files`\n *\n * Image extraction and OCR are stubbed for future implementation.\n */\nexport class PDFProcessor implements DocumentProcessor {\n /**\n * Check if this processor supports the given MIME type or extension.\n * Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).\n *\n * @param type - MIME type or file extension to check\n * @returns `true` if this processor can handle the given type\n */\n supports(type: string): boolean {\n return (\n type === 'application/pdf' ||\n type.endsWith('.pdf') ||\n type.toLowerCase() === 'pdf'\n );\n }\n\n /**\n * Process a PDF document\n *\n * Extracts text and optionally images/OCR from the PDF, structuring\n * it into hierarchical document parts.\n *\n * @param url - PDF URL or file path\n * @param options - Processing options\n * @returns Promise resolving to structured Document\n */\n async process(\n url: string,\n options: FetchDocumentOptions = {},\n ): Promise<Document> {\n // Create and initialize base document\n const baseDoc = await BaseDocument.create(url, options);\n\n // Check cache for processed document\n const cacheKey = `${baseDoc.localPath}.processed_pdf`;\n const cached = await getCached(cacheKey);\n if (cached) {\n try {\n const parsed = JSON.parse(cached);\n return {\n url: baseDoc.url.toString(),\n type: baseDoc.type,\n parts: parsed.parts,\n metadata: parsed.metadata || {},\n };\n } catch (error) {\n // Cache corrupted, continue with fresh processing\n console.warn('Cached PDF data corrupted, reprocessing', error);\n }\n }\n\n // Validate that the downloaded file is actually a PDF (issue #460, #463)\n // WordPress Download Manager and some other servers may return HTML\n // with Content-Type: application/pdf, causing PDF extraction to fail\n const fileBuffer = await fs.readFile(baseDoc.localPath);\n const header = fileBuffer.subarray(0, 5).toString('utf-8');\n\n if (header !== '%PDF-') {\n // File is not a valid PDF - delete poisoned cache file (issue #463)\n try {\n await fs.unlink(baseDoc.localPath);\n } catch (unlinkError) {\n console.warn(\n `Failed to delete poisoned cache file: ${baseDoc.localPath}`,\n unlinkError,\n );\n }\n\n // Check if it's HTML to provide helpful error message\n const content = fileBuffer.toString(\n 'utf-8',\n 0,\n Math.min(1000, fileBuffer.length),\n );\n if (content.includes('<!DOCTYPE html>') || content.includes('<html')) {\n throw new Error(\n `Downloaded file is HTML, not PDF. The server returned HTML content for ${url}. ` +\n 'This commonly occurs with WordPress Download Manager URLs that return tracking pages. ' +\n `Expected PDF magic bytes (%PDF-) but got: ${header}. ` +\n 'The poisoned cache file has been removed - please try again.',\n );\n } else {\n throw new Error(\n `Downloaded file is not a valid PDF. Expected %PDF- magic bytes but got: ${header}. ` +\n 'The invalid cache file has been removed - please try again.',\n );\n }\n }\n\n // Get PDF reader and extract content\n const reader = await getPDFReader();\n const extractedText = await reader.extractText(baseDoc.localPath);\n\n // Create main document part\n const mainPart: DocumentPart = {\n id: uuidv4(),\n title: getTitleFromUrl(url, 'PDF Document'),\n content: extractedText || '',\n type: 'text',\n metadata: {\n source: 'pdf',\n filePath: baseDoc.localPath,\n },\n };\n\n // Extract images if enabled\n if (options.extractImages === true) {\n mainPart.images = await this.extractImages(\n baseDoc.localPath,\n options.runOcr !== false,\n );\n }\n\n const document: Document = {\n url: baseDoc.url.toString(),\n type: baseDoc.type,\n parts: [mainPart],\n metadata: {\n processor: 'pdf',\n extractedAt: new Date().toISOString(),\n hasImages: (mainPart.images?.length || 0) > 0,\n },\n };\n\n // Cache the processed document\n await setCached(cacheKey, JSON.stringify(document));\n\n return document;\n }\n\n /**\n * Extract images from PDF\n *\n * This is a placeholder for future image extraction functionality.\n * Will use @happyvertical/pdf's image extraction capabilities when available.\n *\n * @param filePath - Local PDF file path\n * @param runOcr - Whether to run OCR on extracted images\n * @returns Promise resolving to array of DocumentImages\n */\n private async extractImages(\n _filePath: string,\n _runOcr: boolean,\n ): Promise<DocumentImage[]> {\n // TODO: Implement image extraction using @happyvertical/pdf\n // For now, return empty array as placeholder\n\n // Future implementation will:\n // 1. Use getPDFReader() to extract images from PDF\n // 2. Save images to cache directory\n // 3. If runOcr is true, use @happyvertical/ocr to extract text from images\n // 4. Return array of DocumentImage objects with metadata\n\n return [];\n }\n}\n\nexport default PDFProcessor;\n","import { getMimeType } from '@happyvertical/files';\nimport { scrapeDocument } from '@happyvertical/spider';\nimport { PDFProcessor } from './processors/pdf';\nimport type { Document, FetchDocumentOptions } from './types';\n\n/**\n * Available document processors\n */\nconst processors = [new PDFProcessor()];\n\n/**\n * Fetch a document from a URL with automatic format detection\n *\n * This factory function:\n * 1. Detects the document format (PDF, HTML, Markdown, etc.)\n * 2. Selects the appropriate processor\n * 3. Processes the document into structured parts\n * 4. Returns a Document object with hierarchical content\n *\n * @param url - Document URL or file path (file://, http://, https://)\n * @param options - Fetch and processing options\n * @returns Promise resolving to structured Document\n *\n * @example\n * ```typescript\n * // Fetch a PDF with image extraction and OCR\n * const doc = await fetchDocument('https://example.com/report.pdf', {\n * extractImages: true,\n * runOcr: true\n * });\n *\n * // Access document parts\n * for (const part of doc.parts) {\n * console.log(part.title);\n * console.log(part.content);\n *\n * // Check for images\n * if (part.images) {\n * for (const image of part.images) {\n * console.log(image.url);\n * console.log(image.ocrText); // Text extracted via OCR\n * }\n * }\n * }\n * ```\n */\nexport async function fetchDocument(\n url: string,\n options: FetchDocumentOptions = {},\n): Promise<Document> {\n // For web URLs (http/https), use spider package to detect special cases\n // (WordPress Download Manager, CivicWeb, DocuShare, etc.)\n const isWebUrl = url.startsWith('http://') || url.startsWith('https://');\n\n if (isWebUrl && !options.type) {\n try {\n // Use spider to detect WordPress, CivicWeb, DocuShare, and other document management systems\n const scraped = await scrapeDocument(url, {\n scraper: options.scraper || 'basic',\n spider: options.spider || 'dom',\n cache: options.cache,\n cacheExpiry: options.cacheExpiry,\n headers: options.headers,\n timeout: options.timeout,\n maxDuration: options.maxDuration,\n maxInteractions: options.maxInteractions,\n });\n\n // Check if spider detected a document management system with PDF link\n const hasDocLink =\n scraped.metadata.strategy === 'wordpress-pdf-link' ||\n scraped.metadata.strategy === 'civicweb-pdf-link' ||\n scraped.metadata.strategy === 'docushare-pdf-link';\n\n if (hasDocLink && scraped.metadata.isPdf && !scraped.metadata.complete) {\n // Spider detected a document management page and extracted the PDF URL\n // Use the extracted URL for PDF processing\n url = scraped.url;\n options.type = 'application/pdf';\n }\n } catch (error) {\n // If spider fails, continue with direct download\n // This ensures backward compatibility\n console.warn(\n `Spider detection failed for ${url}, falling back to direct download:`,\n error,\n );\n }\n }\n\n // Determine type - check URL extension first, then MIME type\n // This handles servers that return incorrect Content-Type headers (e.g., application/octet-stream for PDFs)\n let type = options.type;\n\n if (!type) {\n // Extract file extension from URL\n const urlLower = url.toLowerCase();\n\n // Check for common document extensions in URL\n if (\n urlLower.endsWith('.pdf') ||\n urlLower.includes('.pdf?') ||\n urlLower.includes('.pdf#')\n ) {\n type = 'application/pdf';\n } else {\n // Fall back to MIME type detection\n type = getMimeType(url) || '';\n }\n }\n\n // Find appropriate processor\n const processor = processors.find((p) => p.supports(type));\n\n if (!processor) {\n throw new Error(\n `No processor available for document type: ${type}. Supported types: PDF (.pdf, application/pdf)`,\n );\n }\n\n // Process document\n return processor.process(url, options);\n}\n\nexport default fetchDocument;\n","/**\n * @happyvertical/documents - Document processing with multi-part structure\n *\n * Provides document processing for PDFs with support for:\n * - Hierarchical document parts\n * - Automatic format detection from URL or MIME type\n * - Document management system detection (WordPress, CivicWeb, DocuShare)\n * - File caching for performance\n *\n * @example\n * ```typescript\n * import { fetchDocument } from '@happyvertical/documents';\n *\n * const doc = await fetchDocument('https://example.com/report.pdf');\n *\n * for (const part of doc.parts) {\n * console.log(part.title);\n * console.log(part.content);\n * }\n * ```\n */\n\n// Base classes\nexport { Document } from './document';\n// Main factory function\nexport { fetchDocument } from './factory';\n\n// Processors\nexport { PDFProcessor } from './processors/pdf';\n// Types\nexport type {\n Document as DocumentType,\n DocumentImage,\n DocumentPart,\n DocumentProcessor,\n FetchDocumentOptions,\n} from './types';\n// Utilities\nexport { getTitleFromUrl } from './utils';\n\n/** @internal */\nexport const PACKAGE_VERSION_INITIALIZED = true;\n"],"names":["URL","BaseDocument","fs","uuidv4"],"mappings":";;;;;;;;;AAkBO,MAAM,SAAS;AAAA;AAAA;AAAA;AAAA,EAIV,WAAW;AAAA;AAAA;AAAA;AAAA,EAKX;AAAA;AAAA;AAAA;AAAA,EAKF,aAAa;AAAA;AAAA;AAAA;AAAA,EAKb,YAAY;AAAA;AAAA;AAAA;AAAA,EAKb;AAAA;AAAA;AAAA;AAAA,EAKA;AAAA;AAAA;AAAA;AAAA,EAKA,QAAwB,CAAA;AAAA;AAAA;AAAA;AAAA,EAKxB,WAAgC,CAAA;AAAA;AAAA;AAAA;AAAA,EAKvC,IAAW,YAAoB;AAC7B,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA,EAKA,IAAW,WAAmB;AAC5B,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQA,YAAY,KAAa,UAAgC,IAAI;AAC3D,SAAK,MAAM,IAAIA,MAAI,GAAG;AACtB,SAAK,UAAU;AACf,SAAK,OACH,QAAQ,QAAQ,YAAY,KAAK,IAAI,SAAA,CAAU,KAAK;AAEtD,SAAK,YACH,QAAQ,YACR,KAAK,QAAQ,GAAG,OAAA,GAAU,UAAU,YAAY,WAAW;AAE7D,QAAI,KAAK,IAAI,SAAS,WAAW,MAAM,GAAG;AAGxC,WAAK,aAAa,mBAAmB,KAAK,IAAI,QAAQ;AACtD,WAAK,WAAW;AAAA,IAClB,WAAW,KAAK,IAAI,SAAS,WAAW,MAAM,GAAG;AAG/C,UAAI,WAAW,KAAK,IAAI;AAGxB,UAAI,SAAS,SAAS,GAAG,GAAG;AAC1B,mBAAW,SAAS,MAAM,GAAG,EAAE;AAAA,MACjC;AAIA,UAAI,CAAC,SAAS,MAAM,eAAe,GAAG;AAEpC,YACE,KAAK,SAAS,qBACd,QAAQ,SAAS,mBACjB;AACA,sBAAY;AAAA,QACd;AAAA,MAEF;AAEA,WAAK,aAAa,KAAK;AAAA,QACrB,KAAK;AAAA,QACL,SAAS,KAAK,IAAI,QAAQ;AAAA,QAC1B;AAAA,MAAA;AAEF,WAAK,WAAW;AAAA,IAClB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWA,aAAa,OACX,KACA,UAAgC,IACb;AACnB,UAAM,WAAW,IAAI,SAAS,KAAK,OAAO;AAC1C,UAAM,SAAS,WAAA;AACf,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAM,aAA4B;AAChC,QAAI,KAAK,UAAU;AACjB,UAAI,CAAC,KAAK,KAAK;AACb,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACtE;AACA,YAAM,sBAAsB,KAAK,IAAI,SAAA,GAAY,KAAK,UAAU;AAAA,IAClE;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOO,aAAsB;AAC3B,QAAI,CAAC,KAAK,KAAM,QAAO;AAEvB,WACE,KAAK,KAAK,WAAW,OAAO,KAC5B,KAAK,SAAS,sBACd,KAAK,SAAS,qBACd,KAAK,SAAS,4BACd,KAAK,SAAS,4BACd;AAAA,MACE;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IAAA,EACA,KAAK,CAAC,QAAQ,KAAK,UAAU,YAAA,EAAc,SAAS,GAAG,CAAC;AAAA,EAE9D;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOO,aAA2B;AAChC,WAAO;AAAA,MACL,KAAK,KAAK,IAAI,SAAA;AAAA,MACd,MAAM,KAAK;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU,KAAK;AAAA,IAAA;AAAA,EAEnB;AACF;AChLO,SAAS,gBACd,KACA,eAAe,YACP;AACR,MAAI;AACF,UAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,UAAM,WAAW,OAAO;AACxB,UAAM,WAAW,SAAS,MAAM,GAAG,EAAE,SAAS;AAG9C,UAAM,kBAAkB,mBAAmB,QAAQ;AAGnD,WAAO,gBACJ,QAAQ,0BAA0B,EAAE,EACpC,QAAQ,SAAS,GAAG,EACpB,KAAA;AAAA,EACL,QAAQ;AACN,WAAO;AAAA,EACT;AACF;ACpBO,MAAM,aAA0C;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAQrD,SAAS,MAAuB;AAC9B,WACE,SAAS,qBACT,KAAK,SAAS,MAAM,KACpB,KAAK,kBAAkB;AAAA,EAE3B;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,MAAM,QACJ,KACA,UAAgC,IACb;AAEnB,UAAM,UAAU,MAAMC,SAAa,OAAO,KAAK,OAAO;AAGtD,UAAM,WAAW,GAAG,QAAQ,SAAS;AACrC,UAAM,SAAS,MAAM,UAAU,QAAQ;AACvC,QAAI,QAAQ;AACV,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,MAAM;AAChC,eAAO;AAAA,UACL,KAAK,QAAQ,IAAI,SAAA;AAAA,UACjB,MAAM,QAAQ;AAAA,UACd,OAAO,OAAO;AAAA,UACd,UAAU,OAAO,YAAY,CAAA;AAAA,QAAC;AAAA,MAElC,SAAS,OAAO;AAEd,gBAAQ,KAAK,2CAA2C,KAAK;AAAA,MAC/D;AAAA,IACF;AAKA,UAAM,aAAa,MAAMC,SAAG,SAAS,QAAQ,SAAS;AACtD,UAAM,SAAS,WAAW,SAAS,GAAG,CAAC,EAAE,SAAS,OAAO;AAEzD,QAAI,WAAW,SAAS;AAEtB,UAAI;AACF,cAAMA,SAAG,OAAO,QAAQ,SAAS;AAAA,MACnC,SAAS,aAAa;AACpB,gBAAQ;AAAA,UACN,yCAAyC,QAAQ,SAAS;AAAA,UAC1D;AAAA,QAAA;AAAA,MAEJ;AAGA,YAAM,UAAU,WAAW;AAAA,QACzB;AAAA,QACA;AAAA,QACA,KAAK,IAAI,KAAM,WAAW,MAAM;AAAA,MAAA;AAElC,UAAI,QAAQ,SAAS,iBAAiB,KAAK,QAAQ,SAAS,OAAO,GAAG;AACpE,cAAM,IAAI;AAAA,UACR,0EAA0E,GAAG,qIAE9B,MAAM;AAAA,QAAA;AAAA,MAGzD,OAAO;AACL,cAAM,IAAI;AAAA,UACR,2EAA2E,MAAM;AAAA,QAAA;AAAA,MAGrF;AAAA,IACF;AAGA,UAAM,SAAS,MAAM,aAAA;AACrB,UAAM,gBAAgB,MAAM,OAAO,YAAY,QAAQ,SAAS;AAGhE,UAAM,WAAyB;AAAA,MAC7B,IAAIC,GAAA;AAAA,MACJ,OAAO,gBAAgB,KAAK,cAAc;AAAA,MAC1C,SAAS,iBAAiB;AAAA,MAC1B,MAAM;AAAA,MACN,UAAU;AAAA,QACR,QAAQ;AAAA,QACR,UAAU,QAAQ;AAAA,MAAA;AAAA,IACpB;AAIF,QAAI,QAAQ,kBAAkB,MAAM;AAClC,eAAS,SAAS,MAAM,KAAK;AAAA,QAC3B,QAAQ;AAAA,QACR,QAAQ,WAAW;AAAA,MAAA;AAAA,IAEvB;AAEA,UAAM,WAAqB;AAAA,MACzB,KAAK,QAAQ,IAAI,SAAA;AAAA,MACjB,MAAM,QAAQ;AAAA,MACd,OAAO,CAAC,QAAQ;AAAA,MAChB,UAAU;AAAA,QACR,WAAW;AAAA,QACX,cAAa,oBAAI,KAAA,GAAO,YAAA;AAAA,QACxB,YAAY,SAAS,QAAQ,UAAU,KAAK;AAAA,MAAA;AAAA,IAC9C;AAIF,UAAM,UAAU,UAAU,KAAK,UAAU,QAAQ,CAAC;AAElD,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,MAAc,cACZ,WACA,SAC0B;AAU1B,WAAO,CAAA;AAAA,EACT;AACF;AC3KA,MAAM,aAAa,CAAC,IAAI,cAAc;AAsCtC,eAAsB,cACpB,KACA,UAAgC,IACb;AAGnB,QAAM,WAAW,IAAI,WAAW,SAAS,KAAK,IAAI,WAAW,UAAU;AAEvE,MAAI,YAAY,CAAC,QAAQ,MAAM;AAC7B,QAAI;AAEF,YAAM,UAAU,MAAM,eAAe,KAAK;AAAA,QACxC,SAAS,QAAQ,WAAW;AAAA,QAC5B,QAAQ,QAAQ,UAAU;AAAA,QAC1B,OAAO,QAAQ;AAAA,QACf,aAAa,QAAQ;AAAA,QACrB,SAAS,QAAQ;AAAA,QACjB,SAAS,QAAQ;AAAA,QACjB,aAAa,QAAQ;AAAA,QACrB,iBAAiB,QAAQ;AAAA,MAAA,CAC1B;AAGD,YAAM,aACJ,QAAQ,SAAS,aAAa,wBAC9B,QAAQ,SAAS,aAAa,uBAC9B,QAAQ,SAAS,aAAa;AAEhC,UAAI,cAAc,QAAQ,SAAS,SAAS,CAAC,QAAQ,SAAS,UAAU;AAGtE,cAAM,QAAQ;AACd,gBAAQ,OAAO;AAAA,MACjB;AAAA,IACF,SAAS,OAAO;AAGd,cAAQ;AAAA,QACN,+BAA+B,GAAG;AAAA,QAClC;AAAA,MAAA;AAAA,IAEJ;AAAA,EACF;AAIA,MAAI,OAAO,QAAQ;AAEnB,MAAI,CAAC,MAAM;AAET,UAAM,WAAW,IAAI,YAAA;AAGrB,QACE,SAAS,SAAS,MAAM,KACxB,SAAS,SAAS,OAAO,KACzB,SAAS,SAAS,OAAO,GACzB;AACA,aAAO;AAAA,IACT,OAAO;AAEL,aAAO,YAAY,GAAG,KAAK;AAAA,IAC7B;AAAA,EACF;AAGA,QAAM,YAAY,WAAW,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI,CAAC;AAEzD,MAAI,CAAC,WAAW;AACd,UAAM,IAAI;AAAA,MACR,6CAA6C,IAAI;AAAA,IAAA;AAAA,EAErD;AAGA,SAAO,UAAU,QAAQ,KAAK,OAAO;AACvC;ACjFO,MAAM,8BAA8B;"}
@@ -0,0 +1,45 @@
1
+ import { Document, DocumentProcessor, FetchDocumentOptions } from '../types';
2
+ /**
3
+ * PDF Document Processor
4
+ *
5
+ * Handles PDF documents with support for:
6
+ * - Text extraction from PDF content via `@happyvertical/pdf`
7
+ * - PDF header validation (detects HTML cache poisoning from document management systems)
8
+ * - Processed document caching via `@happyvertical/files`
9
+ *
10
+ * Image extraction and OCR are stubbed for future implementation.
11
+ */
12
+ export declare class PDFProcessor implements DocumentProcessor {
13
+ /**
14
+ * Check if this processor supports the given MIME type or extension.
15
+ * Accepts `'application/pdf'`, `'.pdf'`, or `'pdf'` (case-insensitive).
16
+ *
17
+ * @param type - MIME type or file extension to check
18
+ * @returns `true` if this processor can handle the given type
19
+ */
20
+ supports(type: string): boolean;
21
+ /**
22
+ * Process a PDF document
23
+ *
24
+ * Extracts text and optionally images/OCR from the PDF, structuring
25
+ * it into hierarchical document parts.
26
+ *
27
+ * @param url - PDF URL or file path
28
+ * @param options - Processing options
29
+ * @returns Promise resolving to structured Document
30
+ */
31
+ process(url: string, options?: FetchDocumentOptions): Promise<Document>;
32
+ /**
33
+ * Extract images from PDF
34
+ *
35
+ * This is a placeholder for future image extraction functionality.
36
+ * Will use @happyvertical/pdf's image extraction capabilities when available.
37
+ *
38
+ * @param filePath - Local PDF file path
39
+ * @param runOcr - Whether to run OCR on extracted images
40
+ * @returns Promise resolving to array of DocumentImages
41
+ */
42
+ private extractImages;
43
+ }
44
+ export default PDFProcessor;
45
+ //# sourceMappingURL=pdf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/processors/pdf.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EACV,QAAQ,EAGR,iBAAiB,EACjB,oBAAoB,EACrB,MAAM,UAAU,CAAC;AAGlB;;;;;;;;;GASG;AACH,qBAAa,YAAa,YAAW,iBAAiB;IACpD;;;;;;OAMG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAQ/B;;;;;;;;;OASG;IACG,OAAO,CACX,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,oBAAyB,GACjC,OAAO,CAAC,QAAQ,CAAC;IAqGpB;;;;;;;;;OASG;YACW,aAAa;CAe5B;AAED,eAAe,YAAY,CAAC"}
@@ -0,0 +1,199 @@
1
+ /**
2
+ * Type definitions for the @happyvertical/documents package
3
+ */
4
+ /**
5
+ * Image extracted from a document
6
+ *
7
+ * Represents an image found in a document (PDF, HTML, etc.)
8
+ * with optional OCR text extraction for scanned images.
9
+ */
10
+ export interface DocumentImage {
11
+ /**
12
+ * Unique identifier for the image
13
+ */
14
+ id: string;
15
+ /**
16
+ * URL or reference to the image
17
+ */
18
+ url: string;
19
+ /**
20
+ * Local filesystem path if image has been downloaded
21
+ */
22
+ localPath?: string;
23
+ /**
24
+ * Alt text from HTML or PDF metadata
25
+ */
26
+ altText?: string;
27
+ /**
28
+ * Text extracted from image via OCR
29
+ * Useful for scanned documents or images with text
30
+ */
31
+ ocrText?: string;
32
+ /**
33
+ * Position/order of image in the document
34
+ */
35
+ position?: number;
36
+ /**
37
+ * Image metadata (dimensions, format, etc.)
38
+ */
39
+ metadata?: {
40
+ width?: number;
41
+ height?: number;
42
+ format?: string;
43
+ [key: string]: any;
44
+ };
45
+ }
46
+ /**
47
+ * A part or section of a document
48
+ *
49
+ * Documents can be hierarchical with nested parts (e.g., sections, chapters).
50
+ * Each part contains content, optional images, and can have child parts.
51
+ */
52
+ export interface DocumentPart {
53
+ /**
54
+ * Unique identifier for this part
55
+ */
56
+ id: string;
57
+ /**
58
+ * Title or heading for this part
59
+ */
60
+ title: string;
61
+ /**
62
+ * Text content of this part
63
+ */
64
+ content: string;
65
+ /**
66
+ * Content type for this part
67
+ */
68
+ type: 'text' | 'html' | 'markdown';
69
+ /**
70
+ * Images contained in this part
71
+ */
72
+ images?: DocumentImage[];
73
+ /**
74
+ * Additional metadata for this part
75
+ */
76
+ metadata?: Record<string, any>;
77
+ /**
78
+ * Nested child parts (for hierarchical documents)
79
+ */
80
+ parts?: DocumentPart[];
81
+ }
82
+ /**
83
+ * Complete document with all parts and metadata
84
+ */
85
+ export interface Document {
86
+ /**
87
+ * Source URL of the document
88
+ */
89
+ url: string;
90
+ /**
91
+ * MIME type or document type
92
+ */
93
+ type: string;
94
+ /**
95
+ * Document parts (can be hierarchical)
96
+ */
97
+ parts: DocumentPart[];
98
+ /**
99
+ * Document-level metadata
100
+ */
101
+ metadata?: Record<string, any>;
102
+ }
103
+ /**
104
+ * Options for fetching a document
105
+ */
106
+ export interface FetchDocumentOptions {
107
+ /**
108
+ * Directory for caching downloaded files
109
+ * @default os.tmpdir()/.cache/have-sdk/documents
110
+ */
111
+ cacheDir?: string;
112
+ /**
113
+ * Whether to extract images from the document
114
+ * @default true
115
+ */
116
+ extractImages?: boolean;
117
+ /**
118
+ * Whether to run OCR on images (PDF scans, etc.)
119
+ * @default true for PDFs, false for HTML/Markdown
120
+ */
121
+ runOcr?: boolean;
122
+ /**
123
+ * Scraper type to use for content extraction
124
+ * - 'basic': Fast, static HTML scraping (default)
125
+ * - 'crawlee': Full browser with JavaScript execution
126
+ * @default 'basic'
127
+ */
128
+ scraper?: 'basic' | 'crawlee';
129
+ /**
130
+ * Spider adapter to use for fetching web pages
131
+ * - 'simple': Basic HTTP fetch
132
+ * - 'dom': HTML parsing with happy-dom
133
+ * - 'crawlee': Headless browser (requires scraper: 'crawlee')
134
+ * @default 'dom'
135
+ */
136
+ spider?: 'simple' | 'dom' | 'crawlee';
137
+ /**
138
+ * Spider adapter to use for HTML fetching (deprecated, use 'spider' instead)
139
+ * @default 'simple'
140
+ * @deprecated Use 'spider' instead
141
+ */
142
+ spiderAdapter?: 'simple' | 'dom' | 'crawlee';
143
+ /**
144
+ * Whether to use cache for spider fetching
145
+ * @default true
146
+ */
147
+ cache?: boolean;
148
+ /**
149
+ * Cache expiry time in milliseconds for spider fetching
150
+ * @default 300000 (5 minutes)
151
+ */
152
+ cacheExpiry?: number;
153
+ /**
154
+ * Custom HTTP headers for spider requests
155
+ */
156
+ headers?: Record<string, string>;
157
+ /**
158
+ * Request timeout in milliseconds for spider fetching
159
+ * @default 30000 (30 seconds)
160
+ */
161
+ timeout?: number;
162
+ /**
163
+ * Maximum time to spend scraping in milliseconds
164
+ * Used by advanced scrapers (tree, pagination, etc.)
165
+ */
166
+ maxDuration?: number;
167
+ /**
168
+ * Maximum number of interactions to perform
169
+ * Used by advanced scrapers (clicking, scrolling, etc.)
170
+ */
171
+ maxInteractions?: number;
172
+ /**
173
+ * Override MIME type detection
174
+ */
175
+ type?: string;
176
+ }
177
+ /**
178
+ * Interface for document processors
179
+ *
180
+ * Each processor handles a specific document format (PDF, HTML, Markdown, etc.)
181
+ */
182
+ export interface DocumentProcessor {
183
+ /**
184
+ * Process a document and return structured parts
185
+ *
186
+ * @param url - Source URL or file path
187
+ * @param options - Processing options
188
+ * @returns Promise resolving to Document with parts
189
+ */
190
+ process(url: string, options?: FetchDocumentOptions): Promise<Document>;
191
+ /**
192
+ * Check if this processor can handle the given type
193
+ *
194
+ * @param type - MIME type or file extension
195
+ * @returns True if processor supports this type
196
+ */
197
+ supports(type: string): boolean;
198
+ }
199
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;OAEG;IACH,QAAQ,CAAC,EAAE;QACT,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;KACpB,CAAC;CACH;AAED;;;;;GAKG;AACH,MAAM,WAAW,YAAY;IAC3B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,CAAC;IAEnC;;OAEG;IACH,MAAM,CAAC,EAAE,aAAa,EAAE,CAAC;IAEzB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAE/B;;OAEG;IACH,KAAK,CAAC,EAAE,YAAY,EAAE,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB;;OAEG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IAEb;;OAEG;IACH,KAAK,EAAE,YAAY,EAAE,CAAC;IAEtB;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB;;;OAGG;IACH,MAAM,CAAC,EAAE,OAAO,CAAC;IAEjB;;;;;OAKG;IACH,OAAO,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAE9B;;;;;;OAMG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,KAAK,GAAG,SAAS,CAAC;IAEtC;;;;OAIG;IACH,aAAa,CAAC,EAAE,QAAQ,GAAG,KAAK,GAAG,SAAS,CAAC;IAE7C;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAEjC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;;;;OAMG;IACH,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAExE;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC;CACjC"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Utility functions for document processing
3
+ */
4
+ /**
5
+ * Extract a human-readable title from a URL
6
+ *
7
+ * Takes a URL and extracts the filename from the pathname, then formats it
8
+ * into a readable title by removing the extension and converting separators
9
+ * to spaces. Also decodes URL-encoded characters like %20.
10
+ *
11
+ * @param url - URL string to extract title from
12
+ * @param defaultTitle - Default title to use if extraction fails
13
+ * @returns Formatted title string
14
+ *
15
+ * @example
16
+ * ```typescript
17
+ * getTitleFromUrl('file:///path/to/My%20Document.pdf')
18
+ * // Returns: 'My Document'
19
+ *
20
+ * getTitleFromUrl('https://example.com/research_paper.pdf')
21
+ * // Returns: 'research paper'
22
+ * ```
23
+ */
24
+ export declare function getTitleFromUrl(url: string, defaultTitle?: string): string;
25
+ //# sourceMappingURL=utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,eAAe,CAC7B,GAAG,EAAE,MAAM,EACX,YAAY,SAAa,GACxB,MAAM,CAiBR"}
package/metadata.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "@happyvertical/documents",
3
+ "path": "packages/documents",
4
+ "position": {
5
+ "index": 8,
6
+ "count": 30
7
+ },
8
+ "description": "Multi-part document processing with support for PDF, HTML, and Markdown",
9
+ "provides": [
10
+ "Multi-part document processing with support for PDF, HTML, and Markdown"
11
+ ],
12
+ "implements": [],
13
+ "requires": {
14
+ "workspace": [
15
+ "@happyvertical/files",
16
+ "@happyvertical/utils"
17
+ ],
18
+ "externalHappyVertical": [
19
+ "@happyvertical/ocr",
20
+ "@happyvertical/pdf",
21
+ "@happyvertical/spider"
22
+ ],
23
+ "external": [
24
+ "uuid"
25
+ ]
26
+ },
27
+ "dependents": [],
28
+ "stability": {
29
+ "level": "stable",
30
+ "reason": "Primary package surface is described as implemented and production-oriented."
31
+ },
32
+ "keywords": [
33
+ "documents"
34
+ ]
35
+ }
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "@happyvertical/documents",
3
+ "version": "0.74.9",
4
+ "description": "Multi-part document processing with support for PDF, HTML, and Markdown",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "bin": {
9
+ "have-documents-context": "./dist/cli/claude-context.js"
10
+ },
11
+ "files": [
12
+ "dist",
13
+ "README.md",
14
+ "LICENSE",
15
+ "AGENT.md",
16
+ "metadata.json"
17
+ ],
18
+ "publishConfig": {
19
+ "registry": "https://registry.npmjs.org",
20
+ "access": "public"
21
+ },
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "https://github.com/happyvertical/sdk.git",
25
+ "directory": "packages/documents"
26
+ },
27
+ "bugs": {
28
+ "url": "https://github.com/happyvertical/sdk/issues"
29
+ },
30
+ "homepage": "https://github.com/happyvertical/sdk/tree/main/packages/documents#readme",
31
+ "license": "MIT",
32
+ "exports": {
33
+ ".": {
34
+ "types": "./dist/index.d.ts",
35
+ "import": "./dist/index.js"
36
+ }
37
+ },
38
+ "dependencies": {
39
+ "@happyvertical/ocr": "^0.60.39",
40
+ "@happyvertical/pdf": "^0.62.25",
41
+ "@happyvertical/spider": "^0.60.10",
42
+ "uuid": "^13.0.0",
43
+ "@happyvertical/files": "0.74.9",
44
+ "@happyvertical/utils": "0.74.9"
45
+ },
46
+ "devDependencies": {
47
+ "@types/node": "25.0.10",
48
+ "typescript": "^5.9.3",
49
+ "vite": "7.3.2",
50
+ "vitest": "^4.1.5"
51
+ },
52
+ "keywords": [
53
+ "documents",
54
+ "pdf",
55
+ "html",
56
+ "markdown",
57
+ "ocr",
58
+ "content-extraction"
59
+ ],
60
+ "scripts": {
61
+ "build": "vite build",
62
+ "build:watch": "vite build --watch",
63
+ "dev": "vite --port 3004",
64
+ "test": "vitest run",
65
+ "test:watch": "vitest watch"
66
+ }
67
+ }