hazo_pdf 1.7.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SETUP_CHECKLIST.md +693 -0
- package/config/hazo_pdf_config.ini.sample +42 -0
- package/db_setup_postgres.sql +17 -0
- package/db_setup_sqlite.sql +13 -0
- package/dist/{chunk-NQ6KUJWG.js → chunk-7M53O3HF.js} +14 -4
- package/dist/chunk-7M53O3HF.js.map +1 -0
- package/dist/{chunk-4JJOUQ62.js → chunk-KDOQ3FIO.js} +176 -87
- package/dist/chunk-KDOQ3FIO.js.map +1 -0
- package/dist/{chunk-KHB3VZJQ.js → chunk-LFFCPDWC.js} +14 -3
- package/dist/chunk-LFFCPDWC.js.map +1 -0
- package/dist/{chunk-264BTVJT.js → chunk-TZJ5S57X.js} +18 -31
- package/dist/chunk-TZJ5S57X.js.map +1 -0
- package/dist/index.d.ts +9 -5
- package/dist/index.js +35 -16
- package/dist/index.js.map +1 -1
- package/dist/{pdf_saver-7FA4DAXI.js → pdf_saver-T6SEDYEE.js} +3 -3
- package/dist/{pdf_viewer-B6S5PJJB.js → pdf_viewer-TFCSUGWU.js} +3 -3
- package/dist/server/index.d.ts +5 -1
- package/dist/server/index.js +219 -81
- package/dist/server/index.js.map +1 -1
- package/dist/server/{text_search-2OZOVUIP.js → text_search-PVDG5Y6I.js} +14 -3
- package/dist/server/text_search-PVDG5Y6I.js.map +1 -0
- package/dist/styles/full.css +5879 -7264
- package/dist/styles/full.css.map +1 -1
- package/dist/styles/index.css +4845 -3955
- package/dist/styles/index.css.map +1 -1
- package/dist/{text_search-I2KZ7DTW.js → text_search-SO4ZOMIZ.js} +2 -2
- package/package.json +51 -36
- package/dist/chunk-264BTVJT.js.map +0 -1
- package/dist/chunk-4JJOUQ62.js.map +0 -1
- package/dist/chunk-KHB3VZJQ.js.map +0 -1
- package/dist/chunk-NQ6KUJWG.js.map +0 -1
- package/dist/server/text_search-2OZOVUIP.js.map +0 -1
- /package/dist/{pdf_saver-7FA4DAXI.js.map → pdf_saver-T6SEDYEE.js.map} +0 -0
- /package/dist/{pdf_viewer-B6S5PJJB.js.map → pdf_viewer-TFCSUGWU.js.map} +0 -0
- /package/dist/{text_search-I2KZ7DTW.js.map → text_search-SO4ZOMIZ.js.map} +0 -0
package/dist/server/index.js
CHANGED
|
@@ -1,40 +1,55 @@
|
|
|
1
|
+
// src/server/index.ts
|
|
2
|
+
import { HazoInternalError } from "hazo_core/errors";
|
|
3
|
+
|
|
4
|
+
// src/server/extract.ts
|
|
5
|
+
import {
|
|
6
|
+
generateRequestId,
|
|
7
|
+
getCorrelationId,
|
|
8
|
+
optional_import,
|
|
9
|
+
withContext
|
|
10
|
+
} from "hazo_core";
|
|
11
|
+
import {
|
|
12
|
+
HazoExternalError,
|
|
13
|
+
HazoNotFoundError,
|
|
14
|
+
HazoUnavailableError,
|
|
15
|
+
HazoValidationError
|
|
16
|
+
} from "hazo_core/errors";
|
|
17
|
+
|
|
1
18
|
// src/utils/logger.ts
|
|
19
|
+
import { createLogger } from "hazo_core";
|
|
2
20
|
var console_logger = {
|
|
3
|
-
info: (message, data) => {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
console.log(`[hazo_pdf] ${message}`);
|
|
8
|
-
}
|
|
9
|
-
},
|
|
10
|
-
debug: (message, data) => {
|
|
11
|
-
if (data) {
|
|
12
|
-
console.debug(`[hazo_pdf] ${message}`, data);
|
|
13
|
-
} else {
|
|
14
|
-
console.debug(`[hazo_pdf] ${message}`);
|
|
15
|
-
}
|
|
16
|
-
},
|
|
17
|
-
warn: (message, data) => {
|
|
18
|
-
if (data) {
|
|
19
|
-
console.warn(`[hazo_pdf] ${message}`, data);
|
|
20
|
-
} else {
|
|
21
|
-
console.warn(`[hazo_pdf] ${message}`);
|
|
22
|
-
}
|
|
23
|
-
},
|
|
24
|
-
error: (message, data) => {
|
|
25
|
-
if (data) {
|
|
26
|
-
console.error(`[hazo_pdf] ${message}`, data);
|
|
27
|
-
} else {
|
|
28
|
-
console.error(`[hazo_pdf] ${message}`);
|
|
29
|
-
}
|
|
30
|
-
}
|
|
21
|
+
info: (message, data) => data ? console.log(`[hazo_pdf] ${message}`, data) : console.log(`[hazo_pdf] ${message}`),
|
|
22
|
+
debug: (message, data) => data ? console.debug(`[hazo_pdf] ${message}`, data) : console.debug(`[hazo_pdf] ${message}`),
|
|
23
|
+
warn: (message, data) => data ? console.warn(`[hazo_pdf] ${message}`, data) : console.warn(`[hazo_pdf] ${message}`),
|
|
24
|
+
error: (message, data) => data ? console.error(`[hazo_pdf] ${message}`, data) : console.error(`[hazo_pdf] ${message}`)
|
|
31
25
|
};
|
|
32
|
-
|
|
26
|
+
function build_default_logger() {
|
|
27
|
+
try {
|
|
28
|
+
return createLogger("hazo_pdf");
|
|
29
|
+
} catch {
|
|
30
|
+
return console_logger;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
var current_logger = null;
|
|
33
34
|
function get_logger() {
|
|
35
|
+
if (!current_logger) {
|
|
36
|
+
current_logger = build_default_logger();
|
|
37
|
+
}
|
|
34
38
|
return current_logger;
|
|
35
39
|
}
|
|
36
40
|
|
|
37
41
|
// src/server/extract.ts
|
|
42
|
+
async function require_module(pkg) {
|
|
43
|
+
const mod = await optional_import(pkg);
|
|
44
|
+
if (!mod) {
|
|
45
|
+
throw new HazoUnavailableError({
|
|
46
|
+
code: "HAZO_PDF_OPTIONAL_DEP_MISSING",
|
|
47
|
+
pkg: "hazo_pdf",
|
|
48
|
+
message: `Required optional peer "${pkg}" is not installed`
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
return mod;
|
|
52
|
+
}
|
|
38
53
|
var is_initialized = false;
|
|
39
54
|
var hazo_files_adapter = null;
|
|
40
55
|
var llm_initialized = false;
|
|
@@ -48,12 +63,14 @@ async function ensure_initialized(sqlite_path, logger = get_logger()) {
|
|
|
48
63
|
return;
|
|
49
64
|
}
|
|
50
65
|
try {
|
|
51
|
-
const
|
|
52
|
-
const
|
|
66
|
+
const llm_mod = await require_module("hazo_llm_api/server");
|
|
67
|
+
const connect_mod = await require_module("hazo_connect/server");
|
|
68
|
+
const { initialize_llm_api, get_current_config } = llm_mod;
|
|
69
|
+
const { SqliteAdapter } = connect_mod;
|
|
53
70
|
if (!llm_initialized) {
|
|
54
71
|
await initialize_llm_api({ logger });
|
|
55
72
|
llm_initialized = true;
|
|
56
|
-
logger.
|
|
73
|
+
logger.debug("extract.llm_api.initialized");
|
|
57
74
|
}
|
|
58
75
|
const config = get_current_config();
|
|
59
76
|
const db_path = sqlite_path || config?.sqlite_path || "prompt_library.sqlite";
|
|
@@ -61,16 +78,17 @@ async function ensure_initialized(sqlite_path, logger = get_logger()) {
|
|
|
61
78
|
type: "sqlite",
|
|
62
79
|
database_path: db_path
|
|
63
80
|
});
|
|
64
|
-
const
|
|
81
|
+
const files_mod = await require_module("hazo_files");
|
|
82
|
+
const { HAZO_FILES_TABLE_SCHEMA } = files_mod;
|
|
65
83
|
const adapter = hazo_files_adapter;
|
|
66
84
|
await adapter.rawQuery(HAZO_FILES_TABLE_SCHEMA.sqlite.ddl);
|
|
67
85
|
for (const idx of HAZO_FILES_TABLE_SCHEMA.sqlite.indexes) {
|
|
68
86
|
await adapter.rawQuery(idx);
|
|
69
87
|
}
|
|
70
|
-
logger.debug("hazo_files
|
|
88
|
+
logger.debug("extract.hazo_files.table_initialized", { sqlite_path: db_path });
|
|
71
89
|
is_initialized = true;
|
|
72
90
|
} catch (error) {
|
|
73
|
-
logger.error("
|
|
91
|
+
logger.error("extract.initialize.failed", {
|
|
74
92
|
error: error instanceof Error ? error.message : String(error)
|
|
75
93
|
});
|
|
76
94
|
throw error;
|
|
@@ -78,21 +96,30 @@ async function ensure_initialized(sqlite_path, logger = get_logger()) {
|
|
|
78
96
|
}
|
|
79
97
|
async function load_document_as_base64(file_path, logger = get_logger()) {
|
|
80
98
|
if (file_path.startsWith("http://") || file_path.startsWith("https://")) {
|
|
81
|
-
logger.debug("
|
|
99
|
+
logger.debug("extract.document.load_from_url", { url: file_path });
|
|
82
100
|
const response = await fetch(file_path);
|
|
83
101
|
if (!response.ok) {
|
|
84
|
-
throw new
|
|
102
|
+
throw new HazoExternalError({
|
|
103
|
+
code: "HAZO_PDF_EXTERNAL_FETCH_FAILED",
|
|
104
|
+
pkg: "hazo_pdf",
|
|
105
|
+
message: `Failed to fetch document: ${response.status} ${response.statusText}`,
|
|
106
|
+
httpStatus: 502
|
|
107
|
+
});
|
|
85
108
|
}
|
|
86
109
|
const buffer2 = await response.arrayBuffer();
|
|
87
110
|
const base642 = Buffer.from(buffer2).toString("base64");
|
|
88
111
|
const content_type = response.headers.get("content-type") || "application/pdf";
|
|
89
112
|
return { base64: base642, mime_type: content_type };
|
|
90
113
|
}
|
|
91
|
-
logger.debug("
|
|
114
|
+
logger.debug("extract.document.load_from_fs", { path: file_path });
|
|
92
115
|
const fs = await import("fs");
|
|
93
116
|
const path = await import("path");
|
|
94
117
|
if (!fs.existsSync(file_path)) {
|
|
95
|
-
throw new
|
|
118
|
+
throw new HazoNotFoundError({
|
|
119
|
+
code: "HAZO_PDF_FILE_NOT_FOUND",
|
|
120
|
+
pkg: "hazo_pdf",
|
|
121
|
+
message: `File not found: ${file_path}`
|
|
122
|
+
});
|
|
96
123
|
}
|
|
97
124
|
const buffer = fs.readFileSync(file_path);
|
|
98
125
|
const base64 = buffer.toString("base64");
|
|
@@ -109,10 +136,16 @@ async function load_document_as_base64(file_path, logger = get_logger()) {
|
|
|
109
136
|
return { base64, mime_type };
|
|
110
137
|
}
|
|
111
138
|
async function load_document_by_file_id(file_id, _storage_type, file_manager, logger = get_logger()) {
|
|
112
|
-
const
|
|
113
|
-
const
|
|
139
|
+
const files_mod = await require_module("hazo_files");
|
|
140
|
+
const connect_mod = await require_module("hazo_connect/server");
|
|
141
|
+
const { HAZO_FILES_TABLE_SCHEMA } = files_mod;
|
|
142
|
+
const { createCrudService } = connect_mod;
|
|
114
143
|
if (!hazo_files_adapter) {
|
|
115
|
-
throw new
|
|
144
|
+
throw new HazoUnavailableError({
|
|
145
|
+
code: "HAZO_PDF_ADAPTER_UNINITIALIZED",
|
|
146
|
+
pkg: "hazo_pdf",
|
|
147
|
+
message: "hazo_files adapter not initialized \u2014 call ensure_initialized() first"
|
|
148
|
+
});
|
|
116
149
|
}
|
|
117
150
|
const crudService = createCrudService(
|
|
118
151
|
hazo_files_adapter,
|
|
@@ -122,23 +155,40 @@ async function load_document_by_file_id(file_id, _storage_type, file_manager, lo
|
|
|
122
155
|
where: { id: file_id }
|
|
123
156
|
});
|
|
124
157
|
if (!files || files.length === 0) {
|
|
125
|
-
throw new
|
|
158
|
+
throw new HazoNotFoundError({
|
|
159
|
+
code: "HAZO_PDF_FILE_RECORD_NOT_FOUND",
|
|
160
|
+
pkg: "hazo_pdf",
|
|
161
|
+
message: `File record not found: ${file_id}`
|
|
162
|
+
});
|
|
126
163
|
}
|
|
127
164
|
const file_record = files[0];
|
|
128
165
|
const file_path = file_record.file_path;
|
|
129
166
|
const mime_type = file_record.file_type || "application/pdf";
|
|
130
|
-
logger.debug("
|
|
167
|
+
logger.debug("extract.file_record.found", { file_id, file_path, storage_type: file_record.storage_type });
|
|
131
168
|
if (file_record.storage_type === "google_drive") {
|
|
132
169
|
if (!file_manager) {
|
|
133
|
-
throw new
|
|
170
|
+
throw new HazoValidationError({
|
|
171
|
+
code: "HAZO_PDF_FILE_MANAGER_REQUIRED",
|
|
172
|
+
pkg: "hazo_pdf",
|
|
173
|
+
message: "file_manager is required for Google Drive files"
|
|
174
|
+
});
|
|
134
175
|
}
|
|
135
176
|
if (!file_manager.isInitialized()) {
|
|
136
|
-
throw new
|
|
177
|
+
throw new HazoUnavailableError({
|
|
178
|
+
code: "HAZO_PDF_FILE_MANAGER_UNINITIALIZED",
|
|
179
|
+
pkg: "hazo_pdf",
|
|
180
|
+
message: "file_manager is not initialized"
|
|
181
|
+
});
|
|
137
182
|
}
|
|
138
|
-
logger.debug("
|
|
183
|
+
logger.debug("extract.google_drive.download", { file_path });
|
|
139
184
|
const result = await file_manager.downloadFile(file_path);
|
|
140
185
|
if (!result.success || !result.data) {
|
|
141
|
-
throw new
|
|
186
|
+
throw new HazoExternalError({
|
|
187
|
+
code: "HAZO_PDF_EXTERNAL_DOWNLOAD_FAILED",
|
|
188
|
+
pkg: "hazo_pdf",
|
|
189
|
+
message: result.error || "Failed to download file from Google Drive",
|
|
190
|
+
httpStatus: 502
|
|
191
|
+
});
|
|
142
192
|
}
|
|
143
193
|
const buffer = Buffer.from(result.data);
|
|
144
194
|
const base64 = buffer.toString("base64");
|
|
@@ -150,6 +200,10 @@ async function load_document_by_file_id(file_id, _storage_type, file_manager, lo
|
|
|
150
200
|
};
|
|
151
201
|
}
|
|
152
202
|
async function extract_document_data(source, options) {
|
|
203
|
+
const correlationId = getCorrelationId() ?? generateRequestId();
|
|
204
|
+
return withContext({ correlationId }, () => _extract_document_data(source, options));
|
|
205
|
+
}
|
|
206
|
+
async function _extract_document_data(source, options) {
|
|
153
207
|
const logger = options.logger || get_logger();
|
|
154
208
|
const storage_type = options.storage_type || "local";
|
|
155
209
|
const save_to_hazo_files = options.save_to_hazo_files !== false;
|
|
@@ -192,12 +246,13 @@ async function extract_document_data(source, options) {
|
|
|
192
246
|
size_kb: doc_size_kb,
|
|
193
247
|
prompt: `${options.prompt_area}/${options.prompt_key}`
|
|
194
248
|
});
|
|
249
|
+
const llm_mod = await require_module("hazo_llm_api/server");
|
|
195
250
|
const {
|
|
196
251
|
hazo_llm_dynamic_data_extract,
|
|
197
252
|
get_database,
|
|
198
253
|
get_prompt_by_area_and_key,
|
|
199
254
|
default_logger
|
|
200
|
-
} =
|
|
255
|
+
} = llm_mod;
|
|
201
256
|
const db = get_database();
|
|
202
257
|
if (!db) {
|
|
203
258
|
return {
|
|
@@ -249,8 +304,10 @@ async function extract_document_data(source, options) {
|
|
|
249
304
|
const storage_file_path = options.original_file_path || resolved_file_path;
|
|
250
305
|
if (save_to_hazo_files && hazo_files_adapter) {
|
|
251
306
|
try {
|
|
252
|
-
const
|
|
253
|
-
const
|
|
307
|
+
const files_mod = await require_module("hazo_files");
|
|
308
|
+
const connect_mod = await require_module("hazo_connect/server");
|
|
309
|
+
const { createFileMetadataService, HAZO_FILES_TABLE_SCHEMA } = files_mod;
|
|
310
|
+
const { createCrudService } = connect_mod;
|
|
254
311
|
const crudService = createCrudService(
|
|
255
312
|
hazo_files_adapter,
|
|
256
313
|
HAZO_FILES_TABLE_SCHEMA.tableName
|
|
@@ -326,7 +383,34 @@ async function extract_document_data(source, options) {
|
|
|
326
383
|
}
|
|
327
384
|
|
|
328
385
|
// src/server/snippet.ts
|
|
386
|
+
import {
|
|
387
|
+
generateRequestId as generateRequestId2,
|
|
388
|
+
getCorrelationId as getCorrelationId2,
|
|
389
|
+
optional_import as optional_import2,
|
|
390
|
+
withContext as withContext2
|
|
391
|
+
} from "hazo_core";
|
|
392
|
+
import {
|
|
393
|
+
HazoExternalError as HazoExternalError2,
|
|
394
|
+
HazoNotFoundError as HazoNotFoundError2,
|
|
395
|
+
HazoUnavailableError as HazoUnavailableError2,
|
|
396
|
+
HazoValidationError as HazoValidationError2
|
|
397
|
+
} from "hazo_core/errors";
|
|
398
|
+
async function require_module2(pkg) {
|
|
399
|
+
const mod = await optional_import2(pkg);
|
|
400
|
+
if (!mod) {
|
|
401
|
+
throw new HazoUnavailableError2({
|
|
402
|
+
code: "HAZO_PDF_OPTIONAL_DEP_MISSING",
|
|
403
|
+
pkg: "hazo_pdf",
|
|
404
|
+
message: `Required optional peer "${pkg}" is not installed`
|
|
405
|
+
});
|
|
406
|
+
}
|
|
407
|
+
return mod;
|
|
408
|
+
}
|
|
329
409
|
async function extract_text_snippet(source, options) {
|
|
410
|
+
const correlationId = getCorrelationId2() ?? generateRequestId2();
|
|
411
|
+
return withContext2({ correlationId }, () => _extract_text_snippet(source, options));
|
|
412
|
+
}
|
|
413
|
+
async function _extract_text_snippet(source, options) {
|
|
330
414
|
const {
|
|
331
415
|
search_text,
|
|
332
416
|
page_index = 0,
|
|
@@ -356,7 +440,7 @@ async function extract_text_snippet(source, options) {
|
|
|
356
440
|
standardFontDataUrl: standard_font_data_url,
|
|
357
441
|
verbosity: 0
|
|
358
442
|
}).promise;
|
|
359
|
-
const { find_all_text_in_pdf } = await import("./text_search-
|
|
443
|
+
const { find_all_text_in_pdf } = await import("./text_search-PVDG5Y6I.js");
|
|
360
444
|
const total_pages = pdf.numPages;
|
|
361
445
|
const snippets = [];
|
|
362
446
|
if (match_mode === "first") {
|
|
@@ -564,20 +648,33 @@ async function load_pdf_bytes(source) {
|
|
|
564
648
|
return source.pdf_bytes;
|
|
565
649
|
}
|
|
566
650
|
if (!source.file_path) {
|
|
567
|
-
throw new
|
|
651
|
+
throw new HazoValidationError2({
|
|
652
|
+
code: "HAZO_PDF_VALIDATION_ERROR",
|
|
653
|
+
pkg: "hazo_pdf",
|
|
654
|
+
message: "Either file_path or pdf_bytes is required"
|
|
655
|
+
});
|
|
568
656
|
}
|
|
569
657
|
const file_path = source.file_path;
|
|
570
658
|
if (file_path.startsWith("http://") || file_path.startsWith("https://")) {
|
|
571
659
|
const response = await fetch(file_path);
|
|
572
660
|
if (!response.ok) {
|
|
573
|
-
throw new
|
|
661
|
+
throw new HazoExternalError2({
|
|
662
|
+
code: "HAZO_PDF_EXTERNAL_FETCH_FAILED",
|
|
663
|
+
pkg: "hazo_pdf",
|
|
664
|
+
message: `Failed to fetch PDF: ${response.status} ${response.statusText}`,
|
|
665
|
+
httpStatus: 502
|
|
666
|
+
});
|
|
574
667
|
}
|
|
575
668
|
const buffer2 = await response.arrayBuffer();
|
|
576
669
|
return new Uint8Array(buffer2);
|
|
577
670
|
}
|
|
578
671
|
const fs = await import("fs");
|
|
579
672
|
if (!fs.existsSync(file_path)) {
|
|
580
|
-
throw new
|
|
673
|
+
throw new HazoNotFoundError2({
|
|
674
|
+
code: "HAZO_PDF_FILE_NOT_FOUND",
|
|
675
|
+
pkg: "hazo_pdf",
|
|
676
|
+
message: `File not found: ${file_path}`
|
|
677
|
+
});
|
|
581
678
|
}
|
|
582
679
|
const buffer = fs.readFileSync(file_path);
|
|
583
680
|
return new Uint8Array(buffer);
|
|
@@ -595,7 +692,8 @@ async function find_text_with_llm(page, viewport, search_text, _render_scale) {
|
|
|
595
692
|
}).promise;
|
|
596
693
|
const image_buffer = canvas.toBuffer("image/png");
|
|
597
694
|
const image_base64 = image_buffer.toString("base64");
|
|
598
|
-
const
|
|
695
|
+
const llm_mod = await require_module2("hazo_llm_api/server");
|
|
696
|
+
const { hazo_llm_image_text } = llm_mod;
|
|
599
697
|
const prompt = `Find the text "${search_text}" in this document image.
|
|
600
698
|
If found, return ONLY a JSON object with the approximate bounding box as percentage coordinates (0-100):
|
|
601
699
|
{"found": true, "x_pct": <left edge %>, "y_pct": <top edge %>, "width_pct": <width %>, "height_pct": <height %>}
|
|
@@ -642,6 +740,15 @@ Return ONLY the JSON object, nothing else.`;
|
|
|
642
740
|
}
|
|
643
741
|
|
|
644
742
|
// src/server/split.ts
|
|
743
|
+
import {
|
|
744
|
+
generateRequestId as generateRequestId3,
|
|
745
|
+
getCorrelationId as getCorrelationId3,
|
|
746
|
+
withContext as withContext3
|
|
747
|
+
} from "hazo_core";
|
|
748
|
+
import {
|
|
749
|
+
HazoExternalError as HazoExternalError3,
|
|
750
|
+
HazoValidationError as HazoValidationError3
|
|
751
|
+
} from "hazo_core/errors";
|
|
645
752
|
function sanitize_label(label) {
|
|
646
753
|
return label.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_+|_+$/g, "").slice(0, 60);
|
|
647
754
|
}
|
|
@@ -653,7 +760,12 @@ function generate_filename(label, pages) {
|
|
|
653
760
|
}
|
|
654
761
|
function to_uint8array(data) {
|
|
655
762
|
if (!data) {
|
|
656
|
-
throw new
|
|
763
|
+
throw new HazoExternalError3({
|
|
764
|
+
code: "HAZO_PDF_EXTERNAL_DOWNLOAD_EMPTY",
|
|
765
|
+
pkg: "hazo_pdf",
|
|
766
|
+
message: "No data received from downloadFile",
|
|
767
|
+
httpStatus: 502
|
|
768
|
+
});
|
|
657
769
|
}
|
|
658
770
|
if (data instanceof Uint8Array) {
|
|
659
771
|
return data;
|
|
@@ -664,52 +776,73 @@ function to_uint8array(data) {
|
|
|
664
776
|
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) {
|
|
665
777
|
return new Uint8Array(data.buffer, data.byteOffset, data.byteLength);
|
|
666
778
|
}
|
|
667
|
-
throw new
|
|
779
|
+
throw new HazoValidationError3({
|
|
780
|
+
code: "HAZO_PDF_UNSUPPORTED_TYPE",
|
|
781
|
+
pkg: "hazo_pdf",
|
|
782
|
+
message: "Unsupported data type from downloadFile"
|
|
783
|
+
});
|
|
668
784
|
}
|
|
669
785
|
async function split_pdf(request, file_manager) {
|
|
786
|
+
const correlationId = getCorrelationId3() ?? generateRequestId3();
|
|
787
|
+
return withContext3({ correlationId }, () => _split_pdf(request, file_manager));
|
|
788
|
+
}
|
|
789
|
+
async function _split_pdf(request, file_manager) {
|
|
670
790
|
const logger = get_logger();
|
|
671
|
-
logger.info("
|
|
791
|
+
logger.info("pdf.split.start", {
|
|
672
792
|
source_file_id: request.source_file_id,
|
|
673
793
|
split_count: request.splits.length,
|
|
674
794
|
output_folder: request.output_folder
|
|
675
795
|
});
|
|
676
796
|
const { PDFDocument } = await import("pdf-lib");
|
|
677
|
-
logger.debug("
|
|
797
|
+
logger.debug("split.source.download", { file_id: request.source_file_id });
|
|
678
798
|
const download_result = await file_manager.downloadFile(request.source_file_id);
|
|
679
799
|
if (!download_result.success || !download_result.data) {
|
|
680
|
-
throw new
|
|
681
|
-
|
|
682
|
-
|
|
800
|
+
throw new HazoExternalError3({
|
|
801
|
+
code: "HAZO_PDF_EXTERNAL_DOWNLOAD_FAILED",
|
|
802
|
+
pkg: "hazo_pdf",
|
|
803
|
+
message: download_result.error || `Failed to download source file: ${request.source_file_id}`,
|
|
804
|
+
httpStatus: 502
|
|
805
|
+
});
|
|
683
806
|
}
|
|
684
807
|
const source_bytes = to_uint8array(download_result.data);
|
|
685
|
-
logger.debug("
|
|
808
|
+
logger.debug("split.source.load", { byte_size: source_bytes.byteLength });
|
|
686
809
|
const source_pdf = await PDFDocument.load(source_bytes);
|
|
687
810
|
const total_pages = source_pdf.getPageCount();
|
|
688
|
-
logger.
|
|
811
|
+
logger.debug("split.source.loaded", { total_pages });
|
|
689
812
|
const page_counts = new Array(total_pages + 1).fill(0);
|
|
690
813
|
for (const split of request.splits) {
|
|
691
814
|
for (const page of split.pages) {
|
|
692
815
|
if (page < 1 || page > total_pages) {
|
|
693
|
-
throw new
|
|
694
|
-
|
|
695
|
-
|
|
816
|
+
throw new HazoValidationError3({
|
|
817
|
+
code: "HAZO_PDF_SPLIT_PAGE_OUT_OF_RANGE",
|
|
818
|
+
pkg: "hazo_pdf",
|
|
819
|
+
message: `Split "${split.label}" references page ${page} which is out of range (1-${total_pages})`
|
|
820
|
+
});
|
|
696
821
|
}
|
|
697
822
|
page_counts[page]++;
|
|
698
823
|
}
|
|
699
824
|
}
|
|
700
825
|
for (let p = 1; p <= total_pages; p++) {
|
|
701
826
|
if (page_counts[p] === 0) {
|
|
702
|
-
throw new
|
|
827
|
+
throw new HazoValidationError3({
|
|
828
|
+
code: "HAZO_PDF_SPLIT_PAGE_MISSING",
|
|
829
|
+
pkg: "hazo_pdf",
|
|
830
|
+
message: `Page ${p} is not included in any split`
|
|
831
|
+
});
|
|
703
832
|
}
|
|
704
833
|
if (page_counts[p] > 1) {
|
|
705
|
-
throw new
|
|
834
|
+
throw new HazoValidationError3({
|
|
835
|
+
code: "HAZO_PDF_SPLIT_PAGE_DUPLICATED",
|
|
836
|
+
pkg: "hazo_pdf",
|
|
837
|
+
message: `Page ${p} appears in multiple splits`
|
|
838
|
+
});
|
|
706
839
|
}
|
|
707
840
|
}
|
|
708
|
-
logger.debug("
|
|
841
|
+
logger.debug("split.validation.passed", { total_pages, split_count: request.splits.length });
|
|
709
842
|
const outputs = [];
|
|
710
843
|
const output_folder = request.output_folder || "";
|
|
711
844
|
for (const instruction of request.splits) {
|
|
712
|
-
logger.debug("
|
|
845
|
+
logger.debug("split.instruction.start", {
|
|
713
846
|
split_id: instruction.split_id,
|
|
714
847
|
label: instruction.label,
|
|
715
848
|
pages: instruction.pages
|
|
@@ -724,14 +857,17 @@ async function split_pdf(request, file_manager) {
|
|
|
724
857
|
const byte_size = pdf_bytes.byteLength;
|
|
725
858
|
const file_name = instruction.output_filename || generate_filename(instruction.label, instruction.pages);
|
|
726
859
|
const file_path = output_folder ? `${output_folder.replace(/\/$/, "")}/${file_name}` : file_name;
|
|
727
|
-
logger.debug("
|
|
860
|
+
logger.debug("split.upload", { file_path, byte_size });
|
|
728
861
|
const upload_result = await file_manager.uploadFile(new Uint8Array(pdf_bytes), file_path, {
|
|
729
862
|
overwrite: true
|
|
730
863
|
});
|
|
731
864
|
if (!upload_result.success) {
|
|
732
|
-
throw new
|
|
733
|
-
|
|
734
|
-
|
|
865
|
+
throw new HazoExternalError3({
|
|
866
|
+
code: "HAZO_PDF_EXTERNAL_UPLOAD_FAILED",
|
|
867
|
+
pkg: "hazo_pdf",
|
|
868
|
+
message: upload_result.error || `Failed to upload split "${instruction.label}" to ${file_path}`,
|
|
869
|
+
httpStatus: 502
|
|
870
|
+
});
|
|
735
871
|
}
|
|
736
872
|
outputs.push({
|
|
737
873
|
split_id: instruction.split_id,
|
|
@@ -743,7 +879,7 @@ async function split_pdf(request, file_manager) {
|
|
|
743
879
|
page_count: instruction.pages.length,
|
|
744
880
|
byte_size
|
|
745
881
|
});
|
|
746
|
-
logger.
|
|
882
|
+
logger.debug("split.upload.done", {
|
|
747
883
|
split_id: instruction.split_id,
|
|
748
884
|
label: instruction.label,
|
|
749
885
|
file_path,
|
|
@@ -751,7 +887,7 @@ async function split_pdf(request, file_manager) {
|
|
|
751
887
|
byte_size
|
|
752
888
|
});
|
|
753
889
|
}
|
|
754
|
-
logger.info("
|
|
890
|
+
logger.info("pdf.split.complete", {
|
|
755
891
|
source_file_id: request.source_file_id,
|
|
756
892
|
output_count: outputs.length
|
|
757
893
|
});
|
|
@@ -763,9 +899,11 @@ async function split_pdf(request, file_manager) {
|
|
|
763
899
|
|
|
764
900
|
// src/server/index.ts
|
|
765
901
|
if (typeof window !== "undefined") {
|
|
766
|
-
throw new
|
|
767
|
-
|
|
768
|
-
|
|
902
|
+
throw new HazoInternalError({
|
|
903
|
+
code: "HAZO_PDF_SERVER_ONLY",
|
|
904
|
+
pkg: "hazo_pdf",
|
|
905
|
+
message: "hazo_pdf/server cannot be imported in the browser. This module is server-only and requires Node.js runtime."
|
|
906
|
+
});
|
|
769
907
|
}
|
|
770
908
|
export {
|
|
771
909
|
extract_document_data,
|