@uploadista/flow-documents-nodes 0.0.16-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +16 -0
- package/LICENSE +21 -0
- package/README.md +57 -0
- package/dist/index.d.mts +1177 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +396 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +32 -0
- package/src/convert-to-markdown-node.ts +156 -0
- package/src/describe-document-node.ts +92 -0
- package/src/extract-text-node.ts +90 -0
- package/src/index.ts +27 -0
- package/src/merge-pdf-node.ts +144 -0
- package/src/ocr-node.ts +111 -0
- package/src/split-pdf-node.ts +176 -0
- package/tsconfig.json +14 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import {
|
|
3
|
+
completeNodeExecution,
|
|
4
|
+
createFlowNode,
|
|
5
|
+
DocumentAiPlugin,
|
|
6
|
+
DocumentPlugin,
|
|
7
|
+
NodeType,
|
|
8
|
+
resolveUploadMetadata,
|
|
9
|
+
} from "@uploadista/core/flow";
|
|
10
|
+
import { uploadFileSchema } from "@uploadista/core/types";
|
|
11
|
+
import { UploadServer } from "@uploadista/core/upload";
|
|
12
|
+
import { Effect, Either } from "effect";
|
|
13
|
+
|
|
14
|
+
export type ConvertToMarkdownNodeParams = {
|
|
15
|
+
credentialId?: string;
|
|
16
|
+
resolution?: "tiny" | "small" | "base" | "gundam" | "large";
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
export function createConvertToMarkdownNode(
|
|
20
|
+
id: string,
|
|
21
|
+
params: ConvertToMarkdownNodeParams = {},
|
|
22
|
+
) {
|
|
23
|
+
return Effect.gen(function* () {
|
|
24
|
+
const documentService = yield* DocumentPlugin;
|
|
25
|
+
const documentAiService = yield* DocumentAiPlugin;
|
|
26
|
+
const uploadServer = yield* UploadServer;
|
|
27
|
+
|
|
28
|
+
return yield* createFlowNode({
|
|
29
|
+
id,
|
|
30
|
+
name: "Convert to Markdown",
|
|
31
|
+
description:
|
|
32
|
+
"Convert documents to Markdown format (intelligently uses OCR if needed)",
|
|
33
|
+
type: NodeType.process,
|
|
34
|
+
inputSchema: uploadFileSchema,
|
|
35
|
+
outputSchema: uploadFileSchema,
|
|
36
|
+
run: ({ data: file, flowId, jobId, clientId }) => {
|
|
37
|
+
return Effect.gen(function* () {
|
|
38
|
+
const flow = {
|
|
39
|
+
flowId,
|
|
40
|
+
nodeId: id,
|
|
41
|
+
jobId,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
yield* Effect.logInfo(`Converting file ${file.id} to Markdown`);
|
|
45
|
+
|
|
46
|
+
// Read file bytes from upload server
|
|
47
|
+
const fileBytes = yield* uploadServer.read(file.id, clientId);
|
|
48
|
+
|
|
49
|
+
// Try to extract text first (for searchable PDFs)
|
|
50
|
+
const extractResult = yield* documentService
|
|
51
|
+
.extractText(fileBytes)
|
|
52
|
+
.pipe(Effect.either);
|
|
53
|
+
|
|
54
|
+
let markdown: string;
|
|
55
|
+
let markdownSource: "text" | "ocr";
|
|
56
|
+
|
|
57
|
+
if (
|
|
58
|
+
Either.isRight(extractResult) &&
|
|
59
|
+
extractResult.right.trim().length > 0
|
|
60
|
+
) {
|
|
61
|
+
// Successfully extracted text from searchable PDF
|
|
62
|
+
const text = extractResult.right;
|
|
63
|
+
|
|
64
|
+
yield* Effect.logInfo(
|
|
65
|
+
`Successfully extracted ${text.length} characters from searchable PDF`,
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
// Simple text-to-markdown conversion
|
|
69
|
+
// In a real implementation, this could be more sophisticated
|
|
70
|
+
markdown = text
|
|
71
|
+
.split("\n\n")
|
|
72
|
+
.map((para: string) => para.trim())
|
|
73
|
+
.filter((para: string) => para.length > 0)
|
|
74
|
+
.join("\n\n");
|
|
75
|
+
|
|
76
|
+
markdownSource = "text";
|
|
77
|
+
|
|
78
|
+
yield* Effect.logInfo(
|
|
79
|
+
`Converted text to Markdown (${markdown.length} characters)`,
|
|
80
|
+
);
|
|
81
|
+
} else {
|
|
82
|
+
// Text extraction failed or returned empty - use OCR
|
|
83
|
+
yield* Effect.logInfo(
|
|
84
|
+
"Text extraction failed or returned empty, falling back to OCR",
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
const fileUrl = file.url;
|
|
88
|
+
|
|
89
|
+
if (!fileUrl) {
|
|
90
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
91
|
+
cause: "URL is required for OCR-based markdown conversion",
|
|
92
|
+
}).toEffect();
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Build context for DocumentAI plugin
|
|
96
|
+
const context = {
|
|
97
|
+
clientId,
|
|
98
|
+
credentialId: params.credentialId,
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
// Perform OCR with markdown conversion
|
|
102
|
+
const ocrResult = yield* documentAiService
|
|
103
|
+
.performOCR(
|
|
104
|
+
fileUrl,
|
|
105
|
+
{
|
|
106
|
+
taskType: "convertToMarkdown",
|
|
107
|
+
resolution: params.resolution || "gundam",
|
|
108
|
+
},
|
|
109
|
+
context,
|
|
110
|
+
)
|
|
111
|
+
.pipe(
|
|
112
|
+
Effect.catchAll((error) =>
|
|
113
|
+
Effect.gen(function* () {
|
|
114
|
+
yield* Effect.logError("Failed to perform OCR", error);
|
|
115
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
116
|
+
cause:
|
|
117
|
+
error instanceof Error
|
|
118
|
+
? error.message
|
|
119
|
+
: "Failed to perform OCR for markdown conversion",
|
|
120
|
+
}).toEffect();
|
|
121
|
+
}),
|
|
122
|
+
),
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
markdown = ocrResult.extractedText;
|
|
126
|
+
markdownSource = "ocr";
|
|
127
|
+
|
|
128
|
+
yield* Effect.logInfo(
|
|
129
|
+
`Successfully converted scanned document to Markdown using OCR (${markdown.length} characters)`,
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const { metadata } = resolveUploadMetadata(file.metadata);
|
|
134
|
+
|
|
135
|
+
// Add markdown to metadata
|
|
136
|
+
const newMetadata = {
|
|
137
|
+
...file.metadata,
|
|
138
|
+
...metadata,
|
|
139
|
+
markdown,
|
|
140
|
+
markdownSource,
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
yield* Effect.logInfo(
|
|
144
|
+
`Successfully converted file ${file.id} to Markdown via ${markdownSource}`,
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
return completeNodeExecution({
|
|
148
|
+
...file,
|
|
149
|
+
metadata: newMetadata,
|
|
150
|
+
flow,
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
},
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import {
|
|
3
|
+
completeNodeExecution,
|
|
4
|
+
createFlowNode,
|
|
5
|
+
DocumentPlugin,
|
|
6
|
+
NodeType,
|
|
7
|
+
resolveUploadMetadata,
|
|
8
|
+
} from "@uploadista/core/flow";
|
|
9
|
+
import { uploadFileSchema } from "@uploadista/core/types";
|
|
10
|
+
import { UploadServer } from "@uploadista/core/upload";
|
|
11
|
+
import { Effect } from "effect";
|
|
12
|
+
|
|
13
|
+
export type DescribeDocumentNodeParams = Record<string, never>; // No parameters needed
|
|
14
|
+
|
|
15
|
+
export function createDescribeDocumentNode(
|
|
16
|
+
id: string,
|
|
17
|
+
_params: DescribeDocumentNodeParams = {},
|
|
18
|
+
) {
|
|
19
|
+
return Effect.gen(function* () {
|
|
20
|
+
const documentService = yield* DocumentPlugin;
|
|
21
|
+
const uploadServer = yield* UploadServer;
|
|
22
|
+
|
|
23
|
+
return yield* createFlowNode({
|
|
24
|
+
id,
|
|
25
|
+
name: "Describe Document",
|
|
26
|
+
description: "Extract metadata from PDF documents",
|
|
27
|
+
type: NodeType.process,
|
|
28
|
+
inputSchema: uploadFileSchema,
|
|
29
|
+
outputSchema: uploadFileSchema,
|
|
30
|
+
run: ({ data: file, flowId, jobId, clientId }) => {
|
|
31
|
+
return Effect.gen(function* () {
|
|
32
|
+
const flow = {
|
|
33
|
+
flowId,
|
|
34
|
+
nodeId: id,
|
|
35
|
+
jobId,
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
yield* Effect.logInfo(
|
|
39
|
+
`Extracting metadata from PDF file ${file.id}`,
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
// Read file bytes from upload server
|
|
43
|
+
const fileBytes = yield* uploadServer.read(file.id, clientId);
|
|
44
|
+
|
|
45
|
+
// Get metadata with error handling
|
|
46
|
+
const documentMetadata = yield* documentService
|
|
47
|
+
.getMetadata(fileBytes)
|
|
48
|
+
.pipe(
|
|
49
|
+
Effect.catchAll((error) =>
|
|
50
|
+
Effect.gen(function* () {
|
|
51
|
+
yield* Effect.logError("Failed to extract metadata", error);
|
|
52
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
53
|
+
cause:
|
|
54
|
+
error instanceof Error
|
|
55
|
+
? error.message
|
|
56
|
+
: "Failed to extract metadata",
|
|
57
|
+
}).toEffect();
|
|
58
|
+
}),
|
|
59
|
+
),
|
|
60
|
+
);
|
|
61
|
+
|
|
62
|
+
const { metadata } = resolveUploadMetadata(file.metadata);
|
|
63
|
+
|
|
64
|
+
// Add document metadata to file metadata (filter out null values)
|
|
65
|
+
const newMetadata = {
|
|
66
|
+
...file.metadata,
|
|
67
|
+
...metadata,
|
|
68
|
+
pageCount: documentMetadata.pageCount,
|
|
69
|
+
format: documentMetadata.format,
|
|
70
|
+
...(documentMetadata.author && { author: documentMetadata.author }),
|
|
71
|
+
...(documentMetadata.title && { title: documentMetadata.title }),
|
|
72
|
+
...(documentMetadata.subject && { subject: documentMetadata.subject }),
|
|
73
|
+
...(documentMetadata.creator && { creator: documentMetadata.creator }),
|
|
74
|
+
...(documentMetadata.creationDate && { creationDate: documentMetadata.creationDate }),
|
|
75
|
+
...(documentMetadata.modifiedDate && { modifiedDate: documentMetadata.modifiedDate }),
|
|
76
|
+
fileSize: documentMetadata.fileSize,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
yield* Effect.logInfo(
|
|
80
|
+
`Successfully extracted metadata from file ${file.id}: ${documentMetadata.pageCount} pages`,
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
return completeNodeExecution({
|
|
84
|
+
...file,
|
|
85
|
+
metadata: newMetadata,
|
|
86
|
+
flow,
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
},
|
|
90
|
+
});
|
|
91
|
+
});
|
|
92
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import {
|
|
3
|
+
completeNodeExecution,
|
|
4
|
+
createFlowNode,
|
|
5
|
+
DocumentPlugin,
|
|
6
|
+
NodeType,
|
|
7
|
+
resolveUploadMetadata,
|
|
8
|
+
} from "@uploadista/core/flow";
|
|
9
|
+
import { uploadFileSchema } from "@uploadista/core/types";
|
|
10
|
+
import { UploadServer } from "@uploadista/core/upload";
|
|
11
|
+
import { Effect } from "effect";
|
|
12
|
+
|
|
13
|
+
export type ExtractTextNodeParams = Record<string, never>; // No parameters needed
|
|
14
|
+
|
|
15
|
+
export function createExtractTextNode(
|
|
16
|
+
id: string,
|
|
17
|
+
_params: ExtractTextNodeParams = {},
|
|
18
|
+
) {
|
|
19
|
+
return Effect.gen(function* () {
|
|
20
|
+
const documentService = yield* DocumentPlugin;
|
|
21
|
+
const uploadServer = yield* UploadServer;
|
|
22
|
+
|
|
23
|
+
return yield* createFlowNode({
|
|
24
|
+
id,
|
|
25
|
+
name: "Extract Text",
|
|
26
|
+
description: "Extract text from searchable PDF documents",
|
|
27
|
+
type: NodeType.process,
|
|
28
|
+
inputSchema: uploadFileSchema,
|
|
29
|
+
outputSchema: uploadFileSchema,
|
|
30
|
+
run: ({ data: file, flowId, jobId, clientId }) => {
|
|
31
|
+
return Effect.gen(function* () {
|
|
32
|
+
const flow = {
|
|
33
|
+
flowId,
|
|
34
|
+
nodeId: id,
|
|
35
|
+
jobId,
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
yield* Effect.logInfo(
|
|
39
|
+
`Extracting text from PDF file ${file.id}`,
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
// Read file bytes from upload server
|
|
43
|
+
const fileBytes = yield* uploadServer.read(file.id, clientId);
|
|
44
|
+
|
|
45
|
+
// Extract text with error handling
|
|
46
|
+
const extractedText = yield* documentService
|
|
47
|
+
.extractText(fileBytes)
|
|
48
|
+
.pipe(
|
|
49
|
+
Effect.catchAll((error) =>
|
|
50
|
+
Effect.gen(function* () {
|
|
51
|
+
yield* Effect.logError("Failed to extract text", error);
|
|
52
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
53
|
+
cause:
|
|
54
|
+
error instanceof Error
|
|
55
|
+
? error.message
|
|
56
|
+
: "Failed to extract text",
|
|
57
|
+
}).toEffect();
|
|
58
|
+
}),
|
|
59
|
+
),
|
|
60
|
+
);
|
|
61
|
+
|
|
62
|
+
const { metadata } = resolveUploadMetadata(file.metadata);
|
|
63
|
+
|
|
64
|
+
// Add extracted text to metadata
|
|
65
|
+
const newMetadata = {
|
|
66
|
+
...file.metadata,
|
|
67
|
+
...metadata,
|
|
68
|
+
extractedText,
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
if (!extractedText || extractedText.trim().length === 0) {
|
|
72
|
+
yield* Effect.logWarning(
|
|
73
|
+
`No text extracted from file ${file.id}. This might be a scanned document. Consider using the OCR node instead.`,
|
|
74
|
+
);
|
|
75
|
+
} else {
|
|
76
|
+
yield* Effect.logInfo(
|
|
77
|
+
`Successfully extracted ${extractedText.length} characters from file ${file.id}`,
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return completeNodeExecution({
|
|
82
|
+
...file,
|
|
83
|
+
metadata: newMetadata,
|
|
84
|
+
flow,
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
},
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// Document processing nodes
|
|
2
|
+
|
|
3
|
+
export {
|
|
4
|
+
type ConvertToMarkdownNodeParams,
|
|
5
|
+
createConvertToMarkdownNode,
|
|
6
|
+
} from "./convert-to-markdown-node";
|
|
7
|
+
export {
|
|
8
|
+
createDescribeDocumentNode,
|
|
9
|
+
type DescribeDocumentNodeParams,
|
|
10
|
+
} from "./describe-document-node";
|
|
11
|
+
export {
|
|
12
|
+
createExtractTextNode,
|
|
13
|
+
type ExtractTextNodeParams,
|
|
14
|
+
} from "./extract-text-node";
|
|
15
|
+
|
|
16
|
+
export {
|
|
17
|
+
createMergePdfNode,
|
|
18
|
+
type MergePdfNodeParams,
|
|
19
|
+
} from "./merge-pdf-node";
|
|
20
|
+
export {
|
|
21
|
+
createOcrNode,
|
|
22
|
+
type OcrNodeParams,
|
|
23
|
+
} from "./ocr-node";
|
|
24
|
+
export {
|
|
25
|
+
createSplitPdfNode,
|
|
26
|
+
type SplitPdfNodeParams,
|
|
27
|
+
} from "./split-pdf-node";
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import {
|
|
3
|
+
completeNodeExecution,
|
|
4
|
+
createFlowNode,
|
|
5
|
+
DocumentPlugin,
|
|
6
|
+
NodeType,
|
|
7
|
+
resolveUploadMetadata,
|
|
8
|
+
} from "@uploadista/core/flow";
|
|
9
|
+
import { uploadFileSchema } from "@uploadista/core/types";
|
|
10
|
+
import { UploadServer } from "@uploadista/core/upload";
|
|
11
|
+
import { Effect } from "effect";
|
|
12
|
+
import { z } from "zod";
|
|
13
|
+
|
|
14
|
+
export type MergePdfNodeParams = {
|
|
15
|
+
inputCount?: number;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
// Schema for multiple file inputs
|
|
19
|
+
const multipleFilesSchema = z.array(uploadFileSchema);
|
|
20
|
+
|
|
21
|
+
export function createMergePdfNode(
|
|
22
|
+
id: string,
|
|
23
|
+
_params: MergePdfNodeParams = {},
|
|
24
|
+
) {
|
|
25
|
+
return Effect.gen(function* () {
|
|
26
|
+
const documentService = yield* DocumentPlugin;
|
|
27
|
+
const uploadServer = yield* UploadServer;
|
|
28
|
+
|
|
29
|
+
return yield* createFlowNode({
|
|
30
|
+
id,
|
|
31
|
+
name: "Merge PDFs",
|
|
32
|
+
description: "Merge multiple PDF documents into one",
|
|
33
|
+
type: NodeType.process,
|
|
34
|
+
inputSchema: multipleFilesSchema,
|
|
35
|
+
outputSchema: uploadFileSchema,
|
|
36
|
+
run: ({ data: files, flowId, jobId, clientId }) => {
|
|
37
|
+
return Effect.gen(function* () {
|
|
38
|
+
const flow = {
|
|
39
|
+
flowId,
|
|
40
|
+
nodeId: id,
|
|
41
|
+
jobId,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
// Validate that we have an array of files
|
|
45
|
+
if (!Array.isArray(files)) {
|
|
46
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
47
|
+
cause: "Merge PDF node requires an array of files from a Merge utility node",
|
|
48
|
+
}).toEffect();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (files.length === 0) {
|
|
52
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
53
|
+
cause: "At least one PDF file is required for merging",
|
|
54
|
+
}).toEffect();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Read buffers from all files
|
|
58
|
+
const pdfBuffers: Uint8Array[] = [];
|
|
59
|
+
let totalPages = 0;
|
|
60
|
+
|
|
61
|
+
for (const file of files) {
|
|
62
|
+
// Read file bytes from upload server
|
|
63
|
+
const fileBytes = yield* uploadServer.read(file.id, clientId);
|
|
64
|
+
pdfBuffers.push(fileBytes);
|
|
65
|
+
|
|
66
|
+
// Sum up page counts if available
|
|
67
|
+
const fileMetadata = resolveUploadMetadata(file.metadata).metadata;
|
|
68
|
+
if (fileMetadata?.pageCount && typeof fileMetadata.pageCount === 'number') {
|
|
69
|
+
totalPages += fileMetadata.pageCount;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
yield* Effect.logInfo(
|
|
74
|
+
`Merging ${files.length} PDF files`,
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
// Merge PDFs with error handling
|
|
78
|
+
const mergedPdf = yield* documentService
|
|
79
|
+
.mergePdfs({ pdfs: pdfBuffers })
|
|
80
|
+
.pipe(
|
|
81
|
+
Effect.catchAll((error) =>
|
|
82
|
+
Effect.gen(function* () {
|
|
83
|
+
yield* Effect.logError("Failed to merge PDFs", error);
|
|
84
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
85
|
+
cause:
|
|
86
|
+
error instanceof Error
|
|
87
|
+
? error.message
|
|
88
|
+
: "Failed to merge PDFs",
|
|
89
|
+
}).toEffect();
|
|
90
|
+
}),
|
|
91
|
+
),
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
// Use metadata from first file as base
|
|
95
|
+
const firstFile = files[0];
|
|
96
|
+
const { metadata } = resolveUploadMetadata(firstFile.metadata);
|
|
97
|
+
|
|
98
|
+
// Create a stream from the merged PDF bytes
|
|
99
|
+
const stream = new ReadableStream({
|
|
100
|
+
start(controller) {
|
|
101
|
+
controller.enqueue(mergedPdf);
|
|
102
|
+
controller.close();
|
|
103
|
+
},
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// Upload the merged PDF back to the upload server
|
|
107
|
+
const result = yield* uploadServer.upload(
|
|
108
|
+
{
|
|
109
|
+
storageId: firstFile.storage.id,
|
|
110
|
+
size: mergedPdf.byteLength,
|
|
111
|
+
type: "application/pdf",
|
|
112
|
+
fileName: `merged-${files.length}-documents.pdf`,
|
|
113
|
+
lastModified: 0,
|
|
114
|
+
metadata: JSON.stringify({
|
|
115
|
+
...metadata,
|
|
116
|
+
pageCount: totalPages,
|
|
117
|
+
mergedFrom: files.length,
|
|
118
|
+
}),
|
|
119
|
+
flow,
|
|
120
|
+
},
|
|
121
|
+
clientId,
|
|
122
|
+
stream,
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
const newMetadata = {
|
|
126
|
+
...metadata,
|
|
127
|
+
pageCount: totalPages,
|
|
128
|
+
mergedFrom: files.length,
|
|
129
|
+
fileName: `merged-${files.length}-documents.pdf`,
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
yield* Effect.logInfo(
|
|
133
|
+
`Successfully merged ${files.length} PDFs into one document with ${totalPages} pages`,
|
|
134
|
+
);
|
|
135
|
+
|
|
136
|
+
return completeNodeExecution({
|
|
137
|
+
...result,
|
|
138
|
+
metadata: newMetadata,
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
},
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
}
|
package/src/ocr-node.ts
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { UploadistaError } from "@uploadista/core/errors";
|
|
2
|
+
import {
|
|
3
|
+
completeNodeExecution,
|
|
4
|
+
createFlowNode,
|
|
5
|
+
DocumentAiPlugin,
|
|
6
|
+
NodeType,
|
|
7
|
+
type OcrTaskType,
|
|
8
|
+
type OcrResolution,
|
|
9
|
+
resolveUploadMetadata,
|
|
10
|
+
} from "@uploadista/core/flow";
|
|
11
|
+
import { uploadFileSchema } from "@uploadista/core/types";
|
|
12
|
+
import { Effect } from "effect";
|
|
13
|
+
|
|
14
|
+
export type OcrNodeParams = {
|
|
15
|
+
taskType: OcrTaskType;
|
|
16
|
+
resolution?: OcrResolution;
|
|
17
|
+
credentialId?: string;
|
|
18
|
+
referenceText?: string;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
export function createOcrNode(
|
|
22
|
+
id: string,
|
|
23
|
+
params: OcrNodeParams,
|
|
24
|
+
) {
|
|
25
|
+
return Effect.gen(function* () {
|
|
26
|
+
const documentAiService = yield* DocumentAiPlugin;
|
|
27
|
+
|
|
28
|
+
return yield* createFlowNode({
|
|
29
|
+
id,
|
|
30
|
+
name: "OCR",
|
|
31
|
+
description: "Extract text from scanned documents using AI",
|
|
32
|
+
type: NodeType.process,
|
|
33
|
+
inputSchema: uploadFileSchema,
|
|
34
|
+
outputSchema: uploadFileSchema,
|
|
35
|
+
run: ({ data: file, flowId, jobId, clientId }) => {
|
|
36
|
+
return Effect.gen(function* () {
|
|
37
|
+
const flow = {
|
|
38
|
+
flowId,
|
|
39
|
+
nodeId: id,
|
|
40
|
+
jobId,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
const fileUrl = file.url;
|
|
44
|
+
|
|
45
|
+
// Validate input
|
|
46
|
+
if (!fileUrl) {
|
|
47
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
48
|
+
cause: "URL is required for OCR operation",
|
|
49
|
+
}).toEffect();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
yield* Effect.logInfo(
|
|
53
|
+
`Starting OCR for file ${file.id} with task type: ${params.taskType}`,
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
// Build context for DocumentAI plugin
|
|
57
|
+
const context = {
|
|
58
|
+
clientId,
|
|
59
|
+
credentialId: params.credentialId,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Perform OCR with error handling
|
|
63
|
+
const ocrResult = yield* documentAiService
|
|
64
|
+
.performOCR(
|
|
65
|
+
fileUrl,
|
|
66
|
+
{
|
|
67
|
+
taskType: params.taskType,
|
|
68
|
+
resolution: params.resolution,
|
|
69
|
+
referenceText: params.referenceText,
|
|
70
|
+
},
|
|
71
|
+
context
|
|
72
|
+
)
|
|
73
|
+
.pipe(
|
|
74
|
+
Effect.catchAll((error) =>
|
|
75
|
+
Effect.gen(function* () {
|
|
76
|
+
yield* Effect.logError("Failed to perform OCR", error);
|
|
77
|
+
return yield* UploadistaError.fromCode("FLOW_NODE_ERROR", {
|
|
78
|
+
cause:
|
|
79
|
+
error instanceof Error
|
|
80
|
+
? error.message
|
|
81
|
+
: "Failed to perform OCR",
|
|
82
|
+
}).toEffect();
|
|
83
|
+
}),
|
|
84
|
+
),
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
const { metadata } = resolveUploadMetadata(file.metadata);
|
|
88
|
+
|
|
89
|
+
// Add OCR results to metadata
|
|
90
|
+
const newMetadata = {
|
|
91
|
+
...file.metadata,
|
|
92
|
+
...metadata,
|
|
93
|
+
ocrText: ocrResult.extractedText,
|
|
94
|
+
ocrFormat: ocrResult.format,
|
|
95
|
+
ocrTaskType: params.taskType,
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
yield* Effect.logInfo(
|
|
99
|
+
`Successfully completed OCR for file ${file.id}, extracted ${ocrResult.extractedText.length} characters`,
|
|
100
|
+
);
|
|
101
|
+
|
|
102
|
+
return completeNodeExecution({
|
|
103
|
+
...file,
|
|
104
|
+
metadata: newMetadata,
|
|
105
|
+
flow,
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
},
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
}
|