@pdfvector/instance-contract 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.tsc/lib/index.d.ts +5 -0
- package/.tsc/lib/index.js +2 -0
- package/.tsc/lib/router/admin/index.d.ts +3 -0
- package/.tsc/lib/router/admin/index.js +3 -0
- package/.tsc/lib/router/admin/set-domain.d.ts +6 -0
- package/.tsc/lib/router/admin/set-domain.js +20 -0
- package/.tsc/lib/router/admin/set-environment.d.ts +6 -0
- package/.tsc/lib/router/admin/set-environment.js +33 -0
- package/.tsc/lib/router/admin/set-version.d.ts +6 -0
- package/.tsc/lib/router/admin/set-version.js +21 -0
- package/.tsc/lib/router/document/ask.d.ts +22 -0
- package/.tsc/lib/router/document/ask.js +107 -0
- package/.tsc/lib/router/document/extract.d.ts +23 -0
- package/.tsc/lib/router/document/extract.js +124 -0
- package/.tsc/lib/router/document/get-default-spec.d.ts +1 -0
- package/.tsc/lib/router/document/get-default-spec.js +10 -0
- package/.tsc/lib/router/document/index.d.ts +3 -0
- package/.tsc/lib/router/document/index.js +3 -0
- package/.tsc/lib/router/document/parse.d.ts +22 -0
- package/.tsc/lib/router/document/parse.js +110 -0
- package/.tsc/lib/router/index.d.ts +2 -0
- package/.tsc/lib/router/index.js +2 -0
- package/CHANGELOG.md +67 -0
- package/package.json +14 -0
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { InferContractRouterInputs, InferContractRouterOutputs } from "@orpc/contract";
|
|
2
|
+
import * as contract from "./router";
|
|
3
|
+
export { contract };
|
|
4
|
+
export type ContractInputs = InferContractRouterInputs<typeof contract>;
|
|
5
|
+
export type ContractOutputs = InferContractRouterOutputs<typeof contract>;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const setDomain: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
domain: z.ZodString;
|
|
4
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
5
|
+
message: z.ZodString;
|
|
6
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
const adminOutputSchema = z.object({ message: z.string() });
|
|
4
|
+
const setDomainInputSchema = z.object({
|
|
5
|
+
domain: z
|
|
6
|
+
.string()
|
|
7
|
+
.regex(/^([a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}$/, "Must be a valid hostname (e.g. app.example.com)"),
|
|
8
|
+
});
|
|
9
|
+
export const setDomain = oc
|
|
10
|
+
.route({
|
|
11
|
+
summary: "Set instance server's domain",
|
|
12
|
+
description: "Update the instance server's domain name",
|
|
13
|
+
tags: ["Admin"],
|
|
14
|
+
spec: (op) => {
|
|
15
|
+
op.security = [{ bearerAuth: [] }];
|
|
16
|
+
return op;
|
|
17
|
+
},
|
|
18
|
+
})
|
|
19
|
+
.input(setDomainInputSchema)
|
|
20
|
+
.output(adminOutputSchema);
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const setEnvironment: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
environment: z.ZodRecord<z.ZodString, z.ZodString>;
|
|
4
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
5
|
+
message: z.ZodString;
|
|
6
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
const reservedKeys = new Set([
|
|
4
|
+
"NODE_ENV",
|
|
5
|
+
"PDFVECTOR_LOCAL",
|
|
6
|
+
"PDFVECTOR_RELEASES_REGION",
|
|
7
|
+
"PDFVECTOR_RELEASES_S3_BUCKET",
|
|
8
|
+
"PDFVECTOR_RELEASES_DYNAMODB_TABLE",
|
|
9
|
+
"PDFVECTOR_INSTANCE_SERVER_DOMAIN",
|
|
10
|
+
]);
|
|
11
|
+
export const setEnvironment = oc
|
|
12
|
+
.route({
|
|
13
|
+
summary: "Set instance server's environment",
|
|
14
|
+
description: "Update the instance server's environment variables",
|
|
15
|
+
tags: ["Admin"],
|
|
16
|
+
spec: (op) => {
|
|
17
|
+
op.security = [{ bearerAuth: [] }];
|
|
18
|
+
return op;
|
|
19
|
+
},
|
|
20
|
+
})
|
|
21
|
+
.input(z.object({
|
|
22
|
+
environment: z
|
|
23
|
+
.record(z
|
|
24
|
+
.string()
|
|
25
|
+
.regex(/^[A-Z_][A-Z0-9_]*$/, "Keys must be valid env var names (e.g. MY_VAR)"), z.string().regex(/^[^\n\r]*$/, "Values must not contain newlines"))
|
|
26
|
+
.refine((env) => Object.keys(env).length > 0, {
|
|
27
|
+
message: "At least one environment variable is required",
|
|
28
|
+
})
|
|
29
|
+
.refine((env) => !Object.keys(env).some((key) => reservedKeys.has(key)), {
|
|
30
|
+
message: `Cannot set reserved keys: ${[...reservedKeys].join(", ")}`,
|
|
31
|
+
}),
|
|
32
|
+
}))
|
|
33
|
+
.output(z.object({ message: z.string() }));
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const setVersion: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
version: z.ZodUnion<readonly [z.ZodString, z.ZodLiteral<"LATEST">]>;
|
|
4
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
5
|
+
message: z.ZodString;
|
|
6
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
const setVersionInputSchema = z.object({
|
|
4
|
+
version: z.union([
|
|
5
|
+
z.string().regex(/^\d+\.\d+\.\d+$/, "Must be a valid semver (e.g. 1.2.3)"),
|
|
6
|
+
z.literal("LATEST"),
|
|
7
|
+
]),
|
|
8
|
+
});
|
|
9
|
+
const adminOutputSchema = z.object({ message: z.string() });
|
|
10
|
+
export const setVersion = oc
|
|
11
|
+
.route({
|
|
12
|
+
summary: "Set instance server's version",
|
|
13
|
+
description: "Update the instance server to a specific version",
|
|
14
|
+
tags: ["Admin"],
|
|
15
|
+
spec: (op) => {
|
|
16
|
+
op.security = [{ bearerAuth: [] }];
|
|
17
|
+
return op;
|
|
18
|
+
},
|
|
19
|
+
})
|
|
20
|
+
.input(setVersionInputSchema)
|
|
21
|
+
.output(adminOutputSchema);
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const ask: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
url: z.ZodOptional<z.ZodURL>;
|
|
4
|
+
file: z.ZodOptional<z.ZodFile>;
|
|
5
|
+
base64: z.ZodOptional<z.ZodString>;
|
|
6
|
+
question: z.ZodString;
|
|
7
|
+
model: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
8
|
+
auto: "auto";
|
|
9
|
+
nano: "nano";
|
|
10
|
+
mini: "mini";
|
|
11
|
+
pro: "pro";
|
|
12
|
+
max: "max";
|
|
13
|
+
}>>>;
|
|
14
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
15
|
+
markdown: z.ZodString;
|
|
16
|
+
model: z.ZodEnum<{
|
|
17
|
+
nano: "nano";
|
|
18
|
+
mini: "mini";
|
|
19
|
+
pro: "pro";
|
|
20
|
+
max: "max";
|
|
21
|
+
}>;
|
|
22
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { getDefaultSpec } from "./get-default-spec";
|
|
4
|
+
const askInputSchema = z.object({
|
|
5
|
+
url: z
|
|
6
|
+
.url()
|
|
7
|
+
.optional()
|
|
8
|
+
.describe("URL of the document file to fetch and parse"),
|
|
9
|
+
file: z
|
|
10
|
+
.file()
|
|
11
|
+
.mime([
|
|
12
|
+
"application/pdf",
|
|
13
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
14
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
15
|
+
"text/csv",
|
|
16
|
+
"application/csv",
|
|
17
|
+
"image/png",
|
|
18
|
+
"image/jpeg",
|
|
19
|
+
])
|
|
20
|
+
.optional()
|
|
21
|
+
.describe("Document file upload via multipart form-data (PDF, DOCX, XLSX, CSV, PNG, JPG)"),
|
|
22
|
+
base64: z
|
|
23
|
+
.string()
|
|
24
|
+
.optional()
|
|
25
|
+
.describe("Base64-encoded document file content"),
|
|
26
|
+
question: z
|
|
27
|
+
.string()
|
|
28
|
+
.min(4)
|
|
29
|
+
.describe("The question to answer about the document"),
|
|
30
|
+
model: z
|
|
31
|
+
.enum(["auto", "nano", "mini", "pro", "max"])
|
|
32
|
+
.optional()
|
|
33
|
+
.default("auto")
|
|
34
|
+
.describe("Model tier for answering the question. " +
|
|
35
|
+
"'auto' (default): Automatically selects the best tier based on document page count and document complexity. " +
|
|
36
|
+
"'nano': Uses lightweight models (gpt-4o-mini, gemini-2.5-flash, claude-haiku-3-5, ...). Supports PDF, Word, Excel, CSV. " +
|
|
37
|
+
"'mini': Uses mid-range models (gpt-5-mini, gemini-3-flash, claude-haiku-4-5, ...). Supports PDF, Word, Excel, CSV. " +
|
|
38
|
+
"'pro': Uses capable models (gpt-5.2, gemini-3.1-pro, claude-sonnet-4-6, ...). Supports PDF, Word, Excel, CSV, Image. " +
|
|
39
|
+
"'max': Uses the most powerful models (o3, claude-opus-4-6, ...). Supports PDF, Word, Excel, CSV, Image."),
|
|
40
|
+
});
|
|
41
|
+
const askOutputSchema = z
|
|
42
|
+
.object({
|
|
43
|
+
markdown: z.string().describe("The answer to the question"),
|
|
44
|
+
model: z
|
|
45
|
+
.enum(["nano", "mini", "pro", "max"])
|
|
46
|
+
.describe("Model tier used to answer the question. " +
|
|
47
|
+
"'nano': Supports PDF, Word, Excel, CSV. " +
|
|
48
|
+
"'mini': Supports PDF, Word, Excel, CSV. " +
|
|
49
|
+
"'pro': Supports PDF, Word, Excel, CSV, Image. " +
|
|
50
|
+
"'max': Supports PDF, Word, Excel, CSV, Image."),
|
|
51
|
+
})
|
|
52
|
+
.meta({
|
|
53
|
+
examples: [
|
|
54
|
+
{
|
|
55
|
+
markdown: "The study found that viral shedding peaked during the first week of symptoms, with the highest viral loads detected in throat swabs.",
|
|
56
|
+
model: "mini",
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
});
|
|
60
|
+
const requestExamples = {
|
|
61
|
+
"Ask from URL": {
|
|
62
|
+
summary: "Ask from URL",
|
|
63
|
+
value: {
|
|
64
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
65
|
+
question: "What are the main findings of this study?",
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
"Ask from base64": {
|
|
69
|
+
summary: "Ask from base64",
|
|
70
|
+
value: {
|
|
71
|
+
base64: "JVBERi0xLjAKMSAwIG9iajw8L1R5cGUvQ2F0YWxvZy9QYWdlcyAyIDAgUj4+ZW5kb2JqIDIgMCBvYmo8PC9UeXBlL1BhZ2VzL0tpZHNbMyAwIFJdL0NvdW50IDE+PmVuZG9iaiAzIDAgb2JqPDwvVHlwZS9QYWdlL01lZGlhQm94WzAgMCAzIDNdL1BhcmVudCAyIDAgUj4+ZW5kb2JqCnhyZWYKMCA0CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDAwOSAwMDAwMCBuIAowMDAwMDAwMDU4IDAwMDAwIG4gCjAwMDAwMDAxMTUgMDAwMDAgbiAKdHJhaWxlcjw8L1NpemUgNC9Sb290IDEgMCBSPj4Kc3RhcnR4cmVmCjE5MAolJUVPRg==",
|
|
72
|
+
question: "What is the content of this document?",
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
"Ask from file upload": {
|
|
76
|
+
summary: "Ask from file upload",
|
|
77
|
+
value: {
|
|
78
|
+
file: "(binary)",
|
|
79
|
+
question: "Summarize this document",
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"Ask with lightweight models (nano)": {
|
|
83
|
+
summary: "Ask with lightweight models (nano)",
|
|
84
|
+
value: {
|
|
85
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
86
|
+
question: "What is the title of this paper?",
|
|
87
|
+
model: "nano",
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
"Ask with powerful models (max)": {
|
|
91
|
+
summary: "Ask with powerful models (max)",
|
|
92
|
+
value: {
|
|
93
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
94
|
+
question: "Provide a detailed analysis of the methodology used in this study.",
|
|
95
|
+
model: "max",
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
};
|
|
99
|
+
export const ask = oc
|
|
100
|
+
.route({
|
|
101
|
+
summary: "Ask a question about a document",
|
|
102
|
+
description: "Parse a document and answer a question about its content using AI. Supports PDF, Word (.docx), Excel (.xlsx), CSV, and Image (.png, .jpg) files. Provide the document via file upload, a public URL, or a base64-encoded string.",
|
|
103
|
+
tags: ["Document"],
|
|
104
|
+
spec: (op) => getDefaultSpec(op, requestExamples),
|
|
105
|
+
})
|
|
106
|
+
.input(askInputSchema)
|
|
107
|
+
.output(askOutputSchema);
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const extract: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
url: z.ZodOptional<z.ZodURL>;
|
|
4
|
+
file: z.ZodOptional<z.ZodFile>;
|
|
5
|
+
base64: z.ZodOptional<z.ZodString>;
|
|
6
|
+
prompt: z.ZodString;
|
|
7
|
+
schema: z.ZodRecord<z.ZodString, z.ZodUnknown>;
|
|
8
|
+
model: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
9
|
+
auto: "auto";
|
|
10
|
+
nano: "nano";
|
|
11
|
+
mini: "mini";
|
|
12
|
+
pro: "pro";
|
|
13
|
+
max: "max";
|
|
14
|
+
}>>>;
|
|
15
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
16
|
+
data: z.ZodUnknown;
|
|
17
|
+
model: z.ZodEnum<{
|
|
18
|
+
nano: "nano";
|
|
19
|
+
mini: "mini";
|
|
20
|
+
pro: "pro";
|
|
21
|
+
max: "max";
|
|
22
|
+
}>;
|
|
23
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { getDefaultSpec } from "./get-default-spec";
|
|
4
|
+
const extractInputSchema = z.object({
|
|
5
|
+
url: z
|
|
6
|
+
.url()
|
|
7
|
+
.optional()
|
|
8
|
+
.describe("URL of the document file to fetch and parse"),
|
|
9
|
+
file: z
|
|
10
|
+
.file()
|
|
11
|
+
.mime([
|
|
12
|
+
"application/pdf",
|
|
13
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
14
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
15
|
+
"text/csv",
|
|
16
|
+
"application/csv",
|
|
17
|
+
"image/png",
|
|
18
|
+
"image/jpeg",
|
|
19
|
+
])
|
|
20
|
+
.optional()
|
|
21
|
+
.describe("Document file upload via multipart form-data (PDF, DOCX, XLSX, CSV, PNG, JPG)"),
|
|
22
|
+
base64: z
|
|
23
|
+
.string()
|
|
24
|
+
.optional()
|
|
25
|
+
.describe("Base64-encoded document file content"),
|
|
26
|
+
prompt: z
|
|
27
|
+
.string()
|
|
28
|
+
.min(4)
|
|
29
|
+
.describe("The prompt instructing the AI how to extract data from the document"),
|
|
30
|
+
schema: z
|
|
31
|
+
.record(z.string(), z.unknown())
|
|
32
|
+
.describe("JSON Schema describing the structure of the data to extract from the document"),
|
|
33
|
+
model: z
|
|
34
|
+
.enum(["auto", "nano", "mini", "pro", "max"])
|
|
35
|
+
.optional()
|
|
36
|
+
.default("auto")
|
|
37
|
+
.describe("Model tier for extracting structured data. " +
|
|
38
|
+
"'auto' (default): Automatically selects the best tier based on document page count and document complexity. " +
|
|
39
|
+
"'nano': Uses lightweight models (gpt-4o-mini, gemini-2.5-flash, claude-haiku-3-5, ...). Supports PDF, Word, Excel, CSV. " +
|
|
40
|
+
"'mini': Uses mid-range models (gpt-5-mini, gemini-3-flash, claude-haiku-4-5, ...). Supports PDF, Word, Excel, CSV. " +
|
|
41
|
+
"'pro': Uses capable models (gpt-5.2, gemini-3.1-pro, claude-sonnet-4-6, ...). Supports PDF, Word, Excel, CSV, Image. " +
|
|
42
|
+
"'max': Uses the most powerful models (o3, claude-opus-4-6, ...). Supports PDF, Word, Excel, CSV, Image."),
|
|
43
|
+
});
|
|
44
|
+
const extractOutputSchema = z
|
|
45
|
+
.object({
|
|
46
|
+
data: z
|
|
47
|
+
.unknown()
|
|
48
|
+
.describe("Extracted structured data matching the provided JSON Schema"),
|
|
49
|
+
model: z
|
|
50
|
+
.enum(["nano", "mini", "pro", "max"])
|
|
51
|
+
.describe("Model tier used to extract the data. " +
|
|
52
|
+
"'nano': Supports PDF, Word, Excel, CSV. " +
|
|
53
|
+
"'mini': Supports PDF, Word, Excel, CSV. " +
|
|
54
|
+
"'pro': Supports PDF, Word, Excel, CSV, Image. " +
|
|
55
|
+
"'max': Supports PDF, Word, Excel, CSV, Image."),
|
|
56
|
+
})
|
|
57
|
+
.meta({
|
|
58
|
+
examples: [
|
|
59
|
+
{
|
|
60
|
+
data: {
|
|
61
|
+
title: "Virological assessment of hospitalized patients with COVID-2019",
|
|
62
|
+
authors: ["Roman Wölfel", "Victor M. Corman"],
|
|
63
|
+
year: 2020,
|
|
64
|
+
},
|
|
65
|
+
model: "mini",
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
});
|
|
69
|
+
const requestExamples = {
|
|
70
|
+
"Extract from URL": {
|
|
71
|
+
summary: "Extract from URL",
|
|
72
|
+
value: {
|
|
73
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
74
|
+
prompt: "Extract the title, authors, and publication year from this research paper",
|
|
75
|
+
schema: {
|
|
76
|
+
type: "object",
|
|
77
|
+
properties: {
|
|
78
|
+
title: { type: "string" },
|
|
79
|
+
authors: { type: "array", items: { type: "string" } },
|
|
80
|
+
year: { type: "number" },
|
|
81
|
+
},
|
|
82
|
+
required: ["title", "authors", "year"],
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
"Extract from base64": {
|
|
87
|
+
summary: "Extract from base64",
|
|
88
|
+
value: {
|
|
89
|
+
base64: "JVBERi0xLjAKMSAwIG9iajw8L1R5cGUvQ2F0YWxvZy9QYWdlcyAyIDAgUj4+ZW5kb2JqIDIgMCBvYmo8PC9UeXBlL1BhZ2VzL0tpZHNbMyAwIFJdL0NvdW50IDE+PmVuZG9iaiAzIDAgb2JqPDwvVHlwZS9QYWdlL01lZGlhQm94WzAgMCAzIDNdL1BhcmVudCAyIDAgUj4+ZW5kb2JqCnhyZWYKMCA0CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDAwOSAwMDAwMCBuIAowMDAwMDAwMDU4IDAwMDAwIG4gCjAwMDAwMDAxMTUgMDAwMDAgbiAKdHJhaWxlcjw8L1NpemUgNC9Sb290IDEgMCBSPj4Kc3RhcnR4cmVmCjE5MAolJUVPRg==",
|
|
90
|
+
prompt: "Extract the main content from this document",
|
|
91
|
+
schema: {
|
|
92
|
+
type: "object",
|
|
93
|
+
properties: {
|
|
94
|
+
content: { type: "string" },
|
|
95
|
+
},
|
|
96
|
+
required: ["content"],
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
"Extract from file upload": {
|
|
101
|
+
summary: "Extract from file upload",
|
|
102
|
+
value: {
|
|
103
|
+
file: "(binary)",
|
|
104
|
+
prompt: "Extract the title and summary from this document",
|
|
105
|
+
schema: {
|
|
106
|
+
type: "object",
|
|
107
|
+
properties: {
|
|
108
|
+
title: { type: "string" },
|
|
109
|
+
summary: { type: "string" },
|
|
110
|
+
},
|
|
111
|
+
required: ["title", "summary"],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
};
|
|
116
|
+
export const extract = oc
|
|
117
|
+
.route({
|
|
118
|
+
summary: "Extract structured data from a document",
|
|
119
|
+
description: "Parse a document and extract structured data matching a provided JSON Schema using AI. Supports PDF, Word (.docx), Excel (.xlsx), CSV, and Image (.png, .jpg) files. Provide the document via file upload, a public URL, or a base64-encoded string.",
|
|
120
|
+
tags: ["Document"],
|
|
121
|
+
spec: (op) => getDefaultSpec(op, requestExamples),
|
|
122
|
+
})
|
|
123
|
+
.input(extractInputSchema)
|
|
124
|
+
.output(extractOutputSchema);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function getDefaultSpec(op: Record<string, unknown>, requestExamples: Record<string, unknown>): Record<string, unknown>;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export function getDefaultSpec(op, requestExamples) {
|
|
2
|
+
op.security = [{ bearerAuth: [] }];
|
|
3
|
+
const reqBody = op.requestBody;
|
|
4
|
+
if (reqBody?.content) {
|
|
5
|
+
for (const mediaType of Object.values(reqBody.content)) {
|
|
6
|
+
mediaType.examples = requestExamples;
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
return op;
|
|
10
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const parse: import("@orpc/contract").ContractProcedureBuilderWithInputOutput<z.ZodObject<{
|
|
3
|
+
url: z.ZodOptional<z.ZodURL>;
|
|
4
|
+
file: z.ZodOptional<z.ZodFile>;
|
|
5
|
+
base64: z.ZodOptional<z.ZodString>;
|
|
6
|
+
model: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
7
|
+
auto: "auto";
|
|
8
|
+
nano: "nano";
|
|
9
|
+
mini: "mini";
|
|
10
|
+
pro: "pro";
|
|
11
|
+
max: "max";
|
|
12
|
+
}>>>;
|
|
13
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
14
|
+
markdown: z.ZodString;
|
|
15
|
+
pageCount: z.ZodNumber;
|
|
16
|
+
model: z.ZodEnum<{
|
|
17
|
+
nano: "nano";
|
|
18
|
+
mini: "mini";
|
|
19
|
+
pro: "pro";
|
|
20
|
+
max: "max";
|
|
21
|
+
}>;
|
|
22
|
+
}, z.core.$strip>, Record<never, never>, Record<never, never>>;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { oc } from "@orpc/contract";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
import { getDefaultSpec } from "./get-default-spec";
|
|
4
|
+
const parseInputSchema = z.object({
|
|
5
|
+
url: z
|
|
6
|
+
.url()
|
|
7
|
+
.optional()
|
|
8
|
+
.describe("URL of the document file to fetch and parse"),
|
|
9
|
+
file: z
|
|
10
|
+
.file()
|
|
11
|
+
.mime([
|
|
12
|
+
"application/pdf",
|
|
13
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
14
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
15
|
+
"text/csv",
|
|
16
|
+
"application/csv",
|
|
17
|
+
"image/png",
|
|
18
|
+
"image/jpeg",
|
|
19
|
+
])
|
|
20
|
+
.optional()
|
|
21
|
+
.describe("Document file upload via multipart form-data (PDF, DOCX, XLSX, CSV, PNG, JPG)"),
|
|
22
|
+
base64: z
|
|
23
|
+
.string()
|
|
24
|
+
.optional()
|
|
25
|
+
.describe("Base64-encoded document file content"),
|
|
26
|
+
model: z
|
|
27
|
+
.enum(["auto", "nano", "mini", "pro", "max"])
|
|
28
|
+
.optional()
|
|
29
|
+
.default("auto")
|
|
30
|
+
.describe("Model tier for parsing. " +
|
|
31
|
+
"'auto' (default): Automatically selects the best parsing strategy with intelligent fallback. " +
|
|
32
|
+
"'nano': For simple documents with plain text content. Supports PDF, Word, Excel, CSV. " +
|
|
33
|
+
"'mini': For documents with tables and structured content. Supports PDF, Word, Excel, CSV. " +
|
|
34
|
+
"'pro': For documents up to ~50 pages with tables, handwritten text, figures, math, and Arabic. Supports PDF, Word, Excel, CSV, Image. " +
|
|
35
|
+
"'max': For large documents up to ~100MB / ~1000 pages with full Pro capabilities plus enhanced multilingual support. Supports PDF, Word, Excel, CSV, Image."),
|
|
36
|
+
});
|
|
37
|
+
const parseOutputSchema = z
|
|
38
|
+
.object({
|
|
39
|
+
markdown: z.string().describe("Extracted text content from the document"),
|
|
40
|
+
pageCount: z
|
|
41
|
+
.number()
|
|
42
|
+
.int()
|
|
43
|
+
.describe("Total number of pages in the document"),
|
|
44
|
+
model: z
|
|
45
|
+
.enum(["nano", "mini", "pro", "max"])
|
|
46
|
+
.describe("Model tier used to parse the document. " +
|
|
47
|
+
"'nano': Supports PDF, Word, Excel, CSV. " +
|
|
48
|
+
"'mini': Supports PDF, Word, Excel, CSV. " +
|
|
49
|
+
"'pro': Supports PDF, Word, Excel, CSV, Image. " +
|
|
50
|
+
"'max': Supports PDF, Word, Excel, CSV, Image."),
|
|
51
|
+
})
|
|
52
|
+
.meta({
|
|
53
|
+
examples: [
|
|
54
|
+
{
|
|
55
|
+
markdown: "Nature | Vol 581 | 28 May 2020 | 465\nArticle\nVirological assessment of hospitalized patients with COVID-2019...",
|
|
56
|
+
pageCount: 12,
|
|
57
|
+
model: "nano",
|
|
58
|
+
},
|
|
59
|
+
],
|
|
60
|
+
});
|
|
61
|
+
const requestExamples = {
|
|
62
|
+
"Parse from URL": {
|
|
63
|
+
summary: "Parse from URL",
|
|
64
|
+
value: {
|
|
65
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
"Parse from base64": {
|
|
69
|
+
summary: "Parse from base64",
|
|
70
|
+
value: {
|
|
71
|
+
base64: "JVBERi0xLjAKMSAwIG9iajw8L1R5cGUvQ2F0YWxvZy9QYWdlcyAyIDAgUj4+ZW5kb2JqIDIgMCBvYmo8PC9UeXBlL1BhZ2VzL0tpZHNbMyAwIFJdL0NvdW50IDE+PmVuZG9iaiAzIDAgb2JqPDwvVHlwZS9QYWdlL01lZGlhQm94WzAgMCAzIDNdL1BhcmVudCAyIDAgUj4+ZW5kb2JqCnhyZWYKMCA0CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDAwOSAwMDAwMCBuIAowMDAwMDAwMDU4IDAwMDAwIG4gCjAwMDAwMDAxMTUgMDAwMDAgbiAKdHJhaWxlcjw8L1NpemUgNC9Sb290IDEgMCBSPj4Kc3RhcnR4cmVmCjE5MAolJUVPRg==",
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
"Parse from file upload": {
|
|
75
|
+
summary: "Parse from file upload",
|
|
76
|
+
value: {
|
|
77
|
+
file: "(binary)",
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
"Parse with simple PDF (nano)": {
|
|
81
|
+
summary: "Parse with simple PDF (nano)",
|
|
82
|
+
value: {
|
|
83
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
84
|
+
model: "nano",
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
"Parse with small complex documents (pro)": {
|
|
88
|
+
summary: "Parse with small complex documents (pro)",
|
|
89
|
+
value: {
|
|
90
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
91
|
+
model: "pro",
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
"Parse with large complex documents (max)": {
|
|
95
|
+
summary: "Parse with large complex documents (max)",
|
|
96
|
+
value: {
|
|
97
|
+
url: "https://www.nature.com/articles/s41586-020-2196-x.pdf",
|
|
98
|
+
model: "max",
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
};
|
|
102
|
+
export const parse = oc
|
|
103
|
+
.route({
|
|
104
|
+
summary: "Parse a document",
|
|
105
|
+
description: "Extract text and page count from a document. Supports PDF, Word (.docx), Excel (.xlsx), CSV, and Image (.png, .jpg) files. Provide the document via file upload, a public URL, or a base64-encoded string.",
|
|
106
|
+
tags: ["Document"],
|
|
107
|
+
spec: (op) => getDefaultSpec(op, requestExamples),
|
|
108
|
+
})
|
|
109
|
+
.input(parseInputSchema)
|
|
110
|
+
.output(parseOutputSchema);
|
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# @pdfvector/instance-contract
|
|
2
|
+
|
|
3
|
+
## 0.0.7
|
|
4
|
+
### Patch Changes
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
- [#54](https://github.com/phuctm97/pdfvector/pull/54) [`6c00753`](https://github.com/phuctm97/pdfvector/commit/6c0075399cd90d42c38017bc5dbcd8df142fefbd) Thanks [@phuctm97](https://github.com/phuctm97)! - Re-release all packages
|
|
9
|
+
|
|
10
|
+
## 0.0.6
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
- [#52](https://github.com/phuctm97/pdfvector/pull/52) [`fb83b42`](https://github.com/phuctm97/pdfvector/commit/fb83b42ca2ed186d3e5822677b945db100ec7877) Thanks [@phuctm97](https://github.com/phuctm97)! - Transpile with Bun instead of Node.js
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
- [#51](https://github.com/phuctm97/pdfvector/pull/51) [`02e4d4f`](https://github.com/phuctm97/pdfvector/commit/02e4d4f8b08cd2dc86f588c1520bbb95a0e1c5ed) Thanks [@phuctm97](https://github.com/phuctm97)! - Rename all instance stuff to instance-contract, instance-server, instance-client, etc.
|
|
20
|
+
|
|
21
|
+
## 0.0.5
|
|
22
|
+
### Patch Changes
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
- [#48](https://github.com/phuctm97/pdfvector/pull/48) [`50de78b`](https://github.com/phuctm97/pdfvector/commit/50de78bf39eabc2edd8d3f1288d1d7a44e5d6a75) Thanks [@phuctm97](https://github.com/phuctm97)! - Use tsc-alias to remove alias before publishing
|
|
27
|
+
|
|
28
|
+
## 0.0.4
|
|
29
|
+
### Patch Changes
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
- [#47](https://github.com/phuctm97/pdfvector/pull/47) [`102afff`](https://github.com/phuctm97/pdfvector/commit/102afff7ddf8041cab363c816997bcda005df8c1) Thanks [@phuctm97](https://github.com/phuctm97)! - Publish to NPM
|
|
34
|
+
|
|
35
|
+
## 0.0.3
|
|
36
|
+
### Patch Changes
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
- [#37](https://github.com/phuctm97/pdfvector/pull/37) [`0060814`](https://github.com/phuctm97/pdfvector/commit/00608145bae35456c8a44e8a9cfd123b5f7ead82) Thanks [@phuctm97](https://github.com/phuctm97)! - Allow update instance to latest version
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
- [#36](https://github.com/phuctm97/pdfvector/pull/36) [`9abf80a`](https://github.com/phuctm97/pdfvector/commit/9abf80a9ecc353850e7552c8e1da6763074947d2) Thanks [@khanhduyvt0101](https://github.com/khanhduyvt0101)! - Add ask api
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
- [#40](https://github.com/phuctm97/pdfvector/pull/40) [`c23a769`](https://github.com/phuctm97/pdfvector/commit/c23a7691ae121894fbd65d4e57e21e9be3a91c3d) Thanks [@khanhduyvt0101](https://github.com/khanhduyvt0101)! - Add extract api
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
- [#38](https://github.com/phuctm97/pdfvector/pull/38) [`9171706`](https://github.com/phuctm97/pdfvector/commit/91717065aac7b23694c0be52fa571a80fe182c77) Thanks [@khanhduyvt0101](https://github.com/khanhduyvt0101)! - Add ask api and support many file type
|
|
53
|
+
|
|
54
|
+
## 0.0.2
|
|
55
|
+
### Patch Changes
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
- [#32](https://github.com/phuctm97/pdfvector/pull/32) [`bd59fe4`](https://github.com/phuctm97/pdfvector/commit/bd59fe4d72cf674805fad9fefa47285f3868e583) Thanks [@khanhduyvt0101](https://github.com/khanhduyvt0101)! - Update model for parse api, auth and orpc contract
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
- [#35](https://github.com/phuctm97/pdfvector/pull/35) [`ea0dce1`](https://github.com/phuctm97/pdfvector/commit/ea0dce17fe39ccb9d3829cefbd7fcf48d1093344) Thanks [@phuctm97](https://github.com/phuctm97)! - Rename PDFVECTOR_RELEASES env vars
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
- [#33](https://github.com/phuctm97/pdfvector/pull/33) [`83a3de1`](https://github.com/phuctm97/pdfvector/commit/83a3de151b17e9dcf6fb2881c8a0a00799c8f90a) Thanks [@khanhduyvt0101](https://github.com/khanhduyvt0101)! - Update new router structure folder
|
package/package.json
ADDED