@rpcbase/server 0.367.0 → 0.369.0-finalizeupload.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/client/client_router.js +4 -5
- package/src/files/constants.ts +9 -0
- package/src/files/finalize_file_upload.ts +2 -8
- package/src/files/helpers/get_grid_fs_bucket.ts +4 -2
- package/src/files/tasks/finalize_file_upload/apply_img_preview.ts +49 -0
- package/src/files/tasks/finalize_file_upload/constants.ts +23 -0
- package/src/files/tasks/finalize_file_upload/download_file.ts +98 -0
- package/src/files/tasks/finalize_file_upload/get_text_vectors.ts +13 -0
- package/src/files/tasks/finalize_file_upload/helpers/convert_pdf_to_png.ts +34 -0
- package/src/files/tasks/finalize_file_upload/helpers/exec.ts +5 -0
- package/src/files/tasks/finalize_file_upload/helpers/get_metadata.ts +18 -0
- package/src/files/tasks/finalize_file_upload/index.ts +53 -0
- package/src/files/tasks/finalize_file_upload/run_ocr.ts +42 -0
- package/src/files/upload_chunk.ts +3 -2
- package/src/helpers/sim_test_inject.ts +1 -1
- package/src/files/tasks/finalize_file_upload.ts +0 -5
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@rpcbase/server",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.369.0-finalizeupload.0",
|
|
4
4
|
"license": "SSPL-1.0",
|
|
5
5
|
"main": "./index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -72,11 +72,11 @@
|
|
|
72
72
|
"express": "4.19.2",
|
|
73
73
|
"express-session": "1.18.0",
|
|
74
74
|
"firebase-admin": "12.3.0",
|
|
75
|
-
"glob": "11.0.0",
|
|
76
75
|
"lodash": "4.17.21",
|
|
77
76
|
"mkdirp": "3.0.1",
|
|
78
77
|
"mongoose": "8.5.2",
|
|
79
78
|
"openai": "4.56.0",
|
|
79
|
+
"pdf2pic": "3.1.1",
|
|
80
80
|
"picocolors": "1.0.1",
|
|
81
81
|
"postmark": "4.0.4",
|
|
82
82
|
"redis": "4.7.0",
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
const assert = require("assert")
|
|
3
3
|
const fs = require("fs")
|
|
4
4
|
const path = require("path")
|
|
5
|
-
const glob = require("
|
|
5
|
+
const glob = require("glob11")
|
|
6
6
|
const Sentry = require("@sentry/node")
|
|
7
7
|
|
|
8
8
|
|
|
@@ -21,7 +21,7 @@ const has_sentry = !!SENTRY_DSN
|
|
|
21
21
|
const is_development = NODE_ENV === "development"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
const src_path = path.join(process.cwd(), "./src/")
|
|
24
|
+
// const src_path = path.join(process.cwd(), "./src/")
|
|
25
25
|
const build_dir = path.join(process.cwd(), "build/")
|
|
26
26
|
const client_build_dir = path.join(build_dir, "./client")
|
|
27
27
|
|
|
@@ -50,7 +50,7 @@ const serve_file = (req, res, full_path) => {
|
|
|
50
50
|
headers: {
|
|
51
51
|
"Cache-Control": `public, max-age=1800`, // 30 mins
|
|
52
52
|
"Content-Type": content_type,
|
|
53
|
-
}
|
|
53
|
+
},
|
|
54
54
|
})
|
|
55
55
|
}
|
|
56
56
|
|
|
@@ -59,7 +59,6 @@ const client_router = (app) => {
|
|
|
59
59
|
const client_routes = get_client_routes()
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
|
|
63
62
|
const index_file_path = path.join(client_build_dir, "./index.html")
|
|
64
63
|
if (fs.existsSync(index_file_path)) {
|
|
65
64
|
const index_file_buffer = fs.readFileSync(index_file_path)
|
|
@@ -79,7 +78,7 @@ const client_router = (app) => {
|
|
|
79
78
|
} else {
|
|
80
79
|
// TODO: should handle 404 here
|
|
81
80
|
res.writeHead(200, {
|
|
82
|
-
"Content-Type": "text/html"
|
|
81
|
+
"Content-Type": "text/html",
|
|
83
82
|
})
|
|
84
83
|
res.end(index_file_buffer)
|
|
85
84
|
}
|
|
@@ -1,18 +1,12 @@
|
|
|
1
1
|
import Promise from "bluebird"
|
|
2
2
|
|
|
3
3
|
import queue from "../../queue"
|
|
4
|
+
import {ProcessFilePayload} from "./constants"
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
export type ProcessFilePayload = {
|
|
7
|
-
hash: string;
|
|
8
|
-
ocr?: boolean;
|
|
9
|
-
vectors?: boolean;
|
|
10
|
-
img_preview?: boolean;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
7
|
const DEFAULTS = {
|
|
14
8
|
ocr: false,
|
|
15
|
-
|
|
9
|
+
text_vectors: false,
|
|
16
10
|
img_preview: false,
|
|
17
11
|
}
|
|
18
12
|
|
|
@@ -4,8 +4,10 @@ import mongoose from "../../../mongoose"
|
|
|
4
4
|
|
|
5
5
|
const CHUNK_SIZE = 1024 * 1024
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
export const get_grid_fs_bucket = (
|
|
8
|
+
bucket_name: string,
|
|
9
|
+
chunk_size: number = CHUNK_SIZE,
|
|
10
|
+
) => {
|
|
9
11
|
assert(chunk_size === CHUNK_SIZE, "chunk_size must match default CHUNK_SIZE")
|
|
10
12
|
|
|
11
13
|
const {db} = mongoose.connection
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import {DOC_MIME_TYPES} from "./constants"
|
|
2
|
+
import {exec} from "./helpers/exec"
|
|
3
|
+
import {convert_pdf_to_png} from "./helpers/convert_pdf_to_png"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
const convert_pdf = async({tmp_wd, full_path, metadata}) => {
|
|
7
|
+
console.log("converting from:", metadata.mime_type, "to: pdf")
|
|
8
|
+
const cmd = `libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir ${tmp_wd} ${full_path}`
|
|
9
|
+
const result = await exec(cmd)
|
|
10
|
+
|
|
11
|
+
if (result.stderr) {
|
|
12
|
+
console.warn(result.stderr)
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
if (result.stdout) {
|
|
16
|
+
console.log(result.stdout)
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
export const apply_img_preview = async({tmp_wd, full_path, metadata}): Promise<any> => {
|
|
22
|
+
|
|
23
|
+
let is_pdf_converted = false
|
|
24
|
+
|
|
25
|
+
// DOC
|
|
26
|
+
if (DOC_MIME_TYPES.includes(metadata.mime_type)) {
|
|
27
|
+
await convert_pdf({tmp_wd, full_path, metadata})
|
|
28
|
+
is_pdf_converted = true
|
|
29
|
+
}
|
|
30
|
+
// PDF
|
|
31
|
+
else if (metadata.mime_type === "application/pdf") {
|
|
32
|
+
// file is already a pdf, do nothing
|
|
33
|
+
is_pdf_converted = true
|
|
34
|
+
} else {
|
|
35
|
+
console.log("apply img, unknown mime type, not proceeding")
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (is_pdf_converted) {
|
|
39
|
+
const pdf_path = `${full_path}.pdf`
|
|
40
|
+
await convert_pdf_to_png({tmp_wd, pdf_path})
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// TODO: convert spreadsheets to csv ?
|
|
44
|
+
// convert powerpoints to pdf also (they should work just the same as docs)
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
is_pdf_converted,
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export const PDF_IMG_DIR = "img"
|
|
2
|
+
|
|
3
|
+
export const DOC_MIME_TYPES = [
|
|
4
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
5
|
+
"application/msword",
|
|
6
|
+
"application/vnd.oasis.opendocument.text",
|
|
7
|
+
"application/rtf",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
export const IMAGES_MIME_TYPES = [
|
|
12
|
+
"image/jpeg",
|
|
13
|
+
"image/png",
|
|
14
|
+
"image/gif",
|
|
15
|
+
"image/bmp",
|
|
16
|
+
"image/tiff",
|
|
17
|
+
"image/webp",
|
|
18
|
+
"image/svg+xml",
|
|
19
|
+
"image/vnd.microsoft.icon",
|
|
20
|
+
"image/heif",
|
|
21
|
+
"image/heic",
|
|
22
|
+
"image/vnd.adobe.photoshop",
|
|
23
|
+
]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import fs from "fs/promises"
|
|
2
|
+
import {createReadStream, createWriteStream} from "fs"
|
|
3
|
+
import path from "path"
|
|
4
|
+
import {createBrotliDecompress} from "zlib"
|
|
5
|
+
import Promise from "bluebird"
|
|
6
|
+
|
|
7
|
+
import mongoose from "../../../../mongoose"
|
|
8
|
+
import {UPLOAD_BUCKET_NAME} from "../../constants"
|
|
9
|
+
import {get_grid_fs_bucket} from "../../helpers/get_grid_fs_bucket"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
const DL_CHUNK_CONCURRENCY = 8
|
|
13
|
+
|
|
14
|
+
const download_chunk =
|
|
15
|
+
(bucket: mongoose.mongo.GridFSBucket, tmp_wd: string, metadata: any) =>
|
|
16
|
+
(chunk_num: number) =>
|
|
17
|
+
new Promise((resolve, reject) => {
|
|
18
|
+
const filename = `${metadata.hash}.${chunk_num}`
|
|
19
|
+
|
|
20
|
+
const dl_stream = bucket.openDownloadStreamByName(filename)
|
|
21
|
+
|
|
22
|
+
const file_path = path.join(tmp_wd, `./${filename}`)
|
|
23
|
+
|
|
24
|
+
const write_stream = createWriteStream(file_path)
|
|
25
|
+
|
|
26
|
+
if (metadata.is_compressed) {
|
|
27
|
+
const decompress_stream = createBrotliDecompress()
|
|
28
|
+
dl_stream.pipe(decompress_stream).pipe(write_stream)
|
|
29
|
+
} else {
|
|
30
|
+
dl_stream.pipe(write_stream)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
dl_stream.on("error", (error) => {
|
|
34
|
+
console.error("Error downloading file:", error)
|
|
35
|
+
reject(error)
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
write_stream.on("error", (error) => {
|
|
39
|
+
console.error("Error writing file:", error)
|
|
40
|
+
reject(error)
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
write_stream.on("finish", () => {
|
|
44
|
+
console.log("File downloaded and written successfully", file_path)
|
|
45
|
+
resolve()
|
|
46
|
+
})
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
const rebuild_file_from_chunks = async(
|
|
51
|
+
tmp_wd: string,
|
|
52
|
+
metadata: any,
|
|
53
|
+
chunks: Array<number>,
|
|
54
|
+
) => {
|
|
55
|
+
const full_file_path = path.join(tmp_wd, `./${metadata.hash}`)
|
|
56
|
+
const output_stream = createWriteStream(full_file_path)
|
|
57
|
+
|
|
58
|
+
const append_chunk = (chunk_num: number) => new Promise<void>((resolve, reject) => {
|
|
59
|
+
const chunk_path = path.join(tmp_wd, `./${metadata.hash}.${chunk_num}`)
|
|
60
|
+
|
|
61
|
+
const chunk_read_stream = createReadStream(chunk_path)
|
|
62
|
+
|
|
63
|
+
chunk_read_stream.pipe(output_stream, {end: false})
|
|
64
|
+
|
|
65
|
+
chunk_read_stream.on("end", async() => {
|
|
66
|
+
await fs.rm(chunk_path)
|
|
67
|
+
resolve()
|
|
68
|
+
})
|
|
69
|
+
chunk_read_stream.on("error", reject)
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
for (const chunk_num of chunks) {
|
|
73
|
+
await append_chunk(chunk_num)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
output_stream.end()
|
|
77
|
+
|
|
78
|
+
return full_file_path
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// downloads all chunks, reconstruct if necessary
|
|
82
|
+
export const download_file = async({tmp_wd, metadata}:
|
|
83
|
+
{ tmp_wd: string, metadata: any }): Promise<string> => {
|
|
84
|
+
const bucket = get_grid_fs_bucket(UPLOAD_BUCKET_NAME)
|
|
85
|
+
|
|
86
|
+
const chunks = Array.from({length: metadata.total_chunks}, (_, i) => i)
|
|
87
|
+
|
|
88
|
+
// download all file chunks
|
|
89
|
+
await Promise.map(
|
|
90
|
+
chunks,
|
|
91
|
+
download_chunk(bucket, tmp_wd, metadata),
|
|
92
|
+
{concurrency: DL_CHUNK_CONCURRENCY},
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
const full_path = await rebuild_file_from_chunks(tmp_wd, metadata, chunks)
|
|
96
|
+
|
|
97
|
+
return full_path
|
|
98
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import path from "path"
|
|
2
|
+
import {mkdirp} from "mkdirp"
|
|
3
|
+
import {fromPath} from "pdf2pic"
|
|
4
|
+
|
|
5
|
+
import {PDF_IMG_DIR} from "../constants"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
export const convert_pdf_to_png = async(
|
|
9
|
+
{tmp_wd, pdf_path}:
|
|
10
|
+
{ tmp_wd: string, pdf_path: string }
|
|
11
|
+
) =>
|
|
12
|
+
new Promise(async(resolve, reject) => {
|
|
13
|
+
const out_dir = path.join(tmp_wd, `./${PDF_IMG_DIR}/`)
|
|
14
|
+
|
|
15
|
+
await mkdirp(out_dir)
|
|
16
|
+
|
|
17
|
+
const options = {
|
|
18
|
+
density: 220,
|
|
19
|
+
saveFilename: "page",
|
|
20
|
+
savePath: out_dir,
|
|
21
|
+
format: "png",
|
|
22
|
+
// width
|
|
23
|
+
// height
|
|
24
|
+
preserveAspectRatio: true,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const converter = fromPath(pdf_path, options)
|
|
28
|
+
|
|
29
|
+
converter.bulk(-1, {responseType: "image"})
|
|
30
|
+
.then((pages) => {
|
|
31
|
+
resolve(pages)
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
})
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import mongoose from "../../../../../mongoose"
|
|
2
|
+
|
|
3
|
+
import {UPLOAD_BUCKET_NAME} from "../../../constants"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
export const get_metadata = async(hash: string) => {
|
|
7
|
+
const files_coll = mongoose.connection.db.collection(
|
|
8
|
+
`${UPLOAD_BUCKET_NAME}.files`,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
const first_chunk_name = `${hash}.0`
|
|
12
|
+
|
|
13
|
+
const file_metadata = await files_coll.findOne({
|
|
14
|
+
filename: first_chunk_name,
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
return file_metadata.metadata
|
|
18
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import path from "path"
|
|
2
|
+
import os from "os"
|
|
3
|
+
import {mkdirp} from "mkdirp"
|
|
4
|
+
|
|
5
|
+
import {TMP_UPLOADS_DIR} from "../../constants"
|
|
6
|
+
|
|
7
|
+
import {apply_img_preview} from "./apply_img_preview"
|
|
8
|
+
import {download_file} from "./download_file"
|
|
9
|
+
import {run_ocr} from "./run_ocr"
|
|
10
|
+
import {get_metadata} from "./helpers/get_metadata"
|
|
11
|
+
import {get_text_vectors} from "./get_text_vectors"
|
|
12
|
+
|
|
13
|
+
import {ProcessFilePayload} from "../../constants"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
const {RB_TENANT_ID} = process.env
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
export const finalize_file_upload = async(payload: ProcessFilePayload) => {
|
|
20
|
+
const {hash, img_preview, ocr, text_vectors} = payload
|
|
21
|
+
|
|
22
|
+
const tmp_wd = path.join(
|
|
23
|
+
os.tmpdir(),
|
|
24
|
+
`./${TMP_UPLOADS_DIR}/${RB_TENANT_ID}/${hash}`,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
await mkdirp(tmp_wd)
|
|
28
|
+
console.log("created dir", tmp_wd)
|
|
29
|
+
|
|
30
|
+
const metadata = await get_metadata(hash)
|
|
31
|
+
|
|
32
|
+
const full_path = await download_file({tmp_wd, metadata})
|
|
33
|
+
|
|
34
|
+
if (img_preview) {
|
|
35
|
+
await apply_img_preview({tmp_wd, full_path, metadata})
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (ocr) {
|
|
39
|
+
if (!img_preview) {
|
|
40
|
+
throw new Error("img_preview cannot be false when ocr is set to true")
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
await run_ocr({tmp_wd, metadata})
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (text_vectors) {
|
|
47
|
+
if (!ocr) {
|
|
48
|
+
throw new Error("text_vectors requires ocr to be set to true")
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
await get_text_vectors({tmp_wd, metadata})
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import path from "path"
|
|
2
|
+
import Promise from "bluebird"
|
|
3
|
+
import {glob} from "glob11"
|
|
4
|
+
|
|
5
|
+
import {exec} from "./helpers/exec"
|
|
6
|
+
|
|
7
|
+
import {PDF_IMG_DIR} from "./constants"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
const get_input_files = async(tmp_wd: string): Promise<Array<string>> => {
|
|
11
|
+
const input_glob = path.join(tmp_wd, `./${PDF_IMG_DIR}/*.png`)
|
|
12
|
+
|
|
13
|
+
const input_files = await glob(input_glob)
|
|
14
|
+
|
|
15
|
+
const sorted = input_files.sort((a, b) => {
|
|
16
|
+
// Extract the page numbers from the file names
|
|
17
|
+
const page_a = parseInt(a.match(/page\.(\d+)\.png/)?.[1] || "0", 10)
|
|
18
|
+
const page_b = parseInt(b.match(/page\.(\d+)\.png/)?.[1] || "0", 10)
|
|
19
|
+
|
|
20
|
+
return page_a - page_b
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
return sorted
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
export const run_ocr = async({tmp_wd, metadata}: { tmp_wd: string, metadata: any }) => {
|
|
28
|
+
|
|
29
|
+
const input_files = await get_input_files(tmp_wd)
|
|
30
|
+
|
|
31
|
+
const run_ocr_file = async(file_path: string) => {
|
|
32
|
+
const wd = path.dirname(file_path)
|
|
33
|
+
const basename = path.basename(file_path, ".png")
|
|
34
|
+
|
|
35
|
+
const cmd = `tesseract ${basename}.png ${basename} -l eng --oem 1 --psm 11`
|
|
36
|
+
const out = await exec(cmd, {cwd: wd})
|
|
37
|
+
console.log("OUTTT", out)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
await Promise.map(input_files, run_ocr_file, {concurrency: 4})
|
|
41
|
+
|
|
42
|
+
}
|
|
@@ -3,11 +3,12 @@ import fs from "fs"
|
|
|
3
3
|
import {formidable, File} from "formidable"
|
|
4
4
|
|
|
5
5
|
import {get_grid_fs_bucket} from "./helpers/get_grid_fs_bucket"
|
|
6
|
-
import {
|
|
6
|
+
import {sim_test_inject} from "../helpers/sim_test_inject"
|
|
7
|
+
import {UPLOAD_BUCKET_NAME} from "./constants"
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
const upload_file_to_bucket = async(file: File, metadata): Promise<void> => {
|
|
10
|
-
const bucket = get_grid_fs_bucket(
|
|
11
|
+
const bucket = get_grid_fs_bucket(UPLOAD_BUCKET_NAME, metadata.chunk_size)
|
|
11
12
|
|
|
12
13
|
const chunk_filename = `${metadata.hash}.${metadata.chunk_index}`
|
|
13
14
|
|