@rpcbase/server 0.368.0 → 0.369.0-finalizeupload.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rpcbase/server",
3
- "version": "0.368.0",
3
+ "version": "0.369.0-finalizeupload.1",
4
4
  "license": "SSPL-1.0",
5
5
  "main": "./index.js",
6
6
  "scripts": {
@@ -72,11 +72,11 @@
72
72
  "express": "4.19.2",
73
73
  "express-session": "1.18.0",
74
74
  "firebase-admin": "12.3.0",
75
- "glob": "11.0.0",
76
75
  "lodash": "4.17.21",
77
76
  "mkdirp": "3.0.1",
78
77
  "mongoose": "8.5.2",
79
78
  "openai": "4.56.0",
79
+ "pdf2pic": "3.1.1",
80
80
  "picocolors": "1.0.1",
81
81
  "postmark": "4.0.4",
82
82
  "redis": "4.7.0",
@@ -2,7 +2,7 @@
2
2
  const assert = require("assert")
3
3
  const fs = require("fs")
4
4
  const path = require("path")
5
- const glob = require("glob")
5
+ const glob = require("glob11")
6
6
  const Sentry = require("@sentry/node")
7
7
 
8
8
 
@@ -21,7 +21,6 @@ const has_sentry = !!SENTRY_DSN
21
21
  const is_development = NODE_ENV === "development"
22
22
 
23
23
 
24
- const src_path = path.join(process.cwd(), "./src/")
25
24
  const build_dir = path.join(process.cwd(), "build/")
26
25
  const client_build_dir = path.join(build_dir, "./client")
27
26
 
@@ -50,7 +49,7 @@ const serve_file = (req, res, full_path) => {
50
49
  headers: {
51
50
  "Cache-Control": `public, max-age=1800`, // 30 mins
52
51
  "Content-Type": content_type,
53
- }
52
+ },
54
53
  })
55
54
  }
56
55
 
@@ -59,7 +58,6 @@ const client_router = (app) => {
59
58
  const client_routes = get_client_routes()
60
59
 
61
60
 
62
-
63
61
  const index_file_path = path.join(client_build_dir, "./index.html")
64
62
  if (fs.existsSync(index_file_path)) {
65
63
  const index_file_buffer = fs.readFileSync(index_file_path)
@@ -79,7 +77,7 @@ const client_router = (app) => {
79
77
  } else {
80
78
  // TODO: should handle 404 here
81
79
  res.writeHead(200, {
82
- "Content-Type": "text/html"
80
+ "Content-Type": "text/html",
83
81
  })
84
82
  res.end(index_file_buffer)
85
83
  }
@@ -1,2 +1,9 @@
1
1
  export const UPLOAD_BUCKET_NAME = "file-uploads"
2
2
  export const TMP_UPLOADS_DIR = "rb-file-uploads"
3
+
4
+ export type ProcessFilePayload = {
5
+ hash: string;
6
+ img_preview?: boolean;
7
+ ocr?: boolean;
8
+ text_vectors?: boolean;
9
+ }
@@ -1,18 +1,12 @@
1
1
  import Promise from "bluebird"
2
2
 
3
3
  import queue from "../../queue"
4
+ import {ProcessFilePayload} from "./constants"
4
5
 
5
6
 
6
- export type ProcessFilePayload = {
7
- hash: string;
8
- ocr?: boolean;
9
- vectors?: boolean;
10
- img_preview?: boolean;
11
- }
12
-
13
7
  const DEFAULTS = {
14
8
  ocr: false,
15
- vectors: false,
9
+ text_vectors: false,
16
10
  img_preview: false,
17
11
  }
18
12
 
@@ -1,25 +1,49 @@
1
- import {exec} from "child_process"
2
- import {promisify} from "util"
1
+ import {DOC_MIME_TYPES} from "./constants"
2
+ import {exec} from "./helpers/exec"
3
+ import {convert_pdf_to_png} from "./helpers/convert_pdf_to_png"
3
4
 
4
5
 
5
- const execAsync = promisify(exec)
6
+ const convert_pdf = async({tmp_wd, full_path, metadata}) => {
7
+ console.log("converting from:", metadata.mime_type, "to: pdf")
8
+ const cmd = `libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir ${tmp_wd} ${full_path}`
9
+ const result = await exec(cmd)
6
10
 
7
- interface Payload {
8
- pdfPath: string;
9
- imgOutputPath: string;
11
+ if (result.stderr) {
12
+ console.warn(result.stderr)
13
+ }
14
+
15
+ if (result.stdout) {
16
+ console.log(result.stdout)
17
+ }
10
18
  }
11
19
 
12
- export const apply_img_preview = async(payload: Payload): Promise<void> => {
13
- console.log("PL", payload, execAsync)
14
- // try {
15
- // const {stdout, stderr} = await execAsync(
16
- // `pdftoppm -png ${payload.pdfPath} ${payload.imgOutputPath}`,
17
- // )
18
- // if (stderr) {
19
- // console.error(`Error: ${stderr}`)
20
- // }
21
- // console.log(`Image preview created: ${stdout}`)
22
- // } catch (error) {
23
- // console.error(`Execution error: ${error}`)
24
- // }
20
+
21
+ export const apply_img_preview = async({tmp_wd, full_path, metadata}): Promise<any> => {
22
+
23
+ let is_pdf_converted = false
24
+
25
+ // DOC
26
+ if (DOC_MIME_TYPES.includes(metadata.mime_type)) {
27
+ await convert_pdf({tmp_wd, full_path, metadata})
28
+ is_pdf_converted = true
29
+ }
30
+ // PDF
31
+ else if (metadata.mime_type === "application/pdf") {
32
+ // file is already a pdf, do nothing
33
+ is_pdf_converted = true
34
+ } else {
35
+ console.log("apply img, unknown mime type, not proceeding")
36
+ }
37
+
38
+ if (is_pdf_converted) {
39
+ const pdf_path = `${full_path}.pdf`
40
+ await convert_pdf_to_png({tmp_wd, pdf_path})
41
+ }
42
+
43
+ // TODO: convert spreadsheets to csv ?
44
+ // convert powerpoints to pdf also (they should work just the same as docs)
45
+
46
+ return {
47
+ is_pdf_converted,
48
+ }
25
49
  }
@@ -0,0 +1,23 @@
1
+ export const PDF_IMG_DIR = "img"
2
+
3
+ export const DOC_MIME_TYPES = [
4
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
5
+ "application/msword",
6
+ "application/vnd.oasis.opendocument.text",
7
+ "application/rtf",
8
+ ]
9
+
10
+
11
+ export const IMAGES_MIME_TYPES = [
12
+ "image/jpeg",
13
+ "image/png",
14
+ "image/gif",
15
+ "image/bmp",
16
+ "image/tiff",
17
+ "image/webp",
18
+ "image/svg+xml",
19
+ "image/vnd.microsoft.icon",
20
+ "image/heif",
21
+ "image/heic",
22
+ "image/vnd.adobe.photoshop",
23
+ ]
@@ -11,20 +11,6 @@ import {get_grid_fs_bucket} from "../../helpers/get_grid_fs_bucket"
11
11
 
12
12
  const DL_CHUNK_CONCURRENCY = 8
13
13
 
14
- const get_metadata = async(hash: string) => {
15
- const files_coll = mongoose.connection.db.collection(
16
- `${UPLOAD_BUCKET_NAME}.files`,
17
- )
18
-
19
- const first_chunk_name = `${hash}.0`
20
-
21
- const file_metadata = await files_coll.findOne({
22
- filename: first_chunk_name,
23
- })
24
-
25
- return file_metadata.metadata
26
- }
27
-
28
14
  const download_chunk =
29
15
  (bucket: mongoose.mongo.GridFSBucket, tmp_wd: string, metadata: any) =>
30
16
  (chunk_num: number) =>
@@ -93,11 +79,10 @@ const rebuild_file_from_chunks = async(
93
79
  }
94
80
 
95
81
  // downloads all chunks, reconstruct if necessary
96
- export const download_file = async(tmp_wd: string, hash: string): Promise<string> => {
82
+ export const download_file = async({tmp_wd, metadata}:
83
+ { tmp_wd: string, metadata: any }): Promise<string> => {
97
84
  const bucket = get_grid_fs_bucket(UPLOAD_BUCKET_NAME)
98
85
 
99
- const metadata = await get_metadata(hash)
100
-
101
86
  const chunks = Array.from({length: metadata.total_chunks}, (_, i) => i)
102
87
 
103
88
  // download all file chunks
@@ -0,0 +1,13 @@
1
+
2
+
3
+ export const get_text_vectors = async(
4
+ {
5
+ tmp_wd,
6
+ metadata,
7
+ }: {
8
+ tmp_wd: string,
9
+ metadata: any
10
+ }
11
+ ) => {
12
+ console.log("text vectors NYI")
13
+ }
@@ -0,0 +1,34 @@
1
+ import path from "path"
2
+ import {mkdirp} from "mkdirp"
3
+ import {fromPath} from "pdf2pic"
4
+
5
+ import {PDF_IMG_DIR} from "../constants"
6
+
7
+
8
+ export const convert_pdf_to_png = async(
9
+ {tmp_wd, pdf_path}:
10
+ { tmp_wd: string, pdf_path: string }
11
+ ) =>
12
+ new Promise(async(resolve, reject) => {
13
+ const out_dir = path.join(tmp_wd, `./${PDF_IMG_DIR}/`)
14
+
15
+ await mkdirp(out_dir)
16
+
17
+ const options = {
18
+ density: 220,
19
+ saveFilename: "page",
20
+ savePath: out_dir,
21
+ format: "png",
22
+ // width
23
+ // height
24
+ preserveAspectRatio: true,
25
+ }
26
+
27
+ const converter = fromPath(pdf_path, options)
28
+
29
+ converter.bulk(-1, {responseType: "image"})
30
+ .then((pages) => {
31
+ resolve(pages)
32
+ })
33
+
34
+ })
@@ -0,0 +1,5 @@
1
+ import {exec as _exec} from "child_process"
2
+ import {promisify} from "util"
3
+
4
+
5
+ export const exec = promisify(_exec)
@@ -0,0 +1,18 @@
1
+ import mongoose from "../../../../../mongoose"
2
+
3
+ import {UPLOAD_BUCKET_NAME} from "../../../constants"
4
+
5
+
6
+ export const get_metadata = async(hash: string) => {
7
+ const files_coll = mongoose.connection.db.collection(
8
+ `${UPLOAD_BUCKET_NAME}.files`,
9
+ )
10
+
11
+ const first_chunk_name = `${hash}.0`
12
+
13
+ const file_metadata = await files_coll.findOne({
14
+ filename: first_chunk_name,
15
+ })
16
+
17
+ return file_metadata.metadata
18
+ }
@@ -2,17 +2,22 @@ import path from "path"
2
2
  import os from "os"
3
3
  import {mkdirp} from "mkdirp"
4
4
 
5
+ import {TMP_UPLOADS_DIR} from "../../constants"
6
+
7
+ import {apply_img_preview} from "./apply_img_preview"
5
8
  import {download_file} from "./download_file"
9
+ import {run_ocr} from "./run_ocr"
10
+ import {get_metadata} from "./helpers/get_metadata"
11
+ import {get_text_vectors} from "./get_text_vectors"
6
12
 
7
- import {TMP_UPLOADS_DIR} from "../../constants"
13
+ import {ProcessFilePayload} from "../../constants"
8
14
 
9
15
 
10
16
  const {RB_TENANT_ID} = process.env
11
17
 
12
- export const finalize_file_upload = async(payload) => {
13
- console.log("TASK:FINALIZE FILE UPLOAD")
14
18
 
15
- const {hash, img_preview} = payload
19
+ export const finalize_file_upload = async(payload: ProcessFilePayload) => {
20
+ const {hash, img_preview, ocr, text_vectors} = payload
16
21
 
17
22
  const tmp_wd = path.join(
18
23
  os.tmpdir(),
@@ -22,11 +27,27 @@ export const finalize_file_upload = async(payload) => {
22
27
  await mkdirp(tmp_wd)
23
28
  console.log("created dir", tmp_wd)
24
29
 
25
- const full_path = await download_file(tmp_wd, hash)
30
+ const metadata = await get_metadata(hash)
26
31
 
27
- console.log("got full path", full_path)
32
+ const full_path = await download_file({tmp_wd, metadata})
28
33
 
29
34
  if (img_preview) {
30
- console.log("RUN img preview")
35
+ await apply_img_preview({tmp_wd, full_path, metadata})
36
+ }
37
+
38
+ if (ocr) {
39
+ if (!img_preview) {
40
+ throw new Error("img_preview cannot be false when ocr is set to true")
41
+ }
42
+
43
+ await run_ocr({tmp_wd, metadata})
44
+ }
45
+
46
+ if (text_vectors) {
47
+ if (!ocr) {
48
+ throw new Error("text_vectors requires ocr to be set to true")
49
+ }
50
+
51
+ await get_text_vectors({tmp_wd, metadata})
31
52
  }
32
53
  }
@@ -0,0 +1,42 @@
1
+ import path from "path"
2
+ import Promise from "bluebird"
3
+ import {glob} from "glob11"
4
+
5
+ import {exec} from "./helpers/exec"
6
+
7
+ import {PDF_IMG_DIR} from "./constants"
8
+
9
+
10
+ const get_input_files = async(tmp_wd: string): Promise<Array<string>> => {
11
+ const input_glob = path.join(tmp_wd, `./${PDF_IMG_DIR}/*.png`)
12
+
13
+ const input_files = await glob(input_glob)
14
+
15
+ const sorted = input_files.sort((a, b) => {
16
+ // Extract the page numbers from the file names
17
+ const page_a = parseInt(a.match(/page\.(\d+)\.png/)?.[1] || "0", 10)
18
+ const page_b = parseInt(b.match(/page\.(\d+)\.png/)?.[1] || "0", 10)
19
+
20
+ return page_a - page_b
21
+ })
22
+
23
+ return sorted
24
+ }
25
+
26
+
27
+ export const run_ocr = async({tmp_wd, metadata}: { tmp_wd: string, metadata: any }) => {
28
+
29
+ const input_files = await get_input_files(tmp_wd)
30
+
31
+ const run_ocr_file = async(file_path: string) => {
32
+ const wd = path.dirname(file_path)
33
+ const basename = path.basename(file_path, ".png")
34
+
35
+ const cmd = `tesseract ${basename}.png ${basename} -l eng --oem 1 --psm 11`
36
+ const out = await exec(cmd, {cwd: wd})
37
+ console.log("OUTTT", out)
38
+ }
39
+
40
+ await Promise.map(input_files, run_ocr_file, {concurrency: 4})
41
+
42
+ }