@mdgf11/filesystem-lib 2.0.7 → 2.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ import { PartialJurisprudenciaDocument } from "@stjiris/jurisprudencia-document";
2
+ import { ContentType, Date_Area_Section, FilesystemDocument, Retrievable_Metadata, Sharepoint_Metadata } from "./types.js";
3
+ export declare function writeFilesystemDocument(filesystem_document: FilesystemDocument): void;
4
+ export declare function loadFilesystemDocument(jsonPath: string): FilesystemDocument;
5
+ export declare function createJurisprudenciaDocument(retrievable_Metadata: Retrievable_Metadata, contents: ContentType[], date_area_section: Date_Area_Section, sharepoint_metadata?: Sharepoint_Metadata): Promise<PartialJurisprudenciaDocument>;
6
+ export declare function hasSelectableText(buffer: Buffer): Promise<boolean>;
7
+ export declare function generateFilePath(date_area_section: Date_Area_Section, retrievable_metadata: Retrievable_Metadata): string;
@@ -0,0 +1,184 @@
1
+ import { calculateHASH, calculateUUID } from "@stjiris/jurisprudencia-document";
2
+ import fs from "fs";
3
+ import mammoth from "mammoth";
4
+ import { DETAILS_NAME, FILESYSTEM_PATH, ORIGINAL_NAME, ROOT_PATH, SHAREPOINT_COPY_PATH } from "./types.js";
5
+ import { DescritorOficial } from "./descritores.js";
6
+ import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
7
+ export function writeFilesystemDocument(filesystem_document) {
8
+ if (!filesystem_document.content)
9
+ return;
10
+ const safe = {
11
+ ...filesystem_document,
12
+ content: filesystem_document.content?.map(({ extension }) => ({ extension }))
13
+ };
14
+ const content = filesystem_document.content;
15
+ if (filesystem_document.file_path) {
16
+ // make filesystem paths
17
+ const filesystem_dir_path = `${ROOT_PATH}${FILESYSTEM_PATH}${filesystem_document.file_path}`;
18
+ const filesystem_metadata_path = `${filesystem_dir_path}/${DETAILS_NAME}.json`;
19
+ fs.mkdirSync(filesystem_dir_path, { recursive: true });
20
+ fs.writeFileSync(filesystem_metadata_path, JSON.stringify(safe, null, 2), { encoding: "utf-8" });
21
+ for (const content_i of content) {
22
+ const filesystem_original_path = `${filesystem_dir_path}/${ORIGINAL_NAME}.${content_i.extension}`;
23
+ fs.writeFileSync(filesystem_original_path, content_i.data, { encoding: "utf-8" });
24
+ }
25
+ // make metadata copy on filesystem copy
26
+ if (filesystem_document.sharepoint_metadata) {
27
+ const filesystem_sharepoint_dir_path = `${ROOT_PATH}${SHAREPOINT_COPY_PATH}${filesystem_document.sharepoint_metadata.sharepoint_path_rel}`;
28
+ const filesystem_sharepoint_path = `${filesystem_sharepoint_dir_path}/${DETAILS_NAME}.json`;
29
+ fs.mkdirSync(filesystem_sharepoint_dir_path, { recursive: true });
30
+ fs.writeFileSync(filesystem_sharepoint_path, JSON.stringify(safe, null, 2), { encoding: "utf-8" });
31
+ }
32
+ }
33
+ else {
34
+ if (filesystem_document.sharepoint_metadata) {
35
+ const filesystem_sharepoint_dir_path = `${ROOT_PATH}${SHAREPOINT_COPY_PATH}${filesystem_document.sharepoint_metadata.sharepoint_path_rel}`;
36
+ const filesystem_sharepoint_path = `${filesystem_sharepoint_dir_path}/${DETAILS_NAME}.json`;
37
+ fs.mkdirSync(filesystem_sharepoint_dir_path, { recursive: true });
38
+ fs.writeFileSync(filesystem_sharepoint_path, JSON.stringify(safe, null, 2), { encoding: "utf-8" });
39
+ for (const content_i of content) {
40
+ const filesystem_original_path = `${filesystem_sharepoint_dir_path}/${ORIGINAL_NAME}.${content_i.extension}`;
41
+ fs.writeFileSync(filesystem_original_path, content_i.data, { encoding: "utf-8" });
42
+ }
43
+ }
44
+ }
45
+ }
46
+ export function loadFilesystemDocument(jsonPath) {
47
+ const jsonString = fs.readFileSync(jsonPath, 'utf-8');
48
+ const parsed = JSON.parse(jsonString);
49
+ return {
50
+ ...parsed,
51
+ creation_date: new Date(parsed.creation_date),
52
+ last_update_date: new Date(parsed.last_update_date),
53
+ content: parsed.content?.map((item) => ({
54
+ extension: item.extension,
55
+ data: Buffer.from([])
56
+ }))
57
+ };
58
+ }
59
+ export async function createJurisprudenciaDocument(retrievable_Metadata, contents, date_area_section, sharepoint_metadata) {
60
+ if (!retrievable_Metadata) {
61
+ throw new Error("Missing metadata.");
62
+ }
63
+ const content = await extractContent(contents);
64
+ const url = sharepoint_metadata ? sharepoint_metadata.sharepoint_url : "";
65
+ let Original = {};
66
+ let CONTENT = content;
67
+ let numProc = retrievable_Metadata.process_number;
68
+ let Data = Intl.DateTimeFormat("pt-PT").format(date_area_section.file_date);
69
+ let origin = "STJ (Sharepoint)";
70
+ Original["Decisão Texto Integral"] = content.map(line => `<p><font>${line}</font><br>`).join('');
71
+ Original["Data"] = Data;
72
+ Original["Número de Processo"] = numProc;
73
+ Original["Fonte"] = origin;
74
+ Original["URL"] = url;
75
+ Original["Jurisprudência"] = "Simples";
76
+ let obj = {
77
+ "Original": Original,
78
+ "CONTENT": CONTENT,
79
+ "Data": Data,
80
+ "Número de Processo": numProc,
81
+ "Fonte": origin,
82
+ "URL": url,
83
+ "Jurisprudência": { Index: ["Simples"], Original: ["Simples"], Show: ["Simples"] },
84
+ "STATE": "importação",
85
+ };
86
+ obj.Sumário = "";
87
+ obj.Texto = content.map(line => `<p><font>${line}</font><br>`).join('');
88
+ if (retrievable_Metadata.descriptors && retrievable_Metadata.descriptors.length > 0) {
89
+ obj.Descritores = {
90
+ Index: retrievable_Metadata.descriptors.map(desc => DescritorOficial[desc]),
91
+ Original: retrievable_Metadata.descriptors,
92
+ Show: retrievable_Metadata.descriptors.map(desc => DescritorOficial[desc])
93
+ };
94
+ }
95
+ if (date_area_section.area && date_area_section.area.length > 0) {
96
+ obj.Área = { Index: [date_area_section.area], Original: [date_area_section.area], Show: [date_area_section.area] };
97
+ }
98
+ if (date_area_section.section && date_area_section.section.length > 0) {
99
+ obj.Secção = { Index: [date_area_section.section], Original: [date_area_section.section], Show: [date_area_section.section] };
100
+ }
101
+ if (retrievable_Metadata.judge && retrievable_Metadata.judge.length > 0) {
102
+ obj["Relator Nome Profissional"] = { Index: [retrievable_Metadata.judge], Original: [retrievable_Metadata.judge], Show: [retrievable_Metadata.judge] };
103
+ }
104
+ if (retrievable_Metadata.process_mean && retrievable_Metadata.process_mean.length > 0) {
105
+ obj["Meio Processual"] = { Index: [retrievable_Metadata.process_mean], Original: [retrievable_Metadata.process_mean], Show: [retrievable_Metadata.process_mean] };
106
+ }
107
+ if (retrievable_Metadata.decision && retrievable_Metadata.decision.length > 0) {
108
+ obj["Decisão"] = { Index: [retrievable_Metadata.decision], Original: [retrievable_Metadata.decision], Show: [retrievable_Metadata.decision] };
109
+ }
110
+ obj["HASH"] = calculateHASH({
111
+ ...obj,
112
+ Original: obj.Original,
113
+ "Número de Processo": obj["Número de Processo"] || "",
114
+ Sumário: obj.Sumário || "",
115
+ Texto: obj.Texto || "",
116
+ });
117
+ obj["UUID"] = calculateUUID(obj["HASH"]);
118
+ return obj;
119
+ }
120
+ export async function hasSelectableText(buffer) {
121
+ try {
122
+ const uint8Array = new Uint8Array(buffer);
123
+ const loadingTask = getDocument({
124
+ data: uint8Array,
125
+ standardFontDataUrl: 'node_modules/pdfjs-dist/standard_fonts/',
126
+ });
127
+ const pdf = await loadingTask.promise;
128
+ const pagesToCheck = Math.min(3, pdf.numPages);
129
+ for (let i = 1; i <= pagesToCheck; i++) {
130
+ const page = await pdf.getPage(i);
131
+ const textContent = await page.getTextContent();
132
+ if (textContent.items.some((item) => item.str?.trim().length > 0)) {
133
+ return true;
134
+ }
135
+ }
136
+ return false;
137
+ }
138
+ catch (error) {
139
+ console.error('Error reading PDF:', error);
140
+ return false;
141
+ }
142
+ }
143
+ export function generateFilePath(date_area_section, retrievable_metadata) {
144
+ return `/${date_area_section.area}/${date_area_section.file_date.getFullYear()}/${date_area_section.file_date.getMonth() + 1}/${date_area_section.file_date.getDate()}/${retrievable_metadata.process_number.replace("/", "-")}`;
145
+ }
146
+ async function extractContent(contents) {
147
+ for (const content of contents) {
148
+ if (content.extension === "txt") {
149
+ return content.data.toString('utf-8').split(/\r?\n/).filter(line => line.trim().length > 0);
150
+ }
151
+ if (content.extension === "pdf") {
152
+ return await pdfToLines(content.data);
153
+ }
154
+ if (content.extension === "docx") {
155
+ return await docxToLines(content.data);
156
+ }
157
+ }
158
+ throw new Error("Contents are not a supported format.");
159
+ }
160
+ async function pdfToLines(buffer) {
161
+ const uint8Array = new Uint8Array(buffer);
162
+ const loadingTask = getDocument({ data: uint8Array, verbosity: 0 });
163
+ const pdf = await loadingTask.promise;
164
+ const allLines = [];
165
+ for (let i = 1; i <= pdf.numPages; i++) {
166
+ const page = await pdf.getPage(i);
167
+ const textContent = await page.getTextContent();
168
+ const pageText = textContent.items
169
+ .map((item) => item.str)
170
+ .join('\n');
171
+ const lines = pageText.split('\n').filter(line => line.trim().length > 0);
172
+ allLines.push(...lines);
173
+ }
174
+ return allLines;
175
+ }
176
+ async function docxToLines(buffer) {
177
+ const result = await mammoth.extractRawText({ buffer });
178
+ const text = result.value || "";
179
+ const content = text
180
+ .split(/\r?\n/)
181
+ .map(line => line.trim())
182
+ .filter(Boolean);
183
+ return content;
184
+ }
@@ -0,0 +1,5 @@
1
+ import { FilesystemDocument, FilesystemUpdate } from "./types.js";
2
+ export declare function addFileToUpdate(update: FilesystemUpdate, filesystem_document: FilesystemDocument): void;
3
+ export declare function writeFilesystemUpdate(update: FilesystemUpdate): void;
4
+ export declare function logDocumentProcessingError(update: FilesystemUpdate, err: string): void;
5
+ export declare function loadLastFilesystemUpdate(): FilesystemUpdate;
@@ -0,0 +1,65 @@
1
+ import path from "path";
2
+ import fs from "fs";
3
+ import { UPDATE_DIR } from "./types.js";
4
+ export function addFileToUpdate(update, filesystem_document) {
5
+ if (!filesystem_document.file_path) {
6
+ throw new Error("File to be added to update doesn't have a system path.");
7
+ }
8
+ if (!update.created) {
9
+ update.created = [];
10
+ }
11
+ if (!update.created_num) {
12
+ update.created_num = 0;
13
+ }
14
+ update.created_num += 1;
15
+ update.created.push(filesystem_document.file_path);
16
+ }
17
+ export function writeFilesystemUpdate(update) {
18
+ update.date_end = new Date();
19
+ fs.mkdirSync(UPDATE_DIR, { recursive: true });
20
+ const updates_file_path = `${UPDATE_DIR}/log_${formatUpdateDate(update.date_end)}.json`;
21
+ const drive_dir_path = `${UPDATE_DIR}/All`;
22
+ fs.mkdirSync(drive_dir_path, { recursive: true });
23
+ const drive_file_path = `${drive_dir_path}/log_${formatUpdateDate(update.date_end)}.json`;
24
+ removeOldUpdate(UPDATE_DIR);
25
+ fs.writeFileSync(drive_file_path, JSON.stringify(update, null, 2), { encoding: "utf-8" });
26
+ fs.writeFileSync(updates_file_path, JSON.stringify(update, null, 2), { encoding: "utf-8" });
27
+ }
28
+ export function logDocumentProcessingError(update, err) {
29
+ update.file_errors.push(err);
30
+ }
31
+ export function loadLastFilesystemUpdate() {
32
+ const empty_update = {
33
+ updateSource: "STJ (Sharepoint)",
34
+ file_errors: [],
35
+ date_start: new Date()
36
+ };
37
+ if (!fs.existsSync(UPDATE_DIR))
38
+ return empty_update;
39
+ const files = fs.readdirSync(UPDATE_DIR);
40
+ for (const file of files) {
41
+ const fullPath = path.join(UPDATE_DIR, file);
42
+ if (fs.statSync(fullPath).isFile() && file.toLowerCase().includes("log")) {
43
+ const jsonString = fs.readFileSync(fullPath, 'utf-8');
44
+ const parsed = JSON.parse(jsonString);
45
+ return parsed;
46
+ }
47
+ }
48
+ return empty_update;
49
+ }
50
+ function formatUpdateDate(d = new Date()) {
51
+ const pad = (n) => n.toString().padStart(2, "0");
52
+ return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}_${pad(d.getHours())}-${pad(d.getMinutes())}-${pad(d.getSeconds())}`;
53
+ }
54
+ function removeOldUpdate(folderPath) {
55
+ if (!fs.existsSync(folderPath))
56
+ return;
57
+ const files = fs.readdirSync(folderPath);
58
+ for (const file of files) {
59
+ const fullPath = path.join(folderPath, file);
60
+ if (fs.statSync(fullPath).isFile() && file.toLowerCase().includes("log")) {
61
+ fs.unlinkSync(fullPath);
62
+ console.log(`Deleted: ${fullPath} `);
63
+ }
64
+ }
65
+ }
@@ -0,0 +1,3 @@
1
+ export * from "./filesystemDocumentMethods.js";
2
+ export * from "./filesystemUpdateMethods.js";
3
+ export * from "./types.js";
package/dist/index.js ADDED
@@ -0,0 +1,3 @@
1
+ export * from "./filesystemDocumentMethods.js";
2
+ export * from "./filesystemUpdateMethods.js";
3
+ export * from "./types.js";
@@ -0,0 +1,62 @@
1
+ export declare const UpdateSources: readonly ["STJ (Sharepoint)", "Juris"];
2
+ export type SupportedUpdateSources = typeof UpdateSources[number];
3
+ import { PartialJurisprudenciaDocument } from '@stjiris/jurisprudencia-document';
4
+ export declare const ROOT_PATH: string;
5
+ export declare const FILESYSTEM_PATH = "/FileSystem";
6
+ export declare const SHAREPOINT_COPY_PATH = "/Sharepoint";
7
+ export declare const DETAILS_NAME = "Detalhes";
8
+ export declare const ORIGINAL_NAME = "Original";
9
+ export declare const LOGS_PATH = "/Updates";
10
+ export declare const UPDATE_DIR: string;
11
+ export type FilesystemUpdate = {
12
+ updateSource: SupportedUpdateSources;
13
+ date_start: Date;
14
+ file_errors: string[];
15
+ date_end?: Date;
16
+ created_num?: number;
17
+ created?: string[];
18
+ deleted_num?: number;
19
+ deleted?: string[];
20
+ updated_num?: number;
21
+ updated?: string[];
22
+ next_link?: string;
23
+ delta_link?: string;
24
+ };
25
+ export type Sharepoint_Metadata = {
26
+ drive_name: string;
27
+ drive_id: string;
28
+ sharepoint_id: string;
29
+ parent_sharepoint_id: string;
30
+ sharepoint_path: string;
31
+ sharepoint_path_rel: string;
32
+ sharepoint_url: string;
33
+ extensions: Supported_Content_Extensions[];
34
+ xor_hash?: string;
35
+ };
36
+ export type Retrievable_Metadata = {
37
+ process_number: string;
38
+ judge: string;
39
+ process_mean: string;
40
+ decision: string;
41
+ descriptors?: string[];
42
+ };
43
+ export type Date_Area_Section = {
44
+ file_date: Date;
45
+ area: string;
46
+ section: string;
47
+ };
48
+ export declare const SUPPORTED_EXTENSIONS: readonly ["txt", "pdf", "docx"];
49
+ export type Supported_Content_Extensions = typeof SUPPORTED_EXTENSIONS[number];
50
+ export type ContentType = {
51
+ extension: Supported_Content_Extensions;
52
+ data: Buffer;
53
+ };
54
+ export type FilesystemDocument = {
55
+ creation_date: Date;
56
+ last_update_date: Date;
57
+ jurisprudencia_document: PartialJurisprudenciaDocument;
58
+ file_path: string;
59
+ sharepoint_metadata?: Sharepoint_Metadata;
60
+ content?: ContentType[];
61
+ };
62
+ export declare function isSupportedExtension(ext: string): ext is Supported_Content_Extensions;
package/dist/types.js ADDED
@@ -0,0 +1,14 @@
1
+ export const UpdateSources = ["STJ (Sharepoint)", "Juris"];
2
+ import dotenv from 'dotenv';
3
+ dotenv.config();
4
+ export const ROOT_PATH = process.env['LOCAL_ROOT'] || 'results';
5
+ export const FILESYSTEM_PATH = `/FileSystem`;
6
+ export const SHAREPOINT_COPY_PATH = `/Sharepoint`;
7
+ export const DETAILS_NAME = "Detalhes";
8
+ export const ORIGINAL_NAME = "Original";
9
+ export const LOGS_PATH = "/Updates";
10
+ export const UPDATE_DIR = `${ROOT_PATH}${LOGS_PATH}`;
11
+ export const SUPPORTED_EXTENSIONS = ["txt", "pdf", "docx"];
12
+ export function isSupportedExtension(ext) {
13
+ return SUPPORTED_EXTENSIONS.includes(ext);
14
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mdgf11/filesystem-lib",
3
- "version": "2.0.7",
3
+ "version": "2.0.8",
4
4
  "description": "Library to extend usage of jurisprudencia-document",
5
5
  "license": "ISC",
6
6
  "author": "Miguel Fonseca",