npm - @pellux/goodvibes-sdk - Versions diffs - 0.27.2 → 0.27.3 - Mend

@pellux/goodvibes-sdk 0.27.2 → 0.27.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/dist/_internal/platform/knowledge/pdf-extractor.js ADDED Viewed

@@ -0,0 +1,346 @@
+import { inflateSync } from 'node:zlib';
+const MAX_STRUCTURE_SEARCH_TEXT_CHARS = 128 * 1024;
+function cleanText(value) {
+    return value
+        .replace(/\u0000/g, ' ')
+        .replace(/\r\n/g, '\n')
+        .replace(/\r/g, '\n')
+        .replace(/[ \t]+\n/g, '\n')
+        .replace(/\n{3,}/g, '\n\n')
+        .replace(/[ \t]{2,}/g, ' ')
+        .trim();
+}
+function searchTextPayload(value) {
+    const cleaned = cleanText(value);
+    if (!cleaned || looksBinaryLike(cleaned) || looksLikeRawPdfPayload(cleaned))
+        return undefined;
+    return cleaned.length <= MAX_STRUCTURE_SEARCH_TEXT_CHARS
+        ? cleaned
+        : cleaned.slice(0, MAX_STRUCTURE_SEARCH_TEXT_CHARS);
+}
+function estimateTokens(...chunks) {
+    const total = chunks
+        .filter((value) => typeof value === 'string')
+        .reduce((sum, value) => sum + value.length, 0);
+    return Math.max(1, Math.ceil(total / 4));
+}
+function firstNonEmptyLine(value) {
+    return value
+        .split(/\n+/)
+        .map((line) => line.trim())
+        .find(Boolean);
+}
+function summarizeText(text, maxLength = 320) {
+    const cleaned = cleanText(text);
+    if (!cleaned || looksBinaryLike(cleaned) || looksLikeRawPdfPayload(cleaned))
+        return undefined;
+    if (cleaned.length <= maxLength)
+        return cleaned;
+    const sentence = cleaned.match(/^(.{0,320}?[.!?])(?:\s|$)/)?.[1]?.trim();
+    return sentence && sentence.length >= 40 ? sentence : `${cleaned.slice(0, maxLength - 1).trim()}...`;
+}
+function excerptText(text, maxLength = 480) {
+    const cleaned = cleanText(text);
+    if (!cleaned || looksBinaryLike(cleaned) || looksLikeRawPdfPayload(cleaned))
+        return undefined;
+    return cleaned.length <= maxLength ? cleaned : `${cleaned.slice(0, maxLength - 1).trim()}...`;
+}
+function uniqueStrings(values, limit = 24) {
+    const seen = new Set();
+    const result = [];
+    for (const value of values) {
+        const trimmed = cleanText(value);
+        if (!trimmed || seen.has(trimmed) || !isReadablePdfText(trimmed))
+            continue;
+        seen.add(trimmed);
+        result.push(trimmed);
+        if (result.length >= limit)
+            break;
+    }
+    return result;
+}
+export async function extractPdf(buffer) {
+    const parsed = await extractPdfWithPdfJs(buffer);
+    if (parsed)
+        return parsed;
+    return extractPdfRawStreams(buffer);
+}
+async function extractPdfWithPdfJs(buffer) {
+    try {
+        const pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs');
+        const loadingTask = pdfjs.getDocument({
+            data: new Uint8Array(buffer),
+            useSystemFonts: true,
+        });
+        const document = await loadingTask.promise;
+        const pageCount = document.numPages;
+        const pageTexts = [];
+        for (let pageNumber = 1; pageNumber <= pageCount; pageNumber += 1) {
+            const page = await document.getPage(pageNumber);
+            const content = await page.getTextContent();
+            const lines = textContentItemsToLines(content.items);
+            if (lines.length > 0)
+                pageTexts.push(lines.join('\n'));
+            page.cleanup();
+        }
+        await document.destroy();
+        const text = cleanText(pageTexts.join('\n\n'));
+        if (!text)
+            return undefined;
+        const searchText = searchTextPayload(text);
+        return {
+            extractorId: 'pdfjs',
+            format: 'pdf',
+            title: firstNonEmptyLine(text) ?? 'PDF document',
+            summary: summarizeText(text) ?? 'PDF document.',
+            excerpt: excerptText(text),
+            sections: uniqueStrings(text.split(/\n+/), 24),
+            links: uniqueStrings(Array.from(text.matchAll(/\bhttps?:\/\/[^\s)]+/g), (match) => match[0]), 50),
+            estimatedTokens: estimateTokens(text),
+            structure: {
+                pageCount,
+                extractedTextChars: text.length,
+                ...(searchText ? { searchText } : {}),
+            },
+            metadata: {
+                limitations: ['PDF text extraction does not perform OCR for scanned images.'],
+            },
+        };
+    }
+    catch {
+        return undefined;
+    }
+}
+function textContentItemsToLines(items) {
+    const lines = [];
+    let current = '';
+    for (const item of items) {
+        const record = unknownRecord(item);
+        const text = typeof record.str === 'string' ? cleanText(record.str) : '';
+        if (text)
+            current = current ? `${current} ${text}` : text;
+        if (record.hasEOL === true && current) {
+            lines.push(current);
+            current = '';
+        }
+    }
+    if (current)
+        lines.push(current);
+    return lines;
+}
+function unknownRecord(value) {
+    return value && typeof value === 'object' ? value : {};
+}
+function extractPdfRawStreams(buffer) {
+    const body = buffer.toString('latin1');
+    const texts = [];
+    const streamRe = /(<<[\s\S]{0,4096}?>>)\s*stream\r?\n([\s\S]*?)\r?\nendstream/g;
+    let match;
+    while ((match = streamRe.exec(body)) !== null) {
+        const dictionary = match[1] ?? '';
+        const rawChunk = match[2] ?? '';
+        const chunk = decodePdfStreamChunk(dictionary, rawChunk);
+        for (const text of extractPdfTextStrings(chunk)) {
+            if (isReadablePdfText(text))
+                texts.push(text);
+        }
+    }
+    const combined = uniqueStrings(texts, 64).join('\n');
+    const searchable = uniqueStrings(texts, 512).join('\n');
+    const searchText = searchTextPayload(searchable);
+    return {
+        extractorId: 'pdf',
+        format: 'pdf',
+        title: firstNonEmptyLine(combined) ?? 'PDF document',
+        summary: summarizeText(combined) ?? 'PDF extraction produced limited text; OCR is not used in-core.',
+        excerpt: excerptText(combined),
+        sections: uniqueStrings(combined.split(/\n+/), 8),
+        links: uniqueStrings(Array.from(combined.matchAll(/\bhttps?:\/\/[^\s)]+/g), (linkMatch) => linkMatch[0]), 50),
+        estimatedTokens: estimateTokens(combined),
+        structure: {
+            extractedStringCount: texts.length,
+            ...(searchText ? { searchText } : {}),
+        },
+        metadata: {
+            limitations: texts.length === 0
+                ? ['No readable text streams were found. Complex PDFs need OCR or a dedicated provider.']
+                : ['PDF extraction is best-effort and does not use OCR.'],
+        },
+    };
+}
+function decodePdfStreamChunk(dictionary, rawChunk) {
+    if (!/\/FlateDecode\b/i.test(dictionary))
+        return rawChunk;
+    try {
+        return inflateSync(Buffer.from(rawChunk, 'latin1')).toString('latin1');
+    }
+    catch {
+        return '';
+    }
+}
+function extractPdfTextStrings(chunk) {
+    return [
+        ...extractLiteralStrings(chunk),
+        ...extractHexStrings(chunk),
+    ];
+}
+function extractLiteralStrings(chunk) {
+    const values = [];
+    let index = 0;
+    while (index < chunk.length) {
+        if (chunk[index] !== '(') {
+            index += 1;
+            continue;
+        }
+        const parsed = readPdfLiteralString(chunk, index + 1);
+        if (parsed) {
+            values.push(cleanText(parsed.value));
+            index = parsed.nextIndex;
+        }
+        else {
+            index += 1;
+        }
+    }
+    return values;
+}
+function readPdfLiteralString(chunk, start) {
+    let depth = 1;
+    let escaped = false;
+    let value = '';
+    for (let index = start; index < chunk.length; index += 1) {
+        const char = chunk[index];
+        if (escaped) {
+            const decoded = decodePdfEscape(char, chunk.slice(index + 1, index + 3));
+            value += decoded.value;
+            index += decoded.consumed;
+            escaped = false;
+            continue;
+        }
+        if (char === '\\') {
+            escaped = true;
+            continue;
+        }
+        if (char === '(') {
+            depth += 1;
+            value += char;
+            continue;
+        }
+        if (char === ')') {
+            depth -= 1;
+            if (depth === 0)
+                return { value, nextIndex: index + 1 };
+            value += char;
+            continue;
+        }
+        value += char;
+    }
+    return undefined;
+}
+function decodePdfEscape(char, following) {
+    switch (char) {
+        case 'n':
+            return { value: '\n', consumed: 0 };
+        case 'r':
+            return { value: '\r', consumed: 0 };
+        case 't':
+            return { value: '\t', consumed: 0 };
+        case 'b':
+            return { value: '\b', consumed: 0 };
+        case 'f':
+            return { value: '\f', consumed: 0 };
+        case '(':
+        case ')':
+        case '\\':
+            return { value: char, consumed: 0 };
+        default:
+            if (/[0-7]/.test(char)) {
+                const octal = `${char}${(following.match(/^[0-7]{0,2}/)?.[0] ?? '')}`;
+                return { value: String.fromCharCode(Number.parseInt(octal, 8)), consumed: octal.length - 1 };
+            }
+            return { value: char, consumed: 0 };
+    }
+}
+function extractHexStrings(chunk) {
+    const values = [];
+    const hexRe = /<([0-9A-Fa-f\s]{4,})>/g;
+    let match;
+    while ((match = hexRe.exec(chunk)) !== null) {
+        const text = decodeHexPdfString(match[1] ?? '');
+        if (text)
+            values.push(cleanText(text));
+    }
+    return values;
+}
+function decodeHexPdfString(value) {
+    const hex = value.replace(/\s+/g, '');
+    if (hex.length < 4 || hex.length % 2 !== 0)
+        return undefined;
+    const bytes = Buffer.from(hex, 'hex');
+    if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
+        return decodeUtf16Be(bytes.subarray(2));
+    }
+    const mostlyUtf16 = bytes.length >= 4 && bytes.filter((byte, index) => index % 2 === 0 && byte === 0).length >= Math.floor(bytes.length / 4);
+    if (mostlyUtf16)
+        return decodeUtf16Be(bytes);
+    return bytes.toString('latin1');
+}
+function decodeUtf16Be(bytes) {
+    const swapped = Buffer.alloc(bytes.length);
+    for (let index = 0; index + 1 < bytes.length; index += 2) {
+        swapped[index] = bytes[index + 1];
+        swapped[index + 1] = bytes[index];
+    }
+    return swapped.toString('utf16le');
+}
+function isReadablePdfText(value) {
+    const text = cleanText(value);
+    if (text.length < 2 || looksLikeRawPdfPayload(text) || looksBinaryLike(text))
+        return false;
+    const sample = text.slice(0, 512);
+    let lettersOrDigits = 0;
+    let whitespace = 0;
+    for (const char of sample) {
+        if (/[a-z0-9]/i.test(char))
+            lettersOrDigits += 1;
+        if (/\s/.test(char))
+            whitespace += 1;
+    }
+    return (lettersOrDigits + whitespace) / sample.length >= 0.55;
+}
+function looksLikeRawPdfPayload(value) {
+    const lower = value.toLowerCase();
+    return lower.includes('%pdf')
+        || /\b\d+\s+\d+\s+obj\b/.test(lower)
+        || (lower.includes(' endobj') && lower.includes(' stream'))
+        || (lower.includes('/filter') && lower.includes('/flatedecode'));
+}
+function looksBinaryLike(value) {
+    const sample = value.slice(0, 4_096);
+    if (sample.length < 120)
+        return false;
+    let control = 0;
+    let extended = 0;
+    let letters = 0;
+    let whitespace = 0;
+    let punctuation = 0;
+    for (const char of sample) {
+        const code = char.charCodeAt(0);
+        if ((code < 32 && char !== '\n' && char !== '\r' && char !== '\t') || code === 65533)
+            control += 1;
+        if (code > 126)
+            extended += 1;
+        if (/[a-z0-9]/i.test(char))
+            letters += 1;
+        if (/\s/.test(char))
+            whitespace += 1;
+        if (/[^a-z0-9\s]/i.test(char))
+            punctuation += 1;
+    }
+    const length = sample.length;
+    const extendedRatio = extended / length;
+    const usefulRatio = (letters + whitespace) / length;
+    const punctuationRatio = punctuation / length;
+    return control > 0
+        || (extendedRatio > 0.18 && usefulRatio < 0.78)
+        || (punctuationRatio > 0.42 && whitespace / length < 0.08);
+}

package/dist/_internal/platform/version.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { readFileSync } from 'node:fs';
 import { join } from 'node:path';
-let version = '0.27.2';
+let version = '0.27.3';
 try {
     const pkg = JSON.parse(readFileSync(join(import.meta.dir, '..', '..', 'package.json'), 'utf-8'));
     version = pkg.version ?? version;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pellux/goodvibes-sdk",
-  "version": "0.27.2",
+  "version": "0.27.3",
   "repository": {
     "type": "git",
     "url": "git+https://github.com/mgd34msu/goodvibes-sdk.git"