npm - filemayor - Versions diffs - 2.0.0 - Mend

filemayor 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/core/sop-parser.js ADDED Viewed

@@ -0,0 +1,565 @@
+#!/usr/bin/env node
+/**
+ * ═══════════════════════════════════════════════════════════════════
+ * FILEMAYOR — SOP PARSER
+ * Parses Standard Operating Procedure documents and converts them
+ * into deterministic .filemayor.yml organization rules.
+ *
+ * Two engines:
+ *   1. Rule-Based (offline, always available) — pattern matching
+ *   2. Gemini AI (online, enhanced) — natural language understanding
+ *
+ * Copyright (c) 2024-2026 FileMayor. All rights reserved.
+ * ═══════════════════════════════════════════════════════════════════
+ */
+'use strict';
+const fs = require('fs');
+const path = require('path');
+// ─── Text Extraction ──────────────────────────────────────────────
+/**
+ * Extract text from a document file
+ * Supports: .txt, .md, .csv, .json, .yaml, .yml
+ * For PDF/DOCX — extracts raw text via basic parsers
+ * @param {string} filePath - Path to document
+ * @returns {Promise<string>} Extracted text
+ */
+async function extractText(filePath) {
+    const ext = path.extname(filePath).toLowerCase();
+    const buffer = fs.readFileSync(filePath);
+    switch (ext) {
+        case '.txt':
+        case '.md':
+        case '.csv':
+        case '.yaml':
+        case '.yml':
+            return buffer.toString('utf-8');
+        case '.json':
+            return JSON.stringify(JSON.parse(buffer.toString('utf-8')), null, 2);
+        case '.pdf':
+            return extractPDFText(buffer);
+        case '.docx':
+            return extractDOCXText(buffer);
+        default:
+            // Try as plain text
+            return buffer.toString('utf-8');
+    }
+}
+/**
+ * Basic PDF text extraction (no external deps)
+ * Extracts text streams from PDF structure
+ */
+function extractPDFText(buffer) {
+    const text = buffer.toString('latin1');
+    const textBlocks = [];
+    // Extract text between BT/ET (Begin Text / End Text) operators
+    const btEtRegex = /BT\s*([\s\S]*?)\s*ET/g;
+    let match;
+    while ((match = btEtRegex.exec(text)) !== null) {
+        const block = match[1];
+        // Extract text from Tj and TJ operators
+        const tjRegex = /\(([^)]*)\)\s*Tj/g;
+        let tjMatch;
+        while ((tjMatch = tjRegex.exec(block)) !== null) {
+            textBlocks.push(tjMatch[1]);
+        }
+        // TJ arrays
+        const tjArrayRegex = /\[([^\]]*)\]\s*TJ/g;
+        let tjArrMatch;
+        while ((tjArrMatch = tjArrayRegex.exec(block)) !== null) {
+            const items = tjArrMatch[1];
+            const strRegex = /\(([^)]*)\)/g;
+            let strMatch;
+            while ((strMatch = strRegex.exec(items)) !== null) {
+                textBlocks.push(strMatch[1]);
+            }
+        }
+    }
+    if (textBlocks.length === 0) {
+        // Fallback: extract any readable text
+        return text.replace(/[^\x20-\x7E\n\r\t]/g, ' ')
+            .replace(/\s{3,}/g, '\n')
+            .trim()
+            .slice(0, 50000);
+    }
+    return textBlocks.join(' ').replace(/\\n/g, '\n').trim();
+}
+/**
+ * Basic DOCX text extraction (no external deps)
+ * DOCX is a ZIP containing XML — we extract text from word/document.xml
+ */
+function extractDOCXText(buffer) {
+    try {
+        // Find PK zip signature
+        const zipStr = buffer.toString('binary');
+        // Locate word/document.xml in the zip
+        const marker = 'word/document.xml';
+        const idx = zipStr.indexOf(marker);
+        if (idx === -1) return '[Could not parse DOCX — missing document.xml]';
+        // Find the local file header for this entry
+        // Quick approach: find the content between XML tags
+        const xmlStart = zipStr.indexOf('<?xml', idx);
+        const xmlEnd = zipStr.indexOf('</w:document>', xmlStart);
+        if (xmlStart === -1 || xmlEnd === -1) {
+            return '[Could not parse DOCX XML content]';
+        }
+        const xml = zipStr.slice(xmlStart, xmlEnd + '</w:document>'.length);
+        // Extract text from <w:t> tags
+        const textParts = [];
+        const wtRegex = /<w:t[^>]*>([^<]*)<\/w:t>/g;
+        let match;
+        while ((match = wtRegex.exec(xml)) !== null) {
+            textParts.push(match[1]);
+        }
+        return textParts.join(' ').trim() || '[No text content found in DOCX]';
+    } catch {
+        return '[DOCX parsing error — try converting to .txt]';
+    }
+}
+// ─── Rule-Based Parser (Offline) ──────────────────────────────────
+/**
+ * Pattern library for detecting organization rules in natural language
+ */
+const RULE_PATTERNS = {
+    // Directory structure patterns
+    directory: [
+        /(?:create|make|set up|establish)\s+(?:a\s+)?(?:folder|directory|dir)\s+(?:called|named|for)\s+["""]?([^""".\n]+)["""]?/gi,
+        /(?:folder|directory)\s*:\s*["""]?([^""".\n]+)["""]?/gi,
+        /(?:organize|sort|file|move)\s+(?:into|to|under)\s+["""]?([^""".\n]+)["""]?/gi,
+    ],
+    // File type to folder mapping
+    fileTypeMapping: [
+        /(?:move|place|put|file|store|organize)\s+(?:all\s+)?\.?(\w+)\s+(?:files?\s+)?(?:to|into|under|in)\s+["""]?([^""".\n]+)["""]?/gi,
+        /\.(\w+)\s+(?:files?\s+)?(?:→|->|=>|goes?\s+(?:to|into)|should\s+(?:go|be\s+(?:in|placed)))\s+["""]?([^""".\n]+)["""]?/gi,
+        /(\w+)\s+files?\s*(?:\([^)]*\))?\s*(?:→|->|=>|:)\s*["""]?([^""".\n]+)["""]?/gi,
+    ],
+    // Naming convention patterns
+    naming: [
+        /(?:name|rename|naming)\s+(?:convention|format|scheme|pattern)\s*:\s*["""]?([^""".\n]+)["""]?/gi,
+        /(?:files?\s+)?(?:should\s+be|must\s+be|are)\s+(?:named|renamed)\s+(?:as|with|using)\s+["""]?([^""".\n]+)["""]?/gi,
+        /prefix\s+(?:with|using|by)\s+["""]?([^""".\n]+)["""]?/gi,
+    ],
+    // Date-based rules
+    dateBased: [
+        /(?:organize|sort|group|arrange)\s+by\s+(?:date|year|month|quarter)/gi,
+        /(?:create|use)\s+(?:date|year|month|quarterly)\s+(?:folders?|directories?)/gi,
+    ],
+    // Retention / cleanup rules
+    cleanup: [
+        /(?:delete|remove|clean|purge)\s+(?:files?\s+)?(?:older\s+than|after)\s+(\d+)\s+(days?|weeks?|months?|years?)/gi,
+        /(?:retention|keep)\s+(?:period|policy)\s*:\s*(\d+)\s+(days?|weeks?|months?|years?)/gi,
+        /(?:archive|compress)\s+(?:files?\s+)?(?:older\s+than|after)\s+(\d+)\s+(days?|weeks?|months?|years?)/gi,
+    ],
+    // Ignore patterns
+    ignore: [
+        /(?:ignore|skip|exclude|don't\s+(?:touch|move|organize))\s+["""]?([^""".\n]+)["""]?/gi,
+        /(?:protected|locked|system)\s+(?:folder|directory|files?)\s*:\s*["""]?([^""".\n]+)["""]?/gi,
+    ],
+};
+/**
+ * Common file type aliases
+ */
+const FILE_TYPE_ALIASES = {
+    'pdf': ['.pdf'],
+    'pdfs': ['.pdf'],
+    'document': ['.pdf', '.doc', '.docx', '.txt', '.rtf', '.odt'],
+    'documents': ['.pdf', '.doc', '.docx', '.txt', '.rtf', '.odt'],
+    'image': ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp'],
+    'images': ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp'],
+    'photo': ['.jpg', '.jpeg', '.png', '.raw', '.heic'],
+    'photos': ['.jpg', '.jpeg', '.png', '.raw', '.heic'],
+    'video': ['.mp4', '.mkv', '.avi', '.mov', '.wmv', '.webm'],
+    'videos': ['.mp4', '.mkv', '.avi', '.mov', '.wmv', '.webm'],
+    'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'],
+    'music': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'],
+    'spreadsheet': ['.xls', '.xlsx', '.csv', '.ods'],
+    'spreadsheets': ['.xls', '.xlsx', '.csv', '.ods'],
+    'presentation': ['.ppt', '.pptx', '.key', '.odp'],
+    'presentations': ['.ppt', '.pptx', '.key', '.odp'],
+    'archive': ['.zip', '.rar', '.7z', '.tar', '.gz'],
+    'archives': ['.zip', '.rar', '.7z', '.tar', '.gz'],
+    'code': ['.js', '.ts', '.py', '.java', '.cpp', '.go', '.rs', '.rb'],
+    'executable': ['.exe', '.msi', '.app', '.dmg', '.deb'],
+    'executables': ['.exe', '.msi', '.app', '.dmg', '.deb'],
+    'font': ['.ttf', '.otf', '.woff', '.woff2'],
+    'fonts': ['.ttf', '.otf', '.woff', '.woff2'],
+    'design': ['.psd', '.ai', '.sketch', '.fig', '.xd'],
+};
+/**
+ * Parse SOP text using rule-based pattern matching (offline)
+ * @param {string} text - SOP document text
+ * @returns {Object} Parsed rules
+ */
+function parseRuleBased(text) {
+    const rules = {
+        directories: [],
+        fileTypeMappings: [],
+        namingConvention: null,
+        dateBased: false,
+        cleanup: [],
+        ignore: [],
+        confidence: 0,
+        method: 'rule-based',
+    };
+    let matchCount = 0;
+    // Extract directory creation rules
+    for (const pattern of RULE_PATTERNS.directory) {
+        let match;
+        pattern.lastIndex = 0;
+        while ((match = pattern.exec(text)) !== null) {
+            const dir = match[1].trim();
+            if (dir && dir.length < 100 && !rules.directories.includes(dir)) {
+                rules.directories.push(dir);
+                matchCount++;
+            }
+        }
+    }
+    // Extract file type → folder mappings
+    for (const pattern of RULE_PATTERNS.fileTypeMapping) {
+        let match;
+        pattern.lastIndex = 0;
+        while ((match = pattern.exec(text)) !== null) {
+            const fileType = match[1].trim().toLowerCase();
+            const folder = match[2].trim();
+            if (fileType && folder && folder.length < 100) {
+                const extensions = FILE_TYPE_ALIASES[fileType] || [`.${fileType}`];
+                rules.fileTypeMappings.push({
+                    extensions,
+                    folder,
+                    original: match[0].trim(),
+                });
+                matchCount++;
+            }
+        }
+    }
+    // Extract naming conventions
+    for (const pattern of RULE_PATTERNS.naming) {
+        let match;
+        pattern.lastIndex = 0;
+        if ((match = pattern.exec(text)) !== null) {
+            const convention = match[1].trim().toLowerCase();
+            if (convention.includes('date') || convention.includes('yyyy')) {
+                rules.namingConvention = 'date_prefix';
+            } else if (convention.includes('category') || convention.includes('type')) {
+                rules.namingConvention = 'category_prefix';
+            } else if (convention.includes('clean') || convention.includes('title')) {
+                rules.namingConvention = 'clean';
+            } else {
+                rules.namingConvention = 'original';
+            }
+            matchCount++;
+        }
+    }
+    // Check for date-based organization
+    for (const pattern of RULE_PATTERNS.dateBased) {
+        pattern.lastIndex = 0;
+        if (pattern.test(text)) {
+            rules.dateBased = true;
+            matchCount++;
+        }
+    }
+    // Extract cleanup rules
+    for (const pattern of RULE_PATTERNS.cleanup) {
+        let match;
+        pattern.lastIndex = 0;
+        while ((match = pattern.exec(text)) !== null) {
+            rules.cleanup.push({
+                action: match[0].toLowerCase().startsWith('archive') ? 'archive' : 'delete',
+                amount: parseInt(match[1], 10),
+                unit: match[2].replace(/s$/, ''),
+                original: match[0].trim(),
+            });
+            matchCount++;
+        }
+    }
+    // Extract ignore patterns
+    for (const pattern of RULE_PATTERNS.ignore) {
+        let match;
+        pattern.lastIndex = 0;
+        while ((match = pattern.exec(text)) !== null) {
+            const ignore = match[1].trim();
+            if (ignore && !rules.ignore.includes(ignore)) {
+                rules.ignore.push(ignore);
+                matchCount++;
+            }
+        }
+    }
+    // Confidence: 0-100 based on how many rules were found
+    rules.confidence = Math.min(100, Math.round(matchCount * 15));
+    return rules;
+}
+// ─── Gemini AI Parser (Online, Enhanced) ──────────────────────────
+const GEMINI_MODEL = 'gemini-2.0-flash';
+const GEMINI_ENDPOINT = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
+const GEMINI_SYSTEM_PROMPT = `You are an expert file organization consultant. You analyze Standard Operating Procedure (SOP) documents and extract precise file organization rules.
+Your output MUST be a valid JSON object with this exact structure:
+{
+  "directories": ["list", "of", "folder", "names", "to", "create"],
+  "fileTypeMappings": [
+    {"extensions": [".pdf", ".doc"], "folder": "Documents"},
+    {"extensions": [".jpg", ".png"], "folder": "Images"}
+  ],
+  "namingConvention": "original" | "date_prefix" | "category_prefix" | "clean",
+  "dateBased": true | false,
+  "cleanup": [
+    {"action": "delete" | "archive", "amount": 30, "unit": "day"}
+  ],
+  "ignore": ["patterns", "to", "ignore"],
+  "watchRules": [
+    {"match": "*.pdf", "action": "move", "dest": "Documents/PDFs"}
+  ]
+}
+Rules:
+- Extract EVERY organization rule from the SOP, no matter how implicit
+- Map file types to well-known extensions (e.g., "spreadsheets" → [".xls", ".xlsx", ".csv"])
+- If the SOP mentions date-based organization, set dateBased to true
+- If retention periods are mentioned, add cleanup rules
+- Output ONLY the JSON object, no markdown or explanation`;
+/**
+ * Parse SOP text using Google Gemini AI
+ * @param {string} text - SOP document text
+ * @param {string} apiKey - Gemini API key
+ * @returns {Promise<Object>} AI-parsed rules
+ */
+async function parseWithGemini(text, apiKey) {
+    if (!apiKey) {
+        throw new Error(
+            'Gemini API key required. Set GEMINI_API_KEY environment variable.\n' +
+            'Get a free key at: https://aistudio.google.com/apikey'
+        );
+    }
+    // Truncate very long documents to stay within token limits
+    const truncated = text.length > 30000 ? text.slice(0, 30000) + '\n\n[...truncated]' : text;
+    const body = {
+        contents: [{
+            parts: [{
+                text: `${GEMINI_SYSTEM_PROMPT}\n\n--- SOP DOCUMENT START ---\n${truncated}\n--- SOP DOCUMENT END ---\n\nExtract ALL organization rules as JSON:`
+            }]
+        }],
+        generationConfig: {
+            temperature: 0.1,
+            topP: 0.8,
+            maxOutputTokens: 4096,
+        }
+    };
+    const url = `${GEMINI_ENDPOINT}?key=${apiKey}`;
+    // Use native fetch (Node 18+)
+    const response = await fetch(url, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify(body),
+    });
+    if (!response.ok) {
+        const err = await response.text();
+        if (response.status === 429) {
+            throw new Error('Gemini rate limit exceeded. Wait a moment and try again.');
+        }
+        if (response.status === 403) {
+            throw new Error('Invalid Gemini API key. Get a free key at: https://aistudio.google.com/apikey');
+        }
+        throw new Error(`Gemini API error (${response.status}): ${err.slice(0, 200)}`);
+    }
+    const data = await response.json();
+    const rawText = data?.candidates?.[0]?.content?.parts?.[0]?.text || '';
+    // Extract JSON from response (Gemini may wrap it in markdown)
+    const jsonMatch = rawText.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+        throw new Error('Gemini did not return valid JSON. Using rule-based fallback.');
+    }
+    try {
+        const parsed = JSON.parse(jsonMatch[0]);
+        parsed.confidence = 90;
+        parsed.method = 'gemini-ai';
+        return parsed;
+    } catch {
+        throw new Error('Failed to parse Gemini JSON response. Using rule-based fallback.');
+    }
+}
+// ─── Rules → Config Converter ─────────────────────────────────────
+/**
+ * Convert parsed rules into a .filemayor.yml config string
+ * @param {Object} rules - Parsed rules from either engine
+ * @returns {string} YAML config
+ */
+function rulesToConfig(rules) {
+    const lines = [];
+    lines.push('# FileMayor Configuration');
+    lines.push(`# Generated from SOP (${rules.method || 'unknown'}, confidence: ${rules.confidence || 0}%)`);
+    lines.push(`# Generated on: ${new Date().toISOString()}`);
+    lines.push('');
+    lines.push('version: 1');
+    lines.push('');
+    // Organize section
+    lines.push('organize:');
+    lines.push(`  naming: ${rules.namingConvention || 'original'}`);
+    lines.push('  duplicates: rename');
+    if (rules.dateBased) {
+        lines.push('  date_folders: true');
+    }
+    // Custom categories from file type mappings
+    if (rules.fileTypeMappings && rules.fileTypeMappings.length > 0) {
+        lines.push('  categories:');
+        for (const mapping of rules.fileTypeMappings) {
+            const key = mapping.folder.toLowerCase().replace(/[^a-z0-9]/g, '_');
+            const exts = mapping.extensions.map(e => e.startsWith('.') ? e : `.${e}`);
+            lines.push(`    ${key}: [${exts.join(', ')}]`);
+        }
+    }
+    // Ignore patterns
+    if (rules.ignore && rules.ignore.length > 0) {
+        lines.push(`  ignore: [${rules.ignore.join(', ')}]`);
+    }
+    // Clean section
+    if (rules.cleanup && rules.cleanup.length > 0) {
+        lines.push('');
+        lines.push('clean:');
+        lines.push('  auto: true');
+        for (const rule of rules.cleanup) {
+            lines.push(`  # ${rule.action} files older than ${rule.amount} ${rule.unit}(s)`);
+        }
+    }
+    // Watch rules
+    if (rules.watchRules && rules.watchRules.length > 0) {
+        lines.push('');
+        lines.push('watch:');
+        lines.push('  rules:');
+        for (const rule of rules.watchRules) {
+            lines.push(`    - match: "${rule.match}"`);
+            lines.push(`      action: ${rule.action}`);
+            lines.push(`      dest: ${rule.dest}`);
+        }
+    }
+    // Directories to create
+    if (rules.directories && rules.directories.length > 0) {
+        lines.push('');
+        lines.push('# Directories to create:');
+        for (const dir of rules.directories) {
+            lines.push(`#   - ${dir}`);
+        }
+    }
+    lines.push('');
+    return lines.join('\n');
+}
+// ─── Main SOP Parser ──────────────────────────────────────────────
+/**
+ * Parse an SOP document and generate FileMayor rules
+ * @param {string} filePath - Path to SOP document
+ * @param {Object} options
+ * @param {boolean} options.useAI - Use Gemini AI (default: true if key available)
+ * @param {string} options.apiKey - Gemini API key (or GEMINI_API_KEY env var)
+ * @returns {Promise<Object>} { rules, config, text, method }
+ */
+async function parseSOP(filePath, options = {}) {
+    // Extract text from document
+    const text = await extractText(filePath);
+    if (!text || text.trim().length < 10) {
+        throw new Error('Could not extract meaningful text from the document');
+    }
+    const apiKey = options.apiKey || process.env.GEMINI_API_KEY || '';
+    const useAI = options.useAI !== false && apiKey.length > 0;
+    let rules;
+    let fallbackUsed = false;
+    if (useAI) {
+        try {
+            rules = await parseWithGemini(text, apiKey);
+        } catch (err) {
+            console.warn(`[SOP] Gemini AI failed: ${err.message}`);
+            console.warn('[SOP] Falling back to rule-based parser');
+            rules = parseRuleBased(text);
+            fallbackUsed = true;
+        }
+    } else {
+        rules = parseRuleBased(text);
+    }
+    // Generate config YAML
+    const config = rulesToConfig(rules);
+    return {
+        rules,
+        config,
+        text: text.slice(0, 500) + (text.length > 500 ? '...' : ''),
+        method: fallbackUsed ? 'rule-based (AI fallback)' : rules.method,
+        documentPath: filePath,
+        documentSize: text.length,
+    };
+}
+// ─── Exports ──────────────────────────────────────────────────────
+module.exports = {
+    parseSOP,
+    extractText,
+    parseRuleBased,
+    parseWithGemini,
+    rulesToConfig,
+    FILE_TYPE_ALIASES,
+    RULE_PATTERNS,
+};