PyPI - kssrag - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

kssrag 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kssrag/cli.py +58 -13
kssrag/config.py +15 -1
kssrag/core/agents.py +62 -10
kssrag/core/chunkers.py +95 -1
kssrag/core/vectorstores.py +95 -3
kssrag/models/openrouter.py +78 -16
kssrag/server.py +66 -4
kssrag/utils/document_loaders.py +80 -2
kssrag/utils/helpers.py +38 -25
kssrag/utils/ocr.py +48 -0
kssrag/utils/ocr_loader.py +151 -0
kssrag-0.2.1.dist-info/METADATA +840 -0
kssrag-0.2.1.dist-info/RECORD +33 -0
tests/test_bm25s.py +74 -0
tests/test_config.py +42 -0
tests/test_image_chunker.py +17 -0
tests/test_integration.py +35 -0
tests/test_ocr.py +142 -0
tests/test_streaming.py +41 -0
kssrag-0.1.2.dist-info/METADATA +0 -407
kssrag-0.1.2.dist-info/RECORD +0 -25
{kssrag-0.1.2.dist-info → kssrag-0.2.1.dist-info}/WHEEL +0 -0
{kssrag-0.1.2.dist-info → kssrag-0.2.1.dist-info}/entry_points.txt +0 -0
{kssrag-0.1.2.dist-info → kssrag-0.2.1.dist-info}/top_level.txt +0 -0

kssrag/server.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from typing import Dict, Any, Optional, List
 import uuid
+import json
+from kssrag.models.openrouter import OpenRouterLLM
 from .core.agents import RAGAgent
 from .utils.helpers import logger
@@ -12,6 +16,10 @@ class QueryRequest(BaseModel):
     query: str
     session_id: Optional[str] = None
+class StreamResponse(BaseModel):
+    chunk: str
+    done: bool = False
 class ServerConfig(BaseModel):
     """Configuration for the FastAPI server"""
     host: str = config.SERVER_HOST
@@ -20,9 +28,9 @@ class ServerConfig(BaseModel):
     cors_allow_credentials: bool = config.CORS_ALLOW_CREDENTIALS
     cors_allow_methods: List[str] = config.CORS_ALLOW_METHODS
     cors_allow_headers: List[str] = config.CORS_ALLOW_HEADERS
-    title: str = "KSS RAG API"
-    description: str = "A Retrieval-Augmented Generation API by Ksschkw"
-    version: str = "0.1.0"
+    title: str = "KSSSwagger"
+    description: str = "[kssrag](https://github.com/Ksschkw/kssrag)"
+    version: str = "0.2.0"
 def create_app(rag_agent: RAGAgent, server_config: Optional[ServerConfig] = None):
     """Create a FastAPI app for the RAG agent with configurable CORS"""
@@ -80,6 +88,60 @@ def create_app(rag_agent: RAGAgent, server_config: Optional[ServerConfig] = None
             logger.error(f"Error handling query: {str(e)}")
             raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
+    @app.post("/stream")
+    async def stream_query(request: QueryRequest):
+        """Streaming query endpoint with Server-Sent Events"""
+        query = request.query
+        session_id = request.session_id or str(uuid.uuid4())
+        if not query.strip():
+            raise HTTPException(status_code=400, detail="Query cannot be empty")
+        try:
+            # Get or create session - USE THE SAME LLM INSTANCE
+            if session_id not in sessions:
+                logger.info(f"Creating new streaming session: {session_id}")
+                # Use the same LLM configuration but enable streaming
+                sessions[session_id] = RAGAgent(
+                    retriever=rag_agent.retriever,
+                    llm=rag_agent.llm,  # Use the same LLM instance
+                    system_prompt=rag_agent.system_prompt
+                )
+            agent = sessions[session_id]
+            # Build messages using agent's conversation history
+            context_docs = agent.retriever.retrieve(query, top_k=5)
+            context = agent._build_context(context_docs)
+            messages = agent._build_messages(query, context)
+            async def generate():
+                full_response = ""
+                try:
+                    # Use the agent's query_stream method instead of calling LLM directly
+                    for chunk in agent.query_stream(query, top_k=5):
+                        full_response += chunk
+                        yield f"data: {json.dumps({'chunk': chunk, 'done': False})}\n\n"
+                    yield f"data: {json.dumps({'chunk': '', 'done': True})}\n\n"
+                except Exception as e:
+                    logger.error(f"Streaming error: {str(e)}")
+                    yield f"data: {json.dumps({'error': str(e), 'done': True})}\n\n"
+            return StreamingResponse(
+                generate(),
+                media_type="text/plain",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "Connection": "keep-alive",
+                }
+            )
+        except Exception as e:
+            logger.error(f"Streaming query failed: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Streaming error: {str(e)}")
     @app.get("/health")
     async def health_check():
         """Health check endpoint"""
@@ -107,7 +169,7 @@ def create_app(rag_agent: RAGAgent, server_config: Optional[ServerConfig] = None
     async def root():
         """Root endpoint with API information"""
         return {
-            "message": "Welcome to KSS RAG API",
+            "message": "Welcome to KSSRAG API",
             "version": server_config.version,
             "docs": "/docs",
             "health": "/health"

kssrag/utils/document_loaders.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import os
 from typing import List, Dict, Any, Optional
 from ..utils.helpers import logger
@@ -20,15 +21,92 @@ def load_json_file(file_path: str) -> Any:
         logger.error(f"Failed to load JSON file: {str(e)}")
         raise
+def load_docx_file(file_path: str) -> str:
+    """Load text from DOCX file"""
+    try:
+        from docx import Document
+        doc = Document(file_path)
+        text = ""
+        for paragraph in doc.paragraphs:
+            if paragraph.text.strip():
+                text += paragraph.text + "\n"
+        # Extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text.strip():
+                        text += cell.text + "\n"
+        return text.strip()
+    except ImportError:
+        raise ImportError("python-docx is required for DOCX support. Install with: pip install kssrag[office]")
+    except Exception as e:
+        logger.error(f"Failed to load DOCX file: {str(e)}")
+        raise
+def load_excel_file(file_path: str) -> str:
+    """Load text from Excel file"""
+    try:
+        import openpyxl
+        workbook = openpyxl.load_workbook(file_path)
+        text = ""
+        for sheet_name in workbook.sheetnames:
+            sheet = workbook[sheet_name]
+            text += f"Sheet: {sheet_name}\n"
+            for row in sheet.iter_rows(values_only=True):
+                row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
+                if row_text.strip():
+                    text += row_text + "\n"
+            text += "\n"
+        return text.strip()
+    except ImportError:
+        raise ImportError("openpyxl is required for Excel support. Install with: pip install kssrag[office]")
+    except Exception as e:
+        logger.error(f"Failed to load Excel file: {str(e)}")
+        raise
+def load_pptx_file(file_path: str) -> str:
+    """Load text from PowerPoint file"""
+    try:
+        from pptx import Presentation
+        prs = Presentation(file_path)
+        text = ""
+        for slide_number, slide in enumerate(prs.slides, 1):
+            text += f"Slide {slide_number}:\n"
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text.strip():
+                    text += shape.text + "\n"
+            text += "\n"
+        return text.strip()
+    except ImportError:
+        raise ImportError("python-pptx is required for PowerPoint support. Install with: pip install kssrag[office]")
+    except Exception as e:
+        logger.error(f"Failed to load PowerPoint file: {str(e)}")
+        raise
 def load_document(file_path: str) -> str:
-    """Load document from file (supports .txt)"""
+    """Load document from file with auto-format detection"""
     if file_path.endswith('.txt'):
         return load_txt_file(file_path)
+    elif file_path.endswith('.docx'):
+        return load_docx_file(file_path)
+    elif file_path.endswith(('.xlsx', '.xls')):
+        return load_excel_file(file_path)
+    elif file_path.endswith('.pptx'):
+        return load_pptx_file(file_path)
     else:
         raise ValueError(f"Unsupported file type: {file_path}")
 def load_json_documents(file_path: str, metadata_field: str = "name") -> List[Dict[str, Any]]:
-    """Load documents from JSON file (like your drug data)"""
+    """Load documents from JSON file"""
     data = load_json_file(file_path)
     # Apply limit for testing if specified

kssrag/utils/helpers.py CHANGED Viewed

@@ -8,15 +8,23 @@ logging.basicConfig(
 )
 logger = logging.getLogger("KSSRAG")
+# Initialize as None - will be set when actually needed
+FAISS_AVAILABLE = None
+FAISS_AVX_TYPE = None
-def setup_faiss():
-    """Handle FAISS initialization with proper error handling and fallbacks"""
+def setup_faiss(vector_store_type: str = None):
+    """Handle FAISS initialization - only when explicitly called"""
+    global FAISS_AVAILABLE, FAISS_AVX_TYPE
+    # If already initialized, return cached values
+    if FAISS_AVAILABLE is not None:
+        return FAISS_AVAILABLE, FAISS_AVX_TYPE
     faiss_available = False
-    faiss_avx_type = "standard"
+    faiss_avx_type = "not_loaded"
-    # Only try to import FAISS if it's actually needed
-    from ..config import config
-    if config.VECTOR_STORE_TYPE in ["faiss", "hybrid_online"]:
+    # Only load FAISS if explicitly using FAISS-based stores
+    if vector_store_type in ["faiss", "hybrid_online"]:
         try:
             # Try different FAISS versions in order of preference
             faiss_import_attempts = [
@@ -29,8 +37,6 @@ def setup_faiss():
             for avx_type, import_path in faiss_import_attempts:
                 try:
                     logger.info(f"Loading faiss with {avx_type} support.")
-                    # Dynamic import
-                    import importlib
                     faiss_module = importlib.import_module(import_path)
                     # Make the FAISS symbols available globally
                     globals().update({name: getattr(faiss_module, name) for name in dir(faiss_module) if not name.startswith('_')})
@@ -41,7 +47,7 @@ def setup_faiss():
                     break
                 except ImportError as e:
-                    logger.info(f"Could not load library with {avx_type} support due to: {repr(e)}")
+                    logger.debug(f"Could not load library with {avx_type} support: {e}")
                     continue
             if not faiss_available:
@@ -50,30 +56,34 @@ def setup_faiss():
         except Exception as e:
             logger.error(f"Failed to initialize FAISS: {str(e)}")
             faiss_available = False
+    else:
+        # Not using FAISS, don't load it
+        logger.debug(f"Skipping FAISS initialization for vector store: {vector_store_type}")
+    # Cache the results
+    FAISS_AVAILABLE = faiss_available
+    FAISS_AVX_TYPE = faiss_avx_type
     return faiss_available, faiss_avx_type
-# Initialize FAISS only when needed
-FAISS_AVAILABLE, FAISS_AVX_TYPE = setup_faiss()
+def validate_config():
+    """Validate the configuration - don't auto-load FAISS here"""
+    try:
+        from ..config import config
+        if not config.OPENROUTER_API_KEY:
+            logger.warning("OPENROUTER_API_KEY not set. LLM functionality will not work.")
+        # Don't auto-load FAISS here - let the vector stores handle it
+        return True
+    except ImportError:
+        # Config not available, continue anyway
+        return True
 # Your signature in the code
 def kss_signature():
     return "Built with HATE by Ksschkw (github.com/Ksschkw)"
-def validate_config():
-    """Validate the configuration"""
-    from ..config import config
-    if not config.OPENROUTER_API_KEY:
-        logger.warning("OPENROUTER_API_KEY not set. LLM functionality will not work.")
-    if config.VECTOR_STORE_TYPE in ["faiss", "hybrid_online"] and not FAISS_AVAILABLE:
-        logger.warning(f"FAISS not available. Falling back to HYBRID_OFFLINE vector store.")
-        config.VECTOR_STORE_TYPE = "hybrid_offline"
-    return True
 def import_custom_component(import_path: str):
     """Import a custom component from a string path"""
     try:
@@ -83,3 +93,6 @@ def import_custom_component(import_path: str):
     except (ImportError, AttributeError, ValueError) as e:
         logger.error(f"Failed to import custom component {import_path}: {str(e)}")
         raise
+# Remove the auto-initialization at module level
+# FAISS will now only load when explicitly called by vector stores that need it

kssrag/utils/ocr.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""
+OCR utilities for KSS RAG.
+Requires extra dependencies: `paddleocr`, `paddlepaddle`, `pytesseract`, `Pillow`.
+Install via: pip install kssrag[ocr]
+"""
+try:
+    import pytesseract
+    from paddleocr import PaddleOCR
+    from PIL import Image
+except ImportError as e:
+    raise ImportError(
+        "OCR functionality requires extra dependencies. "
+        "Install with: pip install kssrag[ocr]"
+    ) from e
+# Initialize PaddleOCR (handwritten text)
+_paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")
+def ocr_tesseract(image_path: str) -> str:
+    """OCR for typed text using Tesseract."""
+    img = Image.open(image_path)
+    text = pytesseract.image_to_string(img)
+    return text.strip()
+def ocr_paddle(image_path: str) -> str:
+    """OCR for handwritten text using PaddleOCR."""
+    results = _paddle_ocr.ocr(image_path, cls=True)
+    text = ""
+    for line in results:
+        for _, (txt, _) in line:
+            text += txt + " "
+    return text.strip()
+def extract_text_from_image(image_path: str, mode: str = "typed") -> str:
+    """
+    Dispatch OCR engine.
+    mode = 'typed' (Tesseract) or 'handwritten' (PaddleOCR).
+    """
+    if mode == "handwritten":
+        return ocr_paddle(image_path)
+    elif mode == "typed":
+        return ocr_tesseract(image_path)
+    else:
+        raise ValueError("Invalid OCR mode. Choose 'typed' or 'handwritten'.")

kssrag/utils/ocr_loader.py ADDED Viewed

@@ -0,0 +1,151 @@
+import os
+import cv2
+import pytesseract
+from paddleocr import PaddleOCR
+from pathlib import Path
+from PIL import Image
+from .helpers import logger
+class OCRLoader:
+    """Production OCR handler with PaddleOCR (handwritten) and Tesseract (typed)"""
+    def __init__(self):
+        self.paddle_ocr = None
+        self._initialize_paddle_ocr()
+    # def _initialize_paddle_ocr(self):
+    #     """Initialize PaddleOCR with custom model directories and fallback"""
+    #     try:
+    #         # Try to use custom model directories first
+    #         det_model_dir = str(Path(__file__).parent.parent / 'paddle_models' / 'models' / 'ppocrv5_server_det')
+    #         rec_model_dir = str(Path(__file__).parent.parent / 'paddle_models' / 'models' / 'ppocrv5_server_rec')
+    #         # Create directories if they don't exist
+    #         os.makedirs(det_model_dir, exist_ok=True)
+    #         os.makedirs(rec_model_dir, exist_ok=True)
+    #         # Try to initialize with custom directories
+    #         try:
+    #             self.paddle_ocr = PaddleOCR(
+    #                 det_model_dir=det_model_dir,
+    #                 rec_model_dir=rec_model_dir,
+    #                 use_angle_cls=True,
+    #                 lang="en"
+    #             )
+    #             logger.info("PaddleOCR initialized successfully with custom model directories")
+    #         except (PermissionError, OSError) as e:
+    #             logger.warning(f"Failed to initialize PaddleOCR with custom directories: {str(e)}. Using default directories.")
+    #             # Fallback to default initialization
+    #             self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")
+    #             logger.info("PaddleOCR initialized successfully with default directories")
+    #     except Exception as e:
+    #         logger.error(f"PaddleOCR initialization failed: {str(e)}")
+    #         # Don't raise here - allow the loader to be created but OCR will fail when used
+    #         self.paddle_ocr = None
+    def _initialize_paddle_ocr(self):
+        """Initialize PaddleOCR with better directory handling"""
+        try:
+            # Try to use custom model directories first
+            det_model_dir = str(Path(__file__).parent.parent / 'paddle_models' / 'models' / 'ppocrv5_server_det')
+            rec_model_dir = str(Path(__file__).parent.parent / 'paddle_models' / 'models' / 'ppocrv5_server_rec')
+            # Create directories if they don't exist
+            os.makedirs(det_model_dir, exist_ok=True)
+            os.makedirs(rec_model_dir, exist_ok=True)
+            # Check if custom directories have the required files
+            custom_dirs_valid = (
+                os.path.exists(det_model_dir) and
+                os.path.exists(rec_model_dir) and
+                os.path.exists(os.path.join(det_model_dir, 'inference.yml')) and
+                os.path.exists(os.path.join(rec_model_dir, 'inference.yml'))
+            )
+            if custom_dirs_valid:
+                self.paddle_ocr = PaddleOCR(
+                    det_model_dir=det_model_dir,
+                    rec_model_dir=rec_model_dir,
+                    use_angle_cls=True,
+                    lang="en"
+                )
+                logger.info("PaddleOCR initialized successfully with custom model directories")
+            else:
+                logger.info("Custom model directories not found, using default PaddleOCR initialization")
+                self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")
+                logger.info("PaddleOCR initialized successfully with default directories")
+        except Exception as e:
+            logger.warning(f"PaddleOCR initialization failed: {str(e)}. Using default initialization.")
+            # Fallback to default initialization
+            self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")
+    def ocr_tesseract(self, image_path: str) -> str:
+        """OCR for typed text using Tesseract with error handling"""
+        try:
+            if not os.path.exists(image_path):
+                raise FileNotFoundError(f"Image file not found: {image_path}")
+            img = Image.open(image_path)
+            text = pytesseract.image_to_string(img)
+            if not text.strip():
+                logger.warning(f"Tesseract extracted no text from {image_path}")
+            return text.strip()
+        except FileNotFoundError:
+            # Re-raise FileNotFoundError directly
+            raise
+        except Exception as e:
+            logger.error(f"Tesseract OCR failed for {image_path}: {str(e)}")
+            raise RuntimeError(f"Tesseract OCR failed: {str(e)}")
+    def ocr_paddle(self, image_path: str) -> str:
+        """OCR for handwritten text using PaddleOCR with error handling"""
+        if self.paddle_ocr is None:
+            raise RuntimeError("PaddleOCR not initialized. OCR functionality unavailable.")
+        try:
+            if not os.path.exists(image_path):
+                raise FileNotFoundError(f"Image file not found: {image_path}")
+            img = cv2.imread(image_path)
+            if img is None:
+                raise ValueError(f"Could not read image at {image_path}")
+            result = self.paddle_ocr.ocr(img, cls=True)
+            lines = []
+            if result and result[0]:
+                for line in result[0]:
+                    if line and len(line) >= 2:
+                        text_content = line[1][0] if isinstance(line[1], (list, tuple)) and len(line[1]) > 0 else ""
+                        if text_content:
+                            lines.append(text_content)
+            extracted_text = " ".join(lines).strip()
+            if not extracted_text:
+                logger.warning(f"PaddleOCR extracted no text from {image_path}")
+            return extracted_text
+        except FileNotFoundError:
+            # Re-raise FileNotFoundError directly
+            raise
+        except Exception as e:
+            logger.error(f"PaddleOCR failed for {image_path}: {str(e)}")
+            raise RuntimeError(f"PaddleOCR failed: {str(e)}")
+    def extract_text(self, image_path: str, mode: str = "typed") -> str:
+        """Extract text from image using specified OCR engine"""
+        if mode not in ["typed", "handwritten"]:
+            raise ValueError(f"Invalid OCR mode: {mode}. Must be 'typed' or 'handwritten'")
+        if mode == "handwritten":
+            return self.ocr_paddle(image_path)
+        else:  # typed
+            return self.ocr_tesseract(image_path)

kssrag 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

kssrag 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl