PyPI - ara-cli - Versions diffs - 0.1.10.5__py3-none-any.whl → 0.1.14.0__py3-none-any.whl - Mend

ara-cli 0.1.10.5py3-none-any.whl → 0.1.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (151) hide show

ara_cli/file_loaders/loaders/text_file_loader.py ADDED Viewed

@@ -0,0 +1,47 @@
+import os
+import re
+import base64
+import tempfile
+from typing import Optional, Tuple
+import requests
+from charset_normalizer import from_path
+from ara_cli.file_loaders.file_loader import FileLoader
+from ara_cli.file_loaders.readers.markdown_reader import MarkdownReader
+class TextFileLoader(FileLoader):
+    """Loads text files"""
+    def load(
+        self,
+        file_path: str,
+        prefix: str = "",
+        suffix: str = "",
+        block_delimiter: str = "",
+        extract_images: bool = False,
+        **kwargs,
+    ) -> bool:
+        """Load text file with optional markdown image extraction"""
+        is_md_file = file_path.lower().endswith(".md")
+        if is_md_file and extract_images:
+            reader = MarkdownReader(file_path)
+            file_content = reader.read(extract_images=True).replace("\r\n", "\n")
+        else:
+            # Use charset-normalizer to detect encoding
+            encoded_content = from_path(file_path).best()
+            if not encoded_content:
+                print(f"Failed to detect encoding for {file_path}")
+                return False
+            file_content = str(encoded_content).replace("\r\n", "\n")
+        if block_delimiter:
+            file_content = f"{block_delimiter}\n{file_content}\n{block_delimiter}"
+        write_content = f"{prefix}{file_content}{suffix}\n"
+        with open(self.chat.chat_name, "a", encoding="utf-8") as chat_file:
+            chat_file.write(write_content)
+        return True

ara_cli/file_loaders/readers/__init__.py ADDED Viewed

File without changes

ara_cli/file_loaders/readers/docx_reader.py ADDED Viewed

@@ -0,0 +1,49 @@
+from ara_cli.file_loaders.document_reader import DocumentReader
+class DocxReader(DocumentReader):
+    """Reader for DOCX files"""
+    def read(self, extract_images: bool = False) -> str:
+        import docx
+        doc = docx.Document(self.file_path)
+        text_content = '\n'.join(para.text for para in doc.paragraphs)
+        if not extract_images:
+            return text_content
+        from PIL import Image
+        import io
+        # Create data directory for images
+        images_dir = self.create_image_data_dir("docx")
+        # Extract and process images
+        image_descriptions = []
+        image_counter = 1
+        for rel in doc.part.rels.values():
+            if "image" in rel.reltype:
+                image_data = rel.target_part.blob
+                # Determine image format
+                image = Image.open(io.BytesIO(image_data))
+                image_format = image.format.lower()
+                # Save and describe image
+                relative_path, description = self.save_and_describe_image(
+                    image_data, image_format, images_dir, image_counter
+                )
+                # Add formatted description to list
+                image_description = f"\nImage: {relative_path}\n[{description}]\n"
+                image_descriptions.append(image_description)
+                image_counter += 1
+        # Combine text content with image descriptions
+        if image_descriptions:
+            text_content += "\n\n### Extracted Images\n" + \
+                "\n".join(image_descriptions)
+        return text_content

ara_cli/file_loaders/readers/excel_reader.py ADDED Viewed

@@ -0,0 +1,27 @@
+from ara_cli.file_loaders.document_reader import DocumentReader
+class ExcelReader(DocumentReader):
+    """Reader for Excel files"""
+    def read(self, extract_images: bool = False) -> str:
+        import pandas as pd
+        try:
+            # Read all sheets
+            sheets_dict = pd.read_excel(self.file_path, sheet_name=None)
+            markdown_output = []
+            for sheet_name, df in sheets_dict.items():
+                markdown_output.append(f"### Sheet: {sheet_name}")
+                if df.empty:
+                    markdown_output.append("_Empty Sheet_")
+                else:
+                    # Convert to markdown, managing NaN values
+                    markdown_table = df.fillna("").to_markdown(index=False)
+                    markdown_output.append(markdown_table)
+                markdown_output.append("")  # Add empty line between sheets
+            return "\n".join(markdown_output)
+        except Exception as e:
+            return f"Error reading Excel file: {str(e)}"

ara_cli/file_loaders/{markdown_reader.py → readers/markdown_reader.py} RENAMED Viewed

@@ -2,7 +2,7 @@ import os
 import re
 from typing import Optional
 from charset_normalizer import from_path
-from ara_cli.file_loaders.image_processor import ImageProcessor
+from ara_cli.file_loaders.tools.image_processor import ImageProcessor
 class MarkdownReader:

ara_cli/file_loaders/readers/odt_reader.py ADDED Viewed

@@ -0,0 +1,59 @@
+from ara_cli.file_loaders.document_reader import DocumentReader
+class OdtReader(DocumentReader):
+    """Reader for ODT files"""
+    def read(self, extract_images: bool = False) -> str:
+        import pymupdf4llm
+        if not extract_images:
+            return pymupdf4llm.to_markdown(self.file_path, write_images=False)
+        import zipfile
+        from PIL import Image
+        import io
+        # Create data directory for images
+        images_dir = self.create_image_data_dir("odt")
+        # Get text content
+        text_content = pymupdf4llm.to_markdown(
+            self.file_path, write_images=False)
+        # Extract and process images from ODT
+        image_descriptions = []
+        image_counter = 1
+        try:
+            with zipfile.ZipFile(self.file_path, 'r') as odt_zip:
+                # List all files in the Pictures directory
+                picture_files = [
+                    f for f in odt_zip.namelist() if f.startswith('Pictures/')]
+                for picture_file in picture_files:
+                    # Extract image data
+                    image_data = odt_zip.read(picture_file)
+                    # Determine image format
+                    image = Image.open(io.BytesIO(image_data))
+                    image_format = image.format.lower()
+                    # Save and describe image
+                    relative_path, description = self.save_and_describe_image(
+                        image_data, image_format, images_dir, image_counter
+                    )
+                    # Add formatted description to list
+                    image_description = f"\nImage: {relative_path}\n[{description}]\n"
+                    image_descriptions.append(image_description)
+                    image_counter += 1
+        except Exception as e:
+            print(f"Warning: Could not extract images from ODT: {e}")
+        # Combine text content with image descriptions
+        if image_descriptions:
+            text_content += "\n\n### Extracted Images\n" + \
+                "\n".join(image_descriptions)
+        return text_content

ara_cli/file_loaders/readers/pdf_reader.py ADDED Viewed

@@ -0,0 +1,54 @@
+from ara_cli.file_loaders.document_reader import DocumentReader
+class PdfReader(DocumentReader):
+    """Reader for PDF files"""
+    def read(self, extract_images: bool = False) -> str:
+        import pymupdf4llm
+        if not extract_images:
+            return pymupdf4llm.to_markdown(self.file_path, write_images=False)
+        import fitz  # PyMuPDF
+        # Create images directory
+        images_dir = self.create_image_data_dir("pdf")
+        # Extract text without images first
+        text_content = pymupdf4llm.to_markdown(
+            self.file_path, write_images=False)
+        # Extract and process images
+        doc = fitz.open(self.file_path)
+        image_descriptions = []
+        image_counter = 1
+        for page_num, page in enumerate(doc):
+            image_list = page.get_images()
+            for img_index, img in enumerate(image_list):
+                # Extract image
+                xref = img[0]
+                base_image = doc.extract_image(xref)
+                image_bytes = base_image["image"]
+                image_ext = base_image["ext"]
+                # Save and describe image
+                relative_path, description = self.save_and_describe_image(
+                    image_bytes, image_ext, images_dir, image_counter
+                )
+                # Add formatted description to list
+                image_description = f"\nImage: {relative_path}\n[{description}]\n"
+                image_descriptions.append(image_description)
+                image_counter += 1
+        doc.close()
+        # Combine text content with image descriptions
+        if image_descriptions:
+            text_content += "\n\n### Extracted Images\n" + \
+                "\n".join(image_descriptions)
+        return text_content

ara_cli/file_loaders/readers/pptx_reader.py ADDED Viewed

@@ -0,0 +1,104 @@
+from ara_cli.file_loaders.document_reader import DocumentReader
+class PptxReader(DocumentReader):
+    """Reader for PowerPoint files"""
+    @staticmethod
+    def _getActionImage(shape, MSO_SHAPE_TYPE):
+        try:
+            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                return shape.image.blob, shape.image.ext
+            elif shape.is_placeholder and hasattr(shape, "image"):
+                return shape.image.blob, shape.image.ext
+        except Exception:
+            pass
+        return None, None
+    @staticmethod
+    def _get_shape_text(shape, slide):
+        if not shape.has_text_frame:
+            return []
+        lines = []
+        is_title = False
+        try:
+            if shape == slide.shapes.title:
+                is_title = True
+        except AttributeError:
+            pass
+        text_frame = shape.text_frame
+        if is_title:
+            lines.append(f"### {text_frame.text}")
+        else:
+            for paragraph in text_frame.paragraphs:
+                text = paragraph.text.strip()
+                if text:
+                    lines.append(f"- {text}")
+        return lines
+    def read(self, extract_images: bool = False) -> str:
+        from pptx import Presentation
+        from pptx.enum.shapes import MSO_SHAPE_TYPE
+        import io
+        try:
+            prs = Presentation(self.file_path)
+            md_lines = []
+            # Prepare image extraction if requested
+            images_dir = None
+            image_counter = 1
+            image_descriptions = []
+            if extract_images:
+                images_dir = self.create_image_data_dir("pptx")
+            def process_shape(shape):
+                # Recursive function to handle groups and extract images
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for sub_shape in shape.shapes:
+                        process_shape(sub_shape)
+                    return
+                # Text extraction
+                md_lines.extend(self._get_shape_text(shape, slide))
+                # Image extraction
+                if extract_images:
+                    blob, ext = self._getActionImage(shape, MSO_SHAPE_TYPE)
+                    if blob and ext:
+                        try:
+                            nonlocal image_counter
+                            relative_path, description = self.save_and_describe_image(
+                                blob, ext, images_dir, image_counter
+                            )
+                            image_desc_text = (
+                                f"\nImage: {relative_path}\n[{description}]\n"
+                            )
+                            md_lines.append(image_desc_text)
+                            image_descriptions.append(image_desc_text)
+                            image_counter += 1
+                        except Exception as img_err:
+                            print(
+                                f"Warning: Failed to extract image from slide {index+1}: {img_err}"
+                            )
+            for index, slide in enumerate(prs.slides):
+                md_lines.append(f"## Slide {index + 1}")
+                # Collect shapes and sort by top position
+                shapes = sorted(
+                    [s for s in slide.shapes], key=lambda x: (x.top or 0, x.left or 0)
+                )
+                for shape in shapes:
+                    process_shape(shape)
+                md_lines.append("\n---\n")
+            return "\n".join(md_lines)
+        except Exception as e:
+            return f"Error reading PowerPoint file: {str(e)}"

ara_cli/file_loaders/tools/__init__.py ADDED Viewed

File without changes

ara_cli/llm_utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+from ara_cli.ara_config import ConfigManager
+from pydantic_ai import Agent
+FALLBACK_MODEL = "anthropic:claude-4-sonnet-20250514"
+def get_configured_conversion_llm_model() -> str:
+    """
+    Retrieves the configured conversion LLM model string, adapted for pydantic_ai.
+    Falls back to a default model if configuration is missing or invalid.
+    """
+    model_name = FALLBACK_MODEL
+    try:
+        config = ConfigManager.get_config()
+        conversion_llm_key = config.conversion_llm
+        if conversion_llm_key and conversion_llm_key in config.llm_config:
+            llm_config_item = config.llm_config[conversion_llm_key]
+            raw_model_name = llm_config_item.model
+            # Adapt LiteLLM model string to PydanticAI format
+            # LiteLLM: provider/model-name (e.g. openai/gpt-4o)
+            # PydanticAI: provider:model-name (e.g. openai:gpt-4o)
+            if "/" in raw_model_name and ":" not in raw_model_name:
+                parts = raw_model_name.split("/", 1)
+                if len(parts) == 2:
+                    model_name = f"{parts[0]}:{parts[1]}"
+                else:
+                    model_name = raw_model_name
+            else:
+                model_name = raw_model_name
+        else:
+            print(
+                f"Warning: Conversion LLM configuration issue. Using fallback model: {FALLBACK_MODEL}"
+            )
+    except Exception as e:
+        print(
+            f"Warning: Error resolving LLM config ({e}). Using fallback model: {FALLBACK_MODEL}"
+        )
+        model_name = FALLBACK_MODEL
+    return model_name
+def create_pydantic_ai_agent(
+    output_type, model_name: str = None, instrument: bool = True
+) -> Agent:
+    """
+    Creates a pydantic_ai Agent with the specified or configured model.
+    """
+    if not model_name:
+        model_name = get_configured_conversion_llm_model()
+    return Agent(
+        model=model_name,
+        output_type=output_type,
+        instrument=instrument,
+    )

ara_cli/output_suppressor.py CHANGED Viewed

@@ -15,3 +15,56 @@ def suppress_stdout(suppress=False):
                 sys.stdout = old_stdout
     else:
         yield
+@contextmanager
+def suppress_stderr():
+    """Suppress stderr output - useful for hiding library debug/error messages."""
+    with open(os.devnull, "w", encoding="utf-8") as devnull:
+        old_stderr = sys.stderr
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            sys.stderr = old_stderr
+class FilteredStdout:
+    """A stdout wrapper that filters out specific unwanted messages."""
+    FILTERED_PATTERNS = [
+        "Provider List: https://docs.litellm.ai/docs/providers",
+    ]
+    def __init__(self, original_stdout):
+        self.original_stdout = original_stdout
+    def write(self, text):
+        # Check if text contains any filtered patterns
+        for pattern in self.FILTERED_PATTERNS:
+            if pattern in text:
+                return  # Suppress this output
+        self.original_stdout.write(text)
+    def flush(self):
+        self.original_stdout.flush()
+    def __getattr__(self, name):
+        return getattr(self.original_stdout, name)
+@contextmanager
+def filter_unwanted_output():
+    """Filter out unwanted stdout messages and suppress stderr."""
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    sys.stdout = FilteredStdout(old_stdout)
+    with open(os.devnull, "w", encoding="utf-8") as devnull:
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr

ara_cli/prompt_chat.py CHANGED Viewed

@@ -6,9 +6,18 @@ from ara_cli.update_config_prompt import update_artefact_config_prompt_files
 from ara_cli.output_suppressor import suppress_stdout
-def initialize_prompt_chat_mode(classifier, param, chat_name, reset=None, output_mode=False, append_strings=[], restricted=False):
+def initialize_prompt_chat_mode(
+    classifier,
+    param,
+    chat_name,
+    reset=None,
+    output_mode=False,
+    append_strings=[],
+    restricted=False,
+):
     sub_directory = Classifier.get_sub_directory(classifier)
-    artefact_data_path = os.path.join("ara", sub_directory, f"{param}.data") # f"ara/{sub_directory}/{parameter}.data"
+    # f"ara/{sub_directory}/{parameter}.data"
+    artefact_data_path = os.path.join("ara", sub_directory, f"{param}.data")
     if chat_name is None:
         chat_name = classifier
@@ -17,11 +26,18 @@ def initialize_prompt_chat_mode(classifier, param, chat_name, reset=None, output
         update_artefact_config_prompt_files(classifier, param, automatic_update=True)
     classifier_chat_file = os.path.join(artefact_data_path, f"{chat_name}")
-    start_chat_session(classifier_chat_file, reset, output_mode, append_strings, restricted)
+    start_chat_session(
+        classifier_chat_file, reset, output_mode, append_strings, restricted
+    )
 def start_chat_session(chat_file, reset, output_mode, append_strings, restricted):
     with suppress_stdout(suppress=output_mode):
-        chat = Chat(chat_file, reset=reset) if not restricted else Chat(chat_file, reset=reset, enable_commands=whitelisted_commands)
+        chat = (
+            Chat(chat_file, reset=reset)
+            if not restricted
+            else Chat(chat_file, reset=reset, enable_commands=whitelisted_commands)
+        )
     if append_strings:
         chat.append_strings(append_strings)
     if output_mode:

ara-cli 0.1.10.5__py3-none-any.whl → 0.1.14.0__py3-none-any.whl

ara-cli 0.1.10.5py3-none-any.whl → 0.1.14.0py3-none-any.whl