chunkr-ai 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/base.py +34 -122
 - chunkr_ai/api/chunkr.py +63 -4
 - chunkr_ai/api/chunkr_async.py +43 -4
 - chunkr_ai/api/config.py +24 -24
 - chunkr_ai/api/misc.py +106 -0
 - chunkr_ai/api/task.py +65 -11
 - chunkr_ai/main.py +12 -0
 - chunkr_ai/models.py +0 -1
 - {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/METADATA +1 -1
 - chunkr_ai-0.0.8.dist-info/RECORD +18 -0
 - chunkr_ai-0.0.6.dist-info/RECORD +0 -17
 - {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/LICENSE +0 -0
 - {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/WHEEL +0 -0
 - {chunkr_ai-0.0.6.dist-info → chunkr_ai-0.0.8.dist-info}/top_level.txt +0 -0
 
    
        chunkr_ai/api/base.py
    CHANGED
    
    | 
         @@ -3,13 +3,10 @@ from .task import TaskResponse 
     | 
|
| 
       3 
3 
     | 
    
         
             
            from .auth import HeadersMixin
         
     | 
| 
       4 
4 
     | 
    
         
             
            from abc import abstractmethod
         
     | 
| 
       5 
5 
     | 
    
         
             
            from dotenv import load_dotenv
         
     | 
| 
       6 
     | 
    
         
            -
            import io
         
     | 
| 
       7 
     | 
    
         
            -
            import json
         
     | 
| 
       8 
6 
     | 
    
         
             
            import os
         
     | 
| 
       9 
7 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       10 
8 
     | 
    
         
             
            from PIL import Image
         
     | 
| 
       11 
     | 
    
         
            -
            import  
     | 
| 
       12 
     | 
    
         
            -
            from typing import BinaryIO, Tuple, Union
         
     | 
| 
      
 9 
     | 
    
         
            +
            from typing import BinaryIO, Union
         
     | 
| 
       13 
10 
     | 
    
         | 
| 
       14 
11 
     | 
    
         
             
            class ChunkrBase(HeadersMixin):
         
     | 
| 
       15 
12 
     | 
    
         
             
                """Base class with shared functionality for Chunkr API clients."""
         
     | 
| 
         @@ -30,140 +27,38 @@ class ChunkrBase(HeadersMixin): 
     | 
|
| 
       30 
27 
     | 
    
         | 
| 
       31 
28 
     | 
    
         
             
                    self.url = self.url.rstrip("/")
         
     | 
| 
       32 
29 
     | 
    
         | 
| 
       33 
     | 
    
         
            -
                 
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
                    file 
     | 
| 
       36 
     | 
    
         
            -
                ) -> Tuple[str, BinaryIO]:
         
     | 
| 
       37 
     | 
    
         
            -
                    """Convert various file types into a tuple of (filename, file-like object).
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
                    Args:
         
     | 
| 
       40 
     | 
    
         
            -
                        file: Input file, can be:
         
     | 
| 
       41 
     | 
    
         
            -
                            - String or Path to a file
         
     | 
| 
       42 
     | 
    
         
            -
                            - URL string starting with http:// or https://
         
     | 
| 
       43 
     | 
    
         
            -
                            - Base64 string
         
     | 
| 
       44 
     | 
    
         
            -
                            - Opened binary file (mode='rb')
         
     | 
| 
       45 
     | 
    
         
            -
                            - PIL/Pillow Image object
         
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
                    Returns:
         
     | 
| 
       48 
     | 
    
         
            -
                        Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
         
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
                    Raises:
         
     | 
| 
       51 
     | 
    
         
            -
                        FileNotFoundError: If the file path doesn't exist
         
     | 
| 
       52 
     | 
    
         
            -
                        TypeError: If the file type is not supported
         
     | 
| 
       53 
     | 
    
         
            -
                        ValueError: If the URL is invalid or unreachable
         
     | 
| 
       54 
     | 
    
         
            -
                        ValueError: If the MIME type is unsupported
         
     | 
| 
       55 
     | 
    
         
            -
                    """
         
     | 
| 
       56 
     | 
    
         
            -
                    # Handle URLs
         
     | 
| 
       57 
     | 
    
         
            -
                    if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
         
     | 
| 
       58 
     | 
    
         
            -
                        response = requests.get(file)
         
     | 
| 
       59 
     | 
    
         
            -
                        response.raise_for_status()
         
     | 
| 
       60 
     | 
    
         
            -
                        file_obj = io.BytesIO(response.content)
         
     | 
| 
       61 
     | 
    
         
            -
                        filename = Path(file.split('/')[-1]).name or 'downloaded_file'
         
     | 
| 
       62 
     | 
    
         
            -
                        return filename, file_obj
         
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
                    # Handle base64 strings
         
     | 
| 
       65 
     | 
    
         
            -
                    if isinstance(file, str) and ',' in file and ';base64,' in file:
         
     | 
| 
       66 
     | 
    
         
            -
                        try:
         
     | 
| 
       67 
     | 
    
         
            -
                            # Split header and data
         
     | 
| 
       68 
     | 
    
         
            -
                            header, base64_data = file.split(',', 1)
         
     | 
| 
       69 
     | 
    
         
            -
                            import base64
         
     | 
| 
       70 
     | 
    
         
            -
                            file_bytes = base64.b64decode(base64_data)
         
     | 
| 
       71 
     | 
    
         
            -
                            file_obj = io.BytesIO(file_bytes)
         
     | 
| 
       72 
     | 
    
         
            -
                            
         
     | 
| 
       73 
     | 
    
         
            -
                            # Try to determine format from header
         
     | 
| 
       74 
     | 
    
         
            -
                            format = 'bin'
         
     | 
| 
       75 
     | 
    
         
            -
                            mime_type = header.split(':')[-1].split(';')[0].lower()
         
     | 
| 
       76 
     | 
    
         
            -
                            
         
     | 
| 
       77 
     | 
    
         
            -
                            # Map MIME types to file extensions
         
     | 
| 
       78 
     | 
    
         
            -
                            mime_to_ext = {
         
     | 
| 
       79 
     | 
    
         
            -
                                'application/pdf': 'pdf',
         
     | 
| 
       80 
     | 
    
         
            -
                                'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
         
     | 
| 
       81 
     | 
    
         
            -
                                'application/msword': 'doc',
         
     | 
| 
       82 
     | 
    
         
            -
                                'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
         
     | 
| 
       83 
     | 
    
         
            -
                                'application/vnd.ms-powerpoint': 'ppt',
         
     | 
| 
       84 
     | 
    
         
            -
                                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
         
     | 
| 
       85 
     | 
    
         
            -
                                'application/vnd.ms-excel': 'xls',
         
     | 
| 
       86 
     | 
    
         
            -
                                'image/jpeg': 'jpg',
         
     | 
| 
       87 
     | 
    
         
            -
                                'image/png': 'png',
         
     | 
| 
       88 
     | 
    
         
            -
                                'image/jpg': 'jpg'
         
     | 
| 
       89 
     | 
    
         
            -
                            }
         
     | 
| 
       90 
     | 
    
         
            -
                            
         
     | 
| 
       91 
     | 
    
         
            -
                            if mime_type in mime_to_ext:
         
     | 
| 
       92 
     | 
    
         
            -
                                format = mime_to_ext[mime_type]
         
     | 
| 
       93 
     | 
    
         
            -
                            else:
         
     | 
| 
       94 
     | 
    
         
            -
                                raise ValueError(f"Unsupported MIME type: {mime_type}")
         
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
       96 
     | 
    
         
            -
                            return f"file.{format}", file_obj
         
     | 
| 
       97 
     | 
    
         
            -
                        except Exception as e:
         
     | 
| 
       98 
     | 
    
         
            -
                            raise ValueError(f"Invalid base64 string: {str(e)}")
         
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
                    # Handle file paths
         
     | 
| 
       101 
     | 
    
         
            -
                    if isinstance(file, (str, Path)):
         
     | 
| 
       102 
     | 
    
         
            -
                        path = Path(file).resolve()
         
     | 
| 
       103 
     | 
    
         
            -
                        if not path.exists():
         
     | 
| 
       104 
     | 
    
         
            -
                            raise FileNotFoundError(f"File not found: {file}")
         
     | 
| 
       105 
     | 
    
         
            -
                        return path.name, open(path, 'rb')
         
     | 
| 
       106 
     | 
    
         
            -
             
     | 
| 
       107 
     | 
    
         
            -
                    # Handle PIL Images
         
     | 
| 
       108 
     | 
    
         
            -
                    if isinstance(file, Image.Image):
         
     | 
| 
       109 
     | 
    
         
            -
                        img_byte_arr = io.BytesIO()
         
     | 
| 
       110 
     | 
    
         
            -
                        format = file.format or 'PNG'
         
     | 
| 
       111 
     | 
    
         
            -
                        file.save(img_byte_arr, format=format)
         
     | 
| 
       112 
     | 
    
         
            -
                        img_byte_arr.seek(0)
         
     | 
| 
       113 
     | 
    
         
            -
                        return f"image.{format.lower()}", img_byte_arr
         
     | 
| 
       114 
     | 
    
         
            -
             
     | 
| 
       115 
     | 
    
         
            -
                    # Handle file-like objects
         
     | 
| 
       116 
     | 
    
         
            -
                    if hasattr(file, 'read') and hasattr(file, 'seek'):
         
     | 
| 
       117 
     | 
    
         
            -
                        # Try to get the filename from the file object if possible
         
     | 
| 
       118 
     | 
    
         
            -
                        name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
         
     | 
| 
       119 
     | 
    
         
            -
                        return Path(name).name, file
         
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
                    raise TypeError(f"Unsupported file type: {type(file)}")
         
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
                def _prepare_upload_data(
         
     | 
| 
       124 
     | 
    
         
            -
                    self,
         
     | 
| 
       125 
     | 
    
         
            -
                    file: Union[str, Path, BinaryIO, Image.Image],
         
     | 
| 
       126 
     | 
    
         
            -
                    config: Configuration = None
         
     | 
| 
       127 
     | 
    
         
            -
                ) -> Tuple[dict, dict]:
         
     | 
| 
       128 
     | 
    
         
            -
                    """Prepare files and data dictionaries for upload.
         
     | 
| 
      
 30 
     | 
    
         
            +
                @abstractmethod
         
     | 
| 
      
 31 
     | 
    
         
            +
                def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
         
     | 
| 
      
 32 
     | 
    
         
            +
                    """Upload a file and wait for processing to complete.
         
     | 
| 
       129 
33 
     | 
    
         | 
| 
       130 
     | 
    
         
            -
                     
     | 
| 
       131 
     | 
    
         
            -
                        file: The file to upload
         
     | 
| 
       132 
     | 
    
         
            -
                        config: Optional configuration settings
         
     | 
| 
       133 
     | 
    
         
            -
                        
         
     | 
| 
       134 
     | 
    
         
            -
                    Returns:
         
     | 
| 
       135 
     | 
    
         
            -
                        Tuple[dict, dict]: (files dict, data dict) ready for upload
         
     | 
| 
      
 34 
     | 
    
         
            +
                    Must be implemented by subclasses.
         
     | 
| 
       136 
35 
     | 
    
         
             
                    """
         
     | 
| 
       137 
     | 
    
         
            -
                     
     | 
| 
       138 
     | 
    
         
            -
                    files = {"file": (filename, file_obj)}
         
     | 
| 
       139 
     | 
    
         
            -
                    data = {}
         
     | 
| 
       140 
     | 
    
         
            -
                    
         
     | 
| 
       141 
     | 
    
         
            -
                    if config:
         
     | 
| 
       142 
     | 
    
         
            -
                        config_dict = config.model_dump(mode="json", exclude_none=True)
         
     | 
| 
       143 
     | 
    
         
            -
                        for key, value in config_dict.items():
         
     | 
| 
       144 
     | 
    
         
            -
                            if isinstance(value, dict):
         
     | 
| 
       145 
     | 
    
         
            -
                                files[key] = (None, json.dumps(value), 'application/json')
         
     | 
| 
       146 
     | 
    
         
            -
                            else:
         
     | 
| 
       147 
     | 
    
         
            -
                                data[key] = value
         
     | 
| 
       148 
     | 
    
         
            -
                                
         
     | 
| 
       149 
     | 
    
         
            -
                    return files, data
         
     | 
| 
      
 36 
     | 
    
         
            +
                    pass
         
     | 
| 
       150 
37 
     | 
    
         | 
| 
       151 
38 
     | 
    
         
             
                @abstractmethod
         
     | 
| 
       152 
     | 
    
         
            -
                def  
     | 
| 
       153 
     | 
    
         
            -
                    """ 
     | 
| 
      
 39 
     | 
    
         
            +
                def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 40 
     | 
    
         
            +
                    """Update a task by its ID.
         
     | 
| 
       154 
41 
     | 
    
         | 
| 
       155 
42 
     | 
    
         
             
                    Must be implemented by subclasses.
         
     | 
| 
       156 
43 
     | 
    
         
             
                    """
         
     | 
| 
       157 
44 
     | 
    
         
             
                    pass
         
     | 
| 
       158 
45 
     | 
    
         | 
| 
       159 
46 
     | 
    
         
             
                @abstractmethod
         
     | 
| 
       160 
     | 
    
         
            -
                def  
     | 
| 
      
 47 
     | 
    
         
            +
                def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
         
     | 
| 
       161 
48 
     | 
    
         
             
                    """Upload a file for processing and immediately return the task response.
         
     | 
| 
       162 
49 
     | 
    
         | 
| 
       163 
50 
     | 
    
         
             
                    Must be implemented by subclasses.
         
     | 
| 
       164 
51 
     | 
    
         
             
                    """
         
     | 
| 
       165 
52 
     | 
    
         
             
                    pass
         
     | 
| 
       166 
53 
     | 
    
         | 
| 
      
 54 
     | 
    
         
            +
                @abstractmethod
         
     | 
| 
      
 55 
     | 
    
         
            +
                def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 56 
     | 
    
         
            +
                    """Update a task by its ID.
         
     | 
| 
      
 57 
     | 
    
         
            +
                    
         
     | 
| 
      
 58 
     | 
    
         
            +
                    Must be implemented by subclasses.
         
     | 
| 
      
 59 
     | 
    
         
            +
                    """
         
     | 
| 
      
 60 
     | 
    
         
            +
                    pass
         
     | 
| 
      
 61 
     | 
    
         
            +
                
         
     | 
| 
       167 
62 
     | 
    
         
             
                @abstractmethod
         
     | 
| 
       168 
63 
     | 
    
         
             
                def get_task(self, task_id: str) -> TaskResponse:
         
     | 
| 
       169 
64 
     | 
    
         
             
                    """Get a task response by its ID.
         
     | 
| 
         @@ -171,3 +66,20 @@ class ChunkrBase(HeadersMixin): 
     | 
|
| 
       171 
66 
     | 
    
         
             
                    Must be implemented by subclasses.
         
     | 
| 
       172 
67 
     | 
    
         
             
                    """
         
     | 
| 
       173 
68 
     | 
    
         
             
                    pass
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                @abstractmethod
         
     | 
| 
      
 71 
     | 
    
         
            +
                def delete_task(self, task_id: str) -> None:
         
     | 
| 
      
 72 
     | 
    
         
            +
                    """Delete a task by its ID.
         
     | 
| 
      
 73 
     | 
    
         
            +
                    
         
     | 
| 
      
 74 
     | 
    
         
            +
                    Must be implemented by subclasses.
         
     | 
| 
      
 75 
     | 
    
         
            +
                    """
         
     | 
| 
      
 76 
     | 
    
         
            +
                    pass
         
     | 
| 
      
 77 
     | 
    
         
            +
                
         
     | 
| 
      
 78 
     | 
    
         
            +
                @abstractmethod
         
     | 
| 
      
 79 
     | 
    
         
            +
                def cancel_task(self, task_id: str) -> None:
         
     | 
| 
      
 80 
     | 
    
         
            +
                    """Cancel a task by its ID.
         
     | 
| 
      
 81 
     | 
    
         
            +
                    
         
     | 
| 
      
 82 
     | 
    
         
            +
                    Must be implemented by subclasses.
         
     | 
| 
      
 83 
     | 
    
         
            +
                    """
         
     | 
| 
      
 84 
     | 
    
         
            +
                    pass
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
    
        chunkr_ai/api/chunkr.py
    CHANGED
    
    | 
         @@ -5,6 +5,7 @@ from pathlib import Path 
     | 
|
| 
       5 
5 
     | 
    
         
             
            from PIL import Image
         
     | 
| 
       6 
6 
     | 
    
         
             
            import requests
         
     | 
| 
       7 
7 
     | 
    
         
             
            from typing import Union, BinaryIO
         
     | 
| 
      
 8 
     | 
    
         
            +
            from .misc import prepare_upload_data
         
     | 
| 
       8 
9 
     | 
    
         | 
| 
       9 
10 
     | 
    
         
             
            class Chunkr(ChunkrBase):
         
     | 
| 
       10 
11 
     | 
    
         
             
                """Chunkr API client"""
         
     | 
| 
         @@ -43,10 +44,23 @@ class Chunkr(ChunkrBase): 
     | 
|
| 
       43 
44 
     | 
    
         
             
                    Returns:
         
     | 
| 
       44 
45 
     | 
    
         
             
                        TaskResponse: The completed task response
         
     | 
| 
       45 
46 
     | 
    
         
             
                    """
         
     | 
| 
       46 
     | 
    
         
            -
                    task = self. 
     | 
| 
      
 47 
     | 
    
         
            +
                    task = self.create_task(file, config)
         
     | 
| 
       47 
48 
     | 
    
         
             
                    return task.poll()
         
     | 
| 
      
 49 
     | 
    
         
            +
                
         
     | 
| 
      
 50 
     | 
    
         
            +
                def update(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 51 
     | 
    
         
            +
                    """Update a task by its ID and wait for processing to complete.
         
     | 
| 
      
 52 
     | 
    
         
            +
                    
         
     | 
| 
      
 53 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 54 
     | 
    
         
            +
                        task_id: The ID of the task to update
         
     | 
| 
      
 55 
     | 
    
         
            +
                        config: Configuration options for processing. Optional.
         
     | 
| 
       48 
56 
     | 
    
         | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
      
 57 
     | 
    
         
            +
                    Returns:
         
     | 
| 
      
 58 
     | 
    
         
            +
                        TaskResponse: The updated task response
         
     | 
| 
      
 59 
     | 
    
         
            +
                    """
         
     | 
| 
      
 60 
     | 
    
         
            +
                    task = self.update_task(task_id, config)
         
     | 
| 
      
 61 
     | 
    
         
            +
                    return task.poll()
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
         
     | 
| 
       50 
64 
     | 
    
         
             
                    """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
         
     | 
| 
       51 
65 
     | 
    
         | 
| 
       52 
66 
     | 
    
         
             
                    Args:
         
     | 
| 
         @@ -80,16 +94,35 @@ class Chunkr(ChunkrBase): 
     | 
|
| 
       80 
94 
     | 
    
         
             
                    Returns:
         
     | 
| 
       81 
95 
     | 
    
         
             
                        TaskResponse: The initial task response
         
     | 
| 
       82 
96 
     | 
    
         
             
                    """
         
     | 
| 
       83 
     | 
    
         
            -
                    files 
     | 
| 
      
 97 
     | 
    
         
            +
                    files= prepare_upload_data(file, config)
         
     | 
| 
       84 
98 
     | 
    
         
             
                    r = self._session.post(
         
     | 
| 
       85 
99 
     | 
    
         
             
                        f"{self.url}/api/v1/task",
         
     | 
| 
       86 
100 
     | 
    
         
             
                        files=files,
         
     | 
| 
       87 
     | 
    
         
            -
                        data=data,  
         
     | 
| 
       88 
101 
     | 
    
         
             
                        headers=self._headers()
         
     | 
| 
       89 
102 
     | 
    
         
             
                    )
         
     | 
| 
       90 
103 
     | 
    
         
             
                    r.raise_for_status()
         
     | 
| 
       91 
104 
     | 
    
         
             
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
      
 105 
     | 
    
         
            +
                
         
     | 
| 
      
 106 
     | 
    
         
            +
                def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 107 
     | 
    
         
            +
                    """Update a task by its ID.
         
     | 
| 
      
 108 
     | 
    
         
            +
                    
         
     | 
| 
      
 109 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 110 
     | 
    
         
            +
                        task_id: The ID of the task to update
         
     | 
| 
      
 111 
     | 
    
         
            +
                        config: The new configuration to use
         
     | 
| 
       92 
112 
     | 
    
         | 
| 
      
 113 
     | 
    
         
            +
                    Returns:
         
     | 
| 
      
 114 
     | 
    
         
            +
                        TaskResponse: The updated task response
         
     | 
| 
      
 115 
     | 
    
         
            +
                    """
         
     | 
| 
      
 116 
     | 
    
         
            +
                    files = prepare_upload_data(None, config)
         
     | 
| 
      
 117 
     | 
    
         
            +
                    r = self._session.patch(
         
     | 
| 
      
 118 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}",
         
     | 
| 
      
 119 
     | 
    
         
            +
                        files=files,
         
     | 
| 
      
 120 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 121 
     | 
    
         
            +
                    )
         
     | 
| 
      
 122 
     | 
    
         
            +
              
         
     | 
| 
      
 123 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 124 
     | 
    
         
            +
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
      
 125 
     | 
    
         
            +
                
         
     | 
| 
       93 
126 
     | 
    
         
             
                def get_task(self, task_id: str) -> TaskResponse:
         
     | 
| 
       94 
127 
     | 
    
         
             
                    """Get a task response by its ID.
         
     | 
| 
       95 
128 
     | 
    
         | 
| 
         @@ -106,3 +139,29 @@ class Chunkr(ChunkrBase): 
     | 
|
| 
       106 
139 
     | 
    
         
             
                    r.raise_for_status()
         
     | 
| 
       107 
140 
     | 
    
         
             
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
       108 
141 
     | 
    
         | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
                def delete_task(self, task_id: str) -> None:
         
     | 
| 
      
 144 
     | 
    
         
            +
                    """Delete a task by its ID.
         
     | 
| 
      
 145 
     | 
    
         
            +
                    
         
     | 
| 
      
 146 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 147 
     | 
    
         
            +
                        task_id: The ID of the task to delete
         
     | 
| 
      
 148 
     | 
    
         
            +
                    """
         
     | 
| 
      
 149 
     | 
    
         
            +
                    r = self._session.delete(
         
     | 
| 
      
 150 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}",
         
     | 
| 
      
 151 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 152 
     | 
    
         
            +
                    )
         
     | 
| 
      
 153 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                def cancel_task(self, task_id: str) -> None:
         
     | 
| 
      
 156 
     | 
    
         
            +
                    """Cancel a task by its ID.
         
     | 
| 
      
 157 
     | 
    
         
            +
                    
         
     | 
| 
      
 158 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 159 
     | 
    
         
            +
                        task_id: The ID of the task to cancel
         
     | 
| 
      
 160 
     | 
    
         
            +
                    """
         
     | 
| 
      
 161 
     | 
    
         
            +
                    r = self._session.get(
         
     | 
| 
      
 162 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}/cancel",
         
     | 
| 
      
 163 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 164 
     | 
    
         
            +
                    )
         
     | 
| 
      
 165 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
              
         
     | 
    
        chunkr_ai/api/chunkr_async.py
    CHANGED
    
    | 
         @@ -5,6 +5,7 @@ import httpx 
     | 
|
| 
       5 
5 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       6 
6 
     | 
    
         
             
            from PIL import Image
         
     | 
| 
       7 
7 
     | 
    
         
             
            from typing import Union, BinaryIO
         
     | 
| 
      
 8 
     | 
    
         
            +
            from .misc import prepare_upload_data
         
     | 
| 
       8 
9 
     | 
    
         | 
| 
       9 
10 
     | 
    
         
             
            class ChunkrAsync(ChunkrBase):
         
     | 
| 
       10 
11 
     | 
    
         
             
                """Asynchronous Chunkr API client"""
         
     | 
| 
         @@ -43,10 +44,23 @@ class ChunkrAsync(ChunkrBase): 
     | 
|
| 
       43 
44 
     | 
    
         
             
                    Returns:
         
     | 
| 
       44 
45 
     | 
    
         
             
                        TaskResponse: The completed task response
         
     | 
| 
       45 
46 
     | 
    
         
             
                    """
         
     | 
| 
       46 
     | 
    
         
            -
                    task = await self. 
     | 
| 
      
 47 
     | 
    
         
            +
                    task = await self.create_task(file, config)
         
     | 
| 
       47 
48 
     | 
    
         
             
                    return await task.poll_async()
         
     | 
| 
      
 49 
     | 
    
         
            +
                
         
     | 
| 
      
 50 
     | 
    
         
            +
                async def update(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 51 
     | 
    
         
            +
                    """Update a task by its ID and wait for processing to complete.
         
     | 
| 
      
 52 
     | 
    
         
            +
                    
         
     | 
| 
      
 53 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 54 
     | 
    
         
            +
                        task_id: The ID of the task to update
         
     | 
| 
      
 55 
     | 
    
         
            +
                        config: Configuration options for processing. Optional.
         
     | 
| 
       48 
56 
     | 
    
         | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
      
 57 
     | 
    
         
            +
                    Returns:
         
     | 
| 
      
 58 
     | 
    
         
            +
                        TaskResponse: The updated task response
         
     | 
| 
      
 59 
     | 
    
         
            +
                    """
         
     | 
| 
      
 60 
     | 
    
         
            +
                    task = await self.update_task(task_id, config)
         
     | 
| 
      
 61 
     | 
    
         
            +
                    return await task.poll_async()
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
         
     | 
| 
       50 
64 
     | 
    
         
             
                    """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
         
     | 
| 
       51 
65 
     | 
    
         | 
| 
       52 
66 
     | 
    
         
             
                    Args:
         
     | 
| 
         @@ -80,16 +94,26 @@ class ChunkrAsync(ChunkrBase): 
     | 
|
| 
       80 
94 
     | 
    
         
             
                    Returns:
         
     | 
| 
       81 
95 
     | 
    
         
             
                        TaskResponse: The initial task response
         
     | 
| 
       82 
96 
     | 
    
         
             
                    """
         
     | 
| 
       83 
     | 
    
         
            -
                    files 
     | 
| 
      
 97 
     | 
    
         
            +
                    files = prepare_upload_data(file, config)
         
     | 
| 
       84 
98 
     | 
    
         
             
                    r = await self._client.post(
         
     | 
| 
       85 
99 
     | 
    
         
             
                        f"{self.url}/api/v1/task",
         
     | 
| 
       86 
100 
     | 
    
         
             
                        files=files,
         
     | 
| 
       87 
     | 
    
         
            -
                        json=config.model_dump() if config else {},
         
     | 
| 
       88 
101 
     | 
    
         
             
                        headers=self._headers()
         
     | 
| 
       89 
102 
     | 
    
         
             
                    )
         
     | 
| 
       90 
103 
     | 
    
         
             
                    r.raise_for_status()
         
     | 
| 
       91 
104 
     | 
    
         
             
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
       92 
105 
     | 
    
         | 
| 
      
 106 
     | 
    
         
            +
                async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
         
     | 
| 
      
 107 
     | 
    
         
            +
                    files = prepare_upload_data(None, config)
         
     | 
| 
      
 108 
     | 
    
         
            +
                    r = await self._client.patch(
         
     | 
| 
      
 109 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}",
         
     | 
| 
      
 110 
     | 
    
         
            +
                        files=files,
         
     | 
| 
      
 111 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 112 
     | 
    
         
            +
                    )
         
     | 
| 
      
 113 
     | 
    
         
            +
                 
         
     | 
| 
      
 114 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 115 
     | 
    
         
            +
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
      
 116 
     | 
    
         
            +
                
         
     | 
| 
       93 
117 
     | 
    
         
             
                async def get_task(self, task_id: str) -> TaskResponse:
         
     | 
| 
       94 
118 
     | 
    
         
             
                    r = await self._client.get(
         
     | 
| 
       95 
119 
     | 
    
         
             
                        f"{self.url}/api/v1/task/{task_id}",
         
     | 
| 
         @@ -97,7 +121,22 @@ class ChunkrAsync(ChunkrBase): 
     | 
|
| 
       97 
121 
     | 
    
         
             
                    )
         
     | 
| 
       98 
122 
     | 
    
         
             
                    r.raise_for_status()
         
     | 
| 
       99 
123 
     | 
    
         
             
                    return TaskResponse(**r.json()).with_client(self)
         
     | 
| 
      
 124 
     | 
    
         
            +
                
         
     | 
| 
      
 125 
     | 
    
         
            +
                async def delete_task(self, task_id: str) -> None:
         
     | 
| 
      
 126 
     | 
    
         
            +
                    r = await self._client.delete(
         
     | 
| 
      
 127 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}",
         
     | 
| 
      
 128 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 129 
     | 
    
         
            +
                    )
         
     | 
| 
      
 130 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 131 
     | 
    
         
            +
                
         
     | 
| 
      
 132 
     | 
    
         
            +
                async def cancel_task(self, task_id: str) -> None:
         
     | 
| 
      
 133 
     | 
    
         
            +
                    r = await self._client.get(
         
     | 
| 
      
 134 
     | 
    
         
            +
                        f"{self.url}/api/v1/task/{task_id}/cancel",
         
     | 
| 
      
 135 
     | 
    
         
            +
                        headers=self._headers()
         
     | 
| 
      
 136 
     | 
    
         
            +
                    )
         
     | 
| 
      
 137 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
       100 
138 
     | 
    
         | 
| 
      
 139 
     | 
    
         
            +
                
         
     | 
| 
       101 
140 
     | 
    
         
             
                async def __aenter__(self):
         
     | 
| 
       102 
141 
     | 
    
         
             
                    return self
         
     | 
| 
       103 
142 
     | 
    
         | 
    
        chunkr_ai/api/config.py
    CHANGED
    
    | 
         @@ -1,4 +1,4 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            from pydantic import BaseModel, Field, model_validator
         
     | 
| 
      
 1 
     | 
    
         
            +
            from pydantic import BaseModel, Field, model_validator, ConfigDict
         
     | 
| 
       2 
2 
     | 
    
         
             
            from enum import Enum
         
     | 
| 
       3 
3 
     | 
    
         
             
            from typing import Optional, List, Dict
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
         @@ -10,30 +10,30 @@ class CroppingStrategy(str, Enum): 
     | 
|
| 
       10 
10 
     | 
    
         
             
                ALL = "All" 
         
     | 
| 
       11 
11 
     | 
    
         
             
                AUTO = "Auto"
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
       13 
     | 
    
         
            -
            class LlmConfig(BaseModel):
         
     | 
| 
       14 
     | 
    
         
            -
                model: str
         
     | 
| 
       15 
     | 
    
         
            -
                prompt: str
         
     | 
| 
       16 
     | 
    
         
            -
                temperature: float = 0.0
         
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
13 
     | 
    
         
             
            class GenerationConfig(BaseModel):
         
     | 
| 
       19 
14 
     | 
    
         
             
                html: Optional[GenerationStrategy] = None
         
     | 
| 
       20 
     | 
    
         
            -
                llm: Optional[ 
     | 
| 
      
 15 
     | 
    
         
            +
                llm: Optional[str] = None
         
     | 
| 
       21 
16 
     | 
    
         
             
                markdown: Optional[GenerationStrategy] = None
         
     | 
| 
       22 
17 
     | 
    
         
             
                crop_image: Optional[CroppingStrategy] = None
         
     | 
| 
       23 
18 
     | 
    
         | 
| 
       24 
19 
     | 
    
         
             
            class SegmentProcessing(BaseModel):
         
     | 
| 
       25 
     | 
    
         
            -
                 
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
                 
     | 
| 
       29 
     | 
    
         
            -
                 
     | 
| 
       30 
     | 
    
         
            -
                 
     | 
| 
       31 
     | 
    
         
            -
                 
     | 
| 
       32 
     | 
    
         
            -
                 
     | 
| 
       33 
     | 
    
         
            -
                 
     | 
| 
       34 
     | 
    
         
            -
                 
     | 
| 
       35 
     | 
    
         
            -
                 
     | 
| 
       36 
     | 
    
         
            -
                 
     | 
| 
      
 20 
     | 
    
         
            +
                model_config = ConfigDict(
         
     | 
| 
      
 21 
     | 
    
         
            +
                    populate_by_name=True,
         
     | 
| 
      
 22 
     | 
    
         
            +
                    alias_generator=str.title
         
     | 
| 
      
 23 
     | 
    
         
            +
                )
         
     | 
| 
      
 24 
     | 
    
         
            +
                
         
     | 
| 
      
 25 
     | 
    
         
            +
                title: Optional[GenerationConfig] = Field(default=None, alias="Title")
         
     | 
| 
      
 26 
     | 
    
         
            +
                section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
         
     | 
| 
      
 27 
     | 
    
         
            +
                text: Optional[GenerationConfig] = Field(default=None, alias="Text")
         
     | 
| 
      
 28 
     | 
    
         
            +
                list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
         
     | 
| 
      
 29 
     | 
    
         
            +
                table: Optional[GenerationConfig] = Field(default=None, alias="Table")
         
     | 
| 
      
 30 
     | 
    
         
            +
                picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
         
     | 
| 
      
 31 
     | 
    
         
            +
                caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
         
     | 
| 
      
 32 
     | 
    
         
            +
                formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
         
     | 
| 
      
 33 
     | 
    
         
            +
                footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
         
     | 
| 
      
 34 
     | 
    
         
            +
                page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
         
     | 
| 
      
 35 
     | 
    
         
            +
                page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
         
     | 
| 
      
 36 
     | 
    
         
            +
                page: Optional[GenerationConfig] = Field(default=None, alias="Page")
         
     | 
| 
       37 
37 
     | 
    
         | 
| 
       38 
38 
     | 
    
         
             
            class ChunkProcessing(BaseModel):
         
     | 
| 
       39 
39 
     | 
    
         
             
                target_length: Optional[int] = None
         
     | 
| 
         @@ -86,9 +86,9 @@ class Segment(BaseModel): 
     | 
|
| 
       86 
86 
     | 
    
         
             
                bbox: BoundingBox
         
     | 
| 
       87 
87 
     | 
    
         
             
                content: str
         
     | 
| 
       88 
88 
     | 
    
         
             
                page_height: float
         
     | 
| 
       89 
     | 
    
         
            -
                html: Optional[str]
         
     | 
| 
       90 
     | 
    
         
            -
                image: Optional[str]
         
     | 
| 
       91 
     | 
    
         
            -
                markdown: Optional[str]
         
     | 
| 
      
 89 
     | 
    
         
            +
                html: Optional[str] = None
         
     | 
| 
      
 90 
     | 
    
         
            +
                image: Optional[str] = None
         
     | 
| 
      
 91 
     | 
    
         
            +
                markdown: Optional[str] = None
         
     | 
| 
       92 
92 
     | 
    
         
             
                ocr: List[OCRResult]
         
     | 
| 
       93 
93 
     | 
    
         
             
                page_number: int
         
     | 
| 
       94 
94 
     | 
    
         
             
                page_width: float
         
     | 
| 
         @@ -104,8 +104,8 @@ class ExtractedJson(BaseModel): 
     | 
|
| 
       104 
104 
     | 
    
         
             
                data: Dict
         
     | 
| 
       105 
105 
     | 
    
         | 
| 
       106 
106 
     | 
    
         
             
            class OutputResponse(BaseModel):
         
     | 
| 
       107 
     | 
    
         
            -
                chunks: List[Chunk] 
     | 
| 
       108 
     | 
    
         
            -
                extracted_json: Optional[ExtractedJson]
         
     | 
| 
      
 107 
     | 
    
         
            +
                chunks: List[Chunk]
         
     | 
| 
      
 108 
     | 
    
         
            +
                extracted_json: Optional[ExtractedJson] = Field(default=None)
         
     | 
| 
       109 
109 
     | 
    
         | 
| 
       110 
110 
     | 
    
         
             
            class Model(str, Enum):
         
     | 
| 
       111 
111 
     | 
    
         
             
                FAST = "Fast"
         
     | 
    
        chunkr_ai/api/misc.py
    ADDED
    
    | 
         @@ -0,0 +1,106 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            import io
         
     | 
| 
      
 2 
     | 
    
         
            +
            import json
         
     | 
| 
      
 3 
     | 
    
         
            +
            from pathlib import Path
         
     | 
| 
      
 4 
     | 
    
         
            +
            from PIL import Image
         
     | 
| 
      
 5 
     | 
    
         
            +
            import requests
         
     | 
| 
      
 6 
     | 
    
         
            +
            from typing import Union, Tuple, BinaryIO, Optional
         
     | 
| 
      
 7 
     | 
    
         
            +
            from .config import Configuration
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            def prepare_file(
         
     | 
| 
      
 11 
     | 
    
         
            +
                file: Union[str, Path, BinaryIO, Image.Image]
         
     | 
| 
      
 12 
     | 
    
         
            +
            ) -> Tuple[str, BinaryIO]:
         
     | 
| 
      
 13 
     | 
    
         
            +
                """Convert various file types into a tuple of (filename, file-like object)."""
         
     | 
| 
      
 14 
     | 
    
         
            +
                # Handle URLs
         
     | 
| 
      
 15 
     | 
    
         
            +
                if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
         
     | 
| 
      
 16 
     | 
    
         
            +
                    response = requests.get(file)
         
     | 
| 
      
 17 
     | 
    
         
            +
                    response.raise_for_status()
         
     | 
| 
      
 18 
     | 
    
         
            +
                    file_obj = io.BytesIO(response.content)
         
     | 
| 
      
 19 
     | 
    
         
            +
                    filename = Path(file.split('/')[-1]).name or 'downloaded_file'
         
     | 
| 
      
 20 
     | 
    
         
            +
                    return filename, file_obj
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                # Handle base64 strings
         
     | 
| 
      
 23 
     | 
    
         
            +
                if isinstance(file, str) and ',' in file and ';base64,' in file:
         
     | 
| 
      
 24 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 25 
     | 
    
         
            +
                        # Split header and data
         
     | 
| 
      
 26 
     | 
    
         
            +
                        header, base64_data = file.split(',', 1)
         
     | 
| 
      
 27 
     | 
    
         
            +
                        import base64
         
     | 
| 
      
 28 
     | 
    
         
            +
                        file_bytes = base64.b64decode(base64_data)
         
     | 
| 
      
 29 
     | 
    
         
            +
                        file_obj = io.BytesIO(file_bytes)
         
     | 
| 
      
 30 
     | 
    
         
            +
                        
         
     | 
| 
      
 31 
     | 
    
         
            +
                        # Try to determine format from header
         
     | 
| 
      
 32 
     | 
    
         
            +
                        format = 'bin'
         
     | 
| 
      
 33 
     | 
    
         
            +
                        mime_type = header.split(':')[-1].split(';')[0].lower()
         
     | 
| 
      
 34 
     | 
    
         
            +
                        
         
     | 
| 
      
 35 
     | 
    
         
            +
                        # Map MIME types to file extensions
         
     | 
| 
      
 36 
     | 
    
         
            +
                        mime_to_ext = {
         
     | 
| 
      
 37 
     | 
    
         
            +
                            'application/pdf': 'pdf',
         
     | 
| 
      
 38 
     | 
    
         
            +
                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
         
     | 
| 
      
 39 
     | 
    
         
            +
                            'application/msword': 'doc',
         
     | 
| 
      
 40 
     | 
    
         
            +
                            'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
         
     | 
| 
      
 41 
     | 
    
         
            +
                            'application/vnd.ms-powerpoint': 'ppt',
         
     | 
| 
      
 42 
     | 
    
         
            +
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
         
     | 
| 
      
 43 
     | 
    
         
            +
                            'application/vnd.ms-excel': 'xls',
         
     | 
| 
      
 44 
     | 
    
         
            +
                            'image/jpeg': 'jpg',
         
     | 
| 
      
 45 
     | 
    
         
            +
                            'image/png': 'png',
         
     | 
| 
      
 46 
     | 
    
         
            +
                            'image/jpg': 'jpg'
         
     | 
| 
      
 47 
     | 
    
         
            +
                        }
         
     | 
| 
      
 48 
     | 
    
         
            +
                        
         
     | 
| 
      
 49 
     | 
    
         
            +
                        if mime_type in mime_to_ext:
         
     | 
| 
      
 50 
     | 
    
         
            +
                            format = mime_to_ext[mime_type]
         
     | 
| 
      
 51 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 52 
     | 
    
         
            +
                            raise ValueError(f"Unsupported MIME type: {mime_type}")
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                        return f"file.{format}", file_obj
         
     | 
| 
      
 55 
     | 
    
         
            +
                    except Exception as e:
         
     | 
| 
      
 56 
     | 
    
         
            +
                        raise ValueError(f"Invalid base64 string: {str(e)}")
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                # Handle file paths
         
     | 
| 
      
 59 
     | 
    
         
            +
                if isinstance(file, (str, Path)):
         
     | 
| 
      
 60 
     | 
    
         
            +
                    path = Path(file).resolve()
         
     | 
| 
      
 61 
     | 
    
         
            +
                    if not path.exists():
         
     | 
| 
      
 62 
     | 
    
         
            +
                        raise FileNotFoundError(f"File not found: {file}")
         
     | 
| 
      
 63 
     | 
    
         
            +
                    return path.name, open(path, 'rb')
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                # Handle PIL Images
         
     | 
| 
      
 66 
     | 
    
         
            +
                if isinstance(file, Image.Image):
         
     | 
| 
      
 67 
     | 
    
         
            +
                    img_byte_arr = io.BytesIO()
         
     | 
| 
      
 68 
     | 
    
         
            +
                    format = file.format or 'PNG'
         
     | 
| 
      
 69 
     | 
    
         
            +
                    file.save(img_byte_arr, format=format)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    img_byte_arr.seek(0)
         
     | 
| 
      
 71 
     | 
    
         
            +
                    return f"image.{format.lower()}", img_byte_arr
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                # Handle file-like objects
         
     | 
| 
      
 74 
     | 
    
         
            +
                if hasattr(file, 'read') and hasattr(file, 'seek'):
         
     | 
| 
      
 75 
     | 
    
         
            +
                    # Try to get the filename from the file object if possible
         
     | 
| 
      
 76 
     | 
    
         
            +
                    name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
         
     | 
| 
      
 77 
     | 
    
         
            +
                    return Path(name).name, file
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                raise TypeError(f"Unsupported file type: {type(file)}")
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
            def prepare_upload_data(
         
     | 
| 
      
 84 
     | 
    
         
            +
                file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
         
     | 
| 
      
 85 
     | 
    
         
            +
                config: Optional[Configuration] = None
         
     | 
| 
      
 86 
     | 
    
         
            +
            ) -> dict:
         
     | 
| 
      
 87 
     | 
    
         
            +
                """Prepare files and data dictionaries for upload.
         
     | 
| 
      
 88 
     | 
    
         
            +
                
         
     | 
| 
      
 89 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 90 
     | 
    
         
            +
                    file: The file to upload
         
     | 
| 
      
 91 
     | 
    
         
            +
                    config: Optional configuration settings
         
     | 
| 
      
 92 
     | 
    
         
            +
                    
         
     | 
| 
      
 93 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 94 
     | 
    
         
            +
                    dict: (files dict) ready for upload
         
     | 
| 
      
 95 
     | 
    
         
            +
                """
         
     | 
| 
      
 96 
     | 
    
         
            +
                files = {}
         
     | 
| 
      
 97 
     | 
    
         
            +
                if file:
         
     | 
| 
      
 98 
     | 
    
         
            +
                    filename, file_obj = prepare_file(file)
         
     | 
| 
      
 99 
     | 
    
         
            +
                    files = {"file": (filename, file_obj)}
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                if config:
         
     | 
| 
      
 102 
     | 
    
         
            +
                    config_dict = config.model_dump(mode="json", exclude_none=True)
         
     | 
| 
      
 103 
     | 
    
         
            +
                    for key, value in config_dict.items():
         
     | 
| 
      
 104 
     | 
    
         
            +
                        files[key] = (None, json.dumps(value), 'application/json')
         
     | 
| 
      
 105 
     | 
    
         
            +
                            
         
     | 
| 
      
 106 
     | 
    
         
            +
                return files
         
     | 
    
        chunkr_ai/api/task.py
    CHANGED
    
    | 
         @@ -1,5 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from .protocol import ChunkrClientProtocol
         
     | 
| 
       2 
2 
     | 
    
         
             
            from .config import Configuration, OutputResponse
         
     | 
| 
      
 3 
     | 
    
         
            +
            from .misc import prepare_upload_data
         
     | 
| 
       3 
4 
     | 
    
         
             
            import asyncio
         
     | 
| 
       4 
5 
     | 
    
         
             
            from datetime import datetime
         
     | 
| 
       5 
6 
     | 
    
         
             
            from enum import Enum
         
     | 
| 
         @@ -12,22 +13,23 @@ class Status(str, Enum): 
     | 
|
| 
       12 
13 
     | 
    
         
             
                PROCESSING = "Processing"
         
     | 
| 
       13 
14 
     | 
    
         
             
                SUCCEEDED = "Succeeded"
         
     | 
| 
       14 
15 
     | 
    
         
             
                FAILED = "Failed"
         
     | 
| 
      
 16 
     | 
    
         
            +
                CANCELLED = "Cancelled"
         
     | 
| 
       15 
17 
     | 
    
         | 
| 
       16 
18 
     | 
    
         
             
            class TaskResponse(BaseModel):
         
     | 
| 
       17 
19 
     | 
    
         
             
                configuration: Configuration
         
     | 
| 
       18 
20 
     | 
    
         
             
                created_at: datetime
         
     | 
| 
       19 
     | 
    
         
            -
                expires_at: Optional[datetime]
         
     | 
| 
       20 
     | 
    
         
            -
                file_name: Optional[str]
         
     | 
| 
       21 
     | 
    
         
            -
                finished_at: Optional[datetime]
         
     | 
| 
       22 
     | 
    
         
            -
                input_file_url: Optional[str]
         
     | 
| 
      
 21 
     | 
    
         
            +
                expires_at: Optional[datetime] = None
         
     | 
| 
      
 22 
     | 
    
         
            +
                file_name: Optional[str] = None
         
     | 
| 
      
 23 
     | 
    
         
            +
                finished_at: Optional[datetime] = None
         
     | 
| 
      
 24 
     | 
    
         
            +
                input_file_url: Optional[str] = None
         
     | 
| 
       23 
25 
     | 
    
         
             
                message: str
         
     | 
| 
       24 
     | 
    
         
            -
                output: Optional[OutputResponse]
         
     | 
| 
       25 
     | 
    
         
            -
                page_count: Optional[int]
         
     | 
| 
       26 
     | 
    
         
            -
                pdf_url: Optional[str]
         
     | 
| 
       27 
     | 
    
         
            -
                started_at: Optional[datetime]
         
     | 
| 
      
 26 
     | 
    
         
            +
                output: Optional[OutputResponse] = None
         
     | 
| 
      
 27 
     | 
    
         
            +
                page_count: Optional[int] = None
         
     | 
| 
      
 28 
     | 
    
         
            +
                pdf_url: Optional[str] = None
         
     | 
| 
      
 29 
     | 
    
         
            +
                started_at: Optional[datetime] = None
         
     | 
| 
       28 
30 
     | 
    
         
             
                status: Status
         
     | 
| 
       29 
31 
     | 
    
         
             
                task_id: str
         
     | 
| 
       30 
     | 
    
         
            -
                task_url: Optional[str]
         
     | 
| 
      
 32 
     | 
    
         
            +
                task_url: Optional[str] = None
         
     | 
| 
       31 
33 
     | 
    
         
             
                _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
         
     | 
| 
       32 
34 
     | 
    
         | 
| 
       33 
35 
     | 
    
         
             
                def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
         
     | 
| 
         @@ -79,7 +81,8 @@ class TaskResponse(BaseModel): 
     | 
|
| 
       79 
81 
     | 
    
         
             
                    """Poll the task for completion."""
         
     | 
| 
       80 
82 
     | 
    
         
             
                    while True:
         
     | 
| 
       81 
83 
     | 
    
         
             
                        response = self._poll_request_sync()
         
     | 
| 
       82 
     | 
    
         
            -
                         
     | 
| 
      
 84 
     | 
    
         
            +
                        updated_task = TaskResponse(**response).with_client(self._client)
         
     | 
| 
      
 85 
     | 
    
         
            +
                        self.__dict__.update(updated_task.__dict__)
         
     | 
| 
       83 
86 
     | 
    
         | 
| 
       84 
87 
     | 
    
         
             
                        if result := self._check_status():
         
     | 
| 
       85 
88 
     | 
    
         
             
                            return result
         
     | 
| 
         @@ -90,7 +93,8 @@ class TaskResponse(BaseModel): 
     | 
|
| 
       90 
93 
     | 
    
         
             
                    """Poll the task for completion asynchronously."""
         
     | 
| 
       91 
94 
     | 
    
         
             
                    while True:
         
     | 
| 
       92 
95 
     | 
    
         
             
                        response = await self._poll_request_async()
         
     | 
| 
       93 
     | 
    
         
            -
                         
     | 
| 
      
 96 
     | 
    
         
            +
                        updated_task = TaskResponse(**response).with_client(self._client)
         
     | 
| 
      
 97 
     | 
    
         
            +
                        self.__dict__.update(updated_task.__dict__)
         
     | 
| 
       94 
98 
     | 
    
         | 
| 
       95 
99 
     | 
    
         
             
                        if result := self._check_status():
         
     | 
| 
       96 
100 
     | 
    
         
             
                            return result
         
     | 
| 
         @@ -108,6 +112,56 @@ class TaskResponse(BaseModel): 
     | 
|
| 
       108 
112 
     | 
    
         
             
                            if content:
         
     | 
| 
       109 
113 
     | 
    
         
             
                                parts.append(content)
         
     | 
| 
       110 
114 
     | 
    
         
             
                    return "\n".join(parts)
         
     | 
| 
      
 115 
     | 
    
         
            +
                
         
     | 
| 
      
 116 
     | 
    
         
            +
                def update(self, config: Configuration) -> 'TaskResponse':
         
     | 
| 
      
 117 
     | 
    
         
            +
                    files = prepare_upload_data(None, config)
         
     | 
| 
      
 118 
     | 
    
         
            +
                    r = self._client._session.patch(
         
     | 
| 
      
 119 
     | 
    
         
            +
                        f"{self.task_url}",
         
     | 
| 
      
 120 
     | 
    
         
            +
                        files=files,
         
     | 
| 
      
 121 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 122 
     | 
    
         
            +
                    )
         
     | 
| 
      
 123 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 124 
     | 
    
         
            +
                    return TaskResponse(**r.json()).with_client(self._client)
         
     | 
| 
      
 125 
     | 
    
         
            +
                
         
     | 
| 
      
 126 
     | 
    
         
            +
                async def update_async(self, config: Configuration) -> 'TaskResponse':
         
     | 
| 
      
 127 
     | 
    
         
            +
                    files = prepare_upload_data(None, config)
         
     | 
| 
      
 128 
     | 
    
         
            +
                    r = await self._client._client.patch(
         
     | 
| 
      
 129 
     | 
    
         
            +
                        f"{self.task_url}",
         
     | 
| 
      
 130 
     | 
    
         
            +
                        files=files,
         
     | 
| 
      
 131 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 132 
     | 
    
         
            +
                    )   
         
     | 
| 
      
 133 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 134 
     | 
    
         
            +
                    return TaskResponse(**r.json()).with_client(self._client)
         
     | 
| 
      
 135 
     | 
    
         
            +
                
         
     | 
| 
      
 136 
     | 
    
         
            +
                def cancel(self):
         
     | 
| 
      
 137 
     | 
    
         
            +
                    r = self._client._session.get(
         
     | 
| 
      
 138 
     | 
    
         
            +
                        f"{self.task_url}/cancel",
         
     | 
| 
      
 139 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 140 
     | 
    
         
            +
                    )
         
     | 
| 
      
 141 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 142 
     | 
    
         
            +
                    self.poll()
         
     | 
| 
      
 143 
     | 
    
         
            +
                
         
     | 
| 
      
 144 
     | 
    
         
            +
                async def cancel_async(self):
         
     | 
| 
      
 145 
     | 
    
         
            +
                    r = await self._client._client.get(
         
     | 
| 
      
 146 
     | 
    
         
            +
                        f"{self.task_url}/cancel",
         
     | 
| 
      
 147 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 148 
     | 
    
         
            +
                    )
         
     | 
| 
      
 149 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 150 
     | 
    
         
            +
                    await self.poll_async()
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                def delete(self):
         
     | 
| 
      
 153 
     | 
    
         
            +
                    r = self._client._session.delete(
         
     | 
| 
      
 154 
     | 
    
         
            +
                        f"{self.task_url}",
         
     | 
| 
      
 155 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 156 
     | 
    
         
            +
                    )
         
     | 
| 
      
 157 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
      
 158 
     | 
    
         
            +
                
         
     | 
| 
      
 159 
     | 
    
         
            +
                async def delete_async(self):
         
     | 
| 
      
 160 
     | 
    
         
            +
                    r = await self._client._client.delete(
         
     | 
| 
      
 161 
     | 
    
         
            +
                        f"{self.task_url}",
         
     | 
| 
      
 162 
     | 
    
         
            +
                        headers=self._client._headers()
         
     | 
| 
      
 163 
     | 
    
         
            +
                    )
         
     | 
| 
      
 164 
     | 
    
         
            +
                    r.raise_for_status()
         
     | 
| 
       111 
165 
     | 
    
         | 
| 
       112 
166 
     | 
    
         
             
                def html(self) -> str:
         
     | 
| 
       113 
167 
     | 
    
         
             
                    """Get full HTML for the task"""
         
     | 
    
        chunkr_ai/main.py
    CHANGED
    
    | 
         @@ -0,0 +1,12 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from chunkr_ai.api.chunkr import Chunkr
         
     | 
| 
      
 2 
     | 
    
         
            +
            from chunkr_ai.models import Configuration
         
     | 
| 
      
 3 
     | 
    
         
            +
            from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            if __name__ == "__main__":
         
     | 
| 
      
 6 
     | 
    
         
            +
                chunkr = Chunkr()
         
     | 
| 
      
 7 
     | 
    
         
            +
                task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
         
     | 
| 
      
 8 
     | 
    
         
            +
                    chunk_processing=ChunkProcessing(
         
     | 
| 
      
 9 
     | 
    
         
            +
                        target_length=1000
         
     | 
| 
      
 10 
     | 
    
         
            +
                    )
         
     | 
| 
      
 11 
     | 
    
         
            +
                ))
         
     | 
| 
      
 12 
     | 
    
         
            +
                print(task)
         
     | 
    
        chunkr_ai/models.py
    CHANGED
    
    
| 
         @@ -0,0 +1,18 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
         
     | 
| 
      
 2 
     | 
    
         
            +
            chunkr_ai/main.py,sha256=_MT1lcnNiXjVW9ZkZYl28SB_f6M9g_IOgZxvhodTzAo,394
         
     | 
| 
      
 3 
     | 
    
         
            +
            chunkr_ai/models.py,sha256=T8_F-Y1US21ZJVzLIaroqp-Hd0_ZFbdkbEOxr63-PNE,827
         
     | 
| 
      
 4 
     | 
    
         
            +
            chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
      
 5 
     | 
    
         
            +
            chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
      
 6 
     | 
    
         
            +
            chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
         
     | 
| 
      
 7 
     | 
    
         
            +
            chunkr_ai/api/base.py,sha256=IYO0pmoL02GchIggj6_Q5nvtAUoOvYAAvT7VLFU6scY,2506
         
     | 
| 
      
 8 
     | 
    
         
            +
            chunkr_ai/api/chunkr.py,sha256=PmrK37HbK2T1KUPitKnt4wZqIujL61Jo12qW9DEpNMI,5186
         
     | 
| 
      
 9 
     | 
    
         
            +
            chunkr_ai/api/chunkr_async.py,sha256=2yYyAO9-j2xKQYH0fJb2S6gL26hgbtL4QyqlG9l0QBY,4893
         
     | 
| 
      
 10 
     | 
    
         
            +
            chunkr_ai/api/config.py,sha256=XIqXZ_8q7U_BEmY5wyIC9mbQGZBw1956EN9yhC4svD0,4235
         
     | 
| 
      
 11 
     | 
    
         
            +
            chunkr_ai/api/misc.py,sha256=tScsUUcrqeVh_bZv1YlbmjGkQSTDQN8NyKxoNwAG6XA,3792
         
     | 
| 
      
 12 
     | 
    
         
            +
            chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
         
     | 
| 
      
 13 
     | 
    
         
            +
            chunkr_ai/api/task.py,sha256=EB6RK8ms7EaNj57tNJZoNgNMHGWKXFhkQ1WC7gk5ht4,6059
         
     | 
| 
      
 14 
     | 
    
         
            +
            chunkr_ai-0.0.8.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
      
 15 
     | 
    
         
            +
            chunkr_ai-0.0.8.dist-info/METADATA,sha256=tL3OZfFIRsgfIKoDYWAS89bZw48_0C8cdqHJ6_GrT7A,4844
         
     | 
| 
      
 16 
     | 
    
         
            +
            chunkr_ai-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         
     | 
| 
      
 17 
     | 
    
         
            +
            chunkr_ai-0.0.8.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
         
     | 
| 
      
 18 
     | 
    
         
            +
            chunkr_ai-0.0.8.dist-info/RECORD,,
         
     | 
    
        chunkr_ai-0.0.6.dist-info/RECORD
    DELETED
    
    | 
         @@ -1,17 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
         
     | 
| 
       2 
     | 
    
         
            -
            chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       3 
     | 
    
         
            -
            chunkr_ai/models.py,sha256=kNeYtBO4TFvQWKFCent7tLEQjyKlVUieKNiuTt3u564,842
         
     | 
| 
       4 
     | 
    
         
            -
            chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       5 
     | 
    
         
            -
            chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       6 
     | 
    
         
            -
            chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
         
     | 
| 
       7 
     | 
    
         
            -
            chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
         
     | 
| 
       8 
     | 
    
         
            -
            chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
         
     | 
| 
       9 
     | 
    
         
            -
            chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
         
     | 
| 
       10 
     | 
    
         
            -
            chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
         
     | 
| 
       11 
     | 
    
         
            -
            chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
         
     | 
| 
       12 
     | 
    
         
            -
            chunkr_ai/api/task.py,sha256=_WOGRirlLEow_wS9kJB_dNYb2RvYE9nlu7Spq16AhME,4172
         
     | 
| 
       13 
     | 
    
         
            -
            chunkr_ai-0.0.6.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       14 
     | 
    
         
            -
            chunkr_ai-0.0.6.dist-info/METADATA,sha256=TuBBU6n1g7kdLVky2vAx94TFWZVyu8PqQ_47vi6tN5E,4844
         
     | 
| 
       15 
     | 
    
         
            -
            chunkr_ai-0.0.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         
     | 
| 
       16 
     | 
    
         
            -
            chunkr_ai-0.0.6.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
         
     | 
| 
       17 
     | 
    
         
            -
            chunkr_ai-0.0.6.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |