PyPI - chunkr-ai - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.0a1__py3-none-any.whl - Mend

chunkr-ai 0.1.0py3-none-any.whl → 0.1.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

chunkr_ai/__init__.py +89 -2
chunkr_ai/_base_client.py +1995 -0
chunkr_ai/_client.py +402 -0
chunkr_ai/_compat.py +219 -0
chunkr_ai/_constants.py +14 -0
chunkr_ai/_exceptions.py +108 -0
chunkr_ai/_files.py +123 -0
chunkr_ai/_models.py +829 -0
chunkr_ai/_qs.py +150 -0
chunkr_ai/_resource.py +43 -0
chunkr_ai/_response.py +830 -0
chunkr_ai/_streaming.py +333 -0
chunkr_ai/_types.py +219 -0
chunkr_ai/_utils/__init__.py +57 -0
chunkr_ai/_utils/_logs.py +25 -0
chunkr_ai/_utils/_proxy.py +65 -0
chunkr_ai/_utils/_reflection.py +42 -0
chunkr_ai/_utils/_resources_proxy.py +24 -0
chunkr_ai/_utils/_streams.py +12 -0
chunkr_ai/_utils/_sync.py +86 -0
chunkr_ai/_utils/_transform.py +447 -0
chunkr_ai/_utils/_typing.py +151 -0
chunkr_ai/_utils/_utils.py +422 -0
chunkr_ai/_version.py +4 -0
chunkr_ai/lib/.keep +4 -0
chunkr_ai/pagination.py +71 -0
chunkr_ai/resources/__init__.py +33 -0
chunkr_ai/resources/health.py +136 -0
chunkr_ai/resources/task.py +1166 -0
chunkr_ai/types/__init__.py +27 -0
chunkr_ai/types/auto_generation_config.py +39 -0
chunkr_ai/types/auto_generation_config_param.py +39 -0
chunkr_ai/types/bounding_box.py +19 -0
chunkr_ai/types/chunk_processing.py +40 -0
chunkr_ai/types/chunk_processing_param.py +42 -0
chunkr_ai/types/health_check_response.py +7 -0
chunkr_ai/types/ignore_generation_config.py +39 -0
chunkr_ai/types/ignore_generation_config_param.py +39 -0
chunkr_ai/types/llm_generation_config.py +39 -0
chunkr_ai/types/llm_generation_config_param.py +39 -0
chunkr_ai/types/llm_processing.py +36 -0
chunkr_ai/types/llm_processing_param.py +36 -0
chunkr_ai/types/picture_generation_config.py +39 -0
chunkr_ai/types/picture_generation_config_param.py +39 -0
chunkr_ai/types/segment_processing.py +280 -0
chunkr_ai/types/segment_processing_param.py +281 -0
chunkr_ai/types/table_generation_config.py +39 -0
chunkr_ai/types/table_generation_config_param.py +39 -0
chunkr_ai/types/task.py +379 -0
chunkr_ai/types/task_get_params.py +18 -0
chunkr_ai/types/task_list_params.py +37 -0
chunkr_ai/types/task_parse_params.py +90 -0
chunkr_ai/types/task_update_params.py +90 -0
chunkr_ai-0.1.0a1.dist-info/METADATA +504 -0
chunkr_ai-0.1.0a1.dist-info/RECORD +58 -0
{chunkr_ai-0.1.0.dist-info → chunkr_ai-0.1.0a1.dist-info}/WHEEL +1 -2
chunkr_ai-0.1.0a1.dist-info/licenses/LICENSE +201 -0
chunkr_ai/api/auth.py +0 -13
chunkr_ai/api/chunkr.py +0 -103
chunkr_ai/api/chunkr_base.py +0 -185
chunkr_ai/api/configuration.py +0 -313
chunkr_ai/api/decorators.py +0 -101
chunkr_ai/api/misc.py +0 -139
chunkr_ai/api/protocol.py +0 -14
chunkr_ai/api/task_response.py +0 -208
chunkr_ai/models.py +0 -55
chunkr_ai-0.1.0.dist-info/METADATA +0 -268
chunkr_ai-0.1.0.dist-info/RECORD +0 -16
chunkr_ai-0.1.0.dist-info/licenses/LICENSE +0 -21
chunkr_ai-0.1.0.dist-info/top_level.txt +0 -1
/chunkr_ai/{api/__init__.py → py.typed} +0 -0

chunkr_ai/api/configuration.py DELETED Viewed

@@ -1,313 +0,0 @@
-from pydantic import BaseModel, Field, ConfigDict
-from enum import Enum
-from typing import Any, List, Optional, Union
-from pydantic import field_validator, field_serializer
-class CroppingStrategy(str, Enum):
-    ALL = "All"
-    AUTO = "Auto"
-class SegmentFormat(str, Enum):
-    HTML = "Html"
-    MARKDOWN = "Markdown"
-class EmbedSource(str, Enum):
-    CONTENT = "Content"
-    HTML = "HTML"  # Deprecated
-    MARKDOWN = "Markdown"  # Deprecated
-    LLM = "LLM"
-class GenerationStrategy(str, Enum):
-    LLM = "LLM"
-    AUTO = "Auto"
-class GenerationConfig(BaseModel):
-    format: Optional[SegmentFormat] = None
-    strategy: Optional[GenerationStrategy] = None
-    llm: Optional[str] = None
-    crop_image: Optional[CroppingStrategy] = None
-    embed_sources: Optional[List[EmbedSource]] = None
-    extended_context: Optional[bool] = None
-    # Deprecated fields for backwards compatibility
-    html: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.HTML and strategy instead
-    markdown: Optional[GenerationStrategy] = None  # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
-class SegmentProcessing(BaseModel):
-    model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
-    caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
-    footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
-    formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
-    list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
-    page: Optional[GenerationConfig] = Field(default=None, alias="Page")
-    page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
-    page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
-    picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
-    section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
-    table: Optional[GenerationConfig] = Field(default=None, alias="Table")
-    text: Optional[GenerationConfig] = Field(default=None, alias="Text")
-    title: Optional[GenerationConfig] = Field(default=None, alias="Title")
-class Tokenizer(str, Enum):
-    WORD = "Word"
-    CL100K_BASE = "Cl100kBase"
-    XLM_ROBERTA_BASE = "XlmRobertaBase"
-    BERT_BASE_UNCASED = "BertBaseUncased"
-class TokenizerType(BaseModel):
-    enum_value: Optional[Tokenizer] = None
-    string_value: Optional[str] = None
-    @classmethod
-    def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
-        return cls(enum_value=enum_value)
-    @classmethod
-    def from_string(cls, string_value: str) -> "TokenizerType":
-        return cls(string_value=string_value)
-    def __str__(self) -> str:
-        if self.enum_value is not None:
-            return f"enum:{self.enum_value.value}"
-        elif self.string_value is not None:
-            return f"string:{self.string_value}"
-        return ""
-    model_config = ConfigDict()
-    def model_dump(self, **kwargs):
-        if self.enum_value is not None:
-            return {"Enum": self.enum_value.value}
-        elif self.string_value is not None:
-            return {"String": self.string_value}
-        return {}
-class ChunkProcessing(BaseModel):
-    ignore_headers_and_footers: Optional[bool] = True
-    target_length: Optional[int] = None
-    tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
-    model_config = ConfigDict(
-        arbitrary_types_allowed=True,
-    )
-    @field_serializer('tokenizer')
-    def serialize_tokenizer(self, tokenizer: Optional[TokenizerType], _info):
-        if tokenizer is None:
-            return None
-        return tokenizer.model_dump()
-    @field_validator('tokenizer', mode='before')
-    def validate_tokenizer(cls, v):
-        if v is None:
-            return None
-        if isinstance(v, TokenizerType):
-            return v
-        if isinstance(v, Tokenizer):
-            return TokenizerType(enum_value=v)
-        if isinstance(v, dict):
-            if "Enum" in v:
-                try:
-                    return TokenizerType(enum_value=Tokenizer(v["Enum"]))
-                except ValueError:
-                    return TokenizerType(string_value=v["Enum"])
-            elif "String" in v:
-                return TokenizerType(string_value=v["String"])
-        if isinstance(v, str):
-            try:
-                return TokenizerType(enum_value=Tokenizer(v))
-            except ValueError:
-                return TokenizerType(string_value=v)
-        raise ValueError(f"Cannot convert {v} to TokenizerType")
-class OcrStrategy(str, Enum):
-    ALL = "All"
-    AUTO = "Auto"
-class SegmentationStrategy(str, Enum):
-    LAYOUT_ANALYSIS = "LayoutAnalysis"
-    PAGE = "Page"
-class ErrorHandlingStrategy(str, Enum):
-    FAIL = "Fail"
-    CONTINUE = "Continue"
-class FallbackStrategy(BaseModel):
-    type: str
-    model_id: Optional[str] = None
-    @classmethod
-    def none(cls) -> "FallbackStrategy":
-        return cls(type="None")
-    @classmethod
-    def default(cls) -> "FallbackStrategy":
-        return cls(type="Default")
-    @classmethod
-    def model(cls, model_id: str) -> "FallbackStrategy":
-        return cls(type="Model", model_id=model_id)
-    def __str__(self) -> str:
-        if self.type == "Model":
-            return f"Model({self.model_id})"
-        return self.type
-    def model_dump(self, **kwargs):
-        if self.type == "Model":
-            return {"Model": self.model_id}
-        return self.type
-    @field_validator('type')
-    def validate_type(cls, v):
-        if v not in ["None", "Default", "Model"]:
-            raise ValueError(f"Invalid fallback strategy: {v}")
-        return v
-    model_config = ConfigDict()
-    @classmethod
-    def model_validate(cls, obj):
-        # Handle string values like "None" or "Default"
-        if isinstance(obj, str):
-            if obj in ["None", "Default"]:
-                return cls(type=obj)
-            # Try to parse as Enum value if it's not a direct match
-            try:
-                return cls(type=obj)
-            except ValueError:
-                pass  # Let it fall through to normal validation
-        # Handle dictionary format like {"Model": "model-id"}
-        elif isinstance(obj, dict) and len(obj) == 1:
-            if "Model" in obj:
-                return cls(type="Model", model_id=obj["Model"])
-        # Fall back to normal validation
-        return super().model_validate(obj)
-class LlmProcessing(BaseModel):
-    model_id: Optional[str] = None
-    fallback_strategy: FallbackStrategy = Field(default_factory=FallbackStrategy.default)
-    max_completion_tokens: Optional[int] = None
-    temperature: float = 0.0
-    model_config = ConfigDict()
-    @field_serializer('fallback_strategy')
-    def serialize_fallback_strategy(self, fallback_strategy: FallbackStrategy, _info):
-        return fallback_strategy.model_dump()
-    @field_validator('fallback_strategy', mode='before')
-    def validate_fallback_strategy(cls, v):
-        if isinstance(v, str):
-            if v == "None":
-                return FallbackStrategy.none()
-            elif v == "Default":
-                return FallbackStrategy.default()
-            # Try to parse as a model ID if it's not None or Default
-            try:
-                return FallbackStrategy.model(v)
-            except ValueError:
-                pass  # Let it fall through to normal validation
-        # Handle dictionary format like {"Model": "model-id"}
-        elif isinstance(v, dict) and len(v) == 1:
-            if "Model" in v:
-                return FallbackStrategy.model(v["Model"])
-            elif "None" in v or v.get("None") is None:
-                return FallbackStrategy.none()
-            elif "Default" in v or v.get("Default") is None:
-                return FallbackStrategy.default()
-        return v
-class BoundingBox(BaseModel):
-    left: float
-    top: float
-    width: float
-    height: float
-class OCRResult(BaseModel):
-    bbox: BoundingBox
-    text: str
-    confidence: Optional[float]
-class SegmentType(str, Enum):
-    CAPTION = "Caption"
-    FOOTNOTE = "Footnote"
-    FORMULA = "Formula"
-    LIST_ITEM = "ListItem"
-    PAGE = "Page"
-    PAGE_FOOTER = "PageFooter"
-    PAGE_HEADER = "PageHeader"
-    PICTURE = "Picture"
-    SECTION_HEADER = "SectionHeader"
-    TABLE = "Table"
-    TEXT = "Text"
-    TITLE = "Title"
-class Segment(BaseModel):
-    bbox: BoundingBox
-    content: str = ""
-    page_height: float
-    llm: Optional[str] = None
-    html: Optional[str] = None
-    image: Optional[str] = None
-    markdown: Optional[str] = None
-    ocr: Optional[List[OCRResult]] = Field(default_factory=list)
-    page_number: int
-    page_width: float
-    segment_id: str
-    segment_type: SegmentType
-    confidence: Optional[float]
-    text: str = ""
-class Chunk(BaseModel):
-    chunk_id: str
-    chunk_length: int
-    segments: List[Segment]
-    embed: Optional[str] = None
-class OutputResponse(BaseModel):
-    chunks: List[Chunk]
-    file_name: Optional[str]
-    page_count: Optional[int]
-    pdf_url: Optional[str]
-class Model(str, Enum):
-    FAST = "Fast"
-    HIGH_QUALITY = "HighQuality"
-class Pipeline(str, Enum):
-    AZURE = "Azure"
-    CHUNKR = "Chunkr"
-class Configuration(BaseModel):
-    chunk_processing: Optional[ChunkProcessing] = None
-    expires_in: Optional[int] = None
-    error_handling: Optional[ErrorHandlingStrategy] = None
-    high_resolution: Optional[bool] = None
-    ocr_strategy: Optional[OcrStrategy] = None
-    segment_processing: Optional[SegmentProcessing] = None
-    segmentation_strategy: Optional[SegmentationStrategy] = None
-    pipeline: Optional[Pipeline] = None
-    llm_processing: Optional[LlmProcessing] = None
-class OutputConfiguration(Configuration):
-    input_file_url: Optional[str] = None
-    # Deprecated
-    json_schema: Optional[Any] = None
-    model: Optional[Model] = None
-    target_chunk_length: Optional[int] = None
-class Status(str, Enum):
-    STARTING = "Starting"
-    PROCESSING = "Processing"
-    SUCCEEDED = "Succeeded"
-    FAILED = "Failed"
-    CANCELLED = "Cancelled"

chunkr_ai/api/decorators.py DELETED Viewed

@@ -1,101 +0,0 @@
-import asyncio
-import functools
-import httpx
-import nest_asyncio
-from typing import Callable, Any, TypeVar, Awaitable, Union, overload
-try:
-    from typing import ParamSpec
-except ImportError:
-    from typing_extensions import ParamSpec
-T = TypeVar('T')
-P = ParamSpec('P')
-_sync_loop = None
-def anywhere() -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Union[Awaitable[T], T]]]:
-    """Decorator that allows an async function to run anywhere - sync or async context."""
-    def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Union[Awaitable[T], T]]:
-        @functools.wraps(async_func)
-        def wrapper(*args: P.args, **kwargs: P.kwargs) -> Union[Awaitable[T], T]:
-            global _sync_loop
-            try:
-                nest_asyncio.apply()
-            except ImportError:
-                pass
-            try:
-                asyncio.get_running_loop()
-                return async_func(*args, **kwargs)
-            except RuntimeError:
-                if _sync_loop is None:
-                    _sync_loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(_sync_loop)
-                try:
-                    return _sync_loop.run_until_complete(async_func(*args, **kwargs))
-                finally:
-                    asyncio.set_event_loop(None)
-        return wrapper
-    return decorator
-def ensure_client() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
-    """Decorator that ensures a valid httpx.AsyncClient exists before executing the method"""
-    def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
-        @functools.wraps(async_func)
-        async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
-            if not self._client or self._client.is_closed:
-                self._client = httpx.AsyncClient()
-            return await async_func(self, *args, **kwargs)
-        return wrapper
-    return decorator
-def require_task() -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
-    """Decorator that ensures task has required attributes and valid client before execution"""
-    def decorator(async_func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
-        @functools.wraps(async_func)
-        async def wrapper(self: Any, *args: Any, **kwargs: Any) -> T:
-            if not self.task_url:
-                raise ValueError("Task URL not found")
-            if not self._client:
-                raise ValueError("Client not found")
-            if not self._client._client or self._client._client.is_closed:
-                self._client._client = httpx.AsyncClient()
-            return await async_func(self, *args, **kwargs)
-        return wrapper
-    return decorator
-def retry_on_429(max_retries: int = 3, initial_delay: float = 0.5) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
-    """Decorator that retries the request when encountering 429 Too Many Requests errors.
-    Args:
-        max_retries: Maximum number of retry attempts (default: 3)
-        initial_delay: Initial delay in seconds, will be exponentially increased with jitter (default: 0.5)
-    """
-    def decorator(async_func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
-        @functools.wraps(async_func)
-        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
-            import random
-            retries = 0
-            while True:
-                try:
-                    return await async_func(*args, **kwargs)
-                except httpx.HTTPStatusError as e:
-                    if e.response.status_code != 429:
-                        raise e
-                    if retries >= max_retries:
-                        print("Max retries reached")
-                        raise e
-                    retries += 1
-                    delay = initial_delay * (2 ** retries)
-                    # Use Retry-After header if available
-                    retry_after = e.response.headers.get('Retry-After')
-                    if retry_after:
-                        try:
-                            delay = float(retry_after)
-                        except (ValueError, TypeError):
-                            pass
-                    jitter = random.uniform(0, 0.25) * delay
-                    await asyncio.sleep(delay + jitter)
-        return wrapper
-    return decorator

chunkr_ai/api/misc.py DELETED Viewed

@@ -1,139 +0,0 @@
-from .configuration import Configuration
-import base64
-import io
-from pathlib import Path
-from PIL import Image
-from typing import Union, Tuple, BinaryIO, Optional, Any
-async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
-    """Convert various file types into a tuple of (filename, file content).
-    Args:
-        file: Input file, can be:
-            - URL string starting with http:// or https://
-            - Base64 string
-            - Local file path (will be converted to base64)
-            - Opened binary file (will be converted to base64)
-            - PIL/Pillow Image object (will be converted to base64)
-            - Bytes object (will be converted to base64)
-    Returns:
-        Tuple[Optional[str], str]: (filename, content) where content is either a URL or base64 string
-        The filename may be None for URLs, base64 strings, and PIL Images
-    Raises:
-        FileNotFoundError: If the file path doesn't exist
-        TypeError: If the file type is not supported
-        ValueError: If the URL is invalid or unreachable
-        ValueError: If the MIME type is unsupported
-    """
-    # Handle bytes-like objects
-    if isinstance(file, (bytes, bytearray, memoryview)):
-        # Convert to bytes first if it's not already
-        file_bytes = bytes(file)
-        # Check if this might be an already-encoded base64 string in bytes form
-        try:
-            # Try to decode the bytes to a string and see if it's valid base64
-            potential_base64 = file_bytes.decode('utf-8', errors='strict')
-            base64.b64decode(potential_base64)
-            # If we get here, it was a valid base64 string in bytes form
-            return None, potential_base64
-        except:
-            # Not a base64 string in bytes form, encode it as base64
-            base64_str = base64.b64encode(file_bytes).decode()
-            return None, base64_str
-    # Handle strings - urls or paths or base64
-    if isinstance(file, str):
-        # Handle URLs
-        if file.startswith(('http://', 'https://')):
-            return None, file
-        # Handle data URLs
-        if file.startswith('data:'):
-            return None, file
-        # Try to handle as a file path
-        try:
-            path = Path(file)
-            if path.exists():
-                # It's a valid file path, convert to Path object and continue processing
-                file = path
-            else:
-                # If not a valid file path, try treating as base64
-                try:
-                    # Just test if it's valid base64, don't store the result
-                    base64.b64decode(file)
-                    return None, file
-                except:
-                    raise ValueError(f"File not found: {file} and it's not a valid base64 string")
-        except Exception as e:
-            # If string can't be converted to Path or decoded as base64, it might still be a base64 string
-            try:
-                base64.b64decode(file)
-                return None, file
-            except:
-                raise ValueError(f"Unable to process file: {e}")
-    # Handle file paths - convert to base64
-    if isinstance(file, Path):
-        path = Path(file).resolve()
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {file}")
-        with open(path, "rb") as f:
-            file_content = f.read()
-            file_ext = path.suffix.lower().lstrip('.')
-            if not file_ext:
-                raise ValueError("File must have an extension")
-            base64_str = base64.b64encode(file_content).decode()
-            return path.name, base64_str
-    # Handle PIL Images - convert to base64
-    if isinstance(file, Image.Image):
-        img_byte_arr = io.BytesIO()
-        format = file.format or "PNG"
-        file.save(img_byte_arr, format=format)
-        img_byte_arr.seek(0)
-        base64_str = base64.b64encode(img_byte_arr.getvalue()).decode()
-        return None, base64_str
-    # Handle file-like objects - convert to base64
-    if hasattr(file, "read") and hasattr(file, "seek"):
-        file.seek(0)
-        file_content = file.read()
-        name = getattr(file, "name", "document")
-        if not name or not isinstance(name, str):
-            name = None
-        base64_str = base64.b64encode(file_content).decode()
-        return name, base64_str
-    raise TypeError(f"Unsupported file type: {type(file)}")
-async def prepare_upload_data(
-    file: Optional[Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]] = None,
-    filename: Optional[str] = None,
-    config: Optional[Configuration] = None,
-) -> dict:
-    """Prepare data dictionary for upload.
-    Args:
-        file: The file to upload
-        filename: Optional filename to use (overrides any filename from the file)
-        config: Optional configuration settings
-    Returns:
-        dict: JSON-serializable data dictionary ready for upload
-    """
-    data = {}
-    if file:
-        processed_filename, processed_file = await prepare_file(file)
-        data["file"] = processed_file
-        data["file_name"] = filename or processed_filename
-    if config:
-        data.update(config.model_dump(mode="json", exclude_none=True))
-    return data

chunkr_ai/api/protocol.py DELETED Viewed

@@ -1,14 +0,0 @@
-from typing import Optional, runtime_checkable, Protocol
-from httpx import AsyncClient
-@runtime_checkable
-class ChunkrClientProtocol(Protocol):
-    """Protocol defining the interface for Chunkr clients"""
-    raise_on_failure: bool = True
-    _client: Optional[AsyncClient] = None
-    def _headers(self) -> dict:
-        """Return headers required for API requests"""
-        ...

chunkr-ai 0.1.0__py3-none-any.whl → 0.1.0a1__py3-none-any.whl

chunkr-ai 0.1.0py3-none-any.whl → 0.1.0a1py3-none-any.whl