PyPI - chunkr-ai - Versions diffs - 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl - Mend

chunkr-ai 0.1.0a5py3-none-any.whl → 0.1.0a7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

chunkr_ai/__init__.py +2 -0
chunkr_ai/_client.py +31 -3
chunkr_ai/_constants.py +5 -5
chunkr_ai/_exceptions.py +4 -0
chunkr_ai/_models.py +1 -1
chunkr_ai/_types.py +35 -1
chunkr_ai/_utils/__init__.py +1 -0
chunkr_ai/_utils/_typing.py +5 -0
chunkr_ai/_version.py +1 -1
chunkr_ai/resources/__init__.py +14 -0
chunkr_ai/resources/files.py +3 -3
chunkr_ai/resources/tasks/__init__.py +14 -0
chunkr_ai/resources/tasks/extract.py +409 -0
chunkr_ai/resources/tasks/parse.py +102 -346
chunkr_ai/resources/tasks/tasks.py +62 -14
chunkr_ai/resources/webhooks.py +193 -0
chunkr_ai/types/__init__.py +27 -1
chunkr_ai/types/bounding_box.py +19 -0
chunkr_ai/types/cell.py +39 -0
chunkr_ai/types/cell_style.py +28 -0
chunkr_ai/types/chunk.py +40 -0
chunkr_ai/types/chunk_processing.py +40 -0
chunkr_ai/types/chunk_processing_param.py +42 -0
chunkr_ai/types/extract_configuration.py +24 -0
chunkr_ai/types/extract_output_response.py +19 -0
chunkr_ai/types/file_create_params.py +2 -1
chunkr_ai/types/file_info.py +21 -0
chunkr_ai/types/generation_config.py +29 -0
chunkr_ai/types/generation_config_param.py +29 -0
chunkr_ai/types/llm_processing.py +36 -0
chunkr_ai/types/llm_processing_param.py +36 -0
chunkr_ai/types/ocr_result.py +28 -0
chunkr_ai/types/page.py +27 -0
chunkr_ai/types/parse_configuration.py +64 -0
chunkr_ai/types/parse_configuration_param.py +65 -0
chunkr_ai/types/parse_output_response.py +29 -0
chunkr_ai/types/segment.py +109 -0
chunkr_ai/types/segment_processing.py +228 -0
chunkr_ai/types/segment_processing_param.py +229 -0
chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
chunkr_ai/types/task_list_params.py +7 -1
chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
chunkr_ai/types/task_response.py +68 -0
chunkr_ai/types/tasks/__init__.py +7 -1
chunkr_ai/types/tasks/extract_create_params.py +47 -0
chunkr_ai/types/tasks/extract_create_response.py +214 -0
chunkr_ai/types/tasks/extract_get_params.py +21 -0
chunkr_ai/types/tasks/extract_get_response.py +214 -0
chunkr_ai/types/tasks/parse_create_params.py +25 -805
chunkr_ai/types/tasks/parse_create_response.py +55 -0
chunkr_ai/types/tasks/parse_get_params.py +21 -0
chunkr_ai/types/tasks/parse_get_response.py +55 -0
chunkr_ai/types/unwrap_webhook_event.py +11 -0
chunkr_ai/types/version_info.py +31 -0
chunkr_ai/types/webhook_url_response.py +9 -0
{chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/METADATA +14 -13
chunkr_ai-0.1.0a7.dist-info/RECORD +86 -0
chunkr_ai/types/task.py +0 -1225
chunkr_ai/types/tasks/parse_update_params.py +0 -857
chunkr_ai-0.1.0a5.dist-info/RECORD +0 -52
{chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/WHEEL +0 -0
{chunkr_ai-0.1.0a5.dist-info → chunkr_ai-0.1.0a7.dist-info}/licenses/LICENSE +0 -0

chunkr_ai/resources/tasks/parse.py CHANGED Viewed

@@ -17,9 +17,13 @@ from ..._response import (
     async_to_raw_response_wrapper,
     async_to_streamed_response_wrapper,
 )
-from ...types.task import Task
-from ...types.tasks import parse_create_params, parse_update_params
+from ...types.tasks import parse_get_params, parse_create_params
 from ..._base_client import make_request_options
+from ...types.llm_processing_param import LlmProcessingParam
+from ...types.chunk_processing_param import ChunkProcessingParam
+from ...types.segment_processing_param import SegmentProcessingParam
+from ...types.tasks.parse_get_response import ParseGetResponse
+from ...types.tasks.parse_create_response import ParseCreateResponse
 __all__ = ["ParseResource", "AsyncParseResource"]
@@ -48,18 +52,15 @@ class ParseResource(SyncAPIResource):
         self,
         *,
         file: str,
-        base64_urls: bool | NotGiven = NOT_GIVEN,
-        include_chunks: bool | NotGiven = NOT_GIVEN,
-        wait_for_completion: bool | NotGiven = NOT_GIVEN,
-        chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
-        error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
+        chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
+        error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
         expires_in: Optional[int] | NotGiven = NOT_GIVEN,
         file_name: Optional[str] | NotGiven = NOT_GIVEN,
-        llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
-        ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
-        pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
-        segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
-        segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
+        llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
+        ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
+        pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
+        segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
+        segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -67,33 +68,23 @@ class ParseResource(SyncAPIResource):
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
         idempotency_key: str | None = None,
-    ) -> Task:
+    ) -> ParseCreateResponse:
         """
         Queues a document for processing and returns a `TaskResponse` with the assigned
         `task_id`, initial configuration, file metadata, and timestamps. The initial
         status is `Starting`.
-        If `wait_for_completion=true` is provided, the server waits briefly for
-        completion. If the task completes within that window, a 200 response with the
-        final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
-        with retry guidance and a body describing how long to wait before retrying.
+        Creates a parse task and returns its metadata immediately.
         Args:
           file:
-              The file to be uploaded. Supported inputs:
+              The file to be parsed. Supported inputs:
               - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
                 API
               - `http(s)://...`: Remote URL to fetch
               - `data:*;base64,...` or raw base64 string
-          base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
-          include_chunks: Whether to include chunks in the output response
-          wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
-              408/409 with Retry-After headers
           chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
           error_handling:
@@ -106,7 +97,7 @@ class ParseResource(SyncAPIResource):
           expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
               updated, polled or accessed via web interface.
-          file_name: The name of the file to be uploaded. If not set a name will be generated.
+          file_name: The name of the file to be parsed. If not set a name will be generated.
           llm_processing: Controls the LLM used for the task.
@@ -117,41 +108,26 @@ class ParseResource(SyncAPIResource):
                 text. When text layer is present the bounding boxes from the text layer are
                 used.
-          pipeline: Choose the provider whose models will be used for segmentation and OCR. The
-              output will be unified to the Chunkr `output` format.
-          segment_processing: Defines how each segment type is handled when generating the final output.
-              Each segment uses one of three strategies. The chosen strategy controls:
-              - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
-              - How the content is produced (rule-based vs. LLM).
-              - The output format (`Html` or `Markdown`).
+          segment_processing: Configuration for how each document segment is processed and formatted.
-              Optional flags such as image **cropping**, **extended context**, and
-              **descriptions** further refine behaviour.
+              Each segment has sensible defaults, but you can override specific settings:
-              **Default strategy per segment**
+              - `format`: Output as `Html` or `Markdown`
+              - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
+              - `crop_image`: Whether to crop images to segment bounds
+              - `extended_context`: Use full page as context for LLM processing
+              - `description`: Generate descriptions for segments
-              - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
-                (Markdown, description off)
-              - `Table` → **LLM** (HTML, description on)
-              - `Picture` → **LLM** (Markdown, description off, cropping _All_)
-              - `Formula`, `Page` → **LLM** (Markdown, description off)
-              - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
+              **Defaults per segment type:** Check the documentation for more details.
-              **Strategy reference**
-              - **Auto** – rule-based content generation.
-              - **LLM** – generate content with an LLM.
-              - **Ignore** – exclude the segment entirely.
+              Only specify the fields you want to change - everything else uses the defaults.
           segmentation_strategy:
               Controls the segmentation strategy:
               - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
                 `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
-                segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
+                segmentation and better chunking.
               - `Page`: Treats each page as a single segment. Faster processing, but without
                 layout element detection and only simple chunking.
@@ -188,126 +164,47 @@ class ParseResource(SyncAPIResource):
                 extra_body=extra_body,
                 timeout=timeout,
                 idempotency_key=idempotency_key,
-                query=maybe_transform(
-                    {
-                        "base64_urls": base64_urls,
-                        "include_chunks": include_chunks,
-                        "wait_for_completion": wait_for_completion,
-                    },
-                    parse_create_params.ParseCreateParams,
-                ),
             ),
-            cast_to=Task,
+            cast_to=ParseCreateResponse,
         )
-    def update(
+    def get(
         self,
-        task_id: str,
+        task_id: Optional[str],
         *,
         base64_urls: bool | NotGiven = NOT_GIVEN,
         include_chunks: bool | NotGiven = NOT_GIVEN,
         wait_for_completion: bool | NotGiven = NOT_GIVEN,
-        chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
-        error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
-        expires_in: Optional[int] | NotGiven = NOT_GIVEN,
-        high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
-        llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
-        ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
-        pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
-        segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
-        segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-        idempotency_key: str | None = None,
-    ) -> Task:
-        """Updates an existing task's configuration and reprocesses the document.
-        The
-        current configuration is used as the base; only provided fields are changed.
+    ) -> ParseGetResponse:
+        """
+        Retrieves the current state of a parse task and, when requested, can wait for
+        completion.
-        Requirements:
+        Returns task details such as processing status, configuration, output (when
+        available), file metadata, and timestamps. If `wait_for_completion=true` is
+        provided, the server will hold the request briefly. If the task does not reach a
+        terminal state during that window, the response will indicate a retry with
+        appropriate headers.
-        - Task must be in a terminal state (`Succeeded` or `Failed`).
-        - The new configuration must differ from the current configuration.
+        Typical uses:
-        If `wait_for_completion=true` is provided, the server waits briefly for
-        completion. If the task completes within that window, a 200 response with the
-        final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
-        guidance and a body describing how long to wait before retrying.
+        - Poll a task during processing
+        - Retrieve the final output once processing is complete
+        - Access task metadata and configuration
         Args:
-          base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
+          base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
+              presigned URLs.
           include_chunks: Whether to include chunks in the output response
-          wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
-              408/409 with Retry-After headers
-          chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
-          error_handling:
-              Controls how errors are handled during processing:
-              - `Fail`: Stops processing and fails the task when any error occurs
-              - `Continue`: Attempts to continue processing despite non-critical errors (eg.
-                LLM refusals etc.)
-          expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
-              updated, polled or accessed via web interface.
-          high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
-              penalty: ~7 seconds per page)
-          llm_processing: Controls the LLM used for the task.
-          ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
-              - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
-              - `Auto`: Selectively applies OCR only to pages with missing or low-quality
-                text. When text layer is present the bounding boxes from the text layer are
-                used.
-          pipeline: Choose the provider whose models will be used for segmentation and OCR. The
-              output will be unified to the Chunkr `output` format.
-          segment_processing: Defines how each segment type is handled when generating the final output.
-              Each segment uses one of three strategies. The chosen strategy controls:
-              - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
-              - How the content is produced (rule-based vs. LLM).
-              - The output format (`Html` or `Markdown`).
-              Optional flags such as image **cropping**, **extended context**, and
-              **descriptions** further refine behaviour.
-              **Default strategy per segment**
-              - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
-                (Markdown, description off)
-              - `Table` → **LLM** (HTML, description on)
-              - `Picture` → **LLM** (Markdown, description off, cropping _All_)
-              - `Formula`, `Page` → **LLM** (Markdown, description off)
-              - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
-              **Strategy reference**
-              - **Auto** – rule-based content generation.
-              - **LLM** – generate content with an LLM.
-              - **Ignore** – exclude the segment entirely.
-          segmentation_strategy:
-              Controls the segmentation strategy:
-              - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
-                `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
-                segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
-              - `Page`: Treats each page as a single segment. Faster processing, but without
-                layout element detection and only simple chunking.
+          wait_for_completion: Whether to wait for the task to complete
           extra_headers: Send extra headers
@@ -316,43 +213,26 @@ class ParseResource(SyncAPIResource):
           extra_body: Add additional JSON properties to the request
           timeout: Override the client-level default timeout for this request, in seconds
-          idempotency_key: Specify a custom idempotency key for this request
         """
         if not task_id:
             raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
-        return self._patch(
-            f"/tasks/parse/{task_id}",
-            body=maybe_transform(
-                {
-                    "chunk_processing": chunk_processing,
-                    "error_handling": error_handling,
-                    "expires_in": expires_in,
-                    "high_resolution": high_resolution,
-                    "llm_processing": llm_processing,
-                    "ocr_strategy": ocr_strategy,
-                    "pipeline": pipeline,
-                    "segment_processing": segment_processing,
-                    "segmentation_strategy": segmentation_strategy,
-                },
-                parse_update_params.ParseUpdateParams,
-            ),
+        return self._get(
+            f"/tasks/{task_id}/parse",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                idempotency_key=idempotency_key,
                 query=maybe_transform(
                     {
                         "base64_urls": base64_urls,
                         "include_chunks": include_chunks,
                         "wait_for_completion": wait_for_completion,
                     },
-                    parse_update_params.ParseUpdateParams,
+                    parse_get_params.ParseGetParams,
                 ),
             ),
-            cast_to=Task,
+            cast_to=ParseGetResponse,
         )
@@ -380,18 +260,15 @@ class AsyncParseResource(AsyncAPIResource):
         self,
         *,
         file: str,
-        base64_urls: bool | NotGiven = NOT_GIVEN,
-        include_chunks: bool | NotGiven = NOT_GIVEN,
-        wait_for_completion: bool | NotGiven = NOT_GIVEN,
-        chunk_processing: Optional[parse_create_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
-        error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
+        chunk_processing: ChunkProcessingParam | NotGiven = NOT_GIVEN,
+        error_handling: Literal["Fail", "Continue"] | NotGiven = NOT_GIVEN,
         expires_in: Optional[int] | NotGiven = NOT_GIVEN,
         file_name: Optional[str] | NotGiven = NOT_GIVEN,
-        llm_processing: Optional[parse_create_params.LlmProcessing] | NotGiven = NOT_GIVEN,
-        ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
-        pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
-        segment_processing: Optional[parse_create_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
-        segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
+        llm_processing: LlmProcessingParam | NotGiven = NOT_GIVEN,
+        ocr_strategy: Literal["All", "Auto"] | NotGiven = NOT_GIVEN,
+        pipeline: Literal["Azure", "Chunkr"] | NotGiven = NOT_GIVEN,
+        segment_processing: Optional[SegmentProcessingParam] | NotGiven = NOT_GIVEN,
+        segmentation_strategy: Literal["LayoutAnalysis", "Page"] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -399,33 +276,23 @@ class AsyncParseResource(AsyncAPIResource):
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
         idempotency_key: str | None = None,
-    ) -> Task:
+    ) -> ParseCreateResponse:
         """
         Queues a document for processing and returns a `TaskResponse` with the assigned
         `task_id`, initial configuration, file metadata, and timestamps. The initial
         status is `Starting`.
-        If `wait_for_completion=true` is provided, the server waits briefly for
-        completion. If the task completes within that window, a 200 response with the
-        final `TaskResponse` is returned. Otherwise, the server returns a 408 or 409
-        with retry guidance and a body describing how long to wait before retrying.
+        Creates a parse task and returns its metadata immediately.
         Args:
           file:
-              The file to be uploaded. Supported inputs:
+              The file to be parsed. Supported inputs:
               - `ch://files/{file_id}`: Reference to an existing file. Upload via the Files
                 API
               - `http(s)://...`: Remote URL to fetch
               - `data:*;base64,...` or raw base64 string
-          base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
-          include_chunks: Whether to include chunks in the output response
-          wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
-              408/409 with Retry-After headers
           chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
           error_handling:
@@ -438,7 +305,7 @@ class AsyncParseResource(AsyncAPIResource):
           expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
               updated, polled or accessed via web interface.
-          file_name: The name of the file to be uploaded. If not set a name will be generated.
+          file_name: The name of the file to be parsed. If not set a name will be generated.
           llm_processing: Controls the LLM used for the task.
@@ -449,41 +316,26 @@ class AsyncParseResource(AsyncAPIResource):
                 text. When text layer is present the bounding boxes from the text layer are
                 used.
-          pipeline: Choose the provider whose models will be used for segmentation and OCR. The
-              output will be unified to the Chunkr `output` format.
-          segment_processing: Defines how each segment type is handled when generating the final output.
-              Each segment uses one of three strategies. The chosen strategy controls:
-              - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
-              - How the content is produced (rule-based vs. LLM).
-              - The output format (`Html` or `Markdown`).
+          segment_processing: Configuration for how each document segment is processed and formatted.
-              Optional flags such as image **cropping**, **extended context**, and
-              **descriptions** further refine behaviour.
+              Each segment has sensible defaults, but you can override specific settings:
-              **Default strategy per segment**
+              - `format`: Output as `Html` or `Markdown`
+              - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
+              - `crop_image`: Whether to crop images to segment bounds
+              - `extended_context`: Use full page as context for LLM processing
+              - `description`: Generate descriptions for segments
-              - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
-                (Markdown, description off)
-              - `Table` → **LLM** (HTML, description on)
-              - `Picture` → **LLM** (Markdown, description off, cropping _All_)
-              - `Formula`, `Page` → **LLM** (Markdown, description off)
-              - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
+              **Defaults per segment type:** Check the documentation for more details.
-              **Strategy reference**
-              - **Auto** – rule-based content generation.
-              - **LLM** – generate content with an LLM.
-              - **Ignore** – exclude the segment entirely.
+              Only specify the fields you want to change - everything else uses the defaults.
           segmentation_strategy:
               Controls the segmentation strategy:
               - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
                 `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
-                segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
+                segmentation and better chunking.
               - `Page`: Treats each page as a single segment. Faster processing, but without
                 layout element detection and only simple chunking.
@@ -520,126 +372,47 @@ class AsyncParseResource(AsyncAPIResource):
                 extra_body=extra_body,
                 timeout=timeout,
                 idempotency_key=idempotency_key,
-                query=await async_maybe_transform(
-                    {
-                        "base64_urls": base64_urls,
-                        "include_chunks": include_chunks,
-                        "wait_for_completion": wait_for_completion,
-                    },
-                    parse_create_params.ParseCreateParams,
-                ),
             ),
-            cast_to=Task,
+            cast_to=ParseCreateResponse,
         )
-    async def update(
+    async def get(
         self,
-        task_id: str,
+        task_id: Optional[str],
         *,
         base64_urls: bool | NotGiven = NOT_GIVEN,
         include_chunks: bool | NotGiven = NOT_GIVEN,
         wait_for_completion: bool | NotGiven = NOT_GIVEN,
-        chunk_processing: Optional[parse_update_params.ChunkProcessing] | NotGiven = NOT_GIVEN,
-        error_handling: Optional[Literal["Fail", "Continue"]] | NotGiven = NOT_GIVEN,
-        expires_in: Optional[int] | NotGiven = NOT_GIVEN,
-        high_resolution: Optional[bool] | NotGiven = NOT_GIVEN,
-        llm_processing: Optional[parse_update_params.LlmProcessing] | NotGiven = NOT_GIVEN,
-        ocr_strategy: Optional[Literal["All", "Auto"]] | NotGiven = NOT_GIVEN,
-        pipeline: Optional[Literal["Azure", "Chunkr"]] | NotGiven = NOT_GIVEN,
-        segment_processing: Optional[parse_update_params.SegmentProcessing] | NotGiven = NOT_GIVEN,
-        segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-        idempotency_key: str | None = None,
-    ) -> Task:
-        """Updates an existing task's configuration and reprocesses the document.
-        The
-        current configuration is used as the base; only provided fields are changed.
+    ) -> ParseGetResponse:
+        """
+        Retrieves the current state of a parse task and, when requested, can wait for
+        completion.
-        Requirements:
+        Returns task details such as processing status, configuration, output (when
+        available), file metadata, and timestamps. If `wait_for_completion=true` is
+        provided, the server will hold the request briefly. If the task does not reach a
+        terminal state during that window, the response will indicate a retry with
+        appropriate headers.
-        - Task must be in a terminal state (`Succeeded` or `Failed`).
-        - The new configuration must differ from the current configuration.
+        Typical uses:
-        If `wait_for_completion=true` is provided, the server waits briefly for
-        completion. If the task completes within that window, a 200 response with the
-        final `TaskResponse` is returned. Otherwise, the server returns a 408 with retry
-        guidance and a body describing how long to wait before retrying.
+        - Poll a task during processing
+        - Retrieve the final output once processing is complete
+        - Access task metadata and configuration
         Args:
-          base64_urls: Whether to return base64 encoded URLs. If false, presigned URLs are returned.
+          base64_urls: Whether to return base64 encoded URLs. If false, the URLs will be returned as
+              presigned URLs.
           include_chunks: Whether to include chunks in the output response
-          wait_for_completion: If true, server holds briefly and may return 200 when done; otherwise returns
-              408/409 with Retry-After headers
-          chunk_processing: Controls the setting for the chunking and post-processing of each chunk.
-          error_handling:
-              Controls how errors are handled during processing:
-              - `Fail`: Stops processing and fails the task when any error occurs
-              - `Continue`: Attempts to continue processing despite non-critical errors (eg.
-                LLM refusals etc.)
-          expires_in: The number of seconds until task is deleted. Expired tasks can **not** be
-              updated, polled or accessed via web interface.
-          high_resolution: Whether to use high-resolution images for cropping and post-processing. (Latency
-              penalty: ~7 seconds per page)
-          llm_processing: Controls the LLM used for the task.
-          ocr_strategy: Controls the Optical Character Recognition (OCR) strategy.
-              - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
-              - `Auto`: Selectively applies OCR only to pages with missing or low-quality
-                text. When text layer is present the bounding boxes from the text layer are
-                used.
-          pipeline: Choose the provider whose models will be used for segmentation and OCR. The
-              output will be unified to the Chunkr `output` format.
-          segment_processing: Defines how each segment type is handled when generating the final output.
-              Each segment uses one of three strategies. The chosen strategy controls:
-              - Whether the segment is kept (`Auto`, `LLM`) or skipped (`Ignore`).
-              - How the content is produced (rule-based vs. LLM).
-              - The output format (`Html` or `Markdown`).
-              Optional flags such as image **cropping**, **extended context**, and
-              **descriptions** further refine behaviour.
-              **Default strategy per segment**
-              - `Title`, `SectionHeader`, `Text`, `ListItem`, `Caption`, `Footnote` → **Auto**
-                (Markdown, description off)
-              - `Table` → **LLM** (HTML, description on)
-              - `Picture` → **LLM** (Markdown, description off, cropping _All_)
-              - `Formula`, `Page` → **LLM** (Markdown, description off)
-              - `PageHeader`, `PageFooter` → **Ignore** (removed from output)
-              **Strategy reference**
-              - **Auto** – rule-based content generation.
-              - **LLM** – generate content with an LLM.
-              - **Ignore** – exclude the segment entirely.
-          segmentation_strategy:
-              Controls the segmentation strategy:
-              - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
-                `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
-                segmentation and better chunking. (Latency penalty: ~TBD seconds per page).
-              - `Page`: Treats each page as a single segment. Faster processing, but without
-                layout element detection and only simple chunking.
+          wait_for_completion: Whether to wait for the task to complete
           extra_headers: Send extra headers
@@ -648,43 +421,26 @@ class AsyncParseResource(AsyncAPIResource):
           extra_body: Add additional JSON properties to the request
           timeout: Override the client-level default timeout for this request, in seconds
-          idempotency_key: Specify a custom idempotency key for this request
         """
         if not task_id:
             raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
-        return await self._patch(
-            f"/tasks/parse/{task_id}",
-            body=await async_maybe_transform(
-                {
-                    "chunk_processing": chunk_processing,
-                    "error_handling": error_handling,
-                    "expires_in": expires_in,
-                    "high_resolution": high_resolution,
-                    "llm_processing": llm_processing,
-                    "ocr_strategy": ocr_strategy,
-                    "pipeline": pipeline,
-                    "segment_processing": segment_processing,
-                    "segmentation_strategy": segmentation_strategy,
-                },
-                parse_update_params.ParseUpdateParams,
-            ),
+        return await self._get(
+            f"/tasks/{task_id}/parse",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                idempotency_key=idempotency_key,
                 query=await async_maybe_transform(
                     {
                         "base64_urls": base64_urls,
                         "include_chunks": include_chunks,
                         "wait_for_completion": wait_for_completion,
                     },
-                    parse_update_params.ParseUpdateParams,
+                    parse_get_params.ParseGetParams,
                 ),
             ),
-            cast_to=Task,
+            cast_to=ParseGetResponse,
         )
@@ -695,8 +451,8 @@ class ParseResourceWithRawResponse:
         self.create = to_raw_response_wrapper(
             parse.create,
         )
-        self.update = to_raw_response_wrapper(
-            parse.update,
+        self.get = to_raw_response_wrapper(
+            parse.get,
         )
@@ -707,8 +463,8 @@ class AsyncParseResourceWithRawResponse:
         self.create = async_to_raw_response_wrapper(
             parse.create,
         )
-        self.update = async_to_raw_response_wrapper(
-            parse.update,
+        self.get = async_to_raw_response_wrapper(
+            parse.get,
         )
@@ -719,8 +475,8 @@ class ParseResourceWithStreamingResponse:
         self.create = to_streamed_response_wrapper(
             parse.create,
         )
-        self.update = to_streamed_response_wrapper(
-            parse.update,
+        self.get = to_streamed_response_wrapper(
+            parse.get,
         )
@@ -731,6 +487,6 @@ class AsyncParseResourceWithStreamingResponse:
         self.create = async_to_streamed_response_wrapper(
             parse.create,
         )
-        self.update = async_to_streamed_response_wrapper(
-            parse.update,
+        self.get = async_to_streamed_response_wrapper(
+            parse.get,
         )

chunkr-ai 0.1.0a5__py3-none-any.whl → 0.1.0a7__py3-none-any.whl

chunkr-ai 0.1.0a5py3-none-any.whl → 0.1.0a7py3-none-any.whl