chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. chunkr_ai/__init__.py +2 -0
  2. chunkr_ai/_base_client.py +3 -3
  3. chunkr_ai/_client.py +31 -3
  4. chunkr_ai/_compat.py +48 -48
  5. chunkr_ai/_constants.py +5 -5
  6. chunkr_ai/_exceptions.py +4 -0
  7. chunkr_ai/_models.py +41 -41
  8. chunkr_ai/_types.py +35 -1
  9. chunkr_ai/_utils/__init__.py +9 -2
  10. chunkr_ai/_utils/_compat.py +45 -0
  11. chunkr_ai/_utils/_datetime_parse.py +136 -0
  12. chunkr_ai/_utils/_transform.py +11 -1
  13. chunkr_ai/_utils/_typing.py +6 -1
  14. chunkr_ai/_utils/_utils.py +0 -1
  15. chunkr_ai/_version.py +1 -1
  16. chunkr_ai/resources/__init__.py +14 -0
  17. chunkr_ai/resources/files.py +3 -3
  18. chunkr_ai/resources/tasks/__init__.py +14 -0
  19. chunkr_ai/resources/tasks/extract.py +393 -0
  20. chunkr_ai/resources/tasks/parse.py +110 -286
  21. chunkr_ai/resources/tasks/tasks.py +64 -32
  22. chunkr_ai/resources/webhooks.py +193 -0
  23. chunkr_ai/types/__init__.py +27 -1
  24. chunkr_ai/types/bounding_box.py +19 -0
  25. chunkr_ai/types/cell.py +39 -0
  26. chunkr_ai/types/cell_style.py +28 -0
  27. chunkr_ai/types/chunk.py +40 -0
  28. chunkr_ai/types/chunk_processing.py +40 -0
  29. chunkr_ai/types/chunk_processing_param.py +42 -0
  30. chunkr_ai/types/extract_configuration.py +24 -0
  31. chunkr_ai/types/extract_output_response.py +62 -0
  32. chunkr_ai/types/file_create_params.py +2 -1
  33. chunkr_ai/types/file_info.py +21 -0
  34. chunkr_ai/types/generation_config.py +29 -0
  35. chunkr_ai/types/generation_config_param.py +29 -0
  36. chunkr_ai/types/llm_processing.py +36 -0
  37. chunkr_ai/types/llm_processing_param.py +36 -0
  38. chunkr_ai/types/ocr_result.py +28 -0
  39. chunkr_ai/types/page.py +27 -0
  40. chunkr_ai/types/parse_configuration.py +64 -0
  41. chunkr_ai/types/parse_configuration_param.py +65 -0
  42. chunkr_ai/types/parse_output_response.py +29 -0
  43. chunkr_ai/types/segment.py +109 -0
  44. chunkr_ai/types/segment_processing.py +228 -0
  45. chunkr_ai/types/segment_processing_param.py +229 -0
  46. chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
  47. chunkr_ai/types/task_get_params.py +0 -3
  48. chunkr_ai/types/task_list_params.py +7 -1
  49. chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
  50. chunkr_ai/types/task_response.py +68 -0
  51. chunkr_ai/types/tasks/__init__.py +7 -1
  52. chunkr_ai/types/tasks/extract_create_params.py +47 -0
  53. chunkr_ai/types/tasks/extract_create_response.py +67 -0
  54. chunkr_ai/types/tasks/extract_get_params.py +18 -0
  55. chunkr_ai/types/tasks/extract_get_response.py +67 -0
  56. chunkr_ai/types/tasks/parse_create_params.py +25 -793
  57. chunkr_ai/types/tasks/parse_create_response.py +55 -0
  58. chunkr_ai/types/tasks/parse_get_params.py +18 -0
  59. chunkr_ai/types/tasks/parse_get_response.py +55 -0
  60. chunkr_ai/types/unwrap_webhook_event.py +11 -0
  61. chunkr_ai/types/version_info.py +31 -0
  62. chunkr_ai/types/webhook_url_response.py +9 -0
  63. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
  64. chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
  65. chunkr_ai/types/task.py +0 -1225
  66. chunkr_ai/types/tasks/parse_update_params.py +0 -845
  67. chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
  68. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
  69. {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,40 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Union, Optional
4
+ from typing_extensions import Literal, TypeAlias
5
+
6
+ from pydantic import Field as FieldInfo
7
+
8
+ from .._models import BaseModel
9
+
10
+ __all__ = ["ChunkProcessing", "Tokenizer", "TokenizerEnum", "TokenizerString"]
11
+
12
+
13
+ class TokenizerEnum(BaseModel):
14
+ enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
15
+ """Use one of the predefined tokenizer types"""
16
+
17
+
18
+ class TokenizerString(BaseModel):
19
+ string: str = FieldInfo(alias="String")
20
+ """
21
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
22
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
23
+ """
24
+
25
+
26
+ Tokenizer: TypeAlias = Union[TokenizerEnum, TokenizerString]
27
+
28
+
29
+ class ChunkProcessing(BaseModel):
30
+ ignore_headers_and_footers: Optional[bool] = None
31
+ """DEPRECATED: use `segment_processing.ignore` instead"""
32
+
33
+ target_length: Optional[int] = None
34
+ """The target number of words in each chunk.
35
+
36
+ If 0, each chunk will contain a single segment.
37
+ """
38
+
39
+ tokenizer: Optional[Tokenizer] = None
40
+ """The tokenizer to use for the chunking process."""
@@ -0,0 +1,42 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Union, Optional
6
+ from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
+
8
+ from .._utils import PropertyInfo
9
+
10
+ __all__ = ["ChunkProcessingParam", "Tokenizer", "TokenizerEnum", "TokenizerString"]
11
+
12
+
13
+ class TokenizerEnum(TypedDict, total=False):
14
+ enum: Required[
15
+ Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
16
+ ]
17
+ """Use one of the predefined tokenizer types"""
18
+
19
+
20
+ class TokenizerString(TypedDict, total=False):
21
+ string: Required[Annotated[str, PropertyInfo(alias="String")]]
22
+ """
23
+ Use any Hugging Face tokenizer by specifying its model ID Examples:
24
+ "Qwen/Qwen-tokenizer", "facebook/bart-large"
25
+ """
26
+
27
+
28
+ Tokenizer: TypeAlias = Union[TokenizerEnum, TokenizerString]
29
+
30
+
31
+ class ChunkProcessingParam(TypedDict, total=False):
32
+ ignore_headers_and_footers: Optional[bool]
33
+ """DEPRECATED: use `segment_processing.ignore` instead"""
34
+
35
+ target_length: int
36
+ """The target number of words in each chunk.
37
+
38
+ If 0, each chunk will contain a single segment.
39
+ """
40
+
41
+ tokenizer: Tokenizer
42
+ """The tokenizer to use for the chunking process."""
@@ -0,0 +1,24 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+
5
+ from pydantic import Field as FieldInfo
6
+
7
+ from .._models import BaseModel
8
+ from .parse_configuration import ParseConfiguration
9
+
10
+ __all__ = ["ExtractConfiguration"]
11
+
12
+
13
+ class ExtractConfiguration(BaseModel):
14
+ schema_: object = FieldInfo(alias="schema")
15
+ """The schema to be used for the extraction."""
16
+
17
+ parse_configuration: Optional[ParseConfiguration] = None
18
+ """
19
+ Optional configuration for the `parse` task. Can not be used if `file` is a
20
+ `task_id`.
21
+ """
22
+
23
+ system_prompt: Optional[str] = None
24
+ """The system prompt to be used for the extraction."""
@@ -0,0 +1,62 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from .._models import BaseModel
4
+
5
+ __all__ = ["ExtractOutputResponse"]
6
+
7
+
8
+ class ExtractOutputResponse(BaseModel):
9
+ citations: object
10
+ """Mirror of `results`; leaves are `Vec<Citation>` for the corresponding field
11
+
12
+ Example:
13
+
14
+ ```json
15
+ {
16
+ "field_name": [
17
+ {
18
+ "citation_id": "abc1234",
19
+ "citation_type": "Segment",
20
+ "bboxes": [
21
+ {
22
+ "left": 10,
23
+ "top": 20,
24
+ "width": 100,
25
+ "height": 18
26
+ }
27
+ ],
28
+ "content": "Example content",
29
+ "segment_id": "seg_001",
30
+ "segment_type": "Text",
31
+ "page_number": 1,
32
+ "page_height": 297,
33
+ "page_width": 210,
34
+ "ss_ranges": ["A1:C10"],
35
+ "ss_sheet_name": "Sheet1"
36
+ }
37
+ ]
38
+ }
39
+ ```
40
+ """
41
+
42
+ metrics: object
43
+ """
44
+ Mirror of `results`; leaves contain a `Metrics` object for the corresponding
45
+ field
46
+
47
+ Example:
48
+
49
+ ```json
50
+ { "field_name": { "confidence": "High" } }
51
+ ```
52
+ """
53
+
54
+ results: object
55
+ """JSON data that matches the provided schema
56
+
57
+ Example:
58
+
59
+ ```json
60
+ { "field_name": "value" }
61
+ ```
62
+ """
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from typing import Optional
5
6
  from typing_extensions import Required, TypedDict
6
7
 
7
8
  from .._types import FileTypes
@@ -13,5 +14,5 @@ class FileCreateParams(TypedDict, total=False):
13
14
  file: Required[FileTypes]
14
15
  """The file to upload"""
15
16
 
16
- file_metadata: Required[str]
17
+ file_metadata: Optional[str]
17
18
  """Arbitrary JSON metadata associated with the file."""
@@ -0,0 +1,21 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+
5
+ from .._models import BaseModel
6
+
7
+ __all__ = ["FileInfo"]
8
+
9
+
10
+ class FileInfo(BaseModel):
11
+ url: str
12
+ """The presigned URL/Base64 encoded URL of the input file."""
13
+
14
+ mime_type: Optional[str] = None
15
+ """The MIME type of the file."""
16
+
17
+ name: Optional[str] = None
18
+ """The name of the file."""
19
+
20
+ page_count: Optional[int] = None
21
+ """The number of pages in the file."""
@@ -0,0 +1,29 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+ from typing_extensions import Literal
5
+
6
+ from .._models import BaseModel
7
+
8
+ __all__ = ["GenerationConfig"]
9
+
10
+
11
+ class GenerationConfig(BaseModel):
12
+ crop_image: Optional[Literal["All", "Auto"]] = None
13
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
14
+
15
+ - `All` crops all images in the item
16
+ - `Auto` crops images only if required for post-processing
17
+ """
18
+
19
+ description: Optional[bool] = None
20
+ """Generate LLM descriptions for this segment"""
21
+
22
+ extended_context: Optional[bool] = None
23
+ """Use the full page image as context for LLM generation"""
24
+
25
+ format: Optional[Literal["Html", "Markdown"]] = None
26
+ """The format for the `content` field of a segment."""
27
+
28
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
29
+ """The strategy for generating the `content` field of a segment."""
@@ -0,0 +1,29 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal, TypedDict
7
+
8
+ __all__ = ["GenerationConfigParam"]
9
+
10
+
11
+ class GenerationConfigParam(TypedDict, total=False):
12
+ crop_image: Optional[Literal["All", "Auto"]]
13
+ """Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
14
+
15
+ - `All` crops all images in the item
16
+ - `Auto` crops images only if required for post-processing
17
+ """
18
+
19
+ description: Optional[bool]
20
+ """Generate LLM descriptions for this segment"""
21
+
22
+ extended_context: Optional[bool]
23
+ """Use the full page image as context for LLM generation"""
24
+
25
+ format: Optional[Literal["Html", "Markdown"]]
26
+ """The format for the `content` field of a segment."""
27
+
28
+ strategy: Optional[Literal["LLM", "Auto", "Ignore"]]
29
+ """The strategy for generating the `content` field of a segment."""
@@ -0,0 +1,36 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Union, Optional
4
+ from typing_extensions import Literal, TypeAlias
5
+
6
+ from pydantic import Field as FieldInfo
7
+
8
+ from .._models import BaseModel
9
+
10
+ __all__ = ["LlmProcessing", "FallbackStrategy", "FallbackStrategyModel"]
11
+
12
+
13
+ class FallbackStrategyModel(BaseModel):
14
+ model: str = FieldInfo(alias="Model")
15
+ """Use a specific model as fallback"""
16
+
17
+
18
+ FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
19
+
20
+
21
+ class LlmProcessing(BaseModel):
22
+ fallback_strategy: Optional[FallbackStrategy] = None
23
+ """The fallback strategy to use for the LLMs in the task."""
24
+
25
+ llm_model_id: Optional[str] = None
26
+ """The ID of the model to use for the task.
27
+
28
+ If not provided, the default model will be used. Please check the documentation
29
+ for the model you want to use.
30
+ """
31
+
32
+ max_completion_tokens: Optional[int] = None
33
+ """The maximum number of tokens to generate."""
34
+
35
+ temperature: Optional[float] = None
36
+ """The temperature to use for the LLM."""
@@ -0,0 +1,36 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Union, Optional
6
+ from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
7
+
8
+ from .._utils import PropertyInfo
9
+
10
+ __all__ = ["LlmProcessingParam", "FallbackStrategy", "FallbackStrategyModel"]
11
+
12
+
13
+ class FallbackStrategyModel(TypedDict, total=False):
14
+ model: Required[Annotated[str, PropertyInfo(alias="Model")]]
15
+ """Use a specific model as fallback"""
16
+
17
+
18
+ FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
19
+
20
+
21
+ class LlmProcessingParam(TypedDict, total=False):
22
+ fallback_strategy: FallbackStrategy
23
+ """The fallback strategy to use for the LLMs in the task."""
24
+
25
+ llm_model_id: Optional[str]
26
+ """The ID of the model to use for the task.
27
+
28
+ If not provided, the default model will be used. Please check the documentation
29
+ for the model you want to use.
30
+ """
31
+
32
+ max_completion_tokens: Optional[int]
33
+ """The maximum number of tokens to generate."""
34
+
35
+ temperature: float
36
+ """The temperature to use for the LLM."""
@@ -0,0 +1,28 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+
5
+ from .._models import BaseModel
6
+ from .bounding_box import BoundingBox
7
+
8
+ __all__ = ["OcrResult"]
9
+
10
+
11
+ class OcrResult(BaseModel):
12
+ bbox: BoundingBox
13
+ """Bounding box for an item. It is used for segments and OCR results."""
14
+
15
+ text: str
16
+ """The recognized text of the OCR result."""
17
+
18
+ cell_ref: Optional[str] = None
19
+ """
20
+ Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
21
+ spreadsheet cell
22
+ """
23
+
24
+ confidence: Optional[float] = None
25
+ """The confidence score of the recognized text."""
26
+
27
+ ocr_id: Optional[str] = None
28
+ """The unique identifier for the OCR result."""
@@ -0,0 +1,27 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+
5
+ from .._models import BaseModel
6
+
7
+ __all__ = ["Page"]
8
+
9
+
10
+ class Page(BaseModel):
11
+ image: str
12
+ """The presigned URL of the page/sheet image."""
13
+
14
+ page_height: float
15
+ """The number of pages in the file."""
16
+
17
+ page_number: int
18
+ """The number of pages in the file."""
19
+
20
+ page_width: float
21
+ """The number of pages in the file."""
22
+
23
+ dpi: Optional[float] = None
24
+ """DPI of the page/sheet. All cropped images are scaled to this DPI."""
25
+
26
+ ss_sheet_name: Optional[str] = None
27
+ """The name of the sheet containing the page. Only used for Spreadsheets."""
@@ -0,0 +1,64 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import Optional
4
+ from typing_extensions import Literal
5
+
6
+ from .._models import BaseModel
7
+ from .llm_processing import LlmProcessing
8
+ from .chunk_processing import ChunkProcessing
9
+ from .segment_processing import SegmentProcessing
10
+
11
+ __all__ = ["ParseConfiguration"]
12
+
13
+
14
+ class ParseConfiguration(BaseModel):
15
+ chunk_processing: Optional[ChunkProcessing] = None
16
+ """Controls the setting for the chunking and post-processing of each chunk."""
17
+
18
+ error_handling: Optional[Literal["Fail", "Continue"]] = None
19
+ """Controls how errors are handled during processing:
20
+
21
+ - `Fail`: Stops processing and fails the task when any error occurs
22
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
23
+ LLM refusals etc.)
24
+ """
25
+
26
+ llm_processing: Optional[LlmProcessing] = None
27
+ """Controls the LLM used for the task."""
28
+
29
+ ocr_strategy: Optional[Literal["All", "Auto"]] = None
30
+ """Controls the Optical Character Recognition (OCR) strategy.
31
+
32
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
33
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
34
+ text. When text layer is present the bounding boxes from the text layer are
35
+ used.
36
+ """
37
+
38
+ pipeline: Optional[Literal["Azure", "Chunkr"]] = None
39
+
40
+ segment_processing: Optional[SegmentProcessing] = None
41
+ """Configuration for how each document segment is processed and formatted.
42
+
43
+ Each segment has sensible defaults, but you can override specific settings:
44
+
45
+ - `format`: Output as `Html` or `Markdown`
46
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
47
+ - `crop_image`: Whether to crop images to segment bounds
48
+ - `extended_context`: Use full page as context for LLM processing
49
+ - `description`: Generate descriptions for segments
50
+
51
+ **Defaults per segment type:** Check the documentation for more details.
52
+
53
+ Only specify the fields you want to change - everything else uses the defaults.
54
+ """
55
+
56
+ segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] = None
57
+ """Controls the segmentation strategy:
58
+
59
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
60
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
61
+ segmentation and better chunking.
62
+ - `Page`: Treats each page as a single segment. Faster processing, but without
63
+ layout element detection and only simple chunking.
64
+ """
@@ -0,0 +1,65 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+ from typing_extensions import Literal, TypedDict
7
+
8
+ from .llm_processing_param import LlmProcessingParam
9
+ from .chunk_processing_param import ChunkProcessingParam
10
+ from .segment_processing_param import SegmentProcessingParam
11
+
12
+ __all__ = ["ParseConfigurationParam"]
13
+
14
+
15
+ class ParseConfigurationParam(TypedDict, total=False):
16
+ chunk_processing: ChunkProcessingParam
17
+ """Controls the setting for the chunking and post-processing of each chunk."""
18
+
19
+ error_handling: Literal["Fail", "Continue"]
20
+ """Controls how errors are handled during processing:
21
+
22
+ - `Fail`: Stops processing and fails the task when any error occurs
23
+ - `Continue`: Attempts to continue processing despite non-critical errors (eg.
24
+ LLM refusals etc.)
25
+ """
26
+
27
+ llm_processing: LlmProcessingParam
28
+ """Controls the LLM used for the task."""
29
+
30
+ ocr_strategy: Literal["All", "Auto"]
31
+ """Controls the Optical Character Recognition (OCR) strategy.
32
+
33
+ - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
34
+ - `Auto`: Selectively applies OCR only to pages with missing or low-quality
35
+ text. When text layer is present the bounding boxes from the text layer are
36
+ used.
37
+ """
38
+
39
+ pipeline: Literal["Azure", "Chunkr"]
40
+
41
+ segment_processing: Optional[SegmentProcessingParam]
42
+ """Configuration for how each document segment is processed and formatted.
43
+
44
+ Each segment has sensible defaults, but you can override specific settings:
45
+
46
+ - `format`: Output as `Html` or `Markdown`
47
+ - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
48
+ - `crop_image`: Whether to crop images to segment bounds
49
+ - `extended_context`: Use full page as context for LLM processing
50
+ - `description`: Generate descriptions for segments
51
+
52
+ **Defaults per segment type:** Check the documentation for more details.
53
+
54
+ Only specify the fields you want to change - everything else uses the defaults.
55
+ """
56
+
57
+ segmentation_strategy: Literal["LayoutAnalysis", "Page"]
58
+ """Controls the segmentation strategy:
59
+
60
+ - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
61
+ `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
62
+ segmentation and better chunking.
63
+ - `Page`: Treats each page as a single segment. Faster processing, but without
64
+ layout element detection and only simple chunking.
65
+ """
@@ -0,0 +1,29 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import List, Optional
4
+
5
+ from .page import Page
6
+ from .chunk import Chunk
7
+ from .._models import BaseModel
8
+
9
+ __all__ = ["ParseOutputResponse"]
10
+
11
+
12
+ class ParseOutputResponse(BaseModel):
13
+ chunks: List[Chunk]
14
+ """Collection of document chunks, where each chunk contains one or more segments"""
15
+
16
+ file_name: Optional[str] = None
17
+ """The name of the file. Deprecated use `file_info.name` instead."""
18
+
19
+ mime_type: Optional[str] = None
20
+ """The MIME type of the file. Deprecated use `file_info.mime_type` instead."""
21
+
22
+ page_count: Optional[int] = None
23
+ """The number of pages in the file. Deprecated use `file_info.page_count` instead."""
24
+
25
+ pages: Optional[List[Page]] = None
26
+ """The pages of the file. Includes the image and metadata for each page."""
27
+
28
+ pdf_url: Optional[str] = None
29
+ """The presigned URL of the PDF file."""
@@ -0,0 +1,109 @@
1
+ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ from typing import List, Optional
4
+ from typing_extensions import Literal
5
+
6
+ from .cell import Cell
7
+ from .._models import BaseModel
8
+ from .ocr_result import OcrResult
9
+ from .bounding_box import BoundingBox
10
+
11
+ __all__ = ["Segment"]
12
+
13
+
14
+ class Segment(BaseModel):
15
+ bbox: BoundingBox
16
+ """Bounding box for an item. It is used for segments and OCR results."""
17
+
18
+ page_height: float
19
+ """Height of the page/sheet containing the segment."""
20
+
21
+ page_number: int
22
+ """Page number/Sheet number of the segment."""
23
+
24
+ page_width: float
25
+ """Width of the page/sheet containing the segment."""
26
+
27
+ segment_id: str
28
+ """Unique identifier for the segment."""
29
+
30
+ segment_type: Literal[
31
+ "Caption",
32
+ "Footnote",
33
+ "Formula",
34
+ "ListItem",
35
+ "Page",
36
+ "PageFooter",
37
+ "PageHeader",
38
+ "Picture",
39
+ "SectionHeader",
40
+ "Table",
41
+ "Text",
42
+ "Title",
43
+ ]
44
+ """
45
+ All the possible types for a segment. Note: Different configurations will
46
+ produce different types. Please refer to the documentation for more information.
47
+ """
48
+
49
+ confidence: Optional[float] = None
50
+ """Confidence score of the layout analysis model"""
51
+
52
+ content: Optional[str] = None
53
+ """
54
+ Content of the segment, will be either HTML or Markdown, depending on format
55
+ chosen.
56
+ """
57
+
58
+ description: Optional[str] = None
59
+ """Description of the segment, generated by the LLM."""
60
+
61
+ embed: Optional[str] = None
62
+ """Embeddable content of the segment."""
63
+
64
+ image: Optional[str] = None
65
+ """Presigned URL to the image of the segment."""
66
+
67
+ ocr: Optional[List[OcrResult]] = None
68
+ """OCR results for the segment."""
69
+
70
+ segment_length: Optional[int] = None
71
+ """Length of the segment in tokens."""
72
+
73
+ ss_cells: Optional[List[Cell]] = None
74
+ """Cells of the segment. Only used for Spreadsheets."""
75
+
76
+ ss_header_bbox: Optional[BoundingBox] = None
77
+ """Bounding box of the header of the segment, if found.
78
+
79
+ Only used for Spreadsheets.
80
+ """
81
+
82
+ ss_header_ocr: Optional[List[OcrResult]] = None
83
+ """OCR results of the header of the segment, if found. Only used for Spreadsheets."""
84
+
85
+ ss_header_range: Optional[str] = None
86
+ """
87
+ Header range of the segment, if found. The header can have overlap with the
88
+ `segment.range` if the table contains the header, if the header is located in a
89
+ different sheet, the header range will have no overlap with the `segment.range`.
90
+ Only used for Spreadsheets.
91
+ """
92
+
93
+ ss_header_text: Optional[str] = None
94
+ """Text content of the header of the segment, if found.
95
+
96
+ Only used for Spreadsheets.
97
+ """
98
+
99
+ ss_range: Optional[str] = None
100
+ """Range of the segment in Excel notation (e.g., A1:B5).
101
+
102
+ Only used for Spreadsheets.
103
+ """
104
+
105
+ ss_sheet_name: Optional[str] = None
106
+ """Name of the sheet containing the segment. Only used for Spreadsheets."""
107
+
108
+ text: Optional[str] = None
109
+ """Text content of the segment. Calculated by the OCR results."""