chunkr-ai 0.1.0a6__py3-none-any.whl → 0.1.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +2 -0
- chunkr_ai/_base_client.py +3 -3
- chunkr_ai/_client.py +31 -3
- chunkr_ai/_compat.py +48 -48
- chunkr_ai/_constants.py +5 -5
- chunkr_ai/_exceptions.py +4 -0
- chunkr_ai/_models.py +41 -41
- chunkr_ai/_types.py +35 -1
- chunkr_ai/_utils/__init__.py +9 -2
- chunkr_ai/_utils/_compat.py +45 -0
- chunkr_ai/_utils/_datetime_parse.py +136 -0
- chunkr_ai/_utils/_transform.py +11 -1
- chunkr_ai/_utils/_typing.py +6 -1
- chunkr_ai/_utils/_utils.py +0 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/resources/__init__.py +14 -0
- chunkr_ai/resources/files.py +3 -3
- chunkr_ai/resources/tasks/__init__.py +14 -0
- chunkr_ai/resources/tasks/extract.py +393 -0
- chunkr_ai/resources/tasks/parse.py +110 -286
- chunkr_ai/resources/tasks/tasks.py +64 -32
- chunkr_ai/resources/webhooks.py +193 -0
- chunkr_ai/types/__init__.py +27 -1
- chunkr_ai/types/bounding_box.py +19 -0
- chunkr_ai/types/cell.py +39 -0
- chunkr_ai/types/cell_style.py +28 -0
- chunkr_ai/types/chunk.py +40 -0
- chunkr_ai/types/chunk_processing.py +40 -0
- chunkr_ai/types/chunk_processing_param.py +42 -0
- chunkr_ai/types/extract_configuration.py +24 -0
- chunkr_ai/types/extract_output_response.py +62 -0
- chunkr_ai/types/file_create_params.py +2 -1
- chunkr_ai/types/file_info.py +21 -0
- chunkr_ai/types/generation_config.py +29 -0
- chunkr_ai/types/generation_config_param.py +29 -0
- chunkr_ai/types/llm_processing.py +36 -0
- chunkr_ai/types/llm_processing_param.py +36 -0
- chunkr_ai/types/ocr_result.py +28 -0
- chunkr_ai/types/page.py +27 -0
- chunkr_ai/types/parse_configuration.py +64 -0
- chunkr_ai/types/parse_configuration_param.py +65 -0
- chunkr_ai/types/parse_output_response.py +29 -0
- chunkr_ai/types/segment.py +109 -0
- chunkr_ai/types/segment_processing.py +228 -0
- chunkr_ai/types/segment_processing_param.py +229 -0
- chunkr_ai/types/task_extract_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_get_params.py +0 -3
- chunkr_ai/types/task_list_params.py +7 -1
- chunkr_ai/types/task_parse_updated_webhook_event.py +22 -0
- chunkr_ai/types/task_response.py +68 -0
- chunkr_ai/types/tasks/__init__.py +7 -1
- chunkr_ai/types/tasks/extract_create_params.py +47 -0
- chunkr_ai/types/tasks/extract_create_response.py +67 -0
- chunkr_ai/types/tasks/extract_get_params.py +18 -0
- chunkr_ai/types/tasks/extract_get_response.py +67 -0
- chunkr_ai/types/tasks/parse_create_params.py +25 -793
- chunkr_ai/types/tasks/parse_create_response.py +55 -0
- chunkr_ai/types/tasks/parse_get_params.py +18 -0
- chunkr_ai/types/tasks/parse_get_response.py +55 -0
- chunkr_ai/types/unwrap_webhook_event.py +11 -0
- chunkr_ai/types/version_info.py +31 -0
- chunkr_ai/types/webhook_url_response.py +9 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/METADATA +14 -13
- chunkr_ai-0.1.0a8.dist-info/RECORD +88 -0
- chunkr_ai/types/task.py +0 -1225
- chunkr_ai/types/tasks/parse_update_params.py +0 -845
- chunkr_ai-0.1.0a6.dist-info/RECORD +0 -52
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a6.dist-info → chunkr_ai-0.1.0a8.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Union, Optional
|
4
|
+
from typing_extensions import Literal, TypeAlias
|
5
|
+
|
6
|
+
from pydantic import Field as FieldInfo
|
7
|
+
|
8
|
+
from .._models import BaseModel
|
9
|
+
|
10
|
+
__all__ = ["ChunkProcessing", "Tokenizer", "TokenizerEnum", "TokenizerString"]
|
11
|
+
|
12
|
+
|
13
|
+
class TokenizerEnum(BaseModel):
|
14
|
+
enum: Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"] = FieldInfo(alias="Enum")
|
15
|
+
"""Use one of the predefined tokenizer types"""
|
16
|
+
|
17
|
+
|
18
|
+
class TokenizerString(BaseModel):
|
19
|
+
string: str = FieldInfo(alias="String")
|
20
|
+
"""
|
21
|
+
Use any Hugging Face tokenizer by specifying its model ID Examples:
|
22
|
+
"Qwen/Qwen-tokenizer", "facebook/bart-large"
|
23
|
+
"""
|
24
|
+
|
25
|
+
|
26
|
+
Tokenizer: TypeAlias = Union[TokenizerEnum, TokenizerString]
|
27
|
+
|
28
|
+
|
29
|
+
class ChunkProcessing(BaseModel):
|
30
|
+
ignore_headers_and_footers: Optional[bool] = None
|
31
|
+
"""DEPRECATED: use `segment_processing.ignore` instead"""
|
32
|
+
|
33
|
+
target_length: Optional[int] = None
|
34
|
+
"""The target number of words in each chunk.
|
35
|
+
|
36
|
+
If 0, each chunk will contain a single segment.
|
37
|
+
"""
|
38
|
+
|
39
|
+
tokenizer: Optional[Tokenizer] = None
|
40
|
+
"""The tokenizer to use for the chunking process."""
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Union, Optional
|
6
|
+
from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
|
7
|
+
|
8
|
+
from .._utils import PropertyInfo
|
9
|
+
|
10
|
+
__all__ = ["ChunkProcessingParam", "Tokenizer", "TokenizerEnum", "TokenizerString"]
|
11
|
+
|
12
|
+
|
13
|
+
class TokenizerEnum(TypedDict, total=False):
|
14
|
+
enum: Required[
|
15
|
+
Annotated[Literal["Word", "Cl100kBase", "XlmRobertaBase", "BertBaseUncased"], PropertyInfo(alias="Enum")]
|
16
|
+
]
|
17
|
+
"""Use one of the predefined tokenizer types"""
|
18
|
+
|
19
|
+
|
20
|
+
class TokenizerString(TypedDict, total=False):
|
21
|
+
string: Required[Annotated[str, PropertyInfo(alias="String")]]
|
22
|
+
"""
|
23
|
+
Use any Hugging Face tokenizer by specifying its model ID Examples:
|
24
|
+
"Qwen/Qwen-tokenizer", "facebook/bart-large"
|
25
|
+
"""
|
26
|
+
|
27
|
+
|
28
|
+
Tokenizer: TypeAlias = Union[TokenizerEnum, TokenizerString]
|
29
|
+
|
30
|
+
|
31
|
+
class ChunkProcessingParam(TypedDict, total=False):
|
32
|
+
ignore_headers_and_footers: Optional[bool]
|
33
|
+
"""DEPRECATED: use `segment_processing.ignore` instead"""
|
34
|
+
|
35
|
+
target_length: int
|
36
|
+
"""The target number of words in each chunk.
|
37
|
+
|
38
|
+
If 0, each chunk will contain a single segment.
|
39
|
+
"""
|
40
|
+
|
41
|
+
tokenizer: Tokenizer
|
42
|
+
"""The tokenizer to use for the chunking process."""
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from pydantic import Field as FieldInfo
|
6
|
+
|
7
|
+
from .._models import BaseModel
|
8
|
+
from .parse_configuration import ParseConfiguration
|
9
|
+
|
10
|
+
__all__ = ["ExtractConfiguration"]
|
11
|
+
|
12
|
+
|
13
|
+
class ExtractConfiguration(BaseModel):
|
14
|
+
schema_: object = FieldInfo(alias="schema")
|
15
|
+
"""The schema to be used for the extraction."""
|
16
|
+
|
17
|
+
parse_configuration: Optional[ParseConfiguration] = None
|
18
|
+
"""
|
19
|
+
Optional configuration for the `parse` task. Can not be used if `file` is a
|
20
|
+
`task_id`.
|
21
|
+
"""
|
22
|
+
|
23
|
+
system_prompt: Optional[str] = None
|
24
|
+
"""The system prompt to be used for the extraction."""
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from .._models import BaseModel
|
4
|
+
|
5
|
+
__all__ = ["ExtractOutputResponse"]
|
6
|
+
|
7
|
+
|
8
|
+
class ExtractOutputResponse(BaseModel):
|
9
|
+
citations: object
|
10
|
+
"""Mirror of `results`; leaves are `Vec<Citation>` for the corresponding field
|
11
|
+
|
12
|
+
Example:
|
13
|
+
|
14
|
+
```json
|
15
|
+
{
|
16
|
+
"field_name": [
|
17
|
+
{
|
18
|
+
"citation_id": "abc1234",
|
19
|
+
"citation_type": "Segment",
|
20
|
+
"bboxes": [
|
21
|
+
{
|
22
|
+
"left": 10,
|
23
|
+
"top": 20,
|
24
|
+
"width": 100,
|
25
|
+
"height": 18
|
26
|
+
}
|
27
|
+
],
|
28
|
+
"content": "Example content",
|
29
|
+
"segment_id": "seg_001",
|
30
|
+
"segment_type": "Text",
|
31
|
+
"page_number": 1,
|
32
|
+
"page_height": 297,
|
33
|
+
"page_width": 210,
|
34
|
+
"ss_ranges": ["A1:C10"],
|
35
|
+
"ss_sheet_name": "Sheet1"
|
36
|
+
}
|
37
|
+
]
|
38
|
+
}
|
39
|
+
```
|
40
|
+
"""
|
41
|
+
|
42
|
+
metrics: object
|
43
|
+
"""
|
44
|
+
Mirror of `results`; leaves contain a `Metrics` object for the corresponding
|
45
|
+
field
|
46
|
+
|
47
|
+
Example:
|
48
|
+
|
49
|
+
```json
|
50
|
+
{ "field_name": { "confidence": "High" } }
|
51
|
+
```
|
52
|
+
"""
|
53
|
+
|
54
|
+
results: object
|
55
|
+
"""JSON data that matches the provided schema
|
56
|
+
|
57
|
+
Example:
|
58
|
+
|
59
|
+
```json
|
60
|
+
{ "field_name": "value" }
|
61
|
+
```
|
62
|
+
"""
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
+
from typing import Optional
|
5
6
|
from typing_extensions import Required, TypedDict
|
6
7
|
|
7
8
|
from .._types import FileTypes
|
@@ -13,5 +14,5 @@ class FileCreateParams(TypedDict, total=False):
|
|
13
14
|
file: Required[FileTypes]
|
14
15
|
"""The file to upload"""
|
15
16
|
|
16
|
-
file_metadata:
|
17
|
+
file_metadata: Optional[str]
|
17
18
|
"""Arbitrary JSON metadata associated with the file."""
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .._models import BaseModel
|
6
|
+
|
7
|
+
__all__ = ["FileInfo"]
|
8
|
+
|
9
|
+
|
10
|
+
class FileInfo(BaseModel):
|
11
|
+
url: str
|
12
|
+
"""The presigned URL/Base64 encoded URL of the input file."""
|
13
|
+
|
14
|
+
mime_type: Optional[str] = None
|
15
|
+
"""The MIME type of the file."""
|
16
|
+
|
17
|
+
name: Optional[str] = None
|
18
|
+
"""The name of the file."""
|
19
|
+
|
20
|
+
page_count: Optional[int] = None
|
21
|
+
"""The number of pages in the file."""
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
from typing_extensions import Literal
|
5
|
+
|
6
|
+
from .._models import BaseModel
|
7
|
+
|
8
|
+
__all__ = ["GenerationConfig"]
|
9
|
+
|
10
|
+
|
11
|
+
class GenerationConfig(BaseModel):
|
12
|
+
crop_image: Optional[Literal["All", "Auto"]] = None
|
13
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
14
|
+
|
15
|
+
- `All` crops all images in the item
|
16
|
+
- `Auto` crops images only if required for post-processing
|
17
|
+
"""
|
18
|
+
|
19
|
+
description: Optional[bool] = None
|
20
|
+
"""Generate LLM descriptions for this segment"""
|
21
|
+
|
22
|
+
extended_context: Optional[bool] = None
|
23
|
+
"""Use the full page image as context for LLM generation"""
|
24
|
+
|
25
|
+
format: Optional[Literal["Html", "Markdown"]] = None
|
26
|
+
"""The format for the `content` field of a segment."""
|
27
|
+
|
28
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]] = None
|
29
|
+
"""The strategy for generating the `content` field of a segment."""
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Optional
|
6
|
+
from typing_extensions import Literal, TypedDict
|
7
|
+
|
8
|
+
__all__ = ["GenerationConfigParam"]
|
9
|
+
|
10
|
+
|
11
|
+
class GenerationConfigParam(TypedDict, total=False):
|
12
|
+
crop_image: Optional[Literal["All", "Auto"]]
|
13
|
+
"""Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
|
14
|
+
|
15
|
+
- `All` crops all images in the item
|
16
|
+
- `Auto` crops images only if required for post-processing
|
17
|
+
"""
|
18
|
+
|
19
|
+
description: Optional[bool]
|
20
|
+
"""Generate LLM descriptions for this segment"""
|
21
|
+
|
22
|
+
extended_context: Optional[bool]
|
23
|
+
"""Use the full page image as context for LLM generation"""
|
24
|
+
|
25
|
+
format: Optional[Literal["Html", "Markdown"]]
|
26
|
+
"""The format for the `content` field of a segment."""
|
27
|
+
|
28
|
+
strategy: Optional[Literal["LLM", "Auto", "Ignore"]]
|
29
|
+
"""The strategy for generating the `content` field of a segment."""
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Union, Optional
|
4
|
+
from typing_extensions import Literal, TypeAlias
|
5
|
+
|
6
|
+
from pydantic import Field as FieldInfo
|
7
|
+
|
8
|
+
from .._models import BaseModel
|
9
|
+
|
10
|
+
__all__ = ["LlmProcessing", "FallbackStrategy", "FallbackStrategyModel"]
|
11
|
+
|
12
|
+
|
13
|
+
class FallbackStrategyModel(BaseModel):
|
14
|
+
model: str = FieldInfo(alias="Model")
|
15
|
+
"""Use a specific model as fallback"""
|
16
|
+
|
17
|
+
|
18
|
+
FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
|
19
|
+
|
20
|
+
|
21
|
+
class LlmProcessing(BaseModel):
|
22
|
+
fallback_strategy: Optional[FallbackStrategy] = None
|
23
|
+
"""The fallback strategy to use for the LLMs in the task."""
|
24
|
+
|
25
|
+
llm_model_id: Optional[str] = None
|
26
|
+
"""The ID of the model to use for the task.
|
27
|
+
|
28
|
+
If not provided, the default model will be used. Please check the documentation
|
29
|
+
for the model you want to use.
|
30
|
+
"""
|
31
|
+
|
32
|
+
max_completion_tokens: Optional[int] = None
|
33
|
+
"""The maximum number of tokens to generate."""
|
34
|
+
|
35
|
+
temperature: Optional[float] = None
|
36
|
+
"""The temperature to use for the LLM."""
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Union, Optional
|
6
|
+
from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
|
7
|
+
|
8
|
+
from .._utils import PropertyInfo
|
9
|
+
|
10
|
+
__all__ = ["LlmProcessingParam", "FallbackStrategy", "FallbackStrategyModel"]
|
11
|
+
|
12
|
+
|
13
|
+
class FallbackStrategyModel(TypedDict, total=False):
|
14
|
+
model: Required[Annotated[str, PropertyInfo(alias="Model")]]
|
15
|
+
"""Use a specific model as fallback"""
|
16
|
+
|
17
|
+
|
18
|
+
FallbackStrategy: TypeAlias = Union[Literal["None", "Default"], FallbackStrategyModel]
|
19
|
+
|
20
|
+
|
21
|
+
class LlmProcessingParam(TypedDict, total=False):
|
22
|
+
fallback_strategy: FallbackStrategy
|
23
|
+
"""The fallback strategy to use for the LLMs in the task."""
|
24
|
+
|
25
|
+
llm_model_id: Optional[str]
|
26
|
+
"""The ID of the model to use for the task.
|
27
|
+
|
28
|
+
If not provided, the default model will be used. Please check the documentation
|
29
|
+
for the model you want to use.
|
30
|
+
"""
|
31
|
+
|
32
|
+
max_completion_tokens: Optional[int]
|
33
|
+
"""The maximum number of tokens to generate."""
|
34
|
+
|
35
|
+
temperature: float
|
36
|
+
"""The temperature to use for the LLM."""
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .._models import BaseModel
|
6
|
+
from .bounding_box import BoundingBox
|
7
|
+
|
8
|
+
__all__ = ["OcrResult"]
|
9
|
+
|
10
|
+
|
11
|
+
class OcrResult(BaseModel):
|
12
|
+
bbox: BoundingBox
|
13
|
+
"""Bounding box for an item. It is used for segments and OCR results."""
|
14
|
+
|
15
|
+
text: str
|
16
|
+
"""The recognized text of the OCR result."""
|
17
|
+
|
18
|
+
cell_ref: Optional[str] = None
|
19
|
+
"""
|
20
|
+
Excel-style cell reference (e.g., "A1" or "A1:B2") when OCR originates from a
|
21
|
+
spreadsheet cell
|
22
|
+
"""
|
23
|
+
|
24
|
+
confidence: Optional[float] = None
|
25
|
+
"""The confidence score of the recognized text."""
|
26
|
+
|
27
|
+
ocr_id: Optional[str] = None
|
28
|
+
"""The unique identifier for the OCR result."""
|
chunkr_ai/types/page.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .._models import BaseModel
|
6
|
+
|
7
|
+
__all__ = ["Page"]
|
8
|
+
|
9
|
+
|
10
|
+
class Page(BaseModel):
|
11
|
+
image: str
|
12
|
+
"""The presigned URL of the page/sheet image."""
|
13
|
+
|
14
|
+
page_height: float
|
15
|
+
"""The number of pages in the file."""
|
16
|
+
|
17
|
+
page_number: int
|
18
|
+
"""The number of pages in the file."""
|
19
|
+
|
20
|
+
page_width: float
|
21
|
+
"""The number of pages in the file."""
|
22
|
+
|
23
|
+
dpi: Optional[float] = None
|
24
|
+
"""DPI of the page/sheet. All cropped images are scaled to this DPI."""
|
25
|
+
|
26
|
+
ss_sheet_name: Optional[str] = None
|
27
|
+
"""The name of the sheet containing the page. Only used for Spreadsheets."""
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
from typing_extensions import Literal
|
5
|
+
|
6
|
+
from .._models import BaseModel
|
7
|
+
from .llm_processing import LlmProcessing
|
8
|
+
from .chunk_processing import ChunkProcessing
|
9
|
+
from .segment_processing import SegmentProcessing
|
10
|
+
|
11
|
+
__all__ = ["ParseConfiguration"]
|
12
|
+
|
13
|
+
|
14
|
+
class ParseConfiguration(BaseModel):
|
15
|
+
chunk_processing: Optional[ChunkProcessing] = None
|
16
|
+
"""Controls the setting for the chunking and post-processing of each chunk."""
|
17
|
+
|
18
|
+
error_handling: Optional[Literal["Fail", "Continue"]] = None
|
19
|
+
"""Controls how errors are handled during processing:
|
20
|
+
|
21
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
22
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
23
|
+
LLM refusals etc.)
|
24
|
+
"""
|
25
|
+
|
26
|
+
llm_processing: Optional[LlmProcessing] = None
|
27
|
+
"""Controls the LLM used for the task."""
|
28
|
+
|
29
|
+
ocr_strategy: Optional[Literal["All", "Auto"]] = None
|
30
|
+
"""Controls the Optical Character Recognition (OCR) strategy.
|
31
|
+
|
32
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
33
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
34
|
+
text. When text layer is present the bounding boxes from the text layer are
|
35
|
+
used.
|
36
|
+
"""
|
37
|
+
|
38
|
+
pipeline: Optional[Literal["Azure", "Chunkr"]] = None
|
39
|
+
|
40
|
+
segment_processing: Optional[SegmentProcessing] = None
|
41
|
+
"""Configuration for how each document segment is processed and formatted.
|
42
|
+
|
43
|
+
Each segment has sensible defaults, but you can override specific settings:
|
44
|
+
|
45
|
+
- `format`: Output as `Html` or `Markdown`
|
46
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
47
|
+
- `crop_image`: Whether to crop images to segment bounds
|
48
|
+
- `extended_context`: Use full page as context for LLM processing
|
49
|
+
- `description`: Generate descriptions for segments
|
50
|
+
|
51
|
+
**Defaults per segment type:** Check the documentation for more details.
|
52
|
+
|
53
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
54
|
+
"""
|
55
|
+
|
56
|
+
segmentation_strategy: Optional[Literal["LayoutAnalysis", "Page"]] = None
|
57
|
+
"""Controls the segmentation strategy:
|
58
|
+
|
59
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
60
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
61
|
+
segmentation and better chunking.
|
62
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
63
|
+
layout element detection and only simple chunking.
|
64
|
+
"""
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Optional
|
6
|
+
from typing_extensions import Literal, TypedDict
|
7
|
+
|
8
|
+
from .llm_processing_param import LlmProcessingParam
|
9
|
+
from .chunk_processing_param import ChunkProcessingParam
|
10
|
+
from .segment_processing_param import SegmentProcessingParam
|
11
|
+
|
12
|
+
__all__ = ["ParseConfigurationParam"]
|
13
|
+
|
14
|
+
|
15
|
+
class ParseConfigurationParam(TypedDict, total=False):
|
16
|
+
chunk_processing: ChunkProcessingParam
|
17
|
+
"""Controls the setting for the chunking and post-processing of each chunk."""
|
18
|
+
|
19
|
+
error_handling: Literal["Fail", "Continue"]
|
20
|
+
"""Controls how errors are handled during processing:
|
21
|
+
|
22
|
+
- `Fail`: Stops processing and fails the task when any error occurs
|
23
|
+
- `Continue`: Attempts to continue processing despite non-critical errors (eg.
|
24
|
+
LLM refusals etc.)
|
25
|
+
"""
|
26
|
+
|
27
|
+
llm_processing: LlmProcessingParam
|
28
|
+
"""Controls the LLM used for the task."""
|
29
|
+
|
30
|
+
ocr_strategy: Literal["All", "Auto"]
|
31
|
+
"""Controls the Optical Character Recognition (OCR) strategy.
|
32
|
+
|
33
|
+
- `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds per page)
|
34
|
+
- `Auto`: Selectively applies OCR only to pages with missing or low-quality
|
35
|
+
text. When text layer is present the bounding boxes from the text layer are
|
36
|
+
used.
|
37
|
+
"""
|
38
|
+
|
39
|
+
pipeline: Literal["Azure", "Chunkr"]
|
40
|
+
|
41
|
+
segment_processing: Optional[SegmentProcessingParam]
|
42
|
+
"""Configuration for how each document segment is processed and formatted.
|
43
|
+
|
44
|
+
Each segment has sensible defaults, but you can override specific settings:
|
45
|
+
|
46
|
+
- `format`: Output as `Html` or `Markdown`
|
47
|
+
- `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore` (skip)
|
48
|
+
- `crop_image`: Whether to crop images to segment bounds
|
49
|
+
- `extended_context`: Use full page as context for LLM processing
|
50
|
+
- `description`: Generate descriptions for segments
|
51
|
+
|
52
|
+
**Defaults per segment type:** Check the documentation for more details.
|
53
|
+
|
54
|
+
Only specify the fields you want to change - everything else uses the defaults.
|
55
|
+
"""
|
56
|
+
|
57
|
+
segmentation_strategy: Literal["LayoutAnalysis", "Page"]
|
58
|
+
"""Controls the segmentation strategy:
|
59
|
+
|
60
|
+
- `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
|
61
|
+
`Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
|
62
|
+
segmentation and better chunking.
|
63
|
+
- `Page`: Treats each page as a single segment. Faster processing, but without
|
64
|
+
layout element detection and only simple chunking.
|
65
|
+
"""
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import List, Optional
|
4
|
+
|
5
|
+
from .page import Page
|
6
|
+
from .chunk import Chunk
|
7
|
+
from .._models import BaseModel
|
8
|
+
|
9
|
+
__all__ = ["ParseOutputResponse"]
|
10
|
+
|
11
|
+
|
12
|
+
class ParseOutputResponse(BaseModel):
|
13
|
+
chunks: List[Chunk]
|
14
|
+
"""Collection of document chunks, where each chunk contains one or more segments"""
|
15
|
+
|
16
|
+
file_name: Optional[str] = None
|
17
|
+
"""The name of the file. Deprecated use `file_info.name` instead."""
|
18
|
+
|
19
|
+
mime_type: Optional[str] = None
|
20
|
+
"""The MIME type of the file. Deprecated use `file_info.mime_type` instead."""
|
21
|
+
|
22
|
+
page_count: Optional[int] = None
|
23
|
+
"""The number of pages in the file. Deprecated use `file_info.page_count` instead."""
|
24
|
+
|
25
|
+
pages: Optional[List[Page]] = None
|
26
|
+
"""The pages of the file. Includes the image and metadata for each page."""
|
27
|
+
|
28
|
+
pdf_url: Optional[str] = None
|
29
|
+
"""The presigned URL of the PDF file."""
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import List, Optional
|
4
|
+
from typing_extensions import Literal
|
5
|
+
|
6
|
+
from .cell import Cell
|
7
|
+
from .._models import BaseModel
|
8
|
+
from .ocr_result import OcrResult
|
9
|
+
from .bounding_box import BoundingBox
|
10
|
+
|
11
|
+
__all__ = ["Segment"]
|
12
|
+
|
13
|
+
|
14
|
+
class Segment(BaseModel):
|
15
|
+
bbox: BoundingBox
|
16
|
+
"""Bounding box for an item. It is used for segments and OCR results."""
|
17
|
+
|
18
|
+
page_height: float
|
19
|
+
"""Height of the page/sheet containing the segment."""
|
20
|
+
|
21
|
+
page_number: int
|
22
|
+
"""Page number/Sheet number of the segment."""
|
23
|
+
|
24
|
+
page_width: float
|
25
|
+
"""Width of the page/sheet containing the segment."""
|
26
|
+
|
27
|
+
segment_id: str
|
28
|
+
"""Unique identifier for the segment."""
|
29
|
+
|
30
|
+
segment_type: Literal[
|
31
|
+
"Caption",
|
32
|
+
"Footnote",
|
33
|
+
"Formula",
|
34
|
+
"ListItem",
|
35
|
+
"Page",
|
36
|
+
"PageFooter",
|
37
|
+
"PageHeader",
|
38
|
+
"Picture",
|
39
|
+
"SectionHeader",
|
40
|
+
"Table",
|
41
|
+
"Text",
|
42
|
+
"Title",
|
43
|
+
]
|
44
|
+
"""
|
45
|
+
All the possible types for a segment. Note: Different configurations will
|
46
|
+
produce different types. Please refer to the documentation for more information.
|
47
|
+
"""
|
48
|
+
|
49
|
+
confidence: Optional[float] = None
|
50
|
+
"""Confidence score of the layout analysis model"""
|
51
|
+
|
52
|
+
content: Optional[str] = None
|
53
|
+
"""
|
54
|
+
Content of the segment, will be either HTML or Markdown, depending on format
|
55
|
+
chosen.
|
56
|
+
"""
|
57
|
+
|
58
|
+
description: Optional[str] = None
|
59
|
+
"""Description of the segment, generated by the LLM."""
|
60
|
+
|
61
|
+
embed: Optional[str] = None
|
62
|
+
"""Embeddable content of the segment."""
|
63
|
+
|
64
|
+
image: Optional[str] = None
|
65
|
+
"""Presigned URL to the image of the segment."""
|
66
|
+
|
67
|
+
ocr: Optional[List[OcrResult]] = None
|
68
|
+
"""OCR results for the segment."""
|
69
|
+
|
70
|
+
segment_length: Optional[int] = None
|
71
|
+
"""Length of the segment in tokens."""
|
72
|
+
|
73
|
+
ss_cells: Optional[List[Cell]] = None
|
74
|
+
"""Cells of the segment. Only used for Spreadsheets."""
|
75
|
+
|
76
|
+
ss_header_bbox: Optional[BoundingBox] = None
|
77
|
+
"""Bounding box of the header of the segment, if found.
|
78
|
+
|
79
|
+
Only used for Spreadsheets.
|
80
|
+
"""
|
81
|
+
|
82
|
+
ss_header_ocr: Optional[List[OcrResult]] = None
|
83
|
+
"""OCR results of the header of the segment, if found. Only used for Spreadsheets."""
|
84
|
+
|
85
|
+
ss_header_range: Optional[str] = None
|
86
|
+
"""
|
87
|
+
Header range of the segment, if found. The header can have overlap with the
|
88
|
+
`segment.range` if the table contains the header, if the header is located in a
|
89
|
+
different sheet, the header range will have no overlap with the `segment.range`.
|
90
|
+
Only used for Spreadsheets.
|
91
|
+
"""
|
92
|
+
|
93
|
+
ss_header_text: Optional[str] = None
|
94
|
+
"""Text content of the header of the segment, if found.
|
95
|
+
|
96
|
+
Only used for Spreadsheets.
|
97
|
+
"""
|
98
|
+
|
99
|
+
ss_range: Optional[str] = None
|
100
|
+
"""Range of the segment in Excel notation (e.g., A1:B5).
|
101
|
+
|
102
|
+
Only used for Spreadsheets.
|
103
|
+
"""
|
104
|
+
|
105
|
+
ss_sheet_name: Optional[str] = None
|
106
|
+
"""Name of the sheet containing the segment. Only used for Spreadsheets."""
|
107
|
+
|
108
|
+
text: Optional[str] = None
|
109
|
+
"""Text content of the segment. Calculated by the OCR results."""
|