chunkr-ai 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/__init__.py +3 -0
- chunkr_ai/_client.py +18 -10
- chunkr_ai/_files.py +1 -1
- chunkr_ai/_version.py +1 -1
- chunkr_ai/lib/tasks_poll.py +122 -0
- chunkr_ai/pagination.py +61 -1
- chunkr_ai/resources/__init__.py +27 -13
- chunkr_ai/resources/files.py +712 -0
- chunkr_ai/resources/{task → tasks}/__init__.py +14 -14
- chunkr_ai/resources/{task → tasks}/parse.py +48 -52
- chunkr_ai/resources/{task/task.py → tasks/tasks.py} +58 -126
- chunkr_ai/types/__init__.py +7 -0
- chunkr_ai/types/delete.py +10 -0
- chunkr_ai/types/file.py +30 -0
- chunkr_ai/types/file_create_params.py +17 -0
- chunkr_ai/types/file_list_params.py +28 -0
- chunkr_ai/types/file_url.py +15 -0
- chunkr_ai/types/file_url_params.py +15 -0
- chunkr_ai/types/files_list_response.py +20 -0
- chunkr_ai/types/{task/task.py → task.py} +55 -23
- chunkr_ai/types/{task → tasks}/__init__.py +0 -1
- chunkr_ai/types/{task → tasks}/parse_create_params.py +61 -23
- chunkr_ai/types/{task → tasks}/parse_update_params.py +54 -22
- {chunkr_ai-0.1.0a2.dist-info → chunkr_ai-0.1.0a4.dist-info}/METADATA +38 -20
- chunkr_ai-0.1.0a4.dist-info/RECORD +53 -0
- chunkr_ai-0.1.0a2.dist-info/RECORD +0 -44
- {chunkr_ai-0.1.0a2.dist-info → chunkr_ai-0.1.0a4.dist-info}/WHEEL +0 -0
- {chunkr_ai-0.1.0a2.dist-info → chunkr_ai-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Union
|
6
|
+
from datetime import datetime
|
7
|
+
from typing_extensions import Literal, Annotated, TypedDict
|
8
|
+
|
9
|
+
from .._utils import PropertyInfo
|
10
|
+
|
11
|
+
__all__ = ["FileListParams"]
|
12
|
+
|
13
|
+
|
14
|
+
class FileListParams(TypedDict, total=False):
|
15
|
+
cursor: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
|
16
|
+
"""Cursor for pagination (created_at)"""
|
17
|
+
|
18
|
+
end: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
|
19
|
+
"""End date"""
|
20
|
+
|
21
|
+
limit: int
|
22
|
+
"""Number of files per page"""
|
23
|
+
|
24
|
+
sort: Literal["asc", "desc"]
|
25
|
+
"""Sort order: 'asc' for ascending, 'desc' for descending (default)"""
|
26
|
+
|
27
|
+
start: Annotated[Union[str, datetime], PropertyInfo(format="iso8601")]
|
28
|
+
"""Start date"""
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .._models import BaseModel
|
6
|
+
|
7
|
+
__all__ = ["FileURL"]
|
8
|
+
|
9
|
+
|
10
|
+
class FileURL(BaseModel):
|
11
|
+
url: str
|
12
|
+
"""The presigned URL or base64 data (if base64_urls=true)"""
|
13
|
+
|
14
|
+
expires_in: Optional[int] = None
|
15
|
+
"""Expiry in seconds (omitted when base64)"""
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing_extensions import TypedDict
|
6
|
+
|
7
|
+
__all__ = ["FileURLParams"]
|
8
|
+
|
9
|
+
|
10
|
+
class FileURLParams(TypedDict, total=False):
|
11
|
+
base64_urls: bool
|
12
|
+
"""If true, returns base64 data instead of a presigned URL"""
|
13
|
+
|
14
|
+
expires_in: int
|
15
|
+
"""Expiry in seconds for the presigned URL (default 3600)"""
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
2
|
+
|
3
|
+
from typing import List, Optional
|
4
|
+
from datetime import datetime
|
5
|
+
|
6
|
+
from .file import File
|
7
|
+
from .._models import BaseModel
|
8
|
+
|
9
|
+
__all__ = ["FilesListResponse"]
|
10
|
+
|
11
|
+
|
12
|
+
class FilesListResponse(BaseModel):
|
13
|
+
files: List[File]
|
14
|
+
"""List of files"""
|
15
|
+
|
16
|
+
has_more: bool
|
17
|
+
"""Whether there are more files to fetch"""
|
18
|
+
|
19
|
+
next_cursor: Optional[datetime] = None
|
20
|
+
"""Cursor for pagination (timestamp) e.g. 2025-01-01T00:00:00Z"""
|
@@ -6,7 +6,7 @@ from typing_extensions import Literal, TypeAlias
|
|
6
6
|
|
7
7
|
from pydantic import Field as FieldInfo
|
8
8
|
|
9
|
-
from
|
9
|
+
from .._models import BaseModel
|
10
10
|
|
11
11
|
__all__ = [
|
12
12
|
"Task",
|
@@ -494,7 +494,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
494
494
|
- `Auto`: Process content automatically
|
495
495
|
- `LLM`: Use large language models for processing
|
496
496
|
- `Ignore`: Exclude segments from final output
|
497
|
-
- `description` enables LLM-generated descriptions for segments
|
497
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
498
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
499
|
+
configuration.
|
500
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
498
501
|
|
499
502
|
**Deprecated fields (for backwards compatibility):**
|
500
503
|
|
@@ -516,7 +519,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
516
519
|
- `Auto`: Process content automatically
|
517
520
|
- `LLM`: Use large language models for processing
|
518
521
|
- `Ignore`: Exclude segments from final output
|
519
|
-
- `description` enables LLM-generated descriptions for segments
|
522
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
523
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
524
|
+
configuration.
|
525
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
520
526
|
|
521
527
|
**Deprecated fields (for backwards compatibility):**
|
522
528
|
|
@@ -538,7 +544,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
538
544
|
- `Auto`: Process content automatically
|
539
545
|
- `LLM`: Use large language models for processing
|
540
546
|
- `Ignore`: Exclude segments from final output
|
541
|
-
- `description` enables LLM-generated descriptions for segments
|
547
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
548
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
549
|
+
configuration.
|
550
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
542
551
|
|
543
552
|
**Deprecated fields (for backwards compatibility):**
|
544
553
|
|
@@ -560,7 +569,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
560
569
|
- `Auto`: Process content automatically
|
561
570
|
- `LLM`: Use large language models for processing
|
562
571
|
- `Ignore`: Exclude segments from final output
|
563
|
-
- `description` enables LLM-generated descriptions for segments
|
572
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
573
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
574
|
+
configuration.
|
575
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
564
576
|
|
565
577
|
**Deprecated fields (for backwards compatibility):**
|
566
578
|
|
@@ -582,7 +594,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
582
594
|
- `Auto`: Process content automatically
|
583
595
|
- `LLM`: Use large language models for processing
|
584
596
|
- `Ignore`: Exclude segments from final output
|
585
|
-
- `description` enables LLM-generated descriptions for segments
|
597
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
598
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
599
|
+
configuration.
|
600
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
586
601
|
|
587
602
|
**Deprecated fields (for backwards compatibility):**
|
588
603
|
|
@@ -604,7 +619,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
604
619
|
- `Auto`: Process content automatically
|
605
620
|
- `LLM`: Use large language models for processing
|
606
621
|
- `Ignore`: Exclude segments from final output
|
607
|
-
- `description` enables LLM-generated descriptions for segments
|
622
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
623
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
624
|
+
configuration.
|
625
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
608
626
|
|
609
627
|
**Deprecated fields (for backwards compatibility):**
|
610
628
|
|
@@ -626,7 +644,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
626
644
|
- `Auto`: Process content automatically
|
627
645
|
- `LLM`: Use large language models for processing
|
628
646
|
- `Ignore`: Exclude segments from final output
|
629
|
-
- `description` enables LLM-generated descriptions for segments
|
647
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
648
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
649
|
+
configuration.
|
650
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
630
651
|
|
631
652
|
**Deprecated fields (for backwards compatibility):**
|
632
653
|
|
@@ -648,7 +669,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
648
669
|
- `Auto`: Process content automatically
|
649
670
|
- `LLM`: Use large language models for processing
|
650
671
|
- `Ignore`: Exclude segments from final output
|
651
|
-
- `description` enables LLM-generated descriptions for segments
|
672
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
673
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
674
|
+
configuration.
|
675
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
652
676
|
|
653
677
|
**Deprecated fields (for backwards compatibility):**
|
654
678
|
|
@@ -672,7 +696,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
672
696
|
- `Auto`: Process content automatically
|
673
697
|
- `LLM`: Use large language models for processing
|
674
698
|
- `Ignore`: Exclude segments from final output
|
675
|
-
- `description` enables LLM-generated descriptions for segments
|
699
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
700
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
701
|
+
configuration.
|
702
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
676
703
|
|
677
704
|
**Deprecated fields (for backwards compatibility):**
|
678
705
|
|
@@ -694,7 +721,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
694
721
|
- `Auto`: Process content automatically
|
695
722
|
- `LLM`: Use large language models for processing
|
696
723
|
- `Ignore`: Exclude segments from final output
|
697
|
-
- `description` enables LLM-generated descriptions for segments
|
724
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
725
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
726
|
+
configuration.
|
727
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
698
728
|
|
699
729
|
**Deprecated fields (for backwards compatibility):**
|
700
730
|
|
@@ -716,7 +746,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
716
746
|
- `Auto`: Process content automatically
|
717
747
|
- `LLM`: Use large language models for processing
|
718
748
|
- `Ignore`: Exclude segments from final output
|
719
|
-
- `description` enables LLM-generated descriptions for segments
|
749
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
750
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
751
|
+
configuration.
|
752
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
720
753
|
|
721
754
|
**Deprecated fields (for backwards compatibility):**
|
722
755
|
|
@@ -738,7 +771,10 @@ class ConfigurationSegmentProcessing(BaseModel):
|
|
738
771
|
- `Auto`: Process content automatically
|
739
772
|
- `LLM`: Use large language models for processing
|
740
773
|
- `Ignore`: Exclude segments from final output
|
741
|
-
- `description` enables LLM-generated descriptions for segments
|
774
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
775
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
776
|
+
configuration.
|
777
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
742
778
|
|
743
779
|
**Deprecated fields (for backwards compatibility):**
|
744
780
|
|
@@ -796,18 +832,14 @@ class Configuration(BaseModel):
|
|
796
832
|
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
797
833
|
`Markdown`).
|
798
834
|
|
799
|
-
Optional flags such as image **cropping**, **extended context**, and
|
800
|
-
descriptions** further refine behaviour.
|
801
|
-
|
802
|
-
---
|
835
|
+
Optional flags such as image **cropping**, **extended context**, and
|
836
|
+
**descriptions** further refine behaviour.
|
803
837
|
|
804
838
|
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
805
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM**
|
806
|
-
description on) • `Picture` → **LLM** (Markdown, description off,
|
807
|
-
_All_) • `Formula`, `Page` → **LLM** (Markdown) •
|
808
|
-
**Ignore** (removed from output)
|
809
|
-
|
810
|
-
---
|
839
|
+
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
840
|
+
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
841
|
+
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
842
|
+
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
811
843
|
|
812
844
|
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
813
845
|
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
@@ -34,7 +34,13 @@ __all__ = [
|
|
34
34
|
|
35
35
|
class ParseCreateParams(TypedDict, total=False):
|
36
36
|
file: Required[str]
|
37
|
-
"""The file to be uploaded.
|
37
|
+
"""The file to be uploaded. Supported inputs:
|
38
|
+
|
39
|
+
- `ch://files/{file_id}`: References a previously uploaded file you own
|
40
|
+
(authorization enforced)
|
41
|
+
- `http(s)://...`: Remote URL to fetch
|
42
|
+
- `data:*;base64,...` or raw base64 string
|
43
|
+
"""
|
38
44
|
|
39
45
|
chunk_processing: Optional[ChunkProcessing]
|
40
46
|
"""Controls the setting for the chunking and post-processing of each chunk."""
|
@@ -82,18 +88,14 @@ class ParseCreateParams(TypedDict, total=False):
|
|
82
88
|
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
83
89
|
`Markdown`).
|
84
90
|
|
85
|
-
Optional flags such as image **cropping**, **extended context**, and
|
86
|
-
descriptions** further refine behaviour.
|
87
|
-
|
88
|
-
---
|
91
|
+
Optional flags such as image **cropping**, **extended context**, and
|
92
|
+
**descriptions** further refine behaviour.
|
89
93
|
|
90
94
|
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
91
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM**
|
92
|
-
description on) • `Picture` → **LLM** (Markdown, description off,
|
93
|
-
_All_) • `Formula`, `Page` → **LLM** (Markdown) •
|
94
|
-
**Ignore** (removed from output)
|
95
|
-
|
96
|
-
---
|
95
|
+
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
96
|
+
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
97
|
+
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
98
|
+
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
97
99
|
|
98
100
|
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
99
101
|
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
@@ -553,7 +555,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
553
555
|
- `Auto`: Process content automatically
|
554
556
|
- `LLM`: Use large language models for processing
|
555
557
|
- `Ignore`: Exclude segments from final output
|
556
|
-
- `description` enables LLM-generated descriptions for segments
|
558
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
559
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
560
|
+
configuration.
|
561
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
557
562
|
|
558
563
|
**Deprecated fields (for backwards compatibility):**
|
559
564
|
|
@@ -575,7 +580,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
575
580
|
- `Auto`: Process content automatically
|
576
581
|
- `LLM`: Use large language models for processing
|
577
582
|
- `Ignore`: Exclude segments from final output
|
578
|
-
- `description` enables LLM-generated descriptions for segments
|
583
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
584
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
585
|
+
configuration.
|
586
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
579
587
|
|
580
588
|
**Deprecated fields (for backwards compatibility):**
|
581
589
|
|
@@ -597,7 +605,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
597
605
|
- `Auto`: Process content automatically
|
598
606
|
- `LLM`: Use large language models for processing
|
599
607
|
- `Ignore`: Exclude segments from final output
|
600
|
-
- `description` enables LLM-generated descriptions for segments
|
608
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
609
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
610
|
+
configuration.
|
611
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
601
612
|
|
602
613
|
**Deprecated fields (for backwards compatibility):**
|
603
614
|
|
@@ -619,7 +630,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
619
630
|
- `Auto`: Process content automatically
|
620
631
|
- `LLM`: Use large language models for processing
|
621
632
|
- `Ignore`: Exclude segments from final output
|
622
|
-
- `description` enables LLM-generated descriptions for segments
|
633
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
634
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
635
|
+
configuration.
|
636
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
623
637
|
|
624
638
|
**Deprecated fields (for backwards compatibility):**
|
625
639
|
|
@@ -641,7 +655,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
641
655
|
- `Auto`: Process content automatically
|
642
656
|
- `LLM`: Use large language models for processing
|
643
657
|
- `Ignore`: Exclude segments from final output
|
644
|
-
- `description` enables LLM-generated descriptions for segments
|
658
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
659
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
660
|
+
configuration.
|
661
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
645
662
|
|
646
663
|
**Deprecated fields (for backwards compatibility):**
|
647
664
|
|
@@ -663,7 +680,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
663
680
|
- `Auto`: Process content automatically
|
664
681
|
- `LLM`: Use large language models for processing
|
665
682
|
- `Ignore`: Exclude segments from final output
|
666
|
-
- `description` enables LLM-generated descriptions for segments
|
683
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
684
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
685
|
+
configuration.
|
686
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
667
687
|
|
668
688
|
**Deprecated fields (for backwards compatibility):**
|
669
689
|
|
@@ -685,7 +705,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
685
705
|
- `Auto`: Process content automatically
|
686
706
|
- `LLM`: Use large language models for processing
|
687
707
|
- `Ignore`: Exclude segments from final output
|
688
|
-
- `description` enables LLM-generated descriptions for segments
|
708
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
709
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
710
|
+
configuration.
|
711
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
689
712
|
|
690
713
|
**Deprecated fields (for backwards compatibility):**
|
691
714
|
|
@@ -707,7 +730,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
707
730
|
- `Auto`: Process content automatically
|
708
731
|
- `LLM`: Use large language models for processing
|
709
732
|
- `Ignore`: Exclude segments from final output
|
710
|
-
- `description` enables LLM-generated descriptions for segments
|
733
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
734
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
735
|
+
configuration.
|
736
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
711
737
|
|
712
738
|
**Deprecated fields (for backwards compatibility):**
|
713
739
|
|
@@ -729,7 +755,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
729
755
|
- `Auto`: Process content automatically
|
730
756
|
- `LLM`: Use large language models for processing
|
731
757
|
- `Ignore`: Exclude segments from final output
|
732
|
-
- `description` enables LLM-generated descriptions for segments
|
758
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
759
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
760
|
+
configuration.
|
761
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
733
762
|
|
734
763
|
**Deprecated fields (for backwards compatibility):**
|
735
764
|
|
@@ -751,7 +780,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
751
780
|
- `Auto`: Process content automatically
|
752
781
|
- `LLM`: Use large language models for processing
|
753
782
|
- `Ignore`: Exclude segments from final output
|
754
|
-
- `description` enables LLM-generated descriptions for segments
|
783
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
784
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
785
|
+
configuration.
|
786
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
755
787
|
|
756
788
|
**Deprecated fields (for backwards compatibility):**
|
757
789
|
|
@@ -773,7 +805,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
773
805
|
- `Auto`: Process content automatically
|
774
806
|
- `LLM`: Use large language models for processing
|
775
807
|
- `Ignore`: Exclude segments from final output
|
776
|
-
- `description` enables LLM-generated descriptions for segments
|
808
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
809
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
810
|
+
configuration.
|
811
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
777
812
|
|
778
813
|
**Deprecated fields (for backwards compatibility):**
|
779
814
|
|
@@ -795,7 +830,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
795
830
|
- `Auto`: Process content automatically
|
796
831
|
- `LLM`: Use large language models for processing
|
797
832
|
- `Ignore`: Exclude segments from final output
|
798
|
-
- `description` enables LLM-generated descriptions for segments
|
833
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
834
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
835
|
+
configuration.
|
836
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
799
837
|
|
800
838
|
**Deprecated fields (for backwards compatibility):**
|
801
839
|
|
@@ -82,18 +82,14 @@ class ParseUpdateParams(TypedDict, total=False):
|
|
82
82
|
content is produced (rule-based vs. LLM). • The output format (`Html` or
|
83
83
|
`Markdown`).
|
84
84
|
|
85
|
-
Optional flags such as image **cropping**, **extended context**, and
|
86
|
-
descriptions** further refine behaviour.
|
87
|
-
|
88
|
-
---
|
85
|
+
Optional flags such as image **cropping**, **extended context**, and
|
86
|
+
**descriptions** further refine behaviour.
|
89
87
|
|
90
88
|
**Default strategy per segment** • `Title`, `SectionHeader`, `Text`, `ListItem`,
|
91
|
-
`Caption`, `Footnote` → **Auto** (Markdown) • `Table` → **LLM**
|
92
|
-
description on) • `Picture` → **LLM** (Markdown, description off,
|
93
|
-
_All_) • `Formula`, `Page` → **LLM** (Markdown) •
|
94
|
-
**Ignore** (removed from output)
|
95
|
-
|
96
|
-
---
|
89
|
+
`Caption`, `Footnote` → **Auto** (Markdown, description off) • `Table` → **LLM**
|
90
|
+
(HTML, description on) • `Picture` → **LLM** (Markdown, description off,
|
91
|
+
cropping _All_) • `Formula`, `Page` → **LLM** (Markdown, description off) •
|
92
|
+
`PageHeader`, `PageFooter` → **Ignore** (removed from output)
|
97
93
|
|
98
94
|
**Strategy reference** • **Auto** – rule-based content generation. • **LLM** –
|
99
95
|
generate content with an LLM. • **Ignore** – exclude the segment entirely.
|
@@ -553,7 +549,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
553
549
|
- `Auto`: Process content automatically
|
554
550
|
- `LLM`: Use large language models for processing
|
555
551
|
- `Ignore`: Exclude segments from final output
|
556
|
-
- `description` enables LLM-generated descriptions for segments
|
552
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
553
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
554
|
+
configuration.
|
555
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
557
556
|
|
558
557
|
**Deprecated fields (for backwards compatibility):**
|
559
558
|
|
@@ -575,7 +574,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
575
574
|
- `Auto`: Process content automatically
|
576
575
|
- `LLM`: Use large language models for processing
|
577
576
|
- `Ignore`: Exclude segments from final output
|
578
|
-
- `description` enables LLM-generated descriptions for segments
|
577
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
578
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
579
|
+
configuration.
|
580
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
579
581
|
|
580
582
|
**Deprecated fields (for backwards compatibility):**
|
581
583
|
|
@@ -597,7 +599,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
597
599
|
- `Auto`: Process content automatically
|
598
600
|
- `LLM`: Use large language models for processing
|
599
601
|
- `Ignore`: Exclude segments from final output
|
600
|
-
- `description` enables LLM-generated descriptions for segments
|
602
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
603
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
604
|
+
configuration.
|
605
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
601
606
|
|
602
607
|
**Deprecated fields (for backwards compatibility):**
|
603
608
|
|
@@ -619,7 +624,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
619
624
|
- `Auto`: Process content automatically
|
620
625
|
- `LLM`: Use large language models for processing
|
621
626
|
- `Ignore`: Exclude segments from final output
|
622
|
-
- `description` enables LLM-generated descriptions for segments
|
627
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
628
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
629
|
+
configuration.
|
630
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
623
631
|
|
624
632
|
**Deprecated fields (for backwards compatibility):**
|
625
633
|
|
@@ -641,7 +649,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
641
649
|
- `Auto`: Process content automatically
|
642
650
|
- `LLM`: Use large language models for processing
|
643
651
|
- `Ignore`: Exclude segments from final output
|
644
|
-
- `description` enables LLM-generated descriptions for segments
|
652
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
653
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
654
|
+
configuration.
|
655
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
645
656
|
|
646
657
|
**Deprecated fields (for backwards compatibility):**
|
647
658
|
|
@@ -663,7 +674,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
663
674
|
- `Auto`: Process content automatically
|
664
675
|
- `LLM`: Use large language models for processing
|
665
676
|
- `Ignore`: Exclude segments from final output
|
666
|
-
- `description` enables LLM-generated descriptions for segments
|
677
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
678
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
679
|
+
configuration.
|
680
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
667
681
|
|
668
682
|
**Deprecated fields (for backwards compatibility):**
|
669
683
|
|
@@ -685,7 +699,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
685
699
|
- `Auto`: Process content automatically
|
686
700
|
- `LLM`: Use large language models for processing
|
687
701
|
- `Ignore`: Exclude segments from final output
|
688
|
-
- `description` enables LLM-generated descriptions for segments
|
702
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
703
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
704
|
+
configuration.
|
705
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
689
706
|
|
690
707
|
**Deprecated fields (for backwards compatibility):**
|
691
708
|
|
@@ -707,7 +724,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
707
724
|
- `Auto`: Process content automatically
|
708
725
|
- `LLM`: Use large language models for processing
|
709
726
|
- `Ignore`: Exclude segments from final output
|
710
|
-
- `description` enables LLM-generated descriptions for segments
|
727
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
728
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
729
|
+
configuration.
|
730
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
711
731
|
|
712
732
|
**Deprecated fields (for backwards compatibility):**
|
713
733
|
|
@@ -729,7 +749,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
729
749
|
- `Auto`: Process content automatically
|
730
750
|
- `LLM`: Use large language models for processing
|
731
751
|
- `Ignore`: Exclude segments from final output
|
732
|
-
- `description` enables LLM-generated descriptions for segments
|
752
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
753
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
754
|
+
configuration.
|
755
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
733
756
|
|
734
757
|
**Deprecated fields (for backwards compatibility):**
|
735
758
|
|
@@ -751,7 +774,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
751
774
|
- `Auto`: Process content automatically
|
752
775
|
- `LLM`: Use large language models for processing
|
753
776
|
- `Ignore`: Exclude segments from final output
|
754
|
-
- `description` enables LLM-generated descriptions for segments
|
777
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
778
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
779
|
+
configuration.
|
780
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
755
781
|
|
756
782
|
**Deprecated fields (for backwards compatibility):**
|
757
783
|
|
@@ -773,7 +799,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
773
799
|
- `Auto`: Process content automatically
|
774
800
|
- `LLM`: Use large language models for processing
|
775
801
|
- `Ignore`: Exclude segments from final output
|
776
|
-
- `description` enables LLM-generated descriptions for segments
|
802
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
803
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
804
|
+
configuration.
|
805
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
777
806
|
|
778
807
|
**Deprecated fields (for backwards compatibility):**
|
779
808
|
|
@@ -795,7 +824,10 @@ class SegmentProcessing(TypedDict, total=False):
|
|
795
824
|
- `Auto`: Process content automatically
|
796
825
|
- `LLM`: Use large language models for processing
|
797
826
|
- `Ignore`: Exclude segments from final output
|
798
|
-
- `description` enables LLM-generated descriptions for segments
|
827
|
+
- `description` enables LLM-generated descriptions for segments. **Note:** This
|
828
|
+
uses chunkr's own VLM models and is not configurable via LLM processing
|
829
|
+
configuration.
|
830
|
+
- `extended_context` uses the full page image as context for LLM generation.
|
799
831
|
|
800
832
|
**Deprecated fields (for backwards compatibility):**
|
801
833
|
|