chunkr-ai 0.0.43__py3-none-any.whl → 0.0.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Any, List, Optional
3
+ from typing import Any, List, Optional, Union
4
+ from pydantic import field_validator
4
5
 
5
6
  class GenerationStrategy(str, Enum):
6
7
  LLM = "LLM"
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
10
11
  ALL = "All"
11
12
  AUTO = "Auto"
12
13
 
14
+ class EmbedSource(str, Enum):
15
+ HTML = "HTML"
16
+ MARKDOWN = "Markdown"
17
+ LLM = "LLM"
18
+ CONTENT = "Content"
19
+
13
20
  class GenerationConfig(BaseModel):
14
21
  html: Optional[GenerationStrategy] = None
15
22
  llm: Optional[str] = None
16
23
  markdown: Optional[GenerationStrategy] = None
17
24
  crop_image: Optional[CroppingStrategy] = None
25
+ embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
18
26
 
19
27
  class SegmentProcessing(BaseModel):
20
28
  model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
32
40
  text: Optional[GenerationConfig] = Field(default=None, alias="Text")
33
41
  title: Optional[GenerationConfig] = Field(default=None, alias="Title")
34
42
 
43
+ class Tokenizer(str, Enum):
44
+ WORD = "Word"
45
+ CL100K_BASE = "Cl100kBase"
46
+ XLM_ROBERTA_BASE = "XlmRobertaBase"
47
+ BERT_BASE_UNCASED = "BertBaseUncased"
48
+
49
+ class TokenizerType(BaseModel):
50
+ enum_value: Optional[Tokenizer] = None
51
+ string_value: Optional[str] = None
52
+
53
+ @classmethod
54
+ def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
55
+ return cls(enum_value=enum_value)
56
+
57
+ @classmethod
58
+ def from_string(cls, string_value: str) -> "TokenizerType":
59
+ return cls(string_value=string_value)
60
+
61
+ def __str__(self) -> str:
62
+ if self.enum_value is not None:
63
+ return f"enum:{self.enum_value.value}"
64
+ elif self.string_value is not None:
65
+ return f"string:{self.string_value}"
66
+ return ""
67
+
68
+ model_config = ConfigDict(
69
+ json_encoders={
70
+ 'TokenizerType': lambda v: v.model_dump()
71
+ }
72
+ )
73
+
74
+ def model_dump(self, **kwargs):
75
+ if self.enum_value is not None:
76
+ return {"Enum": self.enum_value.value}
77
+ elif self.string_value is not None:
78
+ return {"String": self.string_value}
79
+ return {}
80
+
35
81
  class ChunkProcessing(BaseModel):
36
- ignore_headers_and_footers: Optional[bool] = None
82
+ ignore_headers_and_footers: Optional[bool] = True
37
83
  target_length: Optional[int] = None
84
+ tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
85
+
86
+ model_config = ConfigDict(
87
+ arbitrary_types_allowed=True,
88
+ json_encoders={
89
+ TokenizerType: lambda v: v.model_dump()
90
+ }
91
+ )
92
+
93
+ @field_validator('tokenizer', mode='before')
94
+ def validate_tokenizer(cls, v):
95
+ if v is None:
96
+ return None
97
+
98
+ if isinstance(v, TokenizerType):
99
+ return v
100
+
101
+ if isinstance(v, Tokenizer):
102
+ return TokenizerType(enum_value=v)
103
+
104
+ if isinstance(v, dict):
105
+ if "Enum" in v:
106
+ try:
107
+ return TokenizerType(enum_value=Tokenizer(v["Enum"]))
108
+ except ValueError:
109
+ return TokenizerType(string_value=v["Enum"])
110
+ elif "String" in v:
111
+ return TokenizerType(string_value=v["String"])
112
+
113
+ if isinstance(v, str):
114
+ try:
115
+ return TokenizerType(enum_value=Tokenizer(v))
116
+ except ValueError:
117
+ return TokenizerType(string_value=v)
118
+
119
+ raise ValueError(f"Cannot convert {v} to TokenizerType")
38
120
 
39
121
  class OcrStrategy(str, Enum):
40
122
  ALL = "All"
@@ -44,6 +126,10 @@ class SegmentationStrategy(str, Enum):
44
126
  LAYOUT_ANALYSIS = "LayoutAnalysis"
45
127
  PAGE = "Page"
46
128
 
129
+ class ErrorHandlingStrategy(str, Enum):
130
+ FAIL = "Fail"
131
+ CONTINUE = "Continue"
132
+
47
133
  class BoundingBox(BaseModel):
48
134
  left: float
49
135
  top: float
@@ -107,6 +193,7 @@ class Pipeline(str, Enum):
107
193
  class Configuration(BaseModel):
108
194
  chunk_processing: Optional[ChunkProcessing] = None
109
195
  expires_in: Optional[int] = None
196
+ error_handling: Optional[ErrorHandlingStrategy] = None
110
197
  high_resolution: Optional[bool] = None
111
198
  ocr_strategy: Optional[OcrStrategy] = None
112
199
  segment_processing: Optional[SegmentProcessing] = None
chunkr_ai/models.py CHANGED
@@ -4,6 +4,8 @@ from .api.configuration import (
4
4
  ChunkProcessing,
5
5
  Configuration,
6
6
  CroppingStrategy,
7
+ EmbedSource,
8
+ ErrorHandlingStrategy,
7
9
  GenerationStrategy,
8
10
  GenerationConfig,
9
11
  Model,
@@ -16,6 +18,8 @@ from .api.configuration import (
16
18
  SegmentationStrategy,
17
19
  Status,
18
20
  Pipeline,
21
+ Tokenizer,
22
+ TokenizerType,
19
23
  )
20
24
  from .api.task_response import TaskResponse
21
25
 
@@ -25,6 +29,8 @@ __all__ = [
25
29
  "ChunkProcessing",
26
30
  "Configuration",
27
31
  "CroppingStrategy",
32
+ "EmbedSource",
33
+ "ErrorHandlingStrategy",
28
34
  "GenerationConfig",
29
35
  "GenerationStrategy",
30
36
  "Model",
@@ -38,5 +44,6 @@ __all__ = [
38
44
  "Status",
39
45
  "TaskResponse",
40
46
  "Pipeline",
47
+ "Tokenizer",
48
+ "TokenizerType",
41
49
  ]
42
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.0.43
3
+ Version: 0.0.45
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -1,16 +1,16 @@
1
1
  chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=tOI7ylkhyeFfCLMisk96EPsH4UEcjBx1Mcisxc_AYXI,757
2
+ chunkr_ai/models.py,sha256=m3_3sUm4zm2LTqmxFWJtAGMqJbksmzCF2X2wGCAgSLY,920
3
3
  chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
5
5
  chunkr_ai/api/chunkr.py,sha256=BzwcKNCuLfVR-HzgY8tKStsW4pIDVVjBgnEqPLyUUMM,3292
6
6
  chunkr_ai/api/chunkr_base.py,sha256=FDl0Ew8eOY4hur5FFqPENZiq9YQy0G3XWEqcKPeCO-U,6130
7
- chunkr_ai/api/configuration.py,sha256=Kkqxco8M-xgijUMsmtL8rJpMxrnSEnsQY8dUP8Cg5oc,3947
7
+ chunkr_ai/api/configuration.py,sha256=KHOt1MiRvDbIvAdFK8I5j5uA4O561WeCBd1J1megQ7Y,6671
8
8
  chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
9
9
  chunkr_ai/api/misc.py,sha256=QN-2YWQ8e3VvvK63Ua-e8jsx6gxVxkO88Z96yWOofu0,3653
10
10
  chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
11
  chunkr_ai/api/task_response.py,sha256=6kk9g2f7OZB3PAsmp4Or5A42r1dXTAzWAHEIVtLQ9sA,6545
12
- chunkr_ai-0.0.43.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
- chunkr_ai-0.0.43.dist-info/METADATA,sha256=EUBjfees5n8KRCpn65HFwsvzmSZYJ7wl5rkkbADwkd0,7053
14
- chunkr_ai-0.0.43.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
15
- chunkr_ai-0.0.43.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
- chunkr_ai-0.0.43.dist-info/RECORD,,
12
+ chunkr_ai-0.0.45.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
+ chunkr_ai-0.0.45.dist-info/METADATA,sha256=owMljpIYmTevXkW9V-Nly0dBILzmyN8MfKeXpXctcVU,7053
14
+ chunkr_ai-0.0.45.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
15
+ chunkr_ai-0.0.45.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
+ chunkr_ai-0.0.45.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5