chunkr-ai 0.0.43__py3-none-any.whl → 0.0.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/configuration.py +89 -2
- chunkr_ai/models.py +8 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.45.dist-info}/METADATA +1 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.45.dist-info}/RECORD +7 -7
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.45.dist-info}/WHEEL +1 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.45.dist-info}/licenses/LICENSE +0 -0
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.45.dist-info}/top_level.txt +0 -0
chunkr_ai/api/configuration.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Any, List, Optional
|
3
|
+
from typing import Any, List, Optional, Union
|
4
|
+
from pydantic import field_validator
|
4
5
|
|
5
6
|
class GenerationStrategy(str, Enum):
|
6
7
|
LLM = "LLM"
|
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
|
|
10
11
|
ALL = "All"
|
11
12
|
AUTO = "Auto"
|
12
13
|
|
14
|
+
class EmbedSource(str, Enum):
|
15
|
+
HTML = "HTML"
|
16
|
+
MARKDOWN = "Markdown"
|
17
|
+
LLM = "LLM"
|
18
|
+
CONTENT = "Content"
|
19
|
+
|
13
20
|
class GenerationConfig(BaseModel):
|
14
21
|
html: Optional[GenerationStrategy] = None
|
15
22
|
llm: Optional[str] = None
|
16
23
|
markdown: Optional[GenerationStrategy] = None
|
17
24
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
+
embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
|
18
26
|
|
19
27
|
class SegmentProcessing(BaseModel):
|
20
28
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
|
|
32
40
|
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
33
41
|
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
34
42
|
|
43
|
+
class Tokenizer(str, Enum):
|
44
|
+
WORD = "Word"
|
45
|
+
CL100K_BASE = "Cl100kBase"
|
46
|
+
XLM_ROBERTA_BASE = "XlmRobertaBase"
|
47
|
+
BERT_BASE_UNCASED = "BertBaseUncased"
|
48
|
+
|
49
|
+
class TokenizerType(BaseModel):
|
50
|
+
enum_value: Optional[Tokenizer] = None
|
51
|
+
string_value: Optional[str] = None
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
|
55
|
+
return cls(enum_value=enum_value)
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def from_string(cls, string_value: str) -> "TokenizerType":
|
59
|
+
return cls(string_value=string_value)
|
60
|
+
|
61
|
+
def __str__(self) -> str:
|
62
|
+
if self.enum_value is not None:
|
63
|
+
return f"enum:{self.enum_value.value}"
|
64
|
+
elif self.string_value is not None:
|
65
|
+
return f"string:{self.string_value}"
|
66
|
+
return ""
|
67
|
+
|
68
|
+
model_config = ConfigDict(
|
69
|
+
json_encoders={
|
70
|
+
'TokenizerType': lambda v: v.model_dump()
|
71
|
+
}
|
72
|
+
)
|
73
|
+
|
74
|
+
def model_dump(self, **kwargs):
|
75
|
+
if self.enum_value is not None:
|
76
|
+
return {"Enum": self.enum_value.value}
|
77
|
+
elif self.string_value is not None:
|
78
|
+
return {"String": self.string_value}
|
79
|
+
return {}
|
80
|
+
|
35
81
|
class ChunkProcessing(BaseModel):
|
36
|
-
ignore_headers_and_footers: Optional[bool] =
|
82
|
+
ignore_headers_and_footers: Optional[bool] = True
|
37
83
|
target_length: Optional[int] = None
|
84
|
+
tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
|
85
|
+
|
86
|
+
model_config = ConfigDict(
|
87
|
+
arbitrary_types_allowed=True,
|
88
|
+
json_encoders={
|
89
|
+
TokenizerType: lambda v: v.model_dump()
|
90
|
+
}
|
91
|
+
)
|
92
|
+
|
93
|
+
@field_validator('tokenizer', mode='before')
|
94
|
+
def validate_tokenizer(cls, v):
|
95
|
+
if v is None:
|
96
|
+
return None
|
97
|
+
|
98
|
+
if isinstance(v, TokenizerType):
|
99
|
+
return v
|
100
|
+
|
101
|
+
if isinstance(v, Tokenizer):
|
102
|
+
return TokenizerType(enum_value=v)
|
103
|
+
|
104
|
+
if isinstance(v, dict):
|
105
|
+
if "Enum" in v:
|
106
|
+
try:
|
107
|
+
return TokenizerType(enum_value=Tokenizer(v["Enum"]))
|
108
|
+
except ValueError:
|
109
|
+
return TokenizerType(string_value=v["Enum"])
|
110
|
+
elif "String" in v:
|
111
|
+
return TokenizerType(string_value=v["String"])
|
112
|
+
|
113
|
+
if isinstance(v, str):
|
114
|
+
try:
|
115
|
+
return TokenizerType(enum_value=Tokenizer(v))
|
116
|
+
except ValueError:
|
117
|
+
return TokenizerType(string_value=v)
|
118
|
+
|
119
|
+
raise ValueError(f"Cannot convert {v} to TokenizerType")
|
38
120
|
|
39
121
|
class OcrStrategy(str, Enum):
|
40
122
|
ALL = "All"
|
@@ -44,6 +126,10 @@ class SegmentationStrategy(str, Enum):
|
|
44
126
|
LAYOUT_ANALYSIS = "LayoutAnalysis"
|
45
127
|
PAGE = "Page"
|
46
128
|
|
129
|
+
class ErrorHandlingStrategy(str, Enum):
|
130
|
+
FAIL = "Fail"
|
131
|
+
CONTINUE = "Continue"
|
132
|
+
|
47
133
|
class BoundingBox(BaseModel):
|
48
134
|
left: float
|
49
135
|
top: float
|
@@ -107,6 +193,7 @@ class Pipeline(str, Enum):
|
|
107
193
|
class Configuration(BaseModel):
|
108
194
|
chunk_processing: Optional[ChunkProcessing] = None
|
109
195
|
expires_in: Optional[int] = None
|
196
|
+
error_handling: Optional[ErrorHandlingStrategy] = None
|
110
197
|
high_resolution: Optional[bool] = None
|
111
198
|
ocr_strategy: Optional[OcrStrategy] = None
|
112
199
|
segment_processing: Optional[SegmentProcessing] = None
|
chunkr_ai/models.py
CHANGED
@@ -4,6 +4,8 @@ from .api.configuration import (
|
|
4
4
|
ChunkProcessing,
|
5
5
|
Configuration,
|
6
6
|
CroppingStrategy,
|
7
|
+
EmbedSource,
|
8
|
+
ErrorHandlingStrategy,
|
7
9
|
GenerationStrategy,
|
8
10
|
GenerationConfig,
|
9
11
|
Model,
|
@@ -16,6 +18,8 @@ from .api.configuration import (
|
|
16
18
|
SegmentationStrategy,
|
17
19
|
Status,
|
18
20
|
Pipeline,
|
21
|
+
Tokenizer,
|
22
|
+
TokenizerType,
|
19
23
|
)
|
20
24
|
from .api.task_response import TaskResponse
|
21
25
|
|
@@ -25,6 +29,8 @@ __all__ = [
|
|
25
29
|
"ChunkProcessing",
|
26
30
|
"Configuration",
|
27
31
|
"CroppingStrategy",
|
32
|
+
"EmbedSource",
|
33
|
+
"ErrorHandlingStrategy",
|
28
34
|
"GenerationConfig",
|
29
35
|
"GenerationStrategy",
|
30
36
|
"Model",
|
@@ -38,5 +44,6 @@ __all__ = [
|
|
38
44
|
"Status",
|
39
45
|
"TaskResponse",
|
40
46
|
"Pipeline",
|
47
|
+
"Tokenizer",
|
48
|
+
"TokenizerType",
|
41
49
|
]
|
42
|
-
|
@@ -1,16 +1,16 @@
|
|
1
1
|
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=
|
2
|
+
chunkr_ai/models.py,sha256=m3_3sUm4zm2LTqmxFWJtAGMqJbksmzCF2X2wGCAgSLY,920
|
3
3
|
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
5
5
|
chunkr_ai/api/chunkr.py,sha256=BzwcKNCuLfVR-HzgY8tKStsW4pIDVVjBgnEqPLyUUMM,3292
|
6
6
|
chunkr_ai/api/chunkr_base.py,sha256=FDl0Ew8eOY4hur5FFqPENZiq9YQy0G3XWEqcKPeCO-U,6130
|
7
|
-
chunkr_ai/api/configuration.py,sha256=
|
7
|
+
chunkr_ai/api/configuration.py,sha256=KHOt1MiRvDbIvAdFK8I5j5uA4O561WeCBd1J1megQ7Y,6671
|
8
8
|
chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
|
9
9
|
chunkr_ai/api/misc.py,sha256=QN-2YWQ8e3VvvK63Ua-e8jsx6gxVxkO88Z96yWOofu0,3653
|
10
10
|
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
11
|
chunkr_ai/api/task_response.py,sha256=6kk9g2f7OZB3PAsmp4Or5A42r1dXTAzWAHEIVtLQ9sA,6545
|
12
|
-
chunkr_ai-0.0.
|
13
|
-
chunkr_ai-0.0.
|
14
|
-
chunkr_ai-0.0.
|
15
|
-
chunkr_ai-0.0.
|
16
|
-
chunkr_ai-0.0.
|
12
|
+
chunkr_ai-0.0.45.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
+
chunkr_ai-0.0.45.dist-info/METADATA,sha256=owMljpIYmTevXkW9V-Nly0dBILzmyN8MfKeXpXctcVU,7053
|
14
|
+
chunkr_ai-0.0.45.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
15
|
+
chunkr_ai-0.0.45.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
+
chunkr_ai-0.0.45.dist-info/RECORD,,
|
File without changes
|
File without changes
|