chunkr-ai 0.0.43__py3-none-any.whl → 0.0.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkr_ai/api/configuration.py +84 -2
- chunkr_ai/models.py +6 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.44.dist-info}/METADATA +1 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.44.dist-info}/RECORD +7 -7
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.44.dist-info}/WHEEL +1 -1
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.44.dist-info}/licenses/LICENSE +0 -0
- {chunkr_ai-0.0.43.dist-info → chunkr_ai-0.0.44.dist-info}/top_level.txt +0 -0
chunkr_ai/api/configuration.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, ConfigDict
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Any, List, Optional
|
3
|
+
from typing import Any, List, Optional, Union
|
4
|
+
from pydantic import field_validator
|
4
5
|
|
5
6
|
class GenerationStrategy(str, Enum):
|
6
7
|
LLM = "LLM"
|
@@ -10,11 +11,18 @@ class CroppingStrategy(str, Enum):
|
|
10
11
|
ALL = "All"
|
11
12
|
AUTO = "Auto"
|
12
13
|
|
14
|
+
class EmbedSource(str, Enum):
|
15
|
+
HTML = "HTML"
|
16
|
+
MARKDOWN = "Markdown"
|
17
|
+
LLM = "LLM"
|
18
|
+
CONTENT = "Content"
|
19
|
+
|
13
20
|
class GenerationConfig(BaseModel):
|
14
21
|
html: Optional[GenerationStrategy] = None
|
15
22
|
llm: Optional[str] = None
|
16
23
|
markdown: Optional[GenerationStrategy] = None
|
17
24
|
crop_image: Optional[CroppingStrategy] = None
|
25
|
+
embed_sources: Optional[List[EmbedSource]] = Field(default_factory=lambda: [EmbedSource.MARKDOWN])
|
18
26
|
|
19
27
|
class SegmentProcessing(BaseModel):
|
20
28
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
@@ -32,9 +40,83 @@ class SegmentProcessing(BaseModel):
|
|
32
40
|
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
33
41
|
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
34
42
|
|
43
|
+
class Tokenizer(str, Enum):
|
44
|
+
WORD = "Word"
|
45
|
+
CL100K_BASE = "Cl100kBase"
|
46
|
+
XLM_ROBERTA_BASE = "XlmRobertaBase"
|
47
|
+
BERT_BASE_UNCASED = "BertBaseUncased"
|
48
|
+
|
49
|
+
class TokenizerType(BaseModel):
|
50
|
+
enum_value: Optional[Tokenizer] = None
|
51
|
+
string_value: Optional[str] = None
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def from_enum(cls, enum_value: Tokenizer) -> "TokenizerType":
|
55
|
+
return cls(enum_value=enum_value)
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def from_string(cls, string_value: str) -> "TokenizerType":
|
59
|
+
return cls(string_value=string_value)
|
60
|
+
|
61
|
+
def __str__(self) -> str:
|
62
|
+
if self.enum_value is not None:
|
63
|
+
return f"enum:{self.enum_value.value}"
|
64
|
+
elif self.string_value is not None:
|
65
|
+
return f"string:{self.string_value}"
|
66
|
+
return ""
|
67
|
+
|
68
|
+
model_config = ConfigDict(
|
69
|
+
json_encoders={
|
70
|
+
'TokenizerType': lambda v: v.model_dump()
|
71
|
+
}
|
72
|
+
)
|
73
|
+
|
74
|
+
def model_dump(self, **kwargs):
|
75
|
+
if self.enum_value is not None:
|
76
|
+
return {"Enum": self.enum_value.value}
|
77
|
+
elif self.string_value is not None:
|
78
|
+
return {"String": self.string_value}
|
79
|
+
return {}
|
80
|
+
|
35
81
|
class ChunkProcessing(BaseModel):
|
36
|
-
ignore_headers_and_footers: Optional[bool] =
|
82
|
+
ignore_headers_and_footers: Optional[bool] = True
|
37
83
|
target_length: Optional[int] = None
|
84
|
+
tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
|
85
|
+
|
86
|
+
model_config = ConfigDict(
|
87
|
+
arbitrary_types_allowed=True,
|
88
|
+
json_encoders={
|
89
|
+
TokenizerType: lambda v: v.model_dump()
|
90
|
+
}
|
91
|
+
)
|
92
|
+
|
93
|
+
@field_validator('tokenizer', mode='before')
|
94
|
+
def validate_tokenizer(cls, v):
|
95
|
+
if v is None:
|
96
|
+
return None
|
97
|
+
|
98
|
+
if isinstance(v, TokenizerType):
|
99
|
+
return v
|
100
|
+
|
101
|
+
if isinstance(v, Tokenizer):
|
102
|
+
return TokenizerType(enum_value=v)
|
103
|
+
|
104
|
+
if isinstance(v, dict):
|
105
|
+
if "Enum" in v:
|
106
|
+
try:
|
107
|
+
return TokenizerType(enum_value=Tokenizer(v["Enum"]))
|
108
|
+
except ValueError:
|
109
|
+
return TokenizerType(string_value=v["Enum"])
|
110
|
+
elif "String" in v:
|
111
|
+
return TokenizerType(string_value=v["String"])
|
112
|
+
|
113
|
+
if isinstance(v, str):
|
114
|
+
try:
|
115
|
+
return TokenizerType(enum_value=Tokenizer(v))
|
116
|
+
except ValueError:
|
117
|
+
return TokenizerType(string_value=v)
|
118
|
+
|
119
|
+
raise ValueError(f"Cannot convert {v} to TokenizerType")
|
38
120
|
|
39
121
|
class OcrStrategy(str, Enum):
|
40
122
|
ALL = "All"
|
chunkr_ai/models.py
CHANGED
@@ -4,6 +4,7 @@ from .api.configuration import (
|
|
4
4
|
ChunkProcessing,
|
5
5
|
Configuration,
|
6
6
|
CroppingStrategy,
|
7
|
+
EmbedSource,
|
7
8
|
GenerationStrategy,
|
8
9
|
GenerationConfig,
|
9
10
|
Model,
|
@@ -16,6 +17,8 @@ from .api.configuration import (
|
|
16
17
|
SegmentationStrategy,
|
17
18
|
Status,
|
18
19
|
Pipeline,
|
20
|
+
Tokenizer,
|
21
|
+
TokenizerType,
|
19
22
|
)
|
20
23
|
from .api.task_response import TaskResponse
|
21
24
|
|
@@ -25,6 +28,7 @@ __all__ = [
|
|
25
28
|
"ChunkProcessing",
|
26
29
|
"Configuration",
|
27
30
|
"CroppingStrategy",
|
31
|
+
"EmbedSource",
|
28
32
|
"GenerationConfig",
|
29
33
|
"GenerationStrategy",
|
30
34
|
"Model",
|
@@ -38,5 +42,6 @@ __all__ = [
|
|
38
42
|
"Status",
|
39
43
|
"TaskResponse",
|
40
44
|
"Pipeline",
|
45
|
+
"Tokenizer",
|
46
|
+
"TokenizerType",
|
41
47
|
]
|
42
|
-
|
@@ -1,16 +1,16 @@
|
|
1
1
|
chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
|
2
|
-
chunkr_ai/models.py,sha256=
|
2
|
+
chunkr_ai/models.py,sha256=Pfr8S0qbC5GSgI3zCE63bnBCWOOjiExBXIrSRPyLhkc,864
|
3
3
|
chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
chunkr_ai/api/auth.py,sha256=hlv0GiUmlsbFO1wLL9sslqOnsBSoBqkL_6Mk2SDvxgE,413
|
5
5
|
chunkr_ai/api/chunkr.py,sha256=BzwcKNCuLfVR-HzgY8tKStsW4pIDVVjBgnEqPLyUUMM,3292
|
6
6
|
chunkr_ai/api/chunkr_base.py,sha256=FDl0Ew8eOY4hur5FFqPENZiq9YQy0G3XWEqcKPeCO-U,6130
|
7
|
-
chunkr_ai/api/configuration.py,sha256=
|
7
|
+
chunkr_ai/api/configuration.py,sha256=jHEAz3H9uRh22jpSqnGyCdT4VbkCE_L_1fm0uVlv_1U,6527
|
8
8
|
chunkr_ai/api/decorators.py,sha256=VJX4qGBIL00K2zY8bh5KAMWv7SltJ38TvPJH06FnFss,4415
|
9
9
|
chunkr_ai/api/misc.py,sha256=QN-2YWQ8e3VvvK63Ua-e8jsx6gxVxkO88Z96yWOofu0,3653
|
10
10
|
chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
|
11
11
|
chunkr_ai/api/task_response.py,sha256=6kk9g2f7OZB3PAsmp4Or5A42r1dXTAzWAHEIVtLQ9sA,6545
|
12
|
-
chunkr_ai-0.0.
|
13
|
-
chunkr_ai-0.0.
|
14
|
-
chunkr_ai-0.0.
|
15
|
-
chunkr_ai-0.0.
|
16
|
-
chunkr_ai-0.0.
|
12
|
+
chunkr_ai-0.0.44.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
|
13
|
+
chunkr_ai-0.0.44.dist-info/METADATA,sha256=Gk-DiU78MVBXlYk3BjcKCVGfy1JL-SB0wj8p-ooovYs,7053
|
14
|
+
chunkr_ai-0.0.44.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
15
|
+
chunkr_ai-0.0.44.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
|
16
|
+
chunkr_ai-0.0.44.dist-info/RECORD,,
|
File without changes
|
File without changes
|