sharedkernel 2.7.0__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/PKG-INFO +3 -1
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/README.md +2 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/setup.py +3 -1
- sharedkernel-2.8.0/sharedkernel/chunker/chunk_rule.py +84 -0
- sharedkernel-2.8.0/sharedkernel/chunker/text_chunker.py +107 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/PKG-INFO +3 -1
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/SOURCES.txt +2 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/setup.cfg +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/common.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/config.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/data_format_converter.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/audit_model.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/cache_repository.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/redis_generic_cache.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/distributed_cache.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_generic_audit_repository.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_generic_repository.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_health_checker.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/pagination_response_dto.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/date_converter.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/diff_utils.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/error_code.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/redis_mode_enum.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/sort_order.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/exception.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/exception_handlers.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/file_validation.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/ip_session_service.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/jwt_service.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_decorator.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_dto.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_enums.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_info.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_middlewares.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/logger_service.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/multipart_upload.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/number_normalizer.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/phone_number_normalizer.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/string_normalizer.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/__init__.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/base_document.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/json_string_model.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/jwt_model.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/result.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/user_info.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/regex_masking.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/s3_uploader.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/string_extentions.py +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/dependency_links.txt +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/requires.txt +0 -0
- {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.8.0
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -31,6 +31,8 @@ Dynamic: summary
|
|
|
31
31
|
this is a shared kernel package
|
|
32
32
|
|
|
33
33
|
# Change Log
|
|
34
|
+
### Version 2.8.0
|
|
35
|
+
- Add chunker for tts
|
|
34
36
|
### Version 2.7.0
|
|
35
37
|
- Add Redis and cache repository
|
|
36
38
|
### Version 2.6.4
|
|
@@ -19,6 +19,8 @@ setup(
|
|
|
19
19
|
"sharedkernel.objects",
|
|
20
20
|
"sharedkernel.normalizer",
|
|
21
21
|
"sharedkernel.logger",
|
|
22
|
+
"sharedkernel.chunker",
|
|
23
|
+
|
|
22
24
|
],
|
|
23
25
|
# Needed for dependencies
|
|
24
26
|
install_requires=[
|
|
@@ -41,7 +43,7 @@ setup(
|
|
|
41
43
|
"redis==8.0.0",
|
|
42
44
|
],
|
|
43
45
|
# *strongly* suggested for sharing
|
|
44
|
-
version="2.
|
|
46
|
+
version="2.8.0",
|
|
45
47
|
description="sharekernel is a shared package between all python projects",
|
|
46
48
|
long_description=long_description,
|
|
47
49
|
long_description_content_type="text/markdown",
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Optional, Protocol, Tuple
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SplitRule(Protocol):
|
|
7
|
+
"""Interface for chunk split rules."""
|
|
8
|
+
|
|
9
|
+
def apply(
|
|
10
|
+
self, segment: str, offset: int
|
|
11
|
+
) -> Tuple[Optional[int], Optional[str], Optional[str]]: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PunctuationRule(SplitRule):
|
|
15
|
+
"""Split after the first matching punctuation character."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, punctuations: str = ".,;:!?"):
|
|
18
|
+
self.punctuations = set(punctuations)
|
|
19
|
+
|
|
20
|
+
def apply(
|
|
21
|
+
self, segment: str, offset: int
|
|
22
|
+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
|
|
23
|
+
for i, ch in enumerate(segment):
|
|
24
|
+
if ch in self.punctuations:
|
|
25
|
+
return offset + i + 1, ch, "punctuation"
|
|
26
|
+
return None, None, None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RegexRule(SplitRule):
|
|
30
|
+
"""Split after the first regex match."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, pattern: str):
|
|
33
|
+
self.regex = re.compile(pattern) if pattern else None
|
|
34
|
+
|
|
35
|
+
def apply(
|
|
36
|
+
self, segment: str, offset: int
|
|
37
|
+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
|
|
38
|
+
if not self.regex:
|
|
39
|
+
return None, None, None
|
|
40
|
+
m = self.regex.search(segment)
|
|
41
|
+
if m:
|
|
42
|
+
return offset + m.end(), segment[m.start() : m.end()], "regex"
|
|
43
|
+
return None, None, None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class CutWordRule(SplitRule):
|
|
47
|
+
"""Split after the first occurrence of any configured word/phrase."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, cut_words: List[str]):
|
|
50
|
+
self.cut_words = [w.lower() for w in cut_words]
|
|
51
|
+
|
|
52
|
+
def apply(
|
|
53
|
+
self, segment: str, offset: int
|
|
54
|
+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
|
|
55
|
+
lowered = segment.lower()
|
|
56
|
+
for word in self.cut_words:
|
|
57
|
+
idx = lowered.find(word)
|
|
58
|
+
if idx != -1:
|
|
59
|
+
phrase = segment[idx : idx + len(word)]
|
|
60
|
+
return offset + idx + len(word), phrase, "cut_word"
|
|
61
|
+
return None, None, None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SpaceFallbackRule(SplitRule):
|
|
65
|
+
"""Split near the center on space; otherwise hard-cut at max_len."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, max_len: int):
|
|
68
|
+
self.max_len = max_len
|
|
69
|
+
|
|
70
|
+
def apply(
|
|
71
|
+
self, segment: str, offset: int
|
|
72
|
+
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
|
|
73
|
+
centre = len(segment) // 2
|
|
74
|
+
left = segment.rfind(" ", 0, centre)
|
|
75
|
+
right = segment.find(" ", centre)
|
|
76
|
+
|
|
77
|
+
if left != -1:
|
|
78
|
+
split = left
|
|
79
|
+
elif right != -1:
|
|
80
|
+
split = right
|
|
81
|
+
else:
|
|
82
|
+
split = self.max_len
|
|
83
|
+
|
|
84
|
+
return offset + split, None, "space_fallback"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from .chunk_rule import (
|
|
5
|
+
SplitRule,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Chunk:
|
|
11
|
+
"""A single chunk produced by TextChunker."""
|
|
12
|
+
|
|
13
|
+
text: str
|
|
14
|
+
number: int
|
|
15
|
+
begin: int
|
|
16
|
+
end: int
|
|
17
|
+
method: str
|
|
18
|
+
phrase: Optional[str]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TextChunker:
|
|
22
|
+
"""Split text into chunks based on ordered split rules."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, min_len: int, max_len: int, rules: List[SplitRule]):
|
|
25
|
+
if min_len < 1:
|
|
26
|
+
raise ValueError("min_len must be >= 1")
|
|
27
|
+
if max_len < min_len:
|
|
28
|
+
raise ValueError("max_len must be >= min_len")
|
|
29
|
+
|
|
30
|
+
self.min_len = min_len
|
|
31
|
+
self.max_len = max_len
|
|
32
|
+
self.rules = rules
|
|
33
|
+
|
|
34
|
+
def chunk(self, text: str) -> List[Chunk]:
|
|
35
|
+
if not text:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
chunks: List[Chunk] = []
|
|
39
|
+
pos = 0
|
|
40
|
+
number = 1
|
|
41
|
+
|
|
42
|
+
while pos < len(text):
|
|
43
|
+
if len(text) - pos <= self.max_len:
|
|
44
|
+
chunk_text = text[pos:]
|
|
45
|
+
chunks.append(Chunk(chunk_text, number, pos, len(text), "final", None))
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
window_end = pos + self.max_len
|
|
49
|
+
window = text[pos:window_end]
|
|
50
|
+
|
|
51
|
+
split_idx: Optional[int] = None
|
|
52
|
+
phrase: Optional[str] = None
|
|
53
|
+
method: Optional[str] = None
|
|
54
|
+
|
|
55
|
+
for rule in self.rules:
|
|
56
|
+
candidate, cand_phrase, cand_method = rule.apply(window, pos)
|
|
57
|
+
if candidate is not None and candidate - pos >= self.min_len:
|
|
58
|
+
split_idx, phrase, method = candidate, cand_phrase, cand_method
|
|
59
|
+
break
|
|
60
|
+
|
|
61
|
+
if split_idx is None:
|
|
62
|
+
split_idx = pos + self.max_len
|
|
63
|
+
method = "hard_cut"
|
|
64
|
+
|
|
65
|
+
chunk_text = text[pos:split_idx]
|
|
66
|
+
chunks.append(Chunk(chunk_text, number, pos, split_idx, method, phrase))
|
|
67
|
+
pos = split_idx
|
|
68
|
+
number += 1
|
|
69
|
+
|
|
70
|
+
return chunks
|
|
71
|
+
|
|
72
|
+
def chunks(self, text: str):
|
|
73
|
+
"""Yield chunks lazily, using the same logic as `chunk`."""
|
|
74
|
+
if not text:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
pos = 0
|
|
78
|
+
number = 1
|
|
79
|
+
|
|
80
|
+
while pos < len(text):
|
|
81
|
+
if len(text) - pos <= self.max_len:
|
|
82
|
+
chunk_text = text[pos:]
|
|
83
|
+
yield Chunk(chunk_text, number, pos, len(text), "final", None)
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
window_end = pos + self.max_len
|
|
87
|
+
window = text[pos:window_end]
|
|
88
|
+
|
|
89
|
+
split_idx: Optional[int] = None
|
|
90
|
+
phrase: Optional[str] = None
|
|
91
|
+
method: Optional[str] = None
|
|
92
|
+
|
|
93
|
+
for rule in self.rules:
|
|
94
|
+
candidate, cand_phrase, cand_method = rule.apply(window, pos)
|
|
95
|
+
if candidate is not None and candidate - pos >= self.min_len:
|
|
96
|
+
split_idx, phrase, method = candidate, cand_phrase, cand_method
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
if split_idx is None:
|
|
100
|
+
split_idx = pos + self.max_len
|
|
101
|
+
method = "hard_cut"
|
|
102
|
+
|
|
103
|
+
chunk_text = text[pos:split_idx]
|
|
104
|
+
yield Chunk(chunk_text, number, pos, split_idx, method, phrase)
|
|
105
|
+
|
|
106
|
+
pos = split_idx
|
|
107
|
+
number += 1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.8.0
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -31,6 +31,8 @@ Dynamic: summary
|
|
|
31
31
|
this is a shared kernel package
|
|
32
32
|
|
|
33
33
|
# Change Log
|
|
34
|
+
### Version 2.8.0
|
|
35
|
+
- Add chunker for tts
|
|
34
36
|
### Version 2.7.0
|
|
35
37
|
- Add Redis and cache repository
|
|
36
38
|
### Version 2.6.4
|
|
@@ -17,6 +17,8 @@ sharedkernel.egg-info/SOURCES.txt
|
|
|
17
17
|
sharedkernel.egg-info/dependency_links.txt
|
|
18
18
|
sharedkernel.egg-info/requires.txt
|
|
19
19
|
sharedkernel.egg-info/top_level.txt
|
|
20
|
+
sharedkernel/chunker/chunk_rule.py
|
|
21
|
+
sharedkernel/chunker/text_chunker.py
|
|
20
22
|
sharedkernel/database/__init__.py
|
|
21
23
|
sharedkernel/database/audit_model.py
|
|
22
24
|
sharedkernel/database/distributed_cache.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/redis_generic_cache.py
RENAMED
|
File without changes
|
|
File without changes
|
{sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_generic_audit_repository.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/phone_number_normalizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|