sharedkernel 2.7.0__tar.gz → 2.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/PKG-INFO +3 -1
  2. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/README.md +2 -0
  3. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/setup.py +3 -1
  4. sharedkernel-2.8.0/sharedkernel/chunker/chunk_rule.py +84 -0
  5. sharedkernel-2.8.0/sharedkernel/chunker/text_chunker.py +107 -0
  6. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/PKG-INFO +3 -1
  7. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/SOURCES.txt +2 -0
  8. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/setup.cfg +0 -0
  9. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/common.py +0 -0
  10. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/config.py +0 -0
  11. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/data_format_converter.py +0 -0
  12. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/__init__.py +0 -0
  13. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/audit_model.py +0 -0
  14. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/__init__.py +0 -0
  15. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/cache_repository.py +0 -0
  16. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/cache/redis_generic_cache.py +0 -0
  17. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/distributed_cache.py +0 -0
  18. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_generic_audit_repository.py +0 -0
  19. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_generic_repository.py +0 -0
  20. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/mongo_health_checker.py +0 -0
  21. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/database/pagination_response_dto.py +0 -0
  22. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/date_converter.py +0 -0
  23. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/diff_utils.py +0 -0
  24. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/__init__.py +0 -0
  25. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/error_code.py +0 -0
  26. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/redis_mode_enum.py +0 -0
  27. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/enum/sort_order.py +0 -0
  28. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/__init__.py +0 -0
  29. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/exception.py +0 -0
  30. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/exception/exception_handlers.py +0 -0
  31. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/file_validation.py +0 -0
  32. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/ip_session_service.py +0 -0
  33. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/jwt_service.py +0 -0
  34. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_decorator.py +0 -0
  35. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_dto.py +0 -0
  36. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_enums.py +0 -0
  37. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_info.py +0 -0
  38. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/log_middlewares.py +0 -0
  39. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/logger/logger_service.py +0 -0
  40. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/multipart_upload.py +0 -0
  41. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/__init__.py +0 -0
  42. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/number_normalizer.py +0 -0
  43. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/phone_number_normalizer.py +0 -0
  44. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/normalizer/string_normalizer.py +0 -0
  45. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/__init__.py +0 -0
  46. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/base_document.py +0 -0
  47. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/json_string_model.py +0 -0
  48. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/jwt_model.py +0 -0
  49. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/result.py +0 -0
  50. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/objects/user_info.py +0 -0
  51. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/regex_masking.py +0 -0
  52. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/s3_uploader.py +0 -0
  53. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel/string_extentions.py +0 -0
  54. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/dependency_links.txt +0 -0
  55. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/requires.txt +0 -0
  56. {sharedkernel-2.7.0 → sharedkernel-2.8.0}/sharedkernel.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sharedkernel
3
- Version: 2.7.0
3
+ Version: 2.8.0
4
4
  Summary: sharekernel is a shared package between all python projects
5
5
  Author: Smilinno
6
6
  Description-Content-Type: text/markdown
@@ -31,6 +31,8 @@ Dynamic: summary
31
31
  this is a shared kernel package
32
32
 
33
33
  # Change Log
34
+ ### Version 2.8.0
35
+ - Add chunker for tts
34
36
  ### Version 2.7.0
35
37
  - Add Redis and cache repository
36
38
  ### Version 2.6.4
@@ -2,6 +2,8 @@
2
2
  this is a shared kernel package
3
3
 
4
4
  # Change Log
5
+ ### Version 2.8.0
6
+ - Add chunker for tts
5
7
  ### Version 2.7.0
6
8
  - Add Redis and cache repository
7
9
  ### Version 2.6.4
@@ -19,6 +19,8 @@ setup(
19
19
  "sharedkernel.objects",
20
20
  "sharedkernel.normalizer",
21
21
  "sharedkernel.logger",
22
+ "sharedkernel.chunker",
23
+
22
24
  ],
23
25
  # Needed for dependencies
24
26
  install_requires=[
@@ -41,7 +43,7 @@ setup(
41
43
  "redis==8.0.0",
42
44
  ],
43
45
  # *strongly* suggested for sharing
44
- version="2.7.0",
46
+ version="2.8.0",
45
47
  description="sharekernel is a shared package between all python projects",
46
48
  long_description=long_description,
47
49
  long_description_content_type="text/markdown",
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+ import re
3
+ from typing import List, Optional, Protocol, Tuple
4
+
5
+
6
+ class SplitRule(Protocol):
7
+ """Interface for chunk split rules."""
8
+
9
+ def apply(
10
+ self, segment: str, offset: int
11
+ ) -> Tuple[Optional[int], Optional[str], Optional[str]]: ...
12
+
13
+
14
+ class PunctuationRule(SplitRule):
15
+ """Split after the first matching punctuation character."""
16
+
17
+ def __init__(self, punctuations: str = ".,;:!?"):
18
+ self.punctuations = set(punctuations)
19
+
20
+ def apply(
21
+ self, segment: str, offset: int
22
+ ) -> Tuple[Optional[int], Optional[str], Optional[str]]:
23
+ for i, ch in enumerate(segment):
24
+ if ch in self.punctuations:
25
+ return offset + i + 1, ch, "punctuation"
26
+ return None, None, None
27
+
28
+
29
+ class RegexRule(SplitRule):
30
+ """Split after the first regex match."""
31
+
32
+ def __init__(self, pattern: str):
33
+ self.regex = re.compile(pattern) if pattern else None
34
+
35
+ def apply(
36
+ self, segment: str, offset: int
37
+ ) -> Tuple[Optional[int], Optional[str], Optional[str]]:
38
+ if not self.regex:
39
+ return None, None, None
40
+ m = self.regex.search(segment)
41
+ if m:
42
+ return offset + m.end(), segment[m.start() : m.end()], "regex"
43
+ return None, None, None
44
+
45
+
46
+ class CutWordRule(SplitRule):
47
+ """Split after the first occurrence of any configured word/phrase."""
48
+
49
+ def __init__(self, cut_words: List[str]):
50
+ self.cut_words = [w.lower() for w in cut_words]
51
+
52
+ def apply(
53
+ self, segment: str, offset: int
54
+ ) -> Tuple[Optional[int], Optional[str], Optional[str]]:
55
+ lowered = segment.lower()
56
+ for word in self.cut_words:
57
+ idx = lowered.find(word)
58
+ if idx != -1:
59
+ phrase = segment[idx : idx + len(word)]
60
+ return offset + idx + len(word), phrase, "cut_word"
61
+ return None, None, None
62
+
63
+
64
+ class SpaceFallbackRule(SplitRule):
65
+ """Split near the center on space; otherwise hard-cut at max_len."""
66
+
67
+ def __init__(self, max_len: int):
68
+ self.max_len = max_len
69
+
70
+ def apply(
71
+ self, segment: str, offset: int
72
+ ) -> Tuple[Optional[int], Optional[str], Optional[str]]:
73
+ centre = len(segment) // 2
74
+ left = segment.rfind(" ", 0, centre)
75
+ right = segment.find(" ", centre)
76
+
77
+ if left != -1:
78
+ split = left
79
+ elif right != -1:
80
+ split = right
81
+ else:
82
+ split = self.max_len
83
+
84
+ return offset + split, None, "space_fallback"
@@ -0,0 +1,107 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+ from .chunk_rule import (
5
+ SplitRule,
6
+ )
7
+
8
+
9
+ @dataclass
10
+ class Chunk:
11
+ """A single chunk produced by TextChunker."""
12
+
13
+ text: str
14
+ number: int
15
+ begin: int
16
+ end: int
17
+ method: str
18
+ phrase: Optional[str]
19
+
20
+
21
+ class TextChunker:
22
+ """Split text into chunks based on ordered split rules."""
23
+
24
+ def __init__(self, min_len: int, max_len: int, rules: List[SplitRule]):
25
+ if min_len < 1:
26
+ raise ValueError("min_len must be >= 1")
27
+ if max_len < min_len:
28
+ raise ValueError("max_len must be >= min_len")
29
+
30
+ self.min_len = min_len
31
+ self.max_len = max_len
32
+ self.rules = rules
33
+
34
+ def chunk(self, text: str) -> List[Chunk]:
35
+ if not text:
36
+ return []
37
+
38
+ chunks: List[Chunk] = []
39
+ pos = 0
40
+ number = 1
41
+
42
+ while pos < len(text):
43
+ if len(text) - pos <= self.max_len:
44
+ chunk_text = text[pos:]
45
+ chunks.append(Chunk(chunk_text, number, pos, len(text), "final", None))
46
+ break
47
+
48
+ window_end = pos + self.max_len
49
+ window = text[pos:window_end]
50
+
51
+ split_idx: Optional[int] = None
52
+ phrase: Optional[str] = None
53
+ method: Optional[str] = None
54
+
55
+ for rule in self.rules:
56
+ candidate, cand_phrase, cand_method = rule.apply(window, pos)
57
+ if candidate is not None and candidate - pos >= self.min_len:
58
+ split_idx, phrase, method = candidate, cand_phrase, cand_method
59
+ break
60
+
61
+ if split_idx is None:
62
+ split_idx = pos + self.max_len
63
+ method = "hard_cut"
64
+
65
+ chunk_text = text[pos:split_idx]
66
+ chunks.append(Chunk(chunk_text, number, pos, split_idx, method, phrase))
67
+ pos = split_idx
68
+ number += 1
69
+
70
+ return chunks
71
+
72
+ def chunks(self, text: str):
73
+ """Yield chunks lazily, using the same logic as `chunk`."""
74
+ if not text:
75
+ return
76
+
77
+ pos = 0
78
+ number = 1
79
+
80
+ while pos < len(text):
81
+ if len(text) - pos <= self.max_len:
82
+ chunk_text = text[pos:]
83
+ yield Chunk(chunk_text, number, pos, len(text), "final", None)
84
+ break
85
+
86
+ window_end = pos + self.max_len
87
+ window = text[pos:window_end]
88
+
89
+ split_idx: Optional[int] = None
90
+ phrase: Optional[str] = None
91
+ method: Optional[str] = None
92
+
93
+ for rule in self.rules:
94
+ candidate, cand_phrase, cand_method = rule.apply(window, pos)
95
+ if candidate is not None and candidate - pos >= self.min_len:
96
+ split_idx, phrase, method = candidate, cand_phrase, cand_method
97
+ break
98
+
99
+ if split_idx is None:
100
+ split_idx = pos + self.max_len
101
+ method = "hard_cut"
102
+
103
+ chunk_text = text[pos:split_idx]
104
+ yield Chunk(chunk_text, number, pos, split_idx, method, phrase)
105
+
106
+ pos = split_idx
107
+ number += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sharedkernel
3
- Version: 2.7.0
3
+ Version: 2.8.0
4
4
  Summary: sharekernel is a shared package between all python projects
5
5
  Author: Smilinno
6
6
  Description-Content-Type: text/markdown
@@ -31,6 +31,8 @@ Dynamic: summary
31
31
  this is a shared kernel package
32
32
 
33
33
  # Change Log
34
+ ### Version 2.8.0
35
+ - Add chunker for tts
34
36
  ### Version 2.7.0
35
37
  - Add Redis and cache repository
36
38
  ### Version 2.6.4
@@ -17,6 +17,8 @@ sharedkernel.egg-info/SOURCES.txt
17
17
  sharedkernel.egg-info/dependency_links.txt
18
18
  sharedkernel.egg-info/requires.txt
19
19
  sharedkernel.egg-info/top_level.txt
20
+ sharedkernel/chunker/chunk_rule.py
21
+ sharedkernel/chunker/text_chunker.py
20
22
  sharedkernel/database/__init__.py
21
23
  sharedkernel/database/audit_model.py
22
24
  sharedkernel/database/distributed_cache.py
File without changes