livekit-plugins-nltk 0.5.0__py3-none-any.whl → 0.5.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,11 +13,12 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from .sentence_tokenizer import SentenceTokenizer
16
+ from .sentence_tokenizer import SentenceStream, SentenceTokenizer
17
17
  from .version import __version__
18
18
 
19
19
  __all__ = [
20
20
  "SentenceTokenizer",
21
+ "SentenceStream",
21
22
  "__version__",
22
23
  ]
23
24
 
@@ -1,13 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import dataclasses
4
- import functools
5
5
  from dataclasses import dataclass
6
+ from typing import List, Optional
6
7
 
7
8
  from livekit import agents
8
9
 
9
10
  import nltk # type: ignore
10
11
 
12
+ from .log import logger
13
+
11
14
  # nltk is using the punkt tokenizer
12
15
  # https://www.nltk.org/_modules/nltk/tokenize/punkt.html
13
16
  # this code is using a whitespace to concatenate small sentences together
@@ -15,7 +18,7 @@ import nltk # type: ignore
15
18
 
16
19
 
17
20
  @dataclass
18
- class _TokenizerOptions:
21
+ class TokenizerOptions:
19
22
  language: str
20
23
  min_sentence_len: int
21
24
  stream_context_len: int
@@ -24,25 +27,26 @@ class _TokenizerOptions:
24
27
  class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
25
28
  def __init__(
26
29
  self,
27
- *,
28
30
  language: str = "english",
29
31
  min_sentence_len: int = 20,
30
32
  stream_context_len: int = 10,
31
33
  ) -> None:
32
34
  super().__init__()
33
- self._config = _TokenizerOptions(
35
+ self._config = TokenizerOptions(
34
36
  language=language,
35
37
  min_sentence_len=min_sentence_len,
36
38
  stream_context_len=stream_context_len,
37
39
  )
38
40
 
39
- def _sanitize_options(self, language: str | None = None) -> _TokenizerOptions:
41
+ def _sanitize_options(self, language: Optional[str] = None) -> TokenizerOptions:
40
42
  config = dataclasses.replace(self._config)
41
43
  if language:
42
44
  config.language = language
43
45
  return config
44
46
 
45
- def tokenize(self, *, text: str, language: str | None = None) -> list[str]:
47
+ def tokenize(
48
+ self, *, text: str, language: Optional[str] = None
49
+ ) -> List[agents.tokenize.SegmentedSentence]:
46
50
  config = self._sanitize_options(language=language)
47
51
  sentences = nltk.tokenize.sent_tokenize(text, config.language)
48
52
  new_sentences = []
@@ -56,19 +60,84 @@ class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
56
60
  if buff:
57
61
  new_sentences.append(buff.rstrip())
58
62
 
59
- return new_sentences
63
+ return [agents.tokenize.SegmentedSentence(text=text) for text in new_sentences]
60
64
 
61
65
  def stream(
62
66
  self,
63
67
  *,
64
- language: str | None = None,
68
+ language: Optional[str] = None,
65
69
  ) -> agents.tokenize.SentenceStream:
66
70
  config = self._sanitize_options(language=language)
67
- return agents.tokenize.BufferedTokenStream(
68
- tokenizer=functools.partial(
69
- nltk.tokenize.sent_tokenize,
70
- language=config.language,
71
- ),
72
- min_token_len=self._config.min_sentence_len,
73
- ctx_len=self._config.stream_context_len,
71
+ return SentenceStream(
72
+ language=config.language,
73
+ min_sentence_len=config.min_sentence_len,
74
+ context_len=config.stream_context_len,
74
75
  )
76
+
77
+
78
+ class SentenceStream(agents.tokenize.SentenceStream):
79
+ def __init__(
80
+ self, *, language: str, min_sentence_len: int, context_len: int
81
+ ) -> None:
82
+ self._language = language
83
+ self._context_len = context_len
84
+ self._min_sentence_len = min_sentence_len
85
+ self._event_queue = asyncio.Queue[agents.tokenize.SegmentedSentence | None]()
86
+ self._closed = False
87
+
88
+ self._incomplete_sentences: List[str] = [] # <= min_sentence_len
89
+ self._buffer = ""
90
+
91
+ def push_text(self, text: str) -> None:
92
+ if self._closed:
93
+ logger.error("Cannot push text to closed stream")
94
+ return
95
+
96
+ for char in text:
97
+ self._buffer += char
98
+
99
+ if len(self._buffer) < self._context_len:
100
+ continue
101
+
102
+ sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
103
+ if len(sentences) < 2:
104
+ continue
105
+
106
+ new_sentence = sentences[0]
107
+ self._incomplete_sentences.append(new_sentence)
108
+ s = " ".join(self._incomplete_sentences)
109
+
110
+ if len(s) >= self._min_sentence_len:
111
+ self._event_queue.put_nowait(agents.tokenize.SegmentedSentence(text=s))
112
+ self._incomplete_sentences = []
113
+
114
+ self._buffer = self._buffer[len(new_sentence) :].lstrip()
115
+
116
+ async def flush(self) -> None:
117
+ # try to segment the remaining data inside self._text_buffer
118
+ buff = " ".join(self._incomplete_sentences)
119
+ sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
120
+ for sentence in sentences:
121
+ buff += " " + sentence
122
+ if len(buff) >= self._min_sentence_len:
123
+ await self._event_queue.put(
124
+ agents.tokenize.SegmentedSentence(text=buff)
125
+ )
126
+ buff = ""
127
+
128
+ if buff:
129
+ await self._event_queue.put(agents.tokenize.SegmentedSentence(text=buff))
130
+
131
+ async def aclose(self) -> None:
132
+ self._closed = True
133
+ self._event_queue.put_nowait(None)
134
+
135
+ async def __anext__(self) -> agents.tokenize.SegmentedSentence:
136
+ event = await self._event_queue.get()
137
+ if event is None:
138
+ raise StopAsyncIteration
139
+
140
+ return event
141
+
142
+ def __aiter__(self) -> "SentenceStream":
143
+ return self
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.5.0"
15
+ __version__ = "0.5.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-nltk
3
- Version: 0.5.0
3
+ Version: 0.5.dev0
4
4
  Summary: Agent Framework plugin for NLTK-based text processing.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Requires-Python: >=3.9.0
20
20
  Description-Content-Type: text/markdown
21
21
  Requires-Dist: livekit ~=0.11
22
22
  Requires-Dist: nltk <4,>=3
23
- Requires-Dist: livekit-agents ~=0.6.0
23
+ Requires-Dist: livekit-agents ~=0.6.dev0
24
24
 
25
25
  # LiveKit Plugins NLTK
26
26
 
@@ -0,0 +1,9 @@
1
+ livekit/plugins/nltk/__init__.py,sha256=HGbaUwK-0cU-SbvbEl2WSQKwNrkHn8XuO-86Hiy0cy4,1134
2
+ livekit/plugins/nltk/log.py,sha256=12Ta0eunqKM3KyIlyzp2QFyO_Y-PBLAtyjEWDiuJdKk,67
3
+ livekit/plugins/nltk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ livekit/plugins/nltk/sentence_tokenizer.py,sha256=vhdWqMmwJSiqJsK9WibNHkWChiXyn13Yk4jOmfcLEM8,4570
5
+ livekit/plugins/nltk/version.py,sha256=h2gCxcJSMvCrVP7h14ON6HaghqLCkbl3--HZKEopR_8,603
6
+ livekit_plugins_nltk-0.5.dev0.dist-info/METADATA,sha256=75vfOXu6Xgba899ixTspRIE8Fi3sG0EfRoR2C9cNrG4,1188
7
+ livekit_plugins_nltk-0.5.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
+ livekit_plugins_nltk-0.5.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
9
+ livekit_plugins_nltk-0.5.dev0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- livekit/plugins/nltk/__init__.py,sha256=deM101uCRxPAD7m2xmPkxspW4ozp83jCJpFxoc9bT5U,1096
2
- livekit/plugins/nltk/log.py,sha256=12Ta0eunqKM3KyIlyzp2QFyO_Y-PBLAtyjEWDiuJdKk,67
3
- livekit/plugins/nltk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- livekit/plugins/nltk/sentence_tokenizer.py,sha256=Y2wXJwqqUxYMqSqPrAXn_OIeMQ1Q-18-Nvd-vog8Z40,2211
5
- livekit/plugins/nltk/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
6
- livekit_plugins_nltk-0.5.0.dist-info/METADATA,sha256=A7fC7nNjXotaiPw7r7_Tq-Qdw9k9EmsEBBJRhIeRs1g,1182
7
- livekit_plugins_nltk-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
- livekit_plugins_nltk-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
9
- livekit_plugins_nltk-0.5.0.dist-info/RECORD,,