livekit-plugins-nltk 0.4.dev0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/nltk/__init__.py +1 -2
- livekit/plugins/nltk/log.py +3 -0
- livekit/plugins/nltk/sentence_tokenizer.py +15 -83
- livekit/plugins/nltk/version.py +1 -1
- {livekit_plugins_nltk-0.4.dev0.dist-info → livekit_plugins_nltk-0.5.0.dist-info}/METADATA +3 -3
- livekit_plugins_nltk-0.5.0.dist-info/RECORD +9 -0
- livekit_plugins_nltk-0.4.dev0.dist-info/RECORD +0 -8
- {livekit_plugins_nltk-0.4.dev0.dist-info → livekit_plugins_nltk-0.5.0.dist-info}/WHEEL +0 -0
- {livekit_plugins_nltk-0.4.dev0.dist-info → livekit_plugins_nltk-0.5.0.dist-info}/top_level.txt +0 -0
livekit/plugins/nltk/__init__.py
CHANGED
@@ -13,12 +13,11 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
|
16
|
-
from .sentence_tokenizer import
|
16
|
+
from .sentence_tokenizer import SentenceTokenizer
|
17
17
|
from .version import __version__
|
18
18
|
|
19
19
|
__all__ = [
|
20
20
|
"SentenceTokenizer",
|
21
|
-
"SentenceStream",
|
22
21
|
"__version__",
|
23
22
|
]
|
24
23
|
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
import dataclasses
|
5
|
-
import
|
4
|
+
import functools
|
6
5
|
from dataclasses import dataclass
|
7
|
-
from typing import List, Optional
|
8
6
|
|
9
7
|
from livekit import agents
|
10
8
|
|
@@ -17,7 +15,7 @@ import nltk # type: ignore
|
|
17
15
|
|
18
16
|
|
19
17
|
@dataclass
|
20
|
-
class
|
18
|
+
class _TokenizerOptions:
|
21
19
|
language: str
|
22
20
|
min_sentence_len: int
|
23
21
|
stream_context_len: int
|
@@ -26,26 +24,25 @@ class TokenizerOptions:
|
|
26
24
|
class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
|
27
25
|
def __init__(
|
28
26
|
self,
|
27
|
+
*,
|
29
28
|
language: str = "english",
|
30
29
|
min_sentence_len: int = 20,
|
31
30
|
stream_context_len: int = 10,
|
32
31
|
) -> None:
|
33
32
|
super().__init__()
|
34
|
-
self._config =
|
33
|
+
self._config = _TokenizerOptions(
|
35
34
|
language=language,
|
36
35
|
min_sentence_len=min_sentence_len,
|
37
36
|
stream_context_len=stream_context_len,
|
38
37
|
)
|
39
38
|
|
40
|
-
def _sanitize_options(self, language:
|
39
|
+
def _sanitize_options(self, language: str | None = None) -> _TokenizerOptions:
|
41
40
|
config = dataclasses.replace(self._config)
|
42
41
|
if language:
|
43
42
|
config.language = language
|
44
43
|
return config
|
45
44
|
|
46
|
-
def tokenize(
|
47
|
-
self, *, text: str, language: Optional[str] = None
|
48
|
-
) -> List[agents.tokenize.SegmentedSentence]:
|
45
|
+
def tokenize(self, *, text: str, language: str | None = None) -> list[str]:
|
49
46
|
config = self._sanitize_options(language=language)
|
50
47
|
sentences = nltk.tokenize.sent_tokenize(text, config.language)
|
51
48
|
new_sentences = []
|
@@ -59,84 +56,19 @@ class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
|
|
59
56
|
if buff:
|
60
57
|
new_sentences.append(buff.rstrip())
|
61
58
|
|
62
|
-
return
|
59
|
+
return new_sentences
|
63
60
|
|
64
61
|
def stream(
|
65
62
|
self,
|
66
63
|
*,
|
67
|
-
language:
|
64
|
+
language: str | None = None,
|
68
65
|
) -> agents.tokenize.SentenceStream:
|
69
66
|
config = self._sanitize_options(language=language)
|
70
|
-
return
|
71
|
-
|
72
|
-
|
73
|
-
|
67
|
+
return agents.tokenize.BufferedTokenStream(
|
68
|
+
tokenizer=functools.partial(
|
69
|
+
nltk.tokenize.sent_tokenize,
|
70
|
+
language=config.language,
|
71
|
+
),
|
72
|
+
min_token_len=self._config.min_sentence_len,
|
73
|
+
ctx_len=self._config.stream_context_len,
|
74
74
|
)
|
75
|
-
|
76
|
-
|
77
|
-
class SentenceStream(agents.tokenize.SentenceStream):
|
78
|
-
def __init__(
|
79
|
-
self, *, language: str, min_sentence_len: int, context_len: int
|
80
|
-
) -> None:
|
81
|
-
self._language = language
|
82
|
-
self._context_len = context_len
|
83
|
-
self._min_sentence_len = min_sentence_len
|
84
|
-
self._event_queue = asyncio.Queue[agents.tokenize.SegmentedSentence | None]()
|
85
|
-
self._closed = False
|
86
|
-
|
87
|
-
self._incomplete_sentences: List[str] = [] # <= min_sentence_len
|
88
|
-
self._buffer = ""
|
89
|
-
|
90
|
-
def push_text(self, text: str) -> None:
|
91
|
-
if self._closed:
|
92
|
-
logging.error("Cannot push text to closed stream")
|
93
|
-
return
|
94
|
-
|
95
|
-
for char in text:
|
96
|
-
self._buffer += char
|
97
|
-
|
98
|
-
if len(self._buffer) < self._context_len:
|
99
|
-
continue
|
100
|
-
|
101
|
-
sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
|
102
|
-
if len(sentences) < 2:
|
103
|
-
continue
|
104
|
-
|
105
|
-
new_sentence = sentences[0]
|
106
|
-
self._incomplete_sentences.append(new_sentence)
|
107
|
-
s = " ".join(self._incomplete_sentences)
|
108
|
-
|
109
|
-
if len(s) >= self._min_sentence_len:
|
110
|
-
self._event_queue.put_nowait(agents.tokenize.SegmentedSentence(text=s))
|
111
|
-
self._incomplete_sentences = []
|
112
|
-
|
113
|
-
self._buffer = self._buffer[len(new_sentence) :].lstrip()
|
114
|
-
|
115
|
-
async def flush(self) -> None:
|
116
|
-
# try to segment the remaining data inside self._text_buffer
|
117
|
-
buff = " ".join(self._incomplete_sentences)
|
118
|
-
sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
|
119
|
-
for sentence in sentences:
|
120
|
-
buff += " " + sentence
|
121
|
-
if len(buff) >= self._min_sentence_len:
|
122
|
-
await self._event_queue.put(
|
123
|
-
agents.tokenize.SegmentedSentence(text=buff)
|
124
|
-
)
|
125
|
-
buff = ""
|
126
|
-
|
127
|
-
if buff:
|
128
|
-
await self._event_queue.put(agents.tokenize.SegmentedSentence(text=buff))
|
129
|
-
|
130
|
-
async def aclose(self) -> None:
|
131
|
-
self._closed = True
|
132
|
-
self._event_queue.put_nowait(None)
|
133
|
-
|
134
|
-
async def __anext__(self) -> agents.tokenize.SegmentedSentence:
|
135
|
-
event = await self._event_queue.get()
|
136
|
-
if event is None:
|
137
|
-
raise StopAsyncIteration
|
138
|
-
|
139
|
-
return event
|
140
|
-
|
141
|
-
def __aiter__(self) -> "SentenceStream":
|
142
|
-
return self
|
livekit/plugins/nltk/version.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-nltk
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Agent Framework plugin for NLTK-based text processing.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3 :: Only
|
19
19
|
Requires-Python: >=3.9.0
|
20
20
|
Description-Content-Type: text/markdown
|
21
|
-
Requires-Dist: livekit ~=0.
|
21
|
+
Requires-Dist: livekit ~=0.11
|
22
22
|
Requires-Dist: nltk <4,>=3
|
23
|
-
Requires-Dist: livekit-agents ~=0.
|
23
|
+
Requires-Dist: livekit-agents ~=0.6.0
|
24
24
|
|
25
25
|
# LiveKit Plugins NLTK
|
26
26
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
livekit/plugins/nltk/__init__.py,sha256=deM101uCRxPAD7m2xmPkxspW4ozp83jCJpFxoc9bT5U,1096
|
2
|
+
livekit/plugins/nltk/log.py,sha256=12Ta0eunqKM3KyIlyzp2QFyO_Y-PBLAtyjEWDiuJdKk,67
|
3
|
+
livekit/plugins/nltk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
livekit/plugins/nltk/sentence_tokenizer.py,sha256=Y2wXJwqqUxYMqSqPrAXn_OIeMQ1Q-18-Nvd-vog8Z40,2211
|
5
|
+
livekit/plugins/nltk/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
|
6
|
+
livekit_plugins_nltk-0.5.0.dist-info/METADATA,sha256=A7fC7nNjXotaiPw7r7_Tq-Qdw9k9EmsEBBJRhIeRs1g,1182
|
7
|
+
livekit_plugins_nltk-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
8
|
+
livekit_plugins_nltk-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
9
|
+
livekit_plugins_nltk-0.5.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
livekit/plugins/nltk/__init__.py,sha256=HGbaUwK-0cU-SbvbEl2WSQKwNrkHn8XuO-86Hiy0cy4,1134
|
2
|
-
livekit/plugins/nltk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
livekit/plugins/nltk/sentence_tokenizer.py,sha256=1Oy0pmvCiD_vHepi-Vp6lEKEOFS1FdHjO9bvBX8dPCk,4561
|
4
|
-
livekit/plugins/nltk/version.py,sha256=OwSbVTqWUJKy9w2Jbh1MIrp5cHPvEYsLXDhRGwdZKso,603
|
5
|
-
livekit_plugins_nltk-0.4.dev0.dist-info/METADATA,sha256=bamt5z3nOv55hZwNAeMgkG1HYNazCItEEC-zr7XDfqE,1187
|
6
|
-
livekit_plugins_nltk-0.4.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
7
|
-
livekit_plugins_nltk-0.4.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
8
|
-
livekit_plugins_nltk-0.4.dev0.dist-info/RECORD,,
|
File without changes
|
{livekit_plugins_nltk-0.4.dev0.dist-info → livekit_plugins_nltk-0.5.0.dist-info}/top_level.txt
RENAMED
File without changes
|