livekit-plugins-nltk 0.4.dev0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/PKG-INFO +3 -3
  2. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit/plugins/nltk/__init__.py +1 -2
  3. livekit_plugins_nltk-0.5.0/livekit/plugins/nltk/log.py +3 -0
  4. livekit_plugins_nltk-0.5.0/livekit/plugins/nltk/sentence_tokenizer.py +74 -0
  5. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit/plugins/nltk/version.py +1 -1
  6. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit_plugins_nltk.egg-info/PKG-INFO +3 -3
  7. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit_plugins_nltk.egg-info/SOURCES.txt +1 -0
  8. livekit_plugins_nltk-0.5.0/livekit_plugins_nltk.egg-info/requires.txt +3 -0
  9. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/setup.py +2 -2
  10. livekit-plugins-nltk-0.4.dev0/livekit/plugins/nltk/sentence_tokenizer.py +0 -142
  11. livekit-plugins-nltk-0.4.dev0/livekit_plugins_nltk.egg-info/requires.txt +0 -3
  12. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/README.md +0 -0
  13. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit/plugins/nltk/py.typed +0 -0
  14. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit_plugins_nltk.egg-info/dependency_links.txt +0 -0
  15. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/livekit_plugins_nltk.egg-info/top_level.txt +0 -0
  16. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/pyproject.toml +0 -0
  17. {livekit-plugins-nltk-0.4.dev0 → livekit_plugins_nltk-0.5.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-nltk
3
- Version: 0.4.dev0
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for NLTK-based text processing.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3 :: Only
19
19
  Requires-Python: >=3.9.0
20
20
  Description-Content-Type: text/markdown
21
- Requires-Dist: livekit~=0.9
21
+ Requires-Dist: livekit~=0.11
22
22
  Requires-Dist: nltk<4,>=3
23
- Requires-Dist: livekit-agents~=0.5.dev0
23
+ Requires-Dist: livekit-agents~=0.6.0
24
24
 
25
25
  # LiveKit Plugins NLTK
26
26
 
@@ -13,12 +13,11 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from .sentence_tokenizer import SentenceStream, SentenceTokenizer
16
+ from .sentence_tokenizer import SentenceTokenizer
17
17
  from .version import __version__
18
18
 
19
19
  __all__ = [
20
20
  "SentenceTokenizer",
21
- "SentenceStream",
22
21
  "__version__",
23
22
  ]
24
23
 
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.nltk")
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import functools
5
+ from dataclasses import dataclass
6
+
7
+ from livekit import agents
8
+
9
+ import nltk # type: ignore
10
+
11
+ # nltk is using the punkt tokenizer
12
+ # https://www.nltk.org/_modules/nltk/tokenize/punkt.html
13
+ # this code is using a whitespace to concatenate small sentences together
14
+ # (languages such as Chinese and Japanese are not yet supported)
15
+
16
+
17
+ @dataclass
18
+ class _TokenizerOptions:
19
+ language: str
20
+ min_sentence_len: int
21
+ stream_context_len: int
22
+
23
+
24
+ class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
25
+ def __init__(
26
+ self,
27
+ *,
28
+ language: str = "english",
29
+ min_sentence_len: int = 20,
30
+ stream_context_len: int = 10,
31
+ ) -> None:
32
+ super().__init__()
33
+ self._config = _TokenizerOptions(
34
+ language=language,
35
+ min_sentence_len=min_sentence_len,
36
+ stream_context_len=stream_context_len,
37
+ )
38
+
39
+ def _sanitize_options(self, language: str | None = None) -> _TokenizerOptions:
40
+ config = dataclasses.replace(self._config)
41
+ if language:
42
+ config.language = language
43
+ return config
44
+
45
+ def tokenize(self, *, text: str, language: str | None = None) -> list[str]:
46
+ config = self._sanitize_options(language=language)
47
+ sentences = nltk.tokenize.sent_tokenize(text, config.language)
48
+ new_sentences = []
49
+ buff = ""
50
+ for sentence in sentences:
51
+ buff += sentence + " "
52
+ if len(buff) - 1 >= config.min_sentence_len:
53
+ new_sentences.append(buff.rstrip())
54
+ buff = ""
55
+
56
+ if buff:
57
+ new_sentences.append(buff.rstrip())
58
+
59
+ return new_sentences
60
+
61
+ def stream(
62
+ self,
63
+ *,
64
+ language: str | None = None,
65
+ ) -> agents.tokenize.SentenceStream:
66
+ config = self._sanitize_options(language=language)
67
+ return agents.tokenize.BufferedTokenStream(
68
+ tokenizer=functools.partial(
69
+ nltk.tokenize.sent_tokenize,
70
+ language=config.language,
71
+ ),
72
+ min_token_len=self._config.min_sentence_len,
73
+ ctx_len=self._config.stream_context_len,
74
+ )
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.dev0"
15
+ __version__ = "0.5.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-nltk
3
- Version: 0.4.dev0
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for NLTK-based text processing.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3 :: Only
19
19
  Requires-Python: >=3.9.0
20
20
  Description-Content-Type: text/markdown
21
- Requires-Dist: livekit~=0.9
21
+ Requires-Dist: livekit~=0.11
22
22
  Requires-Dist: nltk<4,>=3
23
- Requires-Dist: livekit-agents~=0.5.dev0
23
+ Requires-Dist: livekit-agents~=0.6.0
24
24
 
25
25
  # LiveKit Plugins NLTK
26
26
 
@@ -2,6 +2,7 @@ README.md
2
2
  pyproject.toml
3
3
  setup.py
4
4
  livekit/plugins/nltk/__init__.py
5
+ livekit/plugins/nltk/log.py
5
6
  livekit/plugins/nltk/py.typed
6
7
  livekit/plugins/nltk/sentence_tokenizer.py
7
8
  livekit/plugins/nltk/version.py
@@ -0,0 +1,3 @@
1
+ livekit~=0.11
2
+ nltk<4,>=3
3
+ livekit-agents~=0.6.0
@@ -47,9 +47,9 @@ setuptools.setup(
47
47
  packages=setuptools.find_namespace_packages(include=["livekit.*"]),
48
48
  python_requires=">=3.9.0",
49
49
  install_requires=[
50
- "livekit~=0.9",
50
+ "livekit~=0.11",
51
51
  "nltk >= 3, < 4",
52
- "livekit-agents~=0.5.dev0",
52
+ "livekit-agents~=0.6.0",
53
53
  ],
54
54
  package_data={
55
55
  "livekit.plugins.nltk": ["py.typed"],
@@ -1,142 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import dataclasses
5
- import logging
6
- from dataclasses import dataclass
7
- from typing import List, Optional
8
-
9
- from livekit import agents
10
-
11
- import nltk # type: ignore
12
-
13
- # nltk is using the punkt tokenizer
14
- # https://www.nltk.org/_modules/nltk/tokenize/punkt.html
15
- # this code is using a whitespace to concatenate small sentences together
16
- # (languages such as Chinese and Japanese are not yet supported)
17
-
18
-
19
- @dataclass
20
- class TokenizerOptions:
21
- language: str
22
- min_sentence_len: int
23
- stream_context_len: int
24
-
25
-
26
- class SentenceTokenizer(agents.tokenize.SentenceTokenizer):
27
- def __init__(
28
- self,
29
- language: str = "english",
30
- min_sentence_len: int = 20,
31
- stream_context_len: int = 10,
32
- ) -> None:
33
- super().__init__()
34
- self._config = TokenizerOptions(
35
- language=language,
36
- min_sentence_len=min_sentence_len,
37
- stream_context_len=stream_context_len,
38
- )
39
-
40
- def _sanitize_options(self, language: Optional[str] = None) -> TokenizerOptions:
41
- config = dataclasses.replace(self._config)
42
- if language:
43
- config.language = language
44
- return config
45
-
46
- def tokenize(
47
- self, *, text: str, language: Optional[str] = None
48
- ) -> List[agents.tokenize.SegmentedSentence]:
49
- config = self._sanitize_options(language=language)
50
- sentences = nltk.tokenize.sent_tokenize(text, config.language)
51
- new_sentences = []
52
- buff = ""
53
- for sentence in sentences:
54
- buff += sentence + " "
55
- if len(buff) - 1 >= config.min_sentence_len:
56
- new_sentences.append(buff.rstrip())
57
- buff = ""
58
-
59
- if buff:
60
- new_sentences.append(buff.rstrip())
61
-
62
- return [agents.tokenize.SegmentedSentence(text=text) for text in new_sentences]
63
-
64
- def stream(
65
- self,
66
- *,
67
- language: Optional[str] = None,
68
- ) -> agents.tokenize.SentenceStream:
69
- config = self._sanitize_options(language=language)
70
- return SentenceStream(
71
- language=config.language,
72
- min_sentence_len=config.min_sentence_len,
73
- context_len=config.stream_context_len,
74
- )
75
-
76
-
77
- class SentenceStream(agents.tokenize.SentenceStream):
78
- def __init__(
79
- self, *, language: str, min_sentence_len: int, context_len: int
80
- ) -> None:
81
- self._language = language
82
- self._context_len = context_len
83
- self._min_sentence_len = min_sentence_len
84
- self._event_queue = asyncio.Queue[agents.tokenize.SegmentedSentence | None]()
85
- self._closed = False
86
-
87
- self._incomplete_sentences: List[str] = [] # <= min_sentence_len
88
- self._buffer = ""
89
-
90
- def push_text(self, text: str) -> None:
91
- if self._closed:
92
- logging.error("Cannot push text to closed stream")
93
- return
94
-
95
- for char in text:
96
- self._buffer += char
97
-
98
- if len(self._buffer) < self._context_len:
99
- continue
100
-
101
- sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
102
- if len(sentences) < 2:
103
- continue
104
-
105
- new_sentence = sentences[0]
106
- self._incomplete_sentences.append(new_sentence)
107
- s = " ".join(self._incomplete_sentences)
108
-
109
- if len(s) >= self._min_sentence_len:
110
- self._event_queue.put_nowait(agents.tokenize.SegmentedSentence(text=s))
111
- self._incomplete_sentences = []
112
-
113
- self._buffer = self._buffer[len(new_sentence) :].lstrip()
114
-
115
- async def flush(self) -> None:
116
- # try to segment the remaining data inside self._text_buffer
117
- buff = " ".join(self._incomplete_sentences)
118
- sentences = nltk.tokenize.sent_tokenize(self._buffer, self._language)
119
- for sentence in sentences:
120
- buff += " " + sentence
121
- if len(buff) >= self._min_sentence_len:
122
- await self._event_queue.put(
123
- agents.tokenize.SegmentedSentence(text=buff)
124
- )
125
- buff = ""
126
-
127
- if buff:
128
- await self._event_queue.put(agents.tokenize.SegmentedSentence(text=buff))
129
-
130
- async def aclose(self) -> None:
131
- self._closed = True
132
- self._event_queue.put_nowait(None)
133
-
134
- async def __anext__(self) -> agents.tokenize.SegmentedSentence:
135
- event = await self._event_queue.get()
136
- if event is None:
137
- raise StopAsyncIteration
138
-
139
- return event
140
-
141
- def __aiter__(self) -> "SentenceStream":
142
- return self
@@ -1,3 +0,0 @@
1
- livekit~=0.9
2
- nltk<4,>=3
3
- livekit-agents~=0.5.dev0