livekit-plugins-turn-detector 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit_plugins_turn_detector-0.3.1/PKG-INFO +27 -0
- livekit_plugins_turn_detector-0.3.1/README.md +2 -0
- livekit_plugins_turn_detector-0.3.1/livekit/plugins/turn_detector/__init__.py +39 -0
- livekit_plugins_turn_detector-0.3.1/livekit/plugins/turn_detector/eou.py +158 -0
- livekit_plugins_turn_detector-0.3.1/livekit/plugins/turn_detector/log.py +3 -0
- livekit_plugins_turn_detector-0.3.1/livekit/plugins/turn_detector/version.py +15 -0
- livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/PKG-INFO +27 -0
- livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/SOURCES.txt +12 -0
- livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/dependency_links.txt +1 -0
- livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/requires.txt +3 -0
- livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/top_level.txt +1 -0
- livekit_plugins_turn_detector-0.3.1/pyproject.toml +3 -0
- livekit_plugins_turn_detector-0.3.1/setup.cfg +4 -0
- livekit_plugins_turn_detector-0.3.1/setup.py +59 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: livekit-plugins-turn-detector
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: End of utterance detection for LiveKit Agents
|
|
5
|
+
Home-page: https://github.com/livekit/agents
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
|
8
|
+
Project-URL: Website, https://livekit.io/
|
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
14
|
+
Classifier: Topic :: Multimedia :: Video
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
+
Requires-Python: >=3.9.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: livekit-agents>=0.11
|
|
23
|
+
Requires-Dist: transformers>=4.46
|
|
24
|
+
Requires-Dist: numpy>=1.26
|
|
25
|
+
|
|
26
|
+
# LiveKit Plugins Turn Detector
|
|
27
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from livekit.agents import Plugin
|
|
16
|
+
from livekit.agents.inference_runner import _InferenceRunner
|
|
17
|
+
|
|
18
|
+
from .eou import EOUModel, _EUORunner
|
|
19
|
+
from .log import logger
|
|
20
|
+
from .version import __version__
|
|
21
|
+
|
|
22
|
+
__all__ = ["EOUModel", "__version__"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class EOUPlugin(Plugin):
|
|
26
|
+
def __init__(self):
|
|
27
|
+
super().__init__(__name__, __version__, __package__, logger)
|
|
28
|
+
|
|
29
|
+
def download_files(self) -> None:
|
|
30
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
31
|
+
|
|
32
|
+
from .eou import HG_MODEL
|
|
33
|
+
|
|
34
|
+
AutoModelForCausalLM.from_pretrained(HG_MODEL)
|
|
35
|
+
AutoTokenizer.from_pretrained(HG_MODEL)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
Plugin.register_plugin(EOUPlugin())
|
|
39
|
+
_InferenceRunner.register_runner(_EUORunner)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import string
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from livekit.agents import llm
|
|
9
|
+
from livekit.agents.inference_runner import _InferenceRunner
|
|
10
|
+
from livekit.agents.ipc.inference_executor import InferenceExecutor
|
|
11
|
+
from livekit.agents.job import get_current_job_context
|
|
12
|
+
|
|
13
|
+
from .log import logger
|
|
14
|
+
|
|
15
|
+
HG_MODEL = "livekit/opt-125m-endpoint-detector-2"
|
|
16
|
+
PUNCS = string.punctuation.replace("'", "")
|
|
17
|
+
MAX_HISTORY = 4
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _softmax(logits: np.ndarray) -> np.ndarray:
|
|
21
|
+
exp_logits = np.exp(logits - np.max(logits))
|
|
22
|
+
return exp_logits / np.sum(exp_logits)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _EUORunner(_InferenceRunner):
|
|
26
|
+
INFERENCE_METHOD = "lk_end_of_utterance"
|
|
27
|
+
|
|
28
|
+
def _normalize(self, text):
|
|
29
|
+
def strip_puncs(text):
|
|
30
|
+
return text.translate(str.maketrans("", "", PUNCS))
|
|
31
|
+
|
|
32
|
+
return " ".join(strip_puncs(text).lower().split())
|
|
33
|
+
|
|
34
|
+
def _format_chat_ctx(self, chat_ctx: dict):
|
|
35
|
+
new_chat_ctx = []
|
|
36
|
+
for msg in chat_ctx:
|
|
37
|
+
content = self._normalize(msg["content"])
|
|
38
|
+
|
|
39
|
+
if not content:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
msg["content"] = content
|
|
43
|
+
new_chat_ctx.append(msg)
|
|
44
|
+
|
|
45
|
+
convo_text = self._tokenizer.apply_chat_template(
|
|
46
|
+
new_chat_ctx,
|
|
47
|
+
add_generation_prompt=False,
|
|
48
|
+
add_special_tokens=False,
|
|
49
|
+
tokenize=False,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# remove the EOU token from current utterance
|
|
53
|
+
ix = convo_text.rfind("<|im_end|>")
|
|
54
|
+
text = convo_text[:ix]
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
def initialize(self) -> None:
|
|
58
|
+
from huggingface_hub import errors
|
|
59
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
self._model = AutoModelForCausalLM.from_pretrained(
|
|
63
|
+
HG_MODEL, local_files_only=True
|
|
64
|
+
)
|
|
65
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
66
|
+
HG_MODEL, local_files_only=True
|
|
67
|
+
)
|
|
68
|
+
self._eou_index = self._tokenizer.encode("<|im_end|>")[-1]
|
|
69
|
+
except (errors.LocalEntryNotFoundError, OSError):
|
|
70
|
+
logger.error(
|
|
71
|
+
(
|
|
72
|
+
f"Could not find model {HG_MODEL}. Make sure you have downloaded the model before running the agent. "
|
|
73
|
+
"Use `python3 your_agent.py download-files` to download the models."
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
raise RuntimeError(
|
|
77
|
+
f"livekit-plugins-turn-detector initialization failed. Could not find model {HG_MODEL}."
|
|
78
|
+
) from None
|
|
79
|
+
|
|
80
|
+
def run(self, data: bytes) -> bytes | None:
|
|
81
|
+
data_json = json.loads(data)
|
|
82
|
+
chat_ctx = data_json.get("chat_ctx", None)
|
|
83
|
+
|
|
84
|
+
if not chat_ctx:
|
|
85
|
+
raise ValueError("chat_ctx is required on the inference input data")
|
|
86
|
+
|
|
87
|
+
start_time = time.perf_counter()
|
|
88
|
+
|
|
89
|
+
text = self._format_chat_ctx(chat_ctx)
|
|
90
|
+
inputs = self._tokenizer(
|
|
91
|
+
text,
|
|
92
|
+
add_special_tokens=False,
|
|
93
|
+
return_tensors="pt",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
outputs = self._model(**inputs)
|
|
97
|
+
logits = outputs.logits[0, -1, :].detach().numpy()
|
|
98
|
+
output_probs = _softmax(logits)
|
|
99
|
+
eou_probability = output_probs[self._eou_index]
|
|
100
|
+
|
|
101
|
+
end_time = time.perf_counter()
|
|
102
|
+
|
|
103
|
+
logger.debug(
|
|
104
|
+
"eou prediction",
|
|
105
|
+
extra={
|
|
106
|
+
"eou_probability": eou_probability,
|
|
107
|
+
"input": text,
|
|
108
|
+
"duration": round(end_time - start_time, 3),
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return json.dumps({"eou_probability": float(eou_probability)}).encode()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class EOUModel:
|
|
116
|
+
def __init__(self, inference_executor: InferenceExecutor | None = None) -> None:
|
|
117
|
+
self._executor = (
|
|
118
|
+
inference_executor or get_current_job_context().inference_executor
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
async def predict_eou(self, chat_ctx: llm.ChatContext) -> float:
|
|
122
|
+
messages = []
|
|
123
|
+
|
|
124
|
+
for msg in chat_ctx.messages:
|
|
125
|
+
if msg.role not in ("user", "assistant"):
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
if isinstance(msg.content, str):
|
|
129
|
+
messages.append(
|
|
130
|
+
{
|
|
131
|
+
"role": msg.role,
|
|
132
|
+
"content": msg.content,
|
|
133
|
+
}
|
|
134
|
+
)
|
|
135
|
+
elif isinstance(msg.content, list):
|
|
136
|
+
for cnt in msg.content:
|
|
137
|
+
if isinstance(cnt, str):
|
|
138
|
+
messages.append(
|
|
139
|
+
{
|
|
140
|
+
"role": msg.role,
|
|
141
|
+
"content": cnt,
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
messages = messages[-MAX_HISTORY:]
|
|
147
|
+
|
|
148
|
+
json_data = json.dumps({"chat_ctx": messages}).encode()
|
|
149
|
+
result = await self._executor.do_inference(
|
|
150
|
+
_EUORunner.INFERENCE_METHOD, json_data
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
assert (
|
|
154
|
+
result is not None
|
|
155
|
+
), "end_of_utterance prediction should always returns a result"
|
|
156
|
+
|
|
157
|
+
result_json = json.loads(result.decode())
|
|
158
|
+
return result_json["eou_probability"]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
__version__ = "0.3.1"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: livekit-plugins-turn-detector
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: End of utterance detection for LiveKit Agents
|
|
5
|
+
Home-page: https://github.com/livekit/agents
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
|
8
|
+
Project-URL: Website, https://livekit.io/
|
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
14
|
+
Classifier: Topic :: Multimedia :: Video
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
+
Requires-Python: >=3.9.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: livekit-agents>=0.11
|
|
23
|
+
Requires-Dist: transformers>=4.46
|
|
24
|
+
Requires-Dist: numpy>=1.26
|
|
25
|
+
|
|
26
|
+
# LiveKit Plugins Turn Detector
|
|
27
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
livekit/plugins/turn_detector/__init__.py
|
|
5
|
+
livekit/plugins/turn_detector/eou.py
|
|
6
|
+
livekit/plugins/turn_detector/log.py
|
|
7
|
+
livekit/plugins/turn_detector/version.py
|
|
8
|
+
livekit_plugins_turn_detector.egg-info/PKG-INFO
|
|
9
|
+
livekit_plugins_turn_detector.egg-info/SOURCES.txt
|
|
10
|
+
livekit_plugins_turn_detector.egg-info/dependency_links.txt
|
|
11
|
+
livekit_plugins_turn_detector.egg-info/requires.txt
|
|
12
|
+
livekit_plugins_turn_detector.egg-info/top_level.txt
|
livekit_plugins_turn_detector-0.3.1/livekit_plugins_turn_detector.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
livekit
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import pathlib
|
|
17
|
+
|
|
18
|
+
import setuptools
|
|
19
|
+
import setuptools.command.build_py
|
|
20
|
+
|
|
21
|
+
here = pathlib.Path(__file__).parent.resolve()
|
|
22
|
+
about = {}
|
|
23
|
+
with open(
|
|
24
|
+
os.path.join(here, "livekit", "plugins", "turn_detector", "version.py"), "r"
|
|
25
|
+
) as f:
|
|
26
|
+
exec(f.read(), about)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
setuptools.setup(
|
|
30
|
+
name="livekit-plugins-turn-detector",
|
|
31
|
+
version=about["__version__"],
|
|
32
|
+
description="End of utterance detection for LiveKit Agents",
|
|
33
|
+
long_description=(here / "README.md").read_text(encoding="utf-8"),
|
|
34
|
+
long_description_content_type="text/markdown",
|
|
35
|
+
url="https://github.com/livekit/agents",
|
|
36
|
+
cmdclass={},
|
|
37
|
+
classifiers=[
|
|
38
|
+
"Intended Audience :: Developers",
|
|
39
|
+
"License :: OSI Approved :: Apache Software License",
|
|
40
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
41
|
+
"Topic :: Multimedia :: Video",
|
|
42
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
43
|
+
"Programming Language :: Python :: 3",
|
|
44
|
+
"Programming Language :: Python :: 3.9",
|
|
45
|
+
"Programming Language :: Python :: 3.10",
|
|
46
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
47
|
+
],
|
|
48
|
+
keywords=["webrtc", "realtime", "audio", "video", "livekit"],
|
|
49
|
+
license="Apache-2.0",
|
|
50
|
+
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
|
51
|
+
python_requires=">=3.9.0",
|
|
52
|
+
install_requires=["livekit-agents>=0.11", "transformers>=4.46", "numpy>=1.26"],
|
|
53
|
+
package_data={"livekit.plugins.eou": ["py.typed"]},
|
|
54
|
+
project_urls={
|
|
55
|
+
"Documentation": "https://docs.livekit.io",
|
|
56
|
+
"Website": "https://livekit.io/",
|
|
57
|
+
"Source": "https://github.com/livekit/agents",
|
|
58
|
+
},
|
|
59
|
+
)
|