backchannel-classifier 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backchannel_classifier-0.4.0/LICENSE +21 -0
- backchannel_classifier-0.4.0/MANIFEST.in +2 -0
- backchannel_classifier-0.4.0/PKG-INFO +142 -0
- backchannel_classifier-0.4.0/README.md +115 -0
- backchannel_classifier-0.4.0/backchannel_classifier/__init__.py +177 -0
- backchannel_classifier-0.4.0/backchannel_classifier/backchannel_model.pkl +0 -0
- backchannel_classifier-0.4.0/backchannel_classifier/backchannel_model_ja.pkl +0 -0
- backchannel_classifier-0.4.0/backchannel_classifier/jp.py +249 -0
- backchannel_classifier-0.4.0/backchannel_classifier.egg-info/PKG-INFO +142 -0
- backchannel_classifier-0.4.0/backchannel_classifier.egg-info/SOURCES.txt +16 -0
- backchannel_classifier-0.4.0/backchannel_classifier.egg-info/dependency_links.txt +1 -0
- backchannel_classifier-0.4.0/backchannel_classifier.egg-info/requires.txt +2 -0
- backchannel_classifier-0.4.0/backchannel_classifier.egg-info/top_level.txt +1 -0
- backchannel_classifier-0.4.0/backchannel_model.pkl +0 -0
- backchannel_classifier-0.4.0/pyproject.toml +42 -0
- backchannel_classifier-0.4.0/setup.cfg +4 -0
- backchannel_classifier-0.4.0/tests/test_classifier.py +192 -0
- backchannel_classifier-0.4.0/tests/test_classifier_ja.py +204 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 100x.fi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: backchannel-classifier
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: backchannel classifier - detect backchannels vs real responses in thai and japanese asr output
|
|
5
|
+
Author-email: "100x.fi" <kiri@100x.fi>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/100x-fi/backchannel-classifier
|
|
8
|
+
Project-URL: Repository, https://github.com/100x-fi/backchannel-classifier
|
|
9
|
+
Keywords: thai,japanese,nlp,backchannel,aizuchi,voice,asr,classifier
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: scikit-learn>=1.0
|
|
25
|
+
Requires-Dist: numpy>=1.20
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# backchannel classifier
|
|
29
|
+
|
|
30
|
+
detects backchannel responses vs real user input for voice ai systems. supports **thai** and **japanese** (aizuchi).
|
|
31
|
+
|
|
32
|
+
## install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install backchannel-classifier
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from backchannel_classifier import is_backchannel
|
|
42
|
+
|
|
43
|
+
# thai (default)
|
|
44
|
+
is_backchannel("ครับ") # (True, 0.91)
|
|
45
|
+
is_backchannel("ไม่ครับ") # (False, 0.01)
|
|
46
|
+
is_backchannel("ใช่ แต่ว่า") # (False, 0.01)
|
|
47
|
+
|
|
48
|
+
# japanese
|
|
49
|
+
is_backchannel("はい", lang="ja") # (True, 0.99)
|
|
50
|
+
is_backchannel("そうですね", lang="ja") # (True, 0.99)
|
|
51
|
+
is_backchannel("予約したいです", lang="ja") # (False, 0.0001)
|
|
52
|
+
|
|
53
|
+
# direct import
|
|
54
|
+
from backchannel_classifier.jp import is_backchannel_ja
|
|
55
|
+
is_backchannel_ja("なるほど") # (True, 0.99)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
returns `(is_backchannel: bool, confidence: float)`.
|
|
59
|
+
|
|
60
|
+
## why
|
|
61
|
+
|
|
62
|
+
voice bots using asr → llm → tts pipelines need to distinguish between backchannels (acknowledgment sounds that should be ignored) and real responses that need processing. simple exact matching fails on asr variants and misses edge cases.
|
|
63
|
+
|
|
64
|
+
## approach
|
|
65
|
+
|
|
66
|
+
gradient boosting classifier with handcrafted language-specific features. key idea: strip known backchannel components from the text, measure what's left (`remaining_ratio`). if nothing remains, it's a backchannel.
|
|
67
|
+
|
|
68
|
+
### thai (26 features)
|
|
69
|
+
|
|
70
|
+
| feature | importance |
|
|
71
|
+
|---|---|
|
|
72
|
+
| remaining_ratio | 0.9098 |
|
|
73
|
+
| has_request | 0.0406 |
|
|
74
|
+
| has_negation | 0.0274 |
|
|
75
|
+
| particle_ratio | 0.0108 |
|
|
76
|
+
|
|
77
|
+
- polite particle detection (ครับ/ค่ะ/จ้ะ variants)
|
|
78
|
+
- backchannel sound patterns (อืม/อ๋อ/เออ with tone variants)
|
|
79
|
+
- question/negation/request/continuation markers
|
|
80
|
+
- handles asr misspellings (ค่า→ค่ะ, คับ→ครับ, อื้ม→อืม)
|
|
81
|
+
|
|
82
|
+
### japanese (27 features)
|
|
83
|
+
|
|
84
|
+
| feature | importance |
|
|
85
|
+
|---|---|
|
|
86
|
+
| remaining_ratio | 0.7765 |
|
|
87
|
+
| remaining_len | 0.0484 |
|
|
88
|
+
| katakana | 0.0347 |
|
|
89
|
+
| word_count | 0.0325 |
|
|
90
|
+
| kanji_ratio | 0.0206 |
|
|
91
|
+
|
|
92
|
+
- core aizuchi (はい/ええ/うん/そう)
|
|
93
|
+
- agreement, understanding, surprise, filler, reaction markers
|
|
94
|
+
- question/continuation/request/negation/verb negative indicators
|
|
95
|
+
- handles asr elongation variants (はーーい, えーーー)
|
|
96
|
+
|
|
97
|
+
## results
|
|
98
|
+
|
|
99
|
+
### thai
|
|
100
|
+
- **99.49% f1** (5-fold cv)
|
|
101
|
+
- test suite: **94/94** (100%)
|
|
102
|
+
|
|
103
|
+
### japanese
|
|
104
|
+
- **98.37% f1** (5-fold cv)
|
|
105
|
+
- test suite: **119/119** (100%)
|
|
106
|
+
|
|
107
|
+
## test coverage
|
|
108
|
+
|
|
109
|
+
### thai (94 cases)
|
|
110
|
+
|
|
111
|
+
**backchannels (49):** ครับ, ค่ะ, อืม, ใช่, อ๋อ, เหรอ, ฮัลโหล, asr variants...
|
|
112
|
+
**real responses (45):** สวัสดีครับ, ไม่ครับ, ราคาเท่าไหร่ครับ, edge cases (ใช่ แต่ว่า, ครับ แล้วก็)...
|
|
113
|
+
|
|
114
|
+
### japanese (119 cases)
|
|
115
|
+
|
|
116
|
+
**aizuchi (63):** はい, うん, そうですね, なるほど, へー, まじで, えーと, すごい, 承知しました, compounds...
|
|
117
|
+
**real responses (56):** ありがとうございます, いくらですか, 予約したいです, edge cases (はい、質問があります, そうですね、でも...)...
|
|
118
|
+
|
|
119
|
+
## testing
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
python3 -m pytest tests/ -v
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## files
|
|
126
|
+
|
|
127
|
+
- `backchannel_classifier/__init__.py` - thai classifier + unified api
|
|
128
|
+
- `backchannel_classifier/jp.py` - japanese classifier
|
|
129
|
+
- `train.py` - thai training script
|
|
130
|
+
- `train_ja.py` - japanese training script
|
|
131
|
+
- `tests/test_classifier.py` - thai test suite (94 cases)
|
|
132
|
+
- `tests/test_classifier_ja.py` - japanese test suite (119 cases)
|
|
133
|
+
|
|
134
|
+
## requirements
|
|
135
|
+
|
|
136
|
+
- python 3.8+
|
|
137
|
+
- scikit-learn
|
|
138
|
+
- numpy
|
|
139
|
+
|
|
140
|
+
## memory
|
|
141
|
+
|
|
142
|
+
~3.7 MB per language model, lazy-loaded. if you only use thai, japanese model is never loaded (zero overhead).
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# backchannel classifier
|
|
2
|
+
|
|
3
|
+
detects backchannel responses vs real user input for voice ai systems. supports **thai** and **japanese** (aizuchi).
|
|
4
|
+
|
|
5
|
+
## install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install backchannel-classifier
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from backchannel_classifier import is_backchannel
|
|
15
|
+
|
|
16
|
+
# thai (default)
|
|
17
|
+
is_backchannel("ครับ") # (True, 0.91)
|
|
18
|
+
is_backchannel("ไม่ครับ") # (False, 0.01)
|
|
19
|
+
is_backchannel("ใช่ แต่ว่า") # (False, 0.01)
|
|
20
|
+
|
|
21
|
+
# japanese
|
|
22
|
+
is_backchannel("はい", lang="ja") # (True, 0.99)
|
|
23
|
+
is_backchannel("そうですね", lang="ja") # (True, 0.99)
|
|
24
|
+
is_backchannel("予約したいです", lang="ja") # (False, 0.0001)
|
|
25
|
+
|
|
26
|
+
# direct import
|
|
27
|
+
from backchannel_classifier.jp import is_backchannel_ja
|
|
28
|
+
is_backchannel_ja("なるほど") # (True, 0.99)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
returns `(is_backchannel: bool, confidence: float)`.
|
|
32
|
+
|
|
33
|
+
## why
|
|
34
|
+
|
|
35
|
+
voice bots using asr → llm → tts pipelines need to distinguish between backchannels (acknowledgment sounds that should be ignored) and real responses that need processing. simple exact matching fails on asr variants and misses edge cases.
|
|
36
|
+
|
|
37
|
+
## approach
|
|
38
|
+
|
|
39
|
+
gradient boosting classifier with handcrafted language-specific features. key idea: strip known backchannel components from the text, measure what's left (`remaining_ratio`). if nothing remains, it's a backchannel.
|
|
40
|
+
|
|
41
|
+
### thai (26 features)
|
|
42
|
+
|
|
43
|
+
| feature | importance |
|
|
44
|
+
|---|---|
|
|
45
|
+
| remaining_ratio | 0.9098 |
|
|
46
|
+
| has_request | 0.0406 |
|
|
47
|
+
| has_negation | 0.0274 |
|
|
48
|
+
| particle_ratio | 0.0108 |
|
|
49
|
+
|
|
50
|
+
- polite particle detection (ครับ/ค่ะ/จ้ะ variants)
|
|
51
|
+
- backchannel sound patterns (อืม/อ๋อ/เออ with tone variants)
|
|
52
|
+
- question/negation/request/continuation markers
|
|
53
|
+
- handles asr misspellings (ค่า→ค่ะ, คับ→ครับ, อื้ม→อืม)
|
|
54
|
+
|
|
55
|
+
### japanese (27 features)
|
|
56
|
+
|
|
57
|
+
| feature | importance |
|
|
58
|
+
|---|---|
|
|
59
|
+
| remaining_ratio | 0.7765 |
|
|
60
|
+
| remaining_len | 0.0484 |
|
|
61
|
+
| katakana | 0.0347 |
|
|
62
|
+
| word_count | 0.0325 |
|
|
63
|
+
| kanji_ratio | 0.0206 |
|
|
64
|
+
|
|
65
|
+
- core aizuchi (はい/ええ/うん/そう)
|
|
66
|
+
- agreement, understanding, surprise, filler, reaction markers
|
|
67
|
+
- question/continuation/request/negation/verb negative indicators
|
|
68
|
+
- handles asr elongation variants (はーーい, えーーー)
|
|
69
|
+
|
|
70
|
+
## results
|
|
71
|
+
|
|
72
|
+
### thai
|
|
73
|
+
- **99.49% f1** (5-fold cv)
|
|
74
|
+
- test suite: **94/94** (100%)
|
|
75
|
+
|
|
76
|
+
### japanese
|
|
77
|
+
- **98.37% f1** (5-fold cv)
|
|
78
|
+
- test suite: **119/119** (100%)
|
|
79
|
+
|
|
80
|
+
## test coverage
|
|
81
|
+
|
|
82
|
+
### thai (94 cases)
|
|
83
|
+
|
|
84
|
+
**backchannels (49):** ครับ, ค่ะ, อืม, ใช่, อ๋อ, เหรอ, ฮัลโหล, asr variants...
|
|
85
|
+
**real responses (45):** สวัสดีครับ, ไม่ครับ, ราคาเท่าไหร่ครับ, edge cases (ใช่ แต่ว่า, ครับ แล้วก็)...
|
|
86
|
+
|
|
87
|
+
### japanese (119 cases)
|
|
88
|
+
|
|
89
|
+
**aizuchi (63):** はい, うん, そうですね, なるほど, へー, まじで, えーと, すごい, 承知しました, compounds...
|
|
90
|
+
**real responses (56):** ありがとうございます, いくらですか, 予約したいです, edge cases (はい、質問があります, そうですね、でも...)...
|
|
91
|
+
|
|
92
|
+
## testing
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
python3 -m pytest tests/ -v
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## files
|
|
99
|
+
|
|
100
|
+
- `backchannel_classifier/__init__.py` - thai classifier + unified api
|
|
101
|
+
- `backchannel_classifier/jp.py` - japanese classifier
|
|
102
|
+
- `train.py` - thai training script
|
|
103
|
+
- `train_ja.py` - japanese training script
|
|
104
|
+
- `tests/test_classifier.py` - thai test suite (94 cases)
|
|
105
|
+
- `tests/test_classifier_ja.py` - japanese test suite (119 cases)
|
|
106
|
+
|
|
107
|
+
## requirements
|
|
108
|
+
|
|
109
|
+
- python 3.8+
|
|
110
|
+
- scikit-learn
|
|
111
|
+
- numpy
|
|
112
|
+
|
|
113
|
+
## memory
|
|
114
|
+
|
|
115
|
+
~3.7 MB per language model, lazy-loaded. if you only use thai, japanese model is never loaded (zero overhead).
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Backchannel Classifier (Thai + Japanese)
|
|
3
|
+
Usage:
|
|
4
|
+
from backchannel_classifier import is_backchannel
|
|
5
|
+
is_backchannel("ครับ") # Thai (default)
|
|
6
|
+
is_backchannel("はい", lang="ja") # Japanese
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import pickle
|
|
11
|
+
import numpy as np
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
_MODEL = None
|
|
15
|
+
_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_features(text):
|
|
19
|
+
"""Extract features from Thai text"""
|
|
20
|
+
text = text.strip()
|
|
21
|
+
char_len = len(text)
|
|
22
|
+
thai_chars = len(re.findall(r'[\u0E00-\u0E7F]', text))
|
|
23
|
+
words = text.split()
|
|
24
|
+
word_count = len(words)
|
|
25
|
+
|
|
26
|
+
has_krab = 1 if re.search(r'ครับ|คับ', text) else 0
|
|
27
|
+
has_ka = 1 if re.search(r'ค่ะ|คะ|ค่า', text) else 0
|
|
28
|
+
has_ja = 1 if re.search(r'จ้ะ|จ้า|จ๊ะ', text) else 0
|
|
29
|
+
has_hmm = 1 if re.search(r'อืม|อือ|อื้อ|อื้ม|อึม|อุ้ม|เอิ่ม', text) else 0
|
|
30
|
+
has_oh = 1 if re.search(r'อ๋อ|เออ|เอ่อ|อ่า|อ้า|อ๊า+', text) else 0
|
|
31
|
+
has_aha = 1 if re.search(r'อ่าฮะ|อาฮะ|อาหะ|อ้าฮะ', text) else 0
|
|
32
|
+
has_hello = 1 if 'ฮัลโหล' in text else 0
|
|
33
|
+
has_chai = 1 if re.search(r'ใช่|ช่าย', text) else 0
|
|
34
|
+
has_jing = 1 if 'จริง' in text else 0
|
|
35
|
+
has_thuk = 1 if 'ถูก' in text else 0
|
|
36
|
+
has_ok = 1 if re.search(r'โอเค|เค$|เคร$', text) else 0
|
|
37
|
+
has_naenon = 1 if 'แน่นอน' in text else 0
|
|
38
|
+
has_wama = 1 if 'ว่ามา' in text else 0
|
|
39
|
+
has_na = 1 if re.search(r'นะ', text) else 0
|
|
40
|
+
has_ha = 1 if re.search(r'ฮะ|ฮ่ะ', text) else 0
|
|
41
|
+
has_question = 1 if re.search(r'ไหม|อะไร|ที่ไหน|เมื่อไหร่|ยังไง|ทำไม|กี่|เท่าไหร่', text) else 0
|
|
42
|
+
has_negation = 1 if re.search(r'ไม่|ยัง(?!ไง)', text) else 0
|
|
43
|
+
has_request = 1 if re.search(r'ขอ|ช่วย|อยาก|ต้องการ', text) else 0
|
|
44
|
+
has_continuation = 1 if re.search(r'แต่|แล้ว|แล้วก็|งั้น(?!เหรอ)', text) else 0
|
|
45
|
+
has_repeat = 1 if 'ๆ' in text else 0
|
|
46
|
+
particle_ratio = (has_krab + has_ka + has_ja) / max(word_count, 1)
|
|
47
|
+
|
|
48
|
+
remaining = text
|
|
49
|
+
for pattern in ['ครับ', 'คับ', 'ค่ะ', 'คะ', 'ค่า', 'จ้ะ', 'จ้า', 'ผม',
|
|
50
|
+
'อืม', 'อือ', 'อื้อ', 'อื้ม', 'อึม', 'เอิ่ม',
|
|
51
|
+
'เออ', 'เอ่อ', 'อ่า', 'อ้า', 'อ่าฮะ', 'อาฮะ', 'อ้าฮะ',
|
|
52
|
+
'อ๋อ', 'ใช่', 'ช่าย', 'จริง', 'ด้วย', 'ถูก', 'โอเค', 'เค', 'เคร',
|
|
53
|
+
'แน่นอน', 'เหรอ', 'หรอ', 'งั้น', 'ได้', 'อ่ะ', 'เอ๊ะ', 'ว่ามา',
|
|
54
|
+
'อาหะ', 'อือหึ', 'อือฮึ', 'ฮัลโหล',
|
|
55
|
+
'ไม่เป็นไร',
|
|
56
|
+
'นะ', 'ฮะ', 'ฮ่ะ', 'ก็', 'ดี', 'อ้าว', 'อะ',
|
|
57
|
+
'ๆ', ' ']:
|
|
58
|
+
remaining = remaining.replace(pattern, '')
|
|
59
|
+
remaining = re.sub(r'อ[๊้]า+', '', remaining)
|
|
60
|
+
remaining_len = len(remaining)
|
|
61
|
+
remaining_ratio = remaining_len / max(char_len, 1)
|
|
62
|
+
|
|
63
|
+
return np.array([
|
|
64
|
+
char_len, thai_chars, word_count,
|
|
65
|
+
has_krab, has_ka, has_ja,
|
|
66
|
+
has_hmm, has_oh, has_aha, has_hello,
|
|
67
|
+
has_chai, has_jing, has_thuk, has_ok, has_naenon,
|
|
68
|
+
has_question, has_negation, has_request,
|
|
69
|
+
has_continuation, has_repeat,
|
|
70
|
+
particle_ratio, remaining_len, remaining_ratio,
|
|
71
|
+
has_wama, has_na, has_ha,
|
|
72
|
+
])
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _load_model():
|
|
76
|
+
global _MODEL
|
|
77
|
+
if _MODEL is None:
|
|
78
|
+
import warnings
|
|
79
|
+
import sklearn
|
|
80
|
+
pkl_path = os.path.join(_DIR, 'backchannel_model.pkl')
|
|
81
|
+
try:
|
|
82
|
+
with warnings.catch_warnings():
|
|
83
|
+
warnings.simplefilter("ignore")
|
|
84
|
+
with open(pkl_path, 'rb') as f:
|
|
85
|
+
data = pickle.load(f)
|
|
86
|
+
_MODEL = data['model']
|
|
87
|
+
# verify it works with current sklearn
|
|
88
|
+
test_features = extract_features("ครับ").reshape(1, -1)
|
|
89
|
+
_MODEL.predict(test_features)
|
|
90
|
+
except Exception:
|
|
91
|
+
# retrain if pkl is incompatible
|
|
92
|
+
_MODEL = _retrain_model(pkl_path)
|
|
93
|
+
return _MODEL
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _retrain_model(pkl_path):
|
|
97
|
+
"""Retrain model from inline data when pkl is incompatible with current sklearn"""
|
|
98
|
+
import random
|
|
99
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
100
|
+
|
|
101
|
+
# import training data from train module
|
|
102
|
+
import importlib.util
|
|
103
|
+
train_path = os.path.join(os.path.dirname(_DIR), 'train.py')
|
|
104
|
+
if os.path.exists(train_path):
|
|
105
|
+
spec = importlib.util.spec_from_file_location("train_module", train_path)
|
|
106
|
+
train_mod = importlib.util.module_from_spec(spec)
|
|
107
|
+
spec.loader.exec_module(train_mod)
|
|
108
|
+
pos_examples = train_mod.augment_backchannels(train_mod.BACKCHANNELS)
|
|
109
|
+
neg_examples = list(train_mod.REAL_RESPONSES)
|
|
110
|
+
else:
|
|
111
|
+
raise RuntimeError("Cannot retrain: train.py not found alongside package")
|
|
112
|
+
|
|
113
|
+
random.seed(42)
|
|
114
|
+
X_pos = np.array([extract_features(t) for t in pos_examples])
|
|
115
|
+
X_neg = np.array([extract_features(t) for t in neg_examples])
|
|
116
|
+
X = np.vstack([X_pos, X_neg])
|
|
117
|
+
y = np.array([1] * len(pos_examples) + [0] * len(neg_examples))
|
|
118
|
+
indices = list(range(len(X)))
|
|
119
|
+
random.shuffle(indices)
|
|
120
|
+
X = X[indices]
|
|
121
|
+
y = y[indices]
|
|
122
|
+
|
|
123
|
+
model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
|
|
124
|
+
model.fit(X, y)
|
|
125
|
+
|
|
126
|
+
# save updated pkl
|
|
127
|
+
try:
|
|
128
|
+
with open(pkl_path, 'wb') as f:
|
|
129
|
+
pickle.dump({'model': model, 'feature_names': [], 'backchannels': []}, f)
|
|
130
|
+
except Exception:
|
|
131
|
+
pass # read-only install, just use in-memory
|
|
132
|
+
|
|
133
|
+
return model
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def is_backchannel(text: str, threshold: float = 0.5, lang: str = "th") -> tuple:
|
|
137
|
+
"""
|
|
138
|
+
Detect if text is a backchannel.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
text: Input text from ASR
|
|
142
|
+
threshold: Classification threshold (default 0.5)
|
|
143
|
+
lang: Language code - "th" (Thai, default) or "ja" (Japanese)
|
|
144
|
+
|
|
145
|
+
Returns: (is_backchannel: bool, confidence: float)
|
|
146
|
+
"""
|
|
147
|
+
if not text or not text.strip():
|
|
148
|
+
return False, 0.0
|
|
149
|
+
|
|
150
|
+
if lang == "ja":
|
|
151
|
+
from backchannel_classifier.jp import is_backchannel_ja
|
|
152
|
+
return is_backchannel_ja(text, threshold=threshold)
|
|
153
|
+
|
|
154
|
+
model = _load_model()
|
|
155
|
+
features = extract_features(text).reshape(1, -1)
|
|
156
|
+
prob = model.predict_proba(features)[0]
|
|
157
|
+
bc_prob = prob[1] # probability of backchannel class
|
|
158
|
+
return bc_prob >= threshold, float(bc_prob)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
if __name__ == '__main__':
|
|
162
|
+
import sys
|
|
163
|
+
if len(sys.argv) > 1:
|
|
164
|
+
text = ' '.join(sys.argv[1:])
|
|
165
|
+
is_bc, conf = is_backchannel(text)
|
|
166
|
+
print(f"'{text}' -> {'BACKCHANNEL' if is_bc else 'REAL RESPONSE'} (confidence: {conf:.4f})")
|
|
167
|
+
else:
|
|
168
|
+
# Interactive mode
|
|
169
|
+
print("Backchannel Classifier - type text to classify (ctrl+c to exit)")
|
|
170
|
+
while True:
|
|
171
|
+
try:
|
|
172
|
+
text = input("> ")
|
|
173
|
+
is_bc, conf = is_backchannel(text)
|
|
174
|
+
label = "BACKCHANNEL" if is_bc else "REAL RESPONSE"
|
|
175
|
+
print(f" -> {label} (confidence: {conf:.4f})")
|
|
176
|
+
except (KeyboardInterrupt, EOFError):
|
|
177
|
+
break
|