backchannel-classifier 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ """
2
+ Backchannel Classifier (Thai + Japanese)
3
+ Usage:
4
+ from backchannel_classifier import is_backchannel
5
+ is_backchannel("ครับ") # Thai (default)
6
+ is_backchannel("はい", lang="ja") # Japanese
7
+ """
8
+
9
+ import re
10
+ import pickle
11
+ import numpy as np
12
+ import os
13
+
14
+ _MODEL = None
15
+ _DIR = os.path.dirname(os.path.abspath(__file__))
16
+
17
+
18
+ def extract_features(text):
19
+ """Extract features from Thai text"""
20
+ text = text.strip()
21
+ char_len = len(text)
22
+ thai_chars = len(re.findall(r'[\u0E00-\u0E7F]', text))
23
+ words = text.split()
24
+ word_count = len(words)
25
+
26
+ has_krab = 1 if re.search(r'ครับ|คับ', text) else 0
27
+ has_ka = 1 if re.search(r'ค่ะ|คะ|ค่า', text) else 0
28
+ has_ja = 1 if re.search(r'จ้ะ|จ้า|จ๊ะ', text) else 0
29
+ has_hmm = 1 if re.search(r'อืม|อือ|อื้อ|อื้ม|อึม|อุ้ม|เอิ่ม', text) else 0
30
+ has_oh = 1 if re.search(r'อ๋อ|เออ|เอ่อ|อ่า|อ้า|อ๊า+', text) else 0
31
+ has_aha = 1 if re.search(r'อ่าฮะ|อาฮะ|อาหะ|อ้าฮะ', text) else 0
32
+ has_hello = 1 if 'ฮัลโหล' in text else 0
33
+ has_chai = 1 if re.search(r'ใช่|ช่าย', text) else 0
34
+ has_jing = 1 if 'จริง' in text else 0
35
+ has_thuk = 1 if 'ถูก' in text else 0
36
+ has_ok = 1 if re.search(r'โอเค|เค$|เคร$', text) else 0
37
+ has_naenon = 1 if 'แน่นอน' in text else 0
38
+ has_wama = 1 if 'ว่ามา' in text else 0
39
+ has_na = 1 if re.search(r'นะ', text) else 0
40
+ has_ha = 1 if re.search(r'ฮะ|ฮ่ะ', text) else 0
41
+ has_question = 1 if re.search(r'ไหม|อะไร|ที่ไหน|เมื่อไหร่|ยังไง|ทำไม|กี่|เท่าไหร่', text) else 0
42
+ has_negation = 1 if re.search(r'ไม่|ยัง(?!ไง)', text) else 0
43
+ has_request = 1 if re.search(r'ขอ|ช่วย|อยาก|ต้องการ', text) else 0
44
+ has_continuation = 1 if re.search(r'แต่|แล้ว|แล้วก็|งั้น(?!เหรอ)', text) else 0
45
+ has_repeat = 1 if 'ๆ' in text else 0
46
+ particle_ratio = (has_krab + has_ka + has_ja) / max(word_count, 1)
47
+
48
+ remaining = text
49
+ for pattern in ['ครับ', 'คับ', 'ค่ะ', 'คะ', 'ค่า', 'จ้ะ', 'จ้า', 'ผม',
50
+ 'อืม', 'อือ', 'อื้อ', 'อื้ม', 'อึม', 'เอิ่ม',
51
+ 'เออ', 'เอ่อ', 'อ่า', 'อ้า', 'อ่าฮะ', 'อาฮะ', 'อ้าฮะ',
52
+ 'อ๋อ', 'ใช่', 'ช่าย', 'จริง', 'ด้วย', 'ถูก', 'โอเค', 'เค', 'เคร',
53
+ 'แน่นอน', 'เหรอ', 'หรอ', 'งั้น', 'ได้', 'อ่ะ', 'เอ๊ะ', 'ว่ามา',
54
+ 'อาหะ', 'อือหึ', 'อือฮึ', 'ฮัลโหล',
55
+ 'ไม่เป็นไร',
56
+ 'นะ', 'ฮะ', 'ฮ่ะ', 'ก็', 'ดี', 'อ้าว', 'อะ',
57
+ 'ๆ', ' ']:
58
+ remaining = remaining.replace(pattern, '')
59
+ remaining = re.sub(r'อ[๊้]า+', '', remaining)
60
+ remaining_len = len(remaining)
61
+ remaining_ratio = remaining_len / max(char_len, 1)
62
+
63
+ return np.array([
64
+ char_len, thai_chars, word_count,
65
+ has_krab, has_ka, has_ja,
66
+ has_hmm, has_oh, has_aha, has_hello,
67
+ has_chai, has_jing, has_thuk, has_ok, has_naenon,
68
+ has_question, has_negation, has_request,
69
+ has_continuation, has_repeat,
70
+ particle_ratio, remaining_len, remaining_ratio,
71
+ has_wama, has_na, has_ha,
72
+ ])
73
+
74
+
75
+ def _load_model():
76
+ global _MODEL
77
+ if _MODEL is None:
78
+ import warnings
79
+ import sklearn
80
+ pkl_path = os.path.join(_DIR, 'backchannel_model.pkl')
81
+ try:
82
+ with warnings.catch_warnings():
83
+ warnings.simplefilter("ignore")
84
+ with open(pkl_path, 'rb') as f:
85
+ data = pickle.load(f)
86
+ _MODEL = data['model']
87
+ # verify it works with current sklearn
88
+ test_features = extract_features("ครับ").reshape(1, -1)
89
+ _MODEL.predict(test_features)
90
+ except Exception:
91
+ # retrain if pkl is incompatible
92
+ _MODEL = _retrain_model(pkl_path)
93
+ return _MODEL
94
+
95
+
96
+ def _retrain_model(pkl_path):
97
+ """Retrain model from inline data when pkl is incompatible with current sklearn"""
98
+ import random
99
+ from sklearn.ensemble import GradientBoostingClassifier
100
+
101
+ # import training data from train module
102
+ import importlib.util
103
+ train_path = os.path.join(os.path.dirname(_DIR), 'train.py')
104
+ if os.path.exists(train_path):
105
+ spec = importlib.util.spec_from_file_location("train_module", train_path)
106
+ train_mod = importlib.util.module_from_spec(spec)
107
+ spec.loader.exec_module(train_mod)
108
+ pos_examples = train_mod.augment_backchannels(train_mod.BACKCHANNELS)
109
+ neg_examples = list(train_mod.REAL_RESPONSES)
110
+ else:
111
+ raise RuntimeError("Cannot retrain: train.py not found alongside package")
112
+
113
+ random.seed(42)
114
+ X_pos = np.array([extract_features(t) for t in pos_examples])
115
+ X_neg = np.array([extract_features(t) for t in neg_examples])
116
+ X = np.vstack([X_pos, X_neg])
117
+ y = np.array([1] * len(pos_examples) + [0] * len(neg_examples))
118
+ indices = list(range(len(X)))
119
+ random.shuffle(indices)
120
+ X = X[indices]
121
+ y = y[indices]
122
+
123
+ model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
124
+ model.fit(X, y)
125
+
126
+ # save updated pkl
127
+ try:
128
+ with open(pkl_path, 'wb') as f:
129
+ pickle.dump({'model': model, 'feature_names': [], 'backchannels': []}, f)
130
+ except Exception:
131
+ pass # read-only install, just use in-memory
132
+
133
+ return model
134
+
135
+
136
+ def is_backchannel(text: str, threshold: float = 0.5, lang: str = "th") -> tuple:
137
+ """
138
+ Detect if text is a backchannel.
139
+
140
+ Args:
141
+ text: Input text from ASR
142
+ threshold: Classification threshold (default 0.5)
143
+ lang: Language code - "th" (Thai, default) or "ja" (Japanese)
144
+
145
+ Returns: (is_backchannel: bool, confidence: float)
146
+ """
147
+ if not text or not text.strip():
148
+ return False, 0.0
149
+
150
+ if lang == "ja":
151
+ from backchannel_classifier.jp import is_backchannel_ja
152
+ return is_backchannel_ja(text, threshold=threshold)
153
+
154
+ model = _load_model()
155
+ features = extract_features(text).reshape(1, -1)
156
+ prob = model.predict_proba(features)[0]
157
+ bc_prob = prob[1] # probability of backchannel class
158
+ return bc_prob >= threshold, float(bc_prob)
159
+
160
+
161
+ if __name__ == '__main__':
162
+ import sys
163
+ if len(sys.argv) > 1:
164
+ text = ' '.join(sys.argv[1:])
165
+ is_bc, conf = is_backchannel(text)
166
+ print(f"'{text}' -> {'BACKCHANNEL' if is_bc else 'REAL RESPONSE'} (confidence: {conf:.4f})")
167
+ else:
168
+ # Interactive mode
169
+ print("Backchannel Classifier - type text to classify (ctrl+c to exit)")
170
+ while True:
171
+ try:
172
+ text = input("> ")
173
+ is_bc, conf = is_backchannel(text)
174
+ label = "BACKCHANNEL" if is_bc else "REAL RESPONSE"
175
+ print(f" -> {label} (confidence: {conf:.4f})")
176
+ except (KeyboardInterrupt, EOFError):
177
+ break
@@ -0,0 +1,249 @@
1
+ """
2
+ Japanese Backchannel (Aizuchi) Classifier
3
+ Usage: from backchannel_classifier.jp import is_backchannel_ja
4
+ """
5
+
6
+ import re
7
+ import pickle
8
+ import numpy as np
9
+ import os
10
+
11
+ _MODEL = None
12
+ _DIR = os.path.dirname(os.path.abspath(__file__))
13
+
14
+
15
+ def extract_features_ja(text):
16
+ """Extract features from Japanese text for aizuchi detection"""
17
+ text = text.strip()
18
+ char_len = len(text)
19
+
20
+ # Count Japanese characters (hiragana + katakana + kanji)
21
+ hiragana = len(re.findall(r'[\u3040-\u309F]', text))
22
+ katakana = len(re.findall(r'[\u30A0-\u30FF]', text))
23
+ kanji = len(re.findall(r'[\u4E00-\u9FFF]', text))
24
+ jp_chars = hiragana + katakana + kanji
25
+
26
+ # Word-like segments (split on spaces — ASR output is usually space-separated)
27
+ words = text.split()
28
+ word_count = len(words)
29
+
30
+ # === Backchannel markers (positive indicators) ===
31
+
32
+ # Core acknowledgment: はい, ええ, うん
33
+ has_hai = 1 if re.search(r'はい|はーい|はいはい', text) else 0
34
+ has_ee = 1 if re.search(r'ええ|えぇ', text) else 0
35
+ has_un = 1 if re.search(r'うん|うんうん', text) else 0
36
+
37
+ # Agreement: そう, そうですね, そうだね, そっか
38
+ has_sou = 1 if re.search(r'そう(?:です(?:ね|か|よね)|だね|だよね|ね|そう|なんだ|なんですね)?|そっか', text) else 0
39
+
40
+ # Understanding: なるほど
41
+ has_naruhodo = 1 if re.search(r'なるほど', text) else 0
42
+
43
+ # Surprise/reaction sounds: へー, ほー, えー (standalone)
44
+ has_hee = 1 if re.search(r'へー+|へぇ+|ほー+|ほぉ+', text) else 0
45
+ has_e_surprise = 1 if re.search(r'^えー+$|^えぇ+$|^えっ$', text) else 0
46
+
47
+ # Filler/hesitation: えーと, あのー, うーん, んー, あー
48
+ has_filler = 1 if re.search(r'えー?と|えっと|あのー?|うー+ん|んー+|あー+|まあ|まぁ|そのー?|なんか|なんていうか', text) else 0
49
+
50
+ # Emotional reaction: すごい, やばい, ほんと, まじ, うそ
51
+ has_reaction = 1 if re.search(r'すご(?:い|ーい|いね|いですね)|やば(?:い|っ)|ほんと(?:う|に|ですか)?|まじ(?:で|っすか)?|マジ(?:で)?|うそ(?:ー)?|うっそ', text) else 0
52
+
53
+ # Empathy: よかった, たいへん, 大丈夫, かわいそう, 残念
54
+ has_empathy = 1 if re.search(r'よかった(?:ですね|ね)?|たいへん(?:ですね)?|大丈夫(?:です)?|かわいそう|残念(?:ですね)?', text) else 0
55
+
56
+ # Standalone particles as backchannels: ね, よね, だよね, ですよね, でしょ
57
+ has_particle_bc = 1 if re.search(r'^(?:ね|ねー+|よね|だよね|ですよね|でしょ|でしょう|だね|ですね)$', text) else 0
58
+
59
+ # Informal: ふーん, あっそ, っす, っすね, じゃん
60
+ has_informal = 1 if re.search(r'ふー+ん|あっそ|っす(?:ね)?$|じゃん$|なー$|ガチ(?:で)?', text) else 0
61
+
62
+ # Polite acknowledgment: わかりました, 承知しました, かしこまりました
63
+ has_polite_ack = 1 if re.search(r'わかりました|わかった|承知(?:しました|いたしました)|かしこまりました', text) else 0
64
+
65
+ # === Negative indicators (NOT backchannel) ===
66
+
67
+ # Question words: 何, どこ, いつ, なぜ, どう, 誰, いくつ, いくら
68
+ has_question = 1 if re.search(r'何|どこ|いつ|なぜ|どう(?:して|やって|いう)|誰|いく(?:つ|ら)|どれ|どの|どちら|ですか$', text) else 0
69
+
70
+ # Conjunctions continuing thought: でも, しかし, それで, だから, けど, ただ, が
71
+ has_continuation = 1 if re.search(r'でも|しかし|それで|だから|けど|ただ|ですが|だけど|それから|あと(?!ー)', text) else 0
72
+
73
+ # Request/command: ください, ましょう, たい, ほしい, お願い
74
+ has_request = 1 if re.search(r'ください|ましょう|(?:し|き|み|い)たい(?:です)?|ほしい|お願い|してほしい|教えて|頼む', text) else 0
75
+
76
+ # Negation: ない, ません, いいえ, 違う, いや
77
+ has_negation = 1 if re.search(r'ない(?:です)?$|ません|いいえ|違(?:う|います)|いや(?!ー)|じゃない', text) else 0
78
+
79
+ # Has verb endings (polite/dictionary form) suggesting real content
80
+ has_verb = 1 if re.search(r'ます$|ました$|ません$|する$|した$|できる|できます|あります|います|思います|考え', text) else 0
81
+
82
+ # Has kanji content (real responses tend to have more kanji)
83
+ kanji_ratio = kanji / max(char_len, 1)
84
+
85
+ # === Remaining ratio (key feature) ===
86
+ remaining = text
87
+ for pattern in [
88
+ # Core aizuchi
89
+ 'はい', 'ええ', 'うん', 'うんうん', 'はいはい',
90
+ # Agreement
91
+ 'そうですね', 'そうですか', 'そうだね', 'そうだよね', 'そうなんだ', 'そうなんですね',
92
+ 'そうそう', 'そうね', 'そっか', 'そう',
93
+ # Understanding
94
+ 'なるほど', 'なるほどね', 'なるほどですね',
95
+ 'わかりました', 'わかった', '分かります',
96
+ 'あぁそうか', 'あそっか',
97
+ # Surprise
98
+ 'へー', 'へぇ', 'ほー', 'ほぉ', 'えー', 'えぇ', 'えっ',
99
+ # Fillers
100
+ 'えーと', 'えっと', 'あのー', 'あの', 'うーん', 'んー', 'あー',
101
+ 'まあ', 'まぁ', 'そのー', 'その', 'なんか',
102
+ # Reaction
103
+ 'すごい', 'すごいね', 'すごいですね', 'やばい', 'やばっ',
104
+ 'ほんと', 'ほんとう', 'ほんとに', 'ほんとですか',
105
+ 'まじ', 'まじで', 'マジ', 'マジで',
106
+ 'うそ', 'うっそ', 'うそー',
107
+ # Empathy
108
+ 'よかった', 'よかったね', 'よかったですね',
109
+ 'たいへん', 'たいへんですね',
110
+ '大丈夫', '大丈夫です',
111
+ 'かわいそう', '残念', '残念ですね',
112
+ # Particles
113
+ 'ね', 'ねー', 'よね', 'だよね', 'ですよね', 'でしょ', 'でしょう',
114
+ 'だね', 'ですね', 'です', 'よ',
115
+ # Informal
116
+ 'ふーん', 'あっそ', 'っす', 'っすね', 'じゃん', 'なー',
117
+ 'ガチ', 'ガチで',
118
+ # Polite
119
+ '承知しました', '承知いたしました', 'かしこまりました',
120
+ # Connectors (neutral)
121
+ 'ああ', 'あぁ', 'あ',
122
+ # Elongation marks
123
+ 'ー',
124
+ ' ',
125
+ ]:
126
+ remaining = remaining.replace(pattern, '')
127
+ # Also strip elongated vowels via regex
128
+ remaining = re.sub(r'ー+', '', remaining)
129
+ remaining = re.sub(r'っ+', '', remaining)
130
+ remaining_len = len(remaining)
131
+ remaining_ratio = remaining_len / max(char_len, 1)
132
+
133
+ return np.array([
134
+ char_len, jp_chars, word_count,
135
+ hiragana, katakana, kanji,
136
+ has_hai, has_ee, has_un,
137
+ has_sou, has_naruhodo,
138
+ has_hee, has_e_surprise, has_filler,
139
+ has_reaction, has_empathy,
140
+ has_particle_bc, has_informal, has_polite_ack,
141
+ has_question, has_continuation, has_request,
142
+ has_negation, has_verb,
143
+ kanji_ratio, remaining_len, remaining_ratio,
144
+ ])
145
+
146
+
147
+ FEATURE_NAMES_JA = [
148
+ 'char_len', 'jp_chars', 'word_count',
149
+ 'hiragana', 'katakana', 'kanji',
150
+ 'has_hai', 'has_ee', 'has_un',
151
+ 'has_sou', 'has_naruhodo',
152
+ 'has_hee', 'has_e_surprise', 'has_filler',
153
+ 'has_reaction', 'has_empathy',
154
+ 'has_particle_bc', 'has_informal', 'has_polite_ack',
155
+ 'has_question', 'has_continuation', 'has_request',
156
+ 'has_negation', 'has_verb',
157
+ 'kanji_ratio', 'remaining_len', 'remaining_ratio',
158
+ ]
159
+
160
+
161
+ def _load_model_ja():
162
+ global _MODEL
163
+ if _MODEL is None:
164
+ import warnings
165
+ pkl_path = os.path.join(_DIR, 'backchannel_model_ja.pkl')
166
+ try:
167
+ with warnings.catch_warnings():
168
+ warnings.simplefilter("ignore")
169
+ with open(pkl_path, 'rb') as f:
170
+ data = pickle.load(f)
171
+ _MODEL = data['model']
172
+ # verify it works with current sklearn
173
+ test_features = extract_features_ja("はい").reshape(1, -1)
174
+ _MODEL.predict(test_features)
175
+ except Exception:
176
+ _MODEL = _retrain_model_ja(pkl_path)
177
+ return _MODEL
178
+
179
+
180
+ def _retrain_model_ja(pkl_path):
181
+ """Retrain Japanese model from inline data when pkl is incompatible"""
182
+ import random
183
+ from sklearn.ensemble import GradientBoostingClassifier
184
+
185
+ import importlib.util
186
+ train_path = os.path.join(os.path.dirname(_DIR), 'train_ja.py')
187
+ if os.path.exists(train_path):
188
+ spec = importlib.util.spec_from_file_location("train_ja_module", train_path)
189
+ train_mod = importlib.util.module_from_spec(spec)
190
+ spec.loader.exec_module(train_mod)
191
+ pos_examples = train_mod.augment_backchannels_ja(train_mod.BACKCHANNELS_JA)
192
+ neg_examples = list(train_mod.REAL_RESPONSES_JA)
193
+ else:
194
+ raise RuntimeError("Cannot retrain: train_ja.py not found alongside package")
195
+
196
+ random.seed(42)
197
+ X_pos = np.array([extract_features_ja(t) for t in pos_examples])
198
+ X_neg = np.array([extract_features_ja(t) for t in neg_examples])
199
+ X = np.vstack([X_pos, X_neg])
200
+ y = np.array([1] * len(pos_examples) + [0] * len(neg_examples))
201
+ indices = list(range(len(X)))
202
+ random.shuffle(indices)
203
+ X = X[indices]
204
+ y = y[indices]
205
+
206
+ model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
207
+ model.fit(X, y)
208
+
209
+ try:
210
+ with open(pkl_path, 'wb') as f:
211
+ pickle.dump({'model': model, 'feature_names': FEATURE_NAMES_JA}, f)
212
+ except Exception:
213
+ pass
214
+
215
+ return model
216
+
217
+
218
+ def is_backchannel_ja(text: str, threshold: float = 0.5) -> tuple:
219
+ """
220
+ Detect if Japanese text is a backchannel (aizuchi).
221
+
222
+ Returns: (is_backchannel: bool, confidence: float)
223
+ """
224
+ if not text or not text.strip():
225
+ return False, 0.0
226
+
227
+ model = _load_model_ja()
228
+ features = extract_features_ja(text).reshape(1, -1)
229
+ prob = model.predict_proba(features)[0]
230
+ bc_prob = prob[1]
231
+ return bc_prob >= threshold, float(bc_prob)
232
+
233
+
234
+ if __name__ == '__main__':
235
+ import sys
236
+ if len(sys.argv) > 1:
237
+ text = ' '.join(sys.argv[1:])
238
+ is_bc, conf = is_backchannel_ja(text)
239
+ print(f"'{text}' -> {'BACKCHANNEL' if is_bc else 'REAL RESPONSE'} (confidence: {conf:.4f})")
240
+ else:
241
+ print("Japanese Backchannel (Aizuchi) Detector - type text to classify (ctrl+c to exit)")
242
+ while True:
243
+ try:
244
+ text = input("> ")
245
+ is_bc, conf = is_backchannel_ja(text)
246
+ label = "BACKCHANNEL" if is_bc else "REAL RESPONSE"
247
+ print(f" -> {label} (confidence: {conf:.4f})")
248
+ except (KeyboardInterrupt, EOFError):
249
+ break
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: backchannel-classifier
3
+ Version: 0.4.0
4
+ Summary: backchannel classifier - detect backchannels vs real responses in thai and japanese asr output
5
+ Author-email: "100x.fi" <kiri@100x.fi>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/100x-fi/backchannel-classifier
8
+ Project-URL: Repository, https://github.com/100x-fi/backchannel-classifier
9
+ Keywords: thai,japanese,nlp,backchannel,aizuchi,voice,asr,classifier
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: scikit-learn>=1.0
25
+ Requires-Dist: numpy>=1.20
26
+ Dynamic: license-file
27
+
28
+ # backchannel classifier
29
+
30
+ detects backchannel responses vs real user input for voice ai systems. supports **thai** and **japanese** (aizuchi).
31
+
32
+ ## install
33
+
34
+ ```bash
35
+ pip install backchannel-classifier
36
+ ```
37
+
38
+ ## usage
39
+
40
+ ```python
41
+ from backchannel_classifier import is_backchannel
42
+
43
+ # thai (default)
44
+ is_backchannel("ครับ") # (True, 0.91)
45
+ is_backchannel("ไม่ครับ") # (False, 0.01)
46
+ is_backchannel("ใช่ แต่ว่า") # (False, 0.01)
47
+
48
+ # japanese
49
+ is_backchannel("はい", lang="ja") # (True, 0.99)
50
+ is_backchannel("そうですね", lang="ja") # (True, 0.99)
51
+ is_backchannel("予約したいです", lang="ja") # (False, 0.0001)
52
+
53
+ # direct import
54
+ from backchannel_classifier.jp import is_backchannel_ja
55
+ is_backchannel_ja("なるほど") # (True, 0.99)
56
+ ```
57
+
58
+ returns `(is_backchannel: bool, confidence: float)`.
59
+
60
+ ## why
61
+
62
+ voice bots using asr → llm → tts pipelines need to distinguish between backchannels (acknowledgment sounds that should be ignored) and real responses that need processing. simple exact matching fails on asr variants and misses edge cases.
63
+
64
+ ## approach
65
+
66
+ gradient boosting classifier with handcrafted language-specific features. key idea: strip known backchannel components from the text, measure what's left (`remaining_ratio`). if nothing remains, it's a backchannel.
67
+
68
+ ### thai (26 features)
69
+
70
+ | feature | importance |
71
+ |---|---|
72
+ | remaining_ratio | 0.9098 |
73
+ | has_request | 0.0406 |
74
+ | has_negation | 0.0274 |
75
+ | particle_ratio | 0.0108 |
76
+
77
+ - polite particle detection (ครับ/ค่ะ/จ้ะ variants)
78
+ - backchannel sound patterns (อืม/อ๋อ/เออ with tone variants)
79
+ - question/negation/request/continuation markers
80
+ - handles asr misspellings (ค่า→ค่ะ, คับ→ครับ, อื้ม→อืม)
81
+
82
+ ### japanese (27 features)
83
+
84
+ | feature | importance |
85
+ |---|---|
86
+ | remaining_ratio | 0.7765 |
87
+ | remaining_len | 0.0484 |
88
+ | katakana | 0.0347 |
89
+ | word_count | 0.0325 |
90
+ | kanji_ratio | 0.0206 |
91
+
92
+ - core aizuchi (はい/ええ/うん/そう)
93
+ - agreement, understanding, surprise, filler, reaction markers
94
+ - question/continuation/request/negation/verb negative indicators
95
+ - handles asr elongation variants (はーーい, えーーー)
96
+
97
+ ## results
98
+
99
+ ### thai
100
+ - **99.49% f1** (5-fold cv)
101
+ - test suite: **94/94** (100%)
102
+
103
+ ### japanese
104
+ - **98.37% f1** (5-fold cv)
105
+ - test suite: **119/119** (100%)
106
+
107
+ ## test coverage
108
+
109
+ ### thai (94 cases)
110
+
111
+ **backchannels (49):** ครับ, ค่ะ, อืม, ใช่, อ๋อ, เหรอ, ฮัลโหล, asr variants...
112
+ **real responses (45):** สวัสดีครับ, ไม่ครับ, ราคาเท่าไหร่ครับ, edge cases (ใช่ แต่ว่า, ครับ แล้วก็)...
113
+
114
+ ### japanese (119 cases)
115
+
116
+ **aizuchi (63):** はい, うん, そうですね, なるほど, へー, まじで, えーと, すごい, 承知しました, compounds...
117
+ **real responses (56):** ありがとうございます, いくらですか, 予約したいです, edge cases (はい、質問があります, そうですね、でも...)...
118
+
119
+ ## testing
120
+
121
+ ```bash
122
+ python3 -m pytest tests/ -v
123
+ ```
124
+
125
+ ## files
126
+
127
+ - `backchannel_classifier/__init__.py` - thai classifier + unified api
128
+ - `backchannel_classifier/jp.py` - japanese classifier
129
+ - `train.py` - thai training script
130
+ - `train_ja.py` - japanese training script
131
+ - `tests/test_classifier.py` - thai test suite (94 cases)
132
+ - `tests/test_classifier_ja.py` - japanese test suite (119 cases)
133
+
134
+ ## requirements
135
+
136
+ - python 3.8+
137
+ - scikit-learn
138
+ - numpy
139
+
140
+ ## memory
141
+
142
+ ~3.7 MB per language model, lazy-loaded. if you only use thai, japanese model is never loaded (zero overhead).
@@ -0,0 +1,9 @@
1
+ backchannel_classifier/__init__.py,sha256=1PA00oIqCbPHPdfyCeyc-4MWjrAsWhHHXjuxQQNR2RQ,7564
2
+ backchannel_classifier/backchannel_model.pkl,sha256=kZhZLE575Ms2mJT0NXzCJHMDltHnpHpW8bI-uR8hq-A,3917
3
+ backchannel_classifier/backchannel_model_ja.pkl,sha256=aVrrQFxzIWXP8eHbbfPGTT7aSRwfHJ_Nuv-4ZqjLLKg,219054
4
+ backchannel_classifier/jp.py,sha256=vuiYkFutySn8GQZsJNgriu6xUjyUisIScntI0itu8t8,10689
5
+ backchannel_classifier-0.4.0.dist-info/licenses/LICENSE,sha256=j0I9MBGgesqFL9pKi32WngbDC03dpUw7f4XMI9FDSRQ,1064
6
+ backchannel_classifier-0.4.0.dist-info/METADATA,sha256=e4swSCejamHbpgwfUcAn0Ajk0uQ8-fjoARu3NyL8gT4,4900
7
+ backchannel_classifier-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ backchannel_classifier-0.4.0.dist-info/top_level.txt,sha256=C0EiI_PBu-SmB3oQXafQ_zhwyb4-y4b9YmPW7CvQGp0,23
9
+ backchannel_classifier-0.4.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 100x.fi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ backchannel_classifier