PyPI - backchannel-classifier - Versions diffs - 0.4.0__py3-none-any.whl - Mend

backchannel-classifier 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

backchannel_classifier/__init__.py +177 -0
backchannel_classifier/backchannel_model.pkl +0 -0
backchannel_classifier/backchannel_model_ja.pkl +0 -0
backchannel_classifier/jp.py +249 -0
backchannel_classifier-0.4.0.dist-info/METADATA +142 -0
backchannel_classifier-0.4.0.dist-info/RECORD +9 -0
backchannel_classifier-0.4.0.dist-info/WHEEL +5 -0
backchannel_classifier-0.4.0.dist-info/licenses/LICENSE +21 -0
backchannel_classifier-0.4.0.dist-info/top_level.txt +1 -0

backchannel_classifier/__init__.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+Backchannel Classifier (Thai + Japanese)
+Usage:
+  from backchannel_classifier import is_backchannel
+  is_backchannel("ครับ")               # Thai (default)
+  is_backchannel("はい", lang="ja")    # Japanese
+"""
+import re
+import pickle
+import numpy as np
+import os
+_MODEL = None
+_DIR = os.path.dirname(os.path.abspath(__file__))
+def extract_features(text):
+    """Extract features from Thai text"""
+    text = text.strip()
+    char_len = len(text)
+    thai_chars = len(re.findall(r'[\u0E00-\u0E7F]', text))
+    words = text.split()
+    word_count = len(words)
+    has_krab = 1 if re.search(r'ครับ|คับ', text) else 0
+    has_ka = 1 if re.search(r'ค่ะ|คะ|ค่า', text) else 0
+    has_ja = 1 if re.search(r'จ้ะ|จ้า|จ๊ะ', text) else 0
+    has_hmm = 1 if re.search(r'อืม|อือ|อื้อ|อื้ม|อึม|อุ้ม|เอิ่ม', text) else 0
+    has_oh = 1 if re.search(r'อ๋อ|เออ|เอ่อ|อ่า|อ้า|อ๊า+', text) else 0
+    has_aha = 1 if re.search(r'อ่าฮะ|อาฮะ|อาหะ|อ้าฮะ', text) else 0
+    has_hello = 1 if 'ฮัลโหล' in text else 0
+    has_chai = 1 if re.search(r'ใช่|ช่าย', text) else 0
+    has_jing = 1 if 'จริง' in text else 0
+    has_thuk = 1 if 'ถูก' in text else 0
+    has_ok = 1 if re.search(r'โอเค|เค$|เคร$', text) else 0
+    has_naenon = 1 if 'แน่นอน' in text else 0
+    has_wama = 1 if 'ว่ามา' in text else 0
+    has_na = 1 if re.search(r'นะ', text) else 0
+    has_ha = 1 if re.search(r'ฮะ|ฮ่ะ', text) else 0
+    has_question = 1 if re.search(r'ไหม|อะไร|ที่ไหน|เมื่อไหร่|ยังไง|ทำไม|กี่|เท่าไหร่', text) else 0
+    has_negation = 1 if re.search(r'ไม่|ยัง(?!ไง)', text) else 0
+    has_request = 1 if re.search(r'ขอ|ช่วย|อยาก|ต้องการ', text) else 0
+    has_continuation = 1 if re.search(r'แต่|แล้ว|แล้วก็|งั้น(?!เหรอ)', text) else 0
+    has_repeat = 1 if 'ๆ' in text else 0
+    particle_ratio = (has_krab + has_ka + has_ja) / max(word_count, 1)
+    remaining = text
+    for pattern in ['ครับ', 'คับ', 'ค่ะ', 'คะ', 'ค่า', 'จ้ะ', 'จ้า', 'ผม',
+                    'อืม', 'อือ', 'อื้อ', 'อื้ม', 'อึม', 'เอิ่ม',
+                    'เออ', 'เอ่อ', 'อ่า', 'อ้า', 'อ่าฮะ', 'อาฮะ', 'อ้าฮะ',
+                    'อ๋อ', 'ใช่', 'ช่าย', 'จริง', 'ด้วย', 'ถูก', 'โอเค', 'เค', 'เคร',
+                    'แน่นอน', 'เหรอ', 'หรอ', 'งั้น', 'ได้', 'อ่ะ', 'เอ๊ะ', 'ว่ามา',
+                    'อาหะ', 'อือหึ', 'อือฮึ', 'ฮัลโหล',
+                    'ไม่เป็นไร',
+                    'นะ', 'ฮะ', 'ฮ่ะ', 'ก็', 'ดี', 'อ้าว', 'อะ',
+                    'ๆ', ' ']:
+        remaining = remaining.replace(pattern, '')
+    remaining = re.sub(r'อ[๊้]า+', '', remaining)
+    remaining_len = len(remaining)
+    remaining_ratio = remaining_len / max(char_len, 1)
+    return np.array([
+        char_len, thai_chars, word_count,
+        has_krab, has_ka, has_ja,
+        has_hmm, has_oh, has_aha, has_hello,
+        has_chai, has_jing, has_thuk, has_ok, has_naenon,
+        has_question, has_negation, has_request,
+        has_continuation, has_repeat,
+        particle_ratio, remaining_len, remaining_ratio,
+        has_wama, has_na, has_ha,
+    ])
+def _load_model():
+    global _MODEL
+    if _MODEL is None:
+        import warnings
+        import sklearn
+        pkl_path = os.path.join(_DIR, 'backchannel_model.pkl')
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                with open(pkl_path, 'rb') as f:
+                    data = pickle.load(f)
+                _MODEL = data['model']
+                # verify it works with current sklearn
+                test_features = extract_features("ครับ").reshape(1, -1)
+                _MODEL.predict(test_features)
+        except Exception:
+            # retrain if pkl is incompatible
+            _MODEL = _retrain_model(pkl_path)
+    return _MODEL
+def _retrain_model(pkl_path):
+    """Retrain model from inline data when pkl is incompatible with current sklearn"""
+    import random
+    from sklearn.ensemble import GradientBoostingClassifier
+    # import training data from train module
+    import importlib.util
+    train_path = os.path.join(os.path.dirname(_DIR), 'train.py')
+    if os.path.exists(train_path):
+        spec = importlib.util.spec_from_file_location("train_module", train_path)
+        train_mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(train_mod)
+        pos_examples = train_mod.augment_backchannels(train_mod.BACKCHANNELS)
+        neg_examples = list(train_mod.REAL_RESPONSES)
+    else:
+        raise RuntimeError("Cannot retrain: train.py not found alongside package")
+    random.seed(42)
+    X_pos = np.array([extract_features(t) for t in pos_examples])
+    X_neg = np.array([extract_features(t) for t in neg_examples])
+    X = np.vstack([X_pos, X_neg])
+    y = np.array([1] * len(pos_examples) + [0] * len(neg_examples))
+    indices = list(range(len(X)))
+    random.shuffle(indices)
+    X = X[indices]
+    y = y[indices]
+    model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
+    model.fit(X, y)
+    # save updated pkl
+    try:
+        with open(pkl_path, 'wb') as f:
+            pickle.dump({'model': model, 'feature_names': [], 'backchannels': []}, f)
+    except Exception:
+        pass  # read-only install, just use in-memory
+    return model
+def is_backchannel(text: str, threshold: float = 0.5, lang: str = "th") -> tuple:
+    """
+    Detect if text is a backchannel.
+    Args:
+        text: Input text from ASR
+        threshold: Classification threshold (default 0.5)
+        lang: Language code - "th" (Thai, default) or "ja" (Japanese)
+    Returns: (is_backchannel: bool, confidence: float)
+    """
+    if not text or not text.strip():
+        return False, 0.0
+    if lang == "ja":
+        from backchannel_classifier.jp import is_backchannel_ja
+        return is_backchannel_ja(text, threshold=threshold)
+    model = _load_model()
+    features = extract_features(text).reshape(1, -1)
+    prob = model.predict_proba(features)[0]
+    bc_prob = prob[1]  # probability of backchannel class
+    return bc_prob >= threshold, float(bc_prob)
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) > 1:
+        text = ' '.join(sys.argv[1:])
+        is_bc, conf = is_backchannel(text)
+        print(f"'{text}' -> {'BACKCHANNEL' if is_bc else 'REAL RESPONSE'} (confidence: {conf:.4f})")
+    else:
+        # Interactive mode
+        print("Backchannel Classifier - type text to classify (ctrl+c to exit)")
+        while True:
+            try:
+                text = input("> ")
+                is_bc, conf = is_backchannel(text)
+                label = "BACKCHANNEL" if is_bc else "REAL RESPONSE"
+                print(f"  -> {label} (confidence: {conf:.4f})")
+            except (KeyboardInterrupt, EOFError):
+                break

backchannel_classifier/backchannel_model.pkl ADDED Viewed

Binary file

backchannel_classifier/backchannel_model_ja.pkl ADDED Viewed

Binary file

backchannel_classifier/jp.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""
+Japanese Backchannel (Aizuchi) Classifier
+Usage: from backchannel_classifier.jp import is_backchannel_ja
+"""
+import re
+import pickle
+import numpy as np
+import os
+_MODEL = None
+_DIR = os.path.dirname(os.path.abspath(__file__))
+def extract_features_ja(text):
+    """Extract features from Japanese text for aizuchi detection"""
+    text = text.strip()
+    char_len = len(text)
+    # Count Japanese characters (hiragana + katakana + kanji)
+    hiragana = len(re.findall(r'[\u3040-\u309F]', text))
+    katakana = len(re.findall(r'[\u30A0-\u30FF]', text))
+    kanji = len(re.findall(r'[\u4E00-\u9FFF]', text))
+    jp_chars = hiragana + katakana + kanji
+    # Word-like segments (split on spaces — ASR output is usually space-separated)
+    words = text.split()
+    word_count = len(words)
+    # === Backchannel markers (positive indicators) ===
+    # Core acknowledgment: はい, ええ, うん
+    has_hai = 1 if re.search(r'はい|はーい|はいはい', text) else 0
+    has_ee = 1 if re.search(r'ええ|えぇ', text) else 0
+    has_un = 1 if re.search(r'うん|うんうん', text) else 0
+    # Agreement: そう, そうですね, そうだね, そっか
+    has_sou = 1 if re.search(r'そう(?:です(?:ね|か|よね)|だね|だよね|ね|そう|なんだ|なんですね)?|そっか', text) else 0
+    # Understanding: なるほど
+    has_naruhodo = 1 if re.search(r'なるほど', text) else 0
+    # Surprise/reaction sounds: へー, ほー, えー (standalone)
+    has_hee = 1 if re.search(r'へー+|へぇ+|ほー+|ほぉ+', text) else 0
+    has_e_surprise = 1 if re.search(r'^えー+$|^えぇ+$|^えっ$', text) else 0
+    # Filler/hesitation: えーと, あのー, うーん, んー, あー
+    has_filler = 1 if re.search(r'えー?と|えっと|あのー?|うー+ん|んー+|あー+|まあ|まぁ|そのー?|なんか|なんていうか', text) else 0
+    # Emotional reaction: すごい, やばい, ほんと, まじ, うそ
+    has_reaction = 1 if re.search(r'すご(?:い|ーい|いね|いですね)|やば(?:い|っ)|ほんと(?:う|に|ですか)?|まじ(?:で|っすか)?|マジ(?:で)?|うそ(?:ー)?|うっそ', text) else 0
+    # Empathy: よかった, たいへん, 大丈夫, かわいそう, 残念
+    has_empathy = 1 if re.search(r'よかった(?:ですね|ね)?|たいへん(?:ですね)?|大丈夫(?:です)?|かわいそう|残念(?:ですね)?', text) else 0
+    # Standalone particles as backchannels: ね, よね, だよね, ですよね, でしょ
+    has_particle_bc = 1 if re.search(r'^(?:ね|ねー+|よね|だよね|ですよね|でしょ|でしょう|だね|ですね)$', text) else 0
+    # Informal: ふーん, あっそ, っす, っすね, じゃん
+    has_informal = 1 if re.search(r'ふー+ん|あっそ|っす(?:ね)?$|じゃん$|なー$|ガチ(?:で)?', text) else 0
+    # Polite acknowledgment: わかりました, 承知しました, かしこまりました
+    has_polite_ack = 1 if re.search(r'わかりました|わかった|承知(?:しました|いたしました)|かしこまりました', text) else 0
+    # === Negative indicators (NOT backchannel) ===
+    # Question words: 何, どこ, いつ, なぜ, どう, 誰, いくつ, いくら
+    has_question = 1 if re.search(r'何|どこ|いつ|なぜ|どう(?:して|やって|いう)|誰|いく(?:つ|ら)|どれ|どの|どちら|ですか$', text) else 0
+    # Conjunctions continuing thought: でも, しかし, それで, だから, けど, ただ, が
+    has_continuation = 1 if re.search(r'でも|しかし|それで|だから|けど|ただ|ですが|だけど|それから|あと(?!ー)', text) else 0
+    # Request/command: ください, ましょう, たい, ほしい, お願い
+    has_request = 1 if re.search(r'ください|ましょう|(?:し|き|み|い)たい(?:です)?|ほしい|お願い|してほしい|教えて|頼む', text) else 0
+    # Negation: ない, ません, いいえ, 違う, いや
+    has_negation = 1 if re.search(r'ない(?:です)?$|ません|いいえ|違(?:う|います)|いや(?!ー)|じゃない', text) else 0
+    # Has verb endings (polite/dictionary form) suggesting real content
+    has_verb = 1 if re.search(r'ます$|ました$|ません$|する$|した$|できる|できます|あります|います|思います|考え', text) else 0
+    # Has kanji content (real responses tend to have more kanji)
+    kanji_ratio = kanji / max(char_len, 1)
+    # === Remaining ratio (key feature) ===
+    remaining = text
+    for pattern in [
+        # Core aizuchi
+        'はい', 'ええ', 'うん', 'うんうん', 'はいはい',
+        # Agreement
+        'そうですね', 'そうですか', 'そうだね', 'そうだよね', 'そうなんだ', 'そうなんですね',
+        'そうそう', 'そうね', 'そっか', 'そう',
+        # Understanding
+        'なるほど', 'なるほどね', 'なるほどですね',
+        'わかりました', 'わかった', '分かります',
+        'あぁそうか', 'あそっか',
+        # Surprise
+        'へー', 'へぇ', 'ほー', 'ほぉ', 'えー', 'えぇ', 'えっ',
+        # Fillers
+        'えーと', 'えっと', 'あのー', 'あの', 'うーん', 'んー', 'あー',
+        'まあ', 'まぁ', 'そのー', 'その', 'なんか',
+        # Reaction
+        'すごい', 'すごいね', 'すごいですね', 'やばい', 'やばっ',
+        'ほんと', 'ほんとう', 'ほんとに', 'ほんとですか',
+        'まじ', 'まじで', 'マジ', 'マジで',
+        'うそ', 'うっそ', 'うそー',
+        # Empathy
+        'よかった', 'よかったね', 'よかったですね',
+        'たいへん', 'たいへんですね',
+        '大丈夫', '大丈夫です',
+        'かわいそう', '残念', '残念ですね',
+        # Particles
+        'ね', 'ねー', 'よね', 'だよね', 'ですよね', 'でしょ', 'でしょう',
+        'だね', 'ですね', 'です', 'よ',
+        # Informal
+        'ふーん', 'あっそ', 'っす', 'っすね', 'じゃん', 'なー',
+        'ガチ', 'ガチで',
+        # Polite
+        '承知しました', '承知いたしました', 'かしこまりました',
+        # Connectors (neutral)
+        'ああ', 'あぁ', 'あ',
+        # Elongation marks
+        'ー',
+        ' ',
+    ]:
+        remaining = remaining.replace(pattern, '')
+    # Also strip elongated vowels via regex
+    remaining = re.sub(r'ー+', '', remaining)
+    remaining = re.sub(r'っ+', '', remaining)
+    remaining_len = len(remaining)
+    remaining_ratio = remaining_len / max(char_len, 1)
+    return np.array([
+        char_len, jp_chars, word_count,
+        hiragana, katakana, kanji,
+        has_hai, has_ee, has_un,
+        has_sou, has_naruhodo,
+        has_hee, has_e_surprise, has_filler,
+        has_reaction, has_empathy,
+        has_particle_bc, has_informal, has_polite_ack,
+        has_question, has_continuation, has_request,
+        has_negation, has_verb,
+        kanji_ratio, remaining_len, remaining_ratio,
+    ])
+FEATURE_NAMES_JA = [
+    'char_len', 'jp_chars', 'word_count',
+    'hiragana', 'katakana', 'kanji',
+    'has_hai', 'has_ee', 'has_un',
+    'has_sou', 'has_naruhodo',
+    'has_hee', 'has_e_surprise', 'has_filler',
+    'has_reaction', 'has_empathy',
+    'has_particle_bc', 'has_informal', 'has_polite_ack',
+    'has_question', 'has_continuation', 'has_request',
+    'has_negation', 'has_verb',
+    'kanji_ratio', 'remaining_len', 'remaining_ratio',
+]
+def _load_model_ja():
+    global _MODEL
+    if _MODEL is None:
+        import warnings
+        pkl_path = os.path.join(_DIR, 'backchannel_model_ja.pkl')
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                with open(pkl_path, 'rb') as f:
+                    data = pickle.load(f)
+                _MODEL = data['model']
+                # verify it works with current sklearn
+                test_features = extract_features_ja("はい").reshape(1, -1)
+                _MODEL.predict(test_features)
+        except Exception:
+            _MODEL = _retrain_model_ja(pkl_path)
+    return _MODEL
+def _retrain_model_ja(pkl_path):
+    """Retrain Japanese model from inline data when pkl is incompatible"""
+    import random
+    from sklearn.ensemble import GradientBoostingClassifier
+    import importlib.util
+    train_path = os.path.join(os.path.dirname(_DIR), 'train_ja.py')
+    if os.path.exists(train_path):
+        spec = importlib.util.spec_from_file_location("train_ja_module", train_path)
+        train_mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(train_mod)
+        pos_examples = train_mod.augment_backchannels_ja(train_mod.BACKCHANNELS_JA)
+        neg_examples = list(train_mod.REAL_RESPONSES_JA)
+    else:
+        raise RuntimeError("Cannot retrain: train_ja.py not found alongside package")
+    random.seed(42)
+    X_pos = np.array([extract_features_ja(t) for t in pos_examples])
+    X_neg = np.array([extract_features_ja(t) for t in neg_examples])
+    X = np.vstack([X_pos, X_neg])
+    y = np.array([1] * len(pos_examples) + [0] * len(neg_examples))
+    indices = list(range(len(X)))
+    random.shuffle(indices)
+    X = X[indices]
+    y = y[indices]
+    model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
+    model.fit(X, y)
+    try:
+        with open(pkl_path, 'wb') as f:
+            pickle.dump({'model': model, 'feature_names': FEATURE_NAMES_JA}, f)
+    except Exception:
+        pass
+    return model
+def is_backchannel_ja(text: str, threshold: float = 0.5) -> tuple:
+    """
+    Detect if Japanese text is a backchannel (aizuchi).
+    Returns: (is_backchannel: bool, confidence: float)
+    """
+    if not text or not text.strip():
+        return False, 0.0
+    model = _load_model_ja()
+    features = extract_features_ja(text).reshape(1, -1)
+    prob = model.predict_proba(features)[0]
+    bc_prob = prob[1]
+    return bc_prob >= threshold, float(bc_prob)
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) > 1:
+        text = ' '.join(sys.argv[1:])
+        is_bc, conf = is_backchannel_ja(text)
+        print(f"'{text}' -> {'BACKCHANNEL' if is_bc else 'REAL RESPONSE'} (confidence: {conf:.4f})")
+    else:
+        print("Japanese Backchannel (Aizuchi) Detector - type text to classify (ctrl+c to exit)")
+        while True:
+            try:
+                text = input("> ")
+                is_bc, conf = is_backchannel_ja(text)
+                label = "BACKCHANNEL" if is_bc else "REAL RESPONSE"
+                print(f"  -> {label} (confidence: {conf:.4f})")
+            except (KeyboardInterrupt, EOFError):
+                break

backchannel_classifier-0.4.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,142 @@
+Metadata-Version: 2.4
+Name: backchannel-classifier
+Version: 0.4.0
+Summary: backchannel classifier - detect backchannels vs real responses in thai and japanese asr output
+Author-email: "100x.fi" <kiri@100x.fi>
+License: MIT
+Project-URL: Homepage, https://github.com/100x-fi/backchannel-classifier
+Project-URL: Repository, https://github.com/100x-fi/backchannel-classifier
+Keywords: thai,japanese,nlp,backchannel,aizuchi,voice,asr,classifier
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: scikit-learn>=1.0
+Requires-Dist: numpy>=1.20
+Dynamic: license-file
+# backchannel classifier
+detects backchannel responses vs real user input for voice ai systems. supports **thai** and **japanese** (aizuchi).
+## install
+```bash
+pip install backchannel-classifier
+```
+## usage
+```python
+from backchannel_classifier import is_backchannel
+# thai (default)
+is_backchannel("ครับ")                    # (True, 0.91)
+is_backchannel("ไม่ครับ")                 # (False, 0.01)
+is_backchannel("ใช่ แต่ว่า")              # (False, 0.01)
+# japanese
+is_backchannel("はい", lang="ja")         # (True, 0.99)
+is_backchannel("そうですね", lang="ja")    # (True, 0.99)
+is_backchannel("予約したいです", lang="ja") # (False, 0.0001)
+# direct import
+from backchannel_classifier.jp import is_backchannel_ja
+is_backchannel_ja("なるほど")              # (True, 0.99)
+```
+returns `(is_backchannel: bool, confidence: float)`.
+## why
+voice bots using asr → llm → tts pipelines need to distinguish between backchannels (acknowledgment sounds that should be ignored) and real responses that need processing. simple exact matching fails on asr variants and misses edge cases.
+## approach
+gradient boosting classifier with handcrafted language-specific features. key idea: strip known backchannel components from the text, measure what's left (`remaining_ratio`). if nothing remains, it's a backchannel.
+### thai (26 features)
+| feature | importance |
+|---|---|
+| remaining_ratio | 0.9098 |
+| has_request | 0.0406 |
+| has_negation | 0.0274 |
+| particle_ratio | 0.0108 |
+- polite particle detection (ครับ/ค่ะ/จ้ะ variants)
+- backchannel sound patterns (อืม/อ๋อ/เออ with tone variants)
+- question/negation/request/continuation markers
+- handles asr misspellings (ค่า→ค่ะ, คับ→ครับ, อื้ม→อืม)
+### japanese (27 features)
+| feature | importance |
+|---|---|
+| remaining_ratio | 0.7765 |
+| remaining_len | 0.0484 |
+| katakana | 0.0347 |
+| word_count | 0.0325 |
+| kanji_ratio | 0.0206 |
+- core aizuchi (はい/ええ/うん/そう)
+- agreement, understanding, surprise, filler, reaction markers
+- question/continuation/request/negation/verb negative indicators
+- handles asr elongation variants (はーーい, えーーー)
+## results
+### thai
+- **99.49% f1** (5-fold cv)
+- test suite: **94/94** (100%)
+### japanese
+- **98.37% f1** (5-fold cv)
+- test suite: **119/119** (100%)
+## test coverage
+### thai (94 cases)
+**backchannels (49):** ครับ, ค่ะ, อืม, ใช่, อ๋อ, เหรอ, ฮัลโหล, asr variants...
+**real responses (45):** สวัสดีครับ, ไม่ครับ, ราคาเท่าไหร่ครับ, edge cases (ใช่ แต่ว่า, ครับ แล้วก็)...
+### japanese (119 cases)
+**aizuchi (63):** はい, うん, そうですね, なるほど, へー, まじで, えーと, すごい, 承知しました, compounds...
+**real responses (56):** ありがとうございます, いくらですか, 予約したいです, edge cases (はい、質問があります, そうですね、でも...)...
+## testing
+```bash
+python3 -m pytest tests/ -v
+```
+## files
+- `backchannel_classifier/__init__.py` - thai classifier + unified api
+- `backchannel_classifier/jp.py` - japanese classifier
+- `train.py` - thai training script
+- `train_ja.py` - japanese training script
+- `tests/test_classifier.py` - thai test suite (94 cases)
+- `tests/test_classifier_ja.py` - japanese test suite (119 cases)
+## requirements
+- python 3.8+
+- scikit-learn
+- numpy
+## memory
+~3.7 MB per language model, lazy-loaded. if you only use thai, japanese model is never loaded (zero overhead).

backchannel_classifier-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+backchannel_classifier/__init__.py,sha256=1PA00oIqCbPHPdfyCeyc-4MWjrAsWhHHXjuxQQNR2RQ,7564
+backchannel_classifier/backchannel_model.pkl,sha256=kZhZLE575Ms2mJT0NXzCJHMDltHnpHpW8bI-uR8hq-A,3917
+backchannel_classifier/backchannel_model_ja.pkl,sha256=aVrrQFxzIWXP8eHbbfPGTT7aSRwfHJ_Nuv-4ZqjLLKg,219054
+backchannel_classifier/jp.py,sha256=vuiYkFutySn8GQZsJNgriu6xUjyUisIScntI0itu8t8,10689
+backchannel_classifier-0.4.0.dist-info/licenses/LICENSE,sha256=j0I9MBGgesqFL9pKi32WngbDC03dpUw7f4XMI9FDSRQ,1064
+backchannel_classifier-0.4.0.dist-info/METADATA,sha256=e4swSCejamHbpgwfUcAn0Ajk0uQ8-fjoARu3NyL8gT4,4900
+backchannel_classifier-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+backchannel_classifier-0.4.0.dist-info/top_level.txt,sha256=C0EiI_PBu-SmB3oQXafQ_zhwyb4-y4b9YmPW7CvQGp0,23
+backchannel_classifier-0.4.0.dist-info/RECORD,,

backchannel_classifier-0.4.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

backchannel_classifier-0.4.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 100x.fi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

backchannel_classifier-0.4.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ backchannel_classifier