filler-classifier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- filler_classifier-0.1.0/LICENSE +21 -0
- filler_classifier-0.1.0/PKG-INFO +118 -0
- filler_classifier-0.1.0/README.md +93 -0
- filler_classifier-0.1.0/filler_classifier/__init__.py +232 -0
- filler_classifier-0.1.0/filler_classifier.egg-info/PKG-INFO +118 -0
- filler_classifier-0.1.0/filler_classifier.egg-info/SOURCES.txt +9 -0
- filler_classifier-0.1.0/filler_classifier.egg-info/dependency_links.txt +1 -0
- filler_classifier-0.1.0/filler_classifier.egg-info/requires.txt +2 -0
- filler_classifier-0.1.0/filler_classifier.egg-info/top_level.txt +1 -0
- filler_classifier-0.1.0/pyproject.toml +37 -0
- filler_classifier-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 100x.fi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: filler-classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Thai filler word classifier for voice bots - picks the right acknowledgment phrase while LLM thinks
|
|
5
|
+
Author-email: "100x.fi" <kiri@100x.fi>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/100x-fi/filler-classifier
|
|
8
|
+
Project-URL: Repository, https://github.com/100x-fi/filler-classifier
|
|
9
|
+
Keywords: thai,nlp,filler,voice,classifier,embeddings,voice-bot
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: sentence-transformers>=2.0
|
|
23
|
+
Requires-Dist: numpy>=1.20
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# filler-classifier
|
|
27
|
+
|
|
28
|
+
Thai filler word classifier for voice bots. Classifies customer input into categories and returns the appropriate filler phrase to play instantly while the LLM generates a full response.
|
|
29
|
+
|
|
30
|
+
Built for [ingfah.ai](https://ingfah.ai) voice bot but easily adaptable to any Thai voice AI system.
|
|
31
|
+
|
|
32
|
+
## Why
|
|
33
|
+
|
|
34
|
+
Voice bots have a latency problem: the user speaks, ASR transcribes, then the LLM takes 1-3 seconds to respond. Dead silence feels broken. The solution is to play a short filler phrase ("สักครู่นะคะ", "ขออภัยด้วยน่ะคะ") immediately while the LLM thinks.
|
|
35
|
+
|
|
36
|
+
But you can't play the same filler for everything. If someone is angry, "ได้เลยค่ะ" sounds dismissive. If someone asks a question, "ขออภัยด้วยน่ะคะ" makes no sense.
|
|
37
|
+
|
|
38
|
+
This classifier picks the right filler by category.
|
|
39
|
+
|
|
40
|
+
## Categories
|
|
41
|
+
|
|
42
|
+
| Category | When | Example Fillers |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `complaint` | Angry, frustrated, profanity, threats | ขออภัยด้วยน่ะคะ |
|
|
45
|
+
| `question` | Asking for info, pricing, how-to | สักครู่นะคะ, ตรวจสอบให้นะคะ |
|
|
46
|
+
| `default` | Greetings, agreements, requests, everything else | รับทราบค่ะ, ได้เลยค่ะ |
|
|
47
|
+
|
|
48
|
+
## How It Works
|
|
49
|
+
|
|
50
|
+
Uses `intfloat/multilingual-e5-small` embeddings with centroid-based cosine similarity:
|
|
51
|
+
|
|
52
|
+
1. Each category has ~30-60 anchor phrases (real Thai customer service examples)
|
|
53
|
+
2. On init, all anchors are embedded and averaged into category centroids
|
|
54
|
+
3. At inference, the input is embedded and compared to centroids via cosine similarity
|
|
55
|
+
4. The closest category wins, and a random filler from that category is returned
|
|
56
|
+
|
|
57
|
+
## Performance
|
|
58
|
+
|
|
59
|
+
- **Accuracy**: 89.6% on 1,000 Thai customer service sentences
|
|
60
|
+
- **Inference**: <10ms per classification (after model load)
|
|
61
|
+
- **Init**: ~200ms for centroid computation
|
|
62
|
+
- **Model size**: ~118MB (multilingual-e5-small)
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install filler-classifier
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from filler_classifier import FillerClassifier
|
|
74
|
+
|
|
75
|
+
# loads model automatically on first init
|
|
76
|
+
clf = FillerClassifier()
|
|
77
|
+
|
|
78
|
+
# classify and get category + confidence + filler
|
|
79
|
+
category, confidence, filler = clf.classify("อยากถามเรื่องบิลครับ")
|
|
80
|
+
# ("question", 0.872, "สักครู่นะคะ")
|
|
81
|
+
|
|
82
|
+
category, confidence, filler = clf.classify("ใช้งานไม่ได้เลย")
|
|
83
|
+
# ("complaint", 0.891, "ขออภัยด้วยน่ะคะ")
|
|
84
|
+
|
|
85
|
+
category, confidence, filler = clf.classify("ได้ครับ ตกลง")
|
|
86
|
+
# ("default", 0.845, "ได้เลยค่ะ")
|
|
87
|
+
|
|
88
|
+
# or just get the filler phrase directly
|
|
89
|
+
filler = clf.get_filler("มีโปรอะไรบ้างครับ")
|
|
90
|
+
# "ตรวจสอบให้นะคะ"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Sharing the model
|
|
94
|
+
|
|
95
|
+
If you already have a `SentenceTransformer` instance loaded (e.g., for other tasks), pass it in to avoid loading twice:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from sentence_transformers import SentenceTransformer
|
|
99
|
+
from filler_classifier import FillerClassifier
|
|
100
|
+
|
|
101
|
+
model = SentenceTransformer("intfloat/multilingual-e5-small")
|
|
102
|
+
clf = FillerClassifier(model=model)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Customizing Fillers
|
|
106
|
+
|
|
107
|
+
Override `CATEGORY_FILLERS` to use your own phrases:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import filler_classifier
|
|
111
|
+
|
|
112
|
+
filler_classifier.CATEGORY_FILLERS["complaint"] = ["ขออภัยค่ะ", "เข้าใจค่ะ"]
|
|
113
|
+
filler_classifier.CATEGORY_FILLERS["question"] = ["รอสักครู่นะคะ"]
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
MIT
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# filler-classifier
|
|
2
|
+
|
|
3
|
+
Thai filler word classifier for voice bots. Classifies customer input into categories and returns the appropriate filler phrase to play instantly while the LLM generates a full response.
|
|
4
|
+
|
|
5
|
+
Built for [ingfah.ai](https://ingfah.ai) voice bot but easily adaptable to any Thai voice AI system.
|
|
6
|
+
|
|
7
|
+
## Why
|
|
8
|
+
|
|
9
|
+
Voice bots have a latency problem: the user speaks, ASR transcribes, then the LLM takes 1-3 seconds to respond. Dead silence feels broken. The solution is to play a short filler phrase ("สักครู่นะคะ", "ขออภัยด้วยน่ะคะ") immediately while the LLM thinks.
|
|
10
|
+
|
|
11
|
+
But you can't play the same filler for everything. If someone is angry, "ได้เลยค่ะ" sounds dismissive. If someone asks a question, "ขออภัยด้วยน่ะคะ" makes no sense.
|
|
12
|
+
|
|
13
|
+
This classifier picks the right filler by category.
|
|
14
|
+
|
|
15
|
+
## Categories
|
|
16
|
+
|
|
17
|
+
| Category | When | Example Fillers |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| `complaint` | Angry, frustrated, profanity, threats | ขออภัยด้วยน่ะคะ |
|
|
20
|
+
| `question` | Asking for info, pricing, how-to | สักครู่นะคะ, ตรวจสอบให้นะคะ |
|
|
21
|
+
| `default` | Greetings, agreements, requests, everything else | รับทราบค่ะ, ได้เลยค่ะ |
|
|
22
|
+
|
|
23
|
+
## How It Works
|
|
24
|
+
|
|
25
|
+
Uses `intfloat/multilingual-e5-small` embeddings with centroid-based cosine similarity:
|
|
26
|
+
|
|
27
|
+
1. Each category has ~30-60 anchor phrases (real Thai customer service examples)
|
|
28
|
+
2. On init, all anchors are embedded and averaged into category centroids
|
|
29
|
+
3. At inference, the input is embedded and compared to centroids via cosine similarity
|
|
30
|
+
4. The closest category wins, and a random filler from that category is returned
|
|
31
|
+
|
|
32
|
+
## Performance
|
|
33
|
+
|
|
34
|
+
- **Accuracy**: 89.6% on 1,000 Thai customer service sentences
|
|
35
|
+
- **Inference**: <10ms per classification (after model load)
|
|
36
|
+
- **Init**: ~200ms for centroid computation
|
|
37
|
+
- **Model size**: ~118MB (multilingual-e5-small)
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install filler-classifier
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from filler_classifier import FillerClassifier
|
|
49
|
+
|
|
50
|
+
# loads model automatically on first init
|
|
51
|
+
clf = FillerClassifier()
|
|
52
|
+
|
|
53
|
+
# classify and get category + confidence + filler
|
|
54
|
+
category, confidence, filler = clf.classify("อยากถามเรื่องบิลครับ")
|
|
55
|
+
# ("question", 0.872, "สักครู่นะคะ")
|
|
56
|
+
|
|
57
|
+
category, confidence, filler = clf.classify("ใช้งานไม่ได้เลย")
|
|
58
|
+
# ("complaint", 0.891, "ขออภัยด้วยน่ะคะ")
|
|
59
|
+
|
|
60
|
+
category, confidence, filler = clf.classify("ได้ครับ ตกลง")
|
|
61
|
+
# ("default", 0.845, "ได้เลยค่ะ")
|
|
62
|
+
|
|
63
|
+
# or just get the filler phrase directly
|
|
64
|
+
filler = clf.get_filler("มีโปรอะไรบ้างครับ")
|
|
65
|
+
# "ตรวจสอบให้นะคะ"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Sharing the model
|
|
69
|
+
|
|
70
|
+
If you already have a `SentenceTransformer` instance loaded (e.g., for other tasks), pass it in to avoid loading twice:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from sentence_transformers import SentenceTransformer
|
|
74
|
+
from filler_classifier import FillerClassifier
|
|
75
|
+
|
|
76
|
+
model = SentenceTransformer("intfloat/multilingual-e5-small")
|
|
77
|
+
clf = FillerClassifier(model=model)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Customizing Fillers
|
|
81
|
+
|
|
82
|
+
Override `CATEGORY_FILLERS` to use your own phrases:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import filler_classifier
|
|
86
|
+
|
|
87
|
+
filler_classifier.CATEGORY_FILLERS["complaint"] = ["ขออภัยค่ะ", "เข้าใจค่ะ"]
|
|
88
|
+
filler_classifier.CATEGORY_FILLERS["question"] = ["รอสักครู่นะคะ"]
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
MIT
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
thai filler word classifier for voice bots.
|
|
3
|
+
picks the right acknowledgment filler to play instantly while llm thinks.
|
|
4
|
+
uses e5-small embeddings with centroid-based classification.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import random
|
|
8
|
+
import numpy as np
|
|
9
|
+
from sentence_transformers import SentenceTransformer
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
# anchor phrases - customer/caller pov (what they say to us)
|
|
14
|
+
# e5 requires "query: " prefix for encoding queries
|
|
15
|
+
CATEGORY_ANCHORS = {
|
|
16
|
+
"complaint": [
|
|
17
|
+
# profanity / cursing
|
|
18
|
+
"พ่อมึงตายหรอไอ้สัส",
|
|
19
|
+
"ไอ้เหี้ย",
|
|
20
|
+
"เอไอหีหมา",
|
|
21
|
+
"แม่เย็ด",
|
|
22
|
+
"เอากับกูเปล่า",
|
|
23
|
+
"ไอ้สัตว์",
|
|
24
|
+
"ไอ้ควาย",
|
|
25
|
+
"กูโกรธมากเลยวะ",
|
|
26
|
+
"มึงทำอะไรอยู่วะ",
|
|
27
|
+
"ห่าอะไรวะ",
|
|
28
|
+
"เฮงซวยมาก",
|
|
29
|
+
"บ้าเหี้ย",
|
|
30
|
+
"สัส",
|
|
31
|
+
"ไอ้บ้า",
|
|
32
|
+
# angry / frustrated (no profanity)
|
|
33
|
+
"ใช้งานไม่ได้เลย",
|
|
34
|
+
"มีปัญหาครับ",
|
|
35
|
+
"เสียแล้ว",
|
|
36
|
+
"ไม่พอใจเลย",
|
|
37
|
+
"ทำไมเป็นแบบนี้",
|
|
38
|
+
"แย่มากเลยครับ",
|
|
39
|
+
"หงุดหงิดมาก",
|
|
40
|
+
"โทรมาหลายรอบแล้ว",
|
|
41
|
+
"ยังไม่ได้แก้เลย",
|
|
42
|
+
"รอนานมากครับ",
|
|
43
|
+
"เน็ตหลุดตลอด",
|
|
44
|
+
"ใช้ไม่ได้มาสามวันแล้ว",
|
|
45
|
+
"เบื่อมากเลยครับ",
|
|
46
|
+
"บริการแย่มาก",
|
|
47
|
+
"ผิดหวังมากครับ",
|
|
48
|
+
"จะฟ้องสคบ",
|
|
49
|
+
"ขอเรื่องร้องเรียน",
|
|
50
|
+
"จะยกเลิกเลย",
|
|
51
|
+
"ไม่มีใครช่วยเลย",
|
|
52
|
+
"โทรมาทั้งวัน",
|
|
53
|
+
# threats / escalation (no profanity)
|
|
54
|
+
"ฟังฉันให้ดีนะ",
|
|
55
|
+
"ฉันจะเอาเรื่องนี้ให้ถึงที่สุด",
|
|
56
|
+
"คุณทำให้ฉันรู้สึกเหมือนถูกดูถูก",
|
|
57
|
+
"จะแจ้งความเลยนะ",
|
|
58
|
+
"จะโพสต์ลงโซเชียลเลย",
|
|
59
|
+
"ขอคุยกับหัวหน้า",
|
|
60
|
+
"ขอพูดกับผู้จัดการ",
|
|
61
|
+
"ไม่ยอมรับได้เลย",
|
|
62
|
+
"นี่มันเรื่องอะไรกัน",
|
|
63
|
+
"คุณทำงานกันยังไงวะ",
|
|
64
|
+
"นี่มันการบริการลูกค้าเหรอวะ",
|
|
65
|
+
"ทำไมถึงแก้ไม่ได้สักที",
|
|
66
|
+
"จะร้องเรียนไปกสทช",
|
|
67
|
+
"พูดแล้วไม่ทำตาม",
|
|
68
|
+
"สัญญาไว้แล้วไม่ทำ",
|
|
69
|
+
"เสียเวลาเปล่า",
|
|
70
|
+
"ทำไมปล่อยให้ลูกค้ารอ",
|
|
71
|
+
"รับผิดชอบหน่อยสิ",
|
|
72
|
+
# rhetorical questions (angry, not real questions)
|
|
73
|
+
"คุณทำงานกันยังไงวะ",
|
|
74
|
+
"นี่เรียกบริการเหรอ",
|
|
75
|
+
"จะให้ทนอีกนานแค่ไหน",
|
|
76
|
+
"ทำไมถึงทำกับลูกค้าแบบนี้",
|
|
77
|
+
],
|
|
78
|
+
"question": [
|
|
79
|
+
"อยากถามว่า",
|
|
80
|
+
"คือว่า",
|
|
81
|
+
"ช่วยอธิบายได้ไหม",
|
|
82
|
+
"สอบถามหน่อยครับ",
|
|
83
|
+
"อยากรู้ว่า",
|
|
84
|
+
"มีโปรอะไรบ้างครับ",
|
|
85
|
+
"ราคาเท่าไหร่ครับ",
|
|
86
|
+
"ทำยังไงครับ",
|
|
87
|
+
"ต้องทำอะไรบ้าง",
|
|
88
|
+
"เงื่อนไขเป็นยังไงครับ",
|
|
89
|
+
"คิดค่าบริการยังไง",
|
|
90
|
+
"ขอถามเรื่องบิลหน่อย",
|
|
91
|
+
"มีวิธีไหนบ้าง",
|
|
92
|
+
"แพ็กเกจไหนดีครับ",
|
|
93
|
+
"ช่วยเช็คให้หน่อยได้ไหม",
|
|
94
|
+
"ป้ายสาขาดูได้จากตรงไหนครับ",
|
|
95
|
+
"มองไปดูตรงไหนได้ครับ",
|
|
96
|
+
"ดูยังไงครับ",
|
|
97
|
+
"หาเจอตรงไหนครับ",
|
|
98
|
+
# problem + question pattern
|
|
99
|
+
"อินเทอร์เน็ตช้ามาก มีวิธีแก้ไขไหมคะ",
|
|
100
|
+
"ทำไมสัญญาณโทรศัพท์ถึงไม่ค่อยมีคะ",
|
|
101
|
+
"ซิมการ์ดหาย ต้องทำยังไงคะ",
|
|
102
|
+
"ทำไมเน็ตช้าจังเลยคะ",
|
|
103
|
+
"ทำไมสัญญาณอ่อนจังเลยคะ",
|
|
104
|
+
"ใช้เน็ตหมดเร็วมากเลยค่ะ",
|
|
105
|
+
"สัญญาณโทรศัพท์ไม่ค่อยดีเลยค่ะ",
|
|
106
|
+
"มีปัญหาเรื่องอินเทอร์เน็ตค่ะ",
|
|
107
|
+
"เน็ตบ้านใช้ไม่ได้มาหลายวันแล้วค่ะ",
|
|
108
|
+
"สินค้าที่สั่งไปนานแล้ว ทำไมยังไม่ได้รับคะ",
|
|
109
|
+
# service inquiry questions
|
|
110
|
+
"ต้องการระงับการใช้งานชั่วคราวได้ไหมคะ",
|
|
111
|
+
"ถ้าไม่ใช้บัตร จะยกเลิกได้ไหมครับ",
|
|
112
|
+
"ฉันอยากจะยกเลิกบริการ ต้องติดต่อที่ไหนคะ",
|
|
113
|
+
"บริการเสริมนี้เสียค่าใช้จ่ายเพิ่มไหมคะ",
|
|
114
|
+
"ฉันลืมรหัสผ่านเข้าระบบ ต้องกู้คืนยังไงคะ",
|
|
115
|
+
"สัญญาหมดเมื่อไหร่คะ",
|
|
116
|
+
"ถ้าไม่ต่อสัญญา จะเป็นอะไรไหมคะ",
|
|
117
|
+
"ต้องรอนานไหมคะ",
|
|
118
|
+
"ถ้าจ่ายช้าจะเป็นอะไรไหมคะ",
|
|
119
|
+
"ถ้าจะยกเลิกต้องเสียค่าอะไรไหมคะ",
|
|
120
|
+
"เปลี่ยนจาก AIS ไป TRUE ต้องทำยังไง",
|
|
121
|
+
"โปรนี้ใช้เล่น TikTok ได้ไหมคะ",
|
|
122
|
+
],
|
|
123
|
+
"default": [
|
|
124
|
+
# acknowledgment, agreement, greeting, decline, info-giving all go here
|
|
125
|
+
"รับทราบครับ",
|
|
126
|
+
"โอเค รับทราบ",
|
|
127
|
+
"ทราบแล้วครับ",
|
|
128
|
+
"เข้าใจแล้วครับ",
|
|
129
|
+
"ได้ครับ",
|
|
130
|
+
"ตกลงครับ",
|
|
131
|
+
"โอเคครับ",
|
|
132
|
+
"ได้เลยครับ",
|
|
133
|
+
"สวัสดีครับ",
|
|
134
|
+
"หวัดดีครับ",
|
|
135
|
+
"ไม่เอาครับ",
|
|
136
|
+
"ไม่ต้องครับ",
|
|
137
|
+
"สาขา 182 ครับ",
|
|
138
|
+
"100245 ครับ",
|
|
139
|
+
"หมายเลข 5678 ครับ",
|
|
140
|
+
"ชื่อ สมชาย ครับ",
|
|
141
|
+
"ถูกต้องครับ",
|
|
142
|
+
"ครับ ติดต่อเบอร์นี้ได้เลย",
|
|
143
|
+
"เอาครับ",
|
|
144
|
+
# polite requests
|
|
145
|
+
"ช่วยเปลี่ยนแพ็กเกจให้หน่อยได้ไหมคะ",
|
|
146
|
+
"รบกวนเช็คให้หน่อยค่ะ",
|
|
147
|
+
"ช่วยแนะนำหน่อยได้ไหมคะ",
|
|
148
|
+
"ขอเปลี่ยนเบอร์ได้ไหมครับ",
|
|
149
|
+
"รบกวนช่วยดูให้หน่อยนะคะ",
|
|
150
|
+
"ช่วยโอนสายให้หน่อยได้ไหม",
|
|
151
|
+
"ขอยกเลิกบริการเสริมได้ไหมคะ",
|
|
152
|
+
"รบกวนส่งรายละเอียดมาให้หน่อยค่ะ",
|
|
153
|
+
"ช่วยตรวจสอบยอดค้างชำระหน่อยได้ไหม",
|
|
154
|
+
"ขอสมัครโปรใหม่ได้ไหมครับ",
|
|
155
|
+
"รบกวนแก้ไขข้อมูลให้หน่อยค่ะ",
|
|
156
|
+
"ช่วยเช็คสถานะให้หน่อยได้ไหมคะ",
|
|
157
|
+
"ช่วยแนะนำหน่อยได้ไหมคะ",
|
|
158
|
+
"ช่วยแนะนำหน่อยครับ",
|
|
159
|
+
"แนะนำให้หน่อยได้ไหม",
|
|
160
|
+
"ช่วยแนะนำโปรหน่อยค่ะ",
|
|
161
|
+
# polite declines
|
|
162
|
+
"ไม่เอาค่ะ",
|
|
163
|
+
"ไม่เอาครับ",
|
|
164
|
+
"ไม่เอาแล้วค่ะ",
|
|
165
|
+
"ไม่เอาแล้วครับ",
|
|
166
|
+
"ไม่สนใจค่ะ",
|
|
167
|
+
"ไม่สนใจครับ",
|
|
168
|
+
"ไม่เป็นไรค่ะ",
|
|
169
|
+
"ไม่เป็นไรครับ",
|
|
170
|
+
"ยังไม่เอาค่ะ",
|
|
171
|
+
"ไม่ล่ะค่ะ",
|
|
172
|
+
# statements
|
|
173
|
+
"ขอโทษนะคะ เมื่อกี้พูดว่าอะไรนะคะ",
|
|
174
|
+
"เน็ตหลุดบ่อยมากเลยค่ะ",
|
|
175
|
+
"ที่บ้านไม่มีสัญญาณเลยค่ะ",
|
|
176
|
+
"อยากได้ความเร็วเน็ตที่เสถียรกว่านี้",
|
|
177
|
+
"พอดีไม่ค่อยเข้าใจค่ะ",
|
|
178
|
+
],
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# filler phrases mapped to each category (bot response fillers)
|
|
182
|
+
CATEGORY_FILLERS = {
|
|
183
|
+
"complaint": ["ขออภัยด้วยน่ะคะ"],
|
|
184
|
+
"question": ["สักครู่นะคะ", "สักครู่ค่ะ", "ตรวจสอบให้นะคะ"],
|
|
185
|
+
"default": ["รับทราบค่ะ", "ค่ะ ได้ค่ะ", "ได้เลยค่ะ", "ดีเลยค่ะ", "เข้าใจค่ะ"],
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class FillerClassifier:
|
|
190
|
+
"""
|
|
191
|
+
embed-based filler classifier.
|
|
192
|
+
computes category centroids from anchor phrases, then classifies
|
|
193
|
+
new input by cosine similarity to centroids.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
def __init__(self, model: SentenceTransformer | None = None, model_name: str = "intfloat/multilingual-e5-small"):
|
|
197
|
+
if model is None:
|
|
198
|
+
model = SentenceTransformer(model_name)
|
|
199
|
+
self.model = model
|
|
200
|
+
self.categories = list(CATEGORY_ANCHORS.keys())
|
|
201
|
+
self.centroids = self._compute_centroids()
|
|
202
|
+
|
|
203
|
+
def _compute_centroids(self) -> np.ndarray:
|
|
204
|
+
"""embed all anchors per category, average to get centroids."""
|
|
205
|
+
centroids = []
|
|
206
|
+
for cat in self.categories:
|
|
207
|
+
phrases = [f"query: {p}" for p in CATEGORY_ANCHORS[cat]]
|
|
208
|
+
embeddings = self.model.encode(phrases, normalize_embeddings=True)
|
|
209
|
+
centroid = np.mean(embeddings, axis=0)
|
|
210
|
+
centroid = centroid / np.linalg.norm(centroid)
|
|
211
|
+
centroids.append(centroid)
|
|
212
|
+
return np.array(centroids)
|
|
213
|
+
|
|
214
|
+
def classify(self, text: str) -> tuple[str, float, str]:
|
|
215
|
+
"""
|
|
216
|
+
classify input text.
|
|
217
|
+
returns (category, confidence, filler_phrase).
|
|
218
|
+
"""
|
|
219
|
+
embedding = self.model.encode(
|
|
220
|
+
[f"query: {text}"], normalize_embeddings=True
|
|
221
|
+
)[0]
|
|
222
|
+
sims = self.centroids @ embedding
|
|
223
|
+
best_idx = int(np.argmax(sims))
|
|
224
|
+
category = self.categories[best_idx]
|
|
225
|
+
confidence = float(sims[best_idx])
|
|
226
|
+
filler = random.choice(CATEGORY_FILLERS[category])
|
|
227
|
+
return category, confidence, filler
|
|
228
|
+
|
|
229
|
+
def get_filler(self, text: str) -> str:
|
|
230
|
+
"""just returns the filler phrase."""
|
|
231
|
+
_, _, filler = self.classify(text)
|
|
232
|
+
return filler
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: filler-classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Thai filler word classifier for voice bots - picks the right acknowledgment phrase while LLM thinks
|
|
5
|
+
Author-email: "100x.fi" <kiri@100x.fi>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/100x-fi/filler-classifier
|
|
8
|
+
Project-URL: Repository, https://github.com/100x-fi/filler-classifier
|
|
9
|
+
Keywords: thai,nlp,filler,voice,classifier,embeddings,voice-bot
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: sentence-transformers>=2.0
|
|
23
|
+
Requires-Dist: numpy>=1.20
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# filler-classifier
|
|
27
|
+
|
|
28
|
+
Thai filler word classifier for voice bots. Classifies customer input into categories and returns the appropriate filler phrase to play instantly while the LLM generates a full response.
|
|
29
|
+
|
|
30
|
+
Built for [ingfah.ai](https://ingfah.ai) voice bot but easily adaptable to any Thai voice AI system.
|
|
31
|
+
|
|
32
|
+
## Why
|
|
33
|
+
|
|
34
|
+
Voice bots have a latency problem: the user speaks, ASR transcribes, then the LLM takes 1-3 seconds to respond. Dead silence feels broken. The solution is to play a short filler phrase ("สักครู่นะคะ", "ขออภัยด้วยน่ะคะ") immediately while the LLM thinks.
|
|
35
|
+
|
|
36
|
+
But you can't play the same filler for everything. If someone is angry, "ได้เลยค่ะ" sounds dismissive. If someone asks a question, "ขออภัยด้วยน่ะคะ" makes no sense.
|
|
37
|
+
|
|
38
|
+
This classifier picks the right filler by category.
|
|
39
|
+
|
|
40
|
+
## Categories
|
|
41
|
+
|
|
42
|
+
| Category | When | Example Fillers |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `complaint` | Angry, frustrated, profanity, threats | ขออภัยด้วยน่ะคะ |
|
|
45
|
+
| `question` | Asking for info, pricing, how-to | สักครู่นะคะ, ตรวจสอบให้นะคะ |
|
|
46
|
+
| `default` | Greetings, agreements, requests, everything else | รับทราบค่ะ, ได้เลยค่ะ |
|
|
47
|
+
|
|
48
|
+
## How It Works
|
|
49
|
+
|
|
50
|
+
Uses `intfloat/multilingual-e5-small` embeddings with centroid-based cosine similarity:
|
|
51
|
+
|
|
52
|
+
1. Each category has ~30-60 anchor phrases (real Thai customer service examples)
|
|
53
|
+
2. On init, all anchors are embedded and averaged into category centroids
|
|
54
|
+
3. At inference, the input is embedded and compared to centroids via cosine similarity
|
|
55
|
+
4. The closest category wins, and a random filler from that category is returned
|
|
56
|
+
|
|
57
|
+
## Performance
|
|
58
|
+
|
|
59
|
+
- **Accuracy**: 89.6% on 1,000 Thai customer service sentences
|
|
60
|
+
- **Inference**: <10ms per classification (after model load)
|
|
61
|
+
- **Init**: ~200ms for centroid computation
|
|
62
|
+
- **Model size**: ~118MB (multilingual-e5-small)
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install filler-classifier
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from filler_classifier import FillerClassifier
|
|
74
|
+
|
|
75
|
+
# loads model automatically on first init
|
|
76
|
+
clf = FillerClassifier()
|
|
77
|
+
|
|
78
|
+
# classify and get category + confidence + filler
|
|
79
|
+
category, confidence, filler = clf.classify("อยากถามเรื่องบิลครับ")
|
|
80
|
+
# ("question", 0.872, "สักครู่นะคะ")
|
|
81
|
+
|
|
82
|
+
category, confidence, filler = clf.classify("ใช้งานไม่ได้เลย")
|
|
83
|
+
# ("complaint", 0.891, "ขออภัยด้วยน่ะคะ")
|
|
84
|
+
|
|
85
|
+
category, confidence, filler = clf.classify("ได้ครับ ตกลง")
|
|
86
|
+
# ("default", 0.845, "ได้เลยค่ะ")
|
|
87
|
+
|
|
88
|
+
# or just get the filler phrase directly
|
|
89
|
+
filler = clf.get_filler("มีโปรอะไรบ้างครับ")
|
|
90
|
+
# "ตรวจสอบให้นะคะ"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Sharing the model
|
|
94
|
+
|
|
95
|
+
If you already have a `SentenceTransformer` instance loaded (e.g., for other tasks), pass it in to avoid loading twice:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from sentence_transformers import SentenceTransformer
|
|
99
|
+
from filler_classifier import FillerClassifier
|
|
100
|
+
|
|
101
|
+
model = SentenceTransformer("intfloat/multilingual-e5-small")
|
|
102
|
+
clf = FillerClassifier(model=model)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Customizing Fillers
|
|
106
|
+
|
|
107
|
+
Override `CATEGORY_FILLERS` to use your own phrases:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import filler_classifier
|
|
111
|
+
|
|
112
|
+
filler_classifier.CATEGORY_FILLERS["complaint"] = ["ขออภัยค่ะ", "เข้าใจค่ะ"]
|
|
113
|
+
filler_classifier.CATEGORY_FILLERS["question"] = ["รอสักครู่นะคะ"]
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
filler_classifier/__init__.py
|
|
5
|
+
filler_classifier.egg-info/PKG-INFO
|
|
6
|
+
filler_classifier.egg-info/SOURCES.txt
|
|
7
|
+
filler_classifier.egg-info/dependency_links.txt
|
|
8
|
+
filler_classifier.egg-info/requires.txt
|
|
9
|
+
filler_classifier.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
filler_classifier
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "filler-classifier"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Thai filler word classifier for voice bots - picks the right acknowledgment phrase while LLM thinks"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "100x.fi", email = "kiri@100x.fi"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["thai", "nlp", "filler", "voice", "classifier", "embeddings", "voice-bot"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
"Topic :: Text Processing :: Linguistic",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"sentence-transformers>=2.0",
|
|
29
|
+
"numpy>=1.20",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/100x-fi/filler-classifier"
|
|
34
|
+
Repository = "https://github.com/100x-fi/filler-classifier"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages.find]
|
|
37
|
+
include = ["filler_classifier*"]
|