thai_keyboard_corrector 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/thai_keyboard_corrector/detector.rb +10 -44
- data/lib/thai_keyboard_corrector/mapping.rb +27 -3
- data/lib/thai_keyboard_corrector/version.rb +1 -1
- data/lib/thai_keyboard_corrector.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: adf3f3e1587f08281375e2d9971af347e33f9d136ee36c9df2b94895174bdb80
|
|
4
|
+
data.tar.gz: c3585823ef59f640c80d80fa906a4ade237874c237ee898c7fb0da9235b9b875
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3edf85c588c050020ce4a087466067cd83576ba2de3922f8121529f215243cba63defe98845f0a1997c47187ab3e4cb504cd711692d70094de37aeb1c36df8bd
|
|
7
|
+
data.tar.gz: a6d8ce0cbc345143393ba0bbcb841d69fdd17eb376e2cf415815d1be7c76c635fa2ac532001e94d894f56921a968796d5ed07aff444f64564054b0d12a25522d
|
data/README.md
CHANGED
|
@@ -13,61 +13,27 @@ module ThaiKeyboardCorrector
|
|
|
13
13
|
# Returns :thai_in_en, :en_in_th, :thai, :en, :mixed, :unknown
|
|
14
14
|
def detect_layout(str)
|
|
15
15
|
clean = str.strip
|
|
16
|
-
|
|
16
|
+
clean_no_ws = clean.gsub(/\s+/, '') # strip ALL whitespace
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
return :unknown if thai_cnt.zero? && latin_cnt.zero?
|
|
18
|
+
return :unknown if clean_no_ws.empty?
|
|
20
19
|
|
|
21
|
-
#
|
|
22
|
-
if thai_cnt.zero?
|
|
23
|
-
# ▼ Treat 1-3-letter words as Thai-in-EN (they’re almost never real English)
|
|
24
|
-
return :thai_in_en if clean.length <= 3 &&
|
|
25
|
-
hit_ratio(clean, Mapping::ENG_TO_THAI) >= FULL_HIT
|
|
20
|
+
thai_cnt, latin_cnt = char_stats(clean) # char_stats already ignores ws
|
|
26
21
|
|
|
27
|
-
|
|
28
|
-
|
|
22
|
+
# Majority-vote rule ----------------------------------------------
|
|
23
|
+
return :en_in_th if thai_cnt > latin_cnt # mostly Thai glyphs
|
|
24
|
+
return :thai_in_en if latin_cnt > thai_cnt # mostly Latin letters
|
|
29
25
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
# ---------- pure-Thai ----------
|
|
34
|
-
if latin_cnt.zero?
|
|
35
|
-
return :thai if clean.length < 4 # ignore tiny words like “ดี”
|
|
36
|
-
|
|
37
|
-
eng = Mapping.map_thai_to_eng(clean)
|
|
38
|
-
vowelish = eng.count(VOWELS).positive?
|
|
39
|
-
if eng.match?(/\A[a-z]+\z/i) && vowelish &&
|
|
40
|
-
hit_ratio(clean, Mapping::THAI_TO_ENG) >= THRESHOLD
|
|
41
|
-
return :en_in_th
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
return :thai
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# ---------- mixed ----------
|
|
48
|
-
return :mixed if thai_cnt.positive? && latin_cnt.positive?
|
|
49
|
-
return :thai_in_en if hit_ratio(clean, Mapping::ENG_TO_THAI) >= THRESHOLD
|
|
50
|
-
return :en_in_th if hit_ratio(clean, Mapping::THAI_TO_ENG) >= THRESHOLD
|
|
51
|
-
|
|
52
|
-
:mixed
|
|
26
|
+
# If counts are equal (or zero) we can’t be sure
|
|
27
|
+
thai_cnt.zero? && latin_cnt.zero? ? :unknown : :mixed
|
|
53
28
|
end
|
|
54
29
|
|
|
55
30
|
# helpers ----------------------------------------------------------------
|
|
56
31
|
def char_stats(str)
|
|
57
|
-
clean = str.gsub(/\s+/, '')
|
|
32
|
+
clean = str.gsub(/\s+/, '')
|
|
58
33
|
thai = clean.each_char.count { |c| THAI_RANGE.include?(c.ord) }
|
|
59
|
-
latin = clean.each_char.count { |c| c
|
|
34
|
+
latin = clean.each_char.count { |c| c.match?(/[A-Za-z]/) }
|
|
60
35
|
[thai, latin]
|
|
61
36
|
end
|
|
62
37
|
private_class_method :char_stats
|
|
63
|
-
|
|
64
|
-
def hit_ratio(str, table)
|
|
65
|
-
chars = str.gsub(/\s+/, '').chars # whitespace-free array
|
|
66
|
-
return 0.0 if chars.empty?
|
|
67
|
-
|
|
68
|
-
hits = chars.count { |c| table.key?(c) }
|
|
69
|
-
hits.to_f / chars.length
|
|
70
|
-
end
|
|
71
|
-
private_class_method :hit_ratio
|
|
72
38
|
end
|
|
73
39
|
end
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
module ThaiKeyboardCorrector
|
|
4
4
|
# Mapping module provides methods to convert between English and Thai characters
|
|
5
|
-
module
|
|
6
|
-
# Base
|
|
5
|
+
module mapping
|
|
6
|
+
# 1. Base map – lower-case EN → Thai Kedmanee
|
|
7
7
|
BASE = {
|
|
8
8
|
'q' => 'ๆ', 'w' => 'ไ', 'e' => 'ำ', 'r' => 'พ', 't' => 'ะ',
|
|
9
9
|
'y' => 'ั', 'u' => 'ี', 'i' => 'ร', 'o' => 'น', 'p' => 'ย',
|
|
@@ -14,8 +14,32 @@ module ThaiKeyboardCorrector
|
|
|
14
14
|
'n' => 'ื', 'm' => 'ท', ',' => 'ม', '.' => 'ใ', '/' => 'ฝ'
|
|
15
15
|
}.freeze
|
|
16
16
|
|
|
17
|
+
# 2. Shift-layer Thai glyphs → underlying EN key
|
|
18
|
+
SHIFT = {
|
|
19
|
+
# ── Number row (Shift+1 … Shift+0) – Thai digits
|
|
20
|
+
'๑' => '1', '๒' => '2', '๓' => '3', '๔' => '4', '๕' => '5',
|
|
21
|
+
'๖' => '6', '๗' => '7', '๘' => '8', '๙' => '9', '๐' => '0',
|
|
22
|
+
|
|
23
|
+
# ── Top-letter row (Q–P)
|
|
24
|
+
'ฃ' => 'w', 'ฅ' => 'e', 'ฆ' => 'r', 'ฑ' => 't', 'ํ' => 'y',
|
|
25
|
+
'ฐ' => 'u', 'ณ' => 'i', 'ญ' => 'o', 'ธ' => 't', # (Shift+T duplicate)
|
|
26
|
+
|
|
27
|
+
# ── Home row (A–L)
|
|
28
|
+
'ฤ' => 'a', 'ฦ' => 's', 'ฌ' => 'h', 'ศ' => 'l', 'ษ' => ';', 'ฮ' => "'",
|
|
29
|
+
|
|
30
|
+
# ── Bottom row (Z–/)
|
|
31
|
+
'ฒ' => 'z', 'ฬ' => 'x',
|
|
32
|
+
'ฯ' => 'm', # Thai paiyannoi
|
|
33
|
+
'฿' => '.', # Baht sign (Shift+.)
|
|
34
|
+
'๏' => '/', # Thai “head mark”
|
|
35
|
+
|
|
36
|
+
# Already mapped earlier (duplicates kept for clarity—harmless):
|
|
37
|
+
'๛' => ',' # end-paragraph mark
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
# 3. Final maps (frozen once, never mutated)
|
|
17
41
|
ENG_TO_THAI = BASE.merge(BASE.transform_keys(&:upcase)).freeze
|
|
18
|
-
THAI_TO_ENG = BASE.invert.freeze
|
|
42
|
+
THAI_TO_ENG = BASE.invert.merge(SHIFT).freeze
|
|
19
43
|
|
|
20
44
|
module_function
|
|
21
45
|
|
|
@@ -13,11 +13,11 @@ module ThaiKeyboardCorrector
|
|
|
13
13
|
# @return [String] corrected or original string
|
|
14
14
|
def correct(str)
|
|
15
15
|
case detect_layout(str)
|
|
16
|
-
when :thai_in_en
|
|
16
|
+
when :thai_in_en # mostly Latin → convert EN→TH
|
|
17
17
|
Mapping.map_eng_to_thai(str)
|
|
18
|
-
when :en_in_th
|
|
18
|
+
when :en_in_th # mostly Thai → convert TH→EN
|
|
19
19
|
Mapping.map_thai_to_eng(str)
|
|
20
|
-
else
|
|
20
|
+
else # :thai, :en, :mixed, :unknown
|
|
21
21
|
str
|
|
22
22
|
end
|
|
23
23
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: thai_keyboard_corrector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chayut Orapinpatipat
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-06-
|
|
11
|
+
date: 2025-06-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rspec
|