thailang4r 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.md +17 -0
- data/Rakefile +12 -0
- data/data/tdict-std.txt +15371 -0
- data/data/test_dict.txt +8 -0
- data/lib/thailang4r.rb +42 -0
- data/lib/thailang4r/dict.rb +89 -0
- data/lib/thailang4r/ranges_builder.rb +90 -0
- data/lib/thailang4r/word_breaker.rb +29 -0
- data/lib/thailang4r/word_dag_builder.rb +78 -0
- metadata +71 -0
data/data/test_dict.txt
ADDED
data/lib/thailang4r.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
def ThaiLang.string_chlevel(s)
|
3
|
+
ch_level_list = []
|
4
|
+
s.each_char do |ch|
|
5
|
+
ch_level_list << chlevel(ch)
|
6
|
+
end
|
7
|
+
ch_level_list
|
8
|
+
end
|
9
|
+
|
10
|
+
def ThaiLang.chlevel(ch)
|
11
|
+
_chlevel(ch.ord)
|
12
|
+
end
|
13
|
+
|
14
|
+
def ThaiLang._chlevel(code)
|
15
|
+
level = nil
|
16
|
+
if (code >= 0x0E01 and code <= 0x0E30) or
|
17
|
+
(code >= 0x0E32 and code <= 0x0E33) or
|
18
|
+
(code >= 0x0E3F and code <= 0x0E46) or
|
19
|
+
(code >= 0x0E4F and code <= 0x0E5B) then
|
20
|
+
level = 1
|
21
|
+
elsif (code >= 0x0E38 and code <= 0x0E3A) then
|
22
|
+
level = -1
|
23
|
+
elsif code == 0x0E31 or
|
24
|
+
(code >= 0x0E34 and code <= 0x0E37) or
|
25
|
+
(code >= 0x0E4C and code <= 0x0E4E) then
|
26
|
+
level = 2
|
27
|
+
elsif code >= 0x0E48 and code <= 0x0E4B then
|
28
|
+
level = 3
|
29
|
+
end
|
30
|
+
level
|
31
|
+
end
|
32
|
+
|
33
|
+
def ThaiLang.exclude_thai_lower_upper(s)
|
34
|
+
included_list = []
|
35
|
+
s.each_char do |ch|
|
36
|
+
if chlevel(ch).nil? or chlevel(ch) == 1
|
37
|
+
included_list << ch
|
38
|
+
end
|
39
|
+
end
|
40
|
+
included_list.join('')
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class Dict
|
3
|
+
def initialize(file_path)
|
4
|
+
load_dict(file_path)
|
5
|
+
end
|
6
|
+
|
7
|
+
def load_dict(file_path)
|
8
|
+
File.open(file_path) do |f|
|
9
|
+
@str_list = f.readlines.map{|line| line.chomp}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
14
|
+
find_index_of_needle(:FIRST, prefix, offset, s, e)
|
15
|
+
end
|
16
|
+
|
17
|
+
def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
18
|
+
find_index_of_needle(:LAST, prefix, offset, s, e)
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
|
22
|
+
offset = offset.nil? ? 0 : offset
|
23
|
+
s = s.nil? ? 0 : s
|
24
|
+
e = e.nil? ? @str_list.length : e
|
25
|
+
|
26
|
+
l = s
|
27
|
+
r = e - 1;
|
28
|
+
ans = nil
|
29
|
+
|
30
|
+
while l <= r do
|
31
|
+
m = (l + r) / 2
|
32
|
+
ch = @str_list[m][offset]
|
33
|
+
if ch.nil? or prefix > ch
|
34
|
+
l = m + 1
|
35
|
+
elsif prefix < ch
|
36
|
+
r = m - 1
|
37
|
+
else
|
38
|
+
ans = m
|
39
|
+
if pos_type == :FIRST
|
40
|
+
r = m - 1
|
41
|
+
else
|
42
|
+
l = m + 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
ans
|
48
|
+
end
|
49
|
+
|
50
|
+
def size
|
51
|
+
@str_list.length
|
52
|
+
end
|
53
|
+
|
54
|
+
def [](i)
|
55
|
+
@str_list[i]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class DictIter
|
60
|
+
def initialize(dict)
|
61
|
+
@dict = dict
|
62
|
+
@e = @dict.size
|
63
|
+
@s = 0
|
64
|
+
@state = :ACTIVE
|
65
|
+
@offset = 0
|
66
|
+
end
|
67
|
+
|
68
|
+
def walk(ch)
|
69
|
+
if @state != :INVALID
|
70
|
+
first = @dict.find_first_index_of_needle ch, @offset, @s, @e
|
71
|
+
if first.nil?
|
72
|
+
@state = :INVALID
|
73
|
+
else
|
74
|
+
@s = first
|
75
|
+
last = @dict.find_last_index_of_needle ch, @offset, @s, @e
|
76
|
+
@e = last + 1
|
77
|
+
len = @dict[first].length
|
78
|
+
@offset += 1
|
79
|
+
if(@offset == len)
|
80
|
+
@state = :ACTIVE_BOUNDARY
|
81
|
+
else
|
82
|
+
@state = :ACTIVE
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
@state
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class RangesBuilder
|
3
|
+
S = 0
|
4
|
+
E = 1
|
5
|
+
LINK_TYPE = 2
|
6
|
+
|
7
|
+
POINTER = 0
|
8
|
+
WEIGHT = 1
|
9
|
+
PATH_UNK = 2
|
10
|
+
PATH_LINK_TYPE = 3
|
11
|
+
|
12
|
+
def _build_index(dag, pos)
|
13
|
+
index = {}
|
14
|
+
dag.each do |range|
|
15
|
+
if not index.has_key?(range[pos])
|
16
|
+
index[range[pos]] = []
|
17
|
+
end
|
18
|
+
index[range[pos]] << range
|
19
|
+
end
|
20
|
+
index
|
21
|
+
end
|
22
|
+
|
23
|
+
def _build_e_index(dag)
|
24
|
+
_build_index(dag, E)
|
25
|
+
end
|
26
|
+
|
27
|
+
def _build_s_index(dag)
|
28
|
+
_build_index(dag, S)
|
29
|
+
end
|
30
|
+
|
31
|
+
def _compare_path_info(a, b)
|
32
|
+
a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
|
33
|
+
end
|
34
|
+
|
35
|
+
def _build_path(len, s_index, e_index)
|
36
|
+
path = Array.new(len + 1) {|i| nil}
|
37
|
+
path[0] = [0, 0, 0, :UNK]
|
38
|
+
left_boundary = 0
|
39
|
+
for i in 1..len
|
40
|
+
if e_index.has_key?(i)
|
41
|
+
e_index[i].each do |range|
|
42
|
+
s = range[S]
|
43
|
+
if not path[s].nil?
|
44
|
+
info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
|
45
|
+
if path[i].nil? or _compare_path_info(info, path[i])
|
46
|
+
path[i] = info
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
if not path[i].nil?
|
51
|
+
left_boundary = i
|
52
|
+
end
|
53
|
+
end
|
54
|
+
if path[i].nil? and s_index.has_key?(i)
|
55
|
+
info = [left_boundary,
|
56
|
+
path[left_boundary][WEIGHT] + 1,
|
57
|
+
path[left_boundary][PATH_UNK] + 1,
|
58
|
+
:UNK]
|
59
|
+
path[i] = info;
|
60
|
+
end
|
61
|
+
end
|
62
|
+
if path[len].nil?
|
63
|
+
path[len] = [left_boundary,
|
64
|
+
path[left_boundary][WEIGHT] + 1,
|
65
|
+
path[left_boundary][PATH_UNK] + 1, :UNK]
|
66
|
+
end
|
67
|
+
path
|
68
|
+
end
|
69
|
+
|
70
|
+
def _path_to_ranges(path, len)
|
71
|
+
ranges = []
|
72
|
+
i = len
|
73
|
+
while i > 0
|
74
|
+
info = path[i]
|
75
|
+
s = info[POINTER]
|
76
|
+
ranges << [s, i, info[PATH_LINK_TYPE]]
|
77
|
+
i = s
|
78
|
+
end
|
79
|
+
ranges.reverse
|
80
|
+
end
|
81
|
+
|
82
|
+
def build_from_dag(dag, len)
|
83
|
+
s_index = _build_s_index(dag)
|
84
|
+
e_index = _build_e_index(dag)
|
85
|
+
path = _build_path(len, s_index, e_index)
|
86
|
+
_path_to_ranges(path, len)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'thailang4r/dict.rb'
|
3
|
+
require 'thailang4r/word_dag_builder.rb'
|
4
|
+
require 'thailang4r/ranges_builder.rb'
|
5
|
+
|
6
|
+
module ThaiLang
|
7
|
+
class WordBreaker
|
8
|
+
|
9
|
+
S = 0
|
10
|
+
E = 1
|
11
|
+
|
12
|
+
def initialize(path = nil)
|
13
|
+
if path.nil?
|
14
|
+
path = File.expand_path('../../../data/tdict-std.txt', __FILE__)
|
15
|
+
puts path
|
16
|
+
end
|
17
|
+
@dict = Dict.new path
|
18
|
+
@dag_builder = WordDagBuilder.new @dict
|
19
|
+
@ranges_builder = RangesBuilder.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def break_into_words(string)
|
23
|
+
len = string.length
|
24
|
+
dag = @dag_builder.build(string, len)
|
25
|
+
ranges = @ranges_builder.build_from_dag(dag, len)
|
26
|
+
ranges.map{|range| string[range[S], range[E] - range[S]]}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class WordDagBuilder
|
3
|
+
def initialize(dict)
|
4
|
+
@dict = dict
|
5
|
+
end
|
6
|
+
|
7
|
+
def build(string, len)
|
8
|
+
dag = []
|
9
|
+
_build_by_dict(dag, string, len)
|
10
|
+
#_build_by_latin_rule(dag, string, len)
|
11
|
+
dag.sort do |a,b|
|
12
|
+
r = 0
|
13
|
+
for i in 0..2
|
14
|
+
r = a[i] <=> b[i]
|
15
|
+
if r != 0
|
16
|
+
break
|
17
|
+
end
|
18
|
+
end
|
19
|
+
r
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def _build_by_latin_rule(dag, string, len)
|
24
|
+
next_latin = 0
|
25
|
+
for i in 0..(len-1)
|
26
|
+
space_e = nil
|
27
|
+
latin_e = nil
|
28
|
+
space_break = false
|
29
|
+
latin_break = false
|
30
|
+
|
31
|
+
for j in i..(len-1)
|
32
|
+
if space_break and latin_break
|
33
|
+
break
|
34
|
+
end
|
35
|
+
ch = string[j]
|
36
|
+
if not space_break
|
37
|
+
if ch == " "
|
38
|
+
space_e = j + 1
|
39
|
+
else
|
40
|
+
space_break = true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
if latin_break and j >= next_latin
|
45
|
+
if /A-Za-z/.match(ch)
|
46
|
+
latin_e = j + 1
|
47
|
+
else
|
48
|
+
latin_break = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
if not space_e.nil?
|
54
|
+
dag << [i, space_e, :SPACE]
|
55
|
+
end
|
56
|
+
if not latin_e.nil?
|
57
|
+
dag << [i, latin_e, :LATIN]
|
58
|
+
next_latin = latin_e;
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def _build_by_dict(dag, string, len)
|
64
|
+
for i in 0..(len-1)
|
65
|
+
iter = DictIter.new @dict
|
66
|
+
for j in i..(len-1)
|
67
|
+
ch = string[j]
|
68
|
+
status = iter.walk ch
|
69
|
+
if status == :INVALID
|
70
|
+
break
|
71
|
+
elsif status == :ACTIVE_BOUNDARY
|
72
|
+
dag << [i, j + 1, :DICT]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: thailang4r
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Vee Satayamas
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cucumber
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.1
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.1
|
30
|
+
description: Thai language utility for Ruby
|
31
|
+
email:
|
32
|
+
- v.satayamas@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- lib/thailang4r.rb
|
38
|
+
- lib/thailang4r/word_dag_builder.rb
|
39
|
+
- lib/thailang4r/dict.rb
|
40
|
+
- lib/thailang4r/ranges_builder.rb
|
41
|
+
- lib/thailang4r/word_breaker.rb
|
42
|
+
- LICENSE
|
43
|
+
- README.md
|
44
|
+
- Rakefile
|
45
|
+
- data/test_dict.txt
|
46
|
+
- data/tdict-std.txt
|
47
|
+
homepage: https://github.com/veer66/thailang4r
|
48
|
+
licenses: []
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
require_paths:
|
52
|
+
- lib
|
53
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ! '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 1.9.3
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
requirements: []
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.8.25
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Thai language utility for Ruby
|
71
|
+
test_files: []
|