thailang4r 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.md +17 -0
- data/Rakefile +12 -0
- data/data/tdict-std.txt +15371 -0
- data/data/test_dict.txt +8 -0
- data/lib/thailang4r.rb +42 -0
- data/lib/thailang4r/dict.rb +89 -0
- data/lib/thailang4r/ranges_builder.rb +90 -0
- data/lib/thailang4r/word_breaker.rb +29 -0
- data/lib/thailang4r/word_dag_builder.rb +78 -0
- metadata +71 -0
data/data/test_dict.txt
ADDED
data/lib/thailang4r.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
def ThaiLang.string_chlevel(s)
|
3
|
+
ch_level_list = []
|
4
|
+
s.each_char do |ch|
|
5
|
+
ch_level_list << chlevel(ch)
|
6
|
+
end
|
7
|
+
ch_level_list
|
8
|
+
end
|
9
|
+
|
10
|
+
def ThaiLang.chlevel(ch)
|
11
|
+
_chlevel(ch.ord)
|
12
|
+
end
|
13
|
+
|
14
|
+
def ThaiLang._chlevel(code)
|
15
|
+
level = nil
|
16
|
+
if (code >= 0x0E01 and code <= 0x0E30) or
|
17
|
+
(code >= 0x0E32 and code <= 0x0E33) or
|
18
|
+
(code >= 0x0E3F and code <= 0x0E46) or
|
19
|
+
(code >= 0x0E4F and code <= 0x0E5B) then
|
20
|
+
level = 1
|
21
|
+
elsif (code >= 0x0E38 and code <= 0x0E3A) then
|
22
|
+
level = -1
|
23
|
+
elsif code == 0x0E31 or
|
24
|
+
(code >= 0x0E34 and code <= 0x0E37) or
|
25
|
+
(code >= 0x0E4C and code <= 0x0E4E) then
|
26
|
+
level = 2
|
27
|
+
elsif code >= 0x0E48 and code <= 0x0E4B then
|
28
|
+
level = 3
|
29
|
+
end
|
30
|
+
level
|
31
|
+
end
|
32
|
+
|
33
|
+
def ThaiLang.exclude_thai_lower_upper(s)
|
34
|
+
included_list = []
|
35
|
+
s.each_char do |ch|
|
36
|
+
if chlevel(ch).nil? or chlevel(ch) == 1
|
37
|
+
included_list << ch
|
38
|
+
end
|
39
|
+
end
|
40
|
+
included_list.join('')
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class Dict
|
3
|
+
def initialize(file_path)
|
4
|
+
load_dict(file_path)
|
5
|
+
end
|
6
|
+
|
7
|
+
def load_dict(file_path)
|
8
|
+
File.open(file_path) do |f|
|
9
|
+
@str_list = f.readlines.map{|line| line.chomp}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
14
|
+
find_index_of_needle(:FIRST, prefix, offset, s, e)
|
15
|
+
end
|
16
|
+
|
17
|
+
def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
|
18
|
+
find_index_of_needle(:LAST, prefix, offset, s, e)
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
|
22
|
+
offset = offset.nil? ? 0 : offset
|
23
|
+
s = s.nil? ? 0 : s
|
24
|
+
e = e.nil? ? @str_list.length : e
|
25
|
+
|
26
|
+
l = s
|
27
|
+
r = e - 1;
|
28
|
+
ans = nil
|
29
|
+
|
30
|
+
while l <= r do
|
31
|
+
m = (l + r) / 2
|
32
|
+
ch = @str_list[m][offset]
|
33
|
+
if ch.nil? or prefix > ch
|
34
|
+
l = m + 1
|
35
|
+
elsif prefix < ch
|
36
|
+
r = m - 1
|
37
|
+
else
|
38
|
+
ans = m
|
39
|
+
if pos_type == :FIRST
|
40
|
+
r = m - 1
|
41
|
+
else
|
42
|
+
l = m + 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
ans
|
48
|
+
end
|
49
|
+
|
50
|
+
def size
|
51
|
+
@str_list.length
|
52
|
+
end
|
53
|
+
|
54
|
+
def [](i)
|
55
|
+
@str_list[i]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class DictIter
|
60
|
+
def initialize(dict)
|
61
|
+
@dict = dict
|
62
|
+
@e = @dict.size
|
63
|
+
@s = 0
|
64
|
+
@state = :ACTIVE
|
65
|
+
@offset = 0
|
66
|
+
end
|
67
|
+
|
68
|
+
def walk(ch)
|
69
|
+
if @state != :INVALID
|
70
|
+
first = @dict.find_first_index_of_needle ch, @offset, @s, @e
|
71
|
+
if first.nil?
|
72
|
+
@state = :INVALID
|
73
|
+
else
|
74
|
+
@s = first
|
75
|
+
last = @dict.find_last_index_of_needle ch, @offset, @s, @e
|
76
|
+
@e = last + 1
|
77
|
+
len = @dict[first].length
|
78
|
+
@offset += 1
|
79
|
+
if(@offset == len)
|
80
|
+
@state = :ACTIVE_BOUNDARY
|
81
|
+
else
|
82
|
+
@state = :ACTIVE
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
@state
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class RangesBuilder
|
3
|
+
S = 0
|
4
|
+
E = 1
|
5
|
+
LINK_TYPE = 2
|
6
|
+
|
7
|
+
POINTER = 0
|
8
|
+
WEIGHT = 1
|
9
|
+
PATH_UNK = 2
|
10
|
+
PATH_LINK_TYPE = 3
|
11
|
+
|
12
|
+
def _build_index(dag, pos)
|
13
|
+
index = {}
|
14
|
+
dag.each do |range|
|
15
|
+
if not index.has_key?(range[pos])
|
16
|
+
index[range[pos]] = []
|
17
|
+
end
|
18
|
+
index[range[pos]] << range
|
19
|
+
end
|
20
|
+
index
|
21
|
+
end
|
22
|
+
|
23
|
+
def _build_e_index(dag)
|
24
|
+
_build_index(dag, E)
|
25
|
+
end
|
26
|
+
|
27
|
+
def _build_s_index(dag)
|
28
|
+
_build_index(dag, S)
|
29
|
+
end
|
30
|
+
|
31
|
+
def _compare_path_info(a, b)
|
32
|
+
a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
|
33
|
+
end
|
34
|
+
|
35
|
+
def _build_path(len, s_index, e_index)
|
36
|
+
path = Array.new(len + 1) {|i| nil}
|
37
|
+
path[0] = [0, 0, 0, :UNK]
|
38
|
+
left_boundary = 0
|
39
|
+
for i in 1..len
|
40
|
+
if e_index.has_key?(i)
|
41
|
+
e_index[i].each do |range|
|
42
|
+
s = range[S]
|
43
|
+
if not path[s].nil?
|
44
|
+
info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
|
45
|
+
if path[i].nil? or _compare_path_info(info, path[i])
|
46
|
+
path[i] = info
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
if not path[i].nil?
|
51
|
+
left_boundary = i
|
52
|
+
end
|
53
|
+
end
|
54
|
+
if path[i].nil? and s_index.has_key?(i)
|
55
|
+
info = [left_boundary,
|
56
|
+
path[left_boundary][WEIGHT] + 1,
|
57
|
+
path[left_boundary][PATH_UNK] + 1,
|
58
|
+
:UNK]
|
59
|
+
path[i] = info;
|
60
|
+
end
|
61
|
+
end
|
62
|
+
if path[len].nil?
|
63
|
+
path[len] = [left_boundary,
|
64
|
+
path[left_boundary][WEIGHT] + 1,
|
65
|
+
path[left_boundary][PATH_UNK] + 1, :UNK]
|
66
|
+
end
|
67
|
+
path
|
68
|
+
end
|
69
|
+
|
70
|
+
def _path_to_ranges(path, len)
|
71
|
+
ranges = []
|
72
|
+
i = len
|
73
|
+
while i > 0
|
74
|
+
info = path[i]
|
75
|
+
s = info[POINTER]
|
76
|
+
ranges << [s, i, info[PATH_LINK_TYPE]]
|
77
|
+
i = s
|
78
|
+
end
|
79
|
+
ranges.reverse
|
80
|
+
end
|
81
|
+
|
82
|
+
def build_from_dag(dag, len)
|
83
|
+
s_index = _build_s_index(dag)
|
84
|
+
e_index = _build_e_index(dag)
|
85
|
+
path = _build_path(len, s_index, e_index)
|
86
|
+
_path_to_ranges(path, len)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'thailang4r/dict.rb'
|
3
|
+
require 'thailang4r/word_dag_builder.rb'
|
4
|
+
require 'thailang4r/ranges_builder.rb'
|
5
|
+
|
6
|
+
module ThaiLang
|
7
|
+
class WordBreaker
|
8
|
+
|
9
|
+
S = 0
|
10
|
+
E = 1
|
11
|
+
|
12
|
+
def initialize(path = nil)
|
13
|
+
if path.nil?
|
14
|
+
path = File.expand_path('../../../data/tdict-std.txt', __FILE__)
|
15
|
+
puts path
|
16
|
+
end
|
17
|
+
@dict = Dict.new path
|
18
|
+
@dag_builder = WordDagBuilder.new @dict
|
19
|
+
@ranges_builder = RangesBuilder.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def break_into_words(string)
|
23
|
+
len = string.length
|
24
|
+
dag = @dag_builder.build(string, len)
|
25
|
+
ranges = @ranges_builder.build_from_dag(dag, len)
|
26
|
+
ranges.map{|range| string[range[S], range[E] - range[S]]}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module ThaiLang
|
2
|
+
class WordDagBuilder
|
3
|
+
def initialize(dict)
|
4
|
+
@dict = dict
|
5
|
+
end
|
6
|
+
|
7
|
+
def build(string, len)
|
8
|
+
dag = []
|
9
|
+
_build_by_dict(dag, string, len)
|
10
|
+
#_build_by_latin_rule(dag, string, len)
|
11
|
+
dag.sort do |a,b|
|
12
|
+
r = 0
|
13
|
+
for i in 0..2
|
14
|
+
r = a[i] <=> b[i]
|
15
|
+
if r != 0
|
16
|
+
break
|
17
|
+
end
|
18
|
+
end
|
19
|
+
r
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def _build_by_latin_rule(dag, string, len)
|
24
|
+
next_latin = 0
|
25
|
+
for i in 0..(len-1)
|
26
|
+
space_e = nil
|
27
|
+
latin_e = nil
|
28
|
+
space_break = false
|
29
|
+
latin_break = false
|
30
|
+
|
31
|
+
for j in i..(len-1)
|
32
|
+
if space_break and latin_break
|
33
|
+
break
|
34
|
+
end
|
35
|
+
ch = string[j]
|
36
|
+
if not space_break
|
37
|
+
if ch == " "
|
38
|
+
space_e = j + 1
|
39
|
+
else
|
40
|
+
space_break = true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
if latin_break and j >= next_latin
|
45
|
+
if /A-Za-z/.match(ch)
|
46
|
+
latin_e = j + 1
|
47
|
+
else
|
48
|
+
latin_break = true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
if not space_e.nil?
|
54
|
+
dag << [i, space_e, :SPACE]
|
55
|
+
end
|
56
|
+
if not latin_e.nil?
|
57
|
+
dag << [i, latin_e, :LATIN]
|
58
|
+
next_latin = latin_e;
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def _build_by_dict(dag, string, len)
|
64
|
+
for i in 0..(len-1)
|
65
|
+
iter = DictIter.new @dict
|
66
|
+
for j in i..(len-1)
|
67
|
+
ch = string[j]
|
68
|
+
status = iter.walk ch
|
69
|
+
if status == :INVALID
|
70
|
+
break
|
71
|
+
elsif status == :ACTIVE_BOUNDARY
|
72
|
+
dag << [i, j + 1, :DICT]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: thailang4r
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Vee Satayamas
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cucumber
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.1
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.1
|
30
|
+
description: Thai language utility for Ruby
|
31
|
+
email:
|
32
|
+
- v.satayamas@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- lib/thailang4r.rb
|
38
|
+
- lib/thailang4r/word_dag_builder.rb
|
39
|
+
- lib/thailang4r/dict.rb
|
40
|
+
- lib/thailang4r/ranges_builder.rb
|
41
|
+
- lib/thailang4r/word_breaker.rb
|
42
|
+
- LICENSE
|
43
|
+
- README.md
|
44
|
+
- Rakefile
|
45
|
+
- data/test_dict.txt
|
46
|
+
- data/tdict-std.txt
|
47
|
+
homepage: https://github.com/veer66/thailang4r
|
48
|
+
licenses: []
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
require_paths:
|
52
|
+
- lib
|
53
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ! '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 1.9.3
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
requirements: []
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.8.25
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Thai language utility for Ruby
|
71
|
+
test_files: []
|