thailang4r 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+
2
+ กกต
3
+
4
+ ขคทท
5
+ ขจ
6
+ จก
7
+ มจ
8
+ มม
@@ -0,0 +1,42 @@
1
+ module ThaiLang
2
+ def ThaiLang.string_chlevel(s)
3
+ ch_level_list = []
4
+ s.each_char do |ch|
5
+ ch_level_list << chlevel(ch)
6
+ end
7
+ ch_level_list
8
+ end
9
+
10
+ def ThaiLang.chlevel(ch)
11
+ _chlevel(ch.ord)
12
+ end
13
+
14
+ def ThaiLang._chlevel(code)
15
+ level = nil
16
+ if (code >= 0x0E01 and code <= 0x0E30) or
17
+ (code >= 0x0E32 and code <= 0x0E33) or
18
+ (code >= 0x0E3F and code <= 0x0E46) or
19
+ (code >= 0x0E4F and code <= 0x0E5B) then
20
+ level = 1
21
+ elsif (code >= 0x0E38 and code <= 0x0E3A) then
22
+ level = -1
23
+ elsif code == 0x0E31 or
24
+ (code >= 0x0E34 and code <= 0x0E37) or
25
+ (code >= 0x0E4C and code <= 0x0E4E) then
26
+ level = 2
27
+ elsif code >= 0x0E48 and code <= 0x0E4B then
28
+ level = 3
29
+ end
30
+ level
31
+ end
32
+
33
+ def ThaiLang.exclude_thai_lower_upper(s)
34
+ included_list = []
35
+ s.each_char do |ch|
36
+ if chlevel(ch).nil? or chlevel(ch) == 1
37
+ included_list << ch
38
+ end
39
+ end
40
+ included_list.join('')
41
+ end
42
+ end
@@ -0,0 +1,89 @@
1
+ module ThaiLang
2
+ class Dict
3
+ def initialize(file_path)
4
+ load_dict(file_path)
5
+ end
6
+
7
+ def load_dict(file_path)
8
+ File.open(file_path) do |f|
9
+ @str_list = f.readlines.map{|line| line.chomp}
10
+ end
11
+ end
12
+
13
+ def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
14
+ find_index_of_needle(:FIRST, prefix, offset, s, e)
15
+ end
16
+
17
+ def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
18
+ find_index_of_needle(:LAST, prefix, offset, s, e)
19
+ end
20
+
21
+ def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
22
+ offset = offset.nil? ? 0 : offset
23
+ s = s.nil? ? 0 : s
24
+ e = e.nil? ? @str_list.length : e
25
+
26
+ l = s
27
+ r = e - 1;
28
+ ans = nil
29
+
30
+ while l <= r do
31
+ m = (l + r) / 2
32
+ ch = @str_list[m][offset]
33
+ if ch.nil? or prefix > ch
34
+ l = m + 1
35
+ elsif prefix < ch
36
+ r = m - 1
37
+ else
38
+ ans = m
39
+ if pos_type == :FIRST
40
+ r = m - 1
41
+ else
42
+ l = m + 1
43
+ end
44
+ end
45
+ end
46
+
47
+ ans
48
+ end
49
+
50
+ def size
51
+ @str_list.length
52
+ end
53
+
54
+ def [](i)
55
+ @str_list[i]
56
+ end
57
+ end
58
+
59
+ class DictIter
60
+ def initialize(dict)
61
+ @dict = dict
62
+ @e = @dict.size
63
+ @s = 0
64
+ @state = :ACTIVE
65
+ @offset = 0
66
+ end
67
+
68
+ def walk(ch)
69
+ if @state != :INVALID
70
+ first = @dict.find_first_index_of_needle ch, @offset, @s, @e
71
+ if first.nil?
72
+ @state = :INVALID
73
+ else
74
+ @s = first
75
+ last = @dict.find_last_index_of_needle ch, @offset, @s, @e
76
+ @e = last + 1
77
+ len = @dict[first].length
78
+ @offset += 1
79
+ if(@offset == len)
80
+ @state = :ACTIVE_BOUNDARY
81
+ else
82
+ @state = :ACTIVE
83
+ end
84
+ end
85
+ end
86
+ @state
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,90 @@
1
+ module ThaiLang
2
+ class RangesBuilder
3
+ S = 0
4
+ E = 1
5
+ LINK_TYPE = 2
6
+
7
+ POINTER = 0
8
+ WEIGHT = 1
9
+ PATH_UNK = 2
10
+ PATH_LINK_TYPE = 3
11
+
12
+ def _build_index(dag, pos)
13
+ index = {}
14
+ dag.each do |range|
15
+ if not index.has_key?(range[pos])
16
+ index[range[pos]] = []
17
+ end
18
+ index[range[pos]] << range
19
+ end
20
+ index
21
+ end
22
+
23
+ def _build_e_index(dag)
24
+ _build_index(dag, E)
25
+ end
26
+
27
+ def _build_s_index(dag)
28
+ _build_index(dag, S)
29
+ end
30
+
31
+ def _compare_path_info(a, b)
32
+ a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
33
+ end
34
+
35
+ def _build_path(len, s_index, e_index)
36
+ path = Array.new(len + 1) {|i| nil}
37
+ path[0] = [0, 0, 0, :UNK]
38
+ left_boundary = 0
39
+ for i in 1..len
40
+ if e_index.has_key?(i)
41
+ e_index[i].each do |range|
42
+ s = range[S]
43
+ if not path[s].nil?
44
+ info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
45
+ if path[i].nil? or _compare_path_info(info, path[i])
46
+ path[i] = info
47
+ end
48
+ end
49
+ end
50
+ if not path[i].nil?
51
+ left_boundary = i
52
+ end
53
+ end
54
+ if path[i].nil? and s_index.has_key?(i)
55
+ info = [left_boundary,
56
+ path[left_boundary][WEIGHT] + 1,
57
+ path[left_boundary][PATH_UNK] + 1,
58
+ :UNK]
59
+ path[i] = info;
60
+ end
61
+ end
62
+ if path[len].nil?
63
+ path[len] = [left_boundary,
64
+ path[left_boundary][WEIGHT] + 1,
65
+ path[left_boundary][PATH_UNK] + 1, :UNK]
66
+ end
67
+ path
68
+ end
69
+
70
+ def _path_to_ranges(path, len)
71
+ ranges = []
72
+ i = len
73
+ while i > 0
74
+ info = path[i]
75
+ s = info[POINTER]
76
+ ranges << [s, i, info[PATH_LINK_TYPE]]
77
+ i = s
78
+ end
79
+ ranges.reverse
80
+ end
81
+
82
+ def build_from_dag(dag, len)
83
+ s_index = _build_s_index(dag)
84
+ e_index = _build_e_index(dag)
85
+ path = _build_path(len, s_index, e_index)
86
+ _path_to_ranges(path, len)
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'thailang4r/dict.rb'
3
+ require 'thailang4r/word_dag_builder.rb'
4
+ require 'thailang4r/ranges_builder.rb'
5
+
6
+ module ThaiLang
7
+ class WordBreaker
8
+
9
+ S = 0
10
+ E = 1
11
+
12
+ def initialize(path = nil)
13
+ if path.nil?
14
+ path = File.expand_path('../../../data/tdict-std.txt', __FILE__)
15
+ puts path
16
+ end
17
+ @dict = Dict.new path
18
+ @dag_builder = WordDagBuilder.new @dict
19
+ @ranges_builder = RangesBuilder.new
20
+ end
21
+
22
+ def break_into_words(string)
23
+ len = string.length
24
+ dag = @dag_builder.build(string, len)
25
+ ranges = @ranges_builder.build_from_dag(dag, len)
26
+ ranges.map{|range| string[range[S], range[E] - range[S]]}
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,78 @@
1
+ module ThaiLang
2
+ class WordDagBuilder
3
+ def initialize(dict)
4
+ @dict = dict
5
+ end
6
+
7
+ def build(string, len)
8
+ dag = []
9
+ _build_by_dict(dag, string, len)
10
+ #_build_by_latin_rule(dag, string, len)
11
+ dag.sort do |a,b|
12
+ r = 0
13
+ for i in 0..2
14
+ r = a[i] <=> b[i]
15
+ if r != 0
16
+ break
17
+ end
18
+ end
19
+ r
20
+ end
21
+ end
22
+
23
+ def _build_by_latin_rule(dag, string, len)
24
+ next_latin = 0
25
+ for i in 0..(len-1)
26
+ space_e = nil
27
+ latin_e = nil
28
+ space_break = false
29
+ latin_break = false
30
+
31
+ for j in i..(len-1)
32
+ if space_break and latin_break
33
+ break
34
+ end
35
+ ch = string[j]
36
+ if not space_break
37
+ if ch == " "
38
+ space_e = j + 1
39
+ else
40
+ space_break = true
41
+ end
42
+ end
43
+
44
+ if latin_break and j >= next_latin
45
+ if /A-Za-z/.match(ch)
46
+ latin_e = j + 1
47
+ else
48
+ latin_break = true
49
+ end
50
+ end
51
+ end
52
+
53
+ if not space_e.nil?
54
+ dag << [i, space_e, :SPACE]
55
+ end
56
+ if not latin_e.nil?
57
+ dag << [i, latin_e, :LATIN]
58
+ next_latin = latin_e;
59
+ end
60
+ end
61
+ end
62
+
63
+ def _build_by_dict(dag, string, len)
64
+ for i in 0..(len-1)
65
+ iter = DictIter.new @dict
66
+ for j in i..(len-1)
67
+ ch = string[j]
68
+ status = iter.walk ch
69
+ if status == :INVALID
70
+ break
71
+ elsif status == :ACTIVE_BOUNDARY
72
+ dag << [i, j + 1, :DICT]
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: thailang4r
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vee Satayamas
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: cucumber
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.1
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.1
30
+ description: Thai language utility for Ruby
31
+ email:
32
+ - v.satayamas@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - lib/thailang4r.rb
38
+ - lib/thailang4r/word_dag_builder.rb
39
+ - lib/thailang4r/dict.rb
40
+ - lib/thailang4r/ranges_builder.rb
41
+ - lib/thailang4r/word_breaker.rb
42
+ - LICENSE
43
+ - README.md
44
+ - Rakefile
45
+ - data/test_dict.txt
46
+ - data/tdict-std.txt
47
+ homepage: https://github.com/veer66/thailang4r
48
+ licenses: []
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
52
+ - lib
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ! '>='
57
+ - !ruby/object:Gem::Version
58
+ version: 1.9.3
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ requirements: []
66
+ rubyforge_project:
67
+ rubygems_version: 1.8.25
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Thai language utility for Ruby
71
+ test_files: []