thailang4r 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+
2
+ กกต
3
+
4
+ ขคทท
5
+ ขจ
6
+ จก
7
+ มจ
8
+ มม
@@ -0,0 +1,42 @@
1
+ module ThaiLang
2
+ def ThaiLang.string_chlevel(s)
3
+ ch_level_list = []
4
+ s.each_char do |ch|
5
+ ch_level_list << chlevel(ch)
6
+ end
7
+ ch_level_list
8
+ end
9
+
10
+ def ThaiLang.chlevel(ch)
11
+ _chlevel(ch.ord)
12
+ end
13
+
14
+ def ThaiLang._chlevel(code)
15
+ level = nil
16
+ if (code >= 0x0E01 and code <= 0x0E30) or
17
+ (code >= 0x0E32 and code <= 0x0E33) or
18
+ (code >= 0x0E3F and code <= 0x0E46) or
19
+ (code >= 0x0E4F and code <= 0x0E5B) then
20
+ level = 1
21
+ elsif (code >= 0x0E38 and code <= 0x0E3A) then
22
+ level = -1
23
+ elsif code == 0x0E31 or
24
+ (code >= 0x0E34 and code <= 0x0E37) or
25
+ (code >= 0x0E4C and code <= 0x0E4E) then
26
+ level = 2
27
+ elsif code >= 0x0E48 and code <= 0x0E4B then
28
+ level = 3
29
+ end
30
+ level
31
+ end
32
+
33
+ def ThaiLang.exclude_thai_lower_upper(s)
34
+ included_list = []
35
+ s.each_char do |ch|
36
+ if chlevel(ch).nil? or chlevel(ch) == 1
37
+ included_list << ch
38
+ end
39
+ end
40
+ included_list.join('')
41
+ end
42
+ end
@@ -0,0 +1,89 @@
1
+ module ThaiLang
2
+ class Dict
3
+ def initialize(file_path)
4
+ load_dict(file_path)
5
+ end
6
+
7
+ def load_dict(file_path)
8
+ File.open(file_path) do |f|
9
+ @str_list = f.readlines.map{|line| line.chomp}
10
+ end
11
+ end
12
+
13
+ def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
14
+ find_index_of_needle(:FIRST, prefix, offset, s, e)
15
+ end
16
+
17
+ def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
18
+ find_index_of_needle(:LAST, prefix, offset, s, e)
19
+ end
20
+
21
+ def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
22
+ offset = offset.nil? ? 0 : offset
23
+ s = s.nil? ? 0 : s
24
+ e = e.nil? ? @str_list.length : e
25
+
26
+ l = s
27
+ r = e - 1;
28
+ ans = nil
29
+
30
+ while l <= r do
31
+ m = (l + r) / 2
32
+ ch = @str_list[m][offset]
33
+ if ch.nil? or prefix > ch
34
+ l = m + 1
35
+ elsif prefix < ch
36
+ r = m - 1
37
+ else
38
+ ans = m
39
+ if pos_type == :FIRST
40
+ r = m - 1
41
+ else
42
+ l = m + 1
43
+ end
44
+ end
45
+ end
46
+
47
+ ans
48
+ end
49
+
50
+ def size
51
+ @str_list.length
52
+ end
53
+
54
+ def [](i)
55
+ @str_list[i]
56
+ end
57
+ end
58
+
59
+ class DictIter
60
+ def initialize(dict)
61
+ @dict = dict
62
+ @e = @dict.size
63
+ @s = 0
64
+ @state = :ACTIVE
65
+ @offset = 0
66
+ end
67
+
68
+ def walk(ch)
69
+ if @state != :INVALID
70
+ first = @dict.find_first_index_of_needle ch, @offset, @s, @e
71
+ if first.nil?
72
+ @state = :INVALID
73
+ else
74
+ @s = first
75
+ last = @dict.find_last_index_of_needle ch, @offset, @s, @e
76
+ @e = last + 1
77
+ len = @dict[first].length
78
+ @offset += 1
79
+ if(@offset == len)
80
+ @state = :ACTIVE_BOUNDARY
81
+ else
82
+ @state = :ACTIVE
83
+ end
84
+ end
85
+ end
86
+ @state
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,90 @@
1
+ module ThaiLang
2
+ class RangesBuilder
3
+ S = 0
4
+ E = 1
5
+ LINK_TYPE = 2
6
+
7
+ POINTER = 0
8
+ WEIGHT = 1
9
+ PATH_UNK = 2
10
+ PATH_LINK_TYPE = 3
11
+
12
+ def _build_index(dag, pos)
13
+ index = {}
14
+ dag.each do |range|
15
+ if not index.has_key?(range[pos])
16
+ index[range[pos]] = []
17
+ end
18
+ index[range[pos]] << range
19
+ end
20
+ index
21
+ end
22
+
23
+ def _build_e_index(dag)
24
+ _build_index(dag, E)
25
+ end
26
+
27
+ def _build_s_index(dag)
28
+ _build_index(dag, S)
29
+ end
30
+
31
+ def _compare_path_info(a, b)
32
+ a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
33
+ end
34
+
35
+ def _build_path(len, s_index, e_index)
36
+ path = Array.new(len + 1) {|i| nil}
37
+ path[0] = [0, 0, 0, :UNK]
38
+ left_boundary = 0
39
+ for i in 1..len
40
+ if e_index.has_key?(i)
41
+ e_index[i].each do |range|
42
+ s = range[S]
43
+ if not path[s].nil?
44
+ info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
45
+ if path[i].nil? or _compare_path_info(info, path[i])
46
+ path[i] = info
47
+ end
48
+ end
49
+ end
50
+ if not path[i].nil?
51
+ left_boundary = i
52
+ end
53
+ end
54
+ if path[i].nil? and s_index.has_key?(i)
55
+ info = [left_boundary,
56
+ path[left_boundary][WEIGHT] + 1,
57
+ path[left_boundary][PATH_UNK] + 1,
58
+ :UNK]
59
+ path[i] = info;
60
+ end
61
+ end
62
+ if path[len].nil?
63
+ path[len] = [left_boundary,
64
+ path[left_boundary][WEIGHT] + 1,
65
+ path[left_boundary][PATH_UNK] + 1, :UNK]
66
+ end
67
+ path
68
+ end
69
+
70
+ def _path_to_ranges(path, len)
71
+ ranges = []
72
+ i = len
73
+ while i > 0
74
+ info = path[i]
75
+ s = info[POINTER]
76
+ ranges << [s, i, info[PATH_LINK_TYPE]]
77
+ i = s
78
+ end
79
+ ranges.reverse
80
+ end
81
+
82
+ def build_from_dag(dag, len)
83
+ s_index = _build_s_index(dag)
84
+ e_index = _build_e_index(dag)
85
+ path = _build_path(len, s_index, e_index)
86
+ _path_to_ranges(path, len)
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'thailang4r/dict.rb'
3
+ require 'thailang4r/word_dag_builder.rb'
4
+ require 'thailang4r/ranges_builder.rb'
5
+
6
+ module ThaiLang
7
+ class WordBreaker
8
+
9
+ S = 0
10
+ E = 1
11
+
12
+ def initialize(path = nil)
13
+ if path.nil?
14
+ path = File.expand_path('../../../data/tdict-std.txt', __FILE__)
15
+ puts path
16
+ end
17
+ @dict = Dict.new path
18
+ @dag_builder = WordDagBuilder.new @dict
19
+ @ranges_builder = RangesBuilder.new
20
+ end
21
+
22
+ def break_into_words(string)
23
+ len = string.length
24
+ dag = @dag_builder.build(string, len)
25
+ ranges = @ranges_builder.build_from_dag(dag, len)
26
+ ranges.map{|range| string[range[S], range[E] - range[S]]}
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,78 @@
1
+ module ThaiLang
2
+ class WordDagBuilder
3
+ def initialize(dict)
4
+ @dict = dict
5
+ end
6
+
7
+ def build(string, len)
8
+ dag = []
9
+ _build_by_dict(dag, string, len)
10
+ #_build_by_latin_rule(dag, string, len)
11
+ dag.sort do |a,b|
12
+ r = 0
13
+ for i in 0..2
14
+ r = a[i] <=> b[i]
15
+ if r != 0
16
+ break
17
+ end
18
+ end
19
+ r
20
+ end
21
+ end
22
+
23
+ def _build_by_latin_rule(dag, string, len)
24
+ next_latin = 0
25
+ for i in 0..(len-1)
26
+ space_e = nil
27
+ latin_e = nil
28
+ space_break = false
29
+ latin_break = false
30
+
31
+ for j in i..(len-1)
32
+ if space_break and latin_break
33
+ break
34
+ end
35
+ ch = string[j]
36
+ if not space_break
37
+ if ch == " "
38
+ space_e = j + 1
39
+ else
40
+ space_break = true
41
+ end
42
+ end
43
+
44
+ if latin_break and j >= next_latin
45
+ if /A-Za-z/.match(ch)
46
+ latin_e = j + 1
47
+ else
48
+ latin_break = true
49
+ end
50
+ end
51
+ end
52
+
53
+ if not space_e.nil?
54
+ dag << [i, space_e, :SPACE]
55
+ end
56
+ if not latin_e.nil?
57
+ dag << [i, latin_e, :LATIN]
58
+ next_latin = latin_e;
59
+ end
60
+ end
61
+ end
62
+
63
+ def _build_by_dict(dag, string, len)
64
+ for i in 0..(len-1)
65
+ iter = DictIter.new @dict
66
+ for j in i..(len-1)
67
+ ch = string[j]
68
+ status = iter.walk ch
69
+ if status == :INVALID
70
+ break
71
+ elsif status == :ACTIVE_BOUNDARY
72
+ dag << [i, j + 1, :DICT]
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: thailang4r
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vee Satayamas
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: cucumber
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.1
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.1
30
+ description: Thai language utility for Ruby
31
+ email:
32
+ - v.satayamas@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - lib/thailang4r.rb
38
+ - lib/thailang4r/word_dag_builder.rb
39
+ - lib/thailang4r/dict.rb
40
+ - lib/thailang4r/ranges_builder.rb
41
+ - lib/thailang4r/word_breaker.rb
42
+ - LICENSE
43
+ - README.md
44
+ - Rakefile
45
+ - data/test_dict.txt
46
+ - data/tdict-std.txt
47
+ homepage: https://github.com/veer66/thailang4r
48
+ licenses: []
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
52
+ - lib
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ! '>='
57
+ - !ruby/object:Gem::Version
58
+ version: 1.9.3
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ requirements: []
66
+ rubyforge_project:
67
+ rubygems_version: 1.8.25
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Thai language utility for Ruby
71
+ test_files: []