thailang4r 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e1af7a4c89e108e784964062d2381ae2b2b81715eb5c0f2da4e6c5aa9604426b
4
+ data.tar.gz: 85aaded96eea07d8369007f76adac54abff69fc246ae798da78da1d6ffa51887
5
+ SHA512:
6
+ metadata.gz: fc738de378f475f7eafa0439176293cd0b524cc3dd8cc246421bb7dc3b444b57f9e12ebdcfffebd56ce3cbdeb3226f72b4909674ae5438b942b040304c600799
7
+ data.tar.gz: 53aab1f982c0aeadc812e2e2618a1651f1e620dd745d54c2b42195bc3ffd8c60152f3f8fe6be08d73fb95f95bd7f3dff10edc0e91c50a5e2bc12bc3d3589473b
data/LICENSE CHANGED
@@ -1,20 +1,201 @@
1
- Copyright (c) 2013 Vee Satayamas
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright {yyyy} {name of copyright owner}
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
data/README.md CHANGED
@@ -4,14 +4,37 @@ Thai language utility for Ruby
4
4
 
5
5
  I have built this project in order to collect and share tools for Thai language, which are written in Ruby language.
6
6
 
7
+ Installation
8
+ ------------
9
+
10
+ > gem install thailang4r
11
+
12
+ Character level
13
+ ---------------
14
+
7
15
  * chlevel is similar th_chlevel in [libthai](http://linux.thai.net/projects/libthai).
8
16
  * string_chlevel gives array of level back for example string_chlevel("กี") will return [1, 2]
9
17
 
10
18
  Word breaker
11
19
  ------------
20
+
12
21
  ```ruby
13
22
  # encoding: UTF-8
14
23
  require 'thailang4r/word_breaker'
15
24
  word_breaker = ThaiLang::WordBreaker.new
16
25
  puts word_breaker.break_into_words("ฉันกินข้าว")
26
+ # ["ฉัน", "กิน", "ข้าว"]
27
+ ```
28
+
29
+ Romanization
30
+ ------------
31
+
32
+ A port of royin.py transliteration from [PyThaiNLP](https://pythainlp.github.io/).
33
+
34
+ ```ruby
35
+ # encoding: UTF-8
36
+ require 'thailang4r/roman'
37
+ royin = ThaiLang::Royin.new
38
+ p .romanize("ฉันกินข้าว", "-")
39
+ # => "chan-kin-khao"
17
40
  ```
@@ -0,0 +1,169 @@
1
+ # coding: utf-8
2
+
3
+ require_relative 'word_breaker'
4
+
5
+ module ThaiLang
6
+ # Ported from https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/transliterate/royin.py (be1265d)
7
+ class Royin
8
+ vowel_patterns = """เ*ียว,\\1iao
9
+ แ*็ว,\\1aeo
10
+ เ*ือย,\\1ueai
11
+ แ*ว,\\1aeo
12
+ เ*็ว,\\1eo
13
+ เ*ว,\\1eo
14
+ *ิว,\\1io
15
+ *วย,\\1uai
16
+ เ*ย,\\1oei
17
+ *อย,\\1oi
18
+ โ*ย,\\1oi
19
+ *ุย,\\1ui
20
+ *าย,\\1ai
21
+ ไ*ย,\\1ai
22
+ *ัย,\\1ai
23
+ ไ**,\\1\\2ai
24
+ ไ*,\\1ai
25
+ ใ*,\\1ai
26
+ *ว*,\\1ua\\2
27
+ *ัวะ,\\1ua
28
+ *ัว,\\1ua
29
+ เ*ือะ,\\1uea
30
+ เ*ือ,\\1uea
31
+ เ*ียะ,\\1ia
32
+ เ*ีย,\\1ia
33
+ เ*อะ,\\1oe
34
+ เ*อ,\\1oe
35
+ เ*ิ,\\1oe
36
+ *อ,\\1o
37
+ เ*าะ,\\1o
38
+ เ*็,\\1e
39
+ โ*ะ,\\1o
40
+ โ*,\\1o
41
+ แ*ะ,\\1ae
42
+ แ*,\\1ae
43
+ เ*าะ,\\1e
44
+ *าว,\\1ao
45
+ เ*า,\\1ao
46
+ เ*,\\1e
47
+ *ู,\\1u
48
+ *ุ,\\1u
49
+ *ื,\\1ue
50
+ *ึ,\\1ue
51
+ *ี,\\1i
52
+ *ิ,\\1i
53
+ *ำ,\\1am
54
+ *า,\\1a
55
+ *ั,\\1a
56
+ *ะ,\\1a
57
+ #ฤ,\\1rue
58
+ $ฤ,\\1ri"""
59
+
60
+ VOWELS = vowel_patterns.gsub("*", "([ก-ฮ])")
61
+ .gsub("#", "([คนพมห])")
62
+ .gsub("$", "([กตทปศส])")
63
+ .split("\n")
64
+ .map {_1.split(",")}
65
+ .map {[Regexp.new(_1), _2]}
66
+
67
+ # พยัญชนะ ต้น สะกด
68
+ CONSONANTS = {
69
+ "ก" => ["k", "k"],
70
+ "ข" => ["kh", "k"],
71
+ "ฃ" => ["kh", "k"],
72
+ "ค" => ["kh", "k"],
73
+ "ฅ" => ["kh", "k"],
74
+ "ฆ" => ["kh", "k"],
75
+ "ง" => ["ng", "ng"],
76
+ "จ" => ["ch", "t"],
77
+ "ฉ" => ["ch", "t"],
78
+ "ช" => ["ch", "t"],
79
+ "ซ" => ["s", "t"],
80
+ "ฌ" => ["ch", "t"],
81
+ "ญ" => ["y", "n"],
82
+ "ฎ" => ["d", "t"],
83
+ "ฏ" => ["t", "t"],
84
+ "ฐ" => ["th", "t"],
85
+ # ฑ พยัญชนะต้น เป็น d ได้
86
+ "ฑ" => ["th", "t"],
87
+ "ฒ" => ["th", "t"],
88
+ "ณ" => ["n", "n"],
89
+ "ด" => ["d", "t"],
90
+ "ต" => ["t", "t"],
91
+ "ถ" => ["th", "t"],
92
+ "ท" => ["th", "t"],
93
+ "ธ" => ["th", "t"],
94
+ "น" => ["n", "n"],
95
+ "บ" => ["b", "p"],
96
+ "ป" => ["p", "p"],
97
+ "ผ" => ["ph", "p"],
98
+ "ฝ" => ["f", "p"],
99
+ "พ" => ["ph", "p"],
100
+ "ฟ" => ["f", "p"],
101
+ "ภ" => ["ph", "p"],
102
+ "ม" => ["m", "m"],
103
+ "ย" => ["y", ""],
104
+ "ร" => ["r", "n"],
105
+ "ฤ" => ["rue", ""],
106
+ "ล" => ["l", "n"],
107
+ "ว" => ["w", ""],
108
+ "ศ" => ["s", "t"],
109
+ "ษ" => ["s", "t"],
110
+ "ส" => ["s", "t"],
111
+ "ห" => ["h", ""],
112
+ "ฬ" => ["l", "n"],
113
+ "อ" => ["", ""],
114
+ "ฮ" => ["h", ""],
115
+ }
116
+
117
+ def normalize(word)
118
+ word.gsub(/จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ]ะ-ู์|[ฯๆ่-๏๚๛]/, "")
119
+ end
120
+
121
+ def replace_vowel(word)
122
+ VOWELS.each { word.gsub!(_1, _2) }
123
+ return word
124
+ end
125
+
126
+ def replace_consonants(word, consonants)
127
+ return word unless consonants
128
+ return word.gsub(consonants[0], CONSONANTS[consonants[0]][0]) if consonants.length == 1
129
+ consonants.reduce({rom: "", th: word}) do |w, consonant|
130
+ non_thai = w[:th].match(/^[^ก-์]+/)
131
+ if non_thai
132
+ w[:rom] += non_thai.to_s
133
+ w[:th] = w[:th][non_thai.to_s.length..-1]
134
+ end
135
+ if w[:skip]
136
+ {rom: w[:rom], th: w[:th]}
137
+ elsif w[:rom] == "" and w[:th] == "ห"
138
+ {rom: "", th: w[:th][1..-1]}
139
+ elsif w[:rom] == ""
140
+ {rom: CONSONANTS[consonant][0], th: w[:th][consonant.length..-1]}
141
+ elsif consonant == "ร" and w[:th] == "รร"
142
+ {rom: w[:rom] + "an", th: w[:th][2..-1], skip: true}
143
+ elsif consonant == "ร" and w[:th][0..1] == "รร"
144
+ {rom: w[:rom] + "a", th: w[:th][2..-1], skip: true}
145
+ else
146
+ {rom: w[:rom] + CONSONANTS[consonant][1], th: w[:th][consonant.length..-1]}
147
+ end
148
+ end[:rom]
149
+ end
150
+
151
+ def romanize_word(word)
152
+ word = replace_vowel(normalize(word))
153
+ consonants = word.scan(/[ก-ฮ]/)
154
+ if word.length == 2 and consonants.length == 2
155
+ word = word.chars
156
+ word.insert(1, "o")
157
+ word = word.join("")
158
+ end
159
+ word = replace_consonants(word, consonants)
160
+ return word
161
+ end
162
+
163
+ WORDCUT = WordBreaker.new
164
+
165
+ def romanize(text, delim = "")
166
+ WORDCUT.break_into_words(text).map { romanize_word _1 }.join(delim)
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,3 @@
1
+ module ThaiLang
2
+
3
+ end
@@ -1,29 +1,254 @@
1
- require 'rubygems'
2
- require 'thailang4r/dict.rb'
3
- require 'thailang4r/word_dag_builder.rb'
4
- require 'thailang4r/ranges_builder.rb'
1
+ # coding: utf-8
5
2
 
6
3
  module ThaiLang
4
+ NODE_KEY_ROW_NO = 0
5
+ NODE_KEY_OFFSET = 1
6
+ NODE_KEY_CH = 2
7
+
8
+ NODE_PTR_ROW_NO = 0
9
+ NODE_PTR_IS_FINAL = 1
10
+ NODE_PTR_PAYLOAD = 2
11
+
12
+ NodeKey = Struct.new(:row_no, :offset, :ch)
13
+
14
+ class PrefixTree
15
+ def initialize(sorted_words_with_payload)
16
+ @prefix_tree = {}
17
+ sorted_words_with_payload.each.with_index do |(w, payload), i|
18
+ row_no = 0
19
+ ch_vec = w.codepoints
20
+ ch_len = w.length
21
+ ch_vec.each.with_index do |ch, j|
22
+ node_key = NodeKey.new(row_no, j, ch)
23
+ ex_node_ptr = @prefix_tree[node_key]
24
+ if ex_node_ptr
25
+ row_no = ex_node_ptr[NODE_PTR_ROW_NO]
26
+ else
27
+ is_final = (j + 1 == ch_len)
28
+ node_ptr = [i, is_final, if is_final; payload; end]
29
+ @prefix_tree[node_key] = node_ptr
30
+ row_no = i
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ def lookup(row_id, offset, ch)
37
+ @prefix_tree[NodeKey.new(row_id, offset, ch)]
38
+ end
39
+ end
40
+
41
+ DEFAULT_THAI_DICT_PATH = File.expand_path('../../../data/tdict-std.txt', __FILE__)
42
+
7
43
  class WordBreaker
44
+ def initialize(dix_path = DEFAULT_THAI_DICT_PATH)
45
+ @dix = PrefixTree.new(File.open(dix_path).each_line.map { [_1.chomp, 1] })
46
+ end
47
+
48
+ def break_into_words(text)
49
+ tokenize(@dix, text)
50
+ end
51
+
52
+ UNK = 1
53
+ DICT = 2
54
+ INIT = 3
55
+ LATIN = 4
56
+ PUNC = 5
57
+
58
+ LINK_P_IDX = 0
59
+ LINK_W = 1
60
+ LINK_UNK = 2
61
+ LINK_KIND = 3
62
+
63
+ def better_link?(l, r)
64
+ l[LINK_UNK] < r[LINK_UNK] or l[LINK_W] < r[LINK_W]
65
+ end
66
+
67
+ WAITING = 1
68
+ ACTIVATED = 2
69
+ COMPLETED = 3
8
70
 
9
- S = 0
10
- E = 1
71
+ CAP_A = "A".ord
72
+ CAP_Z = "Z".ord
73
+ A = "a".ord
74
+ Z = "z".ord
11
75
 
12
- def initialize(path = nil)
13
- if path.nil?
14
- path = File.expand_path('../../../data/tdict-std.txt', __FILE__)
15
- puts path
76
+ def latin?(ch)
77
+ (ch >= CAP_A and ch <= CAP_Z) or (ch >= A and ch <= Z)
78
+ end
79
+
80
+ TRANSDUCER_S = 0
81
+ TRANSDUCER_E = 1
82
+ TRANSDUCER_STATE = 3
83
+ TRANSDUCER_KIND = 4
84
+
85
+ def update_latin_transducer(transducer, ch, i, ch_vec)
86
+ if transducer[TRANSDUCER_STATE] == WAITING
87
+ if latin?(ch)
88
+ transducer[TRANSDUCER_S] = i
89
+ transducer[TRANSDUCER_STATE] = ACTIVATED
90
+ if i + 1 == ch_vec.length or not latin?(ch_vec[i + 1])
91
+ transducer[TRANSDUCER_E] = i + 1
92
+ transducer[TRANSDUCER_STATE] = COMPLETED
93
+ end
94
+ end
95
+ else
96
+ if latin?(ch)
97
+ transducer[TRANSDUCER_E] = i + 1
98
+ transducer[TRANSDUCER_STATE] = COMPLETED
99
+ else
100
+ transducer[TRANSDUCER_STATE] = WAITING
101
+ end
16
102
  end
17
- @dict = Dict.new path
18
- @dag_builder = WordDagBuilder.new @dict
19
- @ranges_builder = RangesBuilder.new
20
103
  end
104
+
105
+
106
+ SPACE = " ".ord
21
107
 
22
- def break_into_words(string)
23
- len = string.length
24
- dag = @dag_builder.build(string, len)
25
- ranges = @ranges_builder.build_from_dag(dag, len)
26
- ranges.map{|range| string[range[S], range[E] - range[S]]}
27
- end
108
+ def punc?(ch)
109
+ ch == SPACE
110
+ end
111
+
112
+ def update_punc_transducer(transducer, ch, i, ch_vec)
113
+ if transducer[TRANSDUCER_STATE] == WAITING
114
+ if punc?(ch)
115
+ transducer[TRANSDUCER_S] = i
116
+ transducer[TRANSDUCER_STATE] = ACTIVATED
117
+ if i + 1 == ch_vec.length or not punc?(ch_vec[i + 1])
118
+ transducer[TRANSDUCER_E] = i + 1
119
+ transducer[TRANSDUCER_STATE] = COMPLETED
120
+ end
121
+ end
122
+ else
123
+ if punc?(ch)
124
+ transducer[TRANSDUCER_E] = i + 1
125
+ transducer[TRANSDUCER_STATE] = COMPLETED
126
+ else
127
+ transducer[TRANSDUCER_STATE] = WAITING
128
+ end
129
+ end
130
+ end
131
+
132
+ DIX_PTR_S = 0
133
+ DIX_PTR_ROW_NO = 1
134
+ DIX_PTR_IS_FINAL = 2
135
+
136
+ def build_path(dix, s)
137
+ left_boundary = 0
138
+ ch_vec = s.codepoints
139
+ ch_len = ch_vec.length
140
+ path = [[0,0,0,INIT]]
141
+ dix_ptrs = []
142
+ latin_transducer = [0,0,WAITING,LATIN]
143
+ punc_transducer =[0,0,WAITING,PUNC]
144
+ ch_vec.each.with_index do |ch, i|
145
+ dix_ptrs << [i, 0, false]
146
+ unk_link = path[left_boundary]
147
+ link = [left_boundary, unk_link[LINK_W] + 1, unk_link[LINK_UNK] + 1, UNK]
148
+ j = 0
149
+ while j < dix_ptrs.length
150
+ dix_ptr = dix_ptrs[j]
151
+ offset = i - dix_ptr[DIX_PTR_S]
152
+ row_no = dix_ptr[DIX_PTR_ROW_NO]
153
+ child = dix.lookup(row_no, offset, ch)
154
+ # puts "ch:#{ch} offset:#{offset} rowno:#{row_no} child:#{child}"
155
+ if child
156
+ dix_ptrs[j] = [dix_ptr[DIX_PTR_S], child[NODE_PTR_ROW_NO], child[NODE_PTR_IS_FINAL]]
157
+ j += 1
158
+ else
159
+ unless j + 1 == dix_ptrs.length
160
+ dix_ptrs[j] = dix_ptrs.pop
161
+ else
162
+ dix_ptrs.pop
163
+ end
164
+ end
165
+ end
166
+
167
+ update_latin_transducer(latin_transducer, ch, i, ch_vec)
168
+ update_punc_transducer(punc_transducer, ch, i, ch_vec)
169
+
170
+ dix_ptrs.each do |dix_ptr|
171
+ if dix_ptr[DIX_PTR_IS_FINAL]
172
+ new_s = dix_ptr[DIX_PTR_S]
173
+ # puts "NEW_S:#{new_s} DIX-PTR:#{dix_ptr} i:#{i}"
174
+ prev_link = path[new_s]
175
+ w = prev_link[LINK_W]
176
+ unk = prev_link[LINK_UNK]
177
+ new_link = [new_s, w + 1, unk, DICT]
178
+ link = new_link if better_link?(new_link, link)
179
+ end
180
+ end
181
+
182
+ if latin_transducer[TRANSDUCER_STATE] == COMPLETED
183
+ s = latin_transducer[TRANSDUCER_S]
184
+ prev_link = path[s]
185
+ w = prev_link[LINK_W]
186
+ unk = prev_link[LINK_UNK]
187
+ new_link = [s, w + 1, unk, LATIN]
188
+ link = new_link if better_link?(new_link, link)
189
+ end
190
+
191
+ if punc_transducer[TRANSDUCER_STATE] == COMPLETED
192
+ s = punc_transducer[TRANSDUCER_S]
193
+ prev_link = path[s]
194
+ w = prev_link[LINK_W]
195
+ unk = prev_link[LINK_UNK]
196
+ new_link = [s, w + 1, unk, PUNC]
197
+ link = new_link if better_link?(new_link, link)
198
+ end
199
+ left_boundary = i if link[LINK_KIND] != UNK
200
+ path << link
201
+ end
202
+ path
203
+ end
204
+
205
+ RANGE_S = 0
206
+ RANGE_E = 1
207
+
208
+ def path_to_ranges(path)
209
+ e = path.length - 1
210
+ ranges = []
211
+ while e > 0
212
+ link = path[e]
213
+ s = link[LINK_P_IDX]
214
+ ranges << [s,e]
215
+ e = s
216
+ end
217
+ ranges.reverse
218
+ end
219
+
220
+ def ranges_to_toks(ranges, str)
221
+ ranges.map {|s,e| str[s...e]}
222
+ end
223
+
224
+ def tokenize(dix, str)
225
+ ranges_to_toks(path_to_ranges(build_path(dix, str)), str)
226
+ end
227
+
228
+ def tokenize_with_delim(dix, str, delim)
229
+ tokenize(dix, str).join(delim)
230
+ end
28
231
  end
29
- end
232
+ end
233
+
234
+
235
+ #dix = APrefixTree.new([["กา",1],["กาก",1]])
236
+ #p dix
237
+ #p tokenize_with_delim(dix, "บทความนี้ใช้ระบบคริสต์ศักราช เพราะอ้างอิงคริสต์ศักราชและคริสต์ศตวรรษ หรืออย่างใดอย่างหนึ่ง", "|")
238
+
239
+
240
+ #t2 = [0,0,WAITING,LATIN]
241
+ #update_punc_transducer(t2, 32, 0, [32])
242
+ #p t2
243
+ #t1 = PrefixTree.new([["A",1]])
244
+ #p t1.lookup(0, 0, "A".codepoints[0])
245
+
246
+ #t2 = [0,0,WAITING,LATIN]
247
+ #update_punc_transducer(t2, 32, 0, [32])
248
+ #p t2
249
+ #t1 = PrefixTree.new([["A",1]])
250
+ #p t1.lookup(0, 0, "A".codepoints[0])
251
+
252
+
253
+ #word_breaker = ThaiLang::WordBreaker.new
254
+ #puts word_breaker.break_into_words("ฉันกินข้าว")
metadata CHANGED
@@ -1,71 +1,53 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thailang4r
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
5
- prerelease:
4
+ version: 0.0.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Vee Satayamas
9
- autorequire:
8
+ autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-08 00:00:00.000000000 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: cucumber
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ~>
20
- - !ruby/object:Gem::Version
21
- version: 1.2.1
22
- type: :development
23
- prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ~>
28
- - !ruby/object:Gem::Version
29
- version: 1.2.1
30
- description: Thai language utility for Ruby
11
+ date: 2021-04-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Thai language tools for Ruby, i.e. a word tokenizer, a character level
14
+ indentifier, and a romanization tool
31
15
  email:
32
- - v.satayamas@gmail.com
16
+ - 5ssgdxltv@relay.firefox.com
33
17
  executables: []
34
18
  extensions: []
35
19
  extra_rdoc_files: []
36
20
  files:
37
- - lib/thailang4r.rb
38
- - lib/thailang4r/word_dag_builder.rb
39
- - lib/thailang4r/dict.rb
40
- - lib/thailang4r/ranges_builder.rb
41
- - lib/thailang4r/word_breaker.rb
42
21
  - LICENSE
43
22
  - README.md
44
23
  - Rakefile
45
- - data/test_dict.txt
46
24
  - data/tdict-std.txt
25
+ - data/test_dict.txt
26
+ - lib/thailang4r.rb
27
+ - lib/thailang4r/roman.rb
28
+ - lib/thailang4r/roman.rb~
29
+ - lib/thailang4r/word_breaker.rb
47
30
  homepage: https://github.com/veer66/thailang4r
48
- licenses: []
49
- post_install_message:
31
+ licenses:
32
+ - Apache-2.0
33
+ metadata: {}
34
+ post_install_message:
50
35
  rdoc_options: []
51
36
  require_paths:
52
37
  - lib
53
38
  required_ruby_version: !ruby/object:Gem::Requirement
54
- none: false
55
39
  requirements:
56
- - - ! '>='
40
+ - - ">="
57
41
  - !ruby/object:Gem::Version
58
- version: 1.9.3
42
+ version: 3.0.0
59
43
  required_rubygems_version: !ruby/object:Gem::Requirement
60
- none: false
61
44
  requirements:
62
- - - ! '>='
45
+ - - ">="
63
46
  - !ruby/object:Gem::Version
64
47
  version: '0'
65
48
  requirements: []
66
- rubyforge_project:
67
- rubygems_version: 1.8.25
68
- signing_key:
69
- specification_version: 3
49
+ rubygems_version: 3.2.3
50
+ signing_key:
51
+ specification_version: 4
70
52
  summary: Thai language utility for Ruby
71
53
  test_files: []
@@ -1,89 +0,0 @@
1
- module ThaiLang
2
- class Dict
3
- def initialize(file_path)
4
- load_dict(file_path)
5
- end
6
-
7
- def load_dict(file_path)
8
- File.open(file_path) do |f|
9
- @str_list = f.readlines.map{|line| line.chomp}
10
- end
11
- end
12
-
13
- def find_first_index_of_needle(prefix, offset = nil, s = nil, e = nil)
14
- find_index_of_needle(:FIRST, prefix, offset, s, e)
15
- end
16
-
17
- def find_last_index_of_needle(prefix, offset = nil, s = nil, e = nil)
18
- find_index_of_needle(:LAST, prefix, offset, s, e)
19
- end
20
-
21
- def find_index_of_needle(pos_type, prefix, offset = nil, s = nil, e = nil)
22
- offset = offset.nil? ? 0 : offset
23
- s = s.nil? ? 0 : s
24
- e = e.nil? ? @str_list.length : e
25
-
26
- l = s
27
- r = e - 1;
28
- ans = nil
29
-
30
- while l <= r do
31
- m = (l + r) / 2
32
- ch = @str_list[m][offset]
33
- if ch.nil? or prefix > ch
34
- l = m + 1
35
- elsif prefix < ch
36
- r = m - 1
37
- else
38
- ans = m
39
- if pos_type == :FIRST
40
- r = m - 1
41
- else
42
- l = m + 1
43
- end
44
- end
45
- end
46
-
47
- ans
48
- end
49
-
50
- def size
51
- @str_list.length
52
- end
53
-
54
- def [](i)
55
- @str_list[i]
56
- end
57
- end
58
-
59
- class DictIter
60
- def initialize(dict)
61
- @dict = dict
62
- @e = @dict.size
63
- @s = 0
64
- @state = :ACTIVE
65
- @offset = 0
66
- end
67
-
68
- def walk(ch)
69
- if @state != :INVALID
70
- first = @dict.find_first_index_of_needle ch, @offset, @s, @e
71
- if first.nil?
72
- @state = :INVALID
73
- else
74
- @s = first
75
- last = @dict.find_last_index_of_needle ch, @offset, @s, @e
76
- @e = last + 1
77
- len = @dict[first].length
78
- @offset += 1
79
- if(@offset == len)
80
- @state = :ACTIVE_BOUNDARY
81
- else
82
- @state = :ACTIVE
83
- end
84
- end
85
- end
86
- @state
87
- end
88
- end
89
- end
@@ -1,90 +0,0 @@
1
- module ThaiLang
2
- class RangesBuilder
3
- S = 0
4
- E = 1
5
- LINK_TYPE = 2
6
-
7
- POINTER = 0
8
- WEIGHT = 1
9
- PATH_UNK = 2
10
- PATH_LINK_TYPE = 3
11
-
12
- def _build_index(dag, pos)
13
- index = {}
14
- dag.each do |range|
15
- if not index.has_key?(range[pos])
16
- index[range[pos]] = []
17
- end
18
- index[range[pos]] << range
19
- end
20
- index
21
- end
22
-
23
- def _build_e_index(dag)
24
- _build_index(dag, E)
25
- end
26
-
27
- def _build_s_index(dag)
28
- _build_index(dag, S)
29
- end
30
-
31
- def _compare_path_info(a, b)
32
- a[PATH_UNK] < b[PATH_UNK] and a[WEIGHT] < b[WEIGHT]
33
- end
34
-
35
- def _build_path(len, s_index, e_index)
36
- path = Array.new(len + 1) {|i| nil}
37
- path[0] = [0, 0, 0, :UNK]
38
- left_boundary = 0
39
- for i in 1..len
40
- if e_index.has_key?(i)
41
- e_index[i].each do |range|
42
- s = range[S]
43
- if not path[s].nil?
44
- info = [s, path[s][WEIGHT] + 1, path[s][PATH_UNK], range[LINK_TYPE]]
45
- if path[i].nil? or _compare_path_info(info, path[i])
46
- path[i] = info
47
- end
48
- end
49
- end
50
- if not path[i].nil?
51
- left_boundary = i
52
- end
53
- end
54
- if path[i].nil? and s_index.has_key?(i)
55
- info = [left_boundary,
56
- path[left_boundary][WEIGHT] + 1,
57
- path[left_boundary][PATH_UNK] + 1,
58
- :UNK]
59
- path[i] = info;
60
- end
61
- end
62
- if path[len].nil?
63
- path[len] = [left_boundary,
64
- path[left_boundary][WEIGHT] + 1,
65
- path[left_boundary][PATH_UNK] + 1, :UNK]
66
- end
67
- path
68
- end
69
-
70
- def _path_to_ranges(path, len)
71
- ranges = []
72
- i = len
73
- while i > 0
74
- info = path[i]
75
- s = info[POINTER]
76
- ranges << [s, i, info[PATH_LINK_TYPE]]
77
- i = s
78
- end
79
- ranges.reverse
80
- end
81
-
82
- def build_from_dag(dag, len)
83
- s_index = _build_s_index(dag)
84
- e_index = _build_e_index(dag)
85
- path = _build_path(len, s_index, e_index)
86
- _path_to_ranges(path, len)
87
- end
88
- end
89
-
90
- end
@@ -1,78 +0,0 @@
1
- module ThaiLang
2
- class WordDagBuilder
3
- def initialize(dict)
4
- @dict = dict
5
- end
6
-
7
- def build(string, len)
8
- dag = []
9
- _build_by_dict(dag, string, len)
10
- #_build_by_latin_rule(dag, string, len)
11
- dag.sort do |a,b|
12
- r = 0
13
- for i in 0..2
14
- r = a[i] <=> b[i]
15
- if r != 0
16
- break
17
- end
18
- end
19
- r
20
- end
21
- end
22
-
23
- def _build_by_latin_rule(dag, string, len)
24
- next_latin = 0
25
- for i in 0..(len-1)
26
- space_e = nil
27
- latin_e = nil
28
- space_break = false
29
- latin_break = false
30
-
31
- for j in i..(len-1)
32
- if space_break and latin_break
33
- break
34
- end
35
- ch = string[j]
36
- if not space_break
37
- if ch == " "
38
- space_e = j + 1
39
- else
40
- space_break = true
41
- end
42
- end
43
-
44
- if latin_break and j >= next_latin
45
- if /A-Za-z/.match(ch)
46
- latin_e = j + 1
47
- else
48
- latin_break = true
49
- end
50
- end
51
- end
52
-
53
- if not space_e.nil?
54
- dag << [i, space_e, :SPACE]
55
- end
56
- if not latin_e.nil?
57
- dag << [i, latin_e, :LATIN]
58
- next_latin = latin_e;
59
- end
60
- end
61
- end
62
-
63
- def _build_by_dict(dag, string, len)
64
- for i in 0..(len-1)
65
- iter = DictIter.new @dict
66
- for j in i..(len-1)
67
- ch = string[j]
68
- status = iter.walk ch
69
- if status == :INVALID
70
- break
71
- elsif status == :ACTIVE_BOUNDARY
72
- dag << [i, j + 1, :DICT]
73
- end
74
- end
75
- end
76
- end
77
- end
78
- end